Merge pull request #2 from HaoLiHaiO/feat/combine-scripts

Make image dl optional with a flag
2024-07-19 08:52:01 -04:00
parent a3ac64ab5d c54a2ac1cd
commit f84a142f4b
4 changed files with 36 additions and 68 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+.venv
+md_output/
--- a/README.md
+++ b/README.md
@@ -20,6 +20,19 @@ pip3 install -r requirements.txt
 python3 wiki-to-md.py <topic_name>
 ```

+Specifying if you want to download the images is optional. It is set to `yes` 
+by default, you can set it to `no`.
+
+```bash
+python3 wiki-to-md.py --image-dl=no <topic_name>
+```
+
+For help:
+
+```bash
+python3 wiki-to-md.py --help
+```
+
 ## Output

 The output is a Markdown file with the same name as the topic name under the newly created directory `md_output` if using `wiki-to-md.py`. If you want to download images too, use the `wiki-to-md-images.py` file and the images will be placed inside `md_output/images/`.
--- a/wiki-to-md-images.py
+++ b/wiki-to-md-images.py
@@ -6,7 +6,7 @@ import requests
 import urllib.parse


-def generate_markdown(topic):
+def generate_markdown(topic, download_images):
    try:
        page = wikipedia.page(topic)
    except wikipedia.exceptions.DisambiguationError as e:
@@ -32,21 +32,22 @@ def generate_markdown(topic):
    output_directory = "md_output"
    os.makedirs(output_directory, exist_ok=True)

-    # Create a directory for image files
-    image_directory = os.path.join(output_directory, "images")
-    os.makedirs(image_directory, exist_ok=True)
+    if download_images:
+        # Create a directory for image files
+        image_directory = os.path.join(output_directory, "images")
+        os.makedirs(image_directory, exist_ok=True)

-    for image_url in page.images:
-        image_filename = urllib.parse.unquote(os.path.basename(image_url))
-        image_path = os.path.join(image_directory, image_filename)
-        image_data = requests.get(image_url).content
-        with open(image_path, "wb") as image_file:
-            image_file.write(image_data)
-        markdown_text += f"![{image_filename}](./images/{image_filename})\n"
+        for image_url in page.images:
+            image_filename = urllib.parse.unquote(os.path.basename(image_url))
+            image_path = os.path.join(image_directory, image_filename)
+            image_data = requests.get(image_url).content
+            with open(image_path, "wb") as image_file:
+                image_file.write(image_data)
+            markdown_text += f"![{image_filename}](./images/{image_filename})\n"

    filename = os.path.join(output_directory, f'{topic.replace(" ", "_")}.md')

-    with open(filename, "w") as md_file:
+    with open(filename, "w", encoding="utf-8") as md_file:
        md_file.write(markdown_text)

    print(f"Markdown file created: {filename}")
@@ -61,9 +62,16 @@ parser.add_argument(
    type=str,
    help="The topic to generate a markdown file for.",
 )
+parser.add_argument(
+    "--dl-image",
+    choices=['yes', 'no'],
+    default='yes',
+    help="Specify whether to download images (yes or no).",
+)

 args = parser.parse_args()

 topic = f"{args.topic}"
+download_images = args.dl_image == 'yes'

-generate_markdown(topic)
+generate_markdown(topic, download_images)
--- a/wiki-to-md.py
+++ b/wiki-to-md.py
@@ -1,55 +0,0 @@
-import os
-import wikipedia
-import argparse
-import re
-
-
-def generate_markdown(topic):
-    try:
-        page = wikipedia.page(topic)
-    except wikipedia.exceptions.DisambiguationError as e:
-        print(e.options)
-        return None
-    except wikipedia.exceptions.PageError:
-        print(f"Page not found for the topic: {topic}")
-        return None
-
-    markdown_text = f"# {topic}\n\n"
-
-    page_content = re.sub(r"=== ([^=]+) ===", r"### \1", page.content)
-    page_content = re.sub(r"== ([^=]+) ==", r"## \1", page_content)
-
-    sections = re.split(r"\n(## .*)\n", page_content)
-    for i in range(0, len(sections), 2):
-        if i + 1 < len(sections) and any(
-            line.strip() for line in sections[i + 1].split("\n")
-        ):
-            markdown_text += f"{sections[i]}\n{sections[i+1]}\n\n"
-
-    # Create a directory for markdown files
-    directory = "md_output"
-    os.makedirs(directory, exist_ok=True)
-
-    filename = os.path.join(directory, f"{topic.replace(' ', '_')}.md")
-
-    with open(filename, "w") as md_file:
-        md_file.write(markdown_text)
-
-    print(f"Markdown file created: {filename}")
-    return filename
-
-
-parser = argparse.ArgumentParser(
-    description="Generate a markdown file for a provided topic."
-)
-parser.add_argument(
-    "topic",
-    type=str,
-    help="The topic to generate a markdown file for.",
-)
-
-args = parser.parse_args()
-
-topic = f"{args.topic}"
-
-generate_markdown(topic)