Python 脚本：将 Wordpress 文章转换为 Markdown

我使用此脚本将旧的基于 Wordpress 的 techoverflow.net 文章转换为 Markdown 格式以便与 Hugo 一起使用。它使用 beautifulsoup4 和 markdownify 库来解析 HTML 内容并将其转换为 Markdown。

第 1 部分：获取 Wordpress 分类

wp_to_md_part1.py

#!/usr/bin/env python3
from tqdm import tqdm
import requests
import json
import base64
import os

URL = "https://myblog.com"
USER = "uli"
PASSWORD = os.environ.get("WORDPRESS_API_PASSWORD")
if PASSWORD is None:
    raise ValueError("WORDPRESS_API_PASSWORD environment variable is not set")

# Compute basic authentication header
auth_header = b"Basic " + base64.b64encode(f"{USER}:{PASSWORD}".encode("utf-8"))

def page_numbers():
    """Infinite generate of page numbers"""
    num = 1
    while True:
        yield num
        num += 1

categories = []
for page in tqdm(page_numbers()):
    # Fetch the next [pagesize=10] categories
    categories_page = requests.get(f"{URL}/wp-json/wp/v2/categories", params={"page": page, "per_page": 100, "context": "edit"},
                             headers={"Authorization": auth_header}).json()
    # Check for "last page" error code
    if not len(categories_page):
        break
    # No error code -> add categories
    categories += categories_page

with open("categories.json", "w") as outfile:
    json.dump(categories, outfile)

#!/usr/bin/env python3
from tqdm import tqdm
import requests
import json
import base64
import os

URL = "https://myblog.com"
USER = "uli"
PASSWORD = os.environ.get("WORDPRESS_API_PASSWORD")
if PASSWORD is None:
    raise ValueError("WORDPRESS_API_PASSWORD environment variable is not set")

# Compute basic authentication header
auth_header = b"Basic " + base64.b64encode(f"{USER}:{PASSWORD}".encode("utf-8"))

def page_numbers():
    """Infinite generate of page numbers"""
    num = 1
    while True:
        yield num
        num += 1

categories = []
for page in tqdm(page_numbers()):
    # Fetch the next [pagesize=10] categories
    categories_page = requests.get(f"{URL}/wp-json/wp/v2/categories", params={"page": page, "per_page": 100, "context": "edit"},
                             headers={"Authorization": auth_header}).json()
    # Check for "last page" error code
    if not len(categories_page):
        break
    # No error code -> add categories
    categories += categories_page

with open("categories.json", "w") as outfile:
    json.dump(categories, outfile)

第 2 部分：获取 Wordpress 文章

wp_to_md_part2.py

#!/usr/bin/env python3
from tqdm import tqdm
import requests
import json
import base64
import os

URL = "https://techoverflow.net"
USER = "uli"
PASSWORD = os.environ.get("WORDPRESS_API_PASSWORD")
if PASSWORD is None:
    raise ValueError("WORDPRESS_API_PASSWORD environment variable is not set")

# Compute basic authentication header
auth_header = b"Basic " + base64.b64encode(f"{USER}:{PASSWORD}".encode("utf-8"))

def page_numbers():
    """Infinite generate of page numbers"""
    num = 1
    while True:
        yield num
        num += 1

posts = []
for page in tqdm(page_numbers()):
    # Fetch the next [pagesize=10] posts
    posts_page = requests.get(f"{URL}/wp-json/wp/v2/posts", params={"page": page, "per_page": 100, "context": "edit"},
                             headers={"Authorization": auth_header}).json()
    # Check for "last page" error code
    if isinstance(posts_page, dict) and posts_page["code"] == "rest_post_invalid_page_number": # Found last page
        break
    # No error code -> add posts
    posts += posts_page

with open("posts.json", "w") as outfile:
    json.dump(posts, outfile)

#!/usr/bin/env python3
from tqdm import tqdm
import requests
import json
import base64
import os

URL = "https://techoverflow.net"
USER = "uli"
PASSWORD = os.environ.get("WORDPRESS_API_PASSWORD")
if PASSWORD is None:
    raise ValueError("WORDPRESS_API_PASSWORD environment variable is not set")

# Compute basic authentication header
auth_header = b"Basic " + base64.b64encode(f"{USER}:{PASSWORD}".encode("utf-8"))

def page_numbers():
    """Infinite generate of page numbers"""
    num = 1
    while True:
        yield num
        num += 1

posts = []
for page in tqdm(page_numbers()):
    # Fetch the next [pagesize=10] posts
    posts_page = requests.get(f"{URL}/wp-json/wp/v2/posts", params={"page": page, "per_page": 100, "context": "edit"},
                             headers={"Authorization": auth_header}).json()
    # Check for "last page" error code
    if isinstance(posts_page, dict) and posts_page["code"] == "rest_post_invalid_page_number": # Found last page
        break
    # No error code -> add posts
    posts += posts_page

with open("posts.json", "w") as outfile:
    json.dump(posts, outfile)

第 3 部分：将保存的文章转换为 Markdown

wp_to_md_part3.py

#!/usr/bin/env python3
import requests
import json
import re
import os
import os.path
import markdownify
from datetime import datetime

with open("posts.json") as infile:
    posts = json.load(infile)

with open("categories.json") as infile:
    categories = json.load(infile)

category_map = {category["id"]: category["name"] for category in categories}

def code_language_callback(elem):
    lang = elem.attrs.get("data-enlighter-language", "")
    filename = elem.attrs.get("data-filename", None)
    return lang

def export_post(the_post):
    title = the_post["title"]["raw"]
    escaped_title = title.replace("\\", "\\\\").replace('"', '\\"')

    slug = the_post["slug"]
    slug = re.sub(r"%[a-f0-9]{2}", "", the_post["slug"]) # Remove escaped special characters

    date_str = the_post["date"]
    date = datetime.fromisoformat(date_str)

    post_categories = [category_map[cat_id] for cat_id in the_post["categories"]]
    content = the_post["content"]["raw"]
    has_latex = "[latex]" in content

    wp_post_id = the_post["id"]

    author = the_post["author"]
    if author == 1:
        author = "Uli Köhler"
    if author == 2:
        author = "Yann Spöri"
    if author == 6:
        author = "Tobias Gutmann"
    if author == 4:
        author = "Joshua Simon"
    if isinstance(author, int):
        raise ValueError(f"Unknown author ID: {author}")

    # Build categories
    category_header = None
    for category in post_categories:
        if category_header is None:
            category_header = "categories:\n"
        category_header += f"    - {category}\n"

    # Build header
    header = f"""---
title: "{escaped_title}"
date: {date_str}
slug: "{slug}"
author: {author}
katex: {has_latex}

wordpress_import:
    wp_post_id: {wp_post_id}

{category_header}
---\n\n"""
    # Filter content
    filtered_content = content.replace("\r\n", "\n")
    filtered_content = markdownify.markdownify(filtered_content, code_language_callback=code_language_callback)

    # Export Markdown
    filename = f"content/post/{date.year}/{date.month}/{slug}.md"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    print(f"Writing to {filename}")
    with open(filename, "w") as outfile:
        outfile.write(header + filtered_content)

    # Export raw HTML for debugging
    raw_filename = f"../raw_content/post/{date.year}/{date.month}/{slug}.html"
    os.makedirs(os.path.dirname(raw_filename), exist_ok=True)
    with open(raw_filename, "w") as outfile:
        outfile.write(content)

for post in [post for post in posts]:
    export_post(post)

#!/usr/bin/env python3
import requests
import json
import re
import os
import os.path
import markdownify
from datetime import datetime

with open("posts.json") as infile:
    posts = json.load(infile)

with open("categories.json") as infile:
    categories = json.load(infile)

category_map = {category["id"]: category["name"] for category in categories}

def code_language_callback(elem):
    lang = elem.attrs.get("data-enlighter-language", "")
    filename = elem.attrs.get("data-filename", None)
    return lang

def export_post(the_post):
    title = the_post["title"]["raw"]
    escaped_title = title.replace("\\", "\\\\").replace('"', '\\"')

    slug = the_post["slug"]
    slug = re.sub(r"%[a-f0-9]{2}", "", the_post["slug"]) # Remove escaped special characters

    date_str = the_post["date"]
    date = datetime.fromisoformat(date_str)

    post_categories = [category_map[cat_id] for cat_id in the_post["categories"]]
    content = the_post["content"]["raw"]
    has_latex = "[latex]" in content

    wp_post_id = the_post["id"]

    author = the_post["author"]
    if author == 1:
        author = "Uli Köhler"
    if author == 2:
        author = "Yann Spöri"
    if author == 6:
        author = "Tobias Gutmann"
    if author == 4:
        author = "Joshua Simon"
    if isinstance(author, int):
        raise ValueError(f"Unknown author ID: {author}")

    # Build categories
    category_header = None
    for category in post_categories:
        if category_header is None:
            category_header = "categories:\n"
        category_header += f"    - {category}\n"

    # Build header
    header = f"""---
title: "{escaped_title}"
date: {date_str}
slug: "{slug}"
author: {author}
katex: {has_latex}

wordpress_import:
    wp_post_id: {wp_post_id}

{category_header}
---\n\n"""
    # Filter content
    filtered_content = content.replace("\r\n", "\n")
    filtered_content = markdownify.markdownify(filtered_content, code_language_callback=code_language_callback)

    # Export Markdown
    filename = f"content/post/{date.year}/{date.month}/{slug}.md"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    print(f"Writing to {filename}")
    with open(filename, "w") as outfile:
        outfile.write(header + filtered_content)

    # Export raw HTML for debugging
    raw_filename = f"../raw_content/post/{date.year}/{date.month}/{slug}.html"
    os.makedirs(os.path.dirname(raw_filename), exist_ok=True)
    with open(raw_filename, "w") as outfile:
        outfile.write(content)

for post in [post for post in posts]:
    export_post(post)

Check out similar posts by category: Python, Hugo

If this post helped you, please consider buying me a coffee or donating via PayPal to support research & publishing of new posts on TechOverflow

Buy me a coffee