Python script to convert Wordpress posts to Markdown
I used this script to convert the old Wordpress-based techoverflow.net
posts to Markdown format for use with Hugo. It uses the beautifulsoup4
and markdownify
libraries to parse the HTML content and convert it to Markdown.
Part 1: Fetch Wordpres categories
#!/usr/bin/env python3
from tqdm import tqdm
import requests
import json
import base64
import os
URL = "https://myblog.com"
USER = "uli"
PASSWORD = os.environ.get("WORDPRESS_API_PASSWORD")
if PASSWORD is None:
raise ValueError("WORDPRESS_API_PASSWORD environment variable is not set")
# Compute basic authentication header
auth_header = b"Basic " + base64.b64encode(f"{USER}:{PASSWORD}".encode("utf-8"))
def page_numbers():
"""Infinite generate of page numbers"""
num = 1
while True:
yield num
num += 1
categories = []
for page in tqdm(page_numbers()):
# Fetch the next [pagesize=10] categories
categories_page = requests.get(f"{URL}/wp-json/wp/v2/categories", params={"page": page, "per_page": 100, "context": "edit"},
headers={"Authorization": auth_header}).json()
# Check for "last page" error code
if not len(categories_page):
break
# No error code -> add categories
categories += categories_page
with open("categories.json", "w") as outfile:
json.dump(categories, outfile)
Part 2: Fetch WordPress posts
#!/usr/bin/env python3
from tqdm import tqdm
import requests
import json
import base64
import os
URL = "https://techoverflow.net"
USER = "uli"
PASSWORD = os.environ.get("WORDPRESS_API_PASSWORD")
if PASSWORD is None:
raise ValueError("WORDPRESS_API_PASSWORD environment variable is not set")
# Compute basic authentication header
auth_header = b"Basic " + base64.b64encode(f"{USER}:{PASSWORD}".encode("utf-8"))
def page_numbers():
"""Infinite generate of page numbers"""
num = 1
while True:
yield num
num += 1
posts = []
for page in tqdm(page_numbers()):
# Fetch the next [pagesize=10] posts
posts_page = requests.get(f"{URL}/wp-json/wp/v2/posts", params={"page": page, "per_page": 100, "context": "edit"},
headers={"Authorization": auth_header}).json()
# Check for "last page" error code
if isinstance(posts_page, dict) and posts_page["code"] == "rest_post_invalid_page_number": # Found last page
break
# No error code -> add posts
posts += posts_page
with open("posts.json", "w") as outfile:
json.dump(posts, outfile)
Part 3: Convert saved posts to Markdown
#!/usr/bin/env python3
import requests
import json
import re
import os
import os.path
import markdownify
from datetime import datetime
with open("posts.json") as infile:
posts = json.load(infile)
with open("categories.json") as infile:
categories = json.load(infile)
category_map = {category["id"]: category["name"] for category in categories}
def code_language_callback(elem):
lang = elem.attrs.get("data-enlighter-language", "")
filename = elem.attrs.get("data-filename", None)
return lang
def export_post(the_post):
title = the_post["title"]["raw"]
escaped_title = title.replace("\\", "\\\\").replace('"', '\\"')
slug = the_post["slug"]
slug = re.sub(r"%[a-f0-9]{2}", "", the_post["slug"]) # Remove escaped special characters
date_str = the_post["date"]
date = datetime.fromisoformat(date_str)
post_categories = [category_map[cat_id] for cat_id in the_post["categories"]]
content = the_post["content"]["raw"]
has_latex = "[latex]" in content
wp_post_id = the_post["id"]
author = the_post["author"]
if author == 1:
author = "Uli Köhler"
if author == 2:
author = "Yann Spöri"
if author == 6:
author = "Tobias Gutmann"
if author == 4:
author = "Joshua Simon"
if isinstance(author, int):
raise ValueError(f"Unknown author ID: {author}")
# Build categories
category_header = None
for category in post_categories:
if category_header is None:
category_header = "categories:\n"
category_header += f" - {category}\n"
# Build header
header = f"""---
title: "{escaped_title}"
date: {date_str}
slug: "{slug}"
author: {author}
katex: {has_latex}
wordpress_import:
wp_post_id: {wp_post_id}
{category_header}
---\n\n"""
# Filter content
filtered_content = content.replace("\r\n", "\n")
filtered_content = markdownify.markdownify(filtered_content, code_language_callback=code_language_callback)
# Export Markdown
filename = f"content/post/{date.year}/{date.month}/{slug}.md"
os.makedirs(os.path.dirname(filename), exist_ok=True)
print(f"Writing to {filename}")
with open(filename, "w") as outfile:
outfile.write(header + filtered_content)
# Export raw HTML for debugging
raw_filename = f"../raw_content/post/{date.year}/{date.month}/{slug}.html"
os.makedirs(os.path.dirname(raw_filename), exist_ok=True)
with open(raw_filename, "w") as outfile:
outfile.write(content)
for post in [post for post in posts]:
export_post(post)
If this post helped you, please consider buying me a coffee or donating via PayPal to support research & publishing of new posts on TechOverflow