jekyll-to-hugo/app/converter/wordpress_markdown.py

169 lines
5.1 KiB
Python
Raw Normal View History

2023-05-29 18:35:38 +00:00
from pathlib import Path
import yaml
from bs4 import BeautifulSoup, Tag
2023-05-29 18:58:32 +00:00
from app import utils
from app.config import Configurator
2023-05-29 18:35:38 +00:00
from app.utils import key_error_silence
class WordpressMarkdownConverter:
"""
Markdown converter that converts jekyll posts to hugo posts.
"""
2023-05-29 18:58:32 +00:00
def __init__(self, configurator: Configurator):
"""
Initializes the WordpressMarkdownConverter
Parameters
----------
configurator : Configurator
The configurator instance.
"""
utils.guard_against_none(configurator, "configurator")
self.configurator = configurator
2023-05-29 18:35:38 +00:00
def fix_hugo_header(self, header: dict) -> dict:
"""
Fix the Hugo header
Parameters
----------
header : dict
The header to fix
Returns
-------
dict
The fixed header
"""
with key_error_silence():
del header["restapi_import_id"]
with key_error_silence():
del header["original_post_id"]
with key_error_silence():
del header["timeline_notification"]
with key_error_silence():
del header["wordads_ufa"]
header["guid"] = header["guid"].replace("http://localhost", "")
2023-05-29 18:58:32 +00:00
header["author"] = self.configurator.converter_options.author_rewrite
2023-05-29 18:35:38 +00:00
return header
def remove_html_tags(self, post_lines):
fixed_lines = []
for line in post_lines:
if line == "":
fixed_lines.append("\n")
continue
soup = BeautifulSoup(line)
for content in soup.contents:
if isinstance(content, Tag):
2023-05-29 18:58:32 +00:00
# Check if it is a youtube video and add it as a shortcode.
2023-05-29 18:35:38 +00:00
if "is-provider-youtube" in content.attrs.get("class", []):
video_link = content.findNext("iframe").attrs["src"]
video_id_part = video_link.rsplit("/")
video_id = video_id_part[-1].split("?")[0]
fixed_lines.append(f"{{{{< youtube {video_id} >}}}}\n")
2023-05-29 18:58:32 +00:00
# Fix unknown tags.
2023-05-29 18:35:38 +00:00
else:
tags = list(map(str, content.contents))
if tags:
fixed_tags = self.remove_html_tags(tags)
if fixed_tags:
fixed_lines.extend(fixed_tags)
else:
2023-05-29 18:58:32 +00:00
# Add the content as is.
2023-05-29 18:35:38 +00:00
fixed_lines.append(str(content))
return fixed_lines
def convert_post_content(self, post_content: str) -> str:
"""
Converts the post content
Parameters
----------
post_content : str
The post content
Returns
-------
str
The converted post content
"""
# fix link
2023-05-29 18:58:32 +00:00
for task in self.configurator.converter_options.links_rewrite:
source_link = task.get("source")
target_link = task.get("target")
if not source_link or not target_link:
continue
post_content = post_content.replace(source_link, target_link)
2023-05-29 18:35:38 +00:00
# fix unknown tags
post_lines = post_content.split("\n")
fixed_lines = self.remove_html_tags(post_lines)
return "\n".join(fixed_lines)
def read_jekyll_post(self, path: Path):
"""
Read a Jekyll post from the specified path
Parameters
----------
path : Path
The path to the Jekyll post
"""
# read source
with open(path, "r") as fh:
contents = fh.read()
return contents
def write_hugo_post(self, output_path, post_header: dict, post_content: str):
"""
Write a Hugo post to the specified path
Parameters
----------
output_path : Path
The path to the Hugo post
post_header : dict
The post header
post_content : str
The post content
"""
2023-05-29 18:58:32 +00:00
# ensure that output path exists
output_path.parent.mkdir(parents=True, exist_ok=True)
2023-05-29 18:35:38 +00:00
with open(output_path, "w") as fo:
header = ["---\n", yaml.dump(post_header), "---\n"]
fo.writelines(header)
fo.write(post_content)
def convert_jekyll_to_hugo(self, jekyll_post_path: Path, hugo_post_output: Path):
"""
Convert a Jekyll post to a Hugo post
Parameters
----------
jekyll_post_path : Path
The path to the Jekyll post
hugo_post_output : Path
The path to the Hugo post
"""
contents = self.read_jekyll_post(jekyll_post_path)
# fix header
header = yaml.safe_load(contents.split("---")[1])
fixed_header = self.fix_hugo_header(header)
# fix content
post_content = contents.split("---", 2)[2].lstrip()
fixed_post_content = self.convert_post_content(post_content)
self.write_hugo_post(
hugo_post_output.joinpath(jekyll_post_path.name),
fixed_header,
fixed_post_content,
)