2023-05-29 18:35:38 +00:00
|
|
|
import yaml
|
|
|
|
from bs4 import BeautifulSoup, Tag
|
|
|
|
|
2023-05-29 18:58:32 +00:00
|
|
|
from app import utils
|
|
|
|
from app.config import Configurator
|
2023-06-02 06:58:20 +00:00
|
|
|
from app.converter.regex_heuristics import RegexHeuristics
|
2023-05-31 16:29:47 +00:00
|
|
|
from app.converter.tags_heuristics import convert_figure_tag_to_shortcode
|
2023-05-31 15:30:30 +00:00
|
|
|
from app.io.reader import IoReader
|
|
|
|
from app.io.writer import IoWriter
|
2023-05-29 18:35:38 +00:00
|
|
|
from app.utils import key_error_silence
|
|
|
|
|
|
|
|
|
|
|
|
class WordpressMarkdownConverter:
|
|
|
|
"""
|
|
|
|
Markdown converter that converts jekyll posts to hugo posts.
|
|
|
|
"""
|
|
|
|
|
2023-05-29 18:58:32 +00:00
|
|
|
def __init__(self, configurator: Configurator):
|
|
|
|
"""
|
|
|
|
Initializes the WordpressMarkdownConverter
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
configurator : Configurator
|
|
|
|
The configurator instance.
|
|
|
|
"""
|
|
|
|
utils.guard_against_none(configurator, "configurator")
|
|
|
|
self.configurator = configurator
|
2023-06-02 06:58:20 +00:00
|
|
|
self.regex_heuristics = RegexHeuristics(configurator)
|
2023-05-29 18:58:32 +00:00
|
|
|
|
2023-05-31 16:29:47 +00:00
|
|
|
def fix_header(self, header: dict) -> dict:
|
2023-05-29 18:35:38 +00:00
|
|
|
"""
|
|
|
|
Fix the Hugo header
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
header : dict
|
|
|
|
The header to fix
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
dict
|
|
|
|
The fixed header
|
|
|
|
"""
|
2023-05-31 15:36:04 +00:00
|
|
|
for field in self.configurator.converter_options.header_fields_drop:
|
|
|
|
with key_error_silence():
|
|
|
|
del header[field]
|
|
|
|
# rewrite header fields
|
2023-05-31 15:50:49 +00:00
|
|
|
with key_error_silence():
|
|
|
|
header["guid"] = header["guid"].replace("http://localhost", "")
|
|
|
|
with key_error_silence():
|
|
|
|
header["author"] = self.configurator.converter_options.author_rewrite
|
2023-05-29 18:35:38 +00:00
|
|
|
return header
|
|
|
|
|
2023-06-02 06:58:20 +00:00
|
|
|
def fix_pre_content(self, post_lines: list[str]) -> list[str]:
|
|
|
|
"""
|
|
|
|
Fixes the pre content from the post lines when enclosed in backticks code blocks.
|
|
|
|
"""
|
|
|
|
fixed_lines = []
|
|
|
|
index = 0
|
|
|
|
while index < len(post_lines):
|
|
|
|
line = post_lines[index]
|
|
|
|
if line == "```":
|
|
|
|
found_enclosing = False
|
|
|
|
search_index = index + 1
|
|
|
|
while search_index < len(post_lines):
|
|
|
|
if post_lines[search_index] == "```":
|
|
|
|
found_enclosing = True
|
|
|
|
break
|
|
|
|
search_index += 1
|
|
|
|
if found_enclosing:
|
|
|
|
for line_index, line in enumerate(
|
|
|
|
post_lines[index : search_index + 1]
|
|
|
|
):
|
|
|
|
if line_index == 1:
|
|
|
|
regex_line = self.regex_heuristics.handle_regex_heuristics(
|
|
|
|
str(line)
|
|
|
|
)
|
|
|
|
if regex_line:
|
|
|
|
fixed_lines.append(regex_line)
|
|
|
|
else:
|
|
|
|
fixed_lines.append(line)
|
|
|
|
index = search_index + 1
|
|
|
|
continue
|
|
|
|
index += 1
|
|
|
|
fixed_lines.append(line)
|
|
|
|
return fixed_lines
|
|
|
|
|
2023-05-31 16:34:01 +00:00
|
|
|
def fix_html_tags(self, post_lines):
|
|
|
|
"""
|
|
|
|
Fixes the html tags from the post lines.
|
|
|
|
"""
|
2023-05-29 18:35:38 +00:00
|
|
|
fixed_lines = []
|
2023-06-02 06:58:20 +00:00
|
|
|
is_in_code_block = False
|
2023-05-29 18:35:38 +00:00
|
|
|
for line in post_lines:
|
2023-06-02 06:58:20 +00:00
|
|
|
if line.startswith("```"):
|
|
|
|
if is_in_code_block:
|
|
|
|
is_in_code_block = False
|
|
|
|
else:
|
|
|
|
is_in_code_block = True
|
|
|
|
fixed_lines.append(line)
|
|
|
|
continue
|
|
|
|
|
|
|
|
if is_in_code_block:
|
|
|
|
fixed_lines.append(line)
|
|
|
|
continue
|
|
|
|
|
|
|
|
# Treat empty string as a new line.
|
2023-05-29 18:35:38 +00:00
|
|
|
if line == "":
|
|
|
|
fixed_lines.append("\n")
|
|
|
|
continue
|
2023-05-31 15:50:49 +00:00
|
|
|
soup = BeautifulSoup(line, features="html.parser")
|
2023-05-29 18:35:38 +00:00
|
|
|
for content in soup.contents:
|
|
|
|
if isinstance(content, Tag):
|
2023-05-31 16:29:47 +00:00
|
|
|
self._fix_html_tag(content, fixed_lines)
|
2023-05-29 18:35:38 +00:00
|
|
|
else:
|
2023-05-31 16:29:47 +00:00
|
|
|
# Add the content.
|
2023-06-02 06:58:20 +00:00
|
|
|
fixed_lines.append(
|
|
|
|
self.regex_heuristics.handle_regex_heuristics(str(content))
|
|
|
|
)
|
2023-05-29 18:35:38 +00:00
|
|
|
return fixed_lines
|
|
|
|
|
2023-06-02 06:58:20 +00:00
|
|
|
def _fix_html_tag(self, content: Tag, fixed_lines: list):
|
2023-05-31 16:29:47 +00:00
|
|
|
"""
|
|
|
|
Fixes the html tag.
|
|
|
|
"""
|
|
|
|
# Check if it is a YouTube video and add it as a shortcode.
|
|
|
|
if "is-provider-youtube" in content.attrs.get("class", []):
|
|
|
|
convert_figure_tag_to_shortcode(content, fixed_lines)
|
|
|
|
# Fix unknown tags.
|
|
|
|
else:
|
|
|
|
tags = list(map(str, content.contents))
|
|
|
|
if tags:
|
2023-05-31 16:34:01 +00:00
|
|
|
fixed_tags = self.fix_html_tags(tags)
|
2023-05-31 16:29:47 +00:00
|
|
|
if fixed_tags:
|
2023-06-01 19:19:57 +00:00
|
|
|
fixed_lines.append("".join(fixed_tags))
|
2023-05-31 16:29:47 +00:00
|
|
|
|
2023-05-29 18:35:38 +00:00
|
|
|
def convert_post_content(self, post_content: str) -> str:
|
|
|
|
"""
|
|
|
|
Converts the post content
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
|
|
|
post_content : str
|
|
|
|
The post content
|
|
|
|
|
|
|
|
Returns
|
|
|
|
-------
|
|
|
|
str
|
|
|
|
The converted post content
|
|
|
|
"""
|
|
|
|
# fix link
|
2023-05-29 18:58:32 +00:00
|
|
|
for task in self.configurator.converter_options.links_rewrite:
|
|
|
|
source_link = task.get("source")
|
|
|
|
target_link = task.get("target")
|
|
|
|
if not source_link or not target_link:
|
|
|
|
continue
|
|
|
|
post_content = post_content.replace(source_link, target_link)
|
|
|
|
|
2023-05-29 18:35:38 +00:00
|
|
|
# fix unknown tags
|
|
|
|
post_lines = post_content.split("\n")
|
2023-06-02 06:58:20 +00:00
|
|
|
fixed_lines = self.fix_pre_content(post_lines)
|
|
|
|
fixed_lines = self.fix_html_tags(fixed_lines)
|
2023-05-29 18:35:38 +00:00
|
|
|
|
|
|
|
return "\n".join(fixed_lines)
|
|
|
|
|
2023-05-31 15:30:30 +00:00
|
|
|
def read_jekyll_post(self, reader: IoReader):
|
2023-05-29 18:35:38 +00:00
|
|
|
"""
|
2023-05-31 15:30:30 +00:00
|
|
|
Read a Jekyll post from the reader.
|
2023-05-29 18:35:38 +00:00
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
2023-05-31 15:30:30 +00:00
|
|
|
reader : IoReader
|
|
|
|
The IoReader instance for reading.
|
2023-05-29 18:35:38 +00:00
|
|
|
"""
|
|
|
|
# read source
|
2023-05-31 15:30:30 +00:00
|
|
|
return reader.read()
|
2023-05-29 18:35:38 +00:00
|
|
|
|
2023-05-31 15:30:30 +00:00
|
|
|
def write_hugo_post(self, writer: IoWriter, post_header: dict, post_content: str):
|
2023-05-29 18:35:38 +00:00
|
|
|
"""
|
2023-05-31 15:30:30 +00:00
|
|
|
Write a Hugo post to the specified writer.
|
2023-05-29 18:35:38 +00:00
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
2023-05-31 15:30:30 +00:00
|
|
|
writer : IoWriter
|
|
|
|
The IoWriter instance for writing.
|
2023-05-29 18:35:38 +00:00
|
|
|
post_header : dict
|
|
|
|
The post header
|
|
|
|
post_content : str
|
|
|
|
The post content
|
|
|
|
"""
|
2023-05-31 15:30:30 +00:00
|
|
|
data = ["---\n", yaml.dump(post_header), "---\n", post_content]
|
|
|
|
writer.write("".join(data))
|
2023-05-29 18:35:38 +00:00
|
|
|
|
2023-05-31 15:30:30 +00:00
|
|
|
def convert_jekyll_to_hugo(self, reader: IoReader, writer: IoWriter):
|
2023-05-29 18:35:38 +00:00
|
|
|
"""
|
|
|
|
Convert a Jekyll post to a Hugo post
|
|
|
|
|
|
|
|
Parameters
|
|
|
|
----------
|
2023-05-31 15:30:30 +00:00
|
|
|
reader : IoReader
|
|
|
|
The IoReader instance for reading.
|
|
|
|
writer : IoWriter
|
|
|
|
The IoWriter instance for writing.
|
2023-05-29 18:35:38 +00:00
|
|
|
"""
|
2023-05-31 15:30:30 +00:00
|
|
|
contents = self.read_jekyll_post(reader)
|
2023-05-29 18:35:38 +00:00
|
|
|
|
|
|
|
# fix header
|
|
|
|
header = yaml.safe_load(contents.split("---")[1])
|
2023-05-31 16:29:47 +00:00
|
|
|
fixed_header = self.fix_header(header)
|
2023-05-29 18:35:38 +00:00
|
|
|
# fix content
|
|
|
|
post_content = contents.split("---", 2)[2].lstrip()
|
|
|
|
fixed_post_content = self.convert_post_content(post_content)
|
|
|
|
|
|
|
|
self.write_hugo_post(
|
2023-05-31 15:30:30 +00:00
|
|
|
writer,
|
2023-05-29 18:35:38 +00:00
|
|
|
fixed_header,
|
|
|
|
fixed_post_content,
|
|
|
|
)
|