jekyll-to-hugo/app/converter/wordpress_markdown.py

import yaml
from bs4 import BeautifulSoup, Tag

from app import utils
from app.config import Configurator
from app.converter.regex_heuristics import RegexHeuristics
from app.converter.tags_heuristics import convert_figure_tag_to_shortcode
from app.io.reader import IoReader
from app.io.writer import IoWriter
from app.utils import key_error_silence


class WordpressMarkdownConverter:
    """
    Markdown converter that converts jekyll posts to hugo posts.
    """

    def __init__(self, configurator: Configurator):
        """
        Initializes the WordpressMarkdownConverter

        Parameters
        ----------
        configurator : Configurator
            The configurator instance.
        """
        utils.guard_against_none(configurator, "configurator")
        self.configurator = configurator
        self.regex_heuristics = RegexHeuristics(configurator)

    def fix_header(self, header: dict) -> dict:
        """
        Fix the Hugo header

        Parameters
        ----------
        header : dict
            The header to fix

        Returns
        -------
        dict
            The fixed header
        """
        for field in self.configurator.converter_options.header_fields_drop:
            with key_error_silence():
                del header[field]
        # rewrite header fields
        with key_error_silence():
            header["guid"] = header["guid"].replace("http://localhost", "")
        with key_error_silence():
            header["author"] = self.configurator.converter_options.author_rewrite
        return header

    def fix_pre_content(self, post_lines: list[str]) -> list[str]:
        """
        Fixes the pre content from the post lines when enclosed in backticks code blocks.
        """
        fixed_lines = []
        index = 0
        while index < len(post_lines):
            line = post_lines[index]
            if line == "```":
                found_enclosing = False
                search_index = index + 1
                while search_index < len(post_lines):
                    if post_lines[search_index] == "```":
                        found_enclosing = True
                        break
                    search_index += 1
                if found_enclosing:
                    for line_index, line in enumerate(
                        post_lines[index : search_index + 1]
                    ):
                        if line_index == 1:
                            regex_line = self.regex_heuristics.handle_regex_heuristics(
                                str(line)
                            )
                            if regex_line:
                                fixed_lines.append(regex_line)
                        else:
                            fixed_lines.append(line)
                    index = search_index + 1
                    continue
            index += 1
            fixed_lines.append(line)
        return fixed_lines

    def fix_html_tags(self, post_lines):
        """
        Fixes the html tags from the post lines.
        """
        fixed_lines = []
        is_in_code_block = False
        for line in post_lines:
            # Enter code block mode when detecting a line that starts with ```
            # and exit when detecting a line that starts with ```.
            if line.startswith("```"):
                if is_in_code_block:
                    is_in_code_block = False
                else:
                    is_in_code_block = True
                fixed_lines.append(line)
                continue

            # Skip modifying the line if it is in code block mode.
            if is_in_code_block:
                fixed_lines.append(line)
                continue

            # Treat empty string as a new line.
            if line == "":
                fixed_lines.append("\n")
                continue

            # Parse the line as html and remove the HTML tags from it.
            soup = BeautifulSoup(line, features="html.parser")
            for content in soup.contents:
                if isinstance(content, Tag):
                    # found html tag
                    self._fix_html_tag(content, fixed_lines)
                else:
                    # found text, add it to the fixed lines
                    fixed_lines.append(
                        self.regex_heuristics.handle_regex_heuristics(str(content))
                    )
        return fixed_lines

    def _fix_html_tag(self, content: Tag, fixed_lines: list):
        """
        Fixes the html tag.
        """
        # Check if tag is a YouTube video and add it as a shortcode.
        if "is-provider-youtube" in content.attrs.get("class", []):
            convert_figure_tag_to_shortcode(content, fixed_lines)
        # Fix unknown tags by removing the tag and only add inner content.
        # content.contents is a list of all the inner content of the tag.
        else:
            tags = list(map(str, content.contents))
            if tags:
                # recursively fix the inner content of the tag.
                fixed_tags = self.fix_html_tags(tags)
                if fixed_tags:
                    fixed_lines.append("".join(fixed_tags))

    def convert_post_content(self, post_content: str) -> str:
        """
        Converts the post content

        Parameters
        ----------
        post_content : str
            The post content

        Returns
        -------
        str
            The converted post content
        """
        # fix links inside post content with simple replace
        for task in self.configurator.converter_options.links_rewrite:
            source_link = task.get("source")
            target_link = task.get("target")
            if not source_link or not target_link:
                continue
            post_content = post_content.replace(source_link, target_link)

        # fix unknown tags
        post_lines = post_content.split("\n")
        fixed_lines = self.fix_pre_content(post_lines)
        fixed_lines = self.fix_html_tags(fixed_lines)

        return "\n".join(fixed_lines)

    def read_jekyll_post(self, reader: IoReader):
        """
        Read a Jekyll post from the reader.

        Parameters
        ----------
        reader : IoReader
            The IoReader instance for reading.
        """
        # read source
        return reader.read()

    def write_hugo_post(self, writer: IoWriter, post_header: dict, post_content: str):
        """
        Write a Hugo post to the specified writer.

        Parameters
        ----------
        writer : IoWriter
            The IoWriter instance for writing.
        post_header : dict
            The post header
        post_content : str
            The post content
        """
        data = ["---\n", yaml.dump(post_header), "---\n", post_content]
        writer.write("".join(data))

    def convert_jekyll_to_hugo(self, reader: IoReader, writer: IoWriter):
        """
        Convert a Jekyll post to a Hugo post

        Parameters
        ----------
        reader : IoReader
            The IoReader instance for reading.
        writer : IoWriter
            The IoWriter instance for writing.
        """
        contents = self.read_jekyll_post(reader)

        # fix header
        header = yaml.safe_load(contents.split("---")[1])
        fixed_header = self.fix_header(header)
        # fix content
        post_content = contents.split("---", 2)[2].lstrip()
        fixed_post_content = self.convert_post_content(post_content)

        self.write_hugo_post(
            writer,
            fixed_header,
            fixed_post_content,
        )