add fix for code blocks

This commit is contained in:
Denis-Cosmin NUTIU 2023-06-02 09:58:20 +03:00
parent 3c20d726cf
commit efbe771799
5 changed files with 125 additions and 9 deletions

View file

@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$"> <content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" /> <excludeFolder url="file://$MODULE_DIR$/venv" />
</content> </content>
<orderEntry type="jdk" jdkName="Python 3.11 (jekyll-to-hugo)" jdkType="Python SDK" /> <orderEntry type="inheritedJdk" />
<orderEntry type="sourceFolder" forTests="false" /> <orderEntry type="sourceFolder" forTests="false" />
</component> </component>
</module> </module>

View file

@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<project version="4"> <project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (jekyll-to-hugo)" project-jdk-type="Python SDK" /> <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (jekyll-to-hugo)" project-jdk-type="Python SDK" />
</project> </project>

View file

@ -1,5 +1,34 @@
def handle_regex_heuristics(line: str) -> str: import re
from app import utils
class RegexHeuristics:
"""
Regex heuristics class for applying modifying a line using regex lines.
"""
def __init__(self, configurator):
utils.guard_against_none(configurator, "configurator")
self.configurator = configurator
self._rules = {
"^(</*pre.*?>)(?P<content>.*?)(<\/pre>)?$": self._remove_pre_tag,
}
def _remove_pre_tag(self, match) -> str:
"""
Removes the pre tag from the match.
"""
return match.group("content")
def handle_regex_heuristics(self, line: str) -> str:
""" """
Manipulates a line by using regex heuristics. Manipulates a line by using regex heuristics.
""" """
for regex, callback in self._rules.items():
match = re.match(regex, line)
if match:
return callback(match)
else:
return line return line

View file

@ -3,7 +3,7 @@ from bs4 import BeautifulSoup, Tag
from app import utils from app import utils
from app.config import Configurator from app.config import Configurator
from app.converter.regex_heuristics import handle_regex_heuristics from app.converter.regex_heuristics import RegexHeuristics
from app.converter.tags_heuristics import convert_figure_tag_to_shortcode from app.converter.tags_heuristics import convert_figure_tag_to_shortcode
from app.io.reader import IoReader from app.io.reader import IoReader
from app.io.writer import IoWriter from app.io.writer import IoWriter
@ -26,6 +26,7 @@ class WordpressMarkdownConverter:
""" """
utils.guard_against_none(configurator, "configurator") utils.guard_against_none(configurator, "configurator")
self.configurator = configurator self.configurator = configurator
self.regex_heuristics = RegexHeuristics(configurator)
def fix_header(self, header: dict) -> dict: def fix_header(self, header: dict) -> dict:
""" """
@ -51,12 +52,60 @@ class WordpressMarkdownConverter:
header["author"] = self.configurator.converter_options.author_rewrite header["author"] = self.configurator.converter_options.author_rewrite
return header return header
def fix_pre_content(self, post_lines: list[str]) -> list[str]:
"""
Fixes the pre content from the post lines when enclosed in backticks code blocks.
"""
fixed_lines = []
index = 0
while index < len(post_lines):
line = post_lines[index]
if line == "```":
found_enclosing = False
search_index = index + 1
while search_index < len(post_lines):
if post_lines[search_index] == "```":
found_enclosing = True
break
search_index += 1
if found_enclosing:
for line_index, line in enumerate(
post_lines[index : search_index + 1]
):
if line_index == 1:
regex_line = self.regex_heuristics.handle_regex_heuristics(
str(line)
)
if regex_line:
fixed_lines.append(regex_line)
else:
fixed_lines.append(line)
index = search_index + 1
continue
index += 1
fixed_lines.append(line)
return fixed_lines
def fix_html_tags(self, post_lines): def fix_html_tags(self, post_lines):
""" """
Fixes the html tags from the post lines. Fixes the html tags from the post lines.
""" """
fixed_lines = [] fixed_lines = []
is_in_code_block = False
for line in post_lines: for line in post_lines:
if line.startswith("```"):
if is_in_code_block:
is_in_code_block = False
else:
is_in_code_block = True
fixed_lines.append(line)
continue
if is_in_code_block:
fixed_lines.append(line)
continue
# Treat empty string as a new line.
if line == "": if line == "":
fixed_lines.append("\n") fixed_lines.append("\n")
continue continue
@ -66,10 +115,12 @@ class WordpressMarkdownConverter:
self._fix_html_tag(content, fixed_lines) self._fix_html_tag(content, fixed_lines)
else: else:
# Add the content. # Add the content.
fixed_lines.append(handle_regex_heuristics(str(content))) fixed_lines.append(
self.regex_heuristics.handle_regex_heuristics(str(content))
)
return fixed_lines return fixed_lines
def _fix_html_tag(self, content, fixed_lines): def _fix_html_tag(self, content: Tag, fixed_lines: list):
""" """
Fixes the html tag. Fixes the html tag.
""" """
@ -108,7 +159,8 @@ class WordpressMarkdownConverter:
# fix unknown tags # fix unknown tags
post_lines = post_content.split("\n") post_lines = post_content.split("\n")
fixed_lines = self.fix_html_tags(post_lines) fixed_lines = self.fix_pre_content(post_lines)
fixed_lines = self.fix_html_tags(fixed_lines)
return "\n".join(fixed_lines) return "\n".join(fixed_lines)

View file

@ -105,3 +105,38 @@ def test_fix_html_tags_stripe_tag(input_lines, expected_lines):
) )
converter = WordpressMarkdownConverter(configurator) converter = WordpressMarkdownConverter(configurator)
assert converter.fix_html_tags(input_lines) == expected_lines assert converter.fix_html_tags(input_lines) == expected_lines
@pytest.mark.parametrize(
"input_lines, expected_lines",
[
(
["<pre>", "<p>Te<span>st</span></p>", "</pre>"],
["<pre>", "<p>Te<span>st</span></p>", "</pre>"],
),
(
[
"```",
'<pre class="wp-block-syntaxhighlighter-code"> <ItemGroup>',
"```",
],
["```", " <ItemGroup>", "```"],
),
(
[
"```",
'<pre class="wp-block-syntaxhighlighter-code">',
"<ItemGroup>",
"```",
],
["```", "<ItemGroup>", "```"],
),
],
)
def test_fix_pre_content(input_lines, expected_lines):
configurator = make_fake_configurator(
"wordpress_markdown_converter",
ConverterOptions(),
)
converter = WordpressMarkdownConverter(configurator)
assert converter.fix_pre_content(input_lines) == expected_lines