From ea589afabc4410dd3f30ae5a0df4a47a5820b3d7 Mon Sep 17 00:00:00 2001 From: Denis Nutiu Date: Wed, 31 May 2023 19:29:47 +0300 Subject: [PATCH] test remove_html_tags for usecase: detect_youtube_links --- app/config.py | 6 +-- app/converter/regex_heuristics.py | 5 +++ app/converter/tags_heuristics.py | 8 ++++ app/converter/wordpress_markdown.py | 39 +++++++++++-------- .../converter/wordpress_markdown_test.py | 38 ++++++++++++++---- 5 files changed, 68 insertions(+), 28 deletions(-) create mode 100644 app/converter/regex_heuristics.py create mode 100644 app/converter/tags_heuristics.py diff --git a/app/config.py b/app/config.py index dd1d678..d01d28c 100644 --- a/app/config.py +++ b/app/config.py @@ -27,9 +27,9 @@ class ConverterOptions(BaseModel): Will drop the specified header fields from the posts. """ - author_rewrite: str - links_rewrite: list[dict] - header_fields_drop: list[str] + author_rewrite: str = "" + links_rewrite: list[dict] = [] + header_fields_drop: list[str] = [] class Configurator(BaseSettings): diff --git a/app/converter/regex_heuristics.py b/app/converter/regex_heuristics.py new file mode 100644 index 0000000..2d03e21 --- /dev/null +++ b/app/converter/regex_heuristics.py @@ -0,0 +1,5 @@ +def handle_regex_heuristics(line: str) -> str: + """ + Manipulates a line without tags by using regex heuristics. + """ + return line diff --git a/app/converter/tags_heuristics.py b/app/converter/tags_heuristics.py new file mode 100644 index 0000000..84343bf --- /dev/null +++ b/app/converter/tags_heuristics.py @@ -0,0 +1,8 @@ +def convert_figure_tag_to_shortcode(content, fixed_lines): + """ + Converts the figure tag that has 'is-provider-youtube' class to a YouTube shortcode. + """ + video_link = content.findNext("iframe").attrs["src"] + video_id_part = video_link.rsplit("/") + video_id = video_id_part[-1].split("?")[0] + fixed_lines.append(f"{{{{< youtube {video_id} >}}}}\n") diff --git a/app/converter/wordpress_markdown.py b/app/converter/wordpress_markdown.py index e90657d..4b8696a 100644 --- a/app/converter/wordpress_markdown.py +++ b/app/converter/wordpress_markdown.py @@ -3,6 +3,8 @@ from bs4 import BeautifulSoup, Tag from app import utils from app.config import Configurator +from app.converter.regex_heuristics import handle_regex_heuristics +from app.converter.tags_heuristics import convert_figure_tag_to_shortcode from app.io.reader import IoReader from app.io.writer import IoWriter from app.utils import key_error_silence @@ -25,7 +27,7 @@ class WordpressMarkdownConverter: utils.guard_against_none(configurator, "configurator") self.configurator = configurator - def fix_hugo_header(self, header: dict) -> dict: + def fix_header(self, header: dict) -> dict: """ Fix the Hugo header @@ -58,24 +60,27 @@ class WordpressMarkdownConverter: soup = BeautifulSoup(line, features="html.parser") for content in soup.contents: if isinstance(content, Tag): - # Check if it is a youtube video and add it as a shortcode. - if "is-provider-youtube" in content.attrs.get("class", []): - video_link = content.findNext("iframe").attrs["src"] - video_id_part = video_link.rsplit("/") - video_id = video_id_part[-1].split("?")[0] - fixed_lines.append(f"{{{{< youtube {video_id} >}}}}\n") - # Fix unknown tags. - else: - tags = list(map(str, content.contents)) - if tags: - fixed_tags = self.remove_html_tags(tags) - if fixed_tags: - fixed_lines.extend(fixed_tags) + self._fix_html_tag(content, fixed_lines) else: - # Add the content as is. - fixed_lines.append(str(content)) + # Add the content. + fixed_lines.append(handle_regex_heuristics(content)) return fixed_lines + def _fix_html_tag(self, content, fixed_lines): + """ + Fixes the html tag. + """ + # Check if it is a YouTube video and add it as a shortcode. + if "is-provider-youtube" in content.attrs.get("class", []): + convert_figure_tag_to_shortcode(content, fixed_lines) + # Fix unknown tags. + else: + tags = list(map(str, content.contents)) + if tags: + fixed_tags = self.remove_html_tags(tags) + if fixed_tags: + fixed_lines.extend(fixed_tags) + def convert_post_content(self, post_content: str) -> str: """ Converts the post content @@ -147,7 +152,7 @@ class WordpressMarkdownConverter: # fix header header = yaml.safe_load(contents.split("---")[1]) - fixed_header = self.fix_hugo_header(header) + fixed_header = self.fix_header(header) # fix content post_content = contents.split("---", 2)[2].lstrip() fixed_post_content = self.convert_post_content(post_content) diff --git a/app/tests/converter/wordpress_markdown_test.py b/app/tests/converter/wordpress_markdown_test.py index 4e3d414..a36d2ef 100644 --- a/app/tests/converter/wordpress_markdown_test.py +++ b/app/tests/converter/wordpress_markdown_test.py @@ -14,17 +14,15 @@ from app.tests.utils import make_fake_configurator ("NucuLabs.dev", {"author": "Denis"}, {"author": "NucuLabs.dev"}), ], ) -def test_fix_hugo_header_rewrite_author(author_rewrite, input_header, expected_header): +def test_header_rewrite_author(author_rewrite, input_header, expected_header): configurator = make_fake_configurator( "wordpress_markdown_converter", ConverterOptions( author_rewrite=author_rewrite, - links_rewrite=[], - header_fields_drop=[], ), ) converter = WordpressMarkdownConverter(configurator) - assert converter.fix_hugo_header(input_header) == expected_header + assert converter.fix_header(input_header) == expected_header @pytest.mark.parametrize( @@ -36,14 +34,38 @@ def test_fix_hugo_header_rewrite_author(author_rewrite, input_header, expected_h ([], {"a": 1, "b": 2, "c": 3}, {"author": "", "a": 1, "b": 2, "c": 3}), ], ) -def test_fix_hugo_header_fields_drop(header_fields_drop, input_header, expected_header): +def test_header_fields_drop(header_fields_drop, input_header, expected_header): configurator = make_fake_configurator( "wordpress_markdown_converter", ConverterOptions( - author_rewrite="", - links_rewrite=[], header_fields_drop=header_fields_drop, ), ) converter = WordpressMarkdownConverter(configurator) - assert converter.fix_hugo_header(input_header) == expected_header + assert converter.fix_header(input_header) == expected_header + + +@pytest.mark.parametrize( + "input_lines, expected_lines", + [ + ([], []), + ([""], ["\n"]), + ( + [ + '
Thanks!' + ], + ["{{< youtube X5865VHcGmQ >}}\n", "Thanks!"], + ), + ( + ["Hello https://youtu.be/jv40aJbRjjY?list=RDjv40aJbRjjY Done"], + ["Hello https://youtu.be/jv40aJbRjjY?list=RDjv40aJbRjjY Done"], + ), + ], +) +def test_remove_html_tags_detect_youtube_links(input_lines, expected_lines): + configurator = make_fake_configurator( + "wordpress_markdown_converter", + ConverterOptions(), + ) + converter = WordpressMarkdownConverter(configurator) + assert converter.remove_html_tags(input_lines) == expected_lines