test remove_html_tags for usecase: detect_youtube_links

This commit is contained in:
Denis-Cosmin Nutiu 2023-05-31 19:29:47 +03:00
parent 5f4566e598
commit ea589afabc
5 changed files with 68 additions and 28 deletions

View file

@ -27,9 +27,9 @@ class ConverterOptions(BaseModel):
Will drop the specified header fields from the posts. Will drop the specified header fields from the posts.
""" """
author_rewrite: str author_rewrite: str = ""
links_rewrite: list[dict] links_rewrite: list[dict] = []
header_fields_drop: list[str] header_fields_drop: list[str] = []
class Configurator(BaseSettings): class Configurator(BaseSettings):

View file

@ -0,0 +1,5 @@
def handle_regex_heuristics(line: str) -> str:
"""
Manipulates a line without tags by using regex heuristics.
"""
return line

View file

@ -0,0 +1,8 @@
def convert_figure_tag_to_shortcode(content, fixed_lines):
"""
Converts the figure tag that has 'is-provider-youtube' class to a YouTube shortcode.
"""
video_link = content.findNext("iframe").attrs["src"]
video_id_part = video_link.rsplit("/")
video_id = video_id_part[-1].split("?")[0]
fixed_lines.append(f"{{{{< youtube {video_id} >}}}}\n")

View file

@ -3,6 +3,8 @@ from bs4 import BeautifulSoup, Tag
from app import utils from app import utils
from app.config import Configurator from app.config import Configurator
from app.converter.regex_heuristics import handle_regex_heuristics
from app.converter.tags_heuristics import convert_figure_tag_to_shortcode
from app.io.reader import IoReader from app.io.reader import IoReader
from app.io.writer import IoWriter from app.io.writer import IoWriter
from app.utils import key_error_silence from app.utils import key_error_silence
@ -25,7 +27,7 @@ class WordpressMarkdownConverter:
utils.guard_against_none(configurator, "configurator") utils.guard_against_none(configurator, "configurator")
self.configurator = configurator self.configurator = configurator
def fix_hugo_header(self, header: dict) -> dict: def fix_header(self, header: dict) -> dict:
""" """
Fix the Hugo header Fix the Hugo header
@ -58,12 +60,19 @@ class WordpressMarkdownConverter:
soup = BeautifulSoup(line, features="html.parser") soup = BeautifulSoup(line, features="html.parser")
for content in soup.contents: for content in soup.contents:
if isinstance(content, Tag): if isinstance(content, Tag):
# Check if it is a youtube video and add it as a shortcode. self._fix_html_tag(content, fixed_lines)
else:
# Add the content.
fixed_lines.append(handle_regex_heuristics(content))
return fixed_lines
def _fix_html_tag(self, content, fixed_lines):
"""
Fixes the html tag.
"""
# Check if it is a YouTube video and add it as a shortcode.
if "is-provider-youtube" in content.attrs.get("class", []): if "is-provider-youtube" in content.attrs.get("class", []):
video_link = content.findNext("iframe").attrs["src"] convert_figure_tag_to_shortcode(content, fixed_lines)
video_id_part = video_link.rsplit("/")
video_id = video_id_part[-1].split("?")[0]
fixed_lines.append(f"{{{{< youtube {video_id} >}}}}\n")
# Fix unknown tags. # Fix unknown tags.
else: else:
tags = list(map(str, content.contents)) tags = list(map(str, content.contents))
@ -71,10 +80,6 @@ class WordpressMarkdownConverter:
fixed_tags = self.remove_html_tags(tags) fixed_tags = self.remove_html_tags(tags)
if fixed_tags: if fixed_tags:
fixed_lines.extend(fixed_tags) fixed_lines.extend(fixed_tags)
else:
# Add the content as is.
fixed_lines.append(str(content))
return fixed_lines
def convert_post_content(self, post_content: str) -> str: def convert_post_content(self, post_content: str) -> str:
""" """
@ -147,7 +152,7 @@ class WordpressMarkdownConverter:
# fix header # fix header
header = yaml.safe_load(contents.split("---")[1]) header = yaml.safe_load(contents.split("---")[1])
fixed_header = self.fix_hugo_header(header) fixed_header = self.fix_header(header)
# fix content # fix content
post_content = contents.split("---", 2)[2].lstrip() post_content = contents.split("---", 2)[2].lstrip()
fixed_post_content = self.convert_post_content(post_content) fixed_post_content = self.convert_post_content(post_content)

View file

@ -14,17 +14,15 @@ from app.tests.utils import make_fake_configurator
("NucuLabs.dev", {"author": "Denis"}, {"author": "NucuLabs.dev"}), ("NucuLabs.dev", {"author": "Denis"}, {"author": "NucuLabs.dev"}),
], ],
) )
def test_fix_hugo_header_rewrite_author(author_rewrite, input_header, expected_header): def test_header_rewrite_author(author_rewrite, input_header, expected_header):
configurator = make_fake_configurator( configurator = make_fake_configurator(
"wordpress_markdown_converter", "wordpress_markdown_converter",
ConverterOptions( ConverterOptions(
author_rewrite=author_rewrite, author_rewrite=author_rewrite,
links_rewrite=[],
header_fields_drop=[],
), ),
) )
converter = WordpressMarkdownConverter(configurator) converter = WordpressMarkdownConverter(configurator)
assert converter.fix_hugo_header(input_header) == expected_header assert converter.fix_header(input_header) == expected_header
@pytest.mark.parametrize( @pytest.mark.parametrize(
@ -36,14 +34,38 @@ def test_fix_hugo_header_rewrite_author(author_rewrite, input_header, expected_h
([], {"a": 1, "b": 2, "c": 3}, {"author": "", "a": 1, "b": 2, "c": 3}), ([], {"a": 1, "b": 2, "c": 3}, {"author": "", "a": 1, "b": 2, "c": 3}),
], ],
) )
def test_fix_hugo_header_fields_drop(header_fields_drop, input_header, expected_header): def test_header_fields_drop(header_fields_drop, input_header, expected_header):
configurator = make_fake_configurator( configurator = make_fake_configurator(
"wordpress_markdown_converter", "wordpress_markdown_converter",
ConverterOptions( ConverterOptions(
author_rewrite="",
links_rewrite=[],
header_fields_drop=header_fields_drop, header_fields_drop=header_fields_drop,
), ),
) )
converter = WordpressMarkdownConverter(configurator) converter = WordpressMarkdownConverter(configurator)
assert converter.fix_hugo_header(input_header) == expected_header assert converter.fix_header(input_header) == expected_header
@pytest.mark.parametrize(
"input_lines, expected_lines",
[
([], []),
([""], ["\n"]),
(
[
'<figure class="wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio"><div class="wp-block-embed__wrapper"><iframe allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen="" frameborder="0" height="281" loading="lazy" src="https://www.youtube.com/embed/X5865VHcGmQ?feature=oembed" title="Command Line Tools: fzf 🌸" width="500"></iframe></div></figure>Thanks!'
],
["{{< youtube X5865VHcGmQ >}}\n", "Thanks!"],
),
(
["Hello https://youtu.be/jv40aJbRjjY?list=RDjv40aJbRjjY Done"],
["Hello https://youtu.be/jv40aJbRjjY?list=RDjv40aJbRjjY Done"],
),
],
)
def test_remove_html_tags_detect_youtube_links(input_lines, expected_lines):
configurator = make_fake_configurator(
"wordpress_markdown_converter",
ConverterOptions(),
)
converter = WordpressMarkdownConverter(configurator)
assert converter.remove_html_tags(input_lines) == expected_lines