test remove_html_tags for usecase: detect_youtube_links
This commit is contained in:
parent
5f4566e598
commit
ea589afabc
5 changed files with 68 additions and 28 deletions
|
@ -27,9 +27,9 @@ class ConverterOptions(BaseModel):
|
|||
Will drop the specified header fields from the posts.
|
||||
"""
|
||||
|
||||
author_rewrite: str
|
||||
links_rewrite: list[dict]
|
||||
header_fields_drop: list[str]
|
||||
author_rewrite: str = ""
|
||||
links_rewrite: list[dict] = []
|
||||
header_fields_drop: list[str] = []
|
||||
|
||||
|
||||
class Configurator(BaseSettings):
|
||||
|
|
5
app/converter/regex_heuristics.py
Normal file
5
app/converter/regex_heuristics.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
def handle_regex_heuristics(line: str) -> str:
|
||||
"""
|
||||
Manipulates a line without tags by using regex heuristics.
|
||||
"""
|
||||
return line
|
8
app/converter/tags_heuristics.py
Normal file
8
app/converter/tags_heuristics.py
Normal file
|
@ -0,0 +1,8 @@
|
|||
def convert_figure_tag_to_shortcode(content, fixed_lines):
|
||||
"""
|
||||
Converts the figure tag that has 'is-provider-youtube' class to a YouTube shortcode.
|
||||
"""
|
||||
video_link = content.findNext("iframe").attrs["src"]
|
||||
video_id_part = video_link.rsplit("/")
|
||||
video_id = video_id_part[-1].split("?")[0]
|
||||
fixed_lines.append(f"{{{{< youtube {video_id} >}}}}\n")
|
|
@ -3,6 +3,8 @@ from bs4 import BeautifulSoup, Tag
|
|||
|
||||
from app import utils
|
||||
from app.config import Configurator
|
||||
from app.converter.regex_heuristics import handle_regex_heuristics
|
||||
from app.converter.tags_heuristics import convert_figure_tag_to_shortcode
|
||||
from app.io.reader import IoReader
|
||||
from app.io.writer import IoWriter
|
||||
from app.utils import key_error_silence
|
||||
|
@ -25,7 +27,7 @@ class WordpressMarkdownConverter:
|
|||
utils.guard_against_none(configurator, "configurator")
|
||||
self.configurator = configurator
|
||||
|
||||
def fix_hugo_header(self, header: dict) -> dict:
|
||||
def fix_header(self, header: dict) -> dict:
|
||||
"""
|
||||
Fix the Hugo header
|
||||
|
||||
|
@ -58,24 +60,27 @@ class WordpressMarkdownConverter:
|
|||
soup = BeautifulSoup(line, features="html.parser")
|
||||
for content in soup.contents:
|
||||
if isinstance(content, Tag):
|
||||
# Check if it is a youtube video and add it as a shortcode.
|
||||
if "is-provider-youtube" in content.attrs.get("class", []):
|
||||
video_link = content.findNext("iframe").attrs["src"]
|
||||
video_id_part = video_link.rsplit("/")
|
||||
video_id = video_id_part[-1].split("?")[0]
|
||||
fixed_lines.append(f"{{{{< youtube {video_id} >}}}}\n")
|
||||
# Fix unknown tags.
|
||||
else:
|
||||
tags = list(map(str, content.contents))
|
||||
if tags:
|
||||
fixed_tags = self.remove_html_tags(tags)
|
||||
if fixed_tags:
|
||||
fixed_lines.extend(fixed_tags)
|
||||
self._fix_html_tag(content, fixed_lines)
|
||||
else:
|
||||
# Add the content as is.
|
||||
fixed_lines.append(str(content))
|
||||
# Add the content.
|
||||
fixed_lines.append(handle_regex_heuristics(content))
|
||||
return fixed_lines
|
||||
|
||||
def _fix_html_tag(self, content, fixed_lines):
|
||||
"""
|
||||
Fixes the html tag.
|
||||
"""
|
||||
# Check if it is a YouTube video and add it as a shortcode.
|
||||
if "is-provider-youtube" in content.attrs.get("class", []):
|
||||
convert_figure_tag_to_shortcode(content, fixed_lines)
|
||||
# Fix unknown tags.
|
||||
else:
|
||||
tags = list(map(str, content.contents))
|
||||
if tags:
|
||||
fixed_tags = self.remove_html_tags(tags)
|
||||
if fixed_tags:
|
||||
fixed_lines.extend(fixed_tags)
|
||||
|
||||
def convert_post_content(self, post_content: str) -> str:
|
||||
"""
|
||||
Converts the post content
|
||||
|
@ -147,7 +152,7 @@ class WordpressMarkdownConverter:
|
|||
|
||||
# fix header
|
||||
header = yaml.safe_load(contents.split("---")[1])
|
||||
fixed_header = self.fix_hugo_header(header)
|
||||
fixed_header = self.fix_header(header)
|
||||
# fix content
|
||||
post_content = contents.split("---", 2)[2].lstrip()
|
||||
fixed_post_content = self.convert_post_content(post_content)
|
||||
|
|
|
@ -14,17 +14,15 @@ from app.tests.utils import make_fake_configurator
|
|||
("NucuLabs.dev", {"author": "Denis"}, {"author": "NucuLabs.dev"}),
|
||||
],
|
||||
)
|
||||
def test_fix_hugo_header_rewrite_author(author_rewrite, input_header, expected_header):
|
||||
def test_header_rewrite_author(author_rewrite, input_header, expected_header):
|
||||
configurator = make_fake_configurator(
|
||||
"wordpress_markdown_converter",
|
||||
ConverterOptions(
|
||||
author_rewrite=author_rewrite,
|
||||
links_rewrite=[],
|
||||
header_fields_drop=[],
|
||||
),
|
||||
)
|
||||
converter = WordpressMarkdownConverter(configurator)
|
||||
assert converter.fix_hugo_header(input_header) == expected_header
|
||||
assert converter.fix_header(input_header) == expected_header
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
|
@ -36,14 +34,38 @@ def test_fix_hugo_header_rewrite_author(author_rewrite, input_header, expected_h
|
|||
([], {"a": 1, "b": 2, "c": 3}, {"author": "", "a": 1, "b": 2, "c": 3}),
|
||||
],
|
||||
)
|
||||
def test_fix_hugo_header_fields_drop(header_fields_drop, input_header, expected_header):
|
||||
def test_header_fields_drop(header_fields_drop, input_header, expected_header):
|
||||
configurator = make_fake_configurator(
|
||||
"wordpress_markdown_converter",
|
||||
ConverterOptions(
|
||||
author_rewrite="",
|
||||
links_rewrite=[],
|
||||
header_fields_drop=header_fields_drop,
|
||||
),
|
||||
)
|
||||
converter = WordpressMarkdownConverter(configurator)
|
||||
assert converter.fix_hugo_header(input_header) == expected_header
|
||||
assert converter.fix_header(input_header) == expected_header
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_lines, expected_lines",
|
||||
[
|
||||
([], []),
|
||||
([""], ["\n"]),
|
||||
(
|
||||
[
|
||||
'<figure class="wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio"><div class="wp-block-embed__wrapper"><iframe allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen="" frameborder="0" height="281" loading="lazy" src="https://www.youtube.com/embed/X5865VHcGmQ?feature=oembed" title="Command Line Tools: fzf 🌸" width="500"></iframe></div></figure>Thanks!'
|
||||
],
|
||||
["{{< youtube X5865VHcGmQ >}}\n", "Thanks!"],
|
||||
),
|
||||
(
|
||||
["Hello https://youtu.be/jv40aJbRjjY?list=RDjv40aJbRjjY Done"],
|
||||
["Hello https://youtu.be/jv40aJbRjjY?list=RDjv40aJbRjjY Done"],
|
||||
),
|
||||
],
|
||||
)
|
||||
def test_remove_html_tags_detect_youtube_links(input_lines, expected_lines):
|
||||
configurator = make_fake_configurator(
|
||||
"wordpress_markdown_converter",
|
||||
ConverterOptions(),
|
||||
)
|
||||
converter = WordpressMarkdownConverter(configurator)
|
||||
assert converter.remove_html_tags(input_lines) == expected_lines
|
||||
|
|
Loading…
Reference in a new issue