test remove_html_tags for usecase: detect_youtube_links
This commit is contained in:
parent
5f4566e598
commit
ea589afabc
5 changed files with 68 additions and 28 deletions
|
@ -27,9 +27,9 @@ class ConverterOptions(BaseModel):
|
||||||
Will drop the specified header fields from the posts.
|
Will drop the specified header fields from the posts.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
author_rewrite: str
|
author_rewrite: str = ""
|
||||||
links_rewrite: list[dict]
|
links_rewrite: list[dict] = []
|
||||||
header_fields_drop: list[str]
|
header_fields_drop: list[str] = []
|
||||||
|
|
||||||
|
|
||||||
class Configurator(BaseSettings):
|
class Configurator(BaseSettings):
|
||||||
|
|
5
app/converter/regex_heuristics.py
Normal file
5
app/converter/regex_heuristics.py
Normal file
|
@ -0,0 +1,5 @@
|
||||||
|
def handle_regex_heuristics(line: str) -> str:
|
||||||
|
"""
|
||||||
|
Manipulates a line without tags by using regex heuristics.
|
||||||
|
"""
|
||||||
|
return line
|
8
app/converter/tags_heuristics.py
Normal file
8
app/converter/tags_heuristics.py
Normal file
|
@ -0,0 +1,8 @@
|
||||||
|
def convert_figure_tag_to_shortcode(content, fixed_lines):
|
||||||
|
"""
|
||||||
|
Converts the figure tag that has 'is-provider-youtube' class to a YouTube shortcode.
|
||||||
|
"""
|
||||||
|
video_link = content.findNext("iframe").attrs["src"]
|
||||||
|
video_id_part = video_link.rsplit("/")
|
||||||
|
video_id = video_id_part[-1].split("?")[0]
|
||||||
|
fixed_lines.append(f"{{{{< youtube {video_id} >}}}}\n")
|
|
@ -3,6 +3,8 @@ from bs4 import BeautifulSoup, Tag
|
||||||
|
|
||||||
from app import utils
|
from app import utils
|
||||||
from app.config import Configurator
|
from app.config import Configurator
|
||||||
|
from app.converter.regex_heuristics import handle_regex_heuristics
|
||||||
|
from app.converter.tags_heuristics import convert_figure_tag_to_shortcode
|
||||||
from app.io.reader import IoReader
|
from app.io.reader import IoReader
|
||||||
from app.io.writer import IoWriter
|
from app.io.writer import IoWriter
|
||||||
from app.utils import key_error_silence
|
from app.utils import key_error_silence
|
||||||
|
@ -25,7 +27,7 @@ class WordpressMarkdownConverter:
|
||||||
utils.guard_against_none(configurator, "configurator")
|
utils.guard_against_none(configurator, "configurator")
|
||||||
self.configurator = configurator
|
self.configurator = configurator
|
||||||
|
|
||||||
def fix_hugo_header(self, header: dict) -> dict:
|
def fix_header(self, header: dict) -> dict:
|
||||||
"""
|
"""
|
||||||
Fix the Hugo header
|
Fix the Hugo header
|
||||||
|
|
||||||
|
@ -58,12 +60,19 @@ class WordpressMarkdownConverter:
|
||||||
soup = BeautifulSoup(line, features="html.parser")
|
soup = BeautifulSoup(line, features="html.parser")
|
||||||
for content in soup.contents:
|
for content in soup.contents:
|
||||||
if isinstance(content, Tag):
|
if isinstance(content, Tag):
|
||||||
# Check if it is a youtube video and add it as a shortcode.
|
self._fix_html_tag(content, fixed_lines)
|
||||||
|
else:
|
||||||
|
# Add the content.
|
||||||
|
fixed_lines.append(handle_regex_heuristics(content))
|
||||||
|
return fixed_lines
|
||||||
|
|
||||||
|
def _fix_html_tag(self, content, fixed_lines):
|
||||||
|
"""
|
||||||
|
Fixes the html tag.
|
||||||
|
"""
|
||||||
|
# Check if it is a YouTube video and add it as a shortcode.
|
||||||
if "is-provider-youtube" in content.attrs.get("class", []):
|
if "is-provider-youtube" in content.attrs.get("class", []):
|
||||||
video_link = content.findNext("iframe").attrs["src"]
|
convert_figure_tag_to_shortcode(content, fixed_lines)
|
||||||
video_id_part = video_link.rsplit("/")
|
|
||||||
video_id = video_id_part[-1].split("?")[0]
|
|
||||||
fixed_lines.append(f"{{{{< youtube {video_id} >}}}}\n")
|
|
||||||
# Fix unknown tags.
|
# Fix unknown tags.
|
||||||
else:
|
else:
|
||||||
tags = list(map(str, content.contents))
|
tags = list(map(str, content.contents))
|
||||||
|
@ -71,10 +80,6 @@ class WordpressMarkdownConverter:
|
||||||
fixed_tags = self.remove_html_tags(tags)
|
fixed_tags = self.remove_html_tags(tags)
|
||||||
if fixed_tags:
|
if fixed_tags:
|
||||||
fixed_lines.extend(fixed_tags)
|
fixed_lines.extend(fixed_tags)
|
||||||
else:
|
|
||||||
# Add the content as is.
|
|
||||||
fixed_lines.append(str(content))
|
|
||||||
return fixed_lines
|
|
||||||
|
|
||||||
def convert_post_content(self, post_content: str) -> str:
|
def convert_post_content(self, post_content: str) -> str:
|
||||||
"""
|
"""
|
||||||
|
@ -147,7 +152,7 @@ class WordpressMarkdownConverter:
|
||||||
|
|
||||||
# fix header
|
# fix header
|
||||||
header = yaml.safe_load(contents.split("---")[1])
|
header = yaml.safe_load(contents.split("---")[1])
|
||||||
fixed_header = self.fix_hugo_header(header)
|
fixed_header = self.fix_header(header)
|
||||||
# fix content
|
# fix content
|
||||||
post_content = contents.split("---", 2)[2].lstrip()
|
post_content = contents.split("---", 2)[2].lstrip()
|
||||||
fixed_post_content = self.convert_post_content(post_content)
|
fixed_post_content = self.convert_post_content(post_content)
|
||||||
|
|
|
@ -14,17 +14,15 @@ from app.tests.utils import make_fake_configurator
|
||||||
("NucuLabs.dev", {"author": "Denis"}, {"author": "NucuLabs.dev"}),
|
("NucuLabs.dev", {"author": "Denis"}, {"author": "NucuLabs.dev"}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_fix_hugo_header_rewrite_author(author_rewrite, input_header, expected_header):
|
def test_header_rewrite_author(author_rewrite, input_header, expected_header):
|
||||||
configurator = make_fake_configurator(
|
configurator = make_fake_configurator(
|
||||||
"wordpress_markdown_converter",
|
"wordpress_markdown_converter",
|
||||||
ConverterOptions(
|
ConverterOptions(
|
||||||
author_rewrite=author_rewrite,
|
author_rewrite=author_rewrite,
|
||||||
links_rewrite=[],
|
|
||||||
header_fields_drop=[],
|
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
converter = WordpressMarkdownConverter(configurator)
|
converter = WordpressMarkdownConverter(configurator)
|
||||||
assert converter.fix_hugo_header(input_header) == expected_header
|
assert converter.fix_header(input_header) == expected_header
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
|
@ -36,14 +34,38 @@ def test_fix_hugo_header_rewrite_author(author_rewrite, input_header, expected_h
|
||||||
([], {"a": 1, "b": 2, "c": 3}, {"author": "", "a": 1, "b": 2, "c": 3}),
|
([], {"a": 1, "b": 2, "c": 3}, {"author": "", "a": 1, "b": 2, "c": 3}),
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_fix_hugo_header_fields_drop(header_fields_drop, input_header, expected_header):
|
def test_header_fields_drop(header_fields_drop, input_header, expected_header):
|
||||||
configurator = make_fake_configurator(
|
configurator = make_fake_configurator(
|
||||||
"wordpress_markdown_converter",
|
"wordpress_markdown_converter",
|
||||||
ConverterOptions(
|
ConverterOptions(
|
||||||
author_rewrite="",
|
|
||||||
links_rewrite=[],
|
|
||||||
header_fields_drop=header_fields_drop,
|
header_fields_drop=header_fields_drop,
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
converter = WordpressMarkdownConverter(configurator)
|
converter = WordpressMarkdownConverter(configurator)
|
||||||
assert converter.fix_hugo_header(input_header) == expected_header
|
assert converter.fix_header(input_header) == expected_header
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.parametrize(
|
||||||
|
"input_lines, expected_lines",
|
||||||
|
[
|
||||||
|
([], []),
|
||||||
|
([""], ["\n"]),
|
||||||
|
(
|
||||||
|
[
|
||||||
|
'<figure class="wp-block-embed is-type-video is-provider-youtube wp-block-embed-youtube wp-embed-aspect-16-9 wp-has-aspect-ratio"><div class="wp-block-embed__wrapper"><iframe allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" allowfullscreen="" frameborder="0" height="281" loading="lazy" src="https://www.youtube.com/embed/X5865VHcGmQ?feature=oembed" title="Command Line Tools: fzf 🌸" width="500"></iframe></div></figure>Thanks!'
|
||||||
|
],
|
||||||
|
["{{< youtube X5865VHcGmQ >}}\n", "Thanks!"],
|
||||||
|
),
|
||||||
|
(
|
||||||
|
["Hello https://youtu.be/jv40aJbRjjY?list=RDjv40aJbRjjY Done"],
|
||||||
|
["Hello https://youtu.be/jv40aJbRjjY?list=RDjv40aJbRjjY Done"],
|
||||||
|
),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
def test_remove_html_tags_detect_youtube_links(input_lines, expected_lines):
|
||||||
|
configurator = make_fake_configurator(
|
||||||
|
"wordpress_markdown_converter",
|
||||||
|
ConverterOptions(),
|
||||||
|
)
|
||||||
|
converter = WordpressMarkdownConverter(configurator)
|
||||||
|
assert converter.remove_html_tags(input_lines) == expected_lines
|
||||||
|
|
Loading…
Reference in a new issue