add config option for RegexHeuristics

This commit is contained in:
Denis-Cosmin NUTIU 2023-06-02 11:12:24 +03:00
parent f4696b35b2
commit ce06c201ee
3 changed files with 32 additions and 5 deletions

View file

@ -13,6 +13,16 @@ def yaml_config_settings_source(settings: BaseSettings):
return yaml.safe_load(fh) return yaml.safe_load(fh)
class RegexHeuristics(BaseModel):
"""
Regex heuristics options for applying modifying a line using regex lines.
True means option is enabled, False means option is disabled.
"""
remove_pre_tag: bool = True
class ConverterOptions(BaseModel): class ConverterOptions(BaseModel):
""" """
Converter options. Converter options.
@ -30,6 +40,8 @@ class ConverterOptions(BaseModel):
author_rewrite: str = "" author_rewrite: str = ""
links_rewrite: list[dict] = [] links_rewrite: list[dict] = []
header_fields_drop: list[str] = [] header_fields_drop: list[str] = []
enable_regex_heuristics: bool = True
regex_heuristics: RegexHeuristics = RegexHeuristics()
class Configurator(BaseSettings): class Configurator(BaseSettings):

View file

@ -1,7 +1,10 @@
import re import re
from collections import namedtuple
from app import utils from app import utils
RegexCallback = namedtuple("RegexCallback", ["callback", "name"])
class RegexHeuristics: class RegexHeuristics:
""" """
@ -12,11 +15,16 @@ class RegexHeuristics:
utils.guard_against_none(configurator, "configurator") utils.guard_against_none(configurator, "configurator")
self.configurator = configurator self.configurator = configurator
self._regex_options = (
self.configurator.converter_options.regex_heuristics.dict()
)
self._rules = { self._rules = {
"^(</*pre.*?>)`{0,3}(?P<content>.*?)(<\/pre>)?$": self._remove_pre_tag, "^(</*pre.*?>)`{0,3}(?P<content>.*?)(<\/pre>)?$": RegexCallback(
self._remove_pre_tag, "remove_pre_tag"
),
} }
def _remove_pre_tag(self, match) -> str: def _remove_pre_tag(self, match: re.Match) -> str:
""" """
Removes the pre tag from the match. Removes the pre tag from the match.
""" """
@ -26,9 +34,15 @@ class RegexHeuristics:
""" """
Manipulates a line by using regex heuristics. Manipulates a line by using regex heuristics.
""" """
if not self.configurator.converter_options.enable_regex_heuristics:
return line
for regex, callback in self._rules.items(): for regex, callback in self._rules.items():
option_enabled = self._regex_options.get(callback.name, False)
if not option_enabled:
continue
match = re.match(regex, line) match = re.match(regex, line)
if match: if match:
return callback(match) line = callback.callback(match)
else:
return line return line

View file

@ -3,6 +3,7 @@ source_path: "/Users/dnutiu/PycharmProjects/jekyll-to-hugo/my_test_data/_posts"
output_path: "/Users/dnutiu/NucuLabsProjects/NucuLabsDevBlog/content/posts" output_path: "/Users/dnutiu/NucuLabsProjects/NucuLabsDevBlog/content/posts"
converter: "wordpress_markdown_converter" converter: "wordpress_markdown_converter"
converter_options: converter_options:
enable_regex_heuristics: true
author_rewrite: "Denis Nuțiu" author_rewrite: "Denis Nuțiu"
links_rewrite: links_rewrite:
- source: "http://localhost/" - source: "http://localhost/"