diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..550bf39 --- /dev/null +++ b/.gitignore @@ -0,0 +1,290 @@ +# Created by https://www.toptal.com/developers/gitignore/api/pycharm,python +# Edit at https://www.toptal.com/developers/gitignore?templates=pycharm,python + +### PyCharm ### +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# AWS User-specific +.idea/**/aws.xml + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/artifacts +# .idea/compiler.xml +# .idea/jarRepositories.xml +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# SonarLint plugin +.idea/sonarlint/ + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### PyCharm Patch ### +# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 + +# *.iml +# modules.xml +# .idea/misc.xml +# *.ipr + +# Sonarlint plugin +# https://plugins.jetbrains.com/plugin/7973-sonarlint +.idea/**/sonarlint/ + +# SonarQube Plugin +# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin +.idea/**/sonarIssues.xml + +# Markdown Navigator plugin +# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced +.idea/**/markdown-navigator.xml +.idea/**/markdown-navigator-enh.xml +.idea/**/markdown-navigator/ + +# Cache file creation bug +# See https://youtrack.jetbrains.com/issue/JBR-2257 +.idea/$CACHE_FILE$ + +# CodeStream plugin +# https://plugins.jetbrains.com/plugin/12206-codestream +.idea/codestream.xml + +# Azure Toolkit for IntelliJ plugin +# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij +.idea/**/azureSettings.xml + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json +.my_test_data +my_test_data/ +# End of https://www.toptal.com/developers/gitignore/api/pycharm,python \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..13566b8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,8 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/jekyll-to-hugo.iml b/.idea/jekyll-to-hugo.iml new file mode 100644 index 0000000..8428c11 --- /dev/null +++ b/.idea/jekyll-to-hugo.iml @@ -0,0 +1,10 @@ + + + + + + + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..8cf1f08 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..9bec349 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8158c13 --- /dev/null +++ b/Makefile @@ -0,0 +1,6 @@ +# Formats the code +format: + black . && isort -r . +# Run tests +test: + pytest . \ No newline at end of file diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/converter/__init__.py b/app/converter/__init__.py new file mode 100644 index 0000000..a679a33 --- /dev/null +++ b/app/converter/__init__.py @@ -0,0 +1,2 @@ +from converter import Converter +from wordpress_markdown import WordpressMarkdownConverter diff --git a/app/converter/converter.py b/app/converter/converter.py new file mode 100644 index 0000000..0d04ab0 --- /dev/null +++ b/app/converter/converter.py @@ -0,0 +1,45 @@ +import os +from pathlib import Path + +from app import utils +from app.converter.wordpress_markdown import WordpressMarkdownConverter + + +class Converter: + """ + Convert Jekyll posts to Hugo posts + """ + + def __init__(self, jekyll_posts_path: str, hugo_posts_path: str): + """ + Initializes the converter + + Parameters + ---------- + jekyll_posts_path : str + The path to the Jekyll posts + hugo_posts_path : str + The path to the Hugo posts + """ + utils.guard_against_none_or_empty_str(jekyll_posts_path, "jekyll_posts_path") + utils.guard_against_none_or_empty_str(hugo_posts_path, "hugo_posts_path") + + self._jekyll_posts_path = jekyll_posts_path + self._hugo_posts_path = hugo_posts_path + + # The converter that converts the markdown + self.markdown_converter = WordpressMarkdownConverter() + + def convert(self): + """ + Converts the Jekyll posts to Hugo posts + """ + source_path = self._jekyll_posts_path + output_path = Path(self._hugo_posts_path) + _, _, files = next(os.walk(source_path)) + for file in files: + source_abs_path = source_path / Path(file) + self.markdown_converter.convert_jekyll_to_hugo( + source_abs_path, + output_path, + ) diff --git a/app/converter/wordpress_markdown.py b/app/converter/wordpress_markdown.py new file mode 100644 index 0000000..76b366e --- /dev/null +++ b/app/converter/wordpress_markdown.py @@ -0,0 +1,145 @@ +from pathlib import Path + +import yaml +from bs4 import BeautifulSoup, Tag + +from app.utils import key_error_silence + + +class WordpressMarkdownConverter: + """ + Markdown converter that converts jekyll posts to hugo posts. + """ + + def fix_hugo_header(self, header: dict) -> dict: + """ + Fix the Hugo header + + Parameters + ---------- + header : dict + The header to fix + + Returns + ------- + dict + The fixed header + """ + with key_error_silence(): + del header["restapi_import_id"] + with key_error_silence(): + del header["original_post_id"] + with key_error_silence(): + del header["timeline_notification"] + with key_error_silence(): + del header["wordads_ufa"] + header["guid"] = header["guid"].replace("http://localhost", "") + header["author"] = "Denis Nuțiu" + return header + + def remove_html_tags(self, post_lines): + fixed_lines = [] + for line in post_lines: + if line == "": + fixed_lines.append("\n") + continue + soup = BeautifulSoup(line) + for content in soup.contents: + if isinstance(content, Tag): + if "is-provider-youtube" in content.attrs.get("class", []): + video_link = content.findNext("iframe").attrs["src"] + video_id_part = video_link.rsplit("/") + video_id = video_id_part[-1].split("?")[0] + fixed_lines.append(f"{{{{< youtube {video_id} >}}}}\n") + else: + tags = list(map(str, content.contents)) + if tags: + fixed_tags = self.remove_html_tags(tags) + if fixed_tags: + fixed_lines.extend(fixed_tags) + else: + fixed_lines.append(str(content)) + return fixed_lines + + def convert_post_content(self, post_content: str) -> str: + """ + Converts the post content + + Parameters + ---------- + post_content : str + The post content + + Returns + ------- + str + The converted post content + """ + # fix link + post_content = post_content.replace("http://localhost/", "/") + post_content = post_content.replace( + "https://nuculabs.wordpress.com/", "https://nuculabs.dev/posts/" + ) + # fix unknown tags + post_lines = post_content.split("\n") + fixed_lines = self.remove_html_tags(post_lines) + + return "\n".join(fixed_lines) + + def read_jekyll_post(self, path: Path): + """ + Read a Jekyll post from the specified path + + Parameters + ---------- + path : Path + The path to the Jekyll post + """ + # read source + with open(path, "r") as fh: + contents = fh.read() + return contents + + def write_hugo_post(self, output_path, post_header: dict, post_content: str): + """ + Write a Hugo post to the specified path + + Parameters + ---------- + output_path : Path + The path to the Hugo post + post_header : dict + The post header + post_content : str + The post content + """ + with open(output_path, "w") as fo: + header = ["---\n", yaml.dump(post_header), "---\n"] + fo.writelines(header) + fo.write(post_content) + + def convert_jekyll_to_hugo(self, jekyll_post_path: Path, hugo_post_output: Path): + """ + Convert a Jekyll post to a Hugo post + + Parameters + ---------- + jekyll_post_path : Path + The path to the Jekyll post + hugo_post_output : Path + The path to the Hugo post + """ + contents = self.read_jekyll_post(jekyll_post_path) + + # fix header + header = yaml.safe_load(contents.split("---")[1]) + fixed_header = self.fix_hugo_header(header) + # fix content + post_content = contents.split("---", 2)[2].lstrip() + fixed_post_content = self.convert_post_content(post_content) + + self.write_hugo_post( + hugo_post_output.joinpath(jekyll_post_path.name), + fixed_header, + fixed_post_content, + ) diff --git a/app/tests/__init__.py b/app/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/tests/utils_test.py b/app/tests/utils_test.py new file mode 100644 index 0000000..60294f3 --- /dev/null +++ b/app/tests/utils_test.py @@ -0,0 +1,35 @@ +import pytest + +from app.utils import key_error_silence, guard_against_none_or_empty_str + + +def test_key_error_silence(): + # Test that the context manager silences the exception + with key_error_silence(): + raise KeyError + # Test that the context manager does not silence other exceptions + with pytest.raises(ValueError): + with key_error_silence(): + raise ValueError + + +@pytest.mark.parametrize( + "input_data", + [ + (""), + (None), + (1), + (True), + (False), + ({}), + ([]), + ({"a": 1}), + ], +) +def test_guard_against_none_or_empty_str(input_data): + with pytest.raises(ValueError): + guard_against_none_or_empty_str(input_data, "test") + + +def test_guard_against_none_or_empty_str_happy(): + guard_against_none_or_empty_str("a", "test") diff --git a/app/utils.py b/app/utils.py new file mode 100644 index 0000000..8880fd4 --- /dev/null +++ b/app/utils.py @@ -0,0 +1,28 @@ +import contextlib + + +@contextlib.contextmanager +def key_error_silence(): + """ + Context manager that silences key errors exceptions. + """ + try: + yield + except KeyError: + pass + + +def guard_against_none_or_empty_str(value: str, name: str): + """ + Guard against None or empty string. + + Parameters: + ---------- + value: str + The value to check. + name: str + The name of the value. + """ + + if value is None or not isinstance(value, str) or value == "": + raise ValueError(f"{name} cannot be None or empty") diff --git a/main.py b/main.py new file mode 100644 index 0000000..2fc1642 --- /dev/null +++ b/main.py @@ -0,0 +1,29 @@ +import logging +import sys + +from app.converter import Converter + + +def main(): + # Logging configuration + logging.basicConfig( + format="%(asctime)s %(process)d %(levelname)s %(message)s", + level=logging.INFO, + datefmt="%Y-%m-%d %H:%M:%S", + ) + logger = logging.getLogger(__name__) + + if len(sys.argv) != 3: + logger.error( + "Usage: python main.py " + ) + sys.exit(1) + + # Converter + converter = Converter(sys.argv[1], sys.argv[2]) + converter.convert() + + +# Press the green button in the gutter to run the script. +if __name__ == "__main__": + main() diff --git a/readme.md b/readme.md new file mode 100644 index 0000000..06e6ac7 --- /dev/null +++ b/readme.md @@ -0,0 +1,15 @@ +# Jekyll to Hugo Converter + +Jekyll to Hugo Converter is a simple tool to convert Jekyll posts to Hugo posts. + +You can also use it to convert your WordPress blog into a Hugo blog. Tutorial coming soon. + +## Usage + +```bash +pip install -r requirements.txt +python3 jekyll-to-hugo.py +``` + +--- +Made with ❤️ by [NucuLabs.dev](https://nuculabs.dev) \ No newline at end of file diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..6854b1f --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,2 @@ +black==23.3.0 +pdoc==13.1.1 \ No newline at end of file diff --git a/requirements-test.txt b/requirements-test.txt new file mode 100644 index 0000000..a6510db --- /dev/null +++ b/requirements-test.txt @@ -0,0 +1 @@ +pytest==7.3.1 \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8dbee84 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,3 @@ +beautifulsoup4==4.12.2 +PyYAML==6.0 +soupsieve==2.4.1