initial commit

This commit is contained in:
Denis-Cosmin Nutiu 2023-05-29 21:35:38 +03:00
parent c953bc7dff
commit 8afd96294f
20 changed files with 643 additions and 0 deletions

290
.gitignore vendored Normal file
View file

@ -0,0 +1,290 @@
# Created by https://www.toptal.com/developers/gitignore/api/pycharm,python
# Edit at https://www.toptal.com/developers/gitignore?templates=pycharm,python
### PyCharm ###
# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf
# AWS User-specific
.idea/**/aws.xml
# Generated files
.idea/**/contentModel.xml
# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml
# Gradle
.idea/**/gradle.xml
.idea/**/libraries
# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr
# CMake
cmake-build-*/
# Mongo Explorer plugin
.idea/**/mongoSettings.xml
# File-based project format
*.iws
# IntelliJ
out/
# mpeltonen/sbt-idea plugin
.idea_modules/
# JIRA plugin
atlassian-ide-plugin.xml
# Cursive Clojure plugin
.idea/replstate.xml
# SonarLint plugin
.idea/sonarlint/
# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties
# Editor-based Rest Client
.idea/httpRequests
# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser
### PyCharm Patch ###
# Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
# *.iml
# modules.xml
# .idea/misc.xml
# *.ipr
# Sonarlint plugin
# https://plugins.jetbrains.com/plugin/7973-sonarlint
.idea/**/sonarlint/
# SonarQube Plugin
# https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin
.idea/**/sonarIssues.xml
# Markdown Navigator plugin
# https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced
.idea/**/markdown-navigator.xml
.idea/**/markdown-navigator-enh.xml
.idea/**/markdown-navigator/
# Cache file creation bug
# See https://youtrack.jetbrains.com/issue/JBR-2257
.idea/$CACHE_FILE$
# CodeStream plugin
# https://plugins.jetbrains.com/plugin/12206-codestream
.idea/codestream.xml
# Azure Toolkit for IntelliJ plugin
# https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij
.idea/**/azureSettings.xml
### Python ###
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class
# C extensions
*.so
# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/
# Translations
*.mo
*.pot
# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal
# Flask stuff:
instance/
.webassets-cache
# Scrapy stuff:
.scrapy
# Sphinx documentation
docs/_build/
# PyBuilder
.pybuilder/
target/
# Jupyter Notebook
.ipynb_checkpoints
# IPython
profile_default/
ipython_config.py
# pyenv
# For a library or package, you might want to ignore these files since the code is
# intended to run in multiple environments; otherwise, check them in:
# .python-version
# pipenv
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
# However, in case of collaboration, if having platform-specific dependencies or dependencies
# having no cross-platform support, pipenv may install dependencies that don't work, or not
# install all needed dependencies.
#Pipfile.lock
# poetry
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
# This is especially recommended for binary packages to ensure reproducibility, and is more
# commonly ignored for libraries.
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock
# pdm
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
# in version control.
# https://pdm.fming.dev/#use-with-ide
.pdm.toml
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/
# Celery stuff
celerybeat-schedule
celerybeat.pid
# SageMath parsed files
*.sage.py
# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
# Spyder project settings
.spyderproject
.spyproject
# Rope project settings
.ropeproject
# mkdocs documentation
/site
# mypy
.mypy_cache/
.dmypy.json
dmypy.json
# Pyre type checker
.pyre/
# pytype static type analyzer
.pytype/
# Cython debug symbols
cython_debug/
# PyCharm
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
### Python Patch ###
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
poetry.toml
# ruff
.ruff_cache/
# LSP config files
pyrightconfig.json
.my_test_data
my_test_data/
# End of https://www.toptal.com/developers/gitignore/api/pycharm,python

8
.idea/.gitignore vendored Normal file
View file

@ -0,0 +1,8 @@
# Default ignored files
/shelf/
/workspace.xml
# Editor-based HTTP Client requests
/httpRequests/
# Datasource local storage ignored files
/dataSources/
/dataSources.local.xml

View file

@ -0,0 +1,6 @@
<component name="InspectionProjectProfileManager">
<settings>
<option name="USE_PROJECT_PROFILE" value="false" />
<version value="1.0" />
</settings>
</component>

10
.idea/jekyll-to-hugo.iml Normal file
View file

@ -0,0 +1,10 @@
<?xml version="1.0" encoding="UTF-8"?>
<module type="PYTHON_MODULE" version="4">
<component name="NewModuleRootManager">
<content url="file://$MODULE_DIR$">
<excludeFolder url="file://$MODULE_DIR$/venv" />
</content>
<orderEntry type="jdk" jdkName="Python 3.11 (jekyll-to-hugo)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

4
.idea/misc.xml Normal file
View file

@ -0,0 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.11 (jekyll-to-hugo)" project-jdk-type="Python SDK" />
</project>

8
.idea/modules.xml Normal file
View file

@ -0,0 +1,8 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectModuleManager">
<modules>
<module fileurl="file://$PROJECT_DIR$/.idea/jekyll-to-hugo.iml" filepath="$PROJECT_DIR$/.idea/jekyll-to-hugo.iml" />
</modules>
</component>
</project>

6
.idea/vcs.xml Normal file
View file

@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

6
Makefile Normal file
View file

@ -0,0 +1,6 @@
# Formats the code
format:
black . && isort -r .
# Run tests
test:
pytest .

0
app/__init__.py Normal file
View file

View file

@ -0,0 +1,2 @@
from converter import Converter
from wordpress_markdown import WordpressMarkdownConverter

View file

@ -0,0 +1,45 @@
import os
from pathlib import Path
from app import utils
from app.converter.wordpress_markdown import WordpressMarkdownConverter
class Converter:
"""
Convert Jekyll posts to Hugo posts
"""
def __init__(self, jekyll_posts_path: str, hugo_posts_path: str):
"""
Initializes the converter
Parameters
----------
jekyll_posts_path : str
The path to the Jekyll posts
hugo_posts_path : str
The path to the Hugo posts
"""
utils.guard_against_none_or_empty_str(jekyll_posts_path, "jekyll_posts_path")
utils.guard_against_none_or_empty_str(hugo_posts_path, "hugo_posts_path")
self._jekyll_posts_path = jekyll_posts_path
self._hugo_posts_path = hugo_posts_path
# The converter that converts the markdown
self.markdown_converter = WordpressMarkdownConverter()
def convert(self):
"""
Converts the Jekyll posts to Hugo posts
"""
source_path = self._jekyll_posts_path
output_path = Path(self._hugo_posts_path)
_, _, files = next(os.walk(source_path))
for file in files:
source_abs_path = source_path / Path(file)
self.markdown_converter.convert_jekyll_to_hugo(
source_abs_path,
output_path,
)

View file

@ -0,0 +1,145 @@
from pathlib import Path
import yaml
from bs4 import BeautifulSoup, Tag
from app.utils import key_error_silence
class WordpressMarkdownConverter:
"""
Markdown converter that converts jekyll posts to hugo posts.
"""
def fix_hugo_header(self, header: dict) -> dict:
"""
Fix the Hugo header
Parameters
----------
header : dict
The header to fix
Returns
-------
dict
The fixed header
"""
with key_error_silence():
del header["restapi_import_id"]
with key_error_silence():
del header["original_post_id"]
with key_error_silence():
del header["timeline_notification"]
with key_error_silence():
del header["wordads_ufa"]
header["guid"] = header["guid"].replace("http://localhost", "")
header["author"] = "Denis Nuțiu"
return header
def remove_html_tags(self, post_lines):
fixed_lines = []
for line in post_lines:
if line == "":
fixed_lines.append("\n")
continue
soup = BeautifulSoup(line)
for content in soup.contents:
if isinstance(content, Tag):
if "is-provider-youtube" in content.attrs.get("class", []):
video_link = content.findNext("iframe").attrs["src"]
video_id_part = video_link.rsplit("/")
video_id = video_id_part[-1].split("?")[0]
fixed_lines.append(f"{{{{< youtube {video_id} >}}}}\n")
else:
tags = list(map(str, content.contents))
if tags:
fixed_tags = self.remove_html_tags(tags)
if fixed_tags:
fixed_lines.extend(fixed_tags)
else:
fixed_lines.append(str(content))
return fixed_lines
def convert_post_content(self, post_content: str) -> str:
"""
Converts the post content
Parameters
----------
post_content : str
The post content
Returns
-------
str
The converted post content
"""
# fix link
post_content = post_content.replace("http://localhost/", "/")
post_content = post_content.replace(
"https://nuculabs.wordpress.com/", "https://nuculabs.dev/posts/"
)
# fix unknown tags
post_lines = post_content.split("\n")
fixed_lines = self.remove_html_tags(post_lines)
return "\n".join(fixed_lines)
def read_jekyll_post(self, path: Path):
"""
Read a Jekyll post from the specified path
Parameters
----------
path : Path
The path to the Jekyll post
"""
# read source
with open(path, "r") as fh:
contents = fh.read()
return contents
def write_hugo_post(self, output_path, post_header: dict, post_content: str):
"""
Write a Hugo post to the specified path
Parameters
----------
output_path : Path
The path to the Hugo post
post_header : dict
The post header
post_content : str
The post content
"""
with open(output_path, "w") as fo:
header = ["---\n", yaml.dump(post_header), "---\n"]
fo.writelines(header)
fo.write(post_content)
def convert_jekyll_to_hugo(self, jekyll_post_path: Path, hugo_post_output: Path):
"""
Convert a Jekyll post to a Hugo post
Parameters
----------
jekyll_post_path : Path
The path to the Jekyll post
hugo_post_output : Path
The path to the Hugo post
"""
contents = self.read_jekyll_post(jekyll_post_path)
# fix header
header = yaml.safe_load(contents.split("---")[1])
fixed_header = self.fix_hugo_header(header)
# fix content
post_content = contents.split("---", 2)[2].lstrip()
fixed_post_content = self.convert_post_content(post_content)
self.write_hugo_post(
hugo_post_output.joinpath(jekyll_post_path.name),
fixed_header,
fixed_post_content,
)

0
app/tests/__init__.py Normal file
View file

35
app/tests/utils_test.py Normal file
View file

@ -0,0 +1,35 @@
import pytest
from app.utils import key_error_silence, guard_against_none_or_empty_str
def test_key_error_silence():
# Test that the context manager silences the exception
with key_error_silence():
raise KeyError
# Test that the context manager does not silence other exceptions
with pytest.raises(ValueError):
with key_error_silence():
raise ValueError
@pytest.mark.parametrize(
"input_data",
[
(""),
(None),
(1),
(True),
(False),
({}),
([]),
({"a": 1}),
],
)
def test_guard_against_none_or_empty_str(input_data):
with pytest.raises(ValueError):
guard_against_none_or_empty_str(input_data, "test")
def test_guard_against_none_or_empty_str_happy():
guard_against_none_or_empty_str("a", "test")

28
app/utils.py Normal file
View file

@ -0,0 +1,28 @@
import contextlib
@contextlib.contextmanager
def key_error_silence():
"""
Context manager that silences key errors exceptions.
"""
try:
yield
except KeyError:
pass
def guard_against_none_or_empty_str(value: str, name: str):
"""
Guard against None or empty string.
Parameters:
----------
value: str
The value to check.
name: str
The name of the value.
"""
if value is None or not isinstance(value, str) or value == "":
raise ValueError(f"{name} cannot be None or empty")

29
main.py Normal file
View file

@ -0,0 +1,29 @@
import logging
import sys
from app.converter import Converter
def main():
# Logging configuration
logging.basicConfig(
format="%(asctime)s %(process)d %(levelname)s %(message)s",
level=logging.INFO,
datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger(__name__)
if len(sys.argv) != 3:
logger.error(
"Usage: python main.py <source path to jekyll posts> <output path to hugo posts>"
)
sys.exit(1)
# Converter
converter = Converter(sys.argv[1], sys.argv[2])
converter.convert()
# Press the green button in the gutter to run the script.
if __name__ == "__main__":
main()

15
readme.md Normal file
View file

@ -0,0 +1,15 @@
# Jekyll to Hugo Converter
Jekyll to Hugo Converter is a simple tool to convert Jekyll posts to Hugo posts.
You can also use it to convert your WordPress blog into a Hugo blog. Tutorial coming soon.
## Usage
```bash
pip install -r requirements.txt
python3 jekyll-to-hugo.py <jekyll_post_path> <hugo_post_path>
```
---
Made with ❤️ by [NucuLabs.dev](https://nuculabs.dev)

2
requirements-dev.txt Normal file
View file

@ -0,0 +1,2 @@
black==23.3.0
pdoc==13.1.1

1
requirements-test.txt Normal file
View file

@ -0,0 +1 @@
pytest==7.3.1

3
requirements.txt Normal file
View file

@ -0,0 +1,3 @@
beautifulsoup4==4.12.2
PyYAML==6.0
soupsieve==2.4.1