import os
from pathlib import Path
from bs4 import BeautifulSoup
from markdownify import markdownify, MarkdownConverter
class IndentedCodeMarkdownConverter(MarkdownConverter):
def convert_pre(self, el, text, convert_as_inline):
if not text:
return ''
code = text.rstrip()
lines = code.splitlines()
indented_lines = [' ' + line for line in lines]
return '\n'.join(indented_lines) + '\n'
def main():
doc_path = (
Path(__file__).parent.parent
/ "target"
/ "doc"
/ "minijinja"
/ "syntax"
/ "index.html"
)
with open(doc_path, "r", encoding="utf-8") as file:
html_content = file.read()
soup = BeautifulSoup(html_content, "html.parser")
main_content = soup.find(id="main-content").find(class_="docblock")
for anchor in main_content.select("a.doc-anchor"):
anchor.decompose()
for link in main_content.find_all("a"):
span = soup.new_tag("span")
span.string = link.text
link.replace_with(span)
synopsis_h2 = main_content.find("h2", id="synopsis")
for tag in main_content.find_all(['h2', 'h3', 'h4', 'h5', 'h6']):
new_level = int(tag.name[1]) - 1
if new_level >= 1:
tag.name = f'h{new_level}'
synopsis_h2.string = 'Syntax Reference'
for element in list(synopsis_h2.previous_siblings):
if isinstance(element, str):
element.extract()
else:
element.decompose()
for paragraph in main_content.find_all('p'):
next_sibling = paragraph.next_sibling
while isinstance(next_sibling, str) and next_sibling.strip() == '':
next_sibling.extract()
next_sibling = paragraph.next_sibling
if paragraph.next_sibling:
paragraph.insert_after('\n')
for element in main_content.contents:
if isinstance(element, str) and element.strip() == '':
element.extract()
for code_block in main_content.find_all('code'):
if code_block.string and code_block.string.endswith('\n'):
code_block.string = code_block.string.rstrip('\n')
for example_wrap in main_content.select('div.example-wrap'):
example_wrap.unwrap()
for headline in main_content.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
for code_tag in headline.find_all('code'):
span_tag = soup.new_tag('span')
span_tag.string = code_tag.string
code_tag.replace_with(span_tag)
for pre_tag in main_content.find_all('pre'):
br_tag = soup.new_tag('br')
pre_tag.insert_after(br_tag)
for paragraph in main_content.find_all('p'):
if paragraph.strong and paragraph.strong.string and paragraph.strong.string.startswith('Feature:'):
paragraph.decompose()
markdown_content = IndentedCodeMarkdownConverter(
escape_underscores=False,
escape_asterisks=False,
escape_misc=False,
wrap=True,
wrap_width=80,
).convert(str(main_content))
output_path = Path(__file__).parent / "src" / "syntax_help.txt"
with open(output_path, 'w', encoding='utf-8') as file:
file.write(markdown_content.rstrip())
print("Regenerated", output_path)
if __name__ == "__main__":
main()