from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
MAX_LINE_WIDTH = 160
_TYPO_MAP: dict[str, str] = {
"varous": "various",
"runtim": "runtime",
}
_TOKEN_RE = re.compile(
r"""
\[[^\]]*\]\([^)]*\) # markdown link: [text](url)
| `[^`]+` | \S+ """,
re.VERBOSE,
)
_VERSION_RE = re.compile(r"^## \[")
_PR_LINK_RE = re.compile(r"\[#(\d+)\]\(https://github\.com/[^)]+/pull/\d+\)")
_COMMIT_LINK_RE = re.compile(r"\s*\[`[a-f0-9]{7}`\]\(https://github\.com/[^)]+/commit/[a-f0-9]+\)")
_STAR_LIST_RE = re.compile(r"^(\s*)\* ")
_LIST_MARKER_SPACE_RE = re.compile(r"^(\s*-)\s{2,}")
def _max_pr_number(entry: str) -> int:
numbers = [int(m) for m in _PR_LINK_RE.findall(entry)]
return max(numbers) if numbers else 0
def _compact_entry(line: str, *, strip_breaking: bool = False) -> str:
result = _COMMIT_LINK_RE.sub("", line).rstrip()
if strip_breaking:
result = result.replace("[**breaking**] ", "", 1)
return result
def _extract_section_summaries(
section: list[str],
) -> tuple[list[str], list[str]]:
pr_entries: list[str] = []
breaking_entries: list[str] = []
for sline in section:
if not sline.startswith("- "):
continue
is_breaking = "[**breaking**]" in sline
has_pr = bool(_PR_LINK_RE.search(sline))
if is_breaking:
breaking_entries.append(_compact_entry(sline, strip_breaking=True))
if has_pr:
pr_entries.append(_compact_entry(sline, strip_breaking=True))
return pr_entries, breaking_entries
def _inject_summary_sections(text: str) -> str:
lines = text.split("\n")
boundaries: list[int] = []
for i, line in enumerate(lines):
if _VERSION_RE.match(line):
boundaries.append(i)
if not boundaries:
return text
for sec_idx in reversed(range(len(boundaries))):
start = boundaries[sec_idx]
end = boundaries[sec_idx + 1] if sec_idx + 1 < len(boundaries) else len(lines)
section = lines[start:end]
if any("### Merged Pull Requests" in s or "### ⚠️ Breaking Changes" in s for s in section):
continue
pr_entries, breaking_entries = _extract_section_summaries(section)
if not pr_entries and not breaking_entries:
continue
pr_entries.sort(key=_max_pr_number, reverse=True)
insert_at = start + 1
while insert_at < end and lines[insert_at].strip() == "":
insert_at += 1
block: list[str] = []
if breaking_entries:
block.append("### ⚠️ Breaking Changes")
block.append("")
block.extend(breaking_entries)
block.append("")
if pr_entries:
block.append("### Merged Pull Requests")
block.append("")
block.extend(pr_entries)
block.append("")
lines[insert_at:insert_at] = block
return "\n".join(lines)
def _reflow_line(line: str, max_width: int = MAX_LINE_WIDTH) -> str:
if len(line) <= max_width:
return line
stripped = line.lstrip()
indent = line[: len(line) - len(stripped)]
if stripped.startswith(("- ", "* ")):
first_prefix = indent + stripped[:2]
content = stripped[2:]
cont_indent = indent + " "
else:
first_prefix = indent
content = stripped
cont_indent = indent
tokens = _TOKEN_RE.findall(content)
if not tokens:
return line
lines: list[str] = []
current = first_prefix + tokens[0]
for token in tokens[1:]:
candidate = current + " " + token
if len(candidate) <= max_width:
current = candidate
else:
lines.append(current)
current = cont_indent + token
lines.append(current)
return "\n".join(lines)
def _deindent_orphan(line: str, lines: list[str], idx: int) -> str:
stripped = line.lstrip()
if not (line.startswith(" ") and stripped.startswith("- ")):
return line
our_indent = len(line) - len(stripped)
nearest_parent_indent: int | None = None
for j in range(idx - 1, -1, -1):
prev = lines[j]
if not prev.strip():
continue if prev.startswith(" "):
prev_stripped = prev.lstrip()
if prev_stripped.startswith(("- ", "* ")):
parent_indent = len(prev) - len(prev_stripped)
if our_indent > parent_indent and nearest_parent_indent is None:
nearest_parent_indent = parent_indent
continue is_list_parent = prev.startswith(("- ", "* "))
if is_list_parent:
base = nearest_parent_indent + 2 if nearest_parent_indent is not None else 2
return " " * base + stripped
return line[2:] if nearest_parent_indent is not None else stripped
return line[2:] if nearest_parent_indent is not None else stripped
def _needs_blank_before(stripped: str, result: list[str]) -> bool:
if not stripped.startswith("- ") or not result or not result[-1].strip():
return False
prev = result[-1].lstrip()
return not prev.startswith(("-", "#"))
def _fix_typos(text: str) -> str:
for typo, correction in _TYPO_MAP.items():
text = re.sub(rf"\b{re.escape(typo)}\b", correction, text)
return text
def postprocess(path: Path) -> None:
text = path.read_text(encoding="utf-8")
text = _fix_typos(text)
text = _inject_summary_sections(text)
lines = text.split("\n")
result: list[str] = []
in_code_block = False
for idx, line in enumerate(lines):
stripped = line.lstrip()
if stripped.startswith("```"):
if not in_code_block:
in_code_block = True
if result and result[-1].strip():
result.append("")
if stripped == "```":
line = line.replace("```", "```text", 1)
else:
in_code_block = False
result.append(line)
continue
if in_code_block:
result.append(line)
continue
line = _STAR_LIST_RE.sub(r"\1- ", line)
line = _LIST_MARKER_SPACE_RE.sub(r"\1 ", line)
line = _deindent_orphan(line, lines, idx)
stripped = line.lstrip()
if _needs_blank_before(stripped, result):
result.append("")
if len(line) > MAX_LINE_WIDTH:
result.append(_reflow_line(line))
else:
result.append(line)
text = "\n".join(result)
text = text.rstrip("\n") + "\n"
path.write_text(text, encoding="utf-8")
def main() -> None:
parser = argparse.ArgumentParser(
prog="postprocess-changelog",
description="Apply markdown hygiene to a git-cliff generated CHANGELOG.md.",
)
parser.add_argument(
"path",
nargs="?",
default="CHANGELOG.md",
help="Path to CHANGELOG.md (default: CHANGELOG.md)",
)
args = parser.parse_args()
changelog = Path(args.path)
if not changelog.is_file():
print(f"Error: {changelog} not found", file=sys.stderr)
sys.exit(1)
postprocess(changelog)
if __name__ == "__main__":
main()