from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
MAX_LINE_WIDTH = 160
_TYPO_MAP: dict[str, str] = {
"deniest": "denies",
"varous": "various",
"runtim": "runtime",
}
_TOKEN_RE = re.compile(
r"""
\[[^\]]*\]\([^)]*\) # markdown link: [text](url)
| `[^`]+` | \S+ """,
re.VERBOSE,
)
_VERSION_RE = re.compile(r"^## \[")
_PR_LINK_RE = re.compile(r"\[#(\d+)\]\(https://github\.com/[^)]+/pull/\d+\)")
_COMMIT_LINK_RE = re.compile(r"\s*\[`[a-f0-9]{7}`\]\(https://github\.com/[^)]+/commit/[a-f0-9]+\)")
_STAR_LIST_RE = re.compile(r"^(\s*)\* ")
_LIST_MARKER_SPACE_RE = re.compile(r"^(\s*-)\s{2,}")
_INDENTED_ATX_HEADING_RE = re.compile(r"^(?P<indent>\s+)#{1,6}\s+(?P<title>.*?)(?:\s+#+\s*)?$")
_SQUASH_HEADING_RE = re.compile(r"^(?P<indent>\s*)-\s+(?P<prefix>[A-Za-z]+(?:\([^)]+\))?!?):\s+(?P<title>.+?)\s*$")
_SQUASH_HEADING_LABELS: dict[str, str] = {
"feat": "Added",
"fix": "Fixed",
"perf": "Performance",
"refactor": "Changed",
"test": "Changed",
"style": "Changed",
"build": "Maintenance",
"chore": "Maintenance",
"ci": "Maintenance",
"doc": "Documentation",
"docs": "Documentation",
"added": "Added",
"fixed": "Fixed",
"changed": "Changed",
"performance": "Performance",
"documentation": "Documentation",
"maintenance": "Maintenance",
"deprecated": "Deprecated",
"removed": "Removed",
}
def _plain_summary(text: str) -> str:
text = _COMMIT_LINK_RE.sub("", text)
text = _PR_LINK_RE.sub("", text)
text = re.sub(r"^\s*[-*]\s+", "", text)
text = re.sub(r"^[A-Za-z]+(?:\([^)]+\))?!?:\s+", "", text)
return re.sub(r"\s+", " ", text).strip().casefold()
def _squash_heading_parts(line: str) -> tuple[str, str, str] | None:
if _COMMIT_LINK_RE.search(line):
return None
match = _SQUASH_HEADING_RE.match(line)
if match is None:
return None
raw_prefix = match.group("prefix")
kind = re.sub(r"\([^)]+\)", "", raw_prefix).rstrip("!").casefold()
label = _SQUASH_HEADING_LABELS.get(kind)
if label is None:
return None
title = match.group("title").strip()
if not title:
return None
return match.group("indent"), label, title[0].upper() + title[1:]
def _normalize_squash_heading(line: str, *, nested: bool = False) -> str:
parts = _squash_heading_parts(line)
if parts is None:
return line
indent, label, title = parts
if nested and not indent:
indent = " "
return f"{indent}**{label}: {title}**"
def _is_duplicate_squash_heading(line: str, parent_summary: str | None) -> bool:
parts = _squash_heading_parts(line)
if parts is None or parent_summary is None:
return False
_, _, title = parts
return _plain_summary(title) == parent_summary
def _is_isolated_body_heading(lines: list[str], idx: int) -> bool:
prev_is_blank = idx > 0 and not lines[idx - 1].strip()
next_is_blank = idx + 1 < len(lines) and not lines[idx + 1].strip()
return prev_is_blank and next_is_blank
def _max_pr_number(entry: str) -> int:
numbers = [int(m) for m in _PR_LINK_RE.findall(entry)]
return max(numbers) if numbers else 0
def _compact_entry(line: str, *, strip_breaking: bool = False) -> str:
result = _COMMIT_LINK_RE.sub("", line).rstrip()
if strip_breaking:
result = result.replace("[**breaking**] ", "", 1)
return result
def _extract_section_summaries(
section: list[str],
) -> tuple[list[str], list[str]]:
pr_entries: list[str] = []
breaking_entries: list[str] = []
for sline in section:
if not sline.startswith("- "):
continue
is_breaking = "[**breaking**]" in sline
has_pr = bool(_PR_LINK_RE.search(sline))
if is_breaking:
breaking_entries.append(_compact_entry(sline, strip_breaking=True))
if has_pr:
pr_entries.append(_compact_entry(sline, strip_breaking=True))
return pr_entries, breaking_entries
def _inject_summary_sections(text: str) -> str:
lines = text.split("\n")
boundaries: list[int] = []
for i, line in enumerate(lines):
if _VERSION_RE.match(line):
boundaries.append(i)
if not boundaries:
return text
for sec_idx in reversed(range(len(boundaries))):
start = boundaries[sec_idx]
end = boundaries[sec_idx + 1] if sec_idx + 1 < len(boundaries) else len(lines)
section = lines[start:end]
if any("### Merged Pull Requests" in s or "### ⚠️ Breaking Changes" in s for s in section):
continue
pr_entries, breaking_entries = _extract_section_summaries(section)
if not pr_entries and not breaking_entries:
continue
pr_entries.sort(key=_max_pr_number, reverse=True)
insert_at = start + 1
while insert_at < end and lines[insert_at].strip() == "":
insert_at += 1
block: list[str] = []
if breaking_entries:
block.append("### ⚠️ Breaking Changes")
block.append("")
block.extend(breaking_entries)
block.append("")
if pr_entries:
block.append("### Merged Pull Requests")
block.append("")
block.extend(pr_entries)
block.append("")
lines[insert_at:insert_at] = block
return "\n".join(lines)
def _reflow_line(line: str, max_width: int = MAX_LINE_WIDTH) -> str:
if len(line) <= max_width:
return line
stripped = line.lstrip()
indent = line[: len(line) - len(stripped)]
if stripped.startswith(("- ", "* ")):
first_prefix = indent + stripped[:2]
content = stripped[2:]
cont_indent = indent + " "
else:
first_prefix = indent
content = stripped
cont_indent = indent
tokens = _TOKEN_RE.findall(content)
if not tokens:
return line
lines: list[str] = []
current = first_prefix + tokens[0]
for token in tokens[1:]:
candidate = current + " " + token
if len(candidate) <= max_width:
current = candidate
else:
lines.append(current)
current = cont_indent + token
lines.append(current)
return "\n".join(lines)
def _deindent_orphan(line: str, lines: list[str], idx: int) -> str:
stripped = line.lstrip()
if not (line.startswith(" ") and stripped.startswith("- ")):
return line
our_indent = len(line) - len(stripped)
nearest_parent_indent: int | None = None
for j in range(idx - 1, -1, -1):
prev = lines[j]
if not prev.strip():
continue if prev.startswith(" "):
prev_stripped = prev.lstrip()
if prev_stripped.startswith(("- ", "* ")):
parent_indent = len(prev) - len(prev_stripped)
if our_indent > parent_indent and nearest_parent_indent is None:
nearest_parent_indent = parent_indent
continue is_list_parent = prev.startswith(("- ", "* "))
if is_list_parent:
base = nearest_parent_indent + 2 if nearest_parent_indent is not None else 2
return " " * base + stripped
return line[2:] if nearest_parent_indent is not None else stripped
return line[2:] if nearest_parent_indent is not None else stripped
def _needs_blank_before(stripped: str, result: list[str]) -> bool:
if not stripped.startswith("- ") or not result or not result[-1].strip():
return False
prev = result[-1].lstrip()
return not prev.startswith(("-", "#"))
def _fix_typos(text: str) -> str:
for typo, correction in _TYPO_MAP.items():
text = re.sub(rf"\b{re.escape(typo)}\b", correction, text)
return text
def _normalize_indented_heading(line: str) -> str:
match = _INDENTED_ATX_HEADING_RE.match(line)
if match is None:
return line
title = match.group("title").strip()
if not title:
return line
return f"{match.group('indent')}**{title}**"
def _process_code_fence(line: str, result: list[str], in_code_block: bool) -> tuple[bool, bool]:
stripped = line.lstrip()
if not stripped.startswith("```"):
return False, in_code_block
if not in_code_block:
in_code_block = True
if result and result[-1].strip():
result.append("")
if stripped == "```":
line = line.replace("```", "```text", 1)
else:
in_code_block = False
result.append(line)
return True, in_code_block
def postprocess(path: Path) -> None:
text = path.read_text(encoding="utf-8")
text = _fix_typos(text)
text = _inject_summary_sections(text)
lines = text.split("\n")
result: list[str] = []
in_code_block = False
current_entry_summary: str | None = None
drop_next_blank = False
for idx, line in enumerate(lines):
handled, in_code_block = _process_code_fence(line, result, in_code_block)
if handled:
continue
if in_code_block:
result.append(line)
continue
line = _STAR_LIST_RE.sub(r"\1- ", line)
line = _LIST_MARKER_SPACE_RE.sub(r"\1 ", line)
if line.startswith("- ") and _COMMIT_LINK_RE.search(line):
current_entry_summary = _plain_summary(line)
elif line.startswith(("### ", "## ", "# ")):
current_entry_summary = None
is_isolated_body_heading = _is_isolated_body_heading(lines, idx)
if is_isolated_body_heading and _is_duplicate_squash_heading(line, current_entry_summary):
drop_next_blank = bool(result and not result[-1].strip())
continue
if drop_next_blank and not line.strip():
drop_next_blank = False
continue
drop_next_blank = False
line = _deindent_orphan(line, lines, idx)
stripped = line.lstrip()
line = _normalize_indented_heading(line)
stripped = line.lstrip()
if is_isolated_body_heading:
line = _normalize_squash_heading(line, nested=current_entry_summary is not None)
stripped = line.lstrip()
if _needs_blank_before(stripped, result):
result.append("")
if len(line) > MAX_LINE_WIDTH:
result.append(_reflow_line(line))
else:
result.append(line)
text = "\n".join(result)
text = text.rstrip("\n") + "\n"
path.write_text(text, encoding="utf-8")
def main() -> None:
parser = argparse.ArgumentParser(
prog="postprocess-changelog",
description="Apply markdown hygiene to a git-cliff generated CHANGELOG.md.",
)
parser.add_argument(
"path",
nargs="?",
default="CHANGELOG.md",
help="Path to CHANGELOG.md (default: CHANGELOG.md)",
)
args = parser.parse_args()
changelog = Path(args.path)
if not changelog.is_file():
print(f"Error: {changelog} not found", file=sys.stderr)
sys.exit(1)
postprocess(changelog)
if __name__ == "__main__":
main()