from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
MAX_LINE_WIDTH = 160
_TOKEN_RE = re.compile(
r"""
\[[^\]]*\]\([^)]*\) # markdown link: [text](url)
| `[^`]+` | \S+ """,
re.VERBOSE,
)
_VERSION_RE = re.compile(
r"^## \[(?:v?\d+\.\d+\.\d+(?:-[0-9A-Za-z.-]+)?(?:\+[0-9A-Za-z.-]+)?|Unreleased)\]"
r"(?:\s+-\s+\d{4}-\d{2}-\d{2})?\s*$"
)
_PR_LINK_RE = re.compile(r"\[#(\d+)\]\(https://github\.com/[^)]+/pull/\d+\)")
_COMMIT_LINK_RE = re.compile(r"\s*\[`[a-f0-9]{7}`\]\(https://github\.com/[^)]+/commit/[a-f0-9]+\)")
_BREAKING_MARKER_RE = re.compile(r"^\s*(?:[-*]\s+)?\[?\*\*breaking\*\*\]?\s*", re.IGNORECASE)
_STAR_LIST_RE = re.compile(r"^(\s*)\* ")
_LIST_MARKER_SPACE_RE = re.compile(r"^(\s*-)\s{2,}")
_INDENTED_ATX_HEADING_RE = re.compile(r"^(?P<indent>\s+)#{1,6}\s+(?P<title>.*?)(?:\s+#+\s*)?$")
_SQUASH_HEADING_RE = re.compile(r"^(?P<indent>\s*)-\s+(?P<prefix>[A-Za-z]+(?:\([^)]+\))?!?):\s+(?P<title>.+?)\s*$")
_CHANGELOG_SECTION_HEADINGS = {
"Added",
"Changed",
"Deprecated",
"Dependencies",
"Documentation",
"Fixed",
"Maintenance",
"Merged Pull Requests",
"Performance",
"Removed",
"Security",
"⚠️ Breaking Changes",
}
_ENTRY_HEADING_RE = re.compile(r"^(?P<level>#{2,6})\s+(?P<title>.*?)(?:\s+#+\s*)?$")
_ESCAPED_EMAIL_RE = re.compile(r"<(?P<email>[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,})>", re.IGNORECASE)
_SQUASH_HEADING_LABELS: dict[str, str] = {
"feat": "Added",
"fix": "Fixed",
"perf": "Performance",
"refactor": "Changed",
"test": "Changed",
"style": "Changed",
"build": "Maintenance",
"chore": "Maintenance",
"ci": "Maintenance",
"doc": "Documentation",
"docs": "Documentation",
"added": "Added",
"fixed": "Fixed",
"changed": "Changed",
"performance": "Performance",
"documentation": "Documentation",
"maintenance": "Maintenance",
"deprecated": "Deprecated",
"removed": "Removed",
}
def _plain_summary(text: str) -> str:
text = _BREAKING_MARKER_RE.sub("", text)
text = _COMMIT_LINK_RE.sub("", text)
text = _PR_LINK_RE.sub("", text)
text = re.sub(r"^\s*[-*]\s+", "", text)
text = re.sub(r"^[A-Za-z]+(?:\([^)]+\))?!?:\s+", "", text)
return re.sub(r"\s+", " ", text).strip().casefold()
def _squash_heading_parts(line: str) -> tuple[str, str, str] | None:
if _COMMIT_LINK_RE.search(line):
return None
match = _SQUASH_HEADING_RE.match(line)
if match is None:
return None
raw_prefix = match.group("prefix")
kind = re.sub(r"\([^)]+\)", "", raw_prefix).rstrip("!").casefold()
label = _SQUASH_HEADING_LABELS.get(kind)
if label is None:
return None
title = match.group("title").strip()
if not title:
return None
return match.group("indent"), label, title[0].upper() + title[1:]
def _normalize_squash_heading(line: str, *, nested: bool = False) -> str:
parts = _squash_heading_parts(line)
if parts is None:
return line
indent, label, title = parts
if nested and not indent:
indent = " "
return f"{indent}**{label}: {title}**"
def _is_duplicate_squash_heading(line: str, parent_summary: str | None) -> bool:
parts = _squash_heading_parts(line)
if parts is None or parent_summary is None:
return False
_, _, title = parts
return _plain_summary(title) == parent_summary
def _is_isolated_body_heading(lines: list[str], idx: int) -> bool:
prev_is_blank = idx > 0 and not lines[idx - 1].strip()
next_is_blank = idx + 1 < len(lines) and not lines[idx + 1].strip()
return prev_is_blank and next_is_blank
def _is_squash_heading_candidate(lines: list[str], idx: int) -> bool:
return _squash_heading_parts(lines[idx]) is not None and _is_isolated_body_heading(lines, idx)
def _max_pr_number(entry: str) -> int:
numbers = [int(m) for m in _PR_LINK_RE.findall(entry)]
return max(numbers) if numbers else 0
def _compact_entry(line: str, *, strip_breaking: bool = False) -> str:
result = _COMMIT_LINK_RE.sub("", line).rstrip()
if strip_breaking:
bullet = result[:2] if result.startswith(("- ", "* ")) else ""
body = result[2:] if bullet else result
result = bullet + _BREAKING_MARKER_RE.sub("", body, count=1)
return result
def _append_unique(entries: list[str], entry: str) -> None:
if entry not in entries:
entries.append(entry)
def _extract_section_summaries(
section: list[str],
) -> tuple[list[str], list[str]]:
pr_entries: list[str] = []
breaking_entries: list[str] = []
for sline in section:
if not sline.startswith(("- ", "* ")):
continue
is_breaking = bool(_BREAKING_MARKER_RE.search(sline))
has_pr = bool(_PR_LINK_RE.search(sline))
if is_breaking:
_append_unique(breaking_entries, _compact_entry(sline, strip_breaking=True))
if has_pr:
_append_unique(pr_entries, _compact_entry(sline, strip_breaking=True))
return pr_entries, breaking_entries
def _inject_summary_sections(text: str) -> str:
lines = text.split("\n")
boundaries: list[int] = []
for i, line in enumerate(lines):
if _VERSION_RE.match(line):
boundaries.append(i)
if not boundaries:
return text
for sec_idx in reversed(range(len(boundaries))):
start = boundaries[sec_idx]
end = boundaries[sec_idx + 1] if sec_idx + 1 < len(boundaries) else len(lines)
section = lines[start:end]
if any("### Merged Pull Requests" in s or "### ⚠️ Breaking Changes" in s for s in section):
continue
pr_entries, breaking_entries = _extract_section_summaries(section)
if not pr_entries and not breaking_entries:
continue
pr_entries.sort(key=_max_pr_number, reverse=True)
insert_at = start + 1
while insert_at < end and lines[insert_at].strip() == "":
insert_at += 1
block: list[str] = []
if breaking_entries:
block.append("### ⚠️ Breaking Changes")
block.append("")
block.extend(breaking_entries)
block.append("")
if pr_entries:
block.append("### Merged Pull Requests")
block.append("")
block.extend(pr_entries)
block.append("")
lines[insert_at:insert_at] = block
return "\n".join(lines)
def _reflow_line(line: str, max_width: int = MAX_LINE_WIDTH) -> str:
if len(line) <= max_width:
return line
stripped = line.lstrip()
indent = line[: len(line) - len(stripped)]
if stripped.startswith(("- ", "* ")):
first_prefix = indent + stripped[:2]
content = stripped[2:]
cont_indent = indent + " "
else:
first_prefix = indent
content = stripped
cont_indent = indent
tokens = _TOKEN_RE.findall(content)
if not tokens:
return line
lines: list[str] = []
current = first_prefix + tokens[0]
for token in tokens[1:]:
candidate = current + " " + token
if len(candidate) <= max_width:
current = candidate
else:
lines.append(current)
current = cont_indent + token
lines.append(current)
return "\n".join(lines)
def _deindent_orphan(line: str, lines: list[str], idx: int) -> str:
stripped = line.lstrip()
if not (line.startswith(" ") and stripped.startswith("- ")):
return line
our_indent = len(line) - len(stripped)
nearest_parent_indent: int | None = None
for j in range(idx - 1, -1, -1):
prev = lines[j]
if not prev.strip():
continue if prev.startswith(" "):
prev_stripped = prev.lstrip()
if prev_stripped.startswith(("- ", "* ")):
if _is_squash_heading_candidate(lines, j):
continue
parent_indent = len(prev) - len(prev_stripped)
if our_indent > parent_indent and nearest_parent_indent is None:
nearest_parent_indent = parent_indent
continue is_list_parent = prev.startswith(("- ", "* "))
if is_list_parent:
base = nearest_parent_indent + 2 if nearest_parent_indent is not None else 2
return " " * base + stripped
return line[2:] if nearest_parent_indent is not None else stripped
return line[2:] if nearest_parent_indent is not None else stripped
def _normalize_list_continuation_indent(line: str, lines: list[str], idx: int) -> str:
stripped = line.lstrip()
if not line.startswith(" ") or not stripped or stripped.startswith(("- ", "* ")):
return line
our_indent = len(line) - len(stripped)
for j in range(idx - 1, -1, -1):
prev = lines[j]
if not prev.strip():
continue
prev_stripped = prev.lstrip()
if prev_stripped.startswith(("- ", "* ")):
parent_indent = len(prev) - len(prev_stripped)
expected_indent = parent_indent + 2
if our_indent > expected_indent:
return " " * expected_indent + stripped
return line
if not prev.startswith(" "):
return line
return line
def _list_item_indent(line: str) -> int | None:
stripped = line.lstrip()
if not stripped.startswith(("- ", "* ")):
return None
return len(line) - len(stripped)
def _has_previous_peer_list_item(lines: list[str], idx: int, peer_indent: int) -> bool:
for j in range(idx - 1, -1, -1):
prev = lines[j]
if not prev.strip():
continue
prev_indent = _list_item_indent(prev)
if prev_indent == peer_indent:
return True
if prev_indent is not None or prev.startswith(" "):
continue
return False
return False
def _next_list_item_indent(lines: list[str], idx: int) -> int | None:
for next_line in lines[idx + 1 :]:
if not next_line.strip():
continue
return _list_item_indent(next_line)
return None
def _is_blank_between_peer_list_items(lines: list[str], idx: int) -> bool:
if lines[idx].strip():
return False
next_indent = _next_list_item_indent(lines, idx)
if next_indent is None:
return False
return _has_previous_peer_list_item(lines, idx, next_indent)
def _normalize_email_autolinks(line: str) -> str:
return _ESCAPED_EMAIL_RE.sub(r"<\g<email>>", line)
def _needs_blank_before(line: str, lines: list[str], idx: int, result: list[str]) -> bool:
stripped = line.lstrip()
if not stripped.startswith("- ") or not result or not result[-1].strip():
return False
prev = result[-1].lstrip()
if prev.startswith("#"):
return False
if prev.startswith("- "):
return False
current_indent = len(line) - len(stripped)
return not _has_previous_peer_list_item(result, len(result), current_indent)
def _normalize_indented_heading(line: str) -> str:
match = _INDENTED_ATX_HEADING_RE.match(line)
if match is None:
return line
title = match.group("title").strip()
if not title:
return line
return f"{match.group('indent')}**{title}**"
def _normalize_entry_heading(line: str) -> str:
match = _ENTRY_HEADING_RE.match(line)
if match is None:
return line
level = match.group("level")
title = match.group("title").strip()
if not title or level.startswith("####"):
return line
if level == "##" and (_VERSION_RE.match(line) or title == "Archives"):
return line
if level == "###" and title in _CHANGELOG_SECTION_HEADINGS:
return line
return f"#### {title}"
def normalize_entry_headings_text(text: str) -> str:
result: list[str] = []
in_code_block = False
for line in text.split("\n"):
if line.lstrip().startswith("```"):
result.append(line)
in_code_block = not in_code_block
continue
result.append(line if in_code_block else _normalize_entry_heading(line))
return "\n".join(result).rstrip("\n") + "\n"
def _process_code_fence(line: str, result: list[str], in_code_block: bool, next_line: str | None) -> tuple[bool, bool]:
stripped = line.lstrip()
if not stripped.startswith("```"):
return False, in_code_block
if not in_code_block:
in_code_block = True
if result and result[-1].strip():
result.append("")
if stripped == "```":
line = line.replace("```", "```text", 1)
else:
in_code_block = False
result.append(line)
if not in_code_block and next_line is not None and next_line.strip():
result.append("")
return True, in_code_block
def _update_entry_summary(line: str, current_entry_summary: str | None) -> str | None:
if line.startswith("- ") and _COMMIT_LINK_RE.search(line):
return _plain_summary(line)
if line.startswith(("### ", "## ", "# ")):
return None
return current_entry_summary
def _should_skip_duplicate_heading(
line: str,
result: list[str],
current_entry_summary: str | None,
is_isolated_body_heading: bool,
) -> tuple[bool, bool]:
if is_isolated_body_heading and _is_duplicate_squash_heading(line, current_entry_summary):
return True, bool(result and not result[-1].strip())
return False, False
def _normalize_body_line(line: str, lines: list[str], idx: int, result: list[str], current_entry_summary: str | None) -> str:
is_isolated_body_heading = _is_isolated_body_heading(lines, idx)
line = _deindent_orphan(line, lines, idx)
line = _normalize_list_continuation_indent(line, lines, idx)
line = _normalize_indented_heading(line)
line = _normalize_entry_heading(line)
if is_isolated_body_heading:
line = _normalize_squash_heading(line, nested=current_entry_summary is not None)
if _needs_blank_before(line, lines, idx, result):
result.append("")
return _reflow_line(line) if len(line) > MAX_LINE_WIDTH else line
def postprocess_text(text: str) -> str:
text = _inject_summary_sections(text)
lines = text.split("\n")
result: list[str] = []
in_code_block = False
current_entry_summary: str | None = None
drop_next_blank = False
for idx, raw_line in enumerate(lines):
line = raw_line
next_line = lines[idx + 1] if idx + 1 < len(lines) else None
handled, in_code_block = _process_code_fence(line, result, in_code_block, next_line)
if handled:
continue
if in_code_block:
result.append(line)
continue
if _is_blank_between_peer_list_items(lines, idx):
continue
line = _STAR_LIST_RE.sub(r"\1- ", line)
line = _LIST_MARKER_SPACE_RE.sub(r"\1 ", line)
line = _normalize_email_autolinks(line)
current_entry_summary = _update_entry_summary(line, current_entry_summary)
is_isolated_body_heading = _is_isolated_body_heading(lines, idx)
should_skip, next_drop_blank = _should_skip_duplicate_heading(
line,
result,
current_entry_summary,
is_isolated_body_heading,
)
if should_skip:
drop_next_blank = next_drop_blank
continue
if drop_next_blank and not line.strip():
drop_next_blank = False
continue
drop_next_blank = False
result.append(_normalize_body_line(line, lines, idx, result, current_entry_summary))
text = "\n".join(result)
return text.rstrip("\n") + "\n"
def postprocess(path: Path) -> None:
text = path.read_text(encoding="utf-8")
text = postprocess_text(text)
path.write_text(text, encoding="utf-8")
def main() -> None:
parser = argparse.ArgumentParser(
prog="postprocess-changelog",
description="Apply markdown hygiene to a git-cliff generated CHANGELOG.md.",
)
parser.add_argument(
"path",
nargs="?",
default="CHANGELOG.md",
help="Path to CHANGELOG.md (default: CHANGELOG.md)",
)
args = parser.parse_args()
changelog = Path(args.path)
if not changelog.is_file():
print(f"Error: {changelog} not found", file=sys.stderr)
sys.exit(1)
postprocess(changelog)
if __name__ == "__main__":
main()