import argparse
import json
import os
import re
import sys
from dataclasses import dataclass
from pathlib import Path
from typing import List, Tuple, Optional
@dataclass
class Section:
title: str
level: int start_line: int
end_line: int
content: List[str]
@dataclass
class ChapterAnalysis:
file_path: Path
line_count: int
h2_sections: List[Section]
should_split: bool
reason: str
def parse_markdown_sections(lines: List[str]) -> List[Section]:
sections = []
current_section = None
for i, line in enumerate(lines):
heading_match = re.match(r'^(#{2,6})\s+(.+)$', line)
if heading_match:
if current_section:
current_section.end_line = i - 1
sections.append(current_section)
level = len(heading_match.group(1))
title = heading_match.group(2).strip()
current_section = Section(
title=title,
level=level,
start_line=i,
end_line=i,
content=[line]
)
elif current_section:
current_section.content.append(line)
if current_section:
current_section.end_line = len(lines) - 1
sections.append(current_section)
return sections
def analyze_chapter(file_path: Path, min_lines: int, min_h2_sections: int) -> ChapterAnalysis:
with open(file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
line_count = len(lines)
sections = parse_markdown_sections(lines)
h2_sections = [s for s in sections if s.level == 2]
h2_count = len(h2_sections)
should_split = False
reason = ""
if line_count > 800:
should_split = True
reason = f"Always split (>{800} lines)"
elif line_count > 600:
should_split = True
reason = f"Large file (>{600} lines)"
elif line_count >= min_lines and h2_count >= min_h2_sections:
should_split = True
reason = f">={min_lines} lines and >={min_h2_sections} H2 sections"
else:
reason = f"Keep as single file ({line_count} lines, {h2_count} H2s)"
return ChapterAnalysis(
file_path=file_path,
line_count=line_count,
h2_sections=h2_sections,
should_split=should_split,
reason=reason
)
def generate_filename(title: str) -> str:
filename = title.lower()
filename = re.sub(r'\s+', '-', filename)
filename = re.sub(r'[^a-z0-9-]', '', filename)
return f"{filename}.md"
def split_chapter(analysis: ChapterAnalysis, preserve_index_sections: int, book_dir: Path) -> dict:
chapter_name = analysis.file_path.stem
chapter_dir = book_dir / "src" / chapter_name
with open(analysis.file_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
chapter_dir.mkdir(exist_ok=True)
intro_end = 0
for i, line in enumerate(lines):
if re.match(r'^##\s+', line):
intro_end = i
break
intro_content = lines[:intro_end]
index_sections = analysis.h2_sections[:preserve_index_sections]
subsection_sections = analysis.h2_sections[preserve_index_sections:]
index_content = intro_content.copy()
for section in index_sections:
index_content.extend(section.content)
if subsection_sections:
index_content.append("\n## Additional Topics\n\n")
index_content.append("See also:\n")
for section in subsection_sections:
filename = generate_filename(section.title)
index_content.append(f"- [{section.title}]({filename})\n")
with open(chapter_dir / "index.md", 'w', encoding='utf-8') as f:
f.writelines(index_content)
subsection_files = []
for section in subsection_sections:
filename = generate_filename(section.title)
filepath = chapter_dir / filename
with open(filepath, 'w', encoding='utf-8') as f:
f.writelines(section.content)
subsection_files.append({
'filename': filename,
'title': section.title
})
return {
'chapter_name': chapter_name,
'subsection_count': len(subsection_files),
'subsection_files': subsection_files,
'index_sections': [s.title for s in index_sections]
}
def update_summary_md(book_dir: Path, split_results: List[dict], dry_run: bool):
summary_path = book_dir / "src" / "SUMMARY.md"
with open(summary_path, 'r', encoding='utf-8') as f:
lines = f.readlines()
new_lines = []
for line in lines:
modified = False
for result in split_results:
chapter_name = result['chapter_name']
pattern = rf'\[([^\]]+)\]\({re.escape(chapter_name)}\.md\)'
match = re.search(pattern, line)
if match:
title = match.group(1)
indent = len(line) - len(line.lstrip())
indent_str = ' ' * indent
new_lines.append(f"{indent_str}- [{title}]({chapter_name}/index.md)\n")
for subsection in result['subsection_files']:
new_lines.append(f"{indent_str} - [{subsection['title']}]({chapter_name}/{subsection['filename']})\n")
modified = True
break
if not modified:
new_lines.append(line)
if not dry_run:
with open(summary_path, 'w', encoding='utf-8') as f:
f.writelines(new_lines)
def main():
parser = argparse.ArgumentParser(description='Organize large mdBook chapters into subsections')
parser.add_argument('--book-dir', default='book', help='Path to mdBook root directory')
parser.add_argument('--min-h2-sections', type=int, default=6, help='Minimum H2 sections to trigger split')
parser.add_argument('--min-lines', type=int, default=400, help='Minimum lines to trigger split')
parser.add_argument('--preserve-index-sections', type=int, default=2, help='Number of H2 sections to keep in index.md')
parser.add_argument('--dry-run', action='store_true', help='Preview changes without applying them')
args = parser.parse_args()
book_dir = Path(args.book_dir)
src_dir = book_dir / "src"
if not src_dir.exists():
print(f"Error: Book directory '{src_dir}' not found", file=sys.stderr)
sys.exit(1)
if not (src_dir / "SUMMARY.md").exists():
print(f"Error: SUMMARY.md not found in '{src_dir}'", file=sys.stderr)
sys.exit(1)
print(f"Analyzing chapters in {src_dir}...\n")
analyses = []
for md_file in sorted(src_dir.glob("*.md")):
if md_file.name == "SUMMARY.md":
continue
chapter_dir = src_dir / md_file.stem
if chapter_dir.exists() and chapter_dir.is_dir():
print(f"✓ {md_file.name}: Already organized (skipping)")
continue
analysis = analyze_chapter(md_file, args.min_lines, args.min_h2_sections)
analyses.append(analysis)
status = "→ Would split" if analysis.should_split else "→ Keep as-is"
print(f"✓ {md_file.name} ({analysis.line_count} lines, {len(analysis.h2_sections)} H2s) {status}")
if analysis.should_split:
print(f" Reason: {analysis.reason}")
chapters_to_split = [a for a in analyses if a.should_split]
if not chapters_to_split:
print("\nNo chapters need to be split.")
return
print(f"\n{'Preview of' if args.dry_run else 'Applying'} changes:\n")
split_results = []
for analysis in chapters_to_split:
if args.dry_run:
print(f"Would split: {analysis.file_path.name}")
print(f" → {analysis.file_path.stem}/index.md (Introduction + {args.preserve_index_sections} sections)")
for i, section in enumerate(analysis.h2_sections[args.preserve_index_sections:]):
filename = generate_filename(section.title)
print(f" → {analysis.file_path.stem}/{filename} ({section.title})")
else:
result = split_chapter(analysis, args.preserve_index_sections, book_dir)
split_results.append(result)
print(f"✓ Split {analysis.file_path.name} → {result['chapter_name']}/ ({result['subsection_count']} subsections)")
print(f" - index.md ({', '.join(result['index_sections'])})")
for subsection in result['subsection_files']:
print(f" - {subsection['filename']}")
if split_results:
print(f"\n✓ Updated SUMMARY.md with nested structure")
update_summary_md(book_dir, split_results, args.dry_run)
for analysis in chapters_to_split:
if not args.dry_run:
analysis.file_path.unlink()
print(f"✓ Deleted {analysis.file_path.name}")
print(f"\nSummary:")
print(f" - {len(chapters_to_split)} chapters {'would be' if args.dry_run else ''} split")
total_subsections = sum(len(a.h2_sections) - args.preserve_index_sections for a in chapters_to_split)
print(f" - {total_subsections} subsection files {'would be' if args.dry_run else ''} created")
print(f" - SUMMARY.md {'would be' if args.dry_run else ''} updated")
if args.dry_run:
print(f"\nRun without --dry-run to apply changes.")
if __name__ == '__main__':
main()