import os
import re
import sys
from datetime import datetime, timedelta
from pathlib import Path
from typing import List, Tuple, Dict, Set
DOCS_DIR = Path(__file__).parent.parent / "docs"
MAX_TODO_AGE_DAYS = 90
REQUIRED_HEADER_FIELDS = ["Status", "Type", "Created", "Updated"]
VALID_STATUSES = ["Draft", "Active", "Archived", "Deprecated", "TODO"]
VALID_TYPES = ["Guide", "Reference", "Decision", "Specification", "Index", "TODO"]
MARKDOWN_PATTERN = "*.md"
EXCLUDE_DIRS = {"archive", "templates"}
class DocumentationValidator:
def __init__(self, docs_dir: Path):
self.docs_dir = docs_dir
self.errors: List[str] = []
self.warnings: List[str] = []
self.all_files: Set[Path] = set()
self.linked_files: Set[Path] = set()
def validate(self) -> bool:
print(f"Validating documentation in: {self.docs_dir}")
print("-" * 60)
self._collect_markdown_files()
self._check_document_headers()
self._check_todo_staleness()
self._check_internal_links()
self._check_orphaned_documents()
self._check_file_naming()
self._report_results()
return len(self.errors) == 0
def _collect_markdown_files(self):
for md_file in self.docs_dir.rglob(MARKDOWN_PATTERN):
if any(exclude in md_file.parts for exclude in EXCLUDE_DIRS):
continue
self.all_files.add(md_file)
def _check_document_headers(self):
print("Checking document headers...")
for file_path in self.all_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
header_match = re.search(r'^#[^#].*?\n(.*?)^---', content, re.MULTILINE | re.DOTALL)
if not header_match:
self.warnings.append(f"{file_path.relative_to(self.docs_dir)}: Missing document header")
continue
header = header_match.group(1)
for field in REQUIRED_HEADER_FIELDS:
if f"**{field}**:" not in header:
self.warnings.append(f"{file_path.relative_to(self.docs_dir)}: Missing header field '{field}'")
status_match = re.search(r'\*\*Status\*\*:\s*(\w+)', header)
if status_match and status_match.group(1) not in VALID_STATUSES:
self.errors.append(f"{file_path.relative_to(self.docs_dir)}: Invalid status '{status_match.group(1)}'")
type_match = re.search(r'\*\*Type\*\*:\s*(\w+)', header)
if type_match and type_match.group(1) not in VALID_TYPES:
self.errors.append(f"{file_path.relative_to(self.docs_dir)}: Invalid type '{type_match.group(1)}'")
except Exception as e:
self.errors.append(f"{file_path.relative_to(self.docs_dir)}: Error reading file: {e}")
def _check_todo_staleness(self):
print("Checking TODO staleness...")
todo_dir = self.docs_dir / "todo"
if not todo_dir.exists():
return
cutoff_date = datetime.now() - timedelta(days=MAX_TODO_AGE_DAYS)
for todo_file in todo_dir.rglob(MARKDOWN_PATTERN):
if "archive" in todo_file.parts:
continue
mtime = datetime.fromtimestamp(todo_file.stat().st_mtime)
if mtime < cutoff_date:
age_days = (datetime.now() - mtime).days
self.warnings.append(f"Stale TODO: {todo_file.relative_to(self.docs_dir)} ({age_days} days old)")
try:
with open(todo_file, 'r', encoding='utf-8') as f:
content = f.read()
created_match = re.search(r'\*\*Created\*\*:\s*(\d{4}-\d{2}-\d{2})', content)
if created_match:
created_date = datetime.strptime(created_match.group(1), '%Y-%m-%d')
if created_date < cutoff_date:
age_days = (datetime.now() - created_date).days
self.warnings.append(f"Stale TODO by creation date: {todo_file.relative_to(self.docs_dir)} ({age_days} days old)")
except Exception:
pass
def _check_internal_links(self):
print("Checking internal links...")
for file_path in self.all_files:
try:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
links = re.findall(r'\[([^\]]+)\]\(([^)]+)\)', content)
for link_text, link_target in links:
if link_target.startswith(('http://', 'https://', 'mailto:', '#')):
continue
if link_target.endswith('.md'):
target_path = (file_path.parent / link_target).resolve()
if target_path.is_relative_to(self.docs_dir):
self.linked_files.add(target_path)
if not target_path.exists():
self.errors.append(f"{file_path.relative_to(self.docs_dir)}: Broken link to '{link_target}'")
except Exception as e:
self.errors.append(f"{file_path.relative_to(self.docs_dir)}: Error checking links: {e}")
def _check_orphaned_documents(self):
print("Checking for orphaned documents...")
for readme in self.docs_dir.rglob("README.md"):
self.linked_files.add(readme)
orphaned = self.all_files - self.linked_files
for orphan in orphaned:
if "archive" not in orphan.parts:
self.warnings.append(f"Orphaned document: {orphan.relative_to(self.docs_dir)}")
def _check_file_naming(self):
print("Checking file naming conventions...")
for file_path in self.all_files:
filename = file_path.stem
if not re.match(r'^[a-z0-9-]+$', filename):
if not re.match(r'^[A-Z_]+$', filename):
self.warnings.append(f"Non-standard filename: {file_path.name}")
if any(char in filename for char in ['_', '.', ' ', '@', '!', '#', '$', '%', '^', '&', '*']):
if not (re.match(r'^[A-Z_]+$', filename) and '_' in filename):
self.warnings.append(f"Special characters in filename: {file_path.name}")
def _report_results(self):
print("\n" + "=" * 60)
print("VALIDATION RESULTS")
print("=" * 60)
print(f"\nTotal files checked: {len(self.all_files)}")
print(f"Errors found: {len(self.errors)}")
print(f"Warnings found: {len(self.warnings)}")
if self.errors:
print("\nERRORS:")
for error in sorted(self.errors):
print(f" ❌ {error}")
if self.warnings:
print("\nWARNINGS:")
for warning in sorted(self.warnings):
print(f" ⚠️ {warning}")
if not self.errors and not self.warnings:
print("\n✅ All documentation validation checks passed!")
print("\nSUMMARY:")
print(f" - Document coverage: {len(self.linked_files)}/{len(self.all_files)} files linked")
print(f" - TODO items checked: {len([f for f in self.all_files if 'todo' in f.parts])}")
print(f" - Validation status: {'PASSED' if not self.errors else 'FAILED'}")
def main():
validator = DocumentationValidator(DOCS_DIR)
success = validator.validate()
sys.exit(0 if success else 1)
if __name__ == "__main__":
main()