import argparse
import re
import sys
from pathlib import Path
from typing import List, Tuple
def extract_document_title(lines: List[str]) -> str:
for line in lines:
line = line.strip()
if line.startswith('# ') and not line.startswith('## '):
return line[2:].strip()
return ""
def remove_copyright_boilerplate(lines: List[str]) -> Tuple[List[str], int]:
copyright_pattern = re.compile(r'© 2015-2024.*AVEVA Group Limited', re.IGNORECASE)
support_pattern = re.compile(r'softwaresupport\.aveva\.com')
copyright_found = False
end_idx = -1
for i, line in enumerate(lines[:80]):
if copyright_pattern.search(line):
copyright_found = True
if copyright_found and support_pattern.search(line):
end_idx = i
break
if end_idx >= 0:
removed = end_idx + 1
return lines[end_idx + 1:], removed
return lines, 0
def remove_page_boundaries(lines: List[str], doc_title: str) -> Tuple[List[str], int]:
page_pattern = re.compile(r'^Page\s+\d+\s*$')
copyright_pattern = re.compile(r'^©')
page_indices = []
for i, line in enumerate(lines):
if page_pattern.match(line.strip()):
page_indices.append(i)
ranges_to_remove = []
for page_idx in page_indices:
start_idx = page_idx
for i in range(page_idx - 1, max(page_idx - 6, -1), -1):
if copyright_pattern.match(lines[i].strip()):
start_idx = i
break
end_idx = page_idx + 1
found_content = False
for i in range(page_idx + 1, min(page_idx + 20, len(lines))):
line = lines[i].strip()
if not line:
end_idx = i + 1
continue
if line.startswith('#'):
found_content = True
break
if doc_title and (doc_title in line or line in doc_title):
end_idx = i + 1
continue
if len(line) < 100 and not any(c in line for c in ['•', '**', '`', '[', ']', '(', ')']):
end_idx = i + 1
continue
found_content = True
break
if not found_content:
end_idx = min(page_idx + 20, len(lines))
ranges_to_remove.append((start_idx, end_idx))
ranges_to_remove.sort()
merged = []
for start, end in ranges_to_remove:
if merged and start <= merged[-1][1]:
merged[-1] = (merged[-1][0], max(merged[-1][1], end))
else:
merged.append((start, end))
result = []
removed_count = 0
idx = 0
for start, end in merged:
result.extend(lines[idx:start])
removed_count += end - start
idx = end
result.extend(lines[idx:])
return result, removed_count
def remove_toc_section(lines: List[str]) -> Tuple[List[str], int]:
toc_start = -1
toc_end = -1
for i, line in enumerate(lines):
stripped = line.strip()
if stripped == '# Contents':
toc_start = i
continue
if toc_start >= 0 and stripped.startswith('# ') and not stripped.startswith('##'):
toc_end = i
break
if toc_start >= 0:
if toc_end < 0:
toc_end = len(lines)
removed = toc_end - toc_start
return lines[:toc_start] + lines[toc_end:], removed
return lines, 0
def remove_chapter_headings(lines: List[str]) -> Tuple[List[str], int]:
chapter_pattern = re.compile(r'^#{1,6}\s+Chapter\s+\d+\s*$')
result = []
removed = 0
for line in lines:
if chapter_pattern.match(line.strip()):
removed += 1
else:
result.append(line)
return result, removed
def replace_bold_bullets(lines: List[str]) -> Tuple[List[str], int]:
result = []
replaced = 0
for line in lines:
new_line = line.replace('**•**', '-')
if new_line != line:
replaced += 1
result.append(new_line)
return result, replaced
def remove_strikethrough(lines: List[str]) -> Tuple[List[str], int]:
strikethrough_pattern = re.compile(r'~~([^~]+)~~')
result = []
replaced = 0
for line in lines:
new_line = strikethrough_pattern.sub(r'\1', line)
if new_line != line:
replaced += 1
result.append(new_line)
return result, replaced
def collapse_blank_lines(lines: List[str]) -> Tuple[List[str], int]:
result = []
blank_count = 0
collapsed = 0
for line in lines:
if line.strip() == '':
blank_count += 1
else:
if blank_count > 0:
output_blanks = min(blank_count, 2) if blank_count >= 3 else blank_count
if blank_count >= 3:
collapsed += blank_count - 2
result.extend([''] * output_blanks)
blank_count = 0
result.append(line)
if blank_count > 0:
output_blanks = min(blank_count, 2) if blank_count >= 3 else blank_count
if blank_count >= 3:
collapsed += blank_count - 2
result.extend([''] * output_blanks)
return result, collapsed
def clean_markdown_file(file_path: Path) -> dict:
with open(file_path, 'r', encoding='utf-8') as f:
content = f.read()
lines = content.splitlines()
original_line_count = len(lines)
stats = {
'original_lines': original_line_count,
'removed_by_rule': {}
}
lines, removed = remove_copyright_boilerplate(lines)
stats['removed_by_rule']['1_copyright_boilerplate'] = removed
doc_title = extract_document_title(lines)
lines, removed = remove_page_boundaries(lines, doc_title)
stats['removed_by_rule']['2_page_boundaries'] = removed
lines, removed = remove_toc_section(lines)
stats['removed_by_rule']['3_toc_section'] = removed
lines, removed = remove_chapter_headings(lines)
stats['removed_by_rule']['4_chapter_headings'] = removed
lines, replaced = replace_bold_bullets(lines)
stats['removed_by_rule']['5_bold_bullets_replaced'] = replaced
lines, replaced = remove_strikethrough(lines)
stats['removed_by_rule']['6_strikethrough_removed'] = replaced
lines, collapsed = collapse_blank_lines(lines)
stats['removed_by_rule']['7_blank_lines_collapsed'] = collapsed
stats['final_lines'] = len(lines)
stats['total_removed'] = original_line_count - len(lines)
backup_path = file_path.with_suffix(file_path.suffix + '.bak')
with open(backup_path, 'w', encoding='utf-8') as f:
f.write(content)
with open(file_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(lines))
if lines: f.write('\n')
return stats
def process_path(path: Path) -> List[Tuple[Path, dict]]:
results = []
if path.is_file():
if path.suffix == '.md':
stats = clean_markdown_file(path)
results.append((path, stats))
else:
print(f"Skipping non-markdown file: {path}", file=sys.stderr)
elif path.is_dir():
md_files = sorted(path.glob('*.md'))
if not md_files:
print(f"No .md files found in {path}", file=sys.stderr)
for md_file in md_files:
stats = clean_markdown_file(md_file)
results.append((md_file, stats))
else:
print(f"Path does not exist: {path}", file=sys.stderr)
sys.exit(1)
return results
def main():
parser = argparse.ArgumentParser(
description='Clean PDF-converted markdown files by removing conversion artifacts.'
)
parser.add_argument(
'path',
type=Path,
help='Path to a markdown file or directory containing markdown files'
)
args = parser.parse_args()
results = process_path(args.path)
if not results:
print("No files processed.", file=sys.stderr)
sys.exit(1)
print(f"\nProcessed {len(results)} file(s):\n")
for file_path, stats in results:
print(f"{file_path.name}:")
print(f" Original lines: {stats['original_lines']}")
print(f" Final lines: {stats['final_lines']}")
print(f" Total removed: {stats['total_removed']}")
print(f" Details:")
for rule, count in stats['removed_by_rule'].items():
if count > 0:
print(f" {rule}: {count}")
print()
if __name__ == '__main__':
main()