use std::sync::LazyLock;
use regex::Regex;
static COPYRIGHT_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"(?i)©\s*\d{4}[-\u{2013}]\d{4}.*AVEVA Group Limited")
.expect("hardcoded copyright regex")
});
static SUPPORT_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"softwaresupport\.aveva\.com").expect("hardcoded support regex"));
static PAGE_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^Page\s+\d+\s*$").expect("hardcoded page regex"));
static PAGE_COPYRIGHT_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^©").expect("hardcoded page copyright regex"));
static CHAPTER_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^#{1,6}\s+Chapter\s+\d+\s*$").expect("hardcoded chapter regex"));
static STRIKE_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"~~([^~]+)~~").expect("hardcoded strikethrough regex"));
pub struct CleaningContext {
pub doc_title: String,
}
pub struct CleaningRule {
pub name: &'static str,
pub description: &'static str,
pub tags: &'static [&'static str],
pub apply: fn(&mut Vec<String>, &CleaningContext) -> usize,
}
static ALL_RULES: &[CleaningRule] = &[
CleaningRule {
name: "copyright_boilerplate",
description: "Remove AVEVA copyright header block",
tags: &["aveva", "pdf"],
apply: rule_copyright_boilerplate,
},
CleaningRule {
name: "page_boundaries",
description: "Remove Page N + title echo blocks from PDF conversion",
tags: &["aveva", "pdf"],
apply: rule_page_boundaries,
},
CleaningRule {
name: "toc_section",
description: "Remove # Contents through next H1",
tags: &["generic", "pdf"],
apply: rule_toc_section,
},
CleaningRule {
name: "chapter_headings",
description: "Strip bare 'Chapter N' headings",
tags: &["generic", "pdf"],
apply: rule_chapter_headings,
},
CleaningRule {
name: "bold_bullets",
description: "Replace **•** with -",
tags: &["generic", "pdf"],
apply: rule_bold_bullets,
},
CleaningRule {
name: "strikethrough",
description: "Remove ~~text~~ strikethrough markers",
tags: &["generic"],
apply: rule_strikethrough,
},
CleaningRule {
name: "blank_lines",
description: "Collapse 3+ consecutive blank lines to 2",
tags: &["generic"],
apply: rule_blank_lines,
},
];
pub fn clean_markdown(input: &str, tags: &[&str]) -> String {
let _span = tracing::info_span!("clean_markdown").entered();
let mut lines: Vec<String> = input.lines().map(|l| l.to_string()).collect();
let mut ctx = CleaningContext {
doc_title: String::new(),
};
for rule in ALL_RULES {
if !tags.is_empty() && !rule.tags.iter().any(|t| tags.contains(t)) {
continue;
}
let count = (rule.apply)(&mut lines, &ctx);
if count > 0 {
tracing::info!(rule = rule.name, changes = count, "Cleaning rule applied");
}
if rule.name == "copyright_boilerplate" {
ctx.doc_title = extract_doc_title(&lines);
}
}
let mut result = lines.join("\n");
if !result.ends_with('\n') {
result.push('\n');
}
result
}
fn extract_doc_title(lines: &[String]) -> String {
for line in lines {
let trimmed = line.trim();
if let Some(heading) = trimmed.strip_prefix("# ") {
if !heading.starts_with('#') {
return heading.trim().to_string();
}
}
}
String::new()
}
fn rule_copyright_boilerplate(lines: &mut Vec<String>, _ctx: &CleaningContext) -> usize {
let scan_limit = lines.len().min(80);
let mut copyright_found = false;
let mut end_idx: Option<usize> = None;
for (i, line) in lines[..scan_limit].iter().enumerate() {
if COPYRIGHT_RE.is_match(line) {
copyright_found = true;
}
if copyright_found && SUPPORT_RE.is_match(line) {
end_idx = Some(i);
break;
}
}
if let Some(idx) = end_idx {
let removed = idx + 1;
lines.drain(..removed);
removed
} else {
0
}
}
fn rule_page_boundaries(lines: &mut Vec<String>, ctx: &CleaningContext) -> usize {
let page_indices: Vec<usize> = lines
.iter()
.enumerate()
.filter(|(_, line)| PAGE_RE.is_match(line.trim()))
.map(|(i, _)| i)
.collect();
if page_indices.is_empty() {
return 0;
}
let mut ranges: Vec<(usize, usize)> = Vec::new();
for &page_idx in &page_indices {
let mut start_idx = page_idx;
for i in (page_idx.saturating_sub(5)..page_idx).rev() {
if PAGE_COPYRIGHT_RE.is_match(lines[i].trim()) {
start_idx = i;
break;
}
}
let mut end_idx = page_idx + 1;
let scan_end = lines.len().min(page_idx + 20);
for (i, line) in lines[page_idx + 1..scan_end].iter().enumerate() {
let i = i + page_idx + 1;
let trimmed = line.trim();
if trimmed.is_empty() {
end_idx = i + 1;
continue;
}
if trimmed.starts_with('#') {
break;
}
if !ctx.doc_title.is_empty()
&& (ctx.doc_title.contains(trimmed) || trimmed.contains(&ctx.doc_title))
{
end_idx = i + 1;
continue;
}
if trimmed.len() < 100
&& !trimmed.contains('•')
&& !trimmed.contains("**")
&& !trimmed.contains('`')
&& !trimmed.contains('[')
{
end_idx = i + 1;
continue;
}
break;
}
ranges.push((start_idx, end_idx));
}
ranges.sort_by_key(|r| r.0);
let mut merged: Vec<(usize, usize)> = Vec::new();
for (start, end) in ranges {
if let Some(last) = merged.last_mut() {
if start <= last.1 {
last.1 = last.1.max(end);
continue;
}
}
merged.push((start, end));
}
let mut removed = 0;
for (start, end) in merged.into_iter().rev() {
let count = end - start;
lines.drain(start..end);
removed += count;
}
removed
}
fn rule_toc_section(lines: &mut Vec<String>, _ctx: &CleaningContext) -> usize {
let mut toc_start: Option<usize> = None;
let mut toc_end: Option<usize> = None;
for (i, line) in lines.iter().enumerate() {
let trimmed = line.trim();
if trimmed == "# Contents" {
toc_start = Some(i);
continue;
}
if toc_start.is_some()
&& toc_end.is_none()
&& trimmed.starts_with("# ")
&& !trimmed.starts_with("## ")
{
toc_end = Some(i);
break;
}
}
if let Some(start) = toc_start {
let end = toc_end.unwrap_or(lines.len());
let removed = end - start;
lines.drain(start..end);
removed
} else {
0
}
}
fn rule_chapter_headings(lines: &mut Vec<String>, _ctx: &CleaningContext) -> usize {
let before = lines.len();
lines.retain(|line| !CHAPTER_RE.is_match(line.trim()));
before - lines.len()
}
#[allow(clippy::ptr_arg)] fn rule_bold_bullets(lines: &mut Vec<String>, _ctx: &CleaningContext) -> usize {
let mut replaced = 0;
for line in lines.iter_mut() {
if line.contains("**•**") {
*line = line.replace("**•**", "-");
replaced += 1;
}
}
replaced
}
#[allow(clippy::ptr_arg)] fn rule_strikethrough(lines: &mut Vec<String>, _ctx: &CleaningContext) -> usize {
let mut replaced = 0;
for line in lines.iter_mut() {
let new = STRIKE_RE.replace_all(line, "$1").to_string();
if new != *line {
*line = new;
replaced += 1;
}
}
replaced
}
fn rule_blank_lines(lines: &mut Vec<String>, _ctx: &CleaningContext) -> usize {
let mut result = Vec::with_capacity(lines.len());
let mut blank_count = 0usize;
let mut collapsed = 0usize;
for line in lines.iter() {
if line.trim().is_empty() {
blank_count += 1;
} else {
if blank_count > 0 {
let output_blanks = if blank_count >= 3 {
collapsed += blank_count - 2;
2
} else {
blank_count
};
for _ in 0..output_blanks {
result.push(String::new());
}
blank_count = 0;
}
result.push(line.clone());
}
}
if blank_count > 0 {
let output_blanks = if blank_count >= 3 {
collapsed += blank_count - 2;
2
} else {
blank_count
};
for _ in 0..output_blanks {
result.push(String::new());
}
}
*lines = result;
collapsed
}
#[cfg(test)]
mod tests {
use super::*;
fn ctx() -> CleaningContext {
CleaningContext {
doc_title: String::new(),
}
}
#[test]
fn test_copyright_boilerplate() {
let mut lines: Vec<String> = vec![
"Some header".into(),
"© 2015-2024 by AVEVA Group Limited".into(),
"All rights reserved".into(),
"softwaresupport.aveva.com".into(),
"# Real Content".into(),
];
let removed = rule_copyright_boilerplate(&mut lines, &ctx());
assert_eq!(removed, 4);
assert_eq!(lines.len(), 1);
assert_eq!(lines[0], "# Real Content");
}
#[test]
fn test_copyright_boilerplate_not_found() {
let mut lines: Vec<String> = vec!["# Normal Document".into(), "Content".into()];
let removed = rule_copyright_boilerplate(&mut lines, &ctx());
assert_eq!(removed, 0);
assert_eq!(lines.len(), 2);
}
#[test]
fn test_toc_section() {
let mut lines: Vec<String> = vec![
"# Contents".into(),
"- Item 1".into(),
"- Item 2".into(),
"# Introduction".into(),
"Real content".into(),
];
let removed = rule_toc_section(&mut lines, &ctx());
assert_eq!(removed, 3);
assert_eq!(lines[0], "# Introduction");
}
#[test]
fn test_chapter_headings() {
let mut lines: Vec<String> = vec![
"# Overview".into(),
"### Chapter 1".into(),
"Content here".into(),
"## Chapter 2".into(),
"More content".into(),
];
let removed = rule_chapter_headings(&mut lines, &ctx());
assert_eq!(removed, 2);
assert_eq!(lines.len(), 3);
assert_eq!(lines[0], "# Overview");
assert_eq!(lines[1], "Content here");
assert_eq!(lines[2], "More content");
}
#[test]
fn test_bold_bullets() {
let mut lines: Vec<String> = vec!["**•** First item".into(), "Normal line".into()];
let replaced = rule_bold_bullets(&mut lines, &ctx());
assert_eq!(replaced, 1);
assert_eq!(lines[0], "- First item");
}
#[test]
fn test_strikethrough() {
let mut lines: Vec<String> = vec!["This is ~~deleted~~ text".into()];
let replaced = rule_strikethrough(&mut lines, &ctx());
assert_eq!(replaced, 1);
assert_eq!(lines[0], "This is deleted text");
}
#[test]
fn test_blank_lines_collapse() {
let mut lines: Vec<String> = vec![
"Line 1".into(),
"".into(),
"".into(),
"".into(),
"".into(),
"Line 2".into(),
];
let collapsed = rule_blank_lines(&mut lines, &ctx());
assert_eq!(collapsed, 2);
assert_eq!(lines.len(), 4); assert_eq!(lines[0], "Line 1");
assert_eq!(lines[1], "");
assert_eq!(lines[2], "");
assert_eq!(lines[3], "Line 2");
}
#[test]
fn test_blank_lines_no_collapse() {
let mut lines: Vec<String> = vec!["Line 1".into(), "".into(), "".into(), "Line 2".into()];
let collapsed = rule_blank_lines(&mut lines, &ctx());
assert_eq!(collapsed, 0);
assert_eq!(lines.len(), 4);
}
#[test]
fn test_clean_markdown_all_tags() {
let input = "# Contents\n- TOC\n# Real Title\nContent\n\n\n\n\nMore content\n";
let result = clean_markdown(input, &[]);
assert!(!result.contains("# Contents"));
assert!(result.contains("# Real Title"));
assert!(!result.contains("\n\n\n\n"));
}
#[test]
fn test_clean_markdown_tag_filter() {
let input =
"© 2015-2024 by AVEVA Group Limited\nAll rights\nsoftwaresupport.aveva.com\n# Title\n";
let result = clean_markdown(input, &["generic"]);
assert!(result.contains("AVEVA"));
}
#[test]
fn test_page_boundaries() {
let mut lines: Vec<String> = vec![
"Content before".into(),
"© AVEVA 2024".into(),
"".into(),
"Page 1".into(),
"".into(),
"# Next Section".into(),
];
let ctx = CleaningContext {
doc_title: "My Doc".to_string(),
};
let removed = rule_page_boundaries(&mut lines, &ctx);
assert!(removed > 0);
assert!(lines.iter().any(|l| l.contains("# Next Section")));
}
}