use std::collections::HashMap;
use super::banner::normalize_banner;
const NOISE_HEADING_EMPTY_PROSE_BYTES: usize = 200;
const NOISE_HEADING_ROMAN_MAX_PROSE_BYTES: usize = 1000;
const REPETITION_THRESHOLD: usize = 4;
pub(super) fn strip_noise_headings(md: &str) -> String {
let lines: Vec<&str> = md.split('\n').collect();
if lines.is_empty() {
return String::new();
}
let heading_re = regex::Regex::new(r"^(#+)\s+(.+?)\s*$").unwrap();
let heading_idx: Vec<(usize, String)> = lines
.iter()
.enumerate()
.filter_map(|(i, l)| {
heading_re
.captures(l)
.map(|caps| (i, caps.get(2).unwrap().as_str().trim().to_string()))
})
.collect();
let mut drop_decision: HashMap<usize, NoiseDecision> = HashMap::new();
let mut repetition_counts: HashMap<String, usize> = HashMap::new();
for (_, title) in &heading_idx {
let key = normalize_banner(title);
if !key.is_empty() {
*repetition_counts.entry(key).or_insert(0) += 1;
}
}
for (line_i, title) in &heading_idx {
let key = normalize_banner(title);
if !key.is_empty()
&& repetition_counts.get(&key).copied().unwrap_or(0) > REPETITION_THRESHOLD
{
drop_decision.insert(
*line_i,
NoiseDecision {
title: title.clone(),
},
);
}
}
for (hi, &(line_i, ref title)) in heading_idx.iter().enumerate() {
if drop_decision.contains_key(&line_i) {
continue;
}
let next_line_i = heading_idx
.get(hi + 1)
.map(|(j, _)| *j)
.unwrap_or(lines.len());
let prose_bytes: usize = lines[line_i + 1..next_line_i]
.iter()
.map(|l| l.trim().len())
.sum();
if let Some(decision) = classify_noise_heading(title, prose_bytes) {
drop_decision.insert(line_i, decision);
}
}
if drop_decision.is_empty() {
return md.to_string();
}
let mut out = String::with_capacity(md.len());
for (i, line) in lines.iter().enumerate() {
if let Some(decision) = drop_decision.get(&i) {
if !decision.title.is_empty() {
out.push_str(&decision.title);
}
} else {
out.push_str(line);
}
if i + 1 < lines.len() {
out.push('\n');
}
}
out
}
struct NoiseDecision {
title: String,
}
fn classify_noise_heading(title: &str, prose_bytes: usize) -> Option<NoiseDecision> {
let t = title.trim();
if t.is_empty() {
return None;
}
let has_digit = t.chars().any(|c| c.is_ascii_digit());
let has_letter = t.chars().any(|c| c.is_alphabetic());
if !has_letter && !has_digit {
return Some(NoiseDecision {
title: t.to_string(),
});
}
let only_date_chars = t
.chars()
.all(|c| c.is_ascii_digit() || matches!(c, '.' | '-' | '/' | ' '));
if only_date_chars && has_digit && !has_letter {
return Some(NoiseDecision {
title: t.to_string(),
});
}
if t.chars().count() == 1
&& t.chars().next().unwrap().is_ascii_alphabetic()
&& prose_bytes < NOISE_HEADING_EMPTY_PROSE_BYTES
{
return Some(NoiseDecision {
title: t.to_string(),
});
}
if !t.is_empty()
&& t.chars().all(|c| {
matches!(
c,
'i' | 'v' | 'x' | 'l' | 'c' | 'd' | 'm' | 'I' | 'V' | 'X' | 'L' | 'C' | 'D' | 'M'
)
})
&& prose_bytes < NOISE_HEADING_ROMAN_MAX_PROSE_BYTES
{
return Some(NoiseDecision {
title: t.to_string(),
});
}
None
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn roman_numeral_heading_with_empty_section_dropped() {
let md = "## i\nZÁMĚRNĚ NEPOUŽITO\n\n## Real heading\nBody text.";
let out = strip_noise_headings(md);
assert!(
!out.contains("## i\n"),
"expected '## i' heading dropped, got: {out}"
);
assert!(out.contains("i\nZÁMĚRNĚ NEPOUŽITO"), "got: {out}");
assert!(out.contains("## Real heading"), "got: {out}");
}
#[test]
fn roman_numeral_heading_with_real_prose_kept() {
let prose = "x".repeat(1500);
let md = format!("## iv\n{}\n", prose);
let out = strip_noise_headings(&md);
assert!(
out.starts_with("## iv\n"),
"expected '## iv' heading kept (section is long), got: {out}"
);
}
#[test]
fn running_header_dropped_by_repetition() {
let mut md = String::new();
for i in 0..10 {
md.push_str(&format!(
"## PŘEDPIS L14\nChapter body {i} text content.\n\n"
));
}
let out = strip_noise_headings(&md);
assert!(
!out.contains("## PŘEDPIS L14"),
"all '## PŘEDPIS L14' headings should be dropped, got: {out}"
);
assert!(out.contains("PŘEDPIS L14\n"), "got: {out}");
for i in 0..10 {
assert!(
out.contains(&format!("Chapter body {i}")),
"lost body {i}: {out}"
);
}
}
#[test]
fn title_with_varying_digits_collapses() {
let mut md = String::new();
for i in 1..=6 {
md.push_str(&format!("## Dopl. {i}\nstub {i}\n\n"));
}
let out = strip_noise_headings(&md);
for i in 1..=6 {
assert!(
!out.contains(&format!("## Dopl. {i}\n")),
"expected '## Dopl. {i}' dropped, got: {out}"
);
}
}
#[test]
fn non_repeated_heading_kept() {
let md = "## Alpha\nbody.\n\n## Beta\nbody.\n\n## Gamma\nbody.\n\n## Delta\nbody.\n\n## Epsilon\nbody.\n\n## Zeta\nbody.";
let out = strip_noise_headings(md);
for name in ["Alpha", "Beta", "Gamma", "Delta", "Epsilon", "Zeta"] {
assert!(
out.contains(&format!("## {name}")),
"expected '## {name}' kept, got: {out}"
);
}
}
#[test]
fn punctuation_only_heading_dropped() {
let md = "## -\n\n## Real\nbody.";
let out = strip_noise_headings(md);
assert!(
!out.contains("## -\n"),
"expected '## -' dropped, got: {out}"
);
let md = "## – -\n\n## Real\nbody.";
let out = strip_noise_headings(md);
assert!(
!out.contains("## – -"),
"expected '## – -' dropped, got: {out}"
);
}
#[test]
fn roman_numeral_with_toc_body_dropped() {
let mut body = String::new();
for chapter in 3..=5 {
for sub in 1..=40 {
body.push_str(&format!("{chapter} - {sub}\n"));
}
}
let md = format!("## vi\n{}", body);
let out = strip_noise_headings(&md);
assert!(
!out.starts_with("## vi"),
"expected '## vi' dropped (TOC body), got first 80 chars: {}",
&out[..out.len().min(80)]
);
assert!(
out.starts_with("vi\n"),
"got: {}",
&out[..out.len().min(80)]
);
}
#[test]
fn date_heading_always_dropped() {
let prose = "x".repeat(500);
let md = format!("## 25 . 12 . 2025\n{}\n", prose);
let out = strip_noise_headings(&md);
assert!(
!out.starts_with("## 25"),
"expected date heading dropped, got: {out}"
);
assert!(out.starts_with("25 . 12 . 2025\n"), "got: {out}");
}
#[test]
fn single_letter_heading_with_empty_section_dropped() {
let md = "## c\n\n## Next\nBody.";
let out = strip_noise_headings(md);
assert!(
!out.contains("## c\n"),
"expected '## c' heading dropped, got: {out}"
);
assert!(out.starts_with("c\n"), "got: {out}");
}
}