use crate::types::{ParsedPage, ProjectedLine};
use super::paragraphs::collapse_whitespace;
const HEADER_BAND_FRACTION: f32 = 0.12;
const FOOTER_BAND_FRACTION: f32 = 0.12;
const HEADER_FOOTER_MIN_FRACTION: f32 = 0.5;
const HEADER_FOOTER_MIN_PAGES: usize = 2;
pub(super) fn normalize_for_repetition(s: &str) -> String {
let collapsed = collapse_whitespace(s).to_lowercase();
let mut out = String::with_capacity(collapsed.len());
let mut in_digits = false;
for c in collapsed.chars() {
if c.is_ascii_digit() {
if !in_digits {
out.push('#');
in_digits = true;
}
} else {
out.push(c);
in_digits = false;
}
}
out
}
pub fn compute_header_footer_set(pages: &[ParsedPage]) -> std::collections::HashSet<String> {
use std::collections::{HashMap, HashSet};
let mut set: HashSet<String> = HashSet::new();
if pages.len() < HEADER_FOOTER_MIN_PAGES {
return set;
}
let mut counts: HashMap<(char, String), HashSet<usize>> = HashMap::new();
for page in pages {
let header_cutoff = page.page_height * HEADER_BAND_FRACTION;
let footer_cutoff = page.page_height * (1.0 - FOOTER_BAND_FRACTION);
for line in &page.projected_lines {
let text = line.text.trim();
if text.is_empty() {
continue;
}
let norm = normalize_for_repetition(text);
if norm.is_empty() {
continue;
}
if line.bbox.y <= header_cutoff {
counts
.entry(('h', norm.clone()))
.or_default()
.insert(page.page_number);
}
let line_bottom = line.bbox.y + line.bbox.height;
if line_bottom >= footer_cutoff {
counts
.entry(('f', norm))
.or_default()
.insert(page.page_number);
}
}
}
let threshold = (pages.len() as f32 * HEADER_FOOTER_MIN_FRACTION)
.ceil()
.max(HEADER_FOOTER_MIN_PAGES as f32) as usize;
for ((_, norm), pages_seen) in counts {
if pages_seen.len() >= threshold {
set.insert(norm);
}
}
set
}
const SP_TOP_BAND_FRACTION: f32 = 0.15;
const SP_BOTTOM_BAND_FRACTION: f32 = 0.15;
const SP_ISOLATION_GAP_RATIO: f32 = 1.0;
fn matches_chrome_pattern(text: &str) -> bool {
let t = text.trim();
if t.is_empty() {
return false;
}
let lower = t.to_lowercase();
if lower.contains("http://")
|| lower.contains("https://")
|| lower.starts_with("www.")
|| lower.contains(" www.")
|| lower.contains("doi:")
|| lower.contains("doi.org/")
|| lower.contains("dx.doi.org")
{
return true;
}
if lower.contains("please cite this article")
|| lower.contains("contents lists available at")
|| lower.contains("available online at")
|| lower.contains("downloaded from")
{
return true;
}
if t.contains('©') || lower.contains("copyright ") || lower.contains("all rights reserved") {
return true;
}
if let Some(rest) = lower.strip_prefix("page ") {
let head: String = rest.chars().take_while(|c| c.is_ascii_digit()).collect();
if !head.is_empty() {
return true;
}
}
if t.chars().all(|c| c.is_ascii_digit()) && t.len() <= 4 {
return true;
}
let lb = lower.as_bytes();
for i in 0..lb.len().saturating_sub(4) {
let starts_word = i == 0 || !lb[i - 1].is_ascii_alphanumeric();
if !starts_word {
continue;
}
if lb[i] == b'v' && lb[i + 1] == b'o' && lb[i + 2] == b'l' {
let sep = lb[i + 3];
if sep == b'.' || sep == b' ' || sep == b',' {
if lb[i + 4..].iter().any(|b| b.is_ascii_digit()) {
return true;
}
}
}
}
if t.len() <= 120 && has_year(&lower) && has_digit_range(t) {
return true;
}
false
}
fn has_year(lower: &str) -> bool {
let bytes = lower.as_bytes();
for i in 0..bytes.len().saturating_sub(3) {
let starts_word = i == 0 || !bytes[i - 1].is_ascii_alphanumeric();
if !starts_word {
continue;
}
if ((bytes[i] == b'1' && bytes[i + 1] == b'9')
|| (bytes[i] == b'2' && bytes[i + 1] == b'0'))
&& bytes[i + 2].is_ascii_digit()
&& bytes[i + 3].is_ascii_digit()
{
let ends_word = i + 4 >= bytes.len() || !bytes[i + 4].is_ascii_alphanumeric();
if ends_word {
return true;
}
}
}
false
}
fn has_digit_range(t: &str) -> bool {
let chars: Vec<char> = t.chars().collect();
let mut i = 0;
while i < chars.len() {
if chars[i].is_ascii_digit() {
let mut j = i + 1;
while j < chars.len() && chars[j].is_ascii_digit() {
j += 1;
}
let g1_len = j - i;
let mut k = j;
while k < chars.len() && chars[k] == ' ' {
k += 1;
}
if k < chars.len() && (chars[k] == '-' || chars[k] == '–' || chars[k] == '—') {
let mut m = k + 1;
while m < chars.len() && chars[m] == ' ' {
m += 1;
}
if m < chars.len() && chars[m].is_ascii_digit() {
let mut n = m + 1;
while n < chars.len() && chars[n].is_ascii_digit() {
n += 1;
}
let g2_len = n - m;
let g1_is_year = g1_len == 4
&& ((chars[i] == '1' && chars[i + 1] == '9')
|| (chars[i] == '2' && chars[i + 1] == '0'));
if !(g1_is_year && g2_len <= 2) {
return true;
}
i = n;
continue;
}
}
i = j;
} else {
i += 1;
}
}
false
}
pub fn detect_single_page_chrome(
page: &ParsedPage,
body_size: f32,
) -> std::collections::HashSet<usize> {
use std::collections::HashSet;
let mut out: HashSet<usize> = HashSet::new();
if page.projected_lines.is_empty() {
return out;
}
let h = page.page_height;
if h <= 0.0 {
return out;
}
let top_cutoff = h * SP_TOP_BAND_FRACTION;
let bottom_cutoff = h * (1.0 - SP_BOTTOM_BAND_FRACTION);
let tops: Vec<f32> = page.projected_lines.iter().map(|l| l.bbox.y).collect();
let bots: Vec<f32> = page
.projected_lines
.iter()
.map(|l| l.bbox.y + l.bbox.height)
.collect();
for (idx, line) in page.projected_lines.iter().enumerate() {
let text = line.text.trim();
if text.is_empty() {
continue;
}
let in_top = bots[idx] <= top_cutoff;
let in_bottom = tops[idx] >= bottom_cutoff;
if !(in_top || in_bottom) {
continue;
}
let gap_ref = if body_size > 0.0 {
body_size
} else {
line.bbox.height.max(1.0)
};
let required_gap = SP_ISOLATION_GAP_RATIO * gap_ref;
let isolated = if in_top {
page.projected_lines
.iter()
.enumerate()
.filter(|(j, l)| {
*j != idx && tops[*j] > bots[idx] && !matches_chrome_pattern(l.text.trim())
})
.map(|(j, _)| tops[j] - bots[idx])
.min_by(|a, b| a.total_cmp(b))
.map(|gap| gap >= required_gap)
.unwrap_or(true)
} else {
page.projected_lines
.iter()
.enumerate()
.filter(|(j, l)| {
*j != idx && bots[*j] < tops[idx] && !matches_chrome_pattern(l.text.trim())
})
.map(|(j, _)| tops[idx] - bots[j])
.min_by(|a, b| a.total_cmp(b))
.map(|gap| gap >= required_gap)
.unwrap_or(true)
};
if !isolated {
continue;
}
if !matches_chrome_pattern(text) {
continue;
}
if text.chars().count() > 200 {
continue;
}
out.insert(idx);
}
out
}
pub(super) fn is_header_or_footer(
line: &ProjectedLine,
page: &ParsedPage,
header_footer: &std::collections::HashSet<String>,
) -> bool {
if header_footer.is_empty() {
return false;
}
let header_cutoff = page.page_height * HEADER_BAND_FRACTION;
let footer_cutoff = page.page_height * (1.0 - FOOTER_BAND_FRACTION);
let in_band = line.bbox.y <= header_cutoff || line.bbox.y + line.bbox.height >= footer_cutoff;
if !in_band {
return false;
}
let norm = normalize_for_repetition(line.text.trim());
header_footer.contains(&norm)
}
#[cfg(test)]
mod tests {
use super::super::test_helpers::header_footer_page;
use super::*;
#[test]
fn normalize_collapses_digits_and_case() {
assert_eq!(normalize_for_repetition("Page 1 of 6"), "page # of #");
assert_eq!(normalize_for_repetition("PAGE 12 OF 6"), "page # of #");
assert_eq!(normalize_for_repetition("Confidential"), "confidential");
}
#[test]
fn detects_repeating_header_and_footer() {
let pages = vec![
header_footer_page(1, "Acme Confidential", "Page 1 of 3", "Body one."),
header_footer_page(2, "Acme Confidential", "Page 2 of 3", "Body two."),
header_footer_page(3, "Acme Confidential", "Page 3 of 3", "Body three."),
];
let set = compute_header_footer_set(&pages);
assert!(set.contains("acme confidential"));
assert!(set.contains("page # of #"));
}
#[test]
fn skips_repetition_check_on_single_page() {
let pages = vec![header_footer_page(1, "Solo", "footer", "body")];
let set = compute_header_footer_set(&pages);
assert!(set.is_empty());
}
#[test]
fn body_text_not_classified_as_header() {
let mut pages = Vec::new();
for n in 1..=3 {
let mut p = header_footer_page(n, "unique header", "unique footer", "shared body text");
p.projected_lines[0].text = format!("unique header {n}");
p.projected_lines[2].text = format!("unique footer {n}");
pages.push(p);
}
let set = compute_header_footer_set(&pages);
assert!(!set.contains("shared body text"));
}
use super::super::test_helpers::{line, page};
#[test]
fn chrome_pattern_recognizes_common_signatures() {
assert!(matches_chrome_pattern("http://example.com/foo"));
assert!(matches_chrome_pattern("www.nature.com/scientificreports/"));
assert!(matches_chrome_pattern(
"Please cite this article in press as: ..."
));
assert!(matches_chrome_pattern("Page 12 of 24"));
assert!(matches_chrome_pattern("9"));
assert!(matches_chrome_pattern("© 2023 Acme Corp"));
assert!(matches_chrome_pattern(
"Cell Chemical Biology 24, 1–9, November 16, 2017"
));
assert!(!matches_chrome_pattern(
"The quick brown fox jumps over the lazy dog."
));
assert!(!matches_chrome_pattern("Introduction"));
assert!(!matches_chrome_pattern("Acme Annual Report 2023"));
assert!(!matches_chrome_pattern(
"SERFF Tracking #: FBLB-134215544 State Tracking #: Company Tracking #: MS-2024-07"
));
assert!(!has_digit_range("Filed 2024-07"));
assert!(!has_digit_range("Generated 2025-01-23"));
assert!(has_digit_range("Vol 24, 1-9, 2017"));
}
#[test]
fn detects_top_url_chrome_on_single_page() {
let lines = vec![
line("www.nature.com/scientificreports/", 50.0, 20.0, 10.0, 10.0),
line("Main Body Title", 50.0, 200.0, 14.0, 14.0),
line("Body prose line one.", 50.0, 220.0, 10.0, 10.0),
line("Body prose line two.", 50.0, 232.0, 10.0, 10.0),
];
let p = page(lines);
let strip = detect_single_page_chrome(&p, 10.0);
assert!(strip.contains(&0), "top-band URL should strip");
assert!(!strip.contains(&1));
assert!(!strip.contains(&2));
}
#[test]
fn detects_bottom_journal_citation_chrome() {
let lines = vec![
line("Body line.", 50.0, 300.0, 10.0, 10.0),
line("More body.", 50.0, 312.0, 10.0, 10.0),
line(
"Cell Chemical Biology 24, 1–9, November 16, 2017",
50.0,
770.0,
10.0,
10.0,
),
];
let p = page(lines);
let strip = detect_single_page_chrome(&p, 10.0);
assert!(strip.contains(&2), "bottom journal-cite should strip");
assert!(!strip.contains(&0));
}
#[test]
fn preserves_title_at_top_without_chrome_pattern() {
let lines = vec![
line("My Important Document", 50.0, 30.0, 18.0, 18.0),
line("Author Name", 50.0, 60.0, 10.0, 10.0),
line("Body prose here.", 50.0, 200.0, 10.0, 10.0),
];
let p = page(lines);
let strip = detect_single_page_chrome(&p, 10.0);
assert!(
strip.is_empty(),
"title without chrome pattern must survive, got {:?}",
strip
);
}
#[test]
fn chrome_with_no_isolation_gap_is_not_stripped() {
let lines = vec![
line("http://example.com/foo", 50.0, 20.0, 10.0, 10.0),
line("Body line right after.", 50.0, 32.0, 10.0, 10.0),
line("Continuing body.", 50.0, 44.0, 10.0, 10.0),
];
let p = page(lines);
let strip = detect_single_page_chrome(&p, 10.0);
assert!(
strip.is_empty(),
"no isolation gap means it's part of body, got {:?}",
strip
);
}
}