use super::*;
use crate::PdfFigure as Figure;
use crate::element::{ImageElement, PageElement, TextElement};
use crate::headings::FontSignature;
use crate::zone::BBox;
fn fs() -> FontSignature {
FontSignature::new(10.0, false, false)
}
fn text_at(text: &str, left: f32, right: f32, y_center: f32, height: f32) -> PageElement {
let top = y_center + height / 2.0;
let bottom = y_center - height / 2.0;
PageElement::Text(TextElement {
text: text.to_string(),
sig: fs(),
left,
right,
top,
bottom,
})
}
fn page_bbox(y_top: f32, y_bottom: f32) -> BBox {
BBox {
left: 0.0,
right: 600.0,
top: y_top,
bottom: y_bottom,
}
}
fn header_only_page(header: &str) -> Vec<PageElement> {
vec![text_at(header, 50.0, 550.0, 790.0, 12.0)]
}
#[test]
fn alternating_headers_caught_via_tokens() {
let unique_words = [
"alpha", "bravo", "charlie", "delta", "echo", "foxtrot", "golf", "hotel", "india",
"juliet", "kilo", "lima",
];
let mut per_page_elements: Vec<Vec<PageElement>> = Vec::new();
let mut per_page_bbox: Vec<BBox> = Vec::new();
for (i, w) in unique_words.iter().enumerate() {
let line = if i % 2 == 0 {
format!("HLAVA PŘEDPIS L14 {}", w)
} else {
format!("PŘEDPIS L14 HLAVA {}", w)
};
per_page_elements.push(header_only_page(&line));
per_page_bbox.push(page_bbox(800.0, 0.0));
}
let (banner_lines, banner_tokens, banner_image_hashes) =
detect_banners(&per_page_elements, &per_page_bbox);
assert!(
banner_lines.is_empty(),
"expected no full-line banner matches, got {:?}",
banner_lines
);
assert!(
banner_tokens.contains("HLAVA"),
"expected HLAVA in banner_tokens, got {:?}",
banner_tokens
);
assert!(
banner_tokens.contains("PŘEDPIS"),
"expected PŘEDPIS in banner_tokens, got {:?}",
banner_tokens
);
let test_page = header_only_page("HLAVA PŘEDPIS");
let cleaned = strip_banners(
test_page,
page_bbox(800.0, 0.0),
&banner_lines,
&banner_tokens,
&banner_image_hashes,
);
assert!(
cleaned.is_empty(),
"alternating-header line should be stripped via token coverage; got {} elements",
cleaned.len()
);
}
#[test]
fn date_footer_token_stripped() {
let n_pages = 8;
let mut per_page_elements: Vec<Vec<PageElement>> = Vec::new();
let mut per_page_bbox: Vec<BBox> = Vec::new();
for _ in 0..n_pages {
let footer = text_at("25 . 12 . 2025", 250.0, 350.0, 10.0, 12.0);
let body = text_at("body content", 50.0, 550.0, 400.0, 12.0);
per_page_elements.push(vec![footer, body]);
per_page_bbox.push(page_bbox(800.0, 0.0));
}
let (banner_lines, banner_tokens, banner_image_hashes) =
detect_banners(&per_page_elements, &per_page_bbox);
assert!(
banner_lines.iter().any(|s| s.contains("\\d")),
"expected normalized date in banner_lines, got {:?}",
banner_lines
);
let test_page = vec![
text_at("25 . 12 . 2025", 250.0, 350.0, 10.0, 12.0),
text_at("body content", 50.0, 550.0, 400.0, 12.0),
];
let cleaned = strip_banners(
test_page,
page_bbox(800.0, 0.0),
&banner_lines,
&banner_tokens,
&banner_image_hashes,
);
assert_eq!(
cleaned.len(),
1,
"date footer should be stripped; expected only body element to remain"
);
if let PageElement::Text(t) = &cleaned[0] {
assert_eq!(t.text, "body content");
} else {
panic!("expected text element after strip");
}
}
fn image_at(hash: &str, left: f32, right: f32, y_center: f32, height: f32) -> PageElement {
let top = y_center + height / 2.0;
let bottom = y_center - height / 2.0;
PageElement::Image(ImageElement {
figure: Figure {
hash: hash.to_string(),
mime_type: "image/png".to_string(),
bytes: Vec::new(),
name: None,
},
left,
right,
top,
bottom,
})
}
#[test]
fn image_banner_stripped_across_pages() {
let mut per_page_elements: Vec<Vec<PageElement>> = Vec::new();
let mut per_page_bbox: Vec<BBox> = Vec::new();
for i in 0..4 {
let logo = image_at("logo-hash-0xDEADBEEF", 50.0, 150.0, 790.0, 20.0);
let body = text_at(&format!("body content {}", i), 50.0, 550.0, 400.0, 12.0);
per_page_elements.push(vec![logo, body]);
per_page_bbox.push(page_bbox(800.0, 0.0));
}
let (banner_lines, banner_tokens, banner_image_hashes) =
detect_banners(&per_page_elements, &per_page_bbox);
assert!(
banner_image_hashes.contains("logo-hash-0xDEADBEEF"),
"expected logo hash in banner_image_hashes, got {:?}",
banner_image_hashes
);
let page = std::mem::take(&mut per_page_elements[0]);
let cleaned = strip_banners(
page,
page_bbox(800.0, 0.0),
&banner_lines,
&banner_tokens,
&banner_image_hashes,
);
assert_eq!(cleaned.len(), 1, "logo image should be stripped");
match &cleaned[0] {
PageElement::Text(t) => assert_eq!(t.text, "body content 0"),
other => panic!("expected text-only body, got {:?}", element_kind(other)),
}
}
fn element_kind(el: &PageElement) -> &'static str {
match el {
PageElement::Text(_) => "text",
PageElement::Image(_) => "image",
}
}
#[test]
fn body_image_not_stripped() {
let mut per_page_elements: Vec<Vec<PageElement>> = Vec::new();
let mut per_page_bbox: Vec<BBox> = Vec::new();
for i in 0..4 {
let body_image = image_at("body-image-hash", 200.0, 400.0, 400.0, 100.0);
let body = text_at(&format!("body content {}", i), 50.0, 550.0, 500.0, 12.0);
per_page_elements.push(vec![body_image, body]);
per_page_bbox.push(page_bbox(800.0, 0.0));
}
let (_lines, _tokens, banner_image_hashes) = detect_banners(&per_page_elements, &per_page_bbox);
assert!(
!banner_image_hashes.contains("body-image-hash"),
"body-band image must NOT be classified as banner"
);
}
#[test]
fn bottom_band_bare_digit_dropped() {
let page = vec![
text_at("body content", 50.0, 550.0, 400.0, 12.0),
text_at("7", 295.0, 305.0, 10.0, 12.0),
];
let cleaned = strip_bottom_band_bare_digits(page, page_bbox(800.0, 0.0));
assert_eq!(cleaned.len(), 1, "bare-digit footer should be stripped");
match &cleaned[0] {
PageElement::Text(t) => assert_eq!(t.text, "body content"),
other => panic!("expected body text, got {:?}", element_kind(other)),
}
}
#[test]
fn bottom_band_non_digit_kept() {
let page = vec![
text_at("body content", 50.0, 550.0, 400.0, 12.0),
text_at("Confidential", 100.0, 500.0, 10.0, 12.0),
];
let cleaned = strip_bottom_band_bare_digits(page, page_bbox(800.0, 0.0));
assert_eq!(cleaned.len(), 2, "non-digit content must be preserved");
}
#[test]
fn body_band_bare_digit_kept() {
let page = vec![
text_at("Section", 50.0, 100.0, 400.0, 12.0),
text_at("7", 110.0, 120.0, 400.0, 12.0),
];
let cleaned = strip_bottom_band_bare_digits(page, page_bbox(800.0, 0.0));
assert_eq!(cleaned.len(), 2, "body-band digit must be preserved");
}