use std::collections::{HashMap, HashSet};
use super::element::{PageElement, TextElement};
use super::zone::{BBox, Bounded};
pub(super) fn bbox_of_elements(elements: &[PageElement]) -> BBox {
if elements.is_empty() {
return BBox::empty();
}
let mut acc = elements[0].bbox();
for el in &elements[1..] {
let b = el.bbox();
acc = BBox {
left: acc.left.min(b.left),
right: acc.right.max(b.right),
top: acc.top.max(b.top),
bottom: acc.bottom.min(b.bottom),
};
}
acc
}
pub(super) const BANNER_BAND_FRACTION: f32 = 0.10;
const BANNER_MIN_PAGES: usize = 3;
const BANNER_MIN_TOKEN_LEN: usize = 4;
pub(super) fn detect_banners(
per_page_elements: &[Vec<PageElement>],
per_page_bbox: &[BBox],
) -> (HashSet<String>, HashSet<String>, HashSet<String>) {
let n_pages = per_page_elements.len();
if n_pages < BANNER_MIN_PAGES {
return (HashSet::new(), HashSet::new(), HashSet::new());
}
let threshold = std::cmp::max(BANNER_MIN_PAGES, n_pages.div_ceil(4));
let mut line_counts: HashMap<String, usize> = HashMap::new();
let mut token_counts: HashMap<String, usize> = HashMap::new();
let mut image_counts: HashMap<String, usize> = HashMap::new();
for (page_idx, elements) in per_page_elements.iter().enumerate() {
let bbox = per_page_bbox[page_idx];
let mut page_tokens: HashSet<String> = HashSet::new();
for line in band_lines(elements, &bbox) {
let n = normalize_banner(&line);
if !n.is_empty() {
*line_counts.entry(n.clone()).or_insert(0) += 1;
for tok in tokenize_banner(&n) {
page_tokens.insert(tok);
}
}
}
for tok in page_tokens {
*token_counts.entry(tok).or_insert(0) += 1;
}
let mut page_image_hashes: HashSet<String> = HashSet::new();
let height = bbox.height();
if height > 0.0 {
let top_cutoff = bbox.top - height * BANNER_BAND_FRACTION;
let bottom_cutoff = bbox.bottom + height * BANNER_BAND_FRACTION;
for el in elements {
if let PageElement::Image(im) = el {
let cy = (im.top + im.bottom) / 2.0;
if cy >= top_cutoff || cy <= bottom_cutoff {
page_image_hashes.insert(im.figure.hash.clone());
}
}
}
}
for h in page_image_hashes {
*image_counts.entry(h).or_insert(0) += 1;
}
}
let lines: HashSet<String> = line_counts
.into_iter()
.filter(|(_, c)| *c >= threshold)
.map(|(k, _)| k)
.collect();
let tokens: HashSet<String> = token_counts
.into_iter()
.filter(|(_, c)| *c >= threshold)
.map(|(k, _)| k)
.collect();
let image_hashes: HashSet<String> = image_counts
.into_iter()
.filter(|(_, c)| *c >= threshold)
.map(|(k, _)| k)
.collect();
(lines, tokens, image_hashes)
}
fn band_lines(elements: &[PageElement], bbox: &BBox) -> Vec<String> {
let height = bbox.height();
if height <= 0.0 {
return Vec::new();
}
let top_cutoff = bbox.top - height * BANNER_BAND_FRACTION;
let bottom_cutoff = bbox.bottom + height * BANNER_BAND_FRACTION;
let mut texts: Vec<&TextElement> = elements
.iter()
.filter_map(|el| match el {
PageElement::Text(t) => {
let cy = t.y_center();
if cy >= top_cutoff || cy <= bottom_cutoff {
Some(t)
} else {
None
}
}
_ => None,
})
.collect();
if texts.is_empty() {
return Vec::new();
}
texts.sort_by(|a, b| {
b.y_center()
.partial_cmp(&a.y_center())
.unwrap_or(std::cmp::Ordering::Equal)
.then(
a.left
.partial_cmp(&b.left)
.unwrap_or(std::cmp::Ordering::Equal),
)
});
let mut lines: Vec<Vec<&TextElement>> = Vec::new();
let mut current: Vec<&TextElement> = Vec::new();
let mut current_y: Option<f32> = None;
for t in texts {
let cy = t.y_center();
let line_h = (t.top - t.bottom).abs().max(1.0);
match current_y {
Some(y) if (cy - y).abs() <= line_h * 0.5 => current.push(t),
_ => {
if !current.is_empty() {
lines.push(std::mem::take(&mut current));
}
current_y = Some(cy);
current.push(t);
}
}
}
if !current.is_empty() {
lines.push(current);
}
lines
.into_iter()
.map(|mut line| {
line.sort_by(|a, b| {
a.left
.partial_cmp(&b.left)
.unwrap_or(std::cmp::Ordering::Equal)
});
line.iter()
.map(|t| t.text.trim())
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ")
})
.filter(|s| !s.is_empty())
.collect()
}
pub(super) fn normalize_banner(s: &str) -> String {
let trimmed = s.trim();
if trimmed.is_empty() {
return String::new();
}
let mut out = String::new();
let mut prev_digit = false;
for c in trimmed.chars() {
if c.is_ascii_digit() {
if !prev_digit {
out.push_str("\\d");
}
prev_digit = true;
} else {
out.push(c);
prev_digit = false;
}
}
out.split_whitespace().collect::<Vec<_>>().join(" ")
}
fn tokenize_banner(normalized: &str) -> Vec<String> {
normalized
.split_whitespace()
.filter(|t| t.chars().count() >= BANNER_MIN_TOKEN_LEN)
.map(|t| t.to_string())
.collect()
}
pub(super) fn strip_banners(
elements: Vec<PageElement>,
bbox: BBox,
banner_lines: &HashSet<String>,
banner_tokens: &HashSet<String>,
banner_image_hashes: &HashSet<String>,
) -> Vec<PageElement> {
if banner_lines.is_empty() && banner_tokens.is_empty() && banner_image_hashes.is_empty() {
return elements;
}
let height = bbox.height();
if height <= 0.0 {
return elements;
}
let top_cutoff = bbox.top - height * BANNER_BAND_FRACTION;
let bottom_cutoff = bbox.bottom + height * BANNER_BAND_FRACTION;
let mut drop_indices: HashSet<usize> = HashSet::new();
if !banner_image_hashes.is_empty() {
for (idx, el) in elements.iter().enumerate() {
if let PageElement::Image(im) = el {
let cy = (im.top + im.bottom) / 2.0;
if (cy >= top_cutoff || cy <= bottom_cutoff)
&& banner_image_hashes.contains(&im.figure.hash)
{
drop_indices.insert(idx);
}
}
}
}
let mut indexed: Vec<(usize, &TextElement)> = Vec::new();
for (idx, el) in elements.iter().enumerate() {
if let PageElement::Text(t) = el {
let cy = t.y_center();
if cy >= top_cutoff || cy <= bottom_cutoff {
indexed.push((idx, t));
}
}
}
if indexed.is_empty() && drop_indices.is_empty() {
return elements;
}
indexed.sort_by(|a, b| {
b.1.y_center()
.partial_cmp(&a.1.y_center())
.unwrap_or(std::cmp::Ordering::Equal)
.then(
a.1.left
.partial_cmp(&b.1.left)
.unwrap_or(std::cmp::Ordering::Equal),
)
});
let mut lines: Vec<Vec<(usize, &TextElement)>> = Vec::new();
let mut current: Vec<(usize, &TextElement)> = Vec::new();
let mut current_y: Option<f32> = None;
for (idx, t) in indexed {
let cy = t.y_center();
let line_h = (t.top - t.bottom).abs().max(1.0);
match current_y {
Some(y) if (cy - y).abs() <= line_h * 0.5 => current.push((idx, t)),
_ => {
if !current.is_empty() {
lines.push(std::mem::take(&mut current));
}
current_y = Some(cy);
current.push((idx, t));
}
}
}
if !current.is_empty() {
lines.push(current);
}
for line in lines {
let mut sorted = line;
sorted.sort_by(|a, b| {
a.1.left
.partial_cmp(&b.1.left)
.unwrap_or(std::cmp::Ordering::Equal)
});
let joined = sorted
.iter()
.map(|(_, t)| t.text.trim())
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join(" ");
let normalized = normalize_banner(&joined);
let line_match = banner_lines.contains(&normalized);
let token_match = !normalized.is_empty() && !banner_tokens.is_empty() && {
let toks = tokenize_banner(&normalized);
!toks.is_empty() && toks.iter().all(|t| banner_tokens.contains(t))
};
if line_match || token_match {
for (idx, _) in sorted {
drop_indices.insert(idx);
}
}
}
elements
.into_iter()
.enumerate()
.filter_map(|(idx, el)| {
if drop_indices.contains(&idx) {
None
} else {
Some(el)
}
})
.collect()
}
const FOOTER_DIGIT_BAND_FRACTION: f32 = 0.08;
pub(super) fn strip_bottom_band_bare_digits(
elements: Vec<PageElement>,
bbox: BBox,
) -> Vec<PageElement> {
let height = bbox.height();
if height <= 0.0 {
return elements;
}
let cutoff = bbox.bottom + height * FOOTER_DIGIT_BAND_FRACTION;
elements
.into_iter()
.filter(|el| match el {
PageElement::Text(t) => {
let cy = t.y_center();
if cy > cutoff {
return true;
}
let trimmed = t.text.trim();
let is_bare_digits = !trimmed.is_empty()
&& trimmed.len() <= 3
&& trimmed.chars().all(|c| c.is_ascii_digit());
!is_bare_digits
}
_ => true,
})
.collect()
}
#[cfg(test)]
#[path = "banner_tests.rs"]
mod tests;