use crate::layout::TextSpan;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum RegionClass {
Prose,
Reference,
Table,
Form,
Mixed,
}
impl RegionClass {
pub fn is_reorderable_column(self) -> bool {
matches!(self, RegionClass::Prose | RegionClass::Reference)
}
}
struct LineStat {
top: f32,
left: f32,
right: f32,
nonws_chars: usize,
lead_text: String,
span_lefts: Vec<f32>,
span_rights: Vec<f32>,
}
pub fn classify_region(spans: &[TextSpan], indices: &[usize]) -> RegionClass {
if indices.len() < 6 {
return RegionClass::Mixed;
}
let mut x_min = f32::MAX;
let mut x_max = f32::MIN;
for &i in indices {
x_min = x_min.min(spans[i].bbox.left());
x_max = x_max.max(spans[i].bbox.right());
}
let region_width = x_max - x_min;
if region_width <= 10.0 {
return RegionClass::Mixed;
}
let med_h = median_height(spans, indices).max(1.0);
let lines = cluster_lines(spans, indices, med_h);
let line_count = lines.len();
if line_count < 6 {
return RegionClass::Mixed;
}
let mut total_chars = 0usize;
let mut wide_lines = 0usize;
let mut numbered_lines = 0usize;
let mut form_lines = 0usize;
let mut left_edges: Vec<f32> = Vec::with_capacity(line_count);
for l in &lines {
total_chars += l.nonws_chars;
let extent = (l.right - l.left).max(0.0);
if extent >= region_width * 0.6 {
wide_lines += 1;
}
if starts_numbered_entry(&l.lead_text) {
numbered_lines += 1;
}
if line_has_label_value_gap(l, region_width) {
form_lines += 1;
}
left_edges.push(l.left);
}
let mean_chars = total_chars as f32 / line_count as f32;
let mostly_wide = wide_lines * 2 > line_count;
let numbered_frac = numbered_lines as f32 / line_count as f32;
let form_frac = form_lines as f32 / line_count as f32;
if mean_chars < 10.0 {
return RegionClass::Table;
}
if form_frac >= 0.4 {
return RegionClass::Form;
}
if mean_chars > 12.0 && (numbered_frac >= 0.3 || has_hanging_indent(&left_edges, med_h)) {
return RegionClass::Reference;
}
if mean_chars > 20.0 && mostly_wide {
return RegionClass::Prose;
}
RegionClass::Mixed
}
fn median_height(spans: &[TextSpan], indices: &[usize]) -> f32 {
let mut hs: Vec<f32> = indices
.iter()
.map(|&i| spans[i].bbox.height.abs())
.filter(|h| *h > 0.0)
.collect();
if hs.is_empty() {
return 1.0;
}
hs.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
hs[hs.len() / 2]
}
fn cluster_lines(spans: &[TextSpan], indices: &[usize], med_h: f32) -> Vec<LineStat> {
let mut order: Vec<usize> = indices.to_vec();
order.sort_by(|&a, &b| {
spans[a]
.bbox
.top()
.partial_cmp(&spans[b].bbox.top())
.unwrap_or(std::cmp::Ordering::Equal)
.then_with(|| {
spans[a]
.bbox
.left()
.partial_cmp(&spans[b].bbox.left())
.unwrap_or(std::cmp::Ordering::Equal)
})
});
let tol = med_h * 0.6;
let mut lines: Vec<LineStat> = Vec::new();
for &i in &order {
let s = &spans[i];
let nonws = s.text.chars().filter(|c| !c.is_whitespace()).count();
match lines.last_mut() {
Some(l) if (s.bbox.top() - l.top).abs() <= tol => {
l.left = l.left.min(s.bbox.left());
l.right = l.right.max(s.bbox.right());
l.nonws_chars += nonws;
if s.bbox.left() < l.span_lefts[0] {
l.lead_text = s.text.trim_start().to_string();
}
l.span_lefts.push(s.bbox.left());
l.span_rights.push(s.bbox.right());
},
_ => lines.push(LineStat {
top: s.bbox.top(),
left: s.bbox.left(),
right: s.bbox.right(),
nonws_chars: nonws,
lead_text: s.text.trim_start().to_string(),
span_lefts: vec![s.bbox.left()],
span_rights: vec![s.bbox.right()],
}),
}
}
for l in &mut lines {
let mut paired: Vec<(f32, f32)> = l
.span_lefts
.iter()
.copied()
.zip(l.span_rights.iter().copied())
.collect();
paired.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(std::cmp::Ordering::Equal));
l.span_lefts = paired.iter().map(|p| p.0).collect();
l.span_rights = paired.iter().map(|p| p.1).collect();
}
lines
}
fn starts_numbered_entry(lead: &str) -> bool {
let b = lead.as_bytes();
if b.is_empty() {
return false;
}
if (b[0] == b'[' || b[0] == b'(') && b.get(1).is_some_and(u8::is_ascii_digit) {
return true;
}
let digits = b.iter().take(4).take_while(|c| c.is_ascii_digit()).count();
if (1..=3).contains(&digits) {
if let Some(&next) = b.get(digits) {
return next == b'.' || next == b')';
}
}
false
}
fn line_has_label_value_gap(l: &LineStat, region_width: f32) -> bool {
if l.span_lefts.len() < 2 {
return false;
}
let threshold = region_width * 0.25;
for w in 1..l.span_lefts.len() {
let gap = l.span_lefts[w] - l.span_rights[w - 1];
if gap >= threshold {
return true;
}
}
false
}
fn has_hanging_indent(left_edges: &[f32], med_h: f32) -> bool {
if left_edges.len() < 6 {
return false;
}
let l0 = left_edges.iter().copied().fold(f32::MAX, f32::min);
let near_tol = med_h * 0.5;
let lo_band = left_edges
.iter()
.filter(|&&x| (x - l0).abs() <= near_tol)
.count();
let hi_band = left_edges
.iter()
.filter(|&&x| {
let d = x - l0;
d >= med_h * 0.8 && d <= med_h * 5.0
})
.count();
let n = left_edges.len();
lo_band * 4 >= n && hi_band * 4 >= n
}
#[cfg(test)]
mod tests {
use super::*;
use crate::geometry::Rect;
fn span(text: &str, left: f32, top: f32, width: f32, height: f32) -> TextSpan {
TextSpan {
text: text.to_string(),
bbox: Rect::new(left, top, width, height),
font_size: height,
..Default::default()
}
}
fn prose_line(top: f32, left: f32, chars: usize) -> TextSpan {
let text: String = "x".repeat(chars);
span(&text, left, top, chars as f32 * 5.0, 10.0)
}
fn classify(spans: &[TextSpan]) -> RegionClass {
let idx: Vec<usize> = (0..spans.len()).collect();
classify_region(spans, &idx)
}
#[test]
fn classify_dense_results_is_prose() {
let spans: Vec<TextSpan> = (0..10)
.map(|i| prose_line(i as f32 * 12.0, 0.0, 40))
.collect();
assert_eq!(classify(&spans), RegionClass::Prose);
}
#[test]
fn classify_numbered_references_is_reference() {
let spans: Vec<TextSpan> = (0..8)
.map(|i| {
let t = format!("{}. Author A, Title of the work, Journal", i + 1);
span(&t, 0.0, i as f32 * 12.0, 180.0, 10.0)
})
.collect();
assert_eq!(classify(&spans), RegionClass::Reference);
}
#[test]
fn classify_hanging_indent_references_is_reference() {
let mut spans = Vec::new();
for e in 0..4 {
let base = e as f32 * 24.0;
spans.push(span(
"Smith J, Some long reference entry title here",
0.0,
base,
200.0,
10.0,
));
spans.push(span(
"continuation of the reference line indented",
15.0,
base + 12.0,
180.0,
10.0,
));
}
assert_eq!(classify(&spans), RegionClass::Reference);
}
#[test]
fn classify_table_cells_is_table() {
let mut spans = Vec::new();
for r in 0..6 {
spans.push(span("12.3", 0.0, r as f32 * 12.0, 18.0, 10.0));
spans.push(span("45.6", 60.0, r as f32 * 12.0, 18.0, 10.0));
}
assert_eq!(classify(&spans), RegionClass::Table);
}
#[test]
fn classify_form_label_value_is_form() {
let spans: Vec<TextSpan> = (0..8)
.map(|i| {
let mut label =
span("Wages, salaries, tips, etc.", 0.0, i as f32 * 12.0, 90.0, 10.0);
label.text = "Wages, salaries, tips".to_string();
label
})
.collect();
let mut all = Vec::new();
for (i, l) in spans.into_iter().enumerate() {
all.push(l);
all.push(span("1,234", 200.0, i as f32 * 12.0, 30.0, 10.0));
}
assert_eq!(classify(&all), RegionClass::Form);
}
#[test]
fn classify_single_paragraph_is_mixed() {
let spans: Vec<TextSpan> = (0..4)
.map(|i| prose_line(i as f32 * 12.0, 0.0, 40))
.collect();
assert_eq!(classify(&spans), RegionClass::Mixed);
}
#[test]
fn classify_empty_or_tiny_is_mixed() {
assert_eq!(classify(&[]), RegionClass::Mixed);
let spans: Vec<TextSpan> = (0..3)
.map(|i| prose_line(i as f32 * 12.0, 0.0, 10))
.collect();
assert_eq!(classify(&spans), RegionClass::Mixed);
}
}