use kurbo::{Affine, BezPath};
use pdf_render::pdf_interpret::cmap::BfString;
use pdf_render::pdf_interpret::font::Glyph;
use pdf_render::pdf_interpret::{
BlendMode, ClipPath, Device, GlyphDrawMode, Image, Paint, PathDrawMode, SoftMask,
};
use std::cmp::Ordering;
const BAND_Y_TOLERANCE: f64 = 5.0;
const BAND_Y_FRACTION: f64 = 0.30;
const PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER: f64 = 1.8;
const TJ_SPACE_THRESHOLD_UNITS: f32 = 100.0;
const TJ_SIGNAL_WEIGHT: f64 = 0.95;
const GAP_SIGNAL_WEIGHT: f64 = 0.80;
const HEURISTIC_SIGNAL_WEIGHT: f64 = 0.60;
const SPACE_CONSENSUS_THRESHOLD: f64 = 0.75;
const GAP_TO_MEDIAN_CHAR_FRACTION: f64 = 0.30;
const GAP_TO_FONT_SIZE_FALLBACK_FRACTION: f64 = 0.15;
const COLUMN_GAP_THRESHOLD_MIN: f64 = 10.0;
const COLUMN_GAP_THRESHOLD_MAX: f64 = 40.0;
const COLUMN_GAP_MEDIAN_MULTIPLIER: f64 = 3.0;
const COLUMN_GAP_THRESHOLD_FALLBACK: f64 = 20.0;
const COLUMN_GAP_MATCH_TOLERANCE: f64 = 12.0;
const MIN_COLUMN_GAPPED_BANDS: usize = 3;
const MIN_COLUMN_GAP_SUPPORT: f64 = 0.80;
const MIN_DENSE_SLICE_RATIO: f64 = 0.35;
#[derive(Debug, Clone)]
pub struct TextSpan {
pub text: String,
pub x: f64,
pub y: f64,
pub width: f64,
pub height: f64,
pub font_size: f64,
}
impl TextSpan {
fn right(&self) -> f64 {
self.x + self.width.max(self.estimated_width())
}
fn measured_right(&self) -> f64 {
self.x + self.width
}
fn estimated_width(&self) -> f64 {
let char_count = self.text.chars().count() as f64;
if char_count <= 0.0 {
self.font_size * 0.5
} else {
self.font_size * 0.5 * char_count
}
}
}
#[derive(Debug, Clone)]
pub struct TextBlock {
pub spans: Vec<TextSpan>,
}
impl TextBlock {
pub fn text(&self) -> String {
if self.spans.is_empty() {
return String::new();
}
let mut result = self.spans[0].text.clone();
for pair in self.spans.windows(2) {
let prev = &pair[0];
let curr = &pair[1];
let expected_end = prev.measured_right();
let gap = curr.x - expected_end;
if gap <= prev.font_size * 0.12 {
if let Some(trimmed) = trim_overlapping_word_prefix(&prev.text, &curr.text) {
result.push_str(&trimmed);
continue;
}
}
if gap > prev.font_size * 0.25 {
result.push(' ');
}
result.push_str(&curr.text);
}
result
}
}
#[derive(Debug, Clone)]
struct TextBand {
y: f64,
spans: Vec<TextSpan>,
}
impl TextBand {
fn new(span: TextSpan) -> Self {
Self {
y: span.y,
spans: vec![span],
}
}
fn sort_spans(&mut self) {
self.spans.sort_by(|a, b| {
a.x.partial_cmp(&b.x)
.unwrap_or(Ordering::Equal)
.then_with(|| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal))
});
collapse_overprinted_spans(&mut self.spans);
}
fn row_block(&self) -> TextBlock {
let mut spans = self.spans.clone();
spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
TextBlock { spans }
}
fn left(&self) -> f64 {
self.spans
.iter()
.map(|span| span.x)
.fold(f64::INFINITY, f64::min)
}
fn right(&self) -> f64 {
self.spans
.iter()
.map(TextSpan::right)
.fold(f64::NEG_INFINITY, f64::max)
}
fn width(&self) -> f64 {
(self.right() - self.left()).max(0.0)
}
fn gap_midpoints(&self, column_gap_threshold: f64) -> Vec<f64> {
self.gaps(column_gap_threshold)
.into_iter()
.map(|gap| (gap.start + gap.end) * 0.5)
.collect()
}
fn gaps(&self, column_gap_threshold: f64) -> Vec<BandGap> {
if self.spans.len() < 2 {
return Vec::new();
}
let mut spans = self.spans.clone();
spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
let mut gaps = Vec::new();
let mut prev_right = spans[0].right();
for span in spans.iter().skip(1) {
let gap = span.x - prev_right;
if gap >= column_gap_threshold {
gaps.push(BandGap {
start: prev_right,
end: span.x,
});
}
prev_right = prev_right.max(span.right());
}
gaps
}
fn split_by_boundaries(&self, boundaries: &[f64]) -> Vec<Vec<TextSpan>> {
let mut columns = vec![Vec::new(); boundaries.len() + 1];
for span in &self.spans {
let center_x = span.x + span.width.max(span.estimated_width()) * 0.5;
let column_idx = boundaries
.iter()
.position(|boundary| center_x < *boundary)
.unwrap_or(boundaries.len());
columns[column_idx].push(span.clone());
}
for spans in &mut columns {
spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
}
columns
}
fn fits_single_column(
&self,
boundaries: &[f64],
region_left: f64,
region_right: f64,
) -> Option<usize> {
let mut column_idx: Option<usize> = None;
for span in &self.spans {
let left = span.x;
let right = span.right();
if boundaries
.iter()
.any(|boundary| left < *boundary && right > *boundary)
{
return None;
}
let center_x = left + (right - left) * 0.5;
let idx = boundaries
.iter()
.position(|boundary| center_x < *boundary)
.unwrap_or(boundaries.len());
match column_idx {
Some(existing) if existing != idx => return None,
Some(_) => {}
None => column_idx = Some(idx),
}
}
let idx = column_idx?;
let mut edges = Vec::with_capacity(boundaries.len() + 2);
edges.push(region_left);
edges.extend_from_slice(boundaries);
edges.push(region_right);
let column_width = (edges[idx + 1] - edges[idx]).max(0.0);
if column_width <= 0.0 || self.width() > column_width * 0.8 {
return None;
}
Some(idx)
}
}
#[derive(Debug, Clone, Copy)]
struct BandGap {
start: f64,
end: f64,
}
pub(crate) struct TextExtractionDevice {
spans: Vec<TextSpan>,
last_y: f64,
last_end_x: f64,
pending_tj_offset: f32,
glyph_widths: Vec<f64>,
cached_median_char_width: f64,
}
const MEDIAN_REFRESH: usize = 32;
impl Default for TextExtractionDevice {
fn default() -> Self {
Self::new()
}
}
impl TextExtractionDevice {
pub fn new() -> Self {
Self {
spans: Vec::new(),
last_y: f64::NEG_INFINITY,
last_end_x: f64::NEG_INFINITY,
pending_tj_offset: 0.0,
glyph_widths: Vec::new(),
cached_median_char_width: 0.0,
}
}
fn refresh_median_char_width(&mut self) {
if self.glyph_widths.is_empty() {
self.cached_median_char_width = 0.0;
return;
}
let mut sorted = self.glyph_widths.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
self.cached_median_char_width = sorted[sorted.len() / 2];
}
fn evaluate_space_consensus(
&self,
gap: f64,
font_size: f64,
prev_text: &str,
next_text: &str,
) -> bool {
let mut confidence = 0.0;
if self.pending_tj_offset.abs() >= TJ_SPACE_THRESHOLD_UNITS {
confidence += TJ_SIGNAL_WEIGHT;
}
let gap_reference = if self.cached_median_char_width > 0.0 {
self.cached_median_char_width * GAP_TO_MEDIAN_CHAR_FRACTION
} else {
font_size * GAP_TO_FONT_SIZE_FALLBACK_FRACTION
};
if gap > gap_reference {
confidence += GAP_SIGNAL_WEIGHT;
}
if let (Some(prev_last), Some(next_first)) =
(prev_text.chars().last(), next_text.chars().next())
{
let camel = prev_last.is_lowercase() && next_first.is_uppercase();
let digit_to_letter = prev_last.is_ascii_digit() && next_first.is_alphabetic();
let letter_to_digit = prev_last.is_alphabetic() && next_first.is_ascii_digit();
if camel || digit_to_letter || letter_to_digit {
confidence += HEURISTIC_SIGNAL_WEIGHT;
}
}
confidence >= SPACE_CONSENSUS_THRESHOLD
}
pub fn into_text(self) -> String {
let blocks = group_spans_into_blocks(self.spans);
let lines: Vec<String> = blocks.iter().map(|b| b.text()).collect();
let stitched = stitch_hyphenated_lines(&lines);
normalize_text_output(&stitched)
}
pub fn into_blocks(self) -> Vec<TextBlock> {
group_spans_into_blocks(self.spans)
}
#[allow(dead_code)]
pub(crate) fn into_spans(self) -> Vec<TextSpan> {
self.spans
}
}
impl Device<'_> for TextExtractionDevice {
fn set_soft_mask(&mut self, _: Option<SoftMask<'_>>) {}
fn set_blend_mode(&mut self, _: BlendMode) {}
fn draw_path(&mut self, _: &BezPath, _: Affine, _: &Paint<'_>, _: &PathDrawMode) {}
fn push_clip_path(&mut self, _: &ClipPath) {}
fn push_transparency_group(&mut self, _: f32, _: Option<SoftMask<'_>>, _: BlendMode) {}
fn draw_image(&mut self, _: Image<'_, '_>, _: Affine) {}
fn pop_clip_path(&mut self) {}
fn pop_transparency_group(&mut self) {}
fn draw_glyph(
&mut self,
glyph: &Glyph<'_>,
transform: Affine,
glyph_transform: Affine,
_paint: &Paint<'_>,
_draw_mode: &GlyphDrawMode,
) {
let text = match glyph.as_unicode() {
Some(BfString::Char(c)) => c.to_string(),
Some(BfString::String(s)) => s,
None => return,
};
let composed = transform * glyph_transform;
let coeffs = composed.as_coeffs();
let x = coeffs[4];
let y = coeffs[5];
let glyph_scale = (coeffs[0].powi(2) + coeffs[1].powi(2)).sqrt().abs();
let font_size = glyph_scale * 1000.0;
let glyph_width = estimate_glyph_width(glyph, font_size).max(font_size * 0.25);
let glyph_end_x = x + glyph_width;
if self.glyph_widths.len() < 4096 {
self.glyph_widths.push(glyph_width);
if self.glyph_widths.len() % MEDIAN_REFRESH == 0 {
self.refresh_median_char_width();
}
}
let same_line = (y - self.last_y).abs() <= font_size.max(BAND_Y_TOLERANCE) * 0.35;
let gap = x - self.last_end_x;
let adjacent = same_line && gap >= -font_size * 0.25 && gap < font_size * 0.5;
if adjacent && !self.spans.is_empty() {
let want_space = {
let last = self.spans.last().expect("checked non-empty");
!last.text.ends_with(' ')
&& !text.starts_with(' ')
&& self.evaluate_space_consensus(gap, font_size, &last.text, &text)
};
let last = self.spans.last_mut().expect("checked non-empty");
if want_space {
last.text.push(' ');
}
last.text.push_str(&text);
last.width = last.width.max(glyph_end_x - last.x);
last.height = last.height.max(font_size);
self.last_y = y;
self.last_end_x = glyph_end_x;
self.pending_tj_offset = 0.0;
return;
}
self.last_y = y;
self.last_end_x = glyph_end_x;
self.pending_tj_offset = 0.0;
self.spans.push(TextSpan {
text,
x,
y,
width: glyph_width,
height: font_size,
font_size,
});
}
fn text_adjustment(&mut self, amount: f32) {
self.pending_tj_offset += amount;
}
}
fn estimate_glyph_width(glyph: &Glyph<'_>, font_size: f64) -> f64 {
match glyph {
Glyph::Outline(outline) => outline
.advance_width()
.map(|width| width as f64 / 1000.0 * font_size)
.unwrap_or(font_size * 0.5),
Glyph::Type3(_) => font_size * 0.5,
}
}
fn collapse_overprinted_spans(spans: &mut Vec<TextSpan>) {
if spans.len() < 2 {
return;
}
let mut deduped: Vec<TextSpan> = Vec::with_capacity(spans.len());
for span in spans.drain(..) {
if let Some(last) = deduped.last_mut() {
if spans_are_overprint_duplicates(last, &span) {
let choose_incoming = span.text.chars().count() > last.text.chars().count()
|| (span.text.chars().count() == last.text.chars().count()
&& span.width > last.width);
let preferred_text = if choose_incoming {
span.text.clone()
} else {
last.text.clone()
};
let left = last.x.min(span.x);
let right = last.right().max(span.right());
last.x = left;
last.y = (last.y + span.y) * 0.5;
last.width = (right - left).max(last.width).max(span.width);
last.height = last.height.max(span.height);
last.font_size = last.font_size.max(span.font_size);
last.text = preferred_text;
continue;
}
}
deduped.push(span);
}
*spans = deduped;
}
fn spans_are_overprint_duplicates(lhs: &TextSpan, rhs: &TextSpan) -> bool {
let lhs_text = lhs.text.trim();
let rhs_text = rhs.text.trim();
if lhs_text.is_empty() || rhs_text.is_empty() {
return false;
}
let same_baseline = (lhs.y - rhs.y).abs() <= lhs.font_size.max(rhs.font_size) * 0.12;
if !same_baseline {
return false;
}
let lhs_left = lhs.x;
let lhs_right = lhs.right();
let rhs_left = rhs.x;
let rhs_right = rhs.right();
let overlap = (lhs_right.min(rhs_right) - lhs_left.max(rhs_left)).max(0.0);
let min_width = (lhs_right - lhs_left).min(rhs_right - rhs_left).max(1.0);
let heavily_overlaps = overlap / min_width >= 0.85;
if !heavily_overlaps {
return false;
}
lhs_text == rhs_text || lhs_text.starts_with(rhs_text) || rhs_text.starts_with(lhs_text)
}
fn trim_overlapping_word_prefix(prev: &str, curr: &str) -> Option<String> {
let prev_chars: Vec<char> = prev.trim_end().chars().collect();
let curr_chars: Vec<char> = curr.trim_start().chars().collect();
let max = prev_chars.len().min(curr_chars.len());
for len in (4..=max).rev() {
let prev_start = prev_chars.len() - len;
if prev_chars[prev_start..] != curr_chars[..len] {
continue;
}
if !curr_chars[..len].iter().all(|ch| ch.is_alphanumeric()) {
continue;
}
let prev_boundary = prev_start == 0 || !prev_chars[prev_start - 1].is_alphanumeric();
let curr_boundary = len == curr_chars.len() || !curr_chars[len].is_alphanumeric();
if !prev_boundary || !curr_boundary {
continue;
}
return Some(curr_chars[len..].iter().collect());
}
None
}
fn compute_adaptive_column_gap(bands: &[TextBand]) -> f64 {
let mut all_gaps: Vec<f64> = Vec::new();
for band in bands {
if band.spans.len() < 2 {
continue;
}
let mut sorted = band.spans.clone();
sorted.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
let mut prev_right = sorted[0].right();
for span in sorted.iter().skip(1) {
let gap = span.x - prev_right;
if gap > 0.0 {
all_gaps.push(gap);
}
prev_right = prev_right.max(span.right());
}
}
if all_gaps.is_empty() {
return COLUMN_GAP_THRESHOLD_FALLBACK;
}
all_gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
let min_gap = all_gaps[0];
if min_gap > COLUMN_GAP_THRESHOLD_MIN {
return (min_gap * 0.75).clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
}
let mut best_break_threshold = 0.0f64;
let mut best_ratio = 1.5f64; for pair in all_gaps.windows(2) {
if pair[0] > 0.5 {
let ratio = pair[1] / pair[0];
if ratio > best_ratio {
best_ratio = ratio;
best_break_threshold = (pair[0] + pair[1]) * 0.5;
}
}
}
if best_break_threshold > 0.0 {
return best_break_threshold.clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
}
let mid = all_gaps.len() / 2;
let median = if all_gaps.len() % 2 == 0 {
(all_gaps[mid - 1] + all_gaps[mid]) * 0.5
} else {
all_gaps[mid]
};
(median * COLUMN_GAP_MEDIAN_MULTIPLIER)
.clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX)
}
#[derive(Debug, Clone, Copy)]
struct PageStats {
median_font_size: f64,
#[allow(dead_code)]
median_char_width: f64,
median_line_spacing: f64,
}
impl PageStats {
fn from_spans(spans: &[TextSpan]) -> Self {
if spans.is_empty() {
return Self {
median_font_size: 12.0,
median_char_width: 6.0,
median_line_spacing: 0.0,
};
}
let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
let median_font_size = sizes[sizes.len() / 2];
let mut char_widths: Vec<f64> = spans
.iter()
.filter_map(|s| {
let chars = s.text.chars().count();
if chars > 0 && s.width > 0.0 {
Some(s.width / chars as f64)
} else {
None
}
})
.collect();
let median_char_width = if char_widths.is_empty() {
median_font_size * 0.5
} else {
char_widths.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
char_widths[char_widths.len() / 2]
};
let band_tolerance = (median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
let mut ys: Vec<f64> = spans.iter().map(|s| s.y).collect();
ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
let mut band_ys: Vec<f64> = Vec::new();
for y in ys {
if band_ys
.last()
.map(|prev: &f64| (prev - y).abs() > band_tolerance)
.unwrap_or(true)
{
band_ys.push(y);
}
}
let median_line_spacing = if band_ys.len() < 2 {
0.0
} else {
let mut spacings: Vec<f64> = band_ys
.windows(2)
.map(|pair| (pair[0] - pair[1]).abs())
.collect();
spacings.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
let q1_index = spacings.len() / 4;
spacings[q1_index]
};
Self {
median_font_size,
median_char_width,
median_line_spacing,
}
}
}
const XY_CUT_MAX_DEPTH: usize = 12;
const XY_CUT_VERTICAL_GAP_REGION_FRACTION: f64 = 0.04;
const XY_CUT_VERTICAL_GAP_FLOOR: f64 = 10.0;
const XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER: f64 = 1.8;
const XY_CUT_MIN_SPANS_PER_COLUMN: usize = 2;
const XY_CUT_MIN_CHARS_PER_BAND: f64 = 8.0;
fn group_spans_into_blocks(spans: Vec<TextSpan>) -> Vec<TextBlock> {
if spans.is_empty() {
return Vec::new();
}
let stats = PageStats::from_spans(&spans);
xy_cut_recursive(spans, 0, &stats)
}
fn xy_cut_recursive(spans: Vec<TextSpan>, depth: usize, stats: &PageStats) -> Vec<TextBlock> {
if spans.is_empty() {
return Vec::new();
}
if depth >= XY_CUT_MAX_DEPTH {
return band_based_blocks(spans, stats);
}
let vcut = try_vertical_cut(&spans, stats);
let hcut = try_horizontal_cut(&spans, stats);
let (chosen, _) = match (vcut, hcut) {
(Some((v_groups, v_gap)), Some((h_groups, h_gap))) => {
if v_gap >= h_gap {
(Some(v_groups), v_gap)
} else {
(Some(h_groups), h_gap)
}
}
(Some((v_groups, v_gap)), None) => (Some(v_groups), v_gap),
(None, Some((h_groups, h_gap))) => (Some(h_groups), h_gap),
(None, None) => (None, 0.0),
};
if let Some(groups) = chosen {
let mut out = Vec::new();
for group in groups {
out.extend(xy_cut_recursive(group, depth + 1, stats));
}
return out;
}
band_based_blocks(spans, stats)
}
fn band_based_blocks(spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBlock> {
group_spans_into_blocks_legacy_with_stats(spans, stats)
}
#[allow(dead_code)]
fn median_font_size(spans: &[TextSpan]) -> f64 {
if spans.is_empty() {
return 12.0;
}
let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
sizes[sizes.len() / 2]
}
fn try_vertical_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
if spans.len() < 2 * XY_CUT_MIN_SPANS_PER_COLUMN {
return None;
}
let region_left = spans.iter().map(|s| s.x).fold(f64::INFINITY, f64::min);
let region_right = spans
.iter()
.map(TextSpan::right)
.fold(f64::NEG_INFINITY, f64::max);
let region_width = region_right - region_left;
if region_width <= 0.0 {
return None;
}
let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
let adaptive = compute_adaptive_column_gap(&bands);
let floor = stats
.median_font_size
.max(region_width * XY_CUT_VERTICAL_GAP_REGION_FRACTION)
.max(XY_CUT_VERTICAL_GAP_FLOOR);
let min_gap = adaptive.min(floor).max(XY_CUT_VERTICAL_GAP_FLOOR);
let mut intervals: Vec<(f64, f64)> = spans
.iter()
.map(|s| (s.x, s.right().max(s.x + 0.001)))
.collect();
intervals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
let mut cursor = intervals[0].1;
let mut best_gap: Option<(f64, f64)> = None; for (left, right) in intervals.iter().skip(1) {
if *left > cursor {
let gap = *left - cursor;
if gap >= min_gap {
match best_gap {
Some((best, _)) if best >= gap => {}
_ => {
let cut_x = (cursor + *left) * 0.5;
best_gap = Some((gap, cut_x));
}
}
}
}
cursor = cursor.max(*right);
}
let (gap_size, cut_x) = best_gap?;
let mut left_group = Vec::new();
let mut right_group = Vec::new();
for span in spans {
let midpoint = span.x + (span.right() - span.x) * 0.5;
if midpoint < cut_x {
left_group.push(span.clone());
} else {
right_group.push(span.clone());
}
}
if !columns_are_dense(&left_group, &right_group, stats) {
return None;
}
if !columns_are_band_aligned(spans, cut_x, region_left, region_right, stats) {
return None;
}
Some((vec![left_group, right_group], gap_size))
}
fn columns_are_band_aligned(
spans: &[TextSpan],
cut_x: f64,
region_left: f64,
region_right: f64,
stats: &PageStats,
) -> bool {
let left_width = (cut_x - region_left).max(1.0);
let right_width = (region_right - cut_x).max(1.0);
const MAX_SINGLE_SIDE_FRACTION: f64 = 0.70;
let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
for band in &bands {
let mut has_left = false;
let mut has_right = false;
for span in &band.spans {
let midpoint = span.x + (span.right() - span.x) * 0.5;
if midpoint < cut_x {
has_left = true;
} else {
has_right = true;
}
}
if has_left && has_right {
continue; }
let band_width = band.width();
if has_left && band_width > left_width * MAX_SINGLE_SIDE_FRACTION {
return false;
}
if has_right && band_width > right_width * MAX_SINGLE_SIDE_FRACTION {
return false;
}
}
true
}
fn columns_are_dense(left: &[TextSpan], right: &[TextSpan], stats: &PageStats) -> bool {
for col in [left, right] {
if col.len() < XY_CUT_MIN_SPANS_PER_COLUMN {
return false;
}
let bands = group_spans_into_bands_with_stats(col.to_vec(), stats);
if bands.is_empty() {
return false;
}
let total_chars: usize = col.iter().map(|s| s.text.chars().count()).sum();
let chars_per_band = total_chars as f64 / bands.len() as f64;
if chars_per_band < XY_CUT_MIN_CHARS_PER_BAND {
return false;
}
}
true
}
fn try_horizontal_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
if spans.len() < 2 {
return None;
}
let mut sorted = spans.to_vec();
sorted.sort_by(|a, b| {
b.y.partial_cmp(&a.y)
.unwrap_or(Ordering::Equal)
.then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
});
let min_gap = if stats.median_line_spacing > 0.0 {
stats.median_line_spacing * PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER
} else {
stats.median_font_size * XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER
};
let mut best: Option<(f64, f64)> = None; let tolerance = stats.median_font_size * BAND_Y_FRACTION;
let mut band_bottom = sorted[0].y;
for span in sorted.iter().skip(1) {
if (band_bottom - span.y).abs() <= tolerance {
band_bottom = band_bottom.min(span.y);
continue;
}
let gap = band_bottom - span.y;
if gap >= min_gap {
let cut_y = (band_bottom + span.y) * 0.5;
match best {
Some((best_gap, _)) if best_gap >= gap => {}
_ => best = Some((gap, cut_y)),
}
}
band_bottom = span.y;
}
let (gap_size, cut_y) = best?;
let mut top_group = Vec::new();
let mut bottom_group = Vec::new();
for span in spans {
if span.y > cut_y {
top_group.push(span.clone());
} else {
bottom_group.push(span.clone());
}
}
if top_group.is_empty() || bottom_group.is_empty() {
return None;
}
Some((vec![top_group, bottom_group], gap_size))
}
#[allow(dead_code)]
fn group_spans_into_blocks_legacy(spans: Vec<TextSpan>) -> Vec<TextBlock> {
let bands = group_spans_into_bands(spans);
group_spans_into_blocks_legacy_from_bands(bands)
}
fn group_spans_into_blocks_legacy_with_stats(
spans: Vec<TextSpan>,
stats: &PageStats,
) -> Vec<TextBlock> {
let bands = group_spans_into_bands_with_stats(spans, stats);
group_spans_into_blocks_legacy_from_bands(bands)
}
fn group_spans_into_blocks_legacy_from_bands(bands: Vec<TextBand>) -> Vec<TextBlock> {
if bands.is_empty() {
return Vec::new();
}
let column_gap_threshold = compute_adaptive_column_gap(&bands);
let mut blocks = Vec::new();
let mut idx = 0;
while idx < bands.len() {
let gap_midpoints = bands[idx].gap_midpoints(column_gap_threshold);
if gap_midpoints.is_empty() {
blocks.push(bands[idx].row_block());
idx += 1;
continue;
}
let mut boundaries = gap_midpoints.clone();
let mut band_indices = vec![idx];
let mut gapped_band_count = 1usize;
let mut region_left = bands[idx].left();
let mut region_right = bands[idx].right();
let mut next_idx = idx + 1;
while next_idx < bands.len() {
let next_band = &bands[next_idx];
let next_gap_midpoints = next_band.gap_midpoints(column_gap_threshold);
if next_gap_midpoints.is_empty() {
if next_band
.fits_single_column(&boundaries, region_left, region_right)
.is_some()
{
band_indices.push(next_idx);
next_idx += 1;
continue;
}
break;
}
if !boundaries_match(&boundaries, &next_gap_midpoints, column_gap_threshold) {
break;
}
update_boundaries(&mut boundaries, &next_gap_midpoints, gapped_band_count);
gapped_band_count += 1;
band_indices.push(next_idx);
region_left = region_left.min(next_band.left());
region_right = region_right.max(next_band.right());
next_idx += 1;
}
if region_is_columnar(&bands, &band_indices, &boundaries, gapped_band_count) {
append_column_region_blocks(&bands, &band_indices, &boundaries, &mut blocks);
idx = next_idx;
} else {
blocks.push(bands[idx].row_block());
idx += 1;
}
}
blocks
}
fn group_spans_into_bands(spans: Vec<TextSpan>) -> Vec<TextBand> {
let stats = PageStats::from_spans(&spans);
group_spans_into_bands_with_stats(spans, &stats)
}
fn group_spans_into_bands_with_stats(mut spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBand> {
if spans.is_empty() {
return Vec::new();
}
spans.sort_by(|a, b| {
b.y.partial_cmp(&a.y)
.unwrap_or(Ordering::Equal)
.then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
});
let page_tolerance = (stats.median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
let mut bands: Vec<TextBand> = Vec::new();
for span in spans {
let tolerance = (span.height * BAND_Y_FRACTION)
.max(page_tolerance)
.max(BAND_Y_TOLERANCE);
if let Some(band) = bands
.iter_mut()
.find(|band| (band.y - span.y).abs() <= tolerance)
{
let span_count = band.spans.len() as f64;
band.y = (band.y * span_count + span.y) / (span_count + 1.0);
band.spans.push(span);
} else {
bands.push(TextBand::new(span));
}
}
for band in &mut bands {
band.sort_spans();
}
bands.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal));
bands
}
fn boundaries_match(boundaries: &[f64], gap_midpoints: &[f64], column_gap_threshold: f64) -> bool {
let tolerance = (column_gap_threshold * 1.5)
.max(COLUMN_GAP_MATCH_TOLERANCE)
.min(60.0);
boundaries.len() == gap_midpoints.len()
&& boundaries
.iter()
.zip(gap_midpoints)
.all(|(lhs, rhs)| (lhs - rhs).abs() <= tolerance)
}
fn update_boundaries(boundaries: &mut [f64], gap_midpoints: &[f64], seen_gapped_bands: usize) {
for (boundary, midpoint) in boundaries.iter_mut().zip(gap_midpoints) {
*boundary =
(*boundary * seen_gapped_bands as f64 + midpoint) / (seen_gapped_bands as f64 + 1.0);
}
}
fn region_is_columnar(
bands: &[TextBand],
band_indices: &[usize],
boundaries: &[f64],
gapped_band_count: usize,
) -> bool {
if boundaries.is_empty()
|| gapped_band_count < MIN_COLUMN_GAPPED_BANDS
|| band_indices.is_empty()
|| (gapped_band_count as f64 / band_indices.len() as f64) < MIN_COLUMN_GAP_SUPPORT
{
return false;
}
let mut non_empty_slices = 0usize;
let mut dense_slices = 0usize;
let mut slices_per_column = vec![0usize; boundaries.len() + 1];
for &band_idx in band_indices {
let slices = bands[band_idx].split_by_boundaries(boundaries);
for (column_idx, slice) in slices.iter().enumerate() {
if slice.is_empty() {
continue;
}
non_empty_slices += 1;
slices_per_column[column_idx] += 1;
let char_count = slice
.iter()
.map(|span| span.text.chars().count())
.sum::<usize>();
if slice.len() >= 2 || char_count >= 8 {
dense_slices += 1;
}
}
}
if non_empty_slices < boundaries.len() + 2 {
return false;
}
if slices_per_column.contains(&0) {
return false;
}
(dense_slices as f64 / non_empty_slices as f64) >= MIN_DENSE_SLICE_RATIO
}
fn append_column_region_blocks(
bands: &[TextBand],
band_indices: &[usize],
boundaries: &[f64],
blocks: &mut Vec<TextBlock>,
) {
let column_count = boundaries.len() + 1;
let mut column_bands = vec![Vec::<TextSpan>::new(); column_count];
for &band_idx in band_indices {
let slices = bands[band_idx].split_by_boundaries(boundaries);
for (column_idx, slice) in slices.into_iter().enumerate() {
if slice.is_empty() {
continue;
}
column_bands[column_idx].push(TextSpan {
text: String::new(),
x: 0.0,
y: 0.0,
width: 0.0,
height: 0.0,
font_size: 0.0,
});
let marker_idx = column_bands[column_idx].len() - 1;
column_bands[column_idx][marker_idx] = TextSpan {
text: String::new(),
x: f64::NEG_INFINITY,
y: bands[band_idx].y,
width: 0.0,
height: 0.0,
font_size: 0.0,
};
column_bands[column_idx].extend(slice);
}
}
for spans in column_bands {
let mut current: Vec<TextSpan> = Vec::new();
for span in spans {
if span.x == f64::NEG_INFINITY {
if !current.is_empty() {
current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
blocks.push(TextBlock {
spans: std::mem::take(&mut current),
});
}
continue;
}
current.push(span);
}
if !current.is_empty() {
current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
blocks.push(TextBlock { spans: current });
}
}
}
fn stitch_hyphenated_lines(lines: &[String]) -> String {
let mut out = String::new();
for (idx, line) in lines.iter().enumerate() {
if idx == 0 {
out.push_str(line);
continue;
}
let next_trimmed = line.trim_start();
let should_merge = is_hyphen_wrap_candidate(&out, next_trimmed);
if should_merge {
out.pop(); out.push_str(next_trimmed);
} else {
out.push('\n');
out.push_str(line);
}
}
out
}
fn is_hyphen_wrap_candidate(accumulated: &str, next_trimmed: &str) -> bool {
if !accumulated.ends_with('-') {
return false;
}
let before_hyphen = accumulated.chars().rev().nth(1);
if !before_hyphen.is_some_and(|c| c.is_alphabetic()) {
return false;
}
let alpha_prefix_len = accumulated
.chars()
.rev()
.skip(1) .take_while(|c| c.is_alphabetic())
.count();
if alpha_prefix_len < 3 {
return false;
}
let first_next = next_trimmed.chars().next();
if !first_next.is_some_and(|c| c.is_ascii_lowercase()) {
return false;
}
let next_alpha_len = next_trimmed
.chars()
.take_while(|c| c.is_ascii_lowercase())
.count();
if next_alpha_len < 3 {
return false;
}
true
}
pub(crate) fn normalize_text_output(text: &str) -> String {
if text.is_empty() {
return String::new();
}
let mut lines: Vec<&str> = Vec::new();
for line in text.split('\n') {
lines.push(line.trim_end());
}
while lines.last() == Some(&"") {
lines.pop();
}
if lines.is_empty() {
return String::new();
}
let mut result = String::with_capacity(text.len());
let mut consecutive_empty = 0u32;
for (i, line) in lines.iter().enumerate() {
if line.is_empty() || *line == "\x0C" {
if line.is_empty() {
consecutive_empty += 1;
if consecutive_empty <= 2 {
result.push('\n');
}
} else {
consecutive_empty = 0;
result.push_str(line);
if i + 1 < lines.len() {
result.push('\n');
}
}
} else {
if line.starts_with('\x0C') {
consecutive_empty = 0;
result.push_str(line);
} else {
consecutive_empty = 0;
result.push_str(line);
}
if i + 1 < lines.len() {
result.push('\n');
}
}
}
if !result.is_empty() && !result.ends_with('\n') {
result.push('\n');
}
result
}
#[cfg(test)]
mod tests {
use super::*;
fn span(text: &str, x: f64, y: f64, width: f64) -> TextSpan {
TextSpan {
text: text.into(),
x,
y,
width,
height: 12.0,
font_size: 12.0,
}
}
fn block_texts(spans: Vec<TextSpan>) -> Vec<String> {
group_spans_into_blocks(spans)
.into_iter()
.map(|block| block.text())
.collect()
}
#[test]
fn empty_device_produces_empty_text() {
let dev = TextExtractionDevice::new();
assert!(dev.into_text().is_empty());
}
#[test]
fn single_column_stays_row_major() {
let texts = block_texts(vec![
span("Single Column Line 1", 40.0, 700.0, 140.0),
span("Single Column Line 2", 40.0, 684.0, 140.0),
span("Single Column Line 3", 40.0, 668.0, 140.0),
]);
assert_eq!(
texts,
vec![
"Single Column Line 1",
"Single Column Line 2",
"Single Column Line 3",
]
);
}
#[test]
fn two_column_region_reads_column_major() {
let texts = block_texts(vec![
span("Header", 200.0, 740.0, 80.0),
span("Left column line one", 40.0, 700.0, 115.0),
span("Right column line one", 320.0, 700.0, 120.0),
span("Left column line two", 40.0, 684.0, 115.0),
span("Right column line two", 320.0, 684.0, 120.0),
span("Left column line three", 40.0, 668.0, 125.0),
span("Right column line three", 320.0, 668.0, 130.0),
span("Footer", 200.0, 620.0, 80.0),
]);
assert_eq!(
texts,
vec![
"Header",
"Left column line one",
"Left column line two",
"Left column line three",
"Right column line one",
"Right column line two",
"Right column line three",
"Footer",
]
);
}
#[test]
fn mixed_single_and_multi_column_regions_preserve_shared_bands() {
let texts = block_texts(vec![
span("Intro paragraph", 40.0, 740.0, 180.0),
span("L1 words here", 40.0, 700.0, 110.0),
span("R1 words here", 320.0, 700.0, 110.0),
span("L2 words here", 40.0, 684.0, 110.0),
span("R2 words here", 320.0, 684.0, 110.0),
span("L3 words here", 40.0, 668.0, 110.0),
span("R3 words here", 320.0, 668.0, 110.0),
span("Outro paragraph", 40.0, 620.0, 180.0),
]);
assert_eq!(
texts,
vec![
"Intro paragraph",
"L1 words here",
"L2 words here",
"L3 words here",
"R1 words here",
"R2 words here",
"R3 words here",
"Outro paragraph",
]
);
}
#[test]
fn short_table_like_rows_fall_back_to_row_major() {
let texts = block_texts(vec![
span("Name", 40.0, 700.0, 30.0),
span("Age", 320.0, 700.0, 20.0),
span("Alice", 40.0, 684.0, 35.0),
span("30", 320.0, 684.0, 15.0),
span("Bob", 40.0, 668.0, 24.0),
span("25", 320.0, 668.0, 15.0),
]);
assert_eq!(texts, vec!["Name Age", "Alice 30", "Bob 25"]);
}
#[test]
fn three_column_regions_are_supported() {
let texts = block_texts(vec![
span("Column one line one", 40.0, 700.0, 105.0),
span("Column two line one", 220.0, 700.0, 105.0),
span("Column three line one", 400.0, 700.0, 120.0),
span("Column one line two", 40.0, 684.0, 105.0),
span("Column two line two", 220.0, 684.0, 105.0),
span("Column three line two", 400.0, 684.0, 120.0),
span("Column one line three", 40.0, 668.0, 120.0),
span("Column two line three", 220.0, 668.0, 120.0),
span("Column three line three", 400.0, 668.0, 135.0),
]);
assert_eq!(
texts,
vec![
"Column one line one",
"Column one line two",
"Column one line three",
"Column two line one",
"Column two line two",
"Column two line three",
"Column three line one",
"Column three line two",
"Column three line three",
]
);
}
#[test]
fn text_block_concatenation_spaced() {
let block = TextBlock {
spans: vec![span("A", 0.0, 0.0, 6.0), span("B", 20.0, 0.0, 6.0)],
};
assert_eq!(block.text(), "A B");
}
#[test]
fn adaptive_column_gap_fallback_for_no_gaps() {
let bands = vec![
TextBand::new(span("Hello", 40.0, 700.0, 80.0)),
TextBand::new(span("World", 40.0, 684.0, 80.0)),
];
let threshold = compute_adaptive_column_gap(&bands);
assert!((threshold - COLUMN_GAP_THRESHOLD_FALLBACK).abs() < 0.01);
}
#[test]
fn adaptive_column_gap_uses_median() {
let mut bands = Vec::new();
for y in [700.0, 684.0, 668.0] {
let mut band = TextBand::new(span("word1", 40.0, y, 30.0));
band.spans.push(span("word2", 74.0, y, 30.0)); band.spans.push(span("word3", 108.0, y, 30.0)); bands.push(band);
}
let threshold = compute_adaptive_column_gap(&bands);
assert!(
threshold >= 10.0 && threshold <= 14.0,
"expected ~12, got {threshold}"
);
}
#[test]
fn adaptive_column_gap_clamps_to_min() {
let mut bands = Vec::new();
for y in [700.0, 684.0, 668.0, 652.0] {
let mut band = TextBand::new(span("abc", 0.0, y, 18.0));
band.spans.push(span("def", 20.0, y, 18.0));
bands.push(band);
}
let threshold = compute_adaptive_column_gap(&bands);
assert!(
(threshold - COLUMN_GAP_THRESHOLD_MIN).abs() < 0.01,
"expected {COLUMN_GAP_THRESHOLD_MIN}, got {threshold}"
);
}
#[test]
fn adaptive_column_gap_all_large_gaps_uses_fraction_of_min() {
let mut band = TextBand::new(span("Left", 0.0, 700.0, 30.0));
band.spans.push(span("Right", 80.0, 700.0, 30.0)); let bands = vec![band];
let threshold = compute_adaptive_column_gap(&bands);
assert!(
(threshold - 37.5).abs() < 0.01,
"expected 37.5 (0.75×50), got {threshold}"
);
}
#[test]
fn normalize_trims_trailing_whitespace_per_line() {
assert_eq!(
normalize_text_output("hello \nworld \n"),
"hello\nworld\n"
);
}
#[test]
fn normalize_collapses_excess_newlines() {
assert_eq!(
normalize_text_output("hello\n\n\n\n\nworld\n"),
"hello\n\n\nworld\n"
);
}
#[test]
fn normalize_preserves_double_newline() {
assert_eq!(
normalize_text_output("paragraph one\n\nparagraph two\n"),
"paragraph one\n\nparagraph two\n"
);
}
#[test]
fn normalize_preserves_form_feed() {
assert_eq!(
normalize_text_output("page1\n\n\x0Cpage2\n"),
"page1\n\n\x0Cpage2\n"
);
}
#[test]
fn normalize_adds_trailing_newline() {
assert_eq!(normalize_text_output("hello"), "hello\n");
}
#[test]
fn normalize_empty_input() {
assert_eq!(normalize_text_output(""), "");
}
#[test]
fn normalize_only_whitespace() {
assert_eq!(normalize_text_output(" \n \n"), "");
}
#[test]
fn hyphen_stitch_joins_wrapped_word() {
let lines = vec!["the aver-".into(), "age rainfall".into()];
assert_eq!(stitch_hyphenated_lines(&lines), "the average rainfall");
}
#[test]
fn hyphen_stitch_handles_leading_whitespace() {
let lines = vec!["pre-".into(), " dict the outcome".into()];
assert_eq!(stitch_hyphenated_lines(&lines), "predict the outcome");
}
#[test]
fn hyphen_stitch_capital_continuation_not_stitched() {
let lines = vec!["Section three-".into(), "Summary here".into()];
assert_eq!(
stitch_hyphenated_lines(&lines),
"Section three-\nSummary here"
);
}
#[test]
fn hyphen_stitch_bullet_dash_not_stitched() {
let lines = vec!["Items:".into(), "-".into(), "milk".into()];
assert_eq!(stitch_hyphenated_lines(&lines), "Items:\n-\nmilk");
}
#[test]
fn hyphen_stitch_numeric_range_not_stitched() {
let lines = vec!["page 42-".into(), "seventy".into()];
assert_eq!(stitch_hyphenated_lines(&lines), "page 42-\nseventy");
}
#[test]
fn hyphen_stitch_short_prefix_not_stitched() {
let lines = vec!["re-".into(), "organize".into()];
assert_eq!(stitch_hyphenated_lines(&lines), "re-\norganize");
}
#[test]
fn hyphen_stitch_short_continuation_not_stitched() {
let lines = vec!["counter-".into(), "an example".into()];
assert_eq!(stitch_hyphenated_lines(&lines), "counter-\nan example");
}
#[test]
fn hyphen_stitch_compound_word_midline_preserved() {
let lines = vec!["real-time system".into()];
assert_eq!(stitch_hyphenated_lines(&lines), "real-time system");
}
#[test]
fn hyphen_stitch_single_line_unchanged() {
let lines = vec!["only line".into()];
assert_eq!(stitch_hyphenated_lines(&lines), "only line");
}
#[test]
fn hyphen_stitch_empty_input() {
let lines: Vec<String> = vec![];
assert_eq!(stitch_hyphenated_lines(&lines), "");
}
fn make_device_with_median(median: f64) -> TextExtractionDevice {
let mut dev = TextExtractionDevice::new();
for _ in 0..MEDIAN_REFRESH {
dev.glyph_widths.push(median);
}
dev.refresh_median_char_width();
assert!((dev.cached_median_char_width - median).abs() < 1e-9);
dev
}
#[test]
fn consensus_inserts_space_on_strong_tj_offset_alone() {
let mut dev = make_device_with_median(6.0);
dev.pending_tj_offset = 250.0; assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
}
#[test]
fn consensus_inserts_space_on_geometric_gap_alone() {
let dev = make_device_with_median(6.0);
assert!(dev.evaluate_space_consensus(2.5, 12.0, "hello", "world"));
}
#[test]
fn consensus_no_space_on_kerning_gap() {
let dev = make_device_with_median(6.0);
assert!(!dev.evaluate_space_consensus(0.5, 12.0, "fi", "lm"));
}
#[test]
fn consensus_inserts_space_on_camel_case_plus_gap() {
let dev = make_device_with_median(6.0);
assert!(dev.evaluate_space_consensus(2.5, 12.0, "helloWorld", "Inc"));
}
#[test]
fn consensus_inserts_space_on_digit_letter_transition_with_gap() {
let dev = make_device_with_median(6.0);
assert!(dev.evaluate_space_consensus(2.5, 12.0, "123", "abc"));
}
#[test]
fn consensus_heuristic_alone_is_insufficient() {
let dev = make_device_with_median(6.0);
assert!(!dev.evaluate_space_consensus(0.5, 12.0, "camel", "Case"));
}
#[test]
fn consensus_falls_back_to_font_size_when_no_median() {
let dev = TextExtractionDevice::new();
assert!(dev.evaluate_space_consensus(1.9, 12.0, "a", "b"));
assert!(!dev.evaluate_space_consensus(1.5, 12.0, "a", "b"));
}
#[test]
fn consensus_ignores_tiny_tj_offsets() {
let mut dev = make_device_with_median(6.0);
dev.pending_tj_offset = 50.0;
assert!(!dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
}
#[test]
fn consensus_accepts_negative_tj_offsets() {
let mut dev = make_device_with_median(6.0);
dev.pending_tj_offset = -250.0;
assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
}
#[test]
fn text_adjustment_accumulates_until_glyph() {
let mut dev = TextExtractionDevice::new();
dev.text_adjustment(120.0);
dev.text_adjustment(140.0);
assert!((dev.pending_tj_offset - 260.0).abs() < 1e-6);
}
#[test]
fn xy_cut_header_body_footer_with_two_columns() {
let texts = block_texts(vec![
span("HEADLINE TITLE", 180.0, 760.0, 120.0),
span("Left col line A", 40.0, 700.0, 110.0),
span("Right col line A", 320.0, 700.0, 115.0),
span("Left col line B", 40.0, 684.0, 110.0),
span("Right col line B", 320.0, 684.0, 115.0),
span("Left col line C", 40.0, 668.0, 110.0),
span("Right col line C", 320.0, 668.0, 115.0),
span("FOOTER LINE TEXT", 180.0, 600.0, 120.0),
]);
assert_eq!(texts.first().map(String::as_str), Some("HEADLINE TITLE"));
assert_eq!(texts.last().map(String::as_str), Some("FOOTER LINE TEXT"));
let left_c_idx = texts.iter().position(|s| s == "Left col line C").unwrap();
let right_a_idx = texts.iter().position(|s| s == "Right col line A").unwrap();
assert!(
left_c_idx < right_a_idx,
"expected column-major ordering in body: {texts:?}"
);
}
#[test]
fn xy_cut_rejects_column_split_on_table_rows() {
let texts = block_texts(vec![
span("Name", 40.0, 700.0, 30.0),
span("Age", 320.0, 700.0, 20.0),
span("Alice", 40.0, 684.0, 35.0),
span("30", 320.0, 684.0, 15.0),
]);
assert_eq!(texts, vec!["Name Age", "Alice 30"]);
}
#[test]
fn xy_cut_rejects_column_split_when_one_band_is_full_width() {
let texts = block_texts(vec![
span(
"Full width intro spanning both columns here",
40.0,
740.0,
360.0,
),
span("Left A", 40.0, 700.0, 50.0),
span("Right A", 320.0, 700.0, 50.0),
span("Left B", 40.0, 684.0, 50.0),
span("Right B", 320.0, 684.0, 50.0),
]);
assert!(
texts[0].contains("Full width intro"),
"expected full-width intro first: {texts:?}"
);
}
#[test]
fn xy_cut_horizontal_split_for_zone_boundaries() {
let texts = block_texts(vec![
span("First paragraph body text", 40.0, 740.0, 200.0),
span("Second paragraph body", 40.0, 680.0, 180.0),
]);
assert_eq!(texts.len(), 2);
assert!(texts[0].starts_with("First"));
assert!(texts[1].starts_with("Second"));
}
#[test]
fn xy_cut_recursion_terminates_with_single_span() {
let texts = block_texts(vec![span("Only one span on the page", 40.0, 700.0, 180.0)]);
assert_eq!(texts, vec!["Only one span on the page"]);
}
#[test]
fn median_font_size_handles_mixed_sizes() {
let spans = vec![
TextSpan {
text: "small".into(),
x: 0.0,
y: 0.0,
width: 10.0,
height: 8.0,
font_size: 8.0,
},
TextSpan {
text: "medium".into(),
x: 0.0,
y: 0.0,
width: 10.0,
height: 12.0,
font_size: 12.0,
},
TextSpan {
text: "large".into(),
x: 0.0,
y: 0.0,
width: 10.0,
height: 24.0,
font_size: 24.0,
},
];
assert!((median_font_size(&spans) - 12.0).abs() < 1e-9);
}
#[test]
fn columns_band_aligned_accepts_aligned_columns() {
let spans = vec![
span("L1", 40.0, 700.0, 60.0),
span("R1", 300.0, 700.0, 60.0),
span("L2", 40.0, 684.0, 60.0),
span("R2", 300.0, 684.0, 60.0),
];
let stats = PageStats::from_spans(&spans);
assert!(columns_are_band_aligned(&spans, 200.0, 40.0, 360.0, &stats));
}
#[test]
fn columns_band_aligned_rejects_wide_single_side_band() {
let spans = vec![
span("Wide banner line across top", 40.0, 740.0, 280.0),
span("L1", 40.0, 700.0, 60.0),
span("R1", 300.0, 700.0, 60.0),
];
let stats = PageStats::from_spans(&spans);
assert!(!columns_are_band_aligned(
&spans, 200.0, 40.0, 360.0, &stats
));
}
#[test]
fn page_stats_computes_median_values() {
let spans = vec![
span("one", 40.0, 700.0, 30.0),
span("two", 40.0, 680.0, 30.0),
span("three", 40.0, 660.0, 50.0),
];
let stats = PageStats::from_spans(&spans);
assert!((stats.median_font_size - 12.0).abs() < 1e-9);
assert!((stats.median_char_width - 10.0).abs() < 1e-9);
assert!((stats.median_line_spacing - 20.0).abs() < 1e-9);
}
#[test]
fn page_stats_handles_empty_input() {
let stats = PageStats::from_spans(&[]);
assert!((stats.median_font_size - 12.0).abs() < 1e-9);
assert!((stats.median_char_width - 6.0).abs() < 1e-9);
assert_eq!(stats.median_line_spacing, 0.0);
}
#[test]
fn narrow_gutter_detected_with_adaptive_threshold() {
let mut spans = Vec::new();
for y in [700.0, 684.0, 668.0] {
spans.push(span("Lorem ipsum", 40.0, y, 100.0));
spans.push(span("dolor sit", 144.0, y, 80.0));
spans.push(span("amet consec", 236.0, y, 100.0));
spans.push(span("tetur adipi", 340.0, y, 80.0));
}
let texts = block_texts(spans);
assert!(
texts.len() >= 6,
"expected column-major output, got {texts:?}"
);
assert!(
texts[0].contains("Lorem"),
"first block should be left column: {texts:?}"
);
}
#[test]
fn xy_cut_leaf_falls_back_to_legacy_columns_for_header_plus_three_columns() {
let texts = block_texts(vec![
span("73022", 45.0, 750.0, 70.0),
span("Federal Register banner", 125.6, 750.0, 260.0),
span("Left column line one", 45.0, 725.0, 140.0),
span("Middle column line one", 222.0, 725.0, 140.0),
span("Right column line one", 399.0, 725.0, 120.0),
span("Left column line two", 45.0, 715.0, 140.0),
span("Middle column line two", 210.0, 715.0, 152.0),
span("Right column line two", 388.0, 715.0, 132.0),
span("Left column line three", 45.0, 705.0, 140.0),
span("Middle column line three", 235.0, 705.0, 135.0),
span("Right column line three", 408.0, 705.0, 118.0),
]);
assert_eq!(
texts,
vec![
"73022 Federal Register banner",
"Left column line one",
"Left column line two",
"Left column line three",
"Middle column line one",
"Middle column line two",
"Middle column line three",
"Right column line one",
"Right column line two",
"Right column line three",
]
);
}
#[test]
fn overlapping_fake_bold_spans_collapse_to_single_copy() {
let texts = block_texts(vec![
span("1 This is fakebold text.", 25.9, 785.3, 320.0),
span("1 This is fakebold text.", 26.2, 785.3, 320.0),
span("1 This is fakebold text.", 26.4, 785.3, 320.0),
span("1 This is fakebold text.", 26.7, 785.3, 320.0),
span("2 This is a fakebold", 27.0, 714.8, 142.0),
span(" fakebold", 169.8, 714.8, 70.0),
span(" fakebold", 170.1, 714.8, 70.0),
span(" fakebold word.", 170.4, 714.8, 110.0),
]);
assert_eq!(
texts,
vec!["1 This is fakebold text.", "2 This is a fakebold word.",]
);
}
}