Skip to main content

pdf_engine/
text.rs

1//! Text extraction via a custom Device implementation.
2
3use kurbo::{Affine, BezPath};
4use pdf_render::pdf_interpret::cmap::BfString;
5use pdf_render::pdf_interpret::font::Glyph;
6use pdf_render::pdf_interpret::{
7    BlendMode, ClipPath, Device, GlyphDrawMode, Image, Paint, PathDrawMode, SoftMask,
8};
9use std::cmp::Ordering;
10
11/// Minimum Y tolerance for grouping spans into horizontal bands. The
12/// effective tolerance is typically `median_font_size * BAND_Y_FRACTION`
13/// per ANN[r17/TEX4]; this constant acts as the absolute floor.
14const BAND_Y_TOLERANCE: f64 = 5.0;
15/// Fraction of the page's median font size used as the band-Y
16/// tolerance. Empirically 0.30× works across common typography —
17/// below typical leading (~1.2×) so adjacent lines never collapse,
18/// above sub-pixel baseline drift.
19const BAND_Y_FRACTION: f64 = 0.30;
20/// Multiplier applied to median line spacing to derive the horizontal
21/// paragraph-break cut threshold. Normal line-to-line progression is
22/// ~1.0× the median; paragraph breaks typically show 1.5× or more.
23const PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER: f64 = 1.8;
24
25// ANN[r17/TEX1] Multi-signal consensus thresholds.
26// The previous single-threshold scheme (gap > 0.15 * font_size) missed
27// word boundaries when kerning or narrow fonts produced small measured
28// gaps even though the PDF emitted an explicit TJ backward shift, and
29// over-emitted spaces for condensed fonts where 0.15em of kerning is
30// well below an actual word space. The consensus system weights three
31// signals and inserts a space when the combined confidence exceeds
32// SPACE_CONSENSUS_THRESHOLD.
33/// Raw TJ backward adjustment (positive in PDF TJ units) that is
34/// definitively a word break. Matches pdftotext / MuPDF heuristics —
35/// a space glyph is typically emitted as either a literal 0x20 or as
36/// a TJ adjustment of around 250 1/1000 em. 100 units is a safely
37/// conservative floor.
38const TJ_SPACE_THRESHOLD_UNITS: f32 = 100.0;
39/// Weight of the TJ offset signal when confidence is high.
40const TJ_SIGNAL_WEIGHT: f64 = 0.95;
41/// Weight of the purely geometric gap signal.
42const GAP_SIGNAL_WEIGHT: f64 = 0.80;
43/// Weight of character-heuristic signals (CamelCase, digit↔letter).
44const HEURISTIC_SIGNAL_WEIGHT: f64 = 0.60;
45/// Combined weight at which a space is inserted.
46const SPACE_CONSENSUS_THRESHOLD: f64 = 0.75;
47/// Fraction of a median character width above which a gap contributes
48/// to the geometric signal (pdf_oxide uses ~0.30).
49const GAP_TO_MEDIAN_CHAR_FRACTION: f64 = 0.30;
50/// Fallback gap fraction relative to `font_size` when the running
51/// median character width has not yet been established.
52const GAP_TO_FONT_SIZE_FALLBACK_FRACTION: f64 = 0.15;
53
54/// Minimum horizontal gap treated as a column gutter (adaptive fallback).
55const COLUMN_GAP_THRESHOLD_MIN: f64 = 10.0;
56/// Maximum adaptive column gap threshold.
57const COLUMN_GAP_THRESHOLD_MAX: f64 = 40.0;
58/// Multiplier applied to median inter-word gap to derive column threshold.
59const COLUMN_GAP_MEDIAN_MULTIPLIER: f64 = 3.0;
60/// Fallback column gap threshold when median cannot be computed.
61const COLUMN_GAP_THRESHOLD_FALLBACK: f64 = 20.0;
62/// Maximum drift allowed when matching gutters across neighboring bands.
63const COLUMN_GAP_MATCH_TOLERANCE: f64 = 12.0;
64/// Minimum number of gapped bands required before we enable column mode.
65const MIN_COLUMN_GAPPED_BANDS: usize = 3;
66/// Minimum fraction of bands in a region that must expose the shared gutters.
67const MIN_COLUMN_GAP_SUPPORT: f64 = 0.80;
68/// Minimum fraction of non-empty column slices that must look like prose.
69const MIN_DENSE_SLICE_RATIO: f64 = 0.35;
70
71/// A single text span at a specific position.
72#[derive(Debug, Clone)]
73pub struct TextSpan {
74    /// The extracted text.
75    pub text: String,
76    /// X position in user space.
77    pub x: f64,
78    /// Y position in user space.
79    pub y: f64,
80    /// Approximate bounding-box width in user space.
81    pub width: f64,
82    /// Approximate bounding-box height in user space.
83    pub height: f64,
84    /// Font size (approximate, from transform).
85    pub font_size: f64,
86}
87
88impl TextSpan {
89    /// Conservative right edge using whichever is wider: measured or estimated.
90    /// Used by column detection to avoid underestimating span extent.
91    fn right(&self) -> f64 {
92        self.x + self.width.max(self.estimated_width())
93    }
94
95    /// Right edge from measured glyph positions only.
96    fn measured_right(&self) -> f64 {
97        self.x + self.width
98    }
99
100    fn estimated_width(&self) -> f64 {
101        let char_count = self.text.chars().count() as f64;
102        if char_count <= 0.0 {
103            self.font_size * 0.5
104        } else {
105            self.font_size * 0.5 * char_count
106        }
107    }
108}
109
110/// A block of text (grouped by reading order).
111#[derive(Debug, Clone)]
112pub struct TextBlock {
113    /// Spans within this block, sorted by position.
114    pub spans: Vec<TextSpan>,
115}
116
117impl TextBlock {
118    /// Concatenate all spans into a single string.
119    ///
120    /// Spans that are close together are joined without a separator;
121    /// a space is inserted when the gap between spans exceeds half
122    /// the average character width.
123    pub fn text(&self) -> String {
124        if self.spans.is_empty() {
125            return String::new();
126        }
127        let mut result = self.spans[0].text.clone();
128        for pair in self.spans.windows(2) {
129            let prev = &pair[0];
130            let curr = &pair[1];
131            let expected_end = prev.measured_right();
132            let gap = curr.x - expected_end;
133            if gap <= prev.font_size * 0.12 {
134                if let Some(trimmed) = trim_overlapping_word_prefix(&prev.text, &curr.text) {
135                    result.push_str(&trimmed);
136                    continue;
137                }
138            }
139            if gap > prev.font_size * 0.25 {
140                result.push(' ');
141            }
142            result.push_str(&curr.text);
143        }
144        result
145    }
146}
147
148#[derive(Debug, Clone)]
149struct TextBand {
150    y: f64,
151    spans: Vec<TextSpan>,
152}
153
154impl TextBand {
155    fn new(span: TextSpan) -> Self {
156        Self {
157            y: span.y,
158            spans: vec![span],
159        }
160    }
161
162    fn sort_spans(&mut self) {
163        self.spans.sort_by(|a, b| {
164            a.x.partial_cmp(&b.x)
165                .unwrap_or(Ordering::Equal)
166                .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal))
167        });
168        collapse_overprinted_spans(&mut self.spans);
169    }
170
171    fn row_block(&self) -> TextBlock {
172        let mut spans = self.spans.clone();
173        spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
174        TextBlock { spans }
175    }
176
177    fn left(&self) -> f64 {
178        self.spans
179            .iter()
180            .map(|span| span.x)
181            .fold(f64::INFINITY, f64::min)
182    }
183
184    fn right(&self) -> f64 {
185        self.spans
186            .iter()
187            .map(TextSpan::right)
188            .fold(f64::NEG_INFINITY, f64::max)
189    }
190
191    fn width(&self) -> f64 {
192        (self.right() - self.left()).max(0.0)
193    }
194
195    fn gap_midpoints(&self, column_gap_threshold: f64) -> Vec<f64> {
196        self.gaps(column_gap_threshold)
197            .into_iter()
198            .map(|gap| (gap.start + gap.end) * 0.5)
199            .collect()
200    }
201
202    fn gaps(&self, column_gap_threshold: f64) -> Vec<BandGap> {
203        if self.spans.len() < 2 {
204            return Vec::new();
205        }
206
207        let mut spans = self.spans.clone();
208        spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
209
210        let mut gaps = Vec::new();
211        let mut prev_right = spans[0].right();
212        for span in spans.iter().skip(1) {
213            let gap = span.x - prev_right;
214            if gap >= column_gap_threshold {
215                gaps.push(BandGap {
216                    start: prev_right,
217                    end: span.x,
218                });
219            }
220            prev_right = prev_right.max(span.right());
221        }
222
223        gaps
224    }
225
226    fn split_by_boundaries(&self, boundaries: &[f64]) -> Vec<Vec<TextSpan>> {
227        let mut columns = vec![Vec::new(); boundaries.len() + 1];
228        for span in &self.spans {
229            let center_x = span.x + span.width.max(span.estimated_width()) * 0.5;
230            let column_idx = boundaries
231                .iter()
232                .position(|boundary| center_x < *boundary)
233                .unwrap_or(boundaries.len());
234            columns[column_idx].push(span.clone());
235        }
236
237        for spans in &mut columns {
238            spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
239        }
240
241        columns
242    }
243
244    fn fits_single_column(
245        &self,
246        boundaries: &[f64],
247        region_left: f64,
248        region_right: f64,
249    ) -> Option<usize> {
250        let mut column_idx: Option<usize> = None;
251        for span in &self.spans {
252            let left = span.x;
253            let right = span.right();
254            if boundaries
255                .iter()
256                .any(|boundary| left < *boundary && right > *boundary)
257            {
258                return None;
259            }
260
261            let center_x = left + (right - left) * 0.5;
262            let idx = boundaries
263                .iter()
264                .position(|boundary| center_x < *boundary)
265                .unwrap_or(boundaries.len());
266            match column_idx {
267                Some(existing) if existing != idx => return None,
268                Some(_) => {}
269                None => column_idx = Some(idx),
270            }
271        }
272        let idx = column_idx?;
273        let mut edges = Vec::with_capacity(boundaries.len() + 2);
274        edges.push(region_left);
275        edges.extend_from_slice(boundaries);
276        edges.push(region_right);
277
278        let column_width = (edges[idx + 1] - edges[idx]).max(0.0);
279        if column_width <= 0.0 || self.width() > column_width * 0.8 {
280            return None;
281        }
282
283        Some(idx)
284    }
285}
286
287#[derive(Debug, Clone, Copy)]
288struct BandGap {
289    start: f64,
290    end: f64,
291}
292
293/// A Device implementation that captures text from draw_glyph calls.
294///
295/// ANN[r17/TEX1][r17/TEX3] Space detection uses a multi-signal consensus
296/// rather than a single geometric threshold. Three signals vote:
297///   1. `pending_tj_offset`  — raw TJ backward shift surfaced by the
298///      interpreter (confidence 0.95). This is the definitive word-break
299///      signal used by pdftotext / MuPDF.
300///   2. geometric gap        — measured horizontal distance between the
301///      previous glyph's right edge and this glyph's origin (confidence
302///      0.80). Compared against the running median glyph width rather
303///      than a flat em-fraction so condensed/wide fonts are handled
304///      uniformly.
305///   3. character heuristic  — CamelCase transition or digit↔letter
306///      transition at the merge point (confidence 0.60). Catches cases
307///      where the writer relied on typography (e.g. table cells glued
308///      with zero gap: `Qty1Price$5`).
309///
310/// A space is inserted when the weighted sum meets SPACE_CONSENSUS_THRESHOLD.
311/// Span accumulation still merges adjacent glyphs into one TextSpan (TEX3)
312/// so downstream reading-order logic sees logical text runs, not individual
313/// character positions.
314pub(crate) struct TextExtractionDevice {
315    spans: Vec<TextSpan>,
316    last_y: f64,
317    last_end_x: f64,
318    /// TJ adjustment in raw 1/1000 em units since the last glyph was
319    /// drawn. Positive values = backward shift (i.e., explicit horizontal
320    /// space). Reset every time a glyph is drawn.
321    pending_tj_offset: f32,
322    /// Running sample of measured glyph widths used as the adaptive
323    /// reference for the geometric gap signal. Cheap to maintain and
324    /// avoids having to re-walk all spans per decision.
325    glyph_widths: Vec<f64>,
326    /// Cached median glyph width (kept fresh every `MEDIAN_REFRESH`
327    /// insertions). Zero = not yet established, caller falls back to
328    /// font-size scaling.
329    cached_median_char_width: f64,
330}
331
332const MEDIAN_REFRESH: usize = 32;
333
334impl Default for TextExtractionDevice {
335    fn default() -> Self {
336        Self::new()
337    }
338}
339
340impl TextExtractionDevice {
341    /// Create a new text extraction device.
342    pub fn new() -> Self {
343        Self {
344            spans: Vec::new(),
345            last_y: f64::NEG_INFINITY,
346            last_end_x: f64::NEG_INFINITY,
347            pending_tj_offset: 0.0,
348            glyph_widths: Vec::new(),
349            cached_median_char_width: 0.0,
350        }
351    }
352
353    /// Refresh the cached median char width. Called lazily from
354    /// `draw_glyph` to keep the hot path cheap.
355    fn refresh_median_char_width(&mut self) {
356        if self.glyph_widths.is_empty() {
357            self.cached_median_char_width = 0.0;
358            return;
359        }
360        let mut sorted = self.glyph_widths.clone();
361        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
362        self.cached_median_char_width = sorted[sorted.len() / 2];
363    }
364
365    /// Decide whether a space should be glued between two glyphs within
366    /// the same span. Returns (insert_space, start_new_span).
367    fn evaluate_space_consensus(
368        &self,
369        gap: f64,
370        font_size: f64,
371        prev_text: &str,
372        next_text: &str,
373    ) -> bool {
374        let mut confidence = 0.0;
375
376        // Signal 1 — TJ offset (highest confidence). Raw units; a full
377        // space is ~250. Anything over TJ_SPACE_THRESHOLD_UNITS counts.
378        if self.pending_tj_offset.abs() >= TJ_SPACE_THRESHOLD_UNITS {
379            confidence += TJ_SIGNAL_WEIGHT;
380        }
381
382        // Signal 2 — geometric gap. Prefer the adaptive median-char-width
383        // reference; fall back to font-size when the median hasn't been
384        // established yet (first few glyphs on a page).
385        let gap_reference = if self.cached_median_char_width > 0.0 {
386            self.cached_median_char_width * GAP_TO_MEDIAN_CHAR_FRACTION
387        } else {
388            font_size * GAP_TO_FONT_SIZE_FALLBACK_FRACTION
389        };
390        if gap > gap_reference {
391            confidence += GAP_SIGNAL_WEIGHT;
392        }
393
394        // Signal 3 — character-class transitions. Only checked when the
395        // previous span ends with a character and the incoming text starts
396        // with one; avoids double-counting with punctuation.
397        if let (Some(prev_last), Some(next_first)) =
398            (prev_text.chars().last(), next_text.chars().next())
399        {
400            let camel = prev_last.is_lowercase() && next_first.is_uppercase();
401            let digit_to_letter = prev_last.is_ascii_digit() && next_first.is_alphabetic();
402            let letter_to_digit = prev_last.is_alphabetic() && next_first.is_ascii_digit();
403            if camel || digit_to_letter || letter_to_digit {
404                confidence += HEURISTIC_SIGNAL_WEIGHT;
405            }
406        }
407
408        confidence >= SPACE_CONSENSUS_THRESHOLD
409    }
410
411    /// Consume the device and return extracted text as a single string.
412    pub fn into_text(self) -> String {
413        let blocks = group_spans_into_blocks(self.spans);
414        let lines: Vec<String> = blocks.iter().map(|b| b.text()).collect();
415        let stitched = stitch_hyphenated_lines(&lines);
416        normalize_text_output(&stitched)
417    }
418
419    /// Consume the device and return text blocks.
420    pub fn into_blocks(self) -> Vec<TextBlock> {
421        group_spans_into_blocks(self.spans)
422    }
423
424    /// Consume the device and return raw spans.
425    #[allow(dead_code)]
426    pub(crate) fn into_spans(self) -> Vec<TextSpan> {
427        self.spans
428    }
429}
430
431impl Device<'_> for TextExtractionDevice {
432    fn set_soft_mask(&mut self, _: Option<SoftMask<'_>>) {}
433    fn set_blend_mode(&mut self, _: BlendMode) {}
434    fn draw_path(&mut self, _: &BezPath, _: Affine, _: &Paint<'_>, _: &PathDrawMode) {}
435    fn push_clip_path(&mut self, _: &ClipPath) {}
436    fn push_transparency_group(&mut self, _: f32, _: Option<SoftMask<'_>>, _: BlendMode) {}
437    fn draw_image(&mut self, _: Image<'_, '_>, _: Affine) {}
438    fn pop_clip_path(&mut self) {}
439    fn pop_transparency_group(&mut self) {}
440
441    fn draw_glyph(
442        &mut self,
443        glyph: &Glyph<'_>,
444        transform: Affine,
445        glyph_transform: Affine,
446        _paint: &Paint<'_>,
447        _draw_mode: &GlyphDrawMode,
448    ) {
449        let text = match glyph.as_unicode() {
450            Some(BfString::Char(c)) => c.to_string(),
451            Some(BfString::String(s)) => s,
452            None => return,
453        };
454
455        let composed = transform * glyph_transform;
456        let coeffs = composed.as_coeffs();
457        let x = coeffs[4];
458        let y = coeffs[5];
459        let glyph_scale = (coeffs[0].powi(2) + coeffs[1].powi(2)).sqrt().abs();
460        let font_size = glyph_scale * 1000.0;
461        let glyph_width = estimate_glyph_width(glyph, font_size).max(font_size * 0.25);
462        let glyph_end_x = x + glyph_width;
463
464        // ANN[r17/TEX4] Feed the running sample used to derive the adaptive
465        // median character width. Capped to protect against pathological
466        // pages with hundreds of thousands of glyphs.
467        if self.glyph_widths.len() < 4096 {
468            self.glyph_widths.push(glyph_width);
469            if self.glyph_widths.len().is_multiple_of(MEDIAN_REFRESH) {
470                self.refresh_median_char_width();
471            }
472        }
473
474        let same_line = (y - self.last_y).abs() <= font_size.max(BAND_Y_TOLERANCE) * 0.35;
475        let gap = x - self.last_end_x;
476        let adjacent = same_line && gap >= -font_size * 0.25 && gap < font_size * 0.5;
477
478        if adjacent && !self.spans.is_empty() {
479            // ANN[r17/TEX1] Multi-signal consensus replaces the prior
480            // single-threshold rule (`gap > 0.15 * font_size`). The
481            // consensus evaluates TJ offset, geometric gap, and
482            // character-class transitions; a space is inserted only
483            // when the weighted sum meets SPACE_CONSENSUS_THRESHOLD.
484            // Decision is computed before the mutable borrow of `last`
485            // to keep the borrow checker happy.
486            let want_space = {
487                let last = self.spans.last().expect("checked non-empty");
488                !last.text.ends_with(' ')
489                    && !text.starts_with(' ')
490                    && self.evaluate_space_consensus(gap, font_size, &last.text, &text)
491            };
492            let last = self.spans.last_mut().expect("checked non-empty");
493            if want_space {
494                last.text.push(' ');
495            }
496            last.text.push_str(&text);
497            last.width = last.width.max(glyph_end_x - last.x);
498            last.height = last.height.max(font_size);
499            self.last_y = y;
500            self.last_end_x = glyph_end_x;
501            // ANN[r17/TEX1] Consume the TJ signal: it only counts for
502            // the one merge it preceded.
503            self.pending_tj_offset = 0.0;
504            return;
505        }
506
507        self.last_y = y;
508        self.last_end_x = glyph_end_x;
509        // ANN[r17/TEX1] Non-adjacent glyph starts a fresh span, so any
510        // pending TJ offset is about within-span word breaks and no longer
511        // meaningful here.
512        self.pending_tj_offset = 0.0;
513
514        self.spans.push(TextSpan {
515            text,
516            x,
517            y,
518            width: glyph_width,
519            height: font_size,
520            font_size,
521        });
522    }
523
524    // ANN[r17/TEX1] Record TJ offsets. Accumulate because a single
525    // inter-substring gap may be expressed as multiple numeric entries
526    // (rare, but legal per PDF §9.4.3). The next draw_glyph consumes
527    // the sum.
528    fn text_adjustment(&mut self, amount: f32) {
529        self.pending_tj_offset += amount;
530    }
531}
532
533fn estimate_glyph_width(glyph: &Glyph<'_>, font_size: f64) -> f64 {
534    match glyph {
535        Glyph::Outline(outline) => outline
536            .advance_width()
537            .map(|width| width as f64 / 1000.0 * font_size)
538            .unwrap_or(font_size * 0.5),
539        Glyph::Type3(_) => font_size * 0.5,
540    }
541}
542
543/// Collapse fake-bold / overprint duplicates inside one band.
544///
545/// Real-word corpus failures such as 0105.pdf draw the same text several times
546/// with sub-point x drift to simulate heavier weight. Text extraction should
547/// keep the most informative span once rather than concatenate every overprint.
548fn collapse_overprinted_spans(spans: &mut Vec<TextSpan>) {
549    if spans.len() < 2 {
550        return;
551    }
552
553    let mut deduped: Vec<TextSpan> = Vec::with_capacity(spans.len());
554    for span in spans.drain(..) {
555        if let Some(last) = deduped.last_mut() {
556            if spans_are_overprint_duplicates(last, &span) {
557                let choose_incoming = span.text.chars().count() > last.text.chars().count()
558                    || (span.text.chars().count() == last.text.chars().count()
559                        && span.width > last.width);
560                let preferred_text = if choose_incoming {
561                    span.text.clone()
562                } else {
563                    last.text.clone()
564                };
565                let left = last.x.min(span.x);
566                let right = last.right().max(span.right());
567                last.x = left;
568                last.y = (last.y + span.y) * 0.5;
569                last.width = (right - left).max(last.width).max(span.width);
570                last.height = last.height.max(span.height);
571                last.font_size = last.font_size.max(span.font_size);
572                last.text = preferred_text;
573                continue;
574            }
575        }
576
577        deduped.push(span);
578    }
579
580    *spans = deduped;
581}
582
583fn spans_are_overprint_duplicates(lhs: &TextSpan, rhs: &TextSpan) -> bool {
584    let lhs_text = lhs.text.trim();
585    let rhs_text = rhs.text.trim();
586    if lhs_text.is_empty() || rhs_text.is_empty() {
587        return false;
588    }
589
590    let same_baseline = (lhs.y - rhs.y).abs() <= lhs.font_size.max(rhs.font_size) * 0.12;
591    if !same_baseline {
592        return false;
593    }
594
595    let lhs_left = lhs.x;
596    let lhs_right = lhs.right();
597    let rhs_left = rhs.x;
598    let rhs_right = rhs.right();
599    let overlap = (lhs_right.min(rhs_right) - lhs_left.max(rhs_left)).max(0.0);
600    let min_width = (lhs_right - lhs_left).min(rhs_right - rhs_left).max(1.0);
601    let heavily_overlaps = overlap / min_width >= 0.85;
602    if !heavily_overlaps {
603        return false;
604    }
605
606    lhs_text == rhs_text || lhs_text.starts_with(rhs_text) || rhs_text.starts_with(lhs_text)
607}
608
609fn trim_overlapping_word_prefix(prev: &str, curr: &str) -> Option<String> {
610    let prev_chars: Vec<char> = prev.trim_end().chars().collect();
611    let curr_chars: Vec<char> = curr.trim_start().chars().collect();
612    let max = prev_chars.len().min(curr_chars.len());
613
614    for len in (4..=max).rev() {
615        let prev_start = prev_chars.len() - len;
616        if prev_chars[prev_start..] != curr_chars[..len] {
617            continue;
618        }
619
620        if !curr_chars[..len].iter().all(|ch| ch.is_alphanumeric()) {
621            continue;
622        }
623
624        let prev_boundary = prev_start == 0 || !prev_chars[prev_start - 1].is_alphanumeric();
625        let curr_boundary = len == curr_chars.len() || !curr_chars[len].is_alphanumeric();
626        if !prev_boundary || !curr_boundary {
627            continue;
628        }
629
630        return Some(curr_chars[len..].iter().collect());
631    }
632
633    None
634}
635
636/// Compute an adaptive column gap threshold from a set of bands.
637///
638/// Collects all positive inter-span gaps within each band, computes the
639/// median, and returns `COLUMN_GAP_MEDIAN_MULTIPLIER × median`, clamped to
640/// `[COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX]`.  Falls back to
641/// `COLUMN_GAP_THRESHOLD_FALLBACK` when there are no measurable gaps.
642fn compute_adaptive_column_gap(bands: &[TextBand]) -> f64 {
643    let mut all_gaps: Vec<f64> = Vec::new();
644
645    for band in bands {
646        if band.spans.len() < 2 {
647            continue;
648        }
649        let mut sorted = band.spans.clone();
650        sorted.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
651        let mut prev_right = sorted[0].right();
652        for span in sorted.iter().skip(1) {
653            let gap = span.x - prev_right;
654            if gap > 0.0 {
655                all_gaps.push(gap);
656            }
657            prev_right = prev_right.max(span.right());
658        }
659    }
660
661    if all_gaps.is_empty() {
662        return COLUMN_GAP_THRESHOLD_FALLBACK;
663    }
664
665    all_gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
666
667    let min_gap = all_gaps[0];
668
669    // When all inter-span gaps are already large (> MIN threshold), they are
670    // likely all column gaps — the draw_glyph merger absorbed word-level
671    // spaces into span text.  Use a fraction of the smallest gap so that
672    // ALL column gaps exceed the threshold.
673    if min_gap > COLUMN_GAP_THRESHOLD_MIN {
674        return (min_gap * 0.75).clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
675    }
676
677    // Look for a natural break: the largest relative jump between consecutive
678    // sorted gaps separates word-level gaps from column gaps.
679    let mut best_break_threshold = 0.0f64;
680    let mut best_ratio = 1.5f64; // require at least 1.5× jump
681    for pair in all_gaps.windows(2) {
682        if pair[0] > 0.5 {
683            let ratio = pair[1] / pair[0];
684            if ratio > best_ratio {
685                best_ratio = ratio;
686                best_break_threshold = (pair[0] + pair[1]) * 0.5;
687            }
688        }
689    }
690
691    if best_break_threshold > 0.0 {
692        return best_break_threshold.clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
693    }
694
695    // Fallback: median × multiplier.
696    let mid = all_gaps.len() / 2;
697    let median = if all_gaps.len().is_multiple_of(2) {
698        (all_gaps[mid - 1] + all_gaps[mid]) * 0.5
699    } else {
700        all_gaps[mid]
701    };
702
703    (median * COLUMN_GAP_MEDIAN_MULTIPLIER)
704        .clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX)
705}
706
707/// Group spans into reading-order blocks, using column-aware reordering when
708/// a contiguous region repeatedly exposes the same gutters.
709/// Per-page adaptive parameters derived from the span set before any
710/// grouping happens. Centralising these here (TEX4) means the rest of
711/// the pipeline — band grouping, XY-Cut cuts, in-block space insertion
712/// — all speak the same typographic baseline for this specific page,
713/// rather than each helper reaching for an independent fixed constant.
714#[derive(Debug, Clone, Copy)]
715struct PageStats {
716    /// Median font size across all spans (pt).
717    median_font_size: f64,
718    /// Median measured character width (pt). Zero-guarded fallback is
719    /// 0.5 × median_font_size when there aren't enough samples.
720    /// Currently populated for diagnostics / future tuning; allow dead_code
721    /// under `-D warnings` until a reader is added.
722    #[allow(dead_code)]
723    median_char_width: f64,
724    /// Tight line-to-line spacing (25th percentile of pairwise band
725    /// gaps), representing the body-text leading on this page. The
726    /// quartile is used instead of the median so large paragraph /
727    /// zone gaps don't inflate the baseline. Zero if the page has
728    /// only one band.
729    median_line_spacing: f64,
730}
731
732impl PageStats {
733    fn from_spans(spans: &[TextSpan]) -> Self {
734        if spans.is_empty() {
735            return Self {
736                median_font_size: 12.0,
737                median_char_width: 6.0,
738                median_line_spacing: 0.0,
739            };
740        }
741
742        // Median font size.
743        let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
744        sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
745        let median_font_size = sizes[sizes.len() / 2];
746
747        // Median char width — measured width / char count, per span.
748        let mut char_widths: Vec<f64> = spans
749            .iter()
750            .filter_map(|s| {
751                let chars = s.text.chars().count();
752                if chars > 0 && s.width > 0.0 {
753                    Some(s.width / chars as f64)
754                } else {
755                    None
756                }
757            })
758            .collect();
759        let median_char_width = if char_widths.is_empty() {
760            median_font_size * 0.5
761        } else {
762            char_widths.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
763            char_widths[char_widths.len() / 2]
764        };
765
766        // Median line spacing — pairwise gaps between consecutive band
767        // y-values.
768        let band_tolerance = (median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
769        let mut ys: Vec<f64> = spans.iter().map(|s| s.y).collect();
770        ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
771        let mut band_ys: Vec<f64> = Vec::new();
772        for y in ys {
773            if band_ys
774                .last()
775                .map(|prev: &f64| (prev - y).abs() > band_tolerance)
776                .unwrap_or(true)
777            {
778                band_ys.push(y);
779            }
780        }
781        // ANN[r17/TEX4] "Line spacing" here means the TIGHT line-to-line
782        // gap inside a text block — not the median of all gaps. Using
783        // the median drags the estimate up when the page has
784        // paragraph / zone breaks (which are the very gaps the
785        // paragraph-break threshold is supposed to EXCEED). The 25th
786        // percentile is the smallest gap that still shows up in more
787        // than one place on the page; it captures body-text leading
788        // robustly even when large zone gaps dominate.
789        let median_line_spacing = if band_ys.len() < 2 {
790            0.0
791        } else {
792            let mut spacings: Vec<f64> = band_ys
793                .windows(2)
794                .map(|pair| (pair[0] - pair[1]).abs())
795                .collect();
796            spacings.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
797            let q1_index = spacings.len() / 4;
798            spacings[q1_index]
799        };
800
801        Self {
802            median_font_size,
803            median_char_width,
804            median_line_spacing,
805        }
806    }
807}
808
809// ANN[r17/TEX2] Maximum recursion depth for XY-Cut. Any real page layout
810// is decomposable in well under 10 alternating cuts; the cap guards
811// against pathological inputs where the cut predicate keeps triggering
812// due to floating-point drift.
813const XY_CUT_MAX_DEPTH: usize = 12;
814/// Minimum fraction of a region's width that a vertical gap must reach
815/// before it qualifies as a column gutter.
816const XY_CUT_VERTICAL_GAP_REGION_FRACTION: f64 = 0.04;
817/// Floor (in pt) for vertical gap regardless of region width. Matches
818/// the previous `COLUMN_GAP_THRESHOLD_MIN` and keeps XY-Cut conservative
819/// on narrow regions (sidebars, tall columns).
820const XY_CUT_VERTICAL_GAP_FLOOR: f64 = 10.0;
821/// Multiplier applied to median font size to produce the horizontal-gap
822/// threshold. 1.8 × line-height matches typical paragraph spacing.
823const XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER: f64 = 1.8;
824/// Minimum number of spans a column must contain before it is eligible
825/// for acceptance — one-span "columns" are almost always sidebar noise
826/// or table-cell fragments.
827const XY_CUT_MIN_SPANS_PER_COLUMN: usize = 2;
828/// Average characters per band a column must have before it's accepted
829/// as dense prose (vs. a short-cell table column).
830const XY_CUT_MIN_CHARS_PER_BAND: f64 = 8.0;
831
832/// ANN[r17/TEX2][r17/TEX4] Top-level grouping uses recursive XY-Cut
833/// with a density guard. Per-page stats are computed once up front so
834/// every decision downstream speaks the same typographic baseline.
835fn group_spans_into_blocks(spans: Vec<TextSpan>) -> Vec<TextBlock> {
836    if spans.is_empty() {
837        return Vec::new();
838    }
839    let stats = PageStats::from_spans(&spans);
840    xy_cut_recursive(spans, 0, &stats)
841}
842
843fn xy_cut_recursive(spans: Vec<TextSpan>, depth: usize, stats: &PageStats) -> Vec<TextBlock> {
844    if spans.is_empty() {
845        return Vec::new();
846    }
847    if depth >= XY_CUT_MAX_DEPTH {
848        return band_based_blocks(spans, stats);
849    }
850
851    // ANN[r17/TEX2] Pick whichever direction has the largest qualifying
852    // gap. Always cutting vertically first breaks layouts where a
853    // footer sits in the mid-x range — it would attach to the left
854    // column instead of being recognized as a page-level zone. The
855    // "largest gap wins" rule is the standard XY-Cut tie-breaker used
856    // by academic OCR literature and matches pdf_oxide.
857    let vcut = try_vertical_cut(&spans, stats);
858    let hcut = try_horizontal_cut(&spans, stats);
859
860    let (chosen, _) = match (vcut, hcut) {
861        (Some((v_groups, v_gap)), Some((h_groups, h_gap))) => {
862            if v_gap >= h_gap {
863                (Some(v_groups), v_gap)
864            } else {
865                (Some(h_groups), h_gap)
866            }
867        }
868        (Some((v_groups, v_gap)), None) => (Some(v_groups), v_gap),
869        (None, Some((h_groups, h_gap))) => (Some(h_groups), h_gap),
870        (None, None) => (None, 0.0),
871    };
872
873    if let Some(groups) = chosen {
874        let mut out = Vec::new();
875        for group in groups {
876            out.extend(xy_cut_recursive(group, depth + 1, stats));
877        }
878        return out;
879    }
880
881    band_based_blocks(spans, stats)
882}
883
884/// Emit per-band row blocks without any column detection. Used as the
885/// leaf of XY-Cut recursion — at this point the region either has no
886/// further cuts or the density guard refused them.
887fn band_based_blocks(spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBlock> {
888    // XY-Cut can miss recurring gutters when a small number of bands span the
889    // full page width (e.g. a running header above a 3-column body). In that
890    // case, fall back to the older band/gutter detector inside the leaf region
891    // instead of flattening everything row-major.
892    group_spans_into_blocks_legacy_with_stats(spans, stats)
893}
894
895/// Median font-size helper. Currently unreferenced after `PageStats` took over
896/// the typography baseline computation; kept available for future tuning paths.
897#[allow(dead_code)]
898fn median_font_size(spans: &[TextSpan]) -> f64 {
899    if spans.is_empty() {
900        return 12.0;
901    }
902    let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
903    sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
904    sizes[sizes.len() / 2]
905}
906
907/// Attempt a vertical (column) cut. Returns the span groups plus the
908/// gap size (in pt) if a suitable gutter is found AND the density +
909/// alignment guards accept.
910///
911/// ANN[r17/TEX2] Three guards together avoid false-positive columns:
912///   1. `min_gap` is the MAX of (median_font, 4% of region width, 10pt)
913///      — deliberately lower than `median_font * 2` so narrow-gutter
914///      academic papers (12pt gutters, common in print) are still
915///      detected.
916///   2. `columns_are_dense` rejects column splits where either side
917///      has <2 spans or <8 chars/band — catches table cells.
918///   3. `columns_are_band_aligned` rejects cuts where any band would
919///      end up on only one side of the cut while being wider than
920///      ~70% of that side's column width — catches full-width
921///      paragraphs (Intro / Outro) that accidentally sit in the
922///      left-column x-range.
923fn try_vertical_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
924    if spans.len() < 2 * XY_CUT_MIN_SPANS_PER_COLUMN {
925        return None;
926    }
927
928    let region_left = spans.iter().map(|s| s.x).fold(f64::INFINITY, f64::min);
929    let region_right = spans
930        .iter()
931        .map(TextSpan::right)
932        .fold(f64::NEG_INFINITY, f64::max);
933    let region_width = region_right - region_left;
934    if region_width <= 0.0 {
935        return None;
936    }
937
938    // ANN[r17/TEX2][r17/TEX4] Threshold uses the ADAPTIVE median-word-gap
939    // from the bands rather than a flat font-size multiple. Narrow-gutter
940    // academic layouts have 12pt gutters next to 4pt word spaces — the
941    // adaptive threshold scales with the actual typography used on this
942    // page. Clamped to `XY_CUT_VERTICAL_GAP_FLOOR` to avoid firing on
943    // ordinary inter-word spaces when character advance data is noisy.
944    // median_font and the width fraction act only as safety rails for
945    // pathological inputs.
946    let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
947    let adaptive = compute_adaptive_column_gap(&bands);
948    let floor = stats
949        .median_font_size
950        .max(region_width * XY_CUT_VERTICAL_GAP_REGION_FRACTION)
951        .max(XY_CUT_VERTICAL_GAP_FLOOR);
952    let min_gap = adaptive.min(floor).max(XY_CUT_VERTICAL_GAP_FLOOR);
953
954    // Intervals [x_left, x_right] of every span; we look for an x value
955    // that is free of ALL intervals (full-height gap).
956    let mut intervals: Vec<(f64, f64)> = spans
957        .iter()
958        .map(|s| (s.x, s.right().max(s.x + 0.001)))
959        .collect();
960    intervals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
961
962    let mut cursor = intervals[0].1;
963    let mut best_gap: Option<(f64, f64)> = None; // (gap_size, cut_x)
964    for (left, right) in intervals.iter().skip(1) {
965        if *left > cursor {
966            let gap = *left - cursor;
967            if gap >= min_gap {
968                match best_gap {
969                    Some((best, _)) if best >= gap => {}
970                    _ => {
971                        let cut_x = (cursor + *left) * 0.5;
972                        best_gap = Some((gap, cut_x));
973                    }
974                }
975            }
976        }
977        cursor = cursor.max(*right);
978    }
979
980    let (gap_size, cut_x) = best_gap?;
981
982    // Split spans around the cut. A span whose midpoint is < cut_x
983    // belongs to the left group.
984    let mut left_group = Vec::new();
985    let mut right_group = Vec::new();
986    for span in spans {
987        let midpoint = span.x + (span.right() - span.x) * 0.5;
988        if midpoint < cut_x {
989            left_group.push(span.clone());
990        } else {
991            right_group.push(span.clone());
992        }
993    }
994
995    if !columns_are_dense(&left_group, &right_group, stats) {
996        return None;
997    }
998    if !columns_are_band_aligned(spans, cut_x, region_left, region_right, stats) {
999        return None;
1000    }
1001
1002    Some((vec![left_group, right_group], gap_size))
1003}
1004
1005/// ANN[r17/TEX2] Reject a vertical cut when any band sits on only one
1006/// side of the cut AND occupies more than ~70% of that side's column
1007/// width. Such bands are almost certainly full-width paragraphs that
1008/// happened to align with the left margin of one column, and forcing
1009/// them into that column re-orders them relative to text that follows.
1010fn columns_are_band_aligned(
1011    spans: &[TextSpan],
1012    cut_x: f64,
1013    region_left: f64,
1014    region_right: f64,
1015    stats: &PageStats,
1016) -> bool {
1017    let left_width = (cut_x - region_left).max(1.0);
1018    let right_width = (region_right - cut_x).max(1.0);
1019
1020    // Threshold chosen empirically: paragraph bodies in columnar
1021    // layouts usually fill ~60-70% of their column; anything wider
1022    // than 0.7× is a page-level element masquerading as column
1023    // content. Empirically validated on the 281-doc VPS text corpus
1024    // 2026-05-09: tuning to 0.80 / 0.90 caused regressions on
1025    // UNKNOWN_TEXT_DIFF docs (column false-positives) that net
1026    // outweighed the LINE_ORDER gains. Threshold restored to 0.70.
1027    const MAX_SINGLE_SIDE_FRACTION: f64 = 0.70;
1028
1029    let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1030    for band in &bands {
1031        let mut has_left = false;
1032        let mut has_right = false;
1033        for span in &band.spans {
1034            let midpoint = span.x + (span.right() - span.x) * 0.5;
1035            if midpoint < cut_x {
1036                has_left = true;
1037            } else {
1038                has_right = true;
1039            }
1040        }
1041        if has_left && has_right {
1042            continue; // Band straddles columns → fine.
1043        }
1044        let band_width = band.width();
1045        if has_left && band_width > left_width * MAX_SINGLE_SIDE_FRACTION {
1046            return false;
1047        }
1048        if has_right && band_width > right_width * MAX_SINGLE_SIDE_FRACTION {
1049            return false;
1050        }
1051    }
1052    true
1053}
1054
1055/// Density guard — reject column splits that look like tables (few,
1056/// short spans per column). A column is "dense" when it has at least
1057/// MIN_SPANS_PER_COLUMN spans and the average character count per band
1058/// exceeds MIN_CHARS_PER_BAND.
1059fn columns_are_dense(left: &[TextSpan], right: &[TextSpan], stats: &PageStats) -> bool {
1060    for col in [left, right] {
1061        if col.len() < XY_CUT_MIN_SPANS_PER_COLUMN {
1062            return false;
1063        }
1064        let bands = group_spans_into_bands_with_stats(col.to_vec(), stats);
1065        if bands.is_empty() {
1066            return false;
1067        }
1068        let total_chars: usize = col.iter().map(|s| s.text.chars().count()).sum();
1069        let chars_per_band = total_chars as f64 / bands.len() as f64;
1070        if chars_per_band < XY_CUT_MIN_CHARS_PER_BAND {
1071            return false;
1072        }
1073    }
1074    true
1075}
1076
1077/// Attempt a horizontal (zone / paragraph) cut. Unlike vertical cuts
1078/// this does NOT need a density guard — splitting top-from-bottom
1079/// cannot re-order content.
1080fn try_horizontal_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1081    if spans.len() < 2 {
1082        return None;
1083    }
1084    // Sort by descending y (PDF y grows upward).
1085    let mut sorted = spans.to_vec();
1086    sorted.sort_by(|a, b| {
1087        b.y.partial_cmp(&a.y)
1088            .unwrap_or(Ordering::Equal)
1089            .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1090    });
1091
1092    // ANN[r17/TEX4] Paragraph / zone cuts scale with MEDIAN LINE
1093    // SPACING when available — this is the typographically correct
1094    // baseline (paragraph break ≈ 1.8 × line-spacing). When the page
1095    // has only one band, or stats haven't observed spacing yet, fall
1096    // back to the font-size multiple the legacy path used.
1097    let min_gap = if stats.median_line_spacing > 0.0 {
1098        stats.median_line_spacing * PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER
1099    } else {
1100        stats.median_font_size * XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER
1101    };
1102
1103    // Look for the largest gap between consecutive span y-values.
1104    let mut best: Option<(f64, f64)> = None; // (gap_size, cut_y)
1105    let tolerance = stats.median_font_size * BAND_Y_FRACTION;
1106    let mut band_bottom = sorted[0].y;
1107
1108    for span in sorted.iter().skip(1) {
1109        if (band_bottom - span.y).abs() <= tolerance {
1110            band_bottom = band_bottom.min(span.y);
1111            continue;
1112        }
1113        let gap = band_bottom - span.y;
1114        if gap >= min_gap {
1115            let cut_y = (band_bottom + span.y) * 0.5;
1116            match best {
1117                Some((best_gap, _)) if best_gap >= gap => {}
1118                _ => best = Some((gap, cut_y)),
1119            }
1120        }
1121        band_bottom = span.y;
1122    }
1123
1124    let (gap_size, cut_y) = best?;
1125
1126    let mut top_group = Vec::new();
1127    let mut bottom_group = Vec::new();
1128    for span in spans {
1129        if span.y > cut_y {
1130            top_group.push(span.clone());
1131        } else {
1132            bottom_group.push(span.clone());
1133        }
1134    }
1135    if top_group.is_empty() || bottom_group.is_empty() {
1136        return None;
1137    }
1138    Some((vec![top_group, bottom_group], gap_size))
1139}
1140
1141/// Legacy band+column-detection path, kept for reference and as the
1142/// fallback inside `band_based_blocks` test coverage. Not currently
1143/// used — XY-Cut supersedes it.
1144#[allow(dead_code)]
1145fn group_spans_into_blocks_legacy(spans: Vec<TextSpan>) -> Vec<TextBlock> {
1146    let bands = group_spans_into_bands(spans);
1147    group_spans_into_blocks_legacy_from_bands(bands)
1148}
1149
1150fn group_spans_into_blocks_legacy_with_stats(
1151    spans: Vec<TextSpan>,
1152    stats: &PageStats,
1153) -> Vec<TextBlock> {
1154    let bands = group_spans_into_bands_with_stats(spans, stats);
1155    group_spans_into_blocks_legacy_from_bands(bands)
1156}
1157
1158fn group_spans_into_blocks_legacy_from_bands(bands: Vec<TextBand>) -> Vec<TextBlock> {
1159    if bands.is_empty() {
1160        return Vec::new();
1161    }
1162
1163    let column_gap_threshold = compute_adaptive_column_gap(&bands);
1164
1165    let mut blocks = Vec::new();
1166    let mut idx = 0;
1167
1168    while idx < bands.len() {
1169        let gap_midpoints = bands[idx].gap_midpoints(column_gap_threshold);
1170        if gap_midpoints.is_empty() {
1171            blocks.push(bands[idx].row_block());
1172            idx += 1;
1173            continue;
1174        }
1175
1176        let mut boundaries = gap_midpoints.clone();
1177        let mut band_indices = vec![idx];
1178        let mut gapped_band_count = 1usize;
1179        let mut region_left = bands[idx].left();
1180        let mut region_right = bands[idx].right();
1181        let mut next_idx = idx + 1;
1182
1183        while next_idx < bands.len() {
1184            let next_band = &bands[next_idx];
1185            let next_gap_midpoints = next_band.gap_midpoints(column_gap_threshold);
1186            if next_gap_midpoints.is_empty() {
1187                if next_band
1188                    .fits_single_column(&boundaries, region_left, region_right)
1189                    .is_some()
1190                {
1191                    band_indices.push(next_idx);
1192                    next_idx += 1;
1193                    continue;
1194                }
1195                break;
1196            }
1197
1198            if !boundaries_match(&boundaries, &next_gap_midpoints, column_gap_threshold) {
1199                break;
1200            }
1201
1202            update_boundaries(&mut boundaries, &next_gap_midpoints, gapped_band_count);
1203            gapped_band_count += 1;
1204            band_indices.push(next_idx);
1205            region_left = region_left.min(next_band.left());
1206            region_right = region_right.max(next_band.right());
1207            next_idx += 1;
1208        }
1209
1210        if region_is_columnar(&bands, &band_indices, &boundaries, gapped_band_count) {
1211            append_column_region_blocks(&bands, &band_indices, &boundaries, &mut blocks);
1212            idx = next_idx;
1213        } else {
1214            blocks.push(bands[idx].row_block());
1215            idx += 1;
1216        }
1217    }
1218
1219    blocks
1220}
1221
1222/// Legacy wrapper used by call sites that haven't been handed PageStats.
1223/// It derives stats locally. Prefer `group_spans_into_bands_with_stats`
1224/// inside the XY-Cut pipeline to avoid recomputing the stats per call.
1225fn group_spans_into_bands(spans: Vec<TextSpan>) -> Vec<TextBand> {
1226    let stats = PageStats::from_spans(&spans);
1227    group_spans_into_bands_with_stats(spans, &stats)
1228}
1229
1230fn group_spans_into_bands_with_stats(mut spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBand> {
1231    if spans.is_empty() {
1232        return Vec::new();
1233    }
1234
1235    spans.sort_by(|a, b| {
1236        b.y.partial_cmp(&a.y)
1237            .unwrap_or(Ordering::Equal)
1238            .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1239    });
1240
1241    // ANN[r17/TEX4] Band tolerance scales with this page's median font
1242    // size rather than a fixed 5pt floor. Single-page spreads with
1243    // huge display fonts (24pt+) previously merged unrelated lines; a
1244    // fractional threshold keeps that from happening without hurting
1245    // body-text pages.
1246    let page_tolerance = (stats.median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
1247
1248    let mut bands: Vec<TextBand> = Vec::new();
1249
1250    for span in spans {
1251        let tolerance = (span.height * BAND_Y_FRACTION)
1252            .max(page_tolerance)
1253            .max(BAND_Y_TOLERANCE);
1254        if let Some(band) = bands
1255            .iter_mut()
1256            .find(|band| (band.y - span.y).abs() <= tolerance)
1257        {
1258            let span_count = band.spans.len() as f64;
1259            band.y = (band.y * span_count + span.y) / (span_count + 1.0);
1260            band.spans.push(span);
1261        } else {
1262            bands.push(TextBand::new(span));
1263        }
1264    }
1265
1266    for band in &mut bands {
1267        band.sort_spans();
1268    }
1269
1270    bands.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal));
1271    bands
1272}
1273
1274fn boundaries_match(boundaries: &[f64], gap_midpoints: &[f64], column_gap_threshold: f64) -> bool {
1275    let tolerance = (column_gap_threshold * 1.5).clamp(COLUMN_GAP_MATCH_TOLERANCE, 60.0);
1276    boundaries.len() == gap_midpoints.len()
1277        && boundaries
1278            .iter()
1279            .zip(gap_midpoints)
1280            .all(|(lhs, rhs)| (lhs - rhs).abs() <= tolerance)
1281}
1282
1283fn update_boundaries(boundaries: &mut [f64], gap_midpoints: &[f64], seen_gapped_bands: usize) {
1284    for (boundary, midpoint) in boundaries.iter_mut().zip(gap_midpoints) {
1285        *boundary =
1286            (*boundary * seen_gapped_bands as f64 + midpoint) / (seen_gapped_bands as f64 + 1.0);
1287    }
1288}
1289
1290fn region_is_columnar(
1291    bands: &[TextBand],
1292    band_indices: &[usize],
1293    boundaries: &[f64],
1294    gapped_band_count: usize,
1295) -> bool {
1296    if boundaries.is_empty()
1297        || gapped_band_count < MIN_COLUMN_GAPPED_BANDS
1298        || band_indices.is_empty()
1299        || (gapped_band_count as f64 / band_indices.len() as f64) < MIN_COLUMN_GAP_SUPPORT
1300    {
1301        return false;
1302    }
1303
1304    let mut non_empty_slices = 0usize;
1305    let mut dense_slices = 0usize;
1306    let mut slices_per_column = vec![0usize; boundaries.len() + 1];
1307
1308    for &band_idx in band_indices {
1309        let slices = bands[band_idx].split_by_boundaries(boundaries);
1310        for (column_idx, slice) in slices.iter().enumerate() {
1311            if slice.is_empty() {
1312                continue;
1313            }
1314
1315            non_empty_slices += 1;
1316            slices_per_column[column_idx] += 1;
1317
1318            let char_count = slice
1319                .iter()
1320                .map(|span| span.text.chars().count())
1321                .sum::<usize>();
1322            if slice.len() >= 2 || char_count >= 8 {
1323                dense_slices += 1;
1324            }
1325        }
1326    }
1327
1328    if non_empty_slices < boundaries.len() + 2 {
1329        return false;
1330    }
1331
1332    if slices_per_column.contains(&0) {
1333        return false;
1334    }
1335
1336    (dense_slices as f64 / non_empty_slices as f64) >= MIN_DENSE_SLICE_RATIO
1337}
1338
1339fn append_column_region_blocks(
1340    bands: &[TextBand],
1341    band_indices: &[usize],
1342    boundaries: &[f64],
1343    blocks: &mut Vec<TextBlock>,
1344) {
1345    let column_count = boundaries.len() + 1;
1346    let mut column_bands = vec![Vec::<TextSpan>::new(); column_count];
1347
1348    for &band_idx in band_indices {
1349        let slices = bands[band_idx].split_by_boundaries(boundaries);
1350        for (column_idx, slice) in slices.into_iter().enumerate() {
1351            if slice.is_empty() {
1352                continue;
1353            }
1354            column_bands[column_idx].push(TextSpan {
1355                text: String::new(),
1356                x: 0.0,
1357                y: 0.0,
1358                width: 0.0,
1359                height: 0.0,
1360                font_size: 0.0,
1361            });
1362            let marker_idx = column_bands[column_idx].len() - 1;
1363            column_bands[column_idx][marker_idx] = TextSpan {
1364                text: String::new(),
1365                x: f64::NEG_INFINITY,
1366                y: bands[band_idx].y,
1367                width: 0.0,
1368                height: 0.0,
1369                font_size: 0.0,
1370            };
1371            column_bands[column_idx].extend(slice);
1372        }
1373    }
1374
1375    for spans in column_bands {
1376        let mut current: Vec<TextSpan> = Vec::new();
1377        for span in spans {
1378            if span.x == f64::NEG_INFINITY {
1379                if !current.is_empty() {
1380                    current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1381                    blocks.push(TextBlock {
1382                        spans: std::mem::take(&mut current),
1383                    });
1384                }
1385                continue;
1386            }
1387            current.push(span);
1388        }
1389        if !current.is_empty() {
1390            current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1391            blocks.push(TextBlock { spans: current });
1392        }
1393    }
1394}
1395
1396/// Join per-block lines, stitching end-of-line hyphenated word-wraps the
1397/// way pdftotext / MuPDF / PDFBox do.
1398///
1399/// Trigger conditions (all must hold):
1400/// 1. Previous line ends with `-` preceded by an alphabetic character.
1401/// 2. The alphabetic suffix before the `-` has >= 3 characters.
1402/// 3. The next line (trimmed) starts with an ASCII lowercase letter.
1403/// 4. The lowercase prefix of the next line has >= 3 characters.
1404///
1405/// When triggered, the trailing `-` is removed and the two halves are
1406/// concatenated without a space or newline.
1407///
1408/// This avoids false positives on compound words ("real-time"), bullet
1409/// lists, numeric ranges ("42-"), and short fragments.
1410fn stitch_hyphenated_lines(lines: &[String]) -> String {
1411    let mut out = String::new();
1412    for (idx, line) in lines.iter().enumerate() {
1413        if idx == 0 {
1414            out.push_str(line);
1415            continue;
1416        }
1417
1418        let next_trimmed = line.trim_start();
1419
1420        // Check the accumulated output for end-of-line hyphen pattern
1421        let should_merge = is_hyphen_wrap_candidate(&out, next_trimmed);
1422
1423        if should_merge {
1424            out.pop(); // drop the trailing '-'
1425            out.push_str(next_trimmed);
1426        } else {
1427            out.push('\n');
1428            out.push_str(line);
1429        }
1430    }
1431    out
1432}
1433
1434/// Check if the accumulated text ends with a hyphen-wrap pattern and the
1435/// continuation is a valid merge target.
1436fn is_hyphen_wrap_candidate(accumulated: &str, next_trimmed: &str) -> bool {
1437    // Must end with '-'
1438    if !accumulated.ends_with('-') {
1439        return false;
1440    }
1441
1442    // Character before '-' must be alphabetic
1443    let before_hyphen = accumulated.chars().rev().nth(1);
1444    if !before_hyphen.is_some_and(|c| c.is_alphabetic()) {
1445        return false;
1446    }
1447
1448    // Count consecutive alphabetic chars before the '-' (the word fragment)
1449    let alpha_prefix_len = accumulated
1450        .chars()
1451        .rev()
1452        .skip(1) // skip the '-'
1453        .take_while(|c| c.is_alphabetic())
1454        .count();
1455    if alpha_prefix_len < 3 {
1456        return false;
1457    }
1458
1459    // Next line must start with lowercase ASCII
1460    let first_next = next_trimmed.chars().next();
1461    if !first_next.is_some_and(|c| c.is_ascii_lowercase()) {
1462        return false;
1463    }
1464
1465    // Count consecutive lowercase chars at start of next line
1466    let next_alpha_len = next_trimmed
1467        .chars()
1468        .take_while(|c| c.is_ascii_lowercase())
1469        .count();
1470    if next_alpha_len < 3 {
1471        return false;
1472    }
1473
1474    true
1475}
1476
1477/// Normalize extracted text to match pdftotext conventions.
1478///
1479/// 1. Trim trailing whitespace from each line.
1480/// 2. Collapse runs of more than two consecutive newlines into exactly two.
1481/// 3. Preserve form-feed characters (`\x0C`) as page separators.
1482/// 4. End with a single trailing newline (or empty for empty input).
1483pub(crate) fn normalize_text_output(text: &str) -> String {
1484    if text.is_empty() {
1485        return String::new();
1486    }
1487
1488    let mut lines: Vec<&str> = Vec::new();
1489    for line in text.split('\n') {
1490        lines.push(line.trim_end());
1491    }
1492
1493    // Remove trailing empty lines (we'll add exactly one \n at the end)
1494    while lines.last() == Some(&"") {
1495        lines.pop();
1496    }
1497
1498    if lines.is_empty() {
1499        return String::new();
1500    }
1501
1502    let mut result = String::with_capacity(text.len());
1503    let mut consecutive_empty = 0u32;
1504
1505    for (i, line) in lines.iter().enumerate() {
1506        if line.is_empty() || *line == "\x0C" {
1507            if line.is_empty() {
1508                consecutive_empty += 1;
1509                // Collapse >2 consecutive blank lines to 2
1510                if consecutive_empty <= 2 {
1511                    result.push('\n');
1512                }
1513            } else {
1514                // Bare form-feed line
1515                consecutive_empty = 0;
1516                result.push_str(line);
1517                if i + 1 < lines.len() {
1518                    result.push('\n');
1519                }
1520            }
1521        } else {
1522            // Both form-feed-prefixed and regular lines are emitted as-is.
1523            consecutive_empty = 0;
1524            result.push_str(line);
1525            if i + 1 < lines.len() {
1526                result.push('\n');
1527            }
1528        }
1529    }
1530
1531    // Ensure single trailing newline
1532    if !result.is_empty() && !result.ends_with('\n') {
1533        result.push('\n');
1534    }
1535
1536    result
1537}
1538
1539#[cfg(test)]
1540mod tests {
1541    use super::*;
1542
1543    fn span(text: &str, x: f64, y: f64, width: f64) -> TextSpan {
1544        TextSpan {
1545            text: text.into(),
1546            x,
1547            y,
1548            width,
1549            height: 12.0,
1550            font_size: 12.0,
1551        }
1552    }
1553
1554    fn block_texts(spans: Vec<TextSpan>) -> Vec<String> {
1555        group_spans_into_blocks(spans)
1556            .into_iter()
1557            .map(|block| block.text())
1558            .collect()
1559    }
1560
1561    #[test]
1562    fn empty_device_produces_empty_text() {
1563        let dev = TextExtractionDevice::new();
1564        assert!(dev.into_text().is_empty());
1565    }
1566
1567    #[test]
1568    fn single_column_stays_row_major() {
1569        let texts = block_texts(vec![
1570            span("Single Column Line 1", 40.0, 700.0, 140.0),
1571            span("Single Column Line 2", 40.0, 684.0, 140.0),
1572            span("Single Column Line 3", 40.0, 668.0, 140.0),
1573        ]);
1574
1575        assert_eq!(
1576            texts,
1577            vec![
1578                "Single Column Line 1",
1579                "Single Column Line 2",
1580                "Single Column Line 3",
1581            ]
1582        );
1583    }
1584
1585    #[test]
1586    fn two_column_region_reads_column_major() {
1587        let texts = block_texts(vec![
1588            span("Header", 200.0, 740.0, 80.0),
1589            span("Left column line one", 40.0, 700.0, 115.0),
1590            span("Right column line one", 320.0, 700.0, 120.0),
1591            span("Left column line two", 40.0, 684.0, 115.0),
1592            span("Right column line two", 320.0, 684.0, 120.0),
1593            span("Left column line three", 40.0, 668.0, 125.0),
1594            span("Right column line three", 320.0, 668.0, 130.0),
1595            span("Footer", 200.0, 620.0, 80.0),
1596        ]);
1597
1598        assert_eq!(
1599            texts,
1600            vec![
1601                "Header",
1602                "Left column line one",
1603                "Left column line two",
1604                "Left column line three",
1605                "Right column line one",
1606                "Right column line two",
1607                "Right column line three",
1608                "Footer",
1609            ]
1610        );
1611    }
1612
1613    #[test]
1614    fn mixed_single_and_multi_column_regions_preserve_shared_bands() {
1615        let texts = block_texts(vec![
1616            span("Intro paragraph", 40.0, 740.0, 180.0),
1617            span("L1 words here", 40.0, 700.0, 110.0),
1618            span("R1 words here", 320.0, 700.0, 110.0),
1619            span("L2 words here", 40.0, 684.0, 110.0),
1620            span("R2 words here", 320.0, 684.0, 110.0),
1621            span("L3 words here", 40.0, 668.0, 110.0),
1622            span("R3 words here", 320.0, 668.0, 110.0),
1623            span("Outro paragraph", 40.0, 620.0, 180.0),
1624        ]);
1625
1626        assert_eq!(
1627            texts,
1628            vec![
1629                "Intro paragraph",
1630                "L1 words here",
1631                "L2 words here",
1632                "L3 words here",
1633                "R1 words here",
1634                "R2 words here",
1635                "R3 words here",
1636                "Outro paragraph",
1637            ]
1638        );
1639    }
1640
1641    #[test]
1642    fn short_table_like_rows_fall_back_to_row_major() {
1643        let texts = block_texts(vec![
1644            span("Name", 40.0, 700.0, 30.0),
1645            span("Age", 320.0, 700.0, 20.0),
1646            span("Alice", 40.0, 684.0, 35.0),
1647            span("30", 320.0, 684.0, 15.0),
1648            span("Bob", 40.0, 668.0, 24.0),
1649            span("25", 320.0, 668.0, 15.0),
1650        ]);
1651
1652        assert_eq!(texts, vec!["Name Age", "Alice 30", "Bob 25"]);
1653    }
1654
1655    #[test]
1656    fn three_column_regions_are_supported() {
1657        let texts = block_texts(vec![
1658            span("Column one line one", 40.0, 700.0, 105.0),
1659            span("Column two line one", 220.0, 700.0, 105.0),
1660            span("Column three line one", 400.0, 700.0, 120.0),
1661            span("Column one line two", 40.0, 684.0, 105.0),
1662            span("Column two line two", 220.0, 684.0, 105.0),
1663            span("Column three line two", 400.0, 684.0, 120.0),
1664            span("Column one line three", 40.0, 668.0, 120.0),
1665            span("Column two line three", 220.0, 668.0, 120.0),
1666            span("Column three line three", 400.0, 668.0, 135.0),
1667        ]);
1668
1669        assert_eq!(
1670            texts,
1671            vec![
1672                "Column one line one",
1673                "Column one line two",
1674                "Column one line three",
1675                "Column two line one",
1676                "Column two line two",
1677                "Column two line three",
1678                "Column three line one",
1679                "Column three line two",
1680                "Column three line three",
1681            ]
1682        );
1683    }
1684
1685    #[test]
1686    fn text_block_concatenation_spaced() {
1687        let block = TextBlock {
1688            spans: vec![span("A", 0.0, 0.0, 6.0), span("B", 20.0, 0.0, 6.0)],
1689        };
1690        assert_eq!(block.text(), "A B");
1691    }
1692
1693    #[test]
1694    fn adaptive_column_gap_fallback_for_no_gaps() {
1695        // Single-span bands produce no measurable gaps → fallback
1696        let bands = vec![
1697            TextBand::new(span("Hello", 40.0, 700.0, 80.0)),
1698            TextBand::new(span("World", 40.0, 684.0, 80.0)),
1699        ];
1700        let threshold = compute_adaptive_column_gap(&bands);
1701        assert!((threshold - COLUMN_GAP_THRESHOLD_FALLBACK).abs() < 0.01);
1702    }
1703
1704    #[test]
1705    fn adaptive_column_gap_uses_median() {
1706        // Three bands with word gaps of ~4pt each → median ≈ 4, threshold = 12
1707        let mut bands = Vec::new();
1708        for y in [700.0, 684.0, 668.0] {
1709            let mut band = TextBand::new(span("word1", 40.0, y, 30.0));
1710            band.spans.push(span("word2", 74.0, y, 30.0)); // gap = 4
1711            band.spans.push(span("word3", 108.0, y, 30.0)); // gap = 4
1712            bands.push(band);
1713        }
1714        let threshold = compute_adaptive_column_gap(&bands);
1715        // median gap = 4, × 3 = 12, clamped to [10, 40] → 12
1716        assert!(
1717            threshold >= 10.0 && threshold <= 14.0,
1718            "expected ~12, got {threshold}"
1719        );
1720    }
1721
1722    #[test]
1723    fn adaptive_column_gap_clamps_to_min() {
1724        // Tight gaps (2pt) across many bands → median = 2, 3×2 = 6 → clamped to 10
1725        let mut bands = Vec::new();
1726        for y in [700.0, 684.0, 668.0, 652.0] {
1727            let mut band = TextBand::new(span("abc", 0.0, y, 18.0));
1728            // right of "abc" = max(18, 12*0.5*3=18) = 18; gap = 20-18 = 2
1729            band.spans.push(span("def", 20.0, y, 18.0));
1730            bands.push(band);
1731        }
1732        let threshold = compute_adaptive_column_gap(&bands);
1733        assert!(
1734            (threshold - COLUMN_GAP_THRESHOLD_MIN).abs() < 0.01,
1735            "expected {COLUMN_GAP_THRESHOLD_MIN}, got {threshold}"
1736        );
1737    }
1738
1739    #[test]
1740    fn adaptive_column_gap_all_large_gaps_uses_fraction_of_min() {
1741        // When all gaps are large (> MIN), threshold = 0.75 × min_gap.
1742        let mut band = TextBand::new(span("Left", 0.0, 700.0, 30.0));
1743        band.spans.push(span("Right", 80.0, 700.0, 30.0)); // gap = 50
1744        let bands = vec![band];
1745        let threshold = compute_adaptive_column_gap(&bands);
1746        assert!(
1747            (threshold - 37.5).abs() < 0.01,
1748            "expected 37.5 (0.75×50), got {threshold}"
1749        );
1750    }
1751
1752    #[test]
1753    fn normalize_trims_trailing_whitespace_per_line() {
1754        assert_eq!(
1755            normalize_text_output("hello   \nworld  \n"),
1756            "hello\nworld\n"
1757        );
1758    }
1759
1760    #[test]
1761    fn normalize_collapses_excess_newlines() {
1762        // >2 blank lines collapse to 2 (meaning 3 \n in a row: line, blank, blank)
1763        assert_eq!(
1764            normalize_text_output("hello\n\n\n\n\nworld\n"),
1765            "hello\n\n\nworld\n"
1766        );
1767    }
1768
1769    #[test]
1770    fn normalize_preserves_double_newline() {
1771        assert_eq!(
1772            normalize_text_output("paragraph one\n\nparagraph two\n"),
1773            "paragraph one\n\nparagraph two\n"
1774        );
1775    }
1776
1777    #[test]
1778    fn normalize_preserves_form_feed() {
1779        assert_eq!(
1780            normalize_text_output("page1\n\n\x0Cpage2\n"),
1781            "page1\n\n\x0Cpage2\n"
1782        );
1783    }
1784
1785    #[test]
1786    fn normalize_adds_trailing_newline() {
1787        assert_eq!(normalize_text_output("hello"), "hello\n");
1788    }
1789
1790    #[test]
1791    fn normalize_empty_input() {
1792        assert_eq!(normalize_text_output(""), "");
1793    }
1794
1795    #[test]
1796    fn normalize_only_whitespace() {
1797        assert_eq!(normalize_text_output("   \n  \n"), "");
1798    }
1799
1800    // --- Hyphen stitching tests ---
1801
1802    #[test]
1803    fn hyphen_stitch_joins_wrapped_word() {
1804        let lines = vec!["the aver-".into(), "age rainfall".into()];
1805        assert_eq!(stitch_hyphenated_lines(&lines), "the average rainfall");
1806    }
1807
1808    #[test]
1809    fn hyphen_stitch_handles_leading_whitespace() {
1810        let lines = vec!["pre-".into(), "   dict the outcome".into()];
1811        // "pre" is only 3 chars → meets >= 3 guard
1812        assert_eq!(stitch_hyphenated_lines(&lines), "predict the outcome");
1813    }
1814
1815    #[test]
1816    fn hyphen_stitch_capital_continuation_not_stitched() {
1817        let lines = vec!["Section three-".into(), "Summary here".into()];
1818        assert_eq!(
1819            stitch_hyphenated_lines(&lines),
1820            "Section three-\nSummary here"
1821        );
1822    }
1823
1824    #[test]
1825    fn hyphen_stitch_bullet_dash_not_stitched() {
1826        // "-" alone: char before hyphen is not alphabetic
1827        let lines = vec!["Items:".into(), "-".into(), "milk".into()];
1828        assert_eq!(stitch_hyphenated_lines(&lines), "Items:\n-\nmilk");
1829    }
1830
1831    #[test]
1832    fn hyphen_stitch_numeric_range_not_stitched() {
1833        // "42-" — char before hyphen is digit, not alphabetic
1834        let lines = vec!["page 42-".into(), "seventy".into()];
1835        assert_eq!(stitch_hyphenated_lines(&lines), "page 42-\nseventy");
1836    }
1837
1838    #[test]
1839    fn hyphen_stitch_short_prefix_not_stitched() {
1840        // "re-" only 2 alpha chars before hyphen → below 3-char guard
1841        let lines = vec!["re-".into(), "organize".into()];
1842        assert_eq!(stitch_hyphenated_lines(&lines), "re-\norganize");
1843    }
1844
1845    #[test]
1846    fn hyphen_stitch_short_continuation_not_stitched() {
1847        // Next line starts with "an" (2 chars) → below 3-char guard
1848        let lines = vec!["counter-".into(), "an example".into()];
1849        assert_eq!(stitch_hyphenated_lines(&lines), "counter-\nan example");
1850    }
1851
1852    #[test]
1853    fn hyphen_stitch_compound_word_midline_preserved() {
1854        // "real-time" is mid-line, not end-of-line — no stitching applies
1855        // because stitch only operates on line boundaries
1856        let lines = vec!["real-time system".into()];
1857        assert_eq!(stitch_hyphenated_lines(&lines), "real-time system");
1858    }
1859
1860    #[test]
1861    fn hyphen_stitch_single_line_unchanged() {
1862        let lines = vec!["only line".into()];
1863        assert_eq!(stitch_hyphenated_lines(&lines), "only line");
1864    }
1865
1866    #[test]
1867    fn hyphen_stitch_empty_input() {
1868        let lines: Vec<String> = vec![];
1869        assert_eq!(stitch_hyphenated_lines(&lines), "");
1870    }
1871
1872    // --- TEX1 multi-signal space consensus tests ---
1873
1874    fn make_device_with_median(median: f64) -> TextExtractionDevice {
1875        let mut dev = TextExtractionDevice::new();
1876        // Seed enough samples for the median to resolve to `median`.
1877        for _ in 0..MEDIAN_REFRESH {
1878            dev.glyph_widths.push(median);
1879        }
1880        dev.refresh_median_char_width();
1881        assert!((dev.cached_median_char_width - median).abs() < 1e-9);
1882        dev
1883    }
1884
1885    #[test]
1886    fn consensus_inserts_space_on_strong_tj_offset_alone() {
1887        // Gap is below the geometric threshold, but the TJ offset is large
1888        // enough that the consensus must still fire.
1889        let mut dev = make_device_with_median(6.0);
1890        dev.pending_tj_offset = 250.0; // full em-space
1891        assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1892    }
1893
1894    #[test]
1895    fn consensus_inserts_space_on_geometric_gap_alone() {
1896        // No TJ, no character transition, but a clearly wide geometric gap.
1897        let dev = make_device_with_median(6.0);
1898        // gap > 0.3 * 6.0 = 1.8 → fires gap signal (0.80), below threshold
1899        // on its own? 0.80 < 0.75 threshold? No, 0.80 > 0.75, so it fires.
1900        assert!(dev.evaluate_space_consensus(2.5, 12.0, "hello", "world"));
1901    }
1902
1903    #[test]
1904    fn consensus_no_space_on_kerning_gap() {
1905        // Small kerning-size gap with no other signals must not inject a
1906        // space (regression guard against false-positive spaces inside
1907        // tightly kerned words).
1908        let dev = make_device_with_median(6.0);
1909        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "fi", "lm"));
1910    }
1911
1912    #[test]
1913    fn consensus_inserts_space_on_camel_case_plus_gap() {
1914        // CamelCase heuristic (0.60) alone doesn't reach threshold, but a
1915        // moderate gap (0.60 gap + 0.60 heuristic if gap fires) should.
1916        // Here gap = 2.5 > 1.8 → gap fires → total 0.80 + 0.60 = 1.40.
1917        let dev = make_device_with_median(6.0);
1918        assert!(dev.evaluate_space_consensus(2.5, 12.0, "helloWorld", "Inc"));
1919    }
1920
1921    #[test]
1922    fn consensus_inserts_space_on_digit_letter_transition_with_gap() {
1923        let dev = make_device_with_median(6.0);
1924        assert!(dev.evaluate_space_consensus(2.5, 12.0, "123", "abc"));
1925    }
1926
1927    #[test]
1928    fn consensus_heuristic_alone_is_insufficient() {
1929        // Heuristic (0.60) on its own is below the 0.75 threshold — the
1930        // design deliberately requires a second corroborating signal to
1931        // avoid gluing spaces into existing CamelCase identifiers that
1932        // have no geometric break.
1933        let dev = make_device_with_median(6.0);
1934        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "camel", "Case"));
1935    }
1936
1937    #[test]
1938    fn consensus_falls_back_to_font_size_when_no_median() {
1939        // No samples → median is 0; geometric reference uses font-size.
1940        let dev = TextExtractionDevice::new();
1941        // gap 1.9 > 0.15 * 12.0 = 1.8 → gap signal fires
1942        assert!(dev.evaluate_space_consensus(1.9, 12.0, "a", "b"));
1943        // gap 1.5 < 1.8 → no signal
1944        assert!(!dev.evaluate_space_consensus(1.5, 12.0, "a", "b"));
1945    }
1946
1947    #[test]
1948    fn consensus_ignores_tiny_tj_offsets() {
1949        // TJ offsets below the threshold are kerning, not word breaks.
1950        let mut dev = make_device_with_median(6.0);
1951        dev.pending_tj_offset = 50.0;
1952        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1953    }
1954
1955    #[test]
1956    fn consensus_accepts_negative_tj_offsets() {
1957        // A negative TJ offset still represents an explicit inter-substring
1958        // shift and counts toward the consensus (|amount| check).
1959        let mut dev = make_device_with_median(6.0);
1960        dev.pending_tj_offset = -250.0;
1961        assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1962    }
1963
1964    #[test]
1965    fn text_adjustment_accumulates_until_glyph() {
1966        let mut dev = TextExtractionDevice::new();
1967        dev.text_adjustment(120.0);
1968        dev.text_adjustment(140.0);
1969        assert!((dev.pending_tj_offset - 260.0).abs() < 1e-6);
1970    }
1971
1972    // --- TEX2 XY-Cut tests ---
1973
1974    #[test]
1975    fn xy_cut_header_body_footer_with_two_columns() {
1976        // Header and footer sit in the mid-x range that would
1977        // accidentally fall into a left-column bucket with a naive
1978        // vertical-first cut. The largest-gap-first rule plus the
1979        // alignment guard ensure header and footer bracket the
1980        // columnar body.
1981        let texts = block_texts(vec![
1982            span("HEADLINE TITLE", 180.0, 760.0, 120.0),
1983            span("Left col line A", 40.0, 700.0, 110.0),
1984            span("Right col line A", 320.0, 700.0, 115.0),
1985            span("Left col line B", 40.0, 684.0, 110.0),
1986            span("Right col line B", 320.0, 684.0, 115.0),
1987            span("Left col line C", 40.0, 668.0, 110.0),
1988            span("Right col line C", 320.0, 668.0, 115.0),
1989            span("FOOTER LINE TEXT", 180.0, 600.0, 120.0),
1990        ]);
1991        assert_eq!(texts.first().map(String::as_str), Some("HEADLINE TITLE"));
1992        assert_eq!(texts.last().map(String::as_str), Some("FOOTER LINE TEXT"));
1993        // Left column lines all come before right column lines.
1994        let left_c_idx = texts.iter().position(|s| s == "Left col line C").unwrap();
1995        let right_a_idx = texts.iter().position(|s| s == "Right col line A").unwrap();
1996        assert!(
1997            left_c_idx < right_a_idx,
1998            "expected column-major ordering in body: {texts:?}"
1999        );
2000    }
2001
2002    #[test]
2003    fn xy_cut_rejects_column_split_on_table_rows() {
2004        // The density guard must still reject the 280pt inter-cell gap
2005        // in a short-cell table, preserving row-major reading order.
2006        let texts = block_texts(vec![
2007            span("Name", 40.0, 700.0, 30.0),
2008            span("Age", 320.0, 700.0, 20.0),
2009            span("Alice", 40.0, 684.0, 35.0),
2010            span("30", 320.0, 684.0, 15.0),
2011        ]);
2012        assert_eq!(texts, vec!["Name Age", "Alice 30"]);
2013    }
2014
2015    #[test]
2016    fn xy_cut_rejects_column_split_when_one_band_is_full_width() {
2017        // The alignment guard catches a full-width paragraph that
2018        // would otherwise be forced into the left column of a 2-column
2019        // region below it.
2020        let texts = block_texts(vec![
2021            span(
2022                "Full width intro spanning both columns here",
2023                40.0,
2024                740.0,
2025                360.0,
2026            ),
2027            span("Left A", 40.0, 700.0, 50.0),
2028            span("Right A", 320.0, 700.0, 50.0),
2029            span("Left B", 40.0, 684.0, 50.0),
2030            span("Right B", 320.0, 684.0, 50.0),
2031        ]);
2032        assert!(
2033            texts[0].contains("Full width intro"),
2034            "expected full-width intro first: {texts:?}"
2035        );
2036    }
2037
2038    #[test]
2039    fn xy_cut_horizontal_split_for_zone_boundaries() {
2040        // Pure horizontal cut on a single-column page with a big
2041        // vertical gap between paragraphs — the cut fires and both
2042        // paragraphs stay in their own blocks.
2043        let texts = block_texts(vec![
2044            span("First paragraph body text", 40.0, 740.0, 200.0),
2045            span("Second paragraph body", 40.0, 680.0, 180.0),
2046        ]);
2047        assert_eq!(texts.len(), 2);
2048        assert!(texts[0].starts_with("First"));
2049        assert!(texts[1].starts_with("Second"));
2050    }
2051
2052    #[test]
2053    fn xy_cut_recursion_terminates_with_single_span() {
2054        let texts = block_texts(vec![span("Only one span on the page", 40.0, 700.0, 180.0)]);
2055        assert_eq!(texts, vec!["Only one span on the page"]);
2056    }
2057
2058    #[test]
2059    fn median_font_size_handles_mixed_sizes() {
2060        let spans = vec![
2061            TextSpan {
2062                text: "small".into(),
2063                x: 0.0,
2064                y: 0.0,
2065                width: 10.0,
2066                height: 8.0,
2067                font_size: 8.0,
2068            },
2069            TextSpan {
2070                text: "medium".into(),
2071                x: 0.0,
2072                y: 0.0,
2073                width: 10.0,
2074                height: 12.0,
2075                font_size: 12.0,
2076            },
2077            TextSpan {
2078                text: "large".into(),
2079                x: 0.0,
2080                y: 0.0,
2081                width: 10.0,
2082                height: 24.0,
2083                font_size: 24.0,
2084            },
2085        ];
2086        assert!((median_font_size(&spans) - 12.0).abs() < 1e-9);
2087    }
2088
2089    #[test]
2090    fn columns_band_aligned_accepts_aligned_columns() {
2091        let spans = vec![
2092            span("L1", 40.0, 700.0, 60.0),
2093            span("R1", 300.0, 700.0, 60.0),
2094            span("L2", 40.0, 684.0, 60.0),
2095            span("R2", 300.0, 684.0, 60.0),
2096        ];
2097        let stats = PageStats::from_spans(&spans);
2098        // cut_x between 100 and 300 → 200. Every band straddles the cut.
2099        assert!(columns_are_band_aligned(&spans, 200.0, 40.0, 360.0, &stats));
2100    }
2101
2102    #[test]
2103    fn columns_band_aligned_rejects_wide_single_side_band() {
2104        let spans = vec![
2105            span("Wide banner line across top", 40.0, 740.0, 280.0),
2106            span("L1", 40.0, 700.0, 60.0),
2107            span("R1", 300.0, 700.0, 60.0),
2108        ];
2109        let stats = PageStats::from_spans(&spans);
2110        // cut_x = 200. Banner only in left group (midpoint < 200). Width
2111        // exceeds 0.7 × left column width → rejected.
2112        assert!(!columns_are_band_aligned(
2113            &spans, 200.0, 40.0, 360.0, &stats
2114        ));
2115    }
2116
2117    #[test]
2118    fn page_stats_computes_median_values() {
2119        let spans = vec![
2120            span("one", 40.0, 700.0, 30.0),
2121            span("two", 40.0, 680.0, 30.0),
2122            span("three", 40.0, 660.0, 50.0),
2123        ];
2124        let stats = PageStats::from_spans(&spans);
2125        assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2126        // char width = width / chars. one=30/3=10, two=30/3=10, three=50/5=10. median=10.
2127        assert!((stats.median_char_width - 10.0).abs() < 1e-9);
2128        // line spacing: bands at 700, 680, 660. gaps = 20, 20. median = 20.
2129        assert!((stats.median_line_spacing - 20.0).abs() < 1e-9);
2130    }
2131
2132    #[test]
2133    fn page_stats_handles_empty_input() {
2134        let stats = PageStats::from_spans(&[]);
2135        assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2136        assert!((stats.median_char_width - 6.0).abs() < 1e-9);
2137        assert_eq!(stats.median_line_spacing, 0.0);
2138    }
2139
2140    #[test]
2141    fn narrow_gutter_detected_with_adaptive_threshold() {
2142        // Academic paper layout: 12pt gutter between columns.
2143        // With old fixed 20pt threshold, this was not detected as columnar.
2144        // With adaptive: median word gap ~4pt, threshold = 12pt → detects 12pt gutter.
2145        let mut spans = Vec::new();
2146        for y in [700.0, 684.0, 668.0] {
2147            // Left column: two words with 4pt gap, ending at x=145
2148            spans.push(span("Lorem ipsum", 40.0, y, 100.0));
2149            spans.push(span("dolor sit", 144.0, y, 80.0));
2150            // Right column starts at 236 (gap = 12pt from 224)
2151            spans.push(span("amet consec", 236.0, y, 100.0));
2152            spans.push(span("tetur adipi", 340.0, y, 80.0));
2153        }
2154        let texts = block_texts(spans);
2155        // Should detect 2-column layout and read column-major
2156        assert!(
2157            texts.len() >= 6,
2158            "expected column-major output, got {texts:?}"
2159        );
2160        // First three blocks should be left column lines
2161        assert!(
2162            texts[0].contains("Lorem"),
2163            "first block should be left column: {texts:?}"
2164        );
2165    }
2166
2167    #[test]
2168    fn xy_cut_leaf_falls_back_to_legacy_columns_for_header_plus_three_columns() {
2169        let texts = block_texts(vec![
2170            span("73022", 45.0, 750.0, 70.0),
2171            span("Federal Register banner", 125.6, 750.0, 260.0),
2172            span("Left column line one", 45.0, 725.0, 140.0),
2173            span("Middle column line one", 222.0, 725.0, 140.0),
2174            span("Right column line one", 399.0, 725.0, 120.0),
2175            span("Left column line two", 45.0, 715.0, 140.0),
2176            span("Middle column line two", 210.0, 715.0, 152.0),
2177            span("Right column line two", 388.0, 715.0, 132.0),
2178            span("Left column line three", 45.0, 705.0, 140.0),
2179            span("Middle column line three", 235.0, 705.0, 135.0),
2180            span("Right column line three", 408.0, 705.0, 118.0),
2181        ]);
2182
2183        assert_eq!(
2184            texts,
2185            vec![
2186                "73022 Federal Register banner",
2187                "Left column line one",
2188                "Left column line two",
2189                "Left column line three",
2190                "Middle column line one",
2191                "Middle column line two",
2192                "Middle column line three",
2193                "Right column line one",
2194                "Right column line two",
2195                "Right column line three",
2196            ]
2197        );
2198    }
2199
2200    #[test]
2201    fn overlapping_fake_bold_spans_collapse_to_single_copy() {
2202        let texts = block_texts(vec![
2203            span("1 This is fakebold text.", 25.9, 785.3, 320.0),
2204            span("1 This is fakebold text.", 26.2, 785.3, 320.0),
2205            span("1 This is fakebold text.", 26.4, 785.3, 320.0),
2206            span("1 This is fakebold text.", 26.7, 785.3, 320.0),
2207            span("2 This is a fakebold", 27.0, 714.8, 142.0),
2208            span(" fakebold", 169.8, 714.8, 70.0),
2209            span(" fakebold", 170.1, 714.8, 70.0),
2210            span(" fakebold word.", 170.4, 714.8, 110.0),
2211        ]);
2212
2213        assert_eq!(
2214            texts,
2215            vec!["1 This is fakebold text.", "2 This is a fakebold word.",]
2216        );
2217    }
2218}