Skip to main content

pdf_engine/
text.rs

1//! Text extraction via a custom Device implementation.
2
3use kurbo::{Affine, BezPath};
4use pdf_render::pdf_interpret::cmap::BfString;
5use pdf_render::pdf_interpret::font::Glyph;
6use pdf_render::pdf_interpret::{
7    BlendMode, ClipPath, Device, GlyphDrawMode, Image, Paint, PathDrawMode, SoftMask,
8};
9use std::cmp::Ordering;
10
11/// Minimum Y tolerance for grouping spans into horizontal bands. The
12/// effective tolerance is typically `median_font_size * BAND_Y_FRACTION`
13/// per ANN[r17/TEX4]; this constant acts as the absolute floor.
14const BAND_Y_TOLERANCE: f64 = 5.0;
15/// Fraction of the page's median font size used as the band-Y
16/// tolerance. Empirically 0.30× works across common typography —
17/// below typical leading (~1.2×) so adjacent lines never collapse,
18/// above sub-pixel baseline drift.
19const BAND_Y_FRACTION: f64 = 0.30;
20/// Multiplier applied to median line spacing to derive the horizontal
21/// paragraph-break cut threshold. Normal line-to-line progression is
22/// ~1.0× the median; paragraph breaks typically show 1.5× or more.
23const PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER: f64 = 1.8;
24
25// ANN[r17/TEX1] Multi-signal consensus thresholds.
26// The previous single-threshold scheme (gap > 0.15 * font_size) missed
27// word boundaries when kerning or narrow fonts produced small measured
28// gaps even though the PDF emitted an explicit TJ backward shift, and
29// over-emitted spaces for condensed fonts where 0.15em of kerning is
30// well below an actual word space. The consensus system weights three
31// signals and inserts a space when the combined confidence exceeds
32// SPACE_CONSENSUS_THRESHOLD.
33/// Raw TJ backward adjustment (positive in PDF TJ units) that is
34/// definitively a word break. Matches pdftotext / MuPDF heuristics —
35/// a space glyph is typically emitted as either a literal 0x20 or as
36/// a TJ adjustment of around 250 1/1000 em. 100 units is a safely
37/// conservative floor.
38const TJ_SPACE_THRESHOLD_UNITS: f32 = 100.0;
39/// Weight of the TJ offset signal when confidence is high.
40const TJ_SIGNAL_WEIGHT: f64 = 0.95;
41/// Weight of the purely geometric gap signal.
42const GAP_SIGNAL_WEIGHT: f64 = 0.80;
43/// Weight of character-heuristic signals (CamelCase, digit↔letter).
44const HEURISTIC_SIGNAL_WEIGHT: f64 = 0.60;
45/// Combined weight at which a space is inserted.
46const SPACE_CONSENSUS_THRESHOLD: f64 = 0.75;
47/// Fraction of a median character width above which a gap contributes
48/// to the geometric signal (pdf_oxide uses ~0.30).
49const GAP_TO_MEDIAN_CHAR_FRACTION: f64 = 0.30;
50/// Fallback gap fraction relative to `font_size` when the running
51/// median character width has not yet been established.
52const GAP_TO_FONT_SIZE_FALLBACK_FRACTION: f64 = 0.15;
53
54/// Minimum horizontal gap treated as a column gutter (adaptive fallback).
55const COLUMN_GAP_THRESHOLD_MIN: f64 = 10.0;
56/// Maximum adaptive column gap threshold.
57const COLUMN_GAP_THRESHOLD_MAX: f64 = 40.0;
58/// Multiplier applied to median inter-word gap to derive column threshold.
59const COLUMN_GAP_MEDIAN_MULTIPLIER: f64 = 3.0;
60/// Fallback column gap threshold when median cannot be computed.
61const COLUMN_GAP_THRESHOLD_FALLBACK: f64 = 20.0;
62/// Maximum drift allowed when matching gutters across neighboring bands.
63const COLUMN_GAP_MATCH_TOLERANCE: f64 = 12.0;
64/// Minimum number of gapped bands required before we enable column mode.
65const MIN_COLUMN_GAPPED_BANDS: usize = 3;
66/// Minimum fraction of bands in a region that must expose the shared gutters.
67const MIN_COLUMN_GAP_SUPPORT: f64 = 0.80;
68/// Minimum fraction of non-empty column slices that must look like prose.
69const MIN_DENSE_SLICE_RATIO: f64 = 0.35;
70
71/// A single text span at a specific position.
72#[derive(Debug, Clone)]
73pub struct TextSpan {
74    /// The extracted text.
75    pub text: String,
76    /// X position in user space.
77    pub x: f64,
78    /// Y position in user space.
79    pub y: f64,
80    /// Approximate bounding-box width in user space.
81    pub width: f64,
82    /// Approximate bounding-box height in user space.
83    pub height: f64,
84    /// Font size (approximate, from transform).
85    pub font_size: f64,
86}
87
88impl TextSpan {
89    /// Conservative right edge using whichever is wider: measured or estimated.
90    /// Used by column detection to avoid underestimating span extent.
91    fn right(&self) -> f64 {
92        self.x + self.width.max(self.estimated_width())
93    }
94
95    /// Right edge from measured glyph positions only.
96    fn measured_right(&self) -> f64 {
97        self.x + self.width
98    }
99
100    fn estimated_width(&self) -> f64 {
101        let char_count = self.text.chars().count() as f64;
102        if char_count <= 0.0 {
103            self.font_size * 0.5
104        } else {
105            self.font_size * 0.5 * char_count
106        }
107    }
108}
109
110/// A block of text (grouped by reading order).
111#[derive(Debug, Clone)]
112pub struct TextBlock {
113    /// Spans within this block, sorted by position.
114    pub spans: Vec<TextSpan>,
115}
116
117impl TextBlock {
118    /// Concatenate all spans into a single string.
119    ///
120    /// Spans that are close together are joined without a separator;
121    /// a space is inserted when the gap between spans exceeds half
122    /// the average character width.
123    pub fn text(&self) -> String {
124        if self.spans.is_empty() {
125            return String::new();
126        }
127        let mut result = self.spans[0].text.clone();
128        for pair in self.spans.windows(2) {
129            let prev = &pair[0];
130            let curr = &pair[1];
131            let expected_end = prev.measured_right();
132            let gap = curr.x - expected_end;
133            if gap <= prev.font_size * 0.12 {
134                if let Some(trimmed) = trim_overlapping_word_prefix(&prev.text, &curr.text) {
135                    result.push_str(&trimmed);
136                    continue;
137                }
138            }
139            if gap > prev.font_size * 0.25 {
140                result.push(' ');
141            }
142            result.push_str(&curr.text);
143        }
144        result
145    }
146}
147
148#[derive(Debug, Clone)]
149struct TextBand {
150    y: f64,
151    spans: Vec<TextSpan>,
152}
153
154impl TextBand {
155    fn new(span: TextSpan) -> Self {
156        Self {
157            y: span.y,
158            spans: vec![span],
159        }
160    }
161
162    fn sort_spans(&mut self) {
163        self.spans.sort_by(|a, b| {
164            a.x.partial_cmp(&b.x)
165                .unwrap_or(Ordering::Equal)
166                .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal))
167        });
168        collapse_overprinted_spans(&mut self.spans);
169    }
170
171    fn row_block(&self) -> TextBlock {
172        let mut spans = self.spans.clone();
173        spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
174        TextBlock { spans }
175    }
176
177    fn left(&self) -> f64 {
178        self.spans
179            .iter()
180            .map(|span| span.x)
181            .fold(f64::INFINITY, f64::min)
182    }
183
184    fn right(&self) -> f64 {
185        self.spans
186            .iter()
187            .map(TextSpan::right)
188            .fold(f64::NEG_INFINITY, f64::max)
189    }
190
191    fn width(&self) -> f64 {
192        (self.right() - self.left()).max(0.0)
193    }
194
195    fn gap_midpoints(&self, column_gap_threshold: f64) -> Vec<f64> {
196        self.gaps(column_gap_threshold)
197            .into_iter()
198            .map(|gap| (gap.start + gap.end) * 0.5)
199            .collect()
200    }
201
202    fn gaps(&self, column_gap_threshold: f64) -> Vec<BandGap> {
203        if self.spans.len() < 2 {
204            return Vec::new();
205        }
206
207        let mut spans = self.spans.clone();
208        spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
209
210        let mut gaps = Vec::new();
211        let mut prev_right = spans[0].right();
212        for span in spans.iter().skip(1) {
213            let gap = span.x - prev_right;
214            if gap >= column_gap_threshold {
215                gaps.push(BandGap {
216                    start: prev_right,
217                    end: span.x,
218                });
219            }
220            prev_right = prev_right.max(span.right());
221        }
222
223        gaps
224    }
225
226    fn split_by_boundaries(&self, boundaries: &[f64]) -> Vec<Vec<TextSpan>> {
227        let mut columns = vec![Vec::new(); boundaries.len() + 1];
228        for span in &self.spans {
229            let center_x = span.x + span.width.max(span.estimated_width()) * 0.5;
230            let column_idx = boundaries
231                .iter()
232                .position(|boundary| center_x < *boundary)
233                .unwrap_or(boundaries.len());
234            columns[column_idx].push(span.clone());
235        }
236
237        for spans in &mut columns {
238            spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
239        }
240
241        columns
242    }
243
244    fn fits_single_column(
245        &self,
246        boundaries: &[f64],
247        region_left: f64,
248        region_right: f64,
249    ) -> Option<usize> {
250        let mut column_idx: Option<usize> = None;
251        for span in &self.spans {
252            let left = span.x;
253            let right = span.right();
254            if boundaries
255                .iter()
256                .any(|boundary| left < *boundary && right > *boundary)
257            {
258                return None;
259            }
260
261            let center_x = left + (right - left) * 0.5;
262            let idx = boundaries
263                .iter()
264                .position(|boundary| center_x < *boundary)
265                .unwrap_or(boundaries.len());
266            match column_idx {
267                Some(existing) if existing != idx => return None,
268                Some(_) => {}
269                None => column_idx = Some(idx),
270            }
271        }
272        let idx = column_idx?;
273        let mut edges = Vec::with_capacity(boundaries.len() + 2);
274        edges.push(region_left);
275        edges.extend_from_slice(boundaries);
276        edges.push(region_right);
277
278        let column_width = (edges[idx + 1] - edges[idx]).max(0.0);
279        if column_width <= 0.0 || self.width() > column_width * 0.8 {
280            return None;
281        }
282
283        Some(idx)
284    }
285}
286
287#[derive(Debug, Clone, Copy)]
288struct BandGap {
289    start: f64,
290    end: f64,
291}
292
293/// A Device implementation that captures text from draw_glyph calls.
294///
295/// ANN[r17/TEX1][r17/TEX3] Space detection uses a multi-signal consensus
296/// rather than a single geometric threshold. Three signals vote:
297///   1. `pending_tj_offset`  — raw TJ backward shift surfaced by the
298///      interpreter (confidence 0.95). This is the definitive word-break
299///      signal used by pdftotext / MuPDF.
300///   2. geometric gap        — measured horizontal distance between the
301///      previous glyph's right edge and this glyph's origin (confidence
302///      0.80). Compared against the running median glyph width rather
303///      than a flat em-fraction so condensed/wide fonts are handled
304///      uniformly.
305///   3. character heuristic  — CamelCase transition or digit↔letter
306///      transition at the merge point (confidence 0.60). Catches cases
307///      where the writer relied on typography (e.g. table cells glued
308///      with zero gap: `Qty1Price$5`).
309///
310/// A space is inserted when the weighted sum meets SPACE_CONSENSUS_THRESHOLD.
311/// Span accumulation still merges adjacent glyphs into one TextSpan (TEX3)
312/// so downstream reading-order logic sees logical text runs, not individual
313/// character positions.
314pub(crate) struct TextExtractionDevice {
315    spans: Vec<TextSpan>,
316    last_y: f64,
317    last_end_x: f64,
318    /// TJ adjustment in raw 1/1000 em units since the last glyph was
319    /// drawn. Positive values = backward shift (i.e., explicit horizontal
320    /// space). Reset every time a glyph is drawn.
321    pending_tj_offset: f32,
322    /// Running sample of measured glyph widths used as the adaptive
323    /// reference for the geometric gap signal. Cheap to maintain and
324    /// avoids having to re-walk all spans per decision.
325    glyph_widths: Vec<f64>,
326    /// Cached median glyph width (kept fresh every `MEDIAN_REFRESH`
327    /// insertions). Zero = not yet established, caller falls back to
328    /// font-size scaling.
329    cached_median_char_width: f64,
330}
331
332const MEDIAN_REFRESH: usize = 32;
333
334impl Default for TextExtractionDevice {
335    fn default() -> Self {
336        Self::new()
337    }
338}
339
340impl TextExtractionDevice {
341    /// Create a new text extraction device.
342    pub fn new() -> Self {
343        Self {
344            spans: Vec::new(),
345            last_y: f64::NEG_INFINITY,
346            last_end_x: f64::NEG_INFINITY,
347            pending_tj_offset: 0.0,
348            glyph_widths: Vec::new(),
349            cached_median_char_width: 0.0,
350        }
351    }
352
353    /// Refresh the cached median char width. Called lazily from
354    /// `draw_glyph` to keep the hot path cheap.
355    fn refresh_median_char_width(&mut self) {
356        if self.glyph_widths.is_empty() {
357            self.cached_median_char_width = 0.0;
358            return;
359        }
360        let mut sorted = self.glyph_widths.clone();
361        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
362        self.cached_median_char_width = sorted[sorted.len() / 2];
363    }
364
365    /// Decide whether a space should be glued between two glyphs within
366    /// the same span. Returns (insert_space, start_new_span).
367    fn evaluate_space_consensus(
368        &self,
369        gap: f64,
370        font_size: f64,
371        prev_text: &str,
372        next_text: &str,
373    ) -> bool {
374        let mut confidence = 0.0;
375
376        // Signal 1 — TJ offset (highest confidence). Raw units; a full
377        // space is ~250. Anything over TJ_SPACE_THRESHOLD_UNITS counts.
378        if self.pending_tj_offset.abs() >= TJ_SPACE_THRESHOLD_UNITS {
379            confidence += TJ_SIGNAL_WEIGHT;
380        }
381
382        // Signal 2 — geometric gap. Prefer the adaptive median-char-width
383        // reference; fall back to font-size when the median hasn't been
384        // established yet (first few glyphs on a page).
385        let gap_reference = if self.cached_median_char_width > 0.0 {
386            self.cached_median_char_width * GAP_TO_MEDIAN_CHAR_FRACTION
387        } else {
388            font_size * GAP_TO_FONT_SIZE_FALLBACK_FRACTION
389        };
390        if gap > gap_reference {
391            confidence += GAP_SIGNAL_WEIGHT;
392        }
393
394        // Signal 3 — character-class transitions. Only checked when the
395        // previous span ends with a character and the incoming text starts
396        // with one; avoids double-counting with punctuation.
397        if let (Some(prev_last), Some(next_first)) =
398            (prev_text.chars().last(), next_text.chars().next())
399        {
400            let camel = prev_last.is_lowercase() && next_first.is_uppercase();
401            let digit_to_letter = prev_last.is_ascii_digit() && next_first.is_alphabetic();
402            let letter_to_digit = prev_last.is_alphabetic() && next_first.is_ascii_digit();
403            if camel || digit_to_letter || letter_to_digit {
404                confidence += HEURISTIC_SIGNAL_WEIGHT;
405            }
406        }
407
408        confidence >= SPACE_CONSENSUS_THRESHOLD
409    }
410
411    /// Consume the device and return extracted text as a single string.
412    pub fn into_text(self) -> String {
413        let blocks = group_spans_into_blocks(self.spans);
414        let lines: Vec<String> = blocks.iter().map(|b| b.text()).collect();
415        let stitched = stitch_hyphenated_lines(&lines);
416        normalize_text_output(&stitched)
417    }
418
419    /// Consume the device and return text blocks.
420    pub fn into_blocks(self) -> Vec<TextBlock> {
421        group_spans_into_blocks(self.spans)
422    }
423
424    /// Consume the device and return raw spans.
425    #[allow(dead_code)]
426    pub(crate) fn into_spans(self) -> Vec<TextSpan> {
427        self.spans
428    }
429}
430
431impl Device<'_> for TextExtractionDevice {
432    fn set_soft_mask(&mut self, _: Option<SoftMask<'_>>) {}
433    fn set_blend_mode(&mut self, _: BlendMode) {}
434    fn draw_path(&mut self, _: &BezPath, _: Affine, _: &Paint<'_>, _: &PathDrawMode) {}
435    fn push_clip_path(&mut self, _: &ClipPath) {}
436    fn push_transparency_group(&mut self, _: f32, _: Option<SoftMask<'_>>, _: BlendMode) {}
437    fn draw_image(&mut self, _: Image<'_, '_>, _: Affine) {}
438    fn pop_clip_path(&mut self) {}
439    fn pop_transparency_group(&mut self) {}
440
441    fn draw_glyph(
442        &mut self,
443        glyph: &Glyph<'_>,
444        transform: Affine,
445        glyph_transform: Affine,
446        _paint: &Paint<'_>,
447        _draw_mode: &GlyphDrawMode,
448    ) {
449        let text = match glyph.as_unicode() {
450            Some(BfString::Char(c)) => c.to_string(),
451            Some(BfString::String(s)) => s,
452            None => return,
453        };
454
455        let composed = transform * glyph_transform;
456        let coeffs = composed.as_coeffs();
457        let x = coeffs[4];
458        let y = coeffs[5];
459        let glyph_scale = (coeffs[0].powi(2) + coeffs[1].powi(2)).sqrt().abs();
460        let font_size = glyph_scale * 1000.0;
461        let glyph_width = estimate_glyph_width(glyph, font_size).max(font_size * 0.25);
462        let glyph_end_x = x + glyph_width;
463
464        // ANN[r17/TEX4] Feed the running sample used to derive the adaptive
465        // median character width. Capped to protect against pathological
466        // pages with hundreds of thousands of glyphs.
467        if self.glyph_widths.len() < 4096 {
468            self.glyph_widths.push(glyph_width);
469            if self.glyph_widths.len().is_multiple_of(MEDIAN_REFRESH) {
470                self.refresh_median_char_width();
471            }
472        }
473
474        let same_line = (y - self.last_y).abs() <= font_size.max(BAND_Y_TOLERANCE) * 0.35;
475        let gap = x - self.last_end_x;
476        let adjacent = same_line && gap >= -font_size * 0.25 && gap < font_size * 0.5;
477
478        if adjacent && !self.spans.is_empty() {
479            // ANN[r17/TEX1] Multi-signal consensus replaces the prior
480            // single-threshold rule (`gap > 0.15 * font_size`). The
481            // consensus evaluates TJ offset, geometric gap, and
482            // character-class transitions; a space is inserted only
483            // when the weighted sum meets SPACE_CONSENSUS_THRESHOLD.
484            // Decision is computed before the mutable borrow of `last`
485            // to keep the borrow checker happy.
486            let want_space = {
487                let last = self.spans.last().expect("checked non-empty");
488                !last.text.ends_with(' ')
489                    && !text.starts_with(' ')
490                    && self.evaluate_space_consensus(gap, font_size, &last.text, &text)
491            };
492            let last = self.spans.last_mut().expect("checked non-empty");
493            if want_space {
494                last.text.push(' ');
495            }
496            last.text.push_str(&text);
497            last.width = last.width.max(glyph_end_x - last.x);
498            last.height = last.height.max(font_size);
499            self.last_y = y;
500            self.last_end_x = glyph_end_x;
501            // ANN[r17/TEX1] Consume the TJ signal: it only counts for
502            // the one merge it preceded.
503            self.pending_tj_offset = 0.0;
504            return;
505        }
506
507        self.last_y = y;
508        self.last_end_x = glyph_end_x;
509        // ANN[r17/TEX1] Non-adjacent glyph starts a fresh span, so any
510        // pending TJ offset is about within-span word breaks and no longer
511        // meaningful here.
512        self.pending_tj_offset = 0.0;
513
514        self.spans.push(TextSpan {
515            text,
516            x,
517            y,
518            width: glyph_width,
519            height: font_size,
520            font_size,
521        });
522    }
523
524    // ANN[r17/TEX1] Record TJ offsets. Accumulate because a single
525    // inter-substring gap may be expressed as multiple numeric entries
526    // (rare, but legal per PDF §9.4.3). The next draw_glyph consumes
527    // the sum.
528    fn text_adjustment(&mut self, amount: f32) {
529        self.pending_tj_offset += amount;
530    }
531}
532
533fn estimate_glyph_width(glyph: &Glyph<'_>, font_size: f64) -> f64 {
534    match glyph {
535        Glyph::Outline(outline) => outline
536            .advance_width()
537            .map(|width| width as f64 / 1000.0 * font_size)
538            .unwrap_or(font_size * 0.5),
539        Glyph::Type3(_) => font_size * 0.5,
540    }
541}
542
543/// Collapse fake-bold / overprint duplicates inside one band.
544///
545/// Real-word corpus failures such as 0105.pdf draw the same text several times
546/// with sub-point x drift to simulate heavier weight. Text extraction should
547/// keep the most informative span once rather than concatenate every overprint.
548fn collapse_overprinted_spans(spans: &mut Vec<TextSpan>) {
549    if spans.len() < 2 {
550        return;
551    }
552
553    let mut deduped: Vec<TextSpan> = Vec::with_capacity(spans.len());
554    for span in spans.drain(..) {
555        if let Some(last) = deduped.last_mut() {
556            if spans_are_overprint_duplicates(last, &span) {
557                let choose_incoming = span.text.chars().count() > last.text.chars().count()
558                    || (span.text.chars().count() == last.text.chars().count()
559                        && span.width > last.width);
560                let preferred_text = if choose_incoming {
561                    span.text.clone()
562                } else {
563                    last.text.clone()
564                };
565                let left = last.x.min(span.x);
566                let right = last.right().max(span.right());
567                last.x = left;
568                last.y = (last.y + span.y) * 0.5;
569                last.width = (right - left).max(last.width).max(span.width);
570                last.height = last.height.max(span.height);
571                last.font_size = last.font_size.max(span.font_size);
572                last.text = preferred_text;
573                continue;
574            }
575        }
576
577        deduped.push(span);
578    }
579
580    *spans = deduped;
581}
582
583fn spans_are_overprint_duplicates(lhs: &TextSpan, rhs: &TextSpan) -> bool {
584    let lhs_text = lhs.text.trim();
585    let rhs_text = rhs.text.trim();
586    if lhs_text.is_empty() || rhs_text.is_empty() {
587        return false;
588    }
589
590    let same_baseline = (lhs.y - rhs.y).abs() <= lhs.font_size.max(rhs.font_size) * 0.12;
591    if !same_baseline {
592        return false;
593    }
594
595    let lhs_left = lhs.x;
596    let lhs_right = lhs.right();
597    let rhs_left = rhs.x;
598    let rhs_right = rhs.right();
599    let overlap = (lhs_right.min(rhs_right) - lhs_left.max(rhs_left)).max(0.0);
600    let min_width = (lhs_right - lhs_left).min(rhs_right - rhs_left).max(1.0);
601    let heavily_overlaps = overlap / min_width >= 0.85;
602    if !heavily_overlaps {
603        return false;
604    }
605
606    lhs_text == rhs_text || lhs_text.starts_with(rhs_text) || rhs_text.starts_with(lhs_text)
607}
608
609fn trim_overlapping_word_prefix(prev: &str, curr: &str) -> Option<String> {
610    let prev_chars: Vec<char> = prev.trim_end().chars().collect();
611    let curr_chars: Vec<char> = curr.trim_start().chars().collect();
612    let max = prev_chars.len().min(curr_chars.len());
613
614    for len in (4..=max).rev() {
615        let prev_start = prev_chars.len() - len;
616        if prev_chars[prev_start..] != curr_chars[..len] {
617            continue;
618        }
619
620        if !curr_chars[..len].iter().all(|ch| ch.is_alphanumeric()) {
621            continue;
622        }
623
624        let prev_boundary = prev_start == 0 || !prev_chars[prev_start - 1].is_alphanumeric();
625        let curr_boundary = len == curr_chars.len() || !curr_chars[len].is_alphanumeric();
626        if !prev_boundary || !curr_boundary {
627            continue;
628        }
629
630        return Some(curr_chars[len..].iter().collect());
631    }
632
633    None
634}
635
636/// Compute an adaptive column gap threshold from a set of bands.
637///
638/// Collects all positive inter-span gaps within each band, computes the
639/// median, and returns `COLUMN_GAP_MEDIAN_MULTIPLIER × median`, clamped to
640/// `[COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX]`.  Falls back to
641/// `COLUMN_GAP_THRESHOLD_FALLBACK` when there are no measurable gaps.
642fn compute_adaptive_column_gap(bands: &[TextBand]) -> f64 {
643    let mut all_gaps: Vec<f64> = Vec::new();
644
645    for band in bands {
646        if band.spans.len() < 2 {
647            continue;
648        }
649        let mut sorted = band.spans.clone();
650        sorted.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
651        let mut prev_right = sorted[0].right();
652        for span in sorted.iter().skip(1) {
653            let gap = span.x - prev_right;
654            if gap > 0.0 {
655                all_gaps.push(gap);
656            }
657            prev_right = prev_right.max(span.right());
658        }
659    }
660
661    if all_gaps.is_empty() {
662        return COLUMN_GAP_THRESHOLD_FALLBACK;
663    }
664
665    all_gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
666
667    let min_gap = all_gaps[0];
668
669    // When all inter-span gaps are already large (> MIN threshold), they are
670    // likely all column gaps — the draw_glyph merger absorbed word-level
671    // spaces into span text.  Use a fraction of the smallest gap so that
672    // ALL column gaps exceed the threshold.
673    if min_gap > COLUMN_GAP_THRESHOLD_MIN {
674        return (min_gap * 0.75).clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
675    }
676
677    // Look for a natural break: the largest relative jump between consecutive
678    // sorted gaps separates word-level gaps from column gaps.
679    let mut best_break_threshold = 0.0f64;
680    let mut best_ratio = 1.5f64; // require at least 1.5× jump
681    for pair in all_gaps.windows(2) {
682        if pair[0] > 0.5 {
683            let ratio = pair[1] / pair[0];
684            if ratio > best_ratio {
685                best_ratio = ratio;
686                best_break_threshold = (pair[0] + pair[1]) * 0.5;
687            }
688        }
689    }
690
691    if best_break_threshold > 0.0 {
692        return best_break_threshold.clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
693    }
694
695    // Fallback: median × multiplier.
696    let mid = all_gaps.len() / 2;
697    let median = if all_gaps.len().is_multiple_of(2) {
698        (all_gaps[mid - 1] + all_gaps[mid]) * 0.5
699    } else {
700        all_gaps[mid]
701    };
702
703    (median * COLUMN_GAP_MEDIAN_MULTIPLIER)
704        .clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX)
705}
706
707/// Group spans into reading-order blocks, using column-aware reordering when
708/// a contiguous region repeatedly exposes the same gutters.
709/// Per-page adaptive parameters derived from the span set before any
710/// grouping happens. Centralising these here (TEX4) means the rest of
711/// the pipeline — band grouping, XY-Cut cuts, in-block space insertion
712/// — all speak the same typographic baseline for this specific page,
713/// rather than each helper reaching for an independent fixed constant.
714#[derive(Debug, Clone, Copy)]
715struct PageStats {
716    /// Median font size across all spans (pt).
717    median_font_size: f64,
718    /// Median measured character width (pt). Zero-guarded fallback is
719    /// 0.5 × median_font_size when there aren't enough samples.
720    /// Currently populated for diagnostics / future tuning; allow dead_code
721    /// under `-D warnings` until a reader is added.
722    #[allow(dead_code)]
723    median_char_width: f64,
724    /// Tight line-to-line spacing (25th percentile of pairwise band
725    /// gaps), representing the body-text leading on this page. The
726    /// quartile is used instead of the median so large paragraph /
727    /// zone gaps don't inflate the baseline. Zero if the page has
728    /// only one band.
729    median_line_spacing: f64,
730}
731
732impl PageStats {
733    fn from_spans(spans: &[TextSpan]) -> Self {
734        if spans.is_empty() {
735            return Self {
736                median_font_size: 12.0,
737                median_char_width: 6.0,
738                median_line_spacing: 0.0,
739            };
740        }
741
742        // Median font size.
743        let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
744        sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
745        let median_font_size = sizes[sizes.len() / 2];
746
747        // Median char width — measured width / char count, per span.
748        let mut char_widths: Vec<f64> = spans
749            .iter()
750            .filter_map(|s| {
751                let chars = s.text.chars().count();
752                if chars > 0 && s.width > 0.0 {
753                    Some(s.width / chars as f64)
754                } else {
755                    None
756                }
757            })
758            .collect();
759        let median_char_width = if char_widths.is_empty() {
760            median_font_size * 0.5
761        } else {
762            char_widths.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
763            char_widths[char_widths.len() / 2]
764        };
765
766        // Median line spacing — pairwise gaps between consecutive band
767        // y-values.
768        let band_tolerance = (median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
769        let mut ys: Vec<f64> = spans.iter().map(|s| s.y).collect();
770        ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
771        let mut band_ys: Vec<f64> = Vec::new();
772        for y in ys {
773            if band_ys
774                .last()
775                .map(|prev: &f64| (prev - y).abs() > band_tolerance)
776                .unwrap_or(true)
777            {
778                band_ys.push(y);
779            }
780        }
781        // ANN[r17/TEX4] "Line spacing" here means the TIGHT line-to-line
782        // gap inside a text block — not the median of all gaps. Using
783        // the median drags the estimate up when the page has
784        // paragraph / zone breaks (which are the very gaps the
785        // paragraph-break threshold is supposed to EXCEED). The 25th
786        // percentile is the smallest gap that still shows up in more
787        // than one place on the page; it captures body-text leading
788        // robustly even when large zone gaps dominate.
789        let median_line_spacing = if band_ys.len() < 2 {
790            0.0
791        } else {
792            let mut spacings: Vec<f64> = band_ys
793                .windows(2)
794                .map(|pair| (pair[0] - pair[1]).abs())
795                .collect();
796            spacings.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
797            let q1_index = spacings.len() / 4;
798            spacings[q1_index]
799        };
800
801        Self {
802            median_font_size,
803            median_char_width,
804            median_line_spacing,
805        }
806    }
807}
808
809// ANN[r17/TEX2] Maximum recursion depth for XY-Cut. Any real page layout
810// is decomposable in well under 10 alternating cuts; the cap guards
811// against pathological inputs where the cut predicate keeps triggering
812// due to floating-point drift.
813const XY_CUT_MAX_DEPTH: usize = 12;
814/// Minimum fraction of a region's width that a vertical gap must reach
815/// before it qualifies as a column gutter.
816const XY_CUT_VERTICAL_GAP_REGION_FRACTION: f64 = 0.04;
817/// Floor (in pt) for vertical gap regardless of region width. Matches
818/// the previous `COLUMN_GAP_THRESHOLD_MIN` and keeps XY-Cut conservative
819/// on narrow regions (sidebars, tall columns).
820const XY_CUT_VERTICAL_GAP_FLOOR: f64 = 10.0;
821/// Multiplier applied to median font size to produce the horizontal-gap
822/// threshold. 1.8 × line-height matches typical paragraph spacing.
823const XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER: f64 = 1.8;
824/// Minimum number of spans a column must contain before it is eligible
825/// for acceptance — one-span "columns" are almost always sidebar noise
826/// or table-cell fragments.
827const XY_CUT_MIN_SPANS_PER_COLUMN: usize = 2;
828/// Average characters per band a column must have before it's accepted
829/// as dense prose (vs. a short-cell table column).
830const XY_CUT_MIN_CHARS_PER_BAND: f64 = 8.0;
831
832/// ANN[r17/TEX2][r17/TEX4] Top-level grouping uses recursive XY-Cut
833/// with a density guard. Per-page stats are computed once up front so
834/// every decision downstream speaks the same typographic baseline.
835fn group_spans_into_blocks(spans: Vec<TextSpan>) -> Vec<TextBlock> {
836    if spans.is_empty() {
837        return Vec::new();
838    }
839    let stats = PageStats::from_spans(&spans);
840    xy_cut_recursive(spans, 0, &stats)
841}
842
843fn xy_cut_recursive(spans: Vec<TextSpan>, depth: usize, stats: &PageStats) -> Vec<TextBlock> {
844    if spans.is_empty() {
845        return Vec::new();
846    }
847    if depth >= XY_CUT_MAX_DEPTH {
848        return band_based_blocks(spans, stats);
849    }
850
851    // ANN[r17/TEX2] Pick whichever direction has the largest qualifying
852    // gap. Always cutting vertically first breaks layouts where a
853    // footer sits in the mid-x range — it would attach to the left
854    // column instead of being recognized as a page-level zone. The
855    // "largest gap wins" rule is the standard XY-Cut tie-breaker used
856    // by academic OCR literature and matches pdf_oxide.
857    let vcut = try_vertical_cut(&spans, stats);
858    let hcut = try_horizontal_cut(&spans, stats);
859
860    let (chosen, _) = match (vcut, hcut) {
861        (Some((v_groups, v_gap)), Some((h_groups, h_gap))) => {
862            if v_gap >= h_gap {
863                (Some(v_groups), v_gap)
864            } else {
865                (Some(h_groups), h_gap)
866            }
867        }
868        (Some((v_groups, v_gap)), None) => (Some(v_groups), v_gap),
869        (None, Some((h_groups, h_gap))) => (Some(h_groups), h_gap),
870        (None, None) => (None, 0.0),
871    };
872
873    if let Some(groups) = chosen {
874        let mut out = Vec::new();
875        for group in groups {
876            out.extend(xy_cut_recursive(group, depth + 1, stats));
877        }
878        return out;
879    }
880
881    band_based_blocks(spans, stats)
882}
883
884/// Emit per-band row blocks without any column detection. Used as the
885/// leaf of XY-Cut recursion — at this point the region either has no
886/// further cuts or the density guard refused them.
887fn band_based_blocks(spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBlock> {
888    // XY-Cut can miss recurring gutters when a small number of bands span the
889    // full page width (e.g. a running header above a 3-column body). In that
890    // case, fall back to the older band/gutter detector inside the leaf region
891    // instead of flattening everything row-major.
892    group_spans_into_blocks_legacy_with_stats(spans, stats)
893}
894
895/// Median font-size helper. Currently unreferenced after `PageStats` took over
896/// the typography baseline computation; kept available for future tuning paths.
897#[allow(dead_code)]
898fn median_font_size(spans: &[TextSpan]) -> f64 {
899    if spans.is_empty() {
900        return 12.0;
901    }
902    let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
903    sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
904    sizes[sizes.len() / 2]
905}
906
907/// Attempt a vertical (column) cut. Returns the span groups plus the
908/// gap size (in pt) if a suitable gutter is found AND the density +
909/// alignment guards accept.
910///
911/// ANN[r17/TEX2] Three guards together avoid false-positive columns:
912///   1. `min_gap` is the MAX of (median_font, 4% of region width, 10pt)
913///      — deliberately lower than `median_font * 2` so narrow-gutter
914///      academic papers (12pt gutters, common in print) are still
915///      detected.
916///   2. `columns_are_dense` rejects column splits where either side
917///      has <2 spans or <8 chars/band — catches table cells.
918///   3. `columns_are_band_aligned` rejects cuts where any band would
919///      end up on only one side of the cut while being wider than
920///      ~70% of that side's column width — catches full-width
921///      paragraphs (Intro / Outro) that accidentally sit in the
922///      left-column x-range.
923fn try_vertical_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
924    if spans.len() < 2 * XY_CUT_MIN_SPANS_PER_COLUMN {
925        return None;
926    }
927
928    let region_left = spans.iter().map(|s| s.x).fold(f64::INFINITY, f64::min);
929    let region_right = spans
930        .iter()
931        .map(TextSpan::right)
932        .fold(f64::NEG_INFINITY, f64::max);
933    let region_width = region_right - region_left;
934    if region_width <= 0.0 {
935        return None;
936    }
937
938    // ANN[r17/TEX2][r17/TEX4] Threshold uses the ADAPTIVE median-word-gap
939    // from the bands rather than a flat font-size multiple. Narrow-gutter
940    // academic layouts have 12pt gutters next to 4pt word spaces — the
941    // adaptive threshold scales with the actual typography used on this
942    // page. Clamped to `XY_CUT_VERTICAL_GAP_FLOOR` to avoid firing on
943    // ordinary inter-word spaces when character advance data is noisy.
944    // median_font and the width fraction act only as safety rails for
945    // pathological inputs.
946    let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
947    let adaptive = compute_adaptive_column_gap(&bands);
948    let floor = stats
949        .median_font_size
950        .max(region_width * XY_CUT_VERTICAL_GAP_REGION_FRACTION)
951        .max(XY_CUT_VERTICAL_GAP_FLOOR);
952    let min_gap = adaptive.min(floor).max(XY_CUT_VERTICAL_GAP_FLOOR);
953
954    // Intervals [x_left, x_right] of every span; we look for an x value
955    // that is free of ALL intervals (full-height gap).
956    let mut intervals: Vec<(f64, f64)> = spans
957        .iter()
958        .map(|s| (s.x, s.right().max(s.x + 0.001)))
959        .collect();
960    intervals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
961
962    let mut cursor = intervals[0].1;
963    let mut best_gap: Option<(f64, f64)> = None; // (gap_size, cut_x)
964    for (left, right) in intervals.iter().skip(1) {
965        if *left > cursor {
966            let gap = *left - cursor;
967            if gap >= min_gap {
968                match best_gap {
969                    Some((best, _)) if best >= gap => {}
970                    _ => {
971                        let cut_x = (cursor + *left) * 0.5;
972                        best_gap = Some((gap, cut_x));
973                    }
974                }
975            }
976        }
977        cursor = cursor.max(*right);
978    }
979
980    let (gap_size, cut_x) = best_gap?;
981
982    // Split spans around the cut. A span whose midpoint is < cut_x
983    // belongs to the left group.
984    let mut left_group = Vec::new();
985    let mut right_group = Vec::new();
986    for span in spans {
987        let midpoint = span.x + (span.right() - span.x) * 0.5;
988        if midpoint < cut_x {
989            left_group.push(span.clone());
990        } else {
991            right_group.push(span.clone());
992        }
993    }
994
995    if !columns_are_dense(&left_group, &right_group, stats) {
996        return None;
997    }
998    if !columns_are_band_aligned(spans, cut_x, region_left, region_right, stats) {
999        return None;
1000    }
1001
1002    Some((vec![left_group, right_group], gap_size))
1003}
1004
1005/// ANN[r17/TEX2] Reject a vertical cut when any band sits on only one
1006/// side of the cut AND occupies more than ~70% of that side's column
1007/// width. Such bands are almost certainly full-width paragraphs that
1008/// happened to align with the left margin of one column, and forcing
1009/// them into that column re-orders them relative to text that follows.
1010fn columns_are_band_aligned(
1011    spans: &[TextSpan],
1012    cut_x: f64,
1013    region_left: f64,
1014    region_right: f64,
1015    stats: &PageStats,
1016) -> bool {
1017    let left_width = (cut_x - region_left).max(1.0);
1018    let right_width = (region_right - cut_x).max(1.0);
1019
1020    // Threshold chosen empirically: paragraph bodies in columnar
1021    // layouts usually fill ~60-70% of their column; anything wider
1022    // than 0.7× is a page-level element masquerading as column
1023    // content.
1024    const MAX_SINGLE_SIDE_FRACTION: f64 = 0.70;
1025
1026    let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1027    for band in &bands {
1028        let mut has_left = false;
1029        let mut has_right = false;
1030        for span in &band.spans {
1031            let midpoint = span.x + (span.right() - span.x) * 0.5;
1032            if midpoint < cut_x {
1033                has_left = true;
1034            } else {
1035                has_right = true;
1036            }
1037        }
1038        if has_left && has_right {
1039            continue; // Band straddles columns → fine.
1040        }
1041        let band_width = band.width();
1042        if has_left && band_width > left_width * MAX_SINGLE_SIDE_FRACTION {
1043            return false;
1044        }
1045        if has_right && band_width > right_width * MAX_SINGLE_SIDE_FRACTION {
1046            return false;
1047        }
1048    }
1049    true
1050}
1051
1052/// Density guard — reject column splits that look like tables (few,
1053/// short spans per column). A column is "dense" when it has at least
1054/// MIN_SPANS_PER_COLUMN spans and the average character count per band
1055/// exceeds MIN_CHARS_PER_BAND.
1056fn columns_are_dense(left: &[TextSpan], right: &[TextSpan], stats: &PageStats) -> bool {
1057    for col in [left, right] {
1058        if col.len() < XY_CUT_MIN_SPANS_PER_COLUMN {
1059            return false;
1060        }
1061        let bands = group_spans_into_bands_with_stats(col.to_vec(), stats);
1062        if bands.is_empty() {
1063            return false;
1064        }
1065        let total_chars: usize = col.iter().map(|s| s.text.chars().count()).sum();
1066        let chars_per_band = total_chars as f64 / bands.len() as f64;
1067        if chars_per_band < XY_CUT_MIN_CHARS_PER_BAND {
1068            return false;
1069        }
1070    }
1071    true
1072}
1073
1074/// Attempt a horizontal (zone / paragraph) cut. Unlike vertical cuts
1075/// this does NOT need a density guard — splitting top-from-bottom
1076/// cannot re-order content.
1077fn try_horizontal_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1078    if spans.len() < 2 {
1079        return None;
1080    }
1081    // Sort by descending y (PDF y grows upward).
1082    let mut sorted = spans.to_vec();
1083    sorted.sort_by(|a, b| {
1084        b.y.partial_cmp(&a.y)
1085            .unwrap_or(Ordering::Equal)
1086            .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1087    });
1088
1089    // ANN[r17/TEX4] Paragraph / zone cuts scale with MEDIAN LINE
1090    // SPACING when available — this is the typographically correct
1091    // baseline (paragraph break ≈ 1.8 × line-spacing). When the page
1092    // has only one band, or stats haven't observed spacing yet, fall
1093    // back to the font-size multiple the legacy path used.
1094    let min_gap = if stats.median_line_spacing > 0.0 {
1095        stats.median_line_spacing * PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER
1096    } else {
1097        stats.median_font_size * XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER
1098    };
1099
1100    // Look for the largest gap between consecutive span y-values.
1101    let mut best: Option<(f64, f64)> = None; // (gap_size, cut_y)
1102    let tolerance = stats.median_font_size * BAND_Y_FRACTION;
1103    let mut band_bottom = sorted[0].y;
1104
1105    for span in sorted.iter().skip(1) {
1106        if (band_bottom - span.y).abs() <= tolerance {
1107            band_bottom = band_bottom.min(span.y);
1108            continue;
1109        }
1110        let gap = band_bottom - span.y;
1111        if gap >= min_gap {
1112            let cut_y = (band_bottom + span.y) * 0.5;
1113            match best {
1114                Some((best_gap, _)) if best_gap >= gap => {}
1115                _ => best = Some((gap, cut_y)),
1116            }
1117        }
1118        band_bottom = span.y;
1119    }
1120
1121    let (gap_size, cut_y) = best?;
1122
1123    let mut top_group = Vec::new();
1124    let mut bottom_group = Vec::new();
1125    for span in spans {
1126        if span.y > cut_y {
1127            top_group.push(span.clone());
1128        } else {
1129            bottom_group.push(span.clone());
1130        }
1131    }
1132    if top_group.is_empty() || bottom_group.is_empty() {
1133        return None;
1134    }
1135    Some((vec![top_group, bottom_group], gap_size))
1136}
1137
1138/// Legacy band+column-detection path, kept for reference and as the
1139/// fallback inside `band_based_blocks` test coverage. Not currently
1140/// used — XY-Cut supersedes it.
1141#[allow(dead_code)]
1142fn group_spans_into_blocks_legacy(spans: Vec<TextSpan>) -> Vec<TextBlock> {
1143    let bands = group_spans_into_bands(spans);
1144    group_spans_into_blocks_legacy_from_bands(bands)
1145}
1146
1147fn group_spans_into_blocks_legacy_with_stats(
1148    spans: Vec<TextSpan>,
1149    stats: &PageStats,
1150) -> Vec<TextBlock> {
1151    let bands = group_spans_into_bands_with_stats(spans, stats);
1152    group_spans_into_blocks_legacy_from_bands(bands)
1153}
1154
1155fn group_spans_into_blocks_legacy_from_bands(bands: Vec<TextBand>) -> Vec<TextBlock> {
1156    if bands.is_empty() {
1157        return Vec::new();
1158    }
1159
1160    let column_gap_threshold = compute_adaptive_column_gap(&bands);
1161
1162    let mut blocks = Vec::new();
1163    let mut idx = 0;
1164
1165    while idx < bands.len() {
1166        let gap_midpoints = bands[idx].gap_midpoints(column_gap_threshold);
1167        if gap_midpoints.is_empty() {
1168            blocks.push(bands[idx].row_block());
1169            idx += 1;
1170            continue;
1171        }
1172
1173        let mut boundaries = gap_midpoints.clone();
1174        let mut band_indices = vec![idx];
1175        let mut gapped_band_count = 1usize;
1176        let mut region_left = bands[idx].left();
1177        let mut region_right = bands[idx].right();
1178        let mut next_idx = idx + 1;
1179
1180        while next_idx < bands.len() {
1181            let next_band = &bands[next_idx];
1182            let next_gap_midpoints = next_band.gap_midpoints(column_gap_threshold);
1183            if next_gap_midpoints.is_empty() {
1184                if next_band
1185                    .fits_single_column(&boundaries, region_left, region_right)
1186                    .is_some()
1187                {
1188                    band_indices.push(next_idx);
1189                    next_idx += 1;
1190                    continue;
1191                }
1192                break;
1193            }
1194
1195            if !boundaries_match(&boundaries, &next_gap_midpoints, column_gap_threshold) {
1196                break;
1197            }
1198
1199            update_boundaries(&mut boundaries, &next_gap_midpoints, gapped_band_count);
1200            gapped_band_count += 1;
1201            band_indices.push(next_idx);
1202            region_left = region_left.min(next_band.left());
1203            region_right = region_right.max(next_band.right());
1204            next_idx += 1;
1205        }
1206
1207        if region_is_columnar(&bands, &band_indices, &boundaries, gapped_band_count) {
1208            append_column_region_blocks(&bands, &band_indices, &boundaries, &mut blocks);
1209            idx = next_idx;
1210        } else {
1211            blocks.push(bands[idx].row_block());
1212            idx += 1;
1213        }
1214    }
1215
1216    blocks
1217}
1218
1219/// Legacy wrapper used by call sites that haven't been handed PageStats.
1220/// It derives stats locally. Prefer `group_spans_into_bands_with_stats`
1221/// inside the XY-Cut pipeline to avoid recomputing the stats per call.
1222fn group_spans_into_bands(spans: Vec<TextSpan>) -> Vec<TextBand> {
1223    let stats = PageStats::from_spans(&spans);
1224    group_spans_into_bands_with_stats(spans, &stats)
1225}
1226
1227fn group_spans_into_bands_with_stats(mut spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBand> {
1228    if spans.is_empty() {
1229        return Vec::new();
1230    }
1231
1232    spans.sort_by(|a, b| {
1233        b.y.partial_cmp(&a.y)
1234            .unwrap_or(Ordering::Equal)
1235            .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1236    });
1237
1238    // ANN[r17/TEX4] Band tolerance scales with this page's median font
1239    // size rather than a fixed 5pt floor. Single-page spreads with
1240    // huge display fonts (24pt+) previously merged unrelated lines; a
1241    // fractional threshold keeps that from happening without hurting
1242    // body-text pages.
1243    let page_tolerance = (stats.median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
1244
1245    let mut bands: Vec<TextBand> = Vec::new();
1246
1247    for span in spans {
1248        let tolerance = (span.height * BAND_Y_FRACTION)
1249            .max(page_tolerance)
1250            .max(BAND_Y_TOLERANCE);
1251        if let Some(band) = bands
1252            .iter_mut()
1253            .find(|band| (band.y - span.y).abs() <= tolerance)
1254        {
1255            let span_count = band.spans.len() as f64;
1256            band.y = (band.y * span_count + span.y) / (span_count + 1.0);
1257            band.spans.push(span);
1258        } else {
1259            bands.push(TextBand::new(span));
1260        }
1261    }
1262
1263    for band in &mut bands {
1264        band.sort_spans();
1265    }
1266
1267    bands.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal));
1268    bands
1269}
1270
1271fn boundaries_match(boundaries: &[f64], gap_midpoints: &[f64], column_gap_threshold: f64) -> bool {
1272    let tolerance = (column_gap_threshold * 1.5).clamp(COLUMN_GAP_MATCH_TOLERANCE, 60.0);
1273    boundaries.len() == gap_midpoints.len()
1274        && boundaries
1275            .iter()
1276            .zip(gap_midpoints)
1277            .all(|(lhs, rhs)| (lhs - rhs).abs() <= tolerance)
1278}
1279
1280fn update_boundaries(boundaries: &mut [f64], gap_midpoints: &[f64], seen_gapped_bands: usize) {
1281    for (boundary, midpoint) in boundaries.iter_mut().zip(gap_midpoints) {
1282        *boundary =
1283            (*boundary * seen_gapped_bands as f64 + midpoint) / (seen_gapped_bands as f64 + 1.0);
1284    }
1285}
1286
1287fn region_is_columnar(
1288    bands: &[TextBand],
1289    band_indices: &[usize],
1290    boundaries: &[f64],
1291    gapped_band_count: usize,
1292) -> bool {
1293    if boundaries.is_empty()
1294        || gapped_band_count < MIN_COLUMN_GAPPED_BANDS
1295        || band_indices.is_empty()
1296        || (gapped_band_count as f64 / band_indices.len() as f64) < MIN_COLUMN_GAP_SUPPORT
1297    {
1298        return false;
1299    }
1300
1301    let mut non_empty_slices = 0usize;
1302    let mut dense_slices = 0usize;
1303    let mut slices_per_column = vec![0usize; boundaries.len() + 1];
1304
1305    for &band_idx in band_indices {
1306        let slices = bands[band_idx].split_by_boundaries(boundaries);
1307        for (column_idx, slice) in slices.iter().enumerate() {
1308            if slice.is_empty() {
1309                continue;
1310            }
1311
1312            non_empty_slices += 1;
1313            slices_per_column[column_idx] += 1;
1314
1315            let char_count = slice
1316                .iter()
1317                .map(|span| span.text.chars().count())
1318                .sum::<usize>();
1319            if slice.len() >= 2 || char_count >= 8 {
1320                dense_slices += 1;
1321            }
1322        }
1323    }
1324
1325    if non_empty_slices < boundaries.len() + 2 {
1326        return false;
1327    }
1328
1329    if slices_per_column.contains(&0) {
1330        return false;
1331    }
1332
1333    (dense_slices as f64 / non_empty_slices as f64) >= MIN_DENSE_SLICE_RATIO
1334}
1335
1336fn append_column_region_blocks(
1337    bands: &[TextBand],
1338    band_indices: &[usize],
1339    boundaries: &[f64],
1340    blocks: &mut Vec<TextBlock>,
1341) {
1342    let column_count = boundaries.len() + 1;
1343    let mut column_bands = vec![Vec::<TextSpan>::new(); column_count];
1344
1345    for &band_idx in band_indices {
1346        let slices = bands[band_idx].split_by_boundaries(boundaries);
1347        for (column_idx, slice) in slices.into_iter().enumerate() {
1348            if slice.is_empty() {
1349                continue;
1350            }
1351            column_bands[column_idx].push(TextSpan {
1352                text: String::new(),
1353                x: 0.0,
1354                y: 0.0,
1355                width: 0.0,
1356                height: 0.0,
1357                font_size: 0.0,
1358            });
1359            let marker_idx = column_bands[column_idx].len() - 1;
1360            column_bands[column_idx][marker_idx] = TextSpan {
1361                text: String::new(),
1362                x: f64::NEG_INFINITY,
1363                y: bands[band_idx].y,
1364                width: 0.0,
1365                height: 0.0,
1366                font_size: 0.0,
1367            };
1368            column_bands[column_idx].extend(slice);
1369        }
1370    }
1371
1372    for spans in column_bands {
1373        let mut current: Vec<TextSpan> = Vec::new();
1374        for span in spans {
1375            if span.x == f64::NEG_INFINITY {
1376                if !current.is_empty() {
1377                    current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1378                    blocks.push(TextBlock {
1379                        spans: std::mem::take(&mut current),
1380                    });
1381                }
1382                continue;
1383            }
1384            current.push(span);
1385        }
1386        if !current.is_empty() {
1387            current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1388            blocks.push(TextBlock { spans: current });
1389        }
1390    }
1391}
1392
1393/// Join per-block lines, stitching end-of-line hyphenated word-wraps the
1394/// way pdftotext / MuPDF / PDFBox do.
1395///
1396/// Trigger conditions (all must hold):
1397/// 1. Previous line ends with `-` preceded by an alphabetic character.
1398/// 2. The alphabetic suffix before the `-` has >= 3 characters.
1399/// 3. The next line (trimmed) starts with an ASCII lowercase letter.
1400/// 4. The lowercase prefix of the next line has >= 3 characters.
1401///
1402/// When triggered, the trailing `-` is removed and the two halves are
1403/// concatenated without a space or newline.
1404///
1405/// This avoids false positives on compound words ("real-time"), bullet
1406/// lists, numeric ranges ("42-"), and short fragments.
1407fn stitch_hyphenated_lines(lines: &[String]) -> String {
1408    let mut out = String::new();
1409    for (idx, line) in lines.iter().enumerate() {
1410        if idx == 0 {
1411            out.push_str(line);
1412            continue;
1413        }
1414
1415        let next_trimmed = line.trim_start();
1416
1417        // Check the accumulated output for end-of-line hyphen pattern
1418        let should_merge = is_hyphen_wrap_candidate(&out, next_trimmed);
1419
1420        if should_merge {
1421            out.pop(); // drop the trailing '-'
1422            out.push_str(next_trimmed);
1423        } else {
1424            out.push('\n');
1425            out.push_str(line);
1426        }
1427    }
1428    out
1429}
1430
1431/// Check if the accumulated text ends with a hyphen-wrap pattern and the
1432/// continuation is a valid merge target.
1433fn is_hyphen_wrap_candidate(accumulated: &str, next_trimmed: &str) -> bool {
1434    // Must end with '-'
1435    if !accumulated.ends_with('-') {
1436        return false;
1437    }
1438
1439    // Character before '-' must be alphabetic
1440    let before_hyphen = accumulated.chars().rev().nth(1);
1441    if !before_hyphen.is_some_and(|c| c.is_alphabetic()) {
1442        return false;
1443    }
1444
1445    // Count consecutive alphabetic chars before the '-' (the word fragment)
1446    let alpha_prefix_len = accumulated
1447        .chars()
1448        .rev()
1449        .skip(1) // skip the '-'
1450        .take_while(|c| c.is_alphabetic())
1451        .count();
1452    if alpha_prefix_len < 3 {
1453        return false;
1454    }
1455
1456    // Next line must start with lowercase ASCII
1457    let first_next = next_trimmed.chars().next();
1458    if !first_next.is_some_and(|c| c.is_ascii_lowercase()) {
1459        return false;
1460    }
1461
1462    // Count consecutive lowercase chars at start of next line
1463    let next_alpha_len = next_trimmed
1464        .chars()
1465        .take_while(|c| c.is_ascii_lowercase())
1466        .count();
1467    if next_alpha_len < 3 {
1468        return false;
1469    }
1470
1471    true
1472}
1473
1474/// Normalize extracted text to match pdftotext conventions.
1475///
1476/// 1. Trim trailing whitespace from each line.
1477/// 2. Collapse runs of more than two consecutive newlines into exactly two.
1478/// 3. Preserve form-feed characters (`\x0C`) as page separators.
1479/// 4. End with a single trailing newline (or empty for empty input).
1480pub(crate) fn normalize_text_output(text: &str) -> String {
1481    if text.is_empty() {
1482        return String::new();
1483    }
1484
1485    let mut lines: Vec<&str> = Vec::new();
1486    for line in text.split('\n') {
1487        lines.push(line.trim_end());
1488    }
1489
1490    // Remove trailing empty lines (we'll add exactly one \n at the end)
1491    while lines.last() == Some(&"") {
1492        lines.pop();
1493    }
1494
1495    if lines.is_empty() {
1496        return String::new();
1497    }
1498
1499    let mut result = String::with_capacity(text.len());
1500    let mut consecutive_empty = 0u32;
1501
1502    for (i, line) in lines.iter().enumerate() {
1503        if line.is_empty() || *line == "\x0C" {
1504            if line.is_empty() {
1505                consecutive_empty += 1;
1506                // Collapse >2 consecutive blank lines to 2
1507                if consecutive_empty <= 2 {
1508                    result.push('\n');
1509                }
1510            } else {
1511                // Bare form-feed line
1512                consecutive_empty = 0;
1513                result.push_str(line);
1514                if i + 1 < lines.len() {
1515                    result.push('\n');
1516                }
1517            }
1518        } else {
1519            // Both form-feed-prefixed and regular lines are emitted as-is.
1520            consecutive_empty = 0;
1521            result.push_str(line);
1522            if i + 1 < lines.len() {
1523                result.push('\n');
1524            }
1525        }
1526    }
1527
1528    // Ensure single trailing newline
1529    if !result.is_empty() && !result.ends_with('\n') {
1530        result.push('\n');
1531    }
1532
1533    result
1534}
1535
1536#[cfg(test)]
1537mod tests {
1538    use super::*;
1539
1540    fn span(text: &str, x: f64, y: f64, width: f64) -> TextSpan {
1541        TextSpan {
1542            text: text.into(),
1543            x,
1544            y,
1545            width,
1546            height: 12.0,
1547            font_size: 12.0,
1548        }
1549    }
1550
1551    fn block_texts(spans: Vec<TextSpan>) -> Vec<String> {
1552        group_spans_into_blocks(spans)
1553            .into_iter()
1554            .map(|block| block.text())
1555            .collect()
1556    }
1557
1558    #[test]
1559    fn empty_device_produces_empty_text() {
1560        let dev = TextExtractionDevice::new();
1561        assert!(dev.into_text().is_empty());
1562    }
1563
1564    #[test]
1565    fn single_column_stays_row_major() {
1566        let texts = block_texts(vec![
1567            span("Single Column Line 1", 40.0, 700.0, 140.0),
1568            span("Single Column Line 2", 40.0, 684.0, 140.0),
1569            span("Single Column Line 3", 40.0, 668.0, 140.0),
1570        ]);
1571
1572        assert_eq!(
1573            texts,
1574            vec![
1575                "Single Column Line 1",
1576                "Single Column Line 2",
1577                "Single Column Line 3",
1578            ]
1579        );
1580    }
1581
1582    #[test]
1583    fn two_column_region_reads_column_major() {
1584        let texts = block_texts(vec![
1585            span("Header", 200.0, 740.0, 80.0),
1586            span("Left column line one", 40.0, 700.0, 115.0),
1587            span("Right column line one", 320.0, 700.0, 120.0),
1588            span("Left column line two", 40.0, 684.0, 115.0),
1589            span("Right column line two", 320.0, 684.0, 120.0),
1590            span("Left column line three", 40.0, 668.0, 125.0),
1591            span("Right column line three", 320.0, 668.0, 130.0),
1592            span("Footer", 200.0, 620.0, 80.0),
1593        ]);
1594
1595        assert_eq!(
1596            texts,
1597            vec![
1598                "Header",
1599                "Left column line one",
1600                "Left column line two",
1601                "Left column line three",
1602                "Right column line one",
1603                "Right column line two",
1604                "Right column line three",
1605                "Footer",
1606            ]
1607        );
1608    }
1609
1610    #[test]
1611    fn mixed_single_and_multi_column_regions_preserve_shared_bands() {
1612        let texts = block_texts(vec![
1613            span("Intro paragraph", 40.0, 740.0, 180.0),
1614            span("L1 words here", 40.0, 700.0, 110.0),
1615            span("R1 words here", 320.0, 700.0, 110.0),
1616            span("L2 words here", 40.0, 684.0, 110.0),
1617            span("R2 words here", 320.0, 684.0, 110.0),
1618            span("L3 words here", 40.0, 668.0, 110.0),
1619            span("R3 words here", 320.0, 668.0, 110.0),
1620            span("Outro paragraph", 40.0, 620.0, 180.0),
1621        ]);
1622
1623        assert_eq!(
1624            texts,
1625            vec![
1626                "Intro paragraph",
1627                "L1 words here",
1628                "L2 words here",
1629                "L3 words here",
1630                "R1 words here",
1631                "R2 words here",
1632                "R3 words here",
1633                "Outro paragraph",
1634            ]
1635        );
1636    }
1637
1638    #[test]
1639    fn short_table_like_rows_fall_back_to_row_major() {
1640        let texts = block_texts(vec![
1641            span("Name", 40.0, 700.0, 30.0),
1642            span("Age", 320.0, 700.0, 20.0),
1643            span("Alice", 40.0, 684.0, 35.0),
1644            span("30", 320.0, 684.0, 15.0),
1645            span("Bob", 40.0, 668.0, 24.0),
1646            span("25", 320.0, 668.0, 15.0),
1647        ]);
1648
1649        assert_eq!(texts, vec!["Name Age", "Alice 30", "Bob 25"]);
1650    }
1651
1652    #[test]
1653    fn three_column_regions_are_supported() {
1654        let texts = block_texts(vec![
1655            span("Column one line one", 40.0, 700.0, 105.0),
1656            span("Column two line one", 220.0, 700.0, 105.0),
1657            span("Column three line one", 400.0, 700.0, 120.0),
1658            span("Column one line two", 40.0, 684.0, 105.0),
1659            span("Column two line two", 220.0, 684.0, 105.0),
1660            span("Column three line two", 400.0, 684.0, 120.0),
1661            span("Column one line three", 40.0, 668.0, 120.0),
1662            span("Column two line three", 220.0, 668.0, 120.0),
1663            span("Column three line three", 400.0, 668.0, 135.0),
1664        ]);
1665
1666        assert_eq!(
1667            texts,
1668            vec![
1669                "Column one line one",
1670                "Column one line two",
1671                "Column one line three",
1672                "Column two line one",
1673                "Column two line two",
1674                "Column two line three",
1675                "Column three line one",
1676                "Column three line two",
1677                "Column three line three",
1678            ]
1679        );
1680    }
1681
1682    #[test]
1683    fn text_block_concatenation_spaced() {
1684        let block = TextBlock {
1685            spans: vec![span("A", 0.0, 0.0, 6.0), span("B", 20.0, 0.0, 6.0)],
1686        };
1687        assert_eq!(block.text(), "A B");
1688    }
1689
1690    #[test]
1691    fn adaptive_column_gap_fallback_for_no_gaps() {
1692        // Single-span bands produce no measurable gaps → fallback
1693        let bands = vec![
1694            TextBand::new(span("Hello", 40.0, 700.0, 80.0)),
1695            TextBand::new(span("World", 40.0, 684.0, 80.0)),
1696        ];
1697        let threshold = compute_adaptive_column_gap(&bands);
1698        assert!((threshold - COLUMN_GAP_THRESHOLD_FALLBACK).abs() < 0.01);
1699    }
1700
1701    #[test]
1702    fn adaptive_column_gap_uses_median() {
1703        // Three bands with word gaps of ~4pt each → median ≈ 4, threshold = 12
1704        let mut bands = Vec::new();
1705        for y in [700.0, 684.0, 668.0] {
1706            let mut band = TextBand::new(span("word1", 40.0, y, 30.0));
1707            band.spans.push(span("word2", 74.0, y, 30.0)); // gap = 4
1708            band.spans.push(span("word3", 108.0, y, 30.0)); // gap = 4
1709            bands.push(band);
1710        }
1711        let threshold = compute_adaptive_column_gap(&bands);
1712        // median gap = 4, × 3 = 12, clamped to [10, 40] → 12
1713        assert!(
1714            threshold >= 10.0 && threshold <= 14.0,
1715            "expected ~12, got {threshold}"
1716        );
1717    }
1718
1719    #[test]
1720    fn adaptive_column_gap_clamps_to_min() {
1721        // Tight gaps (2pt) across many bands → median = 2, 3×2 = 6 → clamped to 10
1722        let mut bands = Vec::new();
1723        for y in [700.0, 684.0, 668.0, 652.0] {
1724            let mut band = TextBand::new(span("abc", 0.0, y, 18.0));
1725            // right of "abc" = max(18, 12*0.5*3=18) = 18; gap = 20-18 = 2
1726            band.spans.push(span("def", 20.0, y, 18.0));
1727            bands.push(band);
1728        }
1729        let threshold = compute_adaptive_column_gap(&bands);
1730        assert!(
1731            (threshold - COLUMN_GAP_THRESHOLD_MIN).abs() < 0.01,
1732            "expected {COLUMN_GAP_THRESHOLD_MIN}, got {threshold}"
1733        );
1734    }
1735
1736    #[test]
1737    fn adaptive_column_gap_all_large_gaps_uses_fraction_of_min() {
1738        // When all gaps are large (> MIN), threshold = 0.75 × min_gap.
1739        let mut band = TextBand::new(span("Left", 0.0, 700.0, 30.0));
1740        band.spans.push(span("Right", 80.0, 700.0, 30.0)); // gap = 50
1741        let bands = vec![band];
1742        let threshold = compute_adaptive_column_gap(&bands);
1743        assert!(
1744            (threshold - 37.5).abs() < 0.01,
1745            "expected 37.5 (0.75×50), got {threshold}"
1746        );
1747    }
1748
1749    #[test]
1750    fn normalize_trims_trailing_whitespace_per_line() {
1751        assert_eq!(
1752            normalize_text_output("hello   \nworld  \n"),
1753            "hello\nworld\n"
1754        );
1755    }
1756
1757    #[test]
1758    fn normalize_collapses_excess_newlines() {
1759        // >2 blank lines collapse to 2 (meaning 3 \n in a row: line, blank, blank)
1760        assert_eq!(
1761            normalize_text_output("hello\n\n\n\n\nworld\n"),
1762            "hello\n\n\nworld\n"
1763        );
1764    }
1765
1766    #[test]
1767    fn normalize_preserves_double_newline() {
1768        assert_eq!(
1769            normalize_text_output("paragraph one\n\nparagraph two\n"),
1770            "paragraph one\n\nparagraph two\n"
1771        );
1772    }
1773
1774    #[test]
1775    fn normalize_preserves_form_feed() {
1776        assert_eq!(
1777            normalize_text_output("page1\n\n\x0Cpage2\n"),
1778            "page1\n\n\x0Cpage2\n"
1779        );
1780    }
1781
1782    #[test]
1783    fn normalize_adds_trailing_newline() {
1784        assert_eq!(normalize_text_output("hello"), "hello\n");
1785    }
1786
1787    #[test]
1788    fn normalize_empty_input() {
1789        assert_eq!(normalize_text_output(""), "");
1790    }
1791
1792    #[test]
1793    fn normalize_only_whitespace() {
1794        assert_eq!(normalize_text_output("   \n  \n"), "");
1795    }
1796
1797    // --- Hyphen stitching tests ---
1798
1799    #[test]
1800    fn hyphen_stitch_joins_wrapped_word() {
1801        let lines = vec!["the aver-".into(), "age rainfall".into()];
1802        assert_eq!(stitch_hyphenated_lines(&lines), "the average rainfall");
1803    }
1804
1805    #[test]
1806    fn hyphen_stitch_handles_leading_whitespace() {
1807        let lines = vec!["pre-".into(), "   dict the outcome".into()];
1808        // "pre" is only 3 chars → meets >= 3 guard
1809        assert_eq!(stitch_hyphenated_lines(&lines), "predict the outcome");
1810    }
1811
1812    #[test]
1813    fn hyphen_stitch_capital_continuation_not_stitched() {
1814        let lines = vec!["Section three-".into(), "Summary here".into()];
1815        assert_eq!(
1816            stitch_hyphenated_lines(&lines),
1817            "Section three-\nSummary here"
1818        );
1819    }
1820
1821    #[test]
1822    fn hyphen_stitch_bullet_dash_not_stitched() {
1823        // "-" alone: char before hyphen is not alphabetic
1824        let lines = vec!["Items:".into(), "-".into(), "milk".into()];
1825        assert_eq!(stitch_hyphenated_lines(&lines), "Items:\n-\nmilk");
1826    }
1827
1828    #[test]
1829    fn hyphen_stitch_numeric_range_not_stitched() {
1830        // "42-" — char before hyphen is digit, not alphabetic
1831        let lines = vec!["page 42-".into(), "seventy".into()];
1832        assert_eq!(stitch_hyphenated_lines(&lines), "page 42-\nseventy");
1833    }
1834
1835    #[test]
1836    fn hyphen_stitch_short_prefix_not_stitched() {
1837        // "re-" only 2 alpha chars before hyphen → below 3-char guard
1838        let lines = vec!["re-".into(), "organize".into()];
1839        assert_eq!(stitch_hyphenated_lines(&lines), "re-\norganize");
1840    }
1841
1842    #[test]
1843    fn hyphen_stitch_short_continuation_not_stitched() {
1844        // Next line starts with "an" (2 chars) → below 3-char guard
1845        let lines = vec!["counter-".into(), "an example".into()];
1846        assert_eq!(stitch_hyphenated_lines(&lines), "counter-\nan example");
1847    }
1848
1849    #[test]
1850    fn hyphen_stitch_compound_word_midline_preserved() {
1851        // "real-time" is mid-line, not end-of-line — no stitching applies
1852        // because stitch only operates on line boundaries
1853        let lines = vec!["real-time system".into()];
1854        assert_eq!(stitch_hyphenated_lines(&lines), "real-time system");
1855    }
1856
1857    #[test]
1858    fn hyphen_stitch_single_line_unchanged() {
1859        let lines = vec!["only line".into()];
1860        assert_eq!(stitch_hyphenated_lines(&lines), "only line");
1861    }
1862
1863    #[test]
1864    fn hyphen_stitch_empty_input() {
1865        let lines: Vec<String> = vec![];
1866        assert_eq!(stitch_hyphenated_lines(&lines), "");
1867    }
1868
1869    // --- TEX1 multi-signal space consensus tests ---
1870
1871    fn make_device_with_median(median: f64) -> TextExtractionDevice {
1872        let mut dev = TextExtractionDevice::new();
1873        // Seed enough samples for the median to resolve to `median`.
1874        for _ in 0..MEDIAN_REFRESH {
1875            dev.glyph_widths.push(median);
1876        }
1877        dev.refresh_median_char_width();
1878        assert!((dev.cached_median_char_width - median).abs() < 1e-9);
1879        dev
1880    }
1881
1882    #[test]
1883    fn consensus_inserts_space_on_strong_tj_offset_alone() {
1884        // Gap is below the geometric threshold, but the TJ offset is large
1885        // enough that the consensus must still fire.
1886        let mut dev = make_device_with_median(6.0);
1887        dev.pending_tj_offset = 250.0; // full em-space
1888        assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1889    }
1890
1891    #[test]
1892    fn consensus_inserts_space_on_geometric_gap_alone() {
1893        // No TJ, no character transition, but a clearly wide geometric gap.
1894        let dev = make_device_with_median(6.0);
1895        // gap > 0.3 * 6.0 = 1.8 → fires gap signal (0.80), below threshold
1896        // on its own? 0.80 < 0.75 threshold? No, 0.80 > 0.75, so it fires.
1897        assert!(dev.evaluate_space_consensus(2.5, 12.0, "hello", "world"));
1898    }
1899
1900    #[test]
1901    fn consensus_no_space_on_kerning_gap() {
1902        // Small kerning-size gap with no other signals must not inject a
1903        // space (regression guard against false-positive spaces inside
1904        // tightly kerned words).
1905        let dev = make_device_with_median(6.0);
1906        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "fi", "lm"));
1907    }
1908
1909    #[test]
1910    fn consensus_inserts_space_on_camel_case_plus_gap() {
1911        // CamelCase heuristic (0.60) alone doesn't reach threshold, but a
1912        // moderate gap (0.60 gap + 0.60 heuristic if gap fires) should.
1913        // Here gap = 2.5 > 1.8 → gap fires → total 0.80 + 0.60 = 1.40.
1914        let dev = make_device_with_median(6.0);
1915        assert!(dev.evaluate_space_consensus(2.5, 12.0, "helloWorld", "Inc"));
1916    }
1917
1918    #[test]
1919    fn consensus_inserts_space_on_digit_letter_transition_with_gap() {
1920        let dev = make_device_with_median(6.0);
1921        assert!(dev.evaluate_space_consensus(2.5, 12.0, "123", "abc"));
1922    }
1923
1924    #[test]
1925    fn consensus_heuristic_alone_is_insufficient() {
1926        // Heuristic (0.60) on its own is below the 0.75 threshold — the
1927        // design deliberately requires a second corroborating signal to
1928        // avoid gluing spaces into existing CamelCase identifiers that
1929        // have no geometric break.
1930        let dev = make_device_with_median(6.0);
1931        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "camel", "Case"));
1932    }
1933
1934    #[test]
1935    fn consensus_falls_back_to_font_size_when_no_median() {
1936        // No samples → median is 0; geometric reference uses font-size.
1937        let dev = TextExtractionDevice::new();
1938        // gap 1.9 > 0.15 * 12.0 = 1.8 → gap signal fires
1939        assert!(dev.evaluate_space_consensus(1.9, 12.0, "a", "b"));
1940        // gap 1.5 < 1.8 → no signal
1941        assert!(!dev.evaluate_space_consensus(1.5, 12.0, "a", "b"));
1942    }
1943
1944    #[test]
1945    fn consensus_ignores_tiny_tj_offsets() {
1946        // TJ offsets below the threshold are kerning, not word breaks.
1947        let mut dev = make_device_with_median(6.0);
1948        dev.pending_tj_offset = 50.0;
1949        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1950    }
1951
1952    #[test]
1953    fn consensus_accepts_negative_tj_offsets() {
1954        // A negative TJ offset still represents an explicit inter-substring
1955        // shift and counts toward the consensus (|amount| check).
1956        let mut dev = make_device_with_median(6.0);
1957        dev.pending_tj_offset = -250.0;
1958        assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1959    }
1960
1961    #[test]
1962    fn text_adjustment_accumulates_until_glyph() {
1963        let mut dev = TextExtractionDevice::new();
1964        dev.text_adjustment(120.0);
1965        dev.text_adjustment(140.0);
1966        assert!((dev.pending_tj_offset - 260.0).abs() < 1e-6);
1967    }
1968
1969    // --- TEX2 XY-Cut tests ---
1970
1971    #[test]
1972    fn xy_cut_header_body_footer_with_two_columns() {
1973        // Header and footer sit in the mid-x range that would
1974        // accidentally fall into a left-column bucket with a naive
1975        // vertical-first cut. The largest-gap-first rule plus the
1976        // alignment guard ensure header and footer bracket the
1977        // columnar body.
1978        let texts = block_texts(vec![
1979            span("HEADLINE TITLE", 180.0, 760.0, 120.0),
1980            span("Left col line A", 40.0, 700.0, 110.0),
1981            span("Right col line A", 320.0, 700.0, 115.0),
1982            span("Left col line B", 40.0, 684.0, 110.0),
1983            span("Right col line B", 320.0, 684.0, 115.0),
1984            span("Left col line C", 40.0, 668.0, 110.0),
1985            span("Right col line C", 320.0, 668.0, 115.0),
1986            span("FOOTER LINE TEXT", 180.0, 600.0, 120.0),
1987        ]);
1988        assert_eq!(texts.first().map(String::as_str), Some("HEADLINE TITLE"));
1989        assert_eq!(texts.last().map(String::as_str), Some("FOOTER LINE TEXT"));
1990        // Left column lines all come before right column lines.
1991        let left_c_idx = texts.iter().position(|s| s == "Left col line C").unwrap();
1992        let right_a_idx = texts.iter().position(|s| s == "Right col line A").unwrap();
1993        assert!(
1994            left_c_idx < right_a_idx,
1995            "expected column-major ordering in body: {texts:?}"
1996        );
1997    }
1998
1999    #[test]
2000    fn xy_cut_rejects_column_split_on_table_rows() {
2001        // The density guard must still reject the 280pt inter-cell gap
2002        // in a short-cell table, preserving row-major reading order.
2003        let texts = block_texts(vec![
2004            span("Name", 40.0, 700.0, 30.0),
2005            span("Age", 320.0, 700.0, 20.0),
2006            span("Alice", 40.0, 684.0, 35.0),
2007            span("30", 320.0, 684.0, 15.0),
2008        ]);
2009        assert_eq!(texts, vec!["Name Age", "Alice 30"]);
2010    }
2011
2012    #[test]
2013    fn xy_cut_rejects_column_split_when_one_band_is_full_width() {
2014        // The alignment guard catches a full-width paragraph that
2015        // would otherwise be forced into the left column of a 2-column
2016        // region below it.
2017        let texts = block_texts(vec![
2018            span(
2019                "Full width intro spanning both columns here",
2020                40.0,
2021                740.0,
2022                360.0,
2023            ),
2024            span("Left A", 40.0, 700.0, 50.0),
2025            span("Right A", 320.0, 700.0, 50.0),
2026            span("Left B", 40.0, 684.0, 50.0),
2027            span("Right B", 320.0, 684.0, 50.0),
2028        ]);
2029        assert!(
2030            texts[0].contains("Full width intro"),
2031            "expected full-width intro first: {texts:?}"
2032        );
2033    }
2034
2035    #[test]
2036    fn xy_cut_horizontal_split_for_zone_boundaries() {
2037        // Pure horizontal cut on a single-column page with a big
2038        // vertical gap between paragraphs — the cut fires and both
2039        // paragraphs stay in their own blocks.
2040        let texts = block_texts(vec![
2041            span("First paragraph body text", 40.0, 740.0, 200.0),
2042            span("Second paragraph body", 40.0, 680.0, 180.0),
2043        ]);
2044        assert_eq!(texts.len(), 2);
2045        assert!(texts[0].starts_with("First"));
2046        assert!(texts[1].starts_with("Second"));
2047    }
2048
2049    #[test]
2050    fn xy_cut_recursion_terminates_with_single_span() {
2051        let texts = block_texts(vec![span("Only one span on the page", 40.0, 700.0, 180.0)]);
2052        assert_eq!(texts, vec!["Only one span on the page"]);
2053    }
2054
2055    #[test]
2056    fn median_font_size_handles_mixed_sizes() {
2057        let spans = vec![
2058            TextSpan {
2059                text: "small".into(),
2060                x: 0.0,
2061                y: 0.0,
2062                width: 10.0,
2063                height: 8.0,
2064                font_size: 8.0,
2065            },
2066            TextSpan {
2067                text: "medium".into(),
2068                x: 0.0,
2069                y: 0.0,
2070                width: 10.0,
2071                height: 12.0,
2072                font_size: 12.0,
2073            },
2074            TextSpan {
2075                text: "large".into(),
2076                x: 0.0,
2077                y: 0.0,
2078                width: 10.0,
2079                height: 24.0,
2080                font_size: 24.0,
2081            },
2082        ];
2083        assert!((median_font_size(&spans) - 12.0).abs() < 1e-9);
2084    }
2085
2086    #[test]
2087    fn columns_band_aligned_accepts_aligned_columns() {
2088        let spans = vec![
2089            span("L1", 40.0, 700.0, 60.0),
2090            span("R1", 300.0, 700.0, 60.0),
2091            span("L2", 40.0, 684.0, 60.0),
2092            span("R2", 300.0, 684.0, 60.0),
2093        ];
2094        let stats = PageStats::from_spans(&spans);
2095        // cut_x between 100 and 300 → 200. Every band straddles the cut.
2096        assert!(columns_are_band_aligned(&spans, 200.0, 40.0, 360.0, &stats));
2097    }
2098
2099    #[test]
2100    fn columns_band_aligned_rejects_wide_single_side_band() {
2101        let spans = vec![
2102            span("Wide banner line across top", 40.0, 740.0, 280.0),
2103            span("L1", 40.0, 700.0, 60.0),
2104            span("R1", 300.0, 700.0, 60.0),
2105        ];
2106        let stats = PageStats::from_spans(&spans);
2107        // cut_x = 200. Banner only in left group (midpoint < 200). Width
2108        // exceeds 0.7 × left column width → rejected.
2109        assert!(!columns_are_band_aligned(
2110            &spans, 200.0, 40.0, 360.0, &stats
2111        ));
2112    }
2113
2114    #[test]
2115    fn page_stats_computes_median_values() {
2116        let spans = vec![
2117            span("one", 40.0, 700.0, 30.0),
2118            span("two", 40.0, 680.0, 30.0),
2119            span("three", 40.0, 660.0, 50.0),
2120        ];
2121        let stats = PageStats::from_spans(&spans);
2122        assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2123        // char width = width / chars. one=30/3=10, two=30/3=10, three=50/5=10. median=10.
2124        assert!((stats.median_char_width - 10.0).abs() < 1e-9);
2125        // line spacing: bands at 700, 680, 660. gaps = 20, 20. median = 20.
2126        assert!((stats.median_line_spacing - 20.0).abs() < 1e-9);
2127    }
2128
2129    #[test]
2130    fn page_stats_handles_empty_input() {
2131        let stats = PageStats::from_spans(&[]);
2132        assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2133        assert!((stats.median_char_width - 6.0).abs() < 1e-9);
2134        assert_eq!(stats.median_line_spacing, 0.0);
2135    }
2136
2137    #[test]
2138    fn narrow_gutter_detected_with_adaptive_threshold() {
2139        // Academic paper layout: 12pt gutter between columns.
2140        // With old fixed 20pt threshold, this was not detected as columnar.
2141        // With adaptive: median word gap ~4pt, threshold = 12pt → detects 12pt gutter.
2142        let mut spans = Vec::new();
2143        for y in [700.0, 684.0, 668.0] {
2144            // Left column: two words with 4pt gap, ending at x=145
2145            spans.push(span("Lorem ipsum", 40.0, y, 100.0));
2146            spans.push(span("dolor sit", 144.0, y, 80.0));
2147            // Right column starts at 236 (gap = 12pt from 224)
2148            spans.push(span("amet consec", 236.0, y, 100.0));
2149            spans.push(span("tetur adipi", 340.0, y, 80.0));
2150        }
2151        let texts = block_texts(spans);
2152        // Should detect 2-column layout and read column-major
2153        assert!(
2154            texts.len() >= 6,
2155            "expected column-major output, got {texts:?}"
2156        );
2157        // First three blocks should be left column lines
2158        assert!(
2159            texts[0].contains("Lorem"),
2160            "first block should be left column: {texts:?}"
2161        );
2162    }
2163
2164    #[test]
2165    fn xy_cut_leaf_falls_back_to_legacy_columns_for_header_plus_three_columns() {
2166        let texts = block_texts(vec![
2167            span("73022", 45.0, 750.0, 70.0),
2168            span("Federal Register banner", 125.6, 750.0, 260.0),
2169            span("Left column line one", 45.0, 725.0, 140.0),
2170            span("Middle column line one", 222.0, 725.0, 140.0),
2171            span("Right column line one", 399.0, 725.0, 120.0),
2172            span("Left column line two", 45.0, 715.0, 140.0),
2173            span("Middle column line two", 210.0, 715.0, 152.0),
2174            span("Right column line two", 388.0, 715.0, 132.0),
2175            span("Left column line three", 45.0, 705.0, 140.0),
2176            span("Middle column line three", 235.0, 705.0, 135.0),
2177            span("Right column line three", 408.0, 705.0, 118.0),
2178        ]);
2179
2180        assert_eq!(
2181            texts,
2182            vec![
2183                "73022 Federal Register banner",
2184                "Left column line one",
2185                "Left column line two",
2186                "Left column line three",
2187                "Middle column line one",
2188                "Middle column line two",
2189                "Middle column line three",
2190                "Right column line one",
2191                "Right column line two",
2192                "Right column line three",
2193            ]
2194        );
2195    }
2196
2197    #[test]
2198    fn overlapping_fake_bold_spans_collapse_to_single_copy() {
2199        let texts = block_texts(vec![
2200            span("1 This is fakebold text.", 25.9, 785.3, 320.0),
2201            span("1 This is fakebold text.", 26.2, 785.3, 320.0),
2202            span("1 This is fakebold text.", 26.4, 785.3, 320.0),
2203            span("1 This is fakebold text.", 26.7, 785.3, 320.0),
2204            span("2 This is a fakebold", 27.0, 714.8, 142.0),
2205            span(" fakebold", 169.8, 714.8, 70.0),
2206            span(" fakebold", 170.1, 714.8, 70.0),
2207            span(" fakebold word.", 170.4, 714.8, 110.0),
2208        ]);
2209
2210        assert_eq!(
2211            texts,
2212            vec!["1 This is fakebold text.", "2 This is a fakebold word.",]
2213        );
2214    }
2215}