Skip to main content

pdf_engine/
text.rs

1//! Text extraction via a custom Device implementation.
2
3use kurbo::{Affine, BezPath};
4use pdf_render::pdf_interpret::cmap::BfString;
5use pdf_render::pdf_interpret::font::Glyph;
6use pdf_render::pdf_interpret::{
7    BlendMode, ClipPath, Device, GlyphDrawMode, Image, Paint, PathDrawMode, SoftMask,
8};
9use std::cmp::Ordering;
10
11/// Minimum Y tolerance for grouping spans into horizontal bands. The
12/// effective tolerance is typically `median_font_size * BAND_Y_FRACTION`
13/// per ANN[r17/TEX4]; this constant acts as the absolute floor.
14const BAND_Y_TOLERANCE: f64 = 5.0;
15/// Fraction of the page's median font size used as the band-Y
16/// tolerance. Empirically 0.30× works across common typography —
17/// below typical leading (~1.2×) so adjacent lines never collapse,
18/// above sub-pixel baseline drift.
19const BAND_Y_FRACTION: f64 = 0.30;
20/// Multiplier applied to median line spacing to derive the horizontal
21/// paragraph-break cut threshold. Normal line-to-line progression is
22/// ~1.0× the median; paragraph breaks typically show 1.5× or more.
23const PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER: f64 = 1.8;
24
25// ANN[r17/TEX1] Multi-signal consensus thresholds.
26// The previous single-threshold scheme (gap > 0.15 * font_size) missed
27// word boundaries when kerning or narrow fonts produced small measured
28// gaps even though the PDF emitted an explicit TJ backward shift, and
29// over-emitted spaces for condensed fonts where 0.15em of kerning is
30// well below an actual word space. The consensus system weights three
31// signals and inserts a space when the combined confidence exceeds
32// SPACE_CONSENSUS_THRESHOLD.
33/// Raw TJ backward adjustment (positive in PDF TJ units) that is
34/// definitively a word break. Matches pdftotext / MuPDF heuristics —
35/// a space glyph is typically emitted as either a literal 0x20 or as
36/// a TJ adjustment of around 250 1/1000 em. 100 units is a safely
37/// conservative floor.
38const TJ_SPACE_THRESHOLD_UNITS: f32 = 100.0;
39/// Weight of the TJ offset signal when confidence is high.
40const TJ_SIGNAL_WEIGHT: f64 = 0.95;
41/// Weight of the purely geometric gap signal.
42const GAP_SIGNAL_WEIGHT: f64 = 0.80;
43/// Weight of character-heuristic signals (CamelCase, digit↔letter).
44const HEURISTIC_SIGNAL_WEIGHT: f64 = 0.60;
45/// Combined weight at which a space is inserted.
46const SPACE_CONSENSUS_THRESHOLD: f64 = 0.75;
47/// Fraction of a median character width above which a gap contributes
48/// to the geometric signal (pdf_oxide uses ~0.30).
49const GAP_TO_MEDIAN_CHAR_FRACTION: f64 = 0.30;
50/// Fallback gap fraction relative to `font_size` when the running
51/// median character width has not yet been established.
52const GAP_TO_FONT_SIZE_FALLBACK_FRACTION: f64 = 0.15;
53
54/// Minimum horizontal gap treated as a column gutter (adaptive fallback).
55const COLUMN_GAP_THRESHOLD_MIN: f64 = 10.0;
56/// Maximum adaptive column gap threshold.
57const COLUMN_GAP_THRESHOLD_MAX: f64 = 40.0;
58/// Multiplier applied to median inter-word gap to derive column threshold.
59const COLUMN_GAP_MEDIAN_MULTIPLIER: f64 = 3.0;
60/// Fallback column gap threshold when median cannot be computed.
61const COLUMN_GAP_THRESHOLD_FALLBACK: f64 = 20.0;
62/// Maximum drift allowed when matching gutters across neighboring bands.
63const COLUMN_GAP_MATCH_TOLERANCE: f64 = 12.0;
64/// Minimum number of gapped bands required before we enable column mode.
65const MIN_COLUMN_GAPPED_BANDS: usize = 3;
66/// Minimum fraction of bands in a region that must expose the shared gutters.
67const MIN_COLUMN_GAP_SUPPORT: f64 = 0.80;
68/// Minimum fraction of non-empty column slices that must look like prose.
69const MIN_DENSE_SLICE_RATIO: f64 = 0.35;
70
71/// A single text span at a specific position.
72#[derive(Debug, Clone)]
73pub struct TextSpan {
74    /// The extracted text.
75    pub text: String,
76    /// X position in user space.
77    pub x: f64,
78    /// Y position in user space.
79    pub y: f64,
80    /// Approximate bounding-box width in user space.
81    pub width: f64,
82    /// Approximate bounding-box height in user space.
83    pub height: f64,
84    /// Font size (approximate, from transform).
85    pub font_size: f64,
86}
87
88impl TextSpan {
89    /// Conservative right edge using whichever is wider: measured or estimated.
90    /// Used by column detection to avoid underestimating span extent.
91    fn right(&self) -> f64 {
92        self.x + self.width.max(self.estimated_width())
93    }
94
95    /// Right edge from measured glyph positions only.
96    fn measured_right(&self) -> f64 {
97        self.x + self.width
98    }
99
100    fn estimated_width(&self) -> f64 {
101        let char_count = self.text.chars().count() as f64;
102        if char_count <= 0.0 {
103            self.font_size * 0.5
104        } else {
105            self.font_size * 0.5 * char_count
106        }
107    }
108}
109
110/// A block of text (grouped by reading order).
111#[derive(Debug, Clone)]
112pub struct TextBlock {
113    /// Spans within this block, sorted by position.
114    pub spans: Vec<TextSpan>,
115}
116
117impl TextBlock {
118    /// Concatenate all spans into a single string.
119    ///
120    /// Spans that are close together are joined without a separator;
121    /// a space is inserted when the gap between spans exceeds half
122    /// the average character width.
123    pub fn text(&self) -> String {
124        if self.spans.is_empty() {
125            return String::new();
126        }
127        let mut result = self.spans[0].text.clone();
128        for pair in self.spans.windows(2) {
129            let prev = &pair[0];
130            let curr = &pair[1];
131            let expected_end = prev.measured_right();
132            let gap = curr.x - expected_end;
133            if gap <= prev.font_size * 0.12 {
134                if let Some(trimmed) = trim_overlapping_word_prefix(&prev.text, &curr.text) {
135                    result.push_str(&trimmed);
136                    continue;
137                }
138            }
139            if gap > prev.font_size * 0.25 {
140                result.push(' ');
141            }
142            result.push_str(&curr.text);
143        }
144        result
145    }
146}
147
148#[derive(Debug, Clone)]
149struct TextBand {
150    y: f64,
151    spans: Vec<TextSpan>,
152}
153
154impl TextBand {
155    fn new(span: TextSpan) -> Self {
156        Self {
157            y: span.y,
158            spans: vec![span],
159        }
160    }
161
162    fn sort_spans(&mut self) {
163        self.spans.sort_by(|a, b| {
164            a.x.partial_cmp(&b.x)
165                .unwrap_or(Ordering::Equal)
166                .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal))
167        });
168        collapse_overprinted_spans(&mut self.spans);
169    }
170
171    fn row_block(&self) -> TextBlock {
172        let mut spans = self.spans.clone();
173        spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
174        TextBlock { spans }
175    }
176
177    fn left(&self) -> f64 {
178        self.spans
179            .iter()
180            .map(|span| span.x)
181            .fold(f64::INFINITY, f64::min)
182    }
183
184    fn right(&self) -> f64 {
185        self.spans
186            .iter()
187            .map(TextSpan::right)
188            .fold(f64::NEG_INFINITY, f64::max)
189    }
190
191    fn width(&self) -> f64 {
192        (self.right() - self.left()).max(0.0)
193    }
194
195    fn gap_midpoints(&self, column_gap_threshold: f64) -> Vec<f64> {
196        self.gaps(column_gap_threshold)
197            .into_iter()
198            .map(|gap| (gap.start + gap.end) * 0.5)
199            .collect()
200    }
201
202    fn gaps(&self, column_gap_threshold: f64) -> Vec<BandGap> {
203        if self.spans.len() < 2 {
204            return Vec::new();
205        }
206
207        let mut spans = self.spans.clone();
208        spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
209
210        let mut gaps = Vec::new();
211        let mut prev_right = spans[0].right();
212        for span in spans.iter().skip(1) {
213            let gap = span.x - prev_right;
214            if gap >= column_gap_threshold {
215                gaps.push(BandGap {
216                    start: prev_right,
217                    end: span.x,
218                });
219            }
220            prev_right = prev_right.max(span.right());
221        }
222
223        gaps
224    }
225
226    fn split_by_boundaries(&self, boundaries: &[f64]) -> Vec<Vec<TextSpan>> {
227        let mut columns = vec![Vec::new(); boundaries.len() + 1];
228        for span in &self.spans {
229            let center_x = span.x + span.width.max(span.estimated_width()) * 0.5;
230            let column_idx = boundaries
231                .iter()
232                .position(|boundary| center_x < *boundary)
233                .unwrap_or(boundaries.len());
234            columns[column_idx].push(span.clone());
235        }
236
237        for spans in &mut columns {
238            spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
239        }
240
241        columns
242    }
243
244    fn fits_single_column(
245        &self,
246        boundaries: &[f64],
247        region_left: f64,
248        region_right: f64,
249    ) -> Option<usize> {
250        let mut column_idx: Option<usize> = None;
251        for span in &self.spans {
252            let left = span.x;
253            let right = span.right();
254            if boundaries
255                .iter()
256                .any(|boundary| left < *boundary && right > *boundary)
257            {
258                return None;
259            }
260
261            let center_x = left + (right - left) * 0.5;
262            let idx = boundaries
263                .iter()
264                .position(|boundary| center_x < *boundary)
265                .unwrap_or(boundaries.len());
266            match column_idx {
267                Some(existing) if existing != idx => return None,
268                Some(_) => {}
269                None => column_idx = Some(idx),
270            }
271        }
272        let idx = column_idx?;
273        let mut edges = Vec::with_capacity(boundaries.len() + 2);
274        edges.push(region_left);
275        edges.extend_from_slice(boundaries);
276        edges.push(region_right);
277
278        let column_width = (edges[idx + 1] - edges[idx]).max(0.0);
279        if column_width <= 0.0 || self.width() > column_width * 0.8 {
280            return None;
281        }
282
283        Some(idx)
284    }
285}
286
287#[derive(Debug, Clone, Copy)]
288struct BandGap {
289    start: f64,
290    end: f64,
291}
292
293/// A Device implementation that captures text from draw_glyph calls.
294///
295/// ANN[r17/TEX1][r17/TEX3] Space detection uses a multi-signal consensus
296/// rather than a single geometric threshold. Three signals vote:
297///   1. `pending_tj_offset`  — raw TJ backward shift surfaced by the
298///      interpreter (confidence 0.95). This is the definitive word-break
299///      signal used by pdftotext / MuPDF.
300///   2. geometric gap        — measured horizontal distance between the
301///      previous glyph's right edge and this glyph's origin (confidence
302///      0.80). Compared against the running median glyph width rather
303///      than a flat em-fraction so condensed/wide fonts are handled
304///      uniformly.
305///   3. character heuristic  — CamelCase transition or digit↔letter
306///      transition at the merge point (confidence 0.60). Catches cases
307///      where the writer relied on typography (e.g. table cells glued
308///      with zero gap: `Qty1Price$5`).
309///
310/// A space is inserted when the weighted sum meets SPACE_CONSENSUS_THRESHOLD.
311/// Span accumulation still merges adjacent glyphs into one TextSpan (TEX3)
312/// so downstream reading-order logic sees logical text runs, not individual
313/// character positions.
314pub(crate) struct TextExtractionDevice {
315    spans: Vec<TextSpan>,
316    last_y: f64,
317    last_end_x: f64,
318    /// TJ adjustment in raw 1/1000 em units since the last glyph was
319    /// drawn. Positive values = backward shift (i.e., explicit horizontal
320    /// space). Reset every time a glyph is drawn.
321    pending_tj_offset: f32,
322    /// Running sample of measured glyph widths used as the adaptive
323    /// reference for the geometric gap signal. Cheap to maintain and
324    /// avoids having to re-walk all spans per decision.
325    glyph_widths: Vec<f64>,
326    /// Cached median glyph width (kept fresh every `MEDIAN_REFRESH`
327    /// insertions). Zero = not yet established, caller falls back to
328    /// font-size scaling.
329    cached_median_char_width: f64,
330}
331
332const MEDIAN_REFRESH: usize = 32;
333
334impl Default for TextExtractionDevice {
335    fn default() -> Self {
336        Self::new()
337    }
338}
339
340impl TextExtractionDevice {
341    /// Create a new text extraction device.
342    pub fn new() -> Self {
343        Self {
344            spans: Vec::new(),
345            last_y: f64::NEG_INFINITY,
346            last_end_x: f64::NEG_INFINITY,
347            pending_tj_offset: 0.0,
348            glyph_widths: Vec::new(),
349            cached_median_char_width: 0.0,
350        }
351    }
352
353    /// Refresh the cached median char width. Called lazily from
354    /// `draw_glyph` to keep the hot path cheap.
355    fn refresh_median_char_width(&mut self) {
356        if self.glyph_widths.is_empty() {
357            self.cached_median_char_width = 0.0;
358            return;
359        }
360        let mut sorted = self.glyph_widths.clone();
361        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
362        self.cached_median_char_width = sorted[sorted.len() / 2];
363    }
364
365    /// Decide whether a space should be glued between two glyphs within
366    /// the same span. Returns (insert_space, start_new_span).
367    fn evaluate_space_consensus(
368        &self,
369        gap: f64,
370        font_size: f64,
371        prev_text: &str,
372        next_text: &str,
373    ) -> bool {
374        let mut confidence = 0.0;
375
376        // Signal 1 — TJ offset (highest confidence). Raw units; a full
377        // space is ~250. Anything over TJ_SPACE_THRESHOLD_UNITS counts.
378        if self.pending_tj_offset.abs() >= TJ_SPACE_THRESHOLD_UNITS {
379            confidence += TJ_SIGNAL_WEIGHT;
380        }
381
382        // Signal 2 — geometric gap. Prefer the adaptive median-char-width
383        // reference; fall back to font-size when the median hasn't been
384        // established yet (first few glyphs on a page).
385        let gap_reference = if self.cached_median_char_width > 0.0 {
386            self.cached_median_char_width * GAP_TO_MEDIAN_CHAR_FRACTION
387        } else {
388            font_size * GAP_TO_FONT_SIZE_FALLBACK_FRACTION
389        };
390        if gap > gap_reference {
391            confidence += GAP_SIGNAL_WEIGHT;
392        }
393
394        // Signal 3 — character-class transitions. Only checked when the
395        // previous span ends with a character and the incoming text starts
396        // with one; avoids double-counting with punctuation.
397        if let (Some(prev_last), Some(next_first)) =
398            (prev_text.chars().last(), next_text.chars().next())
399        {
400            let camel = prev_last.is_lowercase() && next_first.is_uppercase();
401            let digit_to_letter = prev_last.is_ascii_digit() && next_first.is_alphabetic();
402            let letter_to_digit = prev_last.is_alphabetic() && next_first.is_ascii_digit();
403            if camel || digit_to_letter || letter_to_digit {
404                confidence += HEURISTIC_SIGNAL_WEIGHT;
405            }
406        }
407
408        confidence >= SPACE_CONSENSUS_THRESHOLD
409    }
410
411    /// Consume the device and return extracted text as a single string.
412    pub fn into_text(self) -> String {
413        let blocks = group_spans_into_blocks(self.spans);
414        let lines: Vec<String> = blocks.iter().map(|b| b.text()).collect();
415        let stitched = stitch_hyphenated_lines(&lines);
416        normalize_text_output(&stitched)
417    }
418
419    /// Consume the device and return text blocks.
420    pub fn into_blocks(self) -> Vec<TextBlock> {
421        group_spans_into_blocks(self.spans)
422    }
423
424    /// Consume the device and return raw spans.
425    #[allow(dead_code)]
426    pub(crate) fn into_spans(self) -> Vec<TextSpan> {
427        self.spans
428    }
429}
430
431impl Device<'_> for TextExtractionDevice {
432    fn set_soft_mask(&mut self, _: Option<SoftMask<'_>>) {}
433    fn set_blend_mode(&mut self, _: BlendMode) {}
434    fn draw_path(&mut self, _: &BezPath, _: Affine, _: &Paint<'_>, _: &PathDrawMode) {}
435    fn push_clip_path(&mut self, _: &ClipPath) {}
436    fn push_transparency_group(&mut self, _: f32, _: Option<SoftMask<'_>>, _: BlendMode) {}
437    fn draw_image(&mut self, _: Image<'_, '_>, _: Affine) {}
438    fn pop_clip_path(&mut self) {}
439    fn pop_transparency_group(&mut self) {}
440
441    fn draw_glyph(
442        &mut self,
443        glyph: &Glyph<'_>,
444        transform: Affine,
445        glyph_transform: Affine,
446        _paint: &Paint<'_>,
447        _draw_mode: &GlyphDrawMode,
448    ) {
449        let text = match glyph.as_unicode() {
450            Some(BfString::Char(c)) => c.to_string(),
451            Some(BfString::String(s)) => s,
452            None => return,
453        };
454
455        let composed = transform * glyph_transform;
456        let coeffs = composed.as_coeffs();
457        let x = coeffs[4];
458        let y = coeffs[5];
459        let glyph_scale = (coeffs[0].powi(2) + coeffs[1].powi(2)).sqrt().abs();
460        let font_size = glyph_scale * 1000.0;
461        let glyph_width = estimate_glyph_width(glyph, font_size).max(font_size * 0.25);
462        let glyph_end_x = x + glyph_width;
463
464        // ANN[r17/TEX4] Feed the running sample used to derive the adaptive
465        // median character width. Capped to protect against pathological
466        // pages with hundreds of thousands of glyphs.
467        if self.glyph_widths.len() < 4096 {
468            self.glyph_widths.push(glyph_width);
469            if self.glyph_widths.len() % MEDIAN_REFRESH == 0 {
470                self.refresh_median_char_width();
471            }
472        }
473
474        let same_line = (y - self.last_y).abs() <= font_size.max(BAND_Y_TOLERANCE) * 0.35;
475        let gap = x - self.last_end_x;
476        let adjacent = same_line && gap >= -font_size * 0.25 && gap < font_size * 0.5;
477
478        if adjacent && !self.spans.is_empty() {
479            // ANN[r17/TEX1] Multi-signal consensus replaces the prior
480            // single-threshold rule (`gap > 0.15 * font_size`). The
481            // consensus evaluates TJ offset, geometric gap, and
482            // character-class transitions; a space is inserted only
483            // when the weighted sum meets SPACE_CONSENSUS_THRESHOLD.
484            // Decision is computed before the mutable borrow of `last`
485            // to keep the borrow checker happy.
486            let want_space = {
487                let last = self.spans.last().expect("checked non-empty");
488                !last.text.ends_with(' ')
489                    && !text.starts_with(' ')
490                    && self.evaluate_space_consensus(gap, font_size, &last.text, &text)
491            };
492            let last = self.spans.last_mut().expect("checked non-empty");
493            if want_space {
494                last.text.push(' ');
495            }
496            last.text.push_str(&text);
497            last.width = last.width.max(glyph_end_x - last.x);
498            last.height = last.height.max(font_size);
499            self.last_y = y;
500            self.last_end_x = glyph_end_x;
501            // ANN[r17/TEX1] Consume the TJ signal: it only counts for
502            // the one merge it preceded.
503            self.pending_tj_offset = 0.0;
504            return;
505        }
506
507        self.last_y = y;
508        self.last_end_x = glyph_end_x;
509        // ANN[r17/TEX1] Non-adjacent glyph starts a fresh span, so any
510        // pending TJ offset is about within-span word breaks and no longer
511        // meaningful here.
512        self.pending_tj_offset = 0.0;
513
514        self.spans.push(TextSpan {
515            text,
516            x,
517            y,
518            width: glyph_width,
519            height: font_size,
520            font_size,
521        });
522    }
523
524    // ANN[r17/TEX1] Record TJ offsets. Accumulate because a single
525    // inter-substring gap may be expressed as multiple numeric entries
526    // (rare, but legal per PDF §9.4.3). The next draw_glyph consumes
527    // the sum.
528    fn text_adjustment(&mut self, amount: f32) {
529        self.pending_tj_offset += amount;
530    }
531}
532
533fn estimate_glyph_width(glyph: &Glyph<'_>, font_size: f64) -> f64 {
534    match glyph {
535        Glyph::Outline(outline) => outline
536            .advance_width()
537            .map(|width| width as f64 / 1000.0 * font_size)
538            .unwrap_or(font_size * 0.5),
539        Glyph::Type3(_) => font_size * 0.5,
540    }
541}
542
543/// Collapse fake-bold / overprint duplicates inside one band.
544///
545/// Real-word corpus failures such as 0105.pdf draw the same text several times
546/// with sub-point x drift to simulate heavier weight. Text extraction should
547/// keep the most informative span once rather than concatenate every overprint.
548fn collapse_overprinted_spans(spans: &mut Vec<TextSpan>) {
549    if spans.len() < 2 {
550        return;
551    }
552
553    let mut deduped: Vec<TextSpan> = Vec::with_capacity(spans.len());
554    for span in spans.drain(..) {
555        if let Some(last) = deduped.last_mut() {
556            if spans_are_overprint_duplicates(last, &span) {
557                let choose_incoming = span.text.chars().count() > last.text.chars().count()
558                    || (span.text.chars().count() == last.text.chars().count()
559                        && span.width > last.width);
560                let preferred_text = if choose_incoming {
561                    span.text.clone()
562                } else {
563                    last.text.clone()
564                };
565                let left = last.x.min(span.x);
566                let right = last.right().max(span.right());
567                last.x = left;
568                last.y = (last.y + span.y) * 0.5;
569                last.width = (right - left).max(last.width).max(span.width);
570                last.height = last.height.max(span.height);
571                last.font_size = last.font_size.max(span.font_size);
572                last.text = preferred_text;
573                continue;
574            }
575        }
576
577        deduped.push(span);
578    }
579
580    *spans = deduped;
581}
582
583fn spans_are_overprint_duplicates(lhs: &TextSpan, rhs: &TextSpan) -> bool {
584    let lhs_text = lhs.text.trim();
585    let rhs_text = rhs.text.trim();
586    if lhs_text.is_empty() || rhs_text.is_empty() {
587        return false;
588    }
589
590    let same_baseline = (lhs.y - rhs.y).abs() <= lhs.font_size.max(rhs.font_size) * 0.12;
591    if !same_baseline {
592        return false;
593    }
594
595    let lhs_left = lhs.x;
596    let lhs_right = lhs.right();
597    let rhs_left = rhs.x;
598    let rhs_right = rhs.right();
599    let overlap = (lhs_right.min(rhs_right) - lhs_left.max(rhs_left)).max(0.0);
600    let min_width = (lhs_right - lhs_left).min(rhs_right - rhs_left).max(1.0);
601    let heavily_overlaps = overlap / min_width >= 0.85;
602    if !heavily_overlaps {
603        return false;
604    }
605
606    lhs_text == rhs_text || lhs_text.starts_with(rhs_text) || rhs_text.starts_with(lhs_text)
607}
608
609fn trim_overlapping_word_prefix(prev: &str, curr: &str) -> Option<String> {
610    let prev_chars: Vec<char> = prev.trim_end().chars().collect();
611    let curr_chars: Vec<char> = curr.trim_start().chars().collect();
612    let max = prev_chars.len().min(curr_chars.len());
613
614    for len in (4..=max).rev() {
615        let prev_start = prev_chars.len() - len;
616        if prev_chars[prev_start..] != curr_chars[..len] {
617            continue;
618        }
619
620        if !curr_chars[..len].iter().all(|ch| ch.is_alphanumeric()) {
621            continue;
622        }
623
624        let prev_boundary = prev_start == 0 || !prev_chars[prev_start - 1].is_alphanumeric();
625        let curr_boundary = len == curr_chars.len() || !curr_chars[len].is_alphanumeric();
626        if !prev_boundary || !curr_boundary {
627            continue;
628        }
629
630        return Some(curr_chars[len..].iter().collect());
631    }
632
633    None
634}
635
636/// Compute an adaptive column gap threshold from a set of bands.
637///
638/// Collects all positive inter-span gaps within each band, computes the
639/// median, and returns `COLUMN_GAP_MEDIAN_MULTIPLIER × median`, clamped to
640/// `[COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX]`.  Falls back to
641/// `COLUMN_GAP_THRESHOLD_FALLBACK` when there are no measurable gaps.
642fn compute_adaptive_column_gap(bands: &[TextBand]) -> f64 {
643    let mut all_gaps: Vec<f64> = Vec::new();
644
645    for band in bands {
646        if band.spans.len() < 2 {
647            continue;
648        }
649        let mut sorted = band.spans.clone();
650        sorted.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
651        let mut prev_right = sorted[0].right();
652        for span in sorted.iter().skip(1) {
653            let gap = span.x - prev_right;
654            if gap > 0.0 {
655                all_gaps.push(gap);
656            }
657            prev_right = prev_right.max(span.right());
658        }
659    }
660
661    if all_gaps.is_empty() {
662        return COLUMN_GAP_THRESHOLD_FALLBACK;
663    }
664
665    all_gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
666
667    let min_gap = all_gaps[0];
668
669    // When all inter-span gaps are already large (> MIN threshold), they are
670    // likely all column gaps — the draw_glyph merger absorbed word-level
671    // spaces into span text.  Use a fraction of the smallest gap so that
672    // ALL column gaps exceed the threshold.
673    if min_gap > COLUMN_GAP_THRESHOLD_MIN {
674        return (min_gap * 0.75).clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
675    }
676
677    // Look for a natural break: the largest relative jump between consecutive
678    // sorted gaps separates word-level gaps from column gaps.
679    let mut best_break_threshold = 0.0f64;
680    let mut best_ratio = 1.5f64; // require at least 1.5× jump
681    for pair in all_gaps.windows(2) {
682        if pair[0] > 0.5 {
683            let ratio = pair[1] / pair[0];
684            if ratio > best_ratio {
685                best_ratio = ratio;
686                best_break_threshold = (pair[0] + pair[1]) * 0.5;
687            }
688        }
689    }
690
691    if best_break_threshold > 0.0 {
692        return best_break_threshold.clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
693    }
694
695    // Fallback: median × multiplier.
696    let mid = all_gaps.len() / 2;
697    let median = if all_gaps.len() % 2 == 0 {
698        (all_gaps[mid - 1] + all_gaps[mid]) * 0.5
699    } else {
700        all_gaps[mid]
701    };
702
703    (median * COLUMN_GAP_MEDIAN_MULTIPLIER)
704        .clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX)
705}
706
707/// Group spans into reading-order blocks, using column-aware reordering when
708/// a contiguous region repeatedly exposes the same gutters.
709/// Per-page adaptive parameters derived from the span set before any
710/// grouping happens. Centralising these here (TEX4) means the rest of
711/// the pipeline — band grouping, XY-Cut cuts, in-block space insertion
712/// — all speak the same typographic baseline for this specific page,
713/// rather than each helper reaching for an independent fixed constant.
714#[derive(Debug, Clone, Copy)]
715struct PageStats {
716    /// Median font size across all spans (pt).
717    median_font_size: f64,
718    /// Median measured character width (pt). Zero-guarded fallback is
719    /// 0.5 × median_font_size when there aren't enough samples.
720    /// Currently populated for diagnostics / future tuning; allow dead_code
721    /// under `-D warnings` until a reader is added.
722    #[allow(dead_code)]
723    median_char_width: f64,
724    /// Tight line-to-line spacing (25th percentile of pairwise band
725    /// gaps), representing the body-text leading on this page. The
726    /// quartile is used instead of the median so large paragraph /
727    /// zone gaps don't inflate the baseline. Zero if the page has
728    /// only one band.
729    median_line_spacing: f64,
730}
731
732impl PageStats {
733    fn from_spans(spans: &[TextSpan]) -> Self {
734        if spans.is_empty() {
735            return Self {
736                median_font_size: 12.0,
737                median_char_width: 6.0,
738                median_line_spacing: 0.0,
739            };
740        }
741
742        // Median font size.
743        let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
744        sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
745        let median_font_size = sizes[sizes.len() / 2];
746
747        // Median char width — measured width / char count, per span.
748        let mut char_widths: Vec<f64> = spans
749            .iter()
750            .filter_map(|s| {
751                let chars = s.text.chars().count();
752                if chars > 0 && s.width > 0.0 {
753                    Some(s.width / chars as f64)
754                } else {
755                    None
756                }
757            })
758            .collect();
759        let median_char_width = if char_widths.is_empty() {
760            median_font_size * 0.5
761        } else {
762            char_widths.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
763            char_widths[char_widths.len() / 2]
764        };
765
766        // Median line spacing — pairwise gaps between consecutive band
767        // y-values.
768        let band_tolerance = (median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
769        let mut ys: Vec<f64> = spans.iter().map(|s| s.y).collect();
770        ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
771        let mut band_ys: Vec<f64> = Vec::new();
772        for y in ys {
773            if band_ys
774                .last()
775                .map(|prev: &f64| (prev - y).abs() > band_tolerance)
776                .unwrap_or(true)
777            {
778                band_ys.push(y);
779            }
780        }
781        // ANN[r17/TEX4] "Line spacing" here means the TIGHT line-to-line
782        // gap inside a text block — not the median of all gaps. Using
783        // the median drags the estimate up when the page has
784        // paragraph / zone breaks (which are the very gaps the
785        // paragraph-break threshold is supposed to EXCEED). The 25th
786        // percentile is the smallest gap that still shows up in more
787        // than one place on the page; it captures body-text leading
788        // robustly even when large zone gaps dominate.
789        let median_line_spacing = if band_ys.len() < 2 {
790            0.0
791        } else {
792            let mut spacings: Vec<f64> = band_ys
793                .windows(2)
794                .map(|pair| (pair[0] - pair[1]).abs())
795                .collect();
796            spacings.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
797            let q1_index = spacings.len() / 4;
798            spacings[q1_index]
799        };
800
801        Self {
802            median_font_size,
803            median_char_width,
804            median_line_spacing,
805        }
806    }
807}
808
809// ANN[r17/TEX2] Maximum recursion depth for XY-Cut. Any real page layout
810// is decomposable in well under 10 alternating cuts; the cap guards
811// against pathological inputs where the cut predicate keeps triggering
812// due to floating-point drift.
813const XY_CUT_MAX_DEPTH: usize = 12;
814/// Minimum fraction of a region's width that a vertical gap must reach
815/// before it qualifies as a column gutter.
816const XY_CUT_VERTICAL_GAP_REGION_FRACTION: f64 = 0.04;
817/// Floor (in pt) for vertical gap regardless of region width. Matches
818/// the previous `COLUMN_GAP_THRESHOLD_MIN` and keeps XY-Cut conservative
819/// on narrow regions (sidebars, tall columns).
820const XY_CUT_VERTICAL_GAP_FLOOR: f64 = 10.0;
821/// Multiplier applied to median font size to produce the horizontal-gap
822/// threshold. 1.8 × line-height matches typical paragraph spacing.
823const XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER: f64 = 1.8;
824/// Minimum number of spans a column must contain before it is eligible
825/// for acceptance — one-span "columns" are almost always sidebar noise
826/// or table-cell fragments.
827const XY_CUT_MIN_SPANS_PER_COLUMN: usize = 2;
828/// Average characters per band a column must have before it's accepted
829/// as dense prose (vs. a short-cell table column).
830const XY_CUT_MIN_CHARS_PER_BAND: f64 = 8.0;
831
832/// ANN[r17/TEX2][r17/TEX4] Top-level grouping uses recursive XY-Cut
833/// with a density guard. Per-page stats are computed once up front so
834/// every decision downstream speaks the same typographic baseline.
835fn group_spans_into_blocks(spans: Vec<TextSpan>) -> Vec<TextBlock> {
836    if spans.is_empty() {
837        return Vec::new();
838    }
839    let stats = PageStats::from_spans(&spans);
840    xy_cut_recursive(spans, 0, &stats)
841}
842
843fn xy_cut_recursive(spans: Vec<TextSpan>, depth: usize, stats: &PageStats) -> Vec<TextBlock> {
844    if spans.is_empty() {
845        return Vec::new();
846    }
847    if depth >= XY_CUT_MAX_DEPTH {
848        return band_based_blocks(spans, stats);
849    }
850
851    // ANN[r17/TEX2] Pick whichever direction has the largest qualifying
852    // gap. Always cutting vertically first breaks layouts where a
853    // footer sits in the mid-x range — it would attach to the left
854    // column instead of being recognized as a page-level zone. The
855    // "largest gap wins" rule is the standard XY-Cut tie-breaker used
856    // by academic OCR literature and matches pdf_oxide.
857    let vcut = try_vertical_cut(&spans, stats);
858    let hcut = try_horizontal_cut(&spans, stats);
859
860    let (chosen, _) = match (vcut, hcut) {
861        (Some((v_groups, v_gap)), Some((h_groups, h_gap))) => {
862            if v_gap >= h_gap {
863                (Some(v_groups), v_gap)
864            } else {
865                (Some(h_groups), h_gap)
866            }
867        }
868        (Some((v_groups, v_gap)), None) => (Some(v_groups), v_gap),
869        (None, Some((h_groups, h_gap))) => (Some(h_groups), h_gap),
870        (None, None) => (None, 0.0),
871    };
872
873    if let Some(groups) = chosen {
874        let mut out = Vec::new();
875        for group in groups {
876            out.extend(xy_cut_recursive(group, depth + 1, stats));
877        }
878        return out;
879    }
880
881    band_based_blocks(spans, stats)
882}
883
884/// Emit per-band row blocks without any column detection. Used as the
885/// leaf of XY-Cut recursion — at this point the region either has no
886/// further cuts or the density guard refused them.
887fn band_based_blocks(spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBlock> {
888    // XY-Cut can miss recurring gutters when a small number of bands span the
889    // full page width (e.g. a running header above a 3-column body). In that
890    // case, fall back to the older band/gutter detector inside the leaf region
891    // instead of flattening everything row-major.
892    group_spans_into_blocks_legacy_with_stats(spans, stats)
893}
894
895/// Median font-size helper. Currently unreferenced after `PageStats` took over
896/// the typography baseline computation; kept available for future tuning paths.
897#[allow(dead_code)]
898fn median_font_size(spans: &[TextSpan]) -> f64 {
899    if spans.is_empty() {
900        return 12.0;
901    }
902    let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
903    sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
904    sizes[sizes.len() / 2]
905}
906
907/// Attempt a vertical (column) cut. Returns the span groups plus the
908/// gap size (in pt) if a suitable gutter is found AND the density +
909/// alignment guards accept.
910///
911/// ANN[r17/TEX2] Three guards together avoid false-positive columns:
912///   1. `min_gap` is the MAX of (median_font, 4% of region width, 10pt)
913///      — deliberately lower than `median_font * 2` so narrow-gutter
914///      academic papers (12pt gutters, common in print) are still
915///      detected.
916///   2. `columns_are_dense` rejects column splits where either side
917///      has <2 spans or <8 chars/band — catches table cells.
918///   3. `columns_are_band_aligned` rejects cuts where any band would
919///      end up on only one side of the cut while being wider than
920///      ~70% of that side's column width — catches full-width
921///      paragraphs (Intro / Outro) that accidentally sit in the
922///      left-column x-range.
923fn try_vertical_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
924    if spans.len() < 2 * XY_CUT_MIN_SPANS_PER_COLUMN {
925        return None;
926    }
927
928    let region_left = spans.iter().map(|s| s.x).fold(f64::INFINITY, f64::min);
929    let region_right = spans
930        .iter()
931        .map(TextSpan::right)
932        .fold(f64::NEG_INFINITY, f64::max);
933    let region_width = region_right - region_left;
934    if region_width <= 0.0 {
935        return None;
936    }
937
938    // ANN[r17/TEX2][r17/TEX4] Threshold uses the ADAPTIVE median-word-gap
939    // from the bands rather than a flat font-size multiple. Narrow-gutter
940    // academic layouts have 12pt gutters next to 4pt word spaces — the
941    // adaptive threshold scales with the actual typography used on this
942    // page. Clamped to `XY_CUT_VERTICAL_GAP_FLOOR` to avoid firing on
943    // ordinary inter-word spaces when character advance data is noisy.
944    // median_font and the width fraction act only as safety rails for
945    // pathological inputs.
946    let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
947    let adaptive = compute_adaptive_column_gap(&bands);
948    let floor = stats
949        .median_font_size
950        .max(region_width * XY_CUT_VERTICAL_GAP_REGION_FRACTION)
951        .max(XY_CUT_VERTICAL_GAP_FLOOR);
952    let min_gap = adaptive.min(floor).max(XY_CUT_VERTICAL_GAP_FLOOR);
953
954    // Intervals [x_left, x_right] of every span; we look for an x value
955    // that is free of ALL intervals (full-height gap).
956    let mut intervals: Vec<(f64, f64)> = spans
957        .iter()
958        .map(|s| (s.x, s.right().max(s.x + 0.001)))
959        .collect();
960    intervals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
961
962    let mut cursor = intervals[0].1;
963    let mut best_gap: Option<(f64, f64)> = None; // (gap_size, cut_x)
964    for (left, right) in intervals.iter().skip(1) {
965        if *left > cursor {
966            let gap = *left - cursor;
967            if gap >= min_gap {
968                match best_gap {
969                    Some((best, _)) if best >= gap => {}
970                    _ => {
971                        let cut_x = (cursor + *left) * 0.5;
972                        best_gap = Some((gap, cut_x));
973                    }
974                }
975            }
976        }
977        cursor = cursor.max(*right);
978    }
979
980    let (gap_size, cut_x) = best_gap?;
981
982    // Split spans around the cut. A span whose midpoint is < cut_x
983    // belongs to the left group.
984    let mut left_group = Vec::new();
985    let mut right_group = Vec::new();
986    for span in spans {
987        let midpoint = span.x + (span.right() - span.x) * 0.5;
988        if midpoint < cut_x {
989            left_group.push(span.clone());
990        } else {
991            right_group.push(span.clone());
992        }
993    }
994
995    if !columns_are_dense(&left_group, &right_group, stats) {
996        return None;
997    }
998    if !columns_are_band_aligned(spans, cut_x, region_left, region_right, stats) {
999        return None;
1000    }
1001
1002    Some((vec![left_group, right_group], gap_size))
1003}
1004
1005/// ANN[r17/TEX2] Reject a vertical cut when any band sits on only one
1006/// side of the cut AND occupies more than ~70% of that side's column
1007/// width. Such bands are almost certainly full-width paragraphs that
1008/// happened to align with the left margin of one column, and forcing
1009/// them into that column re-orders them relative to text that follows.
1010fn columns_are_band_aligned(
1011    spans: &[TextSpan],
1012    cut_x: f64,
1013    region_left: f64,
1014    region_right: f64,
1015    stats: &PageStats,
1016) -> bool {
1017    let left_width = (cut_x - region_left).max(1.0);
1018    let right_width = (region_right - cut_x).max(1.0);
1019
1020    // Threshold chosen empirically: paragraph bodies in columnar
1021    // layouts usually fill ~60-70% of their column; anything wider
1022    // than 0.7× is a page-level element masquerading as column
1023    // content.
1024    const MAX_SINGLE_SIDE_FRACTION: f64 = 0.70;
1025
1026    let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1027    for band in &bands {
1028        let mut has_left = false;
1029        let mut has_right = false;
1030        for span in &band.spans {
1031            let midpoint = span.x + (span.right() - span.x) * 0.5;
1032            if midpoint < cut_x {
1033                has_left = true;
1034            } else {
1035                has_right = true;
1036            }
1037        }
1038        if has_left && has_right {
1039            continue; // Band straddles columns → fine.
1040        }
1041        let band_width = band.width();
1042        if has_left && band_width > left_width * MAX_SINGLE_SIDE_FRACTION {
1043            return false;
1044        }
1045        if has_right && band_width > right_width * MAX_SINGLE_SIDE_FRACTION {
1046            return false;
1047        }
1048    }
1049    true
1050}
1051
1052/// Density guard — reject column splits that look like tables (few,
1053/// short spans per column). A column is "dense" when it has at least
1054/// MIN_SPANS_PER_COLUMN spans and the average character count per band
1055/// exceeds MIN_CHARS_PER_BAND.
1056fn columns_are_dense(left: &[TextSpan], right: &[TextSpan], stats: &PageStats) -> bool {
1057    for col in [left, right] {
1058        if col.len() < XY_CUT_MIN_SPANS_PER_COLUMN {
1059            return false;
1060        }
1061        let bands = group_spans_into_bands_with_stats(col.to_vec(), stats);
1062        if bands.is_empty() {
1063            return false;
1064        }
1065        let total_chars: usize = col.iter().map(|s| s.text.chars().count()).sum();
1066        let chars_per_band = total_chars as f64 / bands.len() as f64;
1067        if chars_per_band < XY_CUT_MIN_CHARS_PER_BAND {
1068            return false;
1069        }
1070    }
1071    true
1072}
1073
1074/// Attempt a horizontal (zone / paragraph) cut. Unlike vertical cuts
1075/// this does NOT need a density guard — splitting top-from-bottom
1076/// cannot re-order content.
1077fn try_horizontal_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1078    if spans.len() < 2 {
1079        return None;
1080    }
1081    // Sort by descending y (PDF y grows upward).
1082    let mut sorted = spans.to_vec();
1083    sorted.sort_by(|a, b| {
1084        b.y.partial_cmp(&a.y)
1085            .unwrap_or(Ordering::Equal)
1086            .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1087    });
1088
1089    // ANN[r17/TEX4] Paragraph / zone cuts scale with MEDIAN LINE
1090    // SPACING when available — this is the typographically correct
1091    // baseline (paragraph break ≈ 1.8 × line-spacing). When the page
1092    // has only one band, or stats haven't observed spacing yet, fall
1093    // back to the font-size multiple the legacy path used.
1094    let min_gap = if stats.median_line_spacing > 0.0 {
1095        stats.median_line_spacing * PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER
1096    } else {
1097        stats.median_font_size * XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER
1098    };
1099
1100    // Look for the largest gap between consecutive span y-values.
1101    let mut best: Option<(f64, f64)> = None; // (gap_size, cut_y)
1102    let tolerance = stats.median_font_size * BAND_Y_FRACTION;
1103    let mut band_bottom = sorted[0].y;
1104
1105    for span in sorted.iter().skip(1) {
1106        if (band_bottom - span.y).abs() <= tolerance {
1107            band_bottom = band_bottom.min(span.y);
1108            continue;
1109        }
1110        let gap = band_bottom - span.y;
1111        if gap >= min_gap {
1112            let cut_y = (band_bottom + span.y) * 0.5;
1113            match best {
1114                Some((best_gap, _)) if best_gap >= gap => {}
1115                _ => best = Some((gap, cut_y)),
1116            }
1117        }
1118        band_bottom = span.y;
1119    }
1120
1121    let (gap_size, cut_y) = best?;
1122
1123    let mut top_group = Vec::new();
1124    let mut bottom_group = Vec::new();
1125    for span in spans {
1126        if span.y > cut_y {
1127            top_group.push(span.clone());
1128        } else {
1129            bottom_group.push(span.clone());
1130        }
1131    }
1132    if top_group.is_empty() || bottom_group.is_empty() {
1133        return None;
1134    }
1135    Some((vec![top_group, bottom_group], gap_size))
1136}
1137
1138/// Legacy band+column-detection path, kept for reference and as the
1139/// fallback inside `band_based_blocks` test coverage. Not currently
1140/// used — XY-Cut supersedes it.
1141#[allow(dead_code)]
1142fn group_spans_into_blocks_legacy(spans: Vec<TextSpan>) -> Vec<TextBlock> {
1143    let bands = group_spans_into_bands(spans);
1144    group_spans_into_blocks_legacy_from_bands(bands)
1145}
1146
1147fn group_spans_into_blocks_legacy_with_stats(
1148    spans: Vec<TextSpan>,
1149    stats: &PageStats,
1150) -> Vec<TextBlock> {
1151    let bands = group_spans_into_bands_with_stats(spans, stats);
1152    group_spans_into_blocks_legacy_from_bands(bands)
1153}
1154
1155fn group_spans_into_blocks_legacy_from_bands(bands: Vec<TextBand>) -> Vec<TextBlock> {
1156    if bands.is_empty() {
1157        return Vec::new();
1158    }
1159
1160    let column_gap_threshold = compute_adaptive_column_gap(&bands);
1161
1162    let mut blocks = Vec::new();
1163    let mut idx = 0;
1164
1165    while idx < bands.len() {
1166        let gap_midpoints = bands[idx].gap_midpoints(column_gap_threshold);
1167        if gap_midpoints.is_empty() {
1168            blocks.push(bands[idx].row_block());
1169            idx += 1;
1170            continue;
1171        }
1172
1173        let mut boundaries = gap_midpoints.clone();
1174        let mut band_indices = vec![idx];
1175        let mut gapped_band_count = 1usize;
1176        let mut region_left = bands[idx].left();
1177        let mut region_right = bands[idx].right();
1178        let mut next_idx = idx + 1;
1179
1180        while next_idx < bands.len() {
1181            let next_band = &bands[next_idx];
1182            let next_gap_midpoints = next_band.gap_midpoints(column_gap_threshold);
1183            if next_gap_midpoints.is_empty() {
1184                if next_band
1185                    .fits_single_column(&boundaries, region_left, region_right)
1186                    .is_some()
1187                {
1188                    band_indices.push(next_idx);
1189                    next_idx += 1;
1190                    continue;
1191                }
1192                break;
1193            }
1194
1195            if !boundaries_match(&boundaries, &next_gap_midpoints, column_gap_threshold) {
1196                break;
1197            }
1198
1199            update_boundaries(&mut boundaries, &next_gap_midpoints, gapped_band_count);
1200            gapped_band_count += 1;
1201            band_indices.push(next_idx);
1202            region_left = region_left.min(next_band.left());
1203            region_right = region_right.max(next_band.right());
1204            next_idx += 1;
1205        }
1206
1207        if region_is_columnar(&bands, &band_indices, &boundaries, gapped_band_count) {
1208            append_column_region_blocks(&bands, &band_indices, &boundaries, &mut blocks);
1209            idx = next_idx;
1210        } else {
1211            blocks.push(bands[idx].row_block());
1212            idx += 1;
1213        }
1214    }
1215
1216    blocks
1217}
1218
1219/// Legacy wrapper used by call sites that haven't been handed PageStats.
1220/// It derives stats locally. Prefer `group_spans_into_bands_with_stats`
1221/// inside the XY-Cut pipeline to avoid recomputing the stats per call.
1222fn group_spans_into_bands(spans: Vec<TextSpan>) -> Vec<TextBand> {
1223    let stats = PageStats::from_spans(&spans);
1224    group_spans_into_bands_with_stats(spans, &stats)
1225}
1226
1227fn group_spans_into_bands_with_stats(mut spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBand> {
1228    if spans.is_empty() {
1229        return Vec::new();
1230    }
1231
1232    spans.sort_by(|a, b| {
1233        b.y.partial_cmp(&a.y)
1234            .unwrap_or(Ordering::Equal)
1235            .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1236    });
1237
1238    // ANN[r17/TEX4] Band tolerance scales with this page's median font
1239    // size rather than a fixed 5pt floor. Single-page spreads with
1240    // huge display fonts (24pt+) previously merged unrelated lines; a
1241    // fractional threshold keeps that from happening without hurting
1242    // body-text pages.
1243    let page_tolerance = (stats.median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
1244
1245    let mut bands: Vec<TextBand> = Vec::new();
1246
1247    for span in spans {
1248        let tolerance = (span.height * BAND_Y_FRACTION)
1249            .max(page_tolerance)
1250            .max(BAND_Y_TOLERANCE);
1251        if let Some(band) = bands
1252            .iter_mut()
1253            .find(|band| (band.y - span.y).abs() <= tolerance)
1254        {
1255            let span_count = band.spans.len() as f64;
1256            band.y = (band.y * span_count + span.y) / (span_count + 1.0);
1257            band.spans.push(span);
1258        } else {
1259            bands.push(TextBand::new(span));
1260        }
1261    }
1262
1263    for band in &mut bands {
1264        band.sort_spans();
1265    }
1266
1267    bands.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal));
1268    bands
1269}
1270
1271fn boundaries_match(boundaries: &[f64], gap_midpoints: &[f64], column_gap_threshold: f64) -> bool {
1272    let tolerance = (column_gap_threshold * 1.5)
1273        .max(COLUMN_GAP_MATCH_TOLERANCE)
1274        .min(60.0);
1275    boundaries.len() == gap_midpoints.len()
1276        && boundaries
1277            .iter()
1278            .zip(gap_midpoints)
1279            .all(|(lhs, rhs)| (lhs - rhs).abs() <= tolerance)
1280}
1281
1282fn update_boundaries(boundaries: &mut [f64], gap_midpoints: &[f64], seen_gapped_bands: usize) {
1283    for (boundary, midpoint) in boundaries.iter_mut().zip(gap_midpoints) {
1284        *boundary =
1285            (*boundary * seen_gapped_bands as f64 + midpoint) / (seen_gapped_bands as f64 + 1.0);
1286    }
1287}
1288
1289fn region_is_columnar(
1290    bands: &[TextBand],
1291    band_indices: &[usize],
1292    boundaries: &[f64],
1293    gapped_band_count: usize,
1294) -> bool {
1295    if boundaries.is_empty()
1296        || gapped_band_count < MIN_COLUMN_GAPPED_BANDS
1297        || band_indices.is_empty()
1298        || (gapped_band_count as f64 / band_indices.len() as f64) < MIN_COLUMN_GAP_SUPPORT
1299    {
1300        return false;
1301    }
1302
1303    let mut non_empty_slices = 0usize;
1304    let mut dense_slices = 0usize;
1305    let mut slices_per_column = vec![0usize; boundaries.len() + 1];
1306
1307    for &band_idx in band_indices {
1308        let slices = bands[band_idx].split_by_boundaries(boundaries);
1309        for (column_idx, slice) in slices.iter().enumerate() {
1310            if slice.is_empty() {
1311                continue;
1312            }
1313
1314            non_empty_slices += 1;
1315            slices_per_column[column_idx] += 1;
1316
1317            let char_count = slice
1318                .iter()
1319                .map(|span| span.text.chars().count())
1320                .sum::<usize>();
1321            if slice.len() >= 2 || char_count >= 8 {
1322                dense_slices += 1;
1323            }
1324        }
1325    }
1326
1327    if non_empty_slices < boundaries.len() + 2 {
1328        return false;
1329    }
1330
1331    if slices_per_column.contains(&0) {
1332        return false;
1333    }
1334
1335    (dense_slices as f64 / non_empty_slices as f64) >= MIN_DENSE_SLICE_RATIO
1336}
1337
1338fn append_column_region_blocks(
1339    bands: &[TextBand],
1340    band_indices: &[usize],
1341    boundaries: &[f64],
1342    blocks: &mut Vec<TextBlock>,
1343) {
1344    let column_count = boundaries.len() + 1;
1345    let mut column_bands = vec![Vec::<TextSpan>::new(); column_count];
1346
1347    for &band_idx in band_indices {
1348        let slices = bands[band_idx].split_by_boundaries(boundaries);
1349        for (column_idx, slice) in slices.into_iter().enumerate() {
1350            if slice.is_empty() {
1351                continue;
1352            }
1353            column_bands[column_idx].push(TextSpan {
1354                text: String::new(),
1355                x: 0.0,
1356                y: 0.0,
1357                width: 0.0,
1358                height: 0.0,
1359                font_size: 0.0,
1360            });
1361            let marker_idx = column_bands[column_idx].len() - 1;
1362            column_bands[column_idx][marker_idx] = TextSpan {
1363                text: String::new(),
1364                x: f64::NEG_INFINITY,
1365                y: bands[band_idx].y,
1366                width: 0.0,
1367                height: 0.0,
1368                font_size: 0.0,
1369            };
1370            column_bands[column_idx].extend(slice);
1371        }
1372    }
1373
1374    for spans in column_bands {
1375        let mut current: Vec<TextSpan> = Vec::new();
1376        for span in spans {
1377            if span.x == f64::NEG_INFINITY {
1378                if !current.is_empty() {
1379                    current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1380                    blocks.push(TextBlock {
1381                        spans: std::mem::take(&mut current),
1382                    });
1383                }
1384                continue;
1385            }
1386            current.push(span);
1387        }
1388        if !current.is_empty() {
1389            current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1390            blocks.push(TextBlock { spans: current });
1391        }
1392    }
1393}
1394
1395/// Join per-block lines, stitching end-of-line hyphenated word-wraps the
1396/// way pdftotext / MuPDF / PDFBox do.
1397///
1398/// Trigger conditions (all must hold):
1399/// 1. Previous line ends with `-` preceded by an alphabetic character.
1400/// 2. The alphabetic suffix before the `-` has >= 3 characters.
1401/// 3. The next line (trimmed) starts with an ASCII lowercase letter.
1402/// 4. The lowercase prefix of the next line has >= 3 characters.
1403///
1404/// When triggered, the trailing `-` is removed and the two halves are
1405/// concatenated without a space or newline.
1406///
1407/// This avoids false positives on compound words ("real-time"), bullet
1408/// lists, numeric ranges ("42-"), and short fragments.
1409fn stitch_hyphenated_lines(lines: &[String]) -> String {
1410    let mut out = String::new();
1411    for (idx, line) in lines.iter().enumerate() {
1412        if idx == 0 {
1413            out.push_str(line);
1414            continue;
1415        }
1416
1417        let next_trimmed = line.trim_start();
1418
1419        // Check the accumulated output for end-of-line hyphen pattern
1420        let should_merge = is_hyphen_wrap_candidate(&out, next_trimmed);
1421
1422        if should_merge {
1423            out.pop(); // drop the trailing '-'
1424            out.push_str(next_trimmed);
1425        } else {
1426            out.push('\n');
1427            out.push_str(line);
1428        }
1429    }
1430    out
1431}
1432
1433/// Check if the accumulated text ends with a hyphen-wrap pattern and the
1434/// continuation is a valid merge target.
1435fn is_hyphen_wrap_candidate(accumulated: &str, next_trimmed: &str) -> bool {
1436    // Must end with '-'
1437    if !accumulated.ends_with('-') {
1438        return false;
1439    }
1440
1441    // Character before '-' must be alphabetic
1442    let before_hyphen = accumulated.chars().rev().nth(1);
1443    if !before_hyphen.is_some_and(|c| c.is_alphabetic()) {
1444        return false;
1445    }
1446
1447    // Count consecutive alphabetic chars before the '-' (the word fragment)
1448    let alpha_prefix_len = accumulated
1449        .chars()
1450        .rev()
1451        .skip(1) // skip the '-'
1452        .take_while(|c| c.is_alphabetic())
1453        .count();
1454    if alpha_prefix_len < 3 {
1455        return false;
1456    }
1457
1458    // Next line must start with lowercase ASCII
1459    let first_next = next_trimmed.chars().next();
1460    if !first_next.is_some_and(|c| c.is_ascii_lowercase()) {
1461        return false;
1462    }
1463
1464    // Count consecutive lowercase chars at start of next line
1465    let next_alpha_len = next_trimmed
1466        .chars()
1467        .take_while(|c| c.is_ascii_lowercase())
1468        .count();
1469    if next_alpha_len < 3 {
1470        return false;
1471    }
1472
1473    true
1474}
1475
1476/// Normalize extracted text to match pdftotext conventions.
1477///
1478/// 1. Trim trailing whitespace from each line.
1479/// 2. Collapse runs of more than two consecutive newlines into exactly two.
1480/// 3. Preserve form-feed characters (`\x0C`) as page separators.
1481/// 4. End with a single trailing newline (or empty for empty input).
1482pub(crate) fn normalize_text_output(text: &str) -> String {
1483    if text.is_empty() {
1484        return String::new();
1485    }
1486
1487    let mut lines: Vec<&str> = Vec::new();
1488    for line in text.split('\n') {
1489        lines.push(line.trim_end());
1490    }
1491
1492    // Remove trailing empty lines (we'll add exactly one \n at the end)
1493    while lines.last() == Some(&"") {
1494        lines.pop();
1495    }
1496
1497    if lines.is_empty() {
1498        return String::new();
1499    }
1500
1501    let mut result = String::with_capacity(text.len());
1502    let mut consecutive_empty = 0u32;
1503
1504    for (i, line) in lines.iter().enumerate() {
1505        if line.is_empty() || *line == "\x0C" {
1506            if line.is_empty() {
1507                consecutive_empty += 1;
1508                // Collapse >2 consecutive blank lines to 2
1509                if consecutive_empty <= 2 {
1510                    result.push('\n');
1511                }
1512            } else {
1513                // Bare form-feed line
1514                consecutive_empty = 0;
1515                result.push_str(line);
1516                if i + 1 < lines.len() {
1517                    result.push('\n');
1518                }
1519            }
1520        } else {
1521            // Check if line starts with form-feed
1522            if line.starts_with('\x0C') {
1523                consecutive_empty = 0;
1524                result.push_str(line);
1525            } else {
1526                consecutive_empty = 0;
1527                result.push_str(line);
1528            }
1529            if i + 1 < lines.len() {
1530                result.push('\n');
1531            }
1532        }
1533    }
1534
1535    // Ensure single trailing newline
1536    if !result.is_empty() && !result.ends_with('\n') {
1537        result.push('\n');
1538    }
1539
1540    result
1541}
1542
1543#[cfg(test)]
1544mod tests {
1545    use super::*;
1546
1547    fn span(text: &str, x: f64, y: f64, width: f64) -> TextSpan {
1548        TextSpan {
1549            text: text.into(),
1550            x,
1551            y,
1552            width,
1553            height: 12.0,
1554            font_size: 12.0,
1555        }
1556    }
1557
1558    fn block_texts(spans: Vec<TextSpan>) -> Vec<String> {
1559        group_spans_into_blocks(spans)
1560            .into_iter()
1561            .map(|block| block.text())
1562            .collect()
1563    }
1564
1565    #[test]
1566    fn empty_device_produces_empty_text() {
1567        let dev = TextExtractionDevice::new();
1568        assert!(dev.into_text().is_empty());
1569    }
1570
1571    #[test]
1572    fn single_column_stays_row_major() {
1573        let texts = block_texts(vec![
1574            span("Single Column Line 1", 40.0, 700.0, 140.0),
1575            span("Single Column Line 2", 40.0, 684.0, 140.0),
1576            span("Single Column Line 3", 40.0, 668.0, 140.0),
1577        ]);
1578
1579        assert_eq!(
1580            texts,
1581            vec![
1582                "Single Column Line 1",
1583                "Single Column Line 2",
1584                "Single Column Line 3",
1585            ]
1586        );
1587    }
1588
1589    #[test]
1590    fn two_column_region_reads_column_major() {
1591        let texts = block_texts(vec![
1592            span("Header", 200.0, 740.0, 80.0),
1593            span("Left column line one", 40.0, 700.0, 115.0),
1594            span("Right column line one", 320.0, 700.0, 120.0),
1595            span("Left column line two", 40.0, 684.0, 115.0),
1596            span("Right column line two", 320.0, 684.0, 120.0),
1597            span("Left column line three", 40.0, 668.0, 125.0),
1598            span("Right column line three", 320.0, 668.0, 130.0),
1599            span("Footer", 200.0, 620.0, 80.0),
1600        ]);
1601
1602        assert_eq!(
1603            texts,
1604            vec![
1605                "Header",
1606                "Left column line one",
1607                "Left column line two",
1608                "Left column line three",
1609                "Right column line one",
1610                "Right column line two",
1611                "Right column line three",
1612                "Footer",
1613            ]
1614        );
1615    }
1616
1617    #[test]
1618    fn mixed_single_and_multi_column_regions_preserve_shared_bands() {
1619        let texts = block_texts(vec![
1620            span("Intro paragraph", 40.0, 740.0, 180.0),
1621            span("L1 words here", 40.0, 700.0, 110.0),
1622            span("R1 words here", 320.0, 700.0, 110.0),
1623            span("L2 words here", 40.0, 684.0, 110.0),
1624            span("R2 words here", 320.0, 684.0, 110.0),
1625            span("L3 words here", 40.0, 668.0, 110.0),
1626            span("R3 words here", 320.0, 668.0, 110.0),
1627            span("Outro paragraph", 40.0, 620.0, 180.0),
1628        ]);
1629
1630        assert_eq!(
1631            texts,
1632            vec![
1633                "Intro paragraph",
1634                "L1 words here",
1635                "L2 words here",
1636                "L3 words here",
1637                "R1 words here",
1638                "R2 words here",
1639                "R3 words here",
1640                "Outro paragraph",
1641            ]
1642        );
1643    }
1644
1645    #[test]
1646    fn short_table_like_rows_fall_back_to_row_major() {
1647        let texts = block_texts(vec![
1648            span("Name", 40.0, 700.0, 30.0),
1649            span("Age", 320.0, 700.0, 20.0),
1650            span("Alice", 40.0, 684.0, 35.0),
1651            span("30", 320.0, 684.0, 15.0),
1652            span("Bob", 40.0, 668.0, 24.0),
1653            span("25", 320.0, 668.0, 15.0),
1654        ]);
1655
1656        assert_eq!(texts, vec!["Name Age", "Alice 30", "Bob 25"]);
1657    }
1658
1659    #[test]
1660    fn three_column_regions_are_supported() {
1661        let texts = block_texts(vec![
1662            span("Column one line one", 40.0, 700.0, 105.0),
1663            span("Column two line one", 220.0, 700.0, 105.0),
1664            span("Column three line one", 400.0, 700.0, 120.0),
1665            span("Column one line two", 40.0, 684.0, 105.0),
1666            span("Column two line two", 220.0, 684.0, 105.0),
1667            span("Column three line two", 400.0, 684.0, 120.0),
1668            span("Column one line three", 40.0, 668.0, 120.0),
1669            span("Column two line three", 220.0, 668.0, 120.0),
1670            span("Column three line three", 400.0, 668.0, 135.0),
1671        ]);
1672
1673        assert_eq!(
1674            texts,
1675            vec![
1676                "Column one line one",
1677                "Column one line two",
1678                "Column one line three",
1679                "Column two line one",
1680                "Column two line two",
1681                "Column two line three",
1682                "Column three line one",
1683                "Column three line two",
1684                "Column three line three",
1685            ]
1686        );
1687    }
1688
1689    #[test]
1690    fn text_block_concatenation_spaced() {
1691        let block = TextBlock {
1692            spans: vec![span("A", 0.0, 0.0, 6.0), span("B", 20.0, 0.0, 6.0)],
1693        };
1694        assert_eq!(block.text(), "A B");
1695    }
1696
1697    #[test]
1698    fn adaptive_column_gap_fallback_for_no_gaps() {
1699        // Single-span bands produce no measurable gaps → fallback
1700        let bands = vec![
1701            TextBand::new(span("Hello", 40.0, 700.0, 80.0)),
1702            TextBand::new(span("World", 40.0, 684.0, 80.0)),
1703        ];
1704        let threshold = compute_adaptive_column_gap(&bands);
1705        assert!((threshold - COLUMN_GAP_THRESHOLD_FALLBACK).abs() < 0.01);
1706    }
1707
1708    #[test]
1709    fn adaptive_column_gap_uses_median() {
1710        // Three bands with word gaps of ~4pt each → median ≈ 4, threshold = 12
1711        let mut bands = Vec::new();
1712        for y in [700.0, 684.0, 668.0] {
1713            let mut band = TextBand::new(span("word1", 40.0, y, 30.0));
1714            band.spans.push(span("word2", 74.0, y, 30.0)); // gap = 4
1715            band.spans.push(span("word3", 108.0, y, 30.0)); // gap = 4
1716            bands.push(band);
1717        }
1718        let threshold = compute_adaptive_column_gap(&bands);
1719        // median gap = 4, × 3 = 12, clamped to [10, 40] → 12
1720        assert!(
1721            threshold >= 10.0 && threshold <= 14.0,
1722            "expected ~12, got {threshold}"
1723        );
1724    }
1725
1726    #[test]
1727    fn adaptive_column_gap_clamps_to_min() {
1728        // Tight gaps (2pt) across many bands → median = 2, 3×2 = 6 → clamped to 10
1729        let mut bands = Vec::new();
1730        for y in [700.0, 684.0, 668.0, 652.0] {
1731            let mut band = TextBand::new(span("abc", 0.0, y, 18.0));
1732            // right of "abc" = max(18, 12*0.5*3=18) = 18; gap = 20-18 = 2
1733            band.spans.push(span("def", 20.0, y, 18.0));
1734            bands.push(band);
1735        }
1736        let threshold = compute_adaptive_column_gap(&bands);
1737        assert!(
1738            (threshold - COLUMN_GAP_THRESHOLD_MIN).abs() < 0.01,
1739            "expected {COLUMN_GAP_THRESHOLD_MIN}, got {threshold}"
1740        );
1741    }
1742
1743    #[test]
1744    fn adaptive_column_gap_all_large_gaps_uses_fraction_of_min() {
1745        // When all gaps are large (> MIN), threshold = 0.75 × min_gap.
1746        let mut band = TextBand::new(span("Left", 0.0, 700.0, 30.0));
1747        band.spans.push(span("Right", 80.0, 700.0, 30.0)); // gap = 50
1748        let bands = vec![band];
1749        let threshold = compute_adaptive_column_gap(&bands);
1750        assert!(
1751            (threshold - 37.5).abs() < 0.01,
1752            "expected 37.5 (0.75×50), got {threshold}"
1753        );
1754    }
1755
1756    #[test]
1757    fn normalize_trims_trailing_whitespace_per_line() {
1758        assert_eq!(
1759            normalize_text_output("hello   \nworld  \n"),
1760            "hello\nworld\n"
1761        );
1762    }
1763
1764    #[test]
1765    fn normalize_collapses_excess_newlines() {
1766        // >2 blank lines collapse to 2 (meaning 3 \n in a row: line, blank, blank)
1767        assert_eq!(
1768            normalize_text_output("hello\n\n\n\n\nworld\n"),
1769            "hello\n\n\nworld\n"
1770        );
1771    }
1772
1773    #[test]
1774    fn normalize_preserves_double_newline() {
1775        assert_eq!(
1776            normalize_text_output("paragraph one\n\nparagraph two\n"),
1777            "paragraph one\n\nparagraph two\n"
1778        );
1779    }
1780
1781    #[test]
1782    fn normalize_preserves_form_feed() {
1783        assert_eq!(
1784            normalize_text_output("page1\n\n\x0Cpage2\n"),
1785            "page1\n\n\x0Cpage2\n"
1786        );
1787    }
1788
1789    #[test]
1790    fn normalize_adds_trailing_newline() {
1791        assert_eq!(normalize_text_output("hello"), "hello\n");
1792    }
1793
1794    #[test]
1795    fn normalize_empty_input() {
1796        assert_eq!(normalize_text_output(""), "");
1797    }
1798
1799    #[test]
1800    fn normalize_only_whitespace() {
1801        assert_eq!(normalize_text_output("   \n  \n"), "");
1802    }
1803
1804    // --- Hyphen stitching tests ---
1805
1806    #[test]
1807    fn hyphen_stitch_joins_wrapped_word() {
1808        let lines = vec!["the aver-".into(), "age rainfall".into()];
1809        assert_eq!(stitch_hyphenated_lines(&lines), "the average rainfall");
1810    }
1811
1812    #[test]
1813    fn hyphen_stitch_handles_leading_whitespace() {
1814        let lines = vec!["pre-".into(), "   dict the outcome".into()];
1815        // "pre" is only 3 chars → meets >= 3 guard
1816        assert_eq!(stitch_hyphenated_lines(&lines), "predict the outcome");
1817    }
1818
1819    #[test]
1820    fn hyphen_stitch_capital_continuation_not_stitched() {
1821        let lines = vec!["Section three-".into(), "Summary here".into()];
1822        assert_eq!(
1823            stitch_hyphenated_lines(&lines),
1824            "Section three-\nSummary here"
1825        );
1826    }
1827
1828    #[test]
1829    fn hyphen_stitch_bullet_dash_not_stitched() {
1830        // "-" alone: char before hyphen is not alphabetic
1831        let lines = vec!["Items:".into(), "-".into(), "milk".into()];
1832        assert_eq!(stitch_hyphenated_lines(&lines), "Items:\n-\nmilk");
1833    }
1834
1835    #[test]
1836    fn hyphen_stitch_numeric_range_not_stitched() {
1837        // "42-" — char before hyphen is digit, not alphabetic
1838        let lines = vec!["page 42-".into(), "seventy".into()];
1839        assert_eq!(stitch_hyphenated_lines(&lines), "page 42-\nseventy");
1840    }
1841
1842    #[test]
1843    fn hyphen_stitch_short_prefix_not_stitched() {
1844        // "re-" only 2 alpha chars before hyphen → below 3-char guard
1845        let lines = vec!["re-".into(), "organize".into()];
1846        assert_eq!(stitch_hyphenated_lines(&lines), "re-\norganize");
1847    }
1848
1849    #[test]
1850    fn hyphen_stitch_short_continuation_not_stitched() {
1851        // Next line starts with "an" (2 chars) → below 3-char guard
1852        let lines = vec!["counter-".into(), "an example".into()];
1853        assert_eq!(stitch_hyphenated_lines(&lines), "counter-\nan example");
1854    }
1855
1856    #[test]
1857    fn hyphen_stitch_compound_word_midline_preserved() {
1858        // "real-time" is mid-line, not end-of-line — no stitching applies
1859        // because stitch only operates on line boundaries
1860        let lines = vec!["real-time system".into()];
1861        assert_eq!(stitch_hyphenated_lines(&lines), "real-time system");
1862    }
1863
1864    #[test]
1865    fn hyphen_stitch_single_line_unchanged() {
1866        let lines = vec!["only line".into()];
1867        assert_eq!(stitch_hyphenated_lines(&lines), "only line");
1868    }
1869
1870    #[test]
1871    fn hyphen_stitch_empty_input() {
1872        let lines: Vec<String> = vec![];
1873        assert_eq!(stitch_hyphenated_lines(&lines), "");
1874    }
1875
1876    // --- TEX1 multi-signal space consensus tests ---
1877
1878    fn make_device_with_median(median: f64) -> TextExtractionDevice {
1879        let mut dev = TextExtractionDevice::new();
1880        // Seed enough samples for the median to resolve to `median`.
1881        for _ in 0..MEDIAN_REFRESH {
1882            dev.glyph_widths.push(median);
1883        }
1884        dev.refresh_median_char_width();
1885        assert!((dev.cached_median_char_width - median).abs() < 1e-9);
1886        dev
1887    }
1888
1889    #[test]
1890    fn consensus_inserts_space_on_strong_tj_offset_alone() {
1891        // Gap is below the geometric threshold, but the TJ offset is large
1892        // enough that the consensus must still fire.
1893        let mut dev = make_device_with_median(6.0);
1894        dev.pending_tj_offset = 250.0; // full em-space
1895        assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1896    }
1897
1898    #[test]
1899    fn consensus_inserts_space_on_geometric_gap_alone() {
1900        // No TJ, no character transition, but a clearly wide geometric gap.
1901        let dev = make_device_with_median(6.0);
1902        // gap > 0.3 * 6.0 = 1.8 → fires gap signal (0.80), below threshold
1903        // on its own? 0.80 < 0.75 threshold? No, 0.80 > 0.75, so it fires.
1904        assert!(dev.evaluate_space_consensus(2.5, 12.0, "hello", "world"));
1905    }
1906
1907    #[test]
1908    fn consensus_no_space_on_kerning_gap() {
1909        // Small kerning-size gap with no other signals must not inject a
1910        // space (regression guard against false-positive spaces inside
1911        // tightly kerned words).
1912        let dev = make_device_with_median(6.0);
1913        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "fi", "lm"));
1914    }
1915
1916    #[test]
1917    fn consensus_inserts_space_on_camel_case_plus_gap() {
1918        // CamelCase heuristic (0.60) alone doesn't reach threshold, but a
1919        // moderate gap (0.60 gap + 0.60 heuristic if gap fires) should.
1920        // Here gap = 2.5 > 1.8 → gap fires → total 0.80 + 0.60 = 1.40.
1921        let dev = make_device_with_median(6.0);
1922        assert!(dev.evaluate_space_consensus(2.5, 12.0, "helloWorld", "Inc"));
1923    }
1924
1925    #[test]
1926    fn consensus_inserts_space_on_digit_letter_transition_with_gap() {
1927        let dev = make_device_with_median(6.0);
1928        assert!(dev.evaluate_space_consensus(2.5, 12.0, "123", "abc"));
1929    }
1930
1931    #[test]
1932    fn consensus_heuristic_alone_is_insufficient() {
1933        // Heuristic (0.60) on its own is below the 0.75 threshold — the
1934        // design deliberately requires a second corroborating signal to
1935        // avoid gluing spaces into existing CamelCase identifiers that
1936        // have no geometric break.
1937        let dev = make_device_with_median(6.0);
1938        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "camel", "Case"));
1939    }
1940
1941    #[test]
1942    fn consensus_falls_back_to_font_size_when_no_median() {
1943        // No samples → median is 0; geometric reference uses font-size.
1944        let dev = TextExtractionDevice::new();
1945        // gap 1.9 > 0.15 * 12.0 = 1.8 → gap signal fires
1946        assert!(dev.evaluate_space_consensus(1.9, 12.0, "a", "b"));
1947        // gap 1.5 < 1.8 → no signal
1948        assert!(!dev.evaluate_space_consensus(1.5, 12.0, "a", "b"));
1949    }
1950
1951    #[test]
1952    fn consensus_ignores_tiny_tj_offsets() {
1953        // TJ offsets below the threshold are kerning, not word breaks.
1954        let mut dev = make_device_with_median(6.0);
1955        dev.pending_tj_offset = 50.0;
1956        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1957    }
1958
1959    #[test]
1960    fn consensus_accepts_negative_tj_offsets() {
1961        // A negative TJ offset still represents an explicit inter-substring
1962        // shift and counts toward the consensus (|amount| check).
1963        let mut dev = make_device_with_median(6.0);
1964        dev.pending_tj_offset = -250.0;
1965        assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
1966    }
1967
1968    #[test]
1969    fn text_adjustment_accumulates_until_glyph() {
1970        let mut dev = TextExtractionDevice::new();
1971        dev.text_adjustment(120.0);
1972        dev.text_adjustment(140.0);
1973        assert!((dev.pending_tj_offset - 260.0).abs() < 1e-6);
1974    }
1975
1976    // --- TEX2 XY-Cut tests ---
1977
1978    #[test]
1979    fn xy_cut_header_body_footer_with_two_columns() {
1980        // Header and footer sit in the mid-x range that would
1981        // accidentally fall into a left-column bucket with a naive
1982        // vertical-first cut. The largest-gap-first rule plus the
1983        // alignment guard ensure header and footer bracket the
1984        // columnar body.
1985        let texts = block_texts(vec![
1986            span("HEADLINE TITLE", 180.0, 760.0, 120.0),
1987            span("Left col line A", 40.0, 700.0, 110.0),
1988            span("Right col line A", 320.0, 700.0, 115.0),
1989            span("Left col line B", 40.0, 684.0, 110.0),
1990            span("Right col line B", 320.0, 684.0, 115.0),
1991            span("Left col line C", 40.0, 668.0, 110.0),
1992            span("Right col line C", 320.0, 668.0, 115.0),
1993            span("FOOTER LINE TEXT", 180.0, 600.0, 120.0),
1994        ]);
1995        assert_eq!(texts.first().map(String::as_str), Some("HEADLINE TITLE"));
1996        assert_eq!(texts.last().map(String::as_str), Some("FOOTER LINE TEXT"));
1997        // Left column lines all come before right column lines.
1998        let left_c_idx = texts.iter().position(|s| s == "Left col line C").unwrap();
1999        let right_a_idx = texts.iter().position(|s| s == "Right col line A").unwrap();
2000        assert!(
2001            left_c_idx < right_a_idx,
2002            "expected column-major ordering in body: {texts:?}"
2003        );
2004    }
2005
2006    #[test]
2007    fn xy_cut_rejects_column_split_on_table_rows() {
2008        // The density guard must still reject the 280pt inter-cell gap
2009        // in a short-cell table, preserving row-major reading order.
2010        let texts = block_texts(vec![
2011            span("Name", 40.0, 700.0, 30.0),
2012            span("Age", 320.0, 700.0, 20.0),
2013            span("Alice", 40.0, 684.0, 35.0),
2014            span("30", 320.0, 684.0, 15.0),
2015        ]);
2016        assert_eq!(texts, vec!["Name Age", "Alice 30"]);
2017    }
2018
2019    #[test]
2020    fn xy_cut_rejects_column_split_when_one_band_is_full_width() {
2021        // The alignment guard catches a full-width paragraph that
2022        // would otherwise be forced into the left column of a 2-column
2023        // region below it.
2024        let texts = block_texts(vec![
2025            span(
2026                "Full width intro spanning both columns here",
2027                40.0,
2028                740.0,
2029                360.0,
2030            ),
2031            span("Left A", 40.0, 700.0, 50.0),
2032            span("Right A", 320.0, 700.0, 50.0),
2033            span("Left B", 40.0, 684.0, 50.0),
2034            span("Right B", 320.0, 684.0, 50.0),
2035        ]);
2036        assert!(
2037            texts[0].contains("Full width intro"),
2038            "expected full-width intro first: {texts:?}"
2039        );
2040    }
2041
2042    #[test]
2043    fn xy_cut_horizontal_split_for_zone_boundaries() {
2044        // Pure horizontal cut on a single-column page with a big
2045        // vertical gap between paragraphs — the cut fires and both
2046        // paragraphs stay in their own blocks.
2047        let texts = block_texts(vec![
2048            span("First paragraph body text", 40.0, 740.0, 200.0),
2049            span("Second paragraph body", 40.0, 680.0, 180.0),
2050        ]);
2051        assert_eq!(texts.len(), 2);
2052        assert!(texts[0].starts_with("First"));
2053        assert!(texts[1].starts_with("Second"));
2054    }
2055
2056    #[test]
2057    fn xy_cut_recursion_terminates_with_single_span() {
2058        let texts = block_texts(vec![span("Only one span on the page", 40.0, 700.0, 180.0)]);
2059        assert_eq!(texts, vec!["Only one span on the page"]);
2060    }
2061
2062    #[test]
2063    fn median_font_size_handles_mixed_sizes() {
2064        let spans = vec![
2065            TextSpan {
2066                text: "small".into(),
2067                x: 0.0,
2068                y: 0.0,
2069                width: 10.0,
2070                height: 8.0,
2071                font_size: 8.0,
2072            },
2073            TextSpan {
2074                text: "medium".into(),
2075                x: 0.0,
2076                y: 0.0,
2077                width: 10.0,
2078                height: 12.0,
2079                font_size: 12.0,
2080            },
2081            TextSpan {
2082                text: "large".into(),
2083                x: 0.0,
2084                y: 0.0,
2085                width: 10.0,
2086                height: 24.0,
2087                font_size: 24.0,
2088            },
2089        ];
2090        assert!((median_font_size(&spans) - 12.0).abs() < 1e-9);
2091    }
2092
2093    #[test]
2094    fn columns_band_aligned_accepts_aligned_columns() {
2095        let spans = vec![
2096            span("L1", 40.0, 700.0, 60.0),
2097            span("R1", 300.0, 700.0, 60.0),
2098            span("L2", 40.0, 684.0, 60.0),
2099            span("R2", 300.0, 684.0, 60.0),
2100        ];
2101        let stats = PageStats::from_spans(&spans);
2102        // cut_x between 100 and 300 → 200. Every band straddles the cut.
2103        assert!(columns_are_band_aligned(&spans, 200.0, 40.0, 360.0, &stats));
2104    }
2105
2106    #[test]
2107    fn columns_band_aligned_rejects_wide_single_side_band() {
2108        let spans = vec![
2109            span("Wide banner line across top", 40.0, 740.0, 280.0),
2110            span("L1", 40.0, 700.0, 60.0),
2111            span("R1", 300.0, 700.0, 60.0),
2112        ];
2113        let stats = PageStats::from_spans(&spans);
2114        // cut_x = 200. Banner only in left group (midpoint < 200). Width
2115        // exceeds 0.7 × left column width → rejected.
2116        assert!(!columns_are_band_aligned(
2117            &spans, 200.0, 40.0, 360.0, &stats
2118        ));
2119    }
2120
2121    #[test]
2122    fn page_stats_computes_median_values() {
2123        let spans = vec![
2124            span("one", 40.0, 700.0, 30.0),
2125            span("two", 40.0, 680.0, 30.0),
2126            span("three", 40.0, 660.0, 50.0),
2127        ];
2128        let stats = PageStats::from_spans(&spans);
2129        assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2130        // char width = width / chars. one=30/3=10, two=30/3=10, three=50/5=10. median=10.
2131        assert!((stats.median_char_width - 10.0).abs() < 1e-9);
2132        // line spacing: bands at 700, 680, 660. gaps = 20, 20. median = 20.
2133        assert!((stats.median_line_spacing - 20.0).abs() < 1e-9);
2134    }
2135
2136    #[test]
2137    fn page_stats_handles_empty_input() {
2138        let stats = PageStats::from_spans(&[]);
2139        assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2140        assert!((stats.median_char_width - 6.0).abs() < 1e-9);
2141        assert_eq!(stats.median_line_spacing, 0.0);
2142    }
2143
2144    #[test]
2145    fn narrow_gutter_detected_with_adaptive_threshold() {
2146        // Academic paper layout: 12pt gutter between columns.
2147        // With old fixed 20pt threshold, this was not detected as columnar.
2148        // With adaptive: median word gap ~4pt, threshold = 12pt → detects 12pt gutter.
2149        let mut spans = Vec::new();
2150        for y in [700.0, 684.0, 668.0] {
2151            // Left column: two words with 4pt gap, ending at x=145
2152            spans.push(span("Lorem ipsum", 40.0, y, 100.0));
2153            spans.push(span("dolor sit", 144.0, y, 80.0));
2154            // Right column starts at 236 (gap = 12pt from 224)
2155            spans.push(span("amet consec", 236.0, y, 100.0));
2156            spans.push(span("tetur adipi", 340.0, y, 80.0));
2157        }
2158        let texts = block_texts(spans);
2159        // Should detect 2-column layout and read column-major
2160        assert!(
2161            texts.len() >= 6,
2162            "expected column-major output, got {texts:?}"
2163        );
2164        // First three blocks should be left column lines
2165        assert!(
2166            texts[0].contains("Lorem"),
2167            "first block should be left column: {texts:?}"
2168        );
2169    }
2170
2171    #[test]
2172    fn xy_cut_leaf_falls_back_to_legacy_columns_for_header_plus_three_columns() {
2173        let texts = block_texts(vec![
2174            span("73022", 45.0, 750.0, 70.0),
2175            span("Federal Register banner", 125.6, 750.0, 260.0),
2176            span("Left column line one", 45.0, 725.0, 140.0),
2177            span("Middle column line one", 222.0, 725.0, 140.0),
2178            span("Right column line one", 399.0, 725.0, 120.0),
2179            span("Left column line two", 45.0, 715.0, 140.0),
2180            span("Middle column line two", 210.0, 715.0, 152.0),
2181            span("Right column line two", 388.0, 715.0, 132.0),
2182            span("Left column line three", 45.0, 705.0, 140.0),
2183            span("Middle column line three", 235.0, 705.0, 135.0),
2184            span("Right column line three", 408.0, 705.0, 118.0),
2185        ]);
2186
2187        assert_eq!(
2188            texts,
2189            vec![
2190                "73022 Federal Register banner",
2191                "Left column line one",
2192                "Left column line two",
2193                "Left column line three",
2194                "Middle column line one",
2195                "Middle column line two",
2196                "Middle column line three",
2197                "Right column line one",
2198                "Right column line two",
2199                "Right column line three",
2200            ]
2201        );
2202    }
2203
2204    #[test]
2205    fn overlapping_fake_bold_spans_collapse_to_single_copy() {
2206        let texts = block_texts(vec![
2207            span("1 This is fakebold text.", 25.9, 785.3, 320.0),
2208            span("1 This is fakebold text.", 26.2, 785.3, 320.0),
2209            span("1 This is fakebold text.", 26.4, 785.3, 320.0),
2210            span("1 This is fakebold text.", 26.7, 785.3, 320.0),
2211            span("2 This is a fakebold", 27.0, 714.8, 142.0),
2212            span(" fakebold", 169.8, 714.8, 70.0),
2213            span(" fakebold", 170.1, 714.8, 70.0),
2214            span(" fakebold word.", 170.4, 714.8, 110.0),
2215        ]);
2216
2217        assert_eq!(
2218            texts,
2219            vec!["1 This is fakebold text.", "2 This is a fakebold word.",]
2220        );
2221    }
2222}