Skip to main content

pdf_engine/
text.rs

1//! Text extraction via a custom Device implementation.
2
3use kurbo::{Affine, BezPath, Shape};
4use pdf_render::pdf_interpret::cmap::BfString;
5use pdf_render::pdf_interpret::font::Glyph;
6use pdf_render::pdf_interpret::{
7    BlendMode, ClipPath, Device, GlyphDrawMode, Image, Paint, PathDrawMode, SoftMask,
8};
9use std::cmp::Ordering;
10
11/// Minimum Y tolerance for grouping spans into horizontal bands. The
12/// effective tolerance is typically `median_font_size * BAND_Y_FRACTION`
13/// per ANN[r17/TEX4]; this constant acts as the absolute floor.
14const BAND_Y_TOLERANCE: f64 = 5.0;
15/// Fraction of the page's median font size used as the band-Y
16/// tolerance. Empirically 0.30× works across common typography —
17/// below typical leading (~1.2×) so adjacent lines never collapse,
18/// above sub-pixel baseline drift.
19const BAND_Y_FRACTION: f64 = 0.30;
20/// Multiplier applied to median line spacing to derive the horizontal
21/// paragraph-break cut threshold. Normal line-to-line progression is
22/// ~1.0× the median; paragraph breaks typically show 1.5× or more.
23const PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER: f64 = 1.8;
24
25// ANN[r17/TEX1] Multi-signal consensus thresholds.
26// The previous single-threshold scheme (gap > 0.15 * font_size) missed
27// word boundaries when kerning or narrow fonts produced small measured
28// gaps even though the PDF emitted an explicit TJ backward shift, and
29// over-emitted spaces for condensed fonts where 0.15em of kerning is
30// well below an actual word space. The consensus system weights three
31// signals and inserts a space when the combined confidence exceeds
32// SPACE_CONSENSUS_THRESHOLD.
33/// Raw TJ backward adjustment (positive in PDF TJ units) that is
34/// definitively a word break. Matches pdftotext / MuPDF heuristics —
35/// a space glyph is typically emitted as either a literal 0x20 or as
36/// a TJ adjustment of around 250 1/1000 em. 100 units is a safely
37/// conservative floor.
38const TJ_SPACE_THRESHOLD_UNITS: f32 = 100.0;
39/// Weight of the TJ offset signal when confidence is high.
40const TJ_SIGNAL_WEIGHT: f64 = 0.95;
41/// Weight of the purely geometric gap signal.
42const GAP_SIGNAL_WEIGHT: f64 = 0.80;
43/// Weight of character-heuristic signals (CamelCase, digit↔letter).
44const HEURISTIC_SIGNAL_WEIGHT: f64 = 0.60;
45/// Combined weight at which a space is inserted.
46const SPACE_CONSENSUS_THRESHOLD: f64 = 0.75;
47/// Fraction of a median character width above which a gap contributes
48/// to the geometric signal (pdf_oxide uses ~0.30).
49const GAP_TO_MEDIAN_CHAR_FRACTION: f64 = 0.30;
50/// Fallback gap fraction relative to `font_size` when the running
51/// median character width has not yet been established.
52const GAP_TO_FONT_SIZE_FALLBACK_FRACTION: f64 = 0.15;
53
54/// Minimum horizontal gap treated as a column gutter (adaptive fallback).
55const COLUMN_GAP_THRESHOLD_MIN: f64 = 10.0;
56/// Maximum adaptive column gap threshold.
57const COLUMN_GAP_THRESHOLD_MAX: f64 = 40.0;
58/// Multiplier applied to median inter-word gap to derive column threshold.
59const COLUMN_GAP_MEDIAN_MULTIPLIER: f64 = 3.0;
60/// Fallback column gap threshold when median cannot be computed.
61const COLUMN_GAP_THRESHOLD_FALLBACK: f64 = 20.0;
62/// Maximum drift allowed when matching gutters across neighboring bands.
63const COLUMN_GAP_MATCH_TOLERANCE: f64 = 12.0;
64/// Minimum number of gapped bands required before we enable column mode.
65const MIN_COLUMN_GAPPED_BANDS: usize = 3;
66/// Minimum fraction of bands in a region that must expose the shared gutters.
67const MIN_COLUMN_GAP_SUPPORT: f64 = 0.80;
68/// Minimum fraction of non-empty column slices that must look like prose.
69const MIN_DENSE_SLICE_RATIO: f64 = 0.35;
70
71/// Whether a text span's width was computed from real font metrics or estimated.
72#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
73#[cfg_attr(feature = "serde", derive(serde::Serialize))]
74pub enum WidthSource {
75    /// Width derived from the font's actual glyph advance (hmtx, CFF, Type1 charstring).
76    Metric,
77    /// Width estimated at 50 % of font size — no glyph metric was available.
78    #[default]
79    Estimate,
80}
81
82impl WidthSource {
83    /// Stable label used across SDK bindings and the JSON wire form
84    /// (`"Metric"` / `"Estimate"`). Single source of truth for the string; the
85    /// `serde` derive on the enum produces identical values.
86    pub fn as_str(&self) -> &'static str {
87        match self {
88            WidthSource::Metric => "Metric",
89            WidthSource::Estimate => "Estimate",
90        }
91    }
92}
93
94/// Controls the richness of per-glyph geometry extraction.
95#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
96pub enum GeometryMode {
97    /// Advance-based bounds only (current default).
98    #[default]
99    Basic,
100    /// Full tight glyph bounds from outline paths.
101    RichGeometry,
102}
103
104/// Provenance of per-glyph bounding boxes.
105#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
106pub enum BoundsSource {
107    /// Advance bounds.
108    #[default]
109    Advance,
110    /// Path-derived tight bounds.
111    Tight,
112    /// Fallback (no outline available).
113    Estimate,
114}
115
116/// Vertical font metrics in /1000 em (1000 units-per-em), sourced from the
117/// embedded font's OS/2 / hhea tables via skrifa. `ascent` is positive (above
118/// the baseline); `descent` is negative (below it). The whole struct is `None`
119/// on a span when no embedded font binary exposes metrics (non-embedded
120/// standard-14, CFF, Type1, Type3).
121#[derive(Debug, Clone, Copy, PartialEq, Default)]
122#[cfg_attr(feature = "serde", derive(serde::Serialize))]
123pub struct FontMetrics {
124    /// Ascent above the baseline, in /1000 em.
125    pub ascent: f64,
126    /// Descent below the baseline (negative), in /1000 em.
127    pub descent: f64,
128    /// Cap height in /1000 em, when present in the font.
129    #[cfg_attr(
130        feature = "serde",
131        serde(rename = "capHeight", skip_serializing_if = "Option::is_none")
132    )]
133    pub cap_height: Option<f64>,
134    /// x-height in /1000 em, when present in the font.
135    #[cfg_attr(
136        feature = "serde",
137        serde(rename = "xHeight", skip_serializing_if = "Option::is_none")
138    )]
139    pub x_height: Option<f64>,
140}
141
142/// A single text span at a specific position.
143#[derive(Debug, Clone, Default)]
144pub struct TextSpan {
145    /// The extracted text.
146    pub text: String,
147    /// X position in user space.
148    pub x: f64,
149    /// Y position in user space.
150    pub y: f64,
151    /// Approximate bounding-box width in user space.
152    pub width: f64,
153    /// Approximate bounding-box height in user space.
154    pub height: f64,
155    /// Font size (approximate, from transform).
156    pub font_size: f64,
157
158    // ---- G1 read-only metadata (added 2026-05; backward-compatible) ----
159    /// PostScript name of the font, with any 6-character subset prefix stripped
160    /// (e.g. `Helvetica-Bold`, `TimesNewRomanPS-BoldMT`). `None` for Type1
161    /// standard-14 fonts and Type3 fonts where no embedded font data is
162    /// available through the public `pdf-interpret` API.
163    pub font_name: Option<String>,
164    /// Inferred bold style: `weight >= 700` or PostScript name suggests bold
165    /// ("bold", "demi", "semibold", "heavy", "black"). Defaults `false` when
166    /// no descriptor data is reachable.
167    pub is_bold: bool,
168    /// Inferred italic style: FontDescriptor /Italic flag set or PostScript
169    /// name suggests italic/oblique/slant. Defaults `false` when no descriptor
170    /// data is reachable.
171    pub is_italic: bool,
172    /// Fill color as sRGB RGBA, derived from `Paint::Color(c).to_rgba().to_rgba8()`
173    /// at the moment of glyph paint. `None` for `Paint::Pattern` (tiling /
174    /// shading) — the editor falls back to "auto" in that case.
175    pub color: Option<[u8; 4]>,
176
177    // ---- G2 glyph-level metrics (added 2026-05) ----
178    /// Whether glyph widths were measured from real font advance data or estimated.
179    pub width_source: WidthSource,
180    /// Per-glyph bounding boxes in user-space, one entry per source glyph.
181    /// `[x0, y0, x1, y1]` with y0 < y1 (PDF coordinate frame).
182    pub char_bounds: Vec<[f64; 4]>,
183
184    // ---- M3 rich geometry (added 2026-06; backward-compatible) ----
185    #[doc(hidden)]
186    pub geometry_mode: GeometryMode,
187    #[doc(hidden)]
188    pub bounds_source: BoundsSource,
189    #[doc(hidden)]
190    pub tight_char_bounds: Vec<[f64; 4]>,
191    #[doc(hidden)]
192    pub glyph_advances: Vec<f64>,
193    #[doc(hidden)]
194    pub glyph_bounds_sources: Vec<BoundsSource>,
195
196    // ---- Golf 1 typographic metadata (added 2026-06; backward-compatible) ----
197    /// Full affine transform of the span's first glyph in user space,
198    /// `[a, b, c, d, e, f]` (kurbo coeffs of CTM × text-matrix). Captures
199    /// rotation and shear that `(x, y, font_size)` discards. `None` for spans
200    /// not built from a glyph (e.g. synthetic markers).
201    pub transform: Option<[f64; 6]>,
202    /// Numeric font weight (~100–900) from the embedded font's OS/2 table when
203    /// available. `None` for non-embedded / standard-14 fonts.
204    pub font_weight: Option<u16>,
205    /// Whether the font is serif, when determinable from embedded font data.
206    /// `None` when no embedded descriptor is reachable.
207    pub is_serif: Option<bool>,
208    /// Whether the font is monospace, when determinable from embedded font data.
209    /// `None` when no embedded descriptor is reachable.
210    pub is_monospace: Option<bool>,
211    /// Coarse PDF text render mode: `0` fill, `1` stroke, `3` invisible — the
212    /// only three values the renderer's `GlyphDrawMode` expresses (the
213    /// fill+stroke / clip modes 2 and 4–7 are not distinguished). Reflects the
214    /// span's first glyph. `None` for spans not built from a glyph.
215    pub render_mode: Option<u8>,
216
217    // ---- Golf 2 font metrics (added 2026-06; backward-compatible) ----
218    /// Vertical font metrics (ascent/descent, optional cap/x-height) in /1000
219    /// em from the embedded font binary. `None` for non-embedded fonts.
220    pub font_metrics: Option<FontMetrics>,
221}
222
223impl TextSpan {
224    /// Conservative right edge using whichever is wider: measured or estimated.
225    /// Used by column detection to avoid underestimating span extent.
226    fn right(&self) -> f64 {
227        self.x + self.width.max(self.estimated_width())
228    }
229
230    /// Right edge from measured glyph positions only.
231    fn measured_right(&self) -> f64 {
232        self.x + self.width
233    }
234
235    fn estimated_width(&self) -> f64 {
236        let char_count = self.text.chars().count() as f64;
237        if char_count <= 0.0 {
238            self.font_size * 0.5
239        } else {
240            self.font_size * 0.5 * char_count
241        }
242    }
243}
244
245/// A block of text (grouped by reading order).
246#[derive(Debug, Clone)]
247pub struct TextBlock {
248    /// Spans within this block, sorted by position.
249    pub spans: Vec<TextSpan>,
250}
251
252impl TextBlock {
253    /// Concatenate all spans into a single string.
254    ///
255    /// Spans that are close together are joined without a separator;
256    /// a space is inserted when the gap between spans exceeds half
257    /// the average character width.
258    pub fn text(&self) -> String {
259        if self.spans.is_empty() {
260            return String::new();
261        }
262        let mut result = self.spans[0].text.clone();
263        for pair in self.spans.windows(2) {
264            let prev = &pair[0];
265            let curr = &pair[1];
266            let expected_end = prev.measured_right();
267            let gap = curr.x - expected_end;
268            if gap <= prev.font_size * 0.12 {
269                if let Some(trimmed) = trim_overlapping_word_prefix(&prev.text, &curr.text) {
270                    result.push_str(&trimmed);
271                    continue;
272                }
273            }
274            if gap > prev.font_size * 0.25 {
275                result.push(' ');
276            }
277            result.push_str(&curr.text);
278        }
279        result
280    }
281}
282
283#[derive(Debug, Clone)]
284struct TextBand {
285    y: f64,
286    spans: Vec<TextSpan>,
287}
288
289impl TextBand {
290    fn new(span: TextSpan) -> Self {
291        Self {
292            y: span.y,
293            spans: vec![span],
294        }
295    }
296
297    fn sort_spans(&mut self) {
298        self.spans.sort_by(|a, b| {
299            a.x.partial_cmp(&b.x)
300                .unwrap_or(Ordering::Equal)
301                .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal))
302        });
303        collapse_overprinted_spans(&mut self.spans);
304    }
305
306    fn row_block(&self) -> TextBlock {
307        let mut spans = self.spans.clone();
308        spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
309        TextBlock { spans }
310    }
311
312    fn left(&self) -> f64 {
313        self.spans
314            .iter()
315            .map(|span| span.x)
316            .fold(f64::INFINITY, f64::min)
317    }
318
319    fn right(&self) -> f64 {
320        self.spans
321            .iter()
322            .map(TextSpan::right)
323            .fold(f64::NEG_INFINITY, f64::max)
324    }
325
326    fn width(&self) -> f64 {
327        (self.right() - self.left()).max(0.0)
328    }
329
330    fn gap_midpoints(&self, column_gap_threshold: f64) -> Vec<f64> {
331        self.gaps(column_gap_threshold)
332            .into_iter()
333            .map(|gap| (gap.start + gap.end) * 0.5)
334            .collect()
335    }
336
337    fn gaps(&self, column_gap_threshold: f64) -> Vec<BandGap> {
338        if self.spans.len() < 2 {
339            return Vec::new();
340        }
341
342        let mut spans = self.spans.clone();
343        spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
344
345        let mut gaps = Vec::new();
346        let mut prev_right = spans[0].right();
347        for span in spans.iter().skip(1) {
348            let gap = span.x - prev_right;
349            if gap >= column_gap_threshold {
350                gaps.push(BandGap {
351                    start: prev_right,
352                    end: span.x,
353                });
354            }
355            prev_right = prev_right.max(span.right());
356        }
357
358        gaps
359    }
360
361    fn split_by_boundaries(&self, boundaries: &[f64]) -> Vec<Vec<TextSpan>> {
362        let mut columns = vec![Vec::new(); boundaries.len() + 1];
363        for span in &self.spans {
364            let center_x = span.x + span.width.max(span.estimated_width()) * 0.5;
365            let column_idx = boundaries
366                .iter()
367                .position(|boundary| center_x < *boundary)
368                .unwrap_or(boundaries.len());
369            columns[column_idx].push(span.clone());
370        }
371
372        for spans in &mut columns {
373            spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
374        }
375
376        columns
377    }
378
379    fn fits_single_column(
380        &self,
381        boundaries: &[f64],
382        region_left: f64,
383        region_right: f64,
384    ) -> Option<usize> {
385        let mut column_idx: Option<usize> = None;
386        for span in &self.spans {
387            let left = span.x;
388            let right = span.right();
389            if boundaries
390                .iter()
391                .any(|boundary| left < *boundary && right > *boundary)
392            {
393                return None;
394            }
395
396            let center_x = left + (right - left) * 0.5;
397            let idx = boundaries
398                .iter()
399                .position(|boundary| center_x < *boundary)
400                .unwrap_or(boundaries.len());
401            match column_idx {
402                Some(existing) if existing != idx => return None,
403                Some(_) => {}
404                None => column_idx = Some(idx),
405            }
406        }
407        let idx = column_idx?;
408        let mut edges = Vec::with_capacity(boundaries.len() + 2);
409        edges.push(region_left);
410        edges.extend_from_slice(boundaries);
411        edges.push(region_right);
412
413        let column_width = (edges[idx + 1] - edges[idx]).max(0.0);
414        if column_width <= 0.0 || self.width() > column_width * 0.8 {
415            return None;
416        }
417
418        Some(idx)
419    }
420}
421
422#[derive(Debug, Clone, Copy)]
423struct BandGap {
424    start: f64,
425    end: f64,
426}
427
428/// A Device implementation that captures text from draw_glyph calls.
429///
430/// ANN[r17/TEX1][r17/TEX3] Space detection uses a multi-signal consensus
431/// rather than a single geometric threshold. Three signals vote:
432///   1. `pending_tj_offset`  — raw TJ backward shift surfaced by the
433///      interpreter (confidence 0.95). This is the definitive word-break
434///      signal used by pdftotext / MuPDF.
435///   2. geometric gap        — measured horizontal distance between the
436///      previous glyph's right edge and this glyph's origin (confidence
437///      0.80). Compared against the running median glyph width rather
438///      than a flat em-fraction so condensed/wide fonts are handled
439///      uniformly.
440///   3. character heuristic  — CamelCase transition or digit↔letter
441///      transition at the merge point (confidence 0.60). Catches cases
442///      where the writer relied on typography (e.g. table cells glued
443///      with zero gap: `Qty1Price$5`).
444///
445/// A space is inserted when the weighted sum meets SPACE_CONSENSUS_THRESHOLD.
446/// Span accumulation still merges adjacent glyphs into one TextSpan (TEX3)
447/// so downstream reading-order logic sees logical text runs, not individual
448/// character positions.
449#[doc(hidden)]
450pub struct TextExtractionDevice {
451    spans: Vec<TextSpan>,
452    last_y: f64,
453    last_end_x: f64,
454    /// TJ adjustment in raw 1/1000 em units since the last glyph was
455    /// drawn. Positive values = backward shift (i.e., explicit horizontal
456    /// space). Reset every time a glyph is drawn.
457    pending_tj_offset: f32,
458    /// Running sample of measured glyph widths used as the adaptive
459    /// reference for the geometric gap signal. Cheap to maintain and
460    /// avoids having to re-walk all spans per decision.
461    glyph_widths: Vec<f64>,
462    /// Cached median glyph width (kept fresh every `MEDIAN_REFRESH`
463    /// insertions). Zero = not yet established, caller falls back to
464    /// font-size scaling.
465    cached_median_char_width: f64,
466    /// Geometry extraction mode — Basic (advance bounds) or RichGeometry (tight bounds).
467    geometry_mode: GeometryMode,
468    /// Per-glyph data recorded during extraction for deferred tight-bounds
469    /// computation. Only populated in RichGeometry mode. The index into this
470    /// vec corresponds 1:1 with the glyph order across all spans.
471    deferred_rich_glyphs: Vec<DeferredGlyph>,
472}
473
474/// Lightweight glyph data recorded during extraction for deferred tight-bounds
475/// computation. Avoids computing bounds on the hot path.
476#[derive(Clone)]
477struct DeferredGlyph {
478    /// Transform coefficients [a,b,c,d,e,f] from composed (CTM * glyph_transform)
479    coeffs: [f64; 6],
480    /// Font size in points
481    font_size: f64,
482    /// Advance width in page-space points
483    glyph_width: f64,
484    /// Whether this glyph needs transform-then-bound (rotation/shear)
485    needs_exact: bool,
486    /// The glyph outline path in raw glyph space. Retained ONLY for
487    /// rotated/sheared glyphs (`needs_exact`), where a tight bound requires
488    /// bounding the transformed path; `None` for axis-aligned glyphs (the
489    /// `font_bbox` suffices) and for ink-less glyphs (spaces, Type3).
490    outline: Option<BezPath>,
491    /// Pre-computed bounding box of the outline in raw glyph space. `None` when
492    /// the glyph has no ink (space, Type3, or empty outline) → Estimate.
493    font_bbox: Option<kurbo::Rect>,
494}
495
496const MEDIAN_REFRESH: usize = 32;
497
498impl Default for TextExtractionDevice {
499    fn default() -> Self {
500        Self::new()
501    }
502}
503
504impl TextExtractionDevice {
505    /// Create a new text extraction device with Basic geometry (default).
506    pub fn new() -> Self {
507        Self::with_mode(GeometryMode::Basic)
508    }
509
510    /// Create a new text extraction device with the given geometry mode.
511    pub fn with_mode(geometry_mode: GeometryMode) -> Self {
512        Self {
513            spans: Vec::new(),
514            last_y: f64::NEG_INFINITY,
515            last_end_x: f64::NEG_INFINITY,
516            pending_tj_offset: 0.0,
517            glyph_widths: Vec::new(),
518            cached_median_char_width: 0.0,
519            geometry_mode,
520            deferred_rich_glyphs: Vec::new(),
521        }
522    }
523
524    /// Refresh the cached median char width. Called lazily from
525    /// `draw_glyph` to keep the hot path cheap.
526    fn refresh_median_char_width(&mut self) {
527        if self.glyph_widths.is_empty() {
528            self.cached_median_char_width = 0.0;
529            return;
530        }
531        let mut sorted = self.glyph_widths.clone();
532        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
533        self.cached_median_char_width = sorted[sorted.len() / 2];
534    }
535
536    /// Decide whether a space should be glued between two glyphs within
537    /// the same span. Returns (insert_space, start_new_span).
538    fn evaluate_space_consensus(
539        &self,
540        gap: f64,
541        font_size: f64,
542        prev_text: &str,
543        next_text: &str,
544    ) -> bool {
545        let mut confidence = 0.0;
546
547        // Signal 1 — TJ offset (highest confidence). Raw units; a full
548        // space is ~250. Anything over TJ_SPACE_THRESHOLD_UNITS counts.
549        if self.pending_tj_offset.abs() >= TJ_SPACE_THRESHOLD_UNITS {
550            confidence += TJ_SIGNAL_WEIGHT;
551        }
552
553        // Signal 2 — geometric gap. Prefer the adaptive median-char-width
554        // reference; fall back to font-size when the median hasn't been
555        // established yet (first few glyphs on a page).
556        let gap_reference = if self.cached_median_char_width > 0.0 {
557            self.cached_median_char_width * GAP_TO_MEDIAN_CHAR_FRACTION
558        } else {
559            font_size * GAP_TO_FONT_SIZE_FALLBACK_FRACTION
560        };
561        if gap > gap_reference {
562            confidence += GAP_SIGNAL_WEIGHT;
563        }
564
565        // Signal 3 — character-class transitions. Only checked when the
566        // previous span ends with a character and the incoming text starts
567        // with one; avoids double-counting with punctuation.
568        if let (Some(prev_last), Some(next_first)) =
569            (prev_text.chars().last(), next_text.chars().next())
570        {
571            let camel = prev_last.is_lowercase() && next_first.is_uppercase();
572            let digit_to_letter = prev_last.is_ascii_digit() && next_first.is_alphabetic();
573            let letter_to_digit = prev_last.is_alphabetic() && next_first.is_ascii_digit();
574            if camel || digit_to_letter || letter_to_digit {
575                confidence += HEURISTIC_SIGNAL_WEIGHT;
576            }
577        }
578
579        confidence >= SPACE_CONSENSUS_THRESHOLD
580    }
581
582    /// Consume the device and return extracted text as a single string.
583    pub fn into_text(mut self) -> String {
584        if self.geometry_mode == GeometryMode::RichGeometry {
585            self.compute_tight_bounds();
586        }
587        let blocks = group_spans_into_blocks(self.spans);
588        let lines: Vec<String> = blocks.iter().map(|b| b.text()).collect();
589        let stitched = stitch_hyphenated_lines(&lines);
590        normalize_text_output(&stitched)
591    }
592
593    /// Consume the device and return text blocks.
594    pub fn into_blocks(mut self) -> Vec<TextBlock> {
595        if self.geometry_mode == GeometryMode::RichGeometry {
596            self.compute_tight_bounds();
597        }
598        group_spans_into_blocks(self.spans)
599    }
600
601    /// Consume the device and return raw spans.
602    #[allow(dead_code)]
603    pub(crate) fn into_spans(mut self) -> Vec<TextSpan> {
604        if self.geometry_mode == GeometryMode::RichGeometry {
605            self.compute_tight_bounds();
606        }
607        self.spans
608    }
609
610    /// Record glyph-level data for deferred tight-bounds computation.
611    /// Called from draw_glyph in RichGeometry mode only.
612    fn record_deferred_glyph(
613        &mut self,
614        glyph: &Glyph<'_>,
615        composed: &Affine,
616        font_size: f64,
617        glyph_width: f64,
618    ) {
619        let coeffs = composed.as_coeffs();
620        let is_rotated = coeffs[1].abs() > ROTATION_EPSILON || coeffs[2].abs() > ROTATION_EPSILON;
621        let is_sheared = (coeffs[0] - coeffs[3]).abs() > SHEAR_EPSILON;
622        let needs_exact = is_rotated || is_sheared;
623
624        // Capture the glyph's ink bbox in raw glyph space. The full outline path
625        // is retained only when `needs_exact` (rotated/sheared), so a tight bound
626        // can bound the transformed path; axis-aligned glyphs keep only the bbox
627        // (enveloping its transformed corners is exact) to bound memory on large
628        // pages. Ink-less glyphs (spaces, Type3) carry no bbox → Estimate later.
629        let (outline, font_bbox) = match glyph {
630            Glyph::Outline(o) => {
631                let path = o.outline();
632                let bb = path.bounding_box();
633                if bb.width() <= 0.0 && bb.height() <= 0.0 {
634                    (None, None)
635                } else if needs_exact {
636                    (Some(path), Some(bb))
637                } else {
638                    (None, Some(bb))
639                }
640            }
641            Glyph::Type3(_) => (None, None),
642        };
643
644        self.deferred_rich_glyphs.push(DeferredGlyph {
645            coeffs,
646            font_size,
647            glyph_width,
648            needs_exact,
649            outline,
650            font_bbox,
651        });
652    }
653
654    /// Compute tight bounds for all recorded glyphs and populate the spans.
655    /// Called once after all glyphs have been extracted.
656    fn compute_tight_bounds(&mut self) {
657        let deferred = std::mem::take(&mut self.deferred_rich_glyphs);
658        if deferred.is_empty() {
659            return;
660        }
661
662        let mut idx = 0usize;
663
664        for span in &mut self.spans {
665            let n = span.char_bounds.len();
666            if n == 0 {
667                continue;
668            }
669
670            let mut span_bounds_source = BoundsSource::Advance;
671
672            for _gi in 0..n {
673                if idx >= deferred.len() {
674                    break;
675                }
676                let dg = &deferred[idx];
677                idx += 1;
678
679                let composed = Affine::new(dg.coeffs);
680                let (tight_bound, source) = if let Some(font_bbox) = dg.font_bbox {
681                    // Glyph has ink. The outline/bbox is in raw glyph space and
682                    // `composed` (CTM * glyph_transform) maps it straight to page
683                    // space — exactly as the renderer fills the raw outline, with
684                    // NO upem pre-scaling. Applying any extra font_size/1000 here
685                    // would double-scale the bound.
686                    if dg.needs_exact {
687                        // Rotated/sheared: bound the transformed outline path so
688                        // the box stays tight (the transformed bbox would be
689                        // looser). Falls back to enveloping the bbox corners if
690                        // the path was not retained.
691                        let page_bbox = match dg.outline {
692                            Some(ref path) => (composed * path.clone()).bounding_box(),
693                            None => {
694                                let raw = [font_bbox.x0, font_bbox.y0, font_bbox.x1, font_bbox.y1];
695                                let b = transform_bbox_corners(&raw, &composed);
696                                kurbo::Rect::new(b[0], b[1], b[2], b[3])
697                            }
698                        };
699                        (
700                            [page_bbox.x0, page_bbox.y0, page_bbox.x1, page_bbox.y1],
701                            BoundsSource::Tight,
702                        )
703                    } else {
704                        // Axis-aligned scale + translate: enveloping the
705                        // transformed bbox corners is exact.
706                        let raw = [font_bbox.x0, font_bbox.y0, font_bbox.x1, font_bbox.y1];
707                        (transform_bbox_corners(&raw, &composed), BoundsSource::Tight)
708                    }
709                } else {
710                    // No ink (space, Type3, missing outline): fall back to the
711                    // advance box in page space, in the same frame as the span's
712                    // char_bounds ([x, y, x + advance, y + font_size]).
713                    let x = dg.coeffs[4];
714                    let y = dg.coeffs[5];
715                    (
716                        [x, y, x + dg.glyph_width, y + dg.font_size],
717                        BoundsSource::Estimate,
718                    )
719                };
720
721                span.tight_char_bounds.push(tight_bound);
722                span.glyph_advances.push(dg.glyph_width);
723                span.glyph_bounds_sources.push(source);
724
725                if source == BoundsSource::Tight && span_bounds_source != BoundsSource::Estimate {
726                    span_bounds_source = BoundsSource::Tight;
727                } else if source == BoundsSource::Estimate {
728                    span_bounds_source = BoundsSource::Estimate;
729                }
730            }
731
732            span.bounds_source = span_bounds_source;
733        }
734    }
735}
736
737impl Device<'_> for TextExtractionDevice {
738    fn set_soft_mask(&mut self, _: Option<SoftMask<'_>>) {}
739    fn set_blend_mode(&mut self, _: BlendMode) {}
740    fn draw_path(&mut self, _: &BezPath, _: Affine, _: &Paint<'_>, _: &PathDrawMode) {}
741    fn push_clip_path(&mut self, _: &ClipPath) {}
742    fn push_transparency_group(&mut self, _: f32, _: Option<SoftMask<'_>>, _: BlendMode) {}
743    fn draw_image(&mut self, _: Image<'_, '_>, _: Affine) {}
744    fn pop_clip_path(&mut self) {}
745    fn pop_transparency_group(&mut self) {}
746
747    fn draw_glyph(
748        &mut self,
749        glyph: &Glyph<'_>,
750        transform: Affine,
751        glyph_transform: Affine,
752        paint: &Paint<'_>,
753        draw_mode: &GlyphDrawMode,
754    ) {
755        let text = match glyph.as_unicode() {
756            Some(BfString::Char(c)) => c.to_string(),
757            Some(BfString::String(s)) => s,
758            None => return,
759        };
760
761        let composed = transform * glyph_transform;
762        let coeffs = composed.as_coeffs();
763        let x = coeffs[4];
764        let y = coeffs[5];
765        let glyph_scale = (coeffs[0].powi(2) + coeffs[1].powi(2)).sqrt().abs();
766        let font_size = glyph_scale * 1000.0;
767
768        // G2: distinguish real advance (Metric) from estimate (Estimate).
769        let (glyph_width, glyph_ws) = glyph_width_and_source(glyph, font_size);
770        let glyph_end_x = x + glyph_width;
771        let glyph_bound = [x, y, glyph_end_x, y + font_size];
772
773        let style = derive_glyph_style(glyph);
774        let color = paint_to_rgba(paint);
775
776        // M3: record glyph data for deferred tight-bounds computation.
777        if self.geometry_mode == GeometryMode::RichGeometry {
778            self.record_deferred_glyph(glyph, &composed, font_size, glyph_width);
779        }
780
781        // ANN[r17/TEX4] Feed the running sample used to derive the adaptive
782        // median character width. Capped to protect against pathological
783        // pages with hundreds of thousands of glyphs.
784        if self.glyph_widths.len() < 4096 {
785            self.glyph_widths.push(glyph_width);
786            if self.glyph_widths.len().is_multiple_of(MEDIAN_REFRESH) {
787                self.refresh_median_char_width();
788            }
789        }
790
791        let same_line = (y - self.last_y).abs() <= font_size.max(BAND_Y_TOLERANCE) * 0.35;
792        let gap = x - self.last_end_x;
793        let adjacent = same_line && gap >= -font_size * 0.25 && gap < font_size * 0.5;
794
795        // G1: only merge into the previous span when font + style + color
796        // match. Otherwise the editor's style toolbar would render the wrong
797        // state for the cursor position.
798        let style_matches = self
799            .spans
800            .last()
801            .map(|last| {
802                last.font_name == style.font_name
803                    && last.is_bold == style.is_bold
804                    && last.is_italic == style.is_italic
805                    && last.color == color
806            })
807            .unwrap_or(false);
808
809        if adjacent && !self.spans.is_empty() && style_matches {
810            // ANN[r17/TEX1] Multi-signal consensus replaces the prior
811            // single-threshold rule (`gap > 0.15 * font_size`). The
812            // consensus evaluates TJ offset, geometric gap, and
813            // character-class transitions; a space is inserted only
814            // when the weighted sum meets SPACE_CONSENSUS_THRESHOLD.
815            // Decision is computed before the mutable borrow of `last`
816            // to keep the borrow checker happy.
817            let want_space = {
818                let last = self.spans.last().expect("checked non-empty");
819                !last.text.ends_with(' ')
820                    && !text.starts_with(' ')
821                    && self.evaluate_space_consensus(gap, font_size, &last.text, &text)
822            };
823            let last = self.spans.last_mut().expect("checked non-empty");
824            if want_space {
825                last.text.push(' ');
826            }
827            last.text.push_str(&text);
828            last.width = last.width.max(glyph_end_x - last.x);
829            last.height = last.height.max(font_size);
830            // G2: append char_bound; downgrade width_source if this glyph is Estimate.
831            last.char_bounds.push(glyph_bound);
832            if glyph_ws == WidthSource::Estimate {
833                last.width_source = WidthSource::Estimate;
834            }
835            self.last_y = y;
836            self.last_end_x = glyph_end_x;
837            // ANN[r17/TEX1] Consume the TJ signal: it only counts for
838            // the one merge it preceded.
839            self.pending_tj_offset = 0.0;
840            return;
841        }
842
843        self.last_y = y;
844        self.last_end_x = glyph_end_x;
845        // ANN[r17/TEX1] Non-adjacent glyph starts a fresh span, so any
846        // pending TJ offset is about within-span word breaks and no longer
847        // meaningful here.
848        self.pending_tj_offset = 0.0;
849
850        let span = TextSpan {
851            text,
852            x,
853            y,
854            width: glyph_width,
855            height: font_size,
856            font_size,
857            font_name: style.font_name,
858            is_bold: style.is_bold,
859            is_italic: style.is_italic,
860            color,
861            width_source: glyph_ws,
862            char_bounds: vec![glyph_bound],
863            transform: Some(coeffs),
864            font_weight: style.font_weight,
865            is_serif: style.is_serif,
866            is_monospace: style.is_monospace,
867            render_mode: Some(render_mode_from_draw_mode(draw_mode)),
868            font_metrics: style.font_metrics,
869            geometry_mode: self.geometry_mode,
870            bounds_source: BoundsSource::Advance,
871            tight_char_bounds: Vec::new(),
872            glyph_advances: Vec::new(),
873            glyph_bounds_sources: Vec::new(),
874        };
875
876        self.spans.push(span);
877    }
878
879    // ANN[r17/TEX1] Record TJ offsets. Accumulate because a single
880    // inter-substring gap may be expressed as multiple numeric entries
881    // (rare, but legal per PDF §9.4.3). The next draw_glyph consumes
882    // the sum.
883    fn text_adjustment(&mut self, amount: f32) {
884        self.pending_tj_offset += amount;
885    }
886}
887
888/// Style metadata derived from a `Glyph` for the G1 text-run extension.
889#[derive(Debug, Default, Clone)]
890struct GlyphStyle {
891    font_name: Option<String>,
892    is_bold: bool,
893    is_italic: bool,
894    /// Numeric weight from embedded font data, if available.
895    font_weight: Option<u16>,
896    /// Serif flag from embedded font data, if available.
897    is_serif: Option<bool>,
898    /// Monospace flag from embedded font data, if available.
899    is_monospace: Option<bool>,
900    /// Vertical font metrics from embedded font data, if available.
901    font_metrics: Option<FontMetrics>,
902}
903
904/// Strip a 6-character subset prefix (e.g. `AAAAAA+Helvetica` → `Helvetica`).
905fn strip_subset_prefix(name: &str) -> &str {
906    match name.split_once('+') {
907        Some((prefix, rest)) if prefix.len() == 6 => rest,
908        _ => name,
909    }
910}
911
912/// Heuristic style inference from a PostScript name when no descriptor
913/// flags are reachable. Matches the same rules `pdf-interpret` uses in
914/// `FallbackFontQuery::new`.
915fn name_style_hints(name: &str) -> (bool, bool) {
916    let lower = name.to_ascii_lowercase();
917    let italic = lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
918    let bold = lower.contains("bold")
919        || lower.contains("demi")
920        || lower.contains("semibold")
921        || lower.contains("heavy")
922        || lower.contains("black");
923    (bold, italic)
924}
925
926fn derive_glyph_style(glyph: &Glyph<'_>) -> GlyphStyle {
927    match glyph {
928        Glyph::Outline(outline) => {
929            if let Some(data) = outline.font_data() {
930                let raw = data.postscript_name.as_deref().unwrap_or("");
931                let name = strip_subset_prefix(raw).to_string();
932                let weight_bold = data.weight.is_some_and(|w| w >= 700);
933                let (name_bold, name_italic) = name_style_hints(&name);
934                GlyphStyle {
935                    font_name: if name.is_empty() { None } else { Some(name) },
936                    is_bold: weight_bold || name_bold,
937                    is_italic: data.is_italic || name_italic,
938                    font_weight: data.weight.map(|w| w.clamp(1, 1000) as u16),
939                    is_serif: Some(data.is_serif),
940                    is_monospace: Some(data.is_monospace),
941                    font_metrics: match (data.ascent, data.descent) {
942                        (Some(ascent), Some(descent)) => Some(FontMetrics {
943                            ascent,
944                            descent,
945                            cap_height: data.cap_height,
946                            x_height: data.x_height,
947                        }),
948                        _ => None,
949                    },
950                }
951            } else {
952                // Type1 / non-embedded font — descriptor not surfaced
953                // via font_data(). Fall back to the name-only
954                // accessor which works for standard-14 fallbacks,
955                // and use the canonical Standard-14 AFM metrics
956                // when the outline font resolves to a known face.
957                let raw = outline.postscript_name().unwrap_or_default();
958                let name = strip_subset_prefix(&raw).to_string();
959                let (name_bold, name_italic) = name_style_hints(&name);
960                let metrics = outline.font_metrics().map(|(a, d, c, x)| FontMetrics {
961                    ascent: a,
962                    descent: d,
963                    cap_height: c,
964                    x_height: x,
965                });
966                GlyphStyle {
967                    font_name: if name.is_empty() { None } else { Some(name) },
968                    is_bold: name_bold,
969                    is_italic: name_italic,
970                    // Non-embedded font: descriptor metrics are not reachable.
971                    font_weight: None,
972                    is_serif: None,
973                    is_monospace: None,
974                    font_metrics: metrics,
975                }
976            }
977        }
978        Glyph::Type3(_) => GlyphStyle::default(),
979    }
980}
981
982fn paint_to_rgba(paint: &Paint<'_>) -> Option<[u8; 4]> {
983    match paint {
984        Paint::Color(c) => Some(c.to_rgba().to_rgba8()),
985        Paint::Pattern(_) => None,
986    }
987}
988
989/// Map the renderer's coarse glyph draw mode to a PDF text render-mode code.
990///
991/// NOTE: the interpreter collapses the eight PDF text render modes (Tr 0–7)
992/// into three visibility classes, so only `{0, 1, 3}` are representable here —
993/// fill (0), stroke (1), and invisible (3). The fill+stroke and clip modes
994/// (2, 4–7) are not distinguished. This is sufficient to identify and filter
995/// invisible (mode-3) OCR/overlay text; recovering the full `Tr` range would
996/// require threading it through the `Device` trait (out of scope).
997fn render_mode_from_draw_mode(mode: &GlyphDrawMode) -> u8 {
998    match mode {
999        GlyphDrawMode::Fill => 0,
1000        GlyphDrawMode::Stroke(_) => 1,
1001        GlyphDrawMode::Invisible => 3,
1002    }
1003}
1004
1005#[cfg(test)]
1006mod render_mode_tests {
1007    use super::render_mode_from_draw_mode;
1008    use pdf_render::pdf_interpret::{GlyphDrawMode, StrokeProps};
1009
1010    #[test]
1011    fn render_mode_is_only_zero_one_three() {
1012        assert_eq!(render_mode_from_draw_mode(&GlyphDrawMode::Fill), 0);
1013        assert_eq!(
1014            render_mode_from_draw_mode(&GlyphDrawMode::Stroke(StrokeProps::default())),
1015            1
1016        );
1017        assert_eq!(render_mode_from_draw_mode(&GlyphDrawMode::Invisible), 3);
1018        // Exhaustive: the mapping only ever yields 0, 1, or 3 — never 2 or 4–7.
1019        for m in [
1020            GlyphDrawMode::Fill,
1021            GlyphDrawMode::Stroke(StrokeProps::default()),
1022            GlyphDrawMode::Invisible,
1023        ] {
1024            assert!(matches!(render_mode_from_draw_mode(&m), 0 | 1 | 3));
1025        }
1026    }
1027}
1028
1029/// Returns `(advance_in_user_space, WidthSource)` for a glyph.
1030///
1031/// Uses the real advance from `OutlineGlyph::advance_width()` when available
1032/// (returns `WidthSource::Metric`); falls back to 50% em otherwise
1033/// (`WidthSource::Estimate`). The result is clamped to at least 25% em so
1034/// invisible-glyph outliers do not collapse spans.
1035fn glyph_width_and_source(glyph: &Glyph<'_>, font_size: f64) -> (f64, WidthSource) {
1036    match glyph {
1037        Glyph::Outline(outline) => {
1038            if let Some(w) = outline.advance_width() {
1039                let advance = (w as f64 / 1000.0 * font_size).max(font_size * 0.25);
1040                (advance, WidthSource::Metric)
1041            } else {
1042                (font_size * 0.5, WidthSource::Estimate)
1043            }
1044        }
1045        Glyph::Type3(_) => (font_size * 0.5, WidthSource::Estimate),
1046    }
1047}
1048
1049/// Threshold for detecting rotation or shear in the composed transform.
1050/// Any |b| or |c| above this is treated as rotation; any difference between
1051/// a and d above this is treated as non-uniform scale (= shear potential).
1052const ROTATION_EPSILON: f64 = 1e-6;
1053const SHEAR_EPSILON: f64 = 1e-3;
1054
1055/// Transform a bounding-box `[x0,y0,x1,y1]` (in local space) to page space
1056/// by applying the affine to its four corners and taking the envelope.
1057fn transform_bbox_corners(local_bbox: &[f64; 4], affine: &Affine) -> [f64; 4] {
1058    use kurbo::Point;
1059    let corners = [
1060        *affine * Point::new(local_bbox[0], local_bbox[1]),
1061        *affine * Point::new(local_bbox[2], local_bbox[1]),
1062        *affine * Point::new(local_bbox[2], local_bbox[3]),
1063        *affine * Point::new(local_bbox[0], local_bbox[3]),
1064    ];
1065    let x0 = corners.iter().map(|p| p.x).fold(f64::INFINITY, f64::min);
1066    let y0 = corners.iter().map(|p| p.y).fold(f64::INFINITY, f64::min);
1067    let x1 = corners
1068        .iter()
1069        .map(|p| p.x)
1070        .fold(f64::NEG_INFINITY, f64::max);
1071    let y1 = corners
1072        .iter()
1073        .map(|p| p.y)
1074        .fold(f64::NEG_INFINITY, f64::max);
1075    [x0, y0, x1, y1]
1076}
1077
1078/// Collapse fake-bold / overprint duplicates inside one band.
1079///
1080/// Real-word corpus failures such as 0105.pdf draw the same text several times
1081/// with sub-point x drift to simulate heavier weight. Text extraction should
1082/// keep the most informative span once rather than concatenate every overprint.
1083fn collapse_overprinted_spans(spans: &mut Vec<TextSpan>) {
1084    if spans.len() < 2 {
1085        return;
1086    }
1087
1088    let mut deduped: Vec<TextSpan> = Vec::with_capacity(spans.len());
1089    for span in spans.drain(..) {
1090        if let Some(last) = deduped.last_mut() {
1091            if spans_are_overprint_duplicates(last, &span) {
1092                let choose_incoming = span.text.chars().count() > last.text.chars().count()
1093                    || (span.text.chars().count() == last.text.chars().count()
1094                        && span.width > last.width);
1095                let preferred_text = if choose_incoming {
1096                    span.text.clone()
1097                } else {
1098                    last.text.clone()
1099                };
1100                let left = last.x.min(span.x);
1101                let right = last.right().max(span.right());
1102                last.x = left;
1103                last.y = (last.y + span.y) * 0.5;
1104                last.width = (right - left).max(last.width).max(span.width);
1105                last.height = last.height.max(span.height);
1106                last.font_size = last.font_size.max(span.font_size);
1107                last.text = preferred_text;
1108                continue;
1109            }
1110        }
1111
1112        deduped.push(span);
1113    }
1114
1115    *spans = deduped;
1116}
1117
1118fn spans_are_overprint_duplicates(lhs: &TextSpan, rhs: &TextSpan) -> bool {
1119    let lhs_text = lhs.text.trim();
1120    let rhs_text = rhs.text.trim();
1121    if lhs_text.is_empty() || rhs_text.is_empty() {
1122        return false;
1123    }
1124
1125    let same_baseline = (lhs.y - rhs.y).abs() <= lhs.font_size.max(rhs.font_size) * 0.12;
1126    if !same_baseline {
1127        return false;
1128    }
1129
1130    let lhs_left = lhs.x;
1131    let lhs_right = lhs.right();
1132    let rhs_left = rhs.x;
1133    let rhs_right = rhs.right();
1134    let overlap = (lhs_right.min(rhs_right) - lhs_left.max(rhs_left)).max(0.0);
1135    let min_width = (lhs_right - lhs_left).min(rhs_right - rhs_left).max(1.0);
1136    let heavily_overlaps = overlap / min_width >= 0.85;
1137    if !heavily_overlaps {
1138        return false;
1139    }
1140
1141    lhs_text == rhs_text || lhs_text.starts_with(rhs_text) || rhs_text.starts_with(lhs_text)
1142}
1143
1144fn trim_overlapping_word_prefix(prev: &str, curr: &str) -> Option<String> {
1145    let prev_chars: Vec<char> = prev.trim_end().chars().collect();
1146    let curr_chars: Vec<char> = curr.trim_start().chars().collect();
1147    let max = prev_chars.len().min(curr_chars.len());
1148
1149    for len in (4..=max).rev() {
1150        let prev_start = prev_chars.len() - len;
1151        if prev_chars[prev_start..] != curr_chars[..len] {
1152            continue;
1153        }
1154
1155        if !curr_chars[..len].iter().all(|ch| ch.is_alphanumeric()) {
1156            continue;
1157        }
1158
1159        let prev_boundary = prev_start == 0 || !prev_chars[prev_start - 1].is_alphanumeric();
1160        let curr_boundary = len == curr_chars.len() || !curr_chars[len].is_alphanumeric();
1161        if !prev_boundary || !curr_boundary {
1162            continue;
1163        }
1164
1165        return Some(curr_chars[len..].iter().collect());
1166    }
1167
1168    None
1169}
1170
1171/// Compute an adaptive column gap threshold from a set of bands.
1172///
1173/// Collects all positive inter-span gaps within each band, computes the
1174/// median, and returns `COLUMN_GAP_MEDIAN_MULTIPLIER × median`, clamped to
1175/// `[COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX]`.  Falls back to
1176/// `COLUMN_GAP_THRESHOLD_FALLBACK` when there are no measurable gaps.
1177fn compute_adaptive_column_gap(bands: &[TextBand]) -> f64 {
1178    let mut all_gaps: Vec<f64> = Vec::new();
1179
1180    for band in bands {
1181        if band.spans.len() < 2 {
1182            continue;
1183        }
1184        let mut sorted = band.spans.clone();
1185        sorted.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1186        let mut prev_right = sorted[0].right();
1187        for span in sorted.iter().skip(1) {
1188            let gap = span.x - prev_right;
1189            if gap > 0.0 {
1190                all_gaps.push(gap);
1191            }
1192            prev_right = prev_right.max(span.right());
1193        }
1194    }
1195
1196    if all_gaps.is_empty() {
1197        return COLUMN_GAP_THRESHOLD_FALLBACK;
1198    }
1199
1200    all_gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
1201
1202    let min_gap = all_gaps[0];
1203
1204    // When all inter-span gaps are already large (> MIN threshold), they are
1205    // likely all column gaps — the draw_glyph merger absorbed word-level
1206    // spaces into span text.  Use a fraction of the smallest gap so that
1207    // ALL column gaps exceed the threshold.
1208    if min_gap > COLUMN_GAP_THRESHOLD_MIN {
1209        return (min_gap * 0.75).clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
1210    }
1211
1212    // Look for a natural break: the largest relative jump between consecutive
1213    // sorted gaps separates word-level gaps from column gaps.
1214    let mut best_break_threshold = 0.0f64;
1215    let mut best_ratio = 1.5f64; // require at least 1.5× jump
1216    for pair in all_gaps.windows(2) {
1217        if pair[0] > 0.5 {
1218            let ratio = pair[1] / pair[0];
1219            if ratio > best_ratio {
1220                best_ratio = ratio;
1221                best_break_threshold = (pair[0] + pair[1]) * 0.5;
1222            }
1223        }
1224    }
1225
1226    if best_break_threshold > 0.0 {
1227        return best_break_threshold.clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
1228    }
1229
1230    // Fallback: median × multiplier.
1231    let mid = all_gaps.len() / 2;
1232    let median = if all_gaps.len().is_multiple_of(2) {
1233        (all_gaps[mid - 1] + all_gaps[mid]) * 0.5
1234    } else {
1235        all_gaps[mid]
1236    };
1237
1238    (median * COLUMN_GAP_MEDIAN_MULTIPLIER)
1239        .clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX)
1240}
1241
1242/// Group spans into reading-order blocks, using column-aware reordering when
1243/// a contiguous region repeatedly exposes the same gutters.
1244/// Per-page adaptive parameters derived from the span set before any
1245/// grouping happens. Centralising these here (TEX4) means the rest of
1246/// the pipeline — band grouping, XY-Cut cuts, in-block space insertion
1247/// — all speak the same typographic baseline for this specific page,
1248/// rather than each helper reaching for an independent fixed constant.
1249#[derive(Debug, Clone, Copy)]
1250struct PageStats {
1251    /// Median font size across all spans (pt).
1252    median_font_size: f64,
1253    /// Median measured character width (pt). Zero-guarded fallback is
1254    /// 0.5 × median_font_size when there aren't enough samples.
1255    /// Currently populated for diagnostics / future tuning; allow dead_code
1256    /// under `-D warnings` until a reader is added.
1257    #[allow(dead_code)]
1258    median_char_width: f64,
1259    /// Tight line-to-line spacing (25th percentile of pairwise band
1260    /// gaps), representing the body-text leading on this page. The
1261    /// quartile is used instead of the median so large paragraph /
1262    /// zone gaps don't inflate the baseline. Zero if the page has
1263    /// only one band.
1264    median_line_spacing: f64,
1265}
1266
1267impl PageStats {
1268    fn from_spans(spans: &[TextSpan]) -> Self {
1269        if spans.is_empty() {
1270            return Self {
1271                median_font_size: 12.0,
1272                median_char_width: 6.0,
1273                median_line_spacing: 0.0,
1274            };
1275        }
1276
1277        // Median font size.
1278        let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
1279        sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
1280        let median_font_size = sizes[sizes.len() / 2];
1281
1282        // Median char width — measured width / char count, per span.
1283        let mut char_widths: Vec<f64> = spans
1284            .iter()
1285            .filter_map(|s| {
1286                let chars = s.text.chars().count();
1287                if chars > 0 && s.width > 0.0 {
1288                    Some(s.width / chars as f64)
1289                } else {
1290                    None
1291                }
1292            })
1293            .collect();
1294        let median_char_width = if char_widths.is_empty() {
1295            median_font_size * 0.5
1296        } else {
1297            char_widths.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
1298            char_widths[char_widths.len() / 2]
1299        };
1300
1301        // Median line spacing — pairwise gaps between consecutive band
1302        // y-values.
1303        let band_tolerance = (median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
1304        let mut ys: Vec<f64> = spans.iter().map(|s| s.y).collect();
1305        ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
1306        let mut band_ys: Vec<f64> = Vec::new();
1307        for y in ys {
1308            if band_ys
1309                .last()
1310                .map(|prev: &f64| (prev - y).abs() > band_tolerance)
1311                .unwrap_or(true)
1312            {
1313                band_ys.push(y);
1314            }
1315        }
1316        // ANN[r17/TEX4] "Line spacing" here means the TIGHT line-to-line
1317        // gap inside a text block — not the median of all gaps. Using
1318        // the median drags the estimate up when the page has
1319        // paragraph / zone breaks (which are the very gaps the
1320        // paragraph-break threshold is supposed to EXCEED). The 25th
1321        // percentile is the smallest gap that still shows up in more
1322        // than one place on the page; it captures body-text leading
1323        // robustly even when large zone gaps dominate.
1324        let median_line_spacing = if band_ys.len() < 2 {
1325            0.0
1326        } else {
1327            let mut spacings: Vec<f64> = band_ys
1328                .windows(2)
1329                .map(|pair| (pair[0] - pair[1]).abs())
1330                .collect();
1331            spacings.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
1332            let q1_index = spacings.len() / 4;
1333            spacings[q1_index]
1334        };
1335
1336        Self {
1337            median_font_size,
1338            median_char_width,
1339            median_line_spacing,
1340        }
1341    }
1342}
1343
1344// ANN[r17/TEX2] Maximum recursion depth for XY-Cut. Any real page layout
1345// is decomposable in well under 10 alternating cuts; the cap guards
1346// against pathological inputs where the cut predicate keeps triggering
1347// due to floating-point drift.
1348const XY_CUT_MAX_DEPTH: usize = 12;
1349/// Minimum fraction of a region's width that a vertical gap must reach
1350/// before it qualifies as a column gutter.
1351const XY_CUT_VERTICAL_GAP_REGION_FRACTION: f64 = 0.04;
1352/// Floor (in pt) for vertical gap regardless of region width. Matches
1353/// the previous `COLUMN_GAP_THRESHOLD_MIN` and keeps XY-Cut conservative
1354/// on narrow regions (sidebars, tall columns).
1355const XY_CUT_VERTICAL_GAP_FLOOR: f64 = 10.0;
1356/// Multiplier applied to median font size to produce the horizontal-gap
1357/// threshold. 1.8 × line-height matches typical paragraph spacing.
1358const XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER: f64 = 1.8;
1359/// Minimum number of spans a column must contain before it is eligible
1360/// for acceptance — one-span "columns" are almost always sidebar noise
1361/// or table-cell fragments.
1362const XY_CUT_MIN_SPANS_PER_COLUMN: usize = 2;
1363/// Average characters per band a column must have before it's accepted
1364/// as dense prose (vs. a short-cell table column).
1365const XY_CUT_MIN_CHARS_PER_BAND: f64 = 8.0;
1366
1367/// ANN[r17/TEX2][r17/TEX4] Top-level grouping uses recursive XY-Cut
1368/// with a density guard. Per-page stats are computed once up front so
1369/// every decision downstream speaks the same typographic baseline.
1370fn group_spans_into_blocks(spans: Vec<TextSpan>) -> Vec<TextBlock> {
1371    if spans.is_empty() {
1372        return Vec::new();
1373    }
1374    let stats = PageStats::from_spans(&spans);
1375    xy_cut_recursive(spans, 0, &stats)
1376}
1377
1378fn xy_cut_recursive(spans: Vec<TextSpan>, depth: usize, stats: &PageStats) -> Vec<TextBlock> {
1379    if spans.is_empty() {
1380        return Vec::new();
1381    }
1382    if depth >= XY_CUT_MAX_DEPTH {
1383        return band_based_blocks(spans, stats);
1384    }
1385
1386    // ANN[r17/TEX2] Pick whichever direction has the largest qualifying
1387    // gap. Always cutting vertically first breaks layouts where a
1388    // footer sits in the mid-x range — it would attach to the left
1389    // column instead of being recognized as a page-level zone. The
1390    // "largest gap wins" rule is the standard XY-Cut tie-breaker used
1391    // by academic OCR literature and matches pdf_oxide.
1392    let vcut = try_vertical_cut(&spans, stats);
1393    let hcut = try_horizontal_cut(&spans, stats);
1394
1395    let (chosen, _) = match (vcut, hcut) {
1396        (Some((v_groups, v_gap)), Some((h_groups, h_gap))) => {
1397            if v_gap >= h_gap {
1398                (Some(v_groups), v_gap)
1399            } else {
1400                (Some(h_groups), h_gap)
1401            }
1402        }
1403        (Some((v_groups, v_gap)), None) => (Some(v_groups), v_gap),
1404        (None, Some((h_groups, h_gap))) => (Some(h_groups), h_gap),
1405        (None, None) => (None, 0.0),
1406    };
1407
1408    if let Some(groups) = chosen {
1409        let mut out = Vec::new();
1410        for group in groups {
1411            out.extend(xy_cut_recursive(group, depth + 1, stats));
1412        }
1413        return out;
1414    }
1415
1416    band_based_blocks(spans, stats)
1417}
1418
1419/// Emit per-band row blocks without any column detection. Used as the
1420/// leaf of XY-Cut recursion — at this point the region either has no
1421/// further cuts or the density guard refused them.
1422fn band_based_blocks(spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBlock> {
1423    // XY-Cut can miss recurring gutters when a small number of bands span the
1424    // full page width (e.g. a running header above a 3-column body). In that
1425    // case, fall back to the older band/gutter detector inside the leaf region
1426    // instead of flattening everything row-major.
1427    group_spans_into_blocks_legacy_with_stats(spans, stats)
1428}
1429
1430/// Median font-size helper. Currently unreferenced after `PageStats` took over
1431/// the typography baseline computation; kept available for future tuning paths.
1432#[allow(dead_code)]
1433fn median_font_size(spans: &[TextSpan]) -> f64 {
1434    if spans.is_empty() {
1435        return 12.0;
1436    }
1437    let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
1438    sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
1439    sizes[sizes.len() / 2]
1440}
1441
1442/// Attempt a vertical (column) cut. Returns the span groups plus the
1443/// gap size (in pt) if a suitable gutter is found AND the density +
1444/// alignment guards accept.
1445///
1446/// ANN[r17/TEX2] Three guards together avoid false-positive columns:
1447///   1. `min_gap` is the MAX of (median_font, 4% of region width, 10pt)
1448///      — deliberately lower than `median_font * 2` so narrow-gutter
1449///      academic papers (12pt gutters, common in print) are still
1450///      detected.
1451///   2. `columns_are_dense` rejects column splits where either side
1452///      has <2 spans or <8 chars/band — catches table cells.
1453///   3. `columns_are_band_aligned` rejects cuts where any band would
1454///      end up on only one side of the cut while being wider than
1455///      ~70% of that side's column width — catches full-width
1456///      paragraphs (Intro / Outro) that accidentally sit in the
1457///      left-column x-range.
1458fn try_vertical_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1459    if spans.len() < 2 * XY_CUT_MIN_SPANS_PER_COLUMN {
1460        return None;
1461    }
1462
1463    let region_left = spans.iter().map(|s| s.x).fold(f64::INFINITY, f64::min);
1464    let region_right = spans
1465        .iter()
1466        .map(TextSpan::right)
1467        .fold(f64::NEG_INFINITY, f64::max);
1468    let region_width = region_right - region_left;
1469    if region_width <= 0.0 {
1470        return None;
1471    }
1472
1473    // ANN[r17/TEX2][r17/TEX4] Threshold uses the ADAPTIVE median-word-gap
1474    // from the bands rather than a flat font-size multiple. Narrow-gutter
1475    // academic layouts have 12pt gutters next to 4pt word spaces — the
1476    // adaptive threshold scales with the actual typography used on this
1477    // page. Clamped to `XY_CUT_VERTICAL_GAP_FLOOR` to avoid firing on
1478    // ordinary inter-word spaces when character advance data is noisy.
1479    // median_font and the width fraction act only as safety rails for
1480    // pathological inputs.
1481    let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1482    let adaptive = compute_adaptive_column_gap(&bands);
1483    let floor = stats
1484        .median_font_size
1485        .max(region_width * XY_CUT_VERTICAL_GAP_REGION_FRACTION)
1486        .max(XY_CUT_VERTICAL_GAP_FLOOR);
1487    let min_gap = adaptive.min(floor).max(XY_CUT_VERTICAL_GAP_FLOOR);
1488
1489    // Intervals [x_left, x_right] of every span; we look for an x value
1490    // that is free of ALL intervals (full-height gap).
1491    let mut intervals: Vec<(f64, f64)> = spans
1492        .iter()
1493        .map(|s| (s.x, s.right().max(s.x + 0.001)))
1494        .collect();
1495    intervals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
1496
1497    let mut cursor = intervals[0].1;
1498    let mut best_gap: Option<(f64, f64)> = None; // (gap_size, cut_x)
1499    for (left, right) in intervals.iter().skip(1) {
1500        if *left > cursor {
1501            let gap = *left - cursor;
1502            if gap >= min_gap {
1503                match best_gap {
1504                    Some((best, _)) if best >= gap => {}
1505                    _ => {
1506                        let cut_x = (cursor + *left) * 0.5;
1507                        best_gap = Some((gap, cut_x));
1508                    }
1509                }
1510            }
1511        }
1512        cursor = cursor.max(*right);
1513    }
1514
1515    let (gap_size, cut_x) = best_gap?;
1516
1517    // Split spans around the cut. A span whose midpoint is < cut_x
1518    // belongs to the left group.
1519    let mut left_group = Vec::new();
1520    let mut right_group = Vec::new();
1521    for span in spans {
1522        let midpoint = span.x + (span.right() - span.x) * 0.5;
1523        if midpoint < cut_x {
1524            left_group.push(span.clone());
1525        } else {
1526            right_group.push(span.clone());
1527        }
1528    }
1529
1530    if !columns_are_dense(&left_group, &right_group, stats) {
1531        return None;
1532    }
1533    if !columns_are_band_aligned(spans, cut_x, region_left, region_right, stats) {
1534        return None;
1535    }
1536
1537    Some((vec![left_group, right_group], gap_size))
1538}
1539
1540/// ANN[r17/TEX2] Reject a vertical cut when any band sits on only one
1541/// side of the cut AND occupies more than ~70% of that side's column
1542/// width. Such bands are almost certainly full-width paragraphs that
1543/// happened to align with the left margin of one column, and forcing
1544/// them into that column re-orders them relative to text that follows.
1545fn columns_are_band_aligned(
1546    spans: &[TextSpan],
1547    cut_x: f64,
1548    region_left: f64,
1549    region_right: f64,
1550    stats: &PageStats,
1551) -> bool {
1552    let left_width = (cut_x - region_left).max(1.0);
1553    let right_width = (region_right - cut_x).max(1.0);
1554
1555    // Threshold chosen empirically: paragraph bodies in columnar
1556    // layouts usually fill ~60-70% of their column; anything wider
1557    // than 0.7× is a page-level element masquerading as column
1558    // content.
1559    const MAX_SINGLE_SIDE_FRACTION: f64 = 0.70;
1560
1561    let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1562    for band in &bands {
1563        let mut has_left = false;
1564        let mut has_right = false;
1565        for span in &band.spans {
1566            let midpoint = span.x + (span.right() - span.x) * 0.5;
1567            if midpoint < cut_x {
1568                has_left = true;
1569            } else {
1570                has_right = true;
1571            }
1572        }
1573        if has_left && has_right {
1574            continue; // Band straddles columns → fine.
1575        }
1576        let band_width = band.width();
1577        if has_left && band_width > left_width * MAX_SINGLE_SIDE_FRACTION {
1578            return false;
1579        }
1580        if has_right && band_width > right_width * MAX_SINGLE_SIDE_FRACTION {
1581            return false;
1582        }
1583    }
1584    true
1585}
1586
1587/// Density guard — reject column splits that look like tables (few,
1588/// short spans per column). A column is "dense" when it has at least
1589/// MIN_SPANS_PER_COLUMN spans and the average character count per band
1590/// exceeds MIN_CHARS_PER_BAND.
1591fn columns_are_dense(left: &[TextSpan], right: &[TextSpan], stats: &PageStats) -> bool {
1592    for col in [left, right] {
1593        if col.len() < XY_CUT_MIN_SPANS_PER_COLUMN {
1594            return false;
1595        }
1596        let bands = group_spans_into_bands_with_stats(col.to_vec(), stats);
1597        if bands.is_empty() {
1598            return false;
1599        }
1600        let total_chars: usize = col.iter().map(|s| s.text.chars().count()).sum();
1601        let chars_per_band = total_chars as f64 / bands.len() as f64;
1602        if chars_per_band < XY_CUT_MIN_CHARS_PER_BAND {
1603            return false;
1604        }
1605    }
1606    true
1607}
1608
1609/// Attempt a horizontal (zone / paragraph) cut. Unlike vertical cuts
1610/// this does NOT need a density guard — splitting top-from-bottom
1611/// cannot re-order content.
1612fn try_horizontal_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1613    if spans.len() < 2 {
1614        return None;
1615    }
1616    // Sort by descending y (PDF y grows upward).
1617    let mut sorted = spans.to_vec();
1618    sorted.sort_by(|a, b| {
1619        b.y.partial_cmp(&a.y)
1620            .unwrap_or(Ordering::Equal)
1621            .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1622    });
1623
1624    // ANN[r17/TEX4] Paragraph / zone cuts scale with MEDIAN LINE
1625    // SPACING when available — this is the typographically correct
1626    // baseline (paragraph break ≈ 1.8 × line-spacing). When the page
1627    // has only one band, or stats haven't observed spacing yet, fall
1628    // back to the font-size multiple the legacy path used.
1629    let min_gap = if stats.median_line_spacing > 0.0 {
1630        stats.median_line_spacing * PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER
1631    } else {
1632        stats.median_font_size * XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER
1633    };
1634
1635    // Look for the largest gap between consecutive span y-values.
1636    let mut best: Option<(f64, f64)> = None; // (gap_size, cut_y)
1637    let tolerance = stats.median_font_size * BAND_Y_FRACTION;
1638    let mut band_bottom = sorted[0].y;
1639
1640    for span in sorted.iter().skip(1) {
1641        if (band_bottom - span.y).abs() <= tolerance {
1642            band_bottom = band_bottom.min(span.y);
1643            continue;
1644        }
1645        let gap = band_bottom - span.y;
1646        if gap >= min_gap {
1647            let cut_y = (band_bottom + span.y) * 0.5;
1648            match best {
1649                Some((best_gap, _)) if best_gap >= gap => {}
1650                _ => best = Some((gap, cut_y)),
1651            }
1652        }
1653        band_bottom = span.y;
1654    }
1655
1656    let (gap_size, cut_y) = best?;
1657
1658    let mut top_group = Vec::new();
1659    let mut bottom_group = Vec::new();
1660    for span in spans {
1661        if span.y > cut_y {
1662            top_group.push(span.clone());
1663        } else {
1664            bottom_group.push(span.clone());
1665        }
1666    }
1667    if top_group.is_empty() || bottom_group.is_empty() {
1668        return None;
1669    }
1670    Some((vec![top_group, bottom_group], gap_size))
1671}
1672
1673/// Legacy band+column-detection path, kept for reference and as the
1674/// fallback inside `band_based_blocks` test coverage. Not currently
1675/// used — XY-Cut supersedes it.
1676#[allow(dead_code)]
1677fn group_spans_into_blocks_legacy(spans: Vec<TextSpan>) -> Vec<TextBlock> {
1678    let bands = group_spans_into_bands(spans);
1679    group_spans_into_blocks_legacy_from_bands(bands)
1680}
1681
1682fn group_spans_into_blocks_legacy_with_stats(
1683    spans: Vec<TextSpan>,
1684    stats: &PageStats,
1685) -> Vec<TextBlock> {
1686    let bands = group_spans_into_bands_with_stats(spans, stats);
1687    group_spans_into_blocks_legacy_from_bands(bands)
1688}
1689
1690fn group_spans_into_blocks_legacy_from_bands(bands: Vec<TextBand>) -> Vec<TextBlock> {
1691    if bands.is_empty() {
1692        return Vec::new();
1693    }
1694
1695    let column_gap_threshold = compute_adaptive_column_gap(&bands);
1696
1697    let mut blocks = Vec::new();
1698    let mut idx = 0;
1699
1700    while idx < bands.len() {
1701        let gap_midpoints = bands[idx].gap_midpoints(column_gap_threshold);
1702        if gap_midpoints.is_empty() {
1703            blocks.push(bands[idx].row_block());
1704            idx += 1;
1705            continue;
1706        }
1707
1708        let mut boundaries = gap_midpoints.clone();
1709        let mut band_indices = vec![idx];
1710        let mut gapped_band_count = 1usize;
1711        let mut region_left = bands[idx].left();
1712        let mut region_right = bands[idx].right();
1713        let mut next_idx = idx + 1;
1714
1715        while next_idx < bands.len() {
1716            let next_band = &bands[next_idx];
1717            let next_gap_midpoints = next_band.gap_midpoints(column_gap_threshold);
1718            if next_gap_midpoints.is_empty() {
1719                if next_band
1720                    .fits_single_column(&boundaries, region_left, region_right)
1721                    .is_some()
1722                {
1723                    band_indices.push(next_idx);
1724                    next_idx += 1;
1725                    continue;
1726                }
1727                break;
1728            }
1729
1730            if !boundaries_match(&boundaries, &next_gap_midpoints, column_gap_threshold) {
1731                break;
1732            }
1733
1734            update_boundaries(&mut boundaries, &next_gap_midpoints, gapped_band_count);
1735            gapped_band_count += 1;
1736            band_indices.push(next_idx);
1737            region_left = region_left.min(next_band.left());
1738            region_right = region_right.max(next_band.right());
1739            next_idx += 1;
1740        }
1741
1742        if region_is_columnar(&bands, &band_indices, &boundaries, gapped_band_count) {
1743            append_column_region_blocks(&bands, &band_indices, &boundaries, &mut blocks);
1744            idx = next_idx;
1745        } else {
1746            blocks.push(bands[idx].row_block());
1747            idx += 1;
1748        }
1749    }
1750
1751    blocks
1752}
1753
1754/// Legacy wrapper used by call sites that haven't been handed PageStats.
1755/// It derives stats locally. Prefer `group_spans_into_bands_with_stats`
1756/// inside the XY-Cut pipeline to avoid recomputing the stats per call.
1757fn group_spans_into_bands(spans: Vec<TextSpan>) -> Vec<TextBand> {
1758    let stats = PageStats::from_spans(&spans);
1759    group_spans_into_bands_with_stats(spans, &stats)
1760}
1761
1762fn group_spans_into_bands_with_stats(mut spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBand> {
1763    if spans.is_empty() {
1764        return Vec::new();
1765    }
1766
1767    spans.sort_by(|a, b| {
1768        b.y.partial_cmp(&a.y)
1769            .unwrap_or(Ordering::Equal)
1770            .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1771    });
1772
1773    // ANN[r17/TEX4] Band tolerance scales with this page's median font
1774    // size rather than a fixed 5pt floor. Single-page spreads with
1775    // huge display fonts (24pt+) previously merged unrelated lines; a
1776    // fractional threshold keeps that from happening without hurting
1777    // body-text pages.
1778    let page_tolerance = (stats.median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
1779
1780    let mut bands: Vec<TextBand> = Vec::new();
1781
1782    for span in spans {
1783        let tolerance = (span.height * BAND_Y_FRACTION)
1784            .max(page_tolerance)
1785            .max(BAND_Y_TOLERANCE);
1786        if let Some(band) = bands
1787            .iter_mut()
1788            .find(|band| (band.y - span.y).abs() <= tolerance)
1789        {
1790            let span_count = band.spans.len() as f64;
1791            band.y = (band.y * span_count + span.y) / (span_count + 1.0);
1792            band.spans.push(span);
1793        } else {
1794            bands.push(TextBand::new(span));
1795        }
1796    }
1797
1798    for band in &mut bands {
1799        band.sort_spans();
1800    }
1801
1802    bands.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal));
1803    bands
1804}
1805
1806fn boundaries_match(boundaries: &[f64], gap_midpoints: &[f64], column_gap_threshold: f64) -> bool {
1807    let tolerance = (column_gap_threshold * 1.5).clamp(COLUMN_GAP_MATCH_TOLERANCE, 60.0);
1808    boundaries.len() == gap_midpoints.len()
1809        && boundaries
1810            .iter()
1811            .zip(gap_midpoints)
1812            .all(|(lhs, rhs)| (lhs - rhs).abs() <= tolerance)
1813}
1814
1815fn update_boundaries(boundaries: &mut [f64], gap_midpoints: &[f64], seen_gapped_bands: usize) {
1816    for (boundary, midpoint) in boundaries.iter_mut().zip(gap_midpoints) {
1817        *boundary =
1818            (*boundary * seen_gapped_bands as f64 + midpoint) / (seen_gapped_bands as f64 + 1.0);
1819    }
1820}
1821
1822fn region_is_columnar(
1823    bands: &[TextBand],
1824    band_indices: &[usize],
1825    boundaries: &[f64],
1826    gapped_band_count: usize,
1827) -> bool {
1828    if boundaries.is_empty()
1829        || gapped_band_count < MIN_COLUMN_GAPPED_BANDS
1830        || band_indices.is_empty()
1831        || (gapped_band_count as f64 / band_indices.len() as f64) < MIN_COLUMN_GAP_SUPPORT
1832    {
1833        return false;
1834    }
1835
1836    let mut non_empty_slices = 0usize;
1837    let mut dense_slices = 0usize;
1838    let mut slices_per_column = vec![0usize; boundaries.len() + 1];
1839
1840    for &band_idx in band_indices {
1841        let slices = bands[band_idx].split_by_boundaries(boundaries);
1842        for (column_idx, slice) in slices.iter().enumerate() {
1843            if slice.is_empty() {
1844                continue;
1845            }
1846
1847            non_empty_slices += 1;
1848            slices_per_column[column_idx] += 1;
1849
1850            let char_count = slice
1851                .iter()
1852                .map(|span| span.text.chars().count())
1853                .sum::<usize>();
1854            if slice.len() >= 2 || char_count >= 8 {
1855                dense_slices += 1;
1856            }
1857        }
1858    }
1859
1860    if non_empty_slices < boundaries.len() + 2 {
1861        return false;
1862    }
1863
1864    if slices_per_column.contains(&0) {
1865        return false;
1866    }
1867
1868    (dense_slices as f64 / non_empty_slices as f64) >= MIN_DENSE_SLICE_RATIO
1869}
1870
1871fn append_column_region_blocks(
1872    bands: &[TextBand],
1873    band_indices: &[usize],
1874    boundaries: &[f64],
1875    blocks: &mut Vec<TextBlock>,
1876) {
1877    let column_count = boundaries.len() + 1;
1878    let mut column_bands = vec![Vec::<TextSpan>::new(); column_count];
1879
1880    for &band_idx in band_indices {
1881        let slices = bands[band_idx].split_by_boundaries(boundaries);
1882        for (column_idx, slice) in slices.into_iter().enumerate() {
1883            if slice.is_empty() {
1884                continue;
1885            }
1886            column_bands[column_idx].push(TextSpan::default());
1887            let marker_idx = column_bands[column_idx].len() - 1;
1888            column_bands[column_idx][marker_idx] = TextSpan {
1889                x: f64::NEG_INFINITY,
1890                y: bands[band_idx].y,
1891                ..TextSpan::default()
1892            };
1893            column_bands[column_idx].extend(slice);
1894        }
1895    }
1896
1897    for spans in column_bands {
1898        let mut current: Vec<TextSpan> = Vec::new();
1899        for span in spans {
1900            if span.x == f64::NEG_INFINITY {
1901                if !current.is_empty() {
1902                    current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1903                    blocks.push(TextBlock {
1904                        spans: std::mem::take(&mut current),
1905                    });
1906                }
1907                continue;
1908            }
1909            current.push(span);
1910        }
1911        if !current.is_empty() {
1912            current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1913            blocks.push(TextBlock { spans: current });
1914        }
1915    }
1916}
1917
1918/// Join per-block lines, stitching end-of-line hyphenated word-wraps the
1919/// way pdftotext / MuPDF / PDFBox do.
1920///
1921/// Trigger conditions (all must hold):
1922/// 1. Previous line ends with `-` preceded by an alphabetic character.
1923/// 2. The alphabetic suffix before the `-` has >= 3 characters.
1924/// 3. The next line (trimmed) starts with an ASCII lowercase letter.
1925/// 4. The lowercase prefix of the next line has >= 3 characters.
1926///
1927/// When triggered, the trailing `-` is removed and the two halves are
1928/// concatenated without a space or newline.
1929///
1930/// This avoids false positives on compound words ("real-time"), bullet
1931/// lists, numeric ranges ("42-"), and short fragments.
1932fn stitch_hyphenated_lines(lines: &[String]) -> String {
1933    let mut out = String::new();
1934    for (idx, line) in lines.iter().enumerate() {
1935        if idx == 0 {
1936            out.push_str(line);
1937            continue;
1938        }
1939
1940        let next_trimmed = line.trim_start();
1941
1942        // Check the accumulated output for end-of-line hyphen pattern
1943        let should_merge = is_hyphen_wrap_candidate(&out, next_trimmed);
1944
1945        if should_merge {
1946            out.pop(); // drop the trailing '-'
1947            out.push_str(next_trimmed);
1948        } else {
1949            out.push('\n');
1950            out.push_str(line);
1951        }
1952    }
1953    out
1954}
1955
1956/// Check if the accumulated text ends with a hyphen-wrap pattern and the
1957/// continuation is a valid merge target.
1958fn is_hyphen_wrap_candidate(accumulated: &str, next_trimmed: &str) -> bool {
1959    // Must end with '-'
1960    if !accumulated.ends_with('-') {
1961        return false;
1962    }
1963
1964    // Character before '-' must be alphabetic
1965    let before_hyphen = accumulated.chars().rev().nth(1);
1966    if !before_hyphen.is_some_and(|c| c.is_alphabetic()) {
1967        return false;
1968    }
1969
1970    // Count consecutive alphabetic chars before the '-' (the word fragment)
1971    let alpha_prefix_len = accumulated
1972        .chars()
1973        .rev()
1974        .skip(1) // skip the '-'
1975        .take_while(|c| c.is_alphabetic())
1976        .count();
1977    if alpha_prefix_len < 3 {
1978        return false;
1979    }
1980
1981    // Next line must start with lowercase ASCII
1982    let first_next = next_trimmed.chars().next();
1983    if !first_next.is_some_and(|c| c.is_ascii_lowercase()) {
1984        return false;
1985    }
1986
1987    // Count consecutive lowercase chars at start of next line
1988    let next_alpha_len = next_trimmed
1989        .chars()
1990        .take_while(|c| c.is_ascii_lowercase())
1991        .count();
1992    if next_alpha_len < 3 {
1993        return false;
1994    }
1995
1996    true
1997}
1998
1999/// Normalize extracted text to match pdftotext conventions.
2000///
2001/// 1. Trim trailing whitespace from each line.
2002/// 2. Collapse runs of more than two consecutive newlines into exactly two.
2003/// 3. Preserve form-feed characters (`\x0C`) as page separators.
2004/// 4. End with a single trailing newline (or empty for empty input).
2005pub(crate) fn normalize_text_output(text: &str) -> String {
2006    if text.is_empty() {
2007        return String::new();
2008    }
2009
2010    let mut lines: Vec<&str> = Vec::new();
2011    for line in text.split('\n') {
2012        lines.push(line.trim_end());
2013    }
2014
2015    // Remove trailing empty lines (we'll add exactly one \n at the end)
2016    while lines.last() == Some(&"") {
2017        lines.pop();
2018    }
2019
2020    if lines.is_empty() {
2021        return String::new();
2022    }
2023
2024    let mut result = String::with_capacity(text.len());
2025    let mut consecutive_empty = 0u32;
2026
2027    for (i, line) in lines.iter().enumerate() {
2028        if line.is_empty() || *line == "\x0C" {
2029            if line.is_empty() {
2030                consecutive_empty += 1;
2031                // Collapse >2 consecutive blank lines to 2
2032                if consecutive_empty <= 2 {
2033                    result.push('\n');
2034                }
2035            } else {
2036                // Bare form-feed line
2037                consecutive_empty = 0;
2038                result.push_str(line);
2039                if i + 1 < lines.len() {
2040                    result.push('\n');
2041                }
2042            }
2043        } else {
2044            // Both form-feed-prefixed and regular lines are emitted as-is.
2045            consecutive_empty = 0;
2046            result.push_str(line);
2047            if i + 1 < lines.len() {
2048                result.push('\n');
2049            }
2050        }
2051    }
2052
2053    // Ensure single trailing newline
2054    if !result.is_empty() && !result.ends_with('\n') {
2055        result.push('\n');
2056    }
2057
2058    result
2059}
2060
2061#[cfg(test)]
2062mod tests {
2063    use super::*;
2064
2065    fn span(text: &str, x: f64, y: f64, width: f64) -> TextSpan {
2066        TextSpan {
2067            text: text.into(),
2068            x,
2069            y,
2070            width,
2071            height: 12.0,
2072            font_size: 12.0,
2073            ..TextSpan::default()
2074        }
2075    }
2076
2077    fn block_texts(spans: Vec<TextSpan>) -> Vec<String> {
2078        group_spans_into_blocks(spans)
2079            .into_iter()
2080            .map(|block| block.text())
2081            .collect()
2082    }
2083
2084    #[test]
2085    fn empty_device_produces_empty_text() {
2086        let dev = TextExtractionDevice::new();
2087        assert!(dev.into_text().is_empty());
2088    }
2089
2090    #[test]
2091    fn single_column_stays_row_major() {
2092        let texts = block_texts(vec![
2093            span("Single Column Line 1", 40.0, 700.0, 140.0),
2094            span("Single Column Line 2", 40.0, 684.0, 140.0),
2095            span("Single Column Line 3", 40.0, 668.0, 140.0),
2096        ]);
2097
2098        assert_eq!(
2099            texts,
2100            vec![
2101                "Single Column Line 1",
2102                "Single Column Line 2",
2103                "Single Column Line 3",
2104            ]
2105        );
2106    }
2107
2108    #[test]
2109    fn two_column_region_reads_column_major() {
2110        let texts = block_texts(vec![
2111            span("Header", 200.0, 740.0, 80.0),
2112            span("Left column line one", 40.0, 700.0, 115.0),
2113            span("Right column line one", 320.0, 700.0, 120.0),
2114            span("Left column line two", 40.0, 684.0, 115.0),
2115            span("Right column line two", 320.0, 684.0, 120.0),
2116            span("Left column line three", 40.0, 668.0, 125.0),
2117            span("Right column line three", 320.0, 668.0, 130.0),
2118            span("Footer", 200.0, 620.0, 80.0),
2119        ]);
2120
2121        assert_eq!(
2122            texts,
2123            vec![
2124                "Header",
2125                "Left column line one",
2126                "Left column line two",
2127                "Left column line three",
2128                "Right column line one",
2129                "Right column line two",
2130                "Right column line three",
2131                "Footer",
2132            ]
2133        );
2134    }
2135
2136    #[test]
2137    fn mixed_single_and_multi_column_regions_preserve_shared_bands() {
2138        let texts = block_texts(vec![
2139            span("Intro paragraph", 40.0, 740.0, 180.0),
2140            span("L1 words here", 40.0, 700.0, 110.0),
2141            span("R1 words here", 320.0, 700.0, 110.0),
2142            span("L2 words here", 40.0, 684.0, 110.0),
2143            span("R2 words here", 320.0, 684.0, 110.0),
2144            span("L3 words here", 40.0, 668.0, 110.0),
2145            span("R3 words here", 320.0, 668.0, 110.0),
2146            span("Outro paragraph", 40.0, 620.0, 180.0),
2147        ]);
2148
2149        assert_eq!(
2150            texts,
2151            vec![
2152                "Intro paragraph",
2153                "L1 words here",
2154                "L2 words here",
2155                "L3 words here",
2156                "R1 words here",
2157                "R2 words here",
2158                "R3 words here",
2159                "Outro paragraph",
2160            ]
2161        );
2162    }
2163
2164    #[test]
2165    fn short_table_like_rows_fall_back_to_row_major() {
2166        let texts = block_texts(vec![
2167            span("Name", 40.0, 700.0, 30.0),
2168            span("Age", 320.0, 700.0, 20.0),
2169            span("Alice", 40.0, 684.0, 35.0),
2170            span("30", 320.0, 684.0, 15.0),
2171            span("Bob", 40.0, 668.0, 24.0),
2172            span("25", 320.0, 668.0, 15.0),
2173        ]);
2174
2175        assert_eq!(texts, vec!["Name Age", "Alice 30", "Bob 25"]);
2176    }
2177
2178    #[test]
2179    fn three_column_regions_are_supported() {
2180        let texts = block_texts(vec![
2181            span("Column one line one", 40.0, 700.0, 105.0),
2182            span("Column two line one", 220.0, 700.0, 105.0),
2183            span("Column three line one", 400.0, 700.0, 120.0),
2184            span("Column one line two", 40.0, 684.0, 105.0),
2185            span("Column two line two", 220.0, 684.0, 105.0),
2186            span("Column three line two", 400.0, 684.0, 120.0),
2187            span("Column one line three", 40.0, 668.0, 120.0),
2188            span("Column two line three", 220.0, 668.0, 120.0),
2189            span("Column three line three", 400.0, 668.0, 135.0),
2190        ]);
2191
2192        assert_eq!(
2193            texts,
2194            vec![
2195                "Column one line one",
2196                "Column one line two",
2197                "Column one line three",
2198                "Column two line one",
2199                "Column two line two",
2200                "Column two line three",
2201                "Column three line one",
2202                "Column three line two",
2203                "Column three line three",
2204            ]
2205        );
2206    }
2207
2208    #[test]
2209    fn text_block_concatenation_spaced() {
2210        let block = TextBlock {
2211            spans: vec![span("A", 0.0, 0.0, 6.0), span("B", 20.0, 0.0, 6.0)],
2212        };
2213        assert_eq!(block.text(), "A B");
2214    }
2215
2216    #[test]
2217    fn adaptive_column_gap_fallback_for_no_gaps() {
2218        // Single-span bands produce no measurable gaps → fallback
2219        let bands = vec![
2220            TextBand::new(span("Hello", 40.0, 700.0, 80.0)),
2221            TextBand::new(span("World", 40.0, 684.0, 80.0)),
2222        ];
2223        let threshold = compute_adaptive_column_gap(&bands);
2224        assert!((threshold - COLUMN_GAP_THRESHOLD_FALLBACK).abs() < 0.01);
2225    }
2226
2227    #[test]
2228    fn adaptive_column_gap_uses_median() {
2229        // Three bands with word gaps of ~4pt each → median ≈ 4, threshold = 12
2230        let mut bands = Vec::new();
2231        for y in [700.0, 684.0, 668.0] {
2232            let mut band = TextBand::new(span("word1", 40.0, y, 30.0));
2233            band.spans.push(span("word2", 74.0, y, 30.0)); // gap = 4
2234            band.spans.push(span("word3", 108.0, y, 30.0)); // gap = 4
2235            bands.push(band);
2236        }
2237        let threshold = compute_adaptive_column_gap(&bands);
2238        // median gap = 4, × 3 = 12, clamped to [10, 40] → 12
2239        assert!(
2240            (10.0..=14.0).contains(&threshold),
2241            "expected ~12, got {threshold}"
2242        );
2243    }
2244
2245    #[test]
2246    fn adaptive_column_gap_clamps_to_min() {
2247        // Tight gaps (2pt) across many bands → median = 2, 3×2 = 6 → clamped to 10
2248        let mut bands = Vec::new();
2249        for y in [700.0, 684.0, 668.0, 652.0] {
2250            let mut band = TextBand::new(span("abc", 0.0, y, 18.0));
2251            // right of "abc" = max(18, 12*0.5*3=18) = 18; gap = 20-18 = 2
2252            band.spans.push(span("def", 20.0, y, 18.0));
2253            bands.push(band);
2254        }
2255        let threshold = compute_adaptive_column_gap(&bands);
2256        assert!(
2257            (threshold - COLUMN_GAP_THRESHOLD_MIN).abs() < 0.01,
2258            "expected {COLUMN_GAP_THRESHOLD_MIN}, got {threshold}"
2259        );
2260    }
2261
2262    #[test]
2263    fn adaptive_column_gap_all_large_gaps_uses_fraction_of_min() {
2264        // When all gaps are large (> MIN), threshold = 0.75 × min_gap.
2265        let mut band = TextBand::new(span("Left", 0.0, 700.0, 30.0));
2266        band.spans.push(span("Right", 80.0, 700.0, 30.0)); // gap = 50
2267        let bands = vec![band];
2268        let threshold = compute_adaptive_column_gap(&bands);
2269        assert!(
2270            (threshold - 37.5).abs() < 0.01,
2271            "expected 37.5 (0.75×50), got {threshold}"
2272        );
2273    }
2274
2275    #[test]
2276    fn normalize_trims_trailing_whitespace_per_line() {
2277        assert_eq!(
2278            normalize_text_output("hello   \nworld  \n"),
2279            "hello\nworld\n"
2280        );
2281    }
2282
2283    #[test]
2284    fn normalize_collapses_excess_newlines() {
2285        // >2 blank lines collapse to 2 (meaning 3 \n in a row: line, blank, blank)
2286        assert_eq!(
2287            normalize_text_output("hello\n\n\n\n\nworld\n"),
2288            "hello\n\n\nworld\n"
2289        );
2290    }
2291
2292    #[test]
2293    fn normalize_preserves_double_newline() {
2294        assert_eq!(
2295            normalize_text_output("paragraph one\n\nparagraph two\n"),
2296            "paragraph one\n\nparagraph two\n"
2297        );
2298    }
2299
2300    #[test]
2301    fn normalize_preserves_form_feed() {
2302        assert_eq!(
2303            normalize_text_output("page1\n\n\x0Cpage2\n"),
2304            "page1\n\n\x0Cpage2\n"
2305        );
2306    }
2307
2308    #[test]
2309    fn normalize_adds_trailing_newline() {
2310        assert_eq!(normalize_text_output("hello"), "hello\n");
2311    }
2312
2313    #[test]
2314    fn normalize_empty_input() {
2315        assert_eq!(normalize_text_output(""), "");
2316    }
2317
2318    #[test]
2319    fn normalize_only_whitespace() {
2320        assert_eq!(normalize_text_output("   \n  \n"), "");
2321    }
2322
2323    // --- Hyphen stitching tests ---
2324
2325    #[test]
2326    fn hyphen_stitch_joins_wrapped_word() {
2327        let lines = vec!["the aver-".into(), "age rainfall".into()];
2328        assert_eq!(stitch_hyphenated_lines(&lines), "the average rainfall");
2329    }
2330
2331    #[test]
2332    fn hyphen_stitch_handles_leading_whitespace() {
2333        let lines = vec!["pre-".into(), "   dict the outcome".into()];
2334        // "pre" is only 3 chars → meets >= 3 guard
2335        assert_eq!(stitch_hyphenated_lines(&lines), "predict the outcome");
2336    }
2337
2338    #[test]
2339    fn hyphen_stitch_capital_continuation_not_stitched() {
2340        let lines = vec!["Section three-".into(), "Summary here".into()];
2341        assert_eq!(
2342            stitch_hyphenated_lines(&lines),
2343            "Section three-\nSummary here"
2344        );
2345    }
2346
2347    #[test]
2348    fn hyphen_stitch_bullet_dash_not_stitched() {
2349        // "-" alone: char before hyphen is not alphabetic
2350        let lines = vec!["Items:".into(), "-".into(), "milk".into()];
2351        assert_eq!(stitch_hyphenated_lines(&lines), "Items:\n-\nmilk");
2352    }
2353
2354    #[test]
2355    fn hyphen_stitch_numeric_range_not_stitched() {
2356        // "42-" — char before hyphen is digit, not alphabetic
2357        let lines = vec!["page 42-".into(), "seventy".into()];
2358        assert_eq!(stitch_hyphenated_lines(&lines), "page 42-\nseventy");
2359    }
2360
2361    #[test]
2362    fn hyphen_stitch_short_prefix_not_stitched() {
2363        // "re-" only 2 alpha chars before hyphen → below 3-char guard
2364        let lines = vec!["re-".into(), "organize".into()];
2365        assert_eq!(stitch_hyphenated_lines(&lines), "re-\norganize");
2366    }
2367
2368    #[test]
2369    fn hyphen_stitch_short_continuation_not_stitched() {
2370        // Next line starts with "an" (2 chars) → below 3-char guard
2371        let lines = vec!["counter-".into(), "an example".into()];
2372        assert_eq!(stitch_hyphenated_lines(&lines), "counter-\nan example");
2373    }
2374
2375    #[test]
2376    fn hyphen_stitch_compound_word_midline_preserved() {
2377        // "real-time" is mid-line, not end-of-line — no stitching applies
2378        // because stitch only operates on line boundaries
2379        let lines = vec!["real-time system".into()];
2380        assert_eq!(stitch_hyphenated_lines(&lines), "real-time system");
2381    }
2382
2383    #[test]
2384    fn hyphen_stitch_single_line_unchanged() {
2385        let lines = vec!["only line".into()];
2386        assert_eq!(stitch_hyphenated_lines(&lines), "only line");
2387    }
2388
2389    #[test]
2390    fn hyphen_stitch_empty_input() {
2391        let lines: Vec<String> = vec![];
2392        assert_eq!(stitch_hyphenated_lines(&lines), "");
2393    }
2394
2395    // --- TEX1 multi-signal space consensus tests ---
2396
2397    fn make_device_with_median(median: f64) -> TextExtractionDevice {
2398        let mut dev = TextExtractionDevice::new();
2399        // Seed enough samples for the median to resolve to `median`.
2400        for _ in 0..MEDIAN_REFRESH {
2401            dev.glyph_widths.push(median);
2402        }
2403        dev.refresh_median_char_width();
2404        assert!((dev.cached_median_char_width - median).abs() < 1e-9);
2405        dev
2406    }
2407
2408    #[test]
2409    fn consensus_inserts_space_on_strong_tj_offset_alone() {
2410        // Gap is below the geometric threshold, but the TJ offset is large
2411        // enough that the consensus must still fire.
2412        let mut dev = make_device_with_median(6.0);
2413        dev.pending_tj_offset = 250.0; // full em-space
2414        assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
2415    }
2416
2417    #[test]
2418    fn consensus_inserts_space_on_geometric_gap_alone() {
2419        // No TJ, no character transition, but a clearly wide geometric gap.
2420        let dev = make_device_with_median(6.0);
2421        // gap > 0.3 * 6.0 = 1.8 → fires gap signal (0.80), below threshold
2422        // on its own? 0.80 < 0.75 threshold? No, 0.80 > 0.75, so it fires.
2423        assert!(dev.evaluate_space_consensus(2.5, 12.0, "hello", "world"));
2424    }
2425
2426    #[test]
2427    fn consensus_no_space_on_kerning_gap() {
2428        // Small kerning-size gap with no other signals must not inject a
2429        // space (regression guard against false-positive spaces inside
2430        // tightly kerned words).
2431        let dev = make_device_with_median(6.0);
2432        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "fi", "lm"));
2433    }
2434
2435    #[test]
2436    fn consensus_inserts_space_on_camel_case_plus_gap() {
2437        // CamelCase heuristic (0.60) alone doesn't reach threshold, but a
2438        // moderate gap (0.60 gap + 0.60 heuristic if gap fires) should.
2439        // Here gap = 2.5 > 1.8 → gap fires → total 0.80 + 0.60 = 1.40.
2440        let dev = make_device_with_median(6.0);
2441        assert!(dev.evaluate_space_consensus(2.5, 12.0, "helloWorld", "Inc"));
2442    }
2443
2444    #[test]
2445    fn consensus_inserts_space_on_digit_letter_transition_with_gap() {
2446        let dev = make_device_with_median(6.0);
2447        assert!(dev.evaluate_space_consensus(2.5, 12.0, "123", "abc"));
2448    }
2449
2450    #[test]
2451    fn consensus_heuristic_alone_is_insufficient() {
2452        // Heuristic (0.60) on its own is below the 0.75 threshold — the
2453        // design deliberately requires a second corroborating signal to
2454        // avoid gluing spaces into existing CamelCase identifiers that
2455        // have no geometric break.
2456        let dev = make_device_with_median(6.0);
2457        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "camel", "Case"));
2458    }
2459
2460    #[test]
2461    fn consensus_falls_back_to_font_size_when_no_median() {
2462        // No samples → median is 0; geometric reference uses font-size.
2463        let dev = TextExtractionDevice::new();
2464        // gap 1.9 > 0.15 * 12.0 = 1.8 → gap signal fires
2465        assert!(dev.evaluate_space_consensus(1.9, 12.0, "a", "b"));
2466        // gap 1.5 < 1.8 → no signal
2467        assert!(!dev.evaluate_space_consensus(1.5, 12.0, "a", "b"));
2468    }
2469
2470    #[test]
2471    fn consensus_ignores_tiny_tj_offsets() {
2472        // TJ offsets below the threshold are kerning, not word breaks.
2473        let mut dev = make_device_with_median(6.0);
2474        dev.pending_tj_offset = 50.0;
2475        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
2476    }
2477
2478    #[test]
2479    fn consensus_accepts_negative_tj_offsets() {
2480        // A negative TJ offset still represents an explicit inter-substring
2481        // shift and counts toward the consensus (|amount| check).
2482        let mut dev = make_device_with_median(6.0);
2483        dev.pending_tj_offset = -250.0;
2484        assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
2485    }
2486
2487    #[test]
2488    fn text_adjustment_accumulates_until_glyph() {
2489        let mut dev = TextExtractionDevice::new();
2490        dev.text_adjustment(120.0);
2491        dev.text_adjustment(140.0);
2492        assert!((dev.pending_tj_offset - 260.0).abs() < 1e-6);
2493    }
2494
2495    // --- TEX2 XY-Cut tests ---
2496
2497    #[test]
2498    fn xy_cut_header_body_footer_with_two_columns() {
2499        // Header and footer sit in the mid-x range that would
2500        // accidentally fall into a left-column bucket with a naive
2501        // vertical-first cut. The largest-gap-first rule plus the
2502        // alignment guard ensure header and footer bracket the
2503        // columnar body.
2504        let texts = block_texts(vec![
2505            span("HEADLINE TITLE", 180.0, 760.0, 120.0),
2506            span("Left col line A", 40.0, 700.0, 110.0),
2507            span("Right col line A", 320.0, 700.0, 115.0),
2508            span("Left col line B", 40.0, 684.0, 110.0),
2509            span("Right col line B", 320.0, 684.0, 115.0),
2510            span("Left col line C", 40.0, 668.0, 110.0),
2511            span("Right col line C", 320.0, 668.0, 115.0),
2512            span("FOOTER LINE TEXT", 180.0, 600.0, 120.0),
2513        ]);
2514        assert_eq!(texts.first().map(String::as_str), Some("HEADLINE TITLE"));
2515        assert_eq!(texts.last().map(String::as_str), Some("FOOTER LINE TEXT"));
2516        // Left column lines all come before right column lines.
2517        let left_c_idx = texts.iter().position(|s| s == "Left col line C").unwrap();
2518        let right_a_idx = texts.iter().position(|s| s == "Right col line A").unwrap();
2519        assert!(
2520            left_c_idx < right_a_idx,
2521            "expected column-major ordering in body: {texts:?}"
2522        );
2523    }
2524
2525    #[test]
2526    fn xy_cut_rejects_column_split_on_table_rows() {
2527        // The density guard must still reject the 280pt inter-cell gap
2528        // in a short-cell table, preserving row-major reading order.
2529        let texts = block_texts(vec![
2530            span("Name", 40.0, 700.0, 30.0),
2531            span("Age", 320.0, 700.0, 20.0),
2532            span("Alice", 40.0, 684.0, 35.0),
2533            span("30", 320.0, 684.0, 15.0),
2534        ]);
2535        assert_eq!(texts, vec!["Name Age", "Alice 30"]);
2536    }
2537
2538    #[test]
2539    fn xy_cut_rejects_column_split_when_one_band_is_full_width() {
2540        // The alignment guard catches a full-width paragraph that
2541        // would otherwise be forced into the left column of a 2-column
2542        // region below it.
2543        let texts = block_texts(vec![
2544            span(
2545                "Full width intro spanning both columns here",
2546                40.0,
2547                740.0,
2548                360.0,
2549            ),
2550            span("Left A", 40.0, 700.0, 50.0),
2551            span("Right A", 320.0, 700.0, 50.0),
2552            span("Left B", 40.0, 684.0, 50.0),
2553            span("Right B", 320.0, 684.0, 50.0),
2554        ]);
2555        assert!(
2556            texts[0].contains("Full width intro"),
2557            "expected full-width intro first: {texts:?}"
2558        );
2559    }
2560
2561    #[test]
2562    fn xy_cut_horizontal_split_for_zone_boundaries() {
2563        // Pure horizontal cut on a single-column page with a big
2564        // vertical gap between paragraphs — the cut fires and both
2565        // paragraphs stay in their own blocks.
2566        let texts = block_texts(vec![
2567            span("First paragraph body text", 40.0, 740.0, 200.0),
2568            span("Second paragraph body", 40.0, 680.0, 180.0),
2569        ]);
2570        assert_eq!(texts.len(), 2);
2571        assert!(texts[0].starts_with("First"));
2572        assert!(texts[1].starts_with("Second"));
2573    }
2574
2575    #[test]
2576    fn xy_cut_recursion_terminates_with_single_span() {
2577        let texts = block_texts(vec![span("Only one span on the page", 40.0, 700.0, 180.0)]);
2578        assert_eq!(texts, vec!["Only one span on the page"]);
2579    }
2580
2581    #[test]
2582    fn median_font_size_handles_mixed_sizes() {
2583        let spans = vec![
2584            TextSpan {
2585                text: "small".into(),
2586                width: 10.0,
2587                height: 8.0,
2588                font_size: 8.0,
2589                ..TextSpan::default()
2590            },
2591            TextSpan {
2592                text: "medium".into(),
2593                width: 10.0,
2594                height: 12.0,
2595                font_size: 12.0,
2596                ..TextSpan::default()
2597            },
2598            TextSpan {
2599                text: "large".into(),
2600                width: 10.0,
2601                height: 24.0,
2602                font_size: 24.0,
2603                ..TextSpan::default()
2604            },
2605        ];
2606        assert!((median_font_size(&spans) - 12.0).abs() < 1e-9);
2607    }
2608
2609    #[test]
2610    fn columns_band_aligned_accepts_aligned_columns() {
2611        let spans = vec![
2612            span("L1", 40.0, 700.0, 60.0),
2613            span("R1", 300.0, 700.0, 60.0),
2614            span("L2", 40.0, 684.0, 60.0),
2615            span("R2", 300.0, 684.0, 60.0),
2616        ];
2617        let stats = PageStats::from_spans(&spans);
2618        // cut_x between 100 and 300 → 200. Every band straddles the cut.
2619        assert!(columns_are_band_aligned(&spans, 200.0, 40.0, 360.0, &stats));
2620    }
2621
2622    #[test]
2623    fn columns_band_aligned_rejects_wide_single_side_band() {
2624        let spans = vec![
2625            span("Wide banner line across top", 40.0, 740.0, 280.0),
2626            span("L1", 40.0, 700.0, 60.0),
2627            span("R1", 300.0, 700.0, 60.0),
2628        ];
2629        let stats = PageStats::from_spans(&spans);
2630        // cut_x = 200. Banner only in left group (midpoint < 200). Width
2631        // exceeds 0.7 × left column width → rejected.
2632        assert!(!columns_are_band_aligned(
2633            &spans, 200.0, 40.0, 360.0, &stats
2634        ));
2635    }
2636
2637    #[test]
2638    fn page_stats_computes_median_values() {
2639        let spans = vec![
2640            span("one", 40.0, 700.0, 30.0),
2641            span("two", 40.0, 680.0, 30.0),
2642            span("three", 40.0, 660.0, 50.0),
2643        ];
2644        let stats = PageStats::from_spans(&spans);
2645        assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2646        // char width = width / chars. one=30/3=10, two=30/3=10, three=50/5=10. median=10.
2647        assert!((stats.median_char_width - 10.0).abs() < 1e-9);
2648        // line spacing: bands at 700, 680, 660. gaps = 20, 20. median = 20.
2649        assert!((stats.median_line_spacing - 20.0).abs() < 1e-9);
2650    }
2651
2652    #[test]
2653    fn page_stats_handles_empty_input() {
2654        let stats = PageStats::from_spans(&[]);
2655        assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2656        assert!((stats.median_char_width - 6.0).abs() < 1e-9);
2657        assert_eq!(stats.median_line_spacing, 0.0);
2658    }
2659
2660    #[test]
2661    fn narrow_gutter_detected_with_adaptive_threshold() {
2662        // Academic paper layout: 12pt gutter between columns.
2663        // With old fixed 20pt threshold, this was not detected as columnar.
2664        // With adaptive: median word gap ~4pt, threshold = 12pt → detects 12pt gutter.
2665        let mut spans = Vec::new();
2666        for y in [700.0, 684.0, 668.0] {
2667            // Left column: two words with 4pt gap, ending at x=145
2668            spans.push(span("Lorem ipsum", 40.0, y, 100.0));
2669            spans.push(span("dolor sit", 144.0, y, 80.0));
2670            // Right column starts at 236 (gap = 12pt from 224)
2671            spans.push(span("amet consec", 236.0, y, 100.0));
2672            spans.push(span("tetur adipi", 340.0, y, 80.0));
2673        }
2674        let texts = block_texts(spans);
2675        // Should detect 2-column layout and read column-major
2676        assert!(
2677            texts.len() >= 6,
2678            "expected column-major output, got {texts:?}"
2679        );
2680        // First three blocks should be left column lines
2681        assert!(
2682            texts[0].contains("Lorem"),
2683            "first block should be left column: {texts:?}"
2684        );
2685    }
2686
2687    #[test]
2688    fn xy_cut_leaf_falls_back_to_legacy_columns_for_header_plus_three_columns() {
2689        let texts = block_texts(vec![
2690            span("73022", 45.0, 750.0, 70.0),
2691            span("Federal Register banner", 125.6, 750.0, 260.0),
2692            span("Left column line one", 45.0, 725.0, 140.0),
2693            span("Middle column line one", 222.0, 725.0, 140.0),
2694            span("Right column line one", 399.0, 725.0, 120.0),
2695            span("Left column line two", 45.0, 715.0, 140.0),
2696            span("Middle column line two", 210.0, 715.0, 152.0),
2697            span("Right column line two", 388.0, 715.0, 132.0),
2698            span("Left column line three", 45.0, 705.0, 140.0),
2699            span("Middle column line three", 235.0, 705.0, 135.0),
2700            span("Right column line three", 408.0, 705.0, 118.0),
2701        ]);
2702
2703        assert_eq!(
2704            texts,
2705            vec![
2706                "73022 Federal Register banner",
2707                "Left column line one",
2708                "Left column line two",
2709                "Left column line three",
2710                "Middle column line one",
2711                "Middle column line two",
2712                "Middle column line three",
2713                "Right column line one",
2714                "Right column line two",
2715                "Right column line three",
2716            ]
2717        );
2718    }
2719
2720    #[test]
2721    fn overlapping_fake_bold_spans_collapse_to_single_copy() {
2722        let texts = block_texts(vec![
2723            span("1 This is fakebold text.", 25.9, 785.3, 320.0),
2724            span("1 This is fakebold text.", 26.2, 785.3, 320.0),
2725            span("1 This is fakebold text.", 26.4, 785.3, 320.0),
2726            span("1 This is fakebold text.", 26.7, 785.3, 320.0),
2727            span("2 This is a fakebold", 27.0, 714.8, 142.0),
2728            span(" fakebold", 169.8, 714.8, 70.0),
2729            span(" fakebold", 170.1, 714.8, 70.0),
2730            span(" fakebold word.", 170.4, 714.8, 110.0),
2731        ]);
2732
2733        assert_eq!(
2734            texts,
2735            vec!["1 This is fakebold text.", "2 This is a fakebold word.",]
2736        );
2737    }
2738
2739    // ---- G1: read-only metadata field tests ----
2740
2741    #[test]
2742    fn g1_default_text_span_has_empty_metadata() {
2743        let s = TextSpan::default();
2744        assert_eq!(s.font_name, None);
2745        assert!(!s.is_bold);
2746        assert!(!s.is_italic);
2747        assert_eq!(s.color, None);
2748    }
2749
2750    #[test]
2751    fn g1_strip_subset_prefix_handles_six_char_prefix() {
2752        assert_eq!(strip_subset_prefix("AAAAAA+Helvetica"), "Helvetica");
2753        // Non-6-char prefix → keep verbatim.
2754        assert_eq!(strip_subset_prefix("ABC+Helvetica"), "ABC+Helvetica");
2755        // No `+` → unchanged.
2756        assert_eq!(strip_subset_prefix("Helvetica-Bold"), "Helvetica-Bold");
2757    }
2758
2759    #[test]
2760    fn g1_name_style_hints_match_pdf_interpret_rules() {
2761        assert_eq!(name_style_hints("Helvetica-Bold"), (true, false));
2762        assert_eq!(name_style_hints("Times-Italic"), (false, true));
2763        assert_eq!(name_style_hints("MyFont-BoldOblique"), (true, true));
2764        assert_eq!(name_style_hints("Helvetica"), (false, false));
2765        // Semibold / Demi / Heavy / Black variants → bold.
2766        assert_eq!(name_style_hints("Roboto-DemiBold"), (true, false));
2767        assert_eq!(name_style_hints("Roboto-Black"), (true, false));
2768        // Oblique / slant variants → italic.
2769        assert_eq!(name_style_hints("Roboto-Oblique"), (false, true));
2770        assert_eq!(name_style_hints("MyFont-Slanted"), (false, true));
2771    }
2772
2773    // ---- G2: widthSource + char_bounds on TextSpan ----
2774
2775    #[test]
2776    fn g2_default_text_span_has_estimate_width_source() {
2777        let s = TextSpan::default();
2778        assert_eq!(s.width_source, WidthSource::Estimate);
2779        assert!(s.char_bounds.is_empty());
2780    }
2781
2782    /// Verify that a single-glyph span has exactly one char_bound entry
2783    /// and that the bound matches the span's x / width.
2784    #[test]
2785    fn g2_single_glyph_span_has_one_char_bound() {
2786        let s = TextSpan {
2787            text: "A".into(),
2788            x: 10.0,
2789            y: 100.0,
2790            width: 7.22,
2791            height: 10.0,
2792            font_size: 10.0,
2793            width_source: WidthSource::Metric,
2794            char_bounds: vec![[10.0, 100.0, 17.22, 110.0]],
2795            ..Default::default()
2796        };
2797
2798        assert_eq!(s.char_bounds.len(), 1);
2799        let [x0, y0, x1, y1] = s.char_bounds[0];
2800        assert!((x0 - 10.0).abs() < 0.001);
2801        assert!((x1 - 17.22).abs() < 0.001);
2802        assert!((y1 - y0 - s.font_size).abs() < 0.001);
2803    }
2804
2805    /// When merging glyphs into a span the width_source degrades to Estimate
2806    /// if any glyph was estimated.
2807    #[test]
2808    fn g2_merged_span_degrades_width_source_on_estimate() {
2809        let mut s = TextSpan {
2810            width_source: WidthSource::Metric,
2811            char_bounds: vec![[0.0, 0.0, 7.0, 10.0]],
2812            ..Default::default()
2813        };
2814
2815        // Simulate what draw_glyph does on merge: push bound + downgrade.
2816        s.char_bounds.push([7.0, 0.0, 12.0, 10.0]);
2817        s.width_source = WidthSource::Estimate; // second glyph had no advance
2818
2819        assert_eq!(s.width_source, WidthSource::Estimate);
2820        assert_eq!(s.char_bounds.len(), 2);
2821    }
2822
2823    /// Verify WidthSource enum serialises to the two expected string literals.
2824    #[test]
2825    fn g2_width_source_variants_are_correct() {
2826        assert_eq!(format!("{:?}", WidthSource::Metric), "Metric");
2827        assert_eq!(format!("{:?}", WidthSource::Estimate), "Estimate");
2828        assert_ne!(WidthSource::Metric, WidthSource::Estimate);
2829        assert_eq!(WidthSource::default(), WidthSource::Estimate);
2830    }
2831
2832    // ---- M3: RichGeometry / tight glyph bounds tests ----
2833
2834    #[test]
2835    fn m3_default_text_span_has_basic_mode() {
2836        let s = TextSpan::default();
2837        assert_eq!(s.geometry_mode, GeometryMode::Basic);
2838        assert_eq!(s.bounds_source, BoundsSource::Advance);
2839        assert!(s.tight_char_bounds.is_empty());
2840        assert!(s.glyph_advances.is_empty());
2841    }
2842
2843    #[test]
2844    fn m3_geometry_mode_default_is_basic() {
2845        assert_eq!(GeometryMode::default(), GeometryMode::Basic);
2846    }
2847
2848    #[test]
2849    fn m3_bounds_source_default_is_advance() {
2850        assert_eq!(BoundsSource::default(), BoundsSource::Advance);
2851    }
2852
2853    #[test]
2854    fn m3_geometry_mode_variants_are_distinct() {
2855        assert_ne!(GeometryMode::Basic, GeometryMode::RichGeometry);
2856        assert_eq!(format!("{:?}", GeometryMode::Basic), "Basic");
2857        assert_eq!(format!("{:?}", GeometryMode::RichGeometry), "RichGeometry");
2858    }
2859
2860    #[test]
2861    fn m3_bounds_source_variants_are_distinct() {
2862        assert_ne!(BoundsSource::Advance, BoundsSource::Tight);
2863        assert_ne!(BoundsSource::Tight, BoundsSource::Estimate);
2864        assert_eq!(format!("{:?}", BoundsSource::Advance), "Advance");
2865        assert_eq!(format!("{:?}", BoundsSource::Tight), "Tight");
2866        assert_eq!(format!("{:?}", BoundsSource::Estimate), "Estimate");
2867    }
2868
2869    #[test]
2870    fn m3_text_span_rich_geometry_preserves_existing_fields() {
2871        let s = TextSpan {
2872            text: "Hello".into(),
2873            x: 10.0,
2874            y: 100.0,
2875            width: 30.0,
2876            height: 12.0,
2877            font_size: 12.0,
2878            font_name: Some("Helvetica".into()),
2879            is_bold: true,
2880            is_italic: false,
2881            color: Some([0, 0, 0, 255]),
2882            width_source: WidthSource::Metric,
2883            char_bounds: vec![[10.0, 100.0, 40.0, 112.0]],
2884            geometry_mode: GeometryMode::RichGeometry,
2885            bounds_source: BoundsSource::Tight,
2886            tight_char_bounds: vec![[8.5, 98.5, 41.5, 114.0]],
2887            glyph_advances: vec![30.0],
2888            glyph_bounds_sources: vec![BoundsSource::Tight],
2889            ..Default::default()
2890        };
2891
2892        // Existing fields preserved.
2893        assert_eq!(s.text, "Hello");
2894        assert!((s.x - 10.0).abs() < 0.001);
2895        assert!((s.y - 100.0).abs() < 0.001);
2896        assert_eq!(s.width_source, WidthSource::Metric);
2897        assert_eq!(s.char_bounds.len(), 1);
2898
2899        // M3 fields populated.
2900        assert_eq!(s.geometry_mode, GeometryMode::RichGeometry);
2901        assert_eq!(s.bounds_source, BoundsSource::Tight);
2902        assert_eq!(s.tight_char_bounds.len(), 1);
2903        assert_eq!(s.glyph_advances.len(), 1);
2904        assert!((s.glyph_advances[0] - 30.0).abs() < 0.001);
2905    }
2906
2907    #[test]
2908    fn m3_text_span_basic_mode_does_not_populate_tight_fields() {
2909        let s = TextSpan {
2910            text: "A".into(),
2911            x: 0.0,
2912            y: 0.0,
2913            width: 7.0,
2914            height: 10.0,
2915            font_size: 10.0,
2916            geometry_mode: GeometryMode::Basic,
2917            bounds_source: BoundsSource::Advance,
2918            ..Default::default()
2919        };
2920
2921        assert!(s.tight_char_bounds.is_empty());
2922        assert!(s.glyph_advances.is_empty());
2923    }
2924
2925    #[test]
2926    fn m3_tight_char_bounds_are_optional_in_basic_mode() {
2927        // A span from Basic mode should have empty tight fields.
2928        let s = TextSpan {
2929            text: "test".into(),
2930            x: 0.0,
2931            y: 0.0,
2932            width: 40.0,
2933            height: 12.0,
2934            font_size: 12.0,
2935            geometry_mode: GeometryMode::Basic,
2936            ..Default::default()
2937        };
2938        assert!(s.tight_char_bounds.is_empty());
2939        assert!(s.glyph_advances.is_empty());
2940    }
2941
2942    #[test]
2943    fn m3_test_span_constructor_preserves_all_fields() {
2944        // Verify the span constructor used in existing tests still works.
2945        let s = span("hello", 10.0, 100.0, 30.0);
2946        assert_eq!(s.geometry_mode, GeometryMode::Basic);
2947        assert_eq!(s.bounds_source, BoundsSource::Advance);
2948        assert!(s.tight_char_bounds.is_empty());
2949    }
2950
2951    #[test]
2952    fn m3_width_source_unchanged_by_m3() {
2953        // WidthSource should not be affected by RichGeometry.
2954        assert_eq!(WidthSource::Metric, WidthSource::Metric);
2955        assert_eq!(WidthSource::Estimate, WidthSource::Estimate);
2956        assert_eq!(WidthSource::default(), WidthSource::Estimate);
2957    }
2958
2959    #[test]
2960    fn m3_transform_bbox_corners_identity() {
2961        let bbox = [0.0, 0.0, 10.0, 12.0];
2962        let identity = kurbo::Affine::IDENTITY;
2963        let result = transform_bbox_corners(&bbox, &identity);
2964        assert!((result[0] - 0.0).abs() < 0.001);
2965        assert!((result[1] - 0.0).abs() < 0.001);
2966        assert!((result[2] - 10.0).abs() < 0.001);
2967        assert!((result[3] - 12.0).abs() < 0.001);
2968    }
2969
2970    #[test]
2971    fn m3_transform_bbox_corners_translation() {
2972        let bbox = [0.0, 0.0, 10.0, 12.0];
2973        let t = kurbo::Affine::translate((50.0, 100.0));
2974        let result = transform_bbox_corners(&bbox, &t);
2975        assert!((result[0] - 50.0).abs() < 0.001);
2976        assert!((result[1] - 100.0).abs() < 0.001);
2977        assert!((result[2] - 60.0).abs() < 0.001);
2978        assert!((result[3] - 112.0).abs() < 0.001);
2979    }
2980
2981    #[test]
2982    fn m3_transform_bbox_corners_scale() {
2983        let bbox = [0.0, 0.0, 10.0, 12.0];
2984        let s = kurbo::Affine::scale(2.0);
2985        let result = transform_bbox_corners(&bbox, &s);
2986        assert!((result[0] - 0.0).abs() < 0.001);
2987        assert!((result[1] - 0.0).abs() < 0.001);
2988        assert!((result[2] - 20.0).abs() < 0.001);
2989        assert!((result[3] - 24.0).abs() < 0.001);
2990    }
2991
2992    #[test]
2993    fn m3_transform_bbox_corners_with_negative_bbox() {
2994        // Glyphs with descenders have negative y0 in font space.
2995        let bbox = [-5.0, -200.0, 15.0, 800.0];
2996        let s = kurbo::Affine::scale(0.012); // 12pt font at 1000-upem
2997        let result = transform_bbox_corners(&bbox, &s);
2998        assert!((result[0] - (-0.06)).abs() < 0.01);
2999        assert!((result[1] - (-2.4)).abs() < 0.01);
3000        assert!((result[2] - 0.18).abs() < 0.01);
3001        assert!((result[3] - 9.6).abs() < 0.01);
3002    }
3003
3004    #[test]
3005    fn m3_tight_char_bounds_descender_detection() {
3006        // A glyph with a descender should have tight_char_bounds.y0 < origin.y0.
3007        let s = TextSpan {
3008            text: "g".into(),
3009            x: 100.0,
3010            y: 200.0, // baseline
3011            width: 8.0,
3012            height: 12.0,
3013            font_size: 12.0,
3014            geometry_mode: GeometryMode::RichGeometry,
3015            bounds_source: BoundsSource::Tight,
3016            tight_char_bounds: vec![[99.0, 197.5, 109.0, 210.0]],
3017            char_bounds: vec![[100.0, 200.0, 108.0, 212.0]],
3018            glyph_advances: vec![8.0],
3019            ..Default::default()
3020        };
3021
3022        // Tight y0 (197.5) < span y (200.0) — descender extends below baseline.
3023        assert!(
3024            s.tight_char_bounds[0][1] < s.y,
3025            "descender should extend below baseline"
3026        );
3027    }
3028
3029    #[test]
3030    fn m3_tight_char_bounds_ascender_detection() {
3031        // A glyph with an ascender should have tight_char_bounds.y1 > advance y1.
3032        let s = TextSpan {
3033            text: "f".into(),
3034            x: 100.0,
3035            y: 200.0,
3036            width: 7.0,
3037            height: 12.0,
3038            font_size: 12.0,
3039            geometry_mode: GeometryMode::RichGeometry,
3040            bounds_source: BoundsSource::Tight,
3041            tight_char_bounds: vec![[95.0, 200.0, 108.0, 214.0]],
3042            char_bounds: vec![[100.0, 200.0, 107.0, 212.0]],
3043            glyph_advances: vec![7.0],
3044            ..Default::default()
3045        };
3046
3047        // Tight y1 (214.0) > advance y1 (200.0 + 12.0 = 212.0).
3048        let advance_y1 = s.y + s.height;
3049        assert!(
3050            s.tight_char_bounds[0][3] > advance_y1,
3051            "ascender should extend above advance bounds"
3052        );
3053    }
3054
3055    #[test]
3056    fn m3_tight_char_bounds_differ_from_advance() {
3057        // Verify tight bounds produce a different bbox than advance bounds.
3058        let s = TextSpan {
3059            text: "A".into(),
3060            x: 10.0,
3061            y: 100.0,
3062            width: 7.22,
3063            height: 10.0,
3064            font_size: 10.0,
3065            geometry_mode: GeometryMode::RichGeometry,
3066            bounds_source: BoundsSource::Tight,
3067            char_bounds: vec![[10.0, 100.0, 17.22, 110.0]], // advance
3068            tight_char_bounds: vec![[9.5, 100.2, 17.5, 109.5]], // tight (different)
3069            glyph_advances: vec![7.22],
3070            ..Default::default()
3071        };
3072
3073        let ac = s.char_bounds[0];
3074        let tc = s.tight_char_bounds[0];
3075        let differs = (ac[0] - tc[0]).abs() > 0.01
3076            || (ac[1] - tc[1]).abs() > 0.01
3077            || (ac[2] - tc[2]).abs() > 0.01
3078            || (ac[3] - tc[3]).abs() > 0.01;
3079        assert!(differs, "tight bounds should differ from advance bounds");
3080    }
3081
3082    #[test]
3083    fn m3_device_new_uses_basic_mode() {
3084        let dev = TextExtractionDevice::new();
3085        assert_eq!(dev.geometry_mode, GeometryMode::Basic);
3086        assert!(dev.deferred_rich_glyphs.is_empty());
3087    }
3088
3089    #[test]
3090    fn m3_device_with_mode_rich_geometry() {
3091        let dev = TextExtractionDevice::with_mode(GeometryMode::RichGeometry);
3092        assert_eq!(dev.geometry_mode, GeometryMode::RichGeometry);
3093        assert!(dev.deferred_rich_glyphs.is_empty());
3094    }
3095
3096    /// A synthetic rectangular glyph outline in raw design space, plus its
3097    /// bbox. The renderer (`renderer.rs::fill_glyph`) fills this raw outline
3098    /// with `composed = transform * glyph_transform` and NO upem pre-scaling,
3099    /// so the page-space bbox of a tight glyph MUST equal
3100    /// `(composed * outline).bounding_box()`. These tests use that as the
3101    /// ground-truth oracle, which is what guards against scaling regressions.
3102    #[cfg(test)]
3103    fn synthetic_glyph_rect() -> (BezPath, kurbo::Rect) {
3104        let mut path = BezPath::new();
3105        path.move_to((100.0, 0.0));
3106        path.line_to((600.0, 0.0));
3107        path.line_to((600.0, 700.0));
3108        path.line_to((100.0, 700.0));
3109        path.close_path();
3110        let bbox = path.bounding_box();
3111        (path, bbox)
3112    }
3113
3114    #[test]
3115    fn m3_basic_mode_skips_tight_bounds_computation() {
3116        // Basic mode must NOT compute tight bounds, even when a span is present.
3117        let mut dev = TextExtractionDevice::new();
3118        dev.spans.push(TextSpan {
3119            text: "A".into(),
3120            char_bounds: vec![[0.0, 0.0, 7.0, 10.0]],
3121            ..Default::default()
3122        });
3123        let spans = dev.into_spans();
3124        assert_eq!(spans.len(), 1);
3125        assert!(spans[0].tight_char_bounds.is_empty());
3126        assert!(spans[0].glyph_bounds_sources.is_empty());
3127        assert_eq!(spans[0].bounds_source, BoundsSource::Advance);
3128    }
3129
3130    #[test]
3131    fn m3_rich_geometry_tight_bounds_match_rendered_outline() {
3132        // Axis-aligned scale 0.01: the page-space tight bbox must equal the
3133        // bbox of the transformed outline — i.e. design [100,600]x[0,700]
3134        // scaled by 0.01 → [1.0, 0.0, 6.0, 7.0]. A double-scale regression
3135        // (e.g. an extra font_size/1000) would yield ~[0.01, 0, 0.06, 0.07].
3136        let (path, bbox) = synthetic_glyph_rect();
3137        let composed = Affine::scale(0.01);
3138        let oracle = (composed * path.clone()).bounding_box();
3139
3140        let mut dev = TextExtractionDevice::with_mode(GeometryMode::RichGeometry);
3141        dev.spans.push(TextSpan {
3142            text: "A".into(),
3143            char_bounds: vec![[0.0, 0.0, 7.0, 10.0]],
3144            geometry_mode: GeometryMode::RichGeometry,
3145            ..Default::default()
3146        });
3147        dev.deferred_rich_glyphs.push(DeferredGlyph {
3148            coeffs: composed.as_coeffs(),
3149            font_size: 10.0,
3150            glyph_width: 7.0,
3151            needs_exact: false,
3152            outline: Some(path),
3153            font_bbox: Some(bbox),
3154        });
3155
3156        let spans = dev.into_spans();
3157        assert_eq!(spans[0].glyph_bounds_sources, vec![BoundsSource::Tight]);
3158        assert_eq!(spans[0].bounds_source, BoundsSource::Tight);
3159        let tb = spans[0].tight_char_bounds[0];
3160        assert!(
3161            (tb[0] - oracle.x0).abs() < 1e-6,
3162            "x0: got {} want {}",
3163            tb[0],
3164            oracle.x0
3165        );
3166        assert!(
3167            (tb[1] - oracle.y0).abs() < 1e-6,
3168            "y0: got {} want {}",
3169            tb[1],
3170            oracle.y0
3171        );
3172        assert!(
3173            (tb[2] - oracle.x1).abs() < 1e-6,
3174            "x1: got {} want {}",
3175            tb[2],
3176            oracle.x1
3177        );
3178        assert!(
3179            (tb[3] - oracle.y1).abs() < 1e-6,
3180            "y1: got {} want {}",
3181            tb[3],
3182            oracle.y1
3183        );
3184        // Sanity: tight height (~7) is commensurate with the advance font_size
3185        // (10), not ~100x smaller — the regression this test exists to catch.
3186        assert!(
3187            tb[3] - tb[1] > 1.0,
3188            "tight height must be commensurate with font size"
3189        );
3190    }
3191
3192    #[test]
3193    fn m3_rich_geometry_tight_bounds_rotation_is_exact() {
3194        // 90° rotation routes through the needs_exact (transform-then-bound)
3195        // path. Oracle = bbox of the rotated outline.
3196        let (path, bbox) = synthetic_glyph_rect();
3197        let composed = Affine::scale(0.01) * Affine::rotate(std::f64::consts::FRAC_PI_2);
3198        let oracle = (composed * path.clone()).bounding_box();
3199
3200        let mut dev = TextExtractionDevice::with_mode(GeometryMode::RichGeometry);
3201        dev.spans.push(TextSpan {
3202            text: "A".into(),
3203            char_bounds: vec![[0.0, 0.0, 7.0, 10.0]],
3204            geometry_mode: GeometryMode::RichGeometry,
3205            ..Default::default()
3206        });
3207        dev.deferred_rich_glyphs.push(DeferredGlyph {
3208            coeffs: composed.as_coeffs(),
3209            font_size: 10.0,
3210            glyph_width: 7.0,
3211            needs_exact: true,
3212            outline: Some(path),
3213            font_bbox: Some(bbox),
3214        });
3215
3216        let spans = dev.into_spans();
3217        assert_eq!(spans[0].glyph_bounds_sources, vec![BoundsSource::Tight]);
3218        let tb = spans[0].tight_char_bounds[0];
3219        assert!(
3220            (tb[0] - oracle.x0).abs() < 1e-6,
3221            "x0: got {} want {}",
3222            tb[0],
3223            oracle.x0
3224        );
3225        assert!(
3226            (tb[1] - oracle.y0).abs() < 1e-6,
3227            "y0: got {} want {}",
3228            tb[1],
3229            oracle.y0
3230        );
3231        assert!(
3232            (tb[2] - oracle.x1).abs() < 1e-6,
3233            "x1: got {} want {}",
3234            tb[2],
3235            oracle.x1
3236        );
3237        assert!(
3238            (tb[3] - oracle.y1).abs() < 1e-6,
3239            "y1: got {} want {}",
3240            tb[3],
3241            oracle.y1
3242        );
3243    }
3244
3245    #[test]
3246    fn m3_empty_device_rich_geometry_no_panic() {
3247        let dev = TextExtractionDevice::with_mode(GeometryMode::RichGeometry);
3248        let text = dev.into_text();
3249        assert!(text.is_empty());
3250    }
3251
3252    #[test]
3253    fn m3_bounds_source_estimate_fallback() {
3254        // Verify that BoundsSource::Estimate can be set on a span.
3255        let s = TextSpan {
3256            geometry_mode: GeometryMode::RichGeometry,
3257            bounds_source: BoundsSource::Estimate,
3258            ..Default::default()
3259        };
3260        assert_eq!(s.bounds_source, BoundsSource::Estimate);
3261        assert_eq!(s.geometry_mode, GeometryMode::RichGeometry);
3262        // Fallback still has empty tight fields (no glyphs).
3263        assert!(s.tight_char_bounds.is_empty());
3264    }
3265
3266    #[test]
3267    fn m3_multiple_glyphs_tight_bounds_consistent_count() {
3268        // Simulate a span with 3 glyphs — tight_char_bounds and glyph_advances
3269        // should have the same length.
3270        let s = TextSpan {
3271            text: "abc".into(),
3272            x: 0.0,
3273            y: 0.0,
3274            width: 25.0,
3275            height: 12.0,
3276            font_size: 12.0,
3277            geometry_mode: GeometryMode::RichGeometry,
3278            bounds_source: BoundsSource::Tight,
3279            char_bounds: vec![[0., 0., 8., 12.], [8., 0., 16., 12.], [16., 0., 25., 12.]],
3280            tight_char_bounds: vec![
3281                [-1., -2., 9., 14.],
3282                [7., -2., 17., 14.],
3283                [15., -2., 26., 14.],
3284            ],
3285            glyph_advances: vec![8.0, 8.0, 9.0],
3286            ..Default::default()
3287        };
3288
3289        assert_eq!(s.tight_char_bounds.len(), 3);
3290        assert_eq!(s.glyph_advances.len(), 3);
3291        assert_eq!(s.char_bounds.len(), 3);
3292        // Every glyph has advance >= 0.
3293        for &adv in &s.glyph_advances {
3294            assert!(adv >= 0.0, "glyph advance must be non-negative");
3295        }
3296    }
3297}