Skip to main content

pdf_engine/
text.rs

1//! Text extraction via a custom Device implementation.
2
3use kurbo::{Affine, BezPath};
4use pdf_render::pdf_interpret::cmap::BfString;
5use pdf_render::pdf_interpret::font::Glyph;
6use pdf_render::pdf_interpret::{
7    BlendMode, ClipPath, Device, GlyphDrawMode, Image, Paint, PathDrawMode, SoftMask,
8};
9use std::cmp::Ordering;
10
11/// Minimum Y tolerance for grouping spans into horizontal bands. The
12/// effective tolerance is typically `median_font_size * BAND_Y_FRACTION`
13/// per ANN[r17/TEX4]; this constant acts as the absolute floor.
14const BAND_Y_TOLERANCE: f64 = 5.0;
15/// Fraction of the page's median font size used as the band-Y
16/// tolerance. Empirically 0.30× works across common typography —
17/// below typical leading (~1.2×) so adjacent lines never collapse,
18/// above sub-pixel baseline drift.
19const BAND_Y_FRACTION: f64 = 0.30;
20/// Multiplier applied to median line spacing to derive the horizontal
21/// paragraph-break cut threshold. Normal line-to-line progression is
22/// ~1.0× the median; paragraph breaks typically show 1.5× or more.
23const PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER: f64 = 1.8;
24
25// ANN[r17/TEX1] Multi-signal consensus thresholds.
26// The previous single-threshold scheme (gap > 0.15 * font_size) missed
27// word boundaries when kerning or narrow fonts produced small measured
28// gaps even though the PDF emitted an explicit TJ backward shift, and
29// over-emitted spaces for condensed fonts where 0.15em of kerning is
30// well below an actual word space. The consensus system weights three
31// signals and inserts a space when the combined confidence exceeds
32// SPACE_CONSENSUS_THRESHOLD.
33/// Raw TJ backward adjustment (positive in PDF TJ units) that is
34/// definitively a word break. Matches pdftotext / MuPDF heuristics —
35/// a space glyph is typically emitted as either a literal 0x20 or as
36/// a TJ adjustment of around 250 1/1000 em. 100 units is a safely
37/// conservative floor.
38const TJ_SPACE_THRESHOLD_UNITS: f32 = 100.0;
39/// Weight of the TJ offset signal when confidence is high.
40const TJ_SIGNAL_WEIGHT: f64 = 0.95;
41/// Weight of the purely geometric gap signal.
42const GAP_SIGNAL_WEIGHT: f64 = 0.80;
43/// Weight of character-heuristic signals (CamelCase, digit↔letter).
44const HEURISTIC_SIGNAL_WEIGHT: f64 = 0.60;
45/// Combined weight at which a space is inserted.
46const SPACE_CONSENSUS_THRESHOLD: f64 = 0.75;
47/// Fraction of a median character width above which a gap contributes
48/// to the geometric signal (pdf_oxide uses ~0.30).
49const GAP_TO_MEDIAN_CHAR_FRACTION: f64 = 0.30;
50/// Fallback gap fraction relative to `font_size` when the running
51/// median character width has not yet been established.
52const GAP_TO_FONT_SIZE_FALLBACK_FRACTION: f64 = 0.15;
53
54/// Minimum horizontal gap treated as a column gutter (adaptive fallback).
55const COLUMN_GAP_THRESHOLD_MIN: f64 = 10.0;
56/// Maximum adaptive column gap threshold.
57const COLUMN_GAP_THRESHOLD_MAX: f64 = 40.0;
58/// Multiplier applied to median inter-word gap to derive column threshold.
59const COLUMN_GAP_MEDIAN_MULTIPLIER: f64 = 3.0;
60/// Fallback column gap threshold when median cannot be computed.
61const COLUMN_GAP_THRESHOLD_FALLBACK: f64 = 20.0;
62/// Maximum drift allowed when matching gutters across neighboring bands.
63const COLUMN_GAP_MATCH_TOLERANCE: f64 = 12.0;
64/// Minimum number of gapped bands required before we enable column mode.
65const MIN_COLUMN_GAPPED_BANDS: usize = 3;
66/// Minimum fraction of bands in a region that must expose the shared gutters.
67const MIN_COLUMN_GAP_SUPPORT: f64 = 0.80;
68/// Minimum fraction of non-empty column slices that must look like prose.
69const MIN_DENSE_SLICE_RATIO: f64 = 0.35;
70
71/// Whether a text span's width was computed from real font metrics or estimated.
72#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
73pub enum WidthSource {
74    /// Width derived from the font's actual glyph advance (hmtx, CFF, Type1 charstring).
75    Metric,
76    /// Width estimated at 50 % of font size — no glyph metric was available.
77    #[default]
78    Estimate,
79}
80
81/// A single text span at a specific position.
82#[derive(Debug, Clone, Default)]
83pub struct TextSpan {
84    /// The extracted text.
85    pub text: String,
86    /// X position in user space.
87    pub x: f64,
88    /// Y position in user space.
89    pub y: f64,
90    /// Approximate bounding-box width in user space.
91    pub width: f64,
92    /// Approximate bounding-box height in user space.
93    pub height: f64,
94    /// Font size (approximate, from transform).
95    pub font_size: f64,
96
97    // ---- G1 read-only metadata (added 2026-05; backward-compatible) ----
98    /// PostScript name of the font, with any 6-character subset prefix stripped
99    /// (e.g. `Helvetica-Bold`, `TimesNewRomanPS-BoldMT`). `None` for Type1
100    /// standard-14 fonts and Type3 fonts where no embedded font data is
101    /// available through the public `pdf-interpret` API.
102    pub font_name: Option<String>,
103    /// Inferred bold style: `weight >= 700` or PostScript name suggests bold
104    /// ("bold", "demi", "semibold", "heavy", "black"). Defaults `false` when
105    /// no descriptor data is reachable.
106    pub is_bold: bool,
107    /// Inferred italic style: FontDescriptor /Italic flag set or PostScript
108    /// name suggests italic/oblique/slant. Defaults `false` when no descriptor
109    /// data is reachable.
110    pub is_italic: bool,
111    /// Fill color as sRGB RGBA, derived from `Paint::Color(c).to_rgba().to_rgba8()`
112    /// at the moment of glyph paint. `None` for `Paint::Pattern` (tiling /
113    /// shading) — the editor falls back to "auto" in that case.
114    pub color: Option<[u8; 4]>,
115
116    // ---- G2 glyph-level metrics (added 2026-05) ----
117    /// Whether glyph widths were measured from real font advance data or estimated.
118    pub width_source: WidthSource,
119    /// Per-glyph bounding boxes in user-space, one entry per source glyph.
120    /// `[x0, y0, x1, y1]` with y0 < y1 (PDF coordinate frame).
121    pub char_bounds: Vec<[f64; 4]>,
122}
123
124impl TextSpan {
125    /// Conservative right edge using whichever is wider: measured or estimated.
126    /// Used by column detection to avoid underestimating span extent.
127    fn right(&self) -> f64 {
128        self.x + self.width.max(self.estimated_width())
129    }
130
131    /// Right edge from measured glyph positions only.
132    fn measured_right(&self) -> f64 {
133        self.x + self.width
134    }
135
136    fn estimated_width(&self) -> f64 {
137        let char_count = self.text.chars().count() as f64;
138        if char_count <= 0.0 {
139            self.font_size * 0.5
140        } else {
141            self.font_size * 0.5 * char_count
142        }
143    }
144}
145
146/// A block of text (grouped by reading order).
147#[derive(Debug, Clone)]
148pub struct TextBlock {
149    /// Spans within this block, sorted by position.
150    pub spans: Vec<TextSpan>,
151}
152
153impl TextBlock {
154    /// Concatenate all spans into a single string.
155    ///
156    /// Spans that are close together are joined without a separator;
157    /// a space is inserted when the gap between spans exceeds half
158    /// the average character width.
159    pub fn text(&self) -> String {
160        if self.spans.is_empty() {
161            return String::new();
162        }
163        let mut result = self.spans[0].text.clone();
164        for pair in self.spans.windows(2) {
165            let prev = &pair[0];
166            let curr = &pair[1];
167            let expected_end = prev.measured_right();
168            let gap = curr.x - expected_end;
169            if gap <= prev.font_size * 0.12 {
170                if let Some(trimmed) = trim_overlapping_word_prefix(&prev.text, &curr.text) {
171                    result.push_str(&trimmed);
172                    continue;
173                }
174            }
175            if gap > prev.font_size * 0.25 {
176                result.push(' ');
177            }
178            result.push_str(&curr.text);
179        }
180        result
181    }
182}
183
184#[derive(Debug, Clone)]
185struct TextBand {
186    y: f64,
187    spans: Vec<TextSpan>,
188}
189
190impl TextBand {
191    fn new(span: TextSpan) -> Self {
192        Self {
193            y: span.y,
194            spans: vec![span],
195        }
196    }
197
198    fn sort_spans(&mut self) {
199        self.spans.sort_by(|a, b| {
200            a.x.partial_cmp(&b.x)
201                .unwrap_or(Ordering::Equal)
202                .then_with(|| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal))
203        });
204        collapse_overprinted_spans(&mut self.spans);
205    }
206
207    fn row_block(&self) -> TextBlock {
208        let mut spans = self.spans.clone();
209        spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
210        TextBlock { spans }
211    }
212
213    fn left(&self) -> f64 {
214        self.spans
215            .iter()
216            .map(|span| span.x)
217            .fold(f64::INFINITY, f64::min)
218    }
219
220    fn right(&self) -> f64 {
221        self.spans
222            .iter()
223            .map(TextSpan::right)
224            .fold(f64::NEG_INFINITY, f64::max)
225    }
226
227    fn width(&self) -> f64 {
228        (self.right() - self.left()).max(0.0)
229    }
230
231    fn gap_midpoints(&self, column_gap_threshold: f64) -> Vec<f64> {
232        self.gaps(column_gap_threshold)
233            .into_iter()
234            .map(|gap| (gap.start + gap.end) * 0.5)
235            .collect()
236    }
237
238    fn gaps(&self, column_gap_threshold: f64) -> Vec<BandGap> {
239        if self.spans.len() < 2 {
240            return Vec::new();
241        }
242
243        let mut spans = self.spans.clone();
244        spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
245
246        let mut gaps = Vec::new();
247        let mut prev_right = spans[0].right();
248        for span in spans.iter().skip(1) {
249            let gap = span.x - prev_right;
250            if gap >= column_gap_threshold {
251                gaps.push(BandGap {
252                    start: prev_right,
253                    end: span.x,
254                });
255            }
256            prev_right = prev_right.max(span.right());
257        }
258
259        gaps
260    }
261
262    fn split_by_boundaries(&self, boundaries: &[f64]) -> Vec<Vec<TextSpan>> {
263        let mut columns = vec![Vec::new(); boundaries.len() + 1];
264        for span in &self.spans {
265            let center_x = span.x + span.width.max(span.estimated_width()) * 0.5;
266            let column_idx = boundaries
267                .iter()
268                .position(|boundary| center_x < *boundary)
269                .unwrap_or(boundaries.len());
270            columns[column_idx].push(span.clone());
271        }
272
273        for spans in &mut columns {
274            spans.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
275        }
276
277        columns
278    }
279
280    fn fits_single_column(
281        &self,
282        boundaries: &[f64],
283        region_left: f64,
284        region_right: f64,
285    ) -> Option<usize> {
286        let mut column_idx: Option<usize> = None;
287        for span in &self.spans {
288            let left = span.x;
289            let right = span.right();
290            if boundaries
291                .iter()
292                .any(|boundary| left < *boundary && right > *boundary)
293            {
294                return None;
295            }
296
297            let center_x = left + (right - left) * 0.5;
298            let idx = boundaries
299                .iter()
300                .position(|boundary| center_x < *boundary)
301                .unwrap_or(boundaries.len());
302            match column_idx {
303                Some(existing) if existing != idx => return None,
304                Some(_) => {}
305                None => column_idx = Some(idx),
306            }
307        }
308        let idx = column_idx?;
309        let mut edges = Vec::with_capacity(boundaries.len() + 2);
310        edges.push(region_left);
311        edges.extend_from_slice(boundaries);
312        edges.push(region_right);
313
314        let column_width = (edges[idx + 1] - edges[idx]).max(0.0);
315        if column_width <= 0.0 || self.width() > column_width * 0.8 {
316            return None;
317        }
318
319        Some(idx)
320    }
321}
322
323#[derive(Debug, Clone, Copy)]
324struct BandGap {
325    start: f64,
326    end: f64,
327}
328
329/// A Device implementation that captures text from draw_glyph calls.
330///
331/// ANN[r17/TEX1][r17/TEX3] Space detection uses a multi-signal consensus
332/// rather than a single geometric threshold. Three signals vote:
333///   1. `pending_tj_offset`  — raw TJ backward shift surfaced by the
334///      interpreter (confidence 0.95). This is the definitive word-break
335///      signal used by pdftotext / MuPDF.
336///   2. geometric gap        — measured horizontal distance between the
337///      previous glyph's right edge and this glyph's origin (confidence
338///      0.80). Compared against the running median glyph width rather
339///      than a flat em-fraction so condensed/wide fonts are handled
340///      uniformly.
341///   3. character heuristic  — CamelCase transition or digit↔letter
342///      transition at the merge point (confidence 0.60). Catches cases
343///      where the writer relied on typography (e.g. table cells glued
344///      with zero gap: `Qty1Price$5`).
345///
346/// A space is inserted when the weighted sum meets SPACE_CONSENSUS_THRESHOLD.
347/// Span accumulation still merges adjacent glyphs into one TextSpan (TEX3)
348/// so downstream reading-order logic sees logical text runs, not individual
349/// character positions.
350pub(crate) struct TextExtractionDevice {
351    spans: Vec<TextSpan>,
352    last_y: f64,
353    last_end_x: f64,
354    /// TJ adjustment in raw 1/1000 em units since the last glyph was
355    /// drawn. Positive values = backward shift (i.e., explicit horizontal
356    /// space). Reset every time a glyph is drawn.
357    pending_tj_offset: f32,
358    /// Running sample of measured glyph widths used as the adaptive
359    /// reference for the geometric gap signal. Cheap to maintain and
360    /// avoids having to re-walk all spans per decision.
361    glyph_widths: Vec<f64>,
362    /// Cached median glyph width (kept fresh every `MEDIAN_REFRESH`
363    /// insertions). Zero = not yet established, caller falls back to
364    /// font-size scaling.
365    cached_median_char_width: f64,
366}
367
368const MEDIAN_REFRESH: usize = 32;
369
370impl Default for TextExtractionDevice {
371    fn default() -> Self {
372        Self::new()
373    }
374}
375
376impl TextExtractionDevice {
377    /// Create a new text extraction device.
378    pub fn new() -> Self {
379        Self {
380            spans: Vec::new(),
381            last_y: f64::NEG_INFINITY,
382            last_end_x: f64::NEG_INFINITY,
383            pending_tj_offset: 0.0,
384            glyph_widths: Vec::new(),
385            cached_median_char_width: 0.0,
386        }
387    }
388
389    /// Refresh the cached median char width. Called lazily from
390    /// `draw_glyph` to keep the hot path cheap.
391    fn refresh_median_char_width(&mut self) {
392        if self.glyph_widths.is_empty() {
393            self.cached_median_char_width = 0.0;
394            return;
395        }
396        let mut sorted = self.glyph_widths.clone();
397        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
398        self.cached_median_char_width = sorted[sorted.len() / 2];
399    }
400
401    /// Decide whether a space should be glued between two glyphs within
402    /// the same span. Returns (insert_space, start_new_span).
403    fn evaluate_space_consensus(
404        &self,
405        gap: f64,
406        font_size: f64,
407        prev_text: &str,
408        next_text: &str,
409    ) -> bool {
410        let mut confidence = 0.0;
411
412        // Signal 1 — TJ offset (highest confidence). Raw units; a full
413        // space is ~250. Anything over TJ_SPACE_THRESHOLD_UNITS counts.
414        if self.pending_tj_offset.abs() >= TJ_SPACE_THRESHOLD_UNITS {
415            confidence += TJ_SIGNAL_WEIGHT;
416        }
417
418        // Signal 2 — geometric gap. Prefer the adaptive median-char-width
419        // reference; fall back to font-size when the median hasn't been
420        // established yet (first few glyphs on a page).
421        let gap_reference = if self.cached_median_char_width > 0.0 {
422            self.cached_median_char_width * GAP_TO_MEDIAN_CHAR_FRACTION
423        } else {
424            font_size * GAP_TO_FONT_SIZE_FALLBACK_FRACTION
425        };
426        if gap > gap_reference {
427            confidence += GAP_SIGNAL_WEIGHT;
428        }
429
430        // Signal 3 — character-class transitions. Only checked when the
431        // previous span ends with a character and the incoming text starts
432        // with one; avoids double-counting with punctuation.
433        if let (Some(prev_last), Some(next_first)) =
434            (prev_text.chars().last(), next_text.chars().next())
435        {
436            let camel = prev_last.is_lowercase() && next_first.is_uppercase();
437            let digit_to_letter = prev_last.is_ascii_digit() && next_first.is_alphabetic();
438            let letter_to_digit = prev_last.is_alphabetic() && next_first.is_ascii_digit();
439            if camel || digit_to_letter || letter_to_digit {
440                confidence += HEURISTIC_SIGNAL_WEIGHT;
441            }
442        }
443
444        confidence >= SPACE_CONSENSUS_THRESHOLD
445    }
446
447    /// Consume the device and return extracted text as a single string.
448    pub fn into_text(self) -> String {
449        let blocks = group_spans_into_blocks(self.spans);
450        let lines: Vec<String> = blocks.iter().map(|b| b.text()).collect();
451        let stitched = stitch_hyphenated_lines(&lines);
452        normalize_text_output(&stitched)
453    }
454
455    /// Consume the device and return text blocks.
456    pub fn into_blocks(self) -> Vec<TextBlock> {
457        group_spans_into_blocks(self.spans)
458    }
459
460    /// Consume the device and return raw spans.
461    #[allow(dead_code)]
462    pub(crate) fn into_spans(self) -> Vec<TextSpan> {
463        self.spans
464    }
465}
466
467impl Device<'_> for TextExtractionDevice {
468    fn set_soft_mask(&mut self, _: Option<SoftMask<'_>>) {}
469    fn set_blend_mode(&mut self, _: BlendMode) {}
470    fn draw_path(&mut self, _: &BezPath, _: Affine, _: &Paint<'_>, _: &PathDrawMode) {}
471    fn push_clip_path(&mut self, _: &ClipPath) {}
472    fn push_transparency_group(&mut self, _: f32, _: Option<SoftMask<'_>>, _: BlendMode) {}
473    fn draw_image(&mut self, _: Image<'_, '_>, _: Affine) {}
474    fn pop_clip_path(&mut self) {}
475    fn pop_transparency_group(&mut self) {}
476
477    fn draw_glyph(
478        &mut self,
479        glyph: &Glyph<'_>,
480        transform: Affine,
481        glyph_transform: Affine,
482        paint: &Paint<'_>,
483        _draw_mode: &GlyphDrawMode,
484    ) {
485        let text = match glyph.as_unicode() {
486            Some(BfString::Char(c)) => c.to_string(),
487            Some(BfString::String(s)) => s,
488            None => return,
489        };
490
491        let composed = transform * glyph_transform;
492        let coeffs = composed.as_coeffs();
493        let x = coeffs[4];
494        let y = coeffs[5];
495        let glyph_scale = (coeffs[0].powi(2) + coeffs[1].powi(2)).sqrt().abs();
496        let font_size = glyph_scale * 1000.0;
497
498        // G2: distinguish real advance (Metric) from estimate (Estimate).
499        let (glyph_width, glyph_ws) = glyph_width_and_source(glyph, font_size);
500        let glyph_end_x = x + glyph_width;
501        let glyph_bound = [x, y, glyph_end_x, y + font_size];
502
503        let style = derive_glyph_style(glyph);
504        let color = paint_to_rgba(paint);
505
506        // ANN[r17/TEX4] Feed the running sample used to derive the adaptive
507        // median character width. Capped to protect against pathological
508        // pages with hundreds of thousands of glyphs.
509        if self.glyph_widths.len() < 4096 {
510            self.glyph_widths.push(glyph_width);
511            if self.glyph_widths.len().is_multiple_of(MEDIAN_REFRESH) {
512                self.refresh_median_char_width();
513            }
514        }
515
516        let same_line = (y - self.last_y).abs() <= font_size.max(BAND_Y_TOLERANCE) * 0.35;
517        let gap = x - self.last_end_x;
518        let adjacent = same_line && gap >= -font_size * 0.25 && gap < font_size * 0.5;
519
520        // G1: only merge into the previous span when font + style + color
521        // match. Otherwise the editor's style toolbar would render the wrong
522        // state for the cursor position.
523        let style_matches = self
524            .spans
525            .last()
526            .map(|last| {
527                last.font_name == style.font_name
528                    && last.is_bold == style.is_bold
529                    && last.is_italic == style.is_italic
530                    && last.color == color
531            })
532            .unwrap_or(false);
533
534        if adjacent && !self.spans.is_empty() && style_matches {
535            // ANN[r17/TEX1] Multi-signal consensus replaces the prior
536            // single-threshold rule (`gap > 0.15 * font_size`). The
537            // consensus evaluates TJ offset, geometric gap, and
538            // character-class transitions; a space is inserted only
539            // when the weighted sum meets SPACE_CONSENSUS_THRESHOLD.
540            // Decision is computed before the mutable borrow of `last`
541            // to keep the borrow checker happy.
542            let want_space = {
543                let last = self.spans.last().expect("checked non-empty");
544                !last.text.ends_with(' ')
545                    && !text.starts_with(' ')
546                    && self.evaluate_space_consensus(gap, font_size, &last.text, &text)
547            };
548            let last = self.spans.last_mut().expect("checked non-empty");
549            if want_space {
550                last.text.push(' ');
551            }
552            last.text.push_str(&text);
553            last.width = last.width.max(glyph_end_x - last.x);
554            last.height = last.height.max(font_size);
555            // G2: append char_bound; downgrade width_source if this glyph is Estimate.
556            last.char_bounds.push(glyph_bound);
557            if glyph_ws == WidthSource::Estimate {
558                last.width_source = WidthSource::Estimate;
559            }
560            self.last_y = y;
561            self.last_end_x = glyph_end_x;
562            // ANN[r17/TEX1] Consume the TJ signal: it only counts for
563            // the one merge it preceded.
564            self.pending_tj_offset = 0.0;
565            return;
566        }
567
568        self.last_y = y;
569        self.last_end_x = glyph_end_x;
570        // ANN[r17/TEX1] Non-adjacent glyph starts a fresh span, so any
571        // pending TJ offset is about within-span word breaks and no longer
572        // meaningful here.
573        self.pending_tj_offset = 0.0;
574
575        self.spans.push(TextSpan {
576            text,
577            x,
578            y,
579            width: glyph_width,
580            height: font_size,
581            font_size,
582            font_name: style.font_name,
583            is_bold: style.is_bold,
584            is_italic: style.is_italic,
585            color,
586            width_source: glyph_ws,
587            char_bounds: vec![glyph_bound],
588        });
589    }
590
591    // ANN[r17/TEX1] Record TJ offsets. Accumulate because a single
592    // inter-substring gap may be expressed as multiple numeric entries
593    // (rare, but legal per PDF §9.4.3). The next draw_glyph consumes
594    // the sum.
595    fn text_adjustment(&mut self, amount: f32) {
596        self.pending_tj_offset += amount;
597    }
598}
599
600/// Style metadata derived from a `Glyph` for the G1 text-run extension.
601#[derive(Debug, Default, Clone)]
602struct GlyphStyle {
603    font_name: Option<String>,
604    is_bold: bool,
605    is_italic: bool,
606}
607
608/// Strip a 6-character subset prefix (e.g. `AAAAAA+Helvetica` → `Helvetica`).
609fn strip_subset_prefix(name: &str) -> &str {
610    match name.split_once('+') {
611        Some((prefix, rest)) if prefix.len() == 6 => rest,
612        _ => name,
613    }
614}
615
616/// Heuristic style inference from a PostScript name when no descriptor
617/// flags are reachable. Matches the same rules `pdf-interpret` uses in
618/// `FallbackFontQuery::new`.
619fn name_style_hints(name: &str) -> (bool, bool) {
620    let lower = name.to_ascii_lowercase();
621    let italic = lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
622    let bold = lower.contains("bold")
623        || lower.contains("demi")
624        || lower.contains("semibold")
625        || lower.contains("heavy")
626        || lower.contains("black");
627    (bold, italic)
628}
629
630fn derive_glyph_style(glyph: &Glyph<'_>) -> GlyphStyle {
631    match glyph {
632        Glyph::Outline(outline) => {
633            if let Some(data) = outline.font_data() {
634                let raw = data.postscript_name.as_deref().unwrap_or("");
635                let name = strip_subset_prefix(raw).to_string();
636                let weight_bold = data.weight.is_some_and(|w| w >= 700);
637                let (name_bold, name_italic) = name_style_hints(&name);
638                GlyphStyle {
639                    font_name: if name.is_empty() { None } else { Some(name) },
640                    is_bold: weight_bold || name_bold,
641                    is_italic: data.is_italic || name_italic,
642                }
643            } else {
644                // Type1 / non-embedded font — descriptor not surfaced
645                // via font_data(). Fall back to the name-only
646                // accessor which works for standard-14 fallbacks.
647                let raw = outline.postscript_name().unwrap_or_default();
648                let name = strip_subset_prefix(&raw).to_string();
649                let (name_bold, name_italic) = name_style_hints(&name);
650                GlyphStyle {
651                    font_name: if name.is_empty() { None } else { Some(name) },
652                    is_bold: name_bold,
653                    is_italic: name_italic,
654                }
655            }
656        }
657        Glyph::Type3(_) => GlyphStyle::default(),
658    }
659}
660
661fn paint_to_rgba(paint: &Paint<'_>) -> Option<[u8; 4]> {
662    match paint {
663        Paint::Color(c) => Some(c.to_rgba().to_rgba8()),
664        Paint::Pattern(_) => None,
665    }
666}
667
668/// Returns `(advance_in_user_space, WidthSource)` for a glyph.
669///
670/// Uses the real advance from `OutlineGlyph::advance_width()` when available
671/// (returns `WidthSource::Metric`); falls back to 50% em otherwise
672/// (`WidthSource::Estimate`). The result is clamped to at least 25% em so
673/// invisible-glyph outliers do not collapse spans.
674fn glyph_width_and_source(glyph: &Glyph<'_>, font_size: f64) -> (f64, WidthSource) {
675    match glyph {
676        Glyph::Outline(outline) => {
677            if let Some(w) = outline.advance_width() {
678                let advance = (w as f64 / 1000.0 * font_size).max(font_size * 0.25);
679                (advance, WidthSource::Metric)
680            } else {
681                (font_size * 0.5, WidthSource::Estimate)
682            }
683        }
684        Glyph::Type3(_) => (font_size * 0.5, WidthSource::Estimate),
685    }
686}
687
688/// Collapse fake-bold / overprint duplicates inside one band.
689///
690/// Real-word corpus failures such as 0105.pdf draw the same text several times
691/// with sub-point x drift to simulate heavier weight. Text extraction should
692/// keep the most informative span once rather than concatenate every overprint.
693fn collapse_overprinted_spans(spans: &mut Vec<TextSpan>) {
694    if spans.len() < 2 {
695        return;
696    }
697
698    let mut deduped: Vec<TextSpan> = Vec::with_capacity(spans.len());
699    for span in spans.drain(..) {
700        if let Some(last) = deduped.last_mut() {
701            if spans_are_overprint_duplicates(last, &span) {
702                let choose_incoming = span.text.chars().count() > last.text.chars().count()
703                    || (span.text.chars().count() == last.text.chars().count()
704                        && span.width > last.width);
705                let preferred_text = if choose_incoming {
706                    span.text.clone()
707                } else {
708                    last.text.clone()
709                };
710                let left = last.x.min(span.x);
711                let right = last.right().max(span.right());
712                last.x = left;
713                last.y = (last.y + span.y) * 0.5;
714                last.width = (right - left).max(last.width).max(span.width);
715                last.height = last.height.max(span.height);
716                last.font_size = last.font_size.max(span.font_size);
717                last.text = preferred_text;
718                continue;
719            }
720        }
721
722        deduped.push(span);
723    }
724
725    *spans = deduped;
726}
727
728fn spans_are_overprint_duplicates(lhs: &TextSpan, rhs: &TextSpan) -> bool {
729    let lhs_text = lhs.text.trim();
730    let rhs_text = rhs.text.trim();
731    if lhs_text.is_empty() || rhs_text.is_empty() {
732        return false;
733    }
734
735    let same_baseline = (lhs.y - rhs.y).abs() <= lhs.font_size.max(rhs.font_size) * 0.12;
736    if !same_baseline {
737        return false;
738    }
739
740    let lhs_left = lhs.x;
741    let lhs_right = lhs.right();
742    let rhs_left = rhs.x;
743    let rhs_right = rhs.right();
744    let overlap = (lhs_right.min(rhs_right) - lhs_left.max(rhs_left)).max(0.0);
745    let min_width = (lhs_right - lhs_left).min(rhs_right - rhs_left).max(1.0);
746    let heavily_overlaps = overlap / min_width >= 0.85;
747    if !heavily_overlaps {
748        return false;
749    }
750
751    lhs_text == rhs_text || lhs_text.starts_with(rhs_text) || rhs_text.starts_with(lhs_text)
752}
753
754fn trim_overlapping_word_prefix(prev: &str, curr: &str) -> Option<String> {
755    let prev_chars: Vec<char> = prev.trim_end().chars().collect();
756    let curr_chars: Vec<char> = curr.trim_start().chars().collect();
757    let max = prev_chars.len().min(curr_chars.len());
758
759    for len in (4..=max).rev() {
760        let prev_start = prev_chars.len() - len;
761        if prev_chars[prev_start..] != curr_chars[..len] {
762            continue;
763        }
764
765        if !curr_chars[..len].iter().all(|ch| ch.is_alphanumeric()) {
766            continue;
767        }
768
769        let prev_boundary = prev_start == 0 || !prev_chars[prev_start - 1].is_alphanumeric();
770        let curr_boundary = len == curr_chars.len() || !curr_chars[len].is_alphanumeric();
771        if !prev_boundary || !curr_boundary {
772            continue;
773        }
774
775        return Some(curr_chars[len..].iter().collect());
776    }
777
778    None
779}
780
781/// Compute an adaptive column gap threshold from a set of bands.
782///
783/// Collects all positive inter-span gaps within each band, computes the
784/// median, and returns `COLUMN_GAP_MEDIAN_MULTIPLIER × median`, clamped to
785/// `[COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX]`.  Falls back to
786/// `COLUMN_GAP_THRESHOLD_FALLBACK` when there are no measurable gaps.
787fn compute_adaptive_column_gap(bands: &[TextBand]) -> f64 {
788    let mut all_gaps: Vec<f64> = Vec::new();
789
790    for band in bands {
791        if band.spans.len() < 2 {
792            continue;
793        }
794        let mut sorted = band.spans.clone();
795        sorted.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
796        let mut prev_right = sorted[0].right();
797        for span in sorted.iter().skip(1) {
798            let gap = span.x - prev_right;
799            if gap > 0.0 {
800                all_gaps.push(gap);
801            }
802            prev_right = prev_right.max(span.right());
803        }
804    }
805
806    if all_gaps.is_empty() {
807        return COLUMN_GAP_THRESHOLD_FALLBACK;
808    }
809
810    all_gaps.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
811
812    let min_gap = all_gaps[0];
813
814    // When all inter-span gaps are already large (> MIN threshold), they are
815    // likely all column gaps — the draw_glyph merger absorbed word-level
816    // spaces into span text.  Use a fraction of the smallest gap so that
817    // ALL column gaps exceed the threshold.
818    if min_gap > COLUMN_GAP_THRESHOLD_MIN {
819        return (min_gap * 0.75).clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
820    }
821
822    // Look for a natural break: the largest relative jump between consecutive
823    // sorted gaps separates word-level gaps from column gaps.
824    let mut best_break_threshold = 0.0f64;
825    let mut best_ratio = 1.5f64; // require at least 1.5× jump
826    for pair in all_gaps.windows(2) {
827        if pair[0] > 0.5 {
828            let ratio = pair[1] / pair[0];
829            if ratio > best_ratio {
830                best_ratio = ratio;
831                best_break_threshold = (pair[0] + pair[1]) * 0.5;
832            }
833        }
834    }
835
836    if best_break_threshold > 0.0 {
837        return best_break_threshold.clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX);
838    }
839
840    // Fallback: median × multiplier.
841    let mid = all_gaps.len() / 2;
842    let median = if all_gaps.len().is_multiple_of(2) {
843        (all_gaps[mid - 1] + all_gaps[mid]) * 0.5
844    } else {
845        all_gaps[mid]
846    };
847
848    (median * COLUMN_GAP_MEDIAN_MULTIPLIER)
849        .clamp(COLUMN_GAP_THRESHOLD_MIN, COLUMN_GAP_THRESHOLD_MAX)
850}
851
852/// Group spans into reading-order blocks, using column-aware reordering when
853/// a contiguous region repeatedly exposes the same gutters.
854/// Per-page adaptive parameters derived from the span set before any
855/// grouping happens. Centralising these here (TEX4) means the rest of
856/// the pipeline — band grouping, XY-Cut cuts, in-block space insertion
857/// — all speak the same typographic baseline for this specific page,
858/// rather than each helper reaching for an independent fixed constant.
859#[derive(Debug, Clone, Copy)]
860struct PageStats {
861    /// Median font size across all spans (pt).
862    median_font_size: f64,
863    /// Median measured character width (pt). Zero-guarded fallback is
864    /// 0.5 × median_font_size when there aren't enough samples.
865    /// Currently populated for diagnostics / future tuning; allow dead_code
866    /// under `-D warnings` until a reader is added.
867    #[allow(dead_code)]
868    median_char_width: f64,
869    /// Tight line-to-line spacing (25th percentile of pairwise band
870    /// gaps), representing the body-text leading on this page. The
871    /// quartile is used instead of the median so large paragraph /
872    /// zone gaps don't inflate the baseline. Zero if the page has
873    /// only one band.
874    median_line_spacing: f64,
875}
876
877impl PageStats {
878    fn from_spans(spans: &[TextSpan]) -> Self {
879        if spans.is_empty() {
880            return Self {
881                median_font_size: 12.0,
882                median_char_width: 6.0,
883                median_line_spacing: 0.0,
884            };
885        }
886
887        // Median font size.
888        let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
889        sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
890        let median_font_size = sizes[sizes.len() / 2];
891
892        // Median char width — measured width / char count, per span.
893        let mut char_widths: Vec<f64> = spans
894            .iter()
895            .filter_map(|s| {
896                let chars = s.text.chars().count();
897                if chars > 0 && s.width > 0.0 {
898                    Some(s.width / chars as f64)
899                } else {
900                    None
901                }
902            })
903            .collect();
904        let median_char_width = if char_widths.is_empty() {
905            median_font_size * 0.5
906        } else {
907            char_widths.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
908            char_widths[char_widths.len() / 2]
909        };
910
911        // Median line spacing — pairwise gaps between consecutive band
912        // y-values.
913        let band_tolerance = (median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
914        let mut ys: Vec<f64> = spans.iter().map(|s| s.y).collect();
915        ys.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
916        let mut band_ys: Vec<f64> = Vec::new();
917        for y in ys {
918            if band_ys
919                .last()
920                .map(|prev: &f64| (prev - y).abs() > band_tolerance)
921                .unwrap_or(true)
922            {
923                band_ys.push(y);
924            }
925        }
926        // ANN[r17/TEX4] "Line spacing" here means the TIGHT line-to-line
927        // gap inside a text block — not the median of all gaps. Using
928        // the median drags the estimate up when the page has
929        // paragraph / zone breaks (which are the very gaps the
930        // paragraph-break threshold is supposed to EXCEED). The 25th
931        // percentile is the smallest gap that still shows up in more
932        // than one place on the page; it captures body-text leading
933        // robustly even when large zone gaps dominate.
934        let median_line_spacing = if band_ys.len() < 2 {
935            0.0
936        } else {
937            let mut spacings: Vec<f64> = band_ys
938                .windows(2)
939                .map(|pair| (pair[0] - pair[1]).abs())
940                .collect();
941            spacings.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
942            let q1_index = spacings.len() / 4;
943            spacings[q1_index]
944        };
945
946        Self {
947            median_font_size,
948            median_char_width,
949            median_line_spacing,
950        }
951    }
952}
953
954// ANN[r17/TEX2] Maximum recursion depth for XY-Cut. Any real page layout
955// is decomposable in well under 10 alternating cuts; the cap guards
956// against pathological inputs where the cut predicate keeps triggering
957// due to floating-point drift.
958const XY_CUT_MAX_DEPTH: usize = 12;
959/// Minimum fraction of a region's width that a vertical gap must reach
960/// before it qualifies as a column gutter.
961const XY_CUT_VERTICAL_GAP_REGION_FRACTION: f64 = 0.04;
962/// Floor (in pt) for vertical gap regardless of region width. Matches
963/// the previous `COLUMN_GAP_THRESHOLD_MIN` and keeps XY-Cut conservative
964/// on narrow regions (sidebars, tall columns).
965const XY_CUT_VERTICAL_GAP_FLOOR: f64 = 10.0;
966/// Multiplier applied to median font size to produce the horizontal-gap
967/// threshold. 1.8 × line-height matches typical paragraph spacing.
968const XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER: f64 = 1.8;
969/// Minimum number of spans a column must contain before it is eligible
970/// for acceptance — one-span "columns" are almost always sidebar noise
971/// or table-cell fragments.
972const XY_CUT_MIN_SPANS_PER_COLUMN: usize = 2;
973/// Average characters per band a column must have before it's accepted
974/// as dense prose (vs. a short-cell table column).
975const XY_CUT_MIN_CHARS_PER_BAND: f64 = 8.0;
976
977/// ANN[r17/TEX2][r17/TEX4] Top-level grouping uses recursive XY-Cut
978/// with a density guard. Per-page stats are computed once up front so
979/// every decision downstream speaks the same typographic baseline.
980fn group_spans_into_blocks(spans: Vec<TextSpan>) -> Vec<TextBlock> {
981    if spans.is_empty() {
982        return Vec::new();
983    }
984    let stats = PageStats::from_spans(&spans);
985    xy_cut_recursive(spans, 0, &stats)
986}
987
988fn xy_cut_recursive(spans: Vec<TextSpan>, depth: usize, stats: &PageStats) -> Vec<TextBlock> {
989    if spans.is_empty() {
990        return Vec::new();
991    }
992    if depth >= XY_CUT_MAX_DEPTH {
993        return band_based_blocks(spans, stats);
994    }
995
996    // ANN[r17/TEX2] Pick whichever direction has the largest qualifying
997    // gap. Always cutting vertically first breaks layouts where a
998    // footer sits in the mid-x range — it would attach to the left
999    // column instead of being recognized as a page-level zone. The
1000    // "largest gap wins" rule is the standard XY-Cut tie-breaker used
1001    // by academic OCR literature and matches pdf_oxide.
1002    let vcut = try_vertical_cut(&spans, stats);
1003    let hcut = try_horizontal_cut(&spans, stats);
1004
1005    let (chosen, _) = match (vcut, hcut) {
1006        (Some((v_groups, v_gap)), Some((h_groups, h_gap))) => {
1007            if v_gap >= h_gap {
1008                (Some(v_groups), v_gap)
1009            } else {
1010                (Some(h_groups), h_gap)
1011            }
1012        }
1013        (Some((v_groups, v_gap)), None) => (Some(v_groups), v_gap),
1014        (None, Some((h_groups, h_gap))) => (Some(h_groups), h_gap),
1015        (None, None) => (None, 0.0),
1016    };
1017
1018    if let Some(groups) = chosen {
1019        let mut out = Vec::new();
1020        for group in groups {
1021            out.extend(xy_cut_recursive(group, depth + 1, stats));
1022        }
1023        return out;
1024    }
1025
1026    band_based_blocks(spans, stats)
1027}
1028
1029/// Emit per-band row blocks without any column detection. Used as the
1030/// leaf of XY-Cut recursion — at this point the region either has no
1031/// further cuts or the density guard refused them.
1032fn band_based_blocks(spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBlock> {
1033    // XY-Cut can miss recurring gutters when a small number of bands span the
1034    // full page width (e.g. a running header above a 3-column body). In that
1035    // case, fall back to the older band/gutter detector inside the leaf region
1036    // instead of flattening everything row-major.
1037    group_spans_into_blocks_legacy_with_stats(spans, stats)
1038}
1039
1040/// Median font-size helper. Currently unreferenced after `PageStats` took over
1041/// the typography baseline computation; kept available for future tuning paths.
1042#[allow(dead_code)]
1043fn median_font_size(spans: &[TextSpan]) -> f64 {
1044    if spans.is_empty() {
1045        return 12.0;
1046    }
1047    let mut sizes: Vec<f64> = spans.iter().map(|s| s.font_size).collect();
1048    sizes.sort_by(|a, b| a.partial_cmp(b).unwrap_or(Ordering::Equal));
1049    sizes[sizes.len() / 2]
1050}
1051
1052/// Attempt a vertical (column) cut. Returns the span groups plus the
1053/// gap size (in pt) if a suitable gutter is found AND the density +
1054/// alignment guards accept.
1055///
1056/// ANN[r17/TEX2] Three guards together avoid false-positive columns:
1057///   1. `min_gap` is the MAX of (median_font, 4% of region width, 10pt)
1058///      — deliberately lower than `median_font * 2` so narrow-gutter
1059///      academic papers (12pt gutters, common in print) are still
1060///      detected.
1061///   2. `columns_are_dense` rejects column splits where either side
1062///      has <2 spans or <8 chars/band — catches table cells.
1063///   3. `columns_are_band_aligned` rejects cuts where any band would
1064///      end up on only one side of the cut while being wider than
1065///      ~70% of that side's column width — catches full-width
1066///      paragraphs (Intro / Outro) that accidentally sit in the
1067///      left-column x-range.
1068fn try_vertical_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1069    if spans.len() < 2 * XY_CUT_MIN_SPANS_PER_COLUMN {
1070        return None;
1071    }
1072
1073    let region_left = spans.iter().map(|s| s.x).fold(f64::INFINITY, f64::min);
1074    let region_right = spans
1075        .iter()
1076        .map(TextSpan::right)
1077        .fold(f64::NEG_INFINITY, f64::max);
1078    let region_width = region_right - region_left;
1079    if region_width <= 0.0 {
1080        return None;
1081    }
1082
1083    // ANN[r17/TEX2][r17/TEX4] Threshold uses the ADAPTIVE median-word-gap
1084    // from the bands rather than a flat font-size multiple. Narrow-gutter
1085    // academic layouts have 12pt gutters next to 4pt word spaces — the
1086    // adaptive threshold scales with the actual typography used on this
1087    // page. Clamped to `XY_CUT_VERTICAL_GAP_FLOOR` to avoid firing on
1088    // ordinary inter-word spaces when character advance data is noisy.
1089    // median_font and the width fraction act only as safety rails for
1090    // pathological inputs.
1091    let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1092    let adaptive = compute_adaptive_column_gap(&bands);
1093    let floor = stats
1094        .median_font_size
1095        .max(region_width * XY_CUT_VERTICAL_GAP_REGION_FRACTION)
1096        .max(XY_CUT_VERTICAL_GAP_FLOOR);
1097    let min_gap = adaptive.min(floor).max(XY_CUT_VERTICAL_GAP_FLOOR);
1098
1099    // Intervals [x_left, x_right] of every span; we look for an x value
1100    // that is free of ALL intervals (full-height gap).
1101    let mut intervals: Vec<(f64, f64)> = spans
1102        .iter()
1103        .map(|s| (s.x, s.right().max(s.x + 0.001)))
1104        .collect();
1105    intervals.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal));
1106
1107    let mut cursor = intervals[0].1;
1108    let mut best_gap: Option<(f64, f64)> = None; // (gap_size, cut_x)
1109    for (left, right) in intervals.iter().skip(1) {
1110        if *left > cursor {
1111            let gap = *left - cursor;
1112            if gap >= min_gap {
1113                match best_gap {
1114                    Some((best, _)) if best >= gap => {}
1115                    _ => {
1116                        let cut_x = (cursor + *left) * 0.5;
1117                        best_gap = Some((gap, cut_x));
1118                    }
1119                }
1120            }
1121        }
1122        cursor = cursor.max(*right);
1123    }
1124
1125    let (gap_size, cut_x) = best_gap?;
1126
1127    // Split spans around the cut. A span whose midpoint is < cut_x
1128    // belongs to the left group.
1129    let mut left_group = Vec::new();
1130    let mut right_group = Vec::new();
1131    for span in spans {
1132        let midpoint = span.x + (span.right() - span.x) * 0.5;
1133        if midpoint < cut_x {
1134            left_group.push(span.clone());
1135        } else {
1136            right_group.push(span.clone());
1137        }
1138    }
1139
1140    if !columns_are_dense(&left_group, &right_group, stats) {
1141        return None;
1142    }
1143    if !columns_are_band_aligned(spans, cut_x, region_left, region_right, stats) {
1144        return None;
1145    }
1146
1147    Some((vec![left_group, right_group], gap_size))
1148}
1149
1150/// ANN[r17/TEX2] Reject a vertical cut when any band sits on only one
1151/// side of the cut AND occupies more than ~70% of that side's column
1152/// width. Such bands are almost certainly full-width paragraphs that
1153/// happened to align with the left margin of one column, and forcing
1154/// them into that column re-orders them relative to text that follows.
1155fn columns_are_band_aligned(
1156    spans: &[TextSpan],
1157    cut_x: f64,
1158    region_left: f64,
1159    region_right: f64,
1160    stats: &PageStats,
1161) -> bool {
1162    let left_width = (cut_x - region_left).max(1.0);
1163    let right_width = (region_right - cut_x).max(1.0);
1164
1165    // Threshold chosen empirically: paragraph bodies in columnar
1166    // layouts usually fill ~60-70% of their column; anything wider
1167    // than 0.7× is a page-level element masquerading as column
1168    // content.
1169    const MAX_SINGLE_SIDE_FRACTION: f64 = 0.70;
1170
1171    let bands = group_spans_into_bands_with_stats(spans.to_vec(), stats);
1172    for band in &bands {
1173        let mut has_left = false;
1174        let mut has_right = false;
1175        for span in &band.spans {
1176            let midpoint = span.x + (span.right() - span.x) * 0.5;
1177            if midpoint < cut_x {
1178                has_left = true;
1179            } else {
1180                has_right = true;
1181            }
1182        }
1183        if has_left && has_right {
1184            continue; // Band straddles columns → fine.
1185        }
1186        let band_width = band.width();
1187        if has_left && band_width > left_width * MAX_SINGLE_SIDE_FRACTION {
1188            return false;
1189        }
1190        if has_right && band_width > right_width * MAX_SINGLE_SIDE_FRACTION {
1191            return false;
1192        }
1193    }
1194    true
1195}
1196
1197/// Density guard — reject column splits that look like tables (few,
1198/// short spans per column). A column is "dense" when it has at least
1199/// MIN_SPANS_PER_COLUMN spans and the average character count per band
1200/// exceeds MIN_CHARS_PER_BAND.
1201fn columns_are_dense(left: &[TextSpan], right: &[TextSpan], stats: &PageStats) -> bool {
1202    for col in [left, right] {
1203        if col.len() < XY_CUT_MIN_SPANS_PER_COLUMN {
1204            return false;
1205        }
1206        let bands = group_spans_into_bands_with_stats(col.to_vec(), stats);
1207        if bands.is_empty() {
1208            return false;
1209        }
1210        let total_chars: usize = col.iter().map(|s| s.text.chars().count()).sum();
1211        let chars_per_band = total_chars as f64 / bands.len() as f64;
1212        if chars_per_band < XY_CUT_MIN_CHARS_PER_BAND {
1213            return false;
1214        }
1215    }
1216    true
1217}
1218
1219/// Attempt a horizontal (zone / paragraph) cut. Unlike vertical cuts
1220/// this does NOT need a density guard — splitting top-from-bottom
1221/// cannot re-order content.
1222fn try_horizontal_cut(spans: &[TextSpan], stats: &PageStats) -> Option<(Vec<Vec<TextSpan>>, f64)> {
1223    if spans.len() < 2 {
1224        return None;
1225    }
1226    // Sort by descending y (PDF y grows upward).
1227    let mut sorted = spans.to_vec();
1228    sorted.sort_by(|a, b| {
1229        b.y.partial_cmp(&a.y)
1230            .unwrap_or(Ordering::Equal)
1231            .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1232    });
1233
1234    // ANN[r17/TEX4] Paragraph / zone cuts scale with MEDIAN LINE
1235    // SPACING when available — this is the typographically correct
1236    // baseline (paragraph break ≈ 1.8 × line-spacing). When the page
1237    // has only one band, or stats haven't observed spacing yet, fall
1238    // back to the font-size multiple the legacy path used.
1239    let min_gap = if stats.median_line_spacing > 0.0 {
1240        stats.median_line_spacing * PARAGRAPH_BREAK_LINE_SPACING_MULTIPLIER
1241    } else {
1242        stats.median_font_size * XY_CUT_HORIZONTAL_GAP_FONT_MULTIPLIER
1243    };
1244
1245    // Look for the largest gap between consecutive span y-values.
1246    let mut best: Option<(f64, f64)> = None; // (gap_size, cut_y)
1247    let tolerance = stats.median_font_size * BAND_Y_FRACTION;
1248    let mut band_bottom = sorted[0].y;
1249
1250    for span in sorted.iter().skip(1) {
1251        if (band_bottom - span.y).abs() <= tolerance {
1252            band_bottom = band_bottom.min(span.y);
1253            continue;
1254        }
1255        let gap = band_bottom - span.y;
1256        if gap >= min_gap {
1257            let cut_y = (band_bottom + span.y) * 0.5;
1258            match best {
1259                Some((best_gap, _)) if best_gap >= gap => {}
1260                _ => best = Some((gap, cut_y)),
1261            }
1262        }
1263        band_bottom = span.y;
1264    }
1265
1266    let (gap_size, cut_y) = best?;
1267
1268    let mut top_group = Vec::new();
1269    let mut bottom_group = Vec::new();
1270    for span in spans {
1271        if span.y > cut_y {
1272            top_group.push(span.clone());
1273        } else {
1274            bottom_group.push(span.clone());
1275        }
1276    }
1277    if top_group.is_empty() || bottom_group.is_empty() {
1278        return None;
1279    }
1280    Some((vec![top_group, bottom_group], gap_size))
1281}
1282
1283/// Legacy band+column-detection path, kept for reference and as the
1284/// fallback inside `band_based_blocks` test coverage. Not currently
1285/// used — XY-Cut supersedes it.
1286#[allow(dead_code)]
1287fn group_spans_into_blocks_legacy(spans: Vec<TextSpan>) -> Vec<TextBlock> {
1288    let bands = group_spans_into_bands(spans);
1289    group_spans_into_blocks_legacy_from_bands(bands)
1290}
1291
1292fn group_spans_into_blocks_legacy_with_stats(
1293    spans: Vec<TextSpan>,
1294    stats: &PageStats,
1295) -> Vec<TextBlock> {
1296    let bands = group_spans_into_bands_with_stats(spans, stats);
1297    group_spans_into_blocks_legacy_from_bands(bands)
1298}
1299
1300fn group_spans_into_blocks_legacy_from_bands(bands: Vec<TextBand>) -> Vec<TextBlock> {
1301    if bands.is_empty() {
1302        return Vec::new();
1303    }
1304
1305    let column_gap_threshold = compute_adaptive_column_gap(&bands);
1306
1307    let mut blocks = Vec::new();
1308    let mut idx = 0;
1309
1310    while idx < bands.len() {
1311        let gap_midpoints = bands[idx].gap_midpoints(column_gap_threshold);
1312        if gap_midpoints.is_empty() {
1313            blocks.push(bands[idx].row_block());
1314            idx += 1;
1315            continue;
1316        }
1317
1318        let mut boundaries = gap_midpoints.clone();
1319        let mut band_indices = vec![idx];
1320        let mut gapped_band_count = 1usize;
1321        let mut region_left = bands[idx].left();
1322        let mut region_right = bands[idx].right();
1323        let mut next_idx = idx + 1;
1324
1325        while next_idx < bands.len() {
1326            let next_band = &bands[next_idx];
1327            let next_gap_midpoints = next_band.gap_midpoints(column_gap_threshold);
1328            if next_gap_midpoints.is_empty() {
1329                if next_band
1330                    .fits_single_column(&boundaries, region_left, region_right)
1331                    .is_some()
1332                {
1333                    band_indices.push(next_idx);
1334                    next_idx += 1;
1335                    continue;
1336                }
1337                break;
1338            }
1339
1340            if !boundaries_match(&boundaries, &next_gap_midpoints, column_gap_threshold) {
1341                break;
1342            }
1343
1344            update_boundaries(&mut boundaries, &next_gap_midpoints, gapped_band_count);
1345            gapped_band_count += 1;
1346            band_indices.push(next_idx);
1347            region_left = region_left.min(next_band.left());
1348            region_right = region_right.max(next_band.right());
1349            next_idx += 1;
1350        }
1351
1352        if region_is_columnar(&bands, &band_indices, &boundaries, gapped_band_count) {
1353            append_column_region_blocks(&bands, &band_indices, &boundaries, &mut blocks);
1354            idx = next_idx;
1355        } else {
1356            blocks.push(bands[idx].row_block());
1357            idx += 1;
1358        }
1359    }
1360
1361    blocks
1362}
1363
1364/// Legacy wrapper used by call sites that haven't been handed PageStats.
1365/// It derives stats locally. Prefer `group_spans_into_bands_with_stats`
1366/// inside the XY-Cut pipeline to avoid recomputing the stats per call.
1367fn group_spans_into_bands(spans: Vec<TextSpan>) -> Vec<TextBand> {
1368    let stats = PageStats::from_spans(&spans);
1369    group_spans_into_bands_with_stats(spans, &stats)
1370}
1371
1372fn group_spans_into_bands_with_stats(mut spans: Vec<TextSpan>, stats: &PageStats) -> Vec<TextBand> {
1373    if spans.is_empty() {
1374        return Vec::new();
1375    }
1376
1377    spans.sort_by(|a, b| {
1378        b.y.partial_cmp(&a.y)
1379            .unwrap_or(Ordering::Equal)
1380            .then_with(|| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal))
1381    });
1382
1383    // ANN[r17/TEX4] Band tolerance scales with this page's median font
1384    // size rather than a fixed 5pt floor. Single-page spreads with
1385    // huge display fonts (24pt+) previously merged unrelated lines; a
1386    // fractional threshold keeps that from happening without hurting
1387    // body-text pages.
1388    let page_tolerance = (stats.median_font_size * BAND_Y_FRACTION).max(BAND_Y_TOLERANCE);
1389
1390    let mut bands: Vec<TextBand> = Vec::new();
1391
1392    for span in spans {
1393        let tolerance = (span.height * BAND_Y_FRACTION)
1394            .max(page_tolerance)
1395            .max(BAND_Y_TOLERANCE);
1396        if let Some(band) = bands
1397            .iter_mut()
1398            .find(|band| (band.y - span.y).abs() <= tolerance)
1399        {
1400            let span_count = band.spans.len() as f64;
1401            band.y = (band.y * span_count + span.y) / (span_count + 1.0);
1402            band.spans.push(span);
1403        } else {
1404            bands.push(TextBand::new(span));
1405        }
1406    }
1407
1408    for band in &mut bands {
1409        band.sort_spans();
1410    }
1411
1412    bands.sort_by(|a, b| b.y.partial_cmp(&a.y).unwrap_or(Ordering::Equal));
1413    bands
1414}
1415
1416fn boundaries_match(boundaries: &[f64], gap_midpoints: &[f64], column_gap_threshold: f64) -> bool {
1417    let tolerance = (column_gap_threshold * 1.5).clamp(COLUMN_GAP_MATCH_TOLERANCE, 60.0);
1418    boundaries.len() == gap_midpoints.len()
1419        && boundaries
1420            .iter()
1421            .zip(gap_midpoints)
1422            .all(|(lhs, rhs)| (lhs - rhs).abs() <= tolerance)
1423}
1424
1425fn update_boundaries(boundaries: &mut [f64], gap_midpoints: &[f64], seen_gapped_bands: usize) {
1426    for (boundary, midpoint) in boundaries.iter_mut().zip(gap_midpoints) {
1427        *boundary =
1428            (*boundary * seen_gapped_bands as f64 + midpoint) / (seen_gapped_bands as f64 + 1.0);
1429    }
1430}
1431
1432fn region_is_columnar(
1433    bands: &[TextBand],
1434    band_indices: &[usize],
1435    boundaries: &[f64],
1436    gapped_band_count: usize,
1437) -> bool {
1438    if boundaries.is_empty()
1439        || gapped_band_count < MIN_COLUMN_GAPPED_BANDS
1440        || band_indices.is_empty()
1441        || (gapped_band_count as f64 / band_indices.len() as f64) < MIN_COLUMN_GAP_SUPPORT
1442    {
1443        return false;
1444    }
1445
1446    let mut non_empty_slices = 0usize;
1447    let mut dense_slices = 0usize;
1448    let mut slices_per_column = vec![0usize; boundaries.len() + 1];
1449
1450    for &band_idx in band_indices {
1451        let slices = bands[band_idx].split_by_boundaries(boundaries);
1452        for (column_idx, slice) in slices.iter().enumerate() {
1453            if slice.is_empty() {
1454                continue;
1455            }
1456
1457            non_empty_slices += 1;
1458            slices_per_column[column_idx] += 1;
1459
1460            let char_count = slice
1461                .iter()
1462                .map(|span| span.text.chars().count())
1463                .sum::<usize>();
1464            if slice.len() >= 2 || char_count >= 8 {
1465                dense_slices += 1;
1466            }
1467        }
1468    }
1469
1470    if non_empty_slices < boundaries.len() + 2 {
1471        return false;
1472    }
1473
1474    if slices_per_column.contains(&0) {
1475        return false;
1476    }
1477
1478    (dense_slices as f64 / non_empty_slices as f64) >= MIN_DENSE_SLICE_RATIO
1479}
1480
1481fn append_column_region_blocks(
1482    bands: &[TextBand],
1483    band_indices: &[usize],
1484    boundaries: &[f64],
1485    blocks: &mut Vec<TextBlock>,
1486) {
1487    let column_count = boundaries.len() + 1;
1488    let mut column_bands = vec![Vec::<TextSpan>::new(); column_count];
1489
1490    for &band_idx in band_indices {
1491        let slices = bands[band_idx].split_by_boundaries(boundaries);
1492        for (column_idx, slice) in slices.into_iter().enumerate() {
1493            if slice.is_empty() {
1494                continue;
1495            }
1496            column_bands[column_idx].push(TextSpan::default());
1497            let marker_idx = column_bands[column_idx].len() - 1;
1498            column_bands[column_idx][marker_idx] = TextSpan {
1499                x: f64::NEG_INFINITY,
1500                y: bands[band_idx].y,
1501                ..TextSpan::default()
1502            };
1503            column_bands[column_idx].extend(slice);
1504        }
1505    }
1506
1507    for spans in column_bands {
1508        let mut current: Vec<TextSpan> = Vec::new();
1509        for span in spans {
1510            if span.x == f64::NEG_INFINITY {
1511                if !current.is_empty() {
1512                    current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1513                    blocks.push(TextBlock {
1514                        spans: std::mem::take(&mut current),
1515                    });
1516                }
1517                continue;
1518            }
1519            current.push(span);
1520        }
1521        if !current.is_empty() {
1522            current.sort_by(|a, b| a.x.partial_cmp(&b.x).unwrap_or(Ordering::Equal));
1523            blocks.push(TextBlock { spans: current });
1524        }
1525    }
1526}
1527
1528/// Join per-block lines, stitching end-of-line hyphenated word-wraps the
1529/// way pdftotext / MuPDF / PDFBox do.
1530///
1531/// Trigger conditions (all must hold):
1532/// 1. Previous line ends with `-` preceded by an alphabetic character.
1533/// 2. The alphabetic suffix before the `-` has >= 3 characters.
1534/// 3. The next line (trimmed) starts with an ASCII lowercase letter.
1535/// 4. The lowercase prefix of the next line has >= 3 characters.
1536///
1537/// When triggered, the trailing `-` is removed and the two halves are
1538/// concatenated without a space or newline.
1539///
1540/// This avoids false positives on compound words ("real-time"), bullet
1541/// lists, numeric ranges ("42-"), and short fragments.
1542fn stitch_hyphenated_lines(lines: &[String]) -> String {
1543    let mut out = String::new();
1544    for (idx, line) in lines.iter().enumerate() {
1545        if idx == 0 {
1546            out.push_str(line);
1547            continue;
1548        }
1549
1550        let next_trimmed = line.trim_start();
1551
1552        // Check the accumulated output for end-of-line hyphen pattern
1553        let should_merge = is_hyphen_wrap_candidate(&out, next_trimmed);
1554
1555        if should_merge {
1556            out.pop(); // drop the trailing '-'
1557            out.push_str(next_trimmed);
1558        } else {
1559            out.push('\n');
1560            out.push_str(line);
1561        }
1562    }
1563    out
1564}
1565
1566/// Check if the accumulated text ends with a hyphen-wrap pattern and the
1567/// continuation is a valid merge target.
1568fn is_hyphen_wrap_candidate(accumulated: &str, next_trimmed: &str) -> bool {
1569    // Must end with '-'
1570    if !accumulated.ends_with('-') {
1571        return false;
1572    }
1573
1574    // Character before '-' must be alphabetic
1575    let before_hyphen = accumulated.chars().rev().nth(1);
1576    if !before_hyphen.is_some_and(|c| c.is_alphabetic()) {
1577        return false;
1578    }
1579
1580    // Count consecutive alphabetic chars before the '-' (the word fragment)
1581    let alpha_prefix_len = accumulated
1582        .chars()
1583        .rev()
1584        .skip(1) // skip the '-'
1585        .take_while(|c| c.is_alphabetic())
1586        .count();
1587    if alpha_prefix_len < 3 {
1588        return false;
1589    }
1590
1591    // Next line must start with lowercase ASCII
1592    let first_next = next_trimmed.chars().next();
1593    if !first_next.is_some_and(|c| c.is_ascii_lowercase()) {
1594        return false;
1595    }
1596
1597    // Count consecutive lowercase chars at start of next line
1598    let next_alpha_len = next_trimmed
1599        .chars()
1600        .take_while(|c| c.is_ascii_lowercase())
1601        .count();
1602    if next_alpha_len < 3 {
1603        return false;
1604    }
1605
1606    true
1607}
1608
1609/// Normalize extracted text to match pdftotext conventions.
1610///
1611/// 1. Trim trailing whitespace from each line.
1612/// 2. Collapse runs of more than two consecutive newlines into exactly two.
1613/// 3. Preserve form-feed characters (`\x0C`) as page separators.
1614/// 4. End with a single trailing newline (or empty for empty input).
1615pub(crate) fn normalize_text_output(text: &str) -> String {
1616    if text.is_empty() {
1617        return String::new();
1618    }
1619
1620    let mut lines: Vec<&str> = Vec::new();
1621    for line in text.split('\n') {
1622        lines.push(line.trim_end());
1623    }
1624
1625    // Remove trailing empty lines (we'll add exactly one \n at the end)
1626    while lines.last() == Some(&"") {
1627        lines.pop();
1628    }
1629
1630    if lines.is_empty() {
1631        return String::new();
1632    }
1633
1634    let mut result = String::with_capacity(text.len());
1635    let mut consecutive_empty = 0u32;
1636
1637    for (i, line) in lines.iter().enumerate() {
1638        if line.is_empty() || *line == "\x0C" {
1639            if line.is_empty() {
1640                consecutive_empty += 1;
1641                // Collapse >2 consecutive blank lines to 2
1642                if consecutive_empty <= 2 {
1643                    result.push('\n');
1644                }
1645            } else {
1646                // Bare form-feed line
1647                consecutive_empty = 0;
1648                result.push_str(line);
1649                if i + 1 < lines.len() {
1650                    result.push('\n');
1651                }
1652            }
1653        } else {
1654            // Both form-feed-prefixed and regular lines are emitted as-is.
1655            consecutive_empty = 0;
1656            result.push_str(line);
1657            if i + 1 < lines.len() {
1658                result.push('\n');
1659            }
1660        }
1661    }
1662
1663    // Ensure single trailing newline
1664    if !result.is_empty() && !result.ends_with('\n') {
1665        result.push('\n');
1666    }
1667
1668    result
1669}
1670
1671#[cfg(test)]
1672mod tests {
1673    use super::*;
1674
1675    fn span(text: &str, x: f64, y: f64, width: f64) -> TextSpan {
1676        TextSpan {
1677            text: text.into(),
1678            x,
1679            y,
1680            width,
1681            height: 12.0,
1682            font_size: 12.0,
1683            ..TextSpan::default()
1684        }
1685    }
1686
1687    fn block_texts(spans: Vec<TextSpan>) -> Vec<String> {
1688        group_spans_into_blocks(spans)
1689            .into_iter()
1690            .map(|block| block.text())
1691            .collect()
1692    }
1693
1694    #[test]
1695    fn empty_device_produces_empty_text() {
1696        let dev = TextExtractionDevice::new();
1697        assert!(dev.into_text().is_empty());
1698    }
1699
1700    #[test]
1701    fn single_column_stays_row_major() {
1702        let texts = block_texts(vec![
1703            span("Single Column Line 1", 40.0, 700.0, 140.0),
1704            span("Single Column Line 2", 40.0, 684.0, 140.0),
1705            span("Single Column Line 3", 40.0, 668.0, 140.0),
1706        ]);
1707
1708        assert_eq!(
1709            texts,
1710            vec![
1711                "Single Column Line 1",
1712                "Single Column Line 2",
1713                "Single Column Line 3",
1714            ]
1715        );
1716    }
1717
1718    #[test]
1719    fn two_column_region_reads_column_major() {
1720        let texts = block_texts(vec![
1721            span("Header", 200.0, 740.0, 80.0),
1722            span("Left column line one", 40.0, 700.0, 115.0),
1723            span("Right column line one", 320.0, 700.0, 120.0),
1724            span("Left column line two", 40.0, 684.0, 115.0),
1725            span("Right column line two", 320.0, 684.0, 120.0),
1726            span("Left column line three", 40.0, 668.0, 125.0),
1727            span("Right column line three", 320.0, 668.0, 130.0),
1728            span("Footer", 200.0, 620.0, 80.0),
1729        ]);
1730
1731        assert_eq!(
1732            texts,
1733            vec![
1734                "Header",
1735                "Left column line one",
1736                "Left column line two",
1737                "Left column line three",
1738                "Right column line one",
1739                "Right column line two",
1740                "Right column line three",
1741                "Footer",
1742            ]
1743        );
1744    }
1745
1746    #[test]
1747    fn mixed_single_and_multi_column_regions_preserve_shared_bands() {
1748        let texts = block_texts(vec![
1749            span("Intro paragraph", 40.0, 740.0, 180.0),
1750            span("L1 words here", 40.0, 700.0, 110.0),
1751            span("R1 words here", 320.0, 700.0, 110.0),
1752            span("L2 words here", 40.0, 684.0, 110.0),
1753            span("R2 words here", 320.0, 684.0, 110.0),
1754            span("L3 words here", 40.0, 668.0, 110.0),
1755            span("R3 words here", 320.0, 668.0, 110.0),
1756            span("Outro paragraph", 40.0, 620.0, 180.0),
1757        ]);
1758
1759        assert_eq!(
1760            texts,
1761            vec![
1762                "Intro paragraph",
1763                "L1 words here",
1764                "L2 words here",
1765                "L3 words here",
1766                "R1 words here",
1767                "R2 words here",
1768                "R3 words here",
1769                "Outro paragraph",
1770            ]
1771        );
1772    }
1773
1774    #[test]
1775    fn short_table_like_rows_fall_back_to_row_major() {
1776        let texts = block_texts(vec![
1777            span("Name", 40.0, 700.0, 30.0),
1778            span("Age", 320.0, 700.0, 20.0),
1779            span("Alice", 40.0, 684.0, 35.0),
1780            span("30", 320.0, 684.0, 15.0),
1781            span("Bob", 40.0, 668.0, 24.0),
1782            span("25", 320.0, 668.0, 15.0),
1783        ]);
1784
1785        assert_eq!(texts, vec!["Name Age", "Alice 30", "Bob 25"]);
1786    }
1787
1788    #[test]
1789    fn three_column_regions_are_supported() {
1790        let texts = block_texts(vec![
1791            span("Column one line one", 40.0, 700.0, 105.0),
1792            span("Column two line one", 220.0, 700.0, 105.0),
1793            span("Column three line one", 400.0, 700.0, 120.0),
1794            span("Column one line two", 40.0, 684.0, 105.0),
1795            span("Column two line two", 220.0, 684.0, 105.0),
1796            span("Column three line two", 400.0, 684.0, 120.0),
1797            span("Column one line three", 40.0, 668.0, 120.0),
1798            span("Column two line three", 220.0, 668.0, 120.0),
1799            span("Column three line three", 400.0, 668.0, 135.0),
1800        ]);
1801
1802        assert_eq!(
1803            texts,
1804            vec![
1805                "Column one line one",
1806                "Column one line two",
1807                "Column one line three",
1808                "Column two line one",
1809                "Column two line two",
1810                "Column two line three",
1811                "Column three line one",
1812                "Column three line two",
1813                "Column three line three",
1814            ]
1815        );
1816    }
1817
1818    #[test]
1819    fn text_block_concatenation_spaced() {
1820        let block = TextBlock {
1821            spans: vec![span("A", 0.0, 0.0, 6.0), span("B", 20.0, 0.0, 6.0)],
1822        };
1823        assert_eq!(block.text(), "A B");
1824    }
1825
1826    #[test]
1827    fn adaptive_column_gap_fallback_for_no_gaps() {
1828        // Single-span bands produce no measurable gaps → fallback
1829        let bands = vec![
1830            TextBand::new(span("Hello", 40.0, 700.0, 80.0)),
1831            TextBand::new(span("World", 40.0, 684.0, 80.0)),
1832        ];
1833        let threshold = compute_adaptive_column_gap(&bands);
1834        assert!((threshold - COLUMN_GAP_THRESHOLD_FALLBACK).abs() < 0.01);
1835    }
1836
1837    #[test]
1838    fn adaptive_column_gap_uses_median() {
1839        // Three bands with word gaps of ~4pt each → median ≈ 4, threshold = 12
1840        let mut bands = Vec::new();
1841        for y in [700.0, 684.0, 668.0] {
1842            let mut band = TextBand::new(span("word1", 40.0, y, 30.0));
1843            band.spans.push(span("word2", 74.0, y, 30.0)); // gap = 4
1844            band.spans.push(span("word3", 108.0, y, 30.0)); // gap = 4
1845            bands.push(band);
1846        }
1847        let threshold = compute_adaptive_column_gap(&bands);
1848        // median gap = 4, × 3 = 12, clamped to [10, 40] → 12
1849        assert!(
1850            (10.0..=14.0).contains(&threshold),
1851            "expected ~12, got {threshold}"
1852        );
1853    }
1854
1855    #[test]
1856    fn adaptive_column_gap_clamps_to_min() {
1857        // Tight gaps (2pt) across many bands → median = 2, 3×2 = 6 → clamped to 10
1858        let mut bands = Vec::new();
1859        for y in [700.0, 684.0, 668.0, 652.0] {
1860            let mut band = TextBand::new(span("abc", 0.0, y, 18.0));
1861            // right of "abc" = max(18, 12*0.5*3=18) = 18; gap = 20-18 = 2
1862            band.spans.push(span("def", 20.0, y, 18.0));
1863            bands.push(band);
1864        }
1865        let threshold = compute_adaptive_column_gap(&bands);
1866        assert!(
1867            (threshold - COLUMN_GAP_THRESHOLD_MIN).abs() < 0.01,
1868            "expected {COLUMN_GAP_THRESHOLD_MIN}, got {threshold}"
1869        );
1870    }
1871
1872    #[test]
1873    fn adaptive_column_gap_all_large_gaps_uses_fraction_of_min() {
1874        // When all gaps are large (> MIN), threshold = 0.75 × min_gap.
1875        let mut band = TextBand::new(span("Left", 0.0, 700.0, 30.0));
1876        band.spans.push(span("Right", 80.0, 700.0, 30.0)); // gap = 50
1877        let bands = vec![band];
1878        let threshold = compute_adaptive_column_gap(&bands);
1879        assert!(
1880            (threshold - 37.5).abs() < 0.01,
1881            "expected 37.5 (0.75×50), got {threshold}"
1882        );
1883    }
1884
1885    #[test]
1886    fn normalize_trims_trailing_whitespace_per_line() {
1887        assert_eq!(
1888            normalize_text_output("hello   \nworld  \n"),
1889            "hello\nworld\n"
1890        );
1891    }
1892
1893    #[test]
1894    fn normalize_collapses_excess_newlines() {
1895        // >2 blank lines collapse to 2 (meaning 3 \n in a row: line, blank, blank)
1896        assert_eq!(
1897            normalize_text_output("hello\n\n\n\n\nworld\n"),
1898            "hello\n\n\nworld\n"
1899        );
1900    }
1901
1902    #[test]
1903    fn normalize_preserves_double_newline() {
1904        assert_eq!(
1905            normalize_text_output("paragraph one\n\nparagraph two\n"),
1906            "paragraph one\n\nparagraph two\n"
1907        );
1908    }
1909
1910    #[test]
1911    fn normalize_preserves_form_feed() {
1912        assert_eq!(
1913            normalize_text_output("page1\n\n\x0Cpage2\n"),
1914            "page1\n\n\x0Cpage2\n"
1915        );
1916    }
1917
1918    #[test]
1919    fn normalize_adds_trailing_newline() {
1920        assert_eq!(normalize_text_output("hello"), "hello\n");
1921    }
1922
1923    #[test]
1924    fn normalize_empty_input() {
1925        assert_eq!(normalize_text_output(""), "");
1926    }
1927
1928    #[test]
1929    fn normalize_only_whitespace() {
1930        assert_eq!(normalize_text_output("   \n  \n"), "");
1931    }
1932
1933    // --- Hyphen stitching tests ---
1934
1935    #[test]
1936    fn hyphen_stitch_joins_wrapped_word() {
1937        let lines = vec!["the aver-".into(), "age rainfall".into()];
1938        assert_eq!(stitch_hyphenated_lines(&lines), "the average rainfall");
1939    }
1940
1941    #[test]
1942    fn hyphen_stitch_handles_leading_whitespace() {
1943        let lines = vec!["pre-".into(), "   dict the outcome".into()];
1944        // "pre" is only 3 chars → meets >= 3 guard
1945        assert_eq!(stitch_hyphenated_lines(&lines), "predict the outcome");
1946    }
1947
1948    #[test]
1949    fn hyphen_stitch_capital_continuation_not_stitched() {
1950        let lines = vec!["Section three-".into(), "Summary here".into()];
1951        assert_eq!(
1952            stitch_hyphenated_lines(&lines),
1953            "Section three-\nSummary here"
1954        );
1955    }
1956
1957    #[test]
1958    fn hyphen_stitch_bullet_dash_not_stitched() {
1959        // "-" alone: char before hyphen is not alphabetic
1960        let lines = vec!["Items:".into(), "-".into(), "milk".into()];
1961        assert_eq!(stitch_hyphenated_lines(&lines), "Items:\n-\nmilk");
1962    }
1963
1964    #[test]
1965    fn hyphen_stitch_numeric_range_not_stitched() {
1966        // "42-" — char before hyphen is digit, not alphabetic
1967        let lines = vec!["page 42-".into(), "seventy".into()];
1968        assert_eq!(stitch_hyphenated_lines(&lines), "page 42-\nseventy");
1969    }
1970
1971    #[test]
1972    fn hyphen_stitch_short_prefix_not_stitched() {
1973        // "re-" only 2 alpha chars before hyphen → below 3-char guard
1974        let lines = vec!["re-".into(), "organize".into()];
1975        assert_eq!(stitch_hyphenated_lines(&lines), "re-\norganize");
1976    }
1977
1978    #[test]
1979    fn hyphen_stitch_short_continuation_not_stitched() {
1980        // Next line starts with "an" (2 chars) → below 3-char guard
1981        let lines = vec!["counter-".into(), "an example".into()];
1982        assert_eq!(stitch_hyphenated_lines(&lines), "counter-\nan example");
1983    }
1984
1985    #[test]
1986    fn hyphen_stitch_compound_word_midline_preserved() {
1987        // "real-time" is mid-line, not end-of-line — no stitching applies
1988        // because stitch only operates on line boundaries
1989        let lines = vec!["real-time system".into()];
1990        assert_eq!(stitch_hyphenated_lines(&lines), "real-time system");
1991    }
1992
1993    #[test]
1994    fn hyphen_stitch_single_line_unchanged() {
1995        let lines = vec!["only line".into()];
1996        assert_eq!(stitch_hyphenated_lines(&lines), "only line");
1997    }
1998
1999    #[test]
2000    fn hyphen_stitch_empty_input() {
2001        let lines: Vec<String> = vec![];
2002        assert_eq!(stitch_hyphenated_lines(&lines), "");
2003    }
2004
2005    // --- TEX1 multi-signal space consensus tests ---
2006
2007    fn make_device_with_median(median: f64) -> TextExtractionDevice {
2008        let mut dev = TextExtractionDevice::new();
2009        // Seed enough samples for the median to resolve to `median`.
2010        for _ in 0..MEDIAN_REFRESH {
2011            dev.glyph_widths.push(median);
2012        }
2013        dev.refresh_median_char_width();
2014        assert!((dev.cached_median_char_width - median).abs() < 1e-9);
2015        dev
2016    }
2017
2018    #[test]
2019    fn consensus_inserts_space_on_strong_tj_offset_alone() {
2020        // Gap is below the geometric threshold, but the TJ offset is large
2021        // enough that the consensus must still fire.
2022        let mut dev = make_device_with_median(6.0);
2023        dev.pending_tj_offset = 250.0; // full em-space
2024        assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
2025    }
2026
2027    #[test]
2028    fn consensus_inserts_space_on_geometric_gap_alone() {
2029        // No TJ, no character transition, but a clearly wide geometric gap.
2030        let dev = make_device_with_median(6.0);
2031        // gap > 0.3 * 6.0 = 1.8 → fires gap signal (0.80), below threshold
2032        // on its own? 0.80 < 0.75 threshold? No, 0.80 > 0.75, so it fires.
2033        assert!(dev.evaluate_space_consensus(2.5, 12.0, "hello", "world"));
2034    }
2035
2036    #[test]
2037    fn consensus_no_space_on_kerning_gap() {
2038        // Small kerning-size gap with no other signals must not inject a
2039        // space (regression guard against false-positive spaces inside
2040        // tightly kerned words).
2041        let dev = make_device_with_median(6.0);
2042        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "fi", "lm"));
2043    }
2044
2045    #[test]
2046    fn consensus_inserts_space_on_camel_case_plus_gap() {
2047        // CamelCase heuristic (0.60) alone doesn't reach threshold, but a
2048        // moderate gap (0.60 gap + 0.60 heuristic if gap fires) should.
2049        // Here gap = 2.5 > 1.8 → gap fires → total 0.80 + 0.60 = 1.40.
2050        let dev = make_device_with_median(6.0);
2051        assert!(dev.evaluate_space_consensus(2.5, 12.0, "helloWorld", "Inc"));
2052    }
2053
2054    #[test]
2055    fn consensus_inserts_space_on_digit_letter_transition_with_gap() {
2056        let dev = make_device_with_median(6.0);
2057        assert!(dev.evaluate_space_consensus(2.5, 12.0, "123", "abc"));
2058    }
2059
2060    #[test]
2061    fn consensus_heuristic_alone_is_insufficient() {
2062        // Heuristic (0.60) on its own is below the 0.75 threshold — the
2063        // design deliberately requires a second corroborating signal to
2064        // avoid gluing spaces into existing CamelCase identifiers that
2065        // have no geometric break.
2066        let dev = make_device_with_median(6.0);
2067        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "camel", "Case"));
2068    }
2069
2070    #[test]
2071    fn consensus_falls_back_to_font_size_when_no_median() {
2072        // No samples → median is 0; geometric reference uses font-size.
2073        let dev = TextExtractionDevice::new();
2074        // gap 1.9 > 0.15 * 12.0 = 1.8 → gap signal fires
2075        assert!(dev.evaluate_space_consensus(1.9, 12.0, "a", "b"));
2076        // gap 1.5 < 1.8 → no signal
2077        assert!(!dev.evaluate_space_consensus(1.5, 12.0, "a", "b"));
2078    }
2079
2080    #[test]
2081    fn consensus_ignores_tiny_tj_offsets() {
2082        // TJ offsets below the threshold are kerning, not word breaks.
2083        let mut dev = make_device_with_median(6.0);
2084        dev.pending_tj_offset = 50.0;
2085        assert!(!dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
2086    }
2087
2088    #[test]
2089    fn consensus_accepts_negative_tj_offsets() {
2090        // A negative TJ offset still represents an explicit inter-substring
2091        // shift and counts toward the consensus (|amount| check).
2092        let mut dev = make_device_with_median(6.0);
2093        dev.pending_tj_offset = -250.0;
2094        assert!(dev.evaluate_space_consensus(0.5, 12.0, "Hello", "World"));
2095    }
2096
2097    #[test]
2098    fn text_adjustment_accumulates_until_glyph() {
2099        let mut dev = TextExtractionDevice::new();
2100        dev.text_adjustment(120.0);
2101        dev.text_adjustment(140.0);
2102        assert!((dev.pending_tj_offset - 260.0).abs() < 1e-6);
2103    }
2104
2105    // --- TEX2 XY-Cut tests ---
2106
2107    #[test]
2108    fn xy_cut_header_body_footer_with_two_columns() {
2109        // Header and footer sit in the mid-x range that would
2110        // accidentally fall into a left-column bucket with a naive
2111        // vertical-first cut. The largest-gap-first rule plus the
2112        // alignment guard ensure header and footer bracket the
2113        // columnar body.
2114        let texts = block_texts(vec![
2115            span("HEADLINE TITLE", 180.0, 760.0, 120.0),
2116            span("Left col line A", 40.0, 700.0, 110.0),
2117            span("Right col line A", 320.0, 700.0, 115.0),
2118            span("Left col line B", 40.0, 684.0, 110.0),
2119            span("Right col line B", 320.0, 684.0, 115.0),
2120            span("Left col line C", 40.0, 668.0, 110.0),
2121            span("Right col line C", 320.0, 668.0, 115.0),
2122            span("FOOTER LINE TEXT", 180.0, 600.0, 120.0),
2123        ]);
2124        assert_eq!(texts.first().map(String::as_str), Some("HEADLINE TITLE"));
2125        assert_eq!(texts.last().map(String::as_str), Some("FOOTER LINE TEXT"));
2126        // Left column lines all come before right column lines.
2127        let left_c_idx = texts.iter().position(|s| s == "Left col line C").unwrap();
2128        let right_a_idx = texts.iter().position(|s| s == "Right col line A").unwrap();
2129        assert!(
2130            left_c_idx < right_a_idx,
2131            "expected column-major ordering in body: {texts:?}"
2132        );
2133    }
2134
2135    #[test]
2136    fn xy_cut_rejects_column_split_on_table_rows() {
2137        // The density guard must still reject the 280pt inter-cell gap
2138        // in a short-cell table, preserving row-major reading order.
2139        let texts = block_texts(vec![
2140            span("Name", 40.0, 700.0, 30.0),
2141            span("Age", 320.0, 700.0, 20.0),
2142            span("Alice", 40.0, 684.0, 35.0),
2143            span("30", 320.0, 684.0, 15.0),
2144        ]);
2145        assert_eq!(texts, vec!["Name Age", "Alice 30"]);
2146    }
2147
2148    #[test]
2149    fn xy_cut_rejects_column_split_when_one_band_is_full_width() {
2150        // The alignment guard catches a full-width paragraph that
2151        // would otherwise be forced into the left column of a 2-column
2152        // region below it.
2153        let texts = block_texts(vec![
2154            span(
2155                "Full width intro spanning both columns here",
2156                40.0,
2157                740.0,
2158                360.0,
2159            ),
2160            span("Left A", 40.0, 700.0, 50.0),
2161            span("Right A", 320.0, 700.0, 50.0),
2162            span("Left B", 40.0, 684.0, 50.0),
2163            span("Right B", 320.0, 684.0, 50.0),
2164        ]);
2165        assert!(
2166            texts[0].contains("Full width intro"),
2167            "expected full-width intro first: {texts:?}"
2168        );
2169    }
2170
2171    #[test]
2172    fn xy_cut_horizontal_split_for_zone_boundaries() {
2173        // Pure horizontal cut on a single-column page with a big
2174        // vertical gap between paragraphs — the cut fires and both
2175        // paragraphs stay in their own blocks.
2176        let texts = block_texts(vec![
2177            span("First paragraph body text", 40.0, 740.0, 200.0),
2178            span("Second paragraph body", 40.0, 680.0, 180.0),
2179        ]);
2180        assert_eq!(texts.len(), 2);
2181        assert!(texts[0].starts_with("First"));
2182        assert!(texts[1].starts_with("Second"));
2183    }
2184
2185    #[test]
2186    fn xy_cut_recursion_terminates_with_single_span() {
2187        let texts = block_texts(vec![span("Only one span on the page", 40.0, 700.0, 180.0)]);
2188        assert_eq!(texts, vec!["Only one span on the page"]);
2189    }
2190
2191    #[test]
2192    fn median_font_size_handles_mixed_sizes() {
2193        let spans = vec![
2194            TextSpan {
2195                text: "small".into(),
2196                width: 10.0,
2197                height: 8.0,
2198                font_size: 8.0,
2199                ..TextSpan::default()
2200            },
2201            TextSpan {
2202                text: "medium".into(),
2203                width: 10.0,
2204                height: 12.0,
2205                font_size: 12.0,
2206                ..TextSpan::default()
2207            },
2208            TextSpan {
2209                text: "large".into(),
2210                width: 10.0,
2211                height: 24.0,
2212                font_size: 24.0,
2213                ..TextSpan::default()
2214            },
2215        ];
2216        assert!((median_font_size(&spans) - 12.0).abs() < 1e-9);
2217    }
2218
2219    #[test]
2220    fn columns_band_aligned_accepts_aligned_columns() {
2221        let spans = vec![
2222            span("L1", 40.0, 700.0, 60.0),
2223            span("R1", 300.0, 700.0, 60.0),
2224            span("L2", 40.0, 684.0, 60.0),
2225            span("R2", 300.0, 684.0, 60.0),
2226        ];
2227        let stats = PageStats::from_spans(&spans);
2228        // cut_x between 100 and 300 → 200. Every band straddles the cut.
2229        assert!(columns_are_band_aligned(&spans, 200.0, 40.0, 360.0, &stats));
2230    }
2231
2232    #[test]
2233    fn columns_band_aligned_rejects_wide_single_side_band() {
2234        let spans = vec![
2235            span("Wide banner line across top", 40.0, 740.0, 280.0),
2236            span("L1", 40.0, 700.0, 60.0),
2237            span("R1", 300.0, 700.0, 60.0),
2238        ];
2239        let stats = PageStats::from_spans(&spans);
2240        // cut_x = 200. Banner only in left group (midpoint < 200). Width
2241        // exceeds 0.7 × left column width → rejected.
2242        assert!(!columns_are_band_aligned(
2243            &spans, 200.0, 40.0, 360.0, &stats
2244        ));
2245    }
2246
2247    #[test]
2248    fn page_stats_computes_median_values() {
2249        let spans = vec![
2250            span("one", 40.0, 700.0, 30.0),
2251            span("two", 40.0, 680.0, 30.0),
2252            span("three", 40.0, 660.0, 50.0),
2253        ];
2254        let stats = PageStats::from_spans(&spans);
2255        assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2256        // char width = width / chars. one=30/3=10, two=30/3=10, three=50/5=10. median=10.
2257        assert!((stats.median_char_width - 10.0).abs() < 1e-9);
2258        // line spacing: bands at 700, 680, 660. gaps = 20, 20. median = 20.
2259        assert!((stats.median_line_spacing - 20.0).abs() < 1e-9);
2260    }
2261
2262    #[test]
2263    fn page_stats_handles_empty_input() {
2264        let stats = PageStats::from_spans(&[]);
2265        assert!((stats.median_font_size - 12.0).abs() < 1e-9);
2266        assert!((stats.median_char_width - 6.0).abs() < 1e-9);
2267        assert_eq!(stats.median_line_spacing, 0.0);
2268    }
2269
2270    #[test]
2271    fn narrow_gutter_detected_with_adaptive_threshold() {
2272        // Academic paper layout: 12pt gutter between columns.
2273        // With old fixed 20pt threshold, this was not detected as columnar.
2274        // With adaptive: median word gap ~4pt, threshold = 12pt → detects 12pt gutter.
2275        let mut spans = Vec::new();
2276        for y in [700.0, 684.0, 668.0] {
2277            // Left column: two words with 4pt gap, ending at x=145
2278            spans.push(span("Lorem ipsum", 40.0, y, 100.0));
2279            spans.push(span("dolor sit", 144.0, y, 80.0));
2280            // Right column starts at 236 (gap = 12pt from 224)
2281            spans.push(span("amet consec", 236.0, y, 100.0));
2282            spans.push(span("tetur adipi", 340.0, y, 80.0));
2283        }
2284        let texts = block_texts(spans);
2285        // Should detect 2-column layout and read column-major
2286        assert!(
2287            texts.len() >= 6,
2288            "expected column-major output, got {texts:?}"
2289        );
2290        // First three blocks should be left column lines
2291        assert!(
2292            texts[0].contains("Lorem"),
2293            "first block should be left column: {texts:?}"
2294        );
2295    }
2296
2297    #[test]
2298    fn xy_cut_leaf_falls_back_to_legacy_columns_for_header_plus_three_columns() {
2299        let texts = block_texts(vec![
2300            span("73022", 45.0, 750.0, 70.0),
2301            span("Federal Register banner", 125.6, 750.0, 260.0),
2302            span("Left column line one", 45.0, 725.0, 140.0),
2303            span("Middle column line one", 222.0, 725.0, 140.0),
2304            span("Right column line one", 399.0, 725.0, 120.0),
2305            span("Left column line two", 45.0, 715.0, 140.0),
2306            span("Middle column line two", 210.0, 715.0, 152.0),
2307            span("Right column line two", 388.0, 715.0, 132.0),
2308            span("Left column line three", 45.0, 705.0, 140.0),
2309            span("Middle column line three", 235.0, 705.0, 135.0),
2310            span("Right column line three", 408.0, 705.0, 118.0),
2311        ]);
2312
2313        assert_eq!(
2314            texts,
2315            vec![
2316                "73022 Federal Register banner",
2317                "Left column line one",
2318                "Left column line two",
2319                "Left column line three",
2320                "Middle column line one",
2321                "Middle column line two",
2322                "Middle column line three",
2323                "Right column line one",
2324                "Right column line two",
2325                "Right column line three",
2326            ]
2327        );
2328    }
2329
2330    #[test]
2331    fn overlapping_fake_bold_spans_collapse_to_single_copy() {
2332        let texts = block_texts(vec![
2333            span("1 This is fakebold text.", 25.9, 785.3, 320.0),
2334            span("1 This is fakebold text.", 26.2, 785.3, 320.0),
2335            span("1 This is fakebold text.", 26.4, 785.3, 320.0),
2336            span("1 This is fakebold text.", 26.7, 785.3, 320.0),
2337            span("2 This is a fakebold", 27.0, 714.8, 142.0),
2338            span(" fakebold", 169.8, 714.8, 70.0),
2339            span(" fakebold", 170.1, 714.8, 70.0),
2340            span(" fakebold word.", 170.4, 714.8, 110.0),
2341        ]);
2342
2343        assert_eq!(
2344            texts,
2345            vec!["1 This is fakebold text.", "2 This is a fakebold word.",]
2346        );
2347    }
2348
2349    // ---- G1: read-only metadata field tests ----
2350
2351    #[test]
2352    fn g1_default_text_span_has_empty_metadata() {
2353        let s = TextSpan::default();
2354        assert_eq!(s.font_name, None);
2355        assert!(!s.is_bold);
2356        assert!(!s.is_italic);
2357        assert_eq!(s.color, None);
2358    }
2359
2360    #[test]
2361    fn g1_strip_subset_prefix_handles_six_char_prefix() {
2362        assert_eq!(strip_subset_prefix("AAAAAA+Helvetica"), "Helvetica");
2363        // Non-6-char prefix → keep verbatim.
2364        assert_eq!(strip_subset_prefix("ABC+Helvetica"), "ABC+Helvetica");
2365        // No `+` → unchanged.
2366        assert_eq!(strip_subset_prefix("Helvetica-Bold"), "Helvetica-Bold");
2367    }
2368
2369    #[test]
2370    fn g1_name_style_hints_match_pdf_interpret_rules() {
2371        assert_eq!(name_style_hints("Helvetica-Bold"), (true, false));
2372        assert_eq!(name_style_hints("Times-Italic"), (false, true));
2373        assert_eq!(name_style_hints("MyFont-BoldOblique"), (true, true));
2374        assert_eq!(name_style_hints("Helvetica"), (false, false));
2375        // Semibold / Demi / Heavy / Black variants → bold.
2376        assert_eq!(name_style_hints("Roboto-DemiBold"), (true, false));
2377        assert_eq!(name_style_hints("Roboto-Black"), (true, false));
2378        // Oblique / slant variants → italic.
2379        assert_eq!(name_style_hints("Roboto-Oblique"), (false, true));
2380        assert_eq!(name_style_hints("MyFont-Slanted"), (false, true));
2381    }
2382
2383    // ---- G2: widthSource + char_bounds on TextSpan ----
2384
2385    #[test]
2386    fn g2_default_text_span_has_estimate_width_source() {
2387        let s = TextSpan::default();
2388        assert_eq!(s.width_source, WidthSource::Estimate);
2389        assert!(s.char_bounds.is_empty());
2390    }
2391
2392    /// Verify that a single-glyph span has exactly one char_bound entry
2393    /// and that the bound matches the span's x / width.
2394    #[test]
2395    fn g2_single_glyph_span_has_one_char_bound() {
2396        let s = TextSpan {
2397            text: "A".into(),
2398            x: 10.0,
2399            y: 100.0,
2400            width: 7.22,
2401            height: 10.0,
2402            font_size: 10.0,
2403            width_source: WidthSource::Metric,
2404            char_bounds: vec![[10.0, 100.0, 17.22, 110.0]],
2405            ..Default::default()
2406        };
2407
2408        assert_eq!(s.char_bounds.len(), 1);
2409        let [x0, y0, x1, y1] = s.char_bounds[0];
2410        assert!((x0 - 10.0).abs() < 0.001);
2411        assert!((x1 - 17.22).abs() < 0.001);
2412        assert!((y1 - y0 - s.font_size).abs() < 0.001);
2413    }
2414
2415    /// When merging glyphs into a span the width_source degrades to Estimate
2416    /// if any glyph was estimated.
2417    #[test]
2418    fn g2_merged_span_degrades_width_source_on_estimate() {
2419        let mut s = TextSpan {
2420            width_source: WidthSource::Metric,
2421            char_bounds: vec![[0.0, 0.0, 7.0, 10.0]],
2422            ..Default::default()
2423        };
2424
2425        // Simulate what draw_glyph does on merge: push bound + downgrade.
2426        s.char_bounds.push([7.0, 0.0, 12.0, 10.0]);
2427        s.width_source = WidthSource::Estimate; // second glyph had no advance
2428
2429        assert_eq!(s.width_source, WidthSource::Estimate);
2430        assert_eq!(s.char_bounds.len(), 2);
2431    }
2432
2433    /// Verify WidthSource enum serialises to the two expected string literals.
2434    #[test]
2435    fn g2_width_source_variants_are_correct() {
2436        assert_eq!(format!("{:?}", WidthSource::Metric), "Metric");
2437        assert_eq!(format!("{:?}", WidthSource::Estimate), "Estimate");
2438        assert_ne!(WidthSource::Metric, WidthSource::Estimate);
2439        assert_eq!(WidthSource::default(), WidthSource::Estimate);
2440    }
2441}