oxidize_pdf/text/
extraction.rs

1//! Text extraction from PDF content streams
2//!
3//! This module provides functionality to extract text from PDF pages,
4//! handling text positioning, transformations, and basic encodings.
5
6use crate::graphics::Color;
7use crate::parser::content::{ContentOperation, ContentParser, TextElement};
8use crate::parser::document::PdfDocument;
9use crate::parser::objects::{PdfDictionary, PdfObject};
10use crate::parser::page_tree::ParsedPage;
11use crate::parser::ParseResult;
12use crate::text::extraction_cmap::{CMapTextExtractor, FontInfo};
13use std::collections::HashMap;
14use std::io::{Read, Seek};
15
16/// Text extraction options
17#[derive(Debug, Clone)]
18pub struct ExtractionOptions {
19    /// Preserve the original layout (spacing and positioning)
20    pub preserve_layout: bool,
21    /// Minimum space width to insert space character (in text space units)
22    pub space_threshold: f64,
23    /// Threshold for synthesising an implicit `U+0020` from a `TJ` numeric
24    /// kerning offset, expressed as a fraction of the current font size.
25    /// A TJ kern advances the text matrix by `-adjustment/1000 * font_size`
26    /// without rendering any glyph; many PDFs (academic publishers, LaTeX,
27    /// kerned typography) encode inter-word gaps purely as wide negative
28    /// kerns rather than literal space bytes. When the synthesised advance
29    /// exceeds `tj_space_threshold * font_size`, the extractor inserts one
30    /// `U+0020`. Default `0.2` (200 milli-em) sits well between typical
31    /// intra-word kerning (10-50 milli-em) and the width of a `space`
32    /// glyph in most fonts (250-300 milli-em). Lower values catch tighter
33    /// spaces; higher values reduce false positives in fonts with unusually
34    /// wide kerning. Separate from `space_threshold` (which governs the
35    /// post-glyph gap between separate text-show operators) because the TJ
36    /// numeric kern is measured without any glyph advance baseline and
37    /// needs a more sensitive threshold (issue #272).
38    pub tj_space_threshold: f64,
39    /// Minimum vertical distance to insert newline (in text space units)
40    pub newline_threshold: f64,
41    /// Sort text fragments by position (useful for multi-column layouts)
42    pub sort_by_position: bool,
43    /// Detect and handle columns
44    pub detect_columns: bool,
45    /// Column separation threshold (in page units)
46    pub column_threshold: f64,
47    /// Merge hyphenated words at line ends
48    pub merge_hyphenated: bool,
49    /// Track space insertion decisions in each TextFragment (default: false).
50    /// When false: zero overhead. When true: populates `TextFragment::space_decisions`.
51    pub track_space_decisions: bool,
52    /// Reconstruct visual lines and paragraphs from the raw text fragments
53    /// produced by PDF text-show operators. When `true`, the extractor groups
54    /// fragments by baseline into single-line fragments, then groups
55    /// consecutive lines with normal leading into paragraph-level fragments.
56    /// This is what the partition pipeline needs to produce Element values at
57    /// paragraph granularity rather than at per-`Tj` granularity (see
58    /// [issue #261](https://github.com/bzsanti/oxidizePdf/issues/261)).
59    ///
60    /// Default `false` for backward compatibility with direct `extract_text`
61    /// callers. The `PdfDocument::partition*` entry points force this to
62    /// `true`.
63    pub reconstruct_paragraphs: bool,
64    /// Include content inside `/Artifact` marked-content scopes (page headers,
65    /// footers, watermarks, decorative content). Default `false` — Artifact
66    /// content is filtered out, as the PDF/UA conformance level recommends
67    /// for accessibility tooling and as RAG callers consistently want
68    /// (issue #269 Phase 1). Opt-in by setting `true` when extracting
69    /// page furniture matters (e.g. forensic auditing, redaction tools).
70    pub include_artifacts: bool,
71}
72
73impl Default for ExtractionOptions {
74    fn default() -> Self {
75        Self {
76            preserve_layout: false,
77            space_threshold: 0.3,
78            tj_space_threshold: 0.2,
79            newline_threshold: 10.0,
80            sort_by_position: true,
81            detect_columns: false,
82            column_threshold: 50.0,
83            merge_hyphenated: true,
84            track_space_decisions: false,
85            reconstruct_paragraphs: false,
86            include_artifacts: false,
87        }
88    }
89}
90
91/// Extracted text with position information
92#[derive(Debug, Clone)]
93pub struct ExtractedText {
94    /// The extracted text content
95    pub text: String,
96    /// Text fragments with position information (if preserve_layout is true)
97    pub fragments: Vec<TextFragment>,
98}
99
100/// Metadata about a space insertion decision during text extraction.
101/// Only populated when [`ExtractionOptions::track_space_decisions`] is `true`.
102#[derive(Debug, Clone)]
103pub struct SpaceDecision {
104    /// Character offset in the extracted text.
105    pub offset: usize,
106    /// Actual horizontal gap (dx) in text space units.
107    pub dx: f64,
108    /// The threshold used at this point.
109    pub threshold: f64,
110    /// Confidence: `|dx - threshold| / threshold`, clamped to [0.0, 1.0].
111    pub confidence: f64,
112    /// Whether a space was inserted.
113    pub inserted: bool,
114}
115
116/// A fragment of text with position information
117#[derive(Debug, Clone)]
118pub struct TextFragment {
119    /// Text content
120    pub text: String,
121    /// X position in page coordinates
122    pub x: f64,
123    /// Y position in page coordinates
124    pub y: f64,
125    /// Width of the text
126    pub width: f64,
127    /// Height of the text
128    pub height: f64,
129    /// Font size
130    pub font_size: f64,
131    /// Font name (if known) - used for kerning-aware text spacing
132    pub font_name: Option<String>,
133    /// Whether the font is bold (detected from font name)
134    pub is_bold: bool,
135    /// Whether the font is italic (detected from font name)
136    pub is_italic: bool,
137    /// Fill color of the text (from graphics state)
138    pub color: Option<Color>,
139    /// Space insertion decisions (empty unless `track_space_decisions` is true).
140    pub space_decisions: Vec<SpaceDecision>,
141    /// Marked-content identifier from the innermost ancestor BDC with `/MCID`
142    /// (issue #269 Phase 1). `None` for non-tagged PDFs, which preserves the
143    /// pre-Phase-1 grouping behavior (`None == None` collapses to legacy keys).
144    pub mcid: Option<u32>,
145    /// Structural tag of the owning BDC (e.g. `"P"`, `"H1"`, `"Figure"`,
146    /// `"Artifact"`). Set on the same ancestor that supplied `mcid`. Phase 3
147    /// will consume this for partitioner classification; Phase 1 only carries it.
148    pub struct_tag: Option<String>,
149}
150
151/// One entry on the marked-content stack maintained by `TextState`.
152///
153/// PDF marked-content operators (BDC/BMC/EMC) form a balanced LIFO stack
154/// per content stream. Each entry remembers the tag (`"P"`, `"H1"`,
155/// `"Artifact"`, …), the optional `MCID` for fragment grouping, the
156/// optional `/ActualText` substitution string, and a computed
157/// `is_artifact` flag that inherits from any ancestor (so nested
158/// `/P` inside `/Artifact` is still filtered out).
159#[derive(Debug, Clone)]
160struct MarkedContentEntry {
161    /// The BDC/BMC tag (e.g. `"P"`, `"Figure"`, `"Artifact"`, `"Span"`).
162    tag: String,
163    /// MCID from `/MCID <int>` if present in the BDC props.
164    mcid: Option<u32>,
165    /// Decoded ActualText from `/ActualText (...)` if present. Decoded
166    /// once at BDC time (UTF-16BE BOM detection in `decode_pdf_string`)
167    /// rather than per-fragment.
168    #[allow(dead_code)] // Task 9 reads this via pending_actualtext flush path
169    actual_text: Option<String>,
170    /// True if this entry's tag == `"Artifact"` OR any ancestor on the
171    /// stack at push time had `is_artifact == true`. Inheritance lets the
172    /// emitter check only the innermost entry to decide filtering.
173    is_artifact: bool,
174}
175
176/// A pending ActualText run. Created when a BDC pushes an entry with
177/// `actual_text == Some(_)`; drained and emitted as a single synthetic
178/// `TextFragment` when the matching EMC pops the entry.
179///
180/// Spec §3a/§4 (collapse-on-EMC): per-`Tj` emission inside an ActualText
181/// scope is suppressed; on scope close we emit one fragment whose `text`
182/// is the substitution string, `x`/`y` is the first `Tj` origin, and
183/// `width` is the sum of suppressed text widths.
184#[derive(Debug, Clone)]
185struct PendingActualText {
186    /// Substitution text from the BDC's `/ActualText` (already decoded).
187    text: String,
188    /// Pen origin of the first suppressed `Tj` (page-space).
189    first_x: f64,
190    /// Same for Y.
191    first_y: f64,
192    /// Accumulated effective width of suppressed `Tj` runs.
193    width: f64,
194    /// Effective font size at the time the first `Tj` was suppressed.
195    font_size: f64,
196    /// Font name + style at first `Tj`. Set on first suppression.
197    font_name: Option<String>,
198    /// Bold/italic from the font name at first suppression.
199    is_bold: bool,
200    is_italic: bool,
201    /// Fill color at first suppression.
202    color: Option<Color>,
203    /// Depth in `mc_stack` at which this run was opened. When the entry at
204    /// this depth is popped, the pending run is flushed.
205    stack_depth: usize,
206    /// Whether a `Tj`/`TJ`/`'`/`"` has been observed yet inside the scope.
207    /// Until the first one fires, the run has no origin to record.
208    populated: bool,
209}
210
211/// Text extraction state
212struct TextState {
213    /// Current text matrix
214    text_matrix: [f64; 6],
215    /// Current text line matrix
216    text_line_matrix: [f64; 6],
217    /// Current transformation matrix (CTM)
218    ctm: [f64; 6],
219    /// Text leading (line spacing)
220    leading: f64,
221    /// Character spacing
222    char_space: f64,
223    /// Word spacing
224    word_space: f64,
225    /// Horizontal scaling
226    horizontal_scale: f64,
227    /// Text rise
228    text_rise: f64,
229    /// Current font size
230    font_size: f64,
231    /// Current font name
232    font_name: Option<String>,
233    /// Render mode (0 = fill, 1 = stroke, etc.)
234    render_mode: u8,
235    /// Fill color (for text rendering)
236    fill_color: Option<Color>,
237    /// Graphics state stack for `q`/`Q` operators. Each entry holds the CTM
238    /// and other graphics state items that the text extractor needs to restore.
239    /// Per PDF spec §8.4.4, `q` pushes the full graphics state and `Q` pops it;
240    /// here we save only the fields that influence text extraction.
241    saved_states: Vec<SavedGraphicsState>,
242    /// Marked-content stack (issue #269 Phase 1). Pushed on BMC/BDC,
243    /// popped on EMC. Empty on entry to each page.
244    mc_stack: Vec<MarkedContentEntry>,
245    /// Pending ActualText run if any BDC ancestor declared `/ActualText`.
246    /// At most one active run at a time — nested ActualText replaces the
247    /// outer (innermost wins, per spec §4).
248    pending_actualtext: Option<PendingActualText>,
249}
250
251/// Subset of graphics state saved by `q` and restored by `Q` (issue #262).
252#[derive(Clone)]
253struct SavedGraphicsState {
254    ctm: [f64; 6],
255    fill_color: Option<Color>,
256}
257
258/// Mutable accumulator threaded through `process_operations` so the op loop
259/// can be driven recursively (page content stream → Form XObjects) while
260/// carrying text state, position, and accumulated output. Bundled into one
261/// struct so the op match moves verbatim into the recursive method (#319).
262struct OpRunState {
263    state: TextState,
264    in_text_object: bool,
265    last_x: f64,
266    last_y: f64,
267    extracted_text: String,
268    fragments: Vec<TextFragment>,
269}
270
271impl Default for TextState {
272    fn default() -> Self {
273        Self {
274            text_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
275            text_line_matrix: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
276            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
277            leading: 0.0,
278            char_space: 0.0,
279            word_space: 0.0,
280            horizontal_scale: 100.0,
281            text_rise: 0.0,
282            font_size: 0.0,
283            font_name: None,
284            render_mode: 0,
285            fill_color: None,
286            saved_states: Vec::new(),
287            mc_stack: Vec::new(),
288            pending_actualtext: None,
289        }
290    }
291}
292
293/// Parse font style (bold/italic) from font name
294///
295/// Detects bold and italic styles from common font naming patterns.
296/// Works with PostScript font names (e.g., "Helvetica-Bold", "Times-BoldItalic")
297/// and TrueType names (e.g., "Arial Bold", "Courier Oblique").
298///
299/// # Examples
300///
301/// ```
302/// use oxidize_pdf::text::extraction::parse_font_style;
303///
304/// assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
305/// assert_eq!(parse_font_style("Times-BoldItalic"), (true, true));
306/// assert_eq!(parse_font_style("Courier"), (false, false));
307/// assert_eq!(parse_font_style("Arial-Italic"), (false, true));
308/// ```
309///
310/// # Returns
311///
312/// Tuple of (is_bold, is_italic)
313pub fn parse_font_style(font_name: &str) -> (bool, bool) {
314    let name_lower = font_name.to_lowercase();
315
316    // Detect bold from common patterns
317    let is_bold = name_lower.contains("bold")
318        || name_lower.contains("-b")
319        || name_lower.contains(" b ")
320        || name_lower.ends_with(" b");
321
322    // Detect italic/oblique from common patterns
323    let is_italic = name_lower.contains("italic")
324        || name_lower.contains("oblique")
325        || name_lower.contains("-i")
326        || name_lower.contains(" i ")
327        || name_lower.ends_with(" i");
328
329    (is_bold, is_italic)
330}
331
332/// Text extractor for PDF pages with CMap support
333pub struct TextExtractor {
334    options: ExtractionOptions,
335    /// Font cache for the current page (name-keyed, rebuilt per page since names are page-local)
336    font_cache: HashMap<String, FontInfo>,
337    /// Persistent font cache keyed by PDF object reference — avoids re-parsing the same font
338    /// object across pages. Most multi-page PDFs reuse the same font objects.
339    font_object_cache: HashMap<(u32, u16), FontInfo>,
340}
341
342impl TextExtractor {
343    /// Create a new text extractor with default options
344    pub fn new() -> Self {
345        Self {
346            options: ExtractionOptions::default(),
347            font_cache: HashMap::new(),
348            font_object_cache: HashMap::new(),
349        }
350    }
351
352    /// Create a text extractor with custom options
353    pub fn with_options(options: ExtractionOptions) -> Self {
354        Self {
355            options,
356            font_cache: HashMap::new(),
357            font_object_cache: HashMap::new(),
358        }
359    }
360
361    /// Run the full fragment-merge chain used by the partition pipeline:
362    /// kerning fix → line reconstruction → paragraph reconstruction.
363    ///
364    /// Honors `ExtractionOptions::reconstruct_paragraphs`: when `false`, only
365    /// `merge_close_fragments` (the kerning fix) runs and the input is
366    /// returned at fragment granularity.
367    ///
368    /// This method is `pub` so the integration test in
369    /// `tests/paragraph_reconstruction_test.rs` can exercise it without going
370    /// through a PDF file. Production callers should prefer
371    /// `PdfDocument::partition()` and friends, which use this internally.
372    pub fn merge_fragments_for_partition(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
373        let kerning_fixed = self.merge_close_fragments(fragments);
374        if !self.options.reconstruct_paragraphs {
375            return kerning_fixed;
376        }
377        let lines = self.merge_into_lines(&kerning_fixed);
378        self.merge_into_paragraphs(&lines)
379    }
380
381    /// Group fragments by baseline into single-line fragments.
382    ///
383    /// Two fragments are on the same line when their Y centers differ by less
384    /// than `0.2 * min(head.height, frag.height)`. The 0.2 ratio absorbs
385    /// sub-point baseline jitter from text-matrix arithmetic while keeping
386    /// tightly-spaced visual rows (e.g. table cells whose baselines are
387    /// separated by ~2-3pt at 9pt font) on distinct logical lines — see
388    /// issue #265.
389    ///
390    /// Fragments are grouped by `(row_id, Y_bucket, mcid)`, where `row_id`
391    /// comes from `assign_row_ids` (increments on Y-up-jumps in emission
392    /// order). Within a line the tie-break is emission index for tagged PDFs
393    /// (any fragment carries an mcid — ISO 32000 mandates logical order) and
394    /// X coordinate for non-tagged PDFs. A space is inserted between adjacent
395    /// fragments when the X gap exceeds `space_threshold * font_size`.
396    ///
397    /// The output bounding box for each line is the axis-aligned union of the
398    /// input fragments' bounding boxes; `font_size` and `font_name` are
399    /// inherited from the line's first fragment.
400    fn merge_into_lines(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
401        if fragments.is_empty() {
402            return Vec::new();
403        }
404
405        // Pre-pass: assign row_id from Y-up-jumps in emission order. This
406        // disambiguates columns in multi-column layouts where a single outer
407        // BDC makes mcid uniform across visually distinct columns. See
408        // `docs/superpowers/specs/2026-05-23-issue-265-line-interleaving-design.md`.
409        let row_ids = assign_row_ids(fragments);
410
411        // Whether this page has at least one tagged (mcid-carrying) fragment.
412        // `.any()` returns true if even one fragment has mcid=Some; the within-line
413        // tie-break then uses emission index for the whole page rather than X.
414        // See `docs/superpowers/specs/2026-05-23-issue-265-line-interleaving-design.md`.
415        //
416        // For tagged PDFs (PDF/UA, ISO 32000-2 tagged), the content stream delivers
417        // text in logical reading order, so within a visual line we preserve emission
418        // order rather than sorting by X. Out-of-left-to-right glyph placement
419        // (common in typeset tagged PDFs where the PDF author lays out glyphs via
420        // non-monotone Td/Tm operators) is correctly rendered by keeping emission order.
421        //
422        // For non-tagged PDFs (all mcid=None), we retain the X-sort fallback
423        // because many generators emit glyphs in arbitrary (often right-to-left
424        // or random) order and only the X coordinate gives reading order.
425        let is_tagged = fragments.iter().any(|f| f.mcid.is_some());
426
427        // Sort for line GROUPING only: row_id, then Y descending, then X.
428        // row_id keeps fragments from different visual rows in separate
429        // Y-bucket groups; Y descending puts higher-on-page lines first. The
430        // X tie-break only makes same-line fragments adjacent for grouping —
431        // the authoritative reading order WITHIN each line is decided per line
432        // below (#302 symptom 1), so this grouping order is not the final order.
433        let mut indexed: Vec<(u32, usize, &TextFragment)> = row_ids
434            .iter()
435            .copied()
436            .zip(fragments.iter().enumerate())
437            .map(|(rid, (idx, f))| (rid, idx, f))
438            .collect();
439        indexed.sort_by(|a, b| {
440            a.0.cmp(&b.0)
441                .then(b.2.y.total_cmp(&a.2.y))
442                .then(a.2.x.total_cmp(&b.2.x))
443        });
444
445        // Group into visual lines, carrying each fragment's emission index so
446        // the per-line ordering decision below can restore emission order.
447        let mut lines: Vec<Vec<(usize, &TextFragment)>> = Vec::new();
448        let mut last_seen_row_id: Option<u32> = None;
449        for (rid, idx, frag) in indexed {
450            let same_batch = last_seen_row_id == Some(rid);
451            let placed = same_batch
452                && lines.last_mut().is_some_and(|line| {
453                    let head = line[0].1;
454                    let tol = (head.height.min(frag.height)) * 0.2;
455                    (head.y - frag.y).abs() < tol && head.mcid == frag.mcid
456                });
457            if placed {
458                lines.last_mut().unwrap().push((idx, frag));
459            } else {
460                lines.push(vec![(idx, frag)]);
461                last_seen_row_id = Some(rid);
462            }
463        }
464
465        // Decide reading order per visual line (#302 symptom 1).
466        //
467        // X-sort is wrong when one line mixes fonts whose glyph metrics differ
468        // (e.g. an italic particle symbol set in roman body text): the producer
469        // gives the font-switched run an x-origin that falls INSIDE the x-span
470        // of its neighbours, so sorting by x interleaves it
471        // ("to the Z boson" -> "tZboso theon"). The content stream still emits
472        // these runs in correct reading order, so when a line's emission order
473        // has no DISJOINT backward x-step (only span overlaps, or is already
474        // x-monotone) we keep emission order. A disjoint backward step signals
475        // a genuinely scrambled stream (right-to-left / random generators), for
476        // which x-order stays authoritative. Deciding per line — not per
477        // column — prevents one scrambled line from forcing x-sort on the rest.
478        lines
479            .into_iter()
480            .map(|mut line| {
481                if is_tagged || line_prefers_emission_order(&line) {
482                    line.sort_by_key(|&(idx, _)| idx);
483                } else {
484                    line.sort_by(|a, b| a.1.x.total_cmp(&b.1.x));
485                }
486                let frags: Vec<&TextFragment> = line.into_iter().map(|(_, f)| f).collect();
487                self.build_line_fragment(frags)
488            })
489            .collect()
490    }
491
492    /// Space-glyph advance for `font_name` in text space (point units at
493    /// `font_size`), or `None` when unknown. Prefers the font's embedded
494    /// `/Widths` entry for code 32; falls back to the Adobe Core-14 AFM space
495    /// width for the standard base fonts (Times/Helvetica/Courier/Symbol/
496    /// ZapfDingbats), which ship no `/Widths` array (#302 symptom 2).
497    fn font_space_advance(&self, font_name: Option<&str>, font_size: f64) -> Option<f64> {
498        let info = self.font_cache.get(font_name?)?;
499        if let Some(ref widths) = info.metrics.widths {
500            let first = info.metrics.first_char.unwrap_or(0);
501            if first <= 32 {
502                if let Some(&w) = widths.get((32 - first) as usize) {
503                    if w > 0.0 {
504                        return Some(w / 1000.0 * font_size);
505                    }
506                }
507            }
508        }
509        standard_14_space_width(&info.name).map(|em| em / 1000.0 * font_size)
510    }
511
512    /// Minimum inter-fragment x-gap that counts as a word space for `frag`.
513    /// Anchored to the font's real space-glyph advance when known — word gaps
514    /// scale with the font's space metric, not with a fixed fraction of font
515    /// size — falling back to `space_threshold * font_size` otherwise. Tightly
516    /// set justified text (e.g. Standard-14 Times body) has word gaps near
517    /// 0.2em, far below the legacy 0.3*font_size, which dropped spaces
518    /// ("thequadrupletis"); a font with a 250-unit space then gets a 0.125em
519    /// threshold instead (#302 symptom 2).
520    fn space_gap_threshold(&self, frag: &TextFragment) -> f64 {
521        match self.font_space_advance(frag.font_name.as_deref(), frag.font_size) {
522            Some(adv) if adv > 0.0 => 0.5 * adv,
523            _ => self.options.space_threshold * frag.font_size,
524        }
525    }
526
527    /// Assemble one visual line's fragments into a single line `TextFragment`,
528    /// inserting a space between consecutive fragments whose x-gap exceeds the
529    /// font-anchored [`space_gap_threshold`](Self::space_gap_threshold).
530    fn build_line_fragment(&self, line: Vec<&TextFragment>) -> TextFragment {
531        let head = line[0];
532        let mut text = String::new();
533        let mut x_min = head.x;
534        let mut x_max = head.x + head.width;
535        let mut y_min = head.y;
536        let mut y_max = head.y + head.height;
537
538        for (i, frag) in line.iter().enumerate() {
539            if i > 0 {
540                let prev = line[i - 1];
541                let gap = frag.x - (prev.x + prev.width);
542                if gap > self.space_gap_threshold(frag) {
543                    text.push(' ');
544                }
545            }
546            text.push_str(&frag.text);
547            x_min = x_min.min(frag.x);
548            x_max = x_max.max(frag.x + frag.width);
549            y_min = y_min.min(frag.y);
550            y_max = y_max.max(frag.y + frag.height);
551        }
552
553        TextFragment {
554            text,
555            x: x_min,
556            y: y_min,
557            width: x_max - x_min,
558            height: y_max - y_min,
559            font_size: head.font_size,
560            font_name: head.font_name.clone(),
561            is_bold: head.is_bold,
562            is_italic: head.is_italic,
563            color: head.color,
564            space_decisions: Vec::new(),
565            mcid: head.mcid,
566            struct_tag: head.struct_tag.clone(),
567        }
568    }
569
570    /// Group consecutive lines into paragraphs based on vertical gap.
571    ///
572    /// Two consecutive lines are part of the same paragraph when the vertical
573    /// gap between them is less than 1.5× the median line height in the
574    /// input. Hyphenated line breaks (previous line ends with `-` and
575    /// `merge_hyphenated` is set) join without a separator and drop the
576    /// hyphen; otherwise lines join with `'\n'`.
577    fn merge_into_paragraphs(&self, lines: &[TextFragment]) -> Vec<TextFragment> {
578        if lines.is_empty() {
579            return Vec::new();
580        }
581
582        // Median line height — robust to outliers
583        let mut heights: Vec<f64> = lines.iter().map(|l| l.height).collect();
584        heights.sort_by(f64::total_cmp);
585        let median_h = heights[heights.len() / 2];
586        let max_paragraph_gap = median_h * 1.5;
587
588        let mut paragraphs: Vec<TextFragment> = Vec::new();
589        let mut current = lines[0].clone();
590
591        for line in &lines[1..] {
592            let prev_bottom = current.y;
593            let line_top = line.y + line.height;
594            let gap = prev_bottom - line_top;
595
596            if gap < 0.0 || gap > max_paragraph_gap || current.mcid != line.mcid {
597                paragraphs.push(current);
598                current = line.clone();
599                continue;
600            }
601
602            // Same paragraph — join
603            let joined_text = if self.options.merge_hyphenated && current.text.ends_with('-') {
604                let mut s = current.text.clone();
605                s.pop(); // drop trailing hyphen
606                s.push_str(&line.text);
607                s
608            } else {
609                format!("{}\n{}", current.text, line.text)
610            };
611
612            let x_min = current.x.min(line.x);
613            let x_max = (current.x + current.width).max(line.x + line.width);
614            let y_min = current.y.min(line.y);
615            let y_max = (current.y + current.height).max(line.y + line.height);
616
617            current = TextFragment {
618                text: joined_text,
619                x: x_min,
620                y: y_min,
621                width: x_max - x_min,
622                height: y_max - y_min,
623                font_size: current.font_size,
624                font_name: current.font_name.clone(),
625                is_bold: current.is_bold,
626                is_italic: current.is_italic,
627                color: current.color,
628                space_decisions: Vec::new(),
629                mcid: current.mcid,
630                struct_tag: current.struct_tag.clone(),
631            };
632        }
633        paragraphs.push(current);
634
635        paragraphs
636    }
637
638    /// Extract text from a PDF document
639    pub fn extract_from_document<R: Read + Seek>(
640        &mut self,
641        document: &PdfDocument<R>,
642    ) -> ParseResult<Vec<ExtractedText>> {
643        let page_count = document.page_count()?;
644        let mut results = Vec::new();
645
646        for i in 0..page_count {
647            let text = self.extract_from_page(document, i)?;
648            results.push(text);
649        }
650
651        Ok(results)
652    }
653
654    /// Extract text from a specific page
655    pub fn extract_from_page<R: Read + Seek>(
656        &mut self,
657        document: &PdfDocument<R>,
658        page_index: u32,
659    ) -> ParseResult<ExtractedText> {
660        // Get the page
661        let page = document.get_page(page_index)?;
662
663        // Extract font resources first
664        {
665            let _span = tracing::info_span!("font_resources").entered();
666            self.extract_font_resources(&page, document)?;
667        }
668
669        // Get content streams
670        let streams = {
671            let _span = tracing::info_span!("stream_decompress").entered();
672            page.content_streams_with_document(document)?
673        };
674
675        let extracted_text = String::new();
676        let fragments = Vec::new();
677        let state = TextState::default();
678        let in_text_object = false;
679        let last_x = 0.0;
680        let last_y = 0.0;
681
682        // Page resources (owned) for XObject + /Properties lookup during
683        // recursive Form XObject extraction (issue #319).
684        let page_resources: Option<crate::parser::objects::PdfDictionary> =
685            if let Some(rr) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
686                document
687                    .get_object(rr.0, rr.1)
688                    .ok()
689                    .and_then(|o| o.as_dict().cloned())
690            } else {
691                page.get_resources().cloned()
692            };
693
694        let mut run = OpRunState {
695            state,
696            in_text_object,
697            last_x,
698            last_y,
699            extracted_text,
700            fragments,
701        };
702
703        // Process each content stream
704        for (stream_idx, stream_data) in streams.iter().enumerate() {
705            let operations = match {
706                let _span = tracing::info_span!("content_parse").entered();
707                ContentParser::parse_content(stream_data)
708            } {
709                Ok(ops) => ops,
710                Err(e) => {
711                    // Enhanced diagnostic logging for content stream parsing failures
712                    tracing::debug!(
713                        "Warning: Failed to parse content stream on page {}, stream {}/{}",
714                        page_index + 1,
715                        stream_idx + 1,
716                        streams.len()
717                    );
718                    tracing::debug!("         Error: {}", e);
719                    tracing::debug!("         Stream size: {} bytes", stream_data.len());
720
721                    // Show first 100 bytes for diagnosis (or less if stream is smaller)
722                    let preview_len = stream_data.len().min(100);
723                    let preview = String::from_utf8_lossy(&stream_data[..preview_len]);
724                    tracing::debug!(
725                        "         Stream preview (first {} bytes): {:?}",
726                        preview_len,
727                        preview.chars().take(80).collect::<String>()
728                    );
729
730                    // Continue processing other streams
731                    continue;
732                }
733            };
734
735            run = self.process_operations(
736                operations,
737                document,
738                page_resources.as_ref(),
739                run,
740                page_index,
741                0,
742            )?;
743        }
744
745        let OpRunState {
746            mut extracted_text,
747            mut fragments,
748            ..
749        } = run;
750        {
751            let _span = tracing::info_span!("layout_finalize").entered();
752
753            // Sort and process fragments if requested — but ONLY when we're not
754            // going to run merge_into_lines later. merge_into_lines does its
755            // own (row_id, y, x) sort that needs pre-sort emission order to
756            // detect Y-up-jumps for column splitting (issue #265). For the
757            // legacy path with reconstruct_paragraphs=false, the early sort is
758            // still required because nothing downstream reorders fragments.
759            if self.options.sort_by_position
760                && !self.options.reconstruct_paragraphs
761                && !fragments.is_empty()
762            {
763                self.sort_and_merge_fragments(&mut fragments);
764            }
765
766            // Merge close fragments to eliminate spacing artifacts (kerning fix)
767            if self.options.preserve_layout && !fragments.is_empty() {
768                fragments = self.merge_close_fragments(&fragments);
769            }
770
771            // Reconstruct visual lines and paragraphs from raw fragments.
772            // Required for the partition pipeline to produce Element values at
773            // paragraph granularity (issue #261).
774            if self.options.reconstruct_paragraphs && !fragments.is_empty() {
775                let lines = self.merge_into_lines(&fragments);
776                fragments = self.merge_into_paragraphs(&lines);
777            }
778
779            // Reconstruct text from sorted fragments if layout is preserved
780            if self.options.preserve_layout && !fragments.is_empty() {
781                extracted_text = self.reconstruct_text_from_fragments(&fragments);
782            }
783        }
784
785        Ok(ExtractedText {
786            text: extracted_text,
787            fragments,
788        })
789    }
790
791    /// Run a content-stream operation list, recursing into Form XObjects so
792    /// text drawn inside a `Do`-painted Form XObject is extracted (issue #319).
793    #[allow(clippy::too_many_arguments)]
794    fn process_operations<R: Read + Seek>(
795        &mut self,
796        operations: Vec<ContentOperation>,
797        document: &PdfDocument<R>,
798        resources: Option<&crate::parser::objects::PdfDictionary>,
799        run: OpRunState,
800        page_index: u32,
801        depth: u8,
802    ) -> ParseResult<OpRunState> {
803        let OpRunState {
804            mut state,
805            mut in_text_object,
806            mut last_x,
807            mut last_y,
808            mut extracted_text,
809            mut fragments,
810        } = run;
811
812        let page_properties: Option<&crate::parser::objects::PdfDictionary> =
813            resources.and_then(|res| match res.get("Properties") {
814                Some(crate::parser::objects::PdfObject::Dictionary(d)) => Some(d),
815                _ => None,
816            });
817
818        let _ops_span = tracing::info_span!("text_ops_loop").entered();
819        for op in operations {
820            match op {
821                ContentOperation::BeginText => {
822                    in_text_object = true;
823                    // Reset text matrix to identity
824                    state.text_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
825                    state.text_line_matrix = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
826                }
827
828                ContentOperation::EndText => {
829                    in_text_object = false;
830                }
831
832                ContentOperation::SetTextMatrix(a, b, c, d, e, f) => {
833                    state.text_matrix =
834                        [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
835                    state.text_line_matrix =
836                        [a as f64, b as f64, c as f64, d as f64, e as f64, f as f64];
837                }
838
839                ContentOperation::MoveText(tx, ty) => {
840                    // Update text matrix by translation
841                    let new_matrix = multiply_matrix(
842                        &[1.0, 0.0, 0.0, 1.0, tx as f64, ty as f64],
843                        &state.text_line_matrix,
844                    );
845                    state.text_matrix = new_matrix;
846                    state.text_line_matrix = new_matrix;
847                }
848
849                ContentOperation::NextLine => {
850                    // Move to next line using current leading
851                    let new_matrix = multiply_matrix(
852                        &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
853                        &state.text_line_matrix,
854                    );
855                    state.text_matrix = new_matrix;
856                    state.text_line_matrix = new_matrix;
857                }
858
859                ContentOperation::ShowText(text) => {
860                    if in_text_object {
861                        let text_bytes = &text;
862                        let decoded = self.decode_text(text_bytes, &state)?;
863
864                        // Pen origin in user space = (CTM × text_matrix)(0, 0).
865                        let (x, y) = text_origin(&state);
866
867                        // Mirror the gate inside `emit_text_fragment` so that
868                        // `.text` and `.fragments` stay consistent for pages
869                        // wrapped in an `/Artifact` marked-content scope —
870                        // issue #330.
871                        let skip_text = skip_artifact_text(&state, self.options.include_artifacts);
872
873                        // Add spacing based on position change
874                        if !skip_text && !extracted_text.is_empty() {
875                            let dx = x - last_x;
876                            let dy = (y - last_y).abs();
877
878                            if dy > self.options.newline_threshold {
879                                extracted_text.push('\n');
880                            } else if dx > self.options.space_threshold * state.font_size {
881                                extracted_text.push(' ');
882                            }
883                        }
884
885                        if !skip_text {
886                            extracted_text.push_str(&decoded);
887                        }
888
889                        // Get font info for accurate width calculation.
890                        // Width comes from the char codes (`text_bytes`), not
891                        // the decoded Unicode: the Widths array is code-indexed
892                        // (issue #302).
893                        let text_width = {
894                            let font_info = state
895                                .font_name
896                                .as_ref()
897                                .and_then(|name| self.font_cache.get(name));
898                            calculate_text_width_from_codes(
899                                text_bytes,
900                                &decoded,
901                                state.font_size,
902                                font_info,
903                            )
904                        };
905
906                        if self.options.preserve_layout {
907                            emit_text_fragment(
908                                &mut fragments,
909                                &decoded,
910                                text_width,
911                                x,
912                                y,
913                                &mut state,
914                                self.options.include_artifacts,
915                            );
916                        }
917
918                        // Update position for next text
919                        last_x = x + text_width;
920                        last_y = y;
921
922                        // Update text matrix for next show operation
923                        let tx = text_width * state.horizontal_scale / 100.0;
924                        state.text_matrix =
925                            multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
926                    }
927                }
928
929                ContentOperation::ShowTextArray(array) => {
930                    if in_text_object {
931                        for item in array {
932                            match item {
933                                TextElement::Text(text_bytes) => {
934                                    let decoded = self.decode_text(&text_bytes, &state)?;
935                                    // Mirror the gate inside `emit_text_fragment`
936                                    // so `.text` and `.fragments` stay consistent
937                                    // for Artifact scopes (issue #330).
938                                    let skip_text =
939                                        skip_artifact_text(&state, self.options.include_artifacts);
940                                    if !skip_text {
941                                        extracted_text.push_str(&decoded);
942                                    }
943
944                                    let text_width = {
945                                        let font_info = state
946                                            .font_name
947                                            .as_ref()
948                                            .and_then(|name| self.font_cache.get(name));
949                                        calculate_text_width_from_codes(
950                                            &text_bytes,
951                                            &decoded,
952                                            state.font_size,
953                                            font_info,
954                                        )
955                                    };
956
957                                    if self.options.preserve_layout {
958                                        let (x, y) = text_origin(&state);
959                                        emit_text_fragment(
960                                            &mut fragments,
961                                            &decoded,
962                                            text_width,
963                                            x,
964                                            y,
965                                            &mut state,
966                                            self.options.include_artifacts,
967                                        );
968                                    }
969
970                                    let tx = text_width * state.horizontal_scale / 100.0;
971                                    state.text_matrix = multiply_matrix(
972                                        &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
973                                        &state.text_matrix,
974                                    );
975                                }
976                                TextElement::Spacing(adjustment) => {
977                                    // Text position adjustment (negative = move left,
978                                    // i.e. shifts the pen forward). When the synthesised
979                                    // forward advance exceeds `tj_space_threshold * font_size`
980                                    // we treat the kern as an implicit `U+0020` (issue #272):
981                                    // many PDFs encode word breaks purely as wide negative
982                                    // kerns and never emit a literal space byte.
983                                    let tx = -(adjustment as f64) / 1000.0 * state.font_size;
984
985                                    let skip_tj_space =
986                                        skip_artifact_text(&state, self.options.include_artifacts);
987                                    if !skip_tj_space
988                                        && tx > self.options.tj_space_threshold * state.font_size
989                                        && !extracted_text.is_empty()
990                                        && !extracted_text.ends_with(' ')
991                                    {
992                                        extracted_text.push(' ');
993
994                                        // Skip the fragment-level emission while an
995                                        // ActualText scope is pending: the synthesised
996                                        // space is a heuristic, not real content, and
997                                        // emitting it would call `emit_text_fragment`
998                                        // whose ActualText short-circuit would inflate
999                                        // `pending.width` and set `pending.populated`
1000                                        // even though no real `Tj` has fired yet. The
1001                                        // EMC flush will supply the canonical fragment
1002                                        // text from the override (Phase 1 #269 contract).
1003                                        if self.options.preserve_layout
1004                                            && state.pending_actualtext.is_none()
1005                                        {
1006                                            // Emit a synthetic single-space fragment at the
1007                                            // current pen origin so downstream layout merges
1008                                            // (e.g. `merge_close_fragments`) see the gap as
1009                                            // explicit content rather than as a sub-threshold
1010                                            // x-jump. Width = the kern advance so the next
1011                                            // text fragment begins flush against it.
1012                                            let (sx, sy) = text_origin(&state);
1013                                            emit_text_fragment(
1014                                                &mut fragments,
1015                                                " ",
1016                                                tx,
1017                                                sx,
1018                                                sy,
1019                                                &mut state,
1020                                                self.options.include_artifacts,
1021                                            );
1022                                        }
1023                                    }
1024
1025                                    state.text_matrix = multiply_matrix(
1026                                        &[1.0, 0.0, 0.0, 1.0, tx, 0.0],
1027                                        &state.text_matrix,
1028                                    );
1029                                }
1030                            }
1031                        }
1032                    }
1033                }
1034
1035                ContentOperation::NextLineShowText(text) => {
1036                    if in_text_object {
1037                        // ' = T* then Tj string. Advance line matrix by -leading.
1038                        let new_matrix = multiply_matrix(
1039                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
1040                            &state.text_line_matrix,
1041                        );
1042                        state.text_matrix = new_matrix;
1043                        state.text_line_matrix = new_matrix;
1044
1045                        let decoded = self.decode_text(&text, &state)?;
1046                        let (x, y) = text_origin(&state);
1047
1048                        // Mirror the artifact gate (issue #330).
1049                        let skip_text = skip_artifact_text(&state, self.options.include_artifacts);
1050                        if !skip_text {
1051                            if !extracted_text.is_empty() {
1052                                extracted_text.push('\n');
1053                            }
1054                            extracted_text.push_str(&decoded);
1055                        }
1056
1057                        let text_width = {
1058                            let font_info = state
1059                                .font_name
1060                                .as_ref()
1061                                .and_then(|name| self.font_cache.get(name));
1062                            calculate_text_width_from_codes(
1063                                &text,
1064                                &decoded,
1065                                state.font_size,
1066                                font_info,
1067                            )
1068                        };
1069
1070                        if self.options.preserve_layout {
1071                            emit_text_fragment(
1072                                &mut fragments,
1073                                &decoded,
1074                                text_width,
1075                                x,
1076                                y,
1077                                &mut state,
1078                                self.options.include_artifacts,
1079                            );
1080                        }
1081
1082                        last_x = x + text_width;
1083                        last_y = y;
1084
1085                        let tx = text_width * state.horizontal_scale / 100.0;
1086                        state.text_matrix =
1087                            multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
1088                    }
1089                }
1090
1091                ContentOperation::SetSpacingNextLineShowText(word_space, char_space, text) => {
1092                    if in_text_object {
1093                        // " = aw Tw, ac Tc, then ' string. ISO 32000-1 §9.4.3.
1094                        // The variant fields mirror the spec field names:
1095                        // (word_spacing, char_spacing, text).
1096                        state.word_space = word_space as f64;
1097                        state.char_space = char_space as f64;
1098
1099                        let new_matrix = multiply_matrix(
1100                            &[1.0, 0.0, 0.0, 1.0, 0.0, -state.leading],
1101                            &state.text_line_matrix,
1102                        );
1103                        state.text_matrix = new_matrix;
1104                        state.text_line_matrix = new_matrix;
1105
1106                        let decoded = self.decode_text(&text, &state)?;
1107                        let (x, y) = text_origin(&state);
1108
1109                        // Mirror the artifact gate (issue #330).
1110                        let skip_text = skip_artifact_text(&state, self.options.include_artifacts);
1111                        if !skip_text {
1112                            if !extracted_text.is_empty() {
1113                                extracted_text.push('\n');
1114                            }
1115                            extracted_text.push_str(&decoded);
1116                        }
1117
1118                        let text_width = {
1119                            let font_info = state
1120                                .font_name
1121                                .as_ref()
1122                                .and_then(|name| self.font_cache.get(name));
1123                            calculate_text_width_from_codes(
1124                                &text,
1125                                &decoded,
1126                                state.font_size,
1127                                font_info,
1128                            )
1129                        };
1130
1131                        if self.options.preserve_layout {
1132                            emit_text_fragment(
1133                                &mut fragments,
1134                                &decoded,
1135                                text_width,
1136                                x,
1137                                y,
1138                                &mut state,
1139                                self.options.include_artifacts,
1140                            );
1141                        }
1142
1143                        last_x = x + text_width;
1144                        last_y = y;
1145
1146                        let tx = text_width * state.horizontal_scale / 100.0;
1147                        state.text_matrix =
1148                            multiply_matrix(&[1.0, 0.0, 0.0, 1.0, tx, 0.0], &state.text_matrix);
1149                    }
1150                }
1151
1152                ContentOperation::SetFont(name, size) => {
1153                    state.font_name = Some(name);
1154                    state.font_size = size as f64;
1155                }
1156
1157                ContentOperation::SetLeading(leading) => {
1158                    state.leading = leading as f64;
1159                }
1160
1161                ContentOperation::SetCharSpacing(spacing) => {
1162                    state.char_space = spacing as f64;
1163                }
1164
1165                ContentOperation::SetWordSpacing(spacing) => {
1166                    state.word_space = spacing as f64;
1167                }
1168
1169                ContentOperation::SetHorizontalScaling(scale) => {
1170                    state.horizontal_scale = scale as f64;
1171                }
1172
1173                ContentOperation::SetTextRise(rise) => {
1174                    state.text_rise = rise as f64;
1175                }
1176
1177                ContentOperation::SetTextRenderMode(mode) => {
1178                    state.render_mode = mode as u8;
1179                }
1180
1181                ContentOperation::SetTransformMatrix(a, b, c, d, e, f) => {
1182                    // Update CTM: new_ctm = concat_matrix * current_ctm
1183                    let [a0, b0, c0, d0, e0, f0] = state.ctm;
1184                    let a = a as f64;
1185                    let b = b as f64;
1186                    let c = c as f64;
1187                    let d = d as f64;
1188                    let e = e as f64;
1189                    let f = f as f64;
1190                    state.ctm = [
1191                        a * a0 + b * c0,
1192                        a * b0 + b * d0,
1193                        c * a0 + d * c0,
1194                        c * b0 + d * d0,
1195                        e * a0 + f * c0 + e0,
1196                        e * b0 + f * d0 + f0,
1197                    ];
1198                }
1199
1200                // Graphics state stack (issue #262). `q` snapshots the
1201                // current CTM and fill_color; `Q` restores the most recent
1202                // snapshot. Without these, every `cm` accumulates onto the
1203                // CTM forever, producing absurd page-space coordinates and
1204                // wrong font_size scaling on PDFs that nest graphics state.
1205                ContentOperation::SaveGraphicsState => {
1206                    state.saved_states.push(SavedGraphicsState {
1207                        ctm: state.ctm,
1208                        fill_color: state.fill_color,
1209                    });
1210                }
1211                ContentOperation::RestoreGraphicsState => {
1212                    if let Some(saved) = state.saved_states.pop() {
1213                        state.ctm = saved.ctm;
1214                        state.fill_color = saved.fill_color;
1215                    }
1216                    // Unbalanced Q (pop on empty stack) is silently ignored
1217                    // to keep extraction robust to malformed PDFs.
1218                }
1219
1220                // Color operations (Phase 4: Color extraction)
1221                ContentOperation::SetNonStrokingGray(gray) => {
1222                    state.fill_color = Some(Color::gray(gray as f64));
1223                }
1224
1225                ContentOperation::SetNonStrokingRGB(r, g, b) => {
1226                    state.fill_color = Some(Color::rgb(r as f64, g as f64, b as f64));
1227                }
1228
1229                ContentOperation::SetNonStrokingCMYK(c, m, y, k) => {
1230                    state.fill_color = Some(Color::cmyk(c as f64, m as f64, y as f64, k as f64));
1231                }
1232
1233                // Issue #269 Phase 1: marked-content operators
1234                ContentOperation::BeginMarkedContent(tag) => {
1235                    let parent_artifact = state.mc_stack.last().is_some_and(|e| e.is_artifact);
1236                    state.mc_stack.push(MarkedContentEntry {
1237                        is_artifact: tag == "Artifact" || parent_artifact,
1238                        tag,
1239                        mcid: None,
1240                        actual_text: None,
1241                    });
1242                }
1243
1244                ContentOperation::BeginMarkedContentWithProps(tag, props) => {
1245                    let parent_artifact = state.mc_stack.last().is_some_and(|e| e.is_artifact);
1246                    let (mcid, actual_text) = resolve_props(&props, page_properties);
1247
1248                    // If this scope declares ActualText, open a pending run that will be
1249                    // flushed on the matching EMC. Suppresses per-Tj emission inside the
1250                    // scope (innermost-ActualText-wins per spec §4).
1251                    if let Some(ref text) = actual_text {
1252                        state.pending_actualtext = Some(PendingActualText {
1253                            text: text.clone(),
1254                            first_x: 0.0,
1255                            first_y: 0.0,
1256                            width: 0.0,
1257                            font_size: state.font_size,
1258                            font_name: state.font_name.clone(),
1259                            is_bold: false, // overwritten on first Tj
1260                            is_italic: false,
1261                            color: state.fill_color,
1262                            stack_depth: state.mc_stack.len(), // BEFORE the push below
1263                            populated: false,
1264                        });
1265                    }
1266
1267                    state.mc_stack.push(MarkedContentEntry {
1268                        is_artifact: tag == "Artifact" || parent_artifact,
1269                        tag,
1270                        mcid,
1271                        actual_text,
1272                    });
1273                }
1274
1275                ContentOperation::EndMarkedContent => {
1276                    let popped_depth = state.mc_stack.len();
1277                    if state.mc_stack.pop().is_none() {
1278                        // Unbalanced EMC — log and ignore. Real PDFs occasionally emit
1279                        // dangling EMC (e.g. from incremental updates). We must not panic.
1280                        tracing::debug!(
1281                            "extraction: EMC with empty marked-content stack on page {}",
1282                            page_index + 1
1283                        );
1284                    } else if let Some(pending) = state.pending_actualtext.as_ref() {
1285                        // If we just closed the scope that opened the pending run, flush it.
1286                        if pending.stack_depth + 1 == popped_depth {
1287                            let run = state.pending_actualtext.take().unwrap();
1288                            if run.populated && self.options.preserve_layout {
1289                                let (mcid, struct_tag) = innermost_mc_tag(&state.mc_stack);
1290                                let in_artifact = state.mc_stack.iter().any(|e| e.is_artifact);
1291                                if !in_artifact || self.options.include_artifacts {
1292                                    fragments.push(TextFragment {
1293                                        text: run.text,
1294                                        x: run.first_x,
1295                                        y: run.first_y,
1296                                        width: run.width,
1297                                        height: run.font_size,
1298                                        font_size: run.font_size,
1299                                        font_name: run.font_name,
1300                                        is_bold: run.is_bold,
1301                                        is_italic: run.is_italic,
1302                                        color: run.color,
1303                                        space_decisions: Vec::new(),
1304                                        mcid,
1305                                        struct_tag,
1306                                    });
1307                                }
1308                            }
1309                        }
1310                    }
1311                }
1312
1313                ContentOperation::PaintXObject(name) => {
1314                    // Issue #319: recurse into Form XObjects. `Do` paints a
1315                    // Form XObject in an implicit q/Q, with the XObject's
1316                    // /Matrix composed onto the CTM and its own /Resources
1317                    // fonts in scope. Without this, text drawn inside the
1318                    // XObject (the page body, for RML2PDF "inclPDF" output)
1319                    // is never extracted.
1320                    const MAX_XOBJECT_DEPTH: u8 = 12;
1321                    if depth < MAX_XOBJECT_DEPTH {
1322                        if let Some((xobj_ops, xobj_res, matrix)) =
1323                            self.load_form_xobject(resources, &name, document)
1324                        {
1325                            let saved_ctm = state.ctm;
1326                            let saved_fill = state.fill_color;
1327                            let saved_stack = state.saved_states.len();
1328                            let saved_fonts = self.font_cache.clone();
1329
1330                            if let Some(m) = matrix {
1331                                let [a0, b0, c0, d0, e0, f0] = state.ctm;
1332                                let [a, b, c, d, e, f] = m;
1333                                state.ctm = [
1334                                    a * a0 + b * c0,
1335                                    a * b0 + b * d0,
1336                                    c * a0 + d * c0,
1337                                    c * b0 + d * d0,
1338                                    e * a0 + f * c0 + e0,
1339                                    e * b0 + f * d0 + f0,
1340                                ];
1341                            }
1342                            if let Some(ref xr) = xobj_res {
1343                                self.cache_fonts_from_resources::<R>(xr, document);
1344                            }
1345
1346                            let sub = OpRunState {
1347                                state,
1348                                in_text_object: false,
1349                                last_x,
1350                                last_y,
1351                                extracted_text,
1352                                fragments,
1353                            };
1354                            let mut out = self.process_operations(
1355                                xobj_ops,
1356                                document,
1357                                xobj_res.as_ref(),
1358                                sub,
1359                                page_index,
1360                                depth + 1,
1361                            )?;
1362
1363                            out.state.ctm = saved_ctm;
1364                            out.state.fill_color = saved_fill;
1365                            out.state.saved_states.truncate(saved_stack);
1366                            self.font_cache = saved_fonts;
1367
1368                            state = out.state;
1369                            last_x = out.last_x;
1370                            last_y = out.last_y;
1371                            extracted_text = out.extracted_text;
1372                            fragments = out.fragments;
1373                        }
1374                    }
1375                }
1376                _ => {
1377                    // Other operations don't affect text extraction
1378                }
1379            }
1380        }
1381
1382        Ok(OpRunState {
1383            state,
1384            in_text_object,
1385            last_x,
1386            last_y,
1387            extracted_text,
1388            fragments,
1389        })
1390    }
1391
1392    /// Load a Form XObject by name: parsed operations, resolved /Resources,
1393    /// and optional /Matrix. None for image XObjects or anything unparseable.
1394    fn load_form_xobject<R: Read + Seek>(
1395        &self,
1396        resources: Option<&crate::parser::objects::PdfDictionary>,
1397        name: &str,
1398        document: &PdfDocument<R>,
1399    ) -> Option<(
1400        Vec<ContentOperation>,
1401        Option<crate::parser::objects::PdfDictionary>,
1402        Option<[f64; 6]>,
1403    )> {
1404        use crate::parser::objects::PdfObject;
1405        let res = resources?;
1406        let xobjects = match res.get("XObject")? {
1407            PdfObject::Dictionary(d) => d.clone(),
1408            PdfObject::Reference(n, g) => match document.get_object(*n, *g).ok()? {
1409                PdfObject::Dictionary(d) => d,
1410                _ => return None,
1411            },
1412            _ => return None,
1413        };
1414        let (n, g) = xobjects.get(name)?.as_reference()?;
1415        let obj = document.get_object(n, g).ok()?;
1416        let stream = obj.as_stream()?;
1417        if stream
1418            .dict
1419            .get("Subtype")
1420            .and_then(|o| o.as_name())
1421            .map(|nm| nm.0.as_str())
1422            != Some("Form")
1423        {
1424            return None;
1425        }
1426        let data = stream.decode(&Default::default()).ok()?;
1427        let ops = ContentParser::parse_content(&data).ok()?;
1428        let xobj_res = match stream.dict.get("Resources") {
1429            Some(PdfObject::Dictionary(d)) => Some(d.clone()),
1430            Some(PdfObject::Reference(rn, rg)) => document
1431                .get_object(*rn, *rg)
1432                .ok()
1433                .and_then(|o| o.as_dict().cloned()),
1434            _ => None,
1435        };
1436        let matrix = stream
1437            .dict
1438            .get("Matrix")
1439            .and_then(|o| o.as_array())
1440            .and_then(|a| {
1441                if a.0.len() == 6 {
1442                    let mut m = [0.0f64; 6];
1443                    for (i, slot) in m.iter_mut().enumerate() {
1444                        *slot = a.0[i]
1445                            .as_real()
1446                            .or_else(|| a.0[i].as_integer().map(|x| x as f64))?;
1447                    }
1448                    Some(m)
1449                } else {
1450                    None
1451                }
1452            });
1453        Some((ops, xobj_res, matrix))
1454    }
1455
1456    /// Sort text fragments by position and merge them appropriately
1457    fn sort_and_merge_fragments(&self, fragments: &mut [TextFragment]) {
1458        // Sort fragments by Y position (top to bottom) then X position (left to right).
1459        //
1460        // We quantize Y into bands of `newline_threshold` width so that fragments
1461        // on the "same line" get identical Y keys. This ensures the comparator is
1462        // a strict total order (transitive), which Rust's sort algorithm requires.
1463        // Without quantization, threshold-based "same line" detection breaks
1464        // transitivity: A≈B and B≈C does NOT imply A≈C.
1465        let threshold = self.options.newline_threshold;
1466        fragments.sort_by(|a, b| {
1467            // Quantize Y to nearest band (PDF Y increases upward, so negate first)
1468            let band_a = if threshold > 0.0 {
1469                (-a.y / threshold).round()
1470            } else {
1471                -a.y
1472            };
1473            let band_b = if threshold > 0.0 {
1474                (-b.y / threshold).round()
1475            } else {
1476                -b.y
1477            };
1478
1479            // Compare by Y band (top to bottom), then by X within same band
1480            band_a.total_cmp(&band_b).then_with(|| a.x.total_cmp(&b.x))
1481        });
1482
1483        // Detect columns if requested
1484        if self.options.detect_columns {
1485            self.detect_and_sort_columns(fragments);
1486        }
1487    }
1488
1489    /// Detect columns and re-sort fragments accordingly
1490    fn detect_and_sort_columns(&self, fragments: &mut [TextFragment]) {
1491        // Group fragments by approximate Y position
1492        let mut lines: Vec<Vec<&mut TextFragment>> = Vec::new();
1493        let mut current_line: Vec<&mut TextFragment> = Vec::new();
1494        let mut last_y = f64::INFINITY;
1495
1496        for fragment in fragments.iter_mut() {
1497            let fragment_y = fragment.y;
1498            if (last_y - fragment_y).abs() > self.options.newline_threshold
1499                && !current_line.is_empty()
1500            {
1501                lines.push(current_line);
1502                current_line = Vec::new();
1503            }
1504            current_line.push(fragment);
1505            last_y = fragment_y;
1506        }
1507        if !current_line.is_empty() {
1508            lines.push(current_line);
1509        }
1510
1511        // Detect column boundaries
1512        let mut column_boundaries = vec![0.0];
1513        for line in &lines {
1514            if line.len() > 1 {
1515                for i in 0..line.len() - 1 {
1516                    let gap = line[i + 1].x - (line[i].x + line[i].width);
1517                    if gap > self.options.column_threshold {
1518                        let boundary = line[i].x + line[i].width + gap / 2.0;
1519                        if !column_boundaries
1520                            .iter()
1521                            .any(|&b| (b - boundary).abs() < 10.0)
1522                        {
1523                            column_boundaries.push(boundary);
1524                        }
1525                    }
1526                }
1527            }
1528        }
1529        column_boundaries.sort_by(|a, b| a.total_cmp(b));
1530
1531        // Re-sort fragments by column then Y position
1532        if column_boundaries.len() > 1 {
1533            fragments.sort_by(|a, b| {
1534                // Determine column for each fragment
1535                let col_a = column_boundaries
1536                    .iter()
1537                    .position(|&boundary| a.x < boundary)
1538                    .unwrap_or(column_boundaries.len())
1539                    - 1;
1540                let col_b = column_boundaries
1541                    .iter()
1542                    .position(|&boundary| b.x < boundary)
1543                    .unwrap_or(column_boundaries.len())
1544                    - 1;
1545
1546                if col_a != col_b {
1547                    col_a.cmp(&col_b)
1548                } else {
1549                    // Same column, sort by Y position
1550                    b.y.total_cmp(&a.y)
1551                }
1552            });
1553        }
1554    }
1555
1556    /// Reconstruct text from sorted fragments
1557    fn reconstruct_text_from_fragments(&self, fragments: &[TextFragment]) -> String {
1558        // First, merge consecutive fragments that are very close together
1559        let merged_fragments = self.merge_close_fragments(fragments);
1560
1561        let mut result = String::new();
1562        let mut last_y = f64::INFINITY;
1563        let mut last_x = 0.0;
1564        let mut last_line_ended_with_hyphen = false;
1565
1566        for fragment in &merged_fragments {
1567            // Check if we need a newline
1568            let y_diff = (last_y - fragment.y).abs();
1569            if !result.is_empty() && y_diff > self.options.newline_threshold {
1570                // Handle hyphenation
1571                if self.options.merge_hyphenated && last_line_ended_with_hyphen {
1572                    // Remove the hyphen and don't add newline
1573                    if result.ends_with('-') {
1574                        result.pop();
1575                    }
1576                } else {
1577                    result.push('\n');
1578                }
1579            } else if !result.is_empty() {
1580                // Check if we need a space
1581                let x_gap = fragment.x - last_x;
1582                if x_gap > self.options.space_threshold * fragment.font_size {
1583                    result.push(' ');
1584                }
1585            }
1586
1587            result.push_str(&fragment.text);
1588            last_line_ended_with_hyphen = fragment.text.ends_with('-');
1589            last_y = fragment.y;
1590            last_x = fragment.x + fragment.width;
1591        }
1592
1593        result
1594    }
1595
1596    /// Merge fragments that are very close together on the same line
1597    /// This fixes artifacts like "IN VO ICE" -> "INVOICE"
1598    fn merge_close_fragments(&self, fragments: &[TextFragment]) -> Vec<TextFragment> {
1599        if fragments.is_empty() {
1600            return Vec::new();
1601        }
1602
1603        let mut merged = Vec::new();
1604        let mut current = fragments[0].clone();
1605
1606        for fragment in &fragments[1..] {
1607            // Check if this fragment is on the same line and very close
1608            let y_diff = (current.y - fragment.y).abs();
1609            let x_gap = fragment.x - (current.x + current.width);
1610
1611            // Y-tolerance for same-line merging.
1612            //
1613            // Legacy path (`reconstruct_paragraphs=false`): fragments arrive
1614            // after `sort_and_merge_fragments` which quantizes Y into 10pt bands.
1615            // All same-band fragments share nearly identical Y, so 1.0pt is enough.
1616            //
1617            // Reconstruct-paragraphs path (`reconstruct_paragraphs=true`): fragments
1618            // arrive in emission order. Inline superscripts (e.g. citation numbers
1619            // raised via `Td` operators) have Y deltas of 3-4pt for 10pt body text.
1620            // Without a wider tolerance, each superscript becomes its own fragment
1621            // → line proliferation (issue #265 follow-up). Use 0.5 * font_size,
1622            // which captures typical superscript/subscript offsets (typically
1623            // 0.33-0.4 * font_size from baseline) and stays below the row_id
1624            // threshold (also 0.5 * font_size) so adjacent rows are not collapsed.
1625            let y_tol = if self.options.reconstruct_paragraphs {
1626                // Defend against malformed PDFs that emit text before any `Tf` font
1627                // operator (font_size=0 in TextState initial). 0.5 * 0 = 0 would
1628                // prevent any merge, even at identical Y. Fall back to the legacy
1629                // 1.0pt threshold in that case so the path is at least as forgiving
1630                // as the non-reconstruct path.
1631                let base = 0.5 * current.font_size.min(fragment.font_size);
1632                if base > 0.0 {
1633                    base
1634                } else {
1635                    1.0
1636                }
1637            } else {
1638                1.0
1639            };
1640
1641            let should_merge = y_diff < y_tol
1642                && x_gap >= 0.0  // Fragment is to the right
1643                && x_gap < fragment.font_size * 0.5 // Gap less than 50% of font size
1644                && current.mcid == fragment.mcid;
1645
1646            if should_merge {
1647                // Merge this fragment into current, preserving word boundaries
1648                // when the gap exceeds the font-anchored space threshold.
1649                if x_gap > self.space_gap_threshold(fragment) {
1650                    current.text.push(' ');
1651                }
1652                current.text.push_str(&fragment.text);
1653                current.width = (fragment.x + fragment.width) - current.x;
1654            } else {
1655                // Start a new fragment
1656                merged.push(current);
1657                current = fragment.clone();
1658            }
1659        }
1660
1661        merged.push(current);
1662        merged
1663    }
1664
1665    /// Extract font resources from page
1666    ///
1667    /// Clears the per-page name cache (font names are page-local in PDF), but
1668    /// reuses previously parsed font objects via `font_object_cache` to avoid
1669    /// re-parsing the same font object across multiple pages.
1670    fn extract_font_resources<R: Read + Seek>(
1671        &mut self,
1672        page: &ParsedPage,
1673        document: &PdfDocument<R>,
1674    ) -> ParseResult<()> {
1675        // Clear per-page name mapping (font names like /F1 are page-local)
1676        self.font_cache.clear();
1677
1678        // Try to get resources manually from page dictionary first
1679        // This is necessary because ParsedPage.get_resources() may not always work
1680        if let Some(res_ref) = page.dict.get("Resources").and_then(|o| o.as_reference()) {
1681            if let Ok(PdfObject::Dictionary(resources)) = document.get_object(res_ref.0, res_ref.1)
1682            {
1683                self.cache_fonts_from_resources::<R>(&resources, document);
1684            }
1685        } else if let Some(resources) = page.get_resources() {
1686            // Fallback to get_resources() if Resources is not a reference
1687            self.cache_fonts_from_resources::<R>(resources, document);
1688        }
1689
1690        Ok(())
1691    }
1692
1693    /// Cache every font declared in a page's `/Resources` `/Font` dictionary.
1694    ///
1695    /// `/Font` itself may be either an inline dictionary or an indirect
1696    /// reference (`/Font 191 0 R`); both are common in real PDFs (e.g. the
1697    /// ATLAS Higgs paper references it). Resolving the reference is required —
1698    /// otherwise the font cache stays empty, decoding loses ToUnicode, and
1699    /// glyph widths fall back to a flat estimate that scrambles multi-column
1700    /// layout (issue #302).
1701    fn cache_fonts_from_resources<R: Read + Seek>(
1702        &mut self,
1703        resources: &PdfDictionary,
1704        document: &PdfDocument<R>,
1705    ) {
1706        let font_dict = match resources.get("Font") {
1707            Some(PdfObject::Dictionary(dict)) => Some(dict.clone()),
1708            Some(PdfObject::Reference(num, gen)) => match document.get_object(*num, *gen) {
1709                Ok(PdfObject::Dictionary(dict)) => Some(dict),
1710                _ => None,
1711            },
1712            _ => None,
1713        };
1714
1715        if let Some(font_dict) = font_dict {
1716            for (font_name, font_obj) in font_dict.0.iter() {
1717                if let Some(font_ref) = font_obj.as_reference() {
1718                    self.cache_font_by_ref::<R>(&font_name.0, font_ref, document);
1719                }
1720            }
1721        }
1722    }
1723
1724    /// Cache a font, reusing the persistent object cache when possible.
1725    fn cache_font_by_ref<R: Read + Seek>(
1726        &mut self,
1727        font_name: &str,
1728        font_ref: (u32, u16),
1729        document: &PdfDocument<R>,
1730    ) {
1731        // Check persistent object cache first — avoids re-parsing across pages
1732        if let Some(cached) = self.font_object_cache.get(&font_ref) {
1733            self.font_cache
1734                .insert(font_name.to_string(), cached.clone());
1735            tracing::debug!(
1736                "Reused cached font object ({}, {}): {} (ToUnicode: {})",
1737                font_ref.0,
1738                font_ref.1,
1739                font_name,
1740                cached.to_unicode.is_some()
1741            );
1742            return;
1743        }
1744
1745        // Parse font object
1746        if let Ok(PdfObject::Dictionary(font_dict)) = document.get_object(font_ref.0, font_ref.1) {
1747            let mut cmap_extractor: CMapTextExtractor<R> = CMapTextExtractor::new();
1748            if let Ok(font_info) = cmap_extractor.extract_font_info(&font_dict, document) {
1749                let has_to_unicode = font_info.to_unicode.is_some();
1750                // Store in persistent cache
1751                self.font_object_cache.insert(font_ref, font_info.clone());
1752                // Store in per-page name cache
1753                self.font_cache.insert(font_name.to_string(), font_info);
1754                tracing::debug!(
1755                    "Parsed and cached font ({}, {}): {} (ToUnicode: {})",
1756                    font_ref.0,
1757                    font_ref.1,
1758                    font_name,
1759                    has_to_unicode
1760                );
1761            }
1762        }
1763    }
1764
1765    /// Decode text using the current font encoding and ToUnicode mapping
1766    fn decode_text(&self, text: &[u8], state: &TextState) -> ParseResult<String> {
1767        use crate::text::encoding::TextEncoding;
1768
1769        // First, try to use cached font information with ToUnicode CMap
1770        if let Some(ref font_name) = state.font_name {
1771            if let Some(font_info) = self.font_cache.get(font_name) {
1772                // Try CMap-based decoding first (free function — no allocation)
1773                if let Ok(decoded) =
1774                    crate::text::extraction_cmap::decode_text_with_font(text, font_info)
1775                {
1776                    // Only accept if we got meaningful text (not all null bytes or garbage)
1777                    if !decoded.trim().is_empty()
1778                        && !decoded.chars().all(|c| c == '\0' || c.is_ascii_control())
1779                    {
1780                        // Apply sanitization to remove control characters (Issue #116)
1781                        let sanitized = sanitize_extracted_text(&decoded);
1782                        tracing::debug!(
1783                            "Successfully decoded text using CMap for font {}: {:?} -> \"{}\"",
1784                            font_name,
1785                            text,
1786                            sanitized
1787                        );
1788                        return Ok(sanitized);
1789                    }
1790                }
1791
1792                tracing::debug!(
1793                    "CMap decoding failed or produced garbage for font {}, falling back to encoding",
1794                    font_name
1795                );
1796            }
1797        }
1798
1799        // Fall back to encoding-based decoding
1800        let encoding = if let Some(ref font_name) = state.font_name {
1801            match font_name.to_lowercase().as_str() {
1802                name if name.contains("macroman") => TextEncoding::MacRomanEncoding,
1803                name if name.contains("winansi") => TextEncoding::WinAnsiEncoding,
1804                name if name.contains("standard") => TextEncoding::StandardEncoding,
1805                name if name.contains("pdfdoc") => TextEncoding::PdfDocEncoding,
1806                _ => {
1807                    // Default based on common patterns
1808                    if font_name.starts_with("Times")
1809                        || font_name.starts_with("Helvetica")
1810                        || font_name.starts_with("Courier")
1811                    {
1812                        TextEncoding::WinAnsiEncoding // Most common for standard fonts
1813                    } else {
1814                        TextEncoding::PdfDocEncoding // Safe default
1815                    }
1816                }
1817            }
1818        } else {
1819            TextEncoding::WinAnsiEncoding // Default for most PDFs
1820        };
1821
1822        let fallback_result = encoding.decode(text);
1823        // Apply sanitization to remove control characters (Issue #116)
1824        let sanitized = sanitize_extracted_text(&fallback_result);
1825        tracing::debug!(
1826            "Fallback encoding decoding: {:?} -> \"{}\"",
1827            text,
1828            sanitized
1829        );
1830        Ok(sanitized)
1831    }
1832}
1833
1834impl Default for TextExtractor {
1835    fn default() -> Self {
1836        Self::new()
1837    }
1838}
1839
1840/// Emit a `TextFragment` for one decoded text-show event under `preserve_layout`.
1841///
1842/// Encapsulates the style-derivation + push sequence shared by every
1843/// text-show operator handler in `extract_from_page` (`Tj`, `TJ`, `'`,
1844/// `"`). The caller supplies the pen origin `(x, y)` already mapped to
1845/// user space (typically via `text_origin(&state)`); doing so avoids the
1846/// double `multiply_matrix + transform_point` that prior versions did
1847/// (handler computed it for `last_x`/`last_y`, then this fn recomputed
1848/// it on the same `state`).
1849///
1850/// Skips emission when an ancestor in the marked-content stack is `/Artifact`
1851/// and `include_artifacts` is false. When a pending ActualText run is
1852/// active in the current scope, accumulates the text-width contribution and
1853/// records the first origin instead of pushing a fragment (the run is flushed
1854/// once on EMC, see Task 8's EndMarkedContent handler).
1855///
1856/// `mcid` and `struct_tag` come from the innermost ancestor on the stack that
1857/// declared `/MCID`; non-tagged content leaves both as `None`.
1858/// Whether the current marked-content stack should suppress text emission.
1859///
1860/// Mirrors the gate inside [`emit_text_fragment`]: when an ancestor in the
1861/// stack is `/Artifact` and the caller has not opted into artifact content
1862/// via `include_artifacts`, neither `.text` nor `.fragments` should receive
1863/// the run. Used by the four show-text operator arms to keep `extracted_text`
1864/// and `fragments` symmetric — a page whose entire content is an
1865/// `/Artifact BMC … EMC` scope (the common pattern for screen-reader-skipped
1866/// disclaimers / footers / decorative tagged-PDF content) used to surface
1867/// text in `.text` while leaving `.fragments` empty, silently dropping the
1868/// page from `partition_with(...)` / `rag_chunks(...)` (issue #330).
1869fn skip_artifact_text(state: &TextState, include_artifacts: bool) -> bool {
1870    !include_artifacts && state.mc_stack.iter().any(|e| e.is_artifact)
1871}
1872
1873fn emit_text_fragment(
1874    fragments: &mut Vec<TextFragment>,
1875    decoded: &str,
1876    text_width: f64,
1877    x: f64,
1878    y: f64,
1879    state: &mut TextState,
1880    include_artifacts: bool,
1881) {
1882    if decoded.is_empty() {
1883        return;
1884    }
1885
1886    // Artifact filter (default: skip emission for Artifact subtrees).
1887    if !include_artifacts && state.mc_stack.iter().any(|e| e.is_artifact) {
1888        return;
1889    }
1890
1891    let (is_bold, is_italic) = state
1892        .font_name
1893        .as_ref()
1894        .map(|name| parse_font_style(name))
1895        .unwrap_or((false, false));
1896
1897    // Issue #262: font_size, height, and width must be in page space so that
1898    // downstream heuristics (line/paragraph reconstruction, header/footer zone
1899    // detection, table detection) reason about real geometry. `x` and `y` are
1900    // already page-space (caller transforms via `text_origin`); we still need
1901    // to scale the size/width fields by the combined `text_matrix × CTM`.
1902    let combined = multiply_matrix(&state.text_matrix, &state.ctm);
1903    let x_scale = (combined[0] * combined[0] + combined[1] * combined[1]).sqrt();
1904    let y_scale = (combined[2] * combined[2] + combined[3] * combined[3]).sqrt();
1905    let effective_width = text_width * x_scale;
1906    let effective_size = state.font_size * y_scale;
1907
1908    // If a pending ActualText run is active in the current scope, accumulate
1909    // into it instead of emitting a fragment now. The run is flushed on the
1910    // matching EMC by the EndMarkedContent arm (Task 8).
1911    // Hoist font_name/fill_color reads before taking &mut on pending_actualtext
1912    // to avoid borrow-checker conflicts with the disjoint fields.
1913    let local_font_name = state.font_name.clone();
1914    let local_fill_color = state.fill_color;
1915    if let Some(pending) = state.pending_actualtext.as_mut() {
1916        if !pending.populated {
1917            pending.first_x = x;
1918            pending.first_y = y;
1919            pending.font_size = effective_size;
1920            pending.font_name = local_font_name;
1921            pending.is_bold = is_bold;
1922            pending.is_italic = is_italic;
1923            pending.color = local_fill_color;
1924            pending.populated = true;
1925        }
1926        pending.width += effective_width;
1927        return;
1928    }
1929
1930    let (mcid, struct_tag) = innermost_mc_tag(&state.mc_stack);
1931
1932    fragments.push(TextFragment {
1933        text: decoded.to_owned(),
1934        x,
1935        y,
1936        width: effective_width,
1937        height: effective_size,
1938        font_size: effective_size,
1939        font_name: state.font_name.clone(),
1940        is_bold,
1941        is_italic,
1942        color: state.fill_color,
1943        space_decisions: Vec::new(),
1944        mcid,
1945        struct_tag,
1946    });
1947}
1948
1949/// Pen origin (user-space coordinates) of the next glyph in the current
1950/// text state.
1951///
1952/// Per ISO 32000-1 §8.3.4, the text rendering matrix is `Tm × CTM` (row-vector
1953/// convention). `multiply_matrix(a, b)` returns the matrix that applies `a`
1954/// first and then `b`, so the correct composition is
1955/// `multiply_matrix(text_matrix, ctm)`. Prior to issue #262 this used the
1956/// reverse order which gave correct results only when the CTM was an identity
1957/// or pure-translation matrix; non-uniform CTM scaling produced wrong origins.
1958fn text_origin(state: &TextState) -> (f64, f64) {
1959    let combined = multiply_matrix(&state.text_matrix, &state.ctm);
1960    transform_point(0.0, 0.0, &combined)
1961}
1962
1963/// Multiply two transformation matrices
1964fn multiply_matrix(a: &[f64; 6], b: &[f64; 6]) -> [f64; 6] {
1965    [
1966        a[0] * b[0] + a[1] * b[2],
1967        a[0] * b[1] + a[1] * b[3],
1968        a[2] * b[0] + a[3] * b[2],
1969        a[2] * b[1] + a[3] * b[3],
1970        a[4] * b[0] + a[5] * b[2] + b[4],
1971        a[4] * b[1] + a[5] * b[3] + b[5],
1972    ]
1973}
1974
1975/// Decode a PDF string operand into Rust `String`.
1976///
1977/// PDF strings inside marked-content properties (notably `/ActualText`)
1978/// may be encoded as:
1979///
1980/// - **UTF-16BE with BOM**: leading `0xFE 0xFF`, then big-endian 16-bit
1981///   code units. This is the canonical encoding for non-ASCII ActualText
1982///   (e.g. `fi` ligature, Greek/math symbols). Decoded via `String::from_utf16_lossy`
1983///   so invalid surrogate pairs become `U+FFFD` rather than panicking.
1984/// - **PDFDocEncoding** (the catch-all for non-BOM bytes). For the ASCII
1985///   subset (0x20-0x7E) PDFDocEncoding is identical to Latin-1. We
1986///   conservatively map byte-by-byte to `char`. A future revision can
1987///   plug in the full PDFDocEncoding table if a real PDF emerges with
1988///   high-bit characters in ActualText *without* a UTF-16BE BOM (rare;
1989///   most producers emit the BOM when going outside ASCII).
1990fn decode_pdf_string(bytes: &[u8]) -> String {
1991    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1992        let mut code_units: Vec<u16> = Vec::with_capacity((bytes.len() - 2) / 2);
1993        let mut i = 2;
1994        while i + 1 < bytes.len() {
1995            code_units.push(u16::from_be_bytes([bytes[i], bytes[i + 1]]));
1996            i += 2;
1997        }
1998        String::from_utf16_lossy(&code_units)
1999    } else {
2000        bytes.iter().map(|&b| b as char).collect()
2001    }
2002}
2003
2004/// Resolve a `MarkedContentProps` to `(mcid, actual_text)`.
2005///
2006/// For `Inline` props, walk the map: `/MCID` (Integer, must fit in `u32`)
2007/// becomes `mcid`; `/ActualText` (String) is decoded via `decode_pdf_string`.
2008///
2009/// For `ResourceRef(name)`, look up `properties.get(name)`. If found and
2010/// it's a Dictionary, extract `/MCID` and `/ActualText` from there. If
2011/// not found (or the named entry is not a dict), return `(None, None)`
2012/// — a malformed reference must not abort extraction.
2013fn resolve_props(
2014    props: &crate::parser::content::MarkedContentProps,
2015    properties: Option<&crate::parser::objects::PdfDictionary>,
2016) -> (Option<u32>, Option<String>) {
2017    use crate::parser::content::{MarkedContentProps, MarkedContentValue};
2018
2019    let map_mcid_actual =
2020        |map: &std::collections::HashMap<String, MarkedContentValue>| -> (Option<u32>, Option<String>) {
2021            let mcid = match map.get("MCID") {
2022                Some(MarkedContentValue::Integer(n)) if *n >= 0 && *n <= u32::MAX as i64 => {
2023                    Some(*n as u32)
2024                }
2025                _ => None,
2026            };
2027            let actual = match map.get("ActualText") {
2028                Some(MarkedContentValue::String(bytes)) => Some(decode_pdf_string(bytes)),
2029                _ => None,
2030            };
2031            (mcid, actual)
2032        };
2033
2034    match props {
2035        MarkedContentProps::Inline(map) => map_mcid_actual(map),
2036        MarkedContentProps::ResourceRef(name) => {
2037            let Some(properties) = properties else {
2038                return (None, None);
2039            };
2040            let Some(entry) = properties.get(name) else {
2041                return (None, None);
2042            };
2043            let crate::parser::objects::PdfObject::Dictionary(dict) = entry else {
2044                return (None, None);
2045            };
2046            let mcid = dict.get("MCID").and_then(|o| match o {
2047                crate::parser::objects::PdfObject::Integer(n)
2048                    if *n >= 0 && *n <= u32::MAX as i64 =>
2049                {
2050                    Some(*n as u32)
2051                }
2052                _ => None,
2053            });
2054            let actual_text = dict.get("ActualText").and_then(|o| match o {
2055                crate::parser::objects::PdfObject::String(s) => {
2056                    Some(decode_pdf_string(s.as_bytes()))
2057                }
2058                _ => None,
2059            });
2060            (mcid, actual_text)
2061        }
2062    }
2063}
2064
2065/// Walk the marked-content stack from innermost (top) outward, returning the
2066/// first entry's `(mcid, tag)` pair whose `mcid` is `Some`. Returns
2067/// `(None, None)` when no ancestor declared an MCID — typical of non-tagged
2068/// PDFs, in which case the `None == None` grouping-key invariant preserves
2069/// legacy behaviour.
2070fn innermost_mc_tag(stack: &[MarkedContentEntry]) -> (Option<u32>, Option<String>) {
2071    stack
2072        .iter()
2073        .rev()
2074        .find(|e| e.mcid.is_some())
2075        .map_or((None, None), |e| (e.mcid, Some(e.tag.clone())))
2076}
2077
2078/// Transform a point using a transformation matrix
2079fn transform_point(x: f64, y: f64, matrix: &[f64; 6]) -> (f64, f64) {
2080    let tx = matrix[0] * x + matrix[2] * y + matrix[4];
2081    let ty = matrix[1] * x + matrix[3] * y + matrix[5];
2082    (tx, ty)
2083}
2084
2085/// Calculate text width using actual font metrics (including kerning)
2086fn calculate_text_width(text: &str, font_size: f64, font_info: Option<&FontInfo>) -> f64 {
2087    // If we have font metrics, use them for accurate width calculation
2088    if let Some(font) = font_info {
2089        if let Some(ref widths) = font.metrics.widths {
2090            let first_char = font.metrics.first_char.unwrap_or(0);
2091            let last_char = font.metrics.last_char.unwrap_or(255);
2092            let missing_width = font.metrics.missing_width.unwrap_or(500.0);
2093
2094            let mut total_width = 0.0;
2095            let mut chars = text.chars().peekable();
2096
2097            while let Some(ch) = chars.next() {
2098                let char_code = ch as u32;
2099
2100                // Get width from Widths array or use missing_width
2101                let width = if char_code >= first_char && char_code <= last_char {
2102                    let index = (char_code - first_char) as usize;
2103                    widths.get(index).copied().unwrap_or(missing_width)
2104                } else {
2105                    missing_width
2106                };
2107
2108                // Convert from glyph space (1/1000 units) to user space
2109                total_width += width / 1000.0 * font_size;
2110
2111                // Apply kerning if available (for character pairs)
2112                if let Some(ref kerning) = font.metrics.kerning {
2113                    if let Some(&next_ch) = chars.peek() {
2114                        let next_char = next_ch as u32;
2115                        if let Some(&kern_value) = kerning.get(&(char_code, next_char)) {
2116                            // Kerning is in FUnits (1/1000), convert to user space
2117                            total_width += kern_value / 1000.0 * font_size;
2118                        }
2119                    }
2120                }
2121            }
2122
2123            return total_width;
2124        }
2125    }
2126
2127    // Fallback to simplified calculation if no metrics available
2128    text.len() as f64 * font_size * 0.5
2129}
2130
2131/// Compute advance width from the original character **codes**, not the decoded
2132/// Unicode text.
2133///
2134/// A simple font's `Widths` array is indexed by character code (`first_char..=
2135/// last_char`), i.e. the byte value in the content stream — not by the Unicode
2136/// codepoint the code decodes to. [`calculate_text_width`] indexes by the decoded
2137/// codepoint (`ch as u32`), which is correct only when code == codepoint (ASCII /
2138/// WinAnsi fonts). For custom-encoded fonts (Type1 with `Differences`, embedded
2139/// Computer Modern in LaTeX PDFs, ToUnicode remaps) the codepoint diverges from
2140/// the code, so the wrong slot — or `missing_width` — is read, desyncing glyph
2141/// advance and scrambling word order once fragments are sorted by position
2142/// (issue #302).
2143///
2144/// `decoded` is the already-decoded text for this run; it is only consulted for
2145/// composite (Type0) fonts, whose multi-byte codes cannot be indexed byte-wise
2146/// and whose width path is unchanged here to avoid regressing CJK extraction.
2147fn calculate_text_width_from_codes(
2148    codes: &[u8],
2149    decoded: &str,
2150    font_size: f64,
2151    font_info: Option<&FontInfo>,
2152) -> f64 {
2153    // Composite (Type0) fonts use multi-byte codes; a single byte is not a code,
2154    // so byte-indexed width lookup is invalid. Preserve the existing decoded-based
2155    // behavior for them.
2156    let is_composite =
2157        font_info.is_some_and(|f| f.font_type == "Type0" || f.descendant_font.is_some());
2158    if is_composite {
2159        return calculate_text_width(decoded, font_size, font_info);
2160    }
2161
2162    if let Some(font) = font_info {
2163        if let Some(ref widths) = font.metrics.widths {
2164            let first_char = font.metrics.first_char.unwrap_or(0);
2165            let last_char = font.metrics.last_char.unwrap_or(255);
2166            let missing_width = font.metrics.missing_width.unwrap_or(500.0);
2167
2168            let mut total_width = 0.0;
2169            let mut iter = codes.iter().peekable();
2170            while let Some(&byte) = iter.next() {
2171                let code = byte as u32;
2172                let width = if code >= first_char && code <= last_char {
2173                    widths
2174                        .get((code - first_char) as usize)
2175                        .copied()
2176                        .unwrap_or(missing_width)
2177                } else {
2178                    missing_width
2179                };
2180                total_width += width / 1000.0 * font_size;
2181
2182                // Kerning is keyed by code pair, consistent with code-based widths.
2183                if let Some(ref kerning) = font.metrics.kerning {
2184                    if let Some(&next_byte) = iter.peek() {
2185                        if let Some(&kern_value) = kerning.get(&(code, *next_byte as u32)) {
2186                            total_width += kern_value / 1000.0 * font_size;
2187                        }
2188                    }
2189                }
2190            }
2191
2192            return total_width;
2193        }
2194    }
2195
2196    // No metrics: one fallback width per code (byte), the simple-font glyph count.
2197    codes.len() as f64 * font_size * 0.5
2198}
2199
2200/// Sanitize extracted text by removing or replacing control characters.
2201///
2202/// This function addresses Issue #116 where extracted text contains NUL bytes (`\0`)
2203/// and ETX characters (`\u{3}`) where spaces should appear.
2204///
2205/// # Behavior
2206///
2207/// - Replaces `\0\u{3}` sequences with a single space (common word separator pattern)
2208/// - Replaces standalone `\0` (NUL) with space
2209/// - Removes other ASCII control characters (0x01-0x1F) except:
2210///   - `\t` (0x09) - Tab
2211///   - `\n` (0x0A) - Line feed
2212///   - `\r` (0x0D) - Carriage return
2213/// - Collapses multiple consecutive spaces into a single space
2214///
2215/// # Examples
2216///
2217/// ```
2218/// use oxidize_pdf::text::extraction::sanitize_extracted_text;
2219///
2220/// // Issue #116 pattern: NUL+ETX as word separator
2221/// let dirty = "a\0\u{3}sergeant\0\u{3}and";
2222/// assert_eq!(sanitize_extracted_text(dirty), "a sergeant and");
2223///
2224/// // Standalone NUL becomes space
2225/// let with_nul = "word\0another";
2226/// assert_eq!(sanitize_extracted_text(with_nul), "word another");
2227///
2228/// // Clean text passes through unchanged
2229/// let clean = "Normal text";
2230/// assert_eq!(sanitize_extracted_text(clean), "Normal text");
2231/// ```
2232pub fn sanitize_extracted_text(text: &str) -> String {
2233    if text.is_empty() {
2234        return String::new();
2235    }
2236
2237    // Pre-allocate with same capacity (result will be <= input length)
2238    let mut result = String::with_capacity(text.len());
2239    let mut chars = text.chars().peekable();
2240    let mut last_was_space = false;
2241
2242    while let Some(ch) = chars.next() {
2243        match ch {
2244            // NUL byte - check if followed by ETX for the \0\u{3} pattern
2245            '\0' => {
2246                // Peek at next char to detect \0\u{3} sequence
2247                if chars.peek() == Some(&'\u{3}') {
2248                    chars.next(); // consume the ETX
2249                }
2250                // In both cases (standalone NUL or NUL+ETX), emit space
2251                if !last_was_space {
2252                    result.push(' ');
2253                    last_was_space = true;
2254                }
2255            }
2256
2257            // ETX alone (not preceded by NUL) - remove it
2258            '\u{3}' => {
2259                // Don't emit anything, just skip
2260            }
2261
2262            // Preserve allowed whitespace
2263            '\t' | '\n' | '\r' => {
2264                result.push(ch);
2265                // Reset space tracking on newlines but not tabs
2266                last_was_space = ch == '\t';
2267            }
2268
2269            // Regular space - collapse multiples
2270            ' ' => {
2271                if !last_was_space {
2272                    result.push(' ');
2273                    last_was_space = true;
2274                }
2275            }
2276
2277            // Other control characters (0x01-0x1F except tab/newline/CR) - remove
2278            c if c.is_ascii_control() => {
2279                // Skip control characters
2280            }
2281
2282            // Normal characters - keep them
2283            _ => {
2284                result.push(ch);
2285                last_was_space = false;
2286            }
2287        }
2288    }
2289
2290    result
2291}
2292
2293/// Assign a logical row identifier to each fragment based on Y-up-jumps in
2294/// emission order. Used by `merge_into_lines` to distinguish columns in
2295/// multi-column layouts where a single outer BDC scope makes mcid uniform.
2296///
2297/// Increments `row_id` whenever the next fragment's Y exceeds the previous
2298/// by more than `max(font_size * 0.5, 2.0)`. Superscripts (small positive
2299/// deltas) and normal line descents (negative deltas) leave `row_id`
2300/// unchanged. See `docs/superpowers/specs/2026-05-23-issue-265-line-interleaving-design.md`.
2301///
2302/// # Invariants
2303/// Returns a `Vec<u32>` with exactly `fragments.len()` elements — one
2304/// row id per input fragment, in input order. Callers may safely `.zip(fragments)`.
2305fn assign_row_ids(fragments: &[TextFragment]) -> Vec<u32> {
2306    let mut result = Vec::with_capacity(fragments.len());
2307    let mut row_id: u32 = 0;
2308    let mut prev_y: Option<f64> = None;
2309    for frag in fragments {
2310        if let Some(py) = prev_y {
2311            let delta = frag.y - py;
2312            // Threshold anchored to the arriving fragment's font_size; for the
2313            // symmetric same-font case (body→body, same font) this is equivalent
2314            // to anchoring to the previous fragment.
2315            let threshold = (frag.font_size * 0.5).max(2.0);
2316            if delta > threshold {
2317                row_id += 1;
2318            }
2319        }
2320        result.push(row_id);
2321        prev_y = Some(frag.y);
2322    }
2323    debug_assert_eq!(
2324        result.len(),
2325        fragments.len(),
2326        "assign_row_ids: output length must equal input length"
2327    );
2328    result
2329}
2330
2331/// Decide whether a single visual line should be read in emission order.
2332///
2333/// `line` holds `(emission_index, fragment)` pairs for one visual line in any
2334/// order. Returns `true` when, walked in emission order, the line has no
2335/// DISJOINT backward x-step — i.e. no fragment lands entirely to the LEFT of
2336/// everything emitted so far on the line. Such a left jump is the signature of
2337/// a genuinely scrambled stream (right-to-left / random generators), for which
2338/// x-order is authoritative.
2339///
2340/// The comparison is against the line's running left edge, not the immediately
2341/// preceding fragment: dense bodies are split into sub-word glyph runs, so a
2342/// run that legitimately backfills the line (a font-switched math symbol, or a
2343/// word whose run starts left of the previous short run — #302 symptom 1 /
2344/// #305) overlaps the *covered span* even when it does not overlap the single
2345/// fragment right before it. As long as it does not jump past the line's left
2346/// edge, emission order is preserved. Lines that are already x-monotone in
2347/// emission satisfy this trivially and decode identically under either policy.
2348fn line_prefers_emission_order(line: &[(usize, &TextFragment)]) -> bool {
2349    if line.len() < 2 {
2350        return true;
2351    }
2352    let mut em: Vec<&(usize, &TextFragment)> = line.iter().collect();
2353    em.sort_by_key(|&&(idx, _)| idx);
2354    let mut min_start = em[0].1.x;
2355    for &&(_, f) in &em[1..] {
2356        let end = f.x + f.width;
2357        // A fragment whose right edge is at or left of the leftmost glyph seen
2358        // so far is a true backward jump — emission order is not reading order.
2359        if end <= min_start {
2360            return false;
2361        }
2362        min_start = min_start.min(f.x);
2363    }
2364    true
2365}
2366
2367/// Space-glyph advance width (1000-em units) for the Adobe Core-14 base fonts,
2368/// keyed by `/BaseFont`. Subset prefixes (`ABCDEF+`) are stripped; common
2369/// substitute names (Arial→Helvetica, TimesNewRoman→Times, CourierNew→Courier)
2370/// map to their metric-compatible base. Returns `None` for unknown fonts, which
2371/// leaves the caller on its fixed-fraction fallback. These fonts legitimately
2372/// ship no `/Widths` array, so their space metric is only available here.
2373fn standard_14_space_width(base_font: &str) -> Option<f64> {
2374    let name = base_font.rsplit('+').next().unwrap_or(base_font);
2375    let lower = name.to_ascii_lowercase();
2376    if lower.contains("courier") {
2377        Some(600.0)
2378    } else if lower.contains("helvetica") || lower.contains("arial") {
2379        Some(278.0)
2380    } else if lower.contains("times") {
2381        Some(250.0)
2382    } else if lower == "symbol" {
2383        Some(250.0)
2384    } else if lower.contains("zapfdingbats") || lower.contains("dingbats") {
2385        Some(278.0)
2386    } else {
2387        None
2388    }
2389}
2390
2391#[cfg(test)]
2392mod tests {
2393    use super::*;
2394
2395    #[test]
2396    fn test_matrix_multiplication() {
2397        let identity = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2398        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
2399
2400        let result = multiply_matrix(&identity, &translation);
2401        assert_eq!(result, translation);
2402
2403        let result2 = multiply_matrix(&translation, &identity);
2404        assert_eq!(result2, translation);
2405    }
2406
2407    #[test]
2408    fn test_transform_point() {
2409        let translation = [1.0, 0.0, 0.0, 1.0, 10.0, 20.0];
2410        let (x, y) = transform_point(5.0, 5.0, &translation);
2411        assert_eq!(x, 15.0);
2412        assert_eq!(y, 25.0);
2413    }
2414
2415    #[test]
2416    fn test_extraction_options_default() {
2417        let options = ExtractionOptions::default();
2418        assert!(!options.preserve_layout);
2419        assert_eq!(options.space_threshold, 0.3);
2420        assert_eq!(options.newline_threshold, 10.0);
2421        assert!(options.sort_by_position);
2422        assert!(!options.detect_columns);
2423        assert_eq!(options.column_threshold, 50.0);
2424        assert!(options.merge_hyphenated);
2425    }
2426
2427    #[test]
2428    fn test_extraction_options_custom() {
2429        let options = ExtractionOptions {
2430            preserve_layout: true,
2431            space_threshold: 0.5,
2432            tj_space_threshold: 0.15,
2433            newline_threshold: 15.0,
2434            sort_by_position: false,
2435            detect_columns: true,
2436            column_threshold: 75.0,
2437            merge_hyphenated: false,
2438            track_space_decisions: false,
2439            reconstruct_paragraphs: false,
2440            include_artifacts: false,
2441        };
2442        assert!(options.preserve_layout);
2443        assert_eq!(options.space_threshold, 0.5);
2444        assert_eq!(options.tj_space_threshold, 0.15);
2445        assert_eq!(options.newline_threshold, 15.0);
2446        assert!(!options.sort_by_position);
2447        assert!(options.detect_columns);
2448        assert_eq!(options.column_threshold, 75.0);
2449        assert!(!options.merge_hyphenated);
2450    }
2451
2452    #[test]
2453    fn test_parse_font_style_bold() {
2454        // PostScript style
2455        assert_eq!(parse_font_style("Helvetica-Bold"), (true, false));
2456        assert_eq!(parse_font_style("TimesNewRoman-Bold"), (true, false));
2457
2458        // TrueType style
2459        assert_eq!(parse_font_style("Arial Bold"), (true, false));
2460        assert_eq!(parse_font_style("Calibri Bold"), (true, false));
2461
2462        // Short form
2463        assert_eq!(parse_font_style("Helvetica-B"), (true, false));
2464    }
2465
2466    #[test]
2467    fn test_parse_font_style_italic() {
2468        // PostScript style
2469        assert_eq!(parse_font_style("Helvetica-Italic"), (false, true));
2470        assert_eq!(parse_font_style("Times-Oblique"), (false, true));
2471
2472        // TrueType style
2473        assert_eq!(parse_font_style("Arial Italic"), (false, true));
2474        assert_eq!(parse_font_style("Courier Oblique"), (false, true));
2475
2476        // Short form
2477        assert_eq!(parse_font_style("Helvetica-I"), (false, true));
2478    }
2479
2480    #[test]
2481    fn test_parse_font_style_bold_italic() {
2482        assert_eq!(parse_font_style("Helvetica-BoldItalic"), (true, true));
2483        assert_eq!(parse_font_style("Times-BoldOblique"), (true, true));
2484        assert_eq!(parse_font_style("Arial Bold Italic"), (true, true));
2485    }
2486
2487    #[test]
2488    fn test_parse_font_style_regular() {
2489        assert_eq!(parse_font_style("Helvetica"), (false, false));
2490        assert_eq!(parse_font_style("Times-Roman"), (false, false));
2491        assert_eq!(parse_font_style("Courier"), (false, false));
2492        assert_eq!(parse_font_style("Arial"), (false, false));
2493    }
2494
2495    #[test]
2496    fn test_parse_font_style_edge_cases() {
2497        // Empty and unusual cases
2498        assert_eq!(parse_font_style(""), (false, false));
2499        assert_eq!(parse_font_style("UnknownFont"), (false, false));
2500
2501        // Case insensitive
2502        assert_eq!(parse_font_style("HELVETICA-BOLD"), (true, false));
2503        assert_eq!(parse_font_style("times-ITALIC"), (false, true));
2504    }
2505
2506    #[test]
2507    fn test_text_fragment() {
2508        let fragment = TextFragment {
2509            text: "Hello".to_string(),
2510            x: 100.0,
2511            y: 200.0,
2512            width: 50.0,
2513            height: 12.0,
2514            font_size: 10.0,
2515            font_name: None,
2516            is_bold: false,
2517            is_italic: false,
2518            color: None,
2519            space_decisions: Vec::new(),
2520            mcid: None,
2521            struct_tag: None,
2522        };
2523        assert_eq!(fragment.text, "Hello");
2524        assert_eq!(fragment.x, 100.0);
2525        assert_eq!(fragment.y, 200.0);
2526        assert_eq!(fragment.width, 50.0);
2527        assert_eq!(fragment.height, 12.0);
2528        assert_eq!(fragment.font_size, 10.0);
2529    }
2530
2531    #[test]
2532    fn test_extracted_text() {
2533        let fragments = vec![
2534            TextFragment {
2535                text: "Hello".to_string(),
2536                x: 100.0,
2537                y: 200.0,
2538                width: 50.0,
2539                height: 12.0,
2540                font_size: 10.0,
2541                font_name: None,
2542                is_bold: false,
2543                is_italic: false,
2544                color: None,
2545                space_decisions: Vec::new(),
2546                mcid: None,
2547                struct_tag: None,
2548            },
2549            TextFragment {
2550                text: "World".to_string(),
2551                x: 160.0,
2552                y: 200.0,
2553                width: 50.0,
2554                height: 12.0,
2555                font_size: 10.0,
2556                font_name: None,
2557                is_bold: false,
2558                is_italic: false,
2559                color: None,
2560                space_decisions: Vec::new(),
2561                mcid: None,
2562                struct_tag: None,
2563            },
2564        ];
2565
2566        let extracted = ExtractedText {
2567            text: "Hello World".to_string(),
2568            fragments: fragments,
2569        };
2570
2571        assert_eq!(extracted.text, "Hello World");
2572        assert_eq!(extracted.fragments.len(), 2);
2573        assert_eq!(extracted.fragments[0].text, "Hello");
2574        assert_eq!(extracted.fragments[1].text, "World");
2575    }
2576
2577    #[test]
2578    fn test_text_state_default() {
2579        let state = TextState::default();
2580        assert_eq!(state.text_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
2581        assert_eq!(state.text_line_matrix, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
2582        assert_eq!(state.ctm, [1.0, 0.0, 0.0, 1.0, 0.0, 0.0]);
2583        assert_eq!(state.leading, 0.0);
2584        assert_eq!(state.char_space, 0.0);
2585        assert_eq!(state.word_space, 0.0);
2586        assert_eq!(state.horizontal_scale, 100.0);
2587        assert_eq!(state.text_rise, 0.0);
2588        assert_eq!(state.font_size, 0.0);
2589        assert!(state.font_name.is_none());
2590        assert_eq!(state.render_mode, 0);
2591    }
2592
2593    #[test]
2594    fn test_matrix_operations() {
2595        // Test rotation matrix
2596        let rotation = [0.0, 1.0, -1.0, 0.0, 0.0, 0.0]; // 90 degree rotation
2597        let (x, y) = transform_point(1.0, 0.0, &rotation);
2598        assert_eq!(x, 0.0);
2599        assert_eq!(y, 1.0);
2600
2601        // Test scaling matrix
2602        let scale = [2.0, 0.0, 0.0, 3.0, 0.0, 0.0];
2603        let (x, y) = transform_point(5.0, 5.0, &scale);
2604        assert_eq!(x, 10.0);
2605        assert_eq!(y, 15.0);
2606
2607        // Test complex transformation
2608        let complex = [2.0, 1.0, 1.0, 2.0, 10.0, 20.0];
2609        let (x, y) = transform_point(1.0, 1.0, &complex);
2610        assert_eq!(x, 13.0); // 2*1 + 1*1 + 10
2611        assert_eq!(y, 23.0); // 1*1 + 2*1 + 20
2612    }
2613
2614    #[test]
2615    fn test_text_extractor_new() {
2616        let extractor = TextExtractor::new();
2617        let options = extractor.options;
2618        assert!(!options.preserve_layout);
2619        assert_eq!(options.space_threshold, 0.3);
2620        assert_eq!(options.newline_threshold, 10.0);
2621        assert!(options.sort_by_position);
2622        assert!(!options.detect_columns);
2623        assert_eq!(options.column_threshold, 50.0);
2624        assert!(options.merge_hyphenated);
2625    }
2626
2627    #[test]
2628    fn test_text_extractor_with_options() {
2629        let options = ExtractionOptions {
2630            preserve_layout: true,
2631            space_threshold: 0.3,
2632            tj_space_threshold: 0.2,
2633            newline_threshold: 12.0,
2634            sort_by_position: false,
2635            detect_columns: true,
2636            column_threshold: 60.0,
2637            merge_hyphenated: false,
2638            track_space_decisions: false,
2639            reconstruct_paragraphs: false,
2640            include_artifacts: false,
2641        };
2642        let extractor = TextExtractor::with_options(options.clone());
2643        assert_eq!(extractor.options.preserve_layout, options.preserve_layout);
2644        assert_eq!(extractor.options.space_threshold, options.space_threshold);
2645        assert_eq!(
2646            extractor.options.newline_threshold,
2647            options.newline_threshold
2648        );
2649        assert_eq!(extractor.options.sort_by_position, options.sort_by_position);
2650        assert_eq!(extractor.options.detect_columns, options.detect_columns);
2651        assert_eq!(extractor.options.column_threshold, options.column_threshold);
2652        assert_eq!(extractor.options.merge_hyphenated, options.merge_hyphenated);
2653    }
2654
2655    // =========================================================================
2656    // RIGOROUS TESTS FOR FONT METRICS TEXT WIDTH CALCULATION
2657    // =========================================================================
2658
2659    #[test]
2660    fn test_calculate_text_width_with_no_font_info() {
2661        // Test fallback: should use simplified calculation
2662        let width = calculate_text_width("Hello", 12.0, None);
2663
2664        // Expected: 5 chars * 12.0 * 0.5 = 30.0
2665        assert_eq!(
2666            width, 30.0,
2667            "Without font info, should use simplified calculation: len * font_size * 0.5"
2668        );
2669    }
2670
2671    #[test]
2672    fn test_calculate_text_width_with_empty_metrics() {
2673        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2674
2675        // Font with no widths array
2676        let font_info = FontInfo {
2677            name: "TestFont".to_string(),
2678            font_type: "Type1".to_string(),
2679            encoding: None,
2680            to_unicode: None,
2681            differences: None,
2682            descendant_font: None,
2683            cid_to_gid_map: None,
2684            cid_ordering: None,
2685            metrics: FontMetrics {
2686                first_char: None,
2687                last_char: None,
2688                widths: None,
2689                missing_width: Some(500.0),
2690                kerning: None,
2691            },
2692            cid_encoding: None,
2693        };
2694
2695        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
2696
2697        // Should fall back to simplified calculation
2698        assert_eq!(
2699            width, 30.0,
2700            "Without widths array, should fall back to simplified calculation"
2701        );
2702    }
2703
2704    #[test]
2705    fn test_calculate_text_width_with_complete_metrics() {
2706        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2707
2708        // Font with complete metrics for ASCII range 32-126
2709        // Simulate typical Helvetica widths (in 1/1000 units)
2710        let mut widths = vec![0.0; 95]; // 95 chars from 32 to 126
2711
2712        // Set specific widths for "Hello" (H=722, e=556, l=278, o=611)
2713        widths[72 - 32] = 722.0; // 'H' is ASCII 72
2714        widths[101 - 32] = 556.0; // 'e' is ASCII 101
2715        widths[108 - 32] = 278.0; // 'l' is ASCII 108
2716        widths[111 - 32] = 611.0; // 'o' is ASCII 111
2717
2718        let font_info = FontInfo {
2719            name: "Helvetica".to_string(),
2720            font_type: "Type1".to_string(),
2721            encoding: None,
2722            to_unicode: None,
2723            differences: None,
2724            descendant_font: None,
2725            cid_to_gid_map: None,
2726            cid_ordering: None,
2727            metrics: FontMetrics {
2728                first_char: Some(32),
2729                last_char: Some(126),
2730                widths: Some(widths),
2731                missing_width: Some(500.0),
2732                kerning: None,
2733            },
2734            cid_encoding: None,
2735        };
2736
2737        let width = calculate_text_width("Hello", 12.0, Some(&font_info));
2738
2739        // Expected calculation (widths in glyph space / 1000 * font_size):
2740        // H: 722/1000 * 12 = 8.664
2741        // e: 556/1000 * 12 = 6.672
2742        // l: 278/1000 * 12 = 3.336
2743        // l: 278/1000 * 12 = 3.336
2744        // o: 611/1000 * 12 = 7.332
2745        // Total: 29.34
2746        let expected = (722.0 + 556.0 + 278.0 + 278.0 + 611.0) / 1000.0 * 12.0;
2747        let tolerance = 0.0001; // Floating point tolerance
2748        assert!(
2749            (width - expected).abs() < tolerance,
2750            "Should calculate width using actual character metrics: expected {}, got {}, diff {}",
2751            expected,
2752            width,
2753            (width - expected).abs()
2754        );
2755
2756        // Verify it's different from simplified calculation
2757        let simplified = 5.0 * 12.0 * 0.5; // 30.0
2758        assert_ne!(
2759            width, simplified,
2760            "Metrics-based calculation should differ from simplified (30.0)"
2761        );
2762    }
2763
2764    #[test]
2765    fn width_from_codes_uses_char_code_not_decoded_unicode() {
2766        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2767
2768        // Simple Type1 font with a code-indexed Widths array: code 1 -> 1000,
2769        // code 2 -> 100. A custom encoding decodes code 1 -> 'm' (U+006D) and
2770        // code 2 -> 'i' (U+0069), so the decoded Unicode codepoints (109, 105)
2771        // are far from the codes (1, 2). The advance width MUST come from the
2772        // codes; indexing the Widths array by the decoded Unicode codepoint
2773        // reads out-of-range -> missing_width, desyncing glyph advance on
2774        // custom-encoded fonts (issue #302, Higgs/Computer-Modern scramble).
2775        let font_info = FontInfo {
2776            name: "F1".to_string(),
2777            font_type: "Type1".to_string(),
2778            encoding: None,
2779            to_unicode: None,
2780            differences: None,
2781            descendant_font: None,
2782            cid_to_gid_map: None,
2783            cid_ordering: None,
2784            metrics: FontMetrics {
2785                first_char: Some(1),
2786                last_char: Some(2),
2787                widths: Some(vec![1000.0, 100.0]),
2788                missing_width: Some(500.0),
2789                kerning: None,
2790            },
2791            cid_encoding: None,
2792        };
2793
2794        let codes = [1u8, 2u8];
2795        let decoded = "mi"; // what decode_text produced for these codes
2796        let width = calculate_text_width_from_codes(&codes, decoded, 10.0, Some(&font_info));
2797        let expected = (1000.0 + 100.0) / 1000.0 * 10.0; // 11.0
2798        assert!(
2799            (width - expected).abs() < 1e-6,
2800            "width must come from char codes: expected {expected}, got {width}"
2801        );
2802
2803        // The decoded-Unicode-indexed path is the bug: 109 and 105 are outside
2804        // [1,2] so both fall back to missing_width -> (500+500)/1000*10 = 10.0.
2805        let buggy = calculate_text_width(decoded, 10.0, Some(&font_info));
2806        assert_eq!(buggy, 10.0);
2807        assert_ne!(
2808            width, buggy,
2809            "code-based width must differ from the Unicode-indexed bug"
2810        );
2811    }
2812
2813    #[test]
2814    fn test_calculate_text_width_character_outside_range() {
2815        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2816
2817        // Font with narrow range (only covers 'A'-'Z')
2818        let widths = vec![722.0; 26]; // All uppercase letters same width
2819
2820        let font_info = FontInfo {
2821            name: "TestFont".to_string(),
2822            font_type: "Type1".to_string(),
2823            encoding: None,
2824            to_unicode: None,
2825            differences: None,
2826            descendant_font: None,
2827            cid_to_gid_map: None,
2828            cid_ordering: None,
2829            metrics: FontMetrics {
2830                first_char: Some(65), // 'A'
2831                last_char: Some(90),  // 'Z'
2832                widths: Some(widths),
2833                missing_width: Some(500.0),
2834                kerning: None,
2835            },
2836            cid_encoding: None,
2837        };
2838
2839        // Test with character outside range
2840        let width = calculate_text_width("A1", 10.0, Some(&font_info));
2841
2842        // Expected:
2843        // 'A' (65) is in range: 722/1000 * 10 = 7.22
2844        // '1' (49) is outside range: missing_width 500/1000 * 10 = 5.0
2845        // Total: 12.22
2846        let expected = (722.0 / 1000.0 * 10.0) + (500.0 / 1000.0 * 10.0);
2847        assert_eq!(
2848            width, expected,
2849            "Should use missing_width for characters outside range"
2850        );
2851    }
2852
2853    #[test]
2854    fn test_calculate_text_width_missing_width_in_array() {
2855        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2856
2857        // Font with incomplete widths array (some characters have 0.0)
2858        let mut widths = vec![500.0; 95]; // Default width
2859        widths[10] = 0.0; // Character at index 10 has no width defined
2860
2861        let font_info = FontInfo {
2862            name: "TestFont".to_string(),
2863            font_type: "Type1".to_string(),
2864            encoding: None,
2865            to_unicode: None,
2866            differences: None,
2867            descendant_font: None,
2868            cid_to_gid_map: None,
2869            cid_ordering: None,
2870            metrics: FontMetrics {
2871                first_char: Some(32),
2872                last_char: Some(126),
2873                widths: Some(widths),
2874                missing_width: Some(600.0),
2875                kerning: None,
2876            },
2877            cid_encoding: None,
2878        };
2879
2880        // Character 42 (index 10 from first_char 32)
2881        let char_code = 42u8 as char; // '*'
2882        let text = char_code.to_string();
2883        let width = calculate_text_width(&text, 10.0, Some(&font_info));
2884
2885        // Character is in range but width is 0.0, should NOT fall back to missing_width
2886        // (0.0 is a valid width for zero-width characters)
2887        assert_eq!(
2888            width, 0.0,
2889            "Should use 0.0 width from array, not missing_width"
2890        );
2891    }
2892
2893    #[test]
2894    fn test_calculate_text_width_empty_string() {
2895        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2896
2897        let font_info = FontInfo {
2898            name: "TestFont".to_string(),
2899            font_type: "Type1".to_string(),
2900            encoding: None,
2901            to_unicode: None,
2902            differences: None,
2903            descendant_font: None,
2904            cid_to_gid_map: None,
2905            cid_ordering: None,
2906            metrics: FontMetrics {
2907                first_char: Some(32),
2908                last_char: Some(126),
2909                widths: Some(vec![500.0; 95]),
2910                missing_width: Some(500.0),
2911                kerning: None,
2912            },
2913            cid_encoding: None,
2914        };
2915
2916        let width = calculate_text_width("", 12.0, Some(&font_info));
2917        assert_eq!(width, 0.0, "Empty string should have zero width");
2918
2919        // Also test without font info
2920        let width_no_font = calculate_text_width("", 12.0, None);
2921        assert_eq!(
2922            width_no_font, 0.0,
2923            "Empty string should have zero width (no font)"
2924        );
2925    }
2926
2927    #[test]
2928    fn test_calculate_text_width_unicode_characters() {
2929        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2930
2931        // Font with limited ASCII range
2932        let font_info = FontInfo {
2933            name: "TestFont".to_string(),
2934            font_type: "Type1".to_string(),
2935            encoding: None,
2936            to_unicode: None,
2937            differences: None,
2938            descendant_font: None,
2939            cid_to_gid_map: None,
2940            cid_ordering: None,
2941            metrics: FontMetrics {
2942                first_char: Some(32),
2943                last_char: Some(126),
2944                widths: Some(vec![500.0; 95]),
2945                missing_width: Some(600.0),
2946                kerning: None,
2947            },
2948            cid_encoding: None,
2949        };
2950
2951        // Test with Unicode characters outside ASCII range
2952        let width = calculate_text_width("Ñ", 10.0, Some(&font_info));
2953
2954        // 'Ñ' (U+00D1, code 209) is outside range, should use missing_width
2955        // Expected: 600/1000 * 10 = 6.0
2956        assert_eq!(
2957            width, 6.0,
2958            "Unicode character outside range should use missing_width"
2959        );
2960    }
2961
2962    #[test]
2963    fn test_calculate_text_width_different_font_sizes() {
2964        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
2965
2966        let font_info = FontInfo {
2967            name: "TestFont".to_string(),
2968            font_type: "Type1".to_string(),
2969            encoding: None,
2970            to_unicode: None,
2971            differences: None,
2972            descendant_font: None,
2973            cid_to_gid_map: None,
2974            cid_ordering: None,
2975            metrics: FontMetrics {
2976                first_char: Some(65), // 'A'
2977                last_char: Some(65),  // 'A'
2978                widths: Some(vec![722.0]),
2979                missing_width: Some(500.0),
2980                kerning: None,
2981            },
2982            cid_encoding: None,
2983        };
2984
2985        // Test same character with different font sizes
2986        let width_10 = calculate_text_width("A", 10.0, Some(&font_info));
2987        let width_20 = calculate_text_width("A", 20.0, Some(&font_info));
2988
2989        // Widths should scale linearly with font size
2990        assert_eq!(width_10, 722.0 / 1000.0 * 10.0);
2991        assert_eq!(width_20, 722.0 / 1000.0 * 20.0);
2992        assert_eq!(
2993            width_20,
2994            width_10 * 2.0,
2995            "Width should scale linearly with font size"
2996        );
2997    }
2998
2999    #[test]
3000    fn test_calculate_text_width_proportional_vs_monospace() {
3001        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
3002
3003        // Simulate proportional font (different widths)
3004        let proportional_widths = vec![278.0, 556.0, 722.0]; // i, m, W
3005        let proportional_font = FontInfo {
3006            name: "Helvetica".to_string(),
3007            font_type: "Type1".to_string(),
3008            encoding: None,
3009            to_unicode: None,
3010            differences: None,
3011            descendant_font: None,
3012            cid_to_gid_map: None,
3013            cid_ordering: None,
3014            metrics: FontMetrics {
3015                first_char: Some(105), // 'i'
3016                last_char: Some(107),  // covers i, j, k
3017                widths: Some(proportional_widths),
3018                missing_width: Some(500.0),
3019                kerning: None,
3020            },
3021            cid_encoding: None,
3022        };
3023
3024        // Simulate monospace font (same width)
3025        let monospace_widths = vec![600.0, 600.0, 600.0];
3026        let monospace_font = FontInfo {
3027            name: "Courier".to_string(),
3028            font_type: "Type1".to_string(),
3029            encoding: None,
3030            to_unicode: None,
3031            differences: None,
3032            descendant_font: None,
3033            cid_to_gid_map: None,
3034            cid_ordering: None,
3035            metrics: FontMetrics {
3036                first_char: Some(105),
3037                last_char: Some(107),
3038                widths: Some(monospace_widths),
3039                missing_width: Some(600.0),
3040                kerning: None,
3041            },
3042            cid_encoding: None,
3043        };
3044
3045        let prop_width = calculate_text_width("i", 12.0, Some(&proportional_font));
3046        let mono_width = calculate_text_width("i", 12.0, Some(&monospace_font));
3047
3048        // Proportional 'i' should be narrower than monospace 'i'
3049        assert!(
3050            prop_width < mono_width,
3051            "Proportional 'i' ({}) should be narrower than monospace 'i' ({})",
3052            prop_width,
3053            mono_width
3054        );
3055    }
3056
3057    // =========================================================================
3058    // CRITICAL KERNING TESTS (Issue #87 - Quality Agent Required)
3059    // =========================================================================
3060
3061    #[test]
3062    fn test_calculate_text_width_with_kerning() {
3063        use crate::text::extraction_cmap::{FontInfo, FontMetrics};
3064        use std::collections::HashMap;
3065
3066        // Create a font with kerning pairs
3067        let mut widths = vec![500.0; 95]; // ASCII 32-126
3068        widths[65 - 32] = 722.0; // 'A'
3069        widths[86 - 32] = 722.0; // 'V'
3070        widths[87 - 32] = 944.0; // 'W'
3071
3072        let mut kerning = HashMap::new();
3073        // Typical kerning pairs (in FUnits, 1/1000)
3074        kerning.insert((65, 86), -50.0); // 'A' + 'V' → tighten by 50 FUnits
3075        kerning.insert((65, 87), -40.0); // 'A' + 'W' → tighten by 40 FUnits
3076
3077        let font_info = FontInfo {
3078            name: "Helvetica".to_string(),
3079            font_type: "Type1".to_string(),
3080            encoding: None,
3081            to_unicode: None,
3082            differences: None,
3083            descendant_font: None,
3084            cid_to_gid_map: None,
3085            cid_ordering: None,
3086            metrics: FontMetrics {
3087                first_char: Some(32),
3088                last_char: Some(126),
3089                widths: Some(widths),
3090                missing_width: Some(500.0),
3091                kerning: Some(kerning),
3092            },
3093            cid_encoding: None,
3094        };
3095
3096        // Test "AV" with kerning
3097        let width_av = calculate_text_width("AV", 12.0, Some(&font_info));
3098        // Expected: (722 + 722)/1000 * 12 + (-50/1000 * 12)
3099        //         = 17.328 - 0.6 = 16.728
3100        let expected_av = (722.0 + 722.0) / 1000.0 * 12.0 + (-50.0 / 1000.0 * 12.0);
3101        let tolerance = 0.0001;
3102        assert!(
3103            (width_av - expected_av).abs() < tolerance,
3104            "AV with kerning: expected {}, got {}, diff {}",
3105            expected_av,
3106            width_av,
3107            (width_av - expected_av).abs()
3108        );
3109
3110        // Test "AW" with different kerning value
3111        let width_aw = calculate_text_width("AW", 12.0, Some(&font_info));
3112        // Expected: (722 + 944)/1000 * 12 + (-40/1000 * 12)
3113        //         = 19.992 - 0.48 = 19.512
3114        let expected_aw = (722.0 + 944.0) / 1000.0 * 12.0 + (-40.0 / 1000.0 * 12.0);
3115        assert!(
3116            (width_aw - expected_aw).abs() < tolerance,
3117            "AW with kerning: expected {}, got {}, diff {}",
3118            expected_aw,
3119            width_aw,
3120            (width_aw - expected_aw).abs()
3121        );
3122
3123        // Test "VA" with NO kerning (pair not in HashMap)
3124        let width_va = calculate_text_width("VA", 12.0, Some(&font_info));
3125        // Expected: (722 + 722)/1000 * 12 = 17.328 (no kerning adjustment)
3126        let expected_va = (722.0 + 722.0) / 1000.0 * 12.0;
3127        assert!(
3128            (width_va - expected_va).abs() < tolerance,
3129            "VA without kerning: expected {}, got {}, diff {}",
3130            expected_va,
3131            width_va,
3132            (width_va - expected_va).abs()
3133        );
3134
3135        // Verify kerning makes a measurable difference
3136        assert!(
3137            width_av < width_va,
3138            "AV with kerning ({}) should be narrower than VA without kerning ({})",
3139            width_av,
3140            width_va
3141        );
3142    }
3143
3144    #[test]
3145    fn test_parse_truetype_kern_table_minimal() {
3146        use crate::text::extraction_cmap::parse_truetype_kern_table;
3147
3148        // Complete TrueType font with kern table (Format 0, 2 kerning pairs)
3149        // Structure:
3150        // 1. Offset table (12 bytes)
3151        // 2. Table directory (2 tables: 'head' and 'kern', each 16 bytes = 32 total)
3152        // 3. 'head' table data (54 bytes)
3153        // 4. 'kern' table data (30 bytes)
3154        // Total: 128 bytes
3155        let mut ttf_data = vec![
3156            // Offset table
3157            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
3158            0x00, 0x02, // numTables: 2
3159            0x00, 0x20, // searchRange: 32
3160            0x00, 0x01, // entrySelector: 1
3161            0x00, 0x00, // rangeShift: 0
3162        ];
3163
3164        // Table directory entry 1: 'head' table
3165        ttf_data.extend_from_slice(b"head"); // tag
3166        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
3167        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x2C]); // offset: 44 (12 + 32)
3168        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x36]); // length: 54
3169
3170        // Table directory entry 2: 'kern' table
3171        ttf_data.extend_from_slice(b"kern"); // tag
3172        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x00]); // checksum
3173        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x62]); // offset: 98 (44 + 54)
3174        ttf_data.extend_from_slice(&[0x00, 0x00, 0x00, 0x1E]); // length: 30 (actual kern table size)
3175
3176        // 'head' table data (54 bytes of zeros - minimal valid head table)
3177        ttf_data.extend_from_slice(&[0u8; 54]);
3178
3179        // 'kern' table data (34 bytes)
3180        ttf_data.extend_from_slice(&[
3181            // Kern table header
3182            0x00, 0x00, // version: 0
3183            0x00, 0x01, // nTables: 1
3184            // Subtable header
3185            0x00, 0x00, // version: 0
3186            0x00, 0x1A, // length: 26 bytes (header 6 + nPairs data 8 + pairs 2*6=12)
3187            0x00, 0x00, // coverage: 0x0000 (Format 0 in lower byte, horizontal)
3188            0x00, 0x02, // nPairs: 2
3189            0x00, 0x08, // searchRange: 8
3190            0x00, 0x00, // entrySelector: 0
3191            0x00, 0x04, // rangeShift: 4
3192            // Kerning pair 1: A + V → -50
3193            0x00, 0x41, // left glyph: 65 ('A')
3194            0x00, 0x56, // right glyph: 86 ('V')
3195            0xFF, 0xCE, // value: -50 (signed 16-bit big-endian)
3196            // Kerning pair 2: A + W → -40
3197            0x00, 0x41, // left glyph: 65 ('A')
3198            0x00, 0x57, // right glyph: 87 ('W')
3199            0xFF, 0xD8, // value: -40 (signed 16-bit big-endian)
3200        ]);
3201
3202        let result = parse_truetype_kern_table(&ttf_data);
3203        assert!(
3204            result.is_ok(),
3205            "Should parse minimal kern table successfully: {:?}",
3206            result.err()
3207        );
3208
3209        let kerning_map = result.unwrap();
3210        assert_eq!(kerning_map.len(), 2, "Should extract 2 kerning pairs");
3211
3212        // Verify pair 1: A + V → -50
3213        assert_eq!(
3214            kerning_map.get(&(65, 86)),
3215            Some(&-50.0),
3216            "Should have A+V kerning pair with value -50"
3217        );
3218
3219        // Verify pair 2: A + W → -40
3220        assert_eq!(
3221            kerning_map.get(&(65, 87)),
3222            Some(&-40.0),
3223            "Should have A+W kerning pair with value -40"
3224        );
3225    }
3226
3227    #[test]
3228    fn test_parse_kern_table_no_kern_table() {
3229        use crate::text::extraction_cmap::extract_truetype_kerning;
3230
3231        // TrueType font data WITHOUT a 'kern' table
3232        // Structure:
3233        // - Offset table: scaler type + numTables + searchRange + entrySelector + rangeShift
3234        // - Table directory: 1 entry for 'head' table (not 'kern')
3235        let ttf_data = vec![
3236            // Offset table
3237            0x00, 0x01, 0x00, 0x00, // scaler type: TrueType
3238            0x00, 0x01, // numTables: 1
3239            0x00, 0x10, // searchRange: 16
3240            0x00, 0x00, // entrySelector: 0
3241            0x00, 0x00, // rangeShift: 0
3242            // Table directory entry: 'head' table (not 'kern')
3243            b'h', b'e', b'a', b'd', // tag: 'head'
3244            0x00, 0x00, 0x00, 0x00, // checksum
3245            0x00, 0x00, 0x00, 0x1C, // offset: 28
3246            0x00, 0x00, 0x00, 0x36, // length: 54
3247            // Mock 'head' table data (54 bytes of zeros)
3248            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3249            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3250            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3251            0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
3252        ];
3253
3254        let result = extract_truetype_kerning(&ttf_data);
3255        assert!(
3256            result.is_ok(),
3257            "Should gracefully handle missing kern table"
3258        );
3259
3260        let kerning_map = result.unwrap();
3261        assert!(
3262            kerning_map.is_empty(),
3263            "Should return empty HashMap when no kern table exists"
3264        );
3265    }
3266
3267    // Helper for paragraph-reconstruction unit tests. TextFragment has 11
3268    // fields so a helper keeps the test bodies focused on geometry.
3269    fn tf(text: &str, x: f64, y: f64, width: f64, font_size: f64) -> TextFragment {
3270        TextFragment {
3271            text: text.to_string(),
3272            x,
3273            y,
3274            width,
3275            height: font_size,
3276            font_size,
3277            font_name: None,
3278            is_bold: false,
3279            is_italic: false,
3280            color: None,
3281            space_decisions: Vec::new(),
3282            mcid: None,
3283            struct_tag: None,
3284        }
3285    }
3286
3287    #[test]
3288    fn merge_into_lines_groups_same_baseline_fragments() {
3289        let extractor = TextExtractor::with_options(ExtractionOptions {
3290            reconstruct_paragraphs: true,
3291            ..Default::default()
3292        });
3293        let input = vec![
3294            tf("Hello", 50.0, 400.0, 30.0, 12.0),
3295            tf("world", 90.0, 400.0, 30.0, 12.0),
3296            tf("now.", 130.0, 400.0, 25.0, 12.0),
3297            tf("Next", 50.0, 386.0, 30.0, 12.0),
3298            tf("line.", 90.0, 386.0, 25.0, 12.0),
3299        ];
3300        let lines = extractor.merge_into_lines(&input);
3301        assert_eq!(
3302            lines.len(),
3303            2,
3304            "two distinct baselines must produce two line fragments"
3305        );
3306        assert_eq!(
3307            lines[0].text, "Hello world now.",
3308            "first line concatenated with spaces"
3309        );
3310        assert_eq!(lines[1].text, "Next line.", "second line concatenated");
3311    }
3312
3313    #[test]
3314    fn merge_into_lines_inserts_space_only_when_gap_exceeds_threshold() {
3315        let extractor = TextExtractor::with_options(ExtractionOptions {
3316            reconstruct_paragraphs: true,
3317            space_threshold: 0.3,
3318            ..Default::default()
3319        });
3320        // Gap of 4pt at font_size 12 = 0.33x — above threshold 0.3
3321        let with_gap = vec![
3322            tf("AB", 50.0, 400.0, 10.0, 12.0),
3323            tf("CD", 64.0, 400.0, 10.0, 12.0),
3324        ];
3325        let lines = extractor.merge_into_lines(&with_gap);
3326        assert_eq!(
3327            lines[0].text, "AB CD",
3328            "gap above threshold must insert space"
3329        );
3330
3331        // Gap of 1pt = 0.083x — below threshold
3332        let tight = vec![
3333            tf("AB", 50.0, 400.0, 10.0, 12.0),
3334            tf("CD", 61.0, 400.0, 10.0, 12.0),
3335        ];
3336        let lines = extractor.merge_into_lines(&tight);
3337        assert_eq!(lines[0].text, "ABCD", "tight gap must NOT insert space");
3338    }
3339
3340    #[test]
3341    fn standard_14_space_width_maps_base_fonts_and_substitutes() {
3342        // Adobe Core-14 AFM space advances, with subset prefixes stripped and
3343        // metric-compatible substitutes folded in (#302 symptom 2).
3344        assert_eq!(super::standard_14_space_width("Times-Roman"), Some(250.0));
3345        assert_eq!(
3346            super::standard_14_space_width("Times-BoldItalic"),
3347            Some(250.0)
3348        );
3349        assert_eq!(super::standard_14_space_width("Helvetica"), Some(278.0));
3350        assert_eq!(super::standard_14_space_width("Courier-Bold"), Some(600.0));
3351        assert_eq!(super::standard_14_space_width("Symbol"), Some(250.0));
3352        assert_eq!(super::standard_14_space_width("ZapfDingbats"), Some(278.0));
3353        // subset prefix stripped
3354        assert_eq!(
3355            super::standard_14_space_width("ABCDEF+Times-Roman"),
3356            Some(250.0)
3357        );
3358        // metric-compatible substitutes
3359        assert_eq!(super::standard_14_space_width("Arial-BoldMT"), Some(278.0));
3360        assert_eq!(
3361            super::standard_14_space_width("TimesNewRomanPSMT"),
3362            Some(250.0)
3363        );
3364        assert_eq!(
3365            super::standard_14_space_width("CourierNewPSMT"),
3366            Some(600.0)
3367        );
3368        // unknown / embedded fonts fall through to the caller's fallback
3369        assert_eq!(super::standard_14_space_width("Poppins-Regular"), None);
3370        assert_eq!(super::standard_14_space_width("VUNXGH+Calibri"), None);
3371    }
3372
3373    #[test]
3374    fn merge_into_lines_keeps_emission_order_for_font_switch_overlap() {
3375        // #302 symptom 1: a font-switched glyph (e.g. the italic particle
3376        // symbol "Z" in "to the Z boson") is positioned by the producer with
3377        // an x-origin that falls INSIDE the x-span of the preceding roman run
3378        // ("to the"). The content stream still delivers it in correct reading
3379        // order. Sorting a row purely by x-origin interleaves the overlapping
3380        // fragment, yielding "Zto the" instead of "to theZ". When a row's only
3381        // backward emission steps are span overlaps (not disjoint jumps),
3382        // emission order is the authoritative reading order.
3383        let extractor = TextExtractor::with_options(ExtractionOptions {
3384            reconstruct_paragraphs: true,
3385            ..Default::default()
3386        });
3387        // emission order = reading order; "Z" overlaps "to t" + "he" in x.
3388        let row = vec![
3389            tf("to t", 455.5, 400.0, 12.0, 10.0), // 455.5 .. 467.5
3390            tf("he", 467.5, 400.0, 10.0, 10.0),   // 467.5 .. 477.5
3391            tf("Z", 455.3, 400.0, 23.0, 10.0),    // 455.3 .. 478.3 (overlaps both)
3392        ];
3393        let lines = extractor.merge_into_lines(&row);
3394        assert_eq!(lines.len(), 1);
3395        assert_eq!(
3396            lines[0].text, "to theZ",
3397            "overlapping font-switch fragment must keep emission (reading) order"
3398        );
3399    }
3400
3401    #[test]
3402    fn merge_into_lines_keeps_emission_when_run_backfills_covered_span() {
3403        // #305: dense justified body text is split into sub-word fragments by
3404        // the font's arbitrary glyph runs. A later word ("described", x 492..537)
3405        // is emitted with a backward x-origin that lands INSIDE the span already
3406        // covered by the line ("...selections", 479..521), but does NOT overlap
3407        // the short immediately-preceding fragment ("s", 517..521). Emission is
3408        // still the reading order, so the line must keep it — the overlap test
3409        // has to consider the line's running extent, not just the previous
3410        // fragment. (Real case: Higgs p5 "kinematic selections described in".)
3411        let extractor = TextExtractor::with_options(ExtractionOptions {
3412            reconstruct_paragraphs: true,
3413            ..Default::default()
3414        });
3415        let row = vec![
3416            tf("selection", 479.0, 400.0, 38.0, 8.0), // 479..517
3417            tf("s", 517.0, 400.0, 4.0, 8.0),          // 517..521  short predecessor
3418            tf("d", 492.0, 400.0, 4.0, 8.0),          // 492..496  backfill, no overlap with "s"
3419            tf("escribed", 496.0, 400.0, 41.0, 8.0),  // 496..537
3420        ];
3421        let lines = extractor.merge_into_lines(&row);
3422        assert_eq!(
3423            lines[0].text, "selectionsdescribed",
3424            "a run that backfills the line's covered span must keep emission order"
3425        );
3426    }
3427
3428    #[test]
3429    fn merge_into_lines_uses_x_order_for_disjoint_backward_jump() {
3430        // Guard: a genuinely scrambled non-tagged stream (fragments emitted
3431        // out of x-order at DISJOINT positions, e.g. right-to-left or random
3432        // generators) must still be reordered by x. Here "the" is emitted
3433        // after "boson" with no span overlap, so x-order is authoritative.
3434        let extractor = TextExtractor::with_options(ExtractionOptions {
3435            reconstruct_paragraphs: true,
3436            ..Default::default()
3437        });
3438        let row = vec![
3439            tf("boson", 100.0, 400.0, 28.0, 10.0), // 100 .. 128
3440            tf("the", 80.0, 400.0, 15.0, 10.0),    // 80 .. 95 (disjoint, left of boson)
3441        ];
3442        let lines = extractor.merge_into_lines(&row);
3443        assert_eq!(lines.len(), 1);
3444        assert_eq!(
3445            lines[0].text, "the boson",
3446            "disjoint backward emission jump must be reordered by x"
3447        );
3448    }
3449
3450    #[test]
3451    fn merge_into_lines_unioned_bounding_box() {
3452        let extractor = TextExtractor::with_options(ExtractionOptions {
3453            reconstruct_paragraphs: true,
3454            ..Default::default()
3455        });
3456        let input = vec![
3457            tf("A", 50.0, 400.0, 10.0, 12.0),
3458            tf("B", 100.0, 400.0, 10.0, 12.0),
3459        ];
3460        let lines = extractor.merge_into_lines(&input);
3461        assert_eq!(lines.len(), 1);
3462        assert!((lines[0].x - 50.0).abs() < 0.01);
3463        assert!(
3464            (lines[0].width - 60.0).abs() < 0.01,
3465            "width must span 50->110"
3466        );
3467    }
3468
3469    #[test]
3470    fn assign_row_ids_monotone_y_descending_keeps_zero() {
3471        let frags = vec![
3472            tf("A", 50.0, 400.0, 10.0, 9.0),
3473            tf("B", 50.0, 395.0, 10.0, 9.0),
3474            tf("C", 50.0, 390.0, 10.0, 9.0),
3475        ];
3476        let row_ids = super::assign_row_ids(&frags);
3477        assert_eq!(row_ids, vec![0u32, 0, 0]);
3478    }
3479
3480    #[test]
3481    fn assign_row_ids_increments_on_y_up_jump_above_threshold() {
3482        // font_size=9 → threshold = max(4.5, 2.0) = 4.5
3483        // deltas: 395-400=-5, 420-395=+25 (>4.5)
3484        let frags = vec![
3485            tf("A", 50.0, 400.0, 10.0, 9.0),
3486            tf("B", 50.0, 395.0, 10.0, 9.0),
3487            tf("C", 50.0, 420.0, 10.0, 9.0),
3488        ];
3489        let row_ids = super::assign_row_ids(&frags);
3490        assert_eq!(row_ids, vec![0u32, 0, 1]);
3491    }
3492
3493    #[test]
3494    fn assign_row_ids_ignores_superscript_within_threshold() {
3495        // font_size=9 → threshold 4.5. delta 2.5 must NOT trigger.
3496        let frags = vec![
3497            tf("A", 50.0, 400.0, 10.0, 9.0),
3498            tf("^2", 60.0, 402.5, 5.0, 9.0),
3499            tf("B", 65.0, 395.0, 10.0, 9.0),
3500        ];
3501        let row_ids = super::assign_row_ids(&frags);
3502        assert_eq!(row_ids, vec![0u32, 0, 0]);
3503    }
3504
3505    #[test]
3506    fn assign_row_ids_floor_2pt_for_small_fonts() {
3507        // font_size=3 → font_size*0.5 = 1.5; floor lifts threshold to 2.0
3508        // delta = +2.5 > 2.0 must trigger.
3509        let frags = vec![
3510            tf("A", 50.0, 100.0, 10.0, 3.0),
3511            tf("B", 50.0, 102.5, 10.0, 3.0),
3512        ];
3513        let row_ids = super::assign_row_ids(&frags);
3514        assert_eq!(row_ids, vec![0u32, 1]);
3515    }
3516
3517    #[test]
3518    fn assign_row_ids_empty_slice_returns_empty() {
3519        let frags: Vec<TextFragment> = vec![];
3520        let row_ids = super::assign_row_ids(&frags);
3521        assert!(row_ids.is_empty(), "empty input must yield empty output");
3522    }
3523
3524    #[test]
3525    fn merge_into_lines_splits_two_columns_emitted_sequentially() {
3526        let extractor = TextExtractor::with_options(ExtractionOptions {
3527            reconstruct_paragraphs: true,
3528            ..Default::default()
3529        });
3530        // Emission order: col1.l1, col1.l2 (Y monotone down), then col2.l1
3531        // (Y jumps UP by 10 > threshold 5 for font 10pt), col2.l2.
3532        let input = vec![
3533            tf("col1-top", 50.0, 400.0, 80.0, 10.0),
3534            tf("col1-bot", 50.0, 395.0, 80.0, 10.0),
3535            tf("col2-top", 200.0, 405.0, 80.0, 10.0),
3536            tf("col2-bot", 200.0, 400.0, 80.0, 10.0),
3537        ];
3538        let lines = extractor.merge_into_lines(&input);
3539        assert_eq!(
3540            lines.len(),
3541            4,
3542            "two columns at near-identical Y must split into 4 lines"
3543        );
3544        // row_id=0 batch first (col1), then row_id=1 (col2). Within each batch, Y desc.
3545        assert_eq!(lines[0].text, "col1-top");
3546        assert_eq!(lines[0].y, 400.0);
3547        assert_eq!(lines[1].text, "col1-bot");
3548        assert_eq!(lines[1].y, 395.0);
3549        assert_eq!(lines[2].text, "col2-top");
3550        assert_eq!(lines[2].y, 405.0);
3551        assert_eq!(lines[3].text, "col2-bot");
3552        assert_eq!(lines[3].y, 400.0);
3553    }
3554
3555    #[test]
3556    fn merge_into_lines_preserves_single_column_continuation() {
3557        let extractor = TextExtractor::with_options(ExtractionOptions {
3558            reconstruct_paragraphs: true,
3559            ..Default::default()
3560        });
3561        // Single column: same Y continuation (X grows), then next line down.
3562        let input = vec![
3563            tf("Hello", 50.0, 400.0, 30.0, 10.0),
3564            tf("world", 90.0, 400.0, 30.0, 10.0),
3565            tf("next-line", 50.0, 395.0, 70.0, 10.0),
3566        ];
3567        let lines = extractor.merge_into_lines(&input);
3568        assert_eq!(
3569            lines.len(),
3570            2,
3571            "single column continuation must collapse to 2 lines"
3572        );
3573        assert!(lines[0].text.contains("Hello"));
3574        assert!(lines[0].text.contains("world"));
3575        assert_eq!(lines[1].text, "next-line");
3576    }
3577
3578    #[test]
3579    fn merge_into_lines_splits_columns_with_uniform_mcid() {
3580        // Regression guard for #265 root cause: NCSC page 12 has a single
3581        // outer BDC, so every fragment has mcid=Some(0). Column separation
3582        // must come from row_id alone, not from mcid.
3583        let extractor = TextExtractor::with_options(ExtractionOptions {
3584            reconstruct_paragraphs: true,
3585            ..Default::default()
3586        });
3587        let mut frags = vec![
3588            tf("col1-top", 50.0, 400.0, 80.0, 10.0),
3589            tf("col1-bot", 50.0, 395.0, 80.0, 10.0),
3590            tf("col2-top", 200.0, 405.0, 80.0, 10.0),
3591            tf("col2-bot", 200.0, 400.0, 80.0, 10.0),
3592        ];
3593        for f in &mut frags {
3594            f.mcid = Some(0);
3595        }
3596        let lines = extractor.merge_into_lines(&frags);
3597        assert_eq!(
3598            lines.len(),
3599            4,
3600            "uniform mcid must not prevent row_id-based column split (NCSC root cause)"
3601        );
3602        assert_eq!(lines[0].text, "col1-top");
3603        assert_eq!(lines[1].text, "col1-bot");
3604        assert_eq!(lines[2].text, "col2-top");
3605        assert_eq!(lines[3].text, "col2-bot");
3606    }
3607
3608    #[test]
3609    fn merge_close_fragments_superscript_merges_when_reconstruct_paragraphs() {
3610        let extractor = TextExtractor::with_options(ExtractionOptions {
3611            reconstruct_paragraphs: true,
3612            ..Default::default()
3613        });
3614        // Citation superscript: body text at y=400, raised digit at y=403.5
3615        // (3.5pt above baseline for 10pt font). y_tol = 0.5 * 10 = 5.0 > 3.5
3616        // and x_gap = 4pt < 10*0.5 = 5pt, so the superscript must merge into
3617        // the body fragment.
3618        let frags = vec![
3619            tf("body-text", 50.0, 400.0, 25.0, 10.0),
3620            tf("1", 79.0, 403.5, 4.0, 10.0),
3621        ];
3622        let merged = extractor.merge_close_fragments(&frags);
3623        assert_eq!(
3624            merged.len(),
3625            1,
3626            "superscript within 5pt of baseline must merge in reconstruct path"
3627        );
3628        assert!(merged[0].text.contains("body-text"));
3629        assert!(merged[0].text.contains("1"));
3630    }
3631
3632    #[test]
3633    fn merge_close_fragments_superscript_does_not_merge_in_legacy_path() {
3634        let extractor = TextExtractor::with_options(ExtractionOptions {
3635            reconstruct_paragraphs: false,
3636            ..Default::default()
3637        });
3638        // Legacy path: y_tol=1.0 fixed. A 3.5pt delta must NOT merge.
3639        let frags = vec![
3640            tf("body-text", 50.0, 400.0, 25.0, 10.0),
3641            tf("1", 79.0, 403.5, 4.0, 10.0),
3642        ];
3643        let merged = extractor.merge_close_fragments(&frags);
3644        assert_eq!(
3645            merged.len(),
3646            2,
3647            "3.5pt Y delta exceeds legacy 1.0pt threshold; superscript stays separate"
3648        );
3649    }
3650
3651    #[test]
3652    fn merge_into_paragraphs_groups_consecutive_lines() {
3653        let extractor = TextExtractor::with_options(ExtractionOptions {
3654            reconstruct_paragraphs: true,
3655            ..Default::default()
3656        });
3657        // Three lines, 14pt leading (line height 12pt, gap 2pt)
3658        let lines = vec![
3659            tf("Line one.", 50.0, 400.0, 60.0, 12.0),
3660            tf("Line two.", 50.0, 386.0, 60.0, 12.0),
3661            tf("Line three.", 50.0, 372.0, 70.0, 12.0),
3662        ];
3663        let paragraphs = extractor.merge_into_paragraphs(&lines);
3664        assert_eq!(paragraphs.len(), 1);
3665        assert_eq!(paragraphs[0].text, "Line one.\nLine two.\nLine three.");
3666    }
3667
3668    #[test]
3669    fn merge_into_paragraphs_splits_on_large_vertical_gap() {
3670        let extractor = TextExtractor::with_options(ExtractionOptions {
3671            reconstruct_paragraphs: true,
3672            ..Default::default()
3673        });
3674        let lines = vec![
3675            tf("P1L1.", 50.0, 400.0, 40.0, 12.0),
3676            tf("P1L2.", 50.0, 386.0, 40.0, 12.0),
3677            tf("P2L1.", 50.0, 300.0, 40.0, 12.0),
3678        ];
3679        let paragraphs = extractor.merge_into_paragraphs(&lines);
3680        assert_eq!(paragraphs.len(), 2);
3681        assert_eq!(paragraphs[0].text, "P1L1.\nP1L2.");
3682        assert_eq!(paragraphs[1].text, "P2L1.");
3683    }
3684
3685    #[test]
3686    fn merge_into_paragraphs_drops_hyphen_when_merge_hyphenated() {
3687        let extractor = TextExtractor::with_options(ExtractionOptions {
3688            reconstruct_paragraphs: true,
3689            merge_hyphenated: true,
3690            ..Default::default()
3691        });
3692        let lines = vec![
3693            tf("Kryp-", 50.0, 400.0, 30.0, 12.0),
3694            tf("tographie", 50.0, 386.0, 60.0, 12.0),
3695        ];
3696        let paragraphs = extractor.merge_into_paragraphs(&lines);
3697        assert_eq!(paragraphs.len(), 1);
3698        assert_eq!(
3699            paragraphs[0].text, "Kryptographie",
3700            "hyphen elided, no newline inserted"
3701        );
3702    }
3703
3704    #[test]
3705    fn decode_pdf_string_utf16be_bom_decodes_fi_ligature() {
3706        let bytes = [0xFE, 0xFF, 0x00, 0x66, 0x00, 0x69];
3707        assert_eq!(super::decode_pdf_string(&bytes), "fi");
3708    }
3709
3710    #[test]
3711    fn decode_pdf_string_ascii_pdfdocencoding_passthrough() {
3712        let bytes = b"page 12";
3713        assert_eq!(super::decode_pdf_string(bytes), "page 12");
3714    }
3715
3716    #[test]
3717    fn decode_pdf_string_empty_input_returns_empty() {
3718        assert_eq!(super::decode_pdf_string(&[]), "");
3719    }
3720
3721    #[test]
3722    fn decode_pdf_string_lone_bom_returns_empty() {
3723        // BOM only, no code units after.
3724        assert_eq!(super::decode_pdf_string(&[0xFE, 0xFF]), "");
3725    }
3726
3727    #[test]
3728    fn resolve_props_extracts_integer_mcid() {
3729        use crate::parser::content::{MarkedContentProps, MarkedContentValue};
3730        use std::collections::HashMap;
3731        let mut map = HashMap::new();
3732        map.insert("MCID".to_string(), MarkedContentValue::Integer(7));
3733        let props = MarkedContentProps::Inline(map);
3734
3735        let (mcid, actual) = super::resolve_props(&props, None);
3736        assert_eq!(mcid, Some(7));
3737        assert_eq!(actual, None);
3738    }
3739
3740    #[test]
3741    fn resolve_props_decodes_utf16be_actualtext() {
3742        use crate::parser::content::{MarkedContentProps, MarkedContentValue};
3743        use std::collections::HashMap;
3744        let mut map = HashMap::new();
3745        map.insert(
3746            "ActualText".to_string(),
3747            MarkedContentValue::String(vec![0xFE, 0xFF, 0x00, 0x66, 0x00, 0x69]),
3748        );
3749        let props = MarkedContentProps::Inline(map);
3750
3751        let (mcid, actual) = super::resolve_props(&props, None);
3752        assert_eq!(mcid, None);
3753        assert_eq!(actual.as_deref(), Some("fi"));
3754    }
3755
3756    #[test]
3757    fn resolve_props_returns_none_for_unresolvable_resource_ref() {
3758        use crate::parser::content::MarkedContentProps;
3759        let props = MarkedContentProps::ResourceRef("PropsName".to_string());
3760        let (mcid, actual) = super::resolve_props(&props, None);
3761        assert_eq!((mcid, actual), (None, None));
3762    }
3763
3764    #[test]
3765    fn resolve_props_negative_mcid_rejected() {
3766        use crate::parser::content::{MarkedContentProps, MarkedContentValue};
3767        use std::collections::HashMap;
3768        // MCID is unsigned per ISO 32000-1; negative integer is malformed.
3769        let mut map = HashMap::new();
3770        map.insert("MCID".to_string(), MarkedContentValue::Integer(-1));
3771        let props = MarkedContentProps::Inline(map);
3772
3773        let (mcid, _) = super::resolve_props(&props, None);
3774        assert_eq!(mcid, None);
3775    }
3776
3777    #[test]
3778    fn resolve_props_resource_ref_overflow_mcid_rejected() {
3779        // ISO 32000-1 §14.7.4: MCID is an unsigned 32-bit integer. A
3780        // PdfObject::Integer holds an i64, so a malformed PDF can carry an
3781        // out-of-range MCID. The ResourceRef path must reject those rather
3782        // than wrap silently via `as u32`. Mirrors the Inline-path guard
3783        // already covered by `resolve_props_negative_mcid_rejected`.
3784        use crate::parser::content::MarkedContentProps;
3785        use crate::parser::objects::{PdfDictionary, PdfObject};
3786
3787        let mut inner = PdfDictionary::new();
3788        inner.insert("MCID".to_string(), PdfObject::Integer(i64::MAX));
3789
3790        let mut properties = PdfDictionary::new();
3791        properties.insert("PropsName".to_string(), PdfObject::Dictionary(inner));
3792
3793        let props = MarkedContentProps::ResourceRef("PropsName".to_string());
3794        let (mcid, _) = super::resolve_props(&props, Some(&properties));
3795        assert_eq!(mcid, None);
3796    }
3797}
oxidize_pdf/text/extraction.rs

oxidize_pdf/text/
extraction.rs