pdfluent_extract/
text.rs

1//! Text extraction with character-level position tracking.
2//!
3//! Parses content stream text operators (Tj, TJ, Tm, Td, TD, T*, Tc, Tw, Tz, TL, Ts, ', ")
4//! to extract text with positional information.
5
6use crate::error::{ExtractError, Result};
7use lopdf::content::{Content, Operation};
8use lopdf::{Document, Object, ObjectId};
9use std::collections::HashMap;
10use std::sync::OnceLock;
11
12/// Approximate character width as a fraction of font size. Used when no font metric is available.
13const APPROX_CHAR_WIDTH: f64 = 0.5;
14
15/// Whether a text span's width was computed from real font metrics or estimated.
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
17pub enum WidthSource {
18    /// Width derived from the font's actual glyph advance data (`/Widths`, hmtx, CFF).
19    Metric,
20    /// Width estimated at `APPROX_CHAR_WIDTH × font_size` — no glyph metric available.
21    #[default]
22    Estimate,
23}
24
25/// A block of text extracted from a page.
26#[derive(Debug, Clone, Default)]
27pub struct TextBlock {
28    /// The extracted text content.
29    pub text: String,
30    /// The page number (1-based).
31    pub page: u32,
32    /// Bounding box [x0, y0, x1, y1] in PDF coordinates.
33    pub bbox: [f64; 4],
34    /// Font resource name (e.g. "F1") — preserved for backward compatibility.
35    pub font_name: String,
36    /// Font size in points.
37    pub font_size: f64,
38    /// `/ActualText` override from a surrounding marked-content sequence (BDC),
39    /// if any. PDF/UA-compliant authors use this to provide the canonical
40    /// reading-order text for ligatures or other glyph clusters whose visual
41    /// `text` does not match the intended characters.
42    pub actual_text: Option<String>,
43
44    // ---- G1 read-only metadata (added 2026-05; backward-compatible) ----
45    /// `BaseFont` of the active font dict with any 6-character subset prefix
46    /// stripped (e.g. `Helvetica-Bold`). `None` when the font dict has no
47    /// `BaseFont` entry or could not be resolved.
48    pub base_font: Option<String>,
49    /// Inferred bold style. Set when any of:
50    ///   - `FontDescriptor /FontWeight >= 700`
51    ///   - `FontDescriptor /Flags` ForceBold bit (bit 19) is set
52    ///   - the (subset-stripped) BaseFont name contains "bold", "demi",
53    ///     "semibold", "heavy", or "black"
54    pub is_bold: bool,
55    /// Inferred italic style. Set when any of:
56    ///   - `FontDescriptor /Flags` Italic bit (bit 7) is set
57    ///   - the BaseFont name contains "italic", "oblique", or "slant"
58    pub is_italic: bool,
59    /// Active non-stroking fill color as sRGB RGBA at the moment the show
60    /// operator emitted this block. `None` for color spaces we cannot map
61    /// without rendering (ICC, Lab, Pattern, Separation, DeviceN).
62    pub color: Option<[u8; 4]>,
63
64    // ---- G2 glyph-level metrics ----
65    /// Whether glyph widths were measured from real font advance data or estimated.
66    pub width_source: WidthSource,
67    /// Per-glyph bounding boxes in PDF user-space coordinates, one entry per
68    /// source glyph (not per Unicode scalar — ligatures count as one).
69    /// `[x0, y0, x1, y1]` where y0 < y1 (PDF coordinate system, y increasing up).
70    pub char_bounds: Vec<[f64; 4]>,
71}
72
73/// A single character with its position on the page.
74#[derive(Debug, Clone)]
75pub struct PositionedChar {
76    /// The character.
77    pub ch: char,
78    /// The page number (1-based).
79    pub page: u32,
80    /// Bounding box [x0, y0, x1, y1] in PDF coordinates.
81    pub bbox: [f64; 4],
82}
83
84/// Internal graphics state for save/restore (q/Q).
85#[derive(Debug, Clone)]
86struct GraphicsState {
87    ctm: [f64; 6],
88    /// G1: snapshot of the non-stroking fill color at `q`.
89    fill_color: Option<[u8; 4]>,
90}
91
92/// Internal text state tracker.
93#[derive(Debug, Clone)]
94struct TextState {
95    /// Text matrix.
96    tm: [f64; 6],
97    /// Text line matrix.
98    tlm: [f64; 6],
99    /// Current font name.
100    font_name: String,
101    /// Current font size.
102    font_size: f64,
103    /// Character spacing (Tc).
104    tc: f64,
105    /// Word spacing (Tw).
106    tw: f64,
107    /// Horizontal scaling (Tz), as a percentage.
108    th: f64,
109    /// Text leading (TL).
110    tl: f64,
111    /// Text rise (Ts).
112    ts: f64,
113    /// Graphics state stack.
114    gs_stack: Vec<GraphicsState>,
115    /// Current transformation matrix.
116    ctm: [f64; 6],
117    /// G1: active non-stroking fill color as sRGB RGBA.
118    /// PDF §8.6.5.3 initial value: DeviceGray 0.0 (black, fully opaque).
119    /// `None` once an unsupported color space takes over (ICC, Lab,
120    /// Pattern, Separation, DeviceN) until a supported setter re-establishes
121    /// a concrete value.
122    fill_color: Option<[u8; 4]>,
123}
124
125impl Default for TextState {
126    fn default() -> Self {
127        Self {
128            tm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
129            tlm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
130            font_name: String::new(),
131            font_size: 12.0,
132            tc: 0.0,
133            tw: 0.0,
134            th: 100.0,
135            tl: 0.0,
136            ts: 0.0,
137            gs_stack: Vec::new(),
138            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
139            fill_color: Some([0, 0, 0, 255]),
140        }
141    }
142}
143
144/// Per-font information for text decoding.
145#[derive(Clone)]
146struct FontInfo {
147    /// True if the font uses 2-byte CID encoding (Identity-H/V or other CID CMaps).
148    is_cid: bool,
149    /// ToUnicode CMap: maps character code(s) to Unicode string.
150    /// For CID fonts the key is a 2-byte big-endian value; for simple fonts it's a 1-byte code.
151    to_unicode: HashMap<u32, String>,
152    /// Encoding-based code→char map for simple fonts (derived from BaseEncoding + Differences).
153    /// Index = byte code (0..255), value = Unicode char if known.
154    encoding_map: [Option<char>; 256],
155    /// Side-table marking codes whose `Differences` glyph name is `ct`.
156    /// The `ct` ligature has no precomposed Unicode codepoint, so it cannot
157    /// be expressed as a `char` in `encoding_map`. When this flag is set,
158    /// decoding emits one private-use marker scalar and records that marker's
159    /// origin so glyph advance remains one codepoint wide. The marker expands
160    /// to "ct" only in the ligature-decomposition layer, avoiding clobbering
161    /// legitimate U+E007 values that arrive through ToUnicode CMaps.
162    ct_codes: [bool; 256],
163    /// G1: `BaseFont` of the font dict with subset prefix stripped.
164    base_font: Option<String>,
165    /// G1: Inferred bold style — from FontDescriptor (Flags ForceBold bit 19,
166    /// FontWeight ≥ 700) or BaseFont name heuristics.
167    is_bold: bool,
168    /// G1: Inferred italic style — from FontDescriptor (Flags Italic bit 7)
169    /// or BaseFont name heuristics.
170    is_italic: bool,
171
172    // ---- G2: per-code advance widths (in 1/1000 em units) ----
173    /// For simple fonts: maps char_code (0–255) to advance width in glyph
174    /// units (1/1000 em). Built from `FirstChar` + `/Widths` array.
175    /// Empty when the font dict has no `/Widths` or the data is malformed.
176    simple_widths: Box<[Option<f32>; 256]>,
177    /// Default width for CID glyphs (PDF §9.7.4.3). Used when no explicit
178    /// entry is found in `cid_widths`. PDF default = 1000.
179    cid_default_width: f32,
180    /// For Type0 fonts: maps CID → advance width (glyph units).
181    /// Built from the `/W` array of the first DescendantFont.
182    cid_widths: HashMap<u32, f32>,
183}
184
185/// Strip a 6-character subset prefix (e.g. `AAAAAA+Helvetica` → `Helvetica`).
186fn strip_subset_prefix(name: &str) -> &str {
187    match name.split_once('+') {
188        Some((prefix, rest)) if prefix.len() == 6 => rest,
189        _ => name,
190    }
191}
192
193/// PDF Font Descriptor `/Flags` field. Per PDF 1.7 §9.8.2 the relevant bits
194/// for style inference are bit 7 (Italic) and bit 19 (ForceBold), 1-indexed —
195/// i.e. mask 0x40 and 0x40000 in a 0-indexed u32.
196const FONT_FLAG_ITALIC: u32 = 1 << 6;
197const FONT_FLAG_FORCE_BOLD: u32 = 1 << 18;
198
199/// Derive `(base_font, is_bold, is_italic)` from a font dict + its descriptor.
200/// All three values are best-effort; missing data is the documented fallback
201/// per ROUND1_API_DESIGN.md.
202fn derive_font_style(doc: &Document, font: &lopdf::Dictionary) -> (Option<String>, bool, bool) {
203    let base_font_raw = font.get(b"BaseFont").ok().and_then(|o| match o {
204        Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
205        _ => None,
206    });
207    let base_font = base_font_raw
208        .as_deref()
209        .map(|s| strip_subset_prefix(s).to_string());
210
211    let (name_bold, name_italic) = base_font
212        .as_deref()
213        .map(name_style_hints)
214        .unwrap_or((false, false));
215
216    let descriptor = resolve_dict(doc, font, b"FontDescriptor");
217    let (desc_bold, desc_italic) = descriptor
218        .as_ref()
219        .map(|d| {
220            let weight_bold = d
221                .get(b"FontWeight")
222                .ok()
223                .and_then(|o| match o {
224                    Object::Integer(i) => Some(*i as u32),
225                    Object::Real(r) => Some(*r as u32),
226                    _ => None,
227                })
228                .is_some_and(|w| w >= 700);
229            let flags = d
230                .get(b"Flags")
231                .ok()
232                .and_then(|o| match o {
233                    Object::Integer(i) => Some(*i as u32),
234                    Object::Real(r) => Some(*r as u32),
235                    _ => None,
236                })
237                .unwrap_or(0);
238            let flag_italic = (flags & FONT_FLAG_ITALIC) != 0;
239            let flag_force_bold = (flags & FONT_FLAG_FORCE_BOLD) != 0;
240            (weight_bold || flag_force_bold, flag_italic)
241        })
242        .unwrap_or((false, false));
243
244    (
245        base_font,
246        desc_bold || name_bold,
247        desc_italic || name_italic,
248    )
249}
250
251/// Convert a [0.0, 1.0] PDF color component to a u8 channel, clamping.
252fn clamp_unit_to_u8(v: f64) -> u8 {
253    (v.clamp(0.0, 1.0) * 255.0).round() as u8
254}
255
256/// Convert DeviceCMYK to sRGB via the standard non-color-managed formula.
257/// Accurate enough for editor UI; full ICC-managed paths render through
258/// the engine and use `moxcms`.
259fn cmyk_to_rgba(c: f64, m: f64, y: f64, k: f64) -> [u8; 4] {
260    let c = c.clamp(0.0, 1.0);
261    let m = m.clamp(0.0, 1.0);
262    let y = y.clamp(0.0, 1.0);
263    let k = k.clamp(0.0, 1.0);
264    let r = (1.0 - c) * (1.0 - k);
265    let g = (1.0 - m) * (1.0 - k);
266    let b = (1.0 - y) * (1.0 - k);
267    [
268        clamp_unit_to_u8(r),
269        clamp_unit_to_u8(g),
270        clamp_unit_to_u8(b),
271        255,
272    ]
273}
274
275/// Heuristic style hints from a PostScript / BaseFont name. Matches
276/// `pdf-interpret::FallbackFontQuery::new` rules.
277fn name_style_hints(name: &str) -> (bool, bool) {
278    let lower = name.to_ascii_lowercase();
279    let italic = lower.contains("italic") || lower.contains("oblique") || lower.contains("slant");
280    let bold = lower.contains("bold")
281        || lower.contains("demi")
282        || lower.contains("semibold")
283        || lower.contains("heavy")
284        || lower.contains("black");
285    (bold, italic)
286}
287
288/// Look up the glyph advance width for `char_code` in `fi`, returning
289/// `Some(width_in_glyph_units)` (1/1000 em) when font metric data is
290/// present, or `None` when only the 50%-em estimate is available.
291///
292/// For CID fonts the 2-byte char code is passed as-is (big-endian u32).
293fn font_glyph_advance(fi: &FontInfo, char_code: u32) -> Option<f32> {
294    if fi.is_cid {
295        fi.cid_widths
296            .get(&char_code)
297            .copied()
298            .or(if fi.cid_default_width > 0.0 {
299                Some(fi.cid_default_width)
300            } else {
301                None
302            })
303    } else {
304        fi.simple_widths.get(char_code as usize).and_then(|w| *w)
305    }
306}
307
308/// Returns `true` when `fi` has at least one real glyph-width entry so we
309/// can set `WidthSource::Metric` on the resulting block.
310fn fi_has_metrics(fi: &FontInfo) -> bool {
311    if fi.is_cid {
312        // DW is always a real value from the PDF (default 1000 when missing).
313        true
314    } else {
315        fi.simple_widths.iter().any(|w| w.is_some())
316    }
317}
318
319/// Compute per-glyph advance widths (in user space) for a raw PDF byte string.
320///
321/// Returns `(advances, width_source)` where `advances` contains one entry per
322/// source glyph (1-byte for simple fonts, 2-byte pairs for CID).  Each entry
323/// already factors in `Tz` (horizontal scaling) and `Tc`/`Tw` spacing.
324///
325/// The caller is responsible for applying TJ kerning offsets *between* calls.
326fn compute_glyph_advances(
327    bytes: &[u8],
328    fi: &FontInfo,
329    font_size: f64,
330    tc: f64,
331    tw: f64,
332    th: f64,
333) -> (Vec<f64>, WidthSource) {
334    let has_metrics = fi_has_metrics(fi);
335    let width_source = if has_metrics {
336        WidthSource::Metric
337    } else {
338        WidthSource::Estimate
339    };
340    let approx = APPROX_CHAR_WIDTH * 1000.0; // fallback in glyph units
341
342    let mut advances = Vec::new();
343
344    if fi.is_cid {
345        let mut i = 0;
346        while i + 1 < bytes.len() {
347            let code = u16::from_be_bytes([bytes[i], bytes[i + 1]]) as u32;
348            i += 2;
349            let glyph_w = font_glyph_advance(fi, code).unwrap_or(approx as f32);
350            let adv = glyph_w as f64 / 1000.0 * font_size * (th / 100.0) + tc;
351            advances.push(adv);
352        }
353    } else {
354        for &b in bytes {
355            let glyph_w = if has_metrics {
356                font_glyph_advance(fi, b as u32).unwrap_or(approx as f32)
357            } else {
358                approx as f32
359            };
360            let extra_word = if b == 0x20 { tw } else { 0.0 };
361            let adv = glyph_w as f64 / 1000.0 * font_size * (th / 100.0) + tc + extra_word;
362            advances.push(adv);
363        }
364    }
365
366    (advances, width_source)
367}
368
369/// Compact text-state parameters passed into the char-bounds helpers.
370struct GlyphCtx {
371    font_size: f64,
372    tc: f64,
373    tw: f64,
374    th: f64,
375}
376
377/// Build char bounds for a Tj string. Returns `(char_bounds, width_source, total_advance)`.
378fn char_bounds_from_bytes(
379    bytes: &[u8],
380    fi: &FontInfo,
381    x: f64,
382    y: f64,
383    ctx: &GlyphCtx,
384) -> (Vec<[f64; 4]>, WidthSource, f64) {
385    let (advances, ws) = compute_glyph_advances(bytes, fi, ctx.font_size, ctx.tc, ctx.tw, ctx.th);
386    let mut cx = x;
387    let mut bounds = Vec::with_capacity(advances.len());
388    for adv in &advances {
389        bounds.push([cx, y, cx + adv, y + ctx.font_size]);
390        cx += adv;
391    }
392    (bounds, ws, cx - x)
393}
394
395/// Build char bounds for a TJ array, returning `(char_bounds, width_source, x_end)`.
396///
397/// TJ items are either strings (glyph runs) or numbers (kerning offsets in
398/// thousandths of an em).  Kerning offsets shift the current position and are
399/// NOT included as separate char_bounds entries.
400fn char_bounds_from_tj_array(
401    arr: &[Object],
402    fi: &FontInfo,
403    x_start: f64,
404    y: f64,
405    ctx: &GlyphCtx,
406) -> (Vec<[f64; 4]>, WidthSource, f64) {
407    let font_size = ctx.font_size;
408    let tc = ctx.tc;
409    let tw = ctx.tw;
410    let th = ctx.th;
411    let has_metrics = fi_has_metrics(fi);
412    let width_source = if has_metrics {
413        WidthSource::Metric
414    } else {
415        WidthSource::Estimate
416    };
417    let approx = APPROX_CHAR_WIDTH * 1000.0_f64;
418
419    let mut bounds = Vec::new();
420    let mut cx = x_start;
421
422    for item in arr {
423        match item {
424            Object::String(bytes, _) => {
425                if fi.is_cid {
426                    let mut i = 0;
427                    while i + 1 < bytes.len() {
428                        let code = u16::from_be_bytes([bytes[i], bytes[i + 1]]) as u32;
429                        i += 2;
430                        let glyph_w = font_glyph_advance(fi, code).unwrap_or(approx as f32) as f64;
431                        let adv = glyph_w / 1000.0 * font_size * (th / 100.0) + tc;
432                        bounds.push([cx, y, cx + adv, y + font_size]);
433                        cx += adv;
434                    }
435                } else {
436                    for &b in bytes.iter() {
437                        let glyph_w = if has_metrics {
438                            font_glyph_advance(fi, b as u32).unwrap_or(approx as f32) as f64
439                        } else {
440                            approx
441                        };
442                        let extra_word = if b == 0x20 { tw } else { 0.0 };
443                        let adv = glyph_w / 1000.0 * font_size * (th / 100.0) + tc + extra_word;
444                        bounds.push([cx, y, cx + adv, y + font_size]);
445                        cx += adv;
446                    }
447                }
448            }
449            _ => {
450                if let Some(adj) = as_number(item) {
451                    // Negative shifts right (standard kerning), positive shifts left.
452                    cx -= adj / 1000.0 * font_size * (th / 100.0);
453                }
454            }
455        }
456    }
457
458    (bounds, width_source, cx)
459}
460
461/// Build a map from font resource name (e.g. "F1") to FontInfo for a page.
462fn build_font_map(doc: &Document, page_id: ObjectId) -> HashMap<String, FontInfo> {
463    let mut map = HashMap::new();
464
465    // Get the page's Resources dictionary (may be inherited from parent Pages node).
466    let resources = get_page_resources(doc, page_id);
467    let font_dict = match resources.and_then(|res| match res.get(b"Font").ok()? {
468        Object::Dictionary(d) => Some(d.clone()),
469        Object::Reference(r) => match doc.get_object(*r).ok()? {
470            Object::Dictionary(d) => Some(d.clone()),
471            _ => None,
472        },
473        _ => None,
474    }) {
475        Some(d) => d,
476        None => return map,
477    };
478
479    for (name_bytes, value) in font_dict.iter() {
480        let font_name = String::from_utf8_lossy(name_bytes).to_string();
481
482        // Resolve the font dictionary.
483        let font = match value {
484            Object::Reference(r) => match doc.get_object(*r).ok() {
485                Some(Object::Dictionary(d)) => d.clone(),
486                _ => continue,
487            },
488            Object::Dictionary(d) => d.clone(),
489            _ => continue,
490        };
491
492        let subtype = font
493            .get(b"Subtype")
494            .ok()
495            .and_then(|o| match o {
496                Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
497                _ => None,
498            })
499            .unwrap_or_default();
500
501        let is_cid = subtype == "Type0";
502
503        // Parse ToUnicode CMap if present.
504        let to_unicode = parse_to_unicode_from_font(doc, &font);
505
506        // For Type0 fonts, also check DescendantFonts for ToUnicode.
507        let to_unicode = if to_unicode.is_empty() && is_cid {
508            if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
509                descendants
510                    .iter()
511                    .find_map(|d| {
512                        let desc_dict = match d {
513                            Object::Reference(r) => match doc.get_object(*r).ok()? {
514                                Object::Dictionary(d) => d,
515                                _ => return None,
516                            },
517                            Object::Dictionary(d) => d,
518                            _ => return None,
519                        };
520                        let tu = parse_to_unicode_from_font(doc, desc_dict);
521                        if tu.is_empty() {
522                            None
523                        } else {
524                            Some(tu)
525                        }
526                    })
527                    .unwrap_or_default()
528            } else {
529                HashMap::new()
530            }
531        } else {
532            to_unicode
533        };
534
535        // Build encoding map for simple fonts (from Encoding + Differences).
536        let (encoding_map, ct_codes) = if !is_cid {
537            build_encoding_map(doc, &font)
538        } else {
539            ([None; 256], [false; 256])
540        };
541
542        let (base_font, is_bold, is_italic) = derive_font_style(doc, &font);
543
544        let (simple_widths, cid_default_width, cid_widths) = parse_font_widths(doc, &font, is_cid);
545
546        map.insert(
547            font_name,
548            FontInfo {
549                is_cid,
550                to_unicode,
551                encoding_map,
552                ct_codes,
553                base_font,
554                is_bold,
555                is_italic,
556                simple_widths,
557                cid_default_width,
558                cid_widths,
559            },
560        );
561    }
562
563    map
564}
565
566/// Parse glyph advance widths from a font dictionary.
567///
568/// Returns `(simple_widths, cid_default_width, cid_widths)`.
569/// - `simple_widths`: per-code widths for simple fonts (1/1000 em; code 0–255).
570/// - `cid_default_width`: `/DW` from the DescendantFont (default 1000).
571/// - `cid_widths`: CID→width map from `/W` array of the DescendantFont.
572fn parse_font_widths(
573    doc: &Document,
574    font: &lopdf::Dictionary,
575    is_cid: bool,
576) -> (Box<[Option<f32>; 256]>, f32, HashMap<u32, f32>) {
577    if is_cid {
578        let (dw, cid_map) = parse_cid_widths(doc, font);
579        return (Box::new([None; 256]), dw, cid_map);
580    }
581
582    // Simple font: /FirstChar + /Widths
583    let first_char = match font.get(b"FirstChar").ok() {
584        Some(Object::Integer(n)) => *n as usize,
585        _ => return (Box::new([None; 256]), 1000.0, HashMap::new()),
586    };
587
588    let widths_arr = match font.get(b"Widths").ok() {
589        Some(Object::Array(a)) => a.clone(),
590        Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
591            Some(Object::Array(a)) => a.clone(),
592            _ => return (Box::new([None; 256]), 1000.0, HashMap::new()),
593        },
594        _ => return (Box::new([None; 256]), 1000.0, HashMap::new()),
595    };
596
597    let mut simple_widths = Box::new([None::<f32>; 256]);
598    for (i, obj) in widths_arr.iter().enumerate() {
599        let code = first_char + i;
600        if code > 255 {
601            break;
602        }
603        let w = match obj {
604            Object::Integer(n) => *n as f32,
605            Object::Real(n) => *n,
606            _ => continue,
607        };
608        simple_widths[code] = Some(w);
609    }
610
611    (simple_widths, 1000.0, HashMap::new())
612}
613
614/// Parse the `/W` array (run-length CID width encoding) and `/DW` from a
615/// Type0 font's DescendantFonts array.
616fn parse_cid_widths(doc: &Document, font: &lopdf::Dictionary) -> (f32, HashMap<u32, f32>) {
617    let descendants = match font.get(b"DescendantFonts").ok() {
618        Some(Object::Array(a)) => a.clone(),
619        Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
620            Some(Object::Array(a)) => a.clone(),
621            _ => return (1000.0, HashMap::new()),
622        },
623        _ => return (1000.0, HashMap::new()),
624    };
625
626    let desc_dict = match descendants.first() {
627        Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
628            Some(Object::Dictionary(d)) => d.clone(),
629            _ => return (1000.0, HashMap::new()),
630        },
631        Some(Object::Dictionary(d)) => d.clone(),
632        _ => return (1000.0, HashMap::new()),
633    };
634
635    let dw = match desc_dict.get(b"DW").ok() {
636        Some(Object::Integer(n)) => *n as f32,
637        Some(Object::Real(n)) => *n,
638        _ => 1000.0,
639    };
640
641    let w_arr = match desc_dict.get(b"W").ok() {
642        Some(Object::Array(a)) => a.clone(),
643        Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
644            Some(Object::Array(a)) => a.clone(),
645            _ => return (dw, HashMap::new()),
646        },
647        _ => return (dw, HashMap::new()),
648    };
649
650    // PDF §9.7.4.3  /W: [ c [w1 w2 …] | c1 c2 w | … ]
651    let mut map = HashMap::new();
652    let mut idx = 0;
653    while idx < w_arr.len() {
654        let cid_start = match &w_arr[idx] {
655            Object::Integer(n) => *n as u32,
656            _ => {
657                idx += 1;
658                continue;
659            }
660        };
661        idx += 1;
662        if idx >= w_arr.len() {
663            break;
664        }
665        match &w_arr[idx] {
666            Object::Array(widths) => {
667                // [ cid [w1 w2 …] ] — one width per consecutive CID
668                for (i, wobj) in widths.iter().enumerate() {
669                    let w = match wobj {
670                        Object::Integer(n) => *n as f32,
671                        Object::Real(n) => *n,
672                        _ => continue,
673                    };
674                    map.insert(cid_start + i as u32, w);
675                }
676                idx += 1;
677            }
678            Object::Integer(c2) => {
679                // [ cid_first cid_last w ] — same width for a range
680                let cid_end = *c2 as u32;
681                idx += 1;
682                if idx >= w_arr.len() {
683                    break;
684                }
685                let w = match &w_arr[idx] {
686                    Object::Integer(n) => *n as f32,
687                    Object::Real(n) => *n,
688                    _ => {
689                        idx += 1;
690                        continue;
691                    }
692                };
693                for cid in cid_start..=cid_end {
694                    map.insert(cid, w);
695                }
696                idx += 1;
697            }
698            _ => {
699                idx += 1;
700            }
701        }
702    }
703
704    (dw, map)
705}
706
707/// Parse the ToUnicode CMap from a font dictionary.
708fn parse_to_unicode_from_font(doc: &Document, font: &lopdf::Dictionary) -> HashMap<u32, String> {
709    let tu_obj = match font.get(b"ToUnicode").ok() {
710        Some(Object::Reference(r)) => doc.get_object(*r).ok(),
711        Some(obj) => Some(obj),
712        None => return HashMap::new(),
713    };
714
715    let stream_bytes = match tu_obj {
716        Some(Object::Stream(ref s)) => s
717            .decompressed_content()
718            .ok()
719            .unwrap_or_else(|| s.content.clone()),
720        _ => return HashMap::new(),
721    };
722
723    parse_to_unicode_cmap(&stream_bytes)
724}
725
726/// Parse a ToUnicode CMap stream into a code→Unicode mapping.
727///
728/// Handles both `beginbfchar` and `beginbfrange` sections.
729fn parse_to_unicode_cmap(data: &[u8]) -> HashMap<u32, String> {
730    let text = String::from_utf8_lossy(data);
731    let mut map = HashMap::new();
732
733    // Parse beginbfchar sections: <srcCode> <dstString>
734    for section in text.split("beginbfchar") {
735        let section = match section.split("endbfchar").next() {
736            Some(s) => s,
737            None => continue,
738        };
739        let tokens = extract_hex_tokens(section);
740        for pair in tokens.chunks(2) {
741            if pair.len() == 2 {
742                let code = parse_hex_u32(&pair[0]);
743                let unicode = hex_to_unicode_string(&pair[1]);
744                map.insert(code, unicode);
745            }
746        }
747    }
748
749    // Parse beginbfrange sections: <srcLo> <srcHi> <dstStart> or <srcLo> <srcHi> [<dst1> <dst2> ...]
750    for section in text.split("beginbfrange") {
751        let section = match section.split("endbfrange").next() {
752            Some(s) => s,
753            None => continue,
754        };
755
756        // Tokenize: extract hex tokens and array brackets
757        let mut chars = section.chars().peekable();
758        let mut tokens: Vec<String> = Vec::new();
759        let mut arrays: Vec<Vec<String>> = Vec::new();
760        let mut in_array = false;
761        let mut current_array: Vec<String> = Vec::new();
762
763        while let Some(&ch) = chars.peek() {
764            if ch == '<' {
765                chars.next();
766                let hex: String = chars
767                    .by_ref()
768                    .take_while(|&c| c != '>')
769                    .filter(|c| !c.is_whitespace())
770                    .collect();
771                if in_array {
772                    current_array.push(hex);
773                } else {
774                    tokens.push(hex);
775                }
776            } else if ch == '[' {
777                chars.next();
778                in_array = true;
779                current_array = Vec::new();
780            } else if ch == ']' {
781                chars.next();
782                in_array = false;
783                arrays.push(std::mem::take(&mut current_array));
784                tokens.push(String::new()); // placeholder for array position
785            } else {
786                chars.next();
787            }
788        }
789
790        // Process range entries: every 3 tokens = (lo, hi, dst_or_array)
791        let mut array_idx = 0;
792        let mut i = 0;
793        while i + 2 < tokens.len() {
794            let lo = parse_hex_u32(&tokens[i]);
795            let hi = parse_hex_u32(&tokens[i + 1]);
796
797            if tokens[i + 2].is_empty() {
798                // Array destination
799                if array_idx < arrays.len() {
800                    let arr = &arrays[array_idx];
801                    for (offset, dst) in arr.iter().enumerate() {
802                        let code = lo + offset as u32;
803                        if code <= hi {
804                            map.insert(code, hex_to_unicode_string(dst));
805                        }
806                    }
807                    array_idx += 1;
808                }
809            } else {
810                // Single start value — increment for each code in range
811                let dst_start = parse_hex_u32(&tokens[i + 2]);
812                let dst_len = tokens[i + 2].len();
813                for code in lo..=hi {
814                    let dst_val = dst_start + (code - lo);
815                    let s = if dst_len <= 4 {
816                        // BMP character
817                        char::from_u32(dst_val)
818                            .map(|c| c.to_string())
819                            .unwrap_or_default()
820                    } else {
821                        // Multi-byte: treat as UTF-16BE pairs
822                        let hex = format!("{:0>width$X}", dst_val, width = dst_len);
823                        hex_to_unicode_string(&hex)
824                    };
825                    map.insert(code, s);
826                }
827            }
828            i += 3;
829        }
830    }
831
832    map
833}
834
835/// Extract hex tokens (contents between < and >) from text.
836fn extract_hex_tokens(text: &str) -> Vec<String> {
837    let mut tokens = Vec::new();
838    let mut in_hex = false;
839    let mut current = String::new();
840    for ch in text.chars() {
841        if ch == '<' {
842            in_hex = true;
843            current.clear();
844        } else if ch == '>' && in_hex {
845            in_hex = false;
846            tokens.push(current.clone());
847        } else if in_hex && !ch.is_whitespace() {
848            current.push(ch);
849        }
850    }
851    tokens
852}
853
854/// Parse a hex string to a u32 (e.g., "0041" → 65).
855fn parse_hex_u32(hex: &str) -> u32 {
856    u32::from_str_radix(hex, 16).unwrap_or(0)
857}
858
859/// Convert a hex string to a Unicode string (interpreting as UTF-16BE pairs).
860fn hex_to_unicode_string(hex: &str) -> String {
861    let bytes: Vec<u8> = (0..hex.len())
862        .step_by(2)
863        .filter_map(|i| u8::from_str_radix(&hex[i..i + 2.min(hex.len() - i)], 16).ok())
864        .collect();
865
866    if bytes.len() >= 2 && bytes.len().is_multiple_of(2) {
867        // Interpret as UTF-16BE
868        let u16s: Vec<u16> = bytes
869            .chunks(2)
870            .map(|c| u16::from_be_bytes([c[0], c[1]]))
871            .collect();
872        String::from_utf16_lossy(&u16s)
873    } else if bytes.len() == 1 {
874        char::from_u32(bytes[0] as u32)
875            .map(|c| c.to_string())
876            .unwrap_or_default()
877    } else {
878        String::new()
879    }
880}
881
882/// Get the Resources dictionary for a page (resolves inheritance from parent Pages).
883fn get_page_resources(doc: &Document, page_id: ObjectId) -> Option<lopdf::Dictionary> {
884    let page = match doc.get_object(page_id).ok()? {
885        Object::Dictionary(d) => d.clone(),
886        _ => return None,
887    };
888
889    // Direct Resources on the page.
890    if let Some(res) = resolve_dict(doc, &page, b"Resources") {
891        return Some(res);
892    }
893
894    // Walk up the parent chain (Pages nodes) for inherited Resources.
895    let mut current = page;
896    for _ in 0..20 {
897        let parent_ref = match current.get(b"Parent").ok()? {
898            Object::Reference(r) => *r,
899            _ => break,
900        };
901        let parent = match doc.get_object(parent_ref).ok()? {
902            Object::Dictionary(d) => d.clone(),
903            _ => break,
904        };
905        if let Some(res) = resolve_dict(doc, &parent, b"Resources") {
906            return Some(res);
907        }
908        current = parent;
909    }
910
911    None
912}
913
914/// Resolve a dictionary entry that may be inline or an indirect reference.
915fn resolve_dict(doc: &Document, dict: &lopdf::Dictionary, key: &[u8]) -> Option<lopdf::Dictionary> {
916    match dict.get(key).ok()? {
917        Object::Dictionary(d) => Some(d.clone()),
918        Object::Reference(r) => match doc.get_object(*r).ok()? {
919            Object::Dictionary(d) => Some(d.clone()),
920            _ => None,
921        },
922        _ => None,
923    }
924}
925
926/// Build a 256-entry encoding map from a font's /Encoding dictionary.
927///
928/// Handles:
929/// - Named base encodings: WinAnsiEncoding, MacRomanEncoding, MacExpertEncoding
930/// - Differences arrays: `[code1 /name1 /name2 ... codeN /nameN ...]`
931/// - Glyph name → Unicode via AGL (Adobe Glyph List) lookup
932fn build_encoding_map(
933    doc: &Document,
934    font: &lopdf::Dictionary,
935) -> ([Option<char>; 256], [bool; 256]) {
936    let mut table = [None::<char>; 256];
937    let mut ct_codes = [false; 256];
938
939    let encoding = match font.get(b"Encoding").ok() {
940        Some(obj) => obj,
941        None => return (table, ct_codes),
942    };
943
944    match encoding {
945        Object::Name(name) => {
946            // Named encoding (e.g. "WinAnsiEncoding").
947            let name_str = String::from_utf8_lossy(name);
948            apply_base_encoding(&mut table, &name_str);
949        }
950        Object::Reference(r) => match doc.get_object(*r) {
951            Ok(Object::Dictionary(enc_dict)) => {
952                parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
953            }
954            Ok(Object::Name(name)) => {
955                let name_str = String::from_utf8_lossy(name);
956                apply_base_encoding(&mut table, &name_str);
957            }
958            _ => {}
959        },
960        Object::Dictionary(enc_dict) => {
961            parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
962        }
963        _ => {}
964    }
965
966    (table, ct_codes)
967}
968
969/// Parse an Encoding dictionary with optional BaseEncoding and Differences.
970fn parse_encoding_dict(
971    doc: &Document,
972    enc_dict: &lopdf::Dictionary,
973    table: &mut [Option<char>; 256],
974    ct_codes: &mut [bool; 256],
975) {
976    // Apply BaseEncoding first.
977    if let Ok(Object::Name(base)) = enc_dict.get(b"BaseEncoding") {
978        let base_str = String::from_utf8_lossy(base);
979        apply_base_encoding(table, &base_str);
980    }
981
982    // Apply Differences array: [code /name1 /name2 ... code /name3 ...]
983    let diffs = match enc_dict.get(b"Differences").ok() {
984        Some(Object::Array(arr)) => arr.clone(),
985        Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
986            Some(Object::Array(arr)) => arr.clone(),
987            _ => return,
988        },
989        _ => return,
990    };
991
992    let mut code: Option<u32> = None;
993    for item in &diffs {
994        match item {
995            Object::Integer(n) => {
996                code = Some(*n as u32);
997            }
998            Object::Name(name) => {
999                if let Some(c) = code {
1000                    if c < 256 {
1001                        let glyph = String::from_utf8_lossy(name);
1002                        apply_glyph_name(&glyph, c as usize, table, ct_codes);
1003                    }
1004                    code = Some(c + 1);
1005                }
1006            }
1007            Object::Reference(r) => {
1008                // Indirect name reference (rare).
1009                if let Ok(Object::Name(name)) = doc.get_object(*r) {
1010                    if let Some(c) = code {
1011                        if c < 256 {
1012                            let glyph = String::from_utf8_lossy(name);
1013                            apply_glyph_name(&glyph, c as usize, table, ct_codes);
1014                        }
1015                        code = Some(c + 1);
1016                    }
1017                }
1018            }
1019            _ => {}
1020        }
1021    }
1022}
1023
1024/// Resolve a glyph name to either a single Unicode codepoint (stored in
1025/// `table`) or the `ct` ligature marker (recorded in `ct_codes`). The `ct`
1026/// ligature has no precomposed Unicode codepoint, so it cannot live in
1027/// `table`; it is tracked separately so that decoding can emit one internal
1028/// marker and preserve one-glyph text advance before later decomposition.
1029fn apply_glyph_name(
1030    glyph: &str,
1031    code: usize,
1032    table: &mut [Option<char>; 256],
1033    ct_codes: &mut [bool; 256],
1034) {
1035    if glyph == "ct" {
1036        // Clear any prior char mapping at this slot — the ct flag is the
1037        // sole source of truth for this code.
1038        table[code] = None;
1039        ct_codes[code] = true;
1040        return;
1041    }
1042    if let Some(ch) = glyph_name_to_unicode(glyph) {
1043        table[code] = Some(ch);
1044        ct_codes[code] = false;
1045    }
1046}
1047
1048/// Apply a named base encoding to the table.
1049fn apply_base_encoding(table: &mut [Option<char>; 256], name: &str) {
1050    let source = match name {
1051        "WinAnsiEncoding" => winansi_encoding(),
1052        "MacRomanEncoding" => mac_roman_encoding(),
1053        _ => return,
1054    };
1055    for (i, &ch) in source.iter().enumerate() {
1056        if ch != '\0' {
1057            table[i] = Some(ch);
1058        }
1059    }
1060}
1061
1062/// WinAnsiEncoding table (cp1252).
1063fn winansi_encoding() -> &'static [char; 256] {
1064    static TABLE: OnceLock<[char; 256]> = OnceLock::new();
1065    TABLE.get_or_init(|| {
1066        let mut t = ['\0'; 256];
1067        // ASCII range is identity.
1068        for i in 0x20..=0x7Eu8 {
1069            t[i as usize] = i as char;
1070        }
1071        // Control chars mapped to common usage.
1072        t[0x09] = '\t';
1073        t[0x0A] = '\n';
1074        t[0x0D] = '\r';
1075        // cp1252 upper range (0x80-0xFF).
1076        let cp1252: [(u8, char); 27] = [
1077            (0x80, '\u{20AC}'), // Euro sign
1078            (0x82, '\u{201A}'), // Single low-9 quotation mark
1079            (0x83, '\u{0192}'), // Latin small letter f with hook
1080            (0x84, '\u{201E}'), // Double low-9 quotation mark
1081            (0x85, '\u{2026}'), // Horizontal ellipsis
1082            (0x86, '\u{2020}'), // Dagger
1083            (0x87, '\u{2021}'), // Double dagger
1084            (0x88, '\u{02C6}'), // Modifier letter circumflex accent
1085            (0x89, '\u{2030}'), // Per mille sign
1086            (0x8A, '\u{0160}'), // Latin capital letter S with caron
1087            (0x8B, '\u{2039}'), // Single left-pointing angle quotation mark
1088            (0x8C, '\u{0152}'), // Latin capital ligature OE
1089            (0x8E, '\u{017D}'), // Latin capital letter Z with caron
1090            (0x91, '\u{2018}'), // Left single quotation mark
1091            (0x92, '\u{2019}'), // Right single quotation mark
1092            (0x93, '\u{201C}'), // Left double quotation mark
1093            (0x94, '\u{201D}'), // Right double quotation mark
1094            (0x95, '\u{2022}'), // Bullet
1095            (0x96, '\u{2013}'), // En dash
1096            (0x97, '\u{2014}'), // Em dash
1097            (0x98, '\u{02DC}'), // Small tilde
1098            (0x99, '\u{2122}'), // Trade mark sign
1099            (0x9A, '\u{0161}'), // Latin small letter s with caron
1100            (0x9B, '\u{203A}'), // Single right-pointing angle quotation mark
1101            (0x9C, '\u{0153}'), // Latin small ligature oe
1102            (0x9E, '\u{017E}'), // Latin small letter z with caron
1103            (0x9F, '\u{0178}'), // Latin capital letter Y with diaeresis
1104        ];
1105        for (code, ch) in cp1252 {
1106            t[code as usize] = ch;
1107        }
1108        // 0xA0-0xFF: same as Unicode Latin-1 supplement.
1109        for i in 0xA0..=0xFFu16 {
1110            t[i as usize] = char::from_u32(i as u32).unwrap_or('\0');
1111        }
1112        t
1113    })
1114}
1115
1116/// MacRomanEncoding table.
1117fn mac_roman_encoding() -> &'static [char; 256] {
1118    static TABLE: OnceLock<[char; 256]> = OnceLock::new();
1119    TABLE.get_or_init(|| {
1120        let mut t = ['\0'; 256];
1121        // ASCII range is identity.
1122        for i in 0x20..=0x7Eu8 {
1123            t[i as usize] = i as char;
1124        }
1125        t[0x09] = '\t';
1126        t[0x0A] = '\n';
1127        t[0x0D] = '\r';
1128        // Mac Roman 0x80-0xFF mapping to Unicode.
1129        let mac_upper: [char; 128] = [
1130            '\u{00C4}', '\u{00C5}', '\u{00C7}', '\u{00C9}', '\u{00D1}', '\u{00D6}', '\u{00DC}',
1131            '\u{00E1}', '\u{00E0}', '\u{00E2}', '\u{00E4}', '\u{00E3}', '\u{00E5}', '\u{00E7}',
1132            '\u{00E9}', '\u{00E8}', '\u{00EA}', '\u{00EB}', '\u{00ED}', '\u{00EC}', '\u{00EE}',
1133            '\u{00EF}', '\u{00F1}', '\u{00F3}', '\u{00F2}', '\u{00F4}', '\u{00F6}', '\u{00F5}',
1134            '\u{00FA}', '\u{00F9}', '\u{00FB}', '\u{00FC}', '\u{2020}', '\u{00B0}', '\u{00A2}',
1135            '\u{00A3}', '\u{00A7}', '\u{2022}', '\u{00B6}', '\u{00DF}', '\u{00AE}', '\u{00A9}',
1136            '\u{2122}', '\u{00B4}', '\u{00A8}', '\u{2260}', '\u{00C6}', '\u{00D8}', '\u{221E}',
1137            '\u{00B1}', '\u{2264}', '\u{2265}', '\u{00A5}', '\u{00B5}', '\u{2202}', '\u{2211}',
1138            '\u{220F}', '\u{03C0}', '\u{222B}', '\u{00AA}', '\u{00BA}', '\u{03A9}', '\u{00E6}',
1139            '\u{00F8}', '\u{00BF}', '\u{00A1}', '\u{00AC}', '\u{221A}', '\u{0192}', '\u{2248}',
1140            '\u{2206}', '\u{00AB}', '\u{00BB}', '\u{2026}', '\u{00A0}', '\u{00C0}', '\u{00C3}',
1141            '\u{00D5}', '\u{0152}', '\u{0153}', '\u{2013}', '\u{2014}', '\u{201C}', '\u{201D}',
1142            '\u{2018}', '\u{2019}', '\u{00F7}', '\u{25CA}', '\u{00FF}', '\u{0178}', '\u{2044}',
1143            '\u{20AC}', '\u{2039}', '\u{203A}', '\u{FB01}', '\u{FB02}', '\u{2021}', '\u{00B7}',
1144            '\u{201A}', '\u{201E}', '\u{2030}', '\u{00C2}', '\u{00CA}', '\u{00C1}', '\u{00CB}',
1145            '\u{00C8}', '\u{00CD}', '\u{00CE}', '\u{00CF}', '\u{00CC}', '\u{00D3}', '\u{00D4}',
1146            '\u{F8FF}', '\u{00D2}', '\u{00DA}', '\u{00DB}', '\u{00D9}', '\u{0131}', '\u{02C6}',
1147            '\u{02DC}', '\u{00AF}', '\u{02D8}', '\u{02D9}', '\u{02DA}', '\u{00B8}', '\u{02DD}',
1148            '\u{02DB}', '\u{02C7}',
1149        ];
1150        for (i, &ch) in mac_upper.iter().enumerate() {
1151            t[0x80 + i] = ch;
1152        }
1153        t
1154    })
1155}
1156
1157/// Map an Adobe Glyph List (AGL) name to a Unicode character.
1158///
1159/// Handles:
1160/// - `uniXXXX` names (4+ hex digits after "uni")
1161/// - `uXXXX` / `uXXXXX` names
1162/// - Common AGL names (top ~250 entries covering >99% of real-world PDFs)
1163///
1164/// The `ct` ligature has no precomposed Unicode codepoint and is handled
1165/// separately via the `ct_codes` side-table on `FontInfo` — see
1166/// [`apply_glyph_name`] — rather than through any PUA sentinel here.
1167fn glyph_name_to_unicode(name: &str) -> Option<char> {
1168    // Handle uniXXXX / uXXXXX patterns.
1169    if name.starts_with("uni") && name.len() >= 7 {
1170        return u32::from_str_radix(&name[3..7], 16)
1171            .ok()
1172            .and_then(char::from_u32);
1173    }
1174    if name.starts_with('u') && name.len() >= 5 && name[1..].chars().all(|c| c.is_ascii_hexdigit())
1175    {
1176        return u32::from_str_radix(&name[1..], 16)
1177            .ok()
1178            .and_then(char::from_u32);
1179    }
1180
1181    if let Some(c) = agl_table().get(name).copied() {
1182        return Some(c);
1183    }
1184
1185    match name {
1186        "st" => Some('\u{FB06}'),
1187        "longst" => Some('\u{FB05}'),
1188        _ => None,
1189    }
1190}
1191
1192/// Built-in Adobe Glyph List table (most common ~250 entries).
1193fn agl_table() -> &'static HashMap<&'static str, char> {
1194    static TABLE: OnceLock<HashMap<&'static str, char>> = OnceLock::new();
1195    TABLE.get_or_init(|| {
1196        let entries: &[(&str, char)] = &[
1197            ("space", ' '),
1198            ("exclam", '!'),
1199            ("quotedbl", '"'),
1200            ("numbersign", '#'),
1201            ("dollar", '$'),
1202            ("percent", '%'),
1203            ("ampersand", '&'),
1204            ("quotesingle", '\''),
1205            ("parenleft", '('),
1206            ("parenright", ')'),
1207            ("asterisk", '*'),
1208            ("plus", '+'),
1209            ("comma", ','),
1210            ("hyphen", '-'),
1211            ("period", '.'),
1212            ("slash", '/'),
1213            ("zero", '0'),
1214            ("one", '1'),
1215            ("two", '2'),
1216            ("three", '3'),
1217            ("four", '4'),
1218            ("five", '5'),
1219            ("six", '6'),
1220            ("seven", '7'),
1221            ("eight", '8'),
1222            ("nine", '9'),
1223            ("colon", ':'),
1224            ("semicolon", ';'),
1225            ("less", '<'),
1226            ("equal", '='),
1227            ("greater", '>'),
1228            ("question", '?'),
1229            ("at", '@'),
1230            ("A", 'A'),
1231            ("B", 'B'),
1232            ("C", 'C'),
1233            ("D", 'D'),
1234            ("E", 'E'),
1235            ("F", 'F'),
1236            ("G", 'G'),
1237            ("H", 'H'),
1238            ("I", 'I'),
1239            ("J", 'J'),
1240            ("K", 'K'),
1241            ("L", 'L'),
1242            ("M", 'M'),
1243            ("N", 'N'),
1244            ("O", 'O'),
1245            ("P", 'P'),
1246            ("Q", 'Q'),
1247            ("R", 'R'),
1248            ("S", 'S'),
1249            ("T", 'T'),
1250            ("U", 'U'),
1251            ("V", 'V'),
1252            ("W", 'W'),
1253            ("X", 'X'),
1254            ("Y", 'Y'),
1255            ("Z", 'Z'),
1256            ("bracketleft", '['),
1257            ("backslash", '\\'),
1258            ("bracketright", ']'),
1259            ("asciicircum", '^'),
1260            ("underscore", '_'),
1261            ("grave", '`'),
1262            ("a", 'a'),
1263            ("b", 'b'),
1264            ("c", 'c'),
1265            ("d", 'd'),
1266            ("e", 'e'),
1267            ("f", 'f'),
1268            ("g", 'g'),
1269            ("h", 'h'),
1270            ("i", 'i'),
1271            ("j", 'j'),
1272            ("k", 'k'),
1273            ("l", 'l'),
1274            ("m", 'm'),
1275            ("n", 'n'),
1276            ("o", 'o'),
1277            ("p", 'p'),
1278            ("q", 'q'),
1279            ("r", 'r'),
1280            ("s", 's'),
1281            ("t", 't'),
1282            ("u", 'u'),
1283            ("v", 'v'),
1284            ("w", 'w'),
1285            ("x", 'x'),
1286            ("y", 'y'),
1287            ("z", 'z'),
1288            ("braceleft", '{'),
1289            ("bar", '|'),
1290            ("braceright", '}'),
1291            ("asciitilde", '~'),
1292            // Latin extended
1293            ("Agrave", '\u{00C0}'),
1294            ("Aacute", '\u{00C1}'),
1295            ("Acircumflex", '\u{00C2}'),
1296            ("Atilde", '\u{00C3}'),
1297            ("Adieresis", '\u{00C4}'),
1298            ("Aring", '\u{00C5}'),
1299            ("AE", '\u{00C6}'),
1300            ("Ccedilla", '\u{00C7}'),
1301            ("Egrave", '\u{00C8}'),
1302            ("Eacute", '\u{00C9}'),
1303            ("Ecircumflex", '\u{00CA}'),
1304            ("Edieresis", '\u{00CB}'),
1305            ("Igrave", '\u{00CC}'),
1306            ("Iacute", '\u{00CD}'),
1307            ("Icircumflex", '\u{00CE}'),
1308            ("Idieresis", '\u{00CF}'),
1309            ("Eth", '\u{00D0}'),
1310            ("Ntilde", '\u{00D1}'),
1311            ("Ograve", '\u{00D2}'),
1312            ("Oacute", '\u{00D3}'),
1313            ("Ocircumflex", '\u{00D4}'),
1314            ("Otilde", '\u{00D5}'),
1315            ("Odieresis", '\u{00D6}'),
1316            ("Ugrave", '\u{00D9}'),
1317            ("Uacute", '\u{00DA}'),
1318            ("Ucircumflex", '\u{00DB}'),
1319            ("Udieresis", '\u{00DC}'),
1320            ("Yacute", '\u{00DD}'),
1321            ("Thorn", '\u{00DE}'),
1322            ("germandbls", '\u{00DF}'),
1323            ("agrave", '\u{00E0}'),
1324            ("aacute", '\u{00E1}'),
1325            ("acircumflex", '\u{00E2}'),
1326            ("atilde", '\u{00E3}'),
1327            ("adieresis", '\u{00E4}'),
1328            ("aring", '\u{00E5}'),
1329            ("ae", '\u{00E6}'),
1330            ("ccedilla", '\u{00E7}'),
1331            ("egrave", '\u{00E8}'),
1332            ("eacute", '\u{00E9}'),
1333            ("ecircumflex", '\u{00EA}'),
1334            ("edieresis", '\u{00EB}'),
1335            ("igrave", '\u{00EC}'),
1336            ("iacute", '\u{00ED}'),
1337            ("icircumflex", '\u{00EE}'),
1338            ("idieresis", '\u{00EF}'),
1339            ("eth", '\u{00F0}'),
1340            ("ntilde", '\u{00F1}'),
1341            ("ograve", '\u{00F2}'),
1342            ("oacute", '\u{00F3}'),
1343            ("ocircumflex", '\u{00F4}'),
1344            ("otilde", '\u{00F5}'),
1345            ("odieresis", '\u{00F6}'),
1346            ("ugrave", '\u{00F9}'),
1347            ("uacute", '\u{00FA}'),
1348            ("ucircumflex", '\u{00FB}'),
1349            ("udieresis", '\u{00FC}'),
1350            ("yacute", '\u{00FD}'),
1351            ("thorn", '\u{00FE}'),
1352            ("ydieresis", '\u{00FF}'),
1353            // Ligatures and special
1354            ("fi", '\u{FB01}'),
1355            ("fl", '\u{FB02}'),
1356            ("ff", '\u{FB00}'),
1357            ("ffi", '\u{FB03}'),
1358            ("ffl", '\u{FB04}'),
1359            // Punctuation and symbols
1360            ("endash", '\u{2013}'),
1361            ("emdash", '\u{2014}'),
1362            ("bullet", '\u{2022}'),
1363            ("ellipsis", '\u{2026}'),
1364            ("quoteleft", '\u{2018}'),
1365            ("quoteright", '\u{2019}'),
1366            ("quotedblleft", '\u{201C}'),
1367            ("quotedblright", '\u{201D}'),
1368            ("quotesinglebase", '\u{201A}'),
1369            ("quotesinglbase", '\u{201A}'),
1370            ("quotedblbase", '\u{201E}'),
1371            ("dagger", '\u{2020}'),
1372            ("daggerdbl", '\u{2021}'),
1373            ("perthousand", '\u{2030}'),
1374            ("guilsinglleft", '\u{2039}'),
1375            ("guilsinglright", '\u{203A}'),
1376            ("guillemotleft", '\u{00AB}'),
1377            ("guillemotright", '\u{00BB}'),
1378            ("trademark", '\u{2122}'),
1379            ("copyright", '\u{00A9}'),
1380            ("registered", '\u{00AE}'),
1381            ("degree", '\u{00B0}'),
1382            ("plusminus", '\u{00B1}'),
1383            ("multiply", '\u{00D7}'),
1384            ("divide", '\u{00F7}'),
1385            ("fraction", '\u{2044}'),
1386            ("Euro", '\u{20AC}'),
1387            ("sterling", '\u{00A3}'),
1388            ("yen", '\u{00A5}'),
1389            ("cent", '\u{00A2}'),
1390            ("currency", '\u{00A4}'),
1391            ("section", '\u{00A7}'),
1392            ("paragraph", '\u{00B6}'),
1393            ("brokenbar", '\u{00A6}'),
1394            ("ordfeminine", '\u{00AA}'),
1395            ("ordmasculine", '\u{00BA}'),
1396            ("exclamdown", '\u{00A1}'),
1397            ("questiondown", '\u{00BF}'),
1398            ("logicalnot", '\u{00AC}'),
1399            ("mu", '\u{00B5}'),
1400            ("macron", '\u{00AF}'),
1401            ("acute", '\u{00B4}'),
1402            ("cedilla", '\u{00B8}'),
1403            ("dieresis", '\u{00A8}'),
1404            ("circumflex", '\u{02C6}'),
1405            ("tilde", '\u{02DC}'),
1406            ("caron", '\u{02C7}'),
1407            ("ring", '\u{02DA}'),
1408            ("breve", '\u{02D8}'),
1409            ("dotaccent", '\u{02D9}'),
1410            ("hungarumlaut", '\u{02DD}'),
1411            ("ogonek", '\u{02DB}'),
1412            ("nbspace", '\u{00A0}'),
1413            ("nonbreakingspace", '\u{00A0}'),
1414            ("softhyphen", '\u{00AD}'),
1415            ("periodcentered", '\u{00B7}'),
1416            ("middot", '\u{00B7}'),
1417            ("florin", '\u{0192}'),
1418            ("OE", '\u{0152}'),
1419            ("oe", '\u{0153}'),
1420            ("Scaron", '\u{0160}'),
1421            ("scaron", '\u{0161}'),
1422            ("Zcaron", '\u{017D}'),
1423            ("zcaron", '\u{017E}'),
1424            ("Ydieresis", '\u{0178}'),
1425            ("Lslash", '\u{0141}'),
1426            ("lslash", '\u{0142}'),
1427            ("Oslash", '\u{00D8}'),
1428            ("oslash", '\u{00F8}'),
1429            ("dotlessi", '\u{0131}'),
1430            // Superscripts / subscripts
1431            ("onesuperior", '\u{00B9}'),
1432            ("twosuperior", '\u{00B2}'),
1433            ("threesuperior", '\u{00B3}'),
1434            ("onequarter", '\u{00BC}'),
1435            ("onehalf", '\u{00BD}'),
1436            ("threequarters", '\u{00BE}'),
1437            // Math
1438            ("minus", '\u{2212}'),
1439            ("notequal", '\u{2260}'),
1440            ("lessequal", '\u{2264}'),
1441            ("greaterequal", '\u{2265}'),
1442            ("infinity", '\u{221E}'),
1443            ("partialdiff", '\u{2202}'),
1444            ("summation", '\u{2211}'),
1445            ("product", '\u{220F}'),
1446            ("integral", '\u{222B}'),
1447            ("radical", '\u{221A}'),
1448            ("approxequal", '\u{2248}'),
1449            ("Delta", '\u{0394}'),
1450            ("lozenge", '\u{25CA}'),
1451            ("pi", '\u{03C0}'),
1452            ("Omega", '\u{03A9}'),
1453        ];
1454        entries.iter().cloned().collect()
1455    })
1456}
1457
1458const CT_LIGATURE_MARKER: char = '\u{E007}';
1459
1460#[derive(Clone, Default)]
1461struct DecodedPdfString {
1462    text: String,
1463    ct_origins: Vec<bool>,
1464}
1465
1466impl DecodedPdfString {
1467    fn from_text(text: String) -> Self {
1468        let ct_origins = vec![false; text.chars().count()];
1469        Self { text, ct_origins }
1470    }
1471
1472    fn push_char(&mut self, ch: char, is_ct_origin: bool) {
1473        self.text.push(ch);
1474        self.ct_origins.push(is_ct_origin);
1475    }
1476
1477    fn push_str(&mut self, s: &str) {
1478        for ch in s.chars() {
1479            self.push_char(ch, false);
1480        }
1481    }
1482
1483    fn extend(&mut self, other: DecodedPdfString) {
1484        self.text.push_str(&other.text);
1485        self.ct_origins.extend(other.ct_origins);
1486    }
1487
1488    fn is_empty(&self) -> bool {
1489        self.text.is_empty()
1490    }
1491
1492    fn glyph_count(&self) -> usize {
1493        self.ct_origins.len()
1494    }
1495
1496    fn iter(&self) -> impl Iterator<Item = (char, bool)> + '_ {
1497        self.text.chars().zip(self.ct_origins.iter().copied())
1498    }
1499}
1500
1501/// Decode a PDF string using the font's ToUnicode CMap (if available),
1502/// preserving origin metadata for internal `ct` ligature markers.
1503fn decode_pdf_string_with_font_marked(
1504    bytes: &[u8],
1505    font_info: Option<&FontInfo>,
1506) -> DecodedPdfString {
1507    // Check for UTF-16BE BOM first — always takes priority.
1508    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1509        let chars: Vec<u16> = bytes[2..]
1510            .chunks(2)
1511            .filter_map(|chunk| {
1512                if chunk.len() == 2 {
1513                    Some(u16::from_be_bytes([chunk[0], chunk[1]]))
1514                } else {
1515                    None
1516                }
1517            })
1518            .collect();
1519        return DecodedPdfString::from_text(String::from_utf16_lossy(&chars));
1520    }
1521
1522    if let Some(info) = font_info {
1523        if info.is_cid && !info.to_unicode.is_empty() {
1524            // CID font: decode 2-byte codes via ToUnicode.
1525            let mut result = DecodedPdfString::default();
1526            let mut i = 0;
1527            while i + 1 < bytes.len() {
1528                let code = u16::from_be_bytes([bytes[i], bytes[i + 1]]) as u32;
1529                if let Some(s) = info.to_unicode.get(&code) {
1530                    result.push_str(s);
1531                } else {
1532                    // Fallback: try direct Unicode interpretation.
1533                    if let Some(ch) = char::from_u32(code) {
1534                        if !ch.is_control() || ch == ' ' || ch == '\t' || ch == '\n' {
1535                            result.push_char(ch, false);
1536                        }
1537                    }
1538                }
1539                i += 2;
1540            }
1541            return result;
1542        }
1543
1544        if !info.is_cid && !info.to_unicode.is_empty() {
1545            // Simple font with ToUnicode: decode 1-byte codes.
1546            let mut result = DecodedPdfString::default();
1547            for &b in bytes {
1548                if let Some(s) = info.to_unicode.get(&(b as u32)) {
1549                    result.push_str(s);
1550                } else if info.ct_codes[b as usize] {
1551                    result.push_char(CT_LIGATURE_MARKER, true);
1552                } else if let Some(ch) = info.encoding_map[b as usize] {
1553                    result.push_char(ch, false);
1554                } else {
1555                    let ch = b as char;
1556                    if is_printable_or_space(ch) {
1557                        result.push_char(ch, false);
1558                    }
1559                }
1560            }
1561            return result;
1562        }
1563
1564        // Simple font with encoding map but no ToUnicode.
1565        if !info.is_cid
1566            && (info.encoding_map.iter().any(|c| c.is_some()) || info.ct_codes.iter().any(|f| *f))
1567        {
1568            let mut result = DecodedPdfString::default();
1569            for &b in bytes {
1570                if info.ct_codes[b as usize] {
1571                    result.push_char(CT_LIGATURE_MARKER, true);
1572                } else if let Some(ch) = info.encoding_map[b as usize] {
1573                    result.push_char(ch, false);
1574                } else {
1575                    let ch = b as char;
1576                    if is_printable_or_space(ch) {
1577                        result.push_char(ch, false);
1578                    }
1579                }
1580            }
1581            return result;
1582        }
1583    }
1584
1585    // Fallback: PDFDocEncoding (ASCII + Latin-1), skipping control chars.
1586    let mut result = DecodedPdfString::default();
1587    for &b in bytes {
1588        let ch = b as char;
1589        if is_printable_or_space(ch) {
1590            result.push_char(ch, false);
1591        }
1592    }
1593    result
1594}
1595
1596/// Returns true if a character is printable or common whitespace (tab, newline, CR, space).
1597/// Filters out NUL, BEL, and other control characters that corrupt XML output.
1598fn is_printable_or_space(ch: char) -> bool {
1599    let cp = ch as u32;
1600    cp >= 0x20 || cp == 0x09 || cp == 0x0A || cp == 0x0D
1601}
1602
1603/// Whether to expand precomposed-ligature characters and known ligature
1604/// glyph names into their constituent ASCII characters during text
1605/// extraction. Default-on so that text-search, copy-paste and accessibility
1606/// flows see the human-readable string ("office") rather than an opaque
1607/// glyph cluster ("o\u{FB03}ce"). Internal for now — the public surface
1608/// stays unchanged; future configuration would surface this on a higher-
1609/// level type without breaking the current API.
1610const LIGATURE_DECOMP: bool = true;
1611
1612/// Decompose a precomposed-ligature character (Alphabetic Presentation Forms,
1613/// U+FB00..U+FB06) into its constituent string. Returns `None` for any other
1614/// character — the caller keeps the original codepoint in that case.
1615///
1616/// The `ct` ligature uses an internal marker plus origin metadata because
1617/// it has no precomposed Unicode codepoint and must not collide with
1618/// legitimate PUA values from ToUnicode CMaps.
1619fn decompose_ligature_char(c: char) -> Option<&'static str> {
1620    Some(match c {
1621        '\u{FB00}' => "ff",
1622        '\u{FB01}' => "fi",
1623        '\u{FB02}' => "fl",
1624        '\u{FB03}' => "ffi",
1625        '\u{FB04}' => "ffl",
1626        // U+FB05 (long s + t) and U+FB06 (st) both decompose to "st" for
1627        // search/copy-paste purposes; the historical long-s spelling is
1628        // not preserved.
1629        '\u{FB05}' | '\u{FB06}' => "st",
1630        _ => return None,
1631    })
1632}
1633
1634fn decompose_ligature_char_with_origin(c: char, is_ct_origin: bool) -> Option<&'static str> {
1635    if is_ct_origin && c == CT_LIGATURE_MARKER {
1636        Some("ct")
1637    } else {
1638        decompose_ligature_char(c)
1639    }
1640}
1641
1642/// Decompose a single precomposed-ligature character into its constituent
1643/// string. Combines the hardcoded Latin set ([`decompose_ligature_char`])
1644/// with an NFKD fallback over the rest of the Alphabetic Presentation
1645/// Forms block (U+FB00..U+FB4F) — Hebrew, Arabic, Armenian forms — so
1646/// that string-level [`decompose_ligatures`] and the per-glyph
1647/// `push_glyph_positioned` path agree on what counts as a decomposable
1648/// glyph.
1649fn decompose_glyph_to_string(c: char) -> Option<String> {
1650    use unicode_normalization::UnicodeNormalization;
1651    if let Some(s) = decompose_ligature_char(c) {
1652        return Some(s.to_string());
1653    }
1654    if matches!(c, '\u{FB00}'..='\u{FB4F}') {
1655        let nfkd: String = c.nfkd().collect();
1656        // Some codepoints in this block have no compatibility decomposition
1657        // and would round-trip back as themselves; only treat them as
1658        // decomposable when NFKD actually maps them, including legitimate
1659        // single-codepoint mappings such as Hebrew presentation forms.
1660        if nfkd != c.to_string() {
1661            return Some(nfkd);
1662        }
1663    }
1664    None
1665}
1666
1667/// Apply ligature decomposition to a string. Hardcoded mappings cover the
1668/// six common Latin ligatures (fi, fl, ffi, ffl, ff, st). For any other
1669/// character in the Alphabetic Presentation Forms block (U+FB00..U+FB4F)
1670/// not covered above — Hebrew, Arabic, Armenian forms — apply NFKD locally
1671/// to that character only. NFKD is *not* applied globally to avoid breaking
1672/// legitimate composed accented characters elsewhere in the string.
1673fn decompose_ligatures(s: &str) -> String {
1674    let mut out = String::with_capacity(s.len());
1675    for c in s.chars() {
1676        if let Some(replacement) = decompose_glyph_to_string(c) {
1677            out.push_str(&replacement);
1678        } else {
1679            out.push(c);
1680        }
1681    }
1682    out
1683}
1684
1685fn decompose_decoded_ligatures(s: &DecodedPdfString) -> String {
1686    let mut out = String::with_capacity(s.text.len());
1687    for (c, is_ct_origin) in s.iter() {
1688        if let Some(replacement) = decompose_ligature_char_with_origin(c, is_ct_origin) {
1689            out.push_str(replacement);
1690        } else if let Some(replacement) = decompose_glyph_to_string(c) {
1691            out.push_str(&replacement);
1692        } else {
1693            out.push(c);
1694        }
1695    }
1696    out
1697}
1698
1699/// Apply [`decompose_ligatures`] when the feature is enabled; otherwise
1700/// return the input unchanged. Centralizes the gate so callers don't
1701/// repeat the `if LIGATURE_DECOMP` check at every emission site.
1702fn maybe_decompose_decoded(s: &DecodedPdfString) -> String {
1703    if LIGATURE_DECOMP {
1704        if s.ct_origins.iter().any(|origin| *origin) {
1705            decompose_decoded_ligatures(s)
1706        } else {
1707            decompose_ligatures(&s.text)
1708        }
1709    } else {
1710        s.text.clone()
1711    }
1712}
1713
1714/// Extract text blocks from a specific page.
1715pub fn extract_page_blocks(doc: &Document, page_num: u32) -> Vec<TextBlock> {
1716    let pages = doc.get_pages();
1717    let Some(&page_id) = pages.get(&page_num) else {
1718        return Vec::new();
1719    };
1720
1721    let font_map = build_font_map(doc, page_id);
1722    let resources = get_page_resources(doc, page_id).unwrap_or_default();
1723    if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1724        if let Ok(content) = Content::decode(&content_bytes) {
1725            return extract_blocks_from_ops_inner(
1726                &content.operations,
1727                page_num,
1728                &font_map,
1729                Some((doc, &resources)),
1730                0,
1731                None,
1732            );
1733        }
1734    }
1735
1736    Vec::new()
1737}
1738
1739/// Extract text blocks from a page identified by its object ID.
1740///
1741/// Unlike [`extract_page_blocks`], this function does not call `get_pages()`
1742/// internally. Use this when the caller already holds a page map from a
1743/// prior `doc.get_pages()` call to avoid redundant page-tree traversals.
1744pub fn extract_blocks_from_page_id(
1745    doc: &Document,
1746    page_id: lopdf::ObjectId,
1747    page_num: u32,
1748) -> Vec<TextBlock> {
1749    let font_map = build_font_map(doc, page_id);
1750    let resources = get_page_resources(doc, page_id).unwrap_or_default();
1751    if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1752        if let Ok(content) = Content::decode(&content_bytes) {
1753            return extract_blocks_from_ops_inner(
1754                &content.operations,
1755                page_num,
1756                &font_map,
1757                Some((doc, &resources)),
1758                0,
1759                None,
1760            );
1761        }
1762    }
1763    Vec::new()
1764}
1765
1766/// Extract text blocks from all pages of a document.
1767pub fn extract_text(doc: &Document) -> Vec<TextBlock> {
1768    let pages = doc.get_pages();
1769    let mut blocks = Vec::new();
1770
1771    for (&page_num, &page_id) in &pages {
1772        let font_map = build_font_map(doc, page_id);
1773        let resources = get_page_resources(doc, page_id).unwrap_or_default();
1774        if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1775            if let Ok(content) = Content::decode(&content_bytes) {
1776                let page_blocks = extract_blocks_from_ops_inner(
1777                    &content.operations,
1778                    page_num,
1779                    &font_map,
1780                    Some((doc, &resources)),
1781                    0,
1782                    None,
1783                );
1784                blocks.extend(page_blocks);
1785            }
1786        }
1787    }
1788
1789    blocks
1790}
1791
1792/// Extract text from a specific page as a plain string.
1793pub fn extract_page_text(doc: &Document, page_num: u32) -> Result<String> {
1794    let pages = doc.get_pages();
1795    let total = pages.len() as u32;
1796
1797    if page_num == 0 || page_num > total {
1798        return Err(ExtractError::PageOutOfRange(page_num, total));
1799    }
1800
1801    let page_id = *pages
1802        .get(&page_num)
1803        .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
1804
1805    let font_map = build_font_map(doc, page_id);
1806    let resources = get_page_resources(doc, page_id).unwrap_or_default();
1807    let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
1808    let content = match Content::decode(&content_bytes) {
1809        Ok(c) => c,
1810        Err(_) => return Ok(String::new()),
1811    };
1812
1813    let blocks = extract_blocks_from_ops_inner(
1814        &content.operations,
1815        page_num,
1816        &font_map,
1817        Some((doc, &resources)),
1818        0,
1819        None,
1820    );
1821    let text = blocks
1822        .iter()
1823        .map(|b| b.text.as_str())
1824        .collect::<Vec<_>>()
1825        .join("");
1826
1827    Ok(text)
1828}
1829
1830/// Extract positioned characters from a specific page.
1831pub fn extract_positioned_chars(doc: &Document, page_num: u32) -> Result<Vec<PositionedChar>> {
1832    let pages = doc.get_pages();
1833    let total = pages.len() as u32;
1834
1835    if page_num == 0 || page_num > total {
1836        return Err(ExtractError::PageOutOfRange(page_num, total));
1837    }
1838
1839    let page_id = *pages
1840        .get(&page_num)
1841        .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
1842
1843    let font_map = build_font_map(doc, page_id);
1844    let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
1845    let content = match Content::decode(&content_bytes) {
1846        Ok(c) => c,
1847        Err(_) => return Ok(Vec::new()),
1848    };
1849
1850    let chars = extract_chars_from_ops(&content.operations, page_num, &font_map);
1851    Ok(chars)
1852}
1853
1854/// Get content stream bytes for a page.
1855fn get_page_content_bytes(doc: &Document, page_id: ObjectId) -> std::result::Result<Vec<u8>, ()> {
1856    doc.get_page_content(page_id).map_err(|_| ())
1857}
1858
1859/// One entry on the marked-content stack. PDF marked-content sequences nest
1860/// (e.g. `BDC … BDC … EMC … EMC`); each `BDC`/`BMC` pushes, `EMC` pops. The
1861/// innermost entry with `actual_text = Some(_)` wins for any glyph emitted
1862/// inside that range.
1863#[derive(Debug, Clone, Default)]
1864struct MarkedContentEntry {
1865    /// `/ActualText` value from the property dict, decoded as a PDF text
1866    /// string (UTF-16BE if BOM-prefixed, otherwise PDFDocEncoding/Latin-1).
1867    actual_text: Option<String>,
1868}
1869
1870/// Resolve the property dict for a `BDC` operator. The second operand is
1871/// either an inline dictionary or a `/Name` referring to an entry in
1872/// `Resources/Properties`.
1873fn resolve_bdc_properties(
1874    doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
1875    operand: &Object,
1876) -> Option<lopdf::Dictionary> {
1877    match operand {
1878        Object::Dictionary(d) => Some(d.clone()),
1879        Object::Reference(r) => doc_and_resources.and_then(|(doc, _)| {
1880            doc.get_object(*r).ok().and_then(|o| match o {
1881                Object::Dictionary(d) => Some(d.clone()),
1882                _ => None,
1883            })
1884        }),
1885        Object::Name(n) => {
1886            let (doc, resources) = doc_and_resources?;
1887            let props = resolve_dict(doc, resources, b"Properties")?;
1888            match props.get(n.as_slice()).ok()? {
1889                Object::Dictionary(d) => Some(d.clone()),
1890                Object::Reference(r) => match doc.get_object(*r).ok()? {
1891                    Object::Dictionary(d) => Some(d.clone()),
1892                    _ => None,
1893                },
1894                _ => None,
1895            }
1896        }
1897        _ => None,
1898    }
1899}
1900
1901/// Decode a PDF text string (BOM-detected UTF-16BE / UTF-8, otherwise
1902/// PDFDocEncoding-ish Latin-1). Used for the `/ActualText` value, which is
1903/// stored as a regular text string — not as a glyph-coded string.
1904fn decode_pdf_text_string(bytes: &[u8]) -> String {
1905    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1906        let u16s: Vec<u16> = bytes[2..]
1907            .chunks_exact(2)
1908            .map(|c| u16::from_be_bytes([c[0], c[1]]))
1909            .collect();
1910        return String::from_utf16_lossy(&u16s);
1911    }
1912    if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
1913        return String::from_utf8_lossy(&bytes[3..]).into_owned();
1914    }
1915    bytes
1916        .iter()
1917        .filter_map(|&b| {
1918            let ch = b as char;
1919            if is_printable_or_space(ch) {
1920                Some(ch)
1921            } else {
1922                None
1923            }
1924        })
1925        .collect()
1926}
1927
1928/// Extract `/ActualText` from a BDC property dict, if present.
1929fn extract_actual_text(props: &lopdf::Dictionary) -> Option<String> {
1930    let obj = props.get(b"ActualText").ok()?;
1931    match obj {
1932        Object::String(bytes, _) => Some(decode_pdf_text_string(bytes)),
1933        _ => None,
1934    }
1935}
1936
1937/// Innermost `actual_text` from the stack, falling back to an inherited
1938/// value supplied by the caller. The inherited value carries the parent
1939/// invocation's top-of-stack `/ActualText` across `Do` recursion into a
1940/// Form XObject, so text emitted inside the XObject's `BT`...`ET` blocks
1941/// is tagged with the surrounding `BDC`/`EMC` pair's `/ActualText`.
1942fn current_actual_text(stack: &[MarkedContentEntry], inherited: Option<&str>) -> Option<String> {
1943    stack
1944        .iter()
1945        .rev()
1946        .find_map(|e| e.actual_text.clone())
1947        .or_else(|| inherited.map(|s| s.to_string()))
1948}
1949
1950/// Extract text blocks from a list of operations, handling Form XObject
1951/// recursion via `Do`. `inherited_actual_text` carries the parent's active
1952/// `/ActualText` binding into the recursion so a `BDC` ... `Do` ... `EMC`
1953/// sequence keeps tagging text emitted inside the invoked Form XObject.
1954/// Top-level callers pass `None`.
1955fn extract_blocks_from_ops_inner(
1956    ops: &[Operation],
1957    page: u32,
1958    font_map: &HashMap<String, FontInfo>,
1959    doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
1960    depth: u32,
1961    inherited_actual_text: Option<&str>,
1962) -> Vec<TextBlock> {
1963    let mut state = TextState::default();
1964    let mut blocks = Vec::new();
1965    let mut mc_stack: Vec<MarkedContentEntry> = Vec::new();
1966
1967    for op in ops {
1968        match op.operator.as_str() {
1969            "q" => {
1970                state.gs_stack.push(GraphicsState {
1971                    ctm: state.ctm,
1972                    fill_color: state.fill_color,
1973                });
1974            }
1975            "Q" => {
1976                if let Some(gs) = state.gs_stack.pop() {
1977                    state.ctm = gs.ctm;
1978                    state.fill_color = gs.fill_color;
1979                }
1980            }
1981            "cm" => {
1982                if let Some(m) = extract_matrix(&op.operands) {
1983                    state.ctm = multiply_matrix(&state.ctm, &m);
1984                }
1985            }
1986            // G1: non-stroking fill color setters. PDF §8.6.5: `rg` r g b;
1987            // `g` gray; `k` c m y k. Stroking variants (RG/G/K) don't
1988            // affect text fill so we accept and ignore them. `sc`/`scn` are
1989            // honored when operand count makes the device space unambiguous
1990            // (1 = Gray, 3 = RGB, 4 = CMYK). `cs`/`CS` mark color unknown
1991            // because ICC/Lab/Pattern can't be converted from operands alone.
1992            "rg" if op.operands.len() >= 3 => {
1993                if let (Some(r), Some(g), Some(b)) = (
1994                    op.operands.first().and_then(as_number),
1995                    op.operands.get(1).and_then(as_number),
1996                    op.operands.get(2).and_then(as_number),
1997                ) {
1998                    state.fill_color = Some([
1999                        clamp_unit_to_u8(r),
2000                        clamp_unit_to_u8(g),
2001                        clamp_unit_to_u8(b),
2002                        255,
2003                    ]);
2004                }
2005            }
2006            "g" => {
2007                if let Some(v) = op.operands.first().and_then(as_number) {
2008                    let byte = clamp_unit_to_u8(v);
2009                    state.fill_color = Some([byte, byte, byte, 255]);
2010                }
2011            }
2012            "k" if op.operands.len() >= 4 => {
2013                if let (Some(c), Some(m), Some(y), Some(kk)) = (
2014                    op.operands.first().and_then(as_number),
2015                    op.operands.get(1).and_then(as_number),
2016                    op.operands.get(2).and_then(as_number),
2017                    op.operands.get(3).and_then(as_number),
2018                ) {
2019                    state.fill_color = Some(cmyk_to_rgba(c, m, y, kk));
2020                }
2021            }
2022            "RG" | "G" | "K" => {}
2023            "sc" | "scn" => {
2024                state.fill_color = match op.operands.len() {
2025                    1 => op.operands.first().and_then(as_number).map(|v| {
2026                        let b = clamp_unit_to_u8(v);
2027                        [b, b, b, 255]
2028                    }),
2029                    3 => {
2030                        let r = op.operands.first().and_then(as_number);
2031                        let g = op.operands.get(1).and_then(as_number);
2032                        let b = op.operands.get(2).and_then(as_number);
2033                        match (r, g, b) {
2034                            (Some(r), Some(g), Some(b)) => Some([
2035                                clamp_unit_to_u8(r),
2036                                clamp_unit_to_u8(g),
2037                                clamp_unit_to_u8(b),
2038                                255,
2039                            ]),
2040                            _ => None,
2041                        }
2042                    }
2043                    4 => {
2044                        let c = op.operands.first().and_then(as_number);
2045                        let m = op.operands.get(1).and_then(as_number);
2046                        let y = op.operands.get(2).and_then(as_number);
2047                        let kk = op.operands.get(3).and_then(as_number);
2048                        match (c, m, y, kk) {
2049                            (Some(c), Some(m), Some(y), Some(kk)) => {
2050                                Some(cmyk_to_rgba(c, m, y, kk))
2051                            }
2052                            _ => None,
2053                        }
2054                    }
2055                    _ => None,
2056                };
2057            }
2058            "cs" | "CS" => {
2059                state.fill_color = None;
2060            }
2061            "BT" => {
2062                state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2063                state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2064            }
2065            "Tf" if op.operands.len() >= 2 => {
2066                if let Object::Name(ref name) = op.operands[0] {
2067                    state.font_name = String::from_utf8_lossy(name).to_string();
2068                }
2069                if let Some(size) = as_number(&op.operands[1]) {
2070                    state.font_size = size;
2071                }
2072            }
2073            "Tc" => {
2074                if let Some(v) = op.operands.first().and_then(as_number) {
2075                    state.tc = v;
2076                }
2077            }
2078            "Tw" => {
2079                if let Some(v) = op.operands.first().and_then(as_number) {
2080                    state.tw = v;
2081                }
2082            }
2083            "Tz" => {
2084                if let Some(v) = op.operands.first().and_then(as_number) {
2085                    state.th = v;
2086                }
2087            }
2088            "TL" => {
2089                if let Some(v) = op.operands.first().and_then(as_number) {
2090                    state.tl = v;
2091                }
2092            }
2093            "Ts" => {
2094                if let Some(v) = op.operands.first().and_then(as_number) {
2095                    state.ts = v;
2096                }
2097            }
2098            "Td" if op.operands.len() >= 2 => {
2099                let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2100                let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2101                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2102                state.tlm = new_tlm;
2103                state.tm = new_tlm;
2104            }
2105            "TD" if op.operands.len() >= 2 => {
2106                let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2107                let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2108                state.tl = -ty;
2109                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2110                state.tlm = new_tlm;
2111                state.tm = new_tlm;
2112            }
2113            "Tm" => {
2114                if let Some(m) = extract_matrix(&op.operands) {
2115                    state.tm = m;
2116                    state.tlm = m;
2117                }
2118            }
2119            "T*" => {
2120                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2121                state.tlm = new_tlm;
2122                state.tm = new_tlm;
2123            }
2124            "Tj" => {
2125                let fi = font_map.get(&state.font_name);
2126                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2127                    let raw = op.operands.iter().find_map(|o| {
2128                        if let Object::String(b, _) = o {
2129                            Some(b.as_slice())
2130                        } else {
2131                            None
2132                        }
2133                    });
2134
2135                    // G2: compute char_bounds + width_source from raw bytes.
2136                    let x = state.tm[4];
2137                    let y = state.tm[5];
2138                    let gctx = GlyphCtx {
2139                        font_size: state.font_size,
2140                        tc: state.tc,
2141                        tw: state.tw,
2142                        th: state.th,
2143                    };
2144                    let (char_bounds, width_source, total_adv) = match (raw, fi) {
2145                        (Some(raw), Some(fi)) => {
2146                            let (cb, ws, adv) = char_bounds_from_bytes(raw, fi, x, y, &gctx);
2147                            (cb, ws, adv)
2148                        }
2149                        _ => {
2150                            let cw = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2151                            let n = text.glyph_count();
2152                            let cb = (0..n)
2153                                .map(|i| {
2154                                    let cx = x + i as f64 * cw;
2155                                    [cx, y, cx + cw, y + state.font_size]
2156                                })
2157                                .collect();
2158                            (cb, WidthSource::Estimate, n as f64 * cw)
2159                        }
2160                    };
2161
2162                    // Skip empty text blocks — empty blocks cause phantom table
2163                    // detection with all-empty cells (#651).
2164                    if !text.is_empty() {
2165                        let display_text = maybe_decompose_decoded(&text);
2166                        blocks.push(TextBlock {
2167                            text: display_text,
2168                            page,
2169                            bbox: [x, y, x + total_adv, y + state.font_size],
2170                            font_name: state.font_name.clone(),
2171                            font_size: state.font_size,
2172                            actual_text: current_actual_text(&mc_stack, inherited_actual_text),
2173                            base_font: font_map
2174                                .get(&state.font_name)
2175                                .and_then(|fi| fi.base_font.clone()),
2176                            is_bold: font_map.get(&state.font_name).is_some_and(|fi| fi.is_bold),
2177                            is_italic: font_map
2178                                .get(&state.font_name)
2179                                .is_some_and(|fi| fi.is_italic),
2180                            color: state.fill_color,
2181                            width_source,
2182                            char_bounds,
2183                        });
2184                    }
2185
2186                    state.tm[4] = x + total_adv;
2187                }
2188            }
2189            "TJ" => {
2190                if let Some(Object::Array(ref arr)) = op.operands.first() {
2191                    let x_start = state.tm[4];
2192                    let y = state.tm[5];
2193                    let fi = font_map.get(&state.font_name);
2194
2195                    // G2: compute char_bounds + width_source from the raw TJ array.
2196                    let gctx = GlyphCtx {
2197                        font_size: state.font_size,
2198                        tc: state.tc,
2199                        tw: state.tw,
2200                        th: state.th,
2201                    };
2202                    let (char_bounds, width_source, x_end) = match fi {
2203                        Some(fi) => char_bounds_from_tj_array(arr, fi, x_start, y, &gctx),
2204                        None => {
2205                            let cw = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2206                            // Estimate: count glyphs from string items only.
2207                            let mut cx = x_start;
2208                            let mut cb = Vec::new();
2209                            for item in arr {
2210                                if let Object::String(bytes, _) = item {
2211                                    let n = if fi.map(|f| f.is_cid).unwrap_or(false) {
2212                                        bytes.len() / 2
2213                                    } else {
2214                                        bytes.len()
2215                                    };
2216                                    for _ in 0..n {
2217                                        cb.push([cx, y, cx + cw, y + state.font_size]);
2218                                        cx += cw + state.tc;
2219                                    }
2220                                } else if let Some(adj) = as_number(item) {
2221                                    cx -= adj / 1000.0 * state.font_size;
2222                                }
2223                            }
2224                            (cb, WidthSource::Estimate, cx)
2225                        }
2226                    };
2227
2228                    // Decode combined text for the block.
2229                    let mut combined_text = DecodedPdfString::default();
2230                    for item in arr {
2231                        if let Object::String(bytes, _) = item {
2232                            combined_text.extend(decode_pdf_string_with_font_marked(bytes, fi));
2233                        }
2234                    }
2235
2236                    state.tm[4] = x_end;
2237
2238                    if !combined_text.is_empty() {
2239                        blocks.push(TextBlock {
2240                            text: maybe_decompose_decoded(&combined_text),
2241                            page,
2242                            bbox: [x_start, y, x_end, y + state.font_size],
2243                            font_name: state.font_name.clone(),
2244                            font_size: state.font_size,
2245                            actual_text: current_actual_text(&mc_stack, inherited_actual_text),
2246                            base_font: font_map
2247                                .get(&state.font_name)
2248                                .and_then(|fi| fi.base_font.clone()),
2249                            is_bold: font_map.get(&state.font_name).is_some_and(|fi| fi.is_bold),
2250                            is_italic: font_map
2251                                .get(&state.font_name)
2252                                .is_some_and(|fi| fi.is_italic),
2253                            color: state.fill_color,
2254                            width_source,
2255                            char_bounds,
2256                        });
2257                    }
2258                }
2259            }
2260            "'" => {
2261                // Move to next line and show text.
2262                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2263                state.tlm = new_tlm;
2264                state.tm = new_tlm;
2265
2266                let fi = font_map.get(&state.font_name);
2267                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2268                    let raw = op.operands.iter().find_map(|o| {
2269                        if let Object::String(b, _) = o {
2270                            Some(b.as_slice())
2271                        } else {
2272                            None
2273                        }
2274                    });
2275                    let x = state.tm[4];
2276                    let y = state.tm[5];
2277                    let gctx = GlyphCtx {
2278                        font_size: state.font_size,
2279                        tc: state.tc,
2280                        tw: state.tw,
2281                        th: state.th,
2282                    };
2283                    let (char_bounds, width_source, total_adv) = match (raw, fi) {
2284                        (Some(raw), Some(fi)) => {
2285                            let (cb, ws, adv) = char_bounds_from_bytes(raw, fi, x, y, &gctx);
2286                            (cb, ws, adv)
2287                        }
2288                        _ => {
2289                            let cw = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2290                            let n = text.glyph_count();
2291                            let cb = (0..n)
2292                                .map(|i| {
2293                                    let cx = x + i as f64 * cw;
2294                                    [cx, y, cx + cw, y + state.font_size]
2295                                })
2296                                .collect();
2297                            (cb, WidthSource::Estimate, n as f64 * cw)
2298                        }
2299                    };
2300
2301                    if !text.is_empty() {
2302                        let display_text = maybe_decompose_decoded(&text);
2303                        blocks.push(TextBlock {
2304                            text: display_text,
2305                            page,
2306                            bbox: [x, y, x + total_adv, y + state.font_size],
2307                            font_name: state.font_name.clone(),
2308                            font_size: state.font_size,
2309                            actual_text: current_actual_text(&mc_stack, inherited_actual_text),
2310                            base_font: font_map
2311                                .get(&state.font_name)
2312                                .and_then(|fi| fi.base_font.clone()),
2313                            is_bold: font_map.get(&state.font_name).is_some_and(|fi| fi.is_bold),
2314                            is_italic: font_map
2315                                .get(&state.font_name)
2316                                .is_some_and(|fi| fi.is_italic),
2317                            color: state.fill_color,
2318                            width_source,
2319                            char_bounds,
2320                        });
2321                    }
2322
2323                    state.tm[4] = x + total_adv;
2324                }
2325            }
2326            "\"" if op.operands.len() >= 3 => {
2327                // Set word/char spacing, move to next line, show text.
2328                if let Some(tw) = as_number(&op.operands[0]) {
2329                    state.tw = tw;
2330                }
2331                if let Some(tc) = as_number(&op.operands[1]) {
2332                    state.tc = tc;
2333                }
2334
2335                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2336                state.tlm = new_tlm;
2337                state.tm = new_tlm;
2338
2339                let fi = font_map.get(&state.font_name);
2340                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands[2..], fi)
2341                {
2342                    let raw = op.operands[2..].iter().find_map(|o| {
2343                        if let Object::String(b, _) = o {
2344                            Some(b.as_slice())
2345                        } else {
2346                            None
2347                        }
2348                    });
2349                    let x = state.tm[4];
2350                    let y = state.tm[5];
2351                    let gctx = GlyphCtx {
2352                        font_size: state.font_size,
2353                        tc: state.tc,
2354                        tw: state.tw,
2355                        th: state.th,
2356                    };
2357                    let (char_bounds, width_source, total_adv) = match (raw, fi) {
2358                        (Some(raw), Some(fi)) => {
2359                            let (cb, ws, adv) = char_bounds_from_bytes(raw, fi, x, y, &gctx);
2360                            (cb, ws, adv)
2361                        }
2362                        _ => {
2363                            let cw = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2364                            let n = text.glyph_count();
2365                            let cb = (0..n)
2366                                .map(|i| {
2367                                    let cx = x + i as f64 * cw;
2368                                    [cx, y, cx + cw, y + state.font_size]
2369                                })
2370                                .collect();
2371                            (cb, WidthSource::Estimate, n as f64 * cw)
2372                        }
2373                    };
2374
2375                    if !text.is_empty() {
2376                        let display_text = maybe_decompose_decoded(&text);
2377                        blocks.push(TextBlock {
2378                            text: display_text,
2379                            page,
2380                            bbox: [x, y, x + total_adv, y + state.font_size],
2381                            font_name: state.font_name.clone(),
2382                            font_size: state.font_size,
2383                            actual_text: current_actual_text(&mc_stack, inherited_actual_text),
2384                            base_font: font_map
2385                                .get(&state.font_name)
2386                                .and_then(|fi| fi.base_font.clone()),
2387                            is_bold: font_map.get(&state.font_name).is_some_and(|fi| fi.is_bold),
2388                            is_italic: font_map
2389                                .get(&state.font_name)
2390                                .is_some_and(|fi| fi.is_italic),
2391                            color: state.fill_color,
2392                            width_source,
2393                            char_bounds,
2394                        });
2395                    }
2396
2397                    state.tm[4] = x + total_adv;
2398                }
2399            }
2400            "BMC" => {
2401                // Begin marked-content (no property list). Push an empty entry
2402                // so EMC pops the right level — without this, nested BMC inside
2403                // a BDC/ActualText range would pop the BDC entry early.
2404                mc_stack.push(MarkedContentEntry::default());
2405            }
2406            "BDC" => {
2407                // Begin marked-content with property list. Operand 1 is the
2408                // tag (Name); operand 2 is either an inline dict or a Name
2409                // resolved via Resources/Properties.
2410                let actual_text = op
2411                    .operands
2412                    .get(1)
2413                    .and_then(|o| resolve_bdc_properties(doc_and_resources, o))
2414                    .as_ref()
2415                    .and_then(extract_actual_text);
2416                mc_stack.push(MarkedContentEntry { actual_text });
2417            }
2418            "EMC" => {
2419                // End marked-content. Use pop (not unwrap) so a malformed
2420                // stream with an unmatched EMC doesn't panic.
2421                mc_stack.pop();
2422            }
2423            "Do" if depth < 5 => {
2424                // Invoke Form XObject — recurse into its content stream.
2425                // Propagate the active /ActualText binding (top-of-stack, or
2426                // whatever this invocation itself inherited) so text inside
2427                // the XObject is tagged with the surrounding BDC/EMC pair.
2428                if let Some((doc, resources)) = doc_and_resources {
2429                    if let Some(Object::Name(ref xobj_name)) = op.operands.first() {
2430                        let xobj_name_str = String::from_utf8_lossy(xobj_name);
2431                        let inherited_for_child =
2432                            current_actual_text(&mc_stack, inherited_actual_text);
2433                        if let Some(xobj_blocks) = extract_form_xobject_text(
2434                            doc,
2435                            resources,
2436                            &xobj_name_str,
2437                            page,
2438                            font_map,
2439                            depth,
2440                            inherited_for_child.as_deref(),
2441                        ) {
2442                            blocks.extend(xobj_blocks);
2443                        }
2444                    }
2445                }
2446            }
2447            _ => {}
2448        }
2449    }
2450
2451    blocks
2452}
2453
2454/// Extract text from a Form XObject referenced by name in the page's
2455/// Resources/XObject dict. `inherited_actual_text` is the active
2456/// `/ActualText` binding from the calling context (parent stack +
2457/// whatever the parent itself inherited), so text emitted inside this
2458/// XObject's `BT`...`ET` blocks is tagged correctly when the `Do`
2459/// operator sits between a `BDC` and matching `EMC`.
2460fn extract_form_xobject_text(
2461    doc: &Document,
2462    resources: &lopdf::Dictionary,
2463    name: &str,
2464    page: u32,
2465    font_map: &HashMap<String, FontInfo>,
2466    depth: u32,
2467    inherited_actual_text: Option<&str>,
2468) -> Option<Vec<TextBlock>> {
2469    // Look up the XObject in the Resources dictionary.
2470    let xobj_dict = match resources.get(b"XObject").ok()? {
2471        Object::Dictionary(d) => d.clone(),
2472        Object::Reference(r) => match doc.get_object(*r).ok()? {
2473            Object::Dictionary(d) => d.clone(),
2474            _ => return None,
2475        },
2476        _ => return None,
2477    };
2478
2479    let xobj_ref = match xobj_dict.get(name.as_bytes()).ok()? {
2480        Object::Reference(r) => *r,
2481        _ => return None,
2482    };
2483
2484    let stream = match doc.get_object(xobj_ref).ok()? {
2485        Object::Stream(s) => s.clone(),
2486        _ => return None,
2487    };
2488
2489    // Verify it's a Form XObject (not Image).
2490    let subtype = stream
2491        .dict
2492        .get(b"Subtype")
2493        .ok()
2494        .and_then(|o| match o {
2495            Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
2496            _ => None,
2497        })
2498        .unwrap_or_default();
2499    if subtype != "Form" {
2500        return None;
2501    }
2502
2503    // Decode the content stream.
2504    let content_bytes = stream
2505        .decompressed_content()
2506        .ok()
2507        .unwrap_or_else(|| stream.content.clone());
2508    let content = Content::decode(&content_bytes).ok()?;
2509
2510    // Build font map: XObject's own fonts take priority over inherited page fonts.
2511    // A Form XObject may reuse font names (e.g. /F1) with different encodings.
2512    let mut xobj_font_map = font_map.clone();
2513    if let Some(xobj_resources) = resolve_dict(doc, &stream.dict, b"Resources") {
2514        if let Some(xobj_fonts) = resolve_dict(doc, &xobj_resources, b"Font") {
2515            for (name_bytes, value) in xobj_fonts.iter() {
2516                let fname = String::from_utf8_lossy(name_bytes).to_string();
2517                if let Some(fi) = build_font_info_from_value(doc, value) {
2518                    xobj_font_map.insert(fname, fi);
2519                }
2520            }
2521        }
2522    }
2523
2524    // Use the XObject's own Resources dict for recursive Do lookups.
2525    let xobj_resources =
2526        resolve_dict(doc, &stream.dict, b"Resources").unwrap_or_else(|| resources.clone());
2527
2528    Some(extract_blocks_from_ops_inner(
2529        &content.operations,
2530        page,
2531        &xobj_font_map,
2532        Some((doc, &xobj_resources)),
2533        depth + 1,
2534        inherited_actual_text,
2535    ))
2536}
2537
2538/// Build a FontInfo from a font dictionary value (used for XObject font resolution).
2539fn build_font_info_from_value(doc: &Document, value: &Object) -> Option<FontInfo> {
2540    let font = match value {
2541        Object::Reference(r) => match doc.get_object(*r).ok()? {
2542            Object::Dictionary(d) => d.clone(),
2543            _ => return None,
2544        },
2545        Object::Dictionary(d) => d.clone(),
2546        _ => return None,
2547    };
2548
2549    let subtype = font
2550        .get(b"Subtype")
2551        .ok()
2552        .and_then(|o| match o {
2553            Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
2554            _ => None,
2555        })
2556        .unwrap_or_default();
2557
2558    let is_cid = subtype == "Type0";
2559    let mut to_unicode = parse_to_unicode_from_font(doc, &font);
2560
2561    if to_unicode.is_empty() && is_cid {
2562        if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
2563            for d in descendants {
2564                let desc_dict = match d {
2565                    Object::Reference(r) => {
2566                        // Don't use `?` here — a bad reference in one descendant
2567                        // should not abort the entire font info construction.
2568                        let Some(Object::Dictionary(d)) = doc.get_object(*r).ok() else {
2569                            continue;
2570                        };
2571                        d
2572                    }
2573                    Object::Dictionary(d) => d,
2574                    _ => continue,
2575                };
2576                let tu = parse_to_unicode_from_font(doc, desc_dict);
2577                if !tu.is_empty() {
2578                    to_unicode = tu;
2579                    break;
2580                }
2581            }
2582        }
2583    }
2584
2585    let (encoding_map, ct_codes) = if !is_cid {
2586        build_encoding_map(doc, &font)
2587    } else {
2588        ([None; 256], [false; 256])
2589    };
2590
2591    let (base_font, is_bold, is_italic) = derive_font_style(doc, &font);
2592    let (simple_widths, cid_default_width, cid_widths) = parse_font_widths(doc, &font, is_cid);
2593
2594    Some(FontInfo {
2595        is_cid,
2596        to_unicode,
2597        encoding_map,
2598        ct_codes,
2599        base_font,
2600        is_bold,
2601        is_italic,
2602        simple_widths,
2603        cid_default_width,
2604        cid_widths,
2605    })
2606}
2607
2608/// Emit one or more `PositionedChar` for a single glyph, decomposing
2609/// ligatures by splitting the glyph's `char_w` advance evenly across the
2610/// constituent characters. The text matrix advances exactly once (per
2611/// glyph), preserving the original layout — only the per-character bbox is
2612/// subdivided.
2613///
2614/// Uses the unified [`decompose_glyph_to_string`] helper so that the
2615/// per-glyph decomposition matches the string-level
2616/// [`decompose_ligatures`] used by `extract_page_text` / `TextBlock`.
2617/// Without this alignment, presentation-form codepoints outside the
2618/// hardcoded Latin set (e.g. Armenian U+FB13) would expand in extracted
2619/// text but stay as a single PositionedChar, leaving consumers that pair
2620/// text offsets with bounding boxes misaligned.
2621fn push_glyph_positioned(
2622    chars: &mut Vec<PositionedChar>,
2623    state: &mut TextState,
2624    page: u32,
2625    glyph: char,
2626    is_ct_origin: bool,
2627    char_w: f64,
2628) {
2629    let (gx, gy) = apply_ctm(state);
2630    let constituents: Option<String> = if LIGATURE_DECOMP {
2631        // Try ct-origin marker first (preserves U+E007 → "ct" only when it
2632        // came from a /ct glyph-name path), then fall back to the unified
2633        // decomposition helper which covers the Latin set + FB00–FB4F NFKD.
2634        if let Some(s) = decompose_ligature_char_with_origin(glyph, is_ct_origin) {
2635            Some(s.to_string())
2636        } else {
2637            decompose_glyph_to_string(glyph)
2638        }
2639    } else {
2640        None
2641    };
2642    match constituents {
2643        None => {
2644            chars.push(PositionedChar {
2645                ch: glyph,
2646                page,
2647                bbox: [gx, gy, gx + char_w, gy + state.font_size],
2648            });
2649        }
2650        Some(s) => {
2651            let n = s.chars().count() as f64;
2652            let part_w = char_w / n;
2653            for (i, c) in s.chars().enumerate() {
2654                let x = gx + part_w * i as f64;
2655                chars.push(PositionedChar {
2656                    ch: c,
2657                    page,
2658                    bbox: [x, gy, x + part_w, gy + state.font_size],
2659                });
2660            }
2661        }
2662    }
2663    state.tm[4] += char_w + state.tc;
2664}
2665
2666/// Extract positioned characters from operations.
2667fn extract_chars_from_ops(
2668    ops: &[Operation],
2669    page: u32,
2670    font_map: &HashMap<String, FontInfo>,
2671) -> Vec<PositionedChar> {
2672    let mut state = TextState::default();
2673    let mut chars = Vec::new();
2674
2675    for op in ops {
2676        match op.operator.as_str() {
2677            "q" => {
2678                state.gs_stack.push(GraphicsState {
2679                    ctm: state.ctm,
2680                    fill_color: state.fill_color,
2681                });
2682            }
2683            "Q" => {
2684                if let Some(gs) = state.gs_stack.pop() {
2685                    state.ctm = gs.ctm;
2686                    state.fill_color = gs.fill_color;
2687                }
2688            }
2689            "cm" => {
2690                if let Some(m) = extract_matrix(&op.operands) {
2691                    state.ctm = multiply_matrix(&state.ctm, &m);
2692                }
2693            }
2694            // G1: non-stroking fill color setters. PDF §8.6.5: `rg` r g b;
2695            // `g` gray; `k` c m y k. Stroking variants (RG/G/K) don't
2696            // affect text fill so we accept and ignore them. `sc`/`scn` are
2697            // honored when operand count makes the device space unambiguous
2698            // (1 = Gray, 3 = RGB, 4 = CMYK). `cs`/`CS` mark color unknown
2699            // because ICC/Lab/Pattern can't be converted from operands alone.
2700            "rg" if op.operands.len() >= 3 => {
2701                if let (Some(r), Some(g), Some(b)) = (
2702                    op.operands.first().and_then(as_number),
2703                    op.operands.get(1).and_then(as_number),
2704                    op.operands.get(2).and_then(as_number),
2705                ) {
2706                    state.fill_color = Some([
2707                        clamp_unit_to_u8(r),
2708                        clamp_unit_to_u8(g),
2709                        clamp_unit_to_u8(b),
2710                        255,
2711                    ]);
2712                }
2713            }
2714            "g" => {
2715                if let Some(v) = op.operands.first().and_then(as_number) {
2716                    let byte = clamp_unit_to_u8(v);
2717                    state.fill_color = Some([byte, byte, byte, 255]);
2718                }
2719            }
2720            "k" if op.operands.len() >= 4 => {
2721                if let (Some(c), Some(m), Some(y), Some(kk)) = (
2722                    op.operands.first().and_then(as_number),
2723                    op.operands.get(1).and_then(as_number),
2724                    op.operands.get(2).and_then(as_number),
2725                    op.operands.get(3).and_then(as_number),
2726                ) {
2727                    state.fill_color = Some(cmyk_to_rgba(c, m, y, kk));
2728                }
2729            }
2730            "RG" | "G" | "K" => {}
2731            "sc" | "scn" => {
2732                state.fill_color = match op.operands.len() {
2733                    1 => op.operands.first().and_then(as_number).map(|v| {
2734                        let b = clamp_unit_to_u8(v);
2735                        [b, b, b, 255]
2736                    }),
2737                    3 => {
2738                        let r = op.operands.first().and_then(as_number);
2739                        let g = op.operands.get(1).and_then(as_number);
2740                        let b = op.operands.get(2).and_then(as_number);
2741                        match (r, g, b) {
2742                            (Some(r), Some(g), Some(b)) => Some([
2743                                clamp_unit_to_u8(r),
2744                                clamp_unit_to_u8(g),
2745                                clamp_unit_to_u8(b),
2746                                255,
2747                            ]),
2748                            _ => None,
2749                        }
2750                    }
2751                    4 => {
2752                        let c = op.operands.first().and_then(as_number);
2753                        let m = op.operands.get(1).and_then(as_number);
2754                        let y = op.operands.get(2).and_then(as_number);
2755                        let kk = op.operands.get(3).and_then(as_number);
2756                        match (c, m, y, kk) {
2757                            (Some(c), Some(m), Some(y), Some(kk)) => {
2758                                Some(cmyk_to_rgba(c, m, y, kk))
2759                            }
2760                            _ => None,
2761                        }
2762                    }
2763                    _ => None,
2764                };
2765            }
2766            "cs" | "CS" => {
2767                state.fill_color = None;
2768            }
2769            "BT" => {
2770                state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2771                state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2772            }
2773            "Tf" if op.operands.len() >= 2 => {
2774                if let Object::Name(ref name) = op.operands[0] {
2775                    state.font_name = String::from_utf8_lossy(name).to_string();
2776                }
2777                if let Some(size) = as_number(&op.operands[1]) {
2778                    state.font_size = size;
2779                }
2780            }
2781            "Tc" => {
2782                if let Some(v) = op.operands.first().and_then(as_number) {
2783                    state.tc = v;
2784                }
2785            }
2786            "Tw" => {
2787                if let Some(v) = op.operands.first().and_then(as_number) {
2788                    state.tw = v;
2789                }
2790            }
2791            "Tz" => {
2792                if let Some(v) = op.operands.first().and_then(as_number) {
2793                    state.th = v;
2794                }
2795            }
2796            "TL" => {
2797                if let Some(v) = op.operands.first().and_then(as_number) {
2798                    state.tl = v;
2799                }
2800            }
2801            "Ts" => {
2802                if let Some(v) = op.operands.first().and_then(as_number) {
2803                    state.ts = v;
2804                }
2805            }
2806            "Td" if op.operands.len() >= 2 => {
2807                let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2808                let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2809                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2810                state.tlm = new_tlm;
2811                state.tm = new_tlm;
2812            }
2813            "TD" if op.operands.len() >= 2 => {
2814                let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2815                let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2816                state.tl = -ty;
2817                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2818                state.tlm = new_tlm;
2819                state.tm = new_tlm;
2820            }
2821            "Tm" => {
2822                if let Some(m) = extract_matrix(&op.operands) {
2823                    state.tm = m;
2824                    state.tlm = m;
2825                }
2826            }
2827            "T*" => {
2828                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2829                state.tlm = new_tlm;
2830                state.tm = new_tlm;
2831            }
2832            "Tj" => {
2833                let fi = font_map.get(&state.font_name);
2834                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2835                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2836                    for (ch, is_ct_origin) in text.iter() {
2837                        push_glyph_positioned(
2838                            &mut chars,
2839                            &mut state,
2840                            page,
2841                            ch,
2842                            is_ct_origin,
2843                            char_w,
2844                        );
2845                    }
2846                }
2847            }
2848            "TJ" => {
2849                if let Some(Object::Array(ref arr)) = op.operands.first() {
2850                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2851                    let fi = font_map.get(&state.font_name);
2852                    for item in arr {
2853                        match item {
2854                            Object::String(bytes, _) => {
2855                                let text = decode_pdf_string_with_font_marked(bytes, fi);
2856                                for (ch, is_ct_origin) in text.iter() {
2857                                    push_glyph_positioned(
2858                                        &mut chars,
2859                                        &mut state,
2860                                        page,
2861                                        ch,
2862                                        is_ct_origin,
2863                                        char_w,
2864                                    );
2865                                }
2866                            }
2867                            _ => {
2868                                if let Some(adj) = as_number(item) {
2869                                    state.tm[4] -= adj / 1000.0 * state.font_size;
2870                                }
2871                            }
2872                        }
2873                    }
2874                }
2875            }
2876            "'" => {
2877                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2878                state.tlm = new_tlm;
2879                state.tm = new_tlm;
2880
2881                let fi = font_map.get(&state.font_name);
2882                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2883                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2884                    for (ch, is_ct_origin) in text.iter() {
2885                        push_glyph_positioned(
2886                            &mut chars,
2887                            &mut state,
2888                            page,
2889                            ch,
2890                            is_ct_origin,
2891                            char_w,
2892                        );
2893                    }
2894                }
2895            }
2896            "\"" if op.operands.len() >= 3 => {
2897                if let Some(tw) = as_number(&op.operands[0]) {
2898                    state.tw = tw;
2899                }
2900                if let Some(tc) = as_number(&op.operands[1]) {
2901                    state.tc = tc;
2902                }
2903
2904                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2905                state.tlm = new_tlm;
2906                state.tm = new_tlm;
2907
2908                let fi = font_map.get(&state.font_name);
2909                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands[2..], fi)
2910                {
2911                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2912                    for (ch, is_ct_origin) in text.iter() {
2913                        push_glyph_positioned(
2914                            &mut chars,
2915                            &mut state,
2916                            page,
2917                            ch,
2918                            is_ct_origin,
2919                            char_w,
2920                        );
2921                    }
2922                }
2923            }
2924            _ => {}
2925        }
2926    }
2927
2928    chars
2929}
2930
2931/// Apply the current transformation matrix (CTM) to the text matrix position,
2932/// returning (x, y) in page/user space.
2933///
2934/// Mirrors the `compute_x` / `compute_y` logic in pdf-manip's `text_run.rs` so
2935/// that character positions emitted by `extract_chars_from_ops` use the same
2936/// coordinate space as `TextRun.x` / `TextRun.y` from `extract_text_runs`.
2937/// Without this, PDFs whose content streams set a non-identity CTM via `cm`
2938/// produce mismatched coordinates, breaking the spatial redaction fallback.
2939#[inline]
2940fn apply_ctm(state: &TextState) -> (f64, f64) {
2941    let x = state.ctm[0] * state.tm[4] + state.ctm[2] * state.tm[5] + state.ctm[4];
2942    let y = state.ctm[1] * state.tm[4] + state.ctm[3] * state.tm[5] + state.ctm[5];
2943    (x, y)
2944}
2945
2946/// Extract the first string operand, decoding via font's ToUnicode CMap if available.
2947fn extract_decoded_string_operand_with_font(
2948    operands: &[Object],
2949    font_info: Option<&FontInfo>,
2950) -> Option<DecodedPdfString> {
2951    for op in operands {
2952        if let Object::String(bytes, _) = op {
2953            return Some(decode_pdf_string_with_font_marked(bytes, font_info));
2954        }
2955    }
2956    None
2957}
2958
2959/// Convert a PDF object to a number (f64).
2960fn as_number(obj: &Object) -> Option<f64> {
2961    match obj {
2962        Object::Integer(i) => Some(*i as f64),
2963        Object::Real(f) => Some(*f as f64),
2964        _ => None,
2965    }
2966}
2967
2968/// Extract a 6-element transformation matrix from operands.
2969fn extract_matrix(operands: &[Object]) -> Option<[f64; 6]> {
2970    if operands.len() < 6 {
2971        return None;
2972    }
2973    let a = as_number(&operands[0])?;
2974    let b = as_number(&operands[1])?;
2975    let c = as_number(&operands[2])?;
2976    let d = as_number(&operands[3])?;
2977    let e = as_number(&operands[4])?;
2978    let f = as_number(&operands[5])?;
2979    Some([a, b, c, d, e, f])
2980}
2981
2982/// Multiply two 3x3 transformation matrices (stored as [a, b, c, d, e, f]).
2983fn multiply_matrix(m1: &[f64; 6], m2: &[f64; 6]) -> [f64; 6] {
2984    [
2985        m1[0] * m2[0] + m1[1] * m2[2],
2986        m1[0] * m2[1] + m1[1] * m2[3],
2987        m1[2] * m2[0] + m1[3] * m2[2],
2988        m1[2] * m2[1] + m1[3] * m2[3],
2989        m1[4] * m2[0] + m1[5] * m2[2] + m2[4],
2990        m1[4] * m2[1] + m1[5] * m2[3] + m2[5],
2991    ]
2992}
2993
2994#[cfg(test)]
2995mod tests {
2996    use super::*;
2997    use lopdf::{dictionary, Document, Object, Stream};
2998
2999    /// Helper: create a minimal doc with text content.
3000    fn make_doc_with_text(content: &[u8]) -> Document {
3001        let mut doc = Document::with_version("1.7");
3002
3003        let content_stream = Stream::new(dictionary! {}, content.to_vec());
3004        let content_id = doc.add_object(Object::Stream(content_stream));
3005
3006        let page_dict = dictionary! {
3007            "Type" => "Page",
3008            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3009            "Contents" => Object::Reference(content_id),
3010        };
3011        let page_id = doc.add_object(Object::Dictionary(page_dict));
3012
3013        let pages_dict = dictionary! {
3014            "Type" => "Pages",
3015            "Kids" => vec![Object::Reference(page_id)],
3016            "Count" => 1_i64,
3017        };
3018        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
3019
3020        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
3021            d.set("Parent", Object::Reference(pages_id));
3022        }
3023
3024        let catalog = dictionary! {
3025            "Type" => "Catalog",
3026            "Pages" => Object::Reference(pages_id),
3027        };
3028        let catalog_id = doc.add_object(Object::Dictionary(catalog));
3029        doc.trailer.set("Root", Object::Reference(catalog_id));
3030
3031        doc
3032    }
3033
3034    #[test]
3035    fn extract_simple_text() {
3036        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello World) Tj ET");
3037        let blocks = extract_text(&doc);
3038        assert_eq!(blocks.len(), 1);
3039        assert_eq!(blocks[0].text, "Hello World");
3040        assert_eq!(blocks[0].page, 1);
3041        assert_eq!(blocks[0].font_size, 12.0);
3042    }
3043
3044    #[test]
3045    fn extract_page_text_single() {
3046        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
3047        let text = extract_page_text(&doc, 1).unwrap();
3048        assert_eq!(text, "Hello");
3049    }
3050
3051    #[test]
3052    fn extract_page_text_out_of_range() {
3053        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
3054        let result = extract_page_text(&doc, 5);
3055        assert!(result.is_err());
3056    }
3057
3058    #[test]
3059    fn extract_positioned_chars_basic() {
3060        let doc = make_doc_with_text(b"BT /F1 12 Tf (AB) Tj ET");
3061        let chars = extract_positioned_chars(&doc, 1).unwrap();
3062        assert_eq!(chars.len(), 2);
3063        assert_eq!(chars[0].ch, 'A');
3064        assert_eq!(chars[1].ch, 'B');
3065        assert_eq!(chars[0].page, 1);
3066        // Second char should be positioned after the first.
3067        assert!(chars[1].bbox[0] > chars[0].bbox[0]);
3068    }
3069
3070    #[test]
3071    fn extract_tj_array() {
3072        let doc = make_doc_with_text(b"BT /F1 12 Tf [(He) -100 (llo)] TJ ET");
3073        let blocks = extract_text(&doc);
3074        assert_eq!(blocks.len(), 1);
3075        assert_eq!(blocks[0].text, "Hello");
3076    }
3077
3078    #[test]
3079    fn empty_page_extracts_no_text() {
3080        let doc = make_doc_with_text(b"q Q");
3081        let blocks = extract_text(&doc);
3082        assert!(blocks.is_empty());
3083    }
3084
3085    #[test]
3086    fn multiline_text_extraction() {
3087        let doc = make_doc_with_text(b"BT /F1 12 Tf 12 TL (Line1) Tj T* (Line2) Tj ET");
3088        let blocks = extract_text(&doc);
3089        assert_eq!(blocks.len(), 2);
3090        assert_eq!(blocks[0].text, "Line1");
3091        assert_eq!(blocks[1].text, "Line2");
3092    }
3093
3094    /// `/ActualText` on a BDC marked-content sequence overrides the glyph
3095    /// reading of the inner Tj. Glyph text is preserved on `text`; the
3096    /// canonical reading lives on `actual_text`.
3097    #[test]
3098    fn actual_text_basic_bdc_override() {
3099        let doc = make_doc_with_text(b"BT /F1 12 Tf /Span <</ActualText (fi)>> BDC (X) Tj EMC ET");
3100        let blocks = extract_text(&doc);
3101        assert_eq!(blocks.len(), 1);
3102        assert_eq!(blocks[0].text, "X");
3103        assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
3104    }
3105
3106    /// Nested BDCs: the innermost `/ActualText` wins inside its range, and
3107    /// the outer override is restored after the inner EMC. A flat flag would
3108    /// leak the inner value past its scope; the stack must pop one level.
3109    #[test]
3110    fn actual_text_nested_bdc_inner_overrides_outer() {
3111        let doc = make_doc_with_text(
3112            b"BT /F1 12 Tf \
3113              /Span <</ActualText (outer)>> BDC \
3114                (A) Tj \
3115                /Span <</ActualText (inner)>> BDC \
3116                  (B) Tj \
3117                EMC \
3118                (C) Tj \
3119              EMC ET",
3120        );
3121        let blocks = extract_text(&doc);
3122        assert_eq!(blocks.len(), 3);
3123        assert_eq!(blocks[0].text, "A");
3124        assert_eq!(blocks[0].actual_text.as_deref(), Some("outer"));
3125        assert_eq!(blocks[1].text, "B");
3126        assert_eq!(blocks[1].actual_text.as_deref(), Some("inner"));
3127        assert_eq!(blocks[2].text, "C");
3128        assert_eq!(blocks[2].actual_text.as_deref(), Some("outer"));
3129    }
3130
3131    /// BMC (no property list) still pushes a stack entry so the matching EMC
3132    /// pops the right level — otherwise an EMC inside a BDC range would leak.
3133    #[test]
3134    fn actual_text_bmc_does_not_leak_emc() {
3135        let doc = make_doc_with_text(
3136            b"BT /F1 12 Tf \
3137              /Span <</ActualText (X)>> BDC \
3138                /Artifact BMC (in) Tj EMC \
3139                (out) Tj \
3140              EMC \
3141              (after) Tj ET",
3142        );
3143        let blocks = extract_text(&doc);
3144        assert_eq!(blocks.len(), 3);
3145        assert_eq!(blocks[0].actual_text.as_deref(), Some("X"));
3146        assert_eq!(blocks[1].actual_text.as_deref(), Some("X"));
3147        assert_eq!(blocks[2].actual_text, None);
3148    }
3149
3150    /// Build a one-page Document whose page invokes a Form XObject (via
3151    /// `/Fm0 Do`) wrapped in a `BDC` / `EMC` pair. `xobj_content` is the
3152    /// content stream placed inside the Form XObject; `page_pre` runs
3153    /// before the `Do`, and `page_post` runs after it (still inside the
3154    /// surrounding `BDC` / `EMC`). Used to drive issue #1358 regression.
3155    fn make_doc_with_xobj_inside_bdc(
3156        page_pre: &[u8],
3157        xobj_content: &[u8],
3158        page_post: &[u8],
3159        actual_text: &str,
3160    ) -> Document {
3161        let mut doc = Document::with_version("1.7");
3162
3163        let xobj_dict = dictionary! {
3164            "Type" => "XObject",
3165            "Subtype" => "Form",
3166            "BBox" => vec![0.into(), 0.into(), 100.into(), 100.into()],
3167        };
3168        let xobj_stream = Stream::new(xobj_dict, xobj_content.to_vec());
3169        let xobj_id = doc.add_object(Object::Stream(xobj_stream));
3170
3171        // Page content: BT /F1 12 Tf /Span <</ActualText(...)>> BDC <pre> /Fm0 Do <post> EMC ET
3172        let mut page_content = Vec::<u8>::new();
3173        page_content.extend_from_slice(b"BT /F1 12 Tf /Span <</ActualText (");
3174        page_content.extend_from_slice(actual_text.as_bytes());
3175        page_content.extend_from_slice(b")>> BDC ");
3176        page_content.extend_from_slice(page_pre);
3177        page_content.extend_from_slice(b" /Fm0 Do ");
3178        page_content.extend_from_slice(page_post);
3179        page_content.extend_from_slice(b" EMC ET");
3180
3181        let content_stream = Stream::new(dictionary! {}, page_content);
3182        let content_id = doc.add_object(Object::Stream(content_stream));
3183
3184        let resources = dictionary! {
3185            "XObject" => dictionary! { "Fm0" => Object::Reference(xobj_id) },
3186        };
3187        let resources_id = doc.add_object(Object::Dictionary(resources));
3188
3189        let page_dict = dictionary! {
3190            "Type" => "Page",
3191            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3192            "Contents" => Object::Reference(content_id),
3193            "Resources" => Object::Reference(resources_id),
3194        };
3195        let page_id = doc.add_object(Object::Dictionary(page_dict));
3196
3197        let pages_dict = dictionary! {
3198            "Type" => "Pages",
3199            "Kids" => vec![Object::Reference(page_id)],
3200            "Count" => 1_i64,
3201        };
3202        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
3203
3204        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
3205            d.set("Parent", Object::Reference(pages_id));
3206        }
3207
3208        let catalog = dictionary! {
3209            "Type" => "Catalog",
3210            "Pages" => Object::Reference(pages_id),
3211        };
3212        let catalog_id = doc.add_object(Object::Dictionary(catalog));
3213        doc.trailer.set("Root", Object::Reference(catalog_id));
3214
3215        doc
3216    }
3217
3218    /// Regression guard for issue #1358: when a `Do` operator invoking a
3219    /// Form XObject sits inside a `BDC` / `EMC` pair carrying
3220    /// `/ActualText`, text emitted by the XObject's own `Tj`/`TJ`
3221    /// operators must inherit that `/ActualText` value. Previously the
3222    /// recursive `extract_blocks_from_ops_inner` reset the marked-content
3223    /// stack and lost the surrounding context.
3224    #[test]
3225    fn actual_text_propagates_into_form_xobject_recursion() {
3226        let doc = make_doc_with_xobj_inside_bdc(
3227            b"(pre) Tj",
3228            b"BT /F1 12 Tf (inside) Tj ET",
3229            b"(post) Tj",
3230            "wrapped",
3231        );
3232        let blocks = extract_text(&doc);
3233        assert_eq!(blocks.len(), 3);
3234        assert_eq!(blocks[0].text, "pre");
3235        assert_eq!(blocks[0].actual_text.as_deref(), Some("wrapped"));
3236        // The block emitted inside the Form XObject's BT/ET must inherit
3237        // the surrounding /ActualText — not silently become None.
3238        assert_eq!(blocks[1].text, "inside");
3239        assert_eq!(
3240            blocks[1].actual_text.as_deref(),
3241            Some("wrapped"),
3242            "Form XObject text lost surrounding /ActualText (issue #1358)"
3243        );
3244        assert_eq!(blocks[2].text, "post");
3245        assert_eq!(blocks[2].actual_text.as_deref(), Some("wrapped"));
3246    }
3247
3248    /// Inverse regression: when a `Do` invokes a Form XObject whose own
3249    /// content has its own `BDC` ... `EMC` pair with a different
3250    /// `/ActualText`, the inner pair takes precedence inside the XObject
3251    /// and the outer pair resumes after the inner `EMC`. This guards
3252    /// against the inherited binding clobbering legitimate inner
3253    /// marked-content state.
3254    #[test]
3255    fn actual_text_inner_xobj_bdc_overrides_inherited() {
3256        let doc = make_doc_with_xobj_inside_bdc(
3257            b"",
3258            b"BT /F1 12 Tf \
3259              /Span <</ActualText (inner)>> BDC (B) Tj EMC \
3260              (after) Tj ET",
3261            b"",
3262            "outer",
3263        );
3264        let blocks = extract_text(&doc);
3265        assert_eq!(blocks.len(), 2);
3266        assert_eq!(blocks[0].text, "B");
3267        assert_eq!(blocks[0].actual_text.as_deref(), Some("inner"));
3268        assert_eq!(blocks[1].text, "after");
3269        assert_eq!(blocks[1].actual_text.as_deref(), Some("outer"));
3270    }
3271
3272    /// `/ActualText` encoded as UTF-16BE (with BOM) decodes to the proper
3273    /// Unicode string. Real PDFs almost always use this form.
3274    #[test]
3275    fn actual_text_utf16be_bom_decodes() {
3276        // <FEFF00660069> = UTF-16BE BOM + "fi"
3277        let doc = make_doc_with_text(
3278            b"BT /F1 12 Tf /Span <</ActualText <FEFF00660069> >> BDC (X) Tj EMC ET",
3279        );
3280        let blocks = extract_text(&doc);
3281        assert_eq!(blocks.len(), 1);
3282        assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
3283    }
3284
3285    // -- M4-LIG-01: ligature decomposition -------------------------------
3286
3287    #[test]
3288    fn ligature_ff_decomposes() {
3289        assert_eq!(decompose_ligature_char('\u{FB00}'), Some("ff"));
3290        assert_eq!(decompose_ligatures("\u{FB00}"), "ff");
3291        assert_eq!(decompose_ligatures("o\u{FB00}ice"), "office");
3292    }
3293
3294    #[test]
3295    fn ligature_fi_decomposes() {
3296        assert_eq!(decompose_ligature_char('\u{FB01}'), Some("fi"));
3297        assert_eq!(decompose_ligatures("\u{FB01}"), "fi");
3298        assert_eq!(decompose_ligatures("of\u{FB01}ce"), "office");
3299    }
3300
3301    #[test]
3302    fn ligature_fl_decomposes() {
3303        assert_eq!(decompose_ligature_char('\u{FB02}'), Some("fl"));
3304        assert_eq!(decompose_ligatures("\u{FB02}ame"), "flame");
3305    }
3306
3307    #[test]
3308    fn ligature_ffi_decomposes() {
3309        assert_eq!(decompose_ligature_char('\u{FB03}'), Some("ffi"));
3310        assert_eq!(decompose_ligatures("o\u{FB03}ce"), "office");
3311    }
3312
3313    #[test]
3314    fn ligature_ffl_decomposes() {
3315        assert_eq!(decompose_ligature_char('\u{FB04}'), Some("ffl"));
3316        assert_eq!(decompose_ligatures("ba\u{FB04}e"), "baffle");
3317    }
3318
3319    /// `st` has both U+FB05 (long-s + t, archaic) and U+FB06 (regular st);
3320    /// both decompose to "st" — historical spelling is not preserved since
3321    /// the goal is text-search/copy-paste, not typographic round-trip.
3322    /// Glyph name `st` also routes through this codepoint.
3323    #[test]
3324    fn ligature_st_decomposes() {
3325        assert_eq!(decompose_ligature_char('\u{FB06}'), Some("st"));
3326        assert_eq!(decompose_ligature_char('\u{FB05}'), Some("st"));
3327        assert_eq!(decompose_ligatures("fa\u{FB06}"), "fast");
3328        // Glyph name path: `st` → U+FB06 → "st"
3329        assert_eq!(glyph_name_to_unicode("st"), Some('\u{FB06}'));
3330    }
3331
3332    /// `ct` has no precomposed Unicode codepoint, so `glyph_name_to_unicode`
3333    /// reports `None`; the encoding-map builder records the slot in the
3334    /// `ct_codes` side-table instead. Decoding emits one internal marker
3335    /// scalar for that code, then the ligature layer expands only markers
3336    /// whose origin came from `ct_codes`. This avoids clobbering legitimate
3337    /// U+E007 values arriving through ToUnicode CMaps.
3338    #[test]
3339    fn ligature_ct_via_glyph_name_emits_string() {
3340        assert_eq!(glyph_name_to_unicode("ct"), None);
3341        // End-to-end: byte 1 → /ct → "ct" in extracted text.
3342        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6101> Tj ET", "ct");
3343        let blocks = extract_text(&doc);
3344        assert_eq!(blocks.len(), 1);
3345        assert_eq!(blocks[0].text, "act");
3346    }
3347
3348    /// Regression guard for issue #1357: a ToUnicode CMap that legitimately
3349    /// maps a code to U+E007 must NOT be silently rewritten to "ct". The
3350    /// previous PUA-sentinel implementation clobbered every U+E007 it saw,
3351    /// regardless of origin.
3352    #[test]
3353    fn tounicode_pua_e007_is_preserved() {
3354        let doc = make_doc_with_tounicode_cmap(b"BT /F1 12 Tf <01> Tj ET", 0x01, 0xE007);
3355        let blocks = extract_text(&doc);
3356        assert_eq!(blocks.len(), 1);
3357        assert_eq!(blocks[0].text, "\u{E007}");
3358        assert_ne!(blocks[0].text, "ct");
3359    }
3360
3361    /// A `/ct` glyph must advance the text matrix as one glyph even though
3362    /// it expands to two extracted characters. Otherwise following glyphs in
3363    /// the same show operator drift right by one extra glyph width.
3364    #[test]
3365    fn ligature_ct_positioned_chars_advance_as_single_glyph() {
3366        // <610162>: a (0x61), ct (0x01 via /Differences), b (0x62)
3367        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <610162> Tj ET", "ct");
3368        let chars = extract_positioned_chars(&doc, 1).unwrap();
3369        let extracted: String = chars.iter().map(|c| c.ch).collect();
3370        assert_eq!(extracted, "actb");
3371        assert_eq!(chars.len(), 4);
3372
3373        let char_w = 12.0 * APPROX_CHAR_WIDTH;
3374        assert!((chars[1].bbox[0] - char_w).abs() < 1e-6);
3375        assert!((chars[3].bbox[0] - (2.0 * char_w)).abs() < 1e-6);
3376    }
3377
3378    /// Build a one-page Document with a font whose /Encoding /Differences
3379    /// remaps byte 1 to the given ligature glyph name. Used by the golden
3380    /// "office" fixture below to drive ligature decomposition end-to-end
3381    /// through the text extractor.
3382    fn make_doc_with_ligature_font(content: &[u8], glyph_name: &str) -> Document {
3383        let mut doc = Document::with_version("1.7");
3384
3385        let content_stream = Stream::new(dictionary! {}, content.to_vec());
3386        let content_id = doc.add_object(Object::Stream(content_stream));
3387
3388        let encoding = dictionary! {
3389            "Type" => "Encoding",
3390            "BaseEncoding" => "WinAnsiEncoding",
3391            "Differences" => vec![
3392                1_i64.into(),
3393                Object::Name(glyph_name.as_bytes().to_vec()),
3394            ],
3395        };
3396        let encoding_id = doc.add_object(Object::Dictionary(encoding));
3397
3398        let font = dictionary! {
3399            "Type" => "Font",
3400            "Subtype" => "Type1",
3401            "BaseFont" => "Helvetica",
3402            "Encoding" => Object::Reference(encoding_id),
3403        };
3404        let font_id = doc.add_object(Object::Dictionary(font));
3405
3406        let resources = dictionary! {
3407            "Font" => dictionary! { "F1" => Object::Reference(font_id) },
3408        };
3409        let resources_id = doc.add_object(Object::Dictionary(resources));
3410
3411        let page_dict = dictionary! {
3412            "Type" => "Page",
3413            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3414            "Contents" => Object::Reference(content_id),
3415            "Resources" => Object::Reference(resources_id),
3416        };
3417        let page_id = doc.add_object(Object::Dictionary(page_dict));
3418
3419        let pages_dict = dictionary! {
3420            "Type" => "Pages",
3421            "Kids" => vec![Object::Reference(page_id)],
3422            "Count" => 1_i64,
3423        };
3424        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
3425
3426        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
3427            d.set("Parent", Object::Reference(pages_id));
3428        }
3429
3430        let catalog = dictionary! {
3431            "Type" => "Catalog",
3432            "Pages" => Object::Reference(pages_id),
3433        };
3434        let catalog_id = doc.add_object(Object::Dictionary(catalog));
3435        doc.trailer.set("Root", Object::Reference(catalog_id));
3436
3437        doc
3438    }
3439
3440    /// Golden fixture: byte sequence `o + ffi + c + e` rendered through a
3441    /// font whose /Differences remaps byte 1 → /ffi → U+FB03. End-to-end
3442    /// the extractor should produce "office", not "o\u{FB03}ce". This is
3443    /// the single test that exercises the full pipeline: glyph→unicode →
3444    /// encoding map → decode → ligature decomposition.
3445    #[test]
3446    fn ligature_office_golden_ffi() {
3447        // Hex string <6F016365>: o (0x6F), ffi (0x01), c (0x63), e (0x65)
3448        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6F016365> Tj ET", "ffi");
3449        let blocks = extract_text(&doc);
3450        assert_eq!(blocks.len(), 1);
3451        assert_eq!(blocks[0].text, "office");
3452    }
3453
3454    /// Build a one-page Document whose font has a ToUnicode CMap mapping a
3455    /// single 1-byte source code to an explicit Unicode scalar. Used to
3456    /// drive the regression guard for issue #1357: a ToUnicode-supplied
3457    /// codepoint must round-trip through the extractor unchanged.
3458    fn make_doc_with_tounicode_cmap(content: &[u8], src: u8, dst: u32) -> Document {
3459        let mut doc = Document::with_version("1.7");
3460
3461        let content_stream = Stream::new(dictionary! {}, content.to_vec());
3462        let content_id = doc.add_object(Object::Stream(content_stream));
3463
3464        // Minimal ToUnicode CMap with a single bfchar mapping.
3465        let cmap_text = format!(
3466            "/CIDInit /ProcSet findresource begin\n\
3467             12 dict begin\n\
3468             begincmap\n\
3469             /CMapType 2 def\n\
3470             1 beginbfchar\n\
3471             <{:02X}> <{:04X}>\n\
3472             endbfchar\n\
3473             endcmap CMapName currentdict /CMap defineresource pop end end",
3474            src, dst
3475        );
3476        let cmap_stream = Stream::new(dictionary! {}, cmap_text.into_bytes());
3477        let cmap_id = doc.add_object(Object::Stream(cmap_stream));
3478
3479        let font = dictionary! {
3480            "Type" => "Font",
3481            "Subtype" => "Type1",
3482            "BaseFont" => "Helvetica",
3483            "Encoding" => "WinAnsiEncoding",
3484            "ToUnicode" => Object::Reference(cmap_id),
3485        };
3486        let font_id = doc.add_object(Object::Dictionary(font));
3487
3488        let resources = dictionary! {
3489            "Font" => dictionary! { "F1" => Object::Reference(font_id) },
3490        };
3491        let resources_id = doc.add_object(Object::Dictionary(resources));
3492
3493        let page_dict = dictionary! {
3494            "Type" => "Page",
3495            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3496            "Contents" => Object::Reference(content_id),
3497            "Resources" => Object::Reference(resources_id),
3498        };
3499        let page_id = doc.add_object(Object::Dictionary(page_dict));
3500
3501        let pages_dict = dictionary! {
3502            "Type" => "Pages",
3503            "Kids" => vec![Object::Reference(page_id)],
3504            "Count" => 1_i64,
3505        };
3506        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
3507
3508        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
3509            d.set("Parent", Object::Reference(pages_id));
3510        }
3511
3512        let catalog = dictionary! {
3513            "Type" => "Catalog",
3514            "Pages" => Object::Reference(pages_id),
3515        };
3516        let catalog_id = doc.add_object(Object::Dictionary(catalog));
3517        doc.trailer.set("Root", Object::Reference(catalog_id));
3518
3519        doc
3520    }
3521
3522    /// PositionedChar: a ligature glyph splits its bbox evenly across the
3523    /// constituent characters so character offsets stay within the glyph's
3524    /// horizontal footprint. The matrix advances exactly once per glyph,
3525    /// not once per constituent — so layout is preserved.
3526    #[test]
3527    fn ligature_positioned_chars_split_bbox() {
3528        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
3529        let chars = extract_positioned_chars(&doc, 1).unwrap();
3530        assert_eq!(chars.len(), 3, "ffi → 3 chars");
3531        assert_eq!(chars[0].ch, 'f');
3532        assert_eq!(chars[1].ch, 'f');
3533        assert_eq!(chars[2].ch, 'i');
3534        // Bboxes should tile horizontally.
3535        assert!(chars[0].bbox[2] <= chars[1].bbox[0] + 1e-9);
3536        assert!(chars[1].bbox[2] <= chars[2].bbox[0] + 1e-9);
3537        // And total width should equal one glyph's char_w.
3538        let total = chars[2].bbox[2] - chars[0].bbox[0];
3539        let expected = 12.0 * APPROX_CHAR_WIDTH;
3540        assert!((total - expected).abs() < 1e-6);
3541    }
3542
3543    // -- M4-FOLLOWUP-03: PositionedChar decomposition + Tj bbox geometry ---
3544
3545    /// Regression guard for issue #1359 (problem 1): ligatures outside the
3546    /// hardcoded Latin set must decompose in PositionedChar output the same
3547    /// way they decompose in extracted text. Previously
3548    /// `push_glyph_positioned` only ran the hardcoded Latin map while
3549    /// extracted text additionally applied an NFKD fallback over
3550    /// FB00..FB4F, leaving counts misaligned for Armenian/Hebrew/Arabic
3551    /// presentation forms.
3552    #[test]
3553    fn positioned_chars_match_extracted_text_for_armenian_ligature() {
3554        // U+FB13 (Armenian small ligature men now) NFKDs to U+0574 U+0576.
3555        // The font's /Differences slot 1 maps to glyph name uniFB13.
3556        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB13");
3557        let blocks = extract_text(&doc);
3558        let chars = extract_positioned_chars(&doc, 1).unwrap();
3559        assert_eq!(blocks.len(), 1);
3560        // Both paths must agree on the constituent count.
3561        assert_eq!(
3562            blocks[0].text.chars().count(),
3563            chars.len(),
3564            "PositionedChar count drifted from extract_text() char count \
3565             (issue #1359 problem 1): blocks[0].text = {:?}, chars = {:?}",
3566            blocks[0].text,
3567            chars.iter().map(|c| c.ch).collect::<String>(),
3568        );
3569        // And on the actual characters.
3570        let chars_str: String = chars.iter().map(|c| c.ch).collect();
3571        assert_eq!(blocks[0].text, chars_str);
3572        // Total bbox span equals one glyph's char_w (the matrix advanced once).
3573        let total = chars.last().unwrap().bbox[2] - chars.first().unwrap().bbox[0];
3574        let expected = 12.0 * APPROX_CHAR_WIDTH;
3575        assert!((total - expected).abs() < 1e-6);
3576    }
3577
3578    /// Hebrew presentation forms may NFKD-map to a single base Hebrew
3579    /// codepoint. Those mappings are still semantic decompositions and must
3580    /// not be dropped just because the output has one scalar.
3581    #[test]
3582    fn hebrew_presentation_form_extracts_single_codepoint_nfkd_mapping() {
3583        // U+FB21 (Hebrew letter wide alef) NFKDs to U+05D0.
3584        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
3585        let blocks = extract_text(&doc);
3586        assert_eq!(blocks.len(), 1);
3587        assert_eq!(blocks[0].text, "\u{05D0}");
3588        assert_eq!(
3589            decompose_glyph_to_string('\u{FB21}').as_deref(),
3590            Some("\u{05D0}")
3591        );
3592    }
3593
3594    /// The positioned-character stream must expose the same text characters
3595    /// as extract_text for single-codepoint Hebrew presentation-form
3596    /// decompositions.
3597    #[test]
3598    fn positioned_chars_match_extracted_text_for_hebrew_presentation_form() {
3599        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
3600        let blocks = extract_text(&doc);
3601        let chars = extract_positioned_chars(&doc, 1).unwrap();
3602        let chars_str: String = chars.iter().map(|c| c.ch).collect();
3603
3604        assert_eq!(blocks.len(), 1);
3605        assert_eq!(blocks[0].text, "\u{05D0}");
3606        assert_eq!(chars_str, blocks[0].text);
3607        assert_eq!(blocks[0].text.chars().count(), chars.len());
3608    }
3609
3610    /// Regression guard for issue #1359 (problem 2): a Tj operator that
3611    /// emits a single ligature glyph must produce a TextBlock whose bbox
3612    /// width equals one rendered glyph's footprint, not the byte length
3613    /// of the decomposed display string. Previously `display_text.len()`
3614    /// over-counted because ligature expansion grows the string while
3615    /// the text matrix advances exactly once per source glyph.
3616    #[test]
3617    fn tj_block_width_matches_rendered_glyph_count() {
3618        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
3619        let blocks = extract_text(&doc);
3620        assert_eq!(blocks.len(), 1);
3621        assert_eq!(blocks[0].text, "ffi");
3622        let width = blocks[0].bbox[2] - blocks[0].bbox[0];
3623        let expected = 12.0 * APPROX_CHAR_WIDTH;
3624        assert!(
3625            (width - expected).abs() < 1e-6,
3626            "Tj bbox width {} != one glyph's char_w {} (issue #1359 problem 2)",
3627            width,
3628            expected,
3629        );
3630    }
3631
3632    /// Cross-operator consistency: a `Tj` block and a `TJ` block emitting
3633    /// the same single-glyph ligature must produce TextBlocks with equal
3634    /// bbox widths. Pre-fix, Tj used `display_text.len()` (3 for "ffi")
3635    /// and TJ used `x_end - x_start` (1 glyph advance) — so the same input
3636    /// rendered as ~3× wider in Tj than in TJ.
3637    #[test]
3638    fn tj_and_tj_array_bbox_widths_agree_for_ligature() {
3639        let tj_doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
3640        let tj_blocks = extract_text(&tj_doc);
3641        let tj_doc_arr = make_doc_with_ligature_font(b"BT /F1 12 Tf [<01>] TJ ET", "ffi");
3642        let tj_arr_blocks = extract_text(&tj_doc_arr);
3643        assert_eq!(tj_blocks.len(), 1);
3644        assert_eq!(tj_arr_blocks.len(), 1);
3645        let tj_w = tj_blocks[0].bbox[2] - tj_blocks[0].bbox[0];
3646        let tj_arr_w = tj_arr_blocks[0].bbox[2] - tj_arr_blocks[0].bbox[0];
3647        assert!(
3648            (tj_w - tj_arr_w).abs() < 1e-6,
3649            "Tj bbox width {} disagrees with TJ bbox width {} for the same \
3650             single-glyph ligature input (issue #1359 problem 2)",
3651            tj_w,
3652            tj_arr_w,
3653        );
3654    }
3655
3656    // ---- G1: text-run metadata extraction tests ----
3657
3658    /// Build a doc with a single /F1 font resource carrying the given
3659    /// BaseFont and an optional FontDescriptor /Flags value.
3660    fn make_doc_with_font(content: &[u8], base_font: &str, desc_flags: Option<u32>) -> Document {
3661        let mut doc = Document::with_version("1.7");
3662
3663        let mut font_dict = dictionary! {
3664            "Type" => "Font",
3665            "Subtype" => "Type1",
3666            "BaseFont" => Object::Name(base_font.as_bytes().to_vec()),
3667        };
3668        if let Some(flags) = desc_flags {
3669            let descriptor = dictionary! {
3670                "Type" => "FontDescriptor",
3671                "FontName" => Object::Name(base_font.as_bytes().to_vec()),
3672                "Flags" => Object::Integer(flags as i64),
3673            };
3674            let desc_id = doc.add_object(Object::Dictionary(descriptor));
3675            font_dict.set("FontDescriptor", Object::Reference(desc_id));
3676        }
3677        let font_id = doc.add_object(Object::Dictionary(font_dict));
3678
3679        let resources = dictionary! {
3680            "Font" => dictionary! { "F1" => Object::Reference(font_id) },
3681        };
3682        let resources_id = doc.add_object(Object::Dictionary(resources));
3683
3684        let content_stream = Stream::new(dictionary! {}, content.to_vec());
3685        let content_id = doc.add_object(Object::Stream(content_stream));
3686
3687        let page_dict = dictionary! {
3688            "Type" => "Page",
3689            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3690            "Contents" => Object::Reference(content_id),
3691            "Resources" => Object::Reference(resources_id),
3692        };
3693        let page_id = doc.add_object(Object::Dictionary(page_dict));
3694
3695        let pages_dict = dictionary! {
3696            "Type" => "Pages",
3697            "Kids" => vec![Object::Reference(page_id)],
3698            "Count" => 1_i64,
3699        };
3700        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
3701
3702        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
3703            d.set("Parent", Object::Reference(pages_id));
3704        }
3705
3706        let catalog = dictionary! {
3707            "Type" => "Catalog",
3708            "Pages" => Object::Reference(pages_id),
3709        };
3710        let catalog_id = doc.add_object(Object::Dictionary(catalog));
3711        doc.trailer.set("Root", Object::Reference(catalog_id));
3712
3713        doc
3714    }
3715
3716    #[test]
3717    fn g1_tj_carries_basefont_metadata() {
3718        let doc = make_doc_with_font(b"BT /F1 12 Tf (Hello) Tj ET", "Helvetica", None);
3719        let blocks = extract_text(&doc);
3720        assert_eq!(blocks.len(), 1);
3721        assert_eq!(blocks[0].text, "Hello");
3722        assert_eq!(blocks[0].base_font.as_deref(), Some("Helvetica"));
3723        assert!(!blocks[0].is_bold);
3724        assert!(!blocks[0].is_italic);
3725    }
3726
3727    #[test]
3728    fn g1_tj_array_carries_basefont_metadata() {
3729        let doc = make_doc_with_font(b"BT /F1 12 Tf [(He) -100 (llo)] TJ ET", "Helvetica", None);
3730        let blocks = extract_text(&doc);
3731        assert_eq!(blocks.len(), 1);
3732        assert_eq!(blocks[0].text, "Hello");
3733        assert_eq!(blocks[0].base_font.as_deref(), Some("Helvetica"));
3734    }
3735
3736    #[test]
3737    fn g1_bold_inferred_from_basefont_name() {
3738        let doc = make_doc_with_font(b"BT /F1 12 Tf (Bold) Tj ET", "Helvetica-Bold", None);
3739        let blocks = extract_text(&doc);
3740        assert_eq!(blocks.len(), 1);
3741        assert!(blocks[0].is_bold);
3742        assert!(!blocks[0].is_italic);
3743        assert_eq!(blocks[0].base_font.as_deref(), Some("Helvetica-Bold"));
3744    }
3745
3746    #[test]
3747    fn g1_italic_inferred_from_basefont_name() {
3748        let doc = make_doc_with_font(b"BT /F1 12 Tf (Slanted) Tj ET", "Times-Italic", None);
3749        let blocks = extract_text(&doc);
3750        assert_eq!(blocks.len(), 1);
3751        assert!(blocks[0].is_italic);
3752        assert!(!blocks[0].is_bold);
3753    }
3754
3755    #[test]
3756    fn g1_subset_prefix_stripped_from_basefont() {
3757        let doc = make_doc_with_font(
3758            b"BT /F1 12 Tf (Subset) Tj ET",
3759            "ABCDEF+Helvetica-Bold",
3760            None,
3761        );
3762        let blocks = extract_text(&doc);
3763        assert_eq!(blocks.len(), 1);
3764        assert_eq!(blocks[0].base_font.as_deref(), Some("Helvetica-Bold"));
3765        assert!(blocks[0].is_bold);
3766    }
3767
3768    #[test]
3769    fn g1_italic_inferred_from_descriptor_flag() {
3770        // Flags bit 7 (1<<6 = 0x40) = Italic.
3771        let doc = make_doc_with_font(b"BT /F1 12 Tf (X) Tj ET", "Custom-Roman", Some(0x40));
3772        let blocks = extract_text(&doc);
3773        assert_eq!(blocks.len(), 1);
3774        assert!(blocks[0].is_italic);
3775        // Name has no "italic" hint; only the flag signals it.
3776        assert_eq!(blocks[0].base_font.as_deref(), Some("Custom-Roman"));
3777    }
3778
3779    #[test]
3780    fn g1_rg_color_propagates_to_block() {
3781        // 1 0 0 rg → solid red. Blocks emitted after the setter carry it.
3782        let doc = make_doc_with_font(b"1 0 0 rg BT /F1 12 Tf (Red) Tj ET", "Helvetica", None);
3783        let blocks = extract_text(&doc);
3784        assert_eq!(blocks.len(), 1);
3785        assert_eq!(blocks[0].color, Some([255, 0, 0, 255]));
3786    }
3787
3788    #[test]
3789    fn g1_gray_color_via_g_operator() {
3790        // 0.5 g → mid-gray broadcast to RGBA.
3791        let doc = make_doc_with_font(b"0.5 g BT /F1 12 Tf (Gray) Tj ET", "Helvetica", None);
3792        let blocks = extract_text(&doc);
3793        assert_eq!(blocks.len(), 1);
3794        let c = blocks[0].color.expect("expected gray color");
3795        // Round-trip tolerance: 0.5 * 255 ≈ 128.
3796        assert_eq!(c[3], 255);
3797        assert_eq!(c[0], c[1]);
3798        assert_eq!(c[1], c[2]);
3799        assert!((c[0] as i32 - 128).abs() <= 1);
3800    }
3801
3802    #[test]
3803    fn g1_default_color_is_black_for_compliant_initial_state() {
3804        // No color setter — PDF §8.6.5.3 initial fill = DeviceGray 0 = black.
3805        let doc = make_doc_with_font(b"BT /F1 12 Tf (Default) Tj ET", "Helvetica", None);
3806        let blocks = extract_text(&doc);
3807        assert_eq!(blocks.len(), 1);
3808        assert_eq!(blocks[0].color, Some([0, 0, 0, 255]));
3809    }
3810
3811    #[test]
3812    fn g1_cmyk_color_via_k_operator() {
3813        // Pure cyan: 1 0 0 0 k → (1-c)*(1-k)=0, (1-m)*(1-k)=1, (1-y)*(1-k)=1
3814        let doc = make_doc_with_font(b"1 0 0 0 k BT /F1 12 Tf (Cyan) Tj ET", "Helvetica", None);
3815        let blocks = extract_text(&doc);
3816        assert_eq!(blocks.len(), 1);
3817        let c = blocks[0].color.expect("k should set color");
3818        assert_eq!(c, [0, 255, 255, 255]);
3819    }
3820
3821    #[test]
3822    fn g1_tj_and_tj_array_emit_same_metadata() {
3823        // Identical font/color setup; differ only in show operator.
3824        let tj_doc =
3825            make_doc_with_font(b"1 0 0 rg BT /F1 12 Tf (Hi) Tj ET", "Helvetica-Bold", None);
3826        let tj_array_doc = make_doc_with_font(
3827            b"1 0 0 rg BT /F1 12 Tf [(Hi)] TJ ET",
3828            "Helvetica-Bold",
3829            None,
3830        );
3831        let a = extract_text(&tj_doc);
3832        let b = extract_text(&tj_array_doc);
3833        assert_eq!(a.len(), 1);
3834        assert_eq!(b.len(), 1);
3835        assert_eq!(a[0].base_font, b[0].base_font);
3836        assert_eq!(a[0].is_bold, b[0].is_bold);
3837        assert_eq!(a[0].is_italic, b[0].is_italic);
3838        assert_eq!(a[0].color, b[0].color);
3839    }
3840
3841    #[test]
3842    fn g1_color_restored_across_q_capital_q() {
3843        // Color set inside q/Q must not leak out.
3844        let doc = make_doc_with_font(
3845            b"q 1 0 0 rg Q BT /F1 12 Tf (Outside) Tj ET",
3846            "Helvetica",
3847            None,
3848        );
3849        let blocks = extract_text(&doc);
3850        assert_eq!(blocks.len(), 1);
3851        // Outside the saved scope, the initial black is what we see.
3852        assert_eq!(blocks[0].color, Some([0, 0, 0, 255]));
3853    }
3854
3855    #[test]
3856    fn g1_no_font_resource_means_no_base_font() {
3857        // Existing helper doesn't add a Font resource — confirms the
3858        // documented fallback path: base_font None, flags false.
3859        let doc = make_doc_with_text(b"BT /F1 12 Tf (NoFont) Tj ET");
3860        let blocks = extract_text(&doc);
3861        assert_eq!(blocks.len(), 1);
3862        assert_eq!(blocks[0].base_font, None);
3863        assert!(!blocks[0].is_bold);
3864        assert!(!blocks[0].is_italic);
3865        // Color still tracks: default initial black.
3866        assert_eq!(blocks[0].color, Some([0, 0, 0, 255]));
3867        // Legacy resource alias preserved.
3868        assert_eq!(blocks[0].font_name, "F1");
3869    }
3870
3871    #[test]
3872    fn g1_unknown_color_space_marks_color_none() {
3873        // /DeviceN cs followed by scn with 5 operands → can't safely map.
3874        let doc = make_doc_with_font(
3875            b"/Cs1 cs 0.5 0.5 0.5 0.5 0.5 scn BT /F1 12 Tf (X) Tj ET",
3876            "Helvetica",
3877            None,
3878        );
3879        let blocks = extract_text(&doc);
3880        assert_eq!(blocks.len(), 1);
3881        assert_eq!(blocks[0].color, None);
3882    }
3883
3884    // ---- G2: widths and char_bounds tests ----
3885
3886    /// Build a doc where font F1 has a /Widths array starting at FirstChar 65 ('A').
3887    fn make_doc_with_widths(content: &[u8], first_char: i64, widths: Vec<i64>) -> Document {
3888        let mut doc = Document::with_version("1.7");
3889
3890        let widths_obj = Object::Array(widths.iter().map(|w| Object::Integer(*w)).collect());
3891
3892        let font_dict = dictionary! {
3893            "Type" => "Font",
3894            "Subtype" => "Type1",
3895            "BaseFont" => Object::Name(b"TestFont".to_vec()),
3896            "FirstChar" => Object::Integer(first_char),
3897            "LastChar" => Object::Integer(first_char + widths.len() as i64 - 1),
3898            "Widths" => widths_obj,
3899        };
3900        let font_id = doc.add_object(Object::Dictionary(font_dict));
3901
3902        let resources = dictionary! {
3903            "Font" => dictionary! { "F1" => Object::Reference(font_id) },
3904        };
3905        let resources_id = doc.add_object(Object::Dictionary(resources));
3906
3907        let content_stream = Stream::new(dictionary! {}, content.to_vec());
3908        let content_id = doc.add_object(Object::Stream(content_stream));
3909
3910        let page_dict = dictionary! {
3911            "Type" => "Page",
3912            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
3913            "Contents" => Object::Reference(content_id),
3914            "Resources" => Object::Reference(resources_id),
3915        };
3916        let page_id = doc.add_object(Object::Dictionary(page_dict));
3917
3918        let pages_dict = dictionary! {
3919            "Type" => "Pages",
3920            "Kids" => vec![Object::Reference(page_id)],
3921            "Count" => 1_i64,
3922        };
3923        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
3924
3925        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
3926            d.set("Parent", Object::Reference(pages_id));
3927        }
3928
3929        let catalog = dictionary! {
3930            "Type" => "Catalog",
3931            "Pages" => Object::Reference(pages_id),
3932        };
3933        let catalog_id = doc.add_object(Object::Dictionary(catalog));
3934        doc.trailer.set("Root", Object::Reference(catalog_id));
3935
3936        doc
3937    }
3938
3939    #[test]
3940    fn g2_no_widths_gives_estimate_source() {
3941        // Font dict with no /Widths → WidthSource::Estimate.
3942        let doc = make_doc_with_font(b"BT /F1 10 Tf (A) Tj ET", "Helvetica", None);
3943        let blocks = extract_text(&doc);
3944        assert_eq!(blocks.len(), 1);
3945        assert_eq!(blocks[0].width_source, WidthSource::Estimate);
3946    }
3947
3948    #[test]
3949    fn g2_widths_array_gives_metric_source() {
3950        // 'A' = 0x41 = 65; single-char string 'A', width 722 glyph units.
3951        // advance = 722/1000 * 10pt = 7.22 user units.
3952        let doc = make_doc_with_widths(b"BT /F1 10 Tf (A) Tj ET", 65, vec![722]);
3953        let blocks = extract_text(&doc);
3954        assert_eq!(blocks.len(), 1);
3955        assert_eq!(blocks[0].width_source, WidthSource::Metric);
3956        // char_bounds should have exactly one entry.
3957        assert_eq!(blocks[0].char_bounds.len(), 1);
3958        let [x0, _y0, x1, _y1] = blocks[0].char_bounds[0];
3959        // advance = 722/1000 * 10 = 7.22; x0 should be at origin (0).
3960        let advance = x1 - x0;
3961        assert!((advance - 7.22).abs() < 0.01, "advance={advance:.4}");
3962    }
3963
3964    #[test]
3965    fn g2_monospace_uniform_char_bounds() {
3966        // Courier-like: all glyphs width 600. Five chars → five equal-width bounds.
3967        // 'A'=65,'B'=66,'C'=67,'D'=68,'E'=69 — widths all 600.
3968        let widths: Vec<i64> = vec![600, 600, 600, 600, 600];
3969        let doc = make_doc_with_widths(b"BT /F1 12 Tf (ABCDE) Tj ET", 65, widths);
3970        let blocks = extract_text(&doc);
3971        assert_eq!(blocks.len(), 1);
3972        assert_eq!(blocks[0].width_source, WidthSource::Metric);
3973        assert_eq!(blocks[0].char_bounds.len(), 5);
3974        let expected_adv = 600.0 / 1000.0 * 12.0; // 7.2
3975        for i in 0..5 {
3976            let [x0, _, x1, _] = blocks[0].char_bounds[i];
3977            let adv = x1 - x0;
3978            assert!(
3979                (adv - expected_adv).abs() < 0.01,
3980                "char {i}: advance={adv:.4} expected={expected_adv:.4}"
3981            );
3982        }
3983    }
3984
3985    #[test]
3986    fn g2_proportional_char_bounds_are_contiguous() {
3987        // Proportional font: 'A'=722, 'B'=667, 'C'=667.
3988        let widths: Vec<i64> = vec![722, 667, 667];
3989        let doc = make_doc_with_widths(b"BT /F1 10 Tf (ABC) Tj ET", 65, widths);
3990        let blocks = extract_text(&doc);
3991        assert_eq!(blocks.len(), 1);
3992        assert_eq!(blocks[0].char_bounds.len(), 3);
3993        // Bounds must be contiguous: x1 of char[i] == x0 of char[i+1]
3994        // (tc=0, tw=0 default → no spacing gaps).
3995        for i in 0..2 {
3996            let x1_prev = blocks[0].char_bounds[i][2];
3997            let x0_next = blocks[0].char_bounds[i + 1][0];
3998            assert!(
3999                (x1_prev - x0_next).abs() < 0.001,
4000                "gap between char {i} and {}: {:.4}",
4001                i + 1,
4002                x0_next - x1_prev
4003            );
4004        }
4005    }
4006
4007    #[test]
4008    fn g2_tj_kerning_shifts_subsequent_bounds() {
4009        // TJ: [(A) -200 (B)] — 'A' then 200 units kern then 'B'.
4010        // 'A'=722, 'B'=667 (font size 10 → /1000).
4011        // After 'A': x = 722/1000*10 = 7.22.
4012        // Kerning: -200/1000*10 = +2.0 (negative kern shifts right: x -= -2 = x += 2).
4013        // Wait: TJ negative = move RIGHT. adj = -200 → x -= -200/1000*10 = x += 2.0.
4014        // 'B' starts at 7.22 + 2.0 = 9.22.
4015        let widths: Vec<i64> = vec![722, 667];
4016        let doc = make_doc_with_widths(b"BT /F1 10 Tf [(A) -200 (B)] TJ ET", 65, widths);
4017        let blocks = extract_text(&doc);
4018        assert_eq!(blocks.len(), 1);
4019        assert_eq!(blocks[0].char_bounds.len(), 2);
4020        let [x0_a, _, x1_a, _] = blocks[0].char_bounds[0];
4021        let [x0_b, _, _, _] = blocks[0].char_bounds[1];
4022        // 'A' advance
4023        assert!(
4024            (x1_a - x0_a - 7.22).abs() < 0.01,
4025            "A advance={:.4}",
4026            x1_a - x0_a
4027        );
4028        // Gap includes kerning: x0_b should be further right than x1_a.
4029        assert!(x0_b > x1_a, "B should start after A ends");
4030        let gap = x0_b - x1_a;
4031        assert!(
4032            (gap - 2.0).abs() < 0.01,
4033            "kerning gap={gap:.4} expected=2.0"
4034        );
4035    }
4036
4037    #[test]
4038    fn g2_subset_font_prefix_does_not_affect_widths() {
4039        // Subset prefix on BaseFont should not prevent width lookup.
4040        let widths: Vec<i64> = vec![722];
4041        let doc = make_doc_with_widths(b"BT /F1 10 Tf (A) Tj ET", 65, widths);
4042        let blocks = extract_text(&doc);
4043        assert_eq!(blocks.len(), 1);
4044        // With widths present, source is always Metric.
4045        assert_eq!(blocks[0].width_source, WidthSource::Metric);
4046        assert_eq!(blocks[0].char_bounds.len(), 1);
4047    }
4048
4049    #[test]
4050    fn g2_char_bounds_count_equals_glyph_count_not_unicode_len() {
4051        // A ligature encoded as a single byte but decomposed to 2 Unicode chars.
4052        // char_bounds must have ONE entry (source glyph), not two.
4053        // We rely on the APPROX path for this test (no /Widths → Estimate).
4054        let doc = make_doc_with_font(b"BT /F1 12 Tf (A) Tj ET", "TestFont", None);
4055        let blocks = extract_text(&doc);
4056        assert_eq!(blocks.len(), 1);
4057        // Single source byte → exactly one char_bound.
4058        assert_eq!(blocks[0].char_bounds.len(), 1);
4059    }
4060}
pdfluent_extract/text.rs

pdfluent_extract/
text.rs