pdfluent_extract/
text.rs

1//! Text extraction with character-level position tracking.
2//!
3//! Parses content stream text operators (Tj, TJ, Tm, Td, TD, T*, Tc, Tw, Tz, TL, Ts, ', ")
4//! to extract text with positional information.
5
6use crate::error::{ExtractError, Result};
7use lopdf::content::{Content, Operation};
8use lopdf::{Document, Object, ObjectId};
9use std::collections::HashMap;
10use std::sync::OnceLock;
11
12/// Approximate character width as a fraction of font size.
13const APPROX_CHAR_WIDTH: f64 = 0.5;
14
15/// A block of text extracted from a page.
16#[derive(Debug, Clone)]
17pub struct TextBlock {
18    /// The extracted text content.
19    pub text: String,
20    /// The page number (1-based).
21    pub page: u32,
22    /// Bounding box [x0, y0, x1, y1] in PDF coordinates.
23    pub bbox: [f64; 4],
24    /// Font name used for this text block.
25    pub font_name: String,
26    /// Font size in points.
27    pub font_size: f64,
28    /// `/ActualText` override from a surrounding marked-content sequence (BDC),
29    /// if any. PDF/UA-compliant authors use this to provide the canonical
30    /// reading-order text for ligatures or other glyph clusters whose visual
31    /// `text` does not match the intended characters.
32    pub actual_text: Option<String>,
33}
34
35/// A single character with its position on the page.
36#[derive(Debug, Clone)]
37pub struct PositionedChar {
38    /// The character.
39    pub ch: char,
40    /// The page number (1-based).
41    pub page: u32,
42    /// Bounding box [x0, y0, x1, y1] in PDF coordinates.
43    pub bbox: [f64; 4],
44}
45
46/// Internal graphics state for save/restore (q/Q).
47#[derive(Debug, Clone)]
48struct GraphicsState {
49    ctm: [f64; 6],
50}
51
52/// Internal text state tracker.
53#[derive(Debug, Clone)]
54struct TextState {
55    /// Text matrix.
56    tm: [f64; 6],
57    /// Text line matrix.
58    tlm: [f64; 6],
59    /// Current font name.
60    font_name: String,
61    /// Current font size.
62    font_size: f64,
63    /// Character spacing (Tc).
64    tc: f64,
65    /// Word spacing (Tw).
66    tw: f64,
67    /// Horizontal scaling (Tz), as a percentage.
68    th: f64,
69    /// Text leading (TL).
70    tl: f64,
71    /// Text rise (Ts).
72    ts: f64,
73    /// Graphics state stack.
74    gs_stack: Vec<GraphicsState>,
75    /// Current transformation matrix.
76    ctm: [f64; 6],
77}
78
79impl Default for TextState {
80    fn default() -> Self {
81        Self {
82            tm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
83            tlm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
84            font_name: String::new(),
85            font_size: 12.0,
86            tc: 0.0,
87            tw: 0.0,
88            th: 100.0,
89            tl: 0.0,
90            ts: 0.0,
91            gs_stack: Vec::new(),
92            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
93        }
94    }
95}
96
97/// Per-font information for text decoding.
98#[derive(Clone)]
99struct FontInfo {
100    /// True if the font uses 2-byte CID encoding (Identity-H/V or other CID CMaps).
101    is_cid: bool,
102    /// ToUnicode CMap: maps character code(s) to Unicode string.
103    /// For CID fonts the key is a 2-byte big-endian value; for simple fonts it's a 1-byte code.
104    to_unicode: HashMap<u32, String>,
105    /// Encoding-based code→char map for simple fonts (derived from BaseEncoding + Differences).
106    /// Index = byte code (0..255), value = Unicode char if known.
107    encoding_map: [Option<char>; 256],
108    /// Side-table marking codes whose `Differences` glyph name is `ct`.
109    /// The `ct` ligature has no precomposed Unicode codepoint, so it cannot
110    /// be expressed as a `char` in `encoding_map`. When this flag is set,
111    /// decoding emits one private-use marker scalar and records that marker's
112    /// origin so glyph advance remains one codepoint wide. The marker expands
113    /// to "ct" only in the ligature-decomposition layer, avoiding clobbering
114    /// legitimate U+E007 values that arrive through ToUnicode CMaps.
115    ct_codes: [bool; 256],
116}
117
118/// Build a map from font resource name (e.g. "F1") to FontInfo for a page.
119fn build_font_map(doc: &Document, page_id: ObjectId) -> HashMap<String, FontInfo> {
120    let mut map = HashMap::new();
121
122    // Get the page's Resources dictionary (may be inherited from parent Pages node).
123    let resources = get_page_resources(doc, page_id);
124    let font_dict = match resources.and_then(|res| match res.get(b"Font").ok()? {
125        Object::Dictionary(d) => Some(d.clone()),
126        Object::Reference(r) => match doc.get_object(*r).ok()? {
127            Object::Dictionary(d) => Some(d.clone()),
128            _ => None,
129        },
130        _ => None,
131    }) {
132        Some(d) => d,
133        None => return map,
134    };
135
136    for (name_bytes, value) in font_dict.iter() {
137        let font_name = String::from_utf8_lossy(name_bytes).to_string();
138
139        // Resolve the font dictionary.
140        let font = match value {
141            Object::Reference(r) => match doc.get_object(*r).ok() {
142                Some(Object::Dictionary(d)) => d.clone(),
143                _ => continue,
144            },
145            Object::Dictionary(d) => d.clone(),
146            _ => continue,
147        };
148
149        let subtype = font
150            .get(b"Subtype")
151            .ok()
152            .and_then(|o| match o {
153                Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
154                _ => None,
155            })
156            .unwrap_or_default();
157
158        let is_cid = subtype == "Type0";
159
160        // Parse ToUnicode CMap if present.
161        let to_unicode = parse_to_unicode_from_font(doc, &font);
162
163        // For Type0 fonts, also check DescendantFonts for ToUnicode.
164        let to_unicode = if to_unicode.is_empty() && is_cid {
165            if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
166                descendants
167                    .iter()
168                    .find_map(|d| {
169                        let desc_dict = match d {
170                            Object::Reference(r) => match doc.get_object(*r).ok()? {
171                                Object::Dictionary(d) => d,
172                                _ => return None,
173                            },
174                            Object::Dictionary(d) => d,
175                            _ => return None,
176                        };
177                        let tu = parse_to_unicode_from_font(doc, desc_dict);
178                        if tu.is_empty() {
179                            None
180                        } else {
181                            Some(tu)
182                        }
183                    })
184                    .unwrap_or_default()
185            } else {
186                HashMap::new()
187            }
188        } else {
189            to_unicode
190        };
191
192        // Build encoding map for simple fonts (from Encoding + Differences).
193        let (encoding_map, ct_codes) = if !is_cid {
194            build_encoding_map(doc, &font)
195        } else {
196            ([None; 256], [false; 256])
197        };
198
199        map.insert(
200            font_name,
201            FontInfo {
202                is_cid,
203                to_unicode,
204                encoding_map,
205                ct_codes,
206            },
207        );
208    }
209
210    map
211}
212
213/// Parse the ToUnicode CMap from a font dictionary.
214fn parse_to_unicode_from_font(doc: &Document, font: &lopdf::Dictionary) -> HashMap<u32, String> {
215    let tu_obj = match font.get(b"ToUnicode").ok() {
216        Some(Object::Reference(r)) => doc.get_object(*r).ok(),
217        Some(obj) => Some(obj),
218        None => return HashMap::new(),
219    };
220
221    let stream_bytes = match tu_obj {
222        Some(Object::Stream(ref s)) => s
223            .decompressed_content()
224            .ok()
225            .unwrap_or_else(|| s.content.clone()),
226        _ => return HashMap::new(),
227    };
228
229    parse_to_unicode_cmap(&stream_bytes)
230}
231
232/// Parse a ToUnicode CMap stream into a code→Unicode mapping.
233///
234/// Handles both `beginbfchar` and `beginbfrange` sections.
235fn parse_to_unicode_cmap(data: &[u8]) -> HashMap<u32, String> {
236    let text = String::from_utf8_lossy(data);
237    let mut map = HashMap::new();
238
239    // Parse beginbfchar sections: <srcCode> <dstString>
240    for section in text.split("beginbfchar") {
241        let section = match section.split("endbfchar").next() {
242            Some(s) => s,
243            None => continue,
244        };
245        let tokens = extract_hex_tokens(section);
246        for pair in tokens.chunks(2) {
247            if pair.len() == 2 {
248                let code = parse_hex_u32(&pair[0]);
249                let unicode = hex_to_unicode_string(&pair[1]);
250                map.insert(code, unicode);
251            }
252        }
253    }
254
255    // Parse beginbfrange sections: <srcLo> <srcHi> <dstStart> or <srcLo> <srcHi> [<dst1> <dst2> ...]
256    for section in text.split("beginbfrange") {
257        let section = match section.split("endbfrange").next() {
258            Some(s) => s,
259            None => continue,
260        };
261
262        // Tokenize: extract hex tokens and array brackets
263        let mut chars = section.chars().peekable();
264        let mut tokens: Vec<String> = Vec::new();
265        let mut arrays: Vec<Vec<String>> = Vec::new();
266        let mut in_array = false;
267        let mut current_array: Vec<String> = Vec::new();
268
269        while let Some(&ch) = chars.peek() {
270            if ch == '<' {
271                chars.next();
272                let hex: String = chars
273                    .by_ref()
274                    .take_while(|&c| c != '>')
275                    .filter(|c| !c.is_whitespace())
276                    .collect();
277                if in_array {
278                    current_array.push(hex);
279                } else {
280                    tokens.push(hex);
281                }
282            } else if ch == '[' {
283                chars.next();
284                in_array = true;
285                current_array = Vec::new();
286            } else if ch == ']' {
287                chars.next();
288                in_array = false;
289                arrays.push(std::mem::take(&mut current_array));
290                tokens.push(String::new()); // placeholder for array position
291            } else {
292                chars.next();
293            }
294        }
295
296        // Process range entries: every 3 tokens = (lo, hi, dst_or_array)
297        let mut array_idx = 0;
298        let mut i = 0;
299        while i + 2 < tokens.len() {
300            let lo = parse_hex_u32(&tokens[i]);
301            let hi = parse_hex_u32(&tokens[i + 1]);
302
303            if tokens[i + 2].is_empty() {
304                // Array destination
305                if array_idx < arrays.len() {
306                    let arr = &arrays[array_idx];
307                    for (offset, dst) in arr.iter().enumerate() {
308                        let code = lo + offset as u32;
309                        if code <= hi {
310                            map.insert(code, hex_to_unicode_string(dst));
311                        }
312                    }
313                    array_idx += 1;
314                }
315            } else {
316                // Single start value — increment for each code in range
317                let dst_start = parse_hex_u32(&tokens[i + 2]);
318                let dst_len = tokens[i + 2].len();
319                for code in lo..=hi {
320                    let dst_val = dst_start + (code - lo);
321                    let s = if dst_len <= 4 {
322                        // BMP character
323                        char::from_u32(dst_val)
324                            .map(|c| c.to_string())
325                            .unwrap_or_default()
326                    } else {
327                        // Multi-byte: treat as UTF-16BE pairs
328                        let hex = format!("{:0>width$X}", dst_val, width = dst_len);
329                        hex_to_unicode_string(&hex)
330                    };
331                    map.insert(code, s);
332                }
333            }
334            i += 3;
335        }
336    }
337
338    map
339}
340
341/// Extract hex tokens (contents between < and >) from text.
342fn extract_hex_tokens(text: &str) -> Vec<String> {
343    let mut tokens = Vec::new();
344    let mut in_hex = false;
345    let mut current = String::new();
346    for ch in text.chars() {
347        if ch == '<' {
348            in_hex = true;
349            current.clear();
350        } else if ch == '>' && in_hex {
351            in_hex = false;
352            tokens.push(current.clone());
353        } else if in_hex && !ch.is_whitespace() {
354            current.push(ch);
355        }
356    }
357    tokens
358}
359
360/// Parse a hex string to a u32 (e.g., "0041" → 65).
361fn parse_hex_u32(hex: &str) -> u32 {
362    u32::from_str_radix(hex, 16).unwrap_or(0)
363}
364
365/// Convert a hex string to a Unicode string (interpreting as UTF-16BE pairs).
366///
367/// Phase 3 Round 7: slice the underlying bytes instead of the `&str` so we
368/// cannot panic on a non-char-boundary index when the input contains
369/// non-ASCII fragments (e.g. a stray `U+FFFD` produced by an upstream
370/// font decode). Bytes that do not form a valid 2-byte ASCII hex pair are
371/// skipped by `from_utf8` + `from_str_radix`, matching the previous
372/// best-effort behaviour without panicking.
373fn hex_to_unicode_string(hex: &str) -> String {
374    let hex_bytes = hex.as_bytes();
375    let bytes: Vec<u8> = (0..hex_bytes.len())
376        .step_by(2)
377        .filter_map(|i| {
378            let end = (i + 2).min(hex_bytes.len());
379            std::str::from_utf8(&hex_bytes[i..end])
380                .ok()
381                .and_then(|s| u8::from_str_radix(s, 16).ok())
382        })
383        .collect();
384
385    if bytes.len() >= 2 && bytes.len().is_multiple_of(2) {
386        // Interpret as UTF-16BE
387        let u16s: Vec<u16> = bytes
388            .chunks(2)
389            .map(|c| u16::from_be_bytes([c[0], c[1]]))
390            .collect();
391        String::from_utf16_lossy(&u16s)
392    } else if bytes.len() == 1 {
393        char::from_u32(bytes[0] as u32)
394            .map(|c| c.to_string())
395            .unwrap_or_default()
396    } else {
397        String::new()
398    }
399}
400
401/// Returns `true` when an extracted text body is non-empty but
402/// almost certainly not human-readable.
403///
404/// PDFs that embed fonts without a working `ToUnicode` CMap routinely
405/// produce extraction output that is technically a non-empty
406/// `String` (the glyph codes round-tripped to bytes) but cannot be
407/// rendered or searched meaningfully — e.g. Caesar-shifted ASCII, or
408/// strings made up almost entirely of punctuation and control bytes.
409/// Callers that need to distinguish "extraction succeeded but is
410/// garbled" from "extraction succeeded with real text" can use this
411/// helper.
412///
413/// The heuristic looks at the first 500 characters:
414///
415/// - If fewer than 30 non-whitespace characters are present, returns
416///   `false` (the sample is too short to judge).
417/// - Otherwise computes the ratio of ASCII alphabetic characters
418///   over the inspected sample and returns `true` when the ratio is
419///   below 25%.
420///
421/// This is an intentionally cheap pre-filter, not a content-quality
422/// oracle. It catches the "extracted text is dominated by
423/// punctuation / control bytes" class of garbling (the shape
424/// commonly seen when a font has no ToUnicode CMap and pdf-extract
425/// falls back to raw glyph codes). It does **not** catch
426/// monoalphabetic-cipher / Caesar-shifted garbling (the shape where
427/// each glyph maps to a different letter but the alpha-character
428/// ratio is preserved) — detecting that requires a linguistic /
429/// frequency-distribution model and is deferred to a future ADR
430/// (see `docs/design/text_extraction_diagnostics.md`). False
431/// positives on pages that legitimately contain only numbers /
432/// symbols (e.g. some accounting tables) are possible but expected
433/// to be rare in practice. Callers that need stronger guarantees
434/// should pair this with their own domain-specific anchors.
435///
436/// This is a read-only diagnostic helper. It does not modify
437/// extraction behaviour and is not consumed by the existing
438/// `extract_page_text` / `extract_text` / `extract_page_blocks`
439/// public APIs; consumers opt in explicitly.
440pub fn suspected_garbled(text: &str) -> bool {
441    let sample: String = text.chars().take(500).collect();
442    let body_len = sample.chars().filter(|c| !c.is_whitespace()).count();
443    if body_len < 30 {
444        return false;
445    }
446    let total = sample.chars().count() as f64;
447    let alpha = sample.chars().filter(|c| c.is_ascii_alphabetic()).count() as f64;
448    (alpha / total) < 0.25
449}
450
451#[cfg(test)]
452mod __suspected_garbled_unit_tests {
453    use super::suspected_garbled;
454
455    #[test]
456    fn empty_string_is_not_garbled() {
457        assert!(!suspected_garbled(""));
458    }
459
460    #[test]
461    fn whitespace_only_is_not_garbled() {
462        assert!(!suspected_garbled("   \n\t  "));
463    }
464
465    #[test]
466    fn short_non_empty_is_not_garbled_under_threshold() {
467        // Fewer than 30 non-whitespace chars -> heuristic returns false.
468        assert!(!suspected_garbled("Hi there!"));
469    }
470
471    #[test]
472    fn real_english_is_not_garbled() {
473        let text =
474            "Department of Homeland Security U.S. Citizenship and Immigration Services Form I-129. \
475             Filing fees apply per applicant.";
476        assert!(!suspected_garbled(text));
477    }
478
479    #[test]
480    fn caesar_shifted_is_documented_limitation() {
481        // Real Caesar-shifted output observed on vps_141f12dfefe76741.
482        // A 1-to-1 letter substitution preserves the alpha-character ratio,
483        // so the ASCII-alpha-ratio heuristic returns false. This test pins
484        // the limitation: detecting Caesar / monoalphabetic-cipher garbled
485        // text requires a stronger linguistic model and is deferred to a
486        // future ADR (see docs/design/text_extraction_diagnostics.md).
487        let text =
488            "3(5)250$1&(%21'6HHLQVWUXFWLRQVRQUHYHUVH'$7(%21'(;(&87('0XVWEHVDPHRUODWHUWKDQGDWHRIFRQWUDFW20%1R3XEOLF";
489        assert!(!suspected_garbled(text));
490    }
491
492    #[test]
493    fn custom_encoding_garbled_is_detected() {
494        // Real custom-encoding output observed on vps_7dbbe9d92ee7c3b9 page 1 via lopdf.
495        // This shape — extracted text dominated by punctuation / control bytes
496        // with few real letters — is exactly what the ASCII-alpha-ratio
497        // heuristic targets.
498        let text =
499            "           !\" !   !#   $%&\u{2019}\"((\")*+,$&\"+-(&$&.$%&-&/\"/&0, &$*$%&/$%&/$\u{2019}$&  *   1     -2  $23%  $23\"";
500        assert!(suspected_garbled(text));
501    }
502
503    #[test]
504    fn mostly_numbers_text_is_not_falsely_flagged() {
505        // 80+ chars of letters >> 25% ASCII alpha -> not garbled.
506        let text =
507            "Invoice 12345  Total 67890.00 USD  Date 2026-05-13  Vendor name and contact details here.";
508        assert!(!suspected_garbled(text));
509    }
510}
511
512/// Get the Resources dictionary for a page (resolves inheritance from parent Pages).
513fn get_page_resources(doc: &Document, page_id: ObjectId) -> Option<lopdf::Dictionary> {
514    let page = match doc.get_object(page_id).ok()? {
515        Object::Dictionary(d) => d.clone(),
516        _ => return None,
517    };
518
519    // Direct Resources on the page.
520    if let Some(res) = resolve_dict(doc, &page, b"Resources") {
521        return Some(res);
522    }
523
524    // Walk up the parent chain (Pages nodes) for inherited Resources.
525    let mut current = page;
526    for _ in 0..20 {
527        let parent_ref = match current.get(b"Parent").ok()? {
528            Object::Reference(r) => *r,
529            _ => break,
530        };
531        let parent = match doc.get_object(parent_ref).ok()? {
532            Object::Dictionary(d) => d.clone(),
533            _ => break,
534        };
535        if let Some(res) = resolve_dict(doc, &parent, b"Resources") {
536            return Some(res);
537        }
538        current = parent;
539    }
540
541    None
542}
543
544/// Resolve a dictionary entry that may be inline or an indirect reference.
545fn resolve_dict(doc: &Document, dict: &lopdf::Dictionary, key: &[u8]) -> Option<lopdf::Dictionary> {
546    match dict.get(key).ok()? {
547        Object::Dictionary(d) => Some(d.clone()),
548        Object::Reference(r) => match doc.get_object(*r).ok()? {
549            Object::Dictionary(d) => Some(d.clone()),
550            _ => None,
551        },
552        _ => None,
553    }
554}
555
556/// Build a 256-entry encoding map from a font's /Encoding dictionary.
557///
558/// Handles:
559/// - Named base encodings: WinAnsiEncoding, MacRomanEncoding, MacExpertEncoding
560/// - Differences arrays: `[code1 /name1 /name2 ... codeN /nameN ...]`
561/// - Glyph name → Unicode via AGL (Adobe Glyph List) lookup
562fn build_encoding_map(
563    doc: &Document,
564    font: &lopdf::Dictionary,
565) -> ([Option<char>; 256], [bool; 256]) {
566    let mut table = [None::<char>; 256];
567    let mut ct_codes = [false; 256];
568
569    let encoding = match font.get(b"Encoding").ok() {
570        Some(obj) => obj,
571        None => return (table, ct_codes),
572    };
573
574    match encoding {
575        Object::Name(name) => {
576            // Named encoding (e.g. "WinAnsiEncoding").
577            let name_str = String::from_utf8_lossy(name);
578            apply_base_encoding(&mut table, &name_str);
579        }
580        Object::Reference(r) => match doc.get_object(*r) {
581            Ok(Object::Dictionary(enc_dict)) => {
582                parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
583            }
584            Ok(Object::Name(name)) => {
585                let name_str = String::from_utf8_lossy(name);
586                apply_base_encoding(&mut table, &name_str);
587            }
588            _ => {}
589        },
590        Object::Dictionary(enc_dict) => {
591            parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
592        }
593        _ => {}
594    }
595
596    (table, ct_codes)
597}
598
599/// Parse an Encoding dictionary with optional BaseEncoding and Differences.
600fn parse_encoding_dict(
601    doc: &Document,
602    enc_dict: &lopdf::Dictionary,
603    table: &mut [Option<char>; 256],
604    ct_codes: &mut [bool; 256],
605) {
606    // Apply BaseEncoding first.
607    if let Ok(Object::Name(base)) = enc_dict.get(b"BaseEncoding") {
608        let base_str = String::from_utf8_lossy(base);
609        apply_base_encoding(table, &base_str);
610    }
611
612    // Apply Differences array: [code /name1 /name2 ... code /name3 ...]
613    let diffs = match enc_dict.get(b"Differences").ok() {
614        Some(Object::Array(arr)) => arr.clone(),
615        Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
616            Some(Object::Array(arr)) => arr.clone(),
617            _ => return,
618        },
619        _ => return,
620    };
621
622    let mut code: Option<u32> = None;
623    for item in &diffs {
624        match item {
625            Object::Integer(n) => {
626                code = Some(*n as u32);
627            }
628            Object::Name(name) => {
629                if let Some(c) = code {
630                    if c < 256 {
631                        let glyph = String::from_utf8_lossy(name);
632                        apply_glyph_name(&glyph, c as usize, table, ct_codes);
633                    }
634                    code = Some(c + 1);
635                }
636            }
637            Object::Reference(r) => {
638                // Indirect name reference (rare).
639                if let Ok(Object::Name(name)) = doc.get_object(*r) {
640                    if let Some(c) = code {
641                        if c < 256 {
642                            let glyph = String::from_utf8_lossy(name);
643                            apply_glyph_name(&glyph, c as usize, table, ct_codes);
644                        }
645                        code = Some(c + 1);
646                    }
647                }
648            }
649            _ => {}
650        }
651    }
652}
653
654/// Resolve a glyph name to either a single Unicode codepoint (stored in
655/// `table`) or the `ct` ligature marker (recorded in `ct_codes`). The `ct`
656/// ligature has no precomposed Unicode codepoint, so it cannot live in
657/// `table`; it is tracked separately so that decoding can emit one internal
658/// marker and preserve one-glyph text advance before later decomposition.
659fn apply_glyph_name(
660    glyph: &str,
661    code: usize,
662    table: &mut [Option<char>; 256],
663    ct_codes: &mut [bool; 256],
664) {
665    if glyph == "ct" {
666        // Clear any prior char mapping at this slot — the ct flag is the
667        // sole source of truth for this code.
668        table[code] = None;
669        ct_codes[code] = true;
670        return;
671    }
672    if let Some(ch) = glyph_name_to_unicode(glyph) {
673        table[code] = Some(ch);
674        ct_codes[code] = false;
675    }
676}
677
678/// Apply a named base encoding to the table.
679fn apply_base_encoding(table: &mut [Option<char>; 256], name: &str) {
680    let source = match name {
681        "WinAnsiEncoding" => winansi_encoding(),
682        "MacRomanEncoding" => mac_roman_encoding(),
683        _ => return,
684    };
685    for (i, &ch) in source.iter().enumerate() {
686        if ch != '\0' {
687            table[i] = Some(ch);
688        }
689    }
690}
691
692/// WinAnsiEncoding table (cp1252).
693fn winansi_encoding() -> &'static [char; 256] {
694    static TABLE: OnceLock<[char; 256]> = OnceLock::new();
695    TABLE.get_or_init(|| {
696        let mut t = ['\0'; 256];
697        // ASCII range is identity.
698        for i in 0x20..=0x7Eu8 {
699            t[i as usize] = i as char;
700        }
701        // Control chars mapped to common usage.
702        t[0x09] = '\t';
703        t[0x0A] = '\n';
704        t[0x0D] = '\r';
705        // cp1252 upper range (0x80-0xFF).
706        let cp1252: [(u8, char); 27] = [
707            (0x80, '\u{20AC}'), // Euro sign
708            (0x82, '\u{201A}'), // Single low-9 quotation mark
709            (0x83, '\u{0192}'), // Latin small letter f with hook
710            (0x84, '\u{201E}'), // Double low-9 quotation mark
711            (0x85, '\u{2026}'), // Horizontal ellipsis
712            (0x86, '\u{2020}'), // Dagger
713            (0x87, '\u{2021}'), // Double dagger
714            (0x88, '\u{02C6}'), // Modifier letter circumflex accent
715            (0x89, '\u{2030}'), // Per mille sign
716            (0x8A, '\u{0160}'), // Latin capital letter S with caron
717            (0x8B, '\u{2039}'), // Single left-pointing angle quotation mark
718            (0x8C, '\u{0152}'), // Latin capital ligature OE
719            (0x8E, '\u{017D}'), // Latin capital letter Z with caron
720            (0x91, '\u{2018}'), // Left single quotation mark
721            (0x92, '\u{2019}'), // Right single quotation mark
722            (0x93, '\u{201C}'), // Left double quotation mark
723            (0x94, '\u{201D}'), // Right double quotation mark
724            (0x95, '\u{2022}'), // Bullet
725            (0x96, '\u{2013}'), // En dash
726            (0x97, '\u{2014}'), // Em dash
727            (0x98, '\u{02DC}'), // Small tilde
728            (0x99, '\u{2122}'), // Trade mark sign
729            (0x9A, '\u{0161}'), // Latin small letter s with caron
730            (0x9B, '\u{203A}'), // Single right-pointing angle quotation mark
731            (0x9C, '\u{0153}'), // Latin small ligature oe
732            (0x9E, '\u{017E}'), // Latin small letter z with caron
733            (0x9F, '\u{0178}'), // Latin capital letter Y with diaeresis
734        ];
735        for (code, ch) in cp1252 {
736            t[code as usize] = ch;
737        }
738        // 0xA0-0xFF: same as Unicode Latin-1 supplement.
739        for i in 0xA0..=0xFFu16 {
740            t[i as usize] = char::from_u32(i as u32).unwrap_or('\0');
741        }
742        t
743    })
744}
745
746/// MacRomanEncoding table.
747fn mac_roman_encoding() -> &'static [char; 256] {
748    static TABLE: OnceLock<[char; 256]> = OnceLock::new();
749    TABLE.get_or_init(|| {
750        let mut t = ['\0'; 256];
751        // ASCII range is identity.
752        for i in 0x20..=0x7Eu8 {
753            t[i as usize] = i as char;
754        }
755        t[0x09] = '\t';
756        t[0x0A] = '\n';
757        t[0x0D] = '\r';
758        // Mac Roman 0x80-0xFF mapping to Unicode.
759        let mac_upper: [char; 128] = [
760            '\u{00C4}', '\u{00C5}', '\u{00C7}', '\u{00C9}', '\u{00D1}', '\u{00D6}', '\u{00DC}',
761            '\u{00E1}', '\u{00E0}', '\u{00E2}', '\u{00E4}', '\u{00E3}', '\u{00E5}', '\u{00E7}',
762            '\u{00E9}', '\u{00E8}', '\u{00EA}', '\u{00EB}', '\u{00ED}', '\u{00EC}', '\u{00EE}',
763            '\u{00EF}', '\u{00F1}', '\u{00F3}', '\u{00F2}', '\u{00F4}', '\u{00F6}', '\u{00F5}',
764            '\u{00FA}', '\u{00F9}', '\u{00FB}', '\u{00FC}', '\u{2020}', '\u{00B0}', '\u{00A2}',
765            '\u{00A3}', '\u{00A7}', '\u{2022}', '\u{00B6}', '\u{00DF}', '\u{00AE}', '\u{00A9}',
766            '\u{2122}', '\u{00B4}', '\u{00A8}', '\u{2260}', '\u{00C6}', '\u{00D8}', '\u{221E}',
767            '\u{00B1}', '\u{2264}', '\u{2265}', '\u{00A5}', '\u{00B5}', '\u{2202}', '\u{2211}',
768            '\u{220F}', '\u{03C0}', '\u{222B}', '\u{00AA}', '\u{00BA}', '\u{03A9}', '\u{00E6}',
769            '\u{00F8}', '\u{00BF}', '\u{00A1}', '\u{00AC}', '\u{221A}', '\u{0192}', '\u{2248}',
770            '\u{2206}', '\u{00AB}', '\u{00BB}', '\u{2026}', '\u{00A0}', '\u{00C0}', '\u{00C3}',
771            '\u{00D5}', '\u{0152}', '\u{0153}', '\u{2013}', '\u{2014}', '\u{201C}', '\u{201D}',
772            '\u{2018}', '\u{2019}', '\u{00F7}', '\u{25CA}', '\u{00FF}', '\u{0178}', '\u{2044}',
773            '\u{20AC}', '\u{2039}', '\u{203A}', '\u{FB01}', '\u{FB02}', '\u{2021}', '\u{00B7}',
774            '\u{201A}', '\u{201E}', '\u{2030}', '\u{00C2}', '\u{00CA}', '\u{00C1}', '\u{00CB}',
775            '\u{00C8}', '\u{00CD}', '\u{00CE}', '\u{00CF}', '\u{00CC}', '\u{00D3}', '\u{00D4}',
776            '\u{F8FF}', '\u{00D2}', '\u{00DA}', '\u{00DB}', '\u{00D9}', '\u{0131}', '\u{02C6}',
777            '\u{02DC}', '\u{00AF}', '\u{02D8}', '\u{02D9}', '\u{02DA}', '\u{00B8}', '\u{02DD}',
778            '\u{02DB}', '\u{02C7}',
779        ];
780        for (i, &ch) in mac_upper.iter().enumerate() {
781            t[0x80 + i] = ch;
782        }
783        t
784    })
785}
786
787/// Map an Adobe Glyph List (AGL) name to a Unicode character.
788///
789/// Handles:
790/// - `uniXXXX` names (4+ hex digits after "uni")
791/// - `uXXXX` / `uXXXXX` names
792/// - Common AGL names (top ~250 entries covering >99% of real-world PDFs)
793///
794/// The `ct` ligature has no precomposed Unicode codepoint and is handled
795/// separately via the `ct_codes` side-table on `FontInfo` — see
796/// [`apply_glyph_name`] — rather than through any PUA sentinel here.
797fn glyph_name_to_unicode(name: &str) -> Option<char> {
798    // Handle uniXXXX / uXXXXX patterns.
799    if name.starts_with("uni") && name.len() >= 7 {
800        return u32::from_str_radix(&name[3..7], 16)
801            .ok()
802            .and_then(char::from_u32);
803    }
804    if name.starts_with('u') && name.len() >= 5 && name[1..].chars().all(|c| c.is_ascii_hexdigit())
805    {
806        return u32::from_str_radix(&name[1..], 16)
807            .ok()
808            .and_then(char::from_u32);
809    }
810
811    if let Some(c) = agl_table().get(name).copied() {
812        return Some(c);
813    }
814
815    match name {
816        "st" => Some('\u{FB06}'),
817        "longst" => Some('\u{FB05}'),
818        _ => None,
819    }
820}
821
822/// Built-in Adobe Glyph List table (most common ~250 entries).
823fn agl_table() -> &'static HashMap<&'static str, char> {
824    static TABLE: OnceLock<HashMap<&'static str, char>> = OnceLock::new();
825    TABLE.get_or_init(|| {
826        let entries: &[(&str, char)] = &[
827            ("space", ' '),
828            ("exclam", '!'),
829            ("quotedbl", '"'),
830            ("numbersign", '#'),
831            ("dollar", '$'),
832            ("percent", '%'),
833            ("ampersand", '&'),
834            ("quotesingle", '\''),
835            ("parenleft", '('),
836            ("parenright", ')'),
837            ("asterisk", '*'),
838            ("plus", '+'),
839            ("comma", ','),
840            ("hyphen", '-'),
841            ("period", '.'),
842            ("slash", '/'),
843            ("zero", '0'),
844            ("one", '1'),
845            ("two", '2'),
846            ("three", '3'),
847            ("four", '4'),
848            ("five", '5'),
849            ("six", '6'),
850            ("seven", '7'),
851            ("eight", '8'),
852            ("nine", '9'),
853            ("colon", ':'),
854            ("semicolon", ';'),
855            ("less", '<'),
856            ("equal", '='),
857            ("greater", '>'),
858            ("question", '?'),
859            ("at", '@'),
860            ("A", 'A'),
861            ("B", 'B'),
862            ("C", 'C'),
863            ("D", 'D'),
864            ("E", 'E'),
865            ("F", 'F'),
866            ("G", 'G'),
867            ("H", 'H'),
868            ("I", 'I'),
869            ("J", 'J'),
870            ("K", 'K'),
871            ("L", 'L'),
872            ("M", 'M'),
873            ("N", 'N'),
874            ("O", 'O'),
875            ("P", 'P'),
876            ("Q", 'Q'),
877            ("R", 'R'),
878            ("S", 'S'),
879            ("T", 'T'),
880            ("U", 'U'),
881            ("V", 'V'),
882            ("W", 'W'),
883            ("X", 'X'),
884            ("Y", 'Y'),
885            ("Z", 'Z'),
886            ("bracketleft", '['),
887            ("backslash", '\\'),
888            ("bracketright", ']'),
889            ("asciicircum", '^'),
890            ("underscore", '_'),
891            ("grave", '`'),
892            ("a", 'a'),
893            ("b", 'b'),
894            ("c", 'c'),
895            ("d", 'd'),
896            ("e", 'e'),
897            ("f", 'f'),
898            ("g", 'g'),
899            ("h", 'h'),
900            ("i", 'i'),
901            ("j", 'j'),
902            ("k", 'k'),
903            ("l", 'l'),
904            ("m", 'm'),
905            ("n", 'n'),
906            ("o", 'o'),
907            ("p", 'p'),
908            ("q", 'q'),
909            ("r", 'r'),
910            ("s", 's'),
911            ("t", 't'),
912            ("u", 'u'),
913            ("v", 'v'),
914            ("w", 'w'),
915            ("x", 'x'),
916            ("y", 'y'),
917            ("z", 'z'),
918            ("braceleft", '{'),
919            ("bar", '|'),
920            ("braceright", '}'),
921            ("asciitilde", '~'),
922            // Latin extended
923            ("Agrave", '\u{00C0}'),
924            ("Aacute", '\u{00C1}'),
925            ("Acircumflex", '\u{00C2}'),
926            ("Atilde", '\u{00C3}'),
927            ("Adieresis", '\u{00C4}'),
928            ("Aring", '\u{00C5}'),
929            ("AE", '\u{00C6}'),
930            ("Ccedilla", '\u{00C7}'),
931            ("Egrave", '\u{00C8}'),
932            ("Eacute", '\u{00C9}'),
933            ("Ecircumflex", '\u{00CA}'),
934            ("Edieresis", '\u{00CB}'),
935            ("Igrave", '\u{00CC}'),
936            ("Iacute", '\u{00CD}'),
937            ("Icircumflex", '\u{00CE}'),
938            ("Idieresis", '\u{00CF}'),
939            ("Eth", '\u{00D0}'),
940            ("Ntilde", '\u{00D1}'),
941            ("Ograve", '\u{00D2}'),
942            ("Oacute", '\u{00D3}'),
943            ("Ocircumflex", '\u{00D4}'),
944            ("Otilde", '\u{00D5}'),
945            ("Odieresis", '\u{00D6}'),
946            ("Ugrave", '\u{00D9}'),
947            ("Uacute", '\u{00DA}'),
948            ("Ucircumflex", '\u{00DB}'),
949            ("Udieresis", '\u{00DC}'),
950            ("Yacute", '\u{00DD}'),
951            ("Thorn", '\u{00DE}'),
952            ("germandbls", '\u{00DF}'),
953            ("agrave", '\u{00E0}'),
954            ("aacute", '\u{00E1}'),
955            ("acircumflex", '\u{00E2}'),
956            ("atilde", '\u{00E3}'),
957            ("adieresis", '\u{00E4}'),
958            ("aring", '\u{00E5}'),
959            ("ae", '\u{00E6}'),
960            ("ccedilla", '\u{00E7}'),
961            ("egrave", '\u{00E8}'),
962            ("eacute", '\u{00E9}'),
963            ("ecircumflex", '\u{00EA}'),
964            ("edieresis", '\u{00EB}'),
965            ("igrave", '\u{00EC}'),
966            ("iacute", '\u{00ED}'),
967            ("icircumflex", '\u{00EE}'),
968            ("idieresis", '\u{00EF}'),
969            ("eth", '\u{00F0}'),
970            ("ntilde", '\u{00F1}'),
971            ("ograve", '\u{00F2}'),
972            ("oacute", '\u{00F3}'),
973            ("ocircumflex", '\u{00F4}'),
974            ("otilde", '\u{00F5}'),
975            ("odieresis", '\u{00F6}'),
976            ("ugrave", '\u{00F9}'),
977            ("uacute", '\u{00FA}'),
978            ("ucircumflex", '\u{00FB}'),
979            ("udieresis", '\u{00FC}'),
980            ("yacute", '\u{00FD}'),
981            ("thorn", '\u{00FE}'),
982            ("ydieresis", '\u{00FF}'),
983            // Ligatures and special
984            ("fi", '\u{FB01}'),
985            ("fl", '\u{FB02}'),
986            ("ff", '\u{FB00}'),
987            ("ffi", '\u{FB03}'),
988            ("ffl", '\u{FB04}'),
989            // Punctuation and symbols
990            ("endash", '\u{2013}'),
991            ("emdash", '\u{2014}'),
992            ("bullet", '\u{2022}'),
993            ("ellipsis", '\u{2026}'),
994            ("quoteleft", '\u{2018}'),
995            ("quoteright", '\u{2019}'),
996            ("quotedblleft", '\u{201C}'),
997            ("quotedblright", '\u{201D}'),
998            ("quotesinglebase", '\u{201A}'),
999            ("quotesinglbase", '\u{201A}'),
1000            ("quotedblbase", '\u{201E}'),
1001            ("dagger", '\u{2020}'),
1002            ("daggerdbl", '\u{2021}'),
1003            ("perthousand", '\u{2030}'),
1004            ("guilsinglleft", '\u{2039}'),
1005            ("guilsinglright", '\u{203A}'),
1006            ("guillemotleft", '\u{00AB}'),
1007            ("guillemotright", '\u{00BB}'),
1008            ("trademark", '\u{2122}'),
1009            ("copyright", '\u{00A9}'),
1010            ("registered", '\u{00AE}'),
1011            ("degree", '\u{00B0}'),
1012            ("plusminus", '\u{00B1}'),
1013            ("multiply", '\u{00D7}'),
1014            ("divide", '\u{00F7}'),
1015            ("fraction", '\u{2044}'),
1016            ("Euro", '\u{20AC}'),
1017            ("sterling", '\u{00A3}'),
1018            ("yen", '\u{00A5}'),
1019            ("cent", '\u{00A2}'),
1020            ("currency", '\u{00A4}'),
1021            ("section", '\u{00A7}'),
1022            ("paragraph", '\u{00B6}'),
1023            ("brokenbar", '\u{00A6}'),
1024            ("ordfeminine", '\u{00AA}'),
1025            ("ordmasculine", '\u{00BA}'),
1026            ("exclamdown", '\u{00A1}'),
1027            ("questiondown", '\u{00BF}'),
1028            ("logicalnot", '\u{00AC}'),
1029            ("mu", '\u{00B5}'),
1030            ("macron", '\u{00AF}'),
1031            ("acute", '\u{00B4}'),
1032            ("cedilla", '\u{00B8}'),
1033            ("dieresis", '\u{00A8}'),
1034            ("circumflex", '\u{02C6}'),
1035            ("tilde", '\u{02DC}'),
1036            ("caron", '\u{02C7}'),
1037            ("ring", '\u{02DA}'),
1038            ("breve", '\u{02D8}'),
1039            ("dotaccent", '\u{02D9}'),
1040            ("hungarumlaut", '\u{02DD}'),
1041            ("ogonek", '\u{02DB}'),
1042            ("nbspace", '\u{00A0}'),
1043            ("nonbreakingspace", '\u{00A0}'),
1044            ("softhyphen", '\u{00AD}'),
1045            ("periodcentered", '\u{00B7}'),
1046            ("middot", '\u{00B7}'),
1047            ("florin", '\u{0192}'),
1048            ("OE", '\u{0152}'),
1049            ("oe", '\u{0153}'),
1050            ("Scaron", '\u{0160}'),
1051            ("scaron", '\u{0161}'),
1052            ("Zcaron", '\u{017D}'),
1053            ("zcaron", '\u{017E}'),
1054            ("Ydieresis", '\u{0178}'),
1055            ("Lslash", '\u{0141}'),
1056            ("lslash", '\u{0142}'),
1057            ("Oslash", '\u{00D8}'),
1058            ("oslash", '\u{00F8}'),
1059            ("dotlessi", '\u{0131}'),
1060            // Superscripts / subscripts
1061            ("onesuperior", '\u{00B9}'),
1062            ("twosuperior", '\u{00B2}'),
1063            ("threesuperior", '\u{00B3}'),
1064            ("onequarter", '\u{00BC}'),
1065            ("onehalf", '\u{00BD}'),
1066            ("threequarters", '\u{00BE}'),
1067            // Math
1068            ("minus", '\u{2212}'),
1069            ("notequal", '\u{2260}'),
1070            ("lessequal", '\u{2264}'),
1071            ("greaterequal", '\u{2265}'),
1072            ("infinity", '\u{221E}'),
1073            ("partialdiff", '\u{2202}'),
1074            ("summation", '\u{2211}'),
1075            ("product", '\u{220F}'),
1076            ("integral", '\u{222B}'),
1077            ("radical", '\u{221A}'),
1078            ("approxequal", '\u{2248}'),
1079            ("Delta", '\u{0394}'),
1080            ("lozenge", '\u{25CA}'),
1081            ("pi", '\u{03C0}'),
1082            ("Omega", '\u{03A9}'),
1083        ];
1084        entries.iter().cloned().collect()
1085    })
1086}
1087
1088const CT_LIGATURE_MARKER: char = '\u{E007}';
1089
1090#[derive(Clone, Default)]
1091struct DecodedPdfString {
1092    text: String,
1093    ct_origins: Vec<bool>,
1094}
1095
1096impl DecodedPdfString {
1097    fn from_text(text: String) -> Self {
1098        let ct_origins = vec![false; text.chars().count()];
1099        Self { text, ct_origins }
1100    }
1101
1102    fn push_char(&mut self, ch: char, is_ct_origin: bool) {
1103        self.text.push(ch);
1104        self.ct_origins.push(is_ct_origin);
1105    }
1106
1107    fn push_str(&mut self, s: &str) {
1108        for ch in s.chars() {
1109            self.push_char(ch, false);
1110        }
1111    }
1112
1113    fn extend(&mut self, other: DecodedPdfString) {
1114        self.text.push_str(&other.text);
1115        self.ct_origins.extend(other.ct_origins);
1116    }
1117
1118    fn is_empty(&self) -> bool {
1119        self.text.is_empty()
1120    }
1121
1122    fn glyph_count(&self) -> usize {
1123        self.ct_origins.len()
1124    }
1125
1126    fn iter(&self) -> impl Iterator<Item = (char, bool)> + '_ {
1127        self.text.chars().zip(self.ct_origins.iter().copied())
1128    }
1129}
1130
1131/// Decode a PDF string using the font's ToUnicode CMap (if available),
1132/// preserving origin metadata for internal `ct` ligature markers.
1133fn decode_pdf_string_with_font_marked(
1134    bytes: &[u8],
1135    font_info: Option<&FontInfo>,
1136) -> DecodedPdfString {
1137    // Check for UTF-16BE BOM first — always takes priority.
1138    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1139        let chars: Vec<u16> = bytes[2..]
1140            .chunks(2)
1141            .filter_map(|chunk| {
1142                if chunk.len() == 2 {
1143                    Some(u16::from_be_bytes([chunk[0], chunk[1]]))
1144                } else {
1145                    None
1146                }
1147            })
1148            .collect();
1149        return DecodedPdfString::from_text(String::from_utf16_lossy(&chars));
1150    }
1151
1152    if let Some(info) = font_info {
1153        if info.is_cid && !info.to_unicode.is_empty() {
1154            // CID font: decode 2-byte codes via ToUnicode.
1155            let mut result = DecodedPdfString::default();
1156            let mut i = 0;
1157            while i + 1 < bytes.len() {
1158                let code = u16::from_be_bytes([bytes[i], bytes[i + 1]]) as u32;
1159                if let Some(s) = info.to_unicode.get(&code) {
1160                    result.push_str(s);
1161                } else {
1162                    // Fallback: try direct Unicode interpretation.
1163                    if let Some(ch) = char::from_u32(code) {
1164                        if !ch.is_control() || ch == ' ' || ch == '\t' || ch == '\n' {
1165                            result.push_char(ch, false);
1166                        }
1167                    }
1168                }
1169                i += 2;
1170            }
1171            return result;
1172        }
1173
1174        if !info.is_cid && !info.to_unicode.is_empty() {
1175            // Simple font with ToUnicode: decode 1-byte codes.
1176            let mut result = DecodedPdfString::default();
1177            for &b in bytes {
1178                if let Some(s) = info.to_unicode.get(&(b as u32)) {
1179                    result.push_str(s);
1180                } else if info.ct_codes[b as usize] {
1181                    result.push_char(CT_LIGATURE_MARKER, true);
1182                } else if let Some(ch) = info.encoding_map[b as usize] {
1183                    result.push_char(ch, false);
1184                } else {
1185                    let ch = b as char;
1186                    if is_printable_or_space(ch) {
1187                        result.push_char(ch, false);
1188                    }
1189                }
1190            }
1191            return result;
1192        }
1193
1194        // Simple font with encoding map but no ToUnicode.
1195        if !info.is_cid
1196            && (info.encoding_map.iter().any(|c| c.is_some()) || info.ct_codes.iter().any(|f| *f))
1197        {
1198            let mut result = DecodedPdfString::default();
1199            for &b in bytes {
1200                if info.ct_codes[b as usize] {
1201                    result.push_char(CT_LIGATURE_MARKER, true);
1202                } else if let Some(ch) = info.encoding_map[b as usize] {
1203                    result.push_char(ch, false);
1204                } else {
1205                    let ch = b as char;
1206                    if is_printable_or_space(ch) {
1207                        result.push_char(ch, false);
1208                    }
1209                }
1210            }
1211            return result;
1212        }
1213    }
1214
1215    // Fallback: PDFDocEncoding (ASCII + Latin-1), skipping control chars.
1216    let mut result = DecodedPdfString::default();
1217    for &b in bytes {
1218        let ch = b as char;
1219        if is_printable_or_space(ch) {
1220            result.push_char(ch, false);
1221        }
1222    }
1223    result
1224}
1225
1226/// Returns true if a character is printable or common whitespace (tab, newline, CR, space).
1227/// Filters out NUL, BEL, and other control characters that corrupt XML output.
1228fn is_printable_or_space(ch: char) -> bool {
1229    let cp = ch as u32;
1230    cp >= 0x20 || cp == 0x09 || cp == 0x0A || cp == 0x0D
1231}
1232
1233/// Whether to expand precomposed-ligature characters and known ligature
1234/// glyph names into their constituent ASCII characters during text
1235/// extraction. Default-on so that text-search, copy-paste and accessibility
1236/// flows see the human-readable string ("office") rather than an opaque
1237/// glyph cluster ("o\u{FB03}ce"). Internal for now — the public surface
1238/// stays unchanged; future configuration would surface this on a higher-
1239/// level type without breaking the current API.
1240const LIGATURE_DECOMP: bool = true;
1241
1242/// Decompose a precomposed-ligature character (Alphabetic Presentation Forms,
1243/// U+FB00..U+FB06) into its constituent string. Returns `None` for any other
1244/// character — the caller keeps the original codepoint in that case.
1245///
1246/// The `ct` ligature uses an internal marker plus origin metadata because
1247/// it has no precomposed Unicode codepoint and must not collide with
1248/// legitimate PUA values from ToUnicode CMaps.
1249fn decompose_ligature_char(c: char) -> Option<&'static str> {
1250    Some(match c {
1251        '\u{FB00}' => "ff",
1252        '\u{FB01}' => "fi",
1253        '\u{FB02}' => "fl",
1254        '\u{FB03}' => "ffi",
1255        '\u{FB04}' => "ffl",
1256        // U+FB05 (long s + t) and U+FB06 (st) both decompose to "st" for
1257        // search/copy-paste purposes; the historical long-s spelling is
1258        // not preserved.
1259        '\u{FB05}' | '\u{FB06}' => "st",
1260        _ => return None,
1261    })
1262}
1263
1264fn decompose_ligature_char_with_origin(c: char, is_ct_origin: bool) -> Option<&'static str> {
1265    if is_ct_origin && c == CT_LIGATURE_MARKER {
1266        Some("ct")
1267    } else {
1268        decompose_ligature_char(c)
1269    }
1270}
1271
1272/// Decompose a single precomposed-ligature character into its constituent
1273/// string. Combines the hardcoded Latin set ([`decompose_ligature_char`])
1274/// with an NFKD fallback over the rest of the Alphabetic Presentation
1275/// Forms block (U+FB00..U+FB4F) — Hebrew, Arabic, Armenian forms — so
1276/// that string-level [`decompose_ligatures`] and the per-glyph
1277/// `push_glyph_positioned` path agree on what counts as a decomposable
1278/// glyph.
1279fn decompose_glyph_to_string(c: char) -> Option<String> {
1280    use unicode_normalization::UnicodeNormalization;
1281    if let Some(s) = decompose_ligature_char(c) {
1282        return Some(s.to_string());
1283    }
1284    if matches!(c, '\u{FB00}'..='\u{FB4F}') {
1285        let nfkd: String = c.nfkd().collect();
1286        // Some codepoints in this block have no compatibility decomposition
1287        // and would round-trip back as themselves; only treat them as
1288        // decomposable when NFKD actually maps them, including legitimate
1289        // single-codepoint mappings such as Hebrew presentation forms.
1290        if nfkd != c.to_string() {
1291            return Some(nfkd);
1292        }
1293    }
1294    None
1295}
1296
1297/// Apply ligature decomposition to a string. Hardcoded mappings cover the
1298/// six common Latin ligatures (fi, fl, ffi, ffl, ff, st). For any other
1299/// character in the Alphabetic Presentation Forms block (U+FB00..U+FB4F)
1300/// not covered above — Hebrew, Arabic, Armenian forms — apply NFKD locally
1301/// to that character only. NFKD is *not* applied globally to avoid breaking
1302/// legitimate composed accented characters elsewhere in the string.
1303fn decompose_ligatures(s: &str) -> String {
1304    let mut out = String::with_capacity(s.len());
1305    for c in s.chars() {
1306        if let Some(replacement) = decompose_glyph_to_string(c) {
1307            out.push_str(&replacement);
1308        } else {
1309            out.push(c);
1310        }
1311    }
1312    out
1313}
1314
1315fn decompose_decoded_ligatures(s: &DecodedPdfString) -> String {
1316    let mut out = String::with_capacity(s.text.len());
1317    for (c, is_ct_origin) in s.iter() {
1318        if let Some(replacement) = decompose_ligature_char_with_origin(c, is_ct_origin) {
1319            out.push_str(replacement);
1320        } else if let Some(replacement) = decompose_glyph_to_string(c) {
1321            out.push_str(&replacement);
1322        } else {
1323            out.push(c);
1324        }
1325    }
1326    out
1327}
1328
1329/// Apply [`decompose_ligatures`] when the feature is enabled; otherwise
1330/// return the input unchanged. Centralizes the gate so callers don't
1331/// repeat the `if LIGATURE_DECOMP` check at every emission site.
1332fn maybe_decompose_decoded(s: &DecodedPdfString) -> String {
1333    if LIGATURE_DECOMP {
1334        if s.ct_origins.iter().any(|origin| *origin) {
1335            decompose_decoded_ligatures(s)
1336        } else {
1337            decompose_ligatures(&s.text)
1338        }
1339    } else {
1340        s.text.clone()
1341    }
1342}
1343
1344/// Extract text blocks from a specific page.
1345pub fn extract_page_blocks(doc: &Document, page_num: u32) -> Vec<TextBlock> {
1346    let pages = doc.get_pages();
1347    let Some(&page_id) = pages.get(&page_num) else {
1348        return Vec::new();
1349    };
1350
1351    let font_map = build_font_map(doc, page_id);
1352    let resources = get_page_resources(doc, page_id).unwrap_or_default();
1353    if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1354        if let Ok(content) = Content::decode(&content_bytes) {
1355            return extract_blocks_from_ops_inner(
1356                &content.operations,
1357                page_num,
1358                &font_map,
1359                Some((doc, &resources)),
1360                0,
1361                None,
1362            );
1363        }
1364    }
1365
1366    Vec::new()
1367}
1368
1369/// Extract text blocks from a page identified by its object ID.
1370///
1371/// Unlike [`extract_page_blocks`], this function does not call `get_pages()`
1372/// internally. Use this when the caller already holds a page map from a
1373/// prior `doc.get_pages()` call to avoid redundant page-tree traversals.
1374pub fn extract_blocks_from_page_id(
1375    doc: &Document,
1376    page_id: lopdf::ObjectId,
1377    page_num: u32,
1378) -> Vec<TextBlock> {
1379    let font_map = build_font_map(doc, page_id);
1380    let resources = get_page_resources(doc, page_id).unwrap_or_default();
1381    if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1382        if let Ok(content) = Content::decode(&content_bytes) {
1383            return extract_blocks_from_ops_inner(
1384                &content.operations,
1385                page_num,
1386                &font_map,
1387                Some((doc, &resources)),
1388                0,
1389                None,
1390            );
1391        }
1392    }
1393    Vec::new()
1394}
1395
1396/// Extract text blocks from all pages of a document.
1397pub fn extract_text(doc: &Document) -> Vec<TextBlock> {
1398    let pages = doc.get_pages();
1399    let mut blocks = Vec::new();
1400
1401    for (&page_num, &page_id) in &pages {
1402        let font_map = build_font_map(doc, page_id);
1403        let resources = get_page_resources(doc, page_id).unwrap_or_default();
1404        if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1405            if let Ok(content) = Content::decode(&content_bytes) {
1406                let page_blocks = extract_blocks_from_ops_inner(
1407                    &content.operations,
1408                    page_num,
1409                    &font_map,
1410                    Some((doc, &resources)),
1411                    0,
1412                    None,
1413                );
1414                blocks.extend(page_blocks);
1415            }
1416        }
1417    }
1418
1419    blocks
1420}
1421
1422/// Extract text from a specific page as a plain string.
1423pub fn extract_page_text(doc: &Document, page_num: u32) -> Result<String> {
1424    let pages = doc.get_pages();
1425    let total = pages.len() as u32;
1426
1427    if page_num == 0 || page_num > total {
1428        return Err(ExtractError::PageOutOfRange(page_num, total));
1429    }
1430
1431    let page_id = *pages
1432        .get(&page_num)
1433        .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
1434
1435    let font_map = build_font_map(doc, page_id);
1436    let resources = get_page_resources(doc, page_id).unwrap_or_default();
1437    let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
1438    let content = match Content::decode(&content_bytes) {
1439        Ok(c) => c,
1440        Err(_) => return Ok(String::new()),
1441    };
1442
1443    let blocks = extract_blocks_from_ops_inner(
1444        &content.operations,
1445        page_num,
1446        &font_map,
1447        Some((doc, &resources)),
1448        0,
1449        None,
1450    );
1451    let text = blocks
1452        .iter()
1453        .map(|b| b.text.as_str())
1454        .collect::<Vec<_>>()
1455        .join("");
1456
1457    Ok(text)
1458}
1459
1460/// Extract positioned characters from a specific page.
1461pub fn extract_positioned_chars(doc: &Document, page_num: u32) -> Result<Vec<PositionedChar>> {
1462    let pages = doc.get_pages();
1463    let total = pages.len() as u32;
1464
1465    if page_num == 0 || page_num > total {
1466        return Err(ExtractError::PageOutOfRange(page_num, total));
1467    }
1468
1469    let page_id = *pages
1470        .get(&page_num)
1471        .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
1472
1473    let font_map = build_font_map(doc, page_id);
1474    let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
1475    let content = match Content::decode(&content_bytes) {
1476        Ok(c) => c,
1477        Err(_) => return Ok(Vec::new()),
1478    };
1479
1480    let chars = extract_chars_from_ops(&content.operations, page_num, &font_map);
1481    Ok(chars)
1482}
1483
1484/// Get content stream bytes for a page.
1485fn get_page_content_bytes(doc: &Document, page_id: ObjectId) -> std::result::Result<Vec<u8>, ()> {
1486    doc.get_page_content(page_id).map_err(|_| ())
1487}
1488
1489/// One entry on the marked-content stack. PDF marked-content sequences nest
1490/// (e.g. `BDC … BDC … EMC … EMC`); each `BDC`/`BMC` pushes, `EMC` pops. The
1491/// innermost entry with `actual_text = Some(_)` wins for any glyph emitted
1492/// inside that range.
1493#[derive(Debug, Clone, Default)]
1494struct MarkedContentEntry {
1495    /// `/ActualText` value from the property dict, decoded as a PDF text
1496    /// string (UTF-16BE if BOM-prefixed, otherwise PDFDocEncoding/Latin-1).
1497    actual_text: Option<String>,
1498}
1499
1500/// Resolve the property dict for a `BDC` operator. The second operand is
1501/// either an inline dictionary or a `/Name` referring to an entry in
1502/// `Resources/Properties`.
1503fn resolve_bdc_properties(
1504    doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
1505    operand: &Object,
1506) -> Option<lopdf::Dictionary> {
1507    match operand {
1508        Object::Dictionary(d) => Some(d.clone()),
1509        Object::Reference(r) => doc_and_resources.and_then(|(doc, _)| {
1510            doc.get_object(*r).ok().and_then(|o| match o {
1511                Object::Dictionary(d) => Some(d.clone()),
1512                _ => None,
1513            })
1514        }),
1515        Object::Name(n) => {
1516            let (doc, resources) = doc_and_resources?;
1517            let props = resolve_dict(doc, resources, b"Properties")?;
1518            match props.get(n.as_slice()).ok()? {
1519                Object::Dictionary(d) => Some(d.clone()),
1520                Object::Reference(r) => match doc.get_object(*r).ok()? {
1521                    Object::Dictionary(d) => Some(d.clone()),
1522                    _ => None,
1523                },
1524                _ => None,
1525            }
1526        }
1527        _ => None,
1528    }
1529}
1530
1531/// Decode a PDF text string (BOM-detected UTF-16BE / UTF-8, otherwise
1532/// PDFDocEncoding-ish Latin-1). Used for the `/ActualText` value, which is
1533/// stored as a regular text string — not as a glyph-coded string.
1534fn decode_pdf_text_string(bytes: &[u8]) -> String {
1535    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1536        let u16s: Vec<u16> = bytes[2..]
1537            .chunks_exact(2)
1538            .map(|c| u16::from_be_bytes([c[0], c[1]]))
1539            .collect();
1540        return String::from_utf16_lossy(&u16s);
1541    }
1542    if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
1543        return String::from_utf8_lossy(&bytes[3..]).into_owned();
1544    }
1545    bytes
1546        .iter()
1547        .filter_map(|&b| {
1548            let ch = b as char;
1549            if is_printable_or_space(ch) {
1550                Some(ch)
1551            } else {
1552                None
1553            }
1554        })
1555        .collect()
1556}
1557
1558/// Extract `/ActualText` from a BDC property dict, if present.
1559fn extract_actual_text(props: &lopdf::Dictionary) -> Option<String> {
1560    let obj = props.get(b"ActualText").ok()?;
1561    match obj {
1562        Object::String(bytes, _) => Some(decode_pdf_text_string(bytes)),
1563        _ => None,
1564    }
1565}
1566
1567/// Innermost `actual_text` from the stack, falling back to an inherited
1568/// value supplied by the caller. The inherited value carries the parent
1569/// invocation's top-of-stack `/ActualText` across `Do` recursion into a
1570/// Form XObject, so text emitted inside the XObject's `BT`...`ET` blocks
1571/// is tagged with the surrounding `BDC`/`EMC` pair's `/ActualText`.
1572fn current_actual_text(stack: &[MarkedContentEntry], inherited: Option<&str>) -> Option<String> {
1573    stack
1574        .iter()
1575        .rev()
1576        .find_map(|e| e.actual_text.clone())
1577        .or_else(|| inherited.map(|s| s.to_string()))
1578}
1579
1580/// Extract text blocks from a list of operations, handling Form XObject
1581/// recursion via `Do`. `inherited_actual_text` carries the parent's active
1582/// `/ActualText` binding into the recursion so a `BDC` ... `Do` ... `EMC`
1583/// sequence keeps tagging text emitted inside the invoked Form XObject.
1584/// Top-level callers pass `None`.
1585fn extract_blocks_from_ops_inner(
1586    ops: &[Operation],
1587    page: u32,
1588    font_map: &HashMap<String, FontInfo>,
1589    doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
1590    depth: u32,
1591    inherited_actual_text: Option<&str>,
1592) -> Vec<TextBlock> {
1593    let mut state = TextState::default();
1594    let mut blocks = Vec::new();
1595    let mut mc_stack: Vec<MarkedContentEntry> = Vec::new();
1596
1597    for op in ops {
1598        match op.operator.as_str() {
1599            "q" => {
1600                state.gs_stack.push(GraphicsState { ctm: state.ctm });
1601            }
1602            "Q" => {
1603                if let Some(gs) = state.gs_stack.pop() {
1604                    state.ctm = gs.ctm;
1605                }
1606            }
1607            "cm" => {
1608                if let Some(m) = extract_matrix(&op.operands) {
1609                    state.ctm = multiply_matrix(&state.ctm, &m);
1610                }
1611            }
1612            "BT" => {
1613                state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1614                state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1615            }
1616            "Tf" if op.operands.len() >= 2 => {
1617                if let Object::Name(ref name) = op.operands[0] {
1618                    state.font_name = String::from_utf8_lossy(name).to_string();
1619                }
1620                if let Some(size) = as_number(&op.operands[1]) {
1621                    state.font_size = size;
1622                }
1623            }
1624            "Tc" => {
1625                if let Some(v) = op.operands.first().and_then(as_number) {
1626                    state.tc = v;
1627                }
1628            }
1629            "Tw" => {
1630                if let Some(v) = op.operands.first().and_then(as_number) {
1631                    state.tw = v;
1632                }
1633            }
1634            "Tz" => {
1635                if let Some(v) = op.operands.first().and_then(as_number) {
1636                    state.th = v;
1637                }
1638            }
1639            "TL" => {
1640                if let Some(v) = op.operands.first().and_then(as_number) {
1641                    state.tl = v;
1642                }
1643            }
1644            "Ts" => {
1645                if let Some(v) = op.operands.first().and_then(as_number) {
1646                    state.ts = v;
1647                }
1648            }
1649            "Td" if op.operands.len() >= 2 => {
1650                let tx = as_number(&op.operands[0]).unwrap_or(0.0);
1651                let ty = as_number(&op.operands[1]).unwrap_or(0.0);
1652                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
1653                state.tlm = new_tlm;
1654                state.tm = new_tlm;
1655            }
1656            "TD" if op.operands.len() >= 2 => {
1657                let tx = as_number(&op.operands[0]).unwrap_or(0.0);
1658                let ty = as_number(&op.operands[1]).unwrap_or(0.0);
1659                state.tl = -ty;
1660                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
1661                state.tlm = new_tlm;
1662                state.tm = new_tlm;
1663            }
1664            "Tm" => {
1665                if let Some(m) = extract_matrix(&op.operands) {
1666                    state.tm = m;
1667                    state.tlm = m;
1668                }
1669            }
1670            "T*" => {
1671                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
1672                state.tlm = new_tlm;
1673                state.tm = new_tlm;
1674            }
1675            "Tj" => {
1676                let fi = font_map.get(&state.font_name);
1677                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
1678                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1679
1680                    // Skip empty text blocks (consistent with TJ handler) — empty
1681                    // blocks carry position but no data, causing phantom table
1682                    // detection with all-empty cells (#651).
1683                    if !text.is_empty() {
1684                        let x = state.tm[4];
1685                        let y = state.tm[5];
1686                        // Bbox width tracks the *rendered* glyph footprint
1687                        // (one char_w per source glyph), not the byte- or
1688                        // char-length of the decomposed display string.
1689                        let text_width = text.glyph_count() as f64 * char_w;
1690                        let display_text = maybe_decompose_decoded(&text);
1691                        blocks.push(TextBlock {
1692                            text: display_text,
1693                            page,
1694                            bbox: [x, y, x + text_width, y + state.font_size],
1695                            font_name: state.font_name.clone(),
1696                            font_size: state.font_size,
1697                            actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1698                        });
1699                    }
1700
1701                    // Advance text position.
1702                    for _ in text.iter() {
1703                        state.tm[4] += char_w + state.tc;
1704                    }
1705                }
1706            }
1707            "TJ" => {
1708                if let Some(Object::Array(ref arr)) = op.operands.first() {
1709                    let x_start = state.tm[4];
1710                    let y = state.tm[5];
1711                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1712                    let mut combined_text = DecodedPdfString::default();
1713                    let fi = font_map.get(&state.font_name);
1714
1715                    for item in arr {
1716                        match item {
1717                            Object::String(bytes, _) => {
1718                                let text = decode_pdf_string_with_font_marked(bytes, fi);
1719                                for _ in text.iter() {
1720                                    state.tm[4] += char_w + state.tc;
1721                                }
1722                                combined_text.extend(text);
1723                            }
1724                            _ => {
1725                                if let Some(adj) = as_number(item) {
1726                                    // Negative values move right, positive move left.
1727                                    state.tm[4] -= adj / 1000.0 * state.font_size;
1728                                }
1729                            }
1730                        }
1731                    }
1732
1733                    if !combined_text.is_empty() {
1734                        let x_end = state.tm[4];
1735                        blocks.push(TextBlock {
1736                            text: maybe_decompose_decoded(&combined_text),
1737                            page,
1738                            bbox: [x_start, y, x_end, y + state.font_size],
1739                            font_name: state.font_name.clone(),
1740                            font_size: state.font_size,
1741                            actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1742                        });
1743                    }
1744                }
1745            }
1746            "'" => {
1747                // Move to next line and show text.
1748                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
1749                state.tlm = new_tlm;
1750                state.tm = new_tlm;
1751
1752                let fi = font_map.get(&state.font_name);
1753                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
1754                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1755
1756                    if !text.is_empty() {
1757                        let x = state.tm[4];
1758                        let y = state.tm[5];
1759                        // Bbox width tracks rendered glyph footprint (one
1760                        // char_w per source glyph) — match the Tj path.
1761                        let text_width = text.glyph_count() as f64 * char_w;
1762                        let display_text = maybe_decompose_decoded(&text);
1763                        blocks.push(TextBlock {
1764                            text: display_text,
1765                            page,
1766                            bbox: [x, y, x + text_width, y + state.font_size],
1767                            font_name: state.font_name.clone(),
1768                            font_size: state.font_size,
1769                            actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1770                        });
1771                    }
1772
1773                    for _ in text.iter() {
1774                        state.tm[4] += char_w + state.tc;
1775                    }
1776                }
1777            }
1778            "\"" if op.operands.len() >= 3 => {
1779                // Set word/char spacing, move to next line, show text.
1780                if let Some(tw) = as_number(&op.operands[0]) {
1781                    state.tw = tw;
1782                }
1783                if let Some(tc) = as_number(&op.operands[1]) {
1784                    state.tc = tc;
1785                }
1786
1787                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
1788                state.tlm = new_tlm;
1789                state.tm = new_tlm;
1790
1791                let fi = font_map.get(&state.font_name);
1792                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands[2..], fi)
1793                {
1794                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1795
1796                    if !text.is_empty() {
1797                        let x = state.tm[4];
1798                        let y = state.tm[5];
1799                        // Bbox width tracks rendered glyph footprint
1800                        // (one char_w per source glyph) — match Tj path.
1801                        let text_width = text.glyph_count() as f64 * char_w;
1802                        let display_text = maybe_decompose_decoded(&text);
1803                        blocks.push(TextBlock {
1804                            text: display_text,
1805                            page,
1806                            bbox: [x, y, x + text_width, y + state.font_size],
1807                            font_name: state.font_name.clone(),
1808                            font_size: state.font_size,
1809                            actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1810                        });
1811                    }
1812
1813                    for _ in text.iter() {
1814                        state.tm[4] += char_w + state.tc;
1815                    }
1816                }
1817            }
1818            "BMC" => {
1819                // Begin marked-content (no property list). Push an empty entry
1820                // so EMC pops the right level — without this, nested BMC inside
1821                // a BDC/ActualText range would pop the BDC entry early.
1822                mc_stack.push(MarkedContentEntry::default());
1823            }
1824            "BDC" => {
1825                // Begin marked-content with property list. Operand 1 is the
1826                // tag (Name); operand 2 is either an inline dict or a Name
1827                // resolved via Resources/Properties.
1828                let actual_text = op
1829                    .operands
1830                    .get(1)
1831                    .and_then(|o| resolve_bdc_properties(doc_and_resources, o))
1832                    .as_ref()
1833                    .and_then(extract_actual_text);
1834                mc_stack.push(MarkedContentEntry { actual_text });
1835            }
1836            "EMC" => {
1837                // End marked-content. Use pop (not unwrap) so a malformed
1838                // stream with an unmatched EMC doesn't panic.
1839                mc_stack.pop();
1840            }
1841            "Do" if depth < 5 => {
1842                // Invoke Form XObject — recurse into its content stream.
1843                // Propagate the active /ActualText binding (top-of-stack, or
1844                // whatever this invocation itself inherited) so text inside
1845                // the XObject is tagged with the surrounding BDC/EMC pair.
1846                if let Some((doc, resources)) = doc_and_resources {
1847                    if let Some(Object::Name(ref xobj_name)) = op.operands.first() {
1848                        let xobj_name_str = String::from_utf8_lossy(xobj_name);
1849                        let inherited_for_child =
1850                            current_actual_text(&mc_stack, inherited_actual_text);
1851                        if let Some(xobj_blocks) = extract_form_xobject_text(
1852                            doc,
1853                            resources,
1854                            &xobj_name_str,
1855                            page,
1856                            font_map,
1857                            depth,
1858                            inherited_for_child.as_deref(),
1859                        ) {
1860                            blocks.extend(xobj_blocks);
1861                        }
1862                    }
1863                }
1864            }
1865            _ => {}
1866        }
1867    }
1868
1869    blocks
1870}
1871
1872/// Extract text from a Form XObject referenced by name in the page's
1873/// Resources/XObject dict. `inherited_actual_text` is the active
1874/// `/ActualText` binding from the calling context (parent stack +
1875/// whatever the parent itself inherited), so text emitted inside this
1876/// XObject's `BT`...`ET` blocks is tagged correctly when the `Do`
1877/// operator sits between a `BDC` and matching `EMC`.
1878fn extract_form_xobject_text(
1879    doc: &Document,
1880    resources: &lopdf::Dictionary,
1881    name: &str,
1882    page: u32,
1883    font_map: &HashMap<String, FontInfo>,
1884    depth: u32,
1885    inherited_actual_text: Option<&str>,
1886) -> Option<Vec<TextBlock>> {
1887    // Look up the XObject in the Resources dictionary.
1888    let xobj_dict = match resources.get(b"XObject").ok()? {
1889        Object::Dictionary(d) => d.clone(),
1890        Object::Reference(r) => match doc.get_object(*r).ok()? {
1891            Object::Dictionary(d) => d.clone(),
1892            _ => return None,
1893        },
1894        _ => return None,
1895    };
1896
1897    let xobj_ref = match xobj_dict.get(name.as_bytes()).ok()? {
1898        Object::Reference(r) => *r,
1899        _ => return None,
1900    };
1901
1902    let stream = match doc.get_object(xobj_ref).ok()? {
1903        Object::Stream(s) => s.clone(),
1904        _ => return None,
1905    };
1906
1907    // Verify it's a Form XObject (not Image).
1908    let subtype = stream
1909        .dict
1910        .get(b"Subtype")
1911        .ok()
1912        .and_then(|o| match o {
1913            Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
1914            _ => None,
1915        })
1916        .unwrap_or_default();
1917    if subtype != "Form" {
1918        return None;
1919    }
1920
1921    // Decode the content stream.
1922    let content_bytes = stream
1923        .decompressed_content()
1924        .ok()
1925        .unwrap_or_else(|| stream.content.clone());
1926    let content = Content::decode(&content_bytes).ok()?;
1927
1928    // Build font map: XObject's own fonts take priority over inherited page fonts.
1929    // A Form XObject may reuse font names (e.g. /F1) with different encodings.
1930    let mut xobj_font_map = font_map.clone();
1931    if let Some(xobj_resources) = resolve_dict(doc, &stream.dict, b"Resources") {
1932        if let Some(xobj_fonts) = resolve_dict(doc, &xobj_resources, b"Font") {
1933            for (name_bytes, value) in xobj_fonts.iter() {
1934                let fname = String::from_utf8_lossy(name_bytes).to_string();
1935                if let Some(fi) = build_font_info_from_value(doc, value) {
1936                    xobj_font_map.insert(fname, fi);
1937                }
1938            }
1939        }
1940    }
1941
1942    // Use the XObject's own Resources dict for recursive Do lookups.
1943    let xobj_resources =
1944        resolve_dict(doc, &stream.dict, b"Resources").unwrap_or_else(|| resources.clone());
1945
1946    Some(extract_blocks_from_ops_inner(
1947        &content.operations,
1948        page,
1949        &xobj_font_map,
1950        Some((doc, &xobj_resources)),
1951        depth + 1,
1952        inherited_actual_text,
1953    ))
1954}
1955
1956/// Build a FontInfo from a font dictionary value (used for XObject font resolution).
1957fn build_font_info_from_value(doc: &Document, value: &Object) -> Option<FontInfo> {
1958    let font = match value {
1959        Object::Reference(r) => match doc.get_object(*r).ok()? {
1960            Object::Dictionary(d) => d.clone(),
1961            _ => return None,
1962        },
1963        Object::Dictionary(d) => d.clone(),
1964        _ => return None,
1965    };
1966
1967    let subtype = font
1968        .get(b"Subtype")
1969        .ok()
1970        .and_then(|o| match o {
1971            Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
1972            _ => None,
1973        })
1974        .unwrap_or_default();
1975
1976    let is_cid = subtype == "Type0";
1977    let mut to_unicode = parse_to_unicode_from_font(doc, &font);
1978
1979    if to_unicode.is_empty() && is_cid {
1980        if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
1981            for d in descendants {
1982                let desc_dict = match d {
1983                    Object::Reference(r) => {
1984                        // Don't use `?` here — a bad reference in one descendant
1985                        // should not abort the entire font info construction.
1986                        let Some(Object::Dictionary(d)) = doc.get_object(*r).ok() else {
1987                            continue;
1988                        };
1989                        d
1990                    }
1991                    Object::Dictionary(d) => d,
1992                    _ => continue,
1993                };
1994                let tu = parse_to_unicode_from_font(doc, desc_dict);
1995                if !tu.is_empty() {
1996                    to_unicode = tu;
1997                    break;
1998                }
1999            }
2000        }
2001    }
2002
2003    let (encoding_map, ct_codes) = if !is_cid {
2004        build_encoding_map(doc, &font)
2005    } else {
2006        ([None; 256], [false; 256])
2007    };
2008
2009    Some(FontInfo {
2010        is_cid,
2011        to_unicode,
2012        encoding_map,
2013        ct_codes,
2014    })
2015}
2016
2017/// Emit one or more `PositionedChar` for a single glyph, decomposing
2018/// ligatures by splitting the glyph's `char_w` advance evenly across the
2019/// constituent characters. The text matrix advances exactly once (per
2020/// glyph), preserving the original layout — only the per-character bbox is
2021/// subdivided.
2022///
2023/// Uses the unified [`decompose_glyph_to_string`] helper so that the
2024/// per-glyph decomposition matches the string-level
2025/// [`decompose_ligatures`] used by `extract_page_text` / `TextBlock`.
2026/// Without this alignment, presentation-form codepoints outside the
2027/// hardcoded Latin set (e.g. Armenian U+FB13) would expand in extracted
2028/// text but stay as a single PositionedChar, leaving consumers that pair
2029/// text offsets with bounding boxes misaligned.
2030fn push_glyph_positioned(
2031    chars: &mut Vec<PositionedChar>,
2032    state: &mut TextState,
2033    page: u32,
2034    glyph: char,
2035    is_ct_origin: bool,
2036    char_w: f64,
2037) {
2038    let (gx, gy) = apply_ctm(state);
2039    let constituents: Option<String> = if LIGATURE_DECOMP {
2040        // Try ct-origin marker first (preserves U+E007 → "ct" only when it
2041        // came from a /ct glyph-name path), then fall back to the unified
2042        // decomposition helper which covers the Latin set + FB00–FB4F NFKD.
2043        if let Some(s) = decompose_ligature_char_with_origin(glyph, is_ct_origin) {
2044            Some(s.to_string())
2045        } else {
2046            decompose_glyph_to_string(glyph)
2047        }
2048    } else {
2049        None
2050    };
2051    match constituents {
2052        None => {
2053            chars.push(PositionedChar {
2054                ch: glyph,
2055                page,
2056                bbox: [gx, gy, gx + char_w, gy + state.font_size],
2057            });
2058        }
2059        Some(s) => {
2060            let n = s.chars().count() as f64;
2061            let part_w = char_w / n;
2062            for (i, c) in s.chars().enumerate() {
2063                let x = gx + part_w * i as f64;
2064                chars.push(PositionedChar {
2065                    ch: c,
2066                    page,
2067                    bbox: [x, gy, x + part_w, gy + state.font_size],
2068                });
2069            }
2070        }
2071    }
2072    state.tm[4] += char_w + state.tc;
2073}
2074
2075/// Extract positioned characters from operations.
2076fn extract_chars_from_ops(
2077    ops: &[Operation],
2078    page: u32,
2079    font_map: &HashMap<String, FontInfo>,
2080) -> Vec<PositionedChar> {
2081    let mut state = TextState::default();
2082    let mut chars = Vec::new();
2083
2084    for op in ops {
2085        match op.operator.as_str() {
2086            "q" => {
2087                state.gs_stack.push(GraphicsState { ctm: state.ctm });
2088            }
2089            "Q" => {
2090                if let Some(gs) = state.gs_stack.pop() {
2091                    state.ctm = gs.ctm;
2092                }
2093            }
2094            "cm" => {
2095                if let Some(m) = extract_matrix(&op.operands) {
2096                    state.ctm = multiply_matrix(&state.ctm, &m);
2097                }
2098            }
2099            "BT" => {
2100                state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2101                state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
2102            }
2103            "Tf" if op.operands.len() >= 2 => {
2104                if let Object::Name(ref name) = op.operands[0] {
2105                    state.font_name = String::from_utf8_lossy(name).to_string();
2106                }
2107                if let Some(size) = as_number(&op.operands[1]) {
2108                    state.font_size = size;
2109                }
2110            }
2111            "Tc" => {
2112                if let Some(v) = op.operands.first().and_then(as_number) {
2113                    state.tc = v;
2114                }
2115            }
2116            "Tw" => {
2117                if let Some(v) = op.operands.first().and_then(as_number) {
2118                    state.tw = v;
2119                }
2120            }
2121            "Tz" => {
2122                if let Some(v) = op.operands.first().and_then(as_number) {
2123                    state.th = v;
2124                }
2125            }
2126            "TL" => {
2127                if let Some(v) = op.operands.first().and_then(as_number) {
2128                    state.tl = v;
2129                }
2130            }
2131            "Ts" => {
2132                if let Some(v) = op.operands.first().and_then(as_number) {
2133                    state.ts = v;
2134                }
2135            }
2136            "Td" if op.operands.len() >= 2 => {
2137                let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2138                let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2139                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2140                state.tlm = new_tlm;
2141                state.tm = new_tlm;
2142            }
2143            "TD" if op.operands.len() >= 2 => {
2144                let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2145                let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2146                state.tl = -ty;
2147                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2148                state.tlm = new_tlm;
2149                state.tm = new_tlm;
2150            }
2151            "Tm" => {
2152                if let Some(m) = extract_matrix(&op.operands) {
2153                    state.tm = m;
2154                    state.tlm = m;
2155                }
2156            }
2157            "T*" => {
2158                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2159                state.tlm = new_tlm;
2160                state.tm = new_tlm;
2161            }
2162            "Tj" => {
2163                let fi = font_map.get(&state.font_name);
2164                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2165                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2166                    for (ch, is_ct_origin) in text.iter() {
2167                        push_glyph_positioned(
2168                            &mut chars,
2169                            &mut state,
2170                            page,
2171                            ch,
2172                            is_ct_origin,
2173                            char_w,
2174                        );
2175                    }
2176                }
2177            }
2178            "TJ" => {
2179                if let Some(Object::Array(ref arr)) = op.operands.first() {
2180                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2181                    let fi = font_map.get(&state.font_name);
2182                    for item in arr {
2183                        match item {
2184                            Object::String(bytes, _) => {
2185                                let text = decode_pdf_string_with_font_marked(bytes, fi);
2186                                for (ch, is_ct_origin) in text.iter() {
2187                                    push_glyph_positioned(
2188                                        &mut chars,
2189                                        &mut state,
2190                                        page,
2191                                        ch,
2192                                        is_ct_origin,
2193                                        char_w,
2194                                    );
2195                                }
2196                            }
2197                            _ => {
2198                                if let Some(adj) = as_number(item) {
2199                                    state.tm[4] -= adj / 1000.0 * state.font_size;
2200                                }
2201                            }
2202                        }
2203                    }
2204                }
2205            }
2206            "'" => {
2207                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2208                state.tlm = new_tlm;
2209                state.tm = new_tlm;
2210
2211                let fi = font_map.get(&state.font_name);
2212                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2213                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2214                    for (ch, is_ct_origin) in text.iter() {
2215                        push_glyph_positioned(
2216                            &mut chars,
2217                            &mut state,
2218                            page,
2219                            ch,
2220                            is_ct_origin,
2221                            char_w,
2222                        );
2223                    }
2224                }
2225            }
2226            "\"" if op.operands.len() >= 3 => {
2227                if let Some(tw) = as_number(&op.operands[0]) {
2228                    state.tw = tw;
2229                }
2230                if let Some(tc) = as_number(&op.operands[1]) {
2231                    state.tc = tc;
2232                }
2233
2234                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2235                state.tlm = new_tlm;
2236                state.tm = new_tlm;
2237
2238                let fi = font_map.get(&state.font_name);
2239                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands[2..], fi)
2240                {
2241                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2242                    for (ch, is_ct_origin) in text.iter() {
2243                        push_glyph_positioned(
2244                            &mut chars,
2245                            &mut state,
2246                            page,
2247                            ch,
2248                            is_ct_origin,
2249                            char_w,
2250                        );
2251                    }
2252                }
2253            }
2254            _ => {}
2255        }
2256    }
2257
2258    chars
2259}
2260
2261/// Apply the current transformation matrix (CTM) to the text matrix position,
2262/// returning (x, y) in page/user space.
2263///
2264/// Mirrors the `compute_x` / `compute_y` logic in pdf-manip's `text_run.rs` so
2265/// that character positions emitted by `extract_chars_from_ops` use the same
2266/// coordinate space as `TextRun.x` / `TextRun.y` from `extract_text_runs`.
2267/// Without this, PDFs whose content streams set a non-identity CTM via `cm`
2268/// produce mismatched coordinates, breaking the spatial redaction fallback.
2269#[inline]
2270fn apply_ctm(state: &TextState) -> (f64, f64) {
2271    let x = state.ctm[0] * state.tm[4] + state.ctm[2] * state.tm[5] + state.ctm[4];
2272    let y = state.ctm[1] * state.tm[4] + state.ctm[3] * state.tm[5] + state.ctm[5];
2273    (x, y)
2274}
2275
2276/// Extract the first string operand, decoding via font's ToUnicode CMap if available.
2277fn extract_decoded_string_operand_with_font(
2278    operands: &[Object],
2279    font_info: Option<&FontInfo>,
2280) -> Option<DecodedPdfString> {
2281    for op in operands {
2282        if let Object::String(bytes, _) = op {
2283            return Some(decode_pdf_string_with_font_marked(bytes, font_info));
2284        }
2285    }
2286    None
2287}
2288
2289/// Convert a PDF object to a number (f64).
2290fn as_number(obj: &Object) -> Option<f64> {
2291    match obj {
2292        Object::Integer(i) => Some(*i as f64),
2293        Object::Real(f) => Some(*f as f64),
2294        _ => None,
2295    }
2296}
2297
2298/// Extract a 6-element transformation matrix from operands.
2299fn extract_matrix(operands: &[Object]) -> Option<[f64; 6]> {
2300    if operands.len() < 6 {
2301        return None;
2302    }
2303    let a = as_number(&operands[0])?;
2304    let b = as_number(&operands[1])?;
2305    let c = as_number(&operands[2])?;
2306    let d = as_number(&operands[3])?;
2307    let e = as_number(&operands[4])?;
2308    let f = as_number(&operands[5])?;
2309    Some([a, b, c, d, e, f])
2310}
2311
2312/// Multiply two 3x3 transformation matrices (stored as [a, b, c, d, e, f]).
2313fn multiply_matrix(m1: &[f64; 6], m2: &[f64; 6]) -> [f64; 6] {
2314    [
2315        m1[0] * m2[0] + m1[1] * m2[2],
2316        m1[0] * m2[1] + m1[1] * m2[3],
2317        m1[2] * m2[0] + m1[3] * m2[2],
2318        m1[2] * m2[1] + m1[3] * m2[3],
2319        m1[4] * m2[0] + m1[5] * m2[2] + m2[4],
2320        m1[4] * m2[1] + m1[5] * m2[3] + m2[5],
2321    ]
2322}
2323
2324#[cfg(test)]
2325mod tests {
2326    use super::*;
2327    use lopdf::{dictionary, Document, Object, Stream};
2328
2329    /// Phase 3 Round 7 — `hex_to_unicode_string` must not panic on
2330    /// inputs that contain multi-byte UTF-8 fragments (e.g. a stray
2331    /// `U+FFFD` produced by an upstream font decode). The previous
2332    /// implementation panicked at `&hex[i..i + 2]` on a non-char
2333    /// boundary; the new implementation slices the underlying bytes
2334    /// and skips invalid UTF-8 fragments gracefully.
2335    #[test]
2336    fn hex_to_unicode_string_does_not_panic_on_replacement_char_input() {
2337        let _ = hex_to_unicode_string("\u{FFFD}F");
2338        let _ = hex_to_unicode_string("0041\u{FFFD}");
2339        let _ = hex_to_unicode_string("\u{FFFD}\u{FFFD}");
2340        let _ = hex_to_unicode_string("\u{FFFD}");
2341        // Sanity: valid hex still decodes (UTF-16BE 0041 == 'A').
2342        assert_eq!(hex_to_unicode_string("0041"), "A");
2343    }
2344
2345    /// Helper: create a minimal doc with text content.
2346    fn make_doc_with_text(content: &[u8]) -> Document {
2347        let mut doc = Document::with_version("1.7");
2348
2349        let content_stream = Stream::new(dictionary! {}, content.to_vec());
2350        let content_id = doc.add_object(Object::Stream(content_stream));
2351
2352        let page_dict = dictionary! {
2353            "Type" => "Page",
2354            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2355            "Contents" => Object::Reference(content_id),
2356        };
2357        let page_id = doc.add_object(Object::Dictionary(page_dict));
2358
2359        let pages_dict = dictionary! {
2360            "Type" => "Pages",
2361            "Kids" => vec![Object::Reference(page_id)],
2362            "Count" => 1_i64,
2363        };
2364        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2365
2366        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2367            d.set("Parent", Object::Reference(pages_id));
2368        }
2369
2370        let catalog = dictionary! {
2371            "Type" => "Catalog",
2372            "Pages" => Object::Reference(pages_id),
2373        };
2374        let catalog_id = doc.add_object(Object::Dictionary(catalog));
2375        doc.trailer.set("Root", Object::Reference(catalog_id));
2376
2377        doc
2378    }
2379
2380    #[test]
2381    fn extract_simple_text() {
2382        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello World) Tj ET");
2383        let blocks = extract_text(&doc);
2384        assert_eq!(blocks.len(), 1);
2385        assert_eq!(blocks[0].text, "Hello World");
2386        assert_eq!(blocks[0].page, 1);
2387        assert_eq!(blocks[0].font_size, 12.0);
2388    }
2389
2390    #[test]
2391    fn extract_page_text_single() {
2392        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
2393        let text = extract_page_text(&doc, 1).unwrap();
2394        assert_eq!(text, "Hello");
2395    }
2396
2397    #[test]
2398    fn extract_page_text_out_of_range() {
2399        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
2400        let result = extract_page_text(&doc, 5);
2401        assert!(result.is_err());
2402    }
2403
2404    #[test]
2405    fn extract_positioned_chars_basic() {
2406        let doc = make_doc_with_text(b"BT /F1 12 Tf (AB) Tj ET");
2407        let chars = extract_positioned_chars(&doc, 1).unwrap();
2408        assert_eq!(chars.len(), 2);
2409        assert_eq!(chars[0].ch, 'A');
2410        assert_eq!(chars[1].ch, 'B');
2411        assert_eq!(chars[0].page, 1);
2412        // Second char should be positioned after the first.
2413        assert!(chars[1].bbox[0] > chars[0].bbox[0]);
2414    }
2415
2416    #[test]
2417    fn extract_tj_array() {
2418        let doc = make_doc_with_text(b"BT /F1 12 Tf [(He) -100 (llo)] TJ ET");
2419        let blocks = extract_text(&doc);
2420        assert_eq!(blocks.len(), 1);
2421        assert_eq!(blocks[0].text, "Hello");
2422    }
2423
2424    /// WORD_SPLIT behaviour-baseline (text-mechanisms pack
2425    /// `benchmarks/text_mechanisms/WORD_SPLIT/`).
2426    ///
2427    /// This test documents the **current** TJ-array tracking handling
2428    /// for several tracking magnitudes. It is intentionally a
2429    /// behaviour snapshot, not a correctness assertion. The
2430    /// pdf-extract layer combines TJ-array string pieces into one
2431    /// block regardless of tracking — it is `pdf-engine`'s
2432    /// `TextExtractionDevice` (multi-signal-consensus) that decides
2433    /// whether to insert a space between glyphs at the page-text
2434    /// level. Both layers are documented here so that any future
2435    /// change to the tracking threshold can be detected and
2436    /// reviewed.
2437    ///
2438    /// If a future fix changes pdf-extract behaviour for TJ tracking,
2439    /// THIS test will need updating. That update is intentional: we
2440    /// want a deliberate behaviour change, not an accidental one.
2441    #[test]
2442    fn word_split_baseline_pdf_extract_layer_combines_tj_pieces() {
2443        // Small tracking (-100): pdf-extract combines into one block.
2444        let doc1 = make_doc_with_text(
2445            b"BT /F1 12 Tf [(Range) -100 (Rs)] TJ ET");
2446        let b1 = extract_text(&doc1);
2447        assert_eq!(b1.len(), 1, "small tracking: 1 block expected");
2448        assert_eq!(b1[0].text, "RangeRs",
2449                   "pdf-extract combines TJ pieces irrespective of tracking");
2450
2451        // Large negative tracking (-2000): pdf-extract STILL combines.
2452        // The per-glyph spacing observed in CLI output comes from the
2453        // pdf-engine TextExtractionDevice consensus, not from this
2454        // pdf-extract path.
2455        let doc2 = make_doc_with_text(
2456            b"BT /F1 12 Tf [(Range) -2000 (Rs)] TJ ET");
2457        let b2 = extract_text(&doc2);
2458        assert_eq!(b2.len(), 1,
2459                   "large tracking: still 1 block at pdf-extract layer");
2460        assert_eq!(b2[0].text, "RangeRs",
2461                   "pdf-extract layer is unaffected by tracking magnitude");
2462
2463        // Multiple separate Tj operations DO produce multiple blocks
2464        // — that's a different pattern (BT/ET-block boundary).
2465        let doc3 = make_doc_with_text(
2466            b"BT /F1 12 Tf (Range) Tj (Rs) Tj ET");
2467        let b3 = extract_text(&doc3);
2468        assert_eq!(b3.len(), 2, "two Tj's = two blocks");
2469        assert_eq!(b3[0].text, "Range");
2470        assert_eq!(b3[1].text, "Rs");
2471    }
2472
2473    #[test]
2474    fn empty_page_extracts_no_text() {
2475        let doc = make_doc_with_text(b"q Q");
2476        let blocks = extract_text(&doc);
2477        assert!(blocks.is_empty());
2478    }
2479
2480    #[test]
2481    fn multiline_text_extraction() {
2482        let doc = make_doc_with_text(b"BT /F1 12 Tf 12 TL (Line1) Tj T* (Line2) Tj ET");
2483        let blocks = extract_text(&doc);
2484        assert_eq!(blocks.len(), 2);
2485        assert_eq!(blocks[0].text, "Line1");
2486        assert_eq!(blocks[1].text, "Line2");
2487    }
2488
2489    /// `/ActualText` on a BDC marked-content sequence overrides the glyph
2490    /// reading of the inner Tj. Glyph text is preserved on `text`; the
2491    /// canonical reading lives on `actual_text`.
2492    #[test]
2493    fn actual_text_basic_bdc_override() {
2494        let doc = make_doc_with_text(b"BT /F1 12 Tf /Span <</ActualText (fi)>> BDC (X) Tj EMC ET");
2495        let blocks = extract_text(&doc);
2496        assert_eq!(blocks.len(), 1);
2497        assert_eq!(blocks[0].text, "X");
2498        assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
2499    }
2500
2501    /// Nested BDCs: the innermost `/ActualText` wins inside its range, and
2502    /// the outer override is restored after the inner EMC. A flat flag would
2503    /// leak the inner value past its scope; the stack must pop one level.
2504    #[test]
2505    fn actual_text_nested_bdc_inner_overrides_outer() {
2506        let doc = make_doc_with_text(
2507            b"BT /F1 12 Tf \
2508              /Span <</ActualText (outer)>> BDC \
2509                (A) Tj \
2510                /Span <</ActualText (inner)>> BDC \
2511                  (B) Tj \
2512                EMC \
2513                (C) Tj \
2514              EMC ET",
2515        );
2516        let blocks = extract_text(&doc);
2517        assert_eq!(blocks.len(), 3);
2518        assert_eq!(blocks[0].text, "A");
2519        assert_eq!(blocks[0].actual_text.as_deref(), Some("outer"));
2520        assert_eq!(blocks[1].text, "B");
2521        assert_eq!(blocks[1].actual_text.as_deref(), Some("inner"));
2522        assert_eq!(blocks[2].text, "C");
2523        assert_eq!(blocks[2].actual_text.as_deref(), Some("outer"));
2524    }
2525
2526    /// BMC (no property list) still pushes a stack entry so the matching EMC
2527    /// pops the right level — otherwise an EMC inside a BDC range would leak.
2528    #[test]
2529    fn actual_text_bmc_does_not_leak_emc() {
2530        let doc = make_doc_with_text(
2531            b"BT /F1 12 Tf \
2532              /Span <</ActualText (X)>> BDC \
2533                /Artifact BMC (in) Tj EMC \
2534                (out) Tj \
2535              EMC \
2536              (after) Tj ET",
2537        );
2538        let blocks = extract_text(&doc);
2539        assert_eq!(blocks.len(), 3);
2540        assert_eq!(blocks[0].actual_text.as_deref(), Some("X"));
2541        assert_eq!(blocks[1].actual_text.as_deref(), Some("X"));
2542        assert_eq!(blocks[2].actual_text, None);
2543    }
2544
2545    /// Build a one-page Document whose page invokes a Form XObject (via
2546    /// `/Fm0 Do`) wrapped in a `BDC` / `EMC` pair. `xobj_content` is the
2547    /// content stream placed inside the Form XObject; `page_pre` runs
2548    /// before the `Do`, and `page_post` runs after it (still inside the
2549    /// surrounding `BDC` / `EMC`). Used to drive issue #1358 regression.
2550    fn make_doc_with_xobj_inside_bdc(
2551        page_pre: &[u8],
2552        xobj_content: &[u8],
2553        page_post: &[u8],
2554        actual_text: &str,
2555    ) -> Document {
2556        let mut doc = Document::with_version("1.7");
2557
2558        let xobj_dict = dictionary! {
2559            "Type" => "XObject",
2560            "Subtype" => "Form",
2561            "BBox" => vec![0.into(), 0.into(), 100.into(), 100.into()],
2562        };
2563        let xobj_stream = Stream::new(xobj_dict, xobj_content.to_vec());
2564        let xobj_id = doc.add_object(Object::Stream(xobj_stream));
2565
2566        // Page content: BT /F1 12 Tf /Span <</ActualText(...)>> BDC <pre> /Fm0 Do <post> EMC ET
2567        let mut page_content = Vec::<u8>::new();
2568        page_content.extend_from_slice(b"BT /F1 12 Tf /Span <</ActualText (");
2569        page_content.extend_from_slice(actual_text.as_bytes());
2570        page_content.extend_from_slice(b")>> BDC ");
2571        page_content.extend_from_slice(page_pre);
2572        page_content.extend_from_slice(b" /Fm0 Do ");
2573        page_content.extend_from_slice(page_post);
2574        page_content.extend_from_slice(b" EMC ET");
2575
2576        let content_stream = Stream::new(dictionary! {}, page_content);
2577        let content_id = doc.add_object(Object::Stream(content_stream));
2578
2579        let resources = dictionary! {
2580            "XObject" => dictionary! { "Fm0" => Object::Reference(xobj_id) },
2581        };
2582        let resources_id = doc.add_object(Object::Dictionary(resources));
2583
2584        let page_dict = dictionary! {
2585            "Type" => "Page",
2586            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2587            "Contents" => Object::Reference(content_id),
2588            "Resources" => Object::Reference(resources_id),
2589        };
2590        let page_id = doc.add_object(Object::Dictionary(page_dict));
2591
2592        let pages_dict = dictionary! {
2593            "Type" => "Pages",
2594            "Kids" => vec![Object::Reference(page_id)],
2595            "Count" => 1_i64,
2596        };
2597        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2598
2599        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2600            d.set("Parent", Object::Reference(pages_id));
2601        }
2602
2603        let catalog = dictionary! {
2604            "Type" => "Catalog",
2605            "Pages" => Object::Reference(pages_id),
2606        };
2607        let catalog_id = doc.add_object(Object::Dictionary(catalog));
2608        doc.trailer.set("Root", Object::Reference(catalog_id));
2609
2610        doc
2611    }
2612
2613    /// Regression guard for issue #1358: when a `Do` operator invoking a
2614    /// Form XObject sits inside a `BDC` / `EMC` pair carrying
2615    /// `/ActualText`, text emitted by the XObject's own `Tj`/`TJ`
2616    /// operators must inherit that `/ActualText` value. Previously the
2617    /// recursive `extract_blocks_from_ops_inner` reset the marked-content
2618    /// stack and lost the surrounding context.
2619    #[test]
2620    fn actual_text_propagates_into_form_xobject_recursion() {
2621        let doc = make_doc_with_xobj_inside_bdc(
2622            b"(pre) Tj",
2623            b"BT /F1 12 Tf (inside) Tj ET",
2624            b"(post) Tj",
2625            "wrapped",
2626        );
2627        let blocks = extract_text(&doc);
2628        assert_eq!(blocks.len(), 3);
2629        assert_eq!(blocks[0].text, "pre");
2630        assert_eq!(blocks[0].actual_text.as_deref(), Some("wrapped"));
2631        // The block emitted inside the Form XObject's BT/ET must inherit
2632        // the surrounding /ActualText — not silently become None.
2633        assert_eq!(blocks[1].text, "inside");
2634        assert_eq!(
2635            blocks[1].actual_text.as_deref(),
2636            Some("wrapped"),
2637            "Form XObject text lost surrounding /ActualText (issue #1358)"
2638        );
2639        assert_eq!(blocks[2].text, "post");
2640        assert_eq!(blocks[2].actual_text.as_deref(), Some("wrapped"));
2641    }
2642
2643    /// Inverse regression: when a `Do` invokes a Form XObject whose own
2644    /// content has its own `BDC` ... `EMC` pair with a different
2645    /// `/ActualText`, the inner pair takes precedence inside the XObject
2646    /// and the outer pair resumes after the inner `EMC`. This guards
2647    /// against the inherited binding clobbering legitimate inner
2648    /// marked-content state.
2649    #[test]
2650    fn actual_text_inner_xobj_bdc_overrides_inherited() {
2651        let doc = make_doc_with_xobj_inside_bdc(
2652            b"",
2653            b"BT /F1 12 Tf \
2654              /Span <</ActualText (inner)>> BDC (B) Tj EMC \
2655              (after) Tj ET",
2656            b"",
2657            "outer",
2658        );
2659        let blocks = extract_text(&doc);
2660        assert_eq!(blocks.len(), 2);
2661        assert_eq!(blocks[0].text, "B");
2662        assert_eq!(blocks[0].actual_text.as_deref(), Some("inner"));
2663        assert_eq!(blocks[1].text, "after");
2664        assert_eq!(blocks[1].actual_text.as_deref(), Some("outer"));
2665    }
2666
2667    /// `/ActualText` encoded as UTF-16BE (with BOM) decodes to the proper
2668    /// Unicode string. Real PDFs almost always use this form.
2669    #[test]
2670    fn actual_text_utf16be_bom_decodes() {
2671        // <FEFF00660069> = UTF-16BE BOM + "fi"
2672        let doc = make_doc_with_text(
2673            b"BT /F1 12 Tf /Span <</ActualText <FEFF00660069> >> BDC (X) Tj EMC ET",
2674        );
2675        let blocks = extract_text(&doc);
2676        assert_eq!(blocks.len(), 1);
2677        assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
2678    }
2679
2680    // -- M4-LIG-01: ligature decomposition -------------------------------
2681
2682    #[test]
2683    fn ligature_ff_decomposes() {
2684        assert_eq!(decompose_ligature_char('\u{FB00}'), Some("ff"));
2685        assert_eq!(decompose_ligatures("\u{FB00}"), "ff");
2686        assert_eq!(decompose_ligatures("o\u{FB00}ice"), "office");
2687    }
2688
2689    #[test]
2690    fn ligature_fi_decomposes() {
2691        assert_eq!(decompose_ligature_char('\u{FB01}'), Some("fi"));
2692        assert_eq!(decompose_ligatures("\u{FB01}"), "fi");
2693        assert_eq!(decompose_ligatures("of\u{FB01}ce"), "office");
2694    }
2695
2696    #[test]
2697    fn ligature_fl_decomposes() {
2698        assert_eq!(decompose_ligature_char('\u{FB02}'), Some("fl"));
2699        assert_eq!(decompose_ligatures("\u{FB02}ame"), "flame");
2700    }
2701
2702    #[test]
2703    fn ligature_ffi_decomposes() {
2704        assert_eq!(decompose_ligature_char('\u{FB03}'), Some("ffi"));
2705        assert_eq!(decompose_ligatures("o\u{FB03}ce"), "office");
2706    }
2707
2708    #[test]
2709    fn ligature_ffl_decomposes() {
2710        assert_eq!(decompose_ligature_char('\u{FB04}'), Some("ffl"));
2711        assert_eq!(decompose_ligatures("ba\u{FB04}e"), "baffle");
2712    }
2713
2714    /// `st` has both U+FB05 (long-s + t, archaic) and U+FB06 (regular st);
2715    /// both decompose to "st" — historical spelling is not preserved since
2716    /// the goal is text-search/copy-paste, not typographic round-trip.
2717    /// Glyph name `st` also routes through this codepoint.
2718    #[test]
2719    fn ligature_st_decomposes() {
2720        assert_eq!(decompose_ligature_char('\u{FB06}'), Some("st"));
2721        assert_eq!(decompose_ligature_char('\u{FB05}'), Some("st"));
2722        assert_eq!(decompose_ligatures("fa\u{FB06}"), "fast");
2723        // Glyph name path: `st` → U+FB06 → "st"
2724        assert_eq!(glyph_name_to_unicode("st"), Some('\u{FB06}'));
2725    }
2726
2727    /// `ct` has no precomposed Unicode codepoint, so `glyph_name_to_unicode`
2728    /// reports `None`; the encoding-map builder records the slot in the
2729    /// `ct_codes` side-table instead. Decoding emits one internal marker
2730    /// scalar for that code, then the ligature layer expands only markers
2731    /// whose origin came from `ct_codes`. This avoids clobbering legitimate
2732    /// U+E007 values arriving through ToUnicode CMaps.
2733    #[test]
2734    fn ligature_ct_via_glyph_name_emits_string() {
2735        assert_eq!(glyph_name_to_unicode("ct"), None);
2736        // End-to-end: byte 1 → /ct → "ct" in extracted text.
2737        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6101> Tj ET", "ct");
2738        let blocks = extract_text(&doc);
2739        assert_eq!(blocks.len(), 1);
2740        assert_eq!(blocks[0].text, "act");
2741    }
2742
2743    /// Regression guard for issue #1357: a ToUnicode CMap that legitimately
2744    /// maps a code to U+E007 must NOT be silently rewritten to "ct". The
2745    /// previous PUA-sentinel implementation clobbered every U+E007 it saw,
2746    /// regardless of origin.
2747    #[test]
2748    fn tounicode_pua_e007_is_preserved() {
2749        let doc = make_doc_with_tounicode_cmap(b"BT /F1 12 Tf <01> Tj ET", 0x01, 0xE007);
2750        let blocks = extract_text(&doc);
2751        assert_eq!(blocks.len(), 1);
2752        assert_eq!(blocks[0].text, "\u{E007}");
2753        assert_ne!(blocks[0].text, "ct");
2754    }
2755
2756    /// A `/ct` glyph must advance the text matrix as one glyph even though
2757    /// it expands to two extracted characters. Otherwise following glyphs in
2758    /// the same show operator drift right by one extra glyph width.
2759    #[test]
2760    fn ligature_ct_positioned_chars_advance_as_single_glyph() {
2761        // <610162>: a (0x61), ct (0x01 via /Differences), b (0x62)
2762        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <610162> Tj ET", "ct");
2763        let chars = extract_positioned_chars(&doc, 1).unwrap();
2764        let extracted: String = chars.iter().map(|c| c.ch).collect();
2765        assert_eq!(extracted, "actb");
2766        assert_eq!(chars.len(), 4);
2767
2768        let char_w = 12.0 * APPROX_CHAR_WIDTH;
2769        assert!((chars[1].bbox[0] - char_w).abs() < 1e-6);
2770        assert!((chars[3].bbox[0] - (2.0 * char_w)).abs() < 1e-6);
2771    }
2772
2773    /// Build a one-page Document with a font whose /Encoding /Differences
2774    /// remaps byte 1 to the given ligature glyph name. Used by the golden
2775    /// "office" fixture below to drive ligature decomposition end-to-end
2776    /// through the text extractor.
2777    fn make_doc_with_ligature_font(content: &[u8], glyph_name: &str) -> Document {
2778        let mut doc = Document::with_version("1.7");
2779
2780        let content_stream = Stream::new(dictionary! {}, content.to_vec());
2781        let content_id = doc.add_object(Object::Stream(content_stream));
2782
2783        let encoding = dictionary! {
2784            "Type" => "Encoding",
2785            "BaseEncoding" => "WinAnsiEncoding",
2786            "Differences" => vec![
2787                1_i64.into(),
2788                Object::Name(glyph_name.as_bytes().to_vec()),
2789            ],
2790        };
2791        let encoding_id = doc.add_object(Object::Dictionary(encoding));
2792
2793        let font = dictionary! {
2794            "Type" => "Font",
2795            "Subtype" => "Type1",
2796            "BaseFont" => "Helvetica",
2797            "Encoding" => Object::Reference(encoding_id),
2798        };
2799        let font_id = doc.add_object(Object::Dictionary(font));
2800
2801        let resources = dictionary! {
2802            "Font" => dictionary! { "F1" => Object::Reference(font_id) },
2803        };
2804        let resources_id = doc.add_object(Object::Dictionary(resources));
2805
2806        let page_dict = dictionary! {
2807            "Type" => "Page",
2808            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2809            "Contents" => Object::Reference(content_id),
2810            "Resources" => Object::Reference(resources_id),
2811        };
2812        let page_id = doc.add_object(Object::Dictionary(page_dict));
2813
2814        let pages_dict = dictionary! {
2815            "Type" => "Pages",
2816            "Kids" => vec![Object::Reference(page_id)],
2817            "Count" => 1_i64,
2818        };
2819        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2820
2821        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2822            d.set("Parent", Object::Reference(pages_id));
2823        }
2824
2825        let catalog = dictionary! {
2826            "Type" => "Catalog",
2827            "Pages" => Object::Reference(pages_id),
2828        };
2829        let catalog_id = doc.add_object(Object::Dictionary(catalog));
2830        doc.trailer.set("Root", Object::Reference(catalog_id));
2831
2832        doc
2833    }
2834
2835    /// Golden fixture: byte sequence `o + ffi + c + e` rendered through a
2836    /// font whose /Differences remaps byte 1 → /ffi → U+FB03. End-to-end
2837    /// the extractor should produce "office", not "o\u{FB03}ce". This is
2838    /// the single test that exercises the full pipeline: glyph→unicode →
2839    /// encoding map → decode → ligature decomposition.
2840    #[test]
2841    fn ligature_office_golden_ffi() {
2842        // Hex string <6F016365>: o (0x6F), ffi (0x01), c (0x63), e (0x65)
2843        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6F016365> Tj ET", "ffi");
2844        let blocks = extract_text(&doc);
2845        assert_eq!(blocks.len(), 1);
2846        assert_eq!(blocks[0].text, "office");
2847    }
2848
2849    /// Build a one-page Document whose font has a ToUnicode CMap mapping a
2850    /// single 1-byte source code to an explicit Unicode scalar. Used to
2851    /// drive the regression guard for issue #1357: a ToUnicode-supplied
2852    /// codepoint must round-trip through the extractor unchanged.
2853    fn make_doc_with_tounicode_cmap(content: &[u8], src: u8, dst: u32) -> Document {
2854        let mut doc = Document::with_version("1.7");
2855
2856        let content_stream = Stream::new(dictionary! {}, content.to_vec());
2857        let content_id = doc.add_object(Object::Stream(content_stream));
2858
2859        // Minimal ToUnicode CMap with a single bfchar mapping.
2860        let cmap_text = format!(
2861            "/CIDInit /ProcSet findresource begin\n\
2862             12 dict begin\n\
2863             begincmap\n\
2864             /CMapType 2 def\n\
2865             1 beginbfchar\n\
2866             <{:02X}> <{:04X}>\n\
2867             endbfchar\n\
2868             endcmap CMapName currentdict /CMap defineresource pop end end",
2869            src, dst
2870        );
2871        let cmap_stream = Stream::new(dictionary! {}, cmap_text.into_bytes());
2872        let cmap_id = doc.add_object(Object::Stream(cmap_stream));
2873
2874        let font = dictionary! {
2875            "Type" => "Font",
2876            "Subtype" => "Type1",
2877            "BaseFont" => "Helvetica",
2878            "Encoding" => "WinAnsiEncoding",
2879            "ToUnicode" => Object::Reference(cmap_id),
2880        };
2881        let font_id = doc.add_object(Object::Dictionary(font));
2882
2883        let resources = dictionary! {
2884            "Font" => dictionary! { "F1" => Object::Reference(font_id) },
2885        };
2886        let resources_id = doc.add_object(Object::Dictionary(resources));
2887
2888        let page_dict = dictionary! {
2889            "Type" => "Page",
2890            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2891            "Contents" => Object::Reference(content_id),
2892            "Resources" => Object::Reference(resources_id),
2893        };
2894        let page_id = doc.add_object(Object::Dictionary(page_dict));
2895
2896        let pages_dict = dictionary! {
2897            "Type" => "Pages",
2898            "Kids" => vec![Object::Reference(page_id)],
2899            "Count" => 1_i64,
2900        };
2901        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2902
2903        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2904            d.set("Parent", Object::Reference(pages_id));
2905        }
2906
2907        let catalog = dictionary! {
2908            "Type" => "Catalog",
2909            "Pages" => Object::Reference(pages_id),
2910        };
2911        let catalog_id = doc.add_object(Object::Dictionary(catalog));
2912        doc.trailer.set("Root", Object::Reference(catalog_id));
2913
2914        doc
2915    }
2916
2917    /// PositionedChar: a ligature glyph splits its bbox evenly across the
2918    /// constituent characters so character offsets stay within the glyph's
2919    /// horizontal footprint. The matrix advances exactly once per glyph,
2920    /// not once per constituent — so layout is preserved.
2921    #[test]
2922    fn ligature_positioned_chars_split_bbox() {
2923        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
2924        let chars = extract_positioned_chars(&doc, 1).unwrap();
2925        assert_eq!(chars.len(), 3, "ffi → 3 chars");
2926        assert_eq!(chars[0].ch, 'f');
2927        assert_eq!(chars[1].ch, 'f');
2928        assert_eq!(chars[2].ch, 'i');
2929        // Bboxes should tile horizontally.
2930        assert!(chars[0].bbox[2] <= chars[1].bbox[0] + 1e-9);
2931        assert!(chars[1].bbox[2] <= chars[2].bbox[0] + 1e-9);
2932        // And total width should equal one glyph's char_w.
2933        let total = chars[2].bbox[2] - chars[0].bbox[0];
2934        let expected = 12.0 * APPROX_CHAR_WIDTH;
2935        assert!((total - expected).abs() < 1e-6);
2936    }
2937
2938    // -- M4-FOLLOWUP-03: PositionedChar decomposition + Tj bbox geometry ---
2939
2940    /// Regression guard for issue #1359 (problem 1): ligatures outside the
2941    /// hardcoded Latin set must decompose in PositionedChar output the same
2942    /// way they decompose in extracted text. Previously
2943    /// `push_glyph_positioned` only ran the hardcoded Latin map while
2944    /// extracted text additionally applied an NFKD fallback over
2945    /// FB00..FB4F, leaving counts misaligned for Armenian/Hebrew/Arabic
2946    /// presentation forms.
2947    #[test]
2948    fn positioned_chars_match_extracted_text_for_armenian_ligature() {
2949        // U+FB13 (Armenian small ligature men now) NFKDs to U+0574 U+0576.
2950        // The font's /Differences slot 1 maps to glyph name uniFB13.
2951        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB13");
2952        let blocks = extract_text(&doc);
2953        let chars = extract_positioned_chars(&doc, 1).unwrap();
2954        assert_eq!(blocks.len(), 1);
2955        // Both paths must agree on the constituent count.
2956        assert_eq!(
2957            blocks[0].text.chars().count(),
2958            chars.len(),
2959            "PositionedChar count drifted from extract_text() char count \
2960             (issue #1359 problem 1): blocks[0].text = {:?}, chars = {:?}",
2961            blocks[0].text,
2962            chars.iter().map(|c| c.ch).collect::<String>(),
2963        );
2964        // And on the actual characters.
2965        let chars_str: String = chars.iter().map(|c| c.ch).collect();
2966        assert_eq!(blocks[0].text, chars_str);
2967        // Total bbox span equals one glyph's char_w (the matrix advanced once).
2968        let total = chars.last().unwrap().bbox[2] - chars.first().unwrap().bbox[0];
2969        let expected = 12.0 * APPROX_CHAR_WIDTH;
2970        assert!((total - expected).abs() < 1e-6);
2971    }
2972
2973    /// Hebrew presentation forms may NFKD-map to a single base Hebrew
2974    /// codepoint. Those mappings are still semantic decompositions and must
2975    /// not be dropped just because the output has one scalar.
2976    #[test]
2977    fn hebrew_presentation_form_extracts_single_codepoint_nfkd_mapping() {
2978        // U+FB21 (Hebrew letter wide alef) NFKDs to U+05D0.
2979        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
2980        let blocks = extract_text(&doc);
2981        assert_eq!(blocks.len(), 1);
2982        assert_eq!(blocks[0].text, "\u{05D0}");
2983        assert_eq!(
2984            decompose_glyph_to_string('\u{FB21}').as_deref(),
2985            Some("\u{05D0}")
2986        );
2987    }
2988
2989    /// The positioned-character stream must expose the same text characters
2990    /// as extract_text for single-codepoint Hebrew presentation-form
2991    /// decompositions.
2992    #[test]
2993    fn positioned_chars_match_extracted_text_for_hebrew_presentation_form() {
2994        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
2995        let blocks = extract_text(&doc);
2996        let chars = extract_positioned_chars(&doc, 1).unwrap();
2997        let chars_str: String = chars.iter().map(|c| c.ch).collect();
2998
2999        assert_eq!(blocks.len(), 1);
3000        assert_eq!(blocks[0].text, "\u{05D0}");
3001        assert_eq!(chars_str, blocks[0].text);
3002        assert_eq!(blocks[0].text.chars().count(), chars.len());
3003    }
3004
3005    /// Regression guard for issue #1359 (problem 2): a Tj operator that
3006    /// emits a single ligature glyph must produce a TextBlock whose bbox
3007    /// width equals one rendered glyph's footprint, not the byte length
3008    /// of the decomposed display string. Previously `display_text.len()`
3009    /// over-counted because ligature expansion grows the string while
3010    /// the text matrix advances exactly once per source glyph.
3011    #[test]
3012    fn tj_block_width_matches_rendered_glyph_count() {
3013        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
3014        let blocks = extract_text(&doc);
3015        assert_eq!(blocks.len(), 1);
3016        assert_eq!(blocks[0].text, "ffi");
3017        let width = blocks[0].bbox[2] - blocks[0].bbox[0];
3018        let expected = 12.0 * APPROX_CHAR_WIDTH;
3019        assert!(
3020            (width - expected).abs() < 1e-6,
3021            "Tj bbox width {} != one glyph's char_w {} (issue #1359 problem 2)",
3022            width,
3023            expected,
3024        );
3025    }
3026
3027    /// Cross-operator consistency: a `Tj` block and a `TJ` block emitting
3028    /// the same single-glyph ligature must produce TextBlocks with equal
3029    /// bbox widths. Pre-fix, Tj used `display_text.len()` (3 for "ffi")
3030    /// and TJ used `x_end - x_start` (1 glyph advance) — so the same input
3031    /// rendered as ~3× wider in Tj than in TJ.
3032    #[test]
3033    fn tj_and_tj_array_bbox_widths_agree_for_ligature() {
3034        let tj_doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
3035        let tj_blocks = extract_text(&tj_doc);
3036        let tj_doc_arr = make_doc_with_ligature_font(b"BT /F1 12 Tf [<01>] TJ ET", "ffi");
3037        let tj_arr_blocks = extract_text(&tj_doc_arr);
3038        assert_eq!(tj_blocks.len(), 1);
3039        assert_eq!(tj_arr_blocks.len(), 1);
3040        let tj_w = tj_blocks[0].bbox[2] - tj_blocks[0].bbox[0];
3041        let tj_arr_w = tj_arr_blocks[0].bbox[2] - tj_arr_blocks[0].bbox[0];
3042        assert!(
3043            (tj_w - tj_arr_w).abs() < 1e-6,
3044            "Tj bbox width {} disagrees with TJ bbox width {} for the same \
3045             single-glyph ligature input (issue #1359 problem 2)",
3046            tj_w,
3047            tj_arr_w,
3048        );
3049    }
3050}
pdfluent_extract/text.rs

pdfluent_extract/
text.rs