pdfluent_extract/
text.rs

1//! Text extraction with character-level position tracking.
2//!
3//! Parses content stream text operators (Tj, TJ, Tm, Td, TD, T*, Tc, Tw, Tz, TL, Ts, ', ")
4//! to extract text with positional information.
5
6use crate::error::{ExtractError, Result};
7use lopdf::content::{Content, Operation};
8use lopdf::{Document, Object, ObjectId};
9use std::collections::HashMap;
10use std::sync::OnceLock;
11
12/// Approximate character width as a fraction of font size.
13const APPROX_CHAR_WIDTH: f64 = 0.5;
14
15/// A block of text extracted from a page.
16#[derive(Debug, Clone)]
17pub struct TextBlock {
18    /// The extracted text content.
19    pub text: String,
20    /// The page number (1-based).
21    pub page: u32,
22    /// Bounding box [x0, y0, x1, y1] in PDF coordinates.
23    pub bbox: [f64; 4],
24    /// Font name used for this text block.
25    pub font_name: String,
26    /// Font size in points.
27    pub font_size: f64,
28    /// `/ActualText` override from a surrounding marked-content sequence (BDC),
29    /// if any. PDF/UA-compliant authors use this to provide the canonical
30    /// reading-order text for ligatures or other glyph clusters whose visual
31    /// `text` does not match the intended characters.
32    pub actual_text: Option<String>,
33}
34
35/// A single character with its position on the page.
36#[derive(Debug, Clone)]
37pub struct PositionedChar {
38    /// The character.
39    pub ch: char,
40    /// The page number (1-based).
41    pub page: u32,
42    /// Bounding box [x0, y0, x1, y1] in PDF coordinates.
43    pub bbox: [f64; 4],
44}
45
46/// Internal graphics state for save/restore (q/Q).
47#[derive(Debug, Clone)]
48struct GraphicsState {
49    ctm: [f64; 6],
50}
51
52/// Internal text state tracker.
53#[derive(Debug, Clone)]
54struct TextState {
55    /// Text matrix.
56    tm: [f64; 6],
57    /// Text line matrix.
58    tlm: [f64; 6],
59    /// Current font name.
60    font_name: String,
61    /// Current font size.
62    font_size: f64,
63    /// Character spacing (Tc).
64    tc: f64,
65    /// Word spacing (Tw).
66    tw: f64,
67    /// Horizontal scaling (Tz), as a percentage.
68    th: f64,
69    /// Text leading (TL).
70    tl: f64,
71    /// Text rise (Ts).
72    ts: f64,
73    /// Graphics state stack.
74    gs_stack: Vec<GraphicsState>,
75    /// Current transformation matrix.
76    ctm: [f64; 6],
77}
78
79impl Default for TextState {
80    fn default() -> Self {
81        Self {
82            tm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
83            tlm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
84            font_name: String::new(),
85            font_size: 12.0,
86            tc: 0.0,
87            tw: 0.0,
88            th: 100.0,
89            tl: 0.0,
90            ts: 0.0,
91            gs_stack: Vec::new(),
92            ctm: [1.0, 0.0, 0.0, 1.0, 0.0, 0.0],
93        }
94    }
95}
96
97/// Per-font information for text decoding.
98#[derive(Clone)]
99struct FontInfo {
100    /// True if the font uses 2-byte CID encoding (Identity-H/V or other CID CMaps).
101    is_cid: bool,
102    /// ToUnicode CMap: maps character code(s) to Unicode string.
103    /// For CID fonts the key is a 2-byte big-endian value; for simple fonts it's a 1-byte code.
104    to_unicode: HashMap<u32, String>,
105    /// Encoding-based code→char map for simple fonts (derived from BaseEncoding + Differences).
106    /// Index = byte code (0..255), value = Unicode char if known.
107    encoding_map: [Option<char>; 256],
108    /// Side-table marking codes whose `Differences` glyph name is `ct`.
109    /// The `ct` ligature has no precomposed Unicode codepoint, so it cannot
110    /// be expressed as a `char` in `encoding_map`. When this flag is set,
111    /// decoding emits one private-use marker scalar and records that marker's
112    /// origin so glyph advance remains one codepoint wide. The marker expands
113    /// to "ct" only in the ligature-decomposition layer, avoiding clobbering
114    /// legitimate U+E007 values that arrive through ToUnicode CMaps.
115    ct_codes: [bool; 256],
116}
117
118/// Build a map from font resource name (e.g. "F1") to FontInfo for a page.
119fn build_font_map(doc: &Document, page_id: ObjectId) -> HashMap<String, FontInfo> {
120    let mut map = HashMap::new();
121
122    // Get the page's Resources dictionary (may be inherited from parent Pages node).
123    let resources = get_page_resources(doc, page_id);
124    let font_dict = match resources.and_then(|res| match res.get(b"Font").ok()? {
125        Object::Dictionary(d) => Some(d.clone()),
126        Object::Reference(r) => match doc.get_object(*r).ok()? {
127            Object::Dictionary(d) => Some(d.clone()),
128            _ => None,
129        },
130        _ => None,
131    }) {
132        Some(d) => d,
133        None => return map,
134    };
135
136    for (name_bytes, value) in font_dict.iter() {
137        let font_name = String::from_utf8_lossy(name_bytes).to_string();
138
139        // Resolve the font dictionary.
140        let font = match value {
141            Object::Reference(r) => match doc.get_object(*r).ok() {
142                Some(Object::Dictionary(d)) => d.clone(),
143                _ => continue,
144            },
145            Object::Dictionary(d) => d.clone(),
146            _ => continue,
147        };
148
149        let subtype = font
150            .get(b"Subtype")
151            .ok()
152            .and_then(|o| match o {
153                Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
154                _ => None,
155            })
156            .unwrap_or_default();
157
158        let is_cid = subtype == "Type0";
159
160        // Parse ToUnicode CMap if present.
161        let to_unicode = parse_to_unicode_from_font(doc, &font);
162
163        // For Type0 fonts, also check DescendantFonts for ToUnicode.
164        let to_unicode = if to_unicode.is_empty() && is_cid {
165            if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
166                descendants
167                    .iter()
168                    .find_map(|d| {
169                        let desc_dict = match d {
170                            Object::Reference(r) => match doc.get_object(*r).ok()? {
171                                Object::Dictionary(d) => d,
172                                _ => return None,
173                            },
174                            Object::Dictionary(d) => d,
175                            _ => return None,
176                        };
177                        let tu = parse_to_unicode_from_font(doc, desc_dict);
178                        if tu.is_empty() {
179                            None
180                        } else {
181                            Some(tu)
182                        }
183                    })
184                    .unwrap_or_default()
185            } else {
186                HashMap::new()
187            }
188        } else {
189            to_unicode
190        };
191
192        // Build encoding map for simple fonts (from Encoding + Differences).
193        let (encoding_map, ct_codes) = if !is_cid {
194            build_encoding_map(doc, &font)
195        } else {
196            ([None; 256], [false; 256])
197        };
198
199        map.insert(
200            font_name,
201            FontInfo {
202                is_cid,
203                to_unicode,
204                encoding_map,
205                ct_codes,
206            },
207        );
208    }
209
210    map
211}
212
213/// Parse the ToUnicode CMap from a font dictionary.
214fn parse_to_unicode_from_font(doc: &Document, font: &lopdf::Dictionary) -> HashMap<u32, String> {
215    let tu_obj = match font.get(b"ToUnicode").ok() {
216        Some(Object::Reference(r)) => doc.get_object(*r).ok(),
217        Some(obj) => Some(obj),
218        None => return HashMap::new(),
219    };
220
221    let stream_bytes = match tu_obj {
222        Some(Object::Stream(ref s)) => s
223            .decompressed_content()
224            .ok()
225            .unwrap_or_else(|| s.content.clone()),
226        _ => return HashMap::new(),
227    };
228
229    parse_to_unicode_cmap(&stream_bytes)
230}
231
232/// Parse a ToUnicode CMap stream into a code→Unicode mapping.
233///
234/// Handles both `beginbfchar` and `beginbfrange` sections.
235fn parse_to_unicode_cmap(data: &[u8]) -> HashMap<u32, String> {
236    let text = String::from_utf8_lossy(data);
237    let mut map = HashMap::new();
238
239    // Parse beginbfchar sections: <srcCode> <dstString>
240    for section in text.split("beginbfchar") {
241        let section = match section.split("endbfchar").next() {
242            Some(s) => s,
243            None => continue,
244        };
245        let tokens = extract_hex_tokens(section);
246        for pair in tokens.chunks(2) {
247            if pair.len() == 2 {
248                let code = parse_hex_u32(&pair[0]);
249                let unicode = hex_to_unicode_string(&pair[1]);
250                map.insert(code, unicode);
251            }
252        }
253    }
254
255    // Parse beginbfrange sections: <srcLo> <srcHi> <dstStart> or <srcLo> <srcHi> [<dst1> <dst2> ...]
256    for section in text.split("beginbfrange") {
257        let section = match section.split("endbfrange").next() {
258            Some(s) => s,
259            None => continue,
260        };
261
262        // Tokenize: extract hex tokens and array brackets
263        let mut chars = section.chars().peekable();
264        let mut tokens: Vec<String> = Vec::new();
265        let mut arrays: Vec<Vec<String>> = Vec::new();
266        let mut in_array = false;
267        let mut current_array: Vec<String> = Vec::new();
268
269        while let Some(&ch) = chars.peek() {
270            if ch == '<' {
271                chars.next();
272                let hex: String = chars
273                    .by_ref()
274                    .take_while(|&c| c != '>')
275                    .filter(|c| !c.is_whitespace())
276                    .collect();
277                if in_array {
278                    current_array.push(hex);
279                } else {
280                    tokens.push(hex);
281                }
282            } else if ch == '[' {
283                chars.next();
284                in_array = true;
285                current_array = Vec::new();
286            } else if ch == ']' {
287                chars.next();
288                in_array = false;
289                arrays.push(std::mem::take(&mut current_array));
290                tokens.push(String::new()); // placeholder for array position
291            } else {
292                chars.next();
293            }
294        }
295
296        // Process range entries: every 3 tokens = (lo, hi, dst_or_array)
297        let mut array_idx = 0;
298        let mut i = 0;
299        while i + 2 < tokens.len() {
300            let lo = parse_hex_u32(&tokens[i]);
301            let hi = parse_hex_u32(&tokens[i + 1]);
302
303            if tokens[i + 2].is_empty() {
304                // Array destination
305                if array_idx < arrays.len() {
306                    let arr = &arrays[array_idx];
307                    for (offset, dst) in arr.iter().enumerate() {
308                        let code = lo + offset as u32;
309                        if code <= hi {
310                            map.insert(code, hex_to_unicode_string(dst));
311                        }
312                    }
313                    array_idx += 1;
314                }
315            } else {
316                // Single start value — increment for each code in range
317                let dst_start = parse_hex_u32(&tokens[i + 2]);
318                let dst_len = tokens[i + 2].len();
319                for code in lo..=hi {
320                    let dst_val = dst_start + (code - lo);
321                    let s = if dst_len <= 4 {
322                        // BMP character
323                        char::from_u32(dst_val)
324                            .map(|c| c.to_string())
325                            .unwrap_or_default()
326                    } else {
327                        // Multi-byte: treat as UTF-16BE pairs
328                        let hex = format!("{:0>width$X}", dst_val, width = dst_len);
329                        hex_to_unicode_string(&hex)
330                    };
331                    map.insert(code, s);
332                }
333            }
334            i += 3;
335        }
336    }
337
338    map
339}
340
341/// Extract hex tokens (contents between < and >) from text.
342fn extract_hex_tokens(text: &str) -> Vec<String> {
343    let mut tokens = Vec::new();
344    let mut in_hex = false;
345    let mut current = String::new();
346    for ch in text.chars() {
347        if ch == '<' {
348            in_hex = true;
349            current.clear();
350        } else if ch == '>' && in_hex {
351            in_hex = false;
352            tokens.push(current.clone());
353        } else if in_hex && !ch.is_whitespace() {
354            current.push(ch);
355        }
356    }
357    tokens
358}
359
360/// Parse a hex string to a u32 (e.g., "0041" → 65).
361fn parse_hex_u32(hex: &str) -> u32 {
362    u32::from_str_radix(hex, 16).unwrap_or(0)
363}
364
365/// Convert a hex string to a Unicode string (interpreting as UTF-16BE pairs).
366fn hex_to_unicode_string(hex: &str) -> String {
367    let bytes: Vec<u8> = (0..hex.len())
368        .step_by(2)
369        .filter_map(|i| u8::from_str_radix(&hex[i..i + 2.min(hex.len() - i)], 16).ok())
370        .collect();
371
372    if bytes.len() >= 2 && bytes.len().is_multiple_of(2) {
373        // Interpret as UTF-16BE
374        let u16s: Vec<u16> = bytes
375            .chunks(2)
376            .map(|c| u16::from_be_bytes([c[0], c[1]]))
377            .collect();
378        String::from_utf16_lossy(&u16s)
379    } else if bytes.len() == 1 {
380        char::from_u32(bytes[0] as u32)
381            .map(|c| c.to_string())
382            .unwrap_or_default()
383    } else {
384        String::new()
385    }
386}
387
388/// Get the Resources dictionary for a page (resolves inheritance from parent Pages).
389fn get_page_resources(doc: &Document, page_id: ObjectId) -> Option<lopdf::Dictionary> {
390    let page = match doc.get_object(page_id).ok()? {
391        Object::Dictionary(d) => d.clone(),
392        _ => return None,
393    };
394
395    // Direct Resources on the page.
396    if let Some(res) = resolve_dict(doc, &page, b"Resources") {
397        return Some(res);
398    }
399
400    // Walk up the parent chain (Pages nodes) for inherited Resources.
401    let mut current = page;
402    for _ in 0..20 {
403        let parent_ref = match current.get(b"Parent").ok()? {
404            Object::Reference(r) => *r,
405            _ => break,
406        };
407        let parent = match doc.get_object(parent_ref).ok()? {
408            Object::Dictionary(d) => d.clone(),
409            _ => break,
410        };
411        if let Some(res) = resolve_dict(doc, &parent, b"Resources") {
412            return Some(res);
413        }
414        current = parent;
415    }
416
417    None
418}
419
420/// Resolve a dictionary entry that may be inline or an indirect reference.
421fn resolve_dict(doc: &Document, dict: &lopdf::Dictionary, key: &[u8]) -> Option<lopdf::Dictionary> {
422    match dict.get(key).ok()? {
423        Object::Dictionary(d) => Some(d.clone()),
424        Object::Reference(r) => match doc.get_object(*r).ok()? {
425            Object::Dictionary(d) => Some(d.clone()),
426            _ => None,
427        },
428        _ => None,
429    }
430}
431
432/// Build a 256-entry encoding map from a font's /Encoding dictionary.
433///
434/// Handles:
435/// - Named base encodings: WinAnsiEncoding, MacRomanEncoding, MacExpertEncoding
436/// - Differences arrays: `[code1 /name1 /name2 ... codeN /nameN ...]`
437/// - Glyph name → Unicode via AGL (Adobe Glyph List) lookup
438fn build_encoding_map(
439    doc: &Document,
440    font: &lopdf::Dictionary,
441) -> ([Option<char>; 256], [bool; 256]) {
442    let mut table = [None::<char>; 256];
443    let mut ct_codes = [false; 256];
444
445    let encoding = match font.get(b"Encoding").ok() {
446        Some(obj) => obj,
447        None => return (table, ct_codes),
448    };
449
450    match encoding {
451        Object::Name(name) => {
452            // Named encoding (e.g. "WinAnsiEncoding").
453            let name_str = String::from_utf8_lossy(name);
454            apply_base_encoding(&mut table, &name_str);
455        }
456        Object::Reference(r) => match doc.get_object(*r) {
457            Ok(Object::Dictionary(enc_dict)) => {
458                parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
459            }
460            Ok(Object::Name(name)) => {
461                let name_str = String::from_utf8_lossy(name);
462                apply_base_encoding(&mut table, &name_str);
463            }
464            _ => {}
465        },
466        Object::Dictionary(enc_dict) => {
467            parse_encoding_dict(doc, enc_dict, &mut table, &mut ct_codes);
468        }
469        _ => {}
470    }
471
472    (table, ct_codes)
473}
474
475/// Parse an Encoding dictionary with optional BaseEncoding and Differences.
476fn parse_encoding_dict(
477    doc: &Document,
478    enc_dict: &lopdf::Dictionary,
479    table: &mut [Option<char>; 256],
480    ct_codes: &mut [bool; 256],
481) {
482    // Apply BaseEncoding first.
483    if let Ok(Object::Name(base)) = enc_dict.get(b"BaseEncoding") {
484        let base_str = String::from_utf8_lossy(base);
485        apply_base_encoding(table, &base_str);
486    }
487
488    // Apply Differences array: [code /name1 /name2 ... code /name3 ...]
489    let diffs = match enc_dict.get(b"Differences").ok() {
490        Some(Object::Array(arr)) => arr.clone(),
491        Some(Object::Reference(r)) => match doc.get_object(*r).ok() {
492            Some(Object::Array(arr)) => arr.clone(),
493            _ => return,
494        },
495        _ => return,
496    };
497
498    let mut code: Option<u32> = None;
499    for item in &diffs {
500        match item {
501            Object::Integer(n) => {
502                code = Some(*n as u32);
503            }
504            Object::Name(name) => {
505                if let Some(c) = code {
506                    if c < 256 {
507                        let glyph = String::from_utf8_lossy(name);
508                        apply_glyph_name(&glyph, c as usize, table, ct_codes);
509                    }
510                    code = Some(c + 1);
511                }
512            }
513            Object::Reference(r) => {
514                // Indirect name reference (rare).
515                if let Ok(Object::Name(name)) = doc.get_object(*r) {
516                    if let Some(c) = code {
517                        if c < 256 {
518                            let glyph = String::from_utf8_lossy(name);
519                            apply_glyph_name(&glyph, c as usize, table, ct_codes);
520                        }
521                        code = Some(c + 1);
522                    }
523                }
524            }
525            _ => {}
526        }
527    }
528}
529
530/// Resolve a glyph name to either a single Unicode codepoint (stored in
531/// `table`) or the `ct` ligature marker (recorded in `ct_codes`). The `ct`
532/// ligature has no precomposed Unicode codepoint, so it cannot live in
533/// `table`; it is tracked separately so that decoding can emit one internal
534/// marker and preserve one-glyph text advance before later decomposition.
535fn apply_glyph_name(
536    glyph: &str,
537    code: usize,
538    table: &mut [Option<char>; 256],
539    ct_codes: &mut [bool; 256],
540) {
541    if glyph == "ct" {
542        // Clear any prior char mapping at this slot — the ct flag is the
543        // sole source of truth for this code.
544        table[code] = None;
545        ct_codes[code] = true;
546        return;
547    }
548    if let Some(ch) = glyph_name_to_unicode(glyph) {
549        table[code] = Some(ch);
550        ct_codes[code] = false;
551    }
552}
553
554/// Apply a named base encoding to the table.
555fn apply_base_encoding(table: &mut [Option<char>; 256], name: &str) {
556    let source = match name {
557        "WinAnsiEncoding" => winansi_encoding(),
558        "MacRomanEncoding" => mac_roman_encoding(),
559        _ => return,
560    };
561    for (i, &ch) in source.iter().enumerate() {
562        if ch != '\0' {
563            table[i] = Some(ch);
564        }
565    }
566}
567
568/// WinAnsiEncoding table (cp1252).
569fn winansi_encoding() -> &'static [char; 256] {
570    static TABLE: OnceLock<[char; 256]> = OnceLock::new();
571    TABLE.get_or_init(|| {
572        let mut t = ['\0'; 256];
573        // ASCII range is identity.
574        for i in 0x20..=0x7Eu8 {
575            t[i as usize] = i as char;
576        }
577        // Control chars mapped to common usage.
578        t[0x09] = '\t';
579        t[0x0A] = '\n';
580        t[0x0D] = '\r';
581        // cp1252 upper range (0x80-0xFF).
582        let cp1252: [(u8, char); 27] = [
583            (0x80, '\u{20AC}'), // Euro sign
584            (0x82, '\u{201A}'), // Single low-9 quotation mark
585            (0x83, '\u{0192}'), // Latin small letter f with hook
586            (0x84, '\u{201E}'), // Double low-9 quotation mark
587            (0x85, '\u{2026}'), // Horizontal ellipsis
588            (0x86, '\u{2020}'), // Dagger
589            (0x87, '\u{2021}'), // Double dagger
590            (0x88, '\u{02C6}'), // Modifier letter circumflex accent
591            (0x89, '\u{2030}'), // Per mille sign
592            (0x8A, '\u{0160}'), // Latin capital letter S with caron
593            (0x8B, '\u{2039}'), // Single left-pointing angle quotation mark
594            (0x8C, '\u{0152}'), // Latin capital ligature OE
595            (0x8E, '\u{017D}'), // Latin capital letter Z with caron
596            (0x91, '\u{2018}'), // Left single quotation mark
597            (0x92, '\u{2019}'), // Right single quotation mark
598            (0x93, '\u{201C}'), // Left double quotation mark
599            (0x94, '\u{201D}'), // Right double quotation mark
600            (0x95, '\u{2022}'), // Bullet
601            (0x96, '\u{2013}'), // En dash
602            (0x97, '\u{2014}'), // Em dash
603            (0x98, '\u{02DC}'), // Small tilde
604            (0x99, '\u{2122}'), // Trade mark sign
605            (0x9A, '\u{0161}'), // Latin small letter s with caron
606            (0x9B, '\u{203A}'), // Single right-pointing angle quotation mark
607            (0x9C, '\u{0153}'), // Latin small ligature oe
608            (0x9E, '\u{017E}'), // Latin small letter z with caron
609            (0x9F, '\u{0178}'), // Latin capital letter Y with diaeresis
610        ];
611        for (code, ch) in cp1252 {
612            t[code as usize] = ch;
613        }
614        // 0xA0-0xFF: same as Unicode Latin-1 supplement.
615        for i in 0xA0..=0xFFu16 {
616            t[i as usize] = char::from_u32(i as u32).unwrap_or('\0');
617        }
618        t
619    })
620}
621
622/// MacRomanEncoding table.
623fn mac_roman_encoding() -> &'static [char; 256] {
624    static TABLE: OnceLock<[char; 256]> = OnceLock::new();
625    TABLE.get_or_init(|| {
626        let mut t = ['\0'; 256];
627        // ASCII range is identity.
628        for i in 0x20..=0x7Eu8 {
629            t[i as usize] = i as char;
630        }
631        t[0x09] = '\t';
632        t[0x0A] = '\n';
633        t[0x0D] = '\r';
634        // Mac Roman 0x80-0xFF mapping to Unicode.
635        let mac_upper: [char; 128] = [
636            '\u{00C4}', '\u{00C5}', '\u{00C7}', '\u{00C9}', '\u{00D1}', '\u{00D6}', '\u{00DC}',
637            '\u{00E1}', '\u{00E0}', '\u{00E2}', '\u{00E4}', '\u{00E3}', '\u{00E5}', '\u{00E7}',
638            '\u{00E9}', '\u{00E8}', '\u{00EA}', '\u{00EB}', '\u{00ED}', '\u{00EC}', '\u{00EE}',
639            '\u{00EF}', '\u{00F1}', '\u{00F3}', '\u{00F2}', '\u{00F4}', '\u{00F6}', '\u{00F5}',
640            '\u{00FA}', '\u{00F9}', '\u{00FB}', '\u{00FC}', '\u{2020}', '\u{00B0}', '\u{00A2}',
641            '\u{00A3}', '\u{00A7}', '\u{2022}', '\u{00B6}', '\u{00DF}', '\u{00AE}', '\u{00A9}',
642            '\u{2122}', '\u{00B4}', '\u{00A8}', '\u{2260}', '\u{00C6}', '\u{00D8}', '\u{221E}',
643            '\u{00B1}', '\u{2264}', '\u{2265}', '\u{00A5}', '\u{00B5}', '\u{2202}', '\u{2211}',
644            '\u{220F}', '\u{03C0}', '\u{222B}', '\u{00AA}', '\u{00BA}', '\u{03A9}', '\u{00E6}',
645            '\u{00F8}', '\u{00BF}', '\u{00A1}', '\u{00AC}', '\u{221A}', '\u{0192}', '\u{2248}',
646            '\u{2206}', '\u{00AB}', '\u{00BB}', '\u{2026}', '\u{00A0}', '\u{00C0}', '\u{00C3}',
647            '\u{00D5}', '\u{0152}', '\u{0153}', '\u{2013}', '\u{2014}', '\u{201C}', '\u{201D}',
648            '\u{2018}', '\u{2019}', '\u{00F7}', '\u{25CA}', '\u{00FF}', '\u{0178}', '\u{2044}',
649            '\u{20AC}', '\u{2039}', '\u{203A}', '\u{FB01}', '\u{FB02}', '\u{2021}', '\u{00B7}',
650            '\u{201A}', '\u{201E}', '\u{2030}', '\u{00C2}', '\u{00CA}', '\u{00C1}', '\u{00CB}',
651            '\u{00C8}', '\u{00CD}', '\u{00CE}', '\u{00CF}', '\u{00CC}', '\u{00D3}', '\u{00D4}',
652            '\u{F8FF}', '\u{00D2}', '\u{00DA}', '\u{00DB}', '\u{00D9}', '\u{0131}', '\u{02C6}',
653            '\u{02DC}', '\u{00AF}', '\u{02D8}', '\u{02D9}', '\u{02DA}', '\u{00B8}', '\u{02DD}',
654            '\u{02DB}', '\u{02C7}',
655        ];
656        for (i, &ch) in mac_upper.iter().enumerate() {
657            t[0x80 + i] = ch;
658        }
659        t
660    })
661}
662
663/// Map an Adobe Glyph List (AGL) name to a Unicode character.
664///
665/// Handles:
666/// - `uniXXXX` names (4+ hex digits after "uni")
667/// - `uXXXX` / `uXXXXX` names
668/// - Common AGL names (top ~250 entries covering >99% of real-world PDFs)
669///
670/// The `ct` ligature has no precomposed Unicode codepoint and is handled
671/// separately via the `ct_codes` side-table on `FontInfo` — see
672/// [`apply_glyph_name`] — rather than through any PUA sentinel here.
673fn glyph_name_to_unicode(name: &str) -> Option<char> {
674    // Handle uniXXXX / uXXXXX patterns.
675    if name.starts_with("uni") && name.len() >= 7 {
676        return u32::from_str_radix(&name[3..7], 16)
677            .ok()
678            .and_then(char::from_u32);
679    }
680    if name.starts_with('u') && name.len() >= 5 && name[1..].chars().all(|c| c.is_ascii_hexdigit())
681    {
682        return u32::from_str_radix(&name[1..], 16)
683            .ok()
684            .and_then(char::from_u32);
685    }
686
687    if let Some(c) = agl_table().get(name).copied() {
688        return Some(c);
689    }
690
691    match name {
692        "st" => Some('\u{FB06}'),
693        "longst" => Some('\u{FB05}'),
694        _ => None,
695    }
696}
697
698/// Built-in Adobe Glyph List table (most common ~250 entries).
699fn agl_table() -> &'static HashMap<&'static str, char> {
700    static TABLE: OnceLock<HashMap<&'static str, char>> = OnceLock::new();
701    TABLE.get_or_init(|| {
702        let entries: &[(&str, char)] = &[
703            ("space", ' '),
704            ("exclam", '!'),
705            ("quotedbl", '"'),
706            ("numbersign", '#'),
707            ("dollar", '$'),
708            ("percent", '%'),
709            ("ampersand", '&'),
710            ("quotesingle", '\''),
711            ("parenleft", '('),
712            ("parenright", ')'),
713            ("asterisk", '*'),
714            ("plus", '+'),
715            ("comma", ','),
716            ("hyphen", '-'),
717            ("period", '.'),
718            ("slash", '/'),
719            ("zero", '0'),
720            ("one", '1'),
721            ("two", '2'),
722            ("three", '3'),
723            ("four", '4'),
724            ("five", '5'),
725            ("six", '6'),
726            ("seven", '7'),
727            ("eight", '8'),
728            ("nine", '9'),
729            ("colon", ':'),
730            ("semicolon", ';'),
731            ("less", '<'),
732            ("equal", '='),
733            ("greater", '>'),
734            ("question", '?'),
735            ("at", '@'),
736            ("A", 'A'),
737            ("B", 'B'),
738            ("C", 'C'),
739            ("D", 'D'),
740            ("E", 'E'),
741            ("F", 'F'),
742            ("G", 'G'),
743            ("H", 'H'),
744            ("I", 'I'),
745            ("J", 'J'),
746            ("K", 'K'),
747            ("L", 'L'),
748            ("M", 'M'),
749            ("N", 'N'),
750            ("O", 'O'),
751            ("P", 'P'),
752            ("Q", 'Q'),
753            ("R", 'R'),
754            ("S", 'S'),
755            ("T", 'T'),
756            ("U", 'U'),
757            ("V", 'V'),
758            ("W", 'W'),
759            ("X", 'X'),
760            ("Y", 'Y'),
761            ("Z", 'Z'),
762            ("bracketleft", '['),
763            ("backslash", '\\'),
764            ("bracketright", ']'),
765            ("asciicircum", '^'),
766            ("underscore", '_'),
767            ("grave", '`'),
768            ("a", 'a'),
769            ("b", 'b'),
770            ("c", 'c'),
771            ("d", 'd'),
772            ("e", 'e'),
773            ("f", 'f'),
774            ("g", 'g'),
775            ("h", 'h'),
776            ("i", 'i'),
777            ("j", 'j'),
778            ("k", 'k'),
779            ("l", 'l'),
780            ("m", 'm'),
781            ("n", 'n'),
782            ("o", 'o'),
783            ("p", 'p'),
784            ("q", 'q'),
785            ("r", 'r'),
786            ("s", 's'),
787            ("t", 't'),
788            ("u", 'u'),
789            ("v", 'v'),
790            ("w", 'w'),
791            ("x", 'x'),
792            ("y", 'y'),
793            ("z", 'z'),
794            ("braceleft", '{'),
795            ("bar", '|'),
796            ("braceright", '}'),
797            ("asciitilde", '~'),
798            // Latin extended
799            ("Agrave", '\u{00C0}'),
800            ("Aacute", '\u{00C1}'),
801            ("Acircumflex", '\u{00C2}'),
802            ("Atilde", '\u{00C3}'),
803            ("Adieresis", '\u{00C4}'),
804            ("Aring", '\u{00C5}'),
805            ("AE", '\u{00C6}'),
806            ("Ccedilla", '\u{00C7}'),
807            ("Egrave", '\u{00C8}'),
808            ("Eacute", '\u{00C9}'),
809            ("Ecircumflex", '\u{00CA}'),
810            ("Edieresis", '\u{00CB}'),
811            ("Igrave", '\u{00CC}'),
812            ("Iacute", '\u{00CD}'),
813            ("Icircumflex", '\u{00CE}'),
814            ("Idieresis", '\u{00CF}'),
815            ("Eth", '\u{00D0}'),
816            ("Ntilde", '\u{00D1}'),
817            ("Ograve", '\u{00D2}'),
818            ("Oacute", '\u{00D3}'),
819            ("Ocircumflex", '\u{00D4}'),
820            ("Otilde", '\u{00D5}'),
821            ("Odieresis", '\u{00D6}'),
822            ("Ugrave", '\u{00D9}'),
823            ("Uacute", '\u{00DA}'),
824            ("Ucircumflex", '\u{00DB}'),
825            ("Udieresis", '\u{00DC}'),
826            ("Yacute", '\u{00DD}'),
827            ("Thorn", '\u{00DE}'),
828            ("germandbls", '\u{00DF}'),
829            ("agrave", '\u{00E0}'),
830            ("aacute", '\u{00E1}'),
831            ("acircumflex", '\u{00E2}'),
832            ("atilde", '\u{00E3}'),
833            ("adieresis", '\u{00E4}'),
834            ("aring", '\u{00E5}'),
835            ("ae", '\u{00E6}'),
836            ("ccedilla", '\u{00E7}'),
837            ("egrave", '\u{00E8}'),
838            ("eacute", '\u{00E9}'),
839            ("ecircumflex", '\u{00EA}'),
840            ("edieresis", '\u{00EB}'),
841            ("igrave", '\u{00EC}'),
842            ("iacute", '\u{00ED}'),
843            ("icircumflex", '\u{00EE}'),
844            ("idieresis", '\u{00EF}'),
845            ("eth", '\u{00F0}'),
846            ("ntilde", '\u{00F1}'),
847            ("ograve", '\u{00F2}'),
848            ("oacute", '\u{00F3}'),
849            ("ocircumflex", '\u{00F4}'),
850            ("otilde", '\u{00F5}'),
851            ("odieresis", '\u{00F6}'),
852            ("ugrave", '\u{00F9}'),
853            ("uacute", '\u{00FA}'),
854            ("ucircumflex", '\u{00FB}'),
855            ("udieresis", '\u{00FC}'),
856            ("yacute", '\u{00FD}'),
857            ("thorn", '\u{00FE}'),
858            ("ydieresis", '\u{00FF}'),
859            // Ligatures and special
860            ("fi", '\u{FB01}'),
861            ("fl", '\u{FB02}'),
862            ("ff", '\u{FB00}'),
863            ("ffi", '\u{FB03}'),
864            ("ffl", '\u{FB04}'),
865            // Punctuation and symbols
866            ("endash", '\u{2013}'),
867            ("emdash", '\u{2014}'),
868            ("bullet", '\u{2022}'),
869            ("ellipsis", '\u{2026}'),
870            ("quoteleft", '\u{2018}'),
871            ("quoteright", '\u{2019}'),
872            ("quotedblleft", '\u{201C}'),
873            ("quotedblright", '\u{201D}'),
874            ("quotesinglebase", '\u{201A}'),
875            ("quotesinglbase", '\u{201A}'),
876            ("quotedblbase", '\u{201E}'),
877            ("dagger", '\u{2020}'),
878            ("daggerdbl", '\u{2021}'),
879            ("perthousand", '\u{2030}'),
880            ("guilsinglleft", '\u{2039}'),
881            ("guilsinglright", '\u{203A}'),
882            ("guillemotleft", '\u{00AB}'),
883            ("guillemotright", '\u{00BB}'),
884            ("trademark", '\u{2122}'),
885            ("copyright", '\u{00A9}'),
886            ("registered", '\u{00AE}'),
887            ("degree", '\u{00B0}'),
888            ("plusminus", '\u{00B1}'),
889            ("multiply", '\u{00D7}'),
890            ("divide", '\u{00F7}'),
891            ("fraction", '\u{2044}'),
892            ("Euro", '\u{20AC}'),
893            ("sterling", '\u{00A3}'),
894            ("yen", '\u{00A5}'),
895            ("cent", '\u{00A2}'),
896            ("currency", '\u{00A4}'),
897            ("section", '\u{00A7}'),
898            ("paragraph", '\u{00B6}'),
899            ("brokenbar", '\u{00A6}'),
900            ("ordfeminine", '\u{00AA}'),
901            ("ordmasculine", '\u{00BA}'),
902            ("exclamdown", '\u{00A1}'),
903            ("questiondown", '\u{00BF}'),
904            ("logicalnot", '\u{00AC}'),
905            ("mu", '\u{00B5}'),
906            ("macron", '\u{00AF}'),
907            ("acute", '\u{00B4}'),
908            ("cedilla", '\u{00B8}'),
909            ("dieresis", '\u{00A8}'),
910            ("circumflex", '\u{02C6}'),
911            ("tilde", '\u{02DC}'),
912            ("caron", '\u{02C7}'),
913            ("ring", '\u{02DA}'),
914            ("breve", '\u{02D8}'),
915            ("dotaccent", '\u{02D9}'),
916            ("hungarumlaut", '\u{02DD}'),
917            ("ogonek", '\u{02DB}'),
918            ("nbspace", '\u{00A0}'),
919            ("nonbreakingspace", '\u{00A0}'),
920            ("softhyphen", '\u{00AD}'),
921            ("periodcentered", '\u{00B7}'),
922            ("middot", '\u{00B7}'),
923            ("florin", '\u{0192}'),
924            ("OE", '\u{0152}'),
925            ("oe", '\u{0153}'),
926            ("Scaron", '\u{0160}'),
927            ("scaron", '\u{0161}'),
928            ("Zcaron", '\u{017D}'),
929            ("zcaron", '\u{017E}'),
930            ("Ydieresis", '\u{0178}'),
931            ("Lslash", '\u{0141}'),
932            ("lslash", '\u{0142}'),
933            ("Oslash", '\u{00D8}'),
934            ("oslash", '\u{00F8}'),
935            ("dotlessi", '\u{0131}'),
936            // Superscripts / subscripts
937            ("onesuperior", '\u{00B9}'),
938            ("twosuperior", '\u{00B2}'),
939            ("threesuperior", '\u{00B3}'),
940            ("onequarter", '\u{00BC}'),
941            ("onehalf", '\u{00BD}'),
942            ("threequarters", '\u{00BE}'),
943            // Math
944            ("minus", '\u{2212}'),
945            ("notequal", '\u{2260}'),
946            ("lessequal", '\u{2264}'),
947            ("greaterequal", '\u{2265}'),
948            ("infinity", '\u{221E}'),
949            ("partialdiff", '\u{2202}'),
950            ("summation", '\u{2211}'),
951            ("product", '\u{220F}'),
952            ("integral", '\u{222B}'),
953            ("radical", '\u{221A}'),
954            ("approxequal", '\u{2248}'),
955            ("Delta", '\u{0394}'),
956            ("lozenge", '\u{25CA}'),
957            ("pi", '\u{03C0}'),
958            ("Omega", '\u{03A9}'),
959        ];
960        entries.iter().cloned().collect()
961    })
962}
963
964const CT_LIGATURE_MARKER: char = '\u{E007}';
965
966#[derive(Clone, Default)]
967struct DecodedPdfString {
968    text: String,
969    ct_origins: Vec<bool>,
970}
971
972impl DecodedPdfString {
973    fn from_text(text: String) -> Self {
974        let ct_origins = vec![false; text.chars().count()];
975        Self { text, ct_origins }
976    }
977
978    fn push_char(&mut self, ch: char, is_ct_origin: bool) {
979        self.text.push(ch);
980        self.ct_origins.push(is_ct_origin);
981    }
982
983    fn push_str(&mut self, s: &str) {
984        for ch in s.chars() {
985            self.push_char(ch, false);
986        }
987    }
988
989    fn extend(&mut self, other: DecodedPdfString) {
990        self.text.push_str(&other.text);
991        self.ct_origins.extend(other.ct_origins);
992    }
993
994    fn is_empty(&self) -> bool {
995        self.text.is_empty()
996    }
997
998    fn glyph_count(&self) -> usize {
999        self.ct_origins.len()
1000    }
1001
1002    fn iter(&self) -> impl Iterator<Item = (char, bool)> + '_ {
1003        self.text.chars().zip(self.ct_origins.iter().copied())
1004    }
1005}
1006
1007/// Decode a PDF string using the font's ToUnicode CMap (if available),
1008/// preserving origin metadata for internal `ct` ligature markers.
1009fn decode_pdf_string_with_font_marked(
1010    bytes: &[u8],
1011    font_info: Option<&FontInfo>,
1012) -> DecodedPdfString {
1013    // Check for UTF-16BE BOM first — always takes priority.
1014    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1015        let chars: Vec<u16> = bytes[2..]
1016            .chunks(2)
1017            .filter_map(|chunk| {
1018                if chunk.len() == 2 {
1019                    Some(u16::from_be_bytes([chunk[0], chunk[1]]))
1020                } else {
1021                    None
1022                }
1023            })
1024            .collect();
1025        return DecodedPdfString::from_text(String::from_utf16_lossy(&chars));
1026    }
1027
1028    if let Some(info) = font_info {
1029        if info.is_cid && !info.to_unicode.is_empty() {
1030            // CID font: decode 2-byte codes via ToUnicode.
1031            let mut result = DecodedPdfString::default();
1032            let mut i = 0;
1033            while i + 1 < bytes.len() {
1034                let code = u16::from_be_bytes([bytes[i], bytes[i + 1]]) as u32;
1035                if let Some(s) = info.to_unicode.get(&code) {
1036                    result.push_str(s);
1037                } else {
1038                    // Fallback: try direct Unicode interpretation.
1039                    if let Some(ch) = char::from_u32(code) {
1040                        if !ch.is_control() || ch == ' ' || ch == '\t' || ch == '\n' {
1041                            result.push_char(ch, false);
1042                        }
1043                    }
1044                }
1045                i += 2;
1046            }
1047            return result;
1048        }
1049
1050        if !info.is_cid && !info.to_unicode.is_empty() {
1051            // Simple font with ToUnicode: decode 1-byte codes.
1052            let mut result = DecodedPdfString::default();
1053            for &b in bytes {
1054                if let Some(s) = info.to_unicode.get(&(b as u32)) {
1055                    result.push_str(s);
1056                } else if info.ct_codes[b as usize] {
1057                    result.push_char(CT_LIGATURE_MARKER, true);
1058                } else if let Some(ch) = info.encoding_map[b as usize] {
1059                    result.push_char(ch, false);
1060                } else {
1061                    let ch = b as char;
1062                    if is_printable_or_space(ch) {
1063                        result.push_char(ch, false);
1064                    }
1065                }
1066            }
1067            return result;
1068        }
1069
1070        // Simple font with encoding map but no ToUnicode.
1071        if !info.is_cid
1072            && (info.encoding_map.iter().any(|c| c.is_some()) || info.ct_codes.iter().any(|f| *f))
1073        {
1074            let mut result = DecodedPdfString::default();
1075            for &b in bytes {
1076                if info.ct_codes[b as usize] {
1077                    result.push_char(CT_LIGATURE_MARKER, true);
1078                } else if let Some(ch) = info.encoding_map[b as usize] {
1079                    result.push_char(ch, false);
1080                } else {
1081                    let ch = b as char;
1082                    if is_printable_or_space(ch) {
1083                        result.push_char(ch, false);
1084                    }
1085                }
1086            }
1087            return result;
1088        }
1089    }
1090
1091    // Fallback: PDFDocEncoding (ASCII + Latin-1), skipping control chars.
1092    let mut result = DecodedPdfString::default();
1093    for &b in bytes {
1094        let ch = b as char;
1095        if is_printable_or_space(ch) {
1096            result.push_char(ch, false);
1097        }
1098    }
1099    result
1100}
1101
1102/// Returns true if a character is printable or common whitespace (tab, newline, CR, space).
1103/// Filters out NUL, BEL, and other control characters that corrupt XML output.
1104fn is_printable_or_space(ch: char) -> bool {
1105    let cp = ch as u32;
1106    cp >= 0x20 || cp == 0x09 || cp == 0x0A || cp == 0x0D
1107}
1108
1109/// Whether to expand precomposed-ligature characters and known ligature
1110/// glyph names into their constituent ASCII characters during text
1111/// extraction. Default-on so that text-search, copy-paste and accessibility
1112/// flows see the human-readable string ("office") rather than an opaque
1113/// glyph cluster ("o\u{FB03}ce"). Internal for now — the public surface
1114/// stays unchanged; future configuration would surface this on a higher-
1115/// level type without breaking the current API.
1116const LIGATURE_DECOMP: bool = true;
1117
1118/// Decompose a precomposed-ligature character (Alphabetic Presentation Forms,
1119/// U+FB00..U+FB06) into its constituent string. Returns `None` for any other
1120/// character — the caller keeps the original codepoint in that case.
1121///
1122/// The `ct` ligature uses an internal marker plus origin metadata because
1123/// it has no precomposed Unicode codepoint and must not collide with
1124/// legitimate PUA values from ToUnicode CMaps.
1125fn decompose_ligature_char(c: char) -> Option<&'static str> {
1126    Some(match c {
1127        '\u{FB00}' => "ff",
1128        '\u{FB01}' => "fi",
1129        '\u{FB02}' => "fl",
1130        '\u{FB03}' => "ffi",
1131        '\u{FB04}' => "ffl",
1132        // U+FB05 (long s + t) and U+FB06 (st) both decompose to "st" for
1133        // search/copy-paste purposes; the historical long-s spelling is
1134        // not preserved.
1135        '\u{FB05}' | '\u{FB06}' => "st",
1136        _ => return None,
1137    })
1138}
1139
1140fn decompose_ligature_char_with_origin(c: char, is_ct_origin: bool) -> Option<&'static str> {
1141    if is_ct_origin && c == CT_LIGATURE_MARKER {
1142        Some("ct")
1143    } else {
1144        decompose_ligature_char(c)
1145    }
1146}
1147
1148/// Decompose a single precomposed-ligature character into its constituent
1149/// string. Combines the hardcoded Latin set ([`decompose_ligature_char`])
1150/// with an NFKD fallback over the rest of the Alphabetic Presentation
1151/// Forms block (U+FB00..U+FB4F) — Hebrew, Arabic, Armenian forms — so
1152/// that string-level [`decompose_ligatures`] and the per-glyph
1153/// `push_glyph_positioned` path agree on what counts as a decomposable
1154/// glyph.
1155fn decompose_glyph_to_string(c: char) -> Option<String> {
1156    use unicode_normalization::UnicodeNormalization;
1157    if let Some(s) = decompose_ligature_char(c) {
1158        return Some(s.to_string());
1159    }
1160    if matches!(c, '\u{FB00}'..='\u{FB4F}') {
1161        let nfkd: String = c.nfkd().collect();
1162        // Some codepoints in this block have no compatibility decomposition
1163        // and would round-trip back as themselves; only treat them as
1164        // decomposable when NFKD actually maps them, including legitimate
1165        // single-codepoint mappings such as Hebrew presentation forms.
1166        if nfkd != c.to_string() {
1167            return Some(nfkd);
1168        }
1169    }
1170    None
1171}
1172
1173/// Apply ligature decomposition to a string. Hardcoded mappings cover the
1174/// six common Latin ligatures (fi, fl, ffi, ffl, ff, st). For any other
1175/// character in the Alphabetic Presentation Forms block (U+FB00..U+FB4F)
1176/// not covered above — Hebrew, Arabic, Armenian forms — apply NFKD locally
1177/// to that character only. NFKD is *not* applied globally to avoid breaking
1178/// legitimate composed accented characters elsewhere in the string.
1179fn decompose_ligatures(s: &str) -> String {
1180    let mut out = String::with_capacity(s.len());
1181    for c in s.chars() {
1182        if let Some(replacement) = decompose_glyph_to_string(c) {
1183            out.push_str(&replacement);
1184        } else {
1185            out.push(c);
1186        }
1187    }
1188    out
1189}
1190
1191fn decompose_decoded_ligatures(s: &DecodedPdfString) -> String {
1192    let mut out = String::with_capacity(s.text.len());
1193    for (c, is_ct_origin) in s.iter() {
1194        if let Some(replacement) = decompose_ligature_char_with_origin(c, is_ct_origin) {
1195            out.push_str(replacement);
1196        } else if let Some(replacement) = decompose_glyph_to_string(c) {
1197            out.push_str(&replacement);
1198        } else {
1199            out.push(c);
1200        }
1201    }
1202    out
1203}
1204
1205/// Apply [`decompose_ligatures`] when the feature is enabled; otherwise
1206/// return the input unchanged. Centralizes the gate so callers don't
1207/// repeat the `if LIGATURE_DECOMP` check at every emission site.
1208fn maybe_decompose_decoded(s: &DecodedPdfString) -> String {
1209    if LIGATURE_DECOMP {
1210        if s.ct_origins.iter().any(|origin| *origin) {
1211            decompose_decoded_ligatures(s)
1212        } else {
1213            decompose_ligatures(&s.text)
1214        }
1215    } else {
1216        s.text.clone()
1217    }
1218}
1219
1220/// Extract text blocks from a specific page.
1221pub fn extract_page_blocks(doc: &Document, page_num: u32) -> Vec<TextBlock> {
1222    let pages = doc.get_pages();
1223    let Some(&page_id) = pages.get(&page_num) else {
1224        return Vec::new();
1225    };
1226
1227    let font_map = build_font_map(doc, page_id);
1228    let resources = get_page_resources(doc, page_id).unwrap_or_default();
1229    if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1230        if let Ok(content) = Content::decode(&content_bytes) {
1231            return extract_blocks_from_ops_inner(
1232                &content.operations,
1233                page_num,
1234                &font_map,
1235                Some((doc, &resources)),
1236                0,
1237                None,
1238            );
1239        }
1240    }
1241
1242    Vec::new()
1243}
1244
1245/// Extract text blocks from a page identified by its object ID.
1246///
1247/// Unlike [`extract_page_blocks`], this function does not call `get_pages()`
1248/// internally. Use this when the caller already holds a page map from a
1249/// prior `doc.get_pages()` call to avoid redundant page-tree traversals.
1250pub fn extract_blocks_from_page_id(
1251    doc: &Document,
1252    page_id: lopdf::ObjectId,
1253    page_num: u32,
1254) -> Vec<TextBlock> {
1255    let font_map = build_font_map(doc, page_id);
1256    let resources = get_page_resources(doc, page_id).unwrap_or_default();
1257    if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1258        if let Ok(content) = Content::decode(&content_bytes) {
1259            return extract_blocks_from_ops_inner(
1260                &content.operations,
1261                page_num,
1262                &font_map,
1263                Some((doc, &resources)),
1264                0,
1265                None,
1266            );
1267        }
1268    }
1269    Vec::new()
1270}
1271
1272/// Extract text blocks from all pages of a document.
1273pub fn extract_text(doc: &Document) -> Vec<TextBlock> {
1274    let pages = doc.get_pages();
1275    let mut blocks = Vec::new();
1276
1277    for (&page_num, &page_id) in &pages {
1278        let font_map = build_font_map(doc, page_id);
1279        let resources = get_page_resources(doc, page_id).unwrap_or_default();
1280        if let Ok(content_bytes) = get_page_content_bytes(doc, page_id) {
1281            if let Ok(content) = Content::decode(&content_bytes) {
1282                let page_blocks = extract_blocks_from_ops_inner(
1283                    &content.operations,
1284                    page_num,
1285                    &font_map,
1286                    Some((doc, &resources)),
1287                    0,
1288                    None,
1289                );
1290                blocks.extend(page_blocks);
1291            }
1292        }
1293    }
1294
1295    blocks
1296}
1297
1298/// Extract text from a specific page as a plain string.
1299pub fn extract_page_text(doc: &Document, page_num: u32) -> Result<String> {
1300    let pages = doc.get_pages();
1301    let total = pages.len() as u32;
1302
1303    if page_num == 0 || page_num > total {
1304        return Err(ExtractError::PageOutOfRange(page_num, total));
1305    }
1306
1307    let page_id = *pages
1308        .get(&page_num)
1309        .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
1310
1311    let font_map = build_font_map(doc, page_id);
1312    let resources = get_page_resources(doc, page_id).unwrap_or_default();
1313    let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
1314    let content = match Content::decode(&content_bytes) {
1315        Ok(c) => c,
1316        Err(_) => return Ok(String::new()),
1317    };
1318
1319    let blocks = extract_blocks_from_ops_inner(
1320        &content.operations,
1321        page_num,
1322        &font_map,
1323        Some((doc, &resources)),
1324        0,
1325        None,
1326    );
1327    let text = blocks
1328        .iter()
1329        .map(|b| b.text.as_str())
1330        .collect::<Vec<_>>()
1331        .join("");
1332
1333    Ok(text)
1334}
1335
1336/// Extract positioned characters from a specific page.
1337pub fn extract_positioned_chars(doc: &Document, page_num: u32) -> Result<Vec<PositionedChar>> {
1338    let pages = doc.get_pages();
1339    let total = pages.len() as u32;
1340
1341    if page_num == 0 || page_num > total {
1342        return Err(ExtractError::PageOutOfRange(page_num, total));
1343    }
1344
1345    let page_id = *pages
1346        .get(&page_num)
1347        .ok_or(ExtractError::PageOutOfRange(page_num, total))?;
1348
1349    let font_map = build_font_map(doc, page_id);
1350    let content_bytes = get_page_content_bytes(doc, page_id).unwrap_or_default();
1351    let content = match Content::decode(&content_bytes) {
1352        Ok(c) => c,
1353        Err(_) => return Ok(Vec::new()),
1354    };
1355
1356    let chars = extract_chars_from_ops(&content.operations, page_num, &font_map);
1357    Ok(chars)
1358}
1359
1360/// Get content stream bytes for a page.
1361fn get_page_content_bytes(doc: &Document, page_id: ObjectId) -> std::result::Result<Vec<u8>, ()> {
1362    doc.get_page_content(page_id).map_err(|_| ())
1363}
1364
1365/// One entry on the marked-content stack. PDF marked-content sequences nest
1366/// (e.g. `BDC … BDC … EMC … EMC`); each `BDC`/`BMC` pushes, `EMC` pops. The
1367/// innermost entry with `actual_text = Some(_)` wins for any glyph emitted
1368/// inside that range.
1369#[derive(Debug, Clone, Default)]
1370struct MarkedContentEntry {
1371    /// `/ActualText` value from the property dict, decoded as a PDF text
1372    /// string (UTF-16BE if BOM-prefixed, otherwise PDFDocEncoding/Latin-1).
1373    actual_text: Option<String>,
1374}
1375
1376/// Resolve the property dict for a `BDC` operator. The second operand is
1377/// either an inline dictionary or a `/Name` referring to an entry in
1378/// `Resources/Properties`.
1379fn resolve_bdc_properties(
1380    doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
1381    operand: &Object,
1382) -> Option<lopdf::Dictionary> {
1383    match operand {
1384        Object::Dictionary(d) => Some(d.clone()),
1385        Object::Reference(r) => doc_and_resources.and_then(|(doc, _)| {
1386            doc.get_object(*r).ok().and_then(|o| match o {
1387                Object::Dictionary(d) => Some(d.clone()),
1388                _ => None,
1389            })
1390        }),
1391        Object::Name(n) => {
1392            let (doc, resources) = doc_and_resources?;
1393            let props = resolve_dict(doc, resources, b"Properties")?;
1394            match props.get(n.as_slice()).ok()? {
1395                Object::Dictionary(d) => Some(d.clone()),
1396                Object::Reference(r) => match doc.get_object(*r).ok()? {
1397                    Object::Dictionary(d) => Some(d.clone()),
1398                    _ => None,
1399                },
1400                _ => None,
1401            }
1402        }
1403        _ => None,
1404    }
1405}
1406
1407/// Decode a PDF text string (BOM-detected UTF-16BE / UTF-8, otherwise
1408/// PDFDocEncoding-ish Latin-1). Used for the `/ActualText` value, which is
1409/// stored as a regular text string — not as a glyph-coded string.
1410fn decode_pdf_text_string(bytes: &[u8]) -> String {
1411    if bytes.len() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF {
1412        let u16s: Vec<u16> = bytes[2..]
1413            .chunks_exact(2)
1414            .map(|c| u16::from_be_bytes([c[0], c[1]]))
1415            .collect();
1416        return String::from_utf16_lossy(&u16s);
1417    }
1418    if bytes.len() >= 3 && bytes[0] == 0xEF && bytes[1] == 0xBB && bytes[2] == 0xBF {
1419        return String::from_utf8_lossy(&bytes[3..]).into_owned();
1420    }
1421    bytes
1422        .iter()
1423        .filter_map(|&b| {
1424            let ch = b as char;
1425            if is_printable_or_space(ch) {
1426                Some(ch)
1427            } else {
1428                None
1429            }
1430        })
1431        .collect()
1432}
1433
1434/// Extract `/ActualText` from a BDC property dict, if present.
1435fn extract_actual_text(props: &lopdf::Dictionary) -> Option<String> {
1436    let obj = props.get(b"ActualText").ok()?;
1437    match obj {
1438        Object::String(bytes, _) => Some(decode_pdf_text_string(bytes)),
1439        _ => None,
1440    }
1441}
1442
1443/// Innermost `actual_text` from the stack, falling back to an inherited
1444/// value supplied by the caller. The inherited value carries the parent
1445/// invocation's top-of-stack `/ActualText` across `Do` recursion into a
1446/// Form XObject, so text emitted inside the XObject's `BT`...`ET` blocks
1447/// is tagged with the surrounding `BDC`/`EMC` pair's `/ActualText`.
1448fn current_actual_text(stack: &[MarkedContentEntry], inherited: Option<&str>) -> Option<String> {
1449    stack
1450        .iter()
1451        .rev()
1452        .find_map(|e| e.actual_text.clone())
1453        .or_else(|| inherited.map(|s| s.to_string()))
1454}
1455
1456/// Extract text blocks from a list of operations, handling Form XObject
1457/// recursion via `Do`. `inherited_actual_text` carries the parent's active
1458/// `/ActualText` binding into the recursion so a `BDC` ... `Do` ... `EMC`
1459/// sequence keeps tagging text emitted inside the invoked Form XObject.
1460/// Top-level callers pass `None`.
1461fn extract_blocks_from_ops_inner(
1462    ops: &[Operation],
1463    page: u32,
1464    font_map: &HashMap<String, FontInfo>,
1465    doc_and_resources: Option<(&Document, &lopdf::Dictionary)>,
1466    depth: u32,
1467    inherited_actual_text: Option<&str>,
1468) -> Vec<TextBlock> {
1469    let mut state = TextState::default();
1470    let mut blocks = Vec::new();
1471    let mut mc_stack: Vec<MarkedContentEntry> = Vec::new();
1472
1473    for op in ops {
1474        match op.operator.as_str() {
1475            "q" => {
1476                state.gs_stack.push(GraphicsState { ctm: state.ctm });
1477            }
1478            "Q" => {
1479                if let Some(gs) = state.gs_stack.pop() {
1480                    state.ctm = gs.ctm;
1481                }
1482            }
1483            "cm" => {
1484                if let Some(m) = extract_matrix(&op.operands) {
1485                    state.ctm = multiply_matrix(&state.ctm, &m);
1486                }
1487            }
1488            "BT" => {
1489                state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1490                state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1491            }
1492            "Tf" => {
1493                if op.operands.len() >= 2 {
1494                    if let Object::Name(ref name) = op.operands[0] {
1495                        state.font_name = String::from_utf8_lossy(name).to_string();
1496                    }
1497                    if let Some(size) = as_number(&op.operands[1]) {
1498                        state.font_size = size;
1499                    }
1500                }
1501            }
1502            "Tc" => {
1503                if let Some(v) = op.operands.first().and_then(as_number) {
1504                    state.tc = v;
1505                }
1506            }
1507            "Tw" => {
1508                if let Some(v) = op.operands.first().and_then(as_number) {
1509                    state.tw = v;
1510                }
1511            }
1512            "Tz" => {
1513                if let Some(v) = op.operands.first().and_then(as_number) {
1514                    state.th = v;
1515                }
1516            }
1517            "TL" => {
1518                if let Some(v) = op.operands.first().and_then(as_number) {
1519                    state.tl = v;
1520                }
1521            }
1522            "Ts" => {
1523                if let Some(v) = op.operands.first().and_then(as_number) {
1524                    state.ts = v;
1525                }
1526            }
1527            "Td" => {
1528                if op.operands.len() >= 2 {
1529                    let tx = as_number(&op.operands[0]).unwrap_or(0.0);
1530                    let ty = as_number(&op.operands[1]).unwrap_or(0.0);
1531                    let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
1532                    state.tlm = new_tlm;
1533                    state.tm = new_tlm;
1534                }
1535            }
1536            "TD" => {
1537                if op.operands.len() >= 2 {
1538                    let tx = as_number(&op.operands[0]).unwrap_or(0.0);
1539                    let ty = as_number(&op.operands[1]).unwrap_or(0.0);
1540                    state.tl = -ty;
1541                    let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
1542                    state.tlm = new_tlm;
1543                    state.tm = new_tlm;
1544                }
1545            }
1546            "Tm" => {
1547                if let Some(m) = extract_matrix(&op.operands) {
1548                    state.tm = m;
1549                    state.tlm = m;
1550                }
1551            }
1552            "T*" => {
1553                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
1554                state.tlm = new_tlm;
1555                state.tm = new_tlm;
1556            }
1557            "Tj" => {
1558                let fi = font_map.get(&state.font_name);
1559                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
1560                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1561
1562                    // Skip empty text blocks (consistent with TJ handler) — empty
1563                    // blocks carry position but no data, causing phantom table
1564                    // detection with all-empty cells (#651).
1565                    if !text.is_empty() {
1566                        let x = state.tm[4];
1567                        let y = state.tm[5];
1568                        // Bbox width tracks the *rendered* glyph footprint
1569                        // (one char_w per source glyph), not the byte- or
1570                        // char-length of the decomposed display string.
1571                        let text_width = text.glyph_count() as f64 * char_w;
1572                        let display_text = maybe_decompose_decoded(&text);
1573                        blocks.push(TextBlock {
1574                            text: display_text,
1575                            page,
1576                            bbox: [x, y, x + text_width, y + state.font_size],
1577                            font_name: state.font_name.clone(),
1578                            font_size: state.font_size,
1579                            actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1580                        });
1581                    }
1582
1583                    // Advance text position.
1584                    for _ in text.iter() {
1585                        state.tm[4] += char_w + state.tc;
1586                    }
1587                }
1588            }
1589            "TJ" => {
1590                if let Some(Object::Array(ref arr)) = op.operands.first() {
1591                    let x_start = state.tm[4];
1592                    let y = state.tm[5];
1593                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1594                    let mut combined_text = DecodedPdfString::default();
1595                    let fi = font_map.get(&state.font_name);
1596
1597                    for item in arr {
1598                        match item {
1599                            Object::String(bytes, _) => {
1600                                let text = decode_pdf_string_with_font_marked(bytes, fi);
1601                                for _ in text.iter() {
1602                                    state.tm[4] += char_w + state.tc;
1603                                }
1604                                combined_text.extend(text);
1605                            }
1606                            _ => {
1607                                if let Some(adj) = as_number(item) {
1608                                    // Negative values move right, positive move left.
1609                                    state.tm[4] -= adj / 1000.0 * state.font_size;
1610                                }
1611                            }
1612                        }
1613                    }
1614
1615                    if !combined_text.is_empty() {
1616                        let x_end = state.tm[4];
1617                        blocks.push(TextBlock {
1618                            text: maybe_decompose_decoded(&combined_text),
1619                            page,
1620                            bbox: [x_start, y, x_end, y + state.font_size],
1621                            font_name: state.font_name.clone(),
1622                            font_size: state.font_size,
1623                            actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1624                        });
1625                    }
1626                }
1627            }
1628            "'" => {
1629                // Move to next line and show text.
1630                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
1631                state.tlm = new_tlm;
1632                state.tm = new_tlm;
1633
1634                let fi = font_map.get(&state.font_name);
1635                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
1636                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1637
1638                    if !text.is_empty() {
1639                        let x = state.tm[4];
1640                        let y = state.tm[5];
1641                        // Bbox width tracks rendered glyph footprint (one
1642                        // char_w per source glyph) — match the Tj path.
1643                        let text_width = text.glyph_count() as f64 * char_w;
1644                        let display_text = maybe_decompose_decoded(&text);
1645                        blocks.push(TextBlock {
1646                            text: display_text,
1647                            page,
1648                            bbox: [x, y, x + text_width, y + state.font_size],
1649                            font_name: state.font_name.clone(),
1650                            font_size: state.font_size,
1651                            actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1652                        });
1653                    }
1654
1655                    for _ in text.iter() {
1656                        state.tm[4] += char_w + state.tc;
1657                    }
1658                }
1659            }
1660            "\"" => {
1661                // Set word/char spacing, move to next line, show text.
1662                if op.operands.len() >= 3 {
1663                    if let Some(tw) = as_number(&op.operands[0]) {
1664                        state.tw = tw;
1665                    }
1666                    if let Some(tc) = as_number(&op.operands[1]) {
1667                        state.tc = tc;
1668                    }
1669
1670                    let new_tlm =
1671                        multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
1672                    state.tlm = new_tlm;
1673                    state.tm = new_tlm;
1674
1675                    let fi = font_map.get(&state.font_name);
1676                    if let Some(text) =
1677                        extract_decoded_string_operand_with_font(&op.operands[2..], fi)
1678                    {
1679                        let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
1680
1681                        if !text.is_empty() {
1682                            let x = state.tm[4];
1683                            let y = state.tm[5];
1684                            // Bbox width tracks rendered glyph footprint
1685                            // (one char_w per source glyph) — match Tj path.
1686                            let text_width = text.glyph_count() as f64 * char_w;
1687                            let display_text = maybe_decompose_decoded(&text);
1688                            blocks.push(TextBlock {
1689                                text: display_text,
1690                                page,
1691                                bbox: [x, y, x + text_width, y + state.font_size],
1692                                font_name: state.font_name.clone(),
1693                                font_size: state.font_size,
1694                                actual_text: current_actual_text(&mc_stack, inherited_actual_text),
1695                            });
1696                        }
1697
1698                        for _ in text.iter() {
1699                            state.tm[4] += char_w + state.tc;
1700                        }
1701                    }
1702                }
1703            }
1704            "BMC" => {
1705                // Begin marked-content (no property list). Push an empty entry
1706                // so EMC pops the right level — without this, nested BMC inside
1707                // a BDC/ActualText range would pop the BDC entry early.
1708                mc_stack.push(MarkedContentEntry::default());
1709            }
1710            "BDC" => {
1711                // Begin marked-content with property list. Operand 1 is the
1712                // tag (Name); operand 2 is either an inline dict or a Name
1713                // resolved via Resources/Properties.
1714                let actual_text = op
1715                    .operands
1716                    .get(1)
1717                    .and_then(|o| resolve_bdc_properties(doc_and_resources, o))
1718                    .as_ref()
1719                    .and_then(extract_actual_text);
1720                mc_stack.push(MarkedContentEntry { actual_text });
1721            }
1722            "EMC" => {
1723                // End marked-content. Use pop (not unwrap) so a malformed
1724                // stream with an unmatched EMC doesn't panic.
1725                mc_stack.pop();
1726            }
1727            "Do" => {
1728                // Invoke Form XObject — recurse into its content stream.
1729                // Propagate the active /ActualText binding (top-of-stack, or
1730                // whatever this invocation itself inherited) so text inside
1731                // the XObject is tagged with the surrounding BDC/EMC pair.
1732                if depth < 5 {
1733                    if let Some((doc, resources)) = doc_and_resources {
1734                        if let Some(Object::Name(ref xobj_name)) = op.operands.first() {
1735                            let xobj_name_str = String::from_utf8_lossy(xobj_name);
1736                            let inherited_for_child =
1737                                current_actual_text(&mc_stack, inherited_actual_text);
1738                            if let Some(xobj_blocks) = extract_form_xobject_text(
1739                                doc,
1740                                resources,
1741                                &xobj_name_str,
1742                                page,
1743                                font_map,
1744                                depth,
1745                                inherited_for_child.as_deref(),
1746                            ) {
1747                                blocks.extend(xobj_blocks);
1748                            }
1749                        }
1750                    }
1751                }
1752            }
1753            _ => {}
1754        }
1755    }
1756
1757    blocks
1758}
1759
1760/// Extract text from a Form XObject referenced by name in the page's
1761/// Resources/XObject dict. `inherited_actual_text` is the active
1762/// `/ActualText` binding from the calling context (parent stack +
1763/// whatever the parent itself inherited), so text emitted inside this
1764/// XObject's `BT`...`ET` blocks is tagged correctly when the `Do`
1765/// operator sits between a `BDC` and matching `EMC`.
1766fn extract_form_xobject_text(
1767    doc: &Document,
1768    resources: &lopdf::Dictionary,
1769    name: &str,
1770    page: u32,
1771    font_map: &HashMap<String, FontInfo>,
1772    depth: u32,
1773    inherited_actual_text: Option<&str>,
1774) -> Option<Vec<TextBlock>> {
1775    // Look up the XObject in the Resources dictionary.
1776    let xobj_dict = match resources.get(b"XObject").ok()? {
1777        Object::Dictionary(d) => d.clone(),
1778        Object::Reference(r) => match doc.get_object(*r).ok()? {
1779            Object::Dictionary(d) => d.clone(),
1780            _ => return None,
1781        },
1782        _ => return None,
1783    };
1784
1785    let xobj_ref = match xobj_dict.get(name.as_bytes()).ok()? {
1786        Object::Reference(r) => *r,
1787        _ => return None,
1788    };
1789
1790    let stream = match doc.get_object(xobj_ref).ok()? {
1791        Object::Stream(s) => s.clone(),
1792        _ => return None,
1793    };
1794
1795    // Verify it's a Form XObject (not Image).
1796    let subtype = stream
1797        .dict
1798        .get(b"Subtype")
1799        .ok()
1800        .and_then(|o| match o {
1801            Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
1802            _ => None,
1803        })
1804        .unwrap_or_default();
1805    if subtype != "Form" {
1806        return None;
1807    }
1808
1809    // Decode the content stream.
1810    let content_bytes = stream
1811        .decompressed_content()
1812        .ok()
1813        .unwrap_or_else(|| stream.content.clone());
1814    let content = Content::decode(&content_bytes).ok()?;
1815
1816    // Build font map: XObject's own fonts take priority over inherited page fonts.
1817    // A Form XObject may reuse font names (e.g. /F1) with different encodings.
1818    let mut xobj_font_map = font_map.clone();
1819    if let Some(xobj_resources) = resolve_dict(doc, &stream.dict, b"Resources") {
1820        if let Some(xobj_fonts) = resolve_dict(doc, &xobj_resources, b"Font") {
1821            for (name_bytes, value) in xobj_fonts.iter() {
1822                let fname = String::from_utf8_lossy(name_bytes).to_string();
1823                if let Some(fi) = build_font_info_from_value(doc, value) {
1824                    xobj_font_map.insert(fname, fi);
1825                }
1826            }
1827        }
1828    }
1829
1830    // Use the XObject's own Resources dict for recursive Do lookups.
1831    let xobj_resources =
1832        resolve_dict(doc, &stream.dict, b"Resources").unwrap_or_else(|| resources.clone());
1833
1834    Some(extract_blocks_from_ops_inner(
1835        &content.operations,
1836        page,
1837        &xobj_font_map,
1838        Some((doc, &xobj_resources)),
1839        depth + 1,
1840        inherited_actual_text,
1841    ))
1842}
1843
1844/// Build a FontInfo from a font dictionary value (used for XObject font resolution).
1845fn build_font_info_from_value(doc: &Document, value: &Object) -> Option<FontInfo> {
1846    let font = match value {
1847        Object::Reference(r) => match doc.get_object(*r).ok()? {
1848            Object::Dictionary(d) => d.clone(),
1849            _ => return None,
1850        },
1851        Object::Dictionary(d) => d.clone(),
1852        _ => return None,
1853    };
1854
1855    let subtype = font
1856        .get(b"Subtype")
1857        .ok()
1858        .and_then(|o| match o {
1859            Object::Name(n) => Some(String::from_utf8_lossy(n).to_string()),
1860            _ => None,
1861        })
1862        .unwrap_or_default();
1863
1864    let is_cid = subtype == "Type0";
1865    let mut to_unicode = parse_to_unicode_from_font(doc, &font);
1866
1867    if to_unicode.is_empty() && is_cid {
1868        if let Ok(Object::Array(descendants)) = font.get(b"DescendantFonts") {
1869            for d in descendants {
1870                let desc_dict = match d {
1871                    Object::Reference(r) => {
1872                        // Don't use `?` here — a bad reference in one descendant
1873                        // should not abort the entire font info construction.
1874                        let Some(Object::Dictionary(d)) = doc.get_object(*r).ok() else {
1875                            continue;
1876                        };
1877                        d
1878                    }
1879                    Object::Dictionary(d) => d,
1880                    _ => continue,
1881                };
1882                let tu = parse_to_unicode_from_font(doc, desc_dict);
1883                if !tu.is_empty() {
1884                    to_unicode = tu;
1885                    break;
1886                }
1887            }
1888        }
1889    }
1890
1891    let (encoding_map, ct_codes) = if !is_cid {
1892        build_encoding_map(doc, &font)
1893    } else {
1894        ([None; 256], [false; 256])
1895    };
1896
1897    Some(FontInfo {
1898        is_cid,
1899        to_unicode,
1900        encoding_map,
1901        ct_codes,
1902    })
1903}
1904
1905/// Emit one or more `PositionedChar` for a single glyph, decomposing
1906/// ligatures by splitting the glyph's `char_w` advance evenly across the
1907/// constituent characters. The text matrix advances exactly once (per
1908/// glyph), preserving the original layout — only the per-character bbox is
1909/// subdivided.
1910///
1911/// Uses the unified [`decompose_glyph_to_string`] helper so that the
1912/// per-glyph decomposition matches the string-level
1913/// [`decompose_ligatures`] used by `extract_page_text` / `TextBlock`.
1914/// Without this alignment, presentation-form codepoints outside the
1915/// hardcoded Latin set (e.g. Armenian U+FB13) would expand in extracted
1916/// text but stay as a single PositionedChar, leaving consumers that pair
1917/// text offsets with bounding boxes misaligned.
1918fn push_glyph_positioned(
1919    chars: &mut Vec<PositionedChar>,
1920    state: &mut TextState,
1921    page: u32,
1922    glyph: char,
1923    is_ct_origin: bool,
1924    char_w: f64,
1925) {
1926    let (gx, gy) = apply_ctm(state);
1927    let constituents: Option<String> = if LIGATURE_DECOMP {
1928        // Try ct-origin marker first (preserves U+E007 → "ct" only when it
1929        // came from a /ct glyph-name path), then fall back to the unified
1930        // decomposition helper which covers the Latin set + FB00–FB4F NFKD.
1931        if let Some(s) = decompose_ligature_char_with_origin(glyph, is_ct_origin) {
1932            Some(s.to_string())
1933        } else {
1934            decompose_glyph_to_string(glyph)
1935        }
1936    } else {
1937        None
1938    };
1939    match constituents {
1940        None => {
1941            chars.push(PositionedChar {
1942                ch: glyph,
1943                page,
1944                bbox: [gx, gy, gx + char_w, gy + state.font_size],
1945            });
1946        }
1947        Some(s) => {
1948            let n = s.chars().count() as f64;
1949            let part_w = char_w / n;
1950            for (i, c) in s.chars().enumerate() {
1951                let x = gx + part_w * i as f64;
1952                chars.push(PositionedChar {
1953                    ch: c,
1954                    page,
1955                    bbox: [x, gy, x + part_w, gy + state.font_size],
1956                });
1957            }
1958        }
1959    }
1960    state.tm[4] += char_w + state.tc;
1961}
1962
1963/// Extract positioned characters from operations.
1964fn extract_chars_from_ops(
1965    ops: &[Operation],
1966    page: u32,
1967    font_map: &HashMap<String, FontInfo>,
1968) -> Vec<PositionedChar> {
1969    let mut state = TextState::default();
1970    let mut chars = Vec::new();
1971
1972    for op in ops {
1973        match op.operator.as_str() {
1974            "q" => {
1975                state.gs_stack.push(GraphicsState { ctm: state.ctm });
1976            }
1977            "Q" => {
1978                if let Some(gs) = state.gs_stack.pop() {
1979                    state.ctm = gs.ctm;
1980                }
1981            }
1982            "cm" => {
1983                if let Some(m) = extract_matrix(&op.operands) {
1984                    state.ctm = multiply_matrix(&state.ctm, &m);
1985                }
1986            }
1987            "BT" => {
1988                state.tm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1989                state.tlm = [1.0, 0.0, 0.0, 1.0, 0.0, 0.0];
1990            }
1991            "Tf" => {
1992                if op.operands.len() >= 2 {
1993                    if let Object::Name(ref name) = op.operands[0] {
1994                        state.font_name = String::from_utf8_lossy(name).to_string();
1995                    }
1996                    if let Some(size) = as_number(&op.operands[1]) {
1997                        state.font_size = size;
1998                    }
1999                }
2000            }
2001            "Tc" => {
2002                if let Some(v) = op.operands.first().and_then(as_number) {
2003                    state.tc = v;
2004                }
2005            }
2006            "Tw" => {
2007                if let Some(v) = op.operands.first().and_then(as_number) {
2008                    state.tw = v;
2009                }
2010            }
2011            "Tz" => {
2012                if let Some(v) = op.operands.first().and_then(as_number) {
2013                    state.th = v;
2014                }
2015            }
2016            "TL" => {
2017                if let Some(v) = op.operands.first().and_then(as_number) {
2018                    state.tl = v;
2019                }
2020            }
2021            "Ts" => {
2022                if let Some(v) = op.operands.first().and_then(as_number) {
2023                    state.ts = v;
2024                }
2025            }
2026            "Td" => {
2027                if op.operands.len() >= 2 {
2028                    let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2029                    let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2030                    let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2031                    state.tlm = new_tlm;
2032                    state.tm = new_tlm;
2033                }
2034            }
2035            "TD" => {
2036                if op.operands.len() >= 2 {
2037                    let tx = as_number(&op.operands[0]).unwrap_or(0.0);
2038                    let ty = as_number(&op.operands[1]).unwrap_or(0.0);
2039                    state.tl = -ty;
2040                    let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, tx, ty]);
2041                    state.tlm = new_tlm;
2042                    state.tm = new_tlm;
2043                }
2044            }
2045            "Tm" => {
2046                if let Some(m) = extract_matrix(&op.operands) {
2047                    state.tm = m;
2048                    state.tlm = m;
2049                }
2050            }
2051            "T*" => {
2052                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2053                state.tlm = new_tlm;
2054                state.tm = new_tlm;
2055            }
2056            "Tj" => {
2057                let fi = font_map.get(&state.font_name);
2058                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2059                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2060                    for (ch, is_ct_origin) in text.iter() {
2061                        push_glyph_positioned(
2062                            &mut chars,
2063                            &mut state,
2064                            page,
2065                            ch,
2066                            is_ct_origin,
2067                            char_w,
2068                        );
2069                    }
2070                }
2071            }
2072            "TJ" => {
2073                if let Some(Object::Array(ref arr)) = op.operands.first() {
2074                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2075                    let fi = font_map.get(&state.font_name);
2076                    for item in arr {
2077                        match item {
2078                            Object::String(bytes, _) => {
2079                                let text = decode_pdf_string_with_font_marked(bytes, fi);
2080                                for (ch, is_ct_origin) in text.iter() {
2081                                    push_glyph_positioned(
2082                                        &mut chars,
2083                                        &mut state,
2084                                        page,
2085                                        ch,
2086                                        is_ct_origin,
2087                                        char_w,
2088                                    );
2089                                }
2090                            }
2091                            _ => {
2092                                if let Some(adj) = as_number(item) {
2093                                    state.tm[4] -= adj / 1000.0 * state.font_size;
2094                                }
2095                            }
2096                        }
2097                    }
2098                }
2099            }
2100            "'" => {
2101                let new_tlm = multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2102                state.tlm = new_tlm;
2103                state.tm = new_tlm;
2104
2105                let fi = font_map.get(&state.font_name);
2106                if let Some(text) = extract_decoded_string_operand_with_font(&op.operands, fi) {
2107                    let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2108                    for (ch, is_ct_origin) in text.iter() {
2109                        push_glyph_positioned(
2110                            &mut chars,
2111                            &mut state,
2112                            page,
2113                            ch,
2114                            is_ct_origin,
2115                            char_w,
2116                        );
2117                    }
2118                }
2119            }
2120            "\"" => {
2121                if op.operands.len() >= 3 {
2122                    if let Some(tw) = as_number(&op.operands[0]) {
2123                        state.tw = tw;
2124                    }
2125                    if let Some(tc) = as_number(&op.operands[1]) {
2126                        state.tc = tc;
2127                    }
2128
2129                    let new_tlm =
2130                        multiply_matrix(&state.tlm, &[1.0, 0.0, 0.0, 1.0, 0.0, -state.tl]);
2131                    state.tlm = new_tlm;
2132                    state.tm = new_tlm;
2133
2134                    let fi = font_map.get(&state.font_name);
2135                    if let Some(text) =
2136                        extract_decoded_string_operand_with_font(&op.operands[2..], fi)
2137                    {
2138                        let char_w = state.font_size * APPROX_CHAR_WIDTH * (state.th / 100.0);
2139                        for (ch, is_ct_origin) in text.iter() {
2140                            push_glyph_positioned(
2141                                &mut chars,
2142                                &mut state,
2143                                page,
2144                                ch,
2145                                is_ct_origin,
2146                                char_w,
2147                            );
2148                        }
2149                    }
2150                }
2151            }
2152            _ => {}
2153        }
2154    }
2155
2156    chars
2157}
2158
2159/// Apply the current transformation matrix (CTM) to the text matrix position,
2160/// returning (x, y) in page/user space.
2161///
2162/// Mirrors the `compute_x` / `compute_y` logic in pdf-manip's `text_run.rs` so
2163/// that character positions emitted by `extract_chars_from_ops` use the same
2164/// coordinate space as `TextRun.x` / `TextRun.y` from `extract_text_runs`.
2165/// Without this, PDFs whose content streams set a non-identity CTM via `cm`
2166/// produce mismatched coordinates, breaking the spatial redaction fallback.
2167#[inline]
2168fn apply_ctm(state: &TextState) -> (f64, f64) {
2169    let x = state.ctm[0] * state.tm[4] + state.ctm[2] * state.tm[5] + state.ctm[4];
2170    let y = state.ctm[1] * state.tm[4] + state.ctm[3] * state.tm[5] + state.ctm[5];
2171    (x, y)
2172}
2173
2174/// Extract the first string operand, decoding via font's ToUnicode CMap if available.
2175fn extract_decoded_string_operand_with_font(
2176    operands: &[Object],
2177    font_info: Option<&FontInfo>,
2178) -> Option<DecodedPdfString> {
2179    for op in operands {
2180        if let Object::String(bytes, _) = op {
2181            return Some(decode_pdf_string_with_font_marked(bytes, font_info));
2182        }
2183    }
2184    None
2185}
2186
2187/// Convert a PDF object to a number (f64).
2188fn as_number(obj: &Object) -> Option<f64> {
2189    match obj {
2190        Object::Integer(i) => Some(*i as f64),
2191        Object::Real(f) => Some(*f as f64),
2192        _ => None,
2193    }
2194}
2195
2196/// Extract a 6-element transformation matrix from operands.
2197fn extract_matrix(operands: &[Object]) -> Option<[f64; 6]> {
2198    if operands.len() < 6 {
2199        return None;
2200    }
2201    let a = as_number(&operands[0])?;
2202    let b = as_number(&operands[1])?;
2203    let c = as_number(&operands[2])?;
2204    let d = as_number(&operands[3])?;
2205    let e = as_number(&operands[4])?;
2206    let f = as_number(&operands[5])?;
2207    Some([a, b, c, d, e, f])
2208}
2209
2210/// Multiply two 3x3 transformation matrices (stored as [a, b, c, d, e, f]).
2211fn multiply_matrix(m1: &[f64; 6], m2: &[f64; 6]) -> [f64; 6] {
2212    [
2213        m1[0] * m2[0] + m1[1] * m2[2],
2214        m1[0] * m2[1] + m1[1] * m2[3],
2215        m1[2] * m2[0] + m1[3] * m2[2],
2216        m1[2] * m2[1] + m1[3] * m2[3],
2217        m1[4] * m2[0] + m1[5] * m2[2] + m2[4],
2218        m1[4] * m2[1] + m1[5] * m2[3] + m2[5],
2219    ]
2220}
2221
2222#[cfg(test)]
2223mod tests {
2224    use super::*;
2225    use lopdf::{dictionary, Document, Object, Stream};
2226
2227    /// Helper: create a minimal doc with text content.
2228    fn make_doc_with_text(content: &[u8]) -> Document {
2229        let mut doc = Document::with_version("1.7");
2230
2231        let content_stream = Stream::new(dictionary! {}, content.to_vec());
2232        let content_id = doc.add_object(Object::Stream(content_stream));
2233
2234        let page_dict = dictionary! {
2235            "Type" => "Page",
2236            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2237            "Contents" => Object::Reference(content_id),
2238        };
2239        let page_id = doc.add_object(Object::Dictionary(page_dict));
2240
2241        let pages_dict = dictionary! {
2242            "Type" => "Pages",
2243            "Kids" => vec![Object::Reference(page_id)],
2244            "Count" => 1_i64,
2245        };
2246        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2247
2248        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2249            d.set("Parent", Object::Reference(pages_id));
2250        }
2251
2252        let catalog = dictionary! {
2253            "Type" => "Catalog",
2254            "Pages" => Object::Reference(pages_id),
2255        };
2256        let catalog_id = doc.add_object(Object::Dictionary(catalog));
2257        doc.trailer.set("Root", Object::Reference(catalog_id));
2258
2259        doc
2260    }
2261
2262    #[test]
2263    fn extract_simple_text() {
2264        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello World) Tj ET");
2265        let blocks = extract_text(&doc);
2266        assert_eq!(blocks.len(), 1);
2267        assert_eq!(blocks[0].text, "Hello World");
2268        assert_eq!(blocks[0].page, 1);
2269        assert_eq!(blocks[0].font_size, 12.0);
2270    }
2271
2272    #[test]
2273    fn extract_page_text_single() {
2274        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
2275        let text = extract_page_text(&doc, 1).unwrap();
2276        assert_eq!(text, "Hello");
2277    }
2278
2279    #[test]
2280    fn extract_page_text_out_of_range() {
2281        let doc = make_doc_with_text(b"BT /F1 12 Tf (Hello) Tj ET");
2282        let result = extract_page_text(&doc, 5);
2283        assert!(result.is_err());
2284    }
2285
2286    #[test]
2287    fn extract_positioned_chars_basic() {
2288        let doc = make_doc_with_text(b"BT /F1 12 Tf (AB) Tj ET");
2289        let chars = extract_positioned_chars(&doc, 1).unwrap();
2290        assert_eq!(chars.len(), 2);
2291        assert_eq!(chars[0].ch, 'A');
2292        assert_eq!(chars[1].ch, 'B');
2293        assert_eq!(chars[0].page, 1);
2294        // Second char should be positioned after the first.
2295        assert!(chars[1].bbox[0] > chars[0].bbox[0]);
2296    }
2297
2298    #[test]
2299    fn extract_tj_array() {
2300        let doc = make_doc_with_text(b"BT /F1 12 Tf [(He) -100 (llo)] TJ ET");
2301        let blocks = extract_text(&doc);
2302        assert_eq!(blocks.len(), 1);
2303        assert_eq!(blocks[0].text, "Hello");
2304    }
2305
2306    #[test]
2307    fn empty_page_extracts_no_text() {
2308        let doc = make_doc_with_text(b"q Q");
2309        let blocks = extract_text(&doc);
2310        assert!(blocks.is_empty());
2311    }
2312
2313    #[test]
2314    fn multiline_text_extraction() {
2315        let doc = make_doc_with_text(b"BT /F1 12 Tf 12 TL (Line1) Tj T* (Line2) Tj ET");
2316        let blocks = extract_text(&doc);
2317        assert_eq!(blocks.len(), 2);
2318        assert_eq!(blocks[0].text, "Line1");
2319        assert_eq!(blocks[1].text, "Line2");
2320    }
2321
2322    /// `/ActualText` on a BDC marked-content sequence overrides the glyph
2323    /// reading of the inner Tj. Glyph text is preserved on `text`; the
2324    /// canonical reading lives on `actual_text`.
2325    #[test]
2326    fn actual_text_basic_bdc_override() {
2327        let doc = make_doc_with_text(b"BT /F1 12 Tf /Span <</ActualText (fi)>> BDC (X) Tj EMC ET");
2328        let blocks = extract_text(&doc);
2329        assert_eq!(blocks.len(), 1);
2330        assert_eq!(blocks[0].text, "X");
2331        assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
2332    }
2333
2334    /// Nested BDCs: the innermost `/ActualText` wins inside its range, and
2335    /// the outer override is restored after the inner EMC. A flat flag would
2336    /// leak the inner value past its scope; the stack must pop one level.
2337    #[test]
2338    fn actual_text_nested_bdc_inner_overrides_outer() {
2339        let doc = make_doc_with_text(
2340            b"BT /F1 12 Tf \
2341              /Span <</ActualText (outer)>> BDC \
2342                (A) Tj \
2343                /Span <</ActualText (inner)>> BDC \
2344                  (B) Tj \
2345                EMC \
2346                (C) Tj \
2347              EMC ET",
2348        );
2349        let blocks = extract_text(&doc);
2350        assert_eq!(blocks.len(), 3);
2351        assert_eq!(blocks[0].text, "A");
2352        assert_eq!(blocks[0].actual_text.as_deref(), Some("outer"));
2353        assert_eq!(blocks[1].text, "B");
2354        assert_eq!(blocks[1].actual_text.as_deref(), Some("inner"));
2355        assert_eq!(blocks[2].text, "C");
2356        assert_eq!(blocks[2].actual_text.as_deref(), Some("outer"));
2357    }
2358
2359    /// BMC (no property list) still pushes a stack entry so the matching EMC
2360    /// pops the right level — otherwise an EMC inside a BDC range would leak.
2361    #[test]
2362    fn actual_text_bmc_does_not_leak_emc() {
2363        let doc = make_doc_with_text(
2364            b"BT /F1 12 Tf \
2365              /Span <</ActualText (X)>> BDC \
2366                /Artifact BMC (in) Tj EMC \
2367                (out) Tj \
2368              EMC \
2369              (after) Tj ET",
2370        );
2371        let blocks = extract_text(&doc);
2372        assert_eq!(blocks.len(), 3);
2373        assert_eq!(blocks[0].actual_text.as_deref(), Some("X"));
2374        assert_eq!(blocks[1].actual_text.as_deref(), Some("X"));
2375        assert_eq!(blocks[2].actual_text, None);
2376    }
2377
2378    /// Build a one-page Document whose page invokes a Form XObject (via
2379    /// `/Fm0 Do`) wrapped in a `BDC` / `EMC` pair. `xobj_content` is the
2380    /// content stream placed inside the Form XObject; `page_pre` runs
2381    /// before the `Do`, and `page_post` runs after it (still inside the
2382    /// surrounding `BDC` / `EMC`). Used to drive issue #1358 regression.
2383    fn make_doc_with_xobj_inside_bdc(
2384        page_pre: &[u8],
2385        xobj_content: &[u8],
2386        page_post: &[u8],
2387        actual_text: &str,
2388    ) -> Document {
2389        let mut doc = Document::with_version("1.7");
2390
2391        let xobj_dict = dictionary! {
2392            "Type" => "XObject",
2393            "Subtype" => "Form",
2394            "BBox" => vec![0.into(), 0.into(), 100.into(), 100.into()],
2395        };
2396        let xobj_stream = Stream::new(xobj_dict, xobj_content.to_vec());
2397        let xobj_id = doc.add_object(Object::Stream(xobj_stream));
2398
2399        // Page content: BT /F1 12 Tf /Span <</ActualText(...)>> BDC <pre> /Fm0 Do <post> EMC ET
2400        let mut page_content = Vec::<u8>::new();
2401        page_content.extend_from_slice(b"BT /F1 12 Tf /Span <</ActualText (");
2402        page_content.extend_from_slice(actual_text.as_bytes());
2403        page_content.extend_from_slice(b")>> BDC ");
2404        page_content.extend_from_slice(page_pre);
2405        page_content.extend_from_slice(b" /Fm0 Do ");
2406        page_content.extend_from_slice(page_post);
2407        page_content.extend_from_slice(b" EMC ET");
2408
2409        let content_stream = Stream::new(dictionary! {}, page_content);
2410        let content_id = doc.add_object(Object::Stream(content_stream));
2411
2412        let resources = dictionary! {
2413            "XObject" => dictionary! { "Fm0" => Object::Reference(xobj_id) },
2414        };
2415        let resources_id = doc.add_object(Object::Dictionary(resources));
2416
2417        let page_dict = dictionary! {
2418            "Type" => "Page",
2419            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2420            "Contents" => Object::Reference(content_id),
2421            "Resources" => Object::Reference(resources_id),
2422        };
2423        let page_id = doc.add_object(Object::Dictionary(page_dict));
2424
2425        let pages_dict = dictionary! {
2426            "Type" => "Pages",
2427            "Kids" => vec![Object::Reference(page_id)],
2428            "Count" => 1_i64,
2429        };
2430        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2431
2432        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2433            d.set("Parent", Object::Reference(pages_id));
2434        }
2435
2436        let catalog = dictionary! {
2437            "Type" => "Catalog",
2438            "Pages" => Object::Reference(pages_id),
2439        };
2440        let catalog_id = doc.add_object(Object::Dictionary(catalog));
2441        doc.trailer.set("Root", Object::Reference(catalog_id));
2442
2443        doc
2444    }
2445
2446    /// Regression guard for issue #1358: when a `Do` operator invoking a
2447    /// Form XObject sits inside a `BDC` / `EMC` pair carrying
2448    /// `/ActualText`, text emitted by the XObject's own `Tj`/`TJ`
2449    /// operators must inherit that `/ActualText` value. Previously the
2450    /// recursive `extract_blocks_from_ops_inner` reset the marked-content
2451    /// stack and lost the surrounding context.
2452    #[test]
2453    fn actual_text_propagates_into_form_xobject_recursion() {
2454        let doc = make_doc_with_xobj_inside_bdc(
2455            b"(pre) Tj",
2456            b"BT /F1 12 Tf (inside) Tj ET",
2457            b"(post) Tj",
2458            "wrapped",
2459        );
2460        let blocks = extract_text(&doc);
2461        assert_eq!(blocks.len(), 3);
2462        assert_eq!(blocks[0].text, "pre");
2463        assert_eq!(blocks[0].actual_text.as_deref(), Some("wrapped"));
2464        // The block emitted inside the Form XObject's BT/ET must inherit
2465        // the surrounding /ActualText — not silently become None.
2466        assert_eq!(blocks[1].text, "inside");
2467        assert_eq!(
2468            blocks[1].actual_text.as_deref(),
2469            Some("wrapped"),
2470            "Form XObject text lost surrounding /ActualText (issue #1358)"
2471        );
2472        assert_eq!(blocks[2].text, "post");
2473        assert_eq!(blocks[2].actual_text.as_deref(), Some("wrapped"));
2474    }
2475
2476    /// Inverse regression: when a `Do` invokes a Form XObject whose own
2477    /// content has its own `BDC` ... `EMC` pair with a different
2478    /// `/ActualText`, the inner pair takes precedence inside the XObject
2479    /// and the outer pair resumes after the inner `EMC`. This guards
2480    /// against the inherited binding clobbering legitimate inner
2481    /// marked-content state.
2482    #[test]
2483    fn actual_text_inner_xobj_bdc_overrides_inherited() {
2484        let doc = make_doc_with_xobj_inside_bdc(
2485            b"",
2486            b"BT /F1 12 Tf \
2487              /Span <</ActualText (inner)>> BDC (B) Tj EMC \
2488              (after) Tj ET",
2489            b"",
2490            "outer",
2491        );
2492        let blocks = extract_text(&doc);
2493        assert_eq!(blocks.len(), 2);
2494        assert_eq!(blocks[0].text, "B");
2495        assert_eq!(blocks[0].actual_text.as_deref(), Some("inner"));
2496        assert_eq!(blocks[1].text, "after");
2497        assert_eq!(blocks[1].actual_text.as_deref(), Some("outer"));
2498    }
2499
2500    /// `/ActualText` encoded as UTF-16BE (with BOM) decodes to the proper
2501    /// Unicode string. Real PDFs almost always use this form.
2502    #[test]
2503    fn actual_text_utf16be_bom_decodes() {
2504        // <FEFF00660069> = UTF-16BE BOM + "fi"
2505        let doc = make_doc_with_text(
2506            b"BT /F1 12 Tf /Span <</ActualText <FEFF00660069> >> BDC (X) Tj EMC ET",
2507        );
2508        let blocks = extract_text(&doc);
2509        assert_eq!(blocks.len(), 1);
2510        assert_eq!(blocks[0].actual_text.as_deref(), Some("fi"));
2511    }
2512
2513    // -- M4-LIG-01: ligature decomposition -------------------------------
2514
2515    #[test]
2516    fn ligature_ff_decomposes() {
2517        assert_eq!(decompose_ligature_char('\u{FB00}'), Some("ff"));
2518        assert_eq!(decompose_ligatures("\u{FB00}"), "ff");
2519        assert_eq!(decompose_ligatures("o\u{FB00}ice"), "office");
2520    }
2521
2522    #[test]
2523    fn ligature_fi_decomposes() {
2524        assert_eq!(decompose_ligature_char('\u{FB01}'), Some("fi"));
2525        assert_eq!(decompose_ligatures("\u{FB01}"), "fi");
2526        assert_eq!(decompose_ligatures("of\u{FB01}ce"), "office");
2527    }
2528
2529    #[test]
2530    fn ligature_fl_decomposes() {
2531        assert_eq!(decompose_ligature_char('\u{FB02}'), Some("fl"));
2532        assert_eq!(decompose_ligatures("\u{FB02}ame"), "flame");
2533    }
2534
2535    #[test]
2536    fn ligature_ffi_decomposes() {
2537        assert_eq!(decompose_ligature_char('\u{FB03}'), Some("ffi"));
2538        assert_eq!(decompose_ligatures("o\u{FB03}ce"), "office");
2539    }
2540
2541    #[test]
2542    fn ligature_ffl_decomposes() {
2543        assert_eq!(decompose_ligature_char('\u{FB04}'), Some("ffl"));
2544        assert_eq!(decompose_ligatures("ba\u{FB04}e"), "baffle");
2545    }
2546
2547    /// `st` has both U+FB05 (long-s + t, archaic) and U+FB06 (regular st);
2548    /// both decompose to "st" — historical spelling is not preserved since
2549    /// the goal is text-search/copy-paste, not typographic round-trip.
2550    /// Glyph name `st` also routes through this codepoint.
2551    #[test]
2552    fn ligature_st_decomposes() {
2553        assert_eq!(decompose_ligature_char('\u{FB06}'), Some("st"));
2554        assert_eq!(decompose_ligature_char('\u{FB05}'), Some("st"));
2555        assert_eq!(decompose_ligatures("fa\u{FB06}"), "fast");
2556        // Glyph name path: `st` → U+FB06 → "st"
2557        assert_eq!(glyph_name_to_unicode("st"), Some('\u{FB06}'));
2558    }
2559
2560    /// `ct` has no precomposed Unicode codepoint, so `glyph_name_to_unicode`
2561    /// reports `None`; the encoding-map builder records the slot in the
2562    /// `ct_codes` side-table instead. Decoding emits one internal marker
2563    /// scalar for that code, then the ligature layer expands only markers
2564    /// whose origin came from `ct_codes`. This avoids clobbering legitimate
2565    /// U+E007 values arriving through ToUnicode CMaps.
2566    #[test]
2567    fn ligature_ct_via_glyph_name_emits_string() {
2568        assert_eq!(glyph_name_to_unicode("ct"), None);
2569        // End-to-end: byte 1 → /ct → "ct" in extracted text.
2570        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6101> Tj ET", "ct");
2571        let blocks = extract_text(&doc);
2572        assert_eq!(blocks.len(), 1);
2573        assert_eq!(blocks[0].text, "act");
2574    }
2575
2576    /// Regression guard for issue #1357: a ToUnicode CMap that legitimately
2577    /// maps a code to U+E007 must NOT be silently rewritten to "ct". The
2578    /// previous PUA-sentinel implementation clobbered every U+E007 it saw,
2579    /// regardless of origin.
2580    #[test]
2581    fn tounicode_pua_e007_is_preserved() {
2582        let doc = make_doc_with_tounicode_cmap(b"BT /F1 12 Tf <01> Tj ET", 0x01, 0xE007);
2583        let blocks = extract_text(&doc);
2584        assert_eq!(blocks.len(), 1);
2585        assert_eq!(blocks[0].text, "\u{E007}");
2586        assert_ne!(blocks[0].text, "ct");
2587    }
2588
2589    /// A `/ct` glyph must advance the text matrix as one glyph even though
2590    /// it expands to two extracted characters. Otherwise following glyphs in
2591    /// the same show operator drift right by one extra glyph width.
2592    #[test]
2593    fn ligature_ct_positioned_chars_advance_as_single_glyph() {
2594        // <610162>: a (0x61), ct (0x01 via /Differences), b (0x62)
2595        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <610162> Tj ET", "ct");
2596        let chars = extract_positioned_chars(&doc, 1).unwrap();
2597        let extracted: String = chars.iter().map(|c| c.ch).collect();
2598        assert_eq!(extracted, "actb");
2599        assert_eq!(chars.len(), 4);
2600
2601        let char_w = 12.0 * APPROX_CHAR_WIDTH;
2602        assert!((chars[1].bbox[0] - char_w).abs() < 1e-6);
2603        assert!((chars[3].bbox[0] - (2.0 * char_w)).abs() < 1e-6);
2604    }
2605
2606    /// Build a one-page Document with a font whose /Encoding /Differences
2607    /// remaps byte 1 to the given ligature glyph name. Used by the golden
2608    /// "office" fixture below to drive ligature decomposition end-to-end
2609    /// through the text extractor.
2610    fn make_doc_with_ligature_font(content: &[u8], glyph_name: &str) -> Document {
2611        let mut doc = Document::with_version("1.7");
2612
2613        let content_stream = Stream::new(dictionary! {}, content.to_vec());
2614        let content_id = doc.add_object(Object::Stream(content_stream));
2615
2616        let encoding = dictionary! {
2617            "Type" => "Encoding",
2618            "BaseEncoding" => "WinAnsiEncoding",
2619            "Differences" => vec![
2620                1_i64.into(),
2621                Object::Name(glyph_name.as_bytes().to_vec()),
2622            ],
2623        };
2624        let encoding_id = doc.add_object(Object::Dictionary(encoding));
2625
2626        let font = dictionary! {
2627            "Type" => "Font",
2628            "Subtype" => "Type1",
2629            "BaseFont" => "Helvetica",
2630            "Encoding" => Object::Reference(encoding_id),
2631        };
2632        let font_id = doc.add_object(Object::Dictionary(font));
2633
2634        let resources = dictionary! {
2635            "Font" => dictionary! { "F1" => Object::Reference(font_id) },
2636        };
2637        let resources_id = doc.add_object(Object::Dictionary(resources));
2638
2639        let page_dict = dictionary! {
2640            "Type" => "Page",
2641            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2642            "Contents" => Object::Reference(content_id),
2643            "Resources" => Object::Reference(resources_id),
2644        };
2645        let page_id = doc.add_object(Object::Dictionary(page_dict));
2646
2647        let pages_dict = dictionary! {
2648            "Type" => "Pages",
2649            "Kids" => vec![Object::Reference(page_id)],
2650            "Count" => 1_i64,
2651        };
2652        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2653
2654        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2655            d.set("Parent", Object::Reference(pages_id));
2656        }
2657
2658        let catalog = dictionary! {
2659            "Type" => "Catalog",
2660            "Pages" => Object::Reference(pages_id),
2661        };
2662        let catalog_id = doc.add_object(Object::Dictionary(catalog));
2663        doc.trailer.set("Root", Object::Reference(catalog_id));
2664
2665        doc
2666    }
2667
2668    /// Golden fixture: byte sequence `o + ffi + c + e` rendered through a
2669    /// font whose /Differences remaps byte 1 → /ffi → U+FB03. End-to-end
2670    /// the extractor should produce "office", not "o\u{FB03}ce". This is
2671    /// the single test that exercises the full pipeline: glyph→unicode →
2672    /// encoding map → decode → ligature decomposition.
2673    #[test]
2674    fn ligature_office_golden_ffi() {
2675        // Hex string <6F016365>: o (0x6F), ffi (0x01), c (0x63), e (0x65)
2676        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <6F016365> Tj ET", "ffi");
2677        let blocks = extract_text(&doc);
2678        assert_eq!(blocks.len(), 1);
2679        assert_eq!(blocks[0].text, "office");
2680    }
2681
2682    /// Build a one-page Document whose font has a ToUnicode CMap mapping a
2683    /// single 1-byte source code to an explicit Unicode scalar. Used to
2684    /// drive the regression guard for issue #1357: a ToUnicode-supplied
2685    /// codepoint must round-trip through the extractor unchanged.
2686    fn make_doc_with_tounicode_cmap(content: &[u8], src: u8, dst: u32) -> Document {
2687        let mut doc = Document::with_version("1.7");
2688
2689        let content_stream = Stream::new(dictionary! {}, content.to_vec());
2690        let content_id = doc.add_object(Object::Stream(content_stream));
2691
2692        // Minimal ToUnicode CMap with a single bfchar mapping.
2693        let cmap_text = format!(
2694            "/CIDInit /ProcSet findresource begin\n\
2695             12 dict begin\n\
2696             begincmap\n\
2697             /CMapType 2 def\n\
2698             1 beginbfchar\n\
2699             <{:02X}> <{:04X}>\n\
2700             endbfchar\n\
2701             endcmap CMapName currentdict /CMap defineresource pop end end",
2702            src, dst
2703        );
2704        let cmap_stream = Stream::new(dictionary! {}, cmap_text.into_bytes());
2705        let cmap_id = doc.add_object(Object::Stream(cmap_stream));
2706
2707        let font = dictionary! {
2708            "Type" => "Font",
2709            "Subtype" => "Type1",
2710            "BaseFont" => "Helvetica",
2711            "Encoding" => "WinAnsiEncoding",
2712            "ToUnicode" => Object::Reference(cmap_id),
2713        };
2714        let font_id = doc.add_object(Object::Dictionary(font));
2715
2716        let resources = dictionary! {
2717            "Font" => dictionary! { "F1" => Object::Reference(font_id) },
2718        };
2719        let resources_id = doc.add_object(Object::Dictionary(resources));
2720
2721        let page_dict = dictionary! {
2722            "Type" => "Page",
2723            "MediaBox" => vec![0.into(), 0.into(), 612.into(), 792.into()],
2724            "Contents" => Object::Reference(content_id),
2725            "Resources" => Object::Reference(resources_id),
2726        };
2727        let page_id = doc.add_object(Object::Dictionary(page_dict));
2728
2729        let pages_dict = dictionary! {
2730            "Type" => "Pages",
2731            "Kids" => vec![Object::Reference(page_id)],
2732            "Count" => 1_i64,
2733        };
2734        let pages_id = doc.add_object(Object::Dictionary(pages_dict));
2735
2736        if let Ok(Object::Dictionary(ref mut d)) = doc.get_object_mut(page_id) {
2737            d.set("Parent", Object::Reference(pages_id));
2738        }
2739
2740        let catalog = dictionary! {
2741            "Type" => "Catalog",
2742            "Pages" => Object::Reference(pages_id),
2743        };
2744        let catalog_id = doc.add_object(Object::Dictionary(catalog));
2745        doc.trailer.set("Root", Object::Reference(catalog_id));
2746
2747        doc
2748    }
2749
2750    /// PositionedChar: a ligature glyph splits its bbox evenly across the
2751    /// constituent characters so character offsets stay within the glyph's
2752    /// horizontal footprint. The matrix advances exactly once per glyph,
2753    /// not once per constituent — so layout is preserved.
2754    #[test]
2755    fn ligature_positioned_chars_split_bbox() {
2756        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
2757        let chars = extract_positioned_chars(&doc, 1).unwrap();
2758        assert_eq!(chars.len(), 3, "ffi → 3 chars");
2759        assert_eq!(chars[0].ch, 'f');
2760        assert_eq!(chars[1].ch, 'f');
2761        assert_eq!(chars[2].ch, 'i');
2762        // Bboxes should tile horizontally.
2763        assert!(chars[0].bbox[2] <= chars[1].bbox[0] + 1e-9);
2764        assert!(chars[1].bbox[2] <= chars[2].bbox[0] + 1e-9);
2765        // And total width should equal one glyph's char_w.
2766        let total = chars[2].bbox[2] - chars[0].bbox[0];
2767        let expected = 12.0 * APPROX_CHAR_WIDTH;
2768        assert!((total - expected).abs() < 1e-6);
2769    }
2770
2771    // -- M4-FOLLOWUP-03: PositionedChar decomposition + Tj bbox geometry ---
2772
2773    /// Regression guard for issue #1359 (problem 1): ligatures outside the
2774    /// hardcoded Latin set must decompose in PositionedChar output the same
2775    /// way they decompose in extracted text. Previously
2776    /// `push_glyph_positioned` only ran the hardcoded Latin map while
2777    /// extracted text additionally applied an NFKD fallback over
2778    /// FB00..FB4F, leaving counts misaligned for Armenian/Hebrew/Arabic
2779    /// presentation forms.
2780    #[test]
2781    fn positioned_chars_match_extracted_text_for_armenian_ligature() {
2782        // U+FB13 (Armenian small ligature men now) NFKDs to U+0574 U+0576.
2783        // The font's /Differences slot 1 maps to glyph name uniFB13.
2784        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB13");
2785        let blocks = extract_text(&doc);
2786        let chars = extract_positioned_chars(&doc, 1).unwrap();
2787        assert_eq!(blocks.len(), 1);
2788        // Both paths must agree on the constituent count.
2789        assert_eq!(
2790            blocks[0].text.chars().count(),
2791            chars.len(),
2792            "PositionedChar count drifted from extract_text() char count \
2793             (issue #1359 problem 1): blocks[0].text = {:?}, chars = {:?}",
2794            blocks[0].text,
2795            chars.iter().map(|c| c.ch).collect::<String>(),
2796        );
2797        // And on the actual characters.
2798        let chars_str: String = chars.iter().map(|c| c.ch).collect();
2799        assert_eq!(blocks[0].text, chars_str);
2800        // Total bbox span equals one glyph's char_w (the matrix advanced once).
2801        let total = chars.last().unwrap().bbox[2] - chars.first().unwrap().bbox[0];
2802        let expected = 12.0 * APPROX_CHAR_WIDTH;
2803        assert!((total - expected).abs() < 1e-6);
2804    }
2805
2806    /// Hebrew presentation forms may NFKD-map to a single base Hebrew
2807    /// codepoint. Those mappings are still semantic decompositions and must
2808    /// not be dropped just because the output has one scalar.
2809    #[test]
2810    fn hebrew_presentation_form_extracts_single_codepoint_nfkd_mapping() {
2811        // U+FB21 (Hebrew letter wide alef) NFKDs to U+05D0.
2812        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
2813        let blocks = extract_text(&doc);
2814        assert_eq!(blocks.len(), 1);
2815        assert_eq!(blocks[0].text, "\u{05D0}");
2816        assert_eq!(
2817            decompose_glyph_to_string('\u{FB21}').as_deref(),
2818            Some("\u{05D0}")
2819        );
2820    }
2821
2822    /// The positioned-character stream must expose the same text characters
2823    /// as extract_text for single-codepoint Hebrew presentation-form
2824    /// decompositions.
2825    #[test]
2826    fn positioned_chars_match_extracted_text_for_hebrew_presentation_form() {
2827        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "uniFB21");
2828        let blocks = extract_text(&doc);
2829        let chars = extract_positioned_chars(&doc, 1).unwrap();
2830        let chars_str: String = chars.iter().map(|c| c.ch).collect();
2831
2832        assert_eq!(blocks.len(), 1);
2833        assert_eq!(blocks[0].text, "\u{05D0}");
2834        assert_eq!(chars_str, blocks[0].text);
2835        assert_eq!(blocks[0].text.chars().count(), chars.len());
2836    }
2837
2838    /// Regression guard for issue #1359 (problem 2): a Tj operator that
2839    /// emits a single ligature glyph must produce a TextBlock whose bbox
2840    /// width equals one rendered glyph's footprint, not the byte length
2841    /// of the decomposed display string. Previously `display_text.len()`
2842    /// over-counted because ligature expansion grows the string while
2843    /// the text matrix advances exactly once per source glyph.
2844    #[test]
2845    fn tj_block_width_matches_rendered_glyph_count() {
2846        let doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
2847        let blocks = extract_text(&doc);
2848        assert_eq!(blocks.len(), 1);
2849        assert_eq!(blocks[0].text, "ffi");
2850        let width = blocks[0].bbox[2] - blocks[0].bbox[0];
2851        let expected = 12.0 * APPROX_CHAR_WIDTH;
2852        assert!(
2853            (width - expected).abs() < 1e-6,
2854            "Tj bbox width {} != one glyph's char_w {} (issue #1359 problem 2)",
2855            width,
2856            expected,
2857        );
2858    }
2859
2860    /// Cross-operator consistency: a `Tj` block and a `TJ` block emitting
2861    /// the same single-glyph ligature must produce TextBlocks with equal
2862    /// bbox widths. Pre-fix, Tj used `display_text.len()` (3 for "ffi")
2863    /// and TJ used `x_end - x_start` (1 glyph advance) — so the same input
2864    /// rendered as ~3× wider in Tj than in TJ.
2865    #[test]
2866    fn tj_and_tj_array_bbox_widths_agree_for_ligature() {
2867        let tj_doc = make_doc_with_ligature_font(b"BT /F1 12 Tf <01> Tj ET", "ffi");
2868        let tj_blocks = extract_text(&tj_doc);
2869        let tj_doc_arr = make_doc_with_ligature_font(b"BT /F1 12 Tf [<01>] TJ ET", "ffi");
2870        let tj_arr_blocks = extract_text(&tj_doc_arr);
2871        assert_eq!(tj_blocks.len(), 1);
2872        assert_eq!(tj_arr_blocks.len(), 1);
2873        let tj_w = tj_blocks[0].bbox[2] - tj_blocks[0].bbox[0];
2874        let tj_arr_w = tj_arr_blocks[0].bbox[2] - tj_arr_blocks[0].bbox[0];
2875        assert!(
2876            (tj_w - tj_arr_w).abs() < 1e-6,
2877            "Tj bbox width {} disagrees with TJ bbox width {} for the same \
2878             single-glyph ligature input (issue #1359 problem 2)",
2879            tj_w,
2880            tj_arr_w,
2881        );
2882    }
2883}
pdfluent_extract/text.rs

pdfluent_extract/
text.rs