fleischwolf_pdf/
textparse.rs

1//! Pure-Rust PDF text extraction (replacing pdfium's glyph layer).
2//!
3//! pdfium reports *rendered* glyph boxes, which diverge from docling's
4//! `docling-parse` C++ parser at exactly the points that drive conformance:
5//! generated spaces get a zero-width box, combining diacritics get a real-width
6//! box, and ligature/fraction glyphs land at different x. This module instead
7//! reconstructs each glyph's box from the **font's own advance widths** and the
8//! PDF text/graphics matrices — the same information docling-parse uses — so a
9//! space is as wide as the font says and a combining mark has zero advance.
10//!
11//! The output is the same [`Glyph`] stream pdfium produces (native PDF
12//! coordinates, y-up), fed straight into the existing docling-parse line
13//! sanitizer ([`crate::dp_lines`]). Only the digital text layer is handled here;
14//! pages without one still fall back to OCR upstream.
15
16use std::collections::HashMap;
17
18use lopdf::{Dictionary, Document, Object};
19
20use crate::pdfium_backend::Glyph;
21
22/// A 2×3 affine matrix `[a b c d e f]`: maps `(x,y)` → `(a·x+c·y+e, b·x+d·y+f)`.
23#[derive(Clone, Copy)]
24struct Mat {
25    a: f64,
26    b: f64,
27    c: f64,
28    d: f64,
29    e: f64,
30    f: f64,
31}
32
33impl Mat {
34    const ID: Mat = Mat {
35        a: 1.0,
36        b: 0.0,
37        c: 0.0,
38        d: 1.0,
39        e: 0.0,
40        f: 0.0,
41    };
42
43    /// `self ∘ m`: the matrix that applies `self` first, then `m`.
44    fn then(self, m: Mat) -> Mat {
45        Mat {
46            a: self.a * m.a + self.b * m.c,
47            b: self.a * m.b + self.b * m.d,
48            c: self.c * m.a + self.d * m.c,
49            d: self.c * m.b + self.d * m.d,
50            e: self.e * m.a + self.f * m.c + m.e,
51            f: self.e * m.b + self.f * m.d + m.f,
52        }
53    }
54
55    fn apply(self, x: f64, y: f64) -> (f64, f64) {
56        (
57            self.a * x + self.c * y + self.e,
58            self.b * x + self.d * y + self.f,
59        )
60    }
61}
62
63/// A parsed font: how to turn raw string bytes into (unicode, advance) pairs.
64struct Font {
65    /// 2-byte codes (Type0 / Identity-H) vs 1-byte (simple fonts).
66    two_byte: bool,
67    /// code → Unicode string (from ToUnicode; may be multi-char, e.g. ligatures).
68    to_unicode: HashMap<u32, String>,
69    /// code → glyph advance, in 1000-unit glyph space.
70    widths: HashMap<u32, f64>,
71    default_width: f64,
72    /// 1-byte fallback decoding when ToUnicode lacks a code (WinAnsi-ish).
73    simple_encoding: Option<HashMap<u8, char>>,
74    /// code → raw `/Differences` glyph name, for GID-style names (`g115`) that
75    /// have no Unicode mapping. docling-parse emits these verbatim as `/g115`
76    /// (see the redp5110 bulleted list); matching it keeps no text skipped.
77    fallback_names: HashMap<u8, String>,
78    /// code → char from the embedded Type1 font program's own `/Encoding` vector,
79    /// used only as a last resort for glyphs the base encoding leaves unmapped
80    /// (standard TeX math fonts: `λ`, `≤`, …).
81    program_encoding: HashMap<u8, char>,
82    ascent: f64,
83    descent: f64,
84    hash: u64,
85}
86
87impl Font {
88    fn decode_code(&self, code: u32) -> (Option<String>, f64) {
89        let w = self
90            .widths
91            .get(&code)
92            .copied()
93            .unwrap_or(self.default_width);
94        if let Some(s) = self.to_unicode.get(&code) {
95            return (Some(decompose_ligatures(s)), w);
96        }
97        if !self.two_byte {
98            // A GID-style `/Differences` name (no Unicode) overrides the base
99            // encoding, matching docling's verbatim `/g115` fallback.
100            if let Some(name) = self.fallback_names.get(&(code as u8)) {
101                return (Some(format!("/{name}")), w);
102            }
103            if let Some(enc) = &self.simple_encoding {
104                if let Some(&ch) = enc.get(&(code as u8)) {
105                    return (Some(decompose_ligatures(&ch.to_string())), w);
106                }
107            }
108            // Last resort: the embedded Type1 font program's own `/Encoding`
109            // vector (`dup N /glyphname put`). Standard TeX math fonts (CMMI, CMSY,
110            // …) ship no PDF `/Encoding` and no ToUnicode, so a glyph like `λ`
111            // (CMMI code 21 → `/lambda`) or `≤` (CMSY code 20 → `/lessequal`) has
112            // no other mapping and would otherwise be silently dropped. docling
113            // recovers these from the same font program. This only fills codes the
114            // base encoding left unmapped, so it never changes an existing decode.
115            if let Some(&ch) = self.program_encoding.get(&(code as u8)) {
116                return (Some(decompose_ligatures(&ch.to_string())), w);
117            }
118        }
119        (None, w)
120    }
121}
122
123/// Spell out Latin presentation-form ligatures (`ﬁ`→`fi`, `ﬃ`→`ffi`, …) the way
124/// docling does, so `configuration`/`difficult` don't keep the ligature glyph.
125/// The chars share the ligature's box, so the line sanitizer recomposes them.
126fn decompose_ligatures(s: &str) -> String {
127    if !s.chars().any(|c| ('\u{FB00}'..='\u{FB06}').contains(&c)) {
128        return s.to_string();
129    }
130    s.chars()
131        .map(|c| {
132            match c {
133                '\u{FB00}' => "ff",
134                '\u{FB01}' => "fi",
135                '\u{FB02}' => "fl",
136                '\u{FB03}' => "ffi",
137                '\u{FB04}' => "ffl",
138                '\u{FB05}' => "ft",
139                '\u{FB06}' => "st",
140                _ => return c.to_string(),
141            }
142            .to_string()
143        })
144        .collect()
145}
146
147fn hash_name(name: &[u8]) -> u64 {
148    use std::hash::{Hash, Hasher};
149    let mut h = std::collections::hash_map::DefaultHasher::new();
150    name.hash(&mut h);
151    h.finish()
152}
153
154/// Resolve a possibly-indirect object to a dictionary.
155fn as_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
156    match obj {
157        Object::Dictionary(d) => Some(d),
158        Object::Reference(id) => doc.get_object(*id).ok().and_then(|o| o.as_dict().ok()),
159        _ => None,
160    }
161}
162
163fn deref<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
164    match obj {
165        Object::Reference(id) => doc.get_object(*id).ok(),
166        other => Some(other),
167    }
168}
169
170/// Parse one font dictionary into a [`Font`].
171fn parse_font(doc: &Document, name: &[u8], fdict: &Dictionary) -> Font {
172    let subtype: &[u8] = fdict
173        .get(b"Subtype")
174        .ok()
175        .and_then(|o| o.as_name().ok())
176        .unwrap_or(&[]);
177    let two_byte = subtype == b"Type0".as_slice();
178
179    let to_unicode = fdict
180        .get(b"ToUnicode")
181        .ok()
182        .and_then(|o| deref(doc, o))
183        .and_then(|o| o.as_stream().ok())
184        .and_then(|s| s.decompressed_content().ok())
185        .map(|data| parse_tounicode(&data))
186        .unwrap_or_default();
187
188    let (widths, default_width) = if two_byte {
189        cid_widths(doc, fdict)
190    } else {
191        simple_widths(doc, fdict)
192    };
193
194    let simple_encoding = if two_byte {
195        None
196    } else {
197        Some(simple_encoding_table(doc, fdict))
198    };
199    let fallback_names = if two_byte {
200        HashMap::new()
201    } else {
202        differences_gid_names(doc, fdict)
203    };
204    let program_encoding = if two_byte {
205        HashMap::new()
206    } else {
207        type1_program_encoding(doc, fdict)
208    };
209
210    let (ascent, descent) = font_ascent_descent(doc, fdict, two_byte);
211
212    Font {
213        two_byte,
214        to_unicode,
215        widths,
216        default_width,
217        simple_encoding,
218        fallback_names,
219        program_encoding,
220        ascent,
221        descent,
222        hash: hash_name(name),
223    }
224}
225
226/// Collect `/Differences` entries whose glyph name is a GID placeholder
227/// (`g115`, `cid42`, `glyph7`, `index9`) with no Unicode mapping. docling-parse
228/// emits such glyphs as the literal name `/g115`; mapping them here keeps the
229/// text from being silently dropped (subsetted fonts with no ToUnicode). The
230/// GID-name restriction keeps real Adobe glyph names on the normal path so this
231/// never invents garbage on the clean files.
232fn differences_gid_names(doc: &Document, fdict: &Dictionary) -> HashMap<u8, String> {
233    let mut map = HashMap::new();
234    let Some(Object::Dictionary(enc)) = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o))
235    else {
236        return map;
237    };
238    let Some(Object::Array(diffs)) = enc.get(b"Differences").ok().and_then(|o| deref(doc, o))
239    else {
240        return map;
241    };
242    let mut code = 0u8;
243    for el in diffs {
244        match el {
245            Object::Integer(i) => code = *i as u8,
246            Object::Name(name) => {
247                if glyph_name_to_char(name).is_none() && is_gid_name(name) {
248                    map.insert(code, String::from_utf8_lossy(name).into_owned());
249                }
250                code = code.wrapping_add(1);
251            }
252            _ => {}
253        }
254    }
255    map
256}
257
258/// Parse the embedded Type1 font program's built-in `/Encoding` vector
259/// (`dup <code> /<glyphname> put` entries in the clear-text header before
260/// `eexec`) into `code → char`. This is how docling recovers glyphs from
261/// standard TeX math fonts (CMMI/CMSY/…) that carry no PDF `/Encoding` and no
262/// ToUnicode — e.g. CMMI's `dup 21 /lambda` or CMSY's `dup 20 /lessequal`.
263/// Only `FontFile` (Type1) is parsed; CFF (`FontFile3`) and TrueType
264/// (`FontFile2`) store their encoding in a binary table and are left alone.
265fn type1_program_encoding(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
266    let mut map = HashMap::new();
267    let Some(desc) = fdict
268        .get(b"FontDescriptor")
269        .ok()
270        .and_then(|o| deref(doc, o))
271        .and_then(|o| o.as_dict().ok())
272    else {
273        return map;
274    };
275    let Some(data) = desc
276        .get(b"FontFile")
277        .ok()
278        .and_then(|o| deref(doc, o))
279        .and_then(|o| o.as_stream().ok())
280        .and_then(|s| s.decompressed_content().ok())
281    else {
282        return map;
283    };
284    // The clear-text header (PostScript) ends at `eexec`; the rest is encrypted.
285    let head_end = data
286        .windows(5)
287        .position(|w| w == b"eexec")
288        .unwrap_or(data.len());
289    let head = String::from_utf8_lossy(&data[..head_end]);
290    // Scan for `dup <code> /<name> put` tokens.
291    let toks: Vec<&str> = head.split_whitespace().collect();
292    for w in toks.windows(4) {
293        if w[0] == "dup" && w[3] == "put" {
294            if let (Ok(code), Some(name)) = (w[1].parse::<u32>(), w[2].strip_prefix('/')) {
295                if code <= 255 {
296                    if let Some(ch) = glyph_name_to_char(name.as_bytes()) {
297                        map.insert(code as u8, ch);
298                    }
299                }
300            }
301        }
302    }
303    map
304}
305
306/// A glyph name that is a synthetic placeholder, not a real Adobe name:
307/// `g115`, `cid42`, `glyph7`, `index9`, `G12`, or a short-prefix code name like
308/// `SM590000` (IBM BookMaster). These carry no Unicode meaning, and docling-parse
309/// emits them verbatim (`/SM590000`). `afii####` / `uni####` are real Adobe names
310/// and excluded. The restriction keeps genuine glyph names on the Unicode path.
311fn is_gid_name(name: &[u8]) -> bool {
312    let Ok(s) = std::str::from_utf8(name) else {
313        return false;
314    };
315    if s.starts_with("afii") || s.starts_with("uni") {
316        return false;
317    }
318    for prefix in ["g", "G", "cid", "CID", "glyph", "index"] {
319        if let Some(rest) = s.strip_prefix(prefix) {
320            if !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit()) {
321                return true;
322            }
323        }
324    }
325    // Short alpha prefix (≤3 letters) followed by a run of ≥3 digits — synthetic
326    // code names like `SM590000`, distinct from real Adobe names (whole words or
327    // letter+`.suffix` variants).
328    let alpha = s.bytes().take_while(|b| b.is_ascii_alphabetic()).count();
329    let digits = s.len() - alpha;
330    (1..=3).contains(&alpha)
331        && digits >= 3
332        && s.as_bytes()[alpha..].iter().all(|b| b.is_ascii_digit())
333}
334
335fn font_ascent_descent(doc: &Document, fdict: &Dictionary, two_byte: bool) -> (f64, f64) {
336    // For Type0, the descriptor lives on the descendant CIDFont.
337    let descr_owner = if two_byte {
338        fdict
339            .get(b"DescendantFonts")
340            .ok()
341            .and_then(|o| deref(doc, o))
342            .and_then(|o| match o {
343                Object::Array(a) => a.first(),
344                _ => None,
345            })
346            .and_then(|o| as_dict(doc, o))
347    } else {
348        Some(fdict)
349    };
350    let fd = descr_owner
351        .and_then(|d| d.get(b"FontDescriptor").ok())
352        .and_then(|o| as_dict(doc, o));
353    let asc = fd
354        .and_then(|d| d.get(b"Ascent").ok())
355        .and_then(|o| {
356            o.as_float()
357                .ok()
358                .or_else(|| o.as_i64().ok().map(|i| i as f32))
359        })
360        .unwrap_or(750.0) as f64;
361    let desc = fd
362        .and_then(|d| d.get(b"Descent").ok())
363        .and_then(|o| {
364            o.as_float()
365                .ok()
366                .or_else(|| o.as_i64().ok().map(|i| i as f32))
367        })
368        .unwrap_or(-250.0) as f64;
369    // Some subsetted fonts carry a degenerate FontDescriptor (`/Ascent 0
370    // /Descent 0`) — the real metrics live in the font program. That collapses
371    // the loose box to zero height, so the line cells get zero area and the
372    // layout's region/text assignment drops them (2305's References list lost
373    // every prose line, keeping only the URLs). Fall back to typical text metrics
374    // so the box has height.
375    if asc - desc <= 1.0 {
376        return (750.0, -250.0);
377    }
378    (asc, desc)
379}
380
381/// Simple-font widths: `/FirstChar` + `/Widths` array, `/MissingWidth` default.
382fn simple_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
383    let mut map = HashMap::new();
384    let first = fdict
385        .get(b"FirstChar")
386        .ok()
387        .and_then(|o| o.as_i64().ok())
388        .unwrap_or(0) as u32;
389    if let Some(Object::Array(arr)) = fdict.get(b"Widths").ok().and_then(|o| deref(doc, o)) {
390        for (i, w) in arr.iter().enumerate() {
391            if let Some(w) = num(w) {
392                map.insert(first + i as u32, w);
393            }
394        }
395    }
396    let dw = fdict
397        .get(b"FontDescriptor")
398        .ok()
399        .and_then(|o| as_dict(doc, o))
400        .and_then(|d| d.get(b"MissingWidth").ok())
401        .and_then(num)
402        .unwrap_or(0.0);
403    (map, dw)
404}
405
406/// CIDFont widths: the `/W` array on the descendant font (`/DW` default = 1000).
407fn cid_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
408    let mut map = HashMap::new();
409    let Some(desc) = fdict
410        .get(b"DescendantFonts")
411        .ok()
412        .and_then(|o| deref(doc, o))
413        .and_then(|o| match o {
414            Object::Array(a) => a.first(),
415            _ => None,
416        })
417        .and_then(|o| as_dict(doc, o))
418    else {
419        return (map, 1000.0);
420    };
421    let dw = desc.get(b"DW").ok().and_then(num).unwrap_or(1000.0);
422    if let Some(Object::Array(w)) = desc.get(b"W").ok().and_then(|o| deref(doc, o)) {
423        let mut i = 0;
424        while i < w.len() {
425            let c = w.get(i).and_then(num);
426            match (c, w.get(i + 1)) {
427                // `c [w1 w2 ...]`: consecutive CIDs starting at c.
428                (Some(c), Some(Object::Array(list))) => {
429                    for (k, wv) in list.iter().enumerate() {
430                        if let Some(wv) = num(wv) {
431                            map.insert(c as u32 + k as u32, wv);
432                        }
433                    }
434                    i += 2;
435                }
436                // `c_first c_last w`: a run all of width w.
437                (Some(c1), Some(o2)) => {
438                    if let (Some(c2), Some(wv)) = (num(o2), w.get(i + 2).and_then(num)) {
439                        for cid in c1 as u32..=c2 as u32 {
440                            map.insert(cid, wv);
441                        }
442                    }
443                    i += 3;
444                }
445                _ => break,
446            }
447        }
448    }
449    (map, dw)
450}
451
452fn num(o: &Object) -> Option<f64> {
453    match o {
454        Object::Integer(i) => Some(*i as f64),
455        Object::Real(r) => Some(*r as f64),
456        _ => None,
457    }
458}
459
460/// Parse a ToUnicode CMap's `bfchar` / `bfrange` sections into code→string.
461fn parse_tounicode(data: &[u8]) -> HashMap<u32, String> {
462    let text = String::from_utf8_lossy(data);
463    let mut map = HashMap::new();
464    let hex = |s: &str| -> Option<Vec<u16>> {
465        let s = s.trim();
466        if !s.starts_with('<') || !s.ends_with('>') {
467            return None;
468        }
469        let h = &s[1..s.len() - 1];
470        let bytes: Vec<u8> = (0..h.len())
471            .step_by(2)
472            .filter_map(|i| u8::from_str_radix(h.get(i..i + 2)?, 16).ok())
473            .collect();
474        Some(
475            bytes
476                .chunks(2)
477                .map(|c| {
478                    if c.len() == 2 {
479                        u16::from_be_bytes([c[0], c[1]])
480                    } else {
481                        c[0] as u16
482                    }
483                })
484                .collect(),
485        )
486    };
487    let u16s_to_string = |u: &[u16]| String::from_utf16_lossy(u);
488    let code_of = |u: &[u16]| u.iter().fold(0u32, |acc, &x| (acc << 16) | x as u32);
489
490    // Tokenize by structure, not whitespace: CMap hex groups are often written
491    // back-to-back with no separators (`<21><21><0054>`), so scan for `<…>`
492    // groups, `[`/`]` brackets, and bareword keywords.
493    let tokens: Vec<String> = {
494        let bytes = text.as_bytes();
495        let mut toks = Vec::new();
496        let mut i = 0;
497        while i < bytes.len() {
498            let c = bytes[i];
499            if c.is_ascii_whitespace() {
500                i += 1;
501            } else if c == b'<' {
502                let start = i;
503                while i < bytes.len() && bytes[i] != b'>' {
504                    i += 1;
505                }
506                i += 1; // include '>'
507                toks.push(String::from_utf8_lossy(&bytes[start..i.min(bytes.len())]).into_owned());
508            } else if c == b'[' || c == b']' {
509                toks.push((c as char).to_string());
510                i += 1;
511            } else {
512                let start = i;
513                while i < bytes.len()
514                    && !bytes[i].is_ascii_whitespace()
515                    && bytes[i] != b'<'
516                    && bytes[i] != b'['
517                    && bytes[i] != b']'
518                {
519                    i += 1;
520                }
521                toks.push(String::from_utf8_lossy(&bytes[start..i]).into_owned());
522            }
523        }
524        toks
525    };
526    let tokens: Vec<&str> = tokens.iter().map(|s| s.as_str()).collect();
527    let mut i = 0;
528    while i < tokens.len() {
529        match tokens[i] {
530            "beginbfchar" => {
531                i += 1;
532                while i + 1 < tokens.len() && tokens[i] != "endbfchar" {
533                    if let (Some(src), Some(dst)) = (hex(tokens[i]), hex(tokens[i + 1])) {
534                        map.insert(code_of(&src), u16s_to_string(&dst));
535                    }
536                    i += 2;
537                }
538            }
539            "beginbfrange" => {
540                i += 1;
541                while i + 2 < tokens.len() && tokens[i] != "endbfrange" {
542                    let (Some(lo), Some(hi)) = (hex(tokens[i]), hex(tokens[i + 1])) else {
543                        i += 1;
544                        continue;
545                    };
546                    let lo = code_of(&lo);
547                    let hi = code_of(&hi);
548                    if tokens[i + 2] == "[" {
549                        // `<lo> <hi> [ <d0> <d1> ... ]`: one dst per code in the range.
550                        let mut j = i + 3;
551                        let mut code = lo;
552                        while j < tokens.len() && tokens[j] != "]" {
553                            if let Some(dst) = hex(tokens[j]) {
554                                map.insert(code, u16s_to_string(&dst));
555                            }
556                            code += 1;
557                            j += 1;
558                        }
559                        i = j + 1;
560                    } else if let Some(dst) = hex(tokens[i + 2]) {
561                        // `<lo> <hi> <dst>`: consecutive Unicode from a base.
562                        let base = code_of(&dst);
563                        for (k, code) in (lo..=hi).enumerate() {
564                            if let Some(ch) = char::from_u32(base + k as u32) {
565                                map.insert(code, ch.to_string());
566                            }
567                        }
568                        i += 3;
569                    } else {
570                        i += 1;
571                    }
572                }
573            }
574            _ => i += 1,
575        }
576    }
577    map
578}
579
580/// Decode a PDF string literal in a Tj/TJ operand into raw code units.
581fn codes(font: &Font, bytes: &[u8]) -> Vec<u32> {
582    if font.two_byte {
583        bytes
584            .chunks(2)
585            .map(|c| {
586                if c.len() == 2 {
587                    ((c[0] as u32) << 8) | c[1] as u32
588                } else {
589                    c[0] as u32
590                }
591            })
592            .collect()
593    } else {
594        bytes.iter().map(|&b| b as u32).collect()
595    }
596}
597
598/// Page size (width, height) in PDF points from the MediaBox.
599fn page_size(doc: &Document, page_id: lopdf::ObjectId) -> (f32, f32) {
600    let mb = doc
601        .get_object(page_id)
602        .ok()
603        .and_then(|o| o.as_dict().ok())
604        .and_then(|d| {
605            // MediaBox may be inherited; lopdf resolves via get_page... fall back to a guess.
606            d.get(b"MediaBox").ok().cloned()
607        })
608        .or_else(|| {
609            doc.get_dictionary(page_id)
610                .ok()
611                .and_then(|d| d.get(b"MediaBox").ok().cloned())
612        });
613    if let Some(Object::Array(a)) = mb {
614        let v: Vec<f32> = a.iter().filter_map(|o| num(o).map(|x| x as f32)).collect();
615        if v.len() == 4 {
616            return ((v[2] - v[0]).abs(), (v[3] - v[1]).abs());
617        }
618    }
619    (612.0, 792.0)
620}
621
622/// Debug: raw glyph stream `(ch, ll, lr, lb, lt)` (native coords) for page
623/// `index`, before the sanitizer. For comparing char cells to docling-parse.
624pub fn debug_glyphs(bytes: &[u8], index: usize) -> Vec<(char, f32, f32, f32, f32)> {
625    let Ok(doc) = Document::load_mem(bytes) else {
626        return Vec::new();
627    };
628    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
629    pages.sort_by_key(|(n, _)| *n);
630    let Some((_, pid)) = pages.get(index) else {
631        return Vec::new();
632    };
633    page_glyphs(&doc, *pid)
634        .into_iter()
635        .map(|g| (g.ch, g.ll, g.lr, g.lb, g.lt))
636        .collect()
637}
638
639/// Public entry: per-page (width, height, line cells) for a PDF, via the Rust
640/// text parser + the docling-parse line sanitizer. Used by the pipeline and the
641/// `textparse_dump` example.
642pub fn pdf_textlines(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
643    let Ok(doc) = Document::load_mem(bytes) else {
644        return Vec::new();
645    };
646    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
647    pages.sort_by_key(|(n, _)| *n);
648    pages
649        .into_iter()
650        .map(|(_, pid)| {
651            let (w, h) = page_size(&doc, pid);
652            let glyphs = page_glyphs(&doc, pid);
653            let cells = crate::dp_lines::line_cells(&glyphs, h, true);
654            (w, h, cells)
655        })
656        .collect()
657}
658
659/// Debug/diagnostic entry: per-page (width, height, word cells) for a PDF, via
660/// the Rust parser glyphs run through the docling-parse word grouping. Used to
661/// compare parser word cells against docling-parse's `word_cells` oracle (roadmap
662/// item 6).
663pub fn pdf_words(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
664    let Ok(doc) = Document::load_mem(bytes) else {
665        return Vec::new();
666    };
667    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
668    pages.sort_by_key(|(n, _)| *n);
669    pages
670        .into_iter()
671        .map(|(_, pid)| {
672            let (w, h) = page_size(&doc, pid);
673            let glyphs = page_glyphs(&doc, pid);
674            let cells = crate::dp_lines::word_cells(&glyphs, h, true);
675            (w, h, cells)
676        })
677        .collect()
678}
679
680/// One page's text cells from the pure-Rust parser: prose line cells, per-word
681/// cells, and code line cells — all from a single glyph parse. Replaces the
682/// pdfium text path (roadmap item 6) when the parser drop is enabled.
683#[derive(Default)]
684pub struct PageParserCells {
685    pub prose: Vec<crate::pdfium_backend::TextCell>,
686    pub words: Vec<crate::pdfium_backend::TextCell>,
687    pub code: Vec<crate::pdfium_backend::TextCell>,
688}
689
690/// Full parser text layer: prose + word + code cells per page, glyphs parsed once.
691/// `prose`/`words` come from the docling-parse contraction ([`crate::dp_lines`]);
692/// `code` splits only at the parser's own space glyphs (monospace keeps its
693/// source spacing). Used by the pipeline to retire pdfium's text path.
694pub fn pdf_all_cells(bytes: &[u8]) -> Vec<PageParserCells> {
695    let Ok(doc) = Document::load_mem(bytes) else {
696        return Vec::new();
697    };
698    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
699    pages.sort_by_key(|(n, _)| *n);
700    pages
701        .into_iter()
702        .map(|(_, pid)| {
703            let (_w, h) = page_size(&doc, pid);
704            let glyphs = page_glyphs(&doc, pid);
705            let (prose, words) = crate::dp_lines::line_and_word_cells(&glyphs, h, true);
706            PageParserCells {
707                prose,
708                words,
709                code: crate::pdfium_backend::code_cells_from_glyphs(&glyphs, h),
710            }
711        })
712        .collect()
713}
714
715/// The text-state scalars inherited by a Form XObject when it is invoked via
716/// `Do` (the PDF graphics state includes the text parameters, but not the text
717/// matrices, which a form re-establishes inside its own `BT`/`ET`).
718#[derive(Clone, Copy)]
719struct TextState {
720    tc: f64,
721    tw: f64,
722    th: f64,
723    tl: f64,
724    trise: f64,
725    fsize: f64,
726}
727
728impl TextState {
729    const INIT: TextState = TextState {
730        tc: 0.0,
731        tw: 0.0,
732        th: 1.0,
733        tl: 0.0,
734        trise: 0.0,
735        fsize: 0.0,
736    };
737}
738
739/// The effective `/Resources` dictionary for a page (inline or via reference,
740/// falling back to an inherited one from a `/Parent`).
741fn page_res(doc: &Document, page_id: lopdf::ObjectId) -> Option<&Dictionary> {
742    let (inline, ids) = doc.get_page_resources(page_id).ok()?;
743    if let Some(d) = inline {
744        return Some(d);
745    }
746    ids.into_iter().find_map(|id| doc.get_dictionary(id).ok())
747}
748
749/// Build the code→[`Font`] map for a resources dictionary's `/Font` sub-dict.
750fn fonts_from_res(doc: &Document, res: &Dictionary) -> HashMap<Vec<u8>, Font> {
751    let mut map = HashMap::new();
752    let font_dict = res
753        .get(b"Font")
754        .ok()
755        .and_then(|o| deref(doc, o))
756        .and_then(|o| o.as_dict().ok());
757    if let Some(fd) = font_dict {
758        for (name, value) in fd.iter() {
759            if let Some(fdict) = deref(doc, value).and_then(|o| o.as_dict().ok()) {
760                map.insert(name.clone(), parse_font(doc, name, fdict));
761            }
762        }
763    }
764    map
765}
766
767/// Extract every glyph on a page as a native-coordinate [`Glyph`].
768pub(crate) fn page_glyphs(doc: &Document, page_id: lopdf::ObjectId) -> Vec<Glyph> {
769    let mut out = Vec::new();
770    let Ok(content_bytes) = doc.get_page_content(page_id) else {
771        return out;
772    };
773    let Ok(content) = lopdf::content::Content::decode(&content_bytes) else {
774        return out;
775    };
776    if let Some(res) = page_res(doc, page_id) {
777        run_content(doc, res, &content, Mat::ID, TextState::INIT, 0, &mut out);
778    }
779    out
780}
781
782/// Run a content stream's operators, emitting glyphs into `out`. Recurses into
783/// Form XObjects on `Do` (bulk body text in heavy PDFs lives inside a form, not
784/// the page content stream). `res` is the resources dict in scope (the page's,
785/// or the form's own); `base_ctm` is the CTM at the point of invocation.
786fn run_content(
787    doc: &Document,
788    res: &Dictionary,
789    content: &lopdf::content::Content,
790    base_ctm: Mat,
791    init: TextState,
792    depth: u32,
793    out: &mut Vec<Glyph>,
794) {
795    let fonts = fonts_from_res(doc, res);
796    let xobjects = res
797        .get(b"XObject")
798        .ok()
799        .and_then(|o| deref(doc, o))
800        .and_then(|o| o.as_dict().ok());
801
802    // Graphics + text state. `q`/`Q` save and restore the whole graphics state,
803    // which includes the text parameters (Tc, Tw, Tz, TL, Tfs, Trise, font) —
804    // *not* the text matrix (that is reset by BT). Saving only the CTM let a Tc
805    // set inside a `q…Q` block leak out and drift every later glyph.
806    #[allow(clippy::type_complexity)]
807    let mut gstate_stack: Vec<(Mat, f64, f64, f64, f64, f64, f64, Option<&Font>)> = Vec::new();
808    let mut ctm = base_ctm;
809    let mut tm = Mat::ID;
810    let mut tlm = Mat::ID;
811    let mut font: Option<&Font> = None;
812    let mut fsize = init.fsize;
813    let mut tc = init.tc; // char spacing
814    let mut tw = init.tw; // word spacing
815    let mut th = init.th; // horizontal scale (Tz/100)
816    let mut tl = init.tl; // leading
817    let mut trise = init.trise;
818
819    let op_f = |operands: &[Object], i: usize| operands.get(i).and_then(num).unwrap_or(0.0);
820
821    for op in &content.operations {
822        let operands = &op.operands;
823        match op.operator.as_str() {
824            "q" => gstate_stack.push((ctm, tc, tw, th, tl, trise, fsize, font)),
825            "Q" => {
826                if let Some((c, a, b, h, l, r, fs, f)) = gstate_stack.pop() {
827                    ctm = c;
828                    tc = a;
829                    tw = b;
830                    th = h;
831                    tl = l;
832                    trise = r;
833                    fsize = fs;
834                    font = f;
835                }
836            }
837            "cm" => {
838                let m = Mat {
839                    a: op_f(operands, 0),
840                    b: op_f(operands, 1),
841                    c: op_f(operands, 2),
842                    d: op_f(operands, 3),
843                    e: op_f(operands, 4),
844                    f: op_f(operands, 5),
845                };
846                ctm = m.then(ctm);
847            }
848            "BT" => {
849                tm = Mat::ID;
850                tlm = Mat::ID;
851            }
852            "ET" => {}
853            "Tf" => {
854                if let Some(Object::Name(n)) = operands.first() {
855                    font = fonts.get(n.as_slice());
856                }
857                fsize = op_f(operands, 1);
858            }
859            "Td" => {
860                tlm = Mat {
861                    a: 1.0,
862                    b: 0.0,
863                    c: 0.0,
864                    d: 1.0,
865                    e: op_f(operands, 0),
866                    f: op_f(operands, 1),
867                }
868                .then(tlm);
869                tm = tlm;
870            }
871            "TD" => {
872                tl = -op_f(operands, 1);
873                tlm = Mat {
874                    a: 1.0,
875                    b: 0.0,
876                    c: 0.0,
877                    d: 1.0,
878                    e: op_f(operands, 0),
879                    f: op_f(operands, 1),
880                }
881                .then(tlm);
882                tm = tlm;
883            }
884            "Tm" => {
885                tlm = Mat {
886                    a: op_f(operands, 0),
887                    b: op_f(operands, 1),
888                    c: op_f(operands, 2),
889                    d: op_f(operands, 3),
890                    e: op_f(operands, 4),
891                    f: op_f(operands, 5),
892                };
893                tm = tlm;
894            }
895            "T*" => {
896                tlm = Mat {
897                    a: 1.0,
898                    b: 0.0,
899                    c: 0.0,
900                    d: 1.0,
901                    e: 0.0,
902                    f: -tl,
903                }
904                .then(tlm);
905                tm = tlm;
906            }
907            "Tc" => tc = op_f(operands, 0),
908            "Tw" => tw = op_f(operands, 0),
909            "Tz" => th = op_f(operands, 0) / 100.0,
910            "TL" => tl = op_f(operands, 0),
911            "Ts" => trise = op_f(operands, 0),
912            "Tj" | "'" | "\"" => {
913                if op.operator == "'" || op.operator == "\"" {
914                    // move to next line first
915                    tlm = Mat {
916                        a: 1.0,
917                        b: 0.0,
918                        c: 0.0,
919                        d: 1.0,
920                        e: 0.0,
921                        f: -tl,
922                    }
923                    .then(tlm);
924                    tm = tlm;
925                }
926                if op.operator == "\"" {
927                    // `aw ac string "` sets word- and char-spacing before
928                    // showing the string (PDF 32000-1 §9.4.3), persisting after.
929                    tw = op_f(operands, 0);
930                    tc = op_f(operands, 1);
931                }
932                if let (Some(f), Some(Object::String(s, _))) = (font, operands.last()) {
933                    show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out);
934                }
935            }
936            "TJ" => {
937                if let (Some(f), Some(Object::Array(arr))) = (font, operands.first()) {
938                    for el in arr {
939                        match el {
940                            Object::String(s, _) => {
941                                show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out)
942                            }
943                            other => {
944                                if let Some(adj) = num(other) {
945                                    // negative number moves text right (PDF: subtract)
946                                    let tx = -adj / 1000.0 * fsize * th;
947                                    tm = Mat {
948                                        a: 1.0,
949                                        b: 0.0,
950                                        c: 0.0,
951                                        d: 1.0,
952                                        e: tx,
953                                        f: 0.0,
954                                    }
955                                    .then(tm);
956                                }
957                            }
958                        }
959                    }
960                }
961            }
962            "Do" => {
963                // Invoke a Form XObject: bulk body text in many PDFs lives inside
964                // a form, reached only here. Image XObjects are skipped (no text).
965                if depth >= 8 {
966                    continue;
967                }
968                let Some(Object::Name(n)) = operands.first() else {
969                    continue;
970                };
971                let stream = xobjects
972                    .and_then(|d| d.get(n.as_slice()).ok())
973                    .and_then(|o| deref(doc, o))
974                    .and_then(|o| o.as_stream().ok());
975                let Some(stream) = stream else { continue };
976                let is_form = stream
977                    .dict
978                    .get(b"Subtype")
979                    .ok()
980                    .and_then(|o| o.as_name().ok())
981                    == Some(b"Form".as_slice());
982                if !is_form {
983                    continue;
984                }
985                let Ok(data) = stream.decompressed_content() else {
986                    continue;
987                };
988                let Ok(form_content) = lopdf::content::Content::decode(&data) else {
989                    continue;
990                };
991                // The form's /Matrix maps form space into the CTM at invocation.
992                let form_mat = match stream.dict.get(b"Matrix").ok() {
993                    Some(Object::Array(a)) if a.len() == 6 => {
994                        let v: Vec<f64> = a.iter().filter_map(num).collect();
995                        if v.len() == 6 {
996                            Mat {
997                                a: v[0],
998                                b: v[1],
999                                c: v[2],
1000                                d: v[3],
1001                                e: v[4],
1002                                f: v[5],
1003                            }
1004                        } else {
1005                            Mat::ID
1006                        }
1007                    }
1008                    _ => Mat::ID,
1009                };
1010                // The form's own /Resources, falling back to the inherited ones.
1011                let form_res = stream
1012                    .dict
1013                    .get(b"Resources")
1014                    .ok()
1015                    .and_then(|o| deref(doc, o))
1016                    .and_then(|o| o.as_dict().ok())
1017                    .unwrap_or(res);
1018                let state = TextState {
1019                    tc,
1020                    tw,
1021                    th,
1022                    tl,
1023                    trise,
1024                    fsize,
1025                };
1026                run_content(
1027                    doc,
1028                    form_res,
1029                    &form_content,
1030                    form_mat.then(ctm),
1031                    state,
1032                    depth + 1,
1033                    out,
1034                );
1035            }
1036            _ => {}
1037        }
1038    }
1039}
1040
1041#[allow(clippy::too_many_arguments)]
1042fn show_text(
1043    font: &Font,
1044    bytes: &[u8],
1045    fsize: f64,
1046    tc: f64,
1047    tw: f64,
1048    th: f64,
1049    trise: f64,
1050    tm: &mut Mat,
1051    ctm: Mat,
1052    out: &mut Vec<Glyph>,
1053) {
1054    for code in codes(font, bytes) {
1055        let (text, w) = font.decode_code(code);
1056        let w0 = w / 1000.0; // advance in text-space (em) units
1057                             // The glyph→user transform: scale glyph space by font size, then Tm, CTM.
1058        let scale = Mat {
1059            a: fsize * th,
1060            b: 0.0,
1061            c: 0.0,
1062            d: fsize,
1063            e: 0.0,
1064            f: trise,
1065        };
1066        let trm = scale.then(*tm).then(ctm);
1067        // Box in glyph space (1000-unit em): x 0..w, y descent..ascent.
1068        let (x0, y0) = trm.apply(0.0, font.descent / 1000.0);
1069        let (x1, _y1) = trm.apply(w0, font.descent / 1000.0);
1070        let (_x2, y2) = trm.apply(0.0, font.ascent / 1000.0);
1071        let (left, right) = (x0.min(x1), x0.max(x1));
1072        let (bot, top) = (y0.min(y2), y0.max(y2));
1073        if let Some(s) = text {
1074            // A run may map one code to multiple chars (ligature/fraction); share box.
1075            for ch in s.chars() {
1076                if ch != '\u{0}' {
1077                    out.push(Glyph {
1078                        ch,
1079                        l: left as f32,
1080                        b: bot as f32,
1081                        r: right as f32,
1082                        t: top as f32,
1083                        ll: left as f32,
1084                        lb: bot as f32,
1085                        lr: right as f32,
1086                        lt: top as f32,
1087                        font: font.hash,
1088                    });
1089                }
1090            }
1091        }
1092        // Advance the text matrix. Word spacing applies to single-byte code 32.
1093        let is_space = !font.two_byte && code == 32;
1094        let tx = (w0 * fsize + tc + if is_space { tw } else { 0.0 }) * th;
1095        *tm = Mat {
1096            a: 1.0,
1097            b: 0.0,
1098            c: 0.0,
1099            d: 1.0,
1100            e: tx,
1101            f: 0.0,
1102        }
1103        .then(*tm);
1104    }
1105}
1106
1107/// Build a simple font's code→char table from its `/Encoding`: the base
1108/// encoding (WinAnsi / MacRoman) plus any `/Differences` overrides (glyph names
1109/// resolved through a small Adobe-glyph-name subset).
1110fn simple_encoding_table(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
1111    let enc = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o));
1112    let base_name = match enc {
1113        Some(Object::Name(n)) => n.clone(),
1114        Some(Object::Dictionary(d)) => d
1115            .get(b"BaseEncoding")
1116            .ok()
1117            .and_then(|o| o.as_name().ok())
1118            .map(|n| n.to_vec())
1119            .unwrap_or_default(),
1120        _ => Vec::new(),
1121    };
1122    let mut m = if base_name == b"MacRomanEncoding" {
1123        macroman_table()
1124    } else {
1125        winansi_table()
1126    };
1127    // Apply /Differences: `code /glyphname /glyphname ... code ...`.
1128    if let Some(Object::Dictionary(d)) = enc {
1129        if let Some(Object::Array(diffs)) = d.get(b"Differences").ok().and_then(|o| deref(doc, o)) {
1130            let mut code = 0u8;
1131            for el in diffs {
1132                match el {
1133                    Object::Integer(i) => code = *i as u8,
1134                    Object::Name(name) => {
1135                        if let Some(ch) = glyph_name_to_char(name) {
1136                            m.insert(code, ch);
1137                        }
1138                        code = code.wrapping_add(1);
1139                    }
1140                    _ => {}
1141                }
1142            }
1143        }
1144    }
1145    m
1146}
1147
1148/// Resolve an Adobe glyph name to Unicode: `uniXXXX`, single ASCII letters, the
1149/// digit/punctuation names from the Adobe Glyph List, and common typographic
1150/// names. A `.suffix` (`one.taboldstyle`, `a.sc`) is stripped and the base name
1151/// retried — docling renders these as the base character.
1152fn glyph_name_to_char(name: &[u8]) -> Option<char> {
1153    let s = std::str::from_utf8(name).ok()?;
1154    if let Some(hex) = s.strip_prefix("uni") {
1155        if let Ok(cp) = u32::from_str_radix(hex.get(0..4)?, 16) {
1156            return char::from_u32(cp);
1157        }
1158    }
1159    // Single ASCII letter names (`A`, `m`) map to themselves.
1160    if s.len() == 1 {
1161        let b = s.as_bytes()[0];
1162        if b.is_ascii_alphabetic() {
1163            return Some(b as char);
1164        }
1165    }
1166    let resolved = match s {
1167        "space" => ' ',
1168        "exclam" => '!',
1169        "quotedbl" => '"',
1170        "numbersign" => '#',
1171        "dollar" => '$',
1172        "percent" => '%',
1173        "ampersand" => '&',
1174        "quotesingle" => '\'',
1175        "parenleft" => '(',
1176        "parenright" => ')',
1177        "asterisk" => '*',
1178        "plus" => '+',
1179        "comma" => ',',
1180        "hyphen" => '-',
1181        "period" => '.',
1182        "slash" => '/',
1183        "zero" => '0',
1184        "one" => '1',
1185        "two" => '2',
1186        "three" => '3',
1187        "four" => '4',
1188        "five" => '5',
1189        "six" => '6',
1190        "seven" => '7',
1191        "eight" => '8',
1192        "nine" => '9',
1193        "colon" => ':',
1194        "semicolon" => ';',
1195        "less" => '<',
1196        "equal" => '=',
1197        "greater" => '>',
1198        "question" => '?',
1199        "at" => '@',
1200        "bracketleft" => '[',
1201        "backslash" => '\\',
1202        "bracketright" => ']',
1203        "asciicircum" => '^',
1204        "underscore" => '_',
1205        "grave" => '`',
1206        "braceleft" => '{',
1207        "bar" => '|',
1208        "braceright" => '}',
1209        "asciitilde" => '~',
1210        "bullet" => '\u{2022}',
1211        "periodcentered" => '\u{00B7}',
1212        "endash" => '\u{2013}',
1213        "emdash" => '\u{2014}',
1214        "quoteright" => '\u{2019}',
1215        "quoteleft" => '\u{2018}',
1216        "quotedblleft" => '\u{201C}',
1217        "quotedblright" => '\u{201D}',
1218        "quotedblbase" => '\u{201E}',
1219        "quotesinglbase" => '\u{201A}',
1220        // Latin f-ligatures named in `/Differences` (e.g. 2305's body font). These
1221        // map to the presentation-form code points, which `decompose_ligatures`
1222        // then spells back out (`ff`→"ff") — without them the glyph decodes to
1223        // nothing and the sanitizer fills the gap with a space (`di erences`).
1224        "ff" => '\u{FB00}',
1225        "fi" => '\u{FB01}',
1226        "fl" => '\u{FB02}',
1227        "ffi" => '\u{FB03}',
1228        "ffl" => '\u{FB04}',
1229        "ft" => '\u{FB05}',
1230        "st" => '\u{FB06}',
1231        "degree" => '\u{00B0}',
1232        "trademark" => '\u{2122}',
1233        "registered" => '\u{00AE}',
1234        "copyright" => '\u{00A9}',
1235        "ellipsis" => '\u{2026}',
1236        "minus" => '\u{2212}',
1237        "fraction" => '\u{2044}',
1238        "nbspace" => '\u{00A0}',
1239        // Greek + math glyph names (standard Adobe Glyph List). Standard TeX math
1240        // fonts (CMMI/CMSY/…) name their glyphs this way in the embedded font
1241        // program's `/Encoding`; without these a `λ`/`≤` decodes to nothing and is
1242        // dropped from body text (`and λ set to 0.5` → `and set to 0.5`).
1243        "alpha" => '\u{03B1}',
1244        "beta" => '\u{03B2}',
1245        "gamma" => '\u{03B3}',
1246        "delta" => '\u{03B4}',
1247        "epsilon" | "epsilon1" => '\u{03B5}',
1248        "zeta" => '\u{03B6}',
1249        "eta" => '\u{03B7}',
1250        "theta" | "theta1" => '\u{03B8}',
1251        "iota" => '\u{03B9}',
1252        "kappa" => '\u{03BA}',
1253        "lambda" => '\u{03BB}',
1254        "mu" => '\u{03BC}',
1255        "nu" => '\u{03BD}',
1256        "xi" => '\u{03BE}',
1257        "omicron" => '\u{03BF}',
1258        "pi" | "pi1" => '\u{03C0}',
1259        "rho" | "rho1" => '\u{03C1}',
1260        "sigma" => '\u{03C3}',
1261        "sigma1" => '\u{03C2}',
1262        "tau" => '\u{03C4}',
1263        "upsilon" => '\u{03C5}',
1264        "phi" | "phi1" => '\u{03C6}',
1265        "chi" => '\u{03C7}',
1266        "psi" => '\u{03C8}',
1267        "omega" | "omega1" => '\u{03C9}',
1268        "Gamma" => '\u{0393}',
1269        "Delta" => '\u{0394}',
1270        "Theta" => '\u{0398}',
1271        "Lambda" => '\u{039B}',
1272        "Xi" => '\u{039E}',
1273        "Pi" => '\u{03A0}',
1274        "Sigma" => '\u{03A3}',
1275        "Upsilon" => '\u{03A5}',
1276        "Phi" => '\u{03A6}',
1277        "Psi" => '\u{03A8}',
1278        "Omega" => '\u{03A9}',
1279        "lessequal" => '\u{2264}',
1280        "greaterequal" => '\u{2265}',
1281        "notequal" => '\u{2260}',
1282        "approxequal" => '\u{2248}',
1283        "equivalence" => '\u{2261}',
1284        "element" => '\u{2208}',
1285        "plusminus" => '\u{00B1}',
1286        "multiply" => '\u{00D7}',
1287        "divide" => '\u{00F7}',
1288        "infinity" => '\u{221E}',
1289        "partialdiff" => '\u{2202}',
1290        "gradient" => '\u{2207}',
1291        "summation" => '\u{2211}',
1292        "product" => '\u{220F}',
1293        "integral" => '\u{222B}',
1294        "radical" => '\u{221A}',
1295        "proportional" => '\u{221D}',
1296        "arrowright" => '\u{2192}',
1297        "arrowleft" => '\u{2190}',
1298        "arrowup" => '\u{2191}',
1299        "arrowdown" => '\u{2193}',
1300        "arrowboth" => '\u{2194}',
1301        "arrowdblright" => '\u{21D2}',
1302        "logicaland" => '\u{2227}',
1303        "logicalor" => '\u{2228}',
1304        "intersection" => '\u{2229}',
1305        "union" => '\u{222A}',
1306        "similar" => '\u{223C}',
1307        "congruent" => '\u{2245}',
1308        "dotmath" => '\u{22C5}',
1309        "asteriskmath" => '\u{2217}',
1310        _ => {
1311            // Strip an AGL `.suffix` (oldstyle/small-cap variant) and retry.
1312            if let Some((base, _)) = s.split_once('.') {
1313                if !base.is_empty() {
1314                    return glyph_name_to_char(base.as_bytes());
1315                }
1316            }
1317            return None;
1318        }
1319    };
1320    Some(resolved)
1321}
1322
1323/// Minimal WinAnsiEncoding (Latin-1-ish) for simple fonts lacking ToUnicode.
1324fn winansi_table() -> HashMap<u8, char> {
1325    let mut m = HashMap::new();
1326    for b in 0x20u8..=0x7e {
1327        m.insert(b, b as char);
1328    }
1329    // High range: Windows-1252 printable points that differ from Latin-1.
1330    let extra: &[(u8, char)] = &[
1331        (0x91, '\u{2018}'),
1332        (0x92, '\u{2019}'),
1333        (0x93, '\u{201C}'),
1334        (0x94, '\u{201D}'),
1335        (0x95, '\u{2022}'),
1336        (0x96, '\u{2013}'),
1337        (0x97, '\u{2014}'),
1338        (0x85, '\u{2026}'),
1339        (0xA0, '\u{00A0}'),
1340    ];
1341    for &(b, c) in extra {
1342        m.insert(b, c);
1343    }
1344    for b in 0xA1u8..=0xFF {
1345        m.entry(b).or_insert(b as char);
1346    }
1347    m
1348}
1349
1350/// Minimal MacRomanEncoding: ASCII plus the high-range points our corpus hits
1351/// (notably 0xA5 = bullet, used as a list marker).
1352fn macroman_table() -> HashMap<u8, char> {
1353    let mut m = HashMap::new();
1354    for b in 0x20u8..=0x7e {
1355        m.insert(b, b as char);
1356    }
1357    let high: &[(u8, char)] = &[
1358        (0xA5, '\u{2022}'), // bullet
1359        (0xD0, '\u{2013}'), // endash
1360        (0xD1, '\u{2014}'), // emdash
1361        (0xD2, '\u{201C}'),
1362        (0xD3, '\u{201D}'),
1363        (0xD4, '\u{2018}'),
1364        (0xD5, '\u{2019}'),
1365        (0xCA, '\u{00A0}'),
1366        (0xC9, '\u{2026}'),
1367        (0xDE, '\u{FB01}'),
1368        (0xDF, '\u{FB02}'),
1369    ];
1370    for &(b, c) in high {
1371        m.insert(b, c);
1372    }
1373    m
1374}
fleischwolf_pdf/textparse.rs

fleischwolf_pdf/
textparse.rs