fleischwolf_pdf/
textparse.rs

1//! Pure-Rust PDF text extraction (replacing pdfium's glyph layer).
2//!
3//! pdfium reports *rendered* glyph boxes, which diverge from docling's
4//! `docling-parse` C++ parser at exactly the points that drive conformance:
5//! generated spaces get a zero-width box, combining diacritics get a real-width
6//! box, and ligature/fraction glyphs land at different x. This module instead
7//! reconstructs each glyph's box from the **font's own advance widths** and the
8//! PDF text/graphics matrices — the same information docling-parse uses — so a
9//! space is as wide as the font says and a combining mark has zero advance.
10//!
11//! The output is the same [`Glyph`] stream pdfium produces (native PDF
12//! coordinates, y-up), fed straight into the existing docling-parse line
13//! sanitizer ([`crate::dp_lines`]). Only the digital text layer is handled here;
14//! pages without one still fall back to OCR upstream.
15
16use std::collections::HashMap;
17
18use lopdf::{Dictionary, Document, Object};
19
20use crate::pdfium_backend::Glyph;
21
22/// A 2×3 affine matrix `[a b c d e f]`: maps `(x,y)` → `(a·x+c·y+e, b·x+d·y+f)`.
23#[derive(Clone, Copy)]
24struct Mat {
25    a: f64,
26    b: f64,
27    c: f64,
28    d: f64,
29    e: f64,
30    f: f64,
31}
32
33impl Mat {
34    const ID: Mat = Mat {
35        a: 1.0,
36        b: 0.0,
37        c: 0.0,
38        d: 1.0,
39        e: 0.0,
40        f: 0.0,
41    };
42
43    /// `self ∘ m`: the matrix that applies `self` first, then `m`.
44    fn then(self, m: Mat) -> Mat {
45        Mat {
46            a: self.a * m.a + self.b * m.c,
47            b: self.a * m.b + self.b * m.d,
48            c: self.c * m.a + self.d * m.c,
49            d: self.c * m.b + self.d * m.d,
50            e: self.e * m.a + self.f * m.c + m.e,
51            f: self.e * m.b + self.f * m.d + m.f,
52        }
53    }
54
55    fn apply(self, x: f64, y: f64) -> (f64, f64) {
56        (
57            self.a * x + self.c * y + self.e,
58            self.b * x + self.d * y + self.f,
59        )
60    }
61}
62
63/// A parsed font: how to turn raw string bytes into (unicode, advance) pairs.
64struct Font {
65    /// 2-byte codes (Type0 / Identity-H) vs 1-byte (simple fonts).
66    two_byte: bool,
67    /// code → Unicode string (from ToUnicode; may be multi-char, e.g. ligatures).
68    to_unicode: HashMap<u32, String>,
69    /// code → glyph advance, in 1000-unit glyph space.
70    widths: HashMap<u32, f64>,
71    default_width: f64,
72    /// 1-byte fallback decoding when ToUnicode lacks a code (WinAnsi-ish).
73    simple_encoding: Option<HashMap<u8, char>>,
74    /// code → raw `/Differences` glyph name, for GID-style names (`g115`) that
75    /// have no Unicode mapping. docling-parse emits these verbatim as `/g115`
76    /// (see the redp5110 bulleted list); matching it keeps no text skipped.
77    fallback_names: HashMap<u8, String>,
78    /// code → char from the embedded Type1 font program's own `/Encoding` vector,
79    /// used only as a last resort for glyphs the base encoding leaves unmapped
80    /// (standard TeX math fonts: `λ`, `≤`, …).
81    program_encoding: HashMap<u8, char>,
82    ascent: f64,
83    descent: f64,
84    hash: u64,
85}
86
87impl Font {
88    fn decode_code(&self, code: u32) -> (Option<String>, f64) {
89        let w = self
90            .widths
91            .get(&code)
92            .copied()
93            .unwrap_or(self.default_width);
94        if let Some(s) = self.to_unicode.get(&code) {
95            return (Some(decompose_ligatures(s)), w);
96        }
97        if !self.two_byte {
98            // A GID-style `/Differences` name (no Unicode) overrides the base
99            // encoding, matching docling's verbatim `/g115` fallback.
100            if let Some(name) = self.fallback_names.get(&(code as u8)) {
101                return (Some(format!("/{name}")), w);
102            }
103            if let Some(enc) = &self.simple_encoding {
104                if let Some(&ch) = enc.get(&(code as u8)) {
105                    return (Some(decompose_ligatures(&ch.to_string())), w);
106                }
107            }
108            // Last resort: the embedded Type1 font program's own `/Encoding`
109            // vector (`dup N /glyphname put`). Standard TeX math fonts (CMMI, CMSY,
110            // …) ship no PDF `/Encoding` and no ToUnicode, so a glyph like `λ`
111            // (CMMI code 21 → `/lambda`) or `≤` (CMSY code 20 → `/lessequal`) has
112            // no other mapping and would otherwise be silently dropped. docling
113            // recovers these from the same font program. This only fills codes the
114            // base encoding left unmapped, so it never changes an existing decode.
115            if let Some(&ch) = self.program_encoding.get(&(code as u8)) {
116                return (Some(decompose_ligatures(&ch.to_string())), w);
117            }
118        }
119        (None, w)
120    }
121}
122
123/// Spell out Latin presentation-form ligatures (`ﬁ`→`fi`, `ﬃ`→`ffi`, …) the way
124/// docling does, so `configuration`/`difficult` don't keep the ligature glyph.
125/// The chars share the ligature's box, so the line sanitizer recomposes them.
126fn decompose_ligatures(s: &str) -> String {
127    if !s.chars().any(|c| ('\u{FB00}'..='\u{FB06}').contains(&c)) {
128        return s.to_string();
129    }
130    s.chars()
131        .map(|c| {
132            match c {
133                '\u{FB00}' => "ff",
134                '\u{FB01}' => "fi",
135                '\u{FB02}' => "fl",
136                '\u{FB03}' => "ffi",
137                '\u{FB04}' => "ffl",
138                '\u{FB05}' => "ft",
139                '\u{FB06}' => "st",
140                _ => return c.to_string(),
141            }
142            .to_string()
143        })
144        .collect()
145}
146
147fn hash_name(name: &[u8]) -> u64 {
148    use std::hash::{Hash, Hasher};
149    let mut h = std::collections::hash_map::DefaultHasher::new();
150    name.hash(&mut h);
151    h.finish()
152}
153
154/// Resolve a possibly-indirect object to a dictionary.
155fn as_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
156    match obj {
157        Object::Dictionary(d) => Some(d),
158        Object::Reference(id) => doc.get_object(*id).ok().and_then(|o| o.as_dict().ok()),
159        _ => None,
160    }
161}
162
163fn deref<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
164    match obj {
165        Object::Reference(id) => doc.get_object(*id).ok(),
166        other => Some(other),
167    }
168}
169
170/// Parse one font dictionary into a [`Font`].
171fn parse_font(doc: &Document, name: &[u8], fdict: &Dictionary) -> Font {
172    let subtype: &[u8] = fdict
173        .get(b"Subtype")
174        .ok()
175        .and_then(|o| o.as_name().ok())
176        .unwrap_or(&[]);
177    let two_byte = subtype == b"Type0".as_slice();
178
179    let to_unicode = fdict
180        .get(b"ToUnicode")
181        .ok()
182        .and_then(|o| deref(doc, o))
183        .and_then(|o| o.as_stream().ok())
184        .and_then(|s| s.decompressed_content().ok())
185        .map(|data| parse_tounicode(&data))
186        .unwrap_or_default();
187
188    let (widths, default_width) = if two_byte {
189        cid_widths(doc, fdict)
190    } else {
191        simple_widths(doc, fdict)
192    };
193
194    let simple_encoding = if two_byte {
195        None
196    } else {
197        Some(simple_encoding_table(doc, fdict))
198    };
199    let fallback_names = if two_byte {
200        HashMap::new()
201    } else {
202        differences_gid_names(doc, fdict)
203    };
204    let program_encoding = if two_byte {
205        HashMap::new()
206    } else {
207        type1_program_encoding(doc, fdict)
208    };
209
210    let (ascent, descent) = font_ascent_descent(doc, fdict, two_byte);
211
212    Font {
213        two_byte,
214        to_unicode,
215        widths,
216        default_width,
217        simple_encoding,
218        fallback_names,
219        program_encoding,
220        ascent,
221        descent,
222        hash: hash_name(name),
223    }
224}
225
226/// Collect `/Differences` entries whose glyph name is a GID placeholder
227/// (`g115`, `cid42`, `glyph7`, `index9`) with no Unicode mapping. docling-parse
228/// emits such glyphs as the literal name `/g115`; mapping them here keeps the
229/// text from being silently dropped (subsetted fonts with no ToUnicode). The
230/// GID-name restriction keeps real Adobe glyph names on the normal path so this
231/// never invents garbage on the clean files.
232fn differences_gid_names(doc: &Document, fdict: &Dictionary) -> HashMap<u8, String> {
233    let mut map = HashMap::new();
234    let Some(Object::Dictionary(enc)) = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o))
235    else {
236        return map;
237    };
238    let Some(Object::Array(diffs)) = enc.get(b"Differences").ok().and_then(|o| deref(doc, o))
239    else {
240        return map;
241    };
242    let mut code = 0u8;
243    for el in diffs {
244        match el {
245            Object::Integer(i) => code = *i as u8,
246            Object::Name(name) => {
247                if glyph_name_to_char(name).is_none() && is_gid_name(name) {
248                    map.insert(code, String::from_utf8_lossy(name).into_owned());
249                }
250                code = code.wrapping_add(1);
251            }
252            _ => {}
253        }
254    }
255    map
256}
257
258/// Parse the embedded Type1 font program's built-in `/Encoding` vector
259/// (`dup <code> /<glyphname> put` entries in the clear-text header before
260/// `eexec`) into `code → char`. This is how docling recovers glyphs from
261/// standard TeX math fonts (CMMI/CMSY/…) that carry no PDF `/Encoding` and no
262/// ToUnicode — e.g. CMMI's `dup 21 /lambda` or CMSY's `dup 20 /lessequal`.
263/// Only `FontFile` (Type1) is parsed; CFF (`FontFile3`) and TrueType
264/// (`FontFile2`) store their encoding in a binary table and are left alone.
265fn type1_program_encoding(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
266    let mut map = HashMap::new();
267    let Some(desc) = fdict
268        .get(b"FontDescriptor")
269        .ok()
270        .and_then(|o| deref(doc, o))
271        .and_then(|o| o.as_dict().ok())
272    else {
273        return map;
274    };
275    let Some(data) = desc
276        .get(b"FontFile")
277        .ok()
278        .and_then(|o| deref(doc, o))
279        .and_then(|o| o.as_stream().ok())
280        .and_then(|s| s.decompressed_content().ok())
281    else {
282        return map;
283    };
284    // The clear-text header (PostScript) ends at `eexec`; the rest is encrypted.
285    let head_end = data
286        .windows(5)
287        .position(|w| w == b"eexec")
288        .unwrap_or(data.len());
289    let head = String::from_utf8_lossy(&data[..head_end]);
290    // Scan for `dup <code> /<name> put` tokens.
291    let toks: Vec<&str> = head.split_whitespace().collect();
292    for w in toks.windows(4) {
293        if w[0] == "dup" && w[3] == "put" {
294            if let (Ok(code), Some(name)) = (w[1].parse::<u32>(), w[2].strip_prefix('/')) {
295                if code <= 255 {
296                    if let Some(ch) = glyph_name_to_char(name.as_bytes()) {
297                        map.insert(code as u8, ch);
298                    }
299                }
300            }
301        }
302    }
303    map
304}
305
306/// A glyph name that is a synthetic placeholder, not a real Adobe name:
307/// `g115`, `cid42`, `glyph7`, `index9`, `G12`, or a short-prefix code name like
308/// `SM590000` (IBM BookMaster). These carry no Unicode meaning, and docling-parse
309/// emits them verbatim (`/SM590000`). `afii####` / `uni####` are real Adobe names
310/// and excluded. The restriction keeps genuine glyph names on the Unicode path.
311fn is_gid_name(name: &[u8]) -> bool {
312    let Ok(s) = std::str::from_utf8(name) else {
313        return false;
314    };
315    if s.starts_with("afii") || s.starts_with("uni") {
316        return false;
317    }
318    for prefix in ["g", "G", "cid", "CID", "glyph", "index"] {
319        if let Some(rest) = s.strip_prefix(prefix) {
320            if !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit()) {
321                return true;
322            }
323        }
324    }
325    // Short alpha prefix (≤3 letters) followed by a run of ≥3 digits — synthetic
326    // code names like `SM590000`, distinct from real Adobe names (whole words or
327    // letter+`.suffix` variants).
328    let alpha = s.bytes().take_while(|b| b.is_ascii_alphabetic()).count();
329    let digits = s.len() - alpha;
330    (1..=3).contains(&alpha)
331        && digits >= 3
332        && s.as_bytes()[alpha..].iter().all(|b| b.is_ascii_digit())
333}
334
335fn font_ascent_descent(doc: &Document, fdict: &Dictionary, two_byte: bool) -> (f64, f64) {
336    // For Type0, the descriptor lives on the descendant CIDFont.
337    let descr_owner = if two_byte {
338        fdict
339            .get(b"DescendantFonts")
340            .ok()
341            .and_then(|o| deref(doc, o))
342            .and_then(|o| match o {
343                Object::Array(a) => a.first(),
344                _ => None,
345            })
346            .and_then(|o| as_dict(doc, o))
347    } else {
348        Some(fdict)
349    };
350    let fd = descr_owner
351        .and_then(|d| d.get(b"FontDescriptor").ok())
352        .and_then(|o| as_dict(doc, o));
353    let asc = fd
354        .and_then(|d| d.get(b"Ascent").ok())
355        .and_then(|o| {
356            o.as_float()
357                .ok()
358                .or_else(|| o.as_i64().ok().map(|i| i as f32))
359        })
360        .unwrap_or(750.0) as f64;
361    let desc = fd
362        .and_then(|d| d.get(b"Descent").ok())
363        .and_then(|o| {
364            o.as_float()
365                .ok()
366                .or_else(|| o.as_i64().ok().map(|i| i as f32))
367        })
368        .unwrap_or(-250.0) as f64;
369    // Some subsetted fonts carry a degenerate FontDescriptor (`/Ascent 0
370    // /Descent 0`) — the real metrics live in the font program. That collapses
371    // the loose box to zero height, so the line cells get zero area and the
372    // layout's region/text assignment drops them (2305's References list lost
373    // every prose line, keeping only the URLs). Fall back to typical text metrics
374    // so the box has height.
375    if asc - desc <= 1.0 {
376        return (750.0, -250.0);
377    }
378    (asc, desc)
379}
380
381/// Simple-font widths: `/FirstChar` + `/Widths` array, `/MissingWidth` default.
382fn simple_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
383    let mut map = HashMap::new();
384    let first = fdict
385        .get(b"FirstChar")
386        .ok()
387        .and_then(|o| o.as_i64().ok())
388        .unwrap_or(0) as u32;
389    if let Some(Object::Array(arr)) = fdict.get(b"Widths").ok().and_then(|o| deref(doc, o)) {
390        for (i, w) in arr.iter().enumerate() {
391            if let Some(w) = num(w) {
392                map.insert(first + i as u32, w);
393            }
394        }
395    }
396    let dw = fdict
397        .get(b"FontDescriptor")
398        .ok()
399        .and_then(|o| as_dict(doc, o))
400        .and_then(|d| d.get(b"MissingWidth").ok())
401        .and_then(num)
402        .unwrap_or(0.0);
403    (map, dw)
404}
405
406/// CIDFont widths: the `/W` array on the descendant font (`/DW` default = 1000).
407fn cid_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
408    let mut map = HashMap::new();
409    let Some(desc) = fdict
410        .get(b"DescendantFonts")
411        .ok()
412        .and_then(|o| deref(doc, o))
413        .and_then(|o| match o {
414            Object::Array(a) => a.first(),
415            _ => None,
416        })
417        .and_then(|o| as_dict(doc, o))
418    else {
419        return (map, 1000.0);
420    };
421    let dw = desc.get(b"DW").ok().and_then(num).unwrap_or(1000.0);
422    if let Some(Object::Array(w)) = desc.get(b"W").ok().and_then(|o| deref(doc, o)) {
423        let mut i = 0;
424        while i < w.len() {
425            let c = w.get(i).and_then(num);
426            match (c, w.get(i + 1)) {
427                // `c [w1 w2 ...]`: consecutive CIDs starting at c.
428                (Some(c), Some(Object::Array(list))) => {
429                    for (k, wv) in list.iter().enumerate() {
430                        if let Some(wv) = num(wv) {
431                            map.insert(c as u32 + k as u32, wv);
432                        }
433                    }
434                    i += 2;
435                }
436                // `c_first c_last w`: a run all of width w.
437                (Some(c1), Some(o2)) => {
438                    if let (Some(c2), Some(wv)) = (num(o2), w.get(i + 2).and_then(num)) {
439                        for cid in c1 as u32..=c2 as u32 {
440                            map.insert(cid, wv);
441                        }
442                    }
443                    i += 3;
444                }
445                _ => break,
446            }
447        }
448    }
449    (map, dw)
450}
451
452fn num(o: &Object) -> Option<f64> {
453    match o {
454        Object::Integer(i) => Some(*i as f64),
455        Object::Real(r) => Some(*r as f64),
456        _ => None,
457    }
458}
459
460/// Parse a ToUnicode CMap's `bfchar` / `bfrange` sections into code→string.
461fn parse_tounicode(data: &[u8]) -> HashMap<u32, String> {
462    let text = String::from_utf8_lossy(data);
463    let mut map = HashMap::new();
464    let hex = |s: &str| -> Option<Vec<u16>> {
465        let s = s.trim();
466        if !s.starts_with('<') || !s.ends_with('>') {
467            return None;
468        }
469        let h = &s[1..s.len() - 1];
470        let bytes: Vec<u8> = (0..h.len())
471            .step_by(2)
472            .filter_map(|i| u8::from_str_radix(h.get(i..i + 2)?, 16).ok())
473            .collect();
474        Some(
475            bytes
476                .chunks(2)
477                .map(|c| {
478                    if c.len() == 2 {
479                        u16::from_be_bytes([c[0], c[1]])
480                    } else {
481                        c[0] as u16
482                    }
483                })
484                .collect(),
485        )
486    };
487    let u16s_to_string = |u: &[u16]| String::from_utf16_lossy(u);
488    let code_of = |u: &[u16]| u.iter().fold(0u32, |acc, &x| (acc << 16) | x as u32);
489
490    // Tokenize by structure, not whitespace: CMap hex groups are often written
491    // back-to-back with no separators (`<21><21><0054>`), so scan for `<…>`
492    // groups, `[`/`]` brackets, and bareword keywords.
493    let tokens: Vec<String> = {
494        let bytes = text.as_bytes();
495        let mut toks = Vec::new();
496        let mut i = 0;
497        while i < bytes.len() {
498            let c = bytes[i];
499            if c.is_ascii_whitespace() {
500                i += 1;
501            } else if c == b'<' {
502                let start = i;
503                while i < bytes.len() && bytes[i] != b'>' {
504                    i += 1;
505                }
506                i += 1; // include '>'
507                toks.push(String::from_utf8_lossy(&bytes[start..i.min(bytes.len())]).into_owned());
508            } else if c == b'[' || c == b']' {
509                toks.push((c as char).to_string());
510                i += 1;
511            } else {
512                let start = i;
513                while i < bytes.len()
514                    && !bytes[i].is_ascii_whitespace()
515                    && bytes[i] != b'<'
516                    && bytes[i] != b'['
517                    && bytes[i] != b']'
518                {
519                    i += 1;
520                }
521                toks.push(String::from_utf8_lossy(&bytes[start..i]).into_owned());
522            }
523        }
524        toks
525    };
526    let tokens: Vec<&str> = tokens.iter().map(|s| s.as_str()).collect();
527    let mut i = 0;
528    while i < tokens.len() {
529        match tokens[i] {
530            "beginbfchar" => {
531                i += 1;
532                while i + 1 < tokens.len() && tokens[i] != "endbfchar" {
533                    if let (Some(src), Some(dst)) = (hex(tokens[i]), hex(tokens[i + 1])) {
534                        map.insert(code_of(&src), u16s_to_string(&dst));
535                    }
536                    i += 2;
537                }
538            }
539            "beginbfrange" => {
540                i += 1;
541                while i + 2 < tokens.len() && tokens[i] != "endbfrange" {
542                    let (Some(lo), Some(hi)) = (hex(tokens[i]), hex(tokens[i + 1])) else {
543                        i += 1;
544                        continue;
545                    };
546                    let lo = code_of(&lo);
547                    let hi = code_of(&hi);
548                    if tokens[i + 2] == "[" {
549                        // `<lo> <hi> [ <d0> <d1> ... ]`: one dst per code in the range.
550                        let mut j = i + 3;
551                        let mut code = lo;
552                        while j < tokens.len() && tokens[j] != "]" {
553                            if let Some(dst) = hex(tokens[j]) {
554                                map.insert(code, u16s_to_string(&dst));
555                            }
556                            code += 1;
557                            j += 1;
558                        }
559                        i = j + 1;
560                    } else if let Some(dst) = hex(tokens[i + 2]) {
561                        // `<lo> <hi> <dst>`: consecutive Unicode from a base.
562                        let base = code_of(&dst);
563                        for (k, code) in (lo..=hi).enumerate() {
564                            if let Some(ch) = char::from_u32(base + k as u32) {
565                                map.insert(code, ch.to_string());
566                            }
567                        }
568                        i += 3;
569                    } else {
570                        i += 1;
571                    }
572                }
573            }
574            _ => i += 1,
575        }
576    }
577    map
578}
579
580/// Decode a PDF string literal in a Tj/TJ operand into raw code units.
581fn codes(font: &Font, bytes: &[u8]) -> Vec<u32> {
582    if font.two_byte {
583        bytes
584            .chunks(2)
585            .map(|c| {
586                if c.len() == 2 {
587                    ((c[0] as u32) << 8) | c[1] as u32
588                } else {
589                    c[0] as u32
590                }
591            })
592            .collect()
593    } else {
594        bytes.iter().map(|&b| b as u32).collect()
595    }
596}
597
598/// Page size (width, height) in PDF points from the MediaBox.
599fn page_size(doc: &Document, page_id: lopdf::ObjectId) -> (f32, f32) {
600    let mb = doc
601        .get_object(page_id)
602        .ok()
603        .and_then(|o| o.as_dict().ok())
604        .and_then(|d| {
605            // MediaBox may be inherited; lopdf resolves via get_page... fall back to a guess.
606            d.get(b"MediaBox").ok().cloned()
607        })
608        .or_else(|| {
609            doc.get_dictionary(page_id)
610                .ok()
611                .and_then(|d| d.get(b"MediaBox").ok().cloned())
612        });
613    if let Some(Object::Array(a)) = mb {
614        let v: Vec<f32> = a.iter().filter_map(|o| num(o).map(|x| x as f32)).collect();
615        if v.len() == 4 {
616            return ((v[2] - v[0]).abs(), (v[3] - v[1]).abs());
617        }
618    }
619    (612.0, 792.0)
620}
621
622/// Debug: raw glyph stream `(ch, ll, lr, lb, lt)` (native coords) for page
623/// `index`, before the sanitizer. For comparing char cells to docling-parse.
624pub fn debug_glyphs(bytes: &[u8], index: usize) -> Vec<(char, f32, f32, f32, f32)> {
625    let Ok(doc) = Document::load_mem(bytes) else {
626        return Vec::new();
627    };
628    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
629    pages.sort_by_key(|(n, _)| *n);
630    let Some((_, pid)) = pages.get(index) else {
631        return Vec::new();
632    };
633    page_glyphs(&doc, *pid)
634        .into_iter()
635        .map(|g| (g.ch, g.ll, g.lr, g.lb, g.lt))
636        .collect()
637}
638
639/// Public entry: per-page (width, height, line cells) for a PDF, via the Rust
640/// text parser + the docling-parse line sanitizer. Used by the pipeline and the
641/// `textparse_dump` example.
642pub fn pdf_textlines(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
643    let Ok(doc) = Document::load_mem(bytes) else {
644        return Vec::new();
645    };
646    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
647    pages.sort_by_key(|(n, _)| *n);
648    pages
649        .into_iter()
650        .map(|(_, pid)| {
651            let (w, h) = page_size(&doc, pid);
652            let glyphs = page_glyphs(&doc, pid);
653            let cells = crate::dp_lines::line_cells(&glyphs, h, true);
654            (w, h, cells)
655        })
656        .collect()
657}
658
659/// Debug/diagnostic entry: per-page (width, height, word cells) for a PDF, via
660/// the Rust parser glyphs run through the docling-parse word grouping. Used to
661/// compare parser word cells against docling-parse's `word_cells` oracle (roadmap
662/// item 6).
663pub fn pdf_words(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
664    let Ok(doc) = Document::load_mem(bytes) else {
665        return Vec::new();
666    };
667    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
668    pages.sort_by_key(|(n, _)| *n);
669    pages
670        .into_iter()
671        .map(|(_, pid)| {
672            let (w, h) = page_size(&doc, pid);
673            let glyphs = page_glyphs(&doc, pid);
674            let cells = crate::dp_lines::word_cells(&glyphs, h, true);
675            (w, h, cells)
676        })
677        .collect()
678}
679
680/// One page's text cells from the pure-Rust parser: prose line cells, per-word
681/// cells, and code line cells — all from a single glyph parse. Replaces the
682/// pdfium text path (roadmap item 6) when the parser drop is enabled.
683#[derive(Default)]
684pub struct PageParserCells {
685    pub prose: Vec<crate::pdfium_backend::TextCell>,
686    pub words: Vec<crate::pdfium_backend::TextCell>,
687    pub code: Vec<crate::pdfium_backend::TextCell>,
688}
689
690/// Full parser text layer: prose + word + code cells per page, glyphs parsed once.
691/// `prose`/`words` come from the docling-parse contraction ([`crate::dp_lines`]);
692/// `code` splits only at the parser's own space glyphs (monospace keeps its
693/// source spacing). Used by the pipeline to retire pdfium's text path.
694pub fn pdf_all_cells(bytes: &[u8]) -> Vec<PageParserCells> {
695    let Ok(doc) = Document::load_mem(bytes) else {
696        return Vec::new();
697    };
698    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
699    pages.sort_by_key(|(n, _)| *n);
700    pages
701        .into_iter()
702        .map(|(_, pid)| {
703            let (_w, h) = page_size(&doc, pid);
704            let glyphs = page_glyphs(&doc, pid);
705            PageParserCells {
706                prose: crate::dp_lines::line_cells(&glyphs, h, true),
707                words: crate::dp_lines::word_cells(&glyphs, h, true),
708                code: crate::pdfium_backend::code_cells_from_glyphs(&glyphs, h),
709            }
710        })
711        .collect()
712}
713
714/// The text-state scalars inherited by a Form XObject when it is invoked via
715/// `Do` (the PDF graphics state includes the text parameters, but not the text
716/// matrices, which a form re-establishes inside its own `BT`/`ET`).
717#[derive(Clone, Copy)]
718struct TextState {
719    tc: f64,
720    tw: f64,
721    th: f64,
722    tl: f64,
723    trise: f64,
724    fsize: f64,
725}
726
727impl TextState {
728    const INIT: TextState = TextState {
729        tc: 0.0,
730        tw: 0.0,
731        th: 1.0,
732        tl: 0.0,
733        trise: 0.0,
734        fsize: 0.0,
735    };
736}
737
738/// The effective `/Resources` dictionary for a page (inline or via reference,
739/// falling back to an inherited one from a `/Parent`).
740fn page_res(doc: &Document, page_id: lopdf::ObjectId) -> Option<&Dictionary> {
741    let (inline, ids) = doc.get_page_resources(page_id).ok()?;
742    if let Some(d) = inline {
743        return Some(d);
744    }
745    ids.into_iter().find_map(|id| doc.get_dictionary(id).ok())
746}
747
748/// Build the code→[`Font`] map for a resources dictionary's `/Font` sub-dict.
749fn fonts_from_res(doc: &Document, res: &Dictionary) -> HashMap<Vec<u8>, Font> {
750    let mut map = HashMap::new();
751    let font_dict = res
752        .get(b"Font")
753        .ok()
754        .and_then(|o| deref(doc, o))
755        .and_then(|o| o.as_dict().ok());
756    if let Some(fd) = font_dict {
757        for (name, value) in fd.iter() {
758            if let Some(fdict) = deref(doc, value).and_then(|o| o.as_dict().ok()) {
759                map.insert(name.clone(), parse_font(doc, name, fdict));
760            }
761        }
762    }
763    map
764}
765
766/// Extract every glyph on a page as a native-coordinate [`Glyph`].
767pub(crate) fn page_glyphs(doc: &Document, page_id: lopdf::ObjectId) -> Vec<Glyph> {
768    let mut out = Vec::new();
769    let Ok(content_bytes) = doc.get_page_content(page_id) else {
770        return out;
771    };
772    let Ok(content) = lopdf::content::Content::decode(&content_bytes) else {
773        return out;
774    };
775    if let Some(res) = page_res(doc, page_id) {
776        run_content(doc, res, &content, Mat::ID, TextState::INIT, 0, &mut out);
777    }
778    out
779}
780
781/// Run a content stream's operators, emitting glyphs into `out`. Recurses into
782/// Form XObjects on `Do` (bulk body text in heavy PDFs lives inside a form, not
783/// the page content stream). `res` is the resources dict in scope (the page's,
784/// or the form's own); `base_ctm` is the CTM at the point of invocation.
785fn run_content(
786    doc: &Document,
787    res: &Dictionary,
788    content: &lopdf::content::Content,
789    base_ctm: Mat,
790    init: TextState,
791    depth: u32,
792    out: &mut Vec<Glyph>,
793) {
794    let fonts = fonts_from_res(doc, res);
795    let xobjects = res
796        .get(b"XObject")
797        .ok()
798        .and_then(|o| deref(doc, o))
799        .and_then(|o| o.as_dict().ok());
800
801    // Graphics + text state. `q`/`Q` save and restore the whole graphics state,
802    // which includes the text parameters (Tc, Tw, Tz, TL, Tfs, Trise, font) —
803    // *not* the text matrix (that is reset by BT). Saving only the CTM let a Tc
804    // set inside a `q…Q` block leak out and drift every later glyph.
805    #[allow(clippy::type_complexity)]
806    let mut gstate_stack: Vec<(Mat, f64, f64, f64, f64, f64, f64, Option<&Font>)> = Vec::new();
807    let mut ctm = base_ctm;
808    let mut tm = Mat::ID;
809    let mut tlm = Mat::ID;
810    let mut font: Option<&Font> = None;
811    let mut fsize = init.fsize;
812    let mut tc = init.tc; // char spacing
813    let mut tw = init.tw; // word spacing
814    let mut th = init.th; // horizontal scale (Tz/100)
815    let mut tl = init.tl; // leading
816    let mut trise = init.trise;
817
818    let op_f = |operands: &[Object], i: usize| operands.get(i).and_then(num).unwrap_or(0.0);
819
820    for op in &content.operations {
821        let operands = &op.operands;
822        match op.operator.as_str() {
823            "q" => gstate_stack.push((ctm, tc, tw, th, tl, trise, fsize, font)),
824            "Q" => {
825                if let Some((c, a, b, h, l, r, fs, f)) = gstate_stack.pop() {
826                    ctm = c;
827                    tc = a;
828                    tw = b;
829                    th = h;
830                    tl = l;
831                    trise = r;
832                    fsize = fs;
833                    font = f;
834                }
835            }
836            "cm" => {
837                let m = Mat {
838                    a: op_f(operands, 0),
839                    b: op_f(operands, 1),
840                    c: op_f(operands, 2),
841                    d: op_f(operands, 3),
842                    e: op_f(operands, 4),
843                    f: op_f(operands, 5),
844                };
845                ctm = m.then(ctm);
846            }
847            "BT" => {
848                tm = Mat::ID;
849                tlm = Mat::ID;
850            }
851            "ET" => {}
852            "Tf" => {
853                if let Some(Object::Name(n)) = operands.first() {
854                    font = fonts.get(n.as_slice());
855                }
856                fsize = op_f(operands, 1);
857            }
858            "Td" => {
859                tlm = Mat {
860                    a: 1.0,
861                    b: 0.0,
862                    c: 0.0,
863                    d: 1.0,
864                    e: op_f(operands, 0),
865                    f: op_f(operands, 1),
866                }
867                .then(tlm);
868                tm = tlm;
869            }
870            "TD" => {
871                tl = -op_f(operands, 1);
872                tlm = Mat {
873                    a: 1.0,
874                    b: 0.0,
875                    c: 0.0,
876                    d: 1.0,
877                    e: op_f(operands, 0),
878                    f: op_f(operands, 1),
879                }
880                .then(tlm);
881                tm = tlm;
882            }
883            "Tm" => {
884                tlm = Mat {
885                    a: op_f(operands, 0),
886                    b: op_f(operands, 1),
887                    c: op_f(operands, 2),
888                    d: op_f(operands, 3),
889                    e: op_f(operands, 4),
890                    f: op_f(operands, 5),
891                };
892                tm = tlm;
893            }
894            "T*" => {
895                tlm = Mat {
896                    a: 1.0,
897                    b: 0.0,
898                    c: 0.0,
899                    d: 1.0,
900                    e: 0.0,
901                    f: -tl,
902                }
903                .then(tlm);
904                tm = tlm;
905            }
906            "Tc" => tc = op_f(operands, 0),
907            "Tw" => tw = op_f(operands, 0),
908            "Tz" => th = op_f(operands, 0) / 100.0,
909            "TL" => tl = op_f(operands, 0),
910            "Ts" => trise = op_f(operands, 0),
911            "Tj" | "'" | "\"" => {
912                if op.operator == "'" || op.operator == "\"" {
913                    // move to next line first
914                    tlm = Mat {
915                        a: 1.0,
916                        b: 0.0,
917                        c: 0.0,
918                        d: 1.0,
919                        e: 0.0,
920                        f: -tl,
921                    }
922                    .then(tlm);
923                    tm = tlm;
924                }
925                if let (Some(f), Some(Object::String(s, _))) = (font, operands.last()) {
926                    show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out);
927                }
928            }
929            "TJ" => {
930                if let (Some(f), Some(Object::Array(arr))) = (font, operands.first()) {
931                    for el in arr {
932                        match el {
933                            Object::String(s, _) => {
934                                show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out)
935                            }
936                            other => {
937                                if let Some(adj) = num(other) {
938                                    // negative number moves text right (PDF: subtract)
939                                    let tx = -adj / 1000.0 * fsize * th;
940                                    tm = Mat {
941                                        a: 1.0,
942                                        b: 0.0,
943                                        c: 0.0,
944                                        d: 1.0,
945                                        e: tx,
946                                        f: 0.0,
947                                    }
948                                    .then(tm);
949                                }
950                            }
951                        }
952                    }
953                }
954            }
955            "Do" => {
956                // Invoke a Form XObject: bulk body text in many PDFs lives inside
957                // a form, reached only here. Image XObjects are skipped (no text).
958                if depth >= 8 {
959                    continue;
960                }
961                let Some(Object::Name(n)) = operands.first() else {
962                    continue;
963                };
964                let stream = xobjects
965                    .and_then(|d| d.get(n.as_slice()).ok())
966                    .and_then(|o| deref(doc, o))
967                    .and_then(|o| o.as_stream().ok());
968                let Some(stream) = stream else { continue };
969                let is_form = stream
970                    .dict
971                    .get(b"Subtype")
972                    .ok()
973                    .and_then(|o| o.as_name().ok())
974                    == Some(b"Form".as_slice());
975                if !is_form {
976                    continue;
977                }
978                let Ok(data) = stream.decompressed_content() else {
979                    continue;
980                };
981                let Ok(form_content) = lopdf::content::Content::decode(&data) else {
982                    continue;
983                };
984                // The form's /Matrix maps form space into the CTM at invocation.
985                let form_mat = match stream.dict.get(b"Matrix").ok() {
986                    Some(Object::Array(a)) if a.len() == 6 => {
987                        let v: Vec<f64> = a.iter().filter_map(num).collect();
988                        if v.len() == 6 {
989                            Mat {
990                                a: v[0],
991                                b: v[1],
992                                c: v[2],
993                                d: v[3],
994                                e: v[4],
995                                f: v[5],
996                            }
997                        } else {
998                            Mat::ID
999                        }
1000                    }
1001                    _ => Mat::ID,
1002                };
1003                // The form's own /Resources, falling back to the inherited ones.
1004                let form_res = stream
1005                    .dict
1006                    .get(b"Resources")
1007                    .ok()
1008                    .and_then(|o| deref(doc, o))
1009                    .and_then(|o| o.as_dict().ok())
1010                    .unwrap_or(res);
1011                let state = TextState {
1012                    tc,
1013                    tw,
1014                    th,
1015                    tl,
1016                    trise,
1017                    fsize,
1018                };
1019                run_content(
1020                    doc,
1021                    form_res,
1022                    &form_content,
1023                    form_mat.then(ctm),
1024                    state,
1025                    depth + 1,
1026                    out,
1027                );
1028            }
1029            _ => {}
1030        }
1031    }
1032}
1033
1034#[allow(clippy::too_many_arguments)]
1035fn show_text(
1036    font: &Font,
1037    bytes: &[u8],
1038    fsize: f64,
1039    tc: f64,
1040    tw: f64,
1041    th: f64,
1042    trise: f64,
1043    tm: &mut Mat,
1044    ctm: Mat,
1045    out: &mut Vec<Glyph>,
1046) {
1047    for code in codes(font, bytes) {
1048        let (text, w) = font.decode_code(code);
1049        let w0 = w / 1000.0; // advance in text-space (em) units
1050                             // The glyph→user transform: scale glyph space by font size, then Tm, CTM.
1051        let scale = Mat {
1052            a: fsize * th,
1053            b: 0.0,
1054            c: 0.0,
1055            d: fsize,
1056            e: 0.0,
1057            f: trise,
1058        };
1059        let trm = scale.then(*tm).then(ctm);
1060        // Box in glyph space (1000-unit em): x 0..w, y descent..ascent.
1061        let (x0, y0) = trm.apply(0.0, font.descent / 1000.0);
1062        let (x1, _y1) = trm.apply(w0, font.descent / 1000.0);
1063        let (_x2, y2) = trm.apply(0.0, font.ascent / 1000.0);
1064        let (left, right) = (x0.min(x1), x0.max(x1));
1065        let (bot, top) = (y0.min(y2), y0.max(y2));
1066        if let Some(s) = text {
1067            // A run may map one code to multiple chars (ligature/fraction); share box.
1068            for ch in s.chars() {
1069                if ch != '\u{0}' {
1070                    out.push(Glyph {
1071                        ch,
1072                        l: left as f32,
1073                        b: bot as f32,
1074                        r: right as f32,
1075                        t: top as f32,
1076                        ll: left as f32,
1077                        lb: bot as f32,
1078                        lr: right as f32,
1079                        lt: top as f32,
1080                        font: font.hash,
1081                    });
1082                }
1083            }
1084        }
1085        // Advance the text matrix. Word spacing applies to single-byte code 32.
1086        let is_space = !font.two_byte && code == 32;
1087        let tx = (w0 * fsize + tc + if is_space { tw } else { 0.0 }) * th;
1088        *tm = Mat {
1089            a: 1.0,
1090            b: 0.0,
1091            c: 0.0,
1092            d: 1.0,
1093            e: tx,
1094            f: 0.0,
1095        }
1096        .then(*tm);
1097    }
1098}
1099
1100/// Build a simple font's code→char table from its `/Encoding`: the base
1101/// encoding (WinAnsi / MacRoman) plus any `/Differences` overrides (glyph names
1102/// resolved through a small Adobe-glyph-name subset).
1103fn simple_encoding_table(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
1104    let enc = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o));
1105    let base_name = match enc {
1106        Some(Object::Name(n)) => n.clone(),
1107        Some(Object::Dictionary(d)) => d
1108            .get(b"BaseEncoding")
1109            .ok()
1110            .and_then(|o| o.as_name().ok())
1111            .map(|n| n.to_vec())
1112            .unwrap_or_default(),
1113        _ => Vec::new(),
1114    };
1115    let mut m = if base_name == b"MacRomanEncoding" {
1116        macroman_table()
1117    } else {
1118        winansi_table()
1119    };
1120    // Apply /Differences: `code /glyphname /glyphname ... code ...`.
1121    if let Some(Object::Dictionary(d)) = enc {
1122        if let Some(Object::Array(diffs)) = d.get(b"Differences").ok().and_then(|o| deref(doc, o)) {
1123            let mut code = 0u8;
1124            for el in diffs {
1125                match el {
1126                    Object::Integer(i) => code = *i as u8,
1127                    Object::Name(name) => {
1128                        if let Some(ch) = glyph_name_to_char(name) {
1129                            m.insert(code, ch);
1130                        }
1131                        code = code.wrapping_add(1);
1132                    }
1133                    _ => {}
1134                }
1135            }
1136        }
1137    }
1138    m
1139}
1140
1141/// Resolve an Adobe glyph name to Unicode: `uniXXXX`, single ASCII letters, the
1142/// digit/punctuation names from the Adobe Glyph List, and common typographic
1143/// names. A `.suffix` (`one.taboldstyle`, `a.sc`) is stripped and the base name
1144/// retried — docling renders these as the base character.
1145fn glyph_name_to_char(name: &[u8]) -> Option<char> {
1146    let s = std::str::from_utf8(name).ok()?;
1147    if let Some(hex) = s.strip_prefix("uni") {
1148        if let Ok(cp) = u32::from_str_radix(hex.get(0..4)?, 16) {
1149            return char::from_u32(cp);
1150        }
1151    }
1152    // Single ASCII letter names (`A`, `m`) map to themselves.
1153    if s.len() == 1 {
1154        let b = s.as_bytes()[0];
1155        if b.is_ascii_alphabetic() {
1156            return Some(b as char);
1157        }
1158    }
1159    let resolved = match s {
1160        "space" => ' ',
1161        "exclam" => '!',
1162        "quotedbl" => '"',
1163        "numbersign" => '#',
1164        "dollar" => '$',
1165        "percent" => '%',
1166        "ampersand" => '&',
1167        "quotesingle" => '\'',
1168        "parenleft" => '(',
1169        "parenright" => ')',
1170        "asterisk" => '*',
1171        "plus" => '+',
1172        "comma" => ',',
1173        "hyphen" => '-',
1174        "period" => '.',
1175        "slash" => '/',
1176        "zero" => '0',
1177        "one" => '1',
1178        "two" => '2',
1179        "three" => '3',
1180        "four" => '4',
1181        "five" => '5',
1182        "six" => '6',
1183        "seven" => '7',
1184        "eight" => '8',
1185        "nine" => '9',
1186        "colon" => ':',
1187        "semicolon" => ';',
1188        "less" => '<',
1189        "equal" => '=',
1190        "greater" => '>',
1191        "question" => '?',
1192        "at" => '@',
1193        "bracketleft" => '[',
1194        "backslash" => '\\',
1195        "bracketright" => ']',
1196        "asciicircum" => '^',
1197        "underscore" => '_',
1198        "grave" => '`',
1199        "braceleft" => '{',
1200        "bar" => '|',
1201        "braceright" => '}',
1202        "asciitilde" => '~',
1203        "bullet" => '\u{2022}',
1204        "periodcentered" => '\u{00B7}',
1205        "endash" => '\u{2013}',
1206        "emdash" => '\u{2014}',
1207        "quoteright" => '\u{2019}',
1208        "quoteleft" => '\u{2018}',
1209        "quotedblleft" => '\u{201C}',
1210        "quotedblright" => '\u{201D}',
1211        "quotedblbase" => '\u{201E}',
1212        "quotesinglbase" => '\u{201A}',
1213        // Latin f-ligatures named in `/Differences` (e.g. 2305's body font). These
1214        // map to the presentation-form code points, which `decompose_ligatures`
1215        // then spells back out (`ff`→"ff") — without them the glyph decodes to
1216        // nothing and the sanitizer fills the gap with a space (`di erences`).
1217        "ff" => '\u{FB00}',
1218        "fi" => '\u{FB01}',
1219        "fl" => '\u{FB02}',
1220        "ffi" => '\u{FB03}',
1221        "ffl" => '\u{FB04}',
1222        "ft" => '\u{FB05}',
1223        "st" => '\u{FB06}',
1224        "degree" => '\u{00B0}',
1225        "trademark" => '\u{2122}',
1226        "registered" => '\u{00AE}',
1227        "copyright" => '\u{00A9}',
1228        "ellipsis" => '\u{2026}',
1229        "minus" => '\u{2212}',
1230        "fraction" => '\u{2044}',
1231        "nbspace" => '\u{00A0}',
1232        // Greek + math glyph names (standard Adobe Glyph List). Standard TeX math
1233        // fonts (CMMI/CMSY/…) name their glyphs this way in the embedded font
1234        // program's `/Encoding`; without these a `λ`/`≤` decodes to nothing and is
1235        // dropped from body text (`and λ set to 0.5` → `and set to 0.5`).
1236        "alpha" => '\u{03B1}',
1237        "beta" => '\u{03B2}',
1238        "gamma" => '\u{03B3}',
1239        "delta" => '\u{03B4}',
1240        "epsilon" | "epsilon1" => '\u{03B5}',
1241        "zeta" => '\u{03B6}',
1242        "eta" => '\u{03B7}',
1243        "theta" | "theta1" => '\u{03B8}',
1244        "iota" => '\u{03B9}',
1245        "kappa" => '\u{03BA}',
1246        "lambda" => '\u{03BB}',
1247        "mu" => '\u{03BC}',
1248        "nu" => '\u{03BD}',
1249        "xi" => '\u{03BE}',
1250        "omicron" => '\u{03BF}',
1251        "pi" | "pi1" => '\u{03C0}',
1252        "rho" | "rho1" => '\u{03C1}',
1253        "sigma" => '\u{03C3}',
1254        "sigma1" => '\u{03C2}',
1255        "tau" => '\u{03C4}',
1256        "upsilon" => '\u{03C5}',
1257        "phi" | "phi1" => '\u{03C6}',
1258        "chi" => '\u{03C7}',
1259        "psi" => '\u{03C8}',
1260        "omega" | "omega1" => '\u{03C9}',
1261        "Gamma" => '\u{0393}',
1262        "Delta" => '\u{0394}',
1263        "Theta" => '\u{0398}',
1264        "Lambda" => '\u{039B}',
1265        "Xi" => '\u{039E}',
1266        "Pi" => '\u{03A0}',
1267        "Sigma" => '\u{03A3}',
1268        "Upsilon" => '\u{03A5}',
1269        "Phi" => '\u{03A6}',
1270        "Psi" => '\u{03A8}',
1271        "Omega" => '\u{03A9}',
1272        "lessequal" => '\u{2264}',
1273        "greaterequal" => '\u{2265}',
1274        "notequal" => '\u{2260}',
1275        "approxequal" => '\u{2248}',
1276        "equivalence" => '\u{2261}',
1277        "element" => '\u{2208}',
1278        "plusminus" => '\u{00B1}',
1279        "multiply" => '\u{00D7}',
1280        "divide" => '\u{00F7}',
1281        "infinity" => '\u{221E}',
1282        "partialdiff" => '\u{2202}',
1283        "gradient" => '\u{2207}',
1284        "summation" => '\u{2211}',
1285        "product" => '\u{220F}',
1286        "integral" => '\u{222B}',
1287        "radical" => '\u{221A}',
1288        "proportional" => '\u{221D}',
1289        "arrowright" => '\u{2192}',
1290        "arrowleft" => '\u{2190}',
1291        "arrowup" => '\u{2191}',
1292        "arrowdown" => '\u{2193}',
1293        "arrowboth" => '\u{2194}',
1294        "arrowdblright" => '\u{21D2}',
1295        "logicaland" => '\u{2227}',
1296        "logicalor" => '\u{2228}',
1297        "intersection" => '\u{2229}',
1298        "union" => '\u{222A}',
1299        "similar" => '\u{223C}',
1300        "congruent" => '\u{2245}',
1301        "dotmath" => '\u{22C5}',
1302        "asteriskmath" => '\u{2217}',
1303        _ => {
1304            // Strip an AGL `.suffix` (oldstyle/small-cap variant) and retry.
1305            if let Some((base, _)) = s.split_once('.') {
1306                if !base.is_empty() {
1307                    return glyph_name_to_char(base.as_bytes());
1308                }
1309            }
1310            return None;
1311        }
1312    };
1313    Some(resolved)
1314}
1315
1316/// Minimal WinAnsiEncoding (Latin-1-ish) for simple fonts lacking ToUnicode.
1317fn winansi_table() -> HashMap<u8, char> {
1318    let mut m = HashMap::new();
1319    for b in 0x20u8..=0x7e {
1320        m.insert(b, b as char);
1321    }
1322    // High range: Windows-1252 printable points that differ from Latin-1.
1323    let extra: &[(u8, char)] = &[
1324        (0x91, '\u{2018}'),
1325        (0x92, '\u{2019}'),
1326        (0x93, '\u{201C}'),
1327        (0x94, '\u{201D}'),
1328        (0x95, '\u{2022}'),
1329        (0x96, '\u{2013}'),
1330        (0x97, '\u{2014}'),
1331        (0x85, '\u{2026}'),
1332        (0xA0, '\u{00A0}'),
1333    ];
1334    for &(b, c) in extra {
1335        m.insert(b, c);
1336    }
1337    for b in 0xA1u8..=0xFF {
1338        m.entry(b).or_insert(b as char);
1339    }
1340    m
1341}
1342
1343/// Minimal MacRomanEncoding: ASCII plus the high-range points our corpus hits
1344/// (notably 0xA5 = bullet, used as a list marker).
1345fn macroman_table() -> HashMap<u8, char> {
1346    let mut m = HashMap::new();
1347    for b in 0x20u8..=0x7e {
1348        m.insert(b, b as char);
1349    }
1350    let high: &[(u8, char)] = &[
1351        (0xA5, '\u{2022}'), // bullet
1352        (0xD0, '\u{2013}'), // endash
1353        (0xD1, '\u{2014}'), // emdash
1354        (0xD2, '\u{201C}'),
1355        (0xD3, '\u{201D}'),
1356        (0xD4, '\u{2018}'),
1357        (0xD5, '\u{2019}'),
1358        (0xCA, '\u{00A0}'),
1359        (0xC9, '\u{2026}'),
1360        (0xDE, '\u{FB01}'),
1361        (0xDF, '\u{FB02}'),
1362    ];
1363    for &(b, c) in high {
1364        m.insert(b, c);
1365    }
1366    m
1367}
fleischwolf_pdf/textparse.rs

fleischwolf_pdf/
textparse.rs