fleischwolf_pdf/
textparse.rs

1//! Pure-Rust PDF text extraction (replacing pdfium's glyph layer).
2//!
3//! pdfium reports *rendered* glyph boxes, which diverge from docling's
4//! `docling-parse` C++ parser at exactly the points that drive conformance:
5//! generated spaces get a zero-width box, combining diacritics get a real-width
6//! box, and ligature/fraction glyphs land at different x. This module instead
7//! reconstructs each glyph's box from the **font's own advance widths** and the
8//! PDF text/graphics matrices — the same information docling-parse uses — so a
9//! space is as wide as the font says and a combining mark has zero advance.
10//!
11//! The output is the same [`Glyph`] stream pdfium produces (native PDF
12//! coordinates, y-up), fed straight into the existing docling-parse line
13//! sanitizer ([`crate::dp_lines`]). Only the digital text layer is handled here;
14//! pages without one still fall back to OCR upstream.
15
16use std::collections::HashMap;
17use std::rc::Rc;
18
19use lopdf::{Dictionary, Document, Object};
20
21use crate::pdfium_backend::Glyph;
22
23/// Per-document caches for the content-stream interpreter. Fonts are indirect
24/// objects shared by many pages, but were fully re-parsed — ToUnicode CMap
25/// decompression + tokenization, embedded Type1 program scan, width tables —
26/// for **every page and every Form XObject invocation**; decoded form content
27/// streams were likewise re-inflated on every `Do`. Cached per document,
28/// keyed by the referenced object id (fonts also by resource name, which
29/// feeds the docling-parse font hash). Inline (non-reference) dicts are rare
30/// and stay uncached.
31#[derive(Default)]
32struct DocCaches {
33    fonts: HashMap<(lopdf::ObjectId, Vec<u8>), Rc<Font>>,
34    forms: HashMap<lopdf::ObjectId, Rc<lopdf::content::Content>>,
35}
36
37/// A 2×3 affine matrix `[a b c d e f]`: maps `(x,y)` → `(a·x+c·y+e, b·x+d·y+f)`.
38#[derive(Clone, Copy)]
39struct Mat {
40    a: f64,
41    b: f64,
42    c: f64,
43    d: f64,
44    e: f64,
45    f: f64,
46}
47
48impl Mat {
49    const ID: Mat = Mat {
50        a: 1.0,
51        b: 0.0,
52        c: 0.0,
53        d: 1.0,
54        e: 0.0,
55        f: 0.0,
56    };
57
58    /// `self ∘ m`: the matrix that applies `self` first, then `m`.
59    fn then(self, m: Mat) -> Mat {
60        Mat {
61            a: self.a * m.a + self.b * m.c,
62            b: self.a * m.b + self.b * m.d,
63            c: self.c * m.a + self.d * m.c,
64            d: self.c * m.b + self.d * m.d,
65            e: self.e * m.a + self.f * m.c + m.e,
66            f: self.e * m.b + self.f * m.d + m.f,
67        }
68    }
69
70    fn apply(self, x: f64, y: f64) -> (f64, f64) {
71        (
72            self.a * x + self.c * y + self.e,
73            self.b * x + self.d * y + self.f,
74        )
75    }
76}
77
78/// A parsed font: how to turn raw string bytes into (unicode, advance) pairs.
79struct Font {
80    /// 2-byte codes (Type0 / Identity-H) vs 1-byte (simple fonts).
81    two_byte: bool,
82    /// code → Unicode string (from ToUnicode; may be multi-char, e.g. ligatures).
83    to_unicode: HashMap<u32, String>,
84    /// code → glyph advance, in 1000-unit glyph space.
85    widths: HashMap<u32, f64>,
86    default_width: f64,
87    /// 1-byte fallback decoding when ToUnicode lacks a code (WinAnsi-ish).
88    simple_encoding: Option<HashMap<u8, char>>,
89    /// code → raw `/Differences` glyph name, for GID-style names (`g115`) that
90    /// have no Unicode mapping. docling-parse emits these verbatim as `/g115`
91    /// (see the redp5110 bulleted list); matching it keeps no text skipped.
92    fallback_names: HashMap<u8, String>,
93    /// code → char from the embedded Type1 font program's own `/Encoding` vector,
94    /// used only as a last resort for glyphs the base encoding leaves unmapped
95    /// (standard TeX math fonts: `λ`, `≤`, …).
96    program_encoding: HashMap<u8, char>,
97    ascent: f64,
98    descent: f64,
99    hash: u64,
100}
101
102impl Font {
103    fn decode_code(&self, code: u32) -> (Option<String>, f64) {
104        let w = self
105            .widths
106            .get(&code)
107            .copied()
108            .unwrap_or(self.default_width);
109        if let Some(s) = self.to_unicode.get(&code) {
110            return (Some(decompose_ligatures(s)), w);
111        }
112        if !self.two_byte {
113            // A GID-style `/Differences` name (no Unicode) overrides the base
114            // encoding, matching docling's verbatim `/g115` fallback.
115            if let Some(name) = self.fallback_names.get(&(code as u8)) {
116                return (Some(format!("/{name}")), w);
117            }
118            if let Some(enc) = &self.simple_encoding {
119                if let Some(&ch) = enc.get(&(code as u8)) {
120                    return (Some(decompose_ligatures(&ch.to_string())), w);
121                }
122            }
123            // Last resort: the embedded Type1 font program's own `/Encoding`
124            // vector (`dup N /glyphname put`). Standard TeX math fonts (CMMI, CMSY,
125            // …) ship no PDF `/Encoding` and no ToUnicode, so a glyph like `λ`
126            // (CMMI code 21 → `/lambda`) or `≤` (CMSY code 20 → `/lessequal`) has
127            // no other mapping and would otherwise be silently dropped. docling
128            // recovers these from the same font program. This only fills codes the
129            // base encoding left unmapped, so it never changes an existing decode.
130            if let Some(&ch) = self.program_encoding.get(&(code as u8)) {
131                return (Some(decompose_ligatures(&ch.to_string())), w);
132            }
133        }
134        (None, w)
135    }
136}
137
138/// Spell out Latin presentation-form ligatures (`ﬁ`→`fi`, `ﬃ`→`ffi`, …) the way
139/// docling does, so `configuration`/`difficult` don't keep the ligature glyph.
140/// The chars share the ligature's box, so the line sanitizer recomposes them.
141fn decompose_ligatures(s: &str) -> String {
142    if !s.chars().any(|c| ('\u{FB00}'..='\u{FB06}').contains(&c)) {
143        return s.to_string();
144    }
145    s.chars()
146        .map(|c| {
147            match c {
148                '\u{FB00}' => "ff",
149                '\u{FB01}' => "fi",
150                '\u{FB02}' => "fl",
151                '\u{FB03}' => "ffi",
152                '\u{FB04}' => "ffl",
153                '\u{FB05}' => "ft",
154                '\u{FB06}' => "st",
155                _ => return c.to_string(),
156            }
157            .to_string()
158        })
159        .collect()
160}
161
162fn hash_name(name: &[u8]) -> u64 {
163    use std::hash::{Hash, Hasher};
164    let mut h = std::collections::hash_map::DefaultHasher::new();
165    name.hash(&mut h);
166    h.finish()
167}
168
169/// Resolve a possibly-indirect object to a dictionary.
170fn as_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
171    match obj {
172        Object::Dictionary(d) => Some(d),
173        Object::Reference(id) => doc.get_object(*id).ok().and_then(|o| o.as_dict().ok()),
174        _ => None,
175    }
176}
177
178fn deref<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
179    match obj {
180        Object::Reference(id) => doc.get_object(*id).ok(),
181        other => Some(other),
182    }
183}
184
185/// Parse one font dictionary into a [`Font`].
186fn parse_font(doc: &Document, name: &[u8], fdict: &Dictionary) -> Font {
187    let subtype: &[u8] = fdict
188        .get(b"Subtype")
189        .ok()
190        .and_then(|o| o.as_name().ok())
191        .unwrap_or(&[]);
192    let two_byte = subtype == b"Type0".as_slice();
193
194    let to_unicode = fdict
195        .get(b"ToUnicode")
196        .ok()
197        .and_then(|o| deref(doc, o))
198        .and_then(|o| o.as_stream().ok())
199        .and_then(|s| s.decompressed_content().ok())
200        .map(|data| parse_tounicode(&data))
201        .unwrap_or_default();
202
203    let (widths, default_width) = if two_byte {
204        cid_widths(doc, fdict)
205    } else {
206        simple_widths(doc, fdict)
207    };
208
209    let simple_encoding = if two_byte {
210        None
211    } else {
212        Some(simple_encoding_table(doc, fdict))
213    };
214    let fallback_names = if two_byte {
215        HashMap::new()
216    } else {
217        differences_gid_names(doc, fdict)
218    };
219    let program_encoding = if two_byte {
220        HashMap::new()
221    } else {
222        type1_program_encoding(doc, fdict)
223    };
224
225    let (ascent, descent) = font_ascent_descent(doc, fdict, two_byte);
226
227    Font {
228        two_byte,
229        to_unicode,
230        widths,
231        default_width,
232        simple_encoding,
233        fallback_names,
234        program_encoding,
235        ascent,
236        descent,
237        hash: hash_name(name),
238    }
239}
240
241/// Collect `/Differences` entries whose glyph name is a GID placeholder
242/// (`g115`, `cid42`, `glyph7`, `index9`) with no Unicode mapping. docling-parse
243/// emits such glyphs as the literal name `/g115`; mapping them here keeps the
244/// text from being silently dropped (subsetted fonts with no ToUnicode). The
245/// GID-name restriction keeps real Adobe glyph names on the normal path so this
246/// never invents garbage on the clean files.
247fn differences_gid_names(doc: &Document, fdict: &Dictionary) -> HashMap<u8, String> {
248    let mut map = HashMap::new();
249    let Some(Object::Dictionary(enc)) = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o))
250    else {
251        return map;
252    };
253    let Some(Object::Array(diffs)) = enc.get(b"Differences").ok().and_then(|o| deref(doc, o))
254    else {
255        return map;
256    };
257    let mut code = 0u8;
258    for el in diffs {
259        match el {
260            Object::Integer(i) => code = *i as u8,
261            Object::Name(name) => {
262                if glyph_name_to_char(name).is_none() && is_gid_name(name) {
263                    map.insert(code, String::from_utf8_lossy(name).into_owned());
264                }
265                code = code.wrapping_add(1);
266            }
267            _ => {}
268        }
269    }
270    map
271}
272
273/// Parse the embedded Type1 font program's built-in `/Encoding` vector
274/// (`dup <code> /<glyphname> put` entries in the clear-text header before
275/// `eexec`) into `code → char`. This is how docling recovers glyphs from
276/// standard TeX math fonts (CMMI/CMSY/…) that carry no PDF `/Encoding` and no
277/// ToUnicode — e.g. CMMI's `dup 21 /lambda` or CMSY's `dup 20 /lessequal`.
278/// Only `FontFile` (Type1) is parsed; CFF (`FontFile3`) and TrueType
279/// (`FontFile2`) store their encoding in a binary table and are left alone.
280fn type1_program_encoding(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
281    let mut map = HashMap::new();
282    let Some(desc) = fdict
283        .get(b"FontDescriptor")
284        .ok()
285        .and_then(|o| deref(doc, o))
286        .and_then(|o| o.as_dict().ok())
287    else {
288        return map;
289    };
290    let Some(data) = desc
291        .get(b"FontFile")
292        .ok()
293        .and_then(|o| deref(doc, o))
294        .and_then(|o| o.as_stream().ok())
295        .and_then(|s| s.decompressed_content().ok())
296    else {
297        return map;
298    };
299    // The clear-text header (PostScript) ends at `eexec`; the rest is encrypted.
300    let head_end = data
301        .windows(5)
302        .position(|w| w == b"eexec")
303        .unwrap_or(data.len());
304    let head = String::from_utf8_lossy(&data[..head_end]);
305    // Scan for `dup <code> /<name> put` tokens.
306    let toks: Vec<&str> = head.split_whitespace().collect();
307    for w in toks.windows(4) {
308        if w[0] == "dup" && w[3] == "put" {
309            if let (Ok(code), Some(name)) = (w[1].parse::<u32>(), w[2].strip_prefix('/')) {
310                if code <= 255 {
311                    if let Some(ch) = glyph_name_to_char(name.as_bytes()) {
312                        map.insert(code as u8, ch);
313                    }
314                }
315            }
316        }
317    }
318    map
319}
320
321/// A glyph name that is a synthetic placeholder, not a real Adobe name:
322/// `g115`, `cid42`, `glyph7`, `index9`, `G12`, or a short-prefix code name like
323/// `SM590000` (IBM BookMaster). These carry no Unicode meaning, and docling-parse
324/// emits them verbatim (`/SM590000`). `afii####` / `uni####` are real Adobe names
325/// and excluded. The restriction keeps genuine glyph names on the Unicode path.
326fn is_gid_name(name: &[u8]) -> bool {
327    let Ok(s) = std::str::from_utf8(name) else {
328        return false;
329    };
330    if s.starts_with("afii") || s.starts_with("uni") {
331        return false;
332    }
333    for prefix in ["g", "G", "cid", "CID", "glyph", "index"] {
334        if let Some(rest) = s.strip_prefix(prefix) {
335            if !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit()) {
336                return true;
337            }
338        }
339    }
340    // Short alpha prefix (≤3 letters) followed by a run of ≥3 digits — synthetic
341    // code names like `SM590000`, distinct from real Adobe names (whole words or
342    // letter+`.suffix` variants).
343    let alpha = s.bytes().take_while(|b| b.is_ascii_alphabetic()).count();
344    let digits = s.len() - alpha;
345    (1..=3).contains(&alpha)
346        && digits >= 3
347        && s.as_bytes()[alpha..].iter().all(|b| b.is_ascii_digit())
348}
349
350fn font_ascent_descent(doc: &Document, fdict: &Dictionary, two_byte: bool) -> (f64, f64) {
351    // For Type0, the descriptor lives on the descendant CIDFont.
352    let descr_owner = if two_byte {
353        fdict
354            .get(b"DescendantFonts")
355            .ok()
356            .and_then(|o| deref(doc, o))
357            .and_then(|o| match o {
358                Object::Array(a) => a.first(),
359                _ => None,
360            })
361            .and_then(|o| as_dict(doc, o))
362    } else {
363        Some(fdict)
364    };
365    let fd = descr_owner
366        .and_then(|d| d.get(b"FontDescriptor").ok())
367        .and_then(|o| as_dict(doc, o));
368    let asc = fd
369        .and_then(|d| d.get(b"Ascent").ok())
370        .and_then(|o| {
371            o.as_float()
372                .ok()
373                .or_else(|| o.as_i64().ok().map(|i| i as f32))
374        })
375        .unwrap_or(750.0) as f64;
376    let desc = fd
377        .and_then(|d| d.get(b"Descent").ok())
378        .and_then(|o| {
379            o.as_float()
380                .ok()
381                .or_else(|| o.as_i64().ok().map(|i| i as f32))
382        })
383        .unwrap_or(-250.0) as f64;
384    // Some subsetted fonts carry a degenerate FontDescriptor (`/Ascent 0
385    // /Descent 0`) — the real metrics live in the font program. That collapses
386    // the loose box to zero height, so the line cells get zero area and the
387    // layout's region/text assignment drops them (2305's References list lost
388    // every prose line, keeping only the URLs). Fall back to typical text metrics
389    // so the box has height.
390    if asc - desc <= 1.0 {
391        return (750.0, -250.0);
392    }
393    (asc, desc)
394}
395
396/// Simple-font widths: `/FirstChar` + `/Widths` array, `/MissingWidth` default.
397fn simple_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
398    let mut map = HashMap::new();
399    let first = fdict
400        .get(b"FirstChar")
401        .ok()
402        .and_then(|o| o.as_i64().ok())
403        .unwrap_or(0) as u32;
404    if let Some(Object::Array(arr)) = fdict.get(b"Widths").ok().and_then(|o| deref(doc, o)) {
405        for (i, w) in arr.iter().enumerate() {
406            if let Some(w) = num(w) {
407                map.insert(first + i as u32, w);
408            }
409        }
410    }
411    let dw = fdict
412        .get(b"FontDescriptor")
413        .ok()
414        .and_then(|o| as_dict(doc, o))
415        .and_then(|d| d.get(b"MissingWidth").ok())
416        .and_then(num)
417        .unwrap_or(0.0);
418    (map, dw)
419}
420
421/// CIDFont widths: the `/W` array on the descendant font (`/DW` default = 1000).
422fn cid_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
423    let mut map = HashMap::new();
424    let Some(desc) = fdict
425        .get(b"DescendantFonts")
426        .ok()
427        .and_then(|o| deref(doc, o))
428        .and_then(|o| match o {
429            Object::Array(a) => a.first(),
430            _ => None,
431        })
432        .and_then(|o| as_dict(doc, o))
433    else {
434        return (map, 1000.0);
435    };
436    let dw = desc.get(b"DW").ok().and_then(num).unwrap_or(1000.0);
437    if let Some(Object::Array(w)) = desc.get(b"W").ok().and_then(|o| deref(doc, o)) {
438        let mut i = 0;
439        while i < w.len() {
440            let c = w.get(i).and_then(num);
441            match (c, w.get(i + 1)) {
442                // `c [w1 w2 ...]`: consecutive CIDs starting at c.
443                (Some(c), Some(Object::Array(list))) => {
444                    for (k, wv) in list.iter().enumerate() {
445                        if let Some(wv) = num(wv) {
446                            map.insert(c as u32 + k as u32, wv);
447                        }
448                    }
449                    i += 2;
450                }
451                // `c_first c_last w`: a run all of width w.
452                (Some(c1), Some(o2)) => {
453                    if let (Some(c2), Some(wv)) = (num(o2), w.get(i + 2).and_then(num)) {
454                        for cid in c1 as u32..=c2 as u32 {
455                            map.insert(cid, wv);
456                        }
457                    }
458                    i += 3;
459                }
460                _ => break,
461            }
462        }
463    }
464    (map, dw)
465}
466
467fn num(o: &Object) -> Option<f64> {
468    match o {
469        Object::Integer(i) => Some(*i as f64),
470        Object::Real(r) => Some(*r as f64),
471        _ => None,
472    }
473}
474
475/// Parse a ToUnicode CMap's `bfchar` / `bfrange` sections into code→string.
476fn parse_tounicode(data: &[u8]) -> HashMap<u32, String> {
477    let text = String::from_utf8_lossy(data);
478    let mut map = HashMap::new();
479    let hex = |s: &str| -> Option<Vec<u16>> {
480        let s = s.trim();
481        if !s.starts_with('<') || !s.ends_with('>') {
482            return None;
483        }
484        let h = &s[1..s.len() - 1];
485        let bytes: Vec<u8> = (0..h.len())
486            .step_by(2)
487            .filter_map(|i| u8::from_str_radix(h.get(i..i + 2)?, 16).ok())
488            .collect();
489        Some(
490            bytes
491                .chunks(2)
492                .map(|c| {
493                    if c.len() == 2 {
494                        u16::from_be_bytes([c[0], c[1]])
495                    } else {
496                        c[0] as u16
497                    }
498                })
499                .collect(),
500        )
501    };
502    let u16s_to_string = |u: &[u16]| String::from_utf16_lossy(u);
503    let code_of = |u: &[u16]| u.iter().fold(0u32, |acc, &x| (acc << 16) | x as u32);
504
505    // Tokenize by structure, not whitespace: CMap hex groups are often written
506    // back-to-back with no separators (`<21><21><0054>`), so scan for `<…>`
507    // groups, `[`/`]` brackets, and bareword keywords.
508    let tokens: Vec<String> = {
509        let bytes = text.as_bytes();
510        let mut toks = Vec::new();
511        let mut i = 0;
512        while i < bytes.len() {
513            let c = bytes[i];
514            if c.is_ascii_whitespace() {
515                i += 1;
516            } else if c == b'<' {
517                let start = i;
518                while i < bytes.len() && bytes[i] != b'>' {
519                    i += 1;
520                }
521                i += 1; // include '>'
522                toks.push(String::from_utf8_lossy(&bytes[start..i.min(bytes.len())]).into_owned());
523            } else if c == b'[' || c == b']' {
524                toks.push((c as char).to_string());
525                i += 1;
526            } else {
527                let start = i;
528                while i < bytes.len()
529                    && !bytes[i].is_ascii_whitespace()
530                    && bytes[i] != b'<'
531                    && bytes[i] != b'['
532                    && bytes[i] != b']'
533                {
534                    i += 1;
535                }
536                toks.push(String::from_utf8_lossy(&bytes[start..i]).into_owned());
537            }
538        }
539        toks
540    };
541    let tokens: Vec<&str> = tokens.iter().map(|s| s.as_str()).collect();
542    let mut i = 0;
543    while i < tokens.len() {
544        match tokens[i] {
545            "beginbfchar" => {
546                i += 1;
547                while i + 1 < tokens.len() && tokens[i] != "endbfchar" {
548                    if let (Some(src), Some(dst)) = (hex(tokens[i]), hex(tokens[i + 1])) {
549                        map.insert(code_of(&src), u16s_to_string(&dst));
550                    }
551                    i += 2;
552                }
553            }
554            "beginbfrange" => {
555                i += 1;
556                while i + 2 < tokens.len() && tokens[i] != "endbfrange" {
557                    let (Some(lo), Some(hi)) = (hex(tokens[i]), hex(tokens[i + 1])) else {
558                        i += 1;
559                        continue;
560                    };
561                    let lo = code_of(&lo);
562                    let hi = code_of(&hi);
563                    if tokens[i + 2] == "[" {
564                        // `<lo> <hi> [ <d0> <d1> ... ]`: one dst per code in the range.
565                        let mut j = i + 3;
566                        let mut code = lo;
567                        while j < tokens.len() && tokens[j] != "]" {
568                            if let Some(dst) = hex(tokens[j]) {
569                                map.insert(code, u16s_to_string(&dst));
570                            }
571                            code += 1;
572                            j += 1;
573                        }
574                        i = j + 1;
575                    } else if let Some(dst) = hex(tokens[i + 2]) {
576                        // `<lo> <hi> <dst>`: consecutive Unicode from a base.
577                        let base = code_of(&dst);
578                        for (k, code) in (lo..=hi).enumerate() {
579                            if let Some(ch) = char::from_u32(base + k as u32) {
580                                map.insert(code, ch.to_string());
581                            }
582                        }
583                        i += 3;
584                    } else {
585                        i += 1;
586                    }
587                }
588            }
589            _ => i += 1,
590        }
591    }
592    map
593}
594
595/// Decode a PDF string literal in a Tj/TJ operand into raw code units.
596fn codes(font: &Font, bytes: &[u8]) -> Vec<u32> {
597    if font.two_byte {
598        bytes
599            .chunks(2)
600            .map(|c| {
601                if c.len() == 2 {
602                    ((c[0] as u32) << 8) | c[1] as u32
603                } else {
604                    c[0] as u32
605                }
606            })
607            .collect()
608    } else {
609        bytes.iter().map(|&b| b as u32).collect()
610    }
611}
612
613/// Page size (width, height) in PDF points from the MediaBox.
614fn page_size(doc: &Document, page_id: lopdf::ObjectId) -> (f32, f32) {
615    let mb = doc
616        .get_object(page_id)
617        .ok()
618        .and_then(|o| o.as_dict().ok())
619        .and_then(|d| {
620            // MediaBox may be inherited; lopdf resolves via get_page... fall back to a guess.
621            d.get(b"MediaBox").ok().cloned()
622        })
623        .or_else(|| {
624            doc.get_dictionary(page_id)
625                .ok()
626                .and_then(|d| d.get(b"MediaBox").ok().cloned())
627        });
628    if let Some(Object::Array(a)) = mb {
629        let v: Vec<f32> = a.iter().filter_map(|o| num(o).map(|x| x as f32)).collect();
630        if v.len() == 4 {
631            return ((v[2] - v[0]).abs(), (v[3] - v[1]).abs());
632        }
633    }
634    (612.0, 792.0)
635}
636
637/// Debug: raw glyph stream `(ch, ll, lr, lb, lt)` (native coords) for page
638/// `index`, before the sanitizer. For comparing char cells to docling-parse.
639pub fn debug_glyphs(bytes: &[u8], index: usize) -> Vec<(char, f32, f32, f32, f32)> {
640    let Ok(doc) = Document::load_mem(bytes) else {
641        return Vec::new();
642    };
643    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
644    pages.sort_by_key(|(n, _)| *n);
645    let Some((_, pid)) = pages.get(index) else {
646        return Vec::new();
647    };
648    page_glyphs(&doc, *pid)
649        .into_iter()
650        .map(|g| (g.ch, g.ll, g.lr, g.lb, g.lt))
651        .collect()
652}
653
654/// Public entry: per-page (width, height, line cells) for a PDF, via the Rust
655/// text parser + the docling-parse line sanitizer. Used by the pipeline and the
656/// `textparse_dump` example.
657pub fn pdf_textlines(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
658    let Ok(doc) = Document::load_mem(bytes) else {
659        return Vec::new();
660    };
661    let mut caches = DocCaches::default();
662    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
663    pages.sort_by_key(|(n, _)| *n);
664    pages
665        .into_iter()
666        .map(|(_, pid)| {
667            let (w, h) = page_size(&doc, pid);
668            let glyphs = page_glyphs_cached(&doc, pid, &mut caches);
669            let cells = crate::dp_lines::line_cells(&glyphs, h, true);
670            (w, h, cells)
671        })
672        .collect()
673}
674
675/// Debug/diagnostic entry: per-page (width, height, word cells) for a PDF, via
676/// the Rust parser glyphs run through the docling-parse word grouping. Used to
677/// compare parser word cells against docling-parse's `word_cells` oracle (roadmap
678/// item 6).
679pub fn pdf_words(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
680    let Ok(doc) = Document::load_mem(bytes) else {
681        return Vec::new();
682    };
683    let mut caches = DocCaches::default();
684    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
685    pages.sort_by_key(|(n, _)| *n);
686    pages
687        .into_iter()
688        .map(|(_, pid)| {
689            let (w, h) = page_size(&doc, pid);
690            let glyphs = page_glyphs_cached(&doc, pid, &mut caches);
691            let cells = crate::dp_lines::word_cells(&glyphs, h, true);
692            (w, h, cells)
693        })
694        .collect()
695}
696
697/// One page's text cells from the pure-Rust parser: prose line cells, per-word
698/// cells, and code line cells — all from a single glyph parse. Replaces the
699/// pdfium text path (roadmap item 6) when the parser drop is enabled.
700#[derive(Default)]
701pub struct PageParserCells {
702    pub prose: Vec<crate::pdfium_backend::TextCell>,
703    pub words: Vec<crate::pdfium_backend::TextCell>,
704    pub code: Vec<crate::pdfium_backend::TextCell>,
705}
706
707/// Full parser text layer: prose + word + code cells per page, glyphs parsed once.
708/// `prose`/`words` come from the docling-parse contraction ([`crate::dp_lines`]);
709/// `code` splits only at the parser's own space glyphs (monospace keeps its
710/// source spacing). Used by the pipeline to retire pdfium's text path.
711pub fn pdf_all_cells(bytes: &[u8]) -> Vec<PageParserCells> {
712    let Ok(doc) = Document::load_mem(bytes) else {
713        return Vec::new();
714    };
715    let mut caches = DocCaches::default();
716    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
717    pages.sort_by_key(|(n, _)| *n);
718    pages
719        .into_iter()
720        .map(|(_, pid)| {
721            let (_w, h) = page_size(&doc, pid);
722            let glyphs = page_glyphs_cached(&doc, pid, &mut caches);
723            let (prose, words) = crate::dp_lines::line_and_word_cells(&glyphs, h, true);
724            PageParserCells {
725                prose,
726                words,
727                code: crate::pdfium_backend::code_cells_from_glyphs(&glyphs, h),
728            }
729        })
730        .collect()
731}
732
733/// The text-state scalars inherited by a Form XObject when it is invoked via
734/// `Do` (the PDF graphics state includes the text parameters, but not the text
735/// matrices, which a form re-establishes inside its own `BT`/`ET`).
736#[derive(Clone, Copy)]
737struct TextState {
738    tc: f64,
739    tw: f64,
740    th: f64,
741    tl: f64,
742    trise: f64,
743    fsize: f64,
744}
745
746impl TextState {
747    const INIT: TextState = TextState {
748        tc: 0.0,
749        tw: 0.0,
750        th: 1.0,
751        tl: 0.0,
752        trise: 0.0,
753        fsize: 0.0,
754    };
755}
756
757/// The effective `/Resources` dictionary for a page (inline or via reference,
758/// falling back to an inherited one from a `/Parent`).
759fn page_res(doc: &Document, page_id: lopdf::ObjectId) -> Option<&Dictionary> {
760    let (inline, ids) = doc.get_page_resources(page_id).ok()?;
761    if let Some(d) = inline {
762        return Some(d);
763    }
764    ids.into_iter().find_map(|id| doc.get_dictionary(id).ok())
765}
766
767/// Build the code→[`Font`] map for a resources dictionary's `/Font` sub-dict,
768/// reusing the per-document cache for fonts referenced indirectly (the common
769/// case — the same font objects recur on every page).
770fn fonts_from_res(
771    doc: &Document,
772    res: &Dictionary,
773    caches: &mut DocCaches,
774) -> HashMap<Vec<u8>, Rc<Font>> {
775    let mut map = HashMap::new();
776    let font_dict = res
777        .get(b"Font")
778        .ok()
779        .and_then(|o| deref(doc, o))
780        .and_then(|o| o.as_dict().ok());
781    if let Some(fd) = font_dict {
782        for (name, value) in fd.iter() {
783            let font = match value {
784                Object::Reference(id) => {
785                    let key = (*id, name.clone());
786                    if let Some(f) = caches.fonts.get(&key) {
787                        Rc::clone(f)
788                    } else if let Some(fdict) = deref(doc, value).and_then(|o| o.as_dict().ok()) {
789                        let f = Rc::new(parse_font(doc, name, fdict));
790                        caches.fonts.insert(key, Rc::clone(&f));
791                        f
792                    } else {
793                        continue;
794                    }
795                }
796                _ => {
797                    if let Some(fdict) = deref(doc, value).and_then(|o| o.as_dict().ok()) {
798                        Rc::new(parse_font(doc, name, fdict))
799                    } else {
800                        continue;
801                    }
802                }
803            };
804            map.insert(name.clone(), font);
805        }
806    }
807    map
808}
809
810/// Extract every glyph on a page as a native-coordinate [`Glyph`].
811pub(crate) fn page_glyphs(doc: &Document, page_id: lopdf::ObjectId) -> Vec<Glyph> {
812    page_glyphs_cached(doc, page_id, &mut DocCaches::default())
813}
814
815/// [`page_glyphs`] with an explicit per-document cache, so a multi-page walk
816/// parses each font / decodes each form once instead of once per page.
817fn page_glyphs_cached(
818    doc: &Document,
819    page_id: lopdf::ObjectId,
820    caches: &mut DocCaches,
821) -> Vec<Glyph> {
822    let mut out = Vec::new();
823    let Ok(content_bytes) = doc.get_page_content(page_id) else {
824        return out;
825    };
826    let Ok(content) = lopdf::content::Content::decode(&content_bytes) else {
827        return out;
828    };
829    if let Some(res) = page_res(doc, page_id) {
830        run_content(
831            doc,
832            res,
833            &content,
834            Mat::ID,
835            TextState::INIT,
836            0,
837            caches,
838            &mut out,
839        );
840    }
841    out
842}
843
844/// Run a content stream's operators, emitting glyphs into `out`. Recurses into
845/// Form XObjects on `Do` (bulk body text in heavy PDFs lives inside a form, not
846/// the page content stream). `res` is the resources dict in scope (the page's,
847/// or the form's own); `base_ctm` is the CTM at the point of invocation.
848#[allow(clippy::too_many_arguments)]
849fn run_content(
850    doc: &Document,
851    res: &Dictionary,
852    content: &lopdf::content::Content,
853    base_ctm: Mat,
854    init: TextState,
855    depth: u32,
856    caches: &mut DocCaches,
857    out: &mut Vec<Glyph>,
858) {
859    let fonts = fonts_from_res(doc, res, caches);
860    let xobjects = res
861        .get(b"XObject")
862        .ok()
863        .and_then(|o| deref(doc, o))
864        .and_then(|o| o.as_dict().ok());
865
866    // Graphics + text state. `q`/`Q` save and restore the whole graphics state,
867    // which includes the text parameters (Tc, Tw, Tz, TL, Tfs, Trise, font) —
868    // *not* the text matrix (that is reset by BT). Saving only the CTM let a Tc
869    // set inside a `q…Q` block leak out and drift every later glyph.
870    #[allow(clippy::type_complexity)]
871    let mut gstate_stack: Vec<(Mat, f64, f64, f64, f64, f64, f64, Option<&Rc<Font>>)> = Vec::new();
872    let mut ctm = base_ctm;
873    let mut tm = Mat::ID;
874    let mut tlm = Mat::ID;
875    let mut font: Option<&Rc<Font>> = None;
876    let mut fsize = init.fsize;
877    let mut tc = init.tc; // char spacing
878    let mut tw = init.tw; // word spacing
879    let mut th = init.th; // horizontal scale (Tz/100)
880    let mut tl = init.tl; // leading
881    let mut trise = init.trise;
882
883    let op_f = |operands: &[Object], i: usize| operands.get(i).and_then(num).unwrap_or(0.0);
884
885    for op in &content.operations {
886        let operands = &op.operands;
887        match op.operator.as_str() {
888            "q" => gstate_stack.push((ctm, tc, tw, th, tl, trise, fsize, font)),
889            "Q" => {
890                if let Some((c, a, b, h, l, r, fs, f)) = gstate_stack.pop() {
891                    ctm = c;
892                    tc = a;
893                    tw = b;
894                    th = h;
895                    tl = l;
896                    trise = r;
897                    fsize = fs;
898                    font = f;
899                }
900            }
901            "cm" => {
902                let m = Mat {
903                    a: op_f(operands, 0),
904                    b: op_f(operands, 1),
905                    c: op_f(operands, 2),
906                    d: op_f(operands, 3),
907                    e: op_f(operands, 4),
908                    f: op_f(operands, 5),
909                };
910                ctm = m.then(ctm);
911            }
912            "BT" => {
913                tm = Mat::ID;
914                tlm = Mat::ID;
915            }
916            "ET" => {}
917            "Tf" => {
918                if let Some(Object::Name(n)) = operands.first() {
919                    font = fonts.get(n.as_slice());
920                }
921                fsize = op_f(operands, 1);
922            }
923            "Td" => {
924                tlm = Mat {
925                    a: 1.0,
926                    b: 0.0,
927                    c: 0.0,
928                    d: 1.0,
929                    e: op_f(operands, 0),
930                    f: op_f(operands, 1),
931                }
932                .then(tlm);
933                tm = tlm;
934            }
935            "TD" => {
936                tl = -op_f(operands, 1);
937                tlm = Mat {
938                    a: 1.0,
939                    b: 0.0,
940                    c: 0.0,
941                    d: 1.0,
942                    e: op_f(operands, 0),
943                    f: op_f(operands, 1),
944                }
945                .then(tlm);
946                tm = tlm;
947            }
948            "Tm" => {
949                tlm = Mat {
950                    a: op_f(operands, 0),
951                    b: op_f(operands, 1),
952                    c: op_f(operands, 2),
953                    d: op_f(operands, 3),
954                    e: op_f(operands, 4),
955                    f: op_f(operands, 5),
956                };
957                tm = tlm;
958            }
959            "T*" => {
960                tlm = Mat {
961                    a: 1.0,
962                    b: 0.0,
963                    c: 0.0,
964                    d: 1.0,
965                    e: 0.0,
966                    f: -tl,
967                }
968                .then(tlm);
969                tm = tlm;
970            }
971            "Tc" => tc = op_f(operands, 0),
972            "Tw" => tw = op_f(operands, 0),
973            "Tz" => th = op_f(operands, 0) / 100.0,
974            "TL" => tl = op_f(operands, 0),
975            "Ts" => trise = op_f(operands, 0),
976            "Tj" | "'" | "\"" => {
977                if op.operator == "'" || op.operator == "\"" {
978                    // move to next line first
979                    tlm = Mat {
980                        a: 1.0,
981                        b: 0.0,
982                        c: 0.0,
983                        d: 1.0,
984                        e: 0.0,
985                        f: -tl,
986                    }
987                    .then(tlm);
988                    tm = tlm;
989                }
990                if op.operator == "\"" {
991                    // `aw ac string "` sets word- and char-spacing before
992                    // showing the string (PDF 32000-1 §9.4.3), persisting after.
993                    tw = op_f(operands, 0);
994                    tc = op_f(operands, 1);
995                }
996                if let (Some(f), Some(Object::String(s, _))) = (font, operands.last()) {
997                    show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out);
998                }
999            }
1000            "TJ" => {
1001                if let (Some(f), Some(Object::Array(arr))) = (font, operands.first()) {
1002                    for el in arr {
1003                        match el {
1004                            Object::String(s, _) => {
1005                                show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out)
1006                            }
1007                            other => {
1008                                if let Some(adj) = num(other) {
1009                                    // negative number moves text right (PDF: subtract)
1010                                    let tx = -adj / 1000.0 * fsize * th;
1011                                    tm = Mat {
1012                                        a: 1.0,
1013                                        b: 0.0,
1014                                        c: 0.0,
1015                                        d: 1.0,
1016                                        e: tx,
1017                                        f: 0.0,
1018                                    }
1019                                    .then(tm);
1020                                }
1021                            }
1022                        }
1023                    }
1024                }
1025            }
1026            "Do" => {
1027                // Invoke a Form XObject: bulk body text in many PDFs lives inside
1028                // a form, reached only here. Image XObjects are skipped (no text).
1029                if depth >= 8 {
1030                    continue;
1031                }
1032                let Some(Object::Name(n)) = operands.first() else {
1033                    continue;
1034                };
1035                let obj = xobjects.and_then(|d| d.get(n.as_slice()).ok());
1036                let form_id = match obj {
1037                    Some(Object::Reference(id)) => Some(*id),
1038                    _ => None,
1039                };
1040                let stream = obj
1041                    .and_then(|o| deref(doc, o))
1042                    .and_then(|o| o.as_stream().ok());
1043                let Some(stream) = stream else { continue };
1044                let is_form = stream
1045                    .dict
1046                    .get(b"Subtype")
1047                    .ok()
1048                    .and_then(|o| o.as_name().ok())
1049                    == Some(b"Form".as_slice());
1050                if !is_form {
1051                    continue;
1052                }
1053                // Decode the form's content once per document (headers/footers
1054                // and bulk body text invoke the same form on every page).
1055                let cached = form_id.and_then(|id| caches.forms.get(&id).cloned());
1056                let form_content = match cached {
1057                    Some(c) => c,
1058                    None => {
1059                        let Ok(data) = stream.decompressed_content() else {
1060                            continue;
1061                        };
1062                        let Ok(c) = lopdf::content::Content::decode(&data) else {
1063                            continue;
1064                        };
1065                        let c = Rc::new(c);
1066                        if let Some(id) = form_id {
1067                            caches.forms.insert(id, Rc::clone(&c));
1068                        }
1069                        c
1070                    }
1071                };
1072                // The form's /Matrix maps form space into the CTM at invocation.
1073                let form_mat = match stream.dict.get(b"Matrix").ok() {
1074                    Some(Object::Array(a)) if a.len() == 6 => {
1075                        let v: Vec<f64> = a.iter().filter_map(num).collect();
1076                        if v.len() == 6 {
1077                            Mat {
1078                                a: v[0],
1079                                b: v[1],
1080                                c: v[2],
1081                                d: v[3],
1082                                e: v[4],
1083                                f: v[5],
1084                            }
1085                        } else {
1086                            Mat::ID
1087                        }
1088                    }
1089                    _ => Mat::ID,
1090                };
1091                // The form's own /Resources, falling back to the inherited ones.
1092                let form_res = stream
1093                    .dict
1094                    .get(b"Resources")
1095                    .ok()
1096                    .and_then(|o| deref(doc, o))
1097                    .and_then(|o| o.as_dict().ok())
1098                    .unwrap_or(res);
1099                let state = TextState {
1100                    tc,
1101                    tw,
1102                    th,
1103                    tl,
1104                    trise,
1105                    fsize,
1106                };
1107                run_content(
1108                    doc,
1109                    form_res,
1110                    &form_content,
1111                    form_mat.then(ctm),
1112                    state,
1113                    depth + 1,
1114                    caches,
1115                    out,
1116                );
1117            }
1118            _ => {}
1119        }
1120    }
1121}
1122
1123#[allow(clippy::too_many_arguments)]
1124fn show_text(
1125    font: &Font,
1126    bytes: &[u8],
1127    fsize: f64,
1128    tc: f64,
1129    tw: f64,
1130    th: f64,
1131    trise: f64,
1132    tm: &mut Mat,
1133    ctm: Mat,
1134    out: &mut Vec<Glyph>,
1135) {
1136    for code in codes(font, bytes) {
1137        let (text, w) = font.decode_code(code);
1138        let w0 = w / 1000.0; // advance in text-space (em) units
1139                             // The glyph→user transform: scale glyph space by font size, then Tm, CTM.
1140        let scale = Mat {
1141            a: fsize * th,
1142            b: 0.0,
1143            c: 0.0,
1144            d: fsize,
1145            e: 0.0,
1146            f: trise,
1147        };
1148        let trm = scale.then(*tm).then(ctm);
1149        // Box in glyph space (1000-unit em): x 0..w, y descent..ascent.
1150        let (x0, y0) = trm.apply(0.0, font.descent / 1000.0);
1151        let (x1, _y1) = trm.apply(w0, font.descent / 1000.0);
1152        let (_x2, y2) = trm.apply(0.0, font.ascent / 1000.0);
1153        let (left, right) = (x0.min(x1), x0.max(x1));
1154        let (bot, top) = (y0.min(y2), y0.max(y2));
1155        if let Some(s) = text {
1156            // A run may map one code to multiple chars (ligature/fraction); share box.
1157            for ch in s.chars() {
1158                if ch != '\u{0}' {
1159                    out.push(Glyph {
1160                        ch,
1161                        l: left as f32,
1162                        b: bot as f32,
1163                        r: right as f32,
1164                        t: top as f32,
1165                        ll: left as f32,
1166                        lb: bot as f32,
1167                        lr: right as f32,
1168                        lt: top as f32,
1169                        font: font.hash,
1170                    });
1171                }
1172            }
1173        }
1174        // Advance the text matrix. Word spacing applies to single-byte code 32.
1175        let is_space = !font.two_byte && code == 32;
1176        let tx = (w0 * fsize + tc + if is_space { tw } else { 0.0 }) * th;
1177        *tm = Mat {
1178            a: 1.0,
1179            b: 0.0,
1180            c: 0.0,
1181            d: 1.0,
1182            e: tx,
1183            f: 0.0,
1184        }
1185        .then(*tm);
1186    }
1187}
1188
1189/// Build a simple font's code→char table from its `/Encoding`: the base
1190/// encoding (WinAnsi / MacRoman) plus any `/Differences` overrides (glyph names
1191/// resolved through a small Adobe-glyph-name subset).
1192fn simple_encoding_table(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
1193    let enc = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o));
1194    let base_name = match enc {
1195        Some(Object::Name(n)) => n.clone(),
1196        Some(Object::Dictionary(d)) => d
1197            .get(b"BaseEncoding")
1198            .ok()
1199            .and_then(|o| o.as_name().ok())
1200            .map(|n| n.to_vec())
1201            .unwrap_or_default(),
1202        _ => Vec::new(),
1203    };
1204    let mut m = if base_name == b"MacRomanEncoding" {
1205        macroman_table()
1206    } else {
1207        winansi_table()
1208    };
1209    // Apply /Differences: `code /glyphname /glyphname ... code ...`.
1210    if let Some(Object::Dictionary(d)) = enc {
1211        if let Some(Object::Array(diffs)) = d.get(b"Differences").ok().and_then(|o| deref(doc, o)) {
1212            let mut code = 0u8;
1213            for el in diffs {
1214                match el {
1215                    Object::Integer(i) => code = *i as u8,
1216                    Object::Name(name) => {
1217                        if let Some(ch) = glyph_name_to_char(name) {
1218                            m.insert(code, ch);
1219                        }
1220                        code = code.wrapping_add(1);
1221                    }
1222                    _ => {}
1223                }
1224            }
1225        }
1226    }
1227    m
1228}
1229
1230/// Resolve an Adobe glyph name to Unicode: `uniXXXX`, single ASCII letters, the
1231/// digit/punctuation names from the Adobe Glyph List, and common typographic
1232/// names. A `.suffix` (`one.taboldstyle`, `a.sc`) is stripped and the base name
1233/// retried — docling renders these as the base character.
1234fn glyph_name_to_char(name: &[u8]) -> Option<char> {
1235    let s = std::str::from_utf8(name).ok()?;
1236    if let Some(hex) = s.strip_prefix("uni") {
1237        if let Ok(cp) = u32::from_str_radix(hex.get(0..4)?, 16) {
1238            return char::from_u32(cp);
1239        }
1240    }
1241    // Single ASCII letter names (`A`, `m`) map to themselves.
1242    if s.len() == 1 {
1243        let b = s.as_bytes()[0];
1244        if b.is_ascii_alphabetic() {
1245            return Some(b as char);
1246        }
1247    }
1248    let resolved = match s {
1249        "space" => ' ',
1250        "exclam" => '!',
1251        "quotedbl" => '"',
1252        "numbersign" => '#',
1253        "dollar" => '$',
1254        "percent" => '%',
1255        "ampersand" => '&',
1256        "quotesingle" => '\'',
1257        "parenleft" => '(',
1258        "parenright" => ')',
1259        "asterisk" => '*',
1260        "plus" => '+',
1261        "comma" => ',',
1262        "hyphen" => '-',
1263        "period" => '.',
1264        "slash" => '/',
1265        "zero" => '0',
1266        "one" => '1',
1267        "two" => '2',
1268        "three" => '3',
1269        "four" => '4',
1270        "five" => '5',
1271        "six" => '6',
1272        "seven" => '7',
1273        "eight" => '8',
1274        "nine" => '9',
1275        "colon" => ':',
1276        "semicolon" => ';',
1277        "less" => '<',
1278        "equal" => '=',
1279        "greater" => '>',
1280        "question" => '?',
1281        "at" => '@',
1282        "bracketleft" => '[',
1283        "backslash" => '\\',
1284        "bracketright" => ']',
1285        "asciicircum" => '^',
1286        "underscore" => '_',
1287        "grave" => '`',
1288        "braceleft" => '{',
1289        "bar" => '|',
1290        "braceright" => '}',
1291        "asciitilde" => '~',
1292        "bullet" => '\u{2022}',
1293        "periodcentered" => '\u{00B7}',
1294        "endash" => '\u{2013}',
1295        "emdash" => '\u{2014}',
1296        "quoteright" => '\u{2019}',
1297        "quoteleft" => '\u{2018}',
1298        "quotedblleft" => '\u{201C}',
1299        "quotedblright" => '\u{201D}',
1300        "quotedblbase" => '\u{201E}',
1301        "quotesinglbase" => '\u{201A}',
1302        // Latin f-ligatures named in `/Differences` (e.g. 2305's body font). These
1303        // map to the presentation-form code points, which `decompose_ligatures`
1304        // then spells back out (`ff`→"ff") — without them the glyph decodes to
1305        // nothing and the sanitizer fills the gap with a space (`di erences`).
1306        "ff" => '\u{FB00}',
1307        "fi" => '\u{FB01}',
1308        "fl" => '\u{FB02}',
1309        "ffi" => '\u{FB03}',
1310        "ffl" => '\u{FB04}',
1311        "ft" => '\u{FB05}',
1312        "st" => '\u{FB06}',
1313        "degree" => '\u{00B0}',
1314        "trademark" => '\u{2122}',
1315        "registered" => '\u{00AE}',
1316        "copyright" => '\u{00A9}',
1317        "ellipsis" => '\u{2026}',
1318        "minus" => '\u{2212}',
1319        "fraction" => '\u{2044}',
1320        "nbspace" => '\u{00A0}',
1321        // Greek + math glyph names (standard Adobe Glyph List). Standard TeX math
1322        // fonts (CMMI/CMSY/…) name their glyphs this way in the embedded font
1323        // program's `/Encoding`; without these a `λ`/`≤` decodes to nothing and is
1324        // dropped from body text (`and λ set to 0.5` → `and set to 0.5`).
1325        "alpha" => '\u{03B1}',
1326        "beta" => '\u{03B2}',
1327        "gamma" => '\u{03B3}',
1328        "delta" => '\u{03B4}',
1329        "epsilon" | "epsilon1" => '\u{03B5}',
1330        "zeta" => '\u{03B6}',
1331        "eta" => '\u{03B7}',
1332        "theta" | "theta1" => '\u{03B8}',
1333        "iota" => '\u{03B9}',
1334        "kappa" => '\u{03BA}',
1335        "lambda" => '\u{03BB}',
1336        "mu" => '\u{03BC}',
1337        "nu" => '\u{03BD}',
1338        "xi" => '\u{03BE}',
1339        "omicron" => '\u{03BF}',
1340        "pi" | "pi1" => '\u{03C0}',
1341        "rho" | "rho1" => '\u{03C1}',
1342        "sigma" => '\u{03C3}',
1343        "sigma1" => '\u{03C2}',
1344        "tau" => '\u{03C4}',
1345        "upsilon" => '\u{03C5}',
1346        "phi" | "phi1" => '\u{03C6}',
1347        "chi" => '\u{03C7}',
1348        "psi" => '\u{03C8}',
1349        "omega" | "omega1" => '\u{03C9}',
1350        "Gamma" => '\u{0393}',
1351        "Delta" => '\u{0394}',
1352        "Theta" => '\u{0398}',
1353        "Lambda" => '\u{039B}',
1354        "Xi" => '\u{039E}',
1355        "Pi" => '\u{03A0}',
1356        "Sigma" => '\u{03A3}',
1357        "Upsilon" => '\u{03A5}',
1358        "Phi" => '\u{03A6}',
1359        "Psi" => '\u{03A8}',
1360        "Omega" => '\u{03A9}',
1361        "lessequal" => '\u{2264}',
1362        "greaterequal" => '\u{2265}',
1363        "notequal" => '\u{2260}',
1364        "approxequal" => '\u{2248}',
1365        "equivalence" => '\u{2261}',
1366        "element" => '\u{2208}',
1367        "plusminus" => '\u{00B1}',
1368        "multiply" => '\u{00D7}',
1369        "divide" => '\u{00F7}',
1370        "infinity" => '\u{221E}',
1371        "partialdiff" => '\u{2202}',
1372        "gradient" => '\u{2207}',
1373        "summation" => '\u{2211}',
1374        "product" => '\u{220F}',
1375        "integral" => '\u{222B}',
1376        "radical" => '\u{221A}',
1377        "proportional" => '\u{221D}',
1378        "arrowright" => '\u{2192}',
1379        "arrowleft" => '\u{2190}',
1380        "arrowup" => '\u{2191}',
1381        "arrowdown" => '\u{2193}',
1382        "arrowboth" => '\u{2194}',
1383        "arrowdblright" => '\u{21D2}',
1384        "logicaland" => '\u{2227}',
1385        "logicalor" => '\u{2228}',
1386        "intersection" => '\u{2229}',
1387        "union" => '\u{222A}',
1388        "similar" => '\u{223C}',
1389        "congruent" => '\u{2245}',
1390        "dotmath" => '\u{22C5}',
1391        "asteriskmath" => '\u{2217}',
1392        _ => {
1393            // Strip an AGL `.suffix` (oldstyle/small-cap variant) and retry.
1394            if let Some((base, _)) = s.split_once('.') {
1395                if !base.is_empty() {
1396                    return glyph_name_to_char(base.as_bytes());
1397                }
1398            }
1399            return None;
1400        }
1401    };
1402    Some(resolved)
1403}
1404
1405/// Minimal WinAnsiEncoding (Latin-1-ish) for simple fonts lacking ToUnicode.
1406fn winansi_table() -> HashMap<u8, char> {
1407    let mut m = HashMap::new();
1408    for b in 0x20u8..=0x7e {
1409        m.insert(b, b as char);
1410    }
1411    // High range: Windows-1252 printable points that differ from Latin-1.
1412    let extra: &[(u8, char)] = &[
1413        (0x91, '\u{2018}'),
1414        (0x92, '\u{2019}'),
1415        (0x93, '\u{201C}'),
1416        (0x94, '\u{201D}'),
1417        (0x95, '\u{2022}'),
1418        (0x96, '\u{2013}'),
1419        (0x97, '\u{2014}'),
1420        (0x85, '\u{2026}'),
1421        (0xA0, '\u{00A0}'),
1422    ];
1423    for &(b, c) in extra {
1424        m.insert(b, c);
1425    }
1426    for b in 0xA1u8..=0xFF {
1427        m.entry(b).or_insert(b as char);
1428    }
1429    m
1430}
1431
1432/// Minimal MacRomanEncoding: ASCII plus the high-range points our corpus hits
1433/// (notably 0xA5 = bullet, used as a list marker).
1434fn macroman_table() -> HashMap<u8, char> {
1435    let mut m = HashMap::new();
1436    for b in 0x20u8..=0x7e {
1437        m.insert(b, b as char);
1438    }
1439    let high: &[(u8, char)] = &[
1440        (0xA5, '\u{2022}'), // bullet
1441        (0xD0, '\u{2013}'), // endash
1442        (0xD1, '\u{2014}'), // emdash
1443        (0xD2, '\u{201C}'),
1444        (0xD3, '\u{201D}'),
1445        (0xD4, '\u{2018}'),
1446        (0xD5, '\u{2019}'),
1447        (0xCA, '\u{00A0}'),
1448        (0xC9, '\u{2026}'),
1449        (0xDE, '\u{FB01}'),
1450        (0xDF, '\u{FB02}'),
1451    ];
1452    for &(b, c) in high {
1453        m.insert(b, c);
1454    }
1455    m
1456}
fleischwolf_pdf/textparse.rs

fleischwolf_pdf/
textparse.rs