fleischwolf_pdf/
textparse.rs

1//! Pure-Rust PDF text extraction (replacing pdfium's glyph layer).
2//!
3//! pdfium reports *rendered* glyph boxes, which diverge from docling's
4//! `docling-parse` C++ parser at exactly the points that drive conformance:
5//! generated spaces get a zero-width box, combining diacritics get a real-width
6//! box, and ligature/fraction glyphs land at different x. This module instead
7//! reconstructs each glyph's box from the **font's own advance widths** and the
8//! PDF text/graphics matrices — the same information docling-parse uses — so a
9//! space is as wide as the font says and a combining mark has zero advance.
10//!
11//! The output is the same [`Glyph`] stream pdfium produces (native PDF
12//! coordinates, y-up), fed straight into the existing docling-parse line
13//! sanitizer ([`crate::dp_lines`]). Only the digital text layer is handled here;
14//! pages without one still fall back to OCR upstream.
15
16use std::collections::HashMap;
17
18use lopdf::{Dictionary, Document, Object};
19
20use crate::pdfium_backend::Glyph;
21
22/// A 2×3 affine matrix `[a b c d e f]`: maps `(x,y)` → `(a·x+c·y+e, b·x+d·y+f)`.
23#[derive(Clone, Copy)]
24struct Mat {
25    a: f64,
26    b: f64,
27    c: f64,
28    d: f64,
29    e: f64,
30    f: f64,
31}
32
33impl Mat {
34    const ID: Mat = Mat {
35        a: 1.0,
36        b: 0.0,
37        c: 0.0,
38        d: 1.0,
39        e: 0.0,
40        f: 0.0,
41    };
42
43    /// `self ∘ m`: the matrix that applies `self` first, then `m`.
44    fn then(self, m: Mat) -> Mat {
45        Mat {
46            a: self.a * m.a + self.b * m.c,
47            b: self.a * m.b + self.b * m.d,
48            c: self.c * m.a + self.d * m.c,
49            d: self.c * m.b + self.d * m.d,
50            e: self.e * m.a + self.f * m.c + m.e,
51            f: self.e * m.b + self.f * m.d + m.f,
52        }
53    }
54
55    fn apply(self, x: f64, y: f64) -> (f64, f64) {
56        (
57            self.a * x + self.c * y + self.e,
58            self.b * x + self.d * y + self.f,
59        )
60    }
61}
62
63/// A parsed font: how to turn raw string bytes into (unicode, advance) pairs.
64struct Font {
65    /// 2-byte codes (Type0 / Identity-H) vs 1-byte (simple fonts).
66    two_byte: bool,
67    /// code → Unicode string (from ToUnicode; may be multi-char, e.g. ligatures).
68    to_unicode: HashMap<u32, String>,
69    /// code → glyph advance, in 1000-unit glyph space.
70    widths: HashMap<u32, f64>,
71    default_width: f64,
72    /// 1-byte fallback decoding when ToUnicode lacks a code (WinAnsi-ish).
73    simple_encoding: Option<HashMap<u8, char>>,
74    /// code → raw `/Differences` glyph name, for GID-style names (`g115`) that
75    /// have no Unicode mapping. docling-parse emits these verbatim as `/g115`
76    /// (see the redp5110 bulleted list); matching it keeps no text skipped.
77    fallback_names: HashMap<u8, String>,
78    /// code → char from the embedded Type1 font program's own `/Encoding` vector,
79    /// used only as a last resort for glyphs the base encoding leaves unmapped
80    /// (standard TeX math fonts: `λ`, `≤`, …).
81    program_encoding: HashMap<u8, char>,
82    ascent: f64,
83    descent: f64,
84    hash: u64,
85}
86
87impl Font {
88    fn decode_code(&self, code: u32) -> (Option<String>, f64) {
89        let w = self
90            .widths
91            .get(&code)
92            .copied()
93            .unwrap_or(self.default_width);
94        if let Some(s) = self.to_unicode.get(&code) {
95            return (Some(decompose_ligatures(s)), w);
96        }
97        if !self.two_byte {
98            // A GID-style `/Differences` name (no Unicode) overrides the base
99            // encoding, matching docling's verbatim `/g115` fallback.
100            if let Some(name) = self.fallback_names.get(&(code as u8)) {
101                return (Some(format!("/{name}")), w);
102            }
103            if let Some(enc) = &self.simple_encoding {
104                if let Some(&ch) = enc.get(&(code as u8)) {
105                    return (Some(decompose_ligatures(&ch.to_string())), w);
106                }
107            }
108            // Last resort: the embedded Type1 font program's own `/Encoding`
109            // vector (`dup N /glyphname put`). Standard TeX math fonts (CMMI, CMSY,
110            // …) ship no PDF `/Encoding` and no ToUnicode, so a glyph like `λ`
111            // (CMMI code 21 → `/lambda`) or `≤` (CMSY code 20 → `/lessequal`) has
112            // no other mapping and would otherwise be silently dropped. docling
113            // recovers these from the same font program. This only fills codes the
114            // base encoding left unmapped, so it never changes an existing decode.
115            if let Some(&ch) = self.program_encoding.get(&(code as u8)) {
116                return (Some(decompose_ligatures(&ch.to_string())), w);
117            }
118        }
119        (None, w)
120    }
121}
122
123/// Spell out Latin presentation-form ligatures (`ﬁ`→`fi`, `ﬃ`→`ffi`, …) the way
124/// docling does, so `configuration`/`difficult` don't keep the ligature glyph.
125/// The chars share the ligature's box, so the line sanitizer recomposes them.
126fn decompose_ligatures(s: &str) -> String {
127    if !s.chars().any(|c| ('\u{FB00}'..='\u{FB06}').contains(&c)) {
128        return s.to_string();
129    }
130    s.chars()
131        .map(|c| {
132            match c {
133                '\u{FB00}' => "ff",
134                '\u{FB01}' => "fi",
135                '\u{FB02}' => "fl",
136                '\u{FB03}' => "ffi",
137                '\u{FB04}' => "ffl",
138                '\u{FB05}' => "ft",
139                '\u{FB06}' => "st",
140                _ => return c.to_string(),
141            }
142            .to_string()
143        })
144        .collect()
145}
146
147fn hash_name(name: &[u8]) -> u64 {
148    use std::hash::{Hash, Hasher};
149    let mut h = std::collections::hash_map::DefaultHasher::new();
150    name.hash(&mut h);
151    h.finish()
152}
153
154/// Resolve a possibly-indirect object to a dictionary.
155fn as_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
156    match obj {
157        Object::Dictionary(d) => Some(d),
158        Object::Reference(id) => doc.get_object(*id).ok().and_then(|o| o.as_dict().ok()),
159        _ => None,
160    }
161}
162
163fn deref<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
164    match obj {
165        Object::Reference(id) => doc.get_object(*id).ok(),
166        other => Some(other),
167    }
168}
169
170/// Parse one font dictionary into a [`Font`].
171fn parse_font(doc: &Document, name: &[u8], fdict: &Dictionary) -> Font {
172    let subtype: &[u8] = fdict
173        .get(b"Subtype")
174        .ok()
175        .and_then(|o| o.as_name().ok())
176        .unwrap_or(&[]);
177    let two_byte = subtype == b"Type0".as_slice();
178
179    let to_unicode = fdict
180        .get(b"ToUnicode")
181        .ok()
182        .and_then(|o| deref(doc, o))
183        .and_then(|o| o.as_stream().ok())
184        .and_then(|s| s.decompressed_content().ok())
185        .map(|data| parse_tounicode(&data))
186        .unwrap_or_default();
187
188    let (widths, default_width) = if two_byte {
189        cid_widths(doc, fdict)
190    } else {
191        simple_widths(doc, fdict)
192    };
193
194    let simple_encoding = if two_byte {
195        None
196    } else {
197        Some(simple_encoding_table(doc, fdict))
198    };
199    let fallback_names = if two_byte {
200        HashMap::new()
201    } else {
202        differences_gid_names(doc, fdict)
203    };
204    let program_encoding = if two_byte {
205        HashMap::new()
206    } else {
207        type1_program_encoding(doc, fdict)
208    };
209
210    let (ascent, descent) = font_ascent_descent(doc, fdict, two_byte);
211
212    Font {
213        two_byte,
214        to_unicode,
215        widths,
216        default_width,
217        simple_encoding,
218        fallback_names,
219        program_encoding,
220        ascent,
221        descent,
222        hash: hash_name(name),
223    }
224}
225
226/// Collect `/Differences` entries whose glyph name is a GID placeholder
227/// (`g115`, `cid42`, `glyph7`, `index9`) with no Unicode mapping. docling-parse
228/// emits such glyphs as the literal name `/g115`; mapping them here keeps the
229/// text from being silently dropped (subsetted fonts with no ToUnicode). The
230/// GID-name restriction keeps real Adobe glyph names on the normal path so this
231/// never invents garbage on the clean files.
232fn differences_gid_names(doc: &Document, fdict: &Dictionary) -> HashMap<u8, String> {
233    let mut map = HashMap::new();
234    let Some(Object::Dictionary(enc)) = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o))
235    else {
236        return map;
237    };
238    let Some(Object::Array(diffs)) = enc.get(b"Differences").ok().and_then(|o| deref(doc, o))
239    else {
240        return map;
241    };
242    let mut code = 0u8;
243    for el in diffs {
244        match el {
245            Object::Integer(i) => code = *i as u8,
246            Object::Name(name) => {
247                if glyph_name_to_char(name).is_none() && is_gid_name(name) {
248                    map.insert(code, String::from_utf8_lossy(name).into_owned());
249                }
250                code = code.wrapping_add(1);
251            }
252            _ => {}
253        }
254    }
255    map
256}
257
258/// Parse the embedded Type1 font program's built-in `/Encoding` vector
259/// (`dup <code> /<glyphname> put` entries in the clear-text header before
260/// `eexec`) into `code → char`. This is how docling recovers glyphs from
261/// standard TeX math fonts (CMMI/CMSY/…) that carry no PDF `/Encoding` and no
262/// ToUnicode — e.g. CMMI's `dup 21 /lambda` or CMSY's `dup 20 /lessequal`.
263/// Only `FontFile` (Type1) is parsed; CFF (`FontFile3`) and TrueType
264/// (`FontFile2`) store their encoding in a binary table and are left alone.
265fn type1_program_encoding(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
266    let mut map = HashMap::new();
267    let Some(desc) = fdict
268        .get(b"FontDescriptor")
269        .ok()
270        .and_then(|o| deref(doc, o))
271        .and_then(|o| o.as_dict().ok())
272    else {
273        return map;
274    };
275    let Some(data) = desc
276        .get(b"FontFile")
277        .ok()
278        .and_then(|o| deref(doc, o))
279        .and_then(|o| o.as_stream().ok())
280        .and_then(|s| s.decompressed_content().ok())
281    else {
282        return map;
283    };
284    // The clear-text header (PostScript) ends at `eexec`; the rest is encrypted.
285    let head_end = data
286        .windows(5)
287        .position(|w| w == b"eexec")
288        .unwrap_or(data.len());
289    let head = String::from_utf8_lossy(&data[..head_end]);
290    // Scan for `dup <code> /<name> put` tokens.
291    let toks: Vec<&str> = head.split_whitespace().collect();
292    for w in toks.windows(4) {
293        if w[0] == "dup" && w[3] == "put" {
294            if let (Ok(code), Some(name)) = (w[1].parse::<u32>(), w[2].strip_prefix('/')) {
295                if code <= 255 {
296                    if let Some(ch) = glyph_name_to_char(name.as_bytes()) {
297                        map.insert(code as u8, ch);
298                    }
299                }
300            }
301        }
302    }
303    map
304}
305
306/// A glyph name that is a synthetic placeholder, not a real Adobe name:
307/// `g115`, `cid42`, `glyph7`, `index9`, `G12`, or a short-prefix code name like
308/// `SM590000` (IBM BookMaster). These carry no Unicode meaning, and docling-parse
309/// emits them verbatim (`/SM590000`). `afii####` / `uni####` are real Adobe names
310/// and excluded. The restriction keeps genuine glyph names on the Unicode path.
311fn is_gid_name(name: &[u8]) -> bool {
312    let Ok(s) = std::str::from_utf8(name) else {
313        return false;
314    };
315    if s.starts_with("afii") || s.starts_with("uni") {
316        return false;
317    }
318    for prefix in ["g", "G", "cid", "CID", "glyph", "index"] {
319        if let Some(rest) = s.strip_prefix(prefix) {
320            if !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit()) {
321                return true;
322            }
323        }
324    }
325    // Short alpha prefix (≤3 letters) followed by a run of ≥3 digits — synthetic
326    // code names like `SM590000`, distinct from real Adobe names (whole words or
327    // letter+`.suffix` variants).
328    let alpha = s.bytes().take_while(|b| b.is_ascii_alphabetic()).count();
329    let digits = s.len() - alpha;
330    (1..=3).contains(&alpha)
331        && digits >= 3
332        && s.as_bytes()[alpha..].iter().all(|b| b.is_ascii_digit())
333}
334
335fn font_ascent_descent(doc: &Document, fdict: &Dictionary, two_byte: bool) -> (f64, f64) {
336    // For Type0, the descriptor lives on the descendant CIDFont.
337    let descr_owner = if two_byte {
338        fdict
339            .get(b"DescendantFonts")
340            .ok()
341            .and_then(|o| deref(doc, o))
342            .and_then(|o| match o {
343                Object::Array(a) => a.first(),
344                _ => None,
345            })
346            .and_then(|o| as_dict(doc, o))
347    } else {
348        Some(fdict)
349    };
350    let fd = descr_owner
351        .and_then(|d| d.get(b"FontDescriptor").ok())
352        .and_then(|o| as_dict(doc, o));
353    let asc = fd
354        .and_then(|d| d.get(b"Ascent").ok())
355        .and_then(|o| {
356            o.as_float()
357                .ok()
358                .or_else(|| o.as_i64().ok().map(|i| i as f32))
359        })
360        .unwrap_or(750.0) as f64;
361    let desc = fd
362        .and_then(|d| d.get(b"Descent").ok())
363        .and_then(|o| {
364            o.as_float()
365                .ok()
366                .or_else(|| o.as_i64().ok().map(|i| i as f32))
367        })
368        .unwrap_or(-250.0) as f64;
369    // Some subsetted fonts carry a degenerate FontDescriptor (`/Ascent 0
370    // /Descent 0`) — the real metrics live in the font program. That collapses
371    // the loose box to zero height, so the line cells get zero area and the
372    // layout's region/text assignment drops them (2305's References list lost
373    // every prose line, keeping only the URLs). Fall back to typical text metrics
374    // so the box has height.
375    if asc - desc <= 1.0 {
376        return (750.0, -250.0);
377    }
378    (asc, desc)
379}
380
381/// Simple-font widths: `/FirstChar` + `/Widths` array, `/MissingWidth` default.
382fn simple_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
383    let mut map = HashMap::new();
384    let first = fdict
385        .get(b"FirstChar")
386        .ok()
387        .and_then(|o| o.as_i64().ok())
388        .unwrap_or(0) as u32;
389    if let Some(Object::Array(arr)) = fdict.get(b"Widths").ok().and_then(|o| deref(doc, o)) {
390        for (i, w) in arr.iter().enumerate() {
391            if let Some(w) = num(w) {
392                map.insert(first + i as u32, w);
393            }
394        }
395    }
396    let dw = fdict
397        .get(b"FontDescriptor")
398        .ok()
399        .and_then(|o| as_dict(doc, o))
400        .and_then(|d| d.get(b"MissingWidth").ok())
401        .and_then(num)
402        .unwrap_or(0.0);
403    (map, dw)
404}
405
406/// CIDFont widths: the `/W` array on the descendant font (`/DW` default = 1000).
407fn cid_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
408    let mut map = HashMap::new();
409    let Some(desc) = fdict
410        .get(b"DescendantFonts")
411        .ok()
412        .and_then(|o| deref(doc, o))
413        .and_then(|o| match o {
414            Object::Array(a) => a.first(),
415            _ => None,
416        })
417        .and_then(|o| as_dict(doc, o))
418    else {
419        return (map, 1000.0);
420    };
421    let dw = desc.get(b"DW").ok().and_then(num).unwrap_or(1000.0);
422    if let Some(Object::Array(w)) = desc.get(b"W").ok().and_then(|o| deref(doc, o)) {
423        let mut i = 0;
424        while i < w.len() {
425            let c = w.get(i).and_then(num);
426            match (c, w.get(i + 1)) {
427                // `c [w1 w2 ...]`: consecutive CIDs starting at c.
428                (Some(c), Some(Object::Array(list))) => {
429                    for (k, wv) in list.iter().enumerate() {
430                        if let Some(wv) = num(wv) {
431                            map.insert(c as u32 + k as u32, wv);
432                        }
433                    }
434                    i += 2;
435                }
436                // `c_first c_last w`: a run all of width w.
437                (Some(c1), Some(o2)) => {
438                    if let (Some(c2), Some(wv)) = (num(o2), w.get(i + 2).and_then(num)) {
439                        for cid in c1 as u32..=c2 as u32 {
440                            map.insert(cid, wv);
441                        }
442                    }
443                    i += 3;
444                }
445                _ => break,
446            }
447        }
448    }
449    (map, dw)
450}
451
452fn num(o: &Object) -> Option<f64> {
453    match o {
454        Object::Integer(i) => Some(*i as f64),
455        Object::Real(r) => Some(*r as f64),
456        _ => None,
457    }
458}
459
460/// Parse a ToUnicode CMap's `bfchar` / `bfrange` sections into code→string.
461fn parse_tounicode(data: &[u8]) -> HashMap<u32, String> {
462    let text = String::from_utf8_lossy(data);
463    let mut map = HashMap::new();
464    let hex = |s: &str| -> Option<Vec<u16>> {
465        let s = s.trim();
466        if !s.starts_with('<') || !s.ends_with('>') {
467            return None;
468        }
469        let h = &s[1..s.len() - 1];
470        let bytes: Vec<u8> = (0..h.len())
471            .step_by(2)
472            .filter_map(|i| u8::from_str_radix(h.get(i..i + 2)?, 16).ok())
473            .collect();
474        Some(
475            bytes
476                .chunks(2)
477                .map(|c| {
478                    if c.len() == 2 {
479                        u16::from_be_bytes([c[0], c[1]])
480                    } else {
481                        c[0] as u16
482                    }
483                })
484                .collect(),
485        )
486    };
487    let u16s_to_string = |u: &[u16]| String::from_utf16_lossy(u);
488    let code_of = |u: &[u16]| u.iter().fold(0u32, |acc, &x| (acc << 16) | x as u32);
489
490    // Tokenize by structure, not whitespace: CMap hex groups are often written
491    // back-to-back with no separators (`<21><21><0054>`), so scan for `<…>`
492    // groups, `[`/`]` brackets, and bareword keywords.
493    let tokens: Vec<String> = {
494        let bytes = text.as_bytes();
495        let mut toks = Vec::new();
496        let mut i = 0;
497        while i < bytes.len() {
498            let c = bytes[i];
499            if c.is_ascii_whitespace() {
500                i += 1;
501            } else if c == b'<' {
502                let start = i;
503                while i < bytes.len() && bytes[i] != b'>' {
504                    i += 1;
505                }
506                i += 1; // include '>'
507                toks.push(String::from_utf8_lossy(&bytes[start..i.min(bytes.len())]).into_owned());
508            } else if c == b'[' || c == b']' {
509                toks.push((c as char).to_string());
510                i += 1;
511            } else {
512                let start = i;
513                while i < bytes.len()
514                    && !bytes[i].is_ascii_whitespace()
515                    && bytes[i] != b'<'
516                    && bytes[i] != b'['
517                    && bytes[i] != b']'
518                {
519                    i += 1;
520                }
521                toks.push(String::from_utf8_lossy(&bytes[start..i]).into_owned());
522            }
523        }
524        toks
525    };
526    let tokens: Vec<&str> = tokens.iter().map(|s| s.as_str()).collect();
527    let mut i = 0;
528    while i < tokens.len() {
529        match tokens[i] {
530            "beginbfchar" => {
531                i += 1;
532                while i + 1 < tokens.len() && tokens[i] != "endbfchar" {
533                    if let (Some(src), Some(dst)) = (hex(tokens[i]), hex(tokens[i + 1])) {
534                        map.insert(code_of(&src), u16s_to_string(&dst));
535                    }
536                    i += 2;
537                }
538            }
539            "beginbfrange" => {
540                i += 1;
541                while i + 2 < tokens.len() && tokens[i] != "endbfrange" {
542                    let (Some(lo), Some(hi)) = (hex(tokens[i]), hex(tokens[i + 1])) else {
543                        i += 1;
544                        continue;
545                    };
546                    let lo = code_of(&lo);
547                    let hi = code_of(&hi);
548                    if tokens[i + 2] == "[" {
549                        // `<lo> <hi> [ <d0> <d1> ... ]`: one dst per code in the range.
550                        let mut j = i + 3;
551                        let mut code = lo;
552                        while j < tokens.len() && tokens[j] != "]" {
553                            if let Some(dst) = hex(tokens[j]) {
554                                map.insert(code, u16s_to_string(&dst));
555                            }
556                            code += 1;
557                            j += 1;
558                        }
559                        i = j + 1;
560                    } else if let Some(dst) = hex(tokens[i + 2]) {
561                        // `<lo> <hi> <dst>`: consecutive Unicode from a base.
562                        let base = code_of(&dst);
563                        for (k, code) in (lo..=hi).enumerate() {
564                            if let Some(ch) = char::from_u32(base + k as u32) {
565                                map.insert(code, ch.to_string());
566                            }
567                        }
568                        i += 3;
569                    } else {
570                        i += 1;
571                    }
572                }
573            }
574            _ => i += 1,
575        }
576    }
577    map
578}
579
580/// Decode a PDF string literal in a Tj/TJ operand into raw code units.
581fn codes(font: &Font, bytes: &[u8]) -> Vec<u32> {
582    if font.two_byte {
583        bytes
584            .chunks(2)
585            .map(|c| {
586                if c.len() == 2 {
587                    ((c[0] as u32) << 8) | c[1] as u32
588                } else {
589                    c[0] as u32
590                }
591            })
592            .collect()
593    } else {
594        bytes.iter().map(|&b| b as u32).collect()
595    }
596}
597
598/// Page size (width, height) in PDF points from the MediaBox.
599fn page_size(doc: &Document, page_id: lopdf::ObjectId) -> (f32, f32) {
600    let mb = doc
601        .get_object(page_id)
602        .ok()
603        .and_then(|o| o.as_dict().ok())
604        .and_then(|d| {
605            // MediaBox may be inherited; lopdf resolves via get_page... fall back to a guess.
606            d.get(b"MediaBox").ok().cloned()
607        })
608        .or_else(|| {
609            doc.get_dictionary(page_id)
610                .ok()
611                .and_then(|d| d.get(b"MediaBox").ok().cloned())
612        });
613    if let Some(Object::Array(a)) = mb {
614        let v: Vec<f32> = a.iter().filter_map(|o| num(o).map(|x| x as f32)).collect();
615        if v.len() == 4 {
616            return ((v[2] - v[0]).abs(), (v[3] - v[1]).abs());
617        }
618    }
619    (612.0, 792.0)
620}
621
622/// Debug: raw glyph stream `(ch, ll, lr, lb, lt)` (native coords) for page
623/// `index`, before the sanitizer. For comparing char cells to docling-parse.
624pub fn debug_glyphs(bytes: &[u8], index: usize) -> Vec<(char, f32, f32, f32, f32)> {
625    let Ok(doc) = Document::load_mem(bytes) else {
626        return Vec::new();
627    };
628    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
629    pages.sort_by_key(|(n, _)| *n);
630    let Some((_, pid)) = pages.get(index) else {
631        return Vec::new();
632    };
633    page_glyphs(&doc, *pid)
634        .into_iter()
635        .map(|g| (g.ch, g.ll, g.lr, g.lb, g.lt))
636        .collect()
637}
638
639/// Public entry: per-page (width, height, line cells) for a PDF, via the Rust
640/// text parser + the docling-parse line sanitizer. Used by the pipeline and the
641/// `textparse_dump` example.
642pub fn pdf_textlines(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
643    let Ok(doc) = Document::load_mem(bytes) else {
644        return Vec::new();
645    };
646    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
647    pages.sort_by_key(|(n, _)| *n);
648    pages
649        .into_iter()
650        .map(|(_, pid)| {
651            let (w, h) = page_size(&doc, pid);
652            let glyphs = page_glyphs(&doc, pid);
653            let cells = crate::dp_lines::line_cells(&glyphs, h, true);
654            (w, h, cells)
655        })
656        .collect()
657}
658
659/// The text-state scalars inherited by a Form XObject when it is invoked via
660/// `Do` (the PDF graphics state includes the text parameters, but not the text
661/// matrices, which a form re-establishes inside its own `BT`/`ET`).
662#[derive(Clone, Copy)]
663struct TextState {
664    tc: f64,
665    tw: f64,
666    th: f64,
667    tl: f64,
668    trise: f64,
669    fsize: f64,
670}
671
672impl TextState {
673    const INIT: TextState = TextState {
674        tc: 0.0,
675        tw: 0.0,
676        th: 1.0,
677        tl: 0.0,
678        trise: 0.0,
679        fsize: 0.0,
680    };
681}
682
683/// The effective `/Resources` dictionary for a page (inline or via reference,
684/// falling back to an inherited one from a `/Parent`).
685fn page_res(doc: &Document, page_id: lopdf::ObjectId) -> Option<&Dictionary> {
686    let (inline, ids) = doc.get_page_resources(page_id).ok()?;
687    if let Some(d) = inline {
688        return Some(d);
689    }
690    ids.into_iter().find_map(|id| doc.get_dictionary(id).ok())
691}
692
693/// Build the code→[`Font`] map for a resources dictionary's `/Font` sub-dict.
694fn fonts_from_res(doc: &Document, res: &Dictionary) -> HashMap<Vec<u8>, Font> {
695    let mut map = HashMap::new();
696    let font_dict = res
697        .get(b"Font")
698        .ok()
699        .and_then(|o| deref(doc, o))
700        .and_then(|o| o.as_dict().ok());
701    if let Some(fd) = font_dict {
702        for (name, value) in fd.iter() {
703            if let Some(fdict) = deref(doc, value).and_then(|o| o.as_dict().ok()) {
704                map.insert(name.clone(), parse_font(doc, name, fdict));
705            }
706        }
707    }
708    map
709}
710
711/// Extract every glyph on a page as a native-coordinate [`Glyph`].
712pub(crate) fn page_glyphs(doc: &Document, page_id: lopdf::ObjectId) -> Vec<Glyph> {
713    let mut out = Vec::new();
714    let Ok(content_bytes) = doc.get_page_content(page_id) else {
715        return out;
716    };
717    let Ok(content) = lopdf::content::Content::decode(&content_bytes) else {
718        return out;
719    };
720    if let Some(res) = page_res(doc, page_id) {
721        run_content(doc, res, &content, Mat::ID, TextState::INIT, 0, &mut out);
722    }
723    out
724}
725
726/// Run a content stream's operators, emitting glyphs into `out`. Recurses into
727/// Form XObjects on `Do` (bulk body text in heavy PDFs lives inside a form, not
728/// the page content stream). `res` is the resources dict in scope (the page's,
729/// or the form's own); `base_ctm` is the CTM at the point of invocation.
730fn run_content(
731    doc: &Document,
732    res: &Dictionary,
733    content: &lopdf::content::Content,
734    base_ctm: Mat,
735    init: TextState,
736    depth: u32,
737    out: &mut Vec<Glyph>,
738) {
739    let fonts = fonts_from_res(doc, res);
740    let xobjects = res
741        .get(b"XObject")
742        .ok()
743        .and_then(|o| deref(doc, o))
744        .and_then(|o| o.as_dict().ok());
745
746    // Graphics + text state. `q`/`Q` save and restore the whole graphics state,
747    // which includes the text parameters (Tc, Tw, Tz, TL, Tfs, Trise, font) —
748    // *not* the text matrix (that is reset by BT). Saving only the CTM let a Tc
749    // set inside a `q…Q` block leak out and drift every later glyph.
750    #[allow(clippy::type_complexity)]
751    let mut gstate_stack: Vec<(Mat, f64, f64, f64, f64, f64, f64, Option<&Font>)> = Vec::new();
752    let mut ctm = base_ctm;
753    let mut tm = Mat::ID;
754    let mut tlm = Mat::ID;
755    let mut font: Option<&Font> = None;
756    let mut fsize = init.fsize;
757    let mut tc = init.tc; // char spacing
758    let mut tw = init.tw; // word spacing
759    let mut th = init.th; // horizontal scale (Tz/100)
760    let mut tl = init.tl; // leading
761    let mut trise = init.trise;
762
763    let op_f = |operands: &[Object], i: usize| operands.get(i).and_then(num).unwrap_or(0.0);
764
765    for op in &content.operations {
766        let operands = &op.operands;
767        match op.operator.as_str() {
768            "q" => gstate_stack.push((ctm, tc, tw, th, tl, trise, fsize, font)),
769            "Q" => {
770                if let Some((c, a, b, h, l, r, fs, f)) = gstate_stack.pop() {
771                    ctm = c;
772                    tc = a;
773                    tw = b;
774                    th = h;
775                    tl = l;
776                    trise = r;
777                    fsize = fs;
778                    font = f;
779                }
780            }
781            "cm" => {
782                let m = Mat {
783                    a: op_f(operands, 0),
784                    b: op_f(operands, 1),
785                    c: op_f(operands, 2),
786                    d: op_f(operands, 3),
787                    e: op_f(operands, 4),
788                    f: op_f(operands, 5),
789                };
790                ctm = m.then(ctm);
791            }
792            "BT" => {
793                tm = Mat::ID;
794                tlm = Mat::ID;
795            }
796            "ET" => {}
797            "Tf" => {
798                if let Some(Object::Name(n)) = operands.first() {
799                    font = fonts.get(n.as_slice());
800                }
801                fsize = op_f(operands, 1);
802            }
803            "Td" => {
804                tlm = Mat {
805                    a: 1.0,
806                    b: 0.0,
807                    c: 0.0,
808                    d: 1.0,
809                    e: op_f(operands, 0),
810                    f: op_f(operands, 1),
811                }
812                .then(tlm);
813                tm = tlm;
814            }
815            "TD" => {
816                tl = -op_f(operands, 1);
817                tlm = Mat {
818                    a: 1.0,
819                    b: 0.0,
820                    c: 0.0,
821                    d: 1.0,
822                    e: op_f(operands, 0),
823                    f: op_f(operands, 1),
824                }
825                .then(tlm);
826                tm = tlm;
827            }
828            "Tm" => {
829                tlm = Mat {
830                    a: op_f(operands, 0),
831                    b: op_f(operands, 1),
832                    c: op_f(operands, 2),
833                    d: op_f(operands, 3),
834                    e: op_f(operands, 4),
835                    f: op_f(operands, 5),
836                };
837                tm = tlm;
838            }
839            "T*" => {
840                tlm = Mat {
841                    a: 1.0,
842                    b: 0.0,
843                    c: 0.0,
844                    d: 1.0,
845                    e: 0.0,
846                    f: -tl,
847                }
848                .then(tlm);
849                tm = tlm;
850            }
851            "Tc" => tc = op_f(operands, 0),
852            "Tw" => tw = op_f(operands, 0),
853            "Tz" => th = op_f(operands, 0) / 100.0,
854            "TL" => tl = op_f(operands, 0),
855            "Ts" => trise = op_f(operands, 0),
856            "Tj" | "'" | "\"" => {
857                if op.operator == "'" || op.operator == "\"" {
858                    // move to next line first
859                    tlm = Mat {
860                        a: 1.0,
861                        b: 0.0,
862                        c: 0.0,
863                        d: 1.0,
864                        e: 0.0,
865                        f: -tl,
866                    }
867                    .then(tlm);
868                    tm = tlm;
869                }
870                if let (Some(f), Some(Object::String(s, _))) = (font, operands.last()) {
871                    show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out);
872                }
873            }
874            "TJ" => {
875                if let (Some(f), Some(Object::Array(arr))) = (font, operands.first()) {
876                    for el in arr {
877                        match el {
878                            Object::String(s, _) => {
879                                show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out)
880                            }
881                            other => {
882                                if let Some(adj) = num(other) {
883                                    // negative number moves text right (PDF: subtract)
884                                    let tx = -adj / 1000.0 * fsize * th;
885                                    tm = Mat {
886                                        a: 1.0,
887                                        b: 0.0,
888                                        c: 0.0,
889                                        d: 1.0,
890                                        e: tx,
891                                        f: 0.0,
892                                    }
893                                    .then(tm);
894                                }
895                            }
896                        }
897                    }
898                }
899            }
900            "Do" => {
901                // Invoke a Form XObject: bulk body text in many PDFs lives inside
902                // a form, reached only here. Image XObjects are skipped (no text).
903                if depth >= 8 {
904                    continue;
905                }
906                let Some(Object::Name(n)) = operands.first() else {
907                    continue;
908                };
909                let stream = xobjects
910                    .and_then(|d| d.get(n.as_slice()).ok())
911                    .and_then(|o| deref(doc, o))
912                    .and_then(|o| o.as_stream().ok());
913                let Some(stream) = stream else { continue };
914                let is_form = stream
915                    .dict
916                    .get(b"Subtype")
917                    .ok()
918                    .and_then(|o| o.as_name().ok())
919                    == Some(b"Form".as_slice());
920                if !is_form {
921                    continue;
922                }
923                let Ok(data) = stream.decompressed_content() else {
924                    continue;
925                };
926                let Ok(form_content) = lopdf::content::Content::decode(&data) else {
927                    continue;
928                };
929                // The form's /Matrix maps form space into the CTM at invocation.
930                let form_mat = match stream.dict.get(b"Matrix").ok() {
931                    Some(Object::Array(a)) if a.len() == 6 => {
932                        let v: Vec<f64> = a.iter().filter_map(num).collect();
933                        if v.len() == 6 {
934                            Mat {
935                                a: v[0],
936                                b: v[1],
937                                c: v[2],
938                                d: v[3],
939                                e: v[4],
940                                f: v[5],
941                            }
942                        } else {
943                            Mat::ID
944                        }
945                    }
946                    _ => Mat::ID,
947                };
948                // The form's own /Resources, falling back to the inherited ones.
949                let form_res = stream
950                    .dict
951                    .get(b"Resources")
952                    .ok()
953                    .and_then(|o| deref(doc, o))
954                    .and_then(|o| o.as_dict().ok())
955                    .unwrap_or(res);
956                let state = TextState {
957                    tc,
958                    tw,
959                    th,
960                    tl,
961                    trise,
962                    fsize,
963                };
964                run_content(
965                    doc,
966                    form_res,
967                    &form_content,
968                    form_mat.then(ctm),
969                    state,
970                    depth + 1,
971                    out,
972                );
973            }
974            _ => {}
975        }
976    }
977}
978
979#[allow(clippy::too_many_arguments)]
980fn show_text(
981    font: &Font,
982    bytes: &[u8],
983    fsize: f64,
984    tc: f64,
985    tw: f64,
986    th: f64,
987    trise: f64,
988    tm: &mut Mat,
989    ctm: Mat,
990    out: &mut Vec<Glyph>,
991) {
992    for code in codes(font, bytes) {
993        let (text, w) = font.decode_code(code);
994        let w0 = w / 1000.0; // advance in text-space (em) units
995                             // The glyph→user transform: scale glyph space by font size, then Tm, CTM.
996        let scale = Mat {
997            a: fsize * th,
998            b: 0.0,
999            c: 0.0,
1000            d: fsize,
1001            e: 0.0,
1002            f: trise,
1003        };
1004        let trm = scale.then(*tm).then(ctm);
1005        // Box in glyph space (1000-unit em): x 0..w, y descent..ascent.
1006        let (x0, y0) = trm.apply(0.0, font.descent / 1000.0);
1007        let (x1, _y1) = trm.apply(w0, font.descent / 1000.0);
1008        let (_x2, y2) = trm.apply(0.0, font.ascent / 1000.0);
1009        let (left, right) = (x0.min(x1), x0.max(x1));
1010        let (bot, top) = (y0.min(y2), y0.max(y2));
1011        if let Some(s) = text {
1012            // A run may map one code to multiple chars (ligature/fraction); share box.
1013            for ch in s.chars() {
1014                if ch != '\u{0}' {
1015                    out.push(Glyph {
1016                        ch,
1017                        l: left as f32,
1018                        b: bot as f32,
1019                        r: right as f32,
1020                        t: top as f32,
1021                        ll: left as f32,
1022                        lb: bot as f32,
1023                        lr: right as f32,
1024                        lt: top as f32,
1025                        font: font.hash,
1026                    });
1027                }
1028            }
1029        }
1030        // Advance the text matrix. Word spacing applies to single-byte code 32.
1031        let is_space = !font.two_byte && code == 32;
1032        let tx = (w0 * fsize + tc + if is_space { tw } else { 0.0 }) * th;
1033        *tm = Mat {
1034            a: 1.0,
1035            b: 0.0,
1036            c: 0.0,
1037            d: 1.0,
1038            e: tx,
1039            f: 0.0,
1040        }
1041        .then(*tm);
1042    }
1043}
1044
1045/// Build a simple font's code→char table from its `/Encoding`: the base
1046/// encoding (WinAnsi / MacRoman) plus any `/Differences` overrides (glyph names
1047/// resolved through a small Adobe-glyph-name subset).
1048fn simple_encoding_table(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
1049    let enc = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o));
1050    let base_name = match enc {
1051        Some(Object::Name(n)) => n.clone(),
1052        Some(Object::Dictionary(d)) => d
1053            .get(b"BaseEncoding")
1054            .ok()
1055            .and_then(|o| o.as_name().ok())
1056            .map(|n| n.to_vec())
1057            .unwrap_or_default(),
1058        _ => Vec::new(),
1059    };
1060    let mut m = if base_name == b"MacRomanEncoding" {
1061        macroman_table()
1062    } else {
1063        winansi_table()
1064    };
1065    // Apply /Differences: `code /glyphname /glyphname ... code ...`.
1066    if let Some(Object::Dictionary(d)) = enc {
1067        if let Some(Object::Array(diffs)) = d.get(b"Differences").ok().and_then(|o| deref(doc, o)) {
1068            let mut code = 0u8;
1069            for el in diffs {
1070                match el {
1071                    Object::Integer(i) => code = *i as u8,
1072                    Object::Name(name) => {
1073                        if let Some(ch) = glyph_name_to_char(name) {
1074                            m.insert(code, ch);
1075                        }
1076                        code = code.wrapping_add(1);
1077                    }
1078                    _ => {}
1079                }
1080            }
1081        }
1082    }
1083    m
1084}
1085
1086/// Resolve an Adobe glyph name to Unicode: `uniXXXX`, single ASCII letters, the
1087/// digit/punctuation names from the Adobe Glyph List, and common typographic
1088/// names. A `.suffix` (`one.taboldstyle`, `a.sc`) is stripped and the base name
1089/// retried — docling renders these as the base character.
1090fn glyph_name_to_char(name: &[u8]) -> Option<char> {
1091    let s = std::str::from_utf8(name).ok()?;
1092    if let Some(hex) = s.strip_prefix("uni") {
1093        if let Ok(cp) = u32::from_str_radix(hex.get(0..4)?, 16) {
1094            return char::from_u32(cp);
1095        }
1096    }
1097    // Single ASCII letter names (`A`, `m`) map to themselves.
1098    if s.len() == 1 {
1099        let b = s.as_bytes()[0];
1100        if b.is_ascii_alphabetic() {
1101            return Some(b as char);
1102        }
1103    }
1104    let resolved = match s {
1105        "space" => ' ',
1106        "exclam" => '!',
1107        "quotedbl" => '"',
1108        "numbersign" => '#',
1109        "dollar" => '$',
1110        "percent" => '%',
1111        "ampersand" => '&',
1112        "quotesingle" => '\'',
1113        "parenleft" => '(',
1114        "parenright" => ')',
1115        "asterisk" => '*',
1116        "plus" => '+',
1117        "comma" => ',',
1118        "hyphen" => '-',
1119        "period" => '.',
1120        "slash" => '/',
1121        "zero" => '0',
1122        "one" => '1',
1123        "two" => '2',
1124        "three" => '3',
1125        "four" => '4',
1126        "five" => '5',
1127        "six" => '6',
1128        "seven" => '7',
1129        "eight" => '8',
1130        "nine" => '9',
1131        "colon" => ':',
1132        "semicolon" => ';',
1133        "less" => '<',
1134        "equal" => '=',
1135        "greater" => '>',
1136        "question" => '?',
1137        "at" => '@',
1138        "bracketleft" => '[',
1139        "backslash" => '\\',
1140        "bracketright" => ']',
1141        "asciicircum" => '^',
1142        "underscore" => '_',
1143        "grave" => '`',
1144        "braceleft" => '{',
1145        "bar" => '|',
1146        "braceright" => '}',
1147        "asciitilde" => '~',
1148        "bullet" => '\u{2022}',
1149        "periodcentered" => '\u{00B7}',
1150        "endash" => '\u{2013}',
1151        "emdash" => '\u{2014}',
1152        "quoteright" => '\u{2019}',
1153        "quoteleft" => '\u{2018}',
1154        "quotedblleft" => '\u{201C}',
1155        "quotedblright" => '\u{201D}',
1156        "quotedblbase" => '\u{201E}',
1157        "quotesinglbase" => '\u{201A}',
1158        // Latin f-ligatures named in `/Differences` (e.g. 2305's body font). These
1159        // map to the presentation-form code points, which `decompose_ligatures`
1160        // then spells back out (`ff`→"ff") — without them the glyph decodes to
1161        // nothing and the sanitizer fills the gap with a space (`di erences`).
1162        "ff" => '\u{FB00}',
1163        "fi" => '\u{FB01}',
1164        "fl" => '\u{FB02}',
1165        "ffi" => '\u{FB03}',
1166        "ffl" => '\u{FB04}',
1167        "ft" => '\u{FB05}',
1168        "st" => '\u{FB06}',
1169        "degree" => '\u{00B0}',
1170        "trademark" => '\u{2122}',
1171        "registered" => '\u{00AE}',
1172        "copyright" => '\u{00A9}',
1173        "ellipsis" => '\u{2026}',
1174        "minus" => '\u{2212}',
1175        "fraction" => '\u{2044}',
1176        "nbspace" => '\u{00A0}',
1177        // Greek + math glyph names (standard Adobe Glyph List). Standard TeX math
1178        // fonts (CMMI/CMSY/…) name their glyphs this way in the embedded font
1179        // program's `/Encoding`; without these a `λ`/`≤` decodes to nothing and is
1180        // dropped from body text (`and λ set to 0.5` → `and set to 0.5`).
1181        "alpha" => '\u{03B1}',
1182        "beta" => '\u{03B2}',
1183        "gamma" => '\u{03B3}',
1184        "delta" => '\u{03B4}',
1185        "epsilon" | "epsilon1" => '\u{03B5}',
1186        "zeta" => '\u{03B6}',
1187        "eta" => '\u{03B7}',
1188        "theta" | "theta1" => '\u{03B8}',
1189        "iota" => '\u{03B9}',
1190        "kappa" => '\u{03BA}',
1191        "lambda" => '\u{03BB}',
1192        "mu" => '\u{03BC}',
1193        "nu" => '\u{03BD}',
1194        "xi" => '\u{03BE}',
1195        "omicron" => '\u{03BF}',
1196        "pi" | "pi1" => '\u{03C0}',
1197        "rho" | "rho1" => '\u{03C1}',
1198        "sigma" => '\u{03C3}',
1199        "sigma1" => '\u{03C2}',
1200        "tau" => '\u{03C4}',
1201        "upsilon" => '\u{03C5}',
1202        "phi" | "phi1" => '\u{03C6}',
1203        "chi" => '\u{03C7}',
1204        "psi" => '\u{03C8}',
1205        "omega" | "omega1" => '\u{03C9}',
1206        "Gamma" => '\u{0393}',
1207        "Delta" => '\u{0394}',
1208        "Theta" => '\u{0398}',
1209        "Lambda" => '\u{039B}',
1210        "Xi" => '\u{039E}',
1211        "Pi" => '\u{03A0}',
1212        "Sigma" => '\u{03A3}',
1213        "Upsilon" => '\u{03A5}',
1214        "Phi" => '\u{03A6}',
1215        "Psi" => '\u{03A8}',
1216        "Omega" => '\u{03A9}',
1217        "lessequal" => '\u{2264}',
1218        "greaterequal" => '\u{2265}',
1219        "notequal" => '\u{2260}',
1220        "approxequal" => '\u{2248}',
1221        "equivalence" => '\u{2261}',
1222        "element" => '\u{2208}',
1223        "plusminus" => '\u{00B1}',
1224        "multiply" => '\u{00D7}',
1225        "divide" => '\u{00F7}',
1226        "infinity" => '\u{221E}',
1227        "partialdiff" => '\u{2202}',
1228        "gradient" => '\u{2207}',
1229        "summation" => '\u{2211}',
1230        "product" => '\u{220F}',
1231        "integral" => '\u{222B}',
1232        "radical" => '\u{221A}',
1233        "proportional" => '\u{221D}',
1234        "arrowright" => '\u{2192}',
1235        "arrowleft" => '\u{2190}',
1236        "arrowup" => '\u{2191}',
1237        "arrowdown" => '\u{2193}',
1238        "arrowboth" => '\u{2194}',
1239        "arrowdblright" => '\u{21D2}',
1240        "logicaland" => '\u{2227}',
1241        "logicalor" => '\u{2228}',
1242        "intersection" => '\u{2229}',
1243        "union" => '\u{222A}',
1244        "similar" => '\u{223C}',
1245        "congruent" => '\u{2245}',
1246        "dotmath" => '\u{22C5}',
1247        "asteriskmath" => '\u{2217}',
1248        _ => {
1249            // Strip an AGL `.suffix` (oldstyle/small-cap variant) and retry.
1250            if let Some((base, _)) = s.split_once('.') {
1251                if !base.is_empty() {
1252                    return glyph_name_to_char(base.as_bytes());
1253                }
1254            }
1255            return None;
1256        }
1257    };
1258    Some(resolved)
1259}
1260
1261/// Minimal WinAnsiEncoding (Latin-1-ish) for simple fonts lacking ToUnicode.
1262fn winansi_table() -> HashMap<u8, char> {
1263    let mut m = HashMap::new();
1264    for b in 0x20u8..=0x7e {
1265        m.insert(b, b as char);
1266    }
1267    // High range: Windows-1252 printable points that differ from Latin-1.
1268    let extra: &[(u8, char)] = &[
1269        (0x91, '\u{2018}'),
1270        (0x92, '\u{2019}'),
1271        (0x93, '\u{201C}'),
1272        (0x94, '\u{201D}'),
1273        (0x95, '\u{2022}'),
1274        (0x96, '\u{2013}'),
1275        (0x97, '\u{2014}'),
1276        (0x85, '\u{2026}'),
1277        (0xA0, '\u{00A0}'),
1278    ];
1279    for &(b, c) in extra {
1280        m.insert(b, c);
1281    }
1282    for b in 0xA1u8..=0xFF {
1283        m.entry(b).or_insert(b as char);
1284    }
1285    m
1286}
1287
1288/// Minimal MacRomanEncoding: ASCII plus the high-range points our corpus hits
1289/// (notably 0xA5 = bullet, used as a list marker).
1290fn macroman_table() -> HashMap<u8, char> {
1291    let mut m = HashMap::new();
1292    for b in 0x20u8..=0x7e {
1293        m.insert(b, b as char);
1294    }
1295    let high: &[(u8, char)] = &[
1296        (0xA5, '\u{2022}'), // bullet
1297        (0xD0, '\u{2013}'), // endash
1298        (0xD1, '\u{2014}'), // emdash
1299        (0xD2, '\u{201C}'),
1300        (0xD3, '\u{201D}'),
1301        (0xD4, '\u{2018}'),
1302        (0xD5, '\u{2019}'),
1303        (0xCA, '\u{00A0}'),
1304        (0xC9, '\u{2026}'),
1305        (0xDE, '\u{FB01}'),
1306        (0xDF, '\u{FB02}'),
1307    ];
1308    for &(b, c) in high {
1309        m.insert(b, c);
1310    }
1311    m
1312}
fleischwolf_pdf/textparse.rs

fleischwolf_pdf/
textparse.rs