fleischwolf_pdf/
textparse.rs

1//! Pure-Rust PDF text extraction (replacing pdfium's glyph layer).
2//!
3//! pdfium reports *rendered* glyph boxes, which diverge from docling's
4//! `docling-parse` C++ parser at exactly the points that drive conformance:
5//! generated spaces get a zero-width box, combining diacritics get a real-width
6//! box, and ligature/fraction glyphs land at different x. This module instead
7//! reconstructs each glyph's box from the **font's own advance widths** and the
8//! PDF text/graphics matrices — the same information docling-parse uses — so a
9//! space is as wide as the font says and a combining mark has zero advance.
10//!
11//! The output is the same [`Glyph`] stream pdfium produces (native PDF
12//! coordinates, y-up), fed straight into the existing docling-parse line
13//! sanitizer ([`crate::dp_lines`]). Only the digital text layer is handled here;
14//! pages without one still fall back to OCR upstream.
15
16use std::collections::HashMap;
17
18use lopdf::{Dictionary, Document, Object};
19
20use crate::pdfium_backend::Glyph;
21
22/// A 2×3 affine matrix `[a b c d e f]`: maps `(x,y)` → `(a·x+c·y+e, b·x+d·y+f)`.
23#[derive(Clone, Copy)]
24struct Mat {
25    a: f64,
26    b: f64,
27    c: f64,
28    d: f64,
29    e: f64,
30    f: f64,
31}
32
33impl Mat {
34    const ID: Mat = Mat {
35        a: 1.0,
36        b: 0.0,
37        c: 0.0,
38        d: 1.0,
39        e: 0.0,
40        f: 0.0,
41    };
42
43    /// `self ∘ m`: the matrix that applies `self` first, then `m`.
44    fn then(self, m: Mat) -> Mat {
45        Mat {
46            a: self.a * m.a + self.b * m.c,
47            b: self.a * m.b + self.b * m.d,
48            c: self.c * m.a + self.d * m.c,
49            d: self.c * m.b + self.d * m.d,
50            e: self.e * m.a + self.f * m.c + m.e,
51            f: self.e * m.b + self.f * m.d + m.f,
52        }
53    }
54
55    fn apply(self, x: f64, y: f64) -> (f64, f64) {
56        (
57            self.a * x + self.c * y + self.e,
58            self.b * x + self.d * y + self.f,
59        )
60    }
61}
62
63/// A parsed font: how to turn raw string bytes into (unicode, advance) pairs.
64struct Font {
65    /// 2-byte codes (Type0 / Identity-H) vs 1-byte (simple fonts).
66    two_byte: bool,
67    /// code → Unicode string (from ToUnicode; may be multi-char, e.g. ligatures).
68    to_unicode: HashMap<u32, String>,
69    /// code → glyph advance, in 1000-unit glyph space.
70    widths: HashMap<u32, f64>,
71    default_width: f64,
72    /// 1-byte fallback decoding when ToUnicode lacks a code (WinAnsi-ish).
73    simple_encoding: Option<HashMap<u8, char>>,
74    /// code → raw `/Differences` glyph name, for GID-style names (`g115`) that
75    /// have no Unicode mapping. docling-parse emits these verbatim as `/g115`
76    /// (see the redp5110 bulleted list); matching it keeps no text skipped.
77    fallback_names: HashMap<u8, String>,
78    ascent: f64,
79    descent: f64,
80    hash: u64,
81}
82
83impl Font {
84    fn decode_code(&self, code: u32) -> (Option<String>, f64) {
85        let w = self
86            .widths
87            .get(&code)
88            .copied()
89            .unwrap_or(self.default_width);
90        if let Some(s) = self.to_unicode.get(&code) {
91            return (Some(decompose_ligatures(s)), w);
92        }
93        if !self.two_byte {
94            // A GID-style `/Differences` name (no Unicode) overrides the base
95            // encoding, matching docling's verbatim `/g115` fallback.
96            if let Some(name) = self.fallback_names.get(&(code as u8)) {
97                return (Some(format!("/{name}")), w);
98            }
99            if let Some(enc) = &self.simple_encoding {
100                if let Some(&ch) = enc.get(&(code as u8)) {
101                    return (Some(decompose_ligatures(&ch.to_string())), w);
102                }
103            }
104        }
105        (None, w)
106    }
107}
108
109/// Spell out Latin presentation-form ligatures (`ﬁ`→`fi`, `ﬃ`→`ffi`, …) the way
110/// docling does, so `configuration`/`difficult` don't keep the ligature glyph.
111/// The chars share the ligature's box, so the line sanitizer recomposes them.
112fn decompose_ligatures(s: &str) -> String {
113    if !s.chars().any(|c| ('\u{FB00}'..='\u{FB06}').contains(&c)) {
114        return s.to_string();
115    }
116    s.chars()
117        .map(|c| {
118            match c {
119                '\u{FB00}' => "ff",
120                '\u{FB01}' => "fi",
121                '\u{FB02}' => "fl",
122                '\u{FB03}' => "ffi",
123                '\u{FB04}' => "ffl",
124                '\u{FB05}' => "ft",
125                '\u{FB06}' => "st",
126                _ => return c.to_string(),
127            }
128            .to_string()
129        })
130        .collect()
131}
132
133fn hash_name(name: &[u8]) -> u64 {
134    use std::hash::{Hash, Hasher};
135    let mut h = std::collections::hash_map::DefaultHasher::new();
136    name.hash(&mut h);
137    h.finish()
138}
139
140/// Resolve a possibly-indirect object to a dictionary.
141fn as_dict<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Dictionary> {
142    match obj {
143        Object::Dictionary(d) => Some(d),
144        Object::Reference(id) => doc.get_object(*id).ok().and_then(|o| o.as_dict().ok()),
145        _ => None,
146    }
147}
148
149fn deref<'a>(doc: &'a Document, obj: &'a Object) -> Option<&'a Object> {
150    match obj {
151        Object::Reference(id) => doc.get_object(*id).ok(),
152        other => Some(other),
153    }
154}
155
156/// Parse one font dictionary into a [`Font`].
157fn parse_font(doc: &Document, name: &[u8], fdict: &Dictionary) -> Font {
158    let subtype: &[u8] = fdict
159        .get(b"Subtype")
160        .ok()
161        .and_then(|o| o.as_name().ok())
162        .unwrap_or(&[]);
163    let two_byte = subtype == b"Type0".as_slice();
164
165    let to_unicode = fdict
166        .get(b"ToUnicode")
167        .ok()
168        .and_then(|o| deref(doc, o))
169        .and_then(|o| o.as_stream().ok())
170        .and_then(|s| s.decompressed_content().ok())
171        .map(|data| parse_tounicode(&data))
172        .unwrap_or_default();
173
174    let (widths, default_width) = if two_byte {
175        cid_widths(doc, fdict)
176    } else {
177        simple_widths(doc, fdict)
178    };
179
180    let simple_encoding = if two_byte {
181        None
182    } else {
183        Some(simple_encoding_table(doc, fdict))
184    };
185    let fallback_names = if two_byte {
186        HashMap::new()
187    } else {
188        differences_gid_names(doc, fdict)
189    };
190
191    let (ascent, descent) = font_ascent_descent(doc, fdict, two_byte);
192
193    Font {
194        two_byte,
195        to_unicode,
196        widths,
197        default_width,
198        simple_encoding,
199        fallback_names,
200        ascent,
201        descent,
202        hash: hash_name(name),
203    }
204}
205
206/// Collect `/Differences` entries whose glyph name is a GID placeholder
207/// (`g115`, `cid42`, `glyph7`, `index9`) with no Unicode mapping. docling-parse
208/// emits such glyphs as the literal name `/g115`; mapping them here keeps the
209/// text from being silently dropped (subsetted fonts with no ToUnicode). The
210/// GID-name restriction keeps real Adobe glyph names on the normal path so this
211/// never invents garbage on the clean files.
212fn differences_gid_names(doc: &Document, fdict: &Dictionary) -> HashMap<u8, String> {
213    let mut map = HashMap::new();
214    let Some(Object::Dictionary(enc)) = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o))
215    else {
216        return map;
217    };
218    let Some(Object::Array(diffs)) = enc.get(b"Differences").ok().and_then(|o| deref(doc, o))
219    else {
220        return map;
221    };
222    let mut code = 0u8;
223    for el in diffs {
224        match el {
225            Object::Integer(i) => code = *i as u8,
226            Object::Name(name) => {
227                if glyph_name_to_char(name).is_none() && is_gid_name(name) {
228                    map.insert(code, String::from_utf8_lossy(name).into_owned());
229                }
230                code = code.wrapping_add(1);
231            }
232            _ => {}
233        }
234    }
235    map
236}
237
238/// A glyph name that is a synthetic placeholder, not a real Adobe name:
239/// `g115`, `cid42`, `glyph7`, `index9`, `G12`, or a short-prefix code name like
240/// `SM590000` (IBM BookMaster). These carry no Unicode meaning, and docling-parse
241/// emits them verbatim (`/SM590000`). `afii####` / `uni####` are real Adobe names
242/// and excluded. The restriction keeps genuine glyph names on the Unicode path.
243fn is_gid_name(name: &[u8]) -> bool {
244    let Ok(s) = std::str::from_utf8(name) else {
245        return false;
246    };
247    if s.starts_with("afii") || s.starts_with("uni") {
248        return false;
249    }
250    for prefix in ["g", "G", "cid", "CID", "glyph", "index"] {
251        if let Some(rest) = s.strip_prefix(prefix) {
252            if !rest.is_empty() && rest.bytes().all(|b| b.is_ascii_digit()) {
253                return true;
254            }
255        }
256    }
257    // Short alpha prefix (≤3 letters) followed by a run of ≥3 digits — synthetic
258    // code names like `SM590000`, distinct from real Adobe names (whole words or
259    // letter+`.suffix` variants).
260    let alpha = s.bytes().take_while(|b| b.is_ascii_alphabetic()).count();
261    let digits = s.len() - alpha;
262    (1..=3).contains(&alpha)
263        && digits >= 3
264        && s.as_bytes()[alpha..].iter().all(|b| b.is_ascii_digit())
265}
266
267fn font_ascent_descent(doc: &Document, fdict: &Dictionary, two_byte: bool) -> (f64, f64) {
268    // For Type0, the descriptor lives on the descendant CIDFont.
269    let descr_owner = if two_byte {
270        fdict
271            .get(b"DescendantFonts")
272            .ok()
273            .and_then(|o| deref(doc, o))
274            .and_then(|o| match o {
275                Object::Array(a) => a.first(),
276                _ => None,
277            })
278            .and_then(|o| as_dict(doc, o))
279    } else {
280        Some(fdict)
281    };
282    let fd = descr_owner
283        .and_then(|d| d.get(b"FontDescriptor").ok())
284        .and_then(|o| as_dict(doc, o));
285    let asc = fd
286        .and_then(|d| d.get(b"Ascent").ok())
287        .and_then(|o| {
288            o.as_float()
289                .ok()
290                .or_else(|| o.as_i64().ok().map(|i| i as f32))
291        })
292        .unwrap_or(750.0) as f64;
293    let desc = fd
294        .and_then(|d| d.get(b"Descent").ok())
295        .and_then(|o| {
296            o.as_float()
297                .ok()
298                .or_else(|| o.as_i64().ok().map(|i| i as f32))
299        })
300        .unwrap_or(-250.0) as f64;
301    (asc, desc)
302}
303
304/// Simple-font widths: `/FirstChar` + `/Widths` array, `/MissingWidth` default.
305fn simple_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
306    let mut map = HashMap::new();
307    let first = fdict
308        .get(b"FirstChar")
309        .ok()
310        .and_then(|o| o.as_i64().ok())
311        .unwrap_or(0) as u32;
312    if let Some(Object::Array(arr)) = fdict.get(b"Widths").ok().and_then(|o| deref(doc, o)) {
313        for (i, w) in arr.iter().enumerate() {
314            if let Some(w) = num(w) {
315                map.insert(first + i as u32, w);
316            }
317        }
318    }
319    let dw = fdict
320        .get(b"FontDescriptor")
321        .ok()
322        .and_then(|o| as_dict(doc, o))
323        .and_then(|d| d.get(b"MissingWidth").ok())
324        .and_then(num)
325        .unwrap_or(0.0);
326    (map, dw)
327}
328
329/// CIDFont widths: the `/W` array on the descendant font (`/DW` default = 1000).
330fn cid_widths(doc: &Document, fdict: &Dictionary) -> (HashMap<u32, f64>, f64) {
331    let mut map = HashMap::new();
332    let Some(desc) = fdict
333        .get(b"DescendantFonts")
334        .ok()
335        .and_then(|o| deref(doc, o))
336        .and_then(|o| match o {
337            Object::Array(a) => a.first(),
338            _ => None,
339        })
340        .and_then(|o| as_dict(doc, o))
341    else {
342        return (map, 1000.0);
343    };
344    let dw = desc.get(b"DW").ok().and_then(num).unwrap_or(1000.0);
345    if let Some(Object::Array(w)) = desc.get(b"W").ok().and_then(|o| deref(doc, o)) {
346        let mut i = 0;
347        while i < w.len() {
348            let c = w.get(i).and_then(num);
349            match (c, w.get(i + 1)) {
350                // `c [w1 w2 ...]`: consecutive CIDs starting at c.
351                (Some(c), Some(Object::Array(list))) => {
352                    for (k, wv) in list.iter().enumerate() {
353                        if let Some(wv) = num(wv) {
354                            map.insert(c as u32 + k as u32, wv);
355                        }
356                    }
357                    i += 2;
358                }
359                // `c_first c_last w`: a run all of width w.
360                (Some(c1), Some(o2)) => {
361                    if let (Some(c2), Some(wv)) = (num(o2), w.get(i + 2).and_then(num)) {
362                        for cid in c1 as u32..=c2 as u32 {
363                            map.insert(cid, wv);
364                        }
365                    }
366                    i += 3;
367                }
368                _ => break,
369            }
370        }
371    }
372    (map, dw)
373}
374
375fn num(o: &Object) -> Option<f64> {
376    match o {
377        Object::Integer(i) => Some(*i as f64),
378        Object::Real(r) => Some(*r as f64),
379        _ => None,
380    }
381}
382
383/// Parse a ToUnicode CMap's `bfchar` / `bfrange` sections into code→string.
384fn parse_tounicode(data: &[u8]) -> HashMap<u32, String> {
385    let text = String::from_utf8_lossy(data);
386    let mut map = HashMap::new();
387    let hex = |s: &str| -> Option<Vec<u16>> {
388        let s = s.trim();
389        if !s.starts_with('<') || !s.ends_with('>') {
390            return None;
391        }
392        let h = &s[1..s.len() - 1];
393        let bytes: Vec<u8> = (0..h.len())
394            .step_by(2)
395            .filter_map(|i| u8::from_str_radix(h.get(i..i + 2)?, 16).ok())
396            .collect();
397        Some(
398            bytes
399                .chunks(2)
400                .map(|c| {
401                    if c.len() == 2 {
402                        u16::from_be_bytes([c[0], c[1]])
403                    } else {
404                        c[0] as u16
405                    }
406                })
407                .collect(),
408        )
409    };
410    let u16s_to_string = |u: &[u16]| String::from_utf16_lossy(u);
411    let code_of = |u: &[u16]| u.iter().fold(0u32, |acc, &x| (acc << 16) | x as u32);
412
413    // Tokenize by structure, not whitespace: CMap hex groups are often written
414    // back-to-back with no separators (`<21><21><0054>`), so scan for `<…>`
415    // groups, `[`/`]` brackets, and bareword keywords.
416    let tokens: Vec<String> = {
417        let bytes = text.as_bytes();
418        let mut toks = Vec::new();
419        let mut i = 0;
420        while i < bytes.len() {
421            let c = bytes[i];
422            if c.is_ascii_whitespace() {
423                i += 1;
424            } else if c == b'<' {
425                let start = i;
426                while i < bytes.len() && bytes[i] != b'>' {
427                    i += 1;
428                }
429                i += 1; // include '>'
430                toks.push(String::from_utf8_lossy(&bytes[start..i.min(bytes.len())]).into_owned());
431            } else if c == b'[' || c == b']' {
432                toks.push((c as char).to_string());
433                i += 1;
434            } else {
435                let start = i;
436                while i < bytes.len()
437                    && !bytes[i].is_ascii_whitespace()
438                    && bytes[i] != b'<'
439                    && bytes[i] != b'['
440                    && bytes[i] != b']'
441                {
442                    i += 1;
443                }
444                toks.push(String::from_utf8_lossy(&bytes[start..i]).into_owned());
445            }
446        }
447        toks
448    };
449    let tokens: Vec<&str> = tokens.iter().map(|s| s.as_str()).collect();
450    let mut i = 0;
451    while i < tokens.len() {
452        match tokens[i] {
453            "beginbfchar" => {
454                i += 1;
455                while i + 1 < tokens.len() && tokens[i] != "endbfchar" {
456                    if let (Some(src), Some(dst)) = (hex(tokens[i]), hex(tokens[i + 1])) {
457                        map.insert(code_of(&src), u16s_to_string(&dst));
458                    }
459                    i += 2;
460                }
461            }
462            "beginbfrange" => {
463                i += 1;
464                while i + 2 < tokens.len() && tokens[i] != "endbfrange" {
465                    let (Some(lo), Some(hi)) = (hex(tokens[i]), hex(tokens[i + 1])) else {
466                        i += 1;
467                        continue;
468                    };
469                    let lo = code_of(&lo);
470                    let hi = code_of(&hi);
471                    if tokens[i + 2] == "[" {
472                        // `<lo> <hi> [ <d0> <d1> ... ]`: one dst per code in the range.
473                        let mut j = i + 3;
474                        let mut code = lo;
475                        while j < tokens.len() && tokens[j] != "]" {
476                            if let Some(dst) = hex(tokens[j]) {
477                                map.insert(code, u16s_to_string(&dst));
478                            }
479                            code += 1;
480                            j += 1;
481                        }
482                        i = j + 1;
483                    } else if let Some(dst) = hex(tokens[i + 2]) {
484                        // `<lo> <hi> <dst>`: consecutive Unicode from a base.
485                        let base = code_of(&dst);
486                        for (k, code) in (lo..=hi).enumerate() {
487                            if let Some(ch) = char::from_u32(base + k as u32) {
488                                map.insert(code, ch.to_string());
489                            }
490                        }
491                        i += 3;
492                    } else {
493                        i += 1;
494                    }
495                }
496            }
497            _ => i += 1,
498        }
499    }
500    map
501}
502
503/// Decode a PDF string literal in a Tj/TJ operand into raw code units.
504fn codes(font: &Font, bytes: &[u8]) -> Vec<u32> {
505    if font.two_byte {
506        bytes
507            .chunks(2)
508            .map(|c| {
509                if c.len() == 2 {
510                    ((c[0] as u32) << 8) | c[1] as u32
511                } else {
512                    c[0] as u32
513                }
514            })
515            .collect()
516    } else {
517        bytes.iter().map(|&b| b as u32).collect()
518    }
519}
520
521/// Page size (width, height) in PDF points from the MediaBox.
522fn page_size(doc: &Document, page_id: lopdf::ObjectId) -> (f32, f32) {
523    let mb = doc
524        .get_object(page_id)
525        .ok()
526        .and_then(|o| o.as_dict().ok())
527        .and_then(|d| {
528            // MediaBox may be inherited; lopdf resolves via get_page... fall back to a guess.
529            d.get(b"MediaBox").ok().cloned()
530        })
531        .or_else(|| {
532            doc.get_dictionary(page_id)
533                .ok()
534                .and_then(|d| d.get(b"MediaBox").ok().cloned())
535        });
536    if let Some(Object::Array(a)) = mb {
537        let v: Vec<f32> = a.iter().filter_map(|o| num(o).map(|x| x as f32)).collect();
538        if v.len() == 4 {
539            return ((v[2] - v[0]).abs(), (v[3] - v[1]).abs());
540        }
541    }
542    (612.0, 792.0)
543}
544
545/// Debug: raw glyph stream `(ch, ll, lr, lb, lt)` (native coords) for page
546/// `index`, before the sanitizer. For comparing char cells to docling-parse.
547pub fn debug_glyphs(bytes: &[u8], index: usize) -> Vec<(char, f32, f32, f32, f32)> {
548    let Ok(doc) = Document::load_mem(bytes) else {
549        return Vec::new();
550    };
551    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
552    pages.sort_by_key(|(n, _)| *n);
553    let Some((_, pid)) = pages.get(index) else {
554        return Vec::new();
555    };
556    page_glyphs(&doc, *pid)
557        .into_iter()
558        .map(|g| (g.ch, g.ll, g.lr, g.lb, g.lt))
559        .collect()
560}
561
562/// Public entry: per-page (width, height, line cells) for a PDF, via the Rust
563/// text parser + the docling-parse line sanitizer. Used by the pipeline and the
564/// `textparse_dump` example.
565pub fn pdf_textlines(bytes: &[u8]) -> Vec<(f32, f32, Vec<crate::pdfium_backend::TextCell>)> {
566    let Ok(doc) = Document::load_mem(bytes) else {
567        return Vec::new();
568    };
569    let mut pages: Vec<_> = doc.get_pages().into_iter().collect();
570    pages.sort_by_key(|(n, _)| *n);
571    pages
572        .into_iter()
573        .map(|(_, pid)| {
574            let (w, h) = page_size(&doc, pid);
575            let glyphs = page_glyphs(&doc, pid);
576            let cells = crate::dp_lines::line_cells(&glyphs, h, true);
577            (w, h, cells)
578        })
579        .collect()
580}
581
582/// The text-state scalars inherited by a Form XObject when it is invoked via
583/// `Do` (the PDF graphics state includes the text parameters, but not the text
584/// matrices, which a form re-establishes inside its own `BT`/`ET`).
585#[derive(Clone, Copy)]
586struct TextState {
587    tc: f64,
588    tw: f64,
589    th: f64,
590    tl: f64,
591    trise: f64,
592    fsize: f64,
593}
594
595impl TextState {
596    const INIT: TextState = TextState {
597        tc: 0.0,
598        tw: 0.0,
599        th: 1.0,
600        tl: 0.0,
601        trise: 0.0,
602        fsize: 0.0,
603    };
604}
605
606/// The effective `/Resources` dictionary for a page (inline or via reference,
607/// falling back to an inherited one from a `/Parent`).
608fn page_res(doc: &Document, page_id: lopdf::ObjectId) -> Option<&Dictionary> {
609    let (inline, ids) = doc.get_page_resources(page_id).ok()?;
610    if let Some(d) = inline {
611        return Some(d);
612    }
613    ids.into_iter().find_map(|id| doc.get_dictionary(id).ok())
614}
615
616/// Build the code→[`Font`] map for a resources dictionary's `/Font` sub-dict.
617fn fonts_from_res(doc: &Document, res: &Dictionary) -> HashMap<Vec<u8>, Font> {
618    let mut map = HashMap::new();
619    let font_dict = res
620        .get(b"Font")
621        .ok()
622        .and_then(|o| deref(doc, o))
623        .and_then(|o| o.as_dict().ok());
624    if let Some(fd) = font_dict {
625        for (name, value) in fd.iter() {
626            if let Some(fdict) = deref(doc, value).and_then(|o| o.as_dict().ok()) {
627                map.insert(name.clone(), parse_font(doc, name, fdict));
628            }
629        }
630    }
631    map
632}
633
634/// Extract every glyph on a page as a native-coordinate [`Glyph`].
635pub(crate) fn page_glyphs(doc: &Document, page_id: lopdf::ObjectId) -> Vec<Glyph> {
636    let mut out = Vec::new();
637    let Ok(content_bytes) = doc.get_page_content(page_id) else {
638        return out;
639    };
640    let Ok(content) = lopdf::content::Content::decode(&content_bytes) else {
641        return out;
642    };
643    if let Some(res) = page_res(doc, page_id) {
644        run_content(doc, res, &content, Mat::ID, TextState::INIT, 0, &mut out);
645    }
646    out
647}
648
649/// Run a content stream's operators, emitting glyphs into `out`. Recurses into
650/// Form XObjects on `Do` (bulk body text in heavy PDFs lives inside a form, not
651/// the page content stream). `res` is the resources dict in scope (the page's,
652/// or the form's own); `base_ctm` is the CTM at the point of invocation.
653fn run_content(
654    doc: &Document,
655    res: &Dictionary,
656    content: &lopdf::content::Content,
657    base_ctm: Mat,
658    init: TextState,
659    depth: u32,
660    out: &mut Vec<Glyph>,
661) {
662    let fonts = fonts_from_res(doc, res);
663    let xobjects = res
664        .get(b"XObject")
665        .ok()
666        .and_then(|o| deref(doc, o))
667        .and_then(|o| o.as_dict().ok());
668
669    // Graphics + text state. `q`/`Q` save and restore the whole graphics state,
670    // which includes the text parameters (Tc, Tw, Tz, TL, Tfs, Trise, font) —
671    // *not* the text matrix (that is reset by BT). Saving only the CTM let a Tc
672    // set inside a `q…Q` block leak out and drift every later glyph.
673    #[allow(clippy::type_complexity)]
674    let mut gstate_stack: Vec<(Mat, f64, f64, f64, f64, f64, f64, Option<&Font>)> = Vec::new();
675    let mut ctm = base_ctm;
676    let mut tm = Mat::ID;
677    let mut tlm = Mat::ID;
678    let mut font: Option<&Font> = None;
679    let mut fsize = init.fsize;
680    let mut tc = init.tc; // char spacing
681    let mut tw = init.tw; // word spacing
682    let mut th = init.th; // horizontal scale (Tz/100)
683    let mut tl = init.tl; // leading
684    let mut trise = init.trise;
685
686    let op_f = |operands: &[Object], i: usize| operands.get(i).and_then(num).unwrap_or(0.0);
687
688    for op in &content.operations {
689        let operands = &op.operands;
690        match op.operator.as_str() {
691            "q" => gstate_stack.push((ctm, tc, tw, th, tl, trise, fsize, font)),
692            "Q" => {
693                if let Some((c, a, b, h, l, r, fs, f)) = gstate_stack.pop() {
694                    ctm = c;
695                    tc = a;
696                    tw = b;
697                    th = h;
698                    tl = l;
699                    trise = r;
700                    fsize = fs;
701                    font = f;
702                }
703            }
704            "cm" => {
705                let m = Mat {
706                    a: op_f(operands, 0),
707                    b: op_f(operands, 1),
708                    c: op_f(operands, 2),
709                    d: op_f(operands, 3),
710                    e: op_f(operands, 4),
711                    f: op_f(operands, 5),
712                };
713                ctm = m.then(ctm);
714            }
715            "BT" => {
716                tm = Mat::ID;
717                tlm = Mat::ID;
718            }
719            "ET" => {}
720            "Tf" => {
721                if let Some(Object::Name(n)) = operands.first() {
722                    font = fonts.get(n.as_slice());
723                }
724                fsize = op_f(operands, 1);
725            }
726            "Td" => {
727                tlm = Mat {
728                    a: 1.0,
729                    b: 0.0,
730                    c: 0.0,
731                    d: 1.0,
732                    e: op_f(operands, 0),
733                    f: op_f(operands, 1),
734                }
735                .then(tlm);
736                tm = tlm;
737            }
738            "TD" => {
739                tl = -op_f(operands, 1);
740                tlm = Mat {
741                    a: 1.0,
742                    b: 0.0,
743                    c: 0.0,
744                    d: 1.0,
745                    e: op_f(operands, 0),
746                    f: op_f(operands, 1),
747                }
748                .then(tlm);
749                tm = tlm;
750            }
751            "Tm" => {
752                tlm = Mat {
753                    a: op_f(operands, 0),
754                    b: op_f(operands, 1),
755                    c: op_f(operands, 2),
756                    d: op_f(operands, 3),
757                    e: op_f(operands, 4),
758                    f: op_f(operands, 5),
759                };
760                tm = tlm;
761            }
762            "T*" => {
763                tlm = Mat {
764                    a: 1.0,
765                    b: 0.0,
766                    c: 0.0,
767                    d: 1.0,
768                    e: 0.0,
769                    f: -tl,
770                }
771                .then(tlm);
772                tm = tlm;
773            }
774            "Tc" => tc = op_f(operands, 0),
775            "Tw" => tw = op_f(operands, 0),
776            "Tz" => th = op_f(operands, 0) / 100.0,
777            "TL" => tl = op_f(operands, 0),
778            "Ts" => trise = op_f(operands, 0),
779            "Tj" | "'" | "\"" => {
780                if op.operator == "'" || op.operator == "\"" {
781                    // move to next line first
782                    tlm = Mat {
783                        a: 1.0,
784                        b: 0.0,
785                        c: 0.0,
786                        d: 1.0,
787                        e: 0.0,
788                        f: -tl,
789                    }
790                    .then(tlm);
791                    tm = tlm;
792                }
793                if let (Some(f), Some(Object::String(s, _))) = (font, operands.last()) {
794                    show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out);
795                }
796            }
797            "TJ" => {
798                if let (Some(f), Some(Object::Array(arr))) = (font, operands.first()) {
799                    for el in arr {
800                        match el {
801                            Object::String(s, _) => {
802                                show_text(f, s, fsize, tc, tw, th, trise, &mut tm, ctm, out)
803                            }
804                            other => {
805                                if let Some(adj) = num(other) {
806                                    // negative number moves text right (PDF: subtract)
807                                    let tx = -adj / 1000.0 * fsize * th;
808                                    tm = Mat {
809                                        a: 1.0,
810                                        b: 0.0,
811                                        c: 0.0,
812                                        d: 1.0,
813                                        e: tx,
814                                        f: 0.0,
815                                    }
816                                    .then(tm);
817                                }
818                            }
819                        }
820                    }
821                }
822            }
823            "Do" => {
824                // Invoke a Form XObject: bulk body text in many PDFs lives inside
825                // a form, reached only here. Image XObjects are skipped (no text).
826                if depth >= 8 {
827                    continue;
828                }
829                let Some(Object::Name(n)) = operands.first() else {
830                    continue;
831                };
832                let stream = xobjects
833                    .and_then(|d| d.get(n.as_slice()).ok())
834                    .and_then(|o| deref(doc, o))
835                    .and_then(|o| o.as_stream().ok());
836                let Some(stream) = stream else { continue };
837                let is_form = stream
838                    .dict
839                    .get(b"Subtype")
840                    .ok()
841                    .and_then(|o| o.as_name().ok())
842                    == Some(b"Form".as_slice());
843                if !is_form {
844                    continue;
845                }
846                let Ok(data) = stream.decompressed_content() else {
847                    continue;
848                };
849                let Ok(form_content) = lopdf::content::Content::decode(&data) else {
850                    continue;
851                };
852                // The form's /Matrix maps form space into the CTM at invocation.
853                let form_mat = match stream.dict.get(b"Matrix").ok() {
854                    Some(Object::Array(a)) if a.len() == 6 => {
855                        let v: Vec<f64> = a.iter().filter_map(num).collect();
856                        if v.len() == 6 {
857                            Mat {
858                                a: v[0],
859                                b: v[1],
860                                c: v[2],
861                                d: v[3],
862                                e: v[4],
863                                f: v[5],
864                            }
865                        } else {
866                            Mat::ID
867                        }
868                    }
869                    _ => Mat::ID,
870                };
871                // The form's own /Resources, falling back to the inherited ones.
872                let form_res = stream
873                    .dict
874                    .get(b"Resources")
875                    .ok()
876                    .and_then(|o| deref(doc, o))
877                    .and_then(|o| o.as_dict().ok())
878                    .unwrap_or(res);
879                let state = TextState {
880                    tc,
881                    tw,
882                    th,
883                    tl,
884                    trise,
885                    fsize,
886                };
887                run_content(
888                    doc,
889                    form_res,
890                    &form_content,
891                    form_mat.then(ctm),
892                    state,
893                    depth + 1,
894                    out,
895                );
896            }
897            _ => {}
898        }
899    }
900}
901
902#[allow(clippy::too_many_arguments)]
903fn show_text(
904    font: &Font,
905    bytes: &[u8],
906    fsize: f64,
907    tc: f64,
908    tw: f64,
909    th: f64,
910    trise: f64,
911    tm: &mut Mat,
912    ctm: Mat,
913    out: &mut Vec<Glyph>,
914) {
915    for code in codes(font, bytes) {
916        let (text, w) = font.decode_code(code);
917        let w0 = w / 1000.0; // advance in text-space (em) units
918                             // The glyph→user transform: scale glyph space by font size, then Tm, CTM.
919        let scale = Mat {
920            a: fsize * th,
921            b: 0.0,
922            c: 0.0,
923            d: fsize,
924            e: 0.0,
925            f: trise,
926        };
927        let trm = scale.then(*tm).then(ctm);
928        // Box in glyph space (1000-unit em): x 0..w, y descent..ascent.
929        let (x0, y0) = trm.apply(0.0, font.descent / 1000.0);
930        let (x1, _y1) = trm.apply(w0, font.descent / 1000.0);
931        let (_x2, y2) = trm.apply(0.0, font.ascent / 1000.0);
932        let (left, right) = (x0.min(x1), x0.max(x1));
933        let (bot, top) = (y0.min(y2), y0.max(y2));
934        if let Some(s) = text {
935            // A run may map one code to multiple chars (ligature/fraction); share box.
936            for ch in s.chars() {
937                if ch != '\u{0}' {
938                    out.push(Glyph {
939                        ch,
940                        l: left as f32,
941                        b: bot as f32,
942                        r: right as f32,
943                        t: top as f32,
944                        ll: left as f32,
945                        lb: bot as f32,
946                        lr: right as f32,
947                        lt: top as f32,
948                        font: font.hash,
949                    });
950                }
951            }
952        }
953        // Advance the text matrix. Word spacing applies to single-byte code 32.
954        let is_space = !font.two_byte && code == 32;
955        let tx = (w0 * fsize + tc + if is_space { tw } else { 0.0 }) * th;
956        *tm = Mat {
957            a: 1.0,
958            b: 0.0,
959            c: 0.0,
960            d: 1.0,
961            e: tx,
962            f: 0.0,
963        }
964        .then(*tm);
965    }
966}
967
968/// Build a simple font's code→char table from its `/Encoding`: the base
969/// encoding (WinAnsi / MacRoman) plus any `/Differences` overrides (glyph names
970/// resolved through a small Adobe-glyph-name subset).
971fn simple_encoding_table(doc: &Document, fdict: &Dictionary) -> HashMap<u8, char> {
972    let enc = fdict.get(b"Encoding").ok().and_then(|o| deref(doc, o));
973    let base_name = match enc {
974        Some(Object::Name(n)) => n.clone(),
975        Some(Object::Dictionary(d)) => d
976            .get(b"BaseEncoding")
977            .ok()
978            .and_then(|o| o.as_name().ok())
979            .map(|n| n.to_vec())
980            .unwrap_or_default(),
981        _ => Vec::new(),
982    };
983    let mut m = if base_name == b"MacRomanEncoding" {
984        macroman_table()
985    } else {
986        winansi_table()
987    };
988    // Apply /Differences: `code /glyphname /glyphname ... code ...`.
989    if let Some(Object::Dictionary(d)) = enc {
990        if let Some(Object::Array(diffs)) = d.get(b"Differences").ok().and_then(|o| deref(doc, o)) {
991            let mut code = 0u8;
992            for el in diffs {
993                match el {
994                    Object::Integer(i) => code = *i as u8,
995                    Object::Name(name) => {
996                        if let Some(ch) = glyph_name_to_char(name) {
997                            m.insert(code, ch);
998                        }
999                        code = code.wrapping_add(1);
1000                    }
1001                    _ => {}
1002                }
1003            }
1004        }
1005    }
1006    m
1007}
1008
1009/// Resolve an Adobe glyph name to Unicode: `uniXXXX`, single ASCII letters, the
1010/// digit/punctuation names from the Adobe Glyph List, and common typographic
1011/// names. A `.suffix` (`one.taboldstyle`, `a.sc`) is stripped and the base name
1012/// retried — docling renders these as the base character.
1013fn glyph_name_to_char(name: &[u8]) -> Option<char> {
1014    let s = std::str::from_utf8(name).ok()?;
1015    if let Some(hex) = s.strip_prefix("uni") {
1016        if let Ok(cp) = u32::from_str_radix(hex.get(0..4)?, 16) {
1017            return char::from_u32(cp);
1018        }
1019    }
1020    // Single ASCII letter names (`A`, `m`) map to themselves.
1021    if s.len() == 1 {
1022        let b = s.as_bytes()[0];
1023        if b.is_ascii_alphabetic() {
1024            return Some(b as char);
1025        }
1026    }
1027    let resolved = match s {
1028        "space" => ' ',
1029        "exclam" => '!',
1030        "quotedbl" => '"',
1031        "numbersign" => '#',
1032        "dollar" => '$',
1033        "percent" => '%',
1034        "ampersand" => '&',
1035        "quotesingle" => '\'',
1036        "parenleft" => '(',
1037        "parenright" => ')',
1038        "asterisk" => '*',
1039        "plus" => '+',
1040        "comma" => ',',
1041        "hyphen" => '-',
1042        "period" => '.',
1043        "slash" => '/',
1044        "zero" => '0',
1045        "one" => '1',
1046        "two" => '2',
1047        "three" => '3',
1048        "four" => '4',
1049        "five" => '5',
1050        "six" => '6',
1051        "seven" => '7',
1052        "eight" => '8',
1053        "nine" => '9',
1054        "colon" => ':',
1055        "semicolon" => ';',
1056        "less" => '<',
1057        "equal" => '=',
1058        "greater" => '>',
1059        "question" => '?',
1060        "at" => '@',
1061        "bracketleft" => '[',
1062        "backslash" => '\\',
1063        "bracketright" => ']',
1064        "asciicircum" => '^',
1065        "underscore" => '_',
1066        "grave" => '`',
1067        "braceleft" => '{',
1068        "bar" => '|',
1069        "braceright" => '}',
1070        "asciitilde" => '~',
1071        "bullet" => '\u{2022}',
1072        "periodcentered" => '\u{00B7}',
1073        "endash" => '\u{2013}',
1074        "emdash" => '\u{2014}',
1075        "quoteright" => '\u{2019}',
1076        "quoteleft" => '\u{2018}',
1077        "quotedblleft" => '\u{201C}',
1078        "quotedblright" => '\u{201D}',
1079        "quotedblbase" => '\u{201E}',
1080        "quotesinglbase" => '\u{201A}',
1081        "fi" => '\u{FB01}',
1082        "fl" => '\u{FB02}',
1083        "degree" => '\u{00B0}',
1084        "trademark" => '\u{2122}',
1085        "registered" => '\u{00AE}',
1086        "copyright" => '\u{00A9}',
1087        "ellipsis" => '\u{2026}',
1088        "minus" => '\u{2212}',
1089        "fraction" => '\u{2044}',
1090        "nbspace" => '\u{00A0}',
1091        _ => {
1092            // Strip an AGL `.suffix` (oldstyle/small-cap variant) and retry.
1093            if let Some((base, _)) = s.split_once('.') {
1094                if !base.is_empty() {
1095                    return glyph_name_to_char(base.as_bytes());
1096                }
1097            }
1098            return None;
1099        }
1100    };
1101    Some(resolved)
1102}
1103
1104/// Minimal WinAnsiEncoding (Latin-1-ish) for simple fonts lacking ToUnicode.
1105fn winansi_table() -> HashMap<u8, char> {
1106    let mut m = HashMap::new();
1107    for b in 0x20u8..=0x7e {
1108        m.insert(b, b as char);
1109    }
1110    // High range: Windows-1252 printable points that differ from Latin-1.
1111    let extra: &[(u8, char)] = &[
1112        (0x91, '\u{2018}'),
1113        (0x92, '\u{2019}'),
1114        (0x93, '\u{201C}'),
1115        (0x94, '\u{201D}'),
1116        (0x95, '\u{2022}'),
1117        (0x96, '\u{2013}'),
1118        (0x97, '\u{2014}'),
1119        (0x85, '\u{2026}'),
1120        (0xA0, '\u{00A0}'),
1121    ];
1122    for &(b, c) in extra {
1123        m.insert(b, c);
1124    }
1125    for b in 0xA1u8..=0xFF {
1126        m.entry(b).or_insert(b as char);
1127    }
1128    m
1129}
1130
1131/// Minimal MacRomanEncoding: ASCII plus the high-range points our corpus hits
1132/// (notably 0xA5 = bullet, used as a list marker).
1133fn macroman_table() -> HashMap<u8, char> {
1134    let mut m = HashMap::new();
1135    for b in 0x20u8..=0x7e {
1136        m.insert(b, b as char);
1137    }
1138    let high: &[(u8, char)] = &[
1139        (0xA5, '\u{2022}'), // bullet
1140        (0xD0, '\u{2013}'), // endash
1141        (0xD1, '\u{2014}'), // emdash
1142        (0xD2, '\u{201C}'),
1143        (0xD3, '\u{201D}'),
1144        (0xD4, '\u{2018}'),
1145        (0xD5, '\u{2019}'),
1146        (0xCA, '\u{00A0}'),
1147        (0xC9, '\u{2026}'),
1148        (0xDE, '\u{FB01}'),
1149        (0xDF, '\u{FB02}'),
1150    ];
1151    for &(b, c) in high {
1152        m.insert(b, c);
1153    }
1154    m
1155}
fleischwolf_pdf/textparse.rs

fleischwolf_pdf/
textparse.rs