fleischwolf_pdf/
pdfium_backend.rs

1//! pdfium-based text extraction and page rendering.
2//!
3//! Text is reconstructed the way docling's `docling-parse` does it, so the
4//! output spacing matches the groundtruth: the page's **character** stream is
5//! grouped into **words** (split at a horizontal gap wider than a fraction of
6//! the font height — font-relative, so letter-tracking in display titles does
7//! not split a word) and words into **lines** (by baseline). pdfium-render's
8//! safe API only exposes whole style runs / `GetBoundedText`, so the character
9//! loop is driven through the raw `PdfiumLibraryBindings` FFI on a second handle
10//! to the same bytes (no fork; stays publishable).
11
12use image::RgbImage;
13use pdfium_render::prelude::*;
14
15/// A run of text with its bounding box, in PDF points with a **top-left** origin
16/// (pdfium's native origin is bottom-left; we flip it to match docling's
17/// `BoundingBox(..., origin=TOPLEFT)`).
18#[derive(Debug, Clone)]
19pub struct TextCell {
20    pub text: String,
21    pub l: f32,
22    pub t: f32,
23    pub r: f32,
24    pub b: f32,
25}
26
27/// Pixels-per-point used to render page images. Layout is scale-invariant (it
28/// scales normalized boxes by the page point size), but OCR benefits from the
29/// extra resolution.
30pub const RENDER_SCALE: f32 = 2.0;
31
32/// One page's geometry, extracted text cells, and a rendered RGB image. The
33/// image is rendered at [`RENDER_SCALE`] pixels per PDF point; `image px =
34/// page point × scale`.
35#[derive(Clone)]
36pub struct PdfPage {
37    pub width: f32,
38    pub height: f32,
39    pub scale: f32,
40    pub cells: Vec<TextCell>,
41    /// Same text grouped for code regions: split only at pdfium space glyphs, so
42    /// monospace runs keep their source spacing instead of the prose heuristic's.
43    pub code_cells: Vec<TextCell>,
44    /// Per-word cells (one per word, not joined into lines) for TableFormer cell
45    /// matching.
46    pub word_cells: Vec<TextCell>,
47    pub image: RgbImage,
48    /// Hyperlink annotations on the page (rect in top-left page coords + target
49    /// URI), restricted to web/mail/tel schemes. Used only by strict Markdown.
50    pub links: Vec<LinkAnnot>,
51}
52
53/// A PDF link annotation: its rectangle (top-left page coordinates, matching
54/// [`TextCell`]) and target URI.
55#[derive(Debug, Clone)]
56pub struct LinkAnnot {
57    pub l: f32,
58    pub t: f32,
59    pub r: f32,
60    pub b: f32,
61    pub uri: String,
62}
63
64/// A parsed PDF: per-page text cells and page images.
65pub struct PdfDocument {
66    pub pages: Vec<PdfPage>,
67}
68
69/// Whether to use the docling-parse line sanitizer ([`crate::dp_lines`]) for prose
70/// reconstruction — the default. Set `DOCLING_LEGACY_LINES` to fall back to the
71/// older gap-heuristic `lines_from_glyphs`.
72pub(crate) fn use_dp_lines() -> bool {
73    std::env::var("DOCLING_LEGACY_LINES").is_err()
74}
75
76/// Whether to source **word** cells from the pure-Rust parser (roadmap item 6),
77/// the default. The parser's `word_cells` reproduce docling-parse's word grouping
78/// byte-for-byte — the per-word tokens TableFormer matches table-grid cells
79/// against — which moves table extraction closer to docling on the heavy
80/// multi-column fixtures. Set `DOCLING_PDFIUM_WORDS` to keep pdfium's word cells,
81/// or `DOCLING_PDFIUM_TEXT` to fall back to pdfium for all text.
82pub(crate) fn use_parser_words() -> bool {
83    std::env::var("DOCLING_PDFIUM_WORDS").is_err() && std::env::var("DOCLING_PDFIUM_TEXT").is_err()
84}
85
86/// Whether to source **code** cells from the parser too (the default) — the last
87/// text layer to leave pdfium, fully retiring its text path. The parser's
88/// gap-based code grouping ([`code_cells_from_glyphs`]) reconstructs monospace
89/// spacing from positioning gaps (`function add(a, b) { … }`), so it no longer
90/// drops the inter-token spaces the old space-glyph-only grouping lost
91/// (`functionadd`). Reverts to pdfium with `DOCLING_PDFIUM_WORDS` (alongside word
92/// cells) or `DOCLING_PDFIUM_TEXT` (all text).
93pub(crate) fn use_parser_code() -> bool {
94    std::env::var("DOCLING_PDFIUM_WORDS").is_err() && std::env::var("DOCLING_PDFIUM_TEXT").is_err()
95}
96
97/// Try binding pdfium from a directory (or a literal library file path):
98/// `<dir>/<platform library name>` first, else `<dir>` itself as the file.
99fn try_bind_dir(path: &str) -> Option<Box<dyn pdfium_render::prelude::PdfiumLibraryBindings>> {
100    let name = Pdfium::pdfium_platform_library_name_at_path(path);
101    if let Ok(b) = Pdfium::bind_to_library(&name) {
102        return Some(b);
103    }
104    Pdfium::bind_to_library(path).ok()
105}
106
107/// Bind to the pdfium dynamic library. Honors `PDFIUM_DYNAMIC_LIB_PATH` (a
108/// directory or file) first; else falls back to `.pdfium/lib` relative to the
109/// current directory (the layout `scripts/download_dependencies.sh` and
110/// `scripts/pdf_setup.sh` both produce); else the system library.
111fn bind() -> Result<Pdfium, PdfiumError> {
112    if let Ok(path) = std::env::var("PDFIUM_DYNAMIC_LIB_PATH") {
113        if let Some(b) = try_bind_dir(&path) {
114            return Ok(Pdfium::new(b));
115        }
116    }
117    // No env var (or it didn't resolve): fall back to `.pdfium/lib` relative to
118    // the current directory — mirroring `layout.rs`/`ocr.rs`'s `models/…`
119    // defaults — the layout `scripts/download_dependencies.sh` (and
120    // `scripts/pdf_setup.sh`) produce, so a checkout with the dependencies
121    // downloaded next to it needs no env var at all.
122    if let Some(b) = try_bind_dir(".pdfium/lib") {
123        return Ok(Pdfium::new(b));
124    }
125    Pdfium::bind_to_system_library().map(Pdfium::new)
126}
127
128impl PdfDocument {
129    /// Parse a PDF from bytes, optionally decrypting with `password`.
130    ///
131    /// Note: this materialises **every** page's rendered bitmap in memory at
132    /// once. For large documents prefer [`for_each_page`], which streams.
133    pub fn open(bytes: &[u8], password: Option<&str>) -> Result<Self, PdfiumError> {
134        let pdfium = bind()?;
135        let ffi = FfiText::load(pdfium.bindings(), bytes, password);
136        let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
137        let mut rust = rust_parser_cells(bytes);
138        let mut pages = Vec::new();
139        for (i, page) in doc.pages().iter().enumerate() {
140            let rc = rust.as_mut().and_then(|v| v.get_mut(i).map(std::mem::take));
141            pages.push(extract_page(&page, &ffi, i as i32, rc)?);
142        }
143        Ok(PdfDocument { pages })
144    }
145}
146
147/// Per-page prose line cells from the pure-Rust text parser. This is the
148/// **default** text layer (it matches docling-parse's char geometry and is a
149/// strict improvement on byte-conformance — e.g. it recovers the Arabic
150/// sentence-period attachment in `right_to_left_01`). Set `DOCLING_PDFIUM_TEXT`
151/// to fall back to pdfium's text layer. The parser returns an empty page when a
152/// PDF (or a page) has no parseable text layer; the caller keeps pdfium's cells
153/// in that case, so scanned/edge-case pages are unaffected.
154fn rust_parser_cells(bytes: &[u8]) -> Option<Vec<crate::textparse::PageParserCells>> {
155    if std::env::var("DOCLING_PDFIUM_TEXT").is_ok() {
156        return None;
157    }
158    Some(crate::timing::timed("textparse", || {
159        crate::textparse::pdf_all_cells(bytes)
160    }))
161}
162
163/// Number of pages in a PDF, without rendering any of them — used to decide
164/// whether a document is worth spinning up the parallel worker pool.
165pub fn page_count(bytes: &[u8], password: Option<&str>) -> Result<usize, PdfiumError> {
166    let pdfium = bind()?;
167    let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
168    Ok(doc.pages().len() as usize)
169}
170
171/// Render + extract pages one at a time, handing each (owned) [`PdfPage`] to `f`.
172/// Only one page bitmap is resident at a time — a rendered page is ~5 MB, so a
173/// large PDF would otherwise hold gigabytes of bitmaps at once. `f` receives the
174/// zero-based page index and the total page count.
175///
176/// `E` is the caller's error type; pdfium errors convert into it via `From`.
177pub fn for_each_page<E, F>(bytes: &[u8], password: Option<&str>, mut f: F) -> Result<(), E>
178where
179    E: From<PdfiumError>,
180    F: FnMut(usize, usize, PdfPage) -> Result<(), E>,
181{
182    let pdfium = bind()?;
183    let ffi = FfiText::load(pdfium.bindings(), bytes, password);
184    let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
185    let mut rust = rust_parser_cells(bytes);
186    let pages = doc.pages();
187    let total = pages.len() as usize;
188    for (i, page) in pages.iter().enumerate() {
189        let rc = rust.as_mut().and_then(|v| v.get_mut(i).map(std::mem::take));
190        let extracted = extract_page(&page, &ffi, i as i32, rc)?;
191        f(i, total, extracted)?;
192    }
193    Ok(())
194}
195
196fn extract_page(
197    page: &pdfium_render::prelude::PdfPage<'_>,
198    ffi: &FfiText<'_>,
199    index: i32,
200    rust_cells: Option<crate::textparse::PageParserCells>,
201) -> Result<PdfPage, PdfiumError> {
202    let width = page.width().value;
203    let height = page.height().value;
204
205    let (mut cells, mut code_cells, mut word_cells) =
206        crate::timing::timed("ffi.page_cells", || ffi.page_cells(index, height));
207    if cells.is_empty() {
208        cells = segment_cells(&page.text()?, height);
209    }
210    // Default: use the pure-Rust text parser instead of pdfium's text layer
211    // (override with `DOCLING_PDFIUM_TEXT`). Prose line cells always come from the
212    // parser; word and code cells do too unless `DOCLING_PDFIUM_WORDS` keeps them
213    // on pdfium (the parser's word grouping reproduces docling-parse's, which
214    // TableFormer matches against — roadmap item 6). A page the parser couldn't
215    // read (no text layer) keeps pdfium's cells.
216    if let Some(rc) = rust_cells {
217        if !rc.prose.is_empty() {
218            cells = rc.prose;
219        }
220        if use_parser_words() && !rc.words.is_empty() {
221            word_cells = rc.words;
222        }
223        if use_parser_code() && !rc.code.is_empty() {
224            code_cells = rc.code;
225        }
226    }
227
228    // docling renders at 1.5× the target scale and downsamples "to make it
229    // sharper" (pypdfium2 → PIL BICUBIC). Replicate exactly: the TableFormer
230    // model is pixel-sensitive, so the page bitmap must match byte-for-byte.
231    // `CatmullRom` is the same a=-0.5 cubic kernel as PIL's BICUBIC.
232    const SUPERSAMPLE: f32 = 1.5;
233    let tw = (width * RENDER_SCALE * SUPERSAMPLE).round().max(1.0) as i32;
234    let th = (height * RENDER_SCALE * SUPERSAMPLE).round().max(1.0) as i32;
235    let cfg = PdfRenderConfig::new()
236        .set_target_width(tw)
237        .set_target_height(th);
238    let big = crate::timing::timed("pdfium.render", || {
239        page.render_with_config(&cfg)
240            .map(|b| b.as_image().into_rgb8())
241    })?;
242    let dw = (width * RENDER_SCALE).round().max(1.0) as u32;
243    let dh = (height * RENDER_SCALE).round().max(1.0) as u32;
244    let image = crate::timing::timed("image.resize", || {
245        image::imageops::resize(&big, dw, dh, image::imageops::FilterType::CatmullRom)
246    });
247
248    Ok(PdfPage {
249        width,
250        height,
251        scale: RENDER_SCALE,
252        cells,
253        code_cells,
254        word_cells,
255        image,
256        links: extract_links(page, height),
257    })
258}
259
260/// Collect web/mail/tel hyperlink annotations on a page, mapping each link's
261/// rectangle into top-left page coordinates (like [`TextCell`]). `file://` and
262/// in-document destinations are skipped — only externally meaningful targets are
263/// rendered. pdfium occasionally lists a link twice; rects are kept as-is and the
264/// caller dedupes by resolved anchor text.
265fn extract_links(page: &pdfium_render::prelude::PdfPage<'_>, page_h: f32) -> Vec<LinkAnnot> {
266    let mut out = Vec::new();
267    for link in page.links().iter() {
268        let Some(uri) = link
269            .action()
270            .and_then(|a| a.as_uri_action().and_then(|u| u.uri().ok()))
271        else {
272            continue;
273        };
274        let scheme_ok = ["http://", "https://", "mailto:", "tel:"]
275            .iter()
276            .any(|s| uri.starts_with(s));
277        if !scheme_ok {
278            continue;
279        }
280        if let Ok(rect) = link.rect() {
281            out.push(LinkAnnot {
282                l: rect.left().value,
283                t: page_h - rect.top().value,
284                r: rect.right().value,
285                b: page_h - rect.bottom().value,
286                uri,
287            });
288        }
289    }
290    out
291}
292
293/// Fallback line cells from pdfium-render's style segments (one cell per
294/// segment). Used only when the raw-FFI text page can't be loaded.
295fn segment_cells(text: &PdfPageText, page_h: f32) -> Vec<TextCell> {
296    text.segments()
297        .iter()
298        .filter_map(|seg| {
299            let s = seg.text();
300            if s.trim().is_empty() {
301                return None;
302            }
303            let r = seg.bounds();
304            Some(TextCell {
305                text: s,
306                l: r.left().value,
307                t: page_h - r.top().value,
308                r: r.right().value,
309                b: page_h - r.bottom().value,
310            })
311        })
312        .collect()
313}
314
315/// A second, raw-FFI handle on the same PDF used to drive the character loop
316/// (`FPDFText_GetUnicode`/`GetCharBox`) that pdfium-render's safe API doesn't
317/// expose. Closes the document on drop.
318struct FfiText<'a> {
319    bindings: &'a dyn PdfiumLibraryBindings,
320    doc: FPDF_DOCUMENT,
321}
322
323/// One glyph: codepoint + native (y-up) box edges. `l/b/r/t` is pdfium's *tight*
324/// ink box (used by the legacy `lines_from_glyphs`); `ll/lb/lr/lt` is the *loose*
325/// box (font ascent/descent + advance — uniform per font/size), which the
326/// docling-parse-style sanitizer needs so adjacent glyphs share a top edge.
327pub(crate) struct Glyph {
328    pub(crate) ch: char,
329    pub(crate) l: f32,
330    pub(crate) b: f32,
331    pub(crate) r: f32,
332    pub(crate) t: f32,
333    pub(crate) ll: f32,
334    pub(crate) lb: f32,
335    pub(crate) lr: f32,
336    pub(crate) lt: f32,
337    /// Hash of the PDF font name + flags (0 when not fetched). The sanitizer uses
338    /// it for docling-parse's `enforce_same_font` (keeps a bold label and regular
339    /// value as separate line cells, e.g. `LABEL : value`).
340    pub(crate) font: u64,
341}
342
343impl<'a> FfiText<'a> {
344    fn load(bindings: &'a dyn PdfiumLibraryBindings, bytes: &[u8], password: Option<&str>) -> Self {
345        let doc = bindings.FPDF_LoadMemDocument(bytes, password);
346        FfiText { bindings, doc }
347    }
348
349    /// Reconstruct line cells for page `index` (zero-based) via the
350    /// chars→words→lines grouping. Returns `(prose_cells, code_cells)` — the same
351    /// glyphs grouped two ways (gap-heuristic for prose, space-glyph-only for
352    /// code). Both empty on any failure (caller falls back).
353    fn page_cells(&self, index: i32, page_h: f32) -> (Vec<TextCell>, Vec<TextCell>, Vec<TextCell>) {
354        let empty = || (Vec::new(), Vec::new(), Vec::new());
355        if self.doc.is_null() {
356            return empty();
357        }
358        let b = self.bindings;
359        let page = b.FPDF_LoadPage(self.doc, index);
360        if page.is_null() {
361            return empty();
362        }
363        let tp = b.FPDFText_LoadPage(page);
364        let out = if tp.is_null() {
365            empty()
366        } else {
367            let dp = use_dp_lines();
368            let g = glyphs(b, tp, dp);
369            b.FPDFText_ClosePage(tp);
370            // Prose line cells: the docling-parse-style sanitizer (behind a flag
371            // while it's validated) or the legacy gap-heuristic reconstruction.
372            let prose = if dp {
373                crate::dp_lines::line_cells(&g, page_h, false)
374            } else {
375                lines_from_glyphs(&g, page_h, Grouping::Prose)
376            };
377            (
378                prose,
379                lines_from_glyphs(&g, page_h, Grouping::CodeSpaceOnly),
380                words_from_glyphs(&g, page_h),
381            )
382        };
383        b.FPDF_ClosePage(page);
384        out
385    }
386}
387
388impl Drop for FfiText<'_> {
389    fn drop(&mut self) {
390        if !self.doc.is_null() {
391            self.bindings.FPDF_CloseDocument(self.doc);
392        }
393    }
394}
395
396/// Read every glyph (codepoint + native box) from the text page, in document
397/// order. A space glyph is kept as a word-boundary marker (NaN box, char `' '`);
398/// pdfium emits these on most lines and they pin word splits exactly. Hard line
399/// breaks are dropped (line structure comes from geometry); the gap heuristic in
400/// [`lines_from_glyphs`] is the fallback for the lines pdfium leaves space-less.
401/// Debug helper: the raw pdfium glyph stream (codepoint + native bottom-left
402/// box) for a page, in pdfium's character order. For comparing against
403/// docling-parse's char cells.
404pub fn debug_glyphs(bytes: &[u8], index: i32) -> Vec<(char, f32, f32)> {
405    let Ok(pdfium) = bind() else {
406        return Vec::new();
407    };
408    let ffi = FfiText::load(pdfium.bindings(), bytes, None);
409    if ffi.doc.is_null() {
410        return Vec::new();
411    }
412    let b = ffi.bindings;
413    let page = b.FPDF_LoadPage(ffi.doc, index);
414    if page.is_null() {
415        return Vec::new();
416    }
417    let tp = b.FPDFText_LoadPage(page);
418    let mut out = Vec::new();
419    if !tp.is_null() {
420        for g in glyphs(b, tp, true) {
421            out.push((g.ch, g.ll, g.lr));
422        }
423        b.FPDFText_ClosePage(tp);
424    }
425    b.FPDF_ClosePage(page);
426    out
427}
428
429/// One text object on a page, for the hidden-layer diagnostic.
430#[derive(Debug, Clone)]
431pub struct DebugTextObject {
432    /// True when the object is drawn invisibly (text render mode 3) — the marker of
433    /// a hidden duplicate text layer.
434    pub invisible: bool,
435    /// Bounding box in native PDF points (bottom-left origin).
436    pub l: f32,
437    pub b: f32,
438    pub r: f32,
439    pub t: f32,
440    /// The object's text (best-effort; empty if it could not be read).
441    pub text: String,
442}
443
444/// Diagnostic: every text object on page `index`, each tagged visible/invisible
445/// (via the object-level [`FPDFTextObj_GetTextRenderMode`], which — unlike the
446/// per-character render-mode API — is available on the default pdfium binding).
447/// A hidden duplicate text layer shows up as invisible objects repeating the
448/// visible text. Used by the `dump_render_modes` example.
449///
450/// [`FPDFTextObj_GetTextRenderMode`]: pdfium_render::prelude::PdfiumLibraryBindings::FPDFTextObj_GetTextRenderMode
451pub fn debug_text_objects(bytes: &[u8], index: i32) -> Vec<DebugTextObject> {
452    let Ok(pdfium) = bind() else {
453        return Vec::new();
454    };
455    let ffi = FfiText::load(pdfium.bindings(), bytes, None);
456    if ffi.doc.is_null() {
457        return Vec::new();
458    }
459    let b = ffi.bindings;
460    let page = b.FPDF_LoadPage(ffi.doc, index);
461    if page.is_null() {
462        return Vec::new();
463    }
464    let tp = b.FPDFText_LoadPage(page);
465    let mut out = Vec::new();
466    let n = b.FPDFPage_CountObjects(page);
467    for i in 0..n {
468        let obj = b.FPDFPage_GetObject(page, i);
469        if obj.is_null() || b.FPDFPageObj_GetType(obj) != FPDF_PAGEOBJ_TEXT as i32 {
470            continue;
471        }
472        let (mut l, mut bot, mut r, mut top) = (0f32, 0f32, 0f32, 0f32);
473        if b.FPDFPageObj_GetBounds(obj, &mut l, &mut bot, &mut r, &mut top) == 0 {
474            continue;
475        }
476        let invisible = b.FPDFTextObj_GetTextRenderMode(obj) == INVISIBLE_RENDER_MODE;
477        let text = if tp.is_null() {
478            String::new()
479        } else {
480            // FPDFTextObj_GetText returns the count of UTF-16 code units, including
481            // the trailing NUL; call once for the size, once to fill.
482            let need = b.FPDFTextObj_GetText(obj, tp, std::ptr::null_mut(), 0);
483            if need <= 1 {
484                String::new()
485            } else {
486                let mut buf = vec![0u16; need as usize];
487                b.FPDFTextObj_GetText(obj, tp, buf.as_mut_ptr(), need);
488                if let Some(&0) = buf.last() {
489                    buf.pop();
490                }
491                String::from_utf16_lossy(&buf)
492            }
493        };
494        out.push(DebugTextObject {
495            invisible,
496            l,
497            b: bot,
498            r,
499            t: top,
500            text,
501        });
502    }
503    if !tp.is_null() {
504        b.FPDFText_ClosePage(tp);
505    }
506    b.FPDF_ClosePage(page);
507    out
508}
509
510/// Hash a glyph's PDF font name + flags, for `enforce_same_font`. 0 if unavailable.
511fn font_hash(b: &dyn PdfiumLibraryBindings, tp: FPDF_TEXTPAGE, i: i32) -> u64 {
512    use std::hash::{Hash, Hasher};
513    let mut flags: std::os::raw::c_int = 0;
514    let len = b.FPDFText_GetFontInfo(tp, i, std::ptr::null_mut(), 0, &mut flags);
515    if len == 0 {
516        return 0;
517    }
518    let mut buf = vec![0u8; len as usize];
519    b.FPDFText_GetFontInfo(
520        tp,
521        i,
522        buf.as_mut_ptr() as *mut std::os::raw::c_void,
523        len,
524        &mut flags,
525    );
526    let mut h = std::collections::hash_map::DefaultHasher::new();
527    buf.hash(&mut h);
528    flags.hash(&mut h);
529    h.finish()
530}
531
532/// pdfium text render mode 3: the glyph is drawn with neither fill nor stroke —
533/// an invisible glyph. Web-to-PDF exporters put a hidden plain-text copy of
534/// syntax-highlighted code (and other "copy"/accessibility layers) in this mode,
535/// which the char-level text API then extracts as a duplicate of the visible text.
536const INVISIBLE_RENDER_MODE: i32 = 3;
537
538fn glyphs(b: &dyn PdfiumLibraryBindings, tp: FPDF_TEXTPAGE, fetch_font: bool) -> Vec<Glyph> {
539    let n = b.FPDFText_CountChars(tp);
540    let mut out = Vec::with_capacity(n.max(0) as usize);
541    for i in 0..n {
542        let ch = match char::from_u32(b.FPDFText_GetUnicode(tp, i)) {
543            Some(c) => c,
544            None => continue,
545        };
546        if ch == '\r' || ch == '\n' {
547            continue;
548        }
549        // Spaces are font-neutral (0): pdfium's generated spaces carry a default
550        // font that would otherwise block every word↔space merge under
551        // enforce_same_font; docling-parse's spaces inherit the run's font.
552        let font = if fetch_font && !ch.is_whitespace() {
553            font_hash(b, tp, i)
554        } else {
555            0
556        };
557        let (mut l, mut r, mut bot, mut top) = (0f64, 0f64, 0f64, 0f64);
558        let has_box = b.FPDFText_GetCharBox(tp, i, &mut l, &mut r, &mut bot, &mut top) != 0;
559        // Loose box: font ascent/descent + glyph advance, uniform per font/size.
560        let mut lr = FS_RECTF {
561            left: 0.0,
562            top: 0.0,
563            right: 0.0,
564            bottom: 0.0,
565        };
566        let (ll, lb, lrt, ltop) = if b.FPDFText_GetLooseCharBox(tp, i, &mut lr) != 0 {
567            (lr.left, lr.bottom, lr.right, lr.top)
568        } else if has_box {
569            (l as f32, bot as f32, r as f32, top as f32)
570        } else {
571            (f32::NAN, 0.0, 0.0, 0.0)
572        };
573        if ch.is_whitespace() {
574            // Keep the space *with its box* (the docling-parse-style line sanitizer
575            // needs literal space glyphs); NaN `l` if pdfium reports no box (the
576            // legacy `lines_from_glyphs` ignores the box and only flags a space).
577            out.push(Glyph {
578                ch: ' ',
579                l: if has_box { l as f32 } else { f32::NAN },
580                b: if has_box { bot as f32 } else { 0.0 },
581                r: if has_box { r as f32 } else { 0.0 },
582                t: if has_box { top as f32 } else { 0.0 },
583                ll,
584                lb,
585                lr: lrt,
586                lt: ltop,
587                font,
588            });
589            continue;
590        }
591        if !has_box {
592            continue;
593        }
594        out.push(Glyph {
595            ch,
596            l: l as f32,
597            b: bot as f32,
598            r: r as f32,
599            t: top as f32,
600            ll,
601            lb,
602            lr: lrt,
603            lt: ltop,
604            font,
605        });
606    }
607    // pdfium splits the Arabic lam-alef ligature into two chars at the *same* x
608    // (it's one glyph) in visual order — `alef-variant, lam`. docling-parse and
609    // logical order are `lam, alef-variant`. Detect the ligature by the shared x
610    // and swap. The shared-x test reliably distinguishes a true ligature from a
611    // genuine `alef + lam` sequence (the article `ال`, or `فعالة`), whose two
612    // glyphs sit at different x and must NOT be reordered.
613    for i in 0..out.len().saturating_sub(1) {
614        let same_x = out[i].l.is_finite()
615            && out[i + 1].l.is_finite()
616            && (out[i].l - out[i + 1].l).abs() < 1.0;
617        if same_x
618            && matches!(out[i].ch, '\u{0622}' | '\u{0623}' | '\u{0625}' | '\u{0627}')
619            && out[i + 1].ch == '\u{0644}'
620        {
621            out.swap(i, i + 1);
622        }
623    }
624    // Reconstruct degenerate (zero-width) loose space boxes by spanning the gap to
625    // the next glyph on the same line, so the sanitizer keeps them as word
626    // separators rather than dropping them (which would merge `Information systems`
627    // → `Informationsystems`). pdfium gives generated spaces a zero-width box at a
628    // wrong baseline; a wrap (different baseline) or a touching gap is left alone.
629    for i in 0..out.len() {
630        if out[i].ch != ' ' || (out[i].lr - out[i].ll).abs() >= 0.5 {
631            continue;
632        }
633        let prev = out[..i]
634            .iter()
635            .rev()
636            .find(|g| g.ch != ' ' && g.ll.is_finite())
637            .map(|g| (g.lr, g.lb, g.lt));
638        let next = out[i + 1..]
639            .iter()
640            .find(|g| g.ch != ' ' && g.ll.is_finite())
641            .map(|g| (g.ll, g.lb));
642        if let (Some((plr, plb, plt)), Some((nll, nlb))) = (prev, next) {
643            let line_h = (plt - plb).abs().max(1.0);
644            if (plb - nlb).abs() < line_h * 0.5 && nll > plr + 0.5 {
645                out[i].ll = plr;
646                out[i].lr = nll;
647                out[i].lb = plb;
648                out[i].lt = plt;
649            }
650        }
651    }
652    out
653}
654
655/// How [`lines_from_glyphs`] splits a line into words.
656#[derive(Clone, Copy, PartialEq)]
657enum Grouping {
658    /// Gap heuristic + punctuation glue (`engines,`, `[37`, `98.5`) — prose.
659    Prose,
660    /// Split only at literal space glyphs, never glue — pdfium code cells.
661    /// pdfium's monospace listings carry a real space glyph at every source space,
662    /// and its overhanging loose boxes would make the gap heuristic over-split
663    /// (`f un c t i o n`), so honouring just the spaces reproduces the spacing.
664    CodeSpaceOnly,
665    /// Split on the inter-glyph **gap** (or a space glyph), but never glue — for
666    /// the parser's code cells: the parser emits no space glyphs (a source space
667    /// is a positioning gap), and its clean advance boxes make the gap reliable.
668    /// Unlike [`Grouping::Prose`] there is no punctuation glue, so a real gap
669    /// always splits (`et al. 2000`, not `et al.2000`) while genuinely touching
670    /// tokens stay joined (`add(a,` / `b)`).
671    CodeGap,
672}
673
674/// Group glyphs (document order) into words then lines, the way docling-parse
675/// does: a new **word** starts where the horizontal gap to the previous glyph
676/// exceeds ~0.2 × the font height (a real space is ~0.3 × height; letter
677/// tracking is smaller, so titles don't shatter); a new **line** starts where
678/// the baseline drops by ~half the font height (a superscript rises without
679/// dropping, so it stays on its line). Coordinates are flipped to top-left.
680/// See [`Grouping`] for how each mode decides word boundaries.
681fn lines_from_glyphs(gs: &[Glyph], page_h: f32, mode: Grouping) -> Vec<TextCell> {
682    let mut cells: Vec<TextCell> = Vec::new();
683    let mut words: Vec<String> = Vec::new(); // words on the current line
684    let mut word = String::new();
685    // current line bounding box, native
686    let (mut ll, mut lb, mut lr, mut lt) = (
687        f32::INFINITY,
688        f32::INFINITY,
689        f32::NEG_INFINITY,
690        f32::NEG_INFINITY,
691    );
692    // Tallest glyph seen on the current line: the word-gap threshold is relative
693    // to it, so a small-font run on the line (a superscript citation) isn't split
694    // at its tight digit gaps, while a big display title isn't split at its wider
695    // letter tracking. A real inter-word space is ~0.3× the font height.
696    let mut line_h: f32 = 0.0;
697    let mut prev: Option<&Glyph> = None;
698    // A space glyph between non-space glyphs pins a word split the gap heuristic
699    // can miss (tight justified spacing); it carries no geometry.
700    let mut pending_space = false;
701
702    for g in gs {
703        if g.ch == ' ' {
704            pending_space = true;
705            continue;
706        }
707        let h = (g.t - g.b).abs().max(1.0);
708        let (mut new_word, mut new_line) = (false, false);
709        if let Some(p) = prev {
710            // A new line drops the baseline *and* resets x leftward; requiring the
711            // x-reset avoids a descending comma/semicolon faking a line break. A
712            // *large* drop (≥1.5× the line height — a skipped line, e.g. a centered
713            // page-number footer below a short last word) is always a new line,
714            // even without the x-reset.
715            // LTR wraps reset x leftward (`g.l < p.r`); RTL (Arabic) wraps reset
716            // rightward (the new line begins at the far right). A large drop
717            // (≥1.5× line height) is a new line regardless of x.
718            let x_reset = if is_arabic(g.ch) || is_arabic(p.ch) {
719                g.l > p.r
720            } else {
721                g.l < p.r
722            };
723            new_line = (p.b - g.b > h * 0.5 && x_reset) || (p.b - g.b > line_h.max(h) * 1.5);
724            // Don't split before closing punctuation, after opening punctuation, or
725            // after a period that runs into a digit/lowercase letter — docling
726            // keeps `engines,` / `[37` / `i.e.` / `98.5` together even across a
727            // space or gap.
728            let glued = is_close_punct(g.ch)
729                || is_open_punct(p.ch)
730                || (p.ch.is_ascii_digit() && g.ch.is_ascii_digit())
731                || (p.ch == '.'
732                    && !pending_space
733                    && (g.ch.is_ascii_digit() || g.ch.is_ascii_lowercase()));
734            let word_gap = line_h.max(h) * 0.25;
735            new_word = if mode == Grouping::CodeSpaceOnly {
736                new_line || pending_space
737            } else if mode == Grouping::CodeGap {
738                // Gap-based, no glue: a real gap always splits, touching tokens join.
739                new_line || pending_space || g.l - p.r > word_gap
740            } else if is_arabic(g.ch) || is_arabic(p.ch) {
741                // RTL runs right-to-left, so the inter-word gap is `p.l - g.r`. A
742                // real word space has a gap; pdfium also emits spurious zero-gap
743                // space glyphs inside words (`التي`), so require the gap rather
744                // than trusting a bare space glyph.
745                new_line || (p.l - g.r > word_gap && !glued)
746            } else {
747                new_line || ((pending_space || g.l - p.r > word_gap) && !glued)
748            };
749        }
750        pending_space = false;
751        if new_line {
752            push_word(&mut word, &mut words);
753            push_line(&mut words, (ll, lb, lr, lt), page_h, &mut cells);
754            (ll, lb, lr, lt) = (
755                f32::INFINITY,
756                f32::INFINITY,
757                f32::NEG_INFINITY,
758                f32::NEG_INFINITY,
759            );
760            line_h = 0.0;
761        } else if new_word {
762            push_word(&mut word, &mut words);
763        }
764        word.push(g.ch);
765        ll = ll.min(g.l);
766        lb = lb.min(g.b);
767        lr = lr.max(g.r);
768        lt = lt.max(g.t);
769        line_h = line_h.max(h);
770        prev = Some(g);
771    }
772    push_word(&mut word, &mut words);
773    push_line(&mut words, (ll, lb, lr, lt), page_h, &mut cells);
774    cells
775}
776
777/// Code line cells from the **parser**'s glyph stream. Unlike pdfium — whose
778/// monospace listings carry explicit space glyphs (so [`Grouping::CodeSpaceOnly`]
779/// keeps their spacing) — the parser emits no space glyphs: a source space is a
780/// positioning gap. So code cells use [`Grouping::CodeGap`], which splits on the
781/// inter-glyph gap (a space wherever it exceeds ~0.25× the line height) but never
782/// glues punctuation, so `et al. 2000` keeps its space while `add(a,` / `b)` stay
783/// joined. The parser's clean advance boxes make the gap heuristic reliable here,
784/// where pdfium's overhanging loose boxes would over-split (`f un c t i o n`).
785pub(crate) fn code_cells_from_glyphs(gs: &[Glyph], page_h: f32) -> Vec<TextCell> {
786    lines_from_glyphs(gs, page_h, Grouping::CodeGap)
787}
788
789/// Per-word cells (each word's text + top-left bbox), using the same word/line
790/// splitting as [`lines_from_glyphs`] but emitting one cell per word instead of
791/// joining into lines — the legacy gap-heuristic word grouping, kept for the
792/// pdfium word path (`DOCLING_PDFIUM_WORDS`). The default parser path uses
793/// [`crate::dp_lines::word_cells`] instead.
794pub(crate) fn words_from_glyphs(gs: &[Glyph], page_h: f32) -> Vec<TextCell> {
795    let mut cells = Vec::new();
796    let mut word = String::new();
797    let inf = (
798        f32::INFINITY,
799        f32::INFINITY,
800        f32::NEG_INFINITY,
801        f32::NEG_INFINITY,
802    );
803    let (mut wl, mut wb, mut wr, mut wt) = inf;
804    let mut line_h: f32 = 0.0;
805    let mut prev: Option<&Glyph> = None;
806    let mut pending_space = false;
807    for g in gs {
808        if g.ch == ' ' {
809            pending_space = true;
810            continue;
811        }
812        let h = (g.t - g.b).abs().max(1.0);
813        let mut new_line = false;
814        let mut new_word = false;
815        if let Some(p) = prev {
816            // LTR wraps reset x leftward (`g.l < p.r`); RTL (Arabic) wraps reset
817            // rightward (the new line begins at the far right). A large drop
818            // (≥1.5× line height) is a new line regardless of x.
819            let x_reset = if is_arabic(g.ch) || is_arabic(p.ch) {
820                g.l > p.r
821            } else {
822                g.l < p.r
823            };
824            new_line = (p.b - g.b > h * 0.5 && x_reset) || (p.b - g.b > line_h.max(h) * 1.5);
825            // No digit-digit glue here (unlike the prose grouping): table cells in
826            // adjacent columns are numeric and a column gap must still split them
827            // (`0.965` `0.934`, not `0.9650.934`). Intra-number digits have no gap
828            // so they stay together regardless.
829            let glued = is_close_punct(g.ch)
830                || is_open_punct(p.ch)
831                || (p.ch == '.'
832                    && !pending_space
833                    && (g.ch.is_ascii_digit() || g.ch.is_ascii_lowercase()));
834            let word_gap = line_h.max(h) * 0.25;
835            new_word = new_line || ((pending_space || g.l - p.r > word_gap) && !glued);
836        }
837        pending_space = false;
838        if new_word && !word.is_empty() {
839            cells.push(TextCell {
840                text: std::mem::take(&mut word),
841                l: wl,
842                t: page_h - wt,
843                r: wr,
844                b: page_h - wb,
845            });
846            (wl, wb, wr, wt) = inf;
847        }
848        if new_line {
849            line_h = 0.0;
850        }
851        word.push(g.ch);
852        wl = wl.min(g.l);
853        wb = wb.min(g.b);
854        wr = wr.max(g.r);
855        wt = wt.max(g.t);
856        line_h = line_h.max(h);
857        prev = Some(g);
858    }
859    if !word.is_empty() {
860        cells.push(TextCell {
861            text: word,
862            l: wl,
863            t: page_h - wt,
864            r: wr,
865            b: page_h - wb,
866        });
867    }
868    cells
869}
870
871fn is_arabic(c: char) -> bool {
872    ('\u{0600}'..='\u{06FF}').contains(&c)
873}
874
875fn is_close_punct(c: char) -> bool {
876    matches!(
877        c,
878        ',' | '.' | ';' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '\u{2019}' | '\u{2018}'
879    )
880}
881
882fn is_open_punct(c: char) -> bool {
883    // `@` glues to what follows (`mAP @0.5`, `bpf@zurich`, `@decorator`).
884    matches!(c, '(' | '[' | '{' | '@')
885}
886
887fn push_word(word: &mut String, words: &mut Vec<String>) {
888    if !word.is_empty() {
889        words.push(std::mem::take(word));
890    }
891}
892
893fn push_line(
894    words: &mut Vec<String>,
895    bbox: (f32, f32, f32, f32),
896    page_h: f32,
897    cells: &mut Vec<TextCell>,
898) {
899    if words.is_empty() {
900        return;
901    }
902    let text = std::mem::take(words).join(" ");
903    let (l, b, r, t) = bbox;
904    cells.push(TextCell {
905        text,
906        l,
907        t: page_h - t,
908        r,
909        b: page_h - b,
910    });
911}
fleischwolf_pdf/pdfium_backend.rs

fleischwolf_pdf/
pdfium_backend.rs