fleischwolf_pdf/pdfium_backend.rs
1//! pdfium-based text extraction and page rendering.
2//!
3//! Text is reconstructed the way docling's `docling-parse` does it, so the
4//! output spacing matches the groundtruth: the page's **character** stream is
5//! grouped into **words** (split at a horizontal gap wider than a fraction of
6//! the font height — font-relative, so letter-tracking in display titles does
7//! not split a word) and words into **lines** (by baseline). pdfium-render's
8//! safe API only exposes whole style runs / `GetBoundedText`, so the character
9//! loop is driven through the raw `PdfiumLibraryBindings` FFI on a second handle
10//! to the same bytes (no fork; stays publishable).
11
12use image::RgbImage;
13use pdfium_render::prelude::*;
14
15/// A run of text with its bounding box, in PDF points with a **top-left** origin
16/// (pdfium's native origin is bottom-left; we flip it to match docling's
17/// `BoundingBox(..., origin=TOPLEFT)`).
18#[derive(Debug, Clone)]
19pub struct TextCell {
20 pub text: String,
21 pub l: f32,
22 pub t: f32,
23 pub r: f32,
24 pub b: f32,
25}
26
27/// Pixels-per-point used to render page images. Layout is scale-invariant (it
28/// scales normalized boxes by the page point size), but OCR benefits from the
29/// extra resolution.
30pub const RENDER_SCALE: f32 = 2.0;
31
32/// One page's geometry, extracted text cells, and a rendered RGB image. The
33/// image is rendered at [`RENDER_SCALE`] pixels per PDF point; `image px =
34/// page point × scale`.
35#[derive(Clone)]
36pub struct PdfPage {
37 pub width: f32,
38 pub height: f32,
39 pub scale: f32,
40 pub cells: Vec<TextCell>,
41 /// Same text grouped for code regions: split only at pdfium space glyphs, so
42 /// monospace runs keep their source spacing instead of the prose heuristic's.
43 pub code_cells: Vec<TextCell>,
44 /// Per-word cells (one per word, not joined into lines) for TableFormer cell
45 /// matching.
46 pub word_cells: Vec<TextCell>,
47 pub image: RgbImage,
48 /// Hyperlink annotations on the page (rect in top-left page coords + target
49 /// URI), restricted to web/mail/tel schemes. Used only by strict Markdown.
50 pub links: Vec<LinkAnnot>,
51}
52
53/// A PDF link annotation: its rectangle (top-left page coordinates, matching
54/// [`TextCell`]) and target URI.
55#[derive(Debug, Clone)]
56pub struct LinkAnnot {
57 pub l: f32,
58 pub t: f32,
59 pub r: f32,
60 pub b: f32,
61 pub uri: String,
62}
63
64/// A parsed PDF: per-page text cells and page images.
65pub struct PdfDocument {
66 pub pages: Vec<PdfPage>,
67}
68
69/// Whether to use the docling-parse line sanitizer ([`crate::dp_lines`]) for prose
70/// reconstruction — the default. Set `DOCLING_LEGACY_LINES` to fall back to the
71/// older gap-heuristic `lines_from_glyphs`.
72pub(crate) fn use_dp_lines() -> bool {
73 std::env::var("DOCLING_LEGACY_LINES").is_err()
74}
75
76/// Whether to source **word** cells from the pure-Rust parser (roadmap item 6),
77/// the default. The parser's `word_cells` reproduce docling-parse's word grouping
78/// byte-for-byte — the per-word tokens TableFormer matches table-grid cells
79/// against — which moves table extraction closer to docling on the heavy
80/// multi-column fixtures. Set `DOCLING_PDFIUM_WORDS` to keep pdfium's word cells,
81/// or `DOCLING_PDFIUM_TEXT` to fall back to pdfium for all text.
82pub(crate) fn use_parser_words() -> bool {
83 std::env::var("DOCLING_PDFIUM_WORDS").is_err() && std::env::var("DOCLING_PDFIUM_TEXT").is_err()
84}
85
86/// Whether to source **code** cells from the parser too (the default) — the last
87/// text layer to leave pdfium, fully retiring its text path. The parser's
88/// gap-based code grouping ([`code_cells_from_glyphs`]) reconstructs monospace
89/// spacing from positioning gaps (`function add(a, b) { … }`), so it no longer
90/// drops the inter-token spaces the old space-glyph-only grouping lost
91/// (`functionadd`). Reverts to pdfium with `DOCLING_PDFIUM_WORDS` (alongside word
92/// cells) or `DOCLING_PDFIUM_TEXT` (all text).
93pub(crate) fn use_parser_code() -> bool {
94 std::env::var("DOCLING_PDFIUM_WORDS").is_err() && std::env::var("DOCLING_PDFIUM_TEXT").is_err()
95}
96
97/// Try binding pdfium from a directory (or a literal library file path):
98/// `<dir>/<platform library name>` first, else `<dir>` itself as the file.
99fn try_bind_dir(path: &str) -> Option<Box<dyn pdfium_render::prelude::PdfiumLibraryBindings>> {
100 let name = Pdfium::pdfium_platform_library_name_at_path(path);
101 if let Ok(b) = Pdfium::bind_to_library(&name) {
102 return Some(b);
103 }
104 Pdfium::bind_to_library(path).ok()
105}
106
107/// Bind to the pdfium dynamic library. Honors `PDFIUM_DYNAMIC_LIB_PATH` (a
108/// directory or file) first; else falls back to `.pdfium/lib` relative to the
109/// current directory (the layout `scripts/download_dependencies.sh` and
110/// `scripts/pdf_setup.sh` both produce); else the system library.
111fn bind() -> Result<Pdfium, PdfiumError> {
112 if let Ok(path) = std::env::var("PDFIUM_DYNAMIC_LIB_PATH") {
113 if let Some(b) = try_bind_dir(&path) {
114 return Ok(Pdfium::new(b));
115 }
116 }
117 // No env var (or it didn't resolve): fall back to `.pdfium/lib` relative to
118 // the current directory — mirroring `layout.rs`/`ocr.rs`'s `models/…`
119 // defaults — the layout `scripts/download_dependencies.sh` (and
120 // `scripts/pdf_setup.sh`) produce, so a checkout with the dependencies
121 // downloaded next to it needs no env var at all.
122 if let Some(b) = try_bind_dir(&crate::resolve_asset(".pdfium/lib")) {
123 return Ok(Pdfium::new(b));
124 }
125 Pdfium::bind_to_system_library().map(Pdfium::new)
126}
127
128impl PdfDocument {
129 /// Parse a PDF from bytes, optionally decrypting with `password`.
130 ///
131 /// Note: this materialises **every** page's rendered bitmap in memory at
132 /// once. For large documents prefer [`for_each_page`], which streams.
133 pub fn open(bytes: &[u8], password: Option<&str>) -> Result<Self, PdfiumError> {
134 let pdfium = bind()?;
135 let ffi = FfiText::load(pdfium.bindings(), bytes, password);
136 let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
137 let mut rust = rust_parser_cells(bytes);
138 let mut pages = Vec::new();
139 for (i, page) in doc.pages().iter().enumerate() {
140 let rc = rust.as_mut().and_then(|v| v.get_mut(i).map(std::mem::take));
141 pages.push(extract_page(&page, &ffi, i as i32, rc, true)?);
142 }
143 Ok(PdfDocument { pages })
144 }
145}
146
147/// Per-page prose line cells from the pure-Rust text parser. This is the
148/// **default** text layer (it matches docling-parse's char geometry and is a
149/// strict improvement on byte-conformance — e.g. it recovers the Arabic
150/// sentence-period attachment in `right_to_left_01`). Set `DOCLING_PDFIUM_TEXT`
151/// to fall back to pdfium's text layer. The parser returns an empty page when a
152/// PDF (or a page) has no parseable text layer; the caller keeps pdfium's cells
153/// in that case, so scanned/edge-case pages are unaffected.
154fn rust_parser_cells(bytes: &[u8]) -> Option<Vec<crate::textparse::PageParserCells>> {
155 if std::env::var("DOCLING_PDFIUM_TEXT").is_ok() {
156 return None;
157 }
158 Some(crate::timing::timed("textparse", || {
159 crate::textparse::pdf_all_cells(bytes)
160 }))
161}
162
163/// Number of pages in a PDF, without rendering any of them — used to decide
164/// whether a document is worth spinning up the parallel worker pool.
165pub fn page_count(bytes: &[u8], password: Option<&str>) -> Result<usize, PdfiumError> {
166 let pdfium = bind()?;
167 let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
168 Ok(doc.pages().len() as usize)
169}
170
171/// Render + extract pages one at a time, handing each (owned) [`PdfPage`] to `f`.
172/// Only one page bitmap is resident at a time — a rendered page is ~5 MB, so a
173/// large PDF would otherwise hold gigabytes of bitmaps at once. `f` receives the
174/// zero-based page index and the total page count.
175///
176/// `render_image` controls whether the page bitmap is rasterized at all: layout,
177/// OCR, TableFormer, and picture cropping all need it, but a caller that skips
178/// every one of those (the `no_ocr` fast path) doesn't, and rasterizing +
179/// downsampling a page is by far the most expensive step per page — skipping it
180/// is most of `no_ocr`'s speedup. `PdfPage::image` is a 1×1 placeholder when
181/// `false`; do not read it.
182///
183/// `E` is the caller's error type; pdfium errors convert into it via `From`.
184pub fn for_each_page<E, F>(
185 bytes: &[u8],
186 password: Option<&str>,
187 render_image: bool,
188 mut f: F,
189) -> Result<(), E>
190where
191 E: From<PdfiumError>,
192 F: FnMut(usize, usize, PdfPage) -> Result<(), E>,
193{
194 let pdfium = bind()?;
195 let ffi = FfiText::load(pdfium.bindings(), bytes, password);
196 let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
197 let mut rust = rust_parser_cells(bytes);
198 let pages = doc.pages();
199 let total = pages.len() as usize;
200 for (i, page) in pages.iter().enumerate() {
201 let rc = rust.as_mut().and_then(|v| v.get_mut(i).map(std::mem::take));
202 let extracted = extract_page(&page, &ffi, i as i32, rc, render_image)?;
203 f(i, total, extracted)?;
204 }
205 Ok(())
206}
207
208fn extract_page(
209 page: &pdfium_render::prelude::PdfPage<'_>,
210 ffi: &FfiText<'_>,
211 index: i32,
212 rust_cells: Option<crate::textparse::PageParserCells>,
213 render_image: bool,
214) -> Result<PdfPage, PdfiumError> {
215 let width = page.width().value;
216 let height = page.height().value;
217
218 // Default: use the pure-Rust text parser instead of pdfium's text layer
219 // (override with `DOCLING_PDFIUM_TEXT`). Prose line cells always come from the
220 // parser; word and code cells do too unless `DOCLING_PDFIUM_WORDS` keeps them
221 // on pdfium (the parser's word grouping reproduces docling-parse's, which
222 // TableFormer matches against — roadmap item 6). A page the parser couldn't
223 // read (no text layer) keeps pdfium's cells.
224 let rc = rust_cells.unwrap_or_default();
225 let need_pdfium_prose = rc.prose.is_empty();
226 let need_pdfium_words = !use_parser_words() || rc.words.is_empty();
227 let need_pdfium_code = !use_parser_code() || rc.code.is_empty();
228
229 // The parser covers prose/words/code from one shared glyph pass, so on the
230 // common (parser-succeeded) page all three are already satisfied and this
231 // pdfium FFI call — otherwise fully discarded below — is skipped outright.
232 let (mut cells, mut code_cells, mut word_cells) =
233 if need_pdfium_prose || need_pdfium_words || need_pdfium_code {
234 let (mut cells, code_cells, word_cells) =
235 crate::timing::timed("ffi.page_cells", || ffi.page_cells(index, height));
236 if cells.is_empty() {
237 cells = segment_cells(&page.text()?, height);
238 }
239 (cells, code_cells, word_cells)
240 } else {
241 (Vec::new(), Vec::new(), Vec::new())
242 };
243 if !rc.prose.is_empty() {
244 cells = rc.prose;
245 }
246 if use_parser_words() && !rc.words.is_empty() {
247 word_cells = rc.words;
248 }
249 if use_parser_code() && !rc.code.is_empty() {
250 code_cells = rc.code;
251 }
252
253 let image = if render_image {
254 // docling renders at 1.5× the target scale and downsamples "to make it
255 // sharper" (pypdfium2 → PIL BICUBIC). Replicate exactly: the TableFormer
256 // model is pixel-sensitive, so the page bitmap must match byte-for-byte.
257 // `CatmullRom` is the same a=-0.5 cubic kernel as PIL's BICUBIC.
258 const SUPERSAMPLE: f32 = 1.5;
259 let tw = (width * RENDER_SCALE * SUPERSAMPLE).round().max(1.0) as i32;
260 let th = (height * RENDER_SCALE * SUPERSAMPLE).round().max(1.0) as i32;
261 let cfg = PdfRenderConfig::new()
262 .set_target_width(tw)
263 .set_target_height(th);
264 let big = crate::timing::timed("pdfium.render", || {
265 page.render_with_config(&cfg)
266 .map(|b| b.as_image().into_rgb8())
267 })?;
268 let dw = (width * RENDER_SCALE).round().max(1.0) as u32;
269 let dh = (height * RENDER_SCALE).round().max(1.0) as u32;
270 crate::timing::timed("image.resize", || fast_downscale(&big, dw, dh))
271 } else {
272 RgbImage::new(1, 1)
273 };
274
275 Ok(PdfPage {
276 width,
277 height,
278 scale: RENDER_SCALE,
279 cells,
280 code_cells,
281 word_cells,
282 image,
283 links: extract_links(page, height),
284 })
285}
286
287/// The supersample→target downscale via `fast_image_resize` (SIMD convolution;
288/// the same a=-0.5 Catmull-Rom kernel as `image::imageops::resize(...,
289/// CatmullRom)` and PIL BICUBIC — see the render comment above). Set
290/// `FLEISCHWOLF_SLOW_RESIZE=1` to fall back to the `image`-crate scalar resize
291/// (byte-parity with the pre-SIMD pipeline, several times slower).
292fn fast_downscale(big: &RgbImage, dw: u32, dh: u32) -> RgbImage {
293 use fast_image_resize as fir;
294 static SLOW: std::sync::OnceLock<bool> = std::sync::OnceLock::new();
295 let slow = *SLOW.get_or_init(|| {
296 std::env::var("FLEISCHWOLF_SLOW_RESIZE")
297 .map(|v| v != "0")
298 .unwrap_or(false)
299 });
300 if !slow {
301 if let Some(out) = (|| {
302 let src = fir::images::ImageRef::new(
303 big.width(),
304 big.height(),
305 big.as_raw(),
306 fir::PixelType::U8x3,
307 )
308 .ok()?;
309 let mut dst = fir::images::Image::new(dw, dh, fir::PixelType::U8x3);
310 fir::Resizer::new()
311 .resize(
312 &src,
313 &mut dst,
314 &fir::ResizeOptions::new()
315 .resize_alg(fir::ResizeAlg::Convolution(fir::FilterType::CatmullRom)),
316 )
317 .ok()?;
318 RgbImage::from_raw(dw, dh, dst.into_vec())
319 })() {
320 return out;
321 }
322 // Unreachable in practice; fall through to the scalar path on any error.
323 }
324 image::imageops::resize(big, dw, dh, image::imageops::FilterType::CatmullRom)
325}
326
327/// Collect web/mail/tel hyperlink annotations on a page, mapping each link's
328/// rectangle into top-left page coordinates (like [`TextCell`]). `file://` and
329/// in-document destinations are skipped — only externally meaningful targets are
330/// rendered. pdfium occasionally lists a link twice; rects are kept as-is and the
331/// caller dedupes by resolved anchor text.
332fn extract_links(page: &pdfium_render::prelude::PdfPage<'_>, page_h: f32) -> Vec<LinkAnnot> {
333 let mut out = Vec::new();
334 for link in page.links().iter() {
335 let Some(uri) = link
336 .action()
337 .and_then(|a| a.as_uri_action().and_then(|u| u.uri().ok()))
338 else {
339 continue;
340 };
341 let scheme_ok = ["http://", "https://", "mailto:", "tel:"]
342 .iter()
343 .any(|s| uri.starts_with(s));
344 if !scheme_ok {
345 continue;
346 }
347 if let Ok(rect) = link.rect() {
348 out.push(LinkAnnot {
349 l: rect.left().value,
350 t: page_h - rect.top().value,
351 r: rect.right().value,
352 b: page_h - rect.bottom().value,
353 uri,
354 });
355 }
356 }
357 out
358}
359
360/// Fallback line cells from pdfium-render's style segments (one cell per
361/// segment). Used only when the raw-FFI text page can't be loaded.
362fn segment_cells(text: &PdfPageText, page_h: f32) -> Vec<TextCell> {
363 text.segments()
364 .iter()
365 .filter_map(|seg| {
366 let s = seg.text();
367 if s.trim().is_empty() {
368 return None;
369 }
370 let r = seg.bounds();
371 Some(TextCell {
372 text: s,
373 l: r.left().value,
374 t: page_h - r.top().value,
375 r: r.right().value,
376 b: page_h - r.bottom().value,
377 })
378 })
379 .collect()
380}
381
382/// A second, raw-FFI handle on the same PDF used to drive the character loop
383/// (`FPDFText_GetUnicode`/`GetCharBox`) that pdfium-render's safe API doesn't
384/// expose. Closes the document on drop.
385struct FfiText<'a> {
386 bindings: &'a dyn PdfiumLibraryBindings,
387 doc: FPDF_DOCUMENT,
388}
389
390/// One glyph: codepoint + native (y-up) box edges. `l/b/r/t` is pdfium's *tight*
391/// ink box (used by the legacy `lines_from_glyphs`); `ll/lb/lr/lt` is the *loose*
392/// box (font ascent/descent + advance — uniform per font/size), which the
393/// docling-parse-style sanitizer needs so adjacent glyphs share a top edge.
394pub(crate) struct Glyph {
395 pub(crate) ch: char,
396 pub(crate) l: f32,
397 pub(crate) b: f32,
398 pub(crate) r: f32,
399 pub(crate) t: f32,
400 pub(crate) ll: f32,
401 pub(crate) lb: f32,
402 pub(crate) lr: f32,
403 pub(crate) lt: f32,
404 /// Hash of the PDF font name + flags (0 when not fetched). The sanitizer uses
405 /// it for docling-parse's `enforce_same_font` (keeps a bold label and regular
406 /// value as separate line cells, e.g. `LABEL : value`).
407 pub(crate) font: u64,
408}
409
410impl<'a> FfiText<'a> {
411 fn load(bindings: &'a dyn PdfiumLibraryBindings, bytes: &[u8], password: Option<&str>) -> Self {
412 let doc = bindings.FPDF_LoadMemDocument(bytes, password);
413 FfiText { bindings, doc }
414 }
415
416 /// Reconstruct line cells for page `index` (zero-based) via the
417 /// chars→words→lines grouping. Returns `(prose_cells, code_cells)` — the same
418 /// glyphs grouped two ways (gap-heuristic for prose, space-glyph-only for
419 /// code). Both empty on any failure (caller falls back).
420 fn page_cells(&self, index: i32, page_h: f32) -> (Vec<TextCell>, Vec<TextCell>, Vec<TextCell>) {
421 let empty = || (Vec::new(), Vec::new(), Vec::new());
422 if self.doc.is_null() {
423 return empty();
424 }
425 let b = self.bindings;
426 let page = b.FPDF_LoadPage(self.doc, index);
427 if page.is_null() {
428 return empty();
429 }
430 let tp = b.FPDFText_LoadPage(page);
431 let out = if tp.is_null() {
432 empty()
433 } else {
434 let dp = use_dp_lines();
435 let g = glyphs(b, tp, dp);
436 b.FPDFText_ClosePage(tp);
437 // Prose line cells: the docling-parse-style sanitizer (behind a flag
438 // while it's validated) or the legacy gap-heuristic reconstruction.
439 let prose = if dp {
440 crate::dp_lines::line_cells(&g, page_h, false)
441 } else {
442 lines_from_glyphs(&g, page_h, Grouping::Prose)
443 };
444 (
445 prose,
446 lines_from_glyphs(&g, page_h, Grouping::CodeSpaceOnly),
447 words_from_glyphs(&g, page_h),
448 )
449 };
450 b.FPDF_ClosePage(page);
451 out
452 }
453}
454
455impl Drop for FfiText<'_> {
456 fn drop(&mut self) {
457 if !self.doc.is_null() {
458 self.bindings.FPDF_CloseDocument(self.doc);
459 }
460 }
461}
462
463/// Read every glyph (codepoint + native box) from the text page, in document
464/// order. A space glyph is kept as a word-boundary marker (NaN box, char `' '`);
465/// pdfium emits these on most lines and they pin word splits exactly. Hard line
466/// breaks are dropped (line structure comes from geometry); the gap heuristic in
467/// [`lines_from_glyphs`] is the fallback for the lines pdfium leaves space-less.
468/// Debug helper: the raw pdfium glyph stream (codepoint + native bottom-left
469/// box) for a page, in pdfium's character order. For comparing against
470/// docling-parse's char cells.
471pub fn debug_glyphs(bytes: &[u8], index: i32) -> Vec<(char, f32, f32)> {
472 let Ok(pdfium) = bind() else {
473 return Vec::new();
474 };
475 let ffi = FfiText::load(pdfium.bindings(), bytes, None);
476 if ffi.doc.is_null() {
477 return Vec::new();
478 }
479 let b = ffi.bindings;
480 let page = b.FPDF_LoadPage(ffi.doc, index);
481 if page.is_null() {
482 return Vec::new();
483 }
484 let tp = b.FPDFText_LoadPage(page);
485 let mut out = Vec::new();
486 if !tp.is_null() {
487 for g in glyphs(b, tp, true) {
488 out.push((g.ch, g.ll, g.lr));
489 }
490 b.FPDFText_ClosePage(tp);
491 }
492 b.FPDF_ClosePage(page);
493 out
494}
495
496/// One text object on a page, for the hidden-layer diagnostic.
497#[derive(Debug, Clone)]
498pub struct DebugTextObject {
499 /// True when the object is drawn invisibly (text render mode 3) — the marker of
500 /// a hidden duplicate text layer.
501 pub invisible: bool,
502 /// Bounding box in native PDF points (bottom-left origin).
503 pub l: f32,
504 pub b: f32,
505 pub r: f32,
506 pub t: f32,
507 /// The object's text (best-effort; empty if it could not be read).
508 pub text: String,
509}
510
511/// Diagnostic: every text object on page `index`, each tagged visible/invisible
512/// (via the object-level [`FPDFTextObj_GetTextRenderMode`], which — unlike the
513/// per-character render-mode API — is available on the default pdfium binding).
514/// A hidden duplicate text layer shows up as invisible objects repeating the
515/// visible text. Used by the `dump_render_modes` example.
516///
517/// [`FPDFTextObj_GetTextRenderMode`]: pdfium_render::prelude::PdfiumLibraryBindings::FPDFTextObj_GetTextRenderMode
518pub fn debug_text_objects(bytes: &[u8], index: i32) -> Vec<DebugTextObject> {
519 let Ok(pdfium) = bind() else {
520 return Vec::new();
521 };
522 let ffi = FfiText::load(pdfium.bindings(), bytes, None);
523 if ffi.doc.is_null() {
524 return Vec::new();
525 }
526 let b = ffi.bindings;
527 let page = b.FPDF_LoadPage(ffi.doc, index);
528 if page.is_null() {
529 return Vec::new();
530 }
531 let tp = b.FPDFText_LoadPage(page);
532 let mut out = Vec::new();
533 let n = b.FPDFPage_CountObjects(page);
534 for i in 0..n {
535 let obj = b.FPDFPage_GetObject(page, i);
536 if obj.is_null() || b.FPDFPageObj_GetType(obj) != FPDF_PAGEOBJ_TEXT as i32 {
537 continue;
538 }
539 let (mut l, mut bot, mut r, mut top) = (0f32, 0f32, 0f32, 0f32);
540 if b.FPDFPageObj_GetBounds(obj, &mut l, &mut bot, &mut r, &mut top) == 0 {
541 continue;
542 }
543 let invisible = b.FPDFTextObj_GetTextRenderMode(obj) == INVISIBLE_RENDER_MODE;
544 let text = if tp.is_null() {
545 String::new()
546 } else {
547 // FPDFTextObj_GetText returns the count of UTF-16 code units, including
548 // the trailing NUL; call once for the size, once to fill.
549 let need = b.FPDFTextObj_GetText(obj, tp, std::ptr::null_mut(), 0);
550 if need <= 1 {
551 String::new()
552 } else {
553 let mut buf = vec![0u16; need as usize];
554 b.FPDFTextObj_GetText(obj, tp, buf.as_mut_ptr(), need);
555 if let Some(&0) = buf.last() {
556 buf.pop();
557 }
558 String::from_utf16_lossy(&buf)
559 }
560 };
561 out.push(DebugTextObject {
562 invisible,
563 l,
564 b: bot,
565 r,
566 t: top,
567 text,
568 });
569 }
570 if !tp.is_null() {
571 b.FPDFText_ClosePage(tp);
572 }
573 b.FPDF_ClosePage(page);
574 out
575}
576
577/// Hash a glyph's PDF font name + flags, for `enforce_same_font`. 0 if unavailable.
578fn font_hash(b: &dyn PdfiumLibraryBindings, tp: FPDF_TEXTPAGE, i: i32) -> u64 {
579 use std::hash::{Hash, Hasher};
580 let mut flags: std::os::raw::c_int = 0;
581 let len = b.FPDFText_GetFontInfo(tp, i, std::ptr::null_mut(), 0, &mut flags);
582 if len == 0 {
583 return 0;
584 }
585 let mut buf = vec![0u8; len as usize];
586 b.FPDFText_GetFontInfo(
587 tp,
588 i,
589 buf.as_mut_ptr() as *mut std::os::raw::c_void,
590 len,
591 &mut flags,
592 );
593 let mut h = std::collections::hash_map::DefaultHasher::new();
594 buf.hash(&mut h);
595 flags.hash(&mut h);
596 h.finish()
597}
598
599/// pdfium text render mode 3: the glyph is drawn with neither fill nor stroke —
600/// an invisible glyph. Web-to-PDF exporters put a hidden plain-text copy of
601/// syntax-highlighted code (and other "copy"/accessibility layers) in this mode,
602/// which the char-level text API then extracts as a duplicate of the visible text.
603const INVISIBLE_RENDER_MODE: i32 = 3;
604
605fn glyphs(b: &dyn PdfiumLibraryBindings, tp: FPDF_TEXTPAGE, fetch_font: bool) -> Vec<Glyph> {
606 let n = b.FPDFText_CountChars(tp);
607 let mut out = Vec::with_capacity(n.max(0) as usize);
608 for i in 0..n {
609 let ch = match char::from_u32(b.FPDFText_GetUnicode(tp, i)) {
610 Some(c) => c,
611 None => continue,
612 };
613 if ch == '\r' || ch == '\n' {
614 continue;
615 }
616 // Spaces are font-neutral (0): pdfium's generated spaces carry a default
617 // font that would otherwise block every word↔space merge under
618 // enforce_same_font; docling-parse's spaces inherit the run's font.
619 let font = if fetch_font && !ch.is_whitespace() {
620 font_hash(b, tp, i)
621 } else {
622 0
623 };
624 let (mut l, mut r, mut bot, mut top) = (0f64, 0f64, 0f64, 0f64);
625 let has_box = b.FPDFText_GetCharBox(tp, i, &mut l, &mut r, &mut bot, &mut top) != 0;
626 // Loose box: font ascent/descent + glyph advance, uniform per font/size.
627 let mut lr = FS_RECTF {
628 left: 0.0,
629 top: 0.0,
630 right: 0.0,
631 bottom: 0.0,
632 };
633 let (ll, lb, lrt, ltop) = if b.FPDFText_GetLooseCharBox(tp, i, &mut lr) != 0 {
634 (lr.left, lr.bottom, lr.right, lr.top)
635 } else if has_box {
636 (l as f32, bot as f32, r as f32, top as f32)
637 } else {
638 (f32::NAN, 0.0, 0.0, 0.0)
639 };
640 if ch.is_whitespace() {
641 // Keep the space *with its box* (the docling-parse-style line sanitizer
642 // needs literal space glyphs); NaN `l` if pdfium reports no box (the
643 // legacy `lines_from_glyphs` ignores the box and only flags a space).
644 out.push(Glyph {
645 ch: ' ',
646 l: if has_box { l as f32 } else { f32::NAN },
647 b: if has_box { bot as f32 } else { 0.0 },
648 r: if has_box { r as f32 } else { 0.0 },
649 t: if has_box { top as f32 } else { 0.0 },
650 ll,
651 lb,
652 lr: lrt,
653 lt: ltop,
654 font,
655 });
656 continue;
657 }
658 if !has_box {
659 continue;
660 }
661 out.push(Glyph {
662 ch,
663 l: l as f32,
664 b: bot as f32,
665 r: r as f32,
666 t: top as f32,
667 ll,
668 lb,
669 lr: lrt,
670 lt: ltop,
671 font,
672 });
673 }
674 // pdfium splits the Arabic lam-alef ligature into two chars at the *same* x
675 // (it's one glyph) in visual order — `alef-variant, lam`. docling-parse and
676 // logical order are `lam, alef-variant`. Detect the ligature by the shared x
677 // and swap. The shared-x test reliably distinguishes a true ligature from a
678 // genuine `alef + lam` sequence (the article `ال`, or `فعالة`), whose two
679 // glyphs sit at different x and must NOT be reordered.
680 for i in 0..out.len().saturating_sub(1) {
681 let same_x = out[i].l.is_finite()
682 && out[i + 1].l.is_finite()
683 && (out[i].l - out[i + 1].l).abs() < 1.0;
684 if same_x
685 && matches!(out[i].ch, '\u{0622}' | '\u{0623}' | '\u{0625}' | '\u{0627}')
686 && out[i + 1].ch == '\u{0644}'
687 {
688 out.swap(i, i + 1);
689 }
690 }
691 // Reconstruct degenerate (zero-width) loose space boxes by spanning the gap to
692 // the next glyph on the same line, so the sanitizer keeps them as word
693 // separators rather than dropping them (which would merge `Information systems`
694 // → `Informationsystems`). pdfium gives generated spaces a zero-width box at a
695 // wrong baseline; a wrap (different baseline) or a touching gap is left alone.
696 for i in 0..out.len() {
697 if out[i].ch != ' ' || (out[i].lr - out[i].ll).abs() >= 0.5 {
698 continue;
699 }
700 let prev = out[..i]
701 .iter()
702 .rev()
703 .find(|g| g.ch != ' ' && g.ll.is_finite())
704 .map(|g| (g.lr, g.lb, g.lt));
705 let next = out[i + 1..]
706 .iter()
707 .find(|g| g.ch != ' ' && g.ll.is_finite())
708 .map(|g| (g.ll, g.lb));
709 if let (Some((plr, plb, plt)), Some((nll, nlb))) = (prev, next) {
710 let line_h = (plt - plb).abs().max(1.0);
711 if (plb - nlb).abs() < line_h * 0.5 && nll > plr + 0.5 {
712 out[i].ll = plr;
713 out[i].lr = nll;
714 out[i].lb = plb;
715 out[i].lt = plt;
716 }
717 }
718 }
719 out
720}
721
722/// How [`lines_from_glyphs`] splits a line into words.
723#[derive(Clone, Copy, PartialEq)]
724enum Grouping {
725 /// Gap heuristic + punctuation glue (`engines,`, `[37`, `98.5`) — prose.
726 Prose,
727 /// Split only at literal space glyphs, never glue — pdfium code cells.
728 /// pdfium's monospace listings carry a real space glyph at every source space,
729 /// and its overhanging loose boxes would make the gap heuristic over-split
730 /// (`f un c t i o n`), so honouring just the spaces reproduces the spacing.
731 CodeSpaceOnly,
732 /// Split on the inter-glyph **gap** (or a space glyph), but never glue — for
733 /// the parser's code cells: the parser emits no space glyphs (a source space
734 /// is a positioning gap), and its clean advance boxes make the gap reliable.
735 /// Unlike [`Grouping::Prose`] there is no punctuation glue, so a real gap
736 /// always splits (`et al. 2000`, not `et al.2000`) while genuinely touching
737 /// tokens stay joined (`add(a,` / `b)`).
738 CodeGap,
739}
740
741/// Group glyphs (document order) into words then lines, the way docling-parse
742/// does: a new **word** starts where the horizontal gap to the previous glyph
743/// exceeds ~0.2 × the font height (a real space is ~0.3 × height; letter
744/// tracking is smaller, so titles don't shatter); a new **line** starts where
745/// the baseline drops by ~half the font height (a superscript rises without
746/// dropping, so it stays on its line). Coordinates are flipped to top-left.
747/// See [`Grouping`] for how each mode decides word boundaries.
748fn lines_from_glyphs(gs: &[Glyph], page_h: f32, mode: Grouping) -> Vec<TextCell> {
749 let mut cells: Vec<TextCell> = Vec::new();
750 let mut words: Vec<String> = Vec::new(); // words on the current line
751 let mut word = String::new();
752 // current line bounding box, native
753 let (mut ll, mut lb, mut lr, mut lt) = (
754 f32::INFINITY,
755 f32::INFINITY,
756 f32::NEG_INFINITY,
757 f32::NEG_INFINITY,
758 );
759 // Tallest glyph seen on the current line: the word-gap threshold is relative
760 // to it, so a small-font run on the line (a superscript citation) isn't split
761 // at its tight digit gaps, while a big display title isn't split at its wider
762 // letter tracking. A real inter-word space is ~0.3× the font height.
763 let mut line_h: f32 = 0.0;
764 let mut prev: Option<&Glyph> = None;
765 // A space glyph between non-space glyphs pins a word split the gap heuristic
766 // can miss (tight justified spacing); it carries no geometry.
767 let mut pending_space = false;
768
769 for g in gs {
770 if g.ch == ' ' {
771 pending_space = true;
772 continue;
773 }
774 let h = (g.t - g.b).abs().max(1.0);
775 let (mut new_word, mut new_line) = (false, false);
776 if let Some(p) = prev {
777 // A new line drops the baseline *and* resets x leftward; requiring the
778 // x-reset avoids a descending comma/semicolon faking a line break. A
779 // *large* drop (≥1.5× the line height — a skipped line, e.g. a centered
780 // page-number footer below a short last word) is always a new line,
781 // even without the x-reset.
782 // LTR wraps reset x leftward (`g.l < p.r`); RTL (Arabic) wraps reset
783 // rightward (the new line begins at the far right). A large drop
784 // (≥1.5× line height) is a new line regardless of x.
785 let x_reset = if is_arabic(g.ch) || is_arabic(p.ch) {
786 g.l > p.r
787 } else {
788 g.l < p.r
789 };
790 new_line = (p.b - g.b > h * 0.5 && x_reset) || (p.b - g.b > line_h.max(h) * 1.5);
791 // Don't split before closing punctuation, after opening punctuation, or
792 // after a period that runs into a digit/lowercase letter — docling
793 // keeps `engines,` / `[37` / `i.e.` / `98.5` together even across a
794 // space or gap.
795 let glued = is_close_punct(g.ch)
796 || is_open_punct(p.ch)
797 || (p.ch.is_ascii_digit() && g.ch.is_ascii_digit())
798 || (p.ch == '.'
799 && !pending_space
800 && (g.ch.is_ascii_digit() || g.ch.is_ascii_lowercase()));
801 let word_gap = line_h.max(h) * 0.25;
802 new_word = if mode == Grouping::CodeSpaceOnly {
803 new_line || pending_space
804 } else if mode == Grouping::CodeGap {
805 // Gap-based, no glue: a real gap always splits, touching tokens join.
806 new_line || pending_space || g.l - p.r > word_gap
807 } else if is_arabic(g.ch) || is_arabic(p.ch) {
808 // RTL runs right-to-left, so the inter-word gap is `p.l - g.r`. A
809 // real word space has a gap; pdfium also emits spurious zero-gap
810 // space glyphs inside words (`التي`), so require the gap rather
811 // than trusting a bare space glyph.
812 new_line || (p.l - g.r > word_gap && !glued)
813 } else {
814 new_line || ((pending_space || g.l - p.r > word_gap) && !glued)
815 };
816 }
817 pending_space = false;
818 if new_line {
819 push_word(&mut word, &mut words);
820 push_line(&mut words, (ll, lb, lr, lt), page_h, &mut cells);
821 (ll, lb, lr, lt) = (
822 f32::INFINITY,
823 f32::INFINITY,
824 f32::NEG_INFINITY,
825 f32::NEG_INFINITY,
826 );
827 line_h = 0.0;
828 } else if new_word {
829 push_word(&mut word, &mut words);
830 }
831 word.push(g.ch);
832 ll = ll.min(g.l);
833 lb = lb.min(g.b);
834 lr = lr.max(g.r);
835 lt = lt.max(g.t);
836 line_h = line_h.max(h);
837 prev = Some(g);
838 }
839 push_word(&mut word, &mut words);
840 push_line(&mut words, (ll, lb, lr, lt), page_h, &mut cells);
841 cells
842}
843
844/// Code line cells from the **parser**'s glyph stream. Unlike pdfium — whose
845/// monospace listings carry explicit space glyphs (so [`Grouping::CodeSpaceOnly`]
846/// keeps their spacing) — the parser emits no space glyphs: a source space is a
847/// positioning gap. So code cells use [`Grouping::CodeGap`], which splits on the
848/// inter-glyph gap (a space wherever it exceeds ~0.25× the line height) but never
849/// glues punctuation, so `et al. 2000` keeps its space while `add(a,` / `b)` stay
850/// joined. The parser's clean advance boxes make the gap heuristic reliable here,
851/// where pdfium's overhanging loose boxes would over-split (`f un c t i o n`).
852pub(crate) fn code_cells_from_glyphs(gs: &[Glyph], page_h: f32) -> Vec<TextCell> {
853 lines_from_glyphs(gs, page_h, Grouping::CodeGap)
854}
855
856/// Per-word cells (each word's text + top-left bbox), using the same word/line
857/// splitting as [`lines_from_glyphs`] but emitting one cell per word instead of
858/// joining into lines — the legacy gap-heuristic word grouping, kept for the
859/// pdfium word path (`DOCLING_PDFIUM_WORDS`). The default parser path uses
860/// [`crate::dp_lines::word_cells`] instead.
861pub(crate) fn words_from_glyphs(gs: &[Glyph], page_h: f32) -> Vec<TextCell> {
862 let mut cells = Vec::new();
863 let mut word = String::new();
864 let inf = (
865 f32::INFINITY,
866 f32::INFINITY,
867 f32::NEG_INFINITY,
868 f32::NEG_INFINITY,
869 );
870 let (mut wl, mut wb, mut wr, mut wt) = inf;
871 let mut line_h: f32 = 0.0;
872 let mut prev: Option<&Glyph> = None;
873 let mut pending_space = false;
874 for g in gs {
875 if g.ch == ' ' {
876 pending_space = true;
877 continue;
878 }
879 let h = (g.t - g.b).abs().max(1.0);
880 let mut new_line = false;
881 let mut new_word = false;
882 if let Some(p) = prev {
883 // LTR wraps reset x leftward (`g.l < p.r`); RTL (Arabic) wraps reset
884 // rightward (the new line begins at the far right). A large drop
885 // (≥1.5× line height) is a new line regardless of x.
886 let x_reset = if is_arabic(g.ch) || is_arabic(p.ch) {
887 g.l > p.r
888 } else {
889 g.l < p.r
890 };
891 new_line = (p.b - g.b > h * 0.5 && x_reset) || (p.b - g.b > line_h.max(h) * 1.5);
892 // No digit-digit glue here (unlike the prose grouping): table cells in
893 // adjacent columns are numeric and a column gap must still split them
894 // (`0.965` `0.934`, not `0.9650.934`). Intra-number digits have no gap
895 // so they stay together regardless.
896 let glued = is_close_punct(g.ch)
897 || is_open_punct(p.ch)
898 || (p.ch == '.'
899 && !pending_space
900 && (g.ch.is_ascii_digit() || g.ch.is_ascii_lowercase()));
901 let word_gap = line_h.max(h) * 0.25;
902 new_word = new_line || ((pending_space || g.l - p.r > word_gap) && !glued);
903 }
904 pending_space = false;
905 if new_word && !word.is_empty() {
906 cells.push(TextCell {
907 text: std::mem::take(&mut word),
908 l: wl,
909 t: page_h - wt,
910 r: wr,
911 b: page_h - wb,
912 });
913 (wl, wb, wr, wt) = inf;
914 }
915 if new_line {
916 line_h = 0.0;
917 }
918 word.push(g.ch);
919 wl = wl.min(g.l);
920 wb = wb.min(g.b);
921 wr = wr.max(g.r);
922 wt = wt.max(g.t);
923 line_h = line_h.max(h);
924 prev = Some(g);
925 }
926 if !word.is_empty() {
927 cells.push(TextCell {
928 text: word,
929 l: wl,
930 t: page_h - wt,
931 r: wr,
932 b: page_h - wb,
933 });
934 }
935 cells
936}
937
938fn is_arabic(c: char) -> bool {
939 ('\u{0600}'..='\u{06FF}').contains(&c)
940}
941
942fn is_close_punct(c: char) -> bool {
943 matches!(
944 c,
945 ',' | '.' | ';' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '\u{2019}' | '\u{2018}'
946 )
947}
948
949fn is_open_punct(c: char) -> bool {
950 // `@` glues to what follows (`mAP @0.5`, `bpf@zurich`, `@decorator`).
951 matches!(c, '(' | '[' | '{' | '@')
952}
953
954fn push_word(word: &mut String, words: &mut Vec<String>) {
955 if !word.is_empty() {
956 words.push(std::mem::take(word));
957 }
958}
959
960fn push_line(
961 words: &mut Vec<String>,
962 bbox: (f32, f32, f32, f32),
963 page_h: f32,
964 cells: &mut Vec<TextCell>,
965) {
966 if words.is_empty() {
967 return;
968 }
969 let text = std::mem::take(words).join(" ");
970 let (l, b, r, t) = bbox;
971 cells.push(TextCell {
972 text,
973 l,
974 t: page_h - t,
975 r,
976 b: page_h - b,
977 });
978}