fleischwolf_pdf/pdfium_backend.rs
1//! pdfium-based text extraction and page rendering.
2//!
3//! Text is reconstructed the way docling's `docling-parse` does it, so the
4//! output spacing matches the groundtruth: the page's **character** stream is
5//! grouped into **words** (split at a horizontal gap wider than a fraction of
6//! the font height — font-relative, so letter-tracking in display titles does
7//! not split a word) and words into **lines** (by baseline). pdfium-render's
8//! safe API only exposes whole style runs / `GetBoundedText`, so the character
9//! loop is driven through the raw `PdfiumLibraryBindings` FFI on a second handle
10//! to the same bytes (no fork; stays publishable).
11
12use image::RgbImage;
13use pdfium_render::prelude::*;
14
15/// A run of text with its bounding box, in PDF points with a **top-left** origin
16/// (pdfium's native origin is bottom-left; we flip it to match docling's
17/// `BoundingBox(..., origin=TOPLEFT)`).
18#[derive(Debug, Clone)]
19pub struct TextCell {
20 pub text: String,
21 pub l: f32,
22 pub t: f32,
23 pub r: f32,
24 pub b: f32,
25}
26
27/// Pixels-per-point used to render page images. Layout is scale-invariant (it
28/// scales normalized boxes by the page point size), but OCR benefits from the
29/// extra resolution.
30pub const RENDER_SCALE: f32 = 2.0;
31
32/// One page's geometry, extracted text cells, and a rendered RGB image. The
33/// image is rendered at [`RENDER_SCALE`] pixels per PDF point; `image px =
34/// page point × scale`.
35#[derive(Clone)]
36pub struct PdfPage {
37 pub width: f32,
38 pub height: f32,
39 pub scale: f32,
40 pub cells: Vec<TextCell>,
41 /// Same text grouped for code regions: split only at pdfium space glyphs, so
42 /// monospace runs keep their source spacing instead of the prose heuristic's.
43 pub code_cells: Vec<TextCell>,
44 /// Per-word cells (one per word, not joined into lines) for TableFormer cell
45 /// matching.
46 pub word_cells: Vec<TextCell>,
47 pub image: RgbImage,
48 /// Hyperlink annotations on the page (rect in top-left page coords + target
49 /// URI), restricted to web/mail/tel schemes. Used only by strict Markdown.
50 pub links: Vec<LinkAnnot>,
51}
52
53/// A PDF link annotation: its rectangle (top-left page coordinates, matching
54/// [`TextCell`]) and target URI.
55#[derive(Debug, Clone)]
56pub struct LinkAnnot {
57 pub l: f32,
58 pub t: f32,
59 pub r: f32,
60 pub b: f32,
61 pub uri: String,
62}
63
64/// A parsed PDF: per-page text cells and page images.
65pub struct PdfDocument {
66 pub pages: Vec<PdfPage>,
67}
68
69/// Bind to the pdfium dynamic library. Honors `PDFIUM_DYNAMIC_LIB_PATH` (a
70/// directory or file), else the directory of the current exe, else the system
71/// library — mirroring how a deployment ships `libpdfium` alongside the binary.
72/// Whether to use the docling-parse line sanitizer ([`crate::dp_lines`]) for prose
73/// reconstruction — the default. Set `DOCLING_LEGACY_LINES` to fall back to the
74/// older gap-heuristic `lines_from_glyphs`.
75pub(crate) fn use_dp_lines() -> bool {
76 std::env::var("DOCLING_LEGACY_LINES").is_err()
77}
78
79/// Whether to source **word** cells from the pure-Rust parser (roadmap item 6),
80/// the default. The parser's `word_cells` reproduce docling-parse's word grouping
81/// byte-for-byte — the per-word tokens TableFormer matches table-grid cells
82/// against — which moves table extraction closer to docling on the heavy
83/// multi-column fixtures. Set `DOCLING_PDFIUM_WORDS` to keep pdfium's word cells,
84/// or `DOCLING_PDFIUM_TEXT` to fall back to pdfium for all text.
85pub(crate) fn use_parser_words() -> bool {
86 std::env::var("DOCLING_PDFIUM_WORDS").is_err() && std::env::var("DOCLING_PDFIUM_TEXT").is_err()
87}
88
89/// Whether to source **code** cells from the parser too (the default) — the last
90/// text layer to leave pdfium, fully retiring its text path. The parser's
91/// gap-based code grouping ([`code_cells_from_glyphs`]) reconstructs monospace
92/// spacing from positioning gaps (`function add(a, b) { … }`), so it no longer
93/// drops the inter-token spaces the old space-glyph-only grouping lost
94/// (`functionadd`). Reverts to pdfium with `DOCLING_PDFIUM_WORDS` (alongside word
95/// cells) or `DOCLING_PDFIUM_TEXT` (all text).
96pub(crate) fn use_parser_code() -> bool {
97 std::env::var("DOCLING_PDFIUM_WORDS").is_err() && std::env::var("DOCLING_PDFIUM_TEXT").is_err()
98}
99
100fn bind() -> Result<Pdfium, PdfiumError> {
101 if let Ok(path) = std::env::var("PDFIUM_DYNAMIC_LIB_PATH") {
102 let name = Pdfium::pdfium_platform_library_name_at_path(&path);
103 if let Ok(b) = Pdfium::bind_to_library(&name) {
104 return Ok(Pdfium::new(b));
105 }
106 if let Ok(b) = Pdfium::bind_to_library(&path) {
107 return Ok(Pdfium::new(b));
108 }
109 }
110 Pdfium::bind_to_system_library().map(Pdfium::new)
111}
112
113impl PdfDocument {
114 /// Parse a PDF from bytes, optionally decrypting with `password`.
115 ///
116 /// Note: this materialises **every** page's rendered bitmap in memory at
117 /// once. For large documents prefer [`for_each_page`], which streams.
118 pub fn open(bytes: &[u8], password: Option<&str>) -> Result<Self, PdfiumError> {
119 let pdfium = bind()?;
120 let ffi = FfiText::load(pdfium.bindings(), bytes, password);
121 let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
122 let mut rust = rust_parser_cells(bytes);
123 let mut pages = Vec::new();
124 for (i, page) in doc.pages().iter().enumerate() {
125 let rc = rust.as_mut().and_then(|v| v.get_mut(i).map(std::mem::take));
126 pages.push(extract_page(&page, &ffi, i as i32, rc)?);
127 }
128 Ok(PdfDocument { pages })
129 }
130}
131
132/// Per-page prose line cells from the pure-Rust text parser. This is the
133/// **default** text layer (it matches docling-parse's char geometry and is a
134/// strict improvement on byte-conformance — e.g. it recovers the Arabic
135/// sentence-period attachment in `right_to_left_01`). Set `DOCLING_PDFIUM_TEXT`
136/// to fall back to pdfium's text layer. The parser returns an empty page when a
137/// PDF (or a page) has no parseable text layer; the caller keeps pdfium's cells
138/// in that case, so scanned/edge-case pages are unaffected.
139fn rust_parser_cells(bytes: &[u8]) -> Option<Vec<crate::textparse::PageParserCells>> {
140 if std::env::var("DOCLING_PDFIUM_TEXT").is_ok() {
141 return None;
142 }
143 Some(crate::timing::timed("textparse", || {
144 crate::textparse::pdf_all_cells(bytes)
145 }))
146}
147
148/// Number of pages in a PDF, without rendering any of them — used to decide
149/// whether a document is worth spinning up the parallel worker pool.
150pub fn page_count(bytes: &[u8], password: Option<&str>) -> Result<usize, PdfiumError> {
151 let pdfium = bind()?;
152 let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
153 Ok(doc.pages().len() as usize)
154}
155
156/// Render + extract pages one at a time, handing each (owned) [`PdfPage`] to `f`.
157/// Only one page bitmap is resident at a time — a rendered page is ~5 MB, so a
158/// large PDF would otherwise hold gigabytes of bitmaps at once. `f` receives the
159/// zero-based page index and the total page count.
160///
161/// `E` is the caller's error type; pdfium errors convert into it via `From`.
162pub fn for_each_page<E, F>(bytes: &[u8], password: Option<&str>, mut f: F) -> Result<(), E>
163where
164 E: From<PdfiumError>,
165 F: FnMut(usize, usize, PdfPage) -> Result<(), E>,
166{
167 let pdfium = bind()?;
168 let ffi = FfiText::load(pdfium.bindings(), bytes, password);
169 let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
170 let mut rust = rust_parser_cells(bytes);
171 let pages = doc.pages();
172 let total = pages.len() as usize;
173 for (i, page) in pages.iter().enumerate() {
174 let rc = rust.as_mut().and_then(|v| v.get_mut(i).map(std::mem::take));
175 let extracted = extract_page(&page, &ffi, i as i32, rc)?;
176 f(i, total, extracted)?;
177 }
178 Ok(())
179}
180
181fn extract_page(
182 page: &pdfium_render::prelude::PdfPage<'_>,
183 ffi: &FfiText<'_>,
184 index: i32,
185 rust_cells: Option<crate::textparse::PageParserCells>,
186) -> Result<PdfPage, PdfiumError> {
187 let width = page.width().value;
188 let height = page.height().value;
189
190 let (mut cells, mut code_cells, mut word_cells) =
191 crate::timing::timed("ffi.page_cells", || ffi.page_cells(index, height));
192 if cells.is_empty() {
193 cells = segment_cells(&page.text()?, height);
194 }
195 // Default: use the pure-Rust text parser instead of pdfium's text layer
196 // (override with `DOCLING_PDFIUM_TEXT`). Prose line cells always come from the
197 // parser; word and code cells do too unless `DOCLING_PDFIUM_WORDS` keeps them
198 // on pdfium (the parser's word grouping reproduces docling-parse's, which
199 // TableFormer matches against — roadmap item 6). A page the parser couldn't
200 // read (no text layer) keeps pdfium's cells.
201 if let Some(rc) = rust_cells {
202 if !rc.prose.is_empty() {
203 cells = rc.prose;
204 }
205 if use_parser_words() && !rc.words.is_empty() {
206 word_cells = rc.words;
207 }
208 if use_parser_code() && !rc.code.is_empty() {
209 code_cells = rc.code;
210 }
211 }
212
213 // docling renders at 1.5× the target scale and downsamples "to make it
214 // sharper" (pypdfium2 → PIL BICUBIC). Replicate exactly: the TableFormer
215 // model is pixel-sensitive, so the page bitmap must match byte-for-byte.
216 // `CatmullRom` is the same a=-0.5 cubic kernel as PIL's BICUBIC.
217 const SUPERSAMPLE: f32 = 1.5;
218 let tw = (width * RENDER_SCALE * SUPERSAMPLE).round().max(1.0) as i32;
219 let th = (height * RENDER_SCALE * SUPERSAMPLE).round().max(1.0) as i32;
220 let cfg = PdfRenderConfig::new()
221 .set_target_width(tw)
222 .set_target_height(th);
223 let big = crate::timing::timed("pdfium.render", || {
224 page.render_with_config(&cfg)
225 .map(|b| b.as_image().into_rgb8())
226 })?;
227 let dw = (width * RENDER_SCALE).round().max(1.0) as u32;
228 let dh = (height * RENDER_SCALE).round().max(1.0) as u32;
229 let image = crate::timing::timed("image.resize", || {
230 image::imageops::resize(&big, dw, dh, image::imageops::FilterType::CatmullRom)
231 });
232
233 Ok(PdfPage {
234 width,
235 height,
236 scale: RENDER_SCALE,
237 cells,
238 code_cells,
239 word_cells,
240 image,
241 links: extract_links(page, height),
242 })
243}
244
245/// Collect web/mail/tel hyperlink annotations on a page, mapping each link's
246/// rectangle into top-left page coordinates (like [`TextCell`]). `file://` and
247/// in-document destinations are skipped — only externally meaningful targets are
248/// rendered. pdfium occasionally lists a link twice; rects are kept as-is and the
249/// caller dedupes by resolved anchor text.
250fn extract_links(page: &pdfium_render::prelude::PdfPage<'_>, page_h: f32) -> Vec<LinkAnnot> {
251 let mut out = Vec::new();
252 for link in page.links().iter() {
253 let Some(uri) = link
254 .action()
255 .and_then(|a| a.as_uri_action().and_then(|u| u.uri().ok()))
256 else {
257 continue;
258 };
259 let scheme_ok = ["http://", "https://", "mailto:", "tel:"]
260 .iter()
261 .any(|s| uri.starts_with(s));
262 if !scheme_ok {
263 continue;
264 }
265 if let Ok(rect) = link.rect() {
266 out.push(LinkAnnot {
267 l: rect.left().value,
268 t: page_h - rect.top().value,
269 r: rect.right().value,
270 b: page_h - rect.bottom().value,
271 uri,
272 });
273 }
274 }
275 out
276}
277
278/// Fallback line cells from pdfium-render's style segments (one cell per
279/// segment). Used only when the raw-FFI text page can't be loaded.
280fn segment_cells(text: &PdfPageText, page_h: f32) -> Vec<TextCell> {
281 text.segments()
282 .iter()
283 .filter_map(|seg| {
284 let s = seg.text();
285 if s.trim().is_empty() {
286 return None;
287 }
288 let r = seg.bounds();
289 Some(TextCell {
290 text: s,
291 l: r.left().value,
292 t: page_h - r.top().value,
293 r: r.right().value,
294 b: page_h - r.bottom().value,
295 })
296 })
297 .collect()
298}
299
300/// A second, raw-FFI handle on the same PDF used to drive the character loop
301/// (`FPDFText_GetUnicode`/`GetCharBox`) that pdfium-render's safe API doesn't
302/// expose. Closes the document on drop.
303struct FfiText<'a> {
304 bindings: &'a dyn PdfiumLibraryBindings,
305 doc: FPDF_DOCUMENT,
306}
307
308/// One glyph: codepoint + native (y-up) box edges. `l/b/r/t` is pdfium's *tight*
309/// ink box (used by the legacy `lines_from_glyphs`); `ll/lb/lr/lt` is the *loose*
310/// box (font ascent/descent + advance — uniform per font/size), which the
311/// docling-parse-style sanitizer needs so adjacent glyphs share a top edge.
312pub(crate) struct Glyph {
313 pub(crate) ch: char,
314 pub(crate) l: f32,
315 pub(crate) b: f32,
316 pub(crate) r: f32,
317 pub(crate) t: f32,
318 pub(crate) ll: f32,
319 pub(crate) lb: f32,
320 pub(crate) lr: f32,
321 pub(crate) lt: f32,
322 /// Hash of the PDF font name + flags (0 when not fetched). The sanitizer uses
323 /// it for docling-parse's `enforce_same_font` (keeps a bold label and regular
324 /// value as separate line cells, e.g. `LABEL : value`).
325 pub(crate) font: u64,
326}
327
328impl<'a> FfiText<'a> {
329 fn load(bindings: &'a dyn PdfiumLibraryBindings, bytes: &[u8], password: Option<&str>) -> Self {
330 let doc = bindings.FPDF_LoadMemDocument(bytes, password);
331 FfiText { bindings, doc }
332 }
333
334 /// Reconstruct line cells for page `index` (zero-based) via the
335 /// chars→words→lines grouping. Returns `(prose_cells, code_cells)` — the same
336 /// glyphs grouped two ways (gap-heuristic for prose, space-glyph-only for
337 /// code). Both empty on any failure (caller falls back).
338 fn page_cells(&self, index: i32, page_h: f32) -> (Vec<TextCell>, Vec<TextCell>, Vec<TextCell>) {
339 let empty = || (Vec::new(), Vec::new(), Vec::new());
340 if self.doc.is_null() {
341 return empty();
342 }
343 let b = self.bindings;
344 let page = b.FPDF_LoadPage(self.doc, index);
345 if page.is_null() {
346 return empty();
347 }
348 let tp = b.FPDFText_LoadPage(page);
349 let out = if tp.is_null() {
350 empty()
351 } else {
352 let dp = use_dp_lines();
353 let g = glyphs(b, tp, dp);
354 b.FPDFText_ClosePage(tp);
355 // Prose line cells: the docling-parse-style sanitizer (behind a flag
356 // while it's validated) or the legacy gap-heuristic reconstruction.
357 let prose = if dp {
358 crate::dp_lines::line_cells(&g, page_h, false)
359 } else {
360 lines_from_glyphs(&g, page_h, Grouping::Prose)
361 };
362 (
363 prose,
364 lines_from_glyphs(&g, page_h, Grouping::CodeSpaceOnly),
365 words_from_glyphs(&g, page_h),
366 )
367 };
368 b.FPDF_ClosePage(page);
369 out
370 }
371}
372
373impl Drop for FfiText<'_> {
374 fn drop(&mut self) {
375 if !self.doc.is_null() {
376 self.bindings.FPDF_CloseDocument(self.doc);
377 }
378 }
379}
380
381/// Read every glyph (codepoint + native box) from the text page, in document
382/// order. A space glyph is kept as a word-boundary marker (NaN box, char `' '`);
383/// pdfium emits these on most lines and they pin word splits exactly. Hard line
384/// breaks are dropped (line structure comes from geometry); the gap heuristic in
385/// [`lines_from_glyphs`] is the fallback for the lines pdfium leaves space-less.
386/// Debug helper: the raw pdfium glyph stream (codepoint + native bottom-left
387/// box) for a page, in pdfium's character order. For comparing against
388/// docling-parse's char cells.
389pub fn debug_glyphs(bytes: &[u8], index: i32) -> Vec<(char, f32, f32)> {
390 let Ok(pdfium) = bind() else {
391 return Vec::new();
392 };
393 let ffi = FfiText::load(pdfium.bindings(), bytes, None);
394 if ffi.doc.is_null() {
395 return Vec::new();
396 }
397 let b = ffi.bindings;
398 let page = b.FPDF_LoadPage(ffi.doc, index);
399 if page.is_null() {
400 return Vec::new();
401 }
402 let tp = b.FPDFText_LoadPage(page);
403 let mut out = Vec::new();
404 if !tp.is_null() {
405 for g in glyphs(b, tp, true) {
406 out.push((g.ch, g.ll, g.lr));
407 }
408 b.FPDFText_ClosePage(tp);
409 }
410 b.FPDF_ClosePage(page);
411 out
412}
413
414/// Hash a glyph's PDF font name + flags, for `enforce_same_font`. 0 if unavailable.
415fn font_hash(b: &dyn PdfiumLibraryBindings, tp: FPDF_TEXTPAGE, i: i32) -> u64 {
416 use std::hash::{Hash, Hasher};
417 let mut flags: std::os::raw::c_int = 0;
418 let len = b.FPDFText_GetFontInfo(tp, i, std::ptr::null_mut(), 0, &mut flags);
419 if len == 0 {
420 return 0;
421 }
422 let mut buf = vec![0u8; len as usize];
423 b.FPDFText_GetFontInfo(
424 tp,
425 i,
426 buf.as_mut_ptr() as *mut std::os::raw::c_void,
427 len,
428 &mut flags,
429 );
430 let mut h = std::collections::hash_map::DefaultHasher::new();
431 buf.hash(&mut h);
432 flags.hash(&mut h);
433 h.finish()
434}
435
436fn glyphs(b: &dyn PdfiumLibraryBindings, tp: FPDF_TEXTPAGE, fetch_font: bool) -> Vec<Glyph> {
437 let n = b.FPDFText_CountChars(tp);
438 let mut out = Vec::with_capacity(n.max(0) as usize);
439 for i in 0..n {
440 let ch = match char::from_u32(b.FPDFText_GetUnicode(tp, i)) {
441 Some(c) => c,
442 None => continue,
443 };
444 if ch == '\r' || ch == '\n' {
445 continue;
446 }
447 // Spaces are font-neutral (0): pdfium's generated spaces carry a default
448 // font that would otherwise block every word↔space merge under
449 // enforce_same_font; docling-parse's spaces inherit the run's font.
450 let font = if fetch_font && !ch.is_whitespace() {
451 font_hash(b, tp, i)
452 } else {
453 0
454 };
455 let (mut l, mut r, mut bot, mut top) = (0f64, 0f64, 0f64, 0f64);
456 let has_box = b.FPDFText_GetCharBox(tp, i, &mut l, &mut r, &mut bot, &mut top) != 0;
457 // Loose box: font ascent/descent + glyph advance, uniform per font/size.
458 let mut lr = FS_RECTF {
459 left: 0.0,
460 top: 0.0,
461 right: 0.0,
462 bottom: 0.0,
463 };
464 let (ll, lb, lrt, ltop) = if b.FPDFText_GetLooseCharBox(tp, i, &mut lr) != 0 {
465 (lr.left, lr.bottom, lr.right, lr.top)
466 } else if has_box {
467 (l as f32, bot as f32, r as f32, top as f32)
468 } else {
469 (f32::NAN, 0.0, 0.0, 0.0)
470 };
471 if ch.is_whitespace() {
472 // Keep the space *with its box* (the docling-parse-style line sanitizer
473 // needs literal space glyphs); NaN `l` if pdfium reports no box (the
474 // legacy `lines_from_glyphs` ignores the box and only flags a space).
475 out.push(Glyph {
476 ch: ' ',
477 l: if has_box { l as f32 } else { f32::NAN },
478 b: if has_box { bot as f32 } else { 0.0 },
479 r: if has_box { r as f32 } else { 0.0 },
480 t: if has_box { top as f32 } else { 0.0 },
481 ll,
482 lb,
483 lr: lrt,
484 lt: ltop,
485 font,
486 });
487 continue;
488 }
489 if !has_box {
490 continue;
491 }
492 out.push(Glyph {
493 ch,
494 l: l as f32,
495 b: bot as f32,
496 r: r as f32,
497 t: top as f32,
498 ll,
499 lb,
500 lr: lrt,
501 lt: ltop,
502 font,
503 });
504 }
505 // pdfium splits the Arabic lam-alef ligature into two chars at the *same* x
506 // (it's one glyph) in visual order — `alef-variant, lam`. docling-parse and
507 // logical order are `lam, alef-variant`. Detect the ligature by the shared x
508 // and swap. The shared-x test reliably distinguishes a true ligature from a
509 // genuine `alef + lam` sequence (the article `ال`, or `فعالة`), whose two
510 // glyphs sit at different x and must NOT be reordered.
511 for i in 0..out.len().saturating_sub(1) {
512 let same_x = out[i].l.is_finite()
513 && out[i + 1].l.is_finite()
514 && (out[i].l - out[i + 1].l).abs() < 1.0;
515 if same_x
516 && matches!(out[i].ch, '\u{0622}' | '\u{0623}' | '\u{0625}' | '\u{0627}')
517 && out[i + 1].ch == '\u{0644}'
518 {
519 out.swap(i, i + 1);
520 }
521 }
522 // Reconstruct degenerate (zero-width) loose space boxes by spanning the gap to
523 // the next glyph on the same line, so the sanitizer keeps them as word
524 // separators rather than dropping them (which would merge `Information systems`
525 // → `Informationsystems`). pdfium gives generated spaces a zero-width box at a
526 // wrong baseline; a wrap (different baseline) or a touching gap is left alone.
527 for i in 0..out.len() {
528 if out[i].ch != ' ' || (out[i].lr - out[i].ll).abs() >= 0.5 {
529 continue;
530 }
531 let prev = out[..i]
532 .iter()
533 .rev()
534 .find(|g| g.ch != ' ' && g.ll.is_finite())
535 .map(|g| (g.lr, g.lb, g.lt));
536 let next = out[i + 1..]
537 .iter()
538 .find(|g| g.ch != ' ' && g.ll.is_finite())
539 .map(|g| (g.ll, g.lb));
540 if let (Some((plr, plb, plt)), Some((nll, nlb))) = (prev, next) {
541 let line_h = (plt - plb).abs().max(1.0);
542 if (plb - nlb).abs() < line_h * 0.5 && nll > plr + 0.5 {
543 out[i].ll = plr;
544 out[i].lr = nll;
545 out[i].lb = plb;
546 out[i].lt = plt;
547 }
548 }
549 }
550 out
551}
552
553/// How [`lines_from_glyphs`] splits a line into words.
554#[derive(Clone, Copy, PartialEq)]
555enum Grouping {
556 /// Gap heuristic + punctuation glue (`engines,`, `[37`, `98.5`) — prose.
557 Prose,
558 /// Split only at literal space glyphs, never glue — pdfium code cells.
559 /// pdfium's monospace listings carry a real space glyph at every source space,
560 /// and its overhanging loose boxes would make the gap heuristic over-split
561 /// (`f un c t i o n`), so honouring just the spaces reproduces the spacing.
562 CodeSpaceOnly,
563 /// Split on the inter-glyph **gap** (or a space glyph), but never glue — for
564 /// the parser's code cells: the parser emits no space glyphs (a source space
565 /// is a positioning gap), and its clean advance boxes make the gap reliable.
566 /// Unlike [`Grouping::Prose`] there is no punctuation glue, so a real gap
567 /// always splits (`et al. 2000`, not `et al.2000`) while genuinely touching
568 /// tokens stay joined (`add(a,` / `b)`).
569 CodeGap,
570}
571
572/// Group glyphs (document order) into words then lines, the way docling-parse
573/// does: a new **word** starts where the horizontal gap to the previous glyph
574/// exceeds ~0.2 × the font height (a real space is ~0.3 × height; letter
575/// tracking is smaller, so titles don't shatter); a new **line** starts where
576/// the baseline drops by ~half the font height (a superscript rises without
577/// dropping, so it stays on its line). Coordinates are flipped to top-left.
578/// See [`Grouping`] for how each mode decides word boundaries.
579fn lines_from_glyphs(gs: &[Glyph], page_h: f32, mode: Grouping) -> Vec<TextCell> {
580 let mut cells: Vec<TextCell> = Vec::new();
581 let mut words: Vec<String> = Vec::new(); // words on the current line
582 let mut word = String::new();
583 // current line bounding box, native
584 let (mut ll, mut lb, mut lr, mut lt) = (
585 f32::INFINITY,
586 f32::INFINITY,
587 f32::NEG_INFINITY,
588 f32::NEG_INFINITY,
589 );
590 // Tallest glyph seen on the current line: the word-gap threshold is relative
591 // to it, so a small-font run on the line (a superscript citation) isn't split
592 // at its tight digit gaps, while a big display title isn't split at its wider
593 // letter tracking. A real inter-word space is ~0.3× the font height.
594 let mut line_h: f32 = 0.0;
595 let mut prev: Option<&Glyph> = None;
596 // A space glyph between non-space glyphs pins a word split the gap heuristic
597 // can miss (tight justified spacing); it carries no geometry.
598 let mut pending_space = false;
599
600 for g in gs {
601 if g.ch == ' ' {
602 pending_space = true;
603 continue;
604 }
605 let h = (g.t - g.b).abs().max(1.0);
606 let (mut new_word, mut new_line) = (false, false);
607 if let Some(p) = prev {
608 // A new line drops the baseline *and* resets x leftward; requiring the
609 // x-reset avoids a descending comma/semicolon faking a line break. A
610 // *large* drop (≥1.5× the line height — a skipped line, e.g. a centered
611 // page-number footer below a short last word) is always a new line,
612 // even without the x-reset.
613 // LTR wraps reset x leftward (`g.l < p.r`); RTL (Arabic) wraps reset
614 // rightward (the new line begins at the far right). A large drop
615 // (≥1.5× line height) is a new line regardless of x.
616 let x_reset = if is_arabic(g.ch) || is_arabic(p.ch) {
617 g.l > p.r
618 } else {
619 g.l < p.r
620 };
621 new_line = (p.b - g.b > h * 0.5 && x_reset) || (p.b - g.b > line_h.max(h) * 1.5);
622 // Don't split before closing punctuation, after opening punctuation, or
623 // after a period that runs into a digit/lowercase letter — docling
624 // keeps `engines,` / `[37` / `i.e.` / `98.5` together even across a
625 // space or gap.
626 let glued = is_close_punct(g.ch)
627 || is_open_punct(p.ch)
628 || (p.ch.is_ascii_digit() && g.ch.is_ascii_digit())
629 || (p.ch == '.'
630 && !pending_space
631 && (g.ch.is_ascii_digit() || g.ch.is_ascii_lowercase()));
632 let word_gap = line_h.max(h) * 0.25;
633 new_word = if mode == Grouping::CodeSpaceOnly {
634 new_line || pending_space
635 } else if mode == Grouping::CodeGap {
636 // Gap-based, no glue: a real gap always splits, touching tokens join.
637 new_line || pending_space || g.l - p.r > word_gap
638 } else if is_arabic(g.ch) || is_arabic(p.ch) {
639 // RTL runs right-to-left, so the inter-word gap is `p.l - g.r`. A
640 // real word space has a gap; pdfium also emits spurious zero-gap
641 // space glyphs inside words (`التي`), so require the gap rather
642 // than trusting a bare space glyph.
643 new_line || (p.l - g.r > word_gap && !glued)
644 } else {
645 new_line || ((pending_space || g.l - p.r > word_gap) && !glued)
646 };
647 }
648 pending_space = false;
649 if new_line {
650 push_word(&mut word, &mut words);
651 push_line(&mut words, (ll, lb, lr, lt), page_h, &mut cells);
652 (ll, lb, lr, lt) = (
653 f32::INFINITY,
654 f32::INFINITY,
655 f32::NEG_INFINITY,
656 f32::NEG_INFINITY,
657 );
658 line_h = 0.0;
659 } else if new_word {
660 push_word(&mut word, &mut words);
661 }
662 word.push(g.ch);
663 ll = ll.min(g.l);
664 lb = lb.min(g.b);
665 lr = lr.max(g.r);
666 lt = lt.max(g.t);
667 line_h = line_h.max(h);
668 prev = Some(g);
669 }
670 push_word(&mut word, &mut words);
671 push_line(&mut words, (ll, lb, lr, lt), page_h, &mut cells);
672 cells
673}
674
675/// Code line cells from the **parser**'s glyph stream. Unlike pdfium — whose
676/// monospace listings carry explicit space glyphs (so [`Grouping::CodeSpaceOnly`]
677/// keeps their spacing) — the parser emits no space glyphs: a source space is a
678/// positioning gap. So code cells use [`Grouping::CodeGap`], which splits on the
679/// inter-glyph gap (a space wherever it exceeds ~0.25× the line height) but never
680/// glues punctuation, so `et al. 2000` keeps its space while `add(a,` / `b)` stay
681/// joined. The parser's clean advance boxes make the gap heuristic reliable here,
682/// where pdfium's overhanging loose boxes would over-split (`f un c t i o n`).
683pub(crate) fn code_cells_from_glyphs(gs: &[Glyph], page_h: f32) -> Vec<TextCell> {
684 lines_from_glyphs(gs, page_h, Grouping::CodeGap)
685}
686
687/// Per-word cells (each word's text + top-left bbox), using the same word/line
688/// splitting as [`lines_from_glyphs`] but emitting one cell per word instead of
689/// joining into lines — the legacy gap-heuristic word grouping, kept for the
690/// pdfium word path (`DOCLING_PDFIUM_WORDS`). The default parser path uses
691/// [`crate::dp_lines::word_cells`] instead.
692pub(crate) fn words_from_glyphs(gs: &[Glyph], page_h: f32) -> Vec<TextCell> {
693 let mut cells = Vec::new();
694 let mut word = String::new();
695 let inf = (
696 f32::INFINITY,
697 f32::INFINITY,
698 f32::NEG_INFINITY,
699 f32::NEG_INFINITY,
700 );
701 let (mut wl, mut wb, mut wr, mut wt) = inf;
702 let mut line_h: f32 = 0.0;
703 let mut prev: Option<&Glyph> = None;
704 let mut pending_space = false;
705 for g in gs {
706 if g.ch == ' ' {
707 pending_space = true;
708 continue;
709 }
710 let h = (g.t - g.b).abs().max(1.0);
711 let mut new_line = false;
712 let mut new_word = false;
713 if let Some(p) = prev {
714 // LTR wraps reset x leftward (`g.l < p.r`); RTL (Arabic) wraps reset
715 // rightward (the new line begins at the far right). A large drop
716 // (≥1.5× line height) is a new line regardless of x.
717 let x_reset = if is_arabic(g.ch) || is_arabic(p.ch) {
718 g.l > p.r
719 } else {
720 g.l < p.r
721 };
722 new_line = (p.b - g.b > h * 0.5 && x_reset) || (p.b - g.b > line_h.max(h) * 1.5);
723 // No digit-digit glue here (unlike the prose grouping): table cells in
724 // adjacent columns are numeric and a column gap must still split them
725 // (`0.965` `0.934`, not `0.9650.934`). Intra-number digits have no gap
726 // so they stay together regardless.
727 let glued = is_close_punct(g.ch)
728 || is_open_punct(p.ch)
729 || (p.ch == '.'
730 && !pending_space
731 && (g.ch.is_ascii_digit() || g.ch.is_ascii_lowercase()));
732 let word_gap = line_h.max(h) * 0.25;
733 new_word = new_line || ((pending_space || g.l - p.r > word_gap) && !glued);
734 }
735 pending_space = false;
736 if new_word && !word.is_empty() {
737 cells.push(TextCell {
738 text: std::mem::take(&mut word),
739 l: wl,
740 t: page_h - wt,
741 r: wr,
742 b: page_h - wb,
743 });
744 (wl, wb, wr, wt) = inf;
745 }
746 if new_line {
747 line_h = 0.0;
748 }
749 word.push(g.ch);
750 wl = wl.min(g.l);
751 wb = wb.min(g.b);
752 wr = wr.max(g.r);
753 wt = wt.max(g.t);
754 line_h = line_h.max(h);
755 prev = Some(g);
756 }
757 if !word.is_empty() {
758 cells.push(TextCell {
759 text: word,
760 l: wl,
761 t: page_h - wt,
762 r: wr,
763 b: page_h - wb,
764 });
765 }
766 cells
767}
768
769fn is_arabic(c: char) -> bool {
770 ('\u{0600}'..='\u{06FF}').contains(&c)
771}
772
773fn is_close_punct(c: char) -> bool {
774 matches!(
775 c,
776 ',' | '.' | ';' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '\u{2019}' | '\u{2018}'
777 )
778}
779
780fn is_open_punct(c: char) -> bool {
781 // `@` glues to what follows (`mAP @0.5`, `bpf@zurich`, `@decorator`).
782 matches!(c, '(' | '[' | '{' | '@')
783}
784
785fn push_word(word: &mut String, words: &mut Vec<String>) {
786 if !word.is_empty() {
787 words.push(std::mem::take(word));
788 }
789}
790
791fn push_line(
792 words: &mut Vec<String>,
793 bbox: (f32, f32, f32, f32),
794 page_h: f32,
795 cells: &mut Vec<TextCell>,
796) {
797 if words.is_empty() {
798 return;
799 }
800 let text = std::mem::take(words).join(" ");
801 let (l, b, r, t) = bbox;
802 cells.push(TextCell {
803 text,
804 l,
805 t: page_h - t,
806 r,
807 b: page_h - b,
808 });
809}