fleischwolf_pdf/pdfium_backend.rs
1//! pdfium-based text extraction and page rendering.
2//!
3//! Text is reconstructed the way docling's `docling-parse` does it, so the
4//! output spacing matches the groundtruth: the page's **character** stream is
5//! grouped into **words** (split at a horizontal gap wider than a fraction of
6//! the font height — font-relative, so letter-tracking in display titles does
7//! not split a word) and words into **lines** (by baseline). pdfium-render's
8//! safe API only exposes whole style runs / `GetBoundedText`, so the character
9//! loop is driven through the raw `PdfiumLibraryBindings` FFI on a second handle
10//! to the same bytes (no fork; stays publishable).
11
12use image::RgbImage;
13use pdfium_render::prelude::*;
14
15/// A run of text with its bounding box, in PDF points with a **top-left** origin
16/// (pdfium's native origin is bottom-left; we flip it to match docling's
17/// `BoundingBox(..., origin=TOPLEFT)`).
18#[derive(Debug, Clone)]
19pub struct TextCell {
20 pub text: String,
21 pub l: f32,
22 pub t: f32,
23 pub r: f32,
24 pub b: f32,
25}
26
27/// Pixels-per-point used to render page images. Layout is scale-invariant (it
28/// scales normalized boxes by the page point size), but OCR benefits from the
29/// extra resolution.
30pub const RENDER_SCALE: f32 = 2.0;
31
32/// One page's geometry, extracted text cells, and a rendered RGB image. The
33/// image is rendered at [`RENDER_SCALE`] pixels per PDF point; `image px =
34/// page point × scale`.
35#[derive(Clone)]
36pub struct PdfPage {
37 pub width: f32,
38 pub height: f32,
39 pub scale: f32,
40 pub cells: Vec<TextCell>,
41 /// Same text grouped for code regions: split only at pdfium space glyphs, so
42 /// monospace runs keep their source spacing instead of the prose heuristic's.
43 pub code_cells: Vec<TextCell>,
44 /// Per-word cells (one per word, not joined into lines) for TableFormer cell
45 /// matching.
46 pub word_cells: Vec<TextCell>,
47 pub image: RgbImage,
48 /// Hyperlink annotations on the page (rect in top-left page coords + target
49 /// URI), restricted to web/mail/tel schemes. Used only by strict Markdown.
50 pub links: Vec<LinkAnnot>,
51}
52
53/// A PDF link annotation: its rectangle (top-left page coordinates, matching
54/// [`TextCell`]) and target URI.
55#[derive(Debug, Clone)]
56pub struct LinkAnnot {
57 pub l: f32,
58 pub t: f32,
59 pub r: f32,
60 pub b: f32,
61 pub uri: String,
62}
63
64/// A parsed PDF: per-page text cells and page images.
65pub struct PdfDocument {
66 pub pages: Vec<PdfPage>,
67}
68
69/// Bind to the pdfium dynamic library. Honors `PDFIUM_DYNAMIC_LIB_PATH` (a
70/// directory or file), else the directory of the current exe, else the system
71/// library — mirroring how a deployment ships `libpdfium` alongside the binary.
72/// Whether to use the docling-parse line sanitizer ([`crate::dp_lines`]) for prose
73/// reconstruction — the default. Set `DOCLING_LEGACY_LINES` to fall back to the
74/// older gap-heuristic `lines_from_glyphs`.
75pub(crate) fn use_dp_lines() -> bool {
76 std::env::var("DOCLING_LEGACY_LINES").is_err()
77}
78
79fn bind() -> Result<Pdfium, PdfiumError> {
80 if let Ok(path) = std::env::var("PDFIUM_DYNAMIC_LIB_PATH") {
81 let name = Pdfium::pdfium_platform_library_name_at_path(&path);
82 if let Ok(b) = Pdfium::bind_to_library(&name) {
83 return Ok(Pdfium::new(b));
84 }
85 if let Ok(b) = Pdfium::bind_to_library(&path) {
86 return Ok(Pdfium::new(b));
87 }
88 }
89 Pdfium::bind_to_system_library().map(Pdfium::new)
90}
91
92impl PdfDocument {
93 /// Parse a PDF from bytes, optionally decrypting with `password`.
94 ///
95 /// Note: this materialises **every** page's rendered bitmap in memory at
96 /// once. For large documents prefer [`for_each_page`], which streams.
97 pub fn open(bytes: &[u8], password: Option<&str>) -> Result<Self, PdfiumError> {
98 let pdfium = bind()?;
99 let ffi = FfiText::load(pdfium.bindings(), bytes, password);
100 let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
101 let mut rust = rust_parser_cells(bytes);
102 let mut pages = Vec::new();
103 for (i, page) in doc.pages().iter().enumerate() {
104 let rc = rust.as_mut().and_then(|v| v.get_mut(i).map(std::mem::take));
105 pages.push(extract_page(&page, &ffi, i as i32, rc)?);
106 }
107 Ok(PdfDocument { pages })
108 }
109}
110
111/// Per-page prose line cells from the pure-Rust text parser. This is the
112/// **default** text layer (it matches docling-parse's char geometry and is a
113/// strict improvement on byte-conformance — e.g. it recovers the Arabic
114/// sentence-period attachment in `right_to_left_01`). Set `DOCLING_PDFIUM_TEXT`
115/// to fall back to pdfium's text layer. The parser returns an empty page when a
116/// PDF (or a page) has no parseable text layer; the caller keeps pdfium's cells
117/// in that case, so scanned/edge-case pages are unaffected.
118fn rust_parser_cells(bytes: &[u8]) -> Option<Vec<Vec<TextCell>>> {
119 if std::env::var("DOCLING_PDFIUM_TEXT").is_ok() {
120 return None;
121 }
122 Some(
123 crate::textparse::pdf_textlines(bytes)
124 .into_iter()
125 .map(|(_, _, cells)| cells)
126 .collect(),
127 )
128}
129
130/// Render + extract pages one at a time, handing each (owned) [`PdfPage`] to `f`.
131/// Only one page bitmap is resident at a time — a rendered page is ~5 MB, so a
132/// large PDF would otherwise hold gigabytes of bitmaps at once. `f` receives the
133/// zero-based page index and the total page count.
134///
135/// `E` is the caller's error type; pdfium errors convert into it via `From`.
136pub fn for_each_page<E, F>(bytes: &[u8], password: Option<&str>, mut f: F) -> Result<(), E>
137where
138 E: From<PdfiumError>,
139 F: FnMut(usize, usize, PdfPage) -> Result<(), E>,
140{
141 let pdfium = bind()?;
142 let ffi = FfiText::load(pdfium.bindings(), bytes, password);
143 let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
144 let mut rust = rust_parser_cells(bytes);
145 let pages = doc.pages();
146 let total = pages.len() as usize;
147 for (i, page) in pages.iter().enumerate() {
148 let rc = rust.as_mut().and_then(|v| v.get_mut(i).map(std::mem::take));
149 let extracted = extract_page(&page, &ffi, i as i32, rc)?;
150 f(i, total, extracted)?;
151 }
152 Ok(())
153}
154
155fn extract_page(
156 page: &pdfium_render::prelude::PdfPage<'_>,
157 ffi: &FfiText<'_>,
158 index: i32,
159 rust_cells: Option<Vec<TextCell>>,
160) -> Result<PdfPage, PdfiumError> {
161 let width = page.width().value;
162 let height = page.height().value;
163
164 let (mut cells, code_cells, word_cells) = ffi.page_cells(index, height);
165 if cells.is_empty() {
166 cells = segment_cells(&page.text()?, height);
167 }
168 // Default: use the pure-Rust text parser's prose line cells instead of
169 // pdfium's (override with `DOCLING_PDFIUM_TEXT`). Word/code cells stay on
170 // pdfium so TableFormer cell-matching is unaffected.
171 if let Some(rc) = rust_cells {
172 if !rc.is_empty() {
173 cells = rc;
174 }
175 }
176
177 // docling renders at 1.5× the target scale and downsamples "to make it
178 // sharper" (pypdfium2 → PIL BICUBIC). Replicate exactly: the TableFormer
179 // model is pixel-sensitive, so the page bitmap must match byte-for-byte.
180 // `CatmullRom` is the same a=-0.5 cubic kernel as PIL's BICUBIC.
181 const SUPERSAMPLE: f32 = 1.5;
182 let tw = (width * RENDER_SCALE * SUPERSAMPLE).round().max(1.0) as i32;
183 let th = (height * RENDER_SCALE * SUPERSAMPLE).round().max(1.0) as i32;
184 let cfg = PdfRenderConfig::new()
185 .set_target_width(tw)
186 .set_target_height(th);
187 let bitmap = page.render_with_config(&cfg)?;
188 let big = bitmap.as_image().into_rgb8();
189 let dw = (width * RENDER_SCALE).round().max(1.0) as u32;
190 let dh = (height * RENDER_SCALE).round().max(1.0) as u32;
191 let image = image::imageops::resize(&big, dw, dh, image::imageops::FilterType::CatmullRom);
192
193 Ok(PdfPage {
194 width,
195 height,
196 scale: RENDER_SCALE,
197 cells,
198 code_cells,
199 word_cells,
200 image,
201 links: extract_links(page, height),
202 })
203}
204
205/// Collect web/mail/tel hyperlink annotations on a page, mapping each link's
206/// rectangle into top-left page coordinates (like [`TextCell`]). `file://` and
207/// in-document destinations are skipped — only externally meaningful targets are
208/// rendered. pdfium occasionally lists a link twice; rects are kept as-is and the
209/// caller dedupes by resolved anchor text.
210fn extract_links(page: &pdfium_render::prelude::PdfPage<'_>, page_h: f32) -> Vec<LinkAnnot> {
211 let mut out = Vec::new();
212 for link in page.links().iter() {
213 let Some(uri) = link
214 .action()
215 .and_then(|a| a.as_uri_action().and_then(|u| u.uri().ok()))
216 else {
217 continue;
218 };
219 let scheme_ok = ["http://", "https://", "mailto:", "tel:"]
220 .iter()
221 .any(|s| uri.starts_with(s));
222 if !scheme_ok {
223 continue;
224 }
225 if let Ok(rect) = link.rect() {
226 out.push(LinkAnnot {
227 l: rect.left().value,
228 t: page_h - rect.top().value,
229 r: rect.right().value,
230 b: page_h - rect.bottom().value,
231 uri,
232 });
233 }
234 }
235 out
236}
237
238/// Fallback line cells from pdfium-render's style segments (one cell per
239/// segment). Used only when the raw-FFI text page can't be loaded.
240fn segment_cells(text: &PdfPageText, page_h: f32) -> Vec<TextCell> {
241 text.segments()
242 .iter()
243 .filter_map(|seg| {
244 let s = seg.text();
245 if s.trim().is_empty() {
246 return None;
247 }
248 let r = seg.bounds();
249 Some(TextCell {
250 text: s,
251 l: r.left().value,
252 t: page_h - r.top().value,
253 r: r.right().value,
254 b: page_h - r.bottom().value,
255 })
256 })
257 .collect()
258}
259
260/// A second, raw-FFI handle on the same PDF used to drive the character loop
261/// (`FPDFText_GetUnicode`/`GetCharBox`) that pdfium-render's safe API doesn't
262/// expose. Closes the document on drop.
263struct FfiText<'a> {
264 bindings: &'a dyn PdfiumLibraryBindings,
265 doc: FPDF_DOCUMENT,
266}
267
268/// One glyph: codepoint + native (y-up) box edges. `l/b/r/t` is pdfium's *tight*
269/// ink box (used by the legacy `lines_from_glyphs`); `ll/lb/lr/lt` is the *loose*
270/// box (font ascent/descent + advance — uniform per font/size), which the
271/// docling-parse-style sanitizer needs so adjacent glyphs share a top edge.
272pub(crate) struct Glyph {
273 pub(crate) ch: char,
274 pub(crate) l: f32,
275 pub(crate) b: f32,
276 pub(crate) r: f32,
277 pub(crate) t: f32,
278 pub(crate) ll: f32,
279 pub(crate) lb: f32,
280 pub(crate) lr: f32,
281 pub(crate) lt: f32,
282 /// Hash of the PDF font name + flags (0 when not fetched). The sanitizer uses
283 /// it for docling-parse's `enforce_same_font` (keeps a bold label and regular
284 /// value as separate line cells, e.g. `LABEL : value`).
285 pub(crate) font: u64,
286}
287
288impl<'a> FfiText<'a> {
289 fn load(bindings: &'a dyn PdfiumLibraryBindings, bytes: &[u8], password: Option<&str>) -> Self {
290 let doc = bindings.FPDF_LoadMemDocument(bytes, password);
291 FfiText { bindings, doc }
292 }
293
294 /// Reconstruct line cells for page `index` (zero-based) via the
295 /// chars→words→lines grouping. Returns `(prose_cells, code_cells)` — the same
296 /// glyphs grouped two ways (gap-heuristic for prose, space-glyph-only for
297 /// code). Both empty on any failure (caller falls back).
298 fn page_cells(&self, index: i32, page_h: f32) -> (Vec<TextCell>, Vec<TextCell>, Vec<TextCell>) {
299 let empty = || (Vec::new(), Vec::new(), Vec::new());
300 if self.doc.is_null() {
301 return empty();
302 }
303 let b = self.bindings;
304 let page = b.FPDF_LoadPage(self.doc, index);
305 if page.is_null() {
306 return empty();
307 }
308 let tp = b.FPDFText_LoadPage(page);
309 let out = if tp.is_null() {
310 empty()
311 } else {
312 let dp = use_dp_lines();
313 let g = glyphs(b, tp, dp);
314 b.FPDFText_ClosePage(tp);
315 // Prose line cells: the docling-parse-style sanitizer (behind a flag
316 // while it's validated) or the legacy gap-heuristic reconstruction.
317 let prose = if dp {
318 crate::dp_lines::line_cells(&g, page_h, false)
319 } else {
320 lines_from_glyphs(&g, page_h, false)
321 };
322 (
323 prose,
324 lines_from_glyphs(&g, page_h, true),
325 words_from_glyphs(&g, page_h),
326 )
327 };
328 b.FPDF_ClosePage(page);
329 out
330 }
331}
332
333impl Drop for FfiText<'_> {
334 fn drop(&mut self) {
335 if !self.doc.is_null() {
336 self.bindings.FPDF_CloseDocument(self.doc);
337 }
338 }
339}
340
341/// Read every glyph (codepoint + native box) from the text page, in document
342/// order. A space glyph is kept as a word-boundary marker (NaN box, char `' '`);
343/// pdfium emits these on most lines and they pin word splits exactly. Hard line
344/// breaks are dropped (line structure comes from geometry); the gap heuristic in
345/// [`lines_from_glyphs`] is the fallback for the lines pdfium leaves space-less.
346/// Debug helper: the raw pdfium glyph stream (codepoint + native bottom-left
347/// box) for a page, in pdfium's character order. For comparing against
348/// docling-parse's char cells.
349pub fn debug_glyphs(bytes: &[u8], index: i32) -> Vec<(char, f32, f32)> {
350 let Ok(pdfium) = bind() else {
351 return Vec::new();
352 };
353 let ffi = FfiText::load(pdfium.bindings(), bytes, None);
354 if ffi.doc.is_null() {
355 return Vec::new();
356 }
357 let b = ffi.bindings;
358 let page = b.FPDF_LoadPage(ffi.doc, index);
359 if page.is_null() {
360 return Vec::new();
361 }
362 let tp = b.FPDFText_LoadPage(page);
363 let mut out = Vec::new();
364 if !tp.is_null() {
365 for g in glyphs(b, tp, true) {
366 out.push((g.ch, g.ll, g.lr));
367 }
368 b.FPDFText_ClosePage(tp);
369 }
370 b.FPDF_ClosePage(page);
371 out
372}
373
374/// Hash a glyph's PDF font name + flags, for `enforce_same_font`. 0 if unavailable.
375fn font_hash(b: &dyn PdfiumLibraryBindings, tp: FPDF_TEXTPAGE, i: i32) -> u64 {
376 use std::hash::{Hash, Hasher};
377 let mut flags: std::os::raw::c_int = 0;
378 let len = b.FPDFText_GetFontInfo(tp, i, std::ptr::null_mut(), 0, &mut flags);
379 if len == 0 {
380 return 0;
381 }
382 let mut buf = vec![0u8; len as usize];
383 b.FPDFText_GetFontInfo(
384 tp,
385 i,
386 buf.as_mut_ptr() as *mut std::os::raw::c_void,
387 len,
388 &mut flags,
389 );
390 let mut h = std::collections::hash_map::DefaultHasher::new();
391 buf.hash(&mut h);
392 flags.hash(&mut h);
393 h.finish()
394}
395
396fn glyphs(b: &dyn PdfiumLibraryBindings, tp: FPDF_TEXTPAGE, fetch_font: bool) -> Vec<Glyph> {
397 let n = b.FPDFText_CountChars(tp);
398 let mut out = Vec::with_capacity(n.max(0) as usize);
399 for i in 0..n {
400 let ch = match char::from_u32(b.FPDFText_GetUnicode(tp, i)) {
401 Some(c) => c,
402 None => continue,
403 };
404 if ch == '\r' || ch == '\n' {
405 continue;
406 }
407 // Spaces are font-neutral (0): pdfium's generated spaces carry a default
408 // font that would otherwise block every word↔space merge under
409 // enforce_same_font; docling-parse's spaces inherit the run's font.
410 let font = if fetch_font && !ch.is_whitespace() {
411 font_hash(b, tp, i)
412 } else {
413 0
414 };
415 let (mut l, mut r, mut bot, mut top) = (0f64, 0f64, 0f64, 0f64);
416 let has_box = b.FPDFText_GetCharBox(tp, i, &mut l, &mut r, &mut bot, &mut top) != 0;
417 // Loose box: font ascent/descent + glyph advance, uniform per font/size.
418 let mut lr = FS_RECTF {
419 left: 0.0,
420 top: 0.0,
421 right: 0.0,
422 bottom: 0.0,
423 };
424 let (ll, lb, lrt, ltop) = if b.FPDFText_GetLooseCharBox(tp, i, &mut lr) != 0 {
425 (lr.left, lr.bottom, lr.right, lr.top)
426 } else if has_box {
427 (l as f32, bot as f32, r as f32, top as f32)
428 } else {
429 (f32::NAN, 0.0, 0.0, 0.0)
430 };
431 if ch.is_whitespace() {
432 // Keep the space *with its box* (the docling-parse-style line sanitizer
433 // needs literal space glyphs); NaN `l` if pdfium reports no box (the
434 // legacy `lines_from_glyphs` ignores the box and only flags a space).
435 out.push(Glyph {
436 ch: ' ',
437 l: if has_box { l as f32 } else { f32::NAN },
438 b: if has_box { bot as f32 } else { 0.0 },
439 r: if has_box { r as f32 } else { 0.0 },
440 t: if has_box { top as f32 } else { 0.0 },
441 ll,
442 lb,
443 lr: lrt,
444 lt: ltop,
445 font,
446 });
447 continue;
448 }
449 if !has_box {
450 continue;
451 }
452 out.push(Glyph {
453 ch,
454 l: l as f32,
455 b: bot as f32,
456 r: r as f32,
457 t: top as f32,
458 ll,
459 lb,
460 lr: lrt,
461 lt: ltop,
462 font,
463 });
464 }
465 // pdfium splits the Arabic lam-alef ligature into two chars at the *same* x
466 // (it's one glyph) in visual order — `alef-variant, lam`. docling-parse and
467 // logical order are `lam, alef-variant`. Detect the ligature by the shared x
468 // and swap. The shared-x test reliably distinguishes a true ligature from a
469 // genuine `alef + lam` sequence (the article `ال`, or `فعالة`), whose two
470 // glyphs sit at different x and must NOT be reordered.
471 for i in 0..out.len().saturating_sub(1) {
472 let same_x = out[i].l.is_finite()
473 && out[i + 1].l.is_finite()
474 && (out[i].l - out[i + 1].l).abs() < 1.0;
475 if same_x
476 && matches!(out[i].ch, '\u{0622}' | '\u{0623}' | '\u{0625}' | '\u{0627}')
477 && out[i + 1].ch == '\u{0644}'
478 {
479 out.swap(i, i + 1);
480 }
481 }
482 // Reconstruct degenerate (zero-width) loose space boxes by spanning the gap to
483 // the next glyph on the same line, so the sanitizer keeps them as word
484 // separators rather than dropping them (which would merge `Information systems`
485 // → `Informationsystems`). pdfium gives generated spaces a zero-width box at a
486 // wrong baseline; a wrap (different baseline) or a touching gap is left alone.
487 for i in 0..out.len() {
488 if out[i].ch != ' ' || (out[i].lr - out[i].ll).abs() >= 0.5 {
489 continue;
490 }
491 let prev = out[..i]
492 .iter()
493 .rev()
494 .find(|g| g.ch != ' ' && g.ll.is_finite())
495 .map(|g| (g.lr, g.lb, g.lt));
496 let next = out[i + 1..]
497 .iter()
498 .find(|g| g.ch != ' ' && g.ll.is_finite())
499 .map(|g| (g.ll, g.lb));
500 if let (Some((plr, plb, plt)), Some((nll, nlb))) = (prev, next) {
501 let line_h = (plt - plb).abs().max(1.0);
502 if (plb - nlb).abs() < line_h * 0.5 && nll > plr + 0.5 {
503 out[i].ll = plr;
504 out[i].lr = nll;
505 out[i].lb = plb;
506 out[i].lt = plt;
507 }
508 }
509 }
510 out
511}
512
513/// Group glyphs (document order) into words then lines, the way docling-parse
514/// does: a new **word** starts where the horizontal gap to the previous glyph
515/// exceeds ~0.2 × the font height (a real space is ~0.3 × height; letter
516/// tracking is smaller, so titles don't shatter); a new **line** starts where
517/// the baseline drops by ~half the font height (a superscript rises without
518/// dropping, so it stays on its line). Coordinates are flipped to top-left.
519/// `code` mode splits words **only** at pdfium's own space glyphs and never glues
520/// punctuation — monospace code has wide inter-glyph advances that the prose
521/// gap heuristic mistakes for spaces (`f un c t i o n`), but pdfium emits a real
522/// space glyph at every true gap, so honoring just those reproduces the source
523/// spacing (`function add(a, b)`).
524fn lines_from_glyphs(gs: &[Glyph], page_h: f32, code: bool) -> Vec<TextCell> {
525 let mut cells: Vec<TextCell> = Vec::new();
526 let mut words: Vec<String> = Vec::new(); // words on the current line
527 let mut word = String::new();
528 // current line bounding box, native
529 let (mut ll, mut lb, mut lr, mut lt) = (
530 f32::INFINITY,
531 f32::INFINITY,
532 f32::NEG_INFINITY,
533 f32::NEG_INFINITY,
534 );
535 // Tallest glyph seen on the current line: the word-gap threshold is relative
536 // to it, so a small-font run on the line (a superscript citation) isn't split
537 // at its tight digit gaps, while a big display title isn't split at its wider
538 // letter tracking. A real inter-word space is ~0.3× the font height.
539 let mut line_h: f32 = 0.0;
540 let mut prev: Option<&Glyph> = None;
541 // A space glyph between non-space glyphs pins a word split the gap heuristic
542 // can miss (tight justified spacing); it carries no geometry.
543 let mut pending_space = false;
544
545 for g in gs {
546 if g.ch == ' ' {
547 pending_space = true;
548 continue;
549 }
550 let h = (g.t - g.b).abs().max(1.0);
551 let (mut new_word, mut new_line) = (false, false);
552 if let Some(p) = prev {
553 // A new line drops the baseline *and* resets x leftward; requiring the
554 // x-reset avoids a descending comma/semicolon faking a line break. A
555 // *large* drop (≥1.5× the line height — a skipped line, e.g. a centered
556 // page-number footer below a short last word) is always a new line,
557 // even without the x-reset.
558 // LTR wraps reset x leftward (`g.l < p.r`); RTL (Arabic) wraps reset
559 // rightward (the new line begins at the far right). A large drop
560 // (≥1.5× line height) is a new line regardless of x.
561 let x_reset = if is_arabic(g.ch) || is_arabic(p.ch) {
562 g.l > p.r
563 } else {
564 g.l < p.r
565 };
566 new_line = (p.b - g.b > h * 0.5 && x_reset) || (p.b - g.b > line_h.max(h) * 1.5);
567 // Don't split before closing punctuation, after opening punctuation, or
568 // after a period that runs into a digit/lowercase letter — docling
569 // keeps `engines,` / `[37` / `i.e.` / `98.5` together even across a
570 // space or gap.
571 let glued = is_close_punct(g.ch)
572 || is_open_punct(p.ch)
573 || (p.ch.is_ascii_digit() && g.ch.is_ascii_digit())
574 || (p.ch == '.'
575 && !pending_space
576 && (g.ch.is_ascii_digit() || g.ch.is_ascii_lowercase()));
577 let word_gap = line_h.max(h) * 0.25;
578 new_word = if code {
579 new_line || pending_space
580 } else if is_arabic(g.ch) || is_arabic(p.ch) {
581 // RTL runs right-to-left, so the inter-word gap is `p.l - g.r`. A
582 // real word space has a gap; pdfium also emits spurious zero-gap
583 // space glyphs inside words (`التي`), so require the gap rather
584 // than trusting a bare space glyph.
585 new_line || (p.l - g.r > word_gap && !glued)
586 } else {
587 new_line || ((pending_space || g.l - p.r > word_gap) && !glued)
588 };
589 }
590 pending_space = false;
591 if new_line {
592 push_word(&mut word, &mut words);
593 push_line(&mut words, (ll, lb, lr, lt), page_h, &mut cells);
594 (ll, lb, lr, lt) = (
595 f32::INFINITY,
596 f32::INFINITY,
597 f32::NEG_INFINITY,
598 f32::NEG_INFINITY,
599 );
600 line_h = 0.0;
601 } else if new_word {
602 push_word(&mut word, &mut words);
603 }
604 word.push(g.ch);
605 ll = ll.min(g.l);
606 lb = lb.min(g.b);
607 lr = lr.max(g.r);
608 lt = lt.max(g.t);
609 line_h = line_h.max(h);
610 prev = Some(g);
611 }
612 push_word(&mut word, &mut words);
613 push_line(&mut words, (ll, lb, lr, lt), page_h, &mut cells);
614 cells
615}
616
617/// Per-word cells (each word's text + top-left bbox), using the same word/line
618/// splitting as [`lines_from_glyphs`] but emitting one cell per word instead of
619/// joining into lines — the TableFormer matcher places individual words into
620/// grid cells (a table line spans many cells, so line cells can't be matched).
621fn words_from_glyphs(gs: &[Glyph], page_h: f32) -> Vec<TextCell> {
622 let mut cells = Vec::new();
623 let mut word = String::new();
624 let inf = (
625 f32::INFINITY,
626 f32::INFINITY,
627 f32::NEG_INFINITY,
628 f32::NEG_INFINITY,
629 );
630 let (mut wl, mut wb, mut wr, mut wt) = inf;
631 let mut line_h: f32 = 0.0;
632 let mut prev: Option<&Glyph> = None;
633 let mut pending_space = false;
634 for g in gs {
635 if g.ch == ' ' {
636 pending_space = true;
637 continue;
638 }
639 let h = (g.t - g.b).abs().max(1.0);
640 let mut new_line = false;
641 let mut new_word = false;
642 if let Some(p) = prev {
643 // LTR wraps reset x leftward (`g.l < p.r`); RTL (Arabic) wraps reset
644 // rightward (the new line begins at the far right). A large drop
645 // (≥1.5× line height) is a new line regardless of x.
646 let x_reset = if is_arabic(g.ch) || is_arabic(p.ch) {
647 g.l > p.r
648 } else {
649 g.l < p.r
650 };
651 new_line = (p.b - g.b > h * 0.5 && x_reset) || (p.b - g.b > line_h.max(h) * 1.5);
652 // No digit-digit glue here (unlike the prose grouping): table cells in
653 // adjacent columns are numeric and a column gap must still split them
654 // (`0.965` `0.934`, not `0.9650.934`). Intra-number digits have no gap
655 // so they stay together regardless.
656 let glued = is_close_punct(g.ch)
657 || is_open_punct(p.ch)
658 || (p.ch == '.'
659 && !pending_space
660 && (g.ch.is_ascii_digit() || g.ch.is_ascii_lowercase()));
661 let word_gap = line_h.max(h) * 0.25;
662 new_word = new_line || ((pending_space || g.l - p.r > word_gap) && !glued);
663 }
664 pending_space = false;
665 if new_word && !word.is_empty() {
666 cells.push(TextCell {
667 text: std::mem::take(&mut word),
668 l: wl,
669 t: page_h - wt,
670 r: wr,
671 b: page_h - wb,
672 });
673 (wl, wb, wr, wt) = inf;
674 }
675 if new_line {
676 line_h = 0.0;
677 }
678 word.push(g.ch);
679 wl = wl.min(g.l);
680 wb = wb.min(g.b);
681 wr = wr.max(g.r);
682 wt = wt.max(g.t);
683 line_h = line_h.max(h);
684 prev = Some(g);
685 }
686 if !word.is_empty() {
687 cells.push(TextCell {
688 text: word,
689 l: wl,
690 t: page_h - wt,
691 r: wr,
692 b: page_h - wb,
693 });
694 }
695 cells
696}
697
698fn is_arabic(c: char) -> bool {
699 ('\u{0600}'..='\u{06FF}').contains(&c)
700}
701
702fn is_close_punct(c: char) -> bool {
703 matches!(
704 c,
705 ',' | '.' | ';' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '\u{2019}' | '\u{2018}'
706 )
707}
708
709fn is_open_punct(c: char) -> bool {
710 // `@` glues to what follows (`mAP @0.5`, `bpf@zurich`, `@decorator`).
711 matches!(c, '(' | '[' | '{' | '@')
712}
713
714fn push_word(word: &mut String, words: &mut Vec<String>) {
715 if !word.is_empty() {
716 words.push(std::mem::take(word));
717 }
718}
719
720fn push_line(
721 words: &mut Vec<String>,
722 bbox: (f32, f32, f32, f32),
723 page_h: f32,
724 cells: &mut Vec<TextCell>,
725) {
726 if words.is_empty() {
727 return;
728 }
729 let text = std::mem::take(words).join(" ");
730 let (l, b, r, t) = bbox;
731 cells.push(TextCell {
732 text,
733 l,
734 t: page_h - t,
735 r,
736 b: page_h - b,
737 });
738}