fleischwolf_pdf/pdfium_backend.rs
1//! pdfium-based text extraction and page rendering.
2//!
3//! Text is reconstructed the way docling's `docling-parse` does it, so the
4//! output spacing matches the groundtruth: the page's **character** stream is
5//! grouped into **words** (split at a horizontal gap wider than a fraction of
6//! the font height — font-relative, so letter-tracking in display titles does
7//! not split a word) and words into **lines** (by baseline). pdfium-render's
8//! safe API only exposes whole style runs / `GetBoundedText`, so the character
9//! loop is driven through the raw `PdfiumLibraryBindings` FFI on a second handle
10//! to the same bytes (no fork; stays publishable).
11
12use image::RgbImage;
13use pdfium_render::prelude::*;
14
15/// A run of text with its bounding box, in PDF points with a **top-left** origin
16/// (pdfium's native origin is bottom-left; we flip it to match docling's
17/// `BoundingBox(..., origin=TOPLEFT)`).
18#[derive(Debug, Clone)]
19pub struct TextCell {
20 pub text: String,
21 pub l: f32,
22 pub t: f32,
23 pub r: f32,
24 pub b: f32,
25}
26
27/// Pixels-per-point used to render page images. Layout is scale-invariant (it
28/// scales normalized boxes by the page point size), but OCR benefits from the
29/// extra resolution.
30pub const RENDER_SCALE: f32 = 2.0;
31
32/// One page's geometry, extracted text cells, and a rendered RGB image. The
33/// image is rendered at [`RENDER_SCALE`] pixels per PDF point; `image px =
34/// page point × scale`.
35#[derive(Clone)]
36pub struct PdfPage {
37 pub width: f32,
38 pub height: f32,
39 pub scale: f32,
40 pub cells: Vec<TextCell>,
41 /// Same text grouped for code regions: split only at pdfium space glyphs, so
42 /// monospace runs keep their source spacing instead of the prose heuristic's.
43 pub code_cells: Vec<TextCell>,
44 /// Per-word cells (one per word, not joined into lines) for TableFormer cell
45 /// matching.
46 pub word_cells: Vec<TextCell>,
47 pub image: RgbImage,
48}
49
50/// A parsed PDF: per-page text cells and page images.
51pub struct PdfDocument {
52 pub pages: Vec<PdfPage>,
53}
54
55/// Bind to the pdfium dynamic library. Honors `PDFIUM_DYNAMIC_LIB_PATH` (a
56/// directory or file), else the directory of the current exe, else the system
57/// library — mirroring how a deployment ships `libpdfium` alongside the binary.
58/// Whether to use the docling-parse line sanitizer ([`crate::dp_lines`]) for prose
59/// reconstruction — the default. Set `DOCLING_LEGACY_LINES` to fall back to the
60/// older gap-heuristic `lines_from_glyphs`.
61pub(crate) fn use_dp_lines() -> bool {
62 std::env::var("DOCLING_LEGACY_LINES").is_err()
63}
64
65fn bind() -> Result<Pdfium, PdfiumError> {
66 if let Ok(path) = std::env::var("PDFIUM_DYNAMIC_LIB_PATH") {
67 let name = Pdfium::pdfium_platform_library_name_at_path(&path);
68 if let Ok(b) = Pdfium::bind_to_library(&name) {
69 return Ok(Pdfium::new(b));
70 }
71 if let Ok(b) = Pdfium::bind_to_library(&path) {
72 return Ok(Pdfium::new(b));
73 }
74 }
75 Pdfium::bind_to_system_library().map(Pdfium::new)
76}
77
78impl PdfDocument {
79 /// Parse a PDF from bytes, optionally decrypting with `password`.
80 ///
81 /// Note: this materialises **every** page's rendered bitmap in memory at
82 /// once. For large documents prefer [`for_each_page`], which streams.
83 pub fn open(bytes: &[u8], password: Option<&str>) -> Result<Self, PdfiumError> {
84 let pdfium = bind()?;
85 let ffi = FfiText::load(pdfium.bindings(), bytes, password);
86 let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
87 let mut pages = Vec::new();
88 for (i, page) in doc.pages().iter().enumerate() {
89 pages.push(extract_page(&page, &ffi, i as i32)?);
90 }
91 Ok(PdfDocument { pages })
92 }
93}
94
95/// Render + extract pages one at a time, handing each (owned) [`PdfPage`] to `f`.
96/// Only one page bitmap is resident at a time — a rendered page is ~5 MB, so a
97/// large PDF would otherwise hold gigabytes of bitmaps at once. `f` receives the
98/// zero-based page index and the total page count.
99///
100/// `E` is the caller's error type; pdfium errors convert into it via `From`.
101pub fn for_each_page<E, F>(bytes: &[u8], password: Option<&str>, mut f: F) -> Result<(), E>
102where
103 E: From<PdfiumError>,
104 F: FnMut(usize, usize, PdfPage) -> Result<(), E>,
105{
106 let pdfium = bind()?;
107 let ffi = FfiText::load(pdfium.bindings(), bytes, password);
108 let doc = pdfium.load_pdf_from_byte_slice(bytes, password)?;
109 let pages = doc.pages();
110 let total = pages.len() as usize;
111 for (i, page) in pages.iter().enumerate() {
112 let extracted = extract_page(&page, &ffi, i as i32)?;
113 f(i, total, extracted)?;
114 }
115 Ok(())
116}
117
118fn extract_page(
119 page: &pdfium_render::prelude::PdfPage<'_>,
120 ffi: &FfiText<'_>,
121 index: i32,
122) -> Result<PdfPage, PdfiumError> {
123 let width = page.width().value;
124 let height = page.height().value;
125
126 let (mut cells, code_cells, word_cells) = ffi.page_cells(index, height);
127 if cells.is_empty() {
128 cells = segment_cells(&page.text()?, height);
129 }
130
131 // docling renders at 1.5× the target scale and downsamples "to make it
132 // sharper" (pypdfium2 → PIL BICUBIC). Replicate exactly: the TableFormer
133 // model is pixel-sensitive, so the page bitmap must match byte-for-byte.
134 // `CatmullRom` is the same a=-0.5 cubic kernel as PIL's BICUBIC.
135 const SUPERSAMPLE: f32 = 1.5;
136 let tw = (width * RENDER_SCALE * SUPERSAMPLE).round().max(1.0) as i32;
137 let th = (height * RENDER_SCALE * SUPERSAMPLE).round().max(1.0) as i32;
138 let cfg = PdfRenderConfig::new()
139 .set_target_width(tw)
140 .set_target_height(th);
141 let bitmap = page.render_with_config(&cfg)?;
142 let big = bitmap.as_image().into_rgb8();
143 let dw = (width * RENDER_SCALE).round().max(1.0) as u32;
144 let dh = (height * RENDER_SCALE).round().max(1.0) as u32;
145 let image = image::imageops::resize(&big, dw, dh, image::imageops::FilterType::CatmullRom);
146
147 Ok(PdfPage {
148 width,
149 height,
150 scale: RENDER_SCALE,
151 cells,
152 code_cells,
153 word_cells,
154 image,
155 })
156}
157
158/// Fallback line cells from pdfium-render's style segments (one cell per
159/// segment). Used only when the raw-FFI text page can't be loaded.
160fn segment_cells(text: &PdfPageText, page_h: f32) -> Vec<TextCell> {
161 text.segments()
162 .iter()
163 .filter_map(|seg| {
164 let s = seg.text();
165 if s.trim().is_empty() {
166 return None;
167 }
168 let r = seg.bounds();
169 Some(TextCell {
170 text: s,
171 l: r.left().value,
172 t: page_h - r.top().value,
173 r: r.right().value,
174 b: page_h - r.bottom().value,
175 })
176 })
177 .collect()
178}
179
180/// A second, raw-FFI handle on the same PDF used to drive the character loop
181/// (`FPDFText_GetUnicode`/`GetCharBox`) that pdfium-render's safe API doesn't
182/// expose. Closes the document on drop.
183struct FfiText<'a> {
184 bindings: &'a dyn PdfiumLibraryBindings,
185 doc: FPDF_DOCUMENT,
186}
187
188/// One glyph: codepoint + native (y-up) box edges. `l/b/r/t` is pdfium's *tight*
189/// ink box (used by the legacy `lines_from_glyphs`); `ll/lb/lr/lt` is the *loose*
190/// box (font ascent/descent + advance — uniform per font/size), which the
191/// docling-parse-style sanitizer needs so adjacent glyphs share a top edge.
192pub(crate) struct Glyph {
193 pub(crate) ch: char,
194 pub(crate) l: f32,
195 pub(crate) b: f32,
196 pub(crate) r: f32,
197 pub(crate) t: f32,
198 pub(crate) ll: f32,
199 pub(crate) lb: f32,
200 pub(crate) lr: f32,
201 pub(crate) lt: f32,
202 /// Hash of the PDF font name + flags (0 when not fetched). The sanitizer uses
203 /// it for docling-parse's `enforce_same_font` (keeps a bold label and regular
204 /// value as separate line cells, e.g. `LABEL : value`).
205 pub(crate) font: u64,
206}
207
208impl<'a> FfiText<'a> {
209 fn load(bindings: &'a dyn PdfiumLibraryBindings, bytes: &[u8], password: Option<&str>) -> Self {
210 let doc = bindings.FPDF_LoadMemDocument(bytes, password);
211 FfiText { bindings, doc }
212 }
213
214 /// Reconstruct line cells for page `index` (zero-based) via the
215 /// chars→words→lines grouping. Returns `(prose_cells, code_cells)` — the same
216 /// glyphs grouped two ways (gap-heuristic for prose, space-glyph-only for
217 /// code). Both empty on any failure (caller falls back).
218 fn page_cells(&self, index: i32, page_h: f32) -> (Vec<TextCell>, Vec<TextCell>, Vec<TextCell>) {
219 let empty = || (Vec::new(), Vec::new(), Vec::new());
220 if self.doc.is_null() {
221 return empty();
222 }
223 let b = self.bindings;
224 let page = b.FPDF_LoadPage(self.doc, index);
225 if page.is_null() {
226 return empty();
227 }
228 let tp = b.FPDFText_LoadPage(page);
229 let out = if tp.is_null() {
230 empty()
231 } else {
232 let dp = use_dp_lines();
233 let g = glyphs(b, tp, dp);
234 b.FPDFText_ClosePage(tp);
235 // Prose line cells: the docling-parse-style sanitizer (behind a flag
236 // while it's validated) or the legacy gap-heuristic reconstruction.
237 let prose = if dp {
238 crate::dp_lines::line_cells(&g, page_h)
239 } else {
240 lines_from_glyphs(&g, page_h, false)
241 };
242 (
243 prose,
244 lines_from_glyphs(&g, page_h, true),
245 words_from_glyphs(&g, page_h),
246 )
247 };
248 b.FPDF_ClosePage(page);
249 out
250 }
251}
252
253impl Drop for FfiText<'_> {
254 fn drop(&mut self) {
255 if !self.doc.is_null() {
256 self.bindings.FPDF_CloseDocument(self.doc);
257 }
258 }
259}
260
261/// Read every glyph (codepoint + native box) from the text page, in document
262/// order. A space glyph is kept as a word-boundary marker (NaN box, char `' '`);
263/// pdfium emits these on most lines and they pin word splits exactly. Hard line
264/// breaks are dropped (line structure comes from geometry); the gap heuristic in
265/// [`lines_from_glyphs`] is the fallback for the lines pdfium leaves space-less.
266/// Debug helper: the raw pdfium glyph stream (codepoint + native bottom-left
267/// box) for a page, in pdfium's character order. For comparing against
268/// docling-parse's char cells.
269pub fn debug_glyphs(bytes: &[u8], index: i32) -> Vec<(char, f32, f32)> {
270 let Ok(pdfium) = bind() else {
271 return Vec::new();
272 };
273 let ffi = FfiText::load(pdfium.bindings(), bytes, None);
274 if ffi.doc.is_null() {
275 return Vec::new();
276 }
277 let b = ffi.bindings;
278 let page = b.FPDF_LoadPage(ffi.doc, index);
279 if page.is_null() {
280 return Vec::new();
281 }
282 let tp = b.FPDFText_LoadPage(page);
283 let mut out = Vec::new();
284 if !tp.is_null() {
285 for g in glyphs(b, tp, true) {
286 out.push((g.ch, g.ll, g.lr));
287 }
288 b.FPDFText_ClosePage(tp);
289 }
290 b.FPDF_ClosePage(page);
291 out
292}
293
294/// Hash a glyph's PDF font name + flags, for `enforce_same_font`. 0 if unavailable.
295fn font_hash(b: &dyn PdfiumLibraryBindings, tp: FPDF_TEXTPAGE, i: i32) -> u64 {
296 use std::hash::{Hash, Hasher};
297 let mut flags: std::os::raw::c_int = 0;
298 let len = b.FPDFText_GetFontInfo(tp, i, std::ptr::null_mut(), 0, &mut flags);
299 if len == 0 {
300 return 0;
301 }
302 let mut buf = vec![0u8; len as usize];
303 b.FPDFText_GetFontInfo(
304 tp,
305 i,
306 buf.as_mut_ptr() as *mut std::os::raw::c_void,
307 len,
308 &mut flags,
309 );
310 let mut h = std::collections::hash_map::DefaultHasher::new();
311 buf.hash(&mut h);
312 flags.hash(&mut h);
313 h.finish()
314}
315
316fn glyphs(b: &dyn PdfiumLibraryBindings, tp: FPDF_TEXTPAGE, fetch_font: bool) -> Vec<Glyph> {
317 let n = b.FPDFText_CountChars(tp);
318 let mut out = Vec::with_capacity(n.max(0) as usize);
319 for i in 0..n {
320 let ch = match char::from_u32(b.FPDFText_GetUnicode(tp, i)) {
321 Some(c) => c,
322 None => continue,
323 };
324 if ch == '\r' || ch == '\n' {
325 continue;
326 }
327 // Spaces are font-neutral (0): pdfium's generated spaces carry a default
328 // font that would otherwise block every word↔space merge under
329 // enforce_same_font; docling-parse's spaces inherit the run's font.
330 let font = if fetch_font && !ch.is_whitespace() {
331 font_hash(b, tp, i)
332 } else {
333 0
334 };
335 let (mut l, mut r, mut bot, mut top) = (0f64, 0f64, 0f64, 0f64);
336 let has_box = b.FPDFText_GetCharBox(tp, i, &mut l, &mut r, &mut bot, &mut top) != 0;
337 // Loose box: font ascent/descent + glyph advance, uniform per font/size.
338 let mut lr = FS_RECTF {
339 left: 0.0,
340 top: 0.0,
341 right: 0.0,
342 bottom: 0.0,
343 };
344 let (ll, lb, lrt, ltop) = if b.FPDFText_GetLooseCharBox(tp, i, &mut lr) != 0 {
345 (lr.left, lr.bottom, lr.right, lr.top)
346 } else if has_box {
347 (l as f32, bot as f32, r as f32, top as f32)
348 } else {
349 (f32::NAN, 0.0, 0.0, 0.0)
350 };
351 if ch.is_whitespace() {
352 // Keep the space *with its box* (the docling-parse-style line sanitizer
353 // needs literal space glyphs); NaN `l` if pdfium reports no box (the
354 // legacy `lines_from_glyphs` ignores the box and only flags a space).
355 out.push(Glyph {
356 ch: ' ',
357 l: if has_box { l as f32 } else { f32::NAN },
358 b: if has_box { bot as f32 } else { 0.0 },
359 r: if has_box { r as f32 } else { 0.0 },
360 t: if has_box { top as f32 } else { 0.0 },
361 ll,
362 lb,
363 lr: lrt,
364 lt: ltop,
365 font,
366 });
367 continue;
368 }
369 if !has_box {
370 continue;
371 }
372 out.push(Glyph {
373 ch,
374 l: l as f32,
375 b: bot as f32,
376 r: r as f32,
377 t: top as f32,
378 ll,
379 lb,
380 lr: lrt,
381 lt: ltop,
382 font,
383 });
384 }
385 // pdfium splits the Arabic lam-alef ligature into two chars at the *same* x
386 // (it's one glyph) in visual order — `alef-variant, lam`. docling-parse and
387 // logical order are `lam, alef-variant`. Detect the ligature by the shared x
388 // and swap. The shared-x test reliably distinguishes a true ligature from a
389 // genuine `alef + lam` sequence (the article `ال`, or `فعالة`), whose two
390 // glyphs sit at different x and must NOT be reordered.
391 for i in 0..out.len().saturating_sub(1) {
392 let same_x = out[i].l.is_finite()
393 && out[i + 1].l.is_finite()
394 && (out[i].l - out[i + 1].l).abs() < 1.0;
395 if same_x
396 && matches!(out[i].ch, '\u{0622}' | '\u{0623}' | '\u{0625}' | '\u{0627}')
397 && out[i + 1].ch == '\u{0644}'
398 {
399 out.swap(i, i + 1);
400 }
401 }
402 // Reconstruct degenerate (zero-width) loose space boxes by spanning the gap to
403 // the next glyph on the same line, so the sanitizer keeps them as word
404 // separators rather than dropping them (which would merge `Information systems`
405 // → `Informationsystems`). pdfium gives generated spaces a zero-width box at a
406 // wrong baseline; a wrap (different baseline) or a touching gap is left alone.
407 for i in 0..out.len() {
408 if out[i].ch != ' ' || (out[i].lr - out[i].ll).abs() >= 0.5 {
409 continue;
410 }
411 let prev = out[..i]
412 .iter()
413 .rev()
414 .find(|g| g.ch != ' ' && g.ll.is_finite())
415 .map(|g| (g.lr, g.lb, g.lt));
416 let next = out[i + 1..]
417 .iter()
418 .find(|g| g.ch != ' ' && g.ll.is_finite())
419 .map(|g| (g.ll, g.lb));
420 if let (Some((plr, plb, plt)), Some((nll, nlb))) = (prev, next) {
421 let line_h = (plt - plb).abs().max(1.0);
422 if (plb - nlb).abs() < line_h * 0.5 && nll > plr + 0.5 {
423 out[i].ll = plr;
424 out[i].lr = nll;
425 out[i].lb = plb;
426 out[i].lt = plt;
427 }
428 }
429 }
430 out
431}
432
433/// Group glyphs (document order) into words then lines, the way docling-parse
434/// does: a new **word** starts where the horizontal gap to the previous glyph
435/// exceeds ~0.2 × the font height (a real space is ~0.3 × height; letter
436/// tracking is smaller, so titles don't shatter); a new **line** starts where
437/// the baseline drops by ~half the font height (a superscript rises without
438/// dropping, so it stays on its line). Coordinates are flipped to top-left.
439/// `code` mode splits words **only** at pdfium's own space glyphs and never glues
440/// punctuation — monospace code has wide inter-glyph advances that the prose
441/// gap heuristic mistakes for spaces (`f un c t i o n`), but pdfium emits a real
442/// space glyph at every true gap, so honoring just those reproduces the source
443/// spacing (`function add(a, b)`).
444fn lines_from_glyphs(gs: &[Glyph], page_h: f32, code: bool) -> Vec<TextCell> {
445 let mut cells: Vec<TextCell> = Vec::new();
446 let mut words: Vec<String> = Vec::new(); // words on the current line
447 let mut word = String::new();
448 // current line bounding box, native
449 let (mut ll, mut lb, mut lr, mut lt) = (
450 f32::INFINITY,
451 f32::INFINITY,
452 f32::NEG_INFINITY,
453 f32::NEG_INFINITY,
454 );
455 // Tallest glyph seen on the current line: the word-gap threshold is relative
456 // to it, so a small-font run on the line (a superscript citation) isn't split
457 // at its tight digit gaps, while a big display title isn't split at its wider
458 // letter tracking. A real inter-word space is ~0.3× the font height.
459 let mut line_h: f32 = 0.0;
460 let mut prev: Option<&Glyph> = None;
461 // A space glyph between non-space glyphs pins a word split the gap heuristic
462 // can miss (tight justified spacing); it carries no geometry.
463 let mut pending_space = false;
464
465 for g in gs {
466 if g.ch == ' ' {
467 pending_space = true;
468 continue;
469 }
470 let h = (g.t - g.b).abs().max(1.0);
471 let (mut new_word, mut new_line) = (false, false);
472 if let Some(p) = prev {
473 // A new line drops the baseline *and* resets x leftward; requiring the
474 // x-reset avoids a descending comma/semicolon faking a line break. A
475 // *large* drop (≥1.5× the line height — a skipped line, e.g. a centered
476 // page-number footer below a short last word) is always a new line,
477 // even without the x-reset.
478 // LTR wraps reset x leftward (`g.l < p.r`); RTL (Arabic) wraps reset
479 // rightward (the new line begins at the far right). A large drop
480 // (≥1.5× line height) is a new line regardless of x.
481 let x_reset = if is_arabic(g.ch) || is_arabic(p.ch) {
482 g.l > p.r
483 } else {
484 g.l < p.r
485 };
486 new_line = (p.b - g.b > h * 0.5 && x_reset) || (p.b - g.b > line_h.max(h) * 1.5);
487 // Don't split before closing punctuation, after opening punctuation, or
488 // after a period that runs into a digit/lowercase letter — docling
489 // keeps `engines,` / `[37` / `i.e.` / `98.5` together even across a
490 // space or gap.
491 let glued = is_close_punct(g.ch)
492 || is_open_punct(p.ch)
493 || (p.ch.is_ascii_digit() && g.ch.is_ascii_digit())
494 || (p.ch == '.'
495 && !pending_space
496 && (g.ch.is_ascii_digit() || g.ch.is_ascii_lowercase()));
497 let word_gap = line_h.max(h) * 0.25;
498 new_word = if code {
499 new_line || pending_space
500 } else if is_arabic(g.ch) || is_arabic(p.ch) {
501 // RTL runs right-to-left, so the inter-word gap is `p.l - g.r`. A
502 // real word space has a gap; pdfium also emits spurious zero-gap
503 // space glyphs inside words (`التي`), so require the gap rather
504 // than trusting a bare space glyph.
505 new_line || (p.l - g.r > word_gap && !glued)
506 } else {
507 new_line || ((pending_space || g.l - p.r > word_gap) && !glued)
508 };
509 }
510 pending_space = false;
511 if new_line {
512 push_word(&mut word, &mut words);
513 push_line(&mut words, (ll, lb, lr, lt), page_h, &mut cells);
514 (ll, lb, lr, lt) = (
515 f32::INFINITY,
516 f32::INFINITY,
517 f32::NEG_INFINITY,
518 f32::NEG_INFINITY,
519 );
520 line_h = 0.0;
521 } else if new_word {
522 push_word(&mut word, &mut words);
523 }
524 word.push(g.ch);
525 ll = ll.min(g.l);
526 lb = lb.min(g.b);
527 lr = lr.max(g.r);
528 lt = lt.max(g.t);
529 line_h = line_h.max(h);
530 prev = Some(g);
531 }
532 push_word(&mut word, &mut words);
533 push_line(&mut words, (ll, lb, lr, lt), page_h, &mut cells);
534 cells
535}
536
537/// Per-word cells (each word's text + top-left bbox), using the same word/line
538/// splitting as [`lines_from_glyphs`] but emitting one cell per word instead of
539/// joining into lines — the TableFormer matcher places individual words into
540/// grid cells (a table line spans many cells, so line cells can't be matched).
541fn words_from_glyphs(gs: &[Glyph], page_h: f32) -> Vec<TextCell> {
542 let mut cells = Vec::new();
543 let mut word = String::new();
544 let inf = (
545 f32::INFINITY,
546 f32::INFINITY,
547 f32::NEG_INFINITY,
548 f32::NEG_INFINITY,
549 );
550 let (mut wl, mut wb, mut wr, mut wt) = inf;
551 let mut line_h: f32 = 0.0;
552 let mut prev: Option<&Glyph> = None;
553 let mut pending_space = false;
554 for g in gs {
555 if g.ch == ' ' {
556 pending_space = true;
557 continue;
558 }
559 let h = (g.t - g.b).abs().max(1.0);
560 let mut new_line = false;
561 let mut new_word = false;
562 if let Some(p) = prev {
563 // LTR wraps reset x leftward (`g.l < p.r`); RTL (Arabic) wraps reset
564 // rightward (the new line begins at the far right). A large drop
565 // (≥1.5× line height) is a new line regardless of x.
566 let x_reset = if is_arabic(g.ch) || is_arabic(p.ch) {
567 g.l > p.r
568 } else {
569 g.l < p.r
570 };
571 new_line = (p.b - g.b > h * 0.5 && x_reset) || (p.b - g.b > line_h.max(h) * 1.5);
572 // No digit-digit glue here (unlike the prose grouping): table cells in
573 // adjacent columns are numeric and a column gap must still split them
574 // (`0.965` `0.934`, not `0.9650.934`). Intra-number digits have no gap
575 // so they stay together regardless.
576 let glued = is_close_punct(g.ch)
577 || is_open_punct(p.ch)
578 || (p.ch == '.'
579 && !pending_space
580 && (g.ch.is_ascii_digit() || g.ch.is_ascii_lowercase()));
581 let word_gap = line_h.max(h) * 0.25;
582 new_word = new_line || ((pending_space || g.l - p.r > word_gap) && !glued);
583 }
584 pending_space = false;
585 if new_word && !word.is_empty() {
586 cells.push(TextCell {
587 text: std::mem::take(&mut word),
588 l: wl,
589 t: page_h - wt,
590 r: wr,
591 b: page_h - wb,
592 });
593 (wl, wb, wr, wt) = inf;
594 }
595 if new_line {
596 line_h = 0.0;
597 }
598 word.push(g.ch);
599 wl = wl.min(g.l);
600 wb = wb.min(g.b);
601 wr = wr.max(g.r);
602 wt = wt.max(g.t);
603 line_h = line_h.max(h);
604 prev = Some(g);
605 }
606 if !word.is_empty() {
607 cells.push(TextCell {
608 text: word,
609 l: wl,
610 t: page_h - wt,
611 r: wr,
612 b: page_h - wb,
613 });
614 }
615 cells
616}
617
618fn is_arabic(c: char) -> bool {
619 ('\u{0600}'..='\u{06FF}').contains(&c)
620}
621
622fn is_close_punct(c: char) -> bool {
623 matches!(
624 c,
625 ',' | '.' | ';' | '!' | '?' | ')' | ']' | '}' | '%' | '\'' | '\u{2019}' | '\u{2018}'
626 )
627}
628
629fn is_open_punct(c: char) -> bool {
630 // `@` glues to what follows (`mAP @0.5`, `bpf@zurich`, `@decorator`).
631 matches!(c, '(' | '[' | '{' | '@')
632}
633
634fn push_word(word: &mut String, words: &mut Vec<String>) {
635 if !word.is_empty() {
636 words.push(std::mem::take(word));
637 }
638}
639
640fn push_line(
641 words: &mut Vec<String>,
642 bbox: (f32, f32, f32, f32),
643 page_h: f32,
644 cells: &mut Vec<TextCell>,
645) {
646 if words.is_empty() {
647 return;
648 }
649 let text = std::mem::take(words).join(" ");
650 let (l, b, r, t) = bbox;
651 cells.push(TextCell {
652 text,
653 l,
654 t: page_h - t,
655 r,
656 b: page_h - b,
657 });
658}