pub fn pdf_textlines(bytes: &[u8]) -> Vec<(f32, f32, Vec<TextCell>)>Expand description
Public entry: per-page (width, height, line cells) for a PDF, via the Rust
text parser + the docling-parse line sanitizer. Used by the pipeline and the
textparse_dump example.
Examples found in repository?
examples/textparse_dump.rs (line 10)
4fn main() {
5 let path = std::env::args()
6 .nth(1)
7 .expect("usage: textparse_dump <pdf> [needle]");
8 let needle = std::env::args().nth(2);
9 let bytes = std::fs::read(&path).expect("read");
10 let pages = fleischwolf_pdf::textparse::pdf_textlines(&bytes);
11 // TSV mode: emit `pageidx\tl\tt\tr\tb\ttext` for the injection harness.
12 if std::env::var("TSV_OUT").is_ok() {
13 for (pi, (_w, _h, cells)) in pages.iter().enumerate() {
14 for c in cells {
15 let t = c.text.replace(['\t', '\n'], " ");
16 println!(
17 "{}\t{:.3}\t{:.3}\t{:.3}\t{:.3}\t{}",
18 pi, c.l, c.t, c.r, c.b, t
19 );
20 }
21 }
22 return;
23 }
24 for (pi, (w, h, cells)) in pages.iter().enumerate() {
25 println!(
26 "page {} ({:.0}x{:.0}) {} line cells",
27 pi + 1,
28 w,
29 h,
30 cells.len()
31 );
32 for c in cells {
33 if let Some(n) = &needle {
34 if !c.text.contains(n.as_str()) {
35 continue;
36 }
37 }
38 println!(" l={:6.1} t={:6.1} r={:6.1} | {}", c.l, c.t, c.r, c.text);
39 }
40 if pi == 0 {
41 break;
42 }
43 }
44}