Skip to main content

pdf_textlines

Function pdf_textlines 

Source
pub fn pdf_textlines(bytes: &[u8]) -> Vec<(f32, f32, Vec<TextCell>)>
Expand description

Public entry: per-page (width, height, line cells) for a PDF, via the Rust text parser + the docling-parse line sanitizer. Used by the pipeline and the textparse_dump example.

Examples found in repository?
examples/textparse_dump.rs (line 10)
4fn main() {
5    let path = std::env::args()
6        .nth(1)
7        .expect("usage: textparse_dump <pdf> [needle]");
8    let needle = std::env::args().nth(2);
9    let bytes = std::fs::read(&path).expect("read");
10    let pages = fleischwolf_pdf::textparse::pdf_textlines(&bytes);
11    // TSV mode: emit `pageidx\tl\tt\tr\tb\ttext` for the injection harness.
12    if std::env::var("TSV_OUT").is_ok() {
13        for (pi, (_w, _h, cells)) in pages.iter().enumerate() {
14            for c in cells {
15                let t = c.text.replace(['\t', '\n'], " ");
16                println!(
17                    "{}\t{:.3}\t{:.3}\t{:.3}\t{:.3}\t{}",
18                    pi, c.l, c.t, c.r, c.b, t
19                );
20            }
21        }
22        return;
23    }
24    for (pi, (w, h, cells)) in pages.iter().enumerate() {
25        println!(
26            "page {} ({:.0}x{:.0}) {} line cells",
27            pi + 1,
28            w,
29            h,
30            cells.len()
31        );
32        for c in cells {
33            if let Some(n) = &needle {
34                if !c.text.contains(n.as_str()) {
35                    continue;
36                }
37            }
38            println!("  l={:6.1} t={:6.1} r={:6.1} | {}", c.l, c.t, c.r, c.text);
39        }
40        if pi == 0 {
41            break;
42        }
43    }
44}