fleischwolf-pdf 0.5.0

PDF/image backend for Fleischwolf: pdfium text extraction + ONNX layout/table/OCR pipeline.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
//! Dump the Rust parser's word cells for a page: `<pdf> <page_index_0based>`.
//! TSV: l<tab>t<tab>r<tab>b<tab>text  (top-left page-point coords).
fn main() {
    let path = std::env::args().nth(1).expect("pdf");
    let pi: usize = std::env::args()
        .nth(2)
        .and_then(|s| s.parse().ok())
        .unwrap_or(0);
    let bytes = std::fs::read(&path).unwrap();
    let pages = fleischwolf_pdf::textparse::pdf_words(&bytes);
    if let Some((_, _, cells)) = pages.get(pi) {
        for c in cells {
            let t = c.text.replace(['\t', '\n'], " ");
            println!("{:.2}\t{:.2}\t{:.2}\t{:.2}\t{}", c.l, c.t, c.r, c.b, t);
        }
    }
}