pub fn pdf_words(bytes: &[u8]) -> Vec<(f32, f32, Vec<TextCell>)>Expand description
Debug/diagnostic entry: per-page (width, height, word cells) for a PDF, via
the Rust parser glyphs run through the docling-parse word grouping. Used to
compare parser word cells against docling-parse’s word_cells oracle (roadmap
item 6).
Examples found in repository?
examples/word_cells.rs (line 10)
3fn main() {
4 let path = std::env::args().nth(1).expect("pdf");
5 let pi: usize = std::env::args()
6 .nth(2)
7 .and_then(|s| s.parse().ok())
8 .unwrap_or(0);
9 let bytes = std::fs::read(&path).unwrap();
10 let pages = fleischwolf_pdf::textparse::pdf_words(&bytes);
11 if let Some((_, _, cells)) = pages.get(pi) {
12 for c in cells {
13 let t = c.text.replace(['\t', '\n'], " ");
14 println!("{:.2}\t{:.2}\t{:.2}\t{:.2}\t{}", c.l, c.t, c.r, c.b, t);
15 }
16 }
17}