Skip to main content

dump_regions/
dump_regions.rs

1//! Dump layout regions (label, bbox, text) for debugging reading order.
2use fleischwolf_pdf::layout::LayoutModel;
3use fleischwolf_pdf::PdfDocument;
4
5fn main() {
6    let path = std::env::args().nth(1).expect("pdf");
7    let bytes = std::fs::read(&path).expect("read");
8    let doc = PdfDocument::open(&bytes, None).expect("open");
9    let mut layout = LayoutModel::load().expect("layout");
10    for (pi, page) in doc.pages.iter().enumerate() {
11        let regions = layout
12            .predict(&page.image, page.width, page.height)
13            .expect("layout");
14        for r in &regions {
15            // crude text: cells whose center is inside the region
16            let txt: String = page
17                .cells
18                .iter()
19                .filter(|c| {
20                    let (cx, cy) = ((c.l + c.r) / 2.0, (c.t + c.b) / 2.0);
21                    cx >= r.l && cx <= r.r && cy >= r.t && cy <= r.b
22                })
23                .map(|c| c.text.trim())
24                .collect::<Vec<_>>()
25                .join(" ");
26            let tail: String = txt
27                .chars()
28                .rev()
29                .take(40)
30                .collect::<Vec<_>>()
31                .into_iter()
32                .rev()
33                .collect();
34            println!(
35                "p{} {:>14} t={:6.1} b={:6.1} | …{}",
36                pi + 1,
37                r.label,
38                r.t,
39                r.b,
40                tail
41            );
42        }
43        // raw line cells in extraction order (to inspect RTL ordering)
44        if std::env::var("DUMP_CELLS").is_ok() {
45            for (ci, c) in page.cells.iter().enumerate() {
46                let snip: String = c.text.chars().take(300).collect();
47                println!(
48                    "   CELL[{ci}] t={:6.1} l={:6.1} r={:6.1} | {}",
49                    c.t, c.l, c.r, snip
50                );
51            }
52        }
53    }
54}