pub struct PdfDocument {
pub pages: Vec<PdfPage>,
}Expand description
A parsed PDF: per-page text cells and page images.
Fields§
§pages: Vec<PdfPage>Implementations§
Source§impl PdfDocument
impl PdfDocument
Sourcepub fn open(bytes: &[u8], password: Option<&str>) -> Result<Self, PdfiumError>
pub fn open(bytes: &[u8], password: Option<&str>) -> Result<Self, PdfiumError>
Parse a PDF from bytes, optionally decrypting with password.
Note: this materialises every page’s rendered bitmap in memory at
once. For large documents prefer for_each_page, which streams.
Examples found in repository?
examples/dump_stages.rs (line 10)
6fn main() {
7 let path = std::env::args().nth(1).expect("pdf");
8 let out = std::env::args().nth(2).expect("out_dir");
9 let bytes = std::fs::read(&path).expect("read");
10 let doc = PdfDocument::open(&bytes, None).expect("open");
11 let page = &doc.pages[0];
12 page.image.save(format!("{out}/my_page.png")).unwrap();
13 println!("my_page: {}x{}", page.image.width(), page.image.height());
14
15 let sf = 1024.0 / page.image.height() as f32;
16 let pw = (page.image.width() as f32 * sf).round() as u32;
17 let p1024 = imageops::thumbnail(&page.image, pw, 1024);
18 p1024.save(format!("{out}/my_p1024.png")).unwrap();
19 println!("my_p1024: {}x{}", p1024.width(), p1024.height());
20}More examples
examples/extract.rs (line 6)
3fn main() {
4 let path = std::env::args().nth(1).expect("usage: extract <file.pdf>");
5 let bytes = std::fs::read(&path).expect("read pdf");
6 match fleischwolf_pdf::PdfDocument::open(&bytes, None) {
7 Ok(doc) => {
8 for (i, page) in doc.pages.iter().enumerate() {
9 println!(
10 "--- page {} ({:.0}x{:.0}, {} cells) ---",
11 i + 1,
12 page.width,
13 page.height,
14 page.cells.len()
15 );
16 for c in page.cells.iter().take(8) {
17 println!(
18 " [{:.0},{:.0},{:.0},{:.0}] {:?}",
19 c.l, c.t, c.r, c.b, c.text
20 );
21 }
22 }
23 }
24 Err(e) => eprintln!("ERROR: {e}"),
25 }
26}examples/layout.rs (line 10)
7fn main() {
8 let path = std::env::args().nth(1).expect("usage: layout <file.pdf>");
9 let bytes = std::fs::read(&path).expect("read pdf");
10 let doc = PdfDocument::open(&bytes, None).expect("open pdf");
11 let mut model = LayoutModel::load().expect("load layout model");
12 for (i, page) in doc.pages.iter().enumerate().take(1) {
13 let regions = model
14 .predict(&page.image, page.width, page.height)
15 .expect("predict");
16 println!(
17 "page {} ({:.0}x{:.0}): {} regions",
18 i + 1,
19 page.width,
20 page.height,
21 regions.len()
22 );
23 let mut rs = regions.clone();
24 rs.sort_by(|a, b| a.t.total_cmp(&b.t));
25 for r in &rs {
26 println!(
27 " {:<16} {:.2} [{:.0},{:.0},{:.0},{:.0}]",
28 r.label, r.score, r.l, r.t, r.r, r.b
29 );
30 }
31 }
32}examples/dump_regions.rs (line 8)
5fn main() {
6 let path = std::env::args().nth(1).expect("pdf");
7 let bytes = std::fs::read(&path).expect("read");
8 let doc = PdfDocument::open(&bytes, None).expect("open");
9 let mut layout = LayoutModel::load().expect("layout");
10 for (pi, page) in doc.pages.iter().enumerate() {
11 let regions = layout
12 .predict(&page.image, page.width, page.height)
13 .expect("layout");
14 for r in ®ions {
15 // crude text: cells whose center is inside the region
16 let txt: String = page
17 .cells
18 .iter()
19 .filter(|c| {
20 let (cx, cy) = ((c.l + c.r) / 2.0, (c.t + c.b) / 2.0);
21 cx >= r.l && cx <= r.r && cy >= r.t && cy <= r.b
22 })
23 .map(|c| c.text.trim())
24 .collect::<Vec<_>>()
25 .join(" ");
26 let tail: String = txt
27 .chars()
28 .rev()
29 .take(40)
30 .collect::<Vec<_>>()
31 .into_iter()
32 .rev()
33 .collect();
34 println!(
35 "p{} {:>14} t={:6.1} b={:6.1} | …{}",
36 pi + 1,
37 r.label,
38 r.t,
39 r.b,
40 tail
41 );
42 }
43 // raw line cells in extraction order (to inspect RTL ordering)
44 if std::env::var("DUMP_CELLS").is_ok() {
45 for (ci, c) in page.cells.iter().enumerate() {
46 let snip: String = c.text.chars().take(300).collect();
47 println!(
48 " CELL[{ci}] t={:6.1} l={:6.1} r={:6.1} | {}",
49 c.t, c.l, c.r, snip
50 );
51 }
52 }
53 }
54}examples/tf_otsl.rs (line 27)
24fn main() {
25 let path = std::env::args().nth(1).expect("usage: tf_otsl <pdf>");
26 let bytes = std::fs::read(&path).expect("read");
27 let doc = PdfDocument::open(&bytes, None).expect("open");
28 let mut layout = LayoutModel::load().expect("layout");
29 let mut tf = TableFormer::load().expect("tableformer models missing");
30 for (pi, page) in doc.pages.iter().enumerate() {
31 let regions = layout
32 .predict(&page.image, page.width, page.height)
33 .expect("layout");
34 // docling resizes the whole page to 1024px height (cv2.INTER_AREA), then
35 // crops the table bbox out of *that*. Replicate exactly.
36 let sf = 1024.0 / page.image.height() as f32;
37 let pw1024 = (page.image.width() as f32 * sf) as u32; // docling: int(w*r)
38 let page1024 = fleischwolf_pdf::resample::inter_area(&page.image, pw1024, 1024);
39 for r in regions.iter().filter(|r| r.label == "table") {
40 // bbox (points) → 1024px-page coords: scale*sf = 1024/page_h_pt;
41 // docling rounds the crop edges.
42 let k = 1024.0 / page.height;
43 let x = (r.l * k).round().max(0.0) as u32;
44 let y = (r.t * k).round().max(0.0) as u32;
45 let x2 = (r.r * k).round() as u32;
46 let y2 = (r.b * k).round() as u32;
47 let (w, h) = (x2 - x, y2 - y);
48 let crop = imageops::crop_imm(&page1024, x, y, w, h).to_image();
49 let cells = tf.predict_table_structure(&crop).expect("predict");
50 println!(
51 "page {} table {}x{}px -> {} cells",
52 pi + 1,
53 w,
54 h,
55 cells.len()
56 );
57 for c in &cells {
58 println!(
59 " r{} c{} {}x{} {} | cxcywh {:.4} {:.4} {:.4} {:.4}",
60 c.row,
61 c.col,
62 c.colspan,
63 c.rowspan,
64 name(c.tag),
65 c.cx,
66 c.cy,
67 c.w,
68 c.h
69 );
70 }
71 }
72 }
73}Auto Trait Implementations§
impl Freeze for PdfDocument
impl RefUnwindSafe for PdfDocument
impl Send for PdfDocument
impl Sync for PdfDocument
impl Unpin for PdfDocument
impl UnsafeUnpin for PdfDocument
impl UnwindSafe for PdfDocument
Blanket Implementations§
Source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
Source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more
impl<ST, DT> CastableFrom<ST, Initialized, Initialized> for DT
impl<ST, DT> CastableFrom<ST, Uninit, Uninit> for DT
Source§impl<T> IntoEither for T
impl<T> IntoEither for T
Source§fn into_either(self, into_left: bool) -> Either<Self, Self>
fn into_either(self, into_left: bool) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left is true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read moreSource§fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
fn into_either_with<F>(self, into_left: F) -> Either<Self, Self>
Converts
self into a Left variant of Either<Self, Self>
if into_left(&self) returns true.
Converts self into a Right variant of Either<Self, Self>
otherwise. Read more