1mod assemble;
13pub mod layout;
14mod mets;
15mod ocr;
16mod pdfium_backend;
17
18use std::fmt;
19
20use fleischwolf_core::DoclingDocument;
21
22pub use mets::convert_mets_gbs;
23pub use pdfium_backend::{PdfDocument, PdfPage, TextCell};
24
25#[derive(Debug)]
27pub enum PdfError {
28 Pdfium(String),
30 Layout(String),
32 Ocr(String),
34}
35
36impl fmt::Display for PdfError {
37 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
38 match self {
39 PdfError::Pdfium(m) => write!(f, "pdf: pdfium error: {m}"),
40 PdfError::Layout(m) => write!(f, "pdf: {m}"),
41 PdfError::Ocr(m) => write!(f, "pdf: {m}"),
42 }
43 }
44}
45
46impl std::error::Error for PdfError {}
47
48pub struct Pipeline {
51 layout: layout::LayoutModel,
52 ocr: Option<ocr::OcrModel>,
53}
54
55impl Pipeline {
56 pub fn new() -> Result<Self, PdfError> {
58 Ok(Self {
59 layout: layout::LayoutModel::load().map_err(PdfError::Layout)?,
60 ocr: None,
61 })
62 }
63
64 pub fn convert(
68 &mut self,
69 bytes: &[u8],
70 password: Option<&str>,
71 name: &str,
72 ) -> Result<DoclingDocument, PdfError> {
73 let parsed =
74 PdfDocument::open(bytes, password).map_err(|e| PdfError::Pdfium(e.to_string()))?;
75 self.process_pages(parsed.pages, name)
76 }
77
78 pub fn convert_image(&mut self, bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
81 let image = image::load_from_memory(bytes)
82 .map_err(|e| PdfError::Pdfium(format!("image: {e}")))?
83 .into_rgb8();
84 let (w, h) = image.dimensions();
85 let page = PdfPage {
88 width: w as f32,
89 height: h as f32,
90 scale: 1.0,
91 cells: Vec::new(),
92 image,
93 };
94 self.process_pages(vec![page], name)
95 }
96
97 fn process_pages(
99 &mut self,
100 mut pages: Vec<PdfPage>,
101 name: &str,
102 ) -> Result<DoclingDocument, PdfError> {
103 let mut doc = DoclingDocument::new(name);
104 for (n, page) in pages.iter_mut().enumerate() {
105 let regions = self
106 .layout
107 .predict(&page.image, page.width, page.height)
108 .map_err(|e| PdfError::Layout(format!("page {}: {e}", n + 1)))?;
109 let regions = assemble::resolve(regions);
111 if page.cells.is_empty() {
113 if self.ocr.is_none() {
114 self.ocr = Some(ocr::OcrModel::load().map_err(PdfError::Ocr)?);
115 }
116 let cells = self
117 .ocr
118 .as_mut()
119 .unwrap()
120 .ocr_page(&page.image, ®ions, page.scale)
121 .map_err(|e| PdfError::Ocr(format!("page {}: {e}", n + 1)))?;
122 page.cells = cells;
123 }
124 assemble::assemble_page(page, regions, &mut doc);
125 }
126 Ok(doc)
127 }
128}
129
130pub fn convert(
133 bytes: &[u8],
134 password: Option<&str>,
135 name: &str,
136) -> Result<DoclingDocument, PdfError> {
137 Pipeline::new()?.convert(bytes, password, name)
138}
139
140pub fn convert_image(bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
142 Pipeline::new()?.convert_image(bytes, name)
143}
144
145pub fn convert_pages(pages: Vec<PdfPage>, name: &str) -> Result<DoclingDocument, PdfError> {
148 Pipeline::new()?.process_pages(pages, name)
149}