1mod assemble;
13pub mod layout;
14mod mets;
15mod ocr;
16mod pdfium_backend;
17
18use std::fmt;
19
20use fleischwolf_core::DoclingDocument;
21
22pub use mets::convert_mets_gbs;
23pub use pdfium_backend::{PdfDocument, PdfPage, TextCell};
24
25#[derive(Debug)]
27pub enum PdfError {
28 Pdfium(String),
30 Layout(String),
32 Ocr(String),
34}
35
36impl fmt::Display for PdfError {
37 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
38 match self {
39 PdfError::Pdfium(m) => write!(f, "pdf: pdfium error: {m}"),
40 PdfError::Layout(m) => write!(f, "pdf: {m}"),
41 PdfError::Ocr(m) => write!(f, "pdf: {m}"),
42 }
43 }
44}
45
46impl std::error::Error for PdfError {}
47
48impl From<pdfium_render::prelude::PdfiumError> for PdfError {
49 fn from(e: pdfium_render::prelude::PdfiumError) -> Self {
50 PdfError::Pdfium(e.to_string())
51 }
52}
53
54pub(crate) fn intra_threads() -> usize {
57 if let Some(n) = std::env::var("FLEISCHWOLF_PDF_THREADS")
58 .ok()
59 .and_then(|v| v.parse::<usize>().ok())
60 .filter(|&n| n > 0)
61 {
62 return n;
63 }
64 std::thread::available_parallelism()
65 .map(|n| n.get())
66 .unwrap_or(1)
67}
68
69pub struct Pipeline {
72 layout: layout::LayoutModel,
73 ocr: Option<ocr::OcrModel>,
74}
75
76impl Pipeline {
77 pub fn new() -> Result<Self, PdfError> {
79 Ok(Self {
80 layout: layout::LayoutModel::load().map_err(PdfError::Layout)?,
81 ocr: None,
82 })
83 }
84
85 pub fn convert(
89 &mut self,
90 bytes: &[u8],
91 password: Option<&str>,
92 name: &str,
93 ) -> Result<DoclingDocument, PdfError> {
94 let mut doc = DoclingDocument::new(name);
99 pdfium_backend::for_each_page(bytes, password, |n, _total, mut page| {
100 self.process_one_page(n, &mut page, &mut doc)
101 })?;
102 Ok(doc)
103 }
104
105 pub fn convert_image(&mut self, bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
108 let image = image::load_from_memory(bytes)
109 .map_err(|e| PdfError::Pdfium(format!("image: {e}")))?
110 .into_rgb8();
111 let (w, h) = image.dimensions();
112 let page = PdfPage {
115 width: w as f32,
116 height: h as f32,
117 scale: 1.0,
118 cells: Vec::new(),
119 image,
120 };
121 self.process_pages(vec![page], name)
122 }
123
124 fn process_one_page(
126 &mut self,
127 n: usize,
128 page: &mut PdfPage,
129 doc: &mut DoclingDocument,
130 ) -> Result<(), PdfError> {
131 let regions = self
132 .layout
133 .predict(&page.image, page.width, page.height)
134 .map_err(|e| PdfError::Layout(format!("page {}: {e}", n + 1)))?;
135 let regions = assemble::resolve(regions);
137 if page.cells.is_empty() {
139 if self.ocr.is_none() {
140 self.ocr = Some(ocr::OcrModel::load().map_err(PdfError::Ocr)?);
141 }
142 let cells = self
143 .ocr
144 .as_mut()
145 .unwrap()
146 .ocr_page(&page.image, ®ions, page.scale)
147 .map_err(|e| PdfError::Ocr(format!("page {}: {e}", n + 1)))?;
148 page.cells = cells;
149 }
150 assemble::assemble_page(page, regions, doc);
151 Ok(())
152 }
153
154 fn process_pages(
157 &mut self,
158 mut pages: Vec<PdfPage>,
159 name: &str,
160 ) -> Result<DoclingDocument, PdfError> {
161 let mut doc = DoclingDocument::new(name);
162 for (n, page) in pages.iter_mut().enumerate() {
163 self.process_one_page(n, page, &mut doc)?;
164 }
165 Ok(doc)
166 }
167}
168
169pub fn convert(
172 bytes: &[u8],
173 password: Option<&str>,
174 name: &str,
175) -> Result<DoclingDocument, PdfError> {
176 Pipeline::new()?.convert(bytes, password, name)
177}
178
179pub fn convert_image(bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
181 Pipeline::new()?.convert_image(bytes, name)
182}
183
184pub fn convert_pages(pages: Vec<PdfPage>, name: &str) -> Result<DoclingDocument, PdfError> {
187 Pipeline::new()?.process_pages(pages, name)
188}