1mod assemble;
13mod dp_lines;
14pub mod layout;
15mod mets;
16mod ocr;
17pub mod pdfium_backend;
18pub mod resample;
19pub mod tableformer;
20
21use std::fmt;
22
23use fleischwolf_core::DoclingDocument;
24
25pub use mets::convert_mets_gbs;
26pub use pdfium_backend::{PdfDocument, PdfPage, TextCell};
27
28#[derive(Debug)]
30pub enum PdfError {
31 Pdfium(String),
33 Layout(String),
35 Ocr(String),
37}
38
39impl fmt::Display for PdfError {
40 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
41 match self {
42 PdfError::Pdfium(m) => write!(f, "pdf: pdfium error: {m}"),
43 PdfError::Layout(m) => write!(f, "pdf: {m}"),
44 PdfError::Ocr(m) => write!(f, "pdf: {m}"),
45 }
46 }
47}
48
49impl std::error::Error for PdfError {}
50
51impl From<pdfium_render::prelude::PdfiumError> for PdfError {
52 fn from(e: pdfium_render::prelude::PdfiumError) -> Self {
53 PdfError::Pdfium(e.to_string())
54 }
55}
56
57pub(crate) fn intra_threads() -> usize {
60 if let Some(n) = std::env::var("FLEISCHWOLF_PDF_THREADS")
61 .ok()
62 .and_then(|v| v.parse::<usize>().ok())
63 .filter(|&n| n > 0)
64 {
65 return n;
66 }
67 std::thread::available_parallelism()
68 .map(|n| n.get())
69 .unwrap_or(1)
70}
71
72pub struct Pipeline {
75 layout: layout::LayoutModel,
76 ocr: Option<ocr::OcrModel>,
77 tables: Option<tableformer::TableFormer>,
80}
81
82impl Pipeline {
83 pub fn new() -> Result<Self, PdfError> {
87 Ok(Self {
88 layout: layout::LayoutModel::load().map_err(PdfError::Layout)?,
89 ocr: None,
90 tables: tableformer::TableFormer::load(),
91 })
92 }
93
94 pub fn convert(
98 &mut self,
99 bytes: &[u8],
100 password: Option<&str>,
101 name: &str,
102 ) -> Result<DoclingDocument, PdfError> {
103 let mut doc = DoclingDocument::new(name);
108 pdfium_backend::for_each_page(bytes, password, |n, _total, mut page| {
109 self.process_one_page(n, &mut page, &mut doc)
110 })?;
111 assemble::merge_continuations(&mut doc.nodes);
112 Ok(doc)
113 }
114
115 pub fn convert_image(&mut self, bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
118 let image = image::load_from_memory(bytes)
119 .map_err(|e| PdfError::Pdfium(format!("image: {e}")))?
120 .into_rgb8();
121 let (w, h) = image.dimensions();
122 let page = PdfPage {
125 width: w as f32,
126 height: h as f32,
127 scale: 1.0,
128 cells: Vec::new(),
129 code_cells: Vec::new(),
130 word_cells: Vec::new(),
131 image,
132 };
133 self.process_pages(vec![page], name)
134 }
135
136 fn process_one_page(
138 &mut self,
139 n: usize,
140 page: &mut PdfPage,
141 doc: &mut DoclingDocument,
142 ) -> Result<(), PdfError> {
143 let regions = self
144 .layout
145 .predict(&page.image, page.width, page.height)
146 .map_err(|e| PdfError::Layout(format!("page {}: {e}", n + 1)))?;
147 let regions = assemble::resolve(regions);
149 if page.cells.is_empty() {
151 if self.ocr.is_none() {
152 self.ocr = Some(ocr::OcrModel::load().map_err(PdfError::Ocr)?);
153 }
154 let cells = self
155 .ocr
156 .as_mut()
157 .unwrap()
158 .ocr_page(&page.image, ®ions, page.scale)
159 .map_err(|e| PdfError::Ocr(format!("page {}: {e}", n + 1)))?;
160 page.cells = cells;
161 }
162 let mut table_rows: Vec<Option<Vec<Vec<String>>>> = vec![None; regions.len()];
164 if let Some(tf) = self.tables.as_mut() {
165 for (i, r) in regions.iter().enumerate() {
166 if r.label == "table" {
167 table_rows[i] = tf.predict_table_rows(
168 &page.image,
169 page.height,
170 [r.l, r.t, r.r, r.b],
171 &page.word_cells,
172 );
173 }
174 }
175 }
176 assemble::assemble_page(page, regions, &table_rows, doc);
177 Ok(())
178 }
179
180 fn process_pages(
183 &mut self,
184 mut pages: Vec<PdfPage>,
185 name: &str,
186 ) -> Result<DoclingDocument, PdfError> {
187 let mut doc = DoclingDocument::new(name);
188 for (n, page) in pages.iter_mut().enumerate() {
189 self.process_one_page(n, page, &mut doc)?;
190 }
191 assemble::merge_continuations(&mut doc.nodes);
192 Ok(doc)
193 }
194}
195
196pub fn convert(
199 bytes: &[u8],
200 password: Option<&str>,
201 name: &str,
202) -> Result<DoclingDocument, PdfError> {
203 Pipeline::new()?.convert(bytes, password, name)
204}
205
206pub fn convert_image(bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
208 Pipeline::new()?.convert_image(bytes, name)
209}
210
211pub fn convert_pages(pages: Vec<PdfPage>, name: &str) -> Result<DoclingDocument, PdfError> {
214 Pipeline::new()?.process_pages(pages, name)
215}