1mod assemble;
13mod dp_lines;
14pub mod layout;
15mod mets;
16mod ocr;
17pub mod pdfium_backend;
18pub mod resample;
19pub mod tableformer;
20pub mod textparse;
21
22use std::fmt;
23
24use fleischwolf_core::DoclingDocument;
25
26pub use mets::convert_mets_gbs;
27pub use pdfium_backend::{PdfDocument, PdfPage, TextCell};
28
29#[derive(Debug)]
31pub enum PdfError {
32 Pdfium(String),
34 Layout(String),
36 Ocr(String),
38}
39
40impl fmt::Display for PdfError {
41 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
42 match self {
43 PdfError::Pdfium(m) => write!(f, "pdf: pdfium error: {m}"),
44 PdfError::Layout(m) => write!(f, "pdf: {m}"),
45 PdfError::Ocr(m) => write!(f, "pdf: {m}"),
46 }
47 }
48}
49
50impl std::error::Error for PdfError {}
51
52impl From<pdfium_render::prelude::PdfiumError> for PdfError {
53 fn from(e: pdfium_render::prelude::PdfiumError) -> Self {
54 PdfError::Pdfium(e.to_string())
55 }
56}
57
58pub(crate) fn intra_threads() -> usize {
61 if let Some(n) = std::env::var("FLEISCHWOLF_PDF_THREADS")
62 .ok()
63 .and_then(|v| v.parse::<usize>().ok())
64 .filter(|&n| n > 0)
65 {
66 return n;
67 }
68 std::thread::available_parallelism()
69 .map(|n| n.get())
70 .unwrap_or(1)
71}
72
73pub struct Pipeline {
76 layout: layout::LayoutModel,
77 ocr: Option<ocr::OcrModel>,
78 tables: Option<tableformer::TableFormer>,
81}
82
83impl Pipeline {
84 pub fn new() -> Result<Self, PdfError> {
88 Ok(Self {
89 layout: layout::LayoutModel::load().map_err(PdfError::Layout)?,
90 ocr: None,
91 tables: tableformer::TableFormer::load(),
92 })
93 }
94
95 pub fn convert(
99 &mut self,
100 bytes: &[u8],
101 password: Option<&str>,
102 name: &str,
103 ) -> Result<DoclingDocument, PdfError> {
104 let mut doc = DoclingDocument::new(name);
109 pdfium_backend::for_each_page(bytes, password, |n, _total, mut page| {
110 self.process_one_page(n, &mut page, &mut doc)
111 })?;
112 assemble::merge_continuations(&mut doc.nodes);
113 Ok(doc)
114 }
115
116 pub fn convert_image(&mut self, bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
119 let image = image::load_from_memory(bytes)
120 .map_err(|e| PdfError::Pdfium(format!("image: {e}")))?
121 .into_rgb8();
122 let (w, h) = image.dimensions();
123 let page = PdfPage {
126 width: w as f32,
127 height: h as f32,
128 scale: 1.0,
129 cells: Vec::new(),
130 code_cells: Vec::new(),
131 word_cells: Vec::new(),
132 image,
133 links: Vec::new(),
134 };
135 self.process_pages(vec![page], name)
136 }
137
138 fn process_one_page(
140 &mut self,
141 n: usize,
142 page: &mut PdfPage,
143 doc: &mut DoclingDocument,
144 ) -> Result<(), PdfError> {
145 let regions = self
146 .layout
147 .predict(&page.image, page.width, page.height)
148 .map_err(|e| PdfError::Layout(format!("page {}: {e}", n + 1)))?;
149 let mut regions = assemble::resolve(regions);
151 assemble::add_orphan_regions(&mut regions, &page.cells);
153 assemble::drop_false_pictures(&mut regions, &page.cells, page.width, page.height);
155 if page.cells.is_empty() {
157 if self.ocr.is_none() {
158 self.ocr = Some(ocr::OcrModel::load().map_err(PdfError::Ocr)?);
159 }
160 let cells = self
161 .ocr
162 .as_mut()
163 .unwrap()
164 .ocr_page(&page.image, ®ions, page.scale)
165 .map_err(|e| PdfError::Ocr(format!("page {}: {e}", n + 1)))?;
166 page.cells = cells;
167 }
168 let mut table_rows: Vec<Option<Vec<Vec<String>>>> = vec![None; regions.len()];
170 if let Some(tf) = self.tables.as_mut() {
171 for (i, r) in regions.iter().enumerate() {
172 if r.label == "table" {
173 table_rows[i] = tf.predict_table_rows(
174 &page.image,
175 page.height,
176 [r.l, r.t, r.r, r.b],
177 &page.word_cells,
178 );
179 }
180 }
181 }
182 assemble::assemble_page(page, regions, &table_rows, doc);
183 Ok(())
184 }
185
186 fn process_pages(
189 &mut self,
190 mut pages: Vec<PdfPage>,
191 name: &str,
192 ) -> Result<DoclingDocument, PdfError> {
193 let mut doc = DoclingDocument::new(name);
194 for (n, page) in pages.iter_mut().enumerate() {
195 self.process_one_page(n, page, &mut doc)?;
196 }
197 assemble::merge_continuations(&mut doc.nodes);
198 Ok(doc)
199 }
200}
201
202pub fn convert(
205 bytes: &[u8],
206 password: Option<&str>,
207 name: &str,
208) -> Result<DoclingDocument, PdfError> {
209 Pipeline::new()?.convert(bytes, password, name)
210}
211
212pub fn convert_image(bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
214 Pipeline::new()?.convert_image(bytes, name)
215}
216
217pub fn convert_pages(pages: Vec<PdfPage>, name: &str) -> Result<DoclingDocument, PdfError> {
220 Pipeline::new()?.process_pages(pages, name)
221}