mod assemble;
mod dp_lines;
pub mod layout;
mod mets;
mod ocr;
pub mod pdfium_backend;
pub mod resample;
pub mod tableformer;
pub mod textparse;
use std::fmt;
use fleischwolf_core::DoclingDocument;
pub use mets::convert_mets_gbs;
pub use pdfium_backend::{PdfDocument, PdfPage, TextCell};
#[derive(Debug)]
pub enum PdfError {
Pdfium(String),
Layout(String),
Ocr(String),
}
impl fmt::Display for PdfError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
PdfError::Pdfium(m) => write!(f, "pdf: pdfium error: {m}"),
PdfError::Layout(m) => write!(f, "pdf: {m}"),
PdfError::Ocr(m) => write!(f, "pdf: {m}"),
}
}
}
impl std::error::Error for PdfError {}
impl From<pdfium_render::prelude::PdfiumError> for PdfError {
fn from(e: pdfium_render::prelude::PdfiumError) -> Self {
PdfError::Pdfium(e.to_string())
}
}
pub(crate) fn intra_threads() -> usize {
if let Some(n) = std::env::var("FLEISCHWOLF_PDF_THREADS")
.ok()
.and_then(|v| v.parse::<usize>().ok())
.filter(|&n| n > 0)
{
return n;
}
std::thread::available_parallelism()
.map(|n| n.get())
.unwrap_or(1)
}
pub struct Pipeline {
layout: layout::LayoutModel,
ocr: Option<ocr::OcrModel>,
tables: Option<tableformer::TableFormer>,
}
impl Pipeline {
pub fn new() -> Result<Self, PdfError> {
Ok(Self {
layout: layout::LayoutModel::load().map_err(PdfError::Layout)?,
ocr: None,
tables: tableformer::TableFormer::load(),
})
}
pub fn convert(
&mut self,
bytes: &[u8],
password: Option<&str>,
name: &str,
) -> Result<DoclingDocument, PdfError> {
let mut doc = DoclingDocument::new(name);
pdfium_backend::for_each_page(bytes, password, |n, _total, mut page| {
self.process_one_page(n, &mut page, &mut doc)
})?;
assemble::merge_continuations(&mut doc.nodes);
Ok(doc)
}
pub fn convert_image(&mut self, bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
let image = image::load_from_memory(bytes)
.map_err(|e| PdfError::Pdfium(format!("image: {e}")))?
.into_rgb8();
let (w, h) = image.dimensions();
let page = PdfPage {
width: w as f32,
height: h as f32,
scale: 1.0,
cells: Vec::new(),
code_cells: Vec::new(),
word_cells: Vec::new(),
image,
links: Vec::new(),
};
self.process_pages(vec![page], name)
}
fn process_one_page(
&mut self,
n: usize,
page: &mut PdfPage,
doc: &mut DoclingDocument,
) -> Result<(), PdfError> {
let regions = self
.layout
.predict(&page.image, page.width, page.height)
.map_err(|e| PdfError::Layout(format!("page {}: {e}", n + 1)))?;
let mut regions = assemble::resolve(regions);
assemble::add_orphan_regions(&mut regions, &page.cells);
assemble::drop_false_pictures(&mut regions, &page.cells, page.width, page.height);
if page.cells.is_empty() {
if self.ocr.is_none() {
self.ocr = Some(ocr::OcrModel::load().map_err(PdfError::Ocr)?);
}
let cells = self
.ocr
.as_mut()
.unwrap()
.ocr_page(&page.image, ®ions, page.scale)
.map_err(|e| PdfError::Ocr(format!("page {}: {e}", n + 1)))?;
page.cells = cells;
}
let mut table_rows: Vec<Option<Vec<Vec<String>>>> = vec![None; regions.len()];
if let Some(tf) = self.tables.as_mut() {
for (i, r) in regions.iter().enumerate() {
if r.label == "table" {
table_rows[i] = tf.predict_table_rows(
&page.image,
page.height,
[r.l, r.t, r.r, r.b],
&page.word_cells,
);
}
}
}
assemble::assemble_page(page, regions, &table_rows, doc);
Ok(())
}
fn process_pages(
&mut self,
mut pages: Vec<PdfPage>,
name: &str,
) -> Result<DoclingDocument, PdfError> {
let mut doc = DoclingDocument::new(name);
for (n, page) in pages.iter_mut().enumerate() {
self.process_one_page(n, page, &mut doc)?;
}
assemble::merge_continuations(&mut doc.nodes);
Ok(doc)
}
}
pub fn convert(
bytes: &[u8],
password: Option<&str>,
name: &str,
) -> Result<DoclingDocument, PdfError> {
Pipeline::new()?.convert(bytes, password, name)
}
pub fn convert_image(bytes: &[u8], name: &str) -> Result<DoclingDocument, PdfError> {
Pipeline::new()?.convert_image(bytes, name)
}
pub fn convert_pages(pages: Vec<PdfPage>, name: &str) -> Result<DoclingDocument, PdfError> {
Pipeline::new()?.process_pages(pages, name)
}