mod backend;
mod config;
mod error;
mod postprocess;
mod router;
pub use backend::{
Backend, BackendCapabilities, BackendInfo, ExtractedDocument, ExtractedPage, MarkerBackend,
NougatBackend, PdfBackend,
};
pub use config::{PdfConfig, PdfConfigBuilder};
pub use error::{PdfError, PdfResult};
pub use postprocess::{PostProcessor, PostProcessorConfig};
pub use router::{PdfRouter, RouterConfig, RouterDecision};
use std::path::Path;
pub struct PdfExtractor {
router: PdfRouter,
postprocessor: PostProcessor,
}
impl PdfExtractor {
pub fn new(config: PdfConfig) -> PdfResult<Self> {
let router = PdfRouter::new(config.router.clone())?;
let postprocessor = PostProcessor::new(config.postprocess.clone());
Ok(Self {
router,
postprocessor,
})
}
pub fn extract<P, F>(&self, path: P, progress: F) -> PdfResult<ExtractedDocument>
where
P: AsRef<Path>,
F: Fn(ExtractionProgress) + Send + Sync,
{
let path = path.as_ref();
let decision = self.router.route(path)?;
let mut doc = match decision.backend {
Backend::Marker => {
let backend = MarkerBackend::new()?;
backend.extract(path, &progress)?
}
Backend::Nougat => {
let backend = NougatBackend::new()?;
backend.extract(path, &progress)?
}
Backend::Auto => {
return Err(PdfError::Configuration(
"Auto backend should have been resolved by router".to_string(),
));
}
};
doc = self.postprocessor.process(doc)?;
Ok(doc)
}
pub fn extract_batch<P, F>(&self, paths: &[P], progress: F) -> Vec<PdfResult<ExtractedDocument>>
where
P: AsRef<Path> + Sync,
F: Fn(BatchProgress) + Send + Sync,
{
use rayon::prelude::*;
let total = paths.len();
paths
.par_iter()
.enumerate()
.map(|(idx, path)| {
let result = self.extract(path, |_| {});
progress(BatchProgress {
current: idx + 1,
total,
path: path.as_ref().to_string_lossy().to_string(),
success: result.is_ok(),
});
result
})
.collect()
}
}
#[derive(Debug, Clone)]
pub struct ExtractionProgress {
pub current_page: usize,
pub total_pages: usize,
pub percent: f32,
pub stage: ExtractionStage,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ExtractionStage {
Analyzing,
Extracting,
MathOcr,
Postprocessing,
Validating,
}
#[derive(Debug, Clone)]
pub struct BatchProgress {
pub current: usize,
pub total: usize,
pub path: String,
pub success: bool,
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_config_builder() {
let config = PdfConfigBuilder::new()
.backend(Backend::Auto)
.math_density_threshold(0.5)
.parallel_pages(true)
.build();
assert_eq!(config.router.default_backend, Backend::Auto);
assert_eq!(config.router.math_density_threshold, 0.5);
}
}