use crate::docx::DocxExtractor;
use crate::html::HtmlExtractor;
use crate::markdown::MarkdownExtractor;
use crate::pdf::PdfExtractor;
use crate::text::PlainTextExtractor;
use crate::types::{DocumentExtractor, ExtractContext, ExtractOutput};
use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
use orbok_fs::ValidatedPath;
pub struct ExtractorRegistry {
extractors: Vec<Box<dyn DocumentExtractor>>,
}
impl Default for ExtractorRegistry {
fn default() -> Self {
Self {
extractors: vec![
Box::new(MarkdownExtractor),
Box::new(DocxExtractor),
Box::new(HtmlExtractor),
Box::new(PlainTextExtractor),
Box::new(PdfExtractor),
],
}
}
}
impl ExtractorRegistry {
pub fn new_with(extractors: Vec<Box<dyn DocumentExtractor>>) -> Self {
Self { extractors }
}
pub fn select(&self, extension: &str) -> Option<&dyn DocumentExtractor> {
let ext = extension.to_ascii_lowercase();
self.extractors
.iter()
.find(|e| e.supported_extensions().contains(&ext.as_str()))
.map(|e| e.as_ref())
}
pub fn extract_with_context(
&self,
path: &ValidatedPath,
context: &ExtractContext,
) -> OrbokResult<ExtractOutput> {
let extension = path
.canonical
.extension()
.and_then(|e| e.to_str())
.unwrap_or_default();
match self.select(extension) {
Some(extractor) => {
tracing::debug!(extractor = extractor.name(), "extracting");
extractor.extract_with_context(path, context)
}
None => Err(OrbokError::Extraction {
category: ErrorCategory::UnsupportedType,
message: format!("no extractor for extension '{extension}'"),
}),
}
}
pub fn extract_safely(
&self,
path: &ValidatedPath,
context: &ExtractContext,
) -> OrbokResult<ExtractOutput> {
let extension = path
.canonical
.extension()
.and_then(|e| e.to_str())
.unwrap_or_default()
.to_ascii_lowercase();
let extractor = match self.select(&extension) {
Some(e) => e,
None => {
return Err(OrbokError::Extraction {
category: ErrorCategory::UnsupportedType,
message: format!("no extractor for extension '{extension}'"),
});
}
};
let path_clone = path.clone();
let context_clone = context.clone();
let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
extractor.extract_with_context(&path_clone, &context_clone)
}));
match result {
Ok(inner) => inner,
Err(_payload) => {
tracing::error!(
path = %path.canonical.display(),
extractor = extractor.name(),
"extractor panicked — recovered safely"
);
Err(OrbokError::Extraction {
category: ErrorCategory::ParserPanic,
message: "extractor panicked while reading this file".into(),
})
}
}
}
pub fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
self.extract_safely(path, &ExtractContext::default())
}
}