use crate::docx::DocxExtractor;
use crate::html::HtmlExtractor;
use crate::markdown::MarkdownExtractor;
use crate::pdf::PdfExtractor;
use crate::text::PlainTextExtractor;
use crate::types::{DocumentExtractor, ExtractOutput};
use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
use orbok_fs::ValidatedPath;
pub struct ExtractorRegistry {
extractors: Vec<Box<dyn DocumentExtractor>>,
}
impl Default for ExtractorRegistry {
fn default() -> Self {
Self {
extractors: vec![
Box::new(MarkdownExtractor),
Box::new(DocxExtractor),
Box::new(HtmlExtractor),
Box::new(PlainTextExtractor),
Box::new(PdfExtractor),
],
}
}
}
impl ExtractorRegistry {
pub fn select(&self, extension: &str) -> Option<&dyn DocumentExtractor> {
let ext = extension.to_ascii_lowercase();
self.extractors
.iter()
.find(|e| e.supported_extensions().contains(&ext.as_str()))
.map(|e| e.as_ref())
}
pub fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
let extension = path
.canonical
.extension()
.and_then(|e| e.to_str())
.unwrap_or_default();
match self.select(extension) {
Some(extractor) => {
tracing::debug!(extractor = extractor.name(), "extracting");
extractor.extract(path)
}
None => Err(OrbokError::Extraction {
category: ErrorCategory::UnsupportedType,
message: format!("no extractor for extension '{extension}'"),
}),
}
}
}