Skip to main content

orbok_extract/
registry.rs

1//! Extractor registry (RFC-005 §6: selection by file type, typed
2//! unsupported results).
3
4use crate::docx::DocxExtractor;
5use crate::html::HtmlExtractor;
6use crate::markdown::MarkdownExtractor;
7use crate::pdf::PdfExtractor;
8use crate::text::PlainTextExtractor;
9use crate::types::{DocumentExtractor, ExtractOutput};
10use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
11use orbok_fs::ValidatedPath;
12
13/// Registry of the available extractors. Markdown takes precedence over
14/// plain text for `.md`; everything claims by extension.
15pub struct ExtractorRegistry {
16    extractors: Vec<Box<dyn DocumentExtractor>>,
17}
18
19impl Default for ExtractorRegistry {
20    fn default() -> Self {
21        Self {
22            extractors: vec![
23                Box::new(MarkdownExtractor),
24                Box::new(DocxExtractor),
25                Box::new(HtmlExtractor),
26                Box::new(PlainTextExtractor),
27                Box::new(PdfExtractor),
28            ],
29        }
30    }
31}
32
33impl ExtractorRegistry {
34    /// The extractor claiming `extension`, if any.
35    pub fn select(&self, extension: &str) -> Option<&dyn DocumentExtractor> {
36        let ext = extension.to_ascii_lowercase();
37        self.extractors
38            .iter()
39            .find(|e| e.supported_extensions().contains(&ext.as_str()))
40            .map(|e| e.as_ref())
41    }
42
43    /// Extract a validated file. Unknown types are a typed
44    /// `UnsupportedType` failure that workers record on the extraction
45    /// record (RFC-005 §13) — never a panic, never a silent skip.
46    pub fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
47        let extension = path
48            .canonical
49            .extension()
50            .and_then(|e| e.to_str())
51            .unwrap_or_default();
52        match self.select(extension) {
53            Some(extractor) => {
54                tracing::debug!(extractor = extractor.name(), "extracting");
55                extractor.extract(path)
56            }
57            None => Err(OrbokError::Extraction {
58                category: ErrorCategory::UnsupportedType,
59                message: format!("no extractor for extension '{extension}'"),
60            }),
61        }
62    }
63}