Skip to main content

orbok_extract/
registry.rs

1//! Extractor registry (RFC-005 §6: selection by file type, typed
2//! unsupported results).
3
4use crate::docx::DocxExtractor;
5use crate::html::HtmlExtractor;
6use crate::markdown::MarkdownExtractor;
7use crate::pdf::PdfExtractor;
8use crate::text::PlainTextExtractor;
9use crate::types::{DocumentExtractor, ExtractOutput};
10use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
11use orbok_fs::ValidatedPath;
12
13/// Registry of the available extractors. Markdown takes precedence over
14/// plain text for `.md`; everything claims by extension.
15pub struct ExtractorRegistry {
16    extractors: Vec<Box<dyn DocumentExtractor>>,
17}
18
19impl Default for ExtractorRegistry {
20    fn default() -> Self {
21        Self {
22            extractors: vec![Box::new(MarkdownExtractor), Box::new(DocxExtractor), Box::new(HtmlExtractor), Box::new(PlainTextExtractor), Box::new(PdfExtractor)],
23        }
24    }
25}
26
27impl ExtractorRegistry {
28    /// The extractor claiming `extension`, if any.
29    pub fn select(&self, extension: &str) -> Option<&dyn DocumentExtractor> {
30        let ext = extension.to_ascii_lowercase();
31        self.extractors
32            .iter()
33            .find(|e| e.supported_extensions().contains(&ext.as_str()))
34            .map(|e| e.as_ref())
35    }
36
37    /// Extract a validated file. Unknown types are a typed
38    /// `UnsupportedType` failure that workers record on the extraction
39    /// record (RFC-005 §13) — never a panic, never a silent skip.
40    pub fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
41        let extension = path
42            .canonical
43            .extension()
44            .and_then(|e| e.to_str())
45            .unwrap_or_default();
46        match self.select(extension) {
47            Some(extractor) => {
48                tracing::debug!(extractor = extractor.name(), "extracting");
49                extractor.extract(path)
50            }
51            None => Err(OrbokError::Extraction {
52                category: ErrorCategory::UnsupportedType,
53                message: format!("no extractor for extension '{extension}'"),
54            }),
55        }
56    }
57}