Skip to main content

orbok_extract/
registry.rs

1//! Extractor registry (RFC-005 §6: selection by file type, typed
2//! unsupported results).
3
4use crate::markdown::MarkdownExtractor;
5use crate::text::PlainTextExtractor;
6use crate::types::{DocumentExtractor, ExtractOutput};
7use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
8use orbok_fs::ValidatedPath;
9
10/// Registry of the available extractors. Markdown takes precedence over
11/// plain text for `.md`; everything claims by extension.
12pub struct ExtractorRegistry {
13    extractors: Vec<Box<dyn DocumentExtractor>>,
14}
15
16impl Default for ExtractorRegistry {
17    fn default() -> Self {
18        Self {
19            extractors: vec![Box::new(MarkdownExtractor), Box::new(PlainTextExtractor)],
20        }
21    }
22}
23
24impl ExtractorRegistry {
25    /// The extractor claiming `extension`, if any.
26    pub fn select(&self, extension: &str) -> Option<&dyn DocumentExtractor> {
27        let ext = extension.to_ascii_lowercase();
28        self.extractors
29            .iter()
30            .find(|e| e.supported_extensions().contains(&ext.as_str()))
31            .map(|e| e.as_ref())
32    }
33
34    /// Extract a validated file. Unknown types are a typed
35    /// `UnsupportedType` failure that workers record on the extraction
36    /// record (RFC-005 §13) — never a panic, never a silent skip.
37    pub fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
38        let extension = path
39            .canonical
40            .extension()
41            .and_then(|e| e.to_str())
42            .unwrap_or_default();
43        match self.select(extension) {
44            Some(extractor) => {
45                tracing::debug!(extractor = extractor.name(), "extracting");
46                extractor.extract(path)
47            }
48            None => Err(OrbokError::Extraction {
49                category: ErrorCategory::UnsupportedType,
50                message: format!("no extractor for extension '{extension}'"),
51            }),
52        }
53    }
54}