Skip to main content

orbok_extract/
registry.rs

1//! Extractor registry (RFC-005 §6: selection by file type, typed
2//! unsupported results).
3
4use crate::markdown::MarkdownExtractor;
5use crate::pdf::PdfExtractor;
6use crate::text::PlainTextExtractor;
7use crate::types::{DocumentExtractor, ExtractOutput};
8use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
9use orbok_fs::ValidatedPath;
10
11/// Registry of the available extractors. Markdown takes precedence over
12/// plain text for `.md`; everything claims by extension.
13pub struct ExtractorRegistry {
14    extractors: Vec<Box<dyn DocumentExtractor>>,
15}
16
17impl Default for ExtractorRegistry {
18    fn default() -> Self {
19        Self {
20            extractors: vec![Box::new(MarkdownExtractor), Box::new(PlainTextExtractor), Box::new(PdfExtractor)],
21        }
22    }
23}
24
25impl ExtractorRegistry {
26    /// The extractor claiming `extension`, if any.
27    pub fn select(&self, extension: &str) -> Option<&dyn DocumentExtractor> {
28        let ext = extension.to_ascii_lowercase();
29        self.extractors
30            .iter()
31            .find(|e| e.supported_extensions().contains(&ext.as_str()))
32            .map(|e| e.as_ref())
33    }
34
35    /// Extract a validated file. Unknown types are a typed
36    /// `UnsupportedType` failure that workers record on the extraction
37    /// record (RFC-005 §13) — never a panic, never a silent skip.
38    pub fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
39        let extension = path
40            .canonical
41            .extension()
42            .and_then(|e| e.to_str())
43            .unwrap_or_default();
44        match self.select(extension) {
45            Some(extractor) => {
46                tracing::debug!(extractor = extractor.name(), "extracting");
47                extractor.extract(path)
48            }
49            None => Err(OrbokError::Extraction {
50                category: ErrorCategory::UnsupportedType,
51                message: format!("no extractor for extension '{extension}'"),
52            }),
53        }
54    }
55}