Skip to main content

orbok_extract/
registry.rs

1//! Extractor registry (RFC-005 §6; RFC-044 §11 panic isolation).
2//!
3//! `ExtractorRegistry` is the single entry point for extraction.
4//! `extract_safely` wraps every extractor call in `catch_unwind` so
5//! a panic in a parser crate cannot crash the orbok process.
6
7use crate::docx::DocxExtractor;
8use crate::html::HtmlExtractor;
9use crate::markdown::MarkdownExtractor;
10use crate::pdf::PdfExtractor;
11use crate::text::PlainTextExtractor;
12use crate::types::{DocumentExtractor, ExtractContext, ExtractOutput};
13use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
14use orbok_fs::ValidatedPath;
15
16/// Registry of the available extractors. Markdown takes precedence over
17/// plain text for `.md`; everything claims by extension.
18pub struct ExtractorRegistry {
19    extractors: Vec<Box<dyn DocumentExtractor>>,
20}
21
22impl Default for ExtractorRegistry {
23    fn default() -> Self {
24        Self {
25            extractors: vec![
26                Box::new(MarkdownExtractor),
27                Box::new(DocxExtractor),
28                Box::new(HtmlExtractor),
29                Box::new(PlainTextExtractor),
30                Box::new(PdfExtractor),
31            ],
32        }
33    }
34}
35
36impl ExtractorRegistry {
37    /// Build a registry with a custom set of extractors (useful in tests).
38    pub fn new_with(extractors: Vec<Box<dyn DocumentExtractor>>) -> Self {
39        Self { extractors }
40    }
41
42    /// The extractor claiming `extension`, if any.
43    pub fn select(&self, extension: &str) -> Option<&dyn DocumentExtractor> {
44        let ext = extension.to_ascii_lowercase();
45        self.extractors
46            .iter()
47            .find(|e| e.supported_extensions().contains(&ext.as_str()))
48            .map(|e| e.as_ref())
49    }
50
51    /// Extract using resource limits. Unknown types are a typed
52    /// `UnsupportedType` failure — never a panic, never a silent skip.
53    ///
54    /// Prefer this over [`extract`] for production code paths.
55    pub fn extract_with_context(
56        &self,
57        path: &ValidatedPath,
58        context: &ExtractContext,
59    ) -> OrbokResult<ExtractOutput> {
60        let extension = path
61            .canonical
62            .extension()
63            .and_then(|e| e.to_str())
64            .unwrap_or_default();
65        match self.select(extension) {
66            Some(extractor) => {
67                tracing::debug!(extractor = extractor.name(), "extracting");
68                extractor.extract_with_context(path, context)
69            }
70            None => Err(OrbokError::Extraction {
71                category: ErrorCategory::UnsupportedType,
72                message: format!("no extractor for extension '{extension}'"),
73            }),
74        }
75    }
76
77    /// Extract with panic isolation (RFC-044 §11).
78    ///
79    /// Wraps the extractor call in `catch_unwind`. A parser panic is
80    /// caught and returned as `ErrorCategory::ParserPanic` instead of
81    /// crashing the worker thread.
82    ///
83    /// The user-facing layer must translate `ParserPanic` to a plain
84    /// message like "This file could not be prepared."
85    pub fn extract_safely(
86        &self,
87        path: &ValidatedPath,
88        context: &ExtractContext,
89    ) -> OrbokResult<ExtractOutput> {
90        let extension = path
91            .canonical
92            .extension()
93            .and_then(|e| e.to_str())
94            .unwrap_or_default()
95            .to_ascii_lowercase();
96
97        let extractor = match self.select(&extension) {
98            Some(e) => e,
99            None => {
100                return Err(OrbokError::Extraction {
101                    category: ErrorCategory::UnsupportedType,
102                    message: format!("no extractor for extension '{extension}'"),
103                });
104            }
105        };
106
107        // Clone what we need to move into the closure.
108        let path_clone = path.clone();
109        let context_clone = context.clone();
110        // SAFETY: AssertUnwindSafe is appropriate here — extraction is
111        // read-only on the path and context; no shared mutable state is
112        // accessed inside the closure that could be left inconsistent by
113        // a panic.
114        let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
115            extractor.extract_with_context(&path_clone, &context_clone)
116        }));
117
118        match result {
119            Ok(inner) => inner,
120            Err(_payload) => {
121                tracing::error!(
122                    path = %path.canonical.display(),
123                    extractor = extractor.name(),
124                    "extractor panicked — recovered safely"
125                );
126                Err(OrbokError::Extraction {
127                    category: ErrorCategory::ParserPanic,
128                    message: "extractor panicked while reading this file".into(),
129                })
130            }
131        }
132    }
133
134    /// Legacy entry point (no limits, no panic isolation).
135    ///
136    /// Kept for compatibility during the migration period. New code
137    /// should call [`extract_safely`] instead.
138    pub fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
139        self.extract_safely(path, &ExtractContext::default())
140    }
141}