orbok_extract/
registry.rs1use crate::docx::DocxExtractor;
8use crate::html::HtmlExtractor;
9use crate::markdown::MarkdownExtractor;
10use crate::pdf::PdfExtractor;
11use crate::text::PlainTextExtractor;
12use crate::types::{DocumentExtractor, ExtractContext, ExtractOutput};
13use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
14use orbok_fs::ValidatedPath;
15
16pub struct ExtractorRegistry {
19 extractors: Vec<Box<dyn DocumentExtractor>>,
20}
21
22impl Default for ExtractorRegistry {
23 fn default() -> Self {
24 Self {
25 extractors: vec![
26 Box::new(MarkdownExtractor),
27 Box::new(DocxExtractor),
28 Box::new(HtmlExtractor),
29 Box::new(PlainTextExtractor),
30 Box::new(PdfExtractor),
31 ],
32 }
33 }
34}
35
36impl ExtractorRegistry {
37 pub fn new_with(extractors: Vec<Box<dyn DocumentExtractor>>) -> Self {
39 Self { extractors }
40 }
41
42 pub fn select(&self, extension: &str) -> Option<&dyn DocumentExtractor> {
44 let ext = extension.to_ascii_lowercase();
45 self.extractors
46 .iter()
47 .find(|e| e.supported_extensions().contains(&ext.as_str()))
48 .map(|e| e.as_ref())
49 }
50
51 pub fn extract_with_context(
56 &self,
57 path: &ValidatedPath,
58 context: &ExtractContext,
59 ) -> OrbokResult<ExtractOutput> {
60 let extension = path
61 .canonical
62 .extension()
63 .and_then(|e| e.to_str())
64 .unwrap_or_default();
65 match self.select(extension) {
66 Some(extractor) => {
67 tracing::debug!(extractor = extractor.name(), "extracting");
68 extractor.extract_with_context(path, context)
69 }
70 None => Err(OrbokError::Extraction {
71 category: ErrorCategory::UnsupportedType,
72 message: format!("no extractor for extension '{extension}'"),
73 }),
74 }
75 }
76
77 pub fn extract_safely(
86 &self,
87 path: &ValidatedPath,
88 context: &ExtractContext,
89 ) -> OrbokResult<ExtractOutput> {
90 let extension = path
91 .canonical
92 .extension()
93 .and_then(|e| e.to_str())
94 .unwrap_or_default()
95 .to_ascii_lowercase();
96
97 let extractor = match self.select(&extension) {
98 Some(e) => e,
99 None => {
100 return Err(OrbokError::Extraction {
101 category: ErrorCategory::UnsupportedType,
102 message: format!("no extractor for extension '{extension}'"),
103 });
104 }
105 };
106
107 let path_clone = path.clone();
109 let context_clone = context.clone();
110 let result = std::panic::catch_unwind(std::panic::AssertUnwindSafe(|| {
115 extractor.extract_with_context(&path_clone, &context_clone)
116 }));
117
118 match result {
119 Ok(inner) => inner,
120 Err(_payload) => {
121 tracing::error!(
122 path = %path.canonical.display(),
123 extractor = extractor.name(),
124 "extractor panicked — recovered safely"
125 );
126 Err(OrbokError::Extraction {
127 category: ErrorCategory::ParserPanic,
128 message: "extractor panicked while reading this file".into(),
129 })
130 }
131 }
132 }
133
134 pub fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
139 self.extract_safely(path, &ExtractContext::default())
140 }
141}