orbok_extract/
registry.rs1use crate::docx::DocxExtractor;
5use crate::html::HtmlExtractor;
6use crate::markdown::MarkdownExtractor;
7use crate::pdf::PdfExtractor;
8use crate::text::PlainTextExtractor;
9use crate::types::{DocumentExtractor, ExtractOutput};
10use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
11use orbok_fs::ValidatedPath;
12
13pub struct ExtractorRegistry {
16 extractors: Vec<Box<dyn DocumentExtractor>>,
17}
18
19impl Default for ExtractorRegistry {
20 fn default() -> Self {
21 Self {
22 extractors: vec![
23 Box::new(MarkdownExtractor),
24 Box::new(DocxExtractor),
25 Box::new(HtmlExtractor),
26 Box::new(PlainTextExtractor),
27 Box::new(PdfExtractor),
28 ],
29 }
30 }
31}
32
33impl ExtractorRegistry {
34 pub fn select(&self, extension: &str) -> Option<&dyn DocumentExtractor> {
36 let ext = extension.to_ascii_lowercase();
37 self.extractors
38 .iter()
39 .find(|e| e.supported_extensions().contains(&ext.as_str()))
40 .map(|e| e.as_ref())
41 }
42
43 pub fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
47 let extension = path
48 .canonical
49 .extension()
50 .and_then(|e| e.to_str())
51 .unwrap_or_default();
52 match self.select(extension) {
53 Some(extractor) => {
54 tracing::debug!(extractor = extractor.name(), "extracting");
55 extractor.extract(path)
56 }
57 None => Err(OrbokError::Extraction {
58 category: ErrorCategory::UnsupportedType,
59 message: format!("no extractor for extension '{extension}'"),
60 }),
61 }
62 }
63}