orbok_extract/
registry.rs1use crate::docx::DocxExtractor;
5use crate::html::HtmlExtractor;
6use crate::markdown::MarkdownExtractor;
7use crate::pdf::PdfExtractor;
8use crate::text::PlainTextExtractor;
9use crate::types::{DocumentExtractor, ExtractOutput};
10use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
11use orbok_fs::ValidatedPath;
12
13pub struct ExtractorRegistry {
16 extractors: Vec<Box<dyn DocumentExtractor>>,
17}
18
19impl Default for ExtractorRegistry {
20 fn default() -> Self {
21 Self {
22 extractors: vec![Box::new(MarkdownExtractor), Box::new(DocxExtractor), Box::new(HtmlExtractor), Box::new(PlainTextExtractor), Box::new(PdfExtractor)],
23 }
24 }
25}
26
27impl ExtractorRegistry {
28 pub fn select(&self, extension: &str) -> Option<&dyn DocumentExtractor> {
30 let ext = extension.to_ascii_lowercase();
31 self.extractors
32 .iter()
33 .find(|e| e.supported_extensions().contains(&ext.as_str()))
34 .map(|e| e.as_ref())
35 }
36
37 pub fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
41 let extension = path
42 .canonical
43 .extension()
44 .and_then(|e| e.to_str())
45 .unwrap_or_default();
46 match self.select(extension) {
47 Some(extractor) => {
48 tracing::debug!(extractor = extractor.name(), "extracting");
49 extractor.extract(path)
50 }
51 None => Err(OrbokError::Extraction {
52 category: ErrorCategory::UnsupportedType,
53 message: format!("no extractor for extension '{extension}'"),
54 }),
55 }
56 }
57}