orbok_extract/
registry.rs1use crate::markdown::MarkdownExtractor;
5use crate::pdf::PdfExtractor;
6use crate::text::PlainTextExtractor;
7use crate::types::{DocumentExtractor, ExtractOutput};
8use orbok_core::{ErrorCategory, OrbokError, OrbokResult};
9use orbok_fs::ValidatedPath;
10
11pub struct ExtractorRegistry {
14 extractors: Vec<Box<dyn DocumentExtractor>>,
15}
16
17impl Default for ExtractorRegistry {
18 fn default() -> Self {
19 Self {
20 extractors: vec![Box::new(MarkdownExtractor), Box::new(PlainTextExtractor), Box::new(PdfExtractor)],
21 }
22 }
23}
24
25impl ExtractorRegistry {
26 pub fn select(&self, extension: &str) -> Option<&dyn DocumentExtractor> {
28 let ext = extension.to_ascii_lowercase();
29 self.extractors
30 .iter()
31 .find(|e| e.supported_extensions().contains(&ext.as_str()))
32 .map(|e| e.as_ref())
33 }
34
35 pub fn extract(&self, path: &ValidatedPath) -> OrbokResult<ExtractOutput> {
39 let extension = path
40 .canonical
41 .extension()
42 .and_then(|e| e.to_str())
43 .unwrap_or_default();
44 match self.select(extension) {
45 Some(extractor) => {
46 tracing::debug!(extractor = extractor.name(), "extracting");
47 extractor.extract(path)
48 }
49 None => Err(OrbokError::Extraction {
50 category: ErrorCategory::UnsupportedType,
51 message: format!("no extractor for extension '{extension}'"),
52 }),
53 }
54 }
55}