Skip to main content

kbolt_core/ingest/
extract.rs

1use std::collections::HashMap;
2use std::path::Path;
3use std::sync::Arc;
4
5use crate::Result;
6
7#[derive(Debug, Clone, PartialEq, Eq)]
8pub struct ExtractedDocument {
9    pub blocks: Vec<ExtractedBlock>,
10    pub metadata: HashMap<String, String>,
11    pub title: Option<String>,
12}
13
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub struct ExtractedBlock {
16    pub text: String,
17    pub offset: usize,
18    pub length: usize,
19    pub kind: BlockKind,
20    pub heading_path: Vec<String>,
21    pub attrs: HashMap<String, String>,
22}
23
24#[derive(Debug, Clone, PartialEq, Eq)]
25pub enum BlockKind {
26    Heading,
27    Paragraph,
28    ListItem,
29    BlockQuote,
30    CodeFence,
31    TableHeader,
32    TableRow,
33    HtmlBlock,
34}
35
36pub trait Extractor: Send + Sync {
37    fn supports(&self) -> &[&str];
38
39    fn profile_key(&self) -> &'static str {
40        "txt"
41    }
42
43    fn version(&self) -> u32 {
44        1
45    }
46
47    fn supports_path(&self, _path: &Path) -> bool {
48        false
49    }
50
51    fn extract(&self, path: &Path, bytes: &[u8]) -> Result<ExtractedDocument>;
52}
53
54#[derive(Default)]
55pub struct ExtractorRegistry {
56    by_extension: HashMap<String, Arc<dyn Extractor>>,
57    fallback_extractors: Vec<Arc<dyn Extractor>>,
58}
59
60impl ExtractorRegistry {
61    pub fn new() -> Self {
62        Self::default()
63    }
64
65    pub fn register(&mut self, extractor: Arc<dyn Extractor>) {
66        for extension in extractor.supports() {
67            self.by_extension
68                .insert(normalize_extension_key(extension), Arc::clone(&extractor));
69        }
70        self.fallback_extractors.push(extractor);
71    }
72
73    pub fn resolve_for_path(&self, path: &Path) -> Option<Arc<dyn Extractor>> {
74        if let Some(extension) = path.extension().and_then(|value| value.to_str()) {
75            let key = normalize_extension_key(extension);
76            if let Some(extractor) = self.by_extension.get(&key) {
77                return Some(Arc::clone(extractor));
78            }
79        }
80
81        for extractor in &self.fallback_extractors {
82            if extractor.supports_path(path) {
83                return Some(Arc::clone(extractor));
84            }
85        }
86
87        None
88    }
89}
90
91pub fn default_registry() -> ExtractorRegistry {
92    let mut registry = ExtractorRegistry::new();
93    registry.register(Arc::new(crate::ingest::html::HtmlExtractor));
94    registry.register(Arc::new(crate::ingest::markdown::MarkdownExtractor));
95    registry.register(Arc::new(crate::ingest::pdf::PdfExtractor));
96    registry.register(Arc::new(crate::ingest::code::CodeExtractor));
97    registry.register(Arc::new(crate::ingest::plaintext::PlaintextExtractor));
98    registry
99}
100
101fn normalize_extension_key(raw: &str) -> String {
102    raw.trim().trim_start_matches('.').to_ascii_lowercase()
103}
104
105#[cfg(test)]
106mod tests {
107    use std::collections::HashMap;
108    use std::path::Path;
109    use std::sync::Arc;
110
111    use super::{
112        default_registry, normalize_extension_key, BlockKind, ExtractedBlock, ExtractedDocument,
113        Extractor, ExtractorRegistry,
114    };
115    use crate::Result;
116
117    struct DummyExtractor;
118
119    impl Extractor for DummyExtractor {
120        fn supports(&self) -> &[&str] {
121            &["txt"]
122        }
123
124        fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
125            Ok(ExtractedDocument {
126                blocks: vec![ExtractedBlock {
127                    text: String::from_utf8_lossy(bytes).to_string(),
128                    offset: 0,
129                    length: bytes.len(),
130                    kind: BlockKind::Paragraph,
131                    heading_path: vec![],
132                    attrs: HashMap::new(),
133                }],
134                metadata: HashMap::new(),
135                title: None,
136            })
137        }
138    }
139
140    struct PathFallbackExtractor;
141
142    impl Extractor for PathFallbackExtractor {
143        fn supports(&self) -> &[&str] {
144            &[]
145        }
146
147        fn supports_path(&self, path: &Path) -> bool {
148            path.file_name().and_then(|value| value.to_str()) == Some("LICENSE")
149        }
150
151        fn extract(&self, _path: &Path, _bytes: &[u8]) -> Result<ExtractedDocument> {
152            Ok(ExtractedDocument {
153                blocks: vec![],
154                metadata: HashMap::new(),
155                title: None,
156            })
157        }
158    }
159
160    #[test]
161    fn extractor_default_supports_path_is_false() {
162        let extractor = DummyExtractor;
163        assert!(!extractor.supports_path(Path::new("notes/readme.txt")));
164    }
165
166    #[test]
167    fn extractor_default_profile_key_is_txt() {
168        let extractor = DummyExtractor;
169        assert_eq!(extractor.profile_key(), "txt");
170    }
171
172    #[test]
173    fn extracted_document_tracks_blocks_and_spans() {
174        let extractor = DummyExtractor;
175        let document = extractor
176            .extract(Path::new("notes/readme.txt"), b"hello world")
177            .expect("extract document");
178        assert_eq!(document.blocks.len(), 1);
179        assert_eq!(document.blocks[0].offset, 0);
180        assert_eq!(document.blocks[0].length, 11);
181        assert_eq!(document.blocks[0].kind, BlockKind::Paragraph);
182    }
183
184    #[test]
185    fn extension_key_normalization_trims_prefix_and_case() {
186        assert_eq!(normalize_extension_key("MD"), "md");
187        assert_eq!(normalize_extension_key(".Markdown"), "markdown");
188        assert_eq!(normalize_extension_key(" rs "), "rs");
189    }
190
191    #[test]
192    fn registry_resolves_by_extension_before_fallbacks() {
193        let mut registry = ExtractorRegistry::new();
194        registry.register(Arc::new(DummyExtractor));
195        registry.register(Arc::new(PathFallbackExtractor));
196
197        let resolved = registry
198            .resolve_for_path(Path::new("notes/readme.TXT"))
199            .expect("resolve txt extractor");
200        assert_eq!(resolved.supports(), ["txt"]);
201    }
202
203    #[test]
204    fn registry_uses_supports_path_as_fallback() {
205        let mut registry = ExtractorRegistry::new();
206        registry.register(Arc::new(PathFallbackExtractor));
207
208        let resolved = registry
209            .resolve_for_path(Path::new("docs/LICENSE"))
210            .expect("resolve fallback extractor");
211        assert!(resolved.supports().is_empty());
212    }
213
214    #[test]
215    fn default_registry_resolves_plaintext_extensions() {
216        let registry = default_registry();
217
218        let txt = registry.resolve_for_path(Path::new("notes/readme.txt"));
219        let html = registry.resolve_for_path(Path::new("docs/page.html"));
220        let htm = registry.resolve_for_path(Path::new("docs/page.htm"));
221        let md = registry.resolve_for_path(Path::new("notes/readme.md"));
222        let pdf = registry.resolve_for_path(Path::new("papers/guide.pdf"));
223        let code = registry.resolve_for_path(Path::new("src/lib.rs"));
224        let unknown = registry.resolve_for_path(Path::new("notes/readme.rst"));
225
226        assert!(txt.is_some());
227        assert!(html
228            .as_ref()
229            .is_some_and(|extractor| extractor.profile_key() == "html"));
230        assert!(htm
231            .as_ref()
232            .is_some_and(|extractor| extractor.profile_key() == "html"));
233        assert!(md
234            .as_ref()
235            .is_some_and(|extractor| extractor.supports().contains(&"md")));
236        assert!(pdf
237            .as_ref()
238            .is_some_and(|extractor| extractor.profile_key() == "pdf"));
239        assert!(code
240            .as_ref()
241            .is_some_and(|extractor| extractor.profile_key() == "code"));
242        assert!(unknown.is_none());
243    }
244}