Skip to main content

kbolt_core/ingest/
extract.rs

1use std::collections::HashMap;
2use std::path::Path;
3use std::sync::Arc;
4
5use crate::Result;
6
7#[derive(Debug, Clone, PartialEq, Eq)]
8pub struct ExtractedDocument {
9    pub blocks: Vec<ExtractedBlock>,
10    pub metadata: HashMap<String, String>,
11    pub title: Option<String>,
12}
13
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub struct ExtractedBlock {
16    pub text: String,
17    pub offset: usize,
18    pub length: usize,
19    pub kind: BlockKind,
20    pub heading_path: Vec<String>,
21    pub attrs: HashMap<String, String>,
22}
23
24#[derive(Debug, Clone, PartialEq, Eq)]
25pub enum BlockKind {
26    Heading,
27    Paragraph,
28    ListItem,
29    BlockQuote,
30    CodeFence,
31    TableHeader,
32    TableRow,
33    HtmlBlock,
34}
35
36pub trait Extractor: Send + Sync {
37    fn supports(&self) -> &[&str];
38
39    fn profile_key(&self) -> &'static str {
40        "txt"
41    }
42
43    fn supports_path(&self, _path: &Path) -> bool {
44        false
45    }
46
47    fn extract(&self, path: &Path, bytes: &[u8]) -> Result<ExtractedDocument>;
48}
49
50#[derive(Default)]
51pub struct ExtractorRegistry {
52    by_extension: HashMap<String, Arc<dyn Extractor>>,
53    fallback_extractors: Vec<Arc<dyn Extractor>>,
54}
55
56impl ExtractorRegistry {
57    pub fn new() -> Self {
58        Self::default()
59    }
60
61    pub fn register(&mut self, extractor: Arc<dyn Extractor>) {
62        for extension in extractor.supports() {
63            self.by_extension
64                .insert(normalize_extension_key(extension), Arc::clone(&extractor));
65        }
66        self.fallback_extractors.push(extractor);
67    }
68
69    pub fn resolve_for_path(&self, path: &Path) -> Option<Arc<dyn Extractor>> {
70        if let Some(extension) = path.extension().and_then(|value| value.to_str()) {
71            let key = normalize_extension_key(extension);
72            if let Some(extractor) = self.by_extension.get(&key) {
73                return Some(Arc::clone(extractor));
74            }
75        }
76
77        for extractor in &self.fallback_extractors {
78            if extractor.supports_path(path) {
79                return Some(Arc::clone(extractor));
80            }
81        }
82
83        None
84    }
85}
86
87pub fn default_registry() -> ExtractorRegistry {
88    let mut registry = ExtractorRegistry::new();
89    registry.register(Arc::new(crate::ingest::markdown::MarkdownExtractor));
90    registry.register(Arc::new(crate::ingest::code::CodeExtractor));
91    registry.register(Arc::new(crate::ingest::plaintext::PlaintextExtractor));
92    registry
93}
94
95fn normalize_extension_key(raw: &str) -> String {
96    raw.trim().trim_start_matches('.').to_ascii_lowercase()
97}
98
99#[cfg(test)]
100mod tests {
101    use std::collections::HashMap;
102    use std::path::Path;
103    use std::sync::Arc;
104
105    use super::{
106        default_registry, normalize_extension_key, BlockKind, ExtractedBlock, ExtractedDocument,
107        Extractor, ExtractorRegistry,
108    };
109    use crate::Result;
110
111    struct DummyExtractor;
112
113    impl Extractor for DummyExtractor {
114        fn supports(&self) -> &[&str] {
115            &["txt"]
116        }
117
118        fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
119            Ok(ExtractedDocument {
120                blocks: vec![ExtractedBlock {
121                    text: String::from_utf8_lossy(bytes).to_string(),
122                    offset: 0,
123                    length: bytes.len(),
124                    kind: BlockKind::Paragraph,
125                    heading_path: vec![],
126                    attrs: HashMap::new(),
127                }],
128                metadata: HashMap::new(),
129                title: None,
130            })
131        }
132    }
133
134    struct PathFallbackExtractor;
135
136    impl Extractor for PathFallbackExtractor {
137        fn supports(&self) -> &[&str] {
138            &[]
139        }
140
141        fn supports_path(&self, path: &Path) -> bool {
142            path.file_name().and_then(|value| value.to_str()) == Some("LICENSE")
143        }
144
145        fn extract(&self, _path: &Path, _bytes: &[u8]) -> Result<ExtractedDocument> {
146            Ok(ExtractedDocument {
147                blocks: vec![],
148                metadata: HashMap::new(),
149                title: None,
150            })
151        }
152    }
153
154    #[test]
155    fn extractor_default_supports_path_is_false() {
156        let extractor = DummyExtractor;
157        assert!(!extractor.supports_path(Path::new("notes/readme.txt")));
158    }
159
160    #[test]
161    fn extractor_default_profile_key_is_txt() {
162        let extractor = DummyExtractor;
163        assert_eq!(extractor.profile_key(), "txt");
164    }
165
166    #[test]
167    fn extracted_document_tracks_blocks_and_spans() {
168        let extractor = DummyExtractor;
169        let document = extractor
170            .extract(Path::new("notes/readme.txt"), b"hello world")
171            .expect("extract document");
172        assert_eq!(document.blocks.len(), 1);
173        assert_eq!(document.blocks[0].offset, 0);
174        assert_eq!(document.blocks[0].length, 11);
175        assert_eq!(document.blocks[0].kind, BlockKind::Paragraph);
176    }
177
178    #[test]
179    fn extension_key_normalization_trims_prefix_and_case() {
180        assert_eq!(normalize_extension_key("MD"), "md");
181        assert_eq!(normalize_extension_key(".Markdown"), "markdown");
182        assert_eq!(normalize_extension_key(" rs "), "rs");
183    }
184
185    #[test]
186    fn registry_resolves_by_extension_before_fallbacks() {
187        let mut registry = ExtractorRegistry::new();
188        registry.register(Arc::new(DummyExtractor));
189        registry.register(Arc::new(PathFallbackExtractor));
190
191        let resolved = registry
192            .resolve_for_path(Path::new("notes/readme.TXT"))
193            .expect("resolve txt extractor");
194        assert_eq!(resolved.supports(), ["txt"]);
195    }
196
197    #[test]
198    fn registry_uses_supports_path_as_fallback() {
199        let mut registry = ExtractorRegistry::new();
200        registry.register(Arc::new(PathFallbackExtractor));
201
202        let resolved = registry
203            .resolve_for_path(Path::new("docs/LICENSE"))
204            .expect("resolve fallback extractor");
205        assert!(resolved.supports().is_empty());
206    }
207
208    #[test]
209    fn default_registry_resolves_plaintext_extensions() {
210        let registry = default_registry();
211
212        let txt = registry.resolve_for_path(Path::new("notes/readme.txt"));
213        let md = registry.resolve_for_path(Path::new("notes/readme.md"));
214        let code = registry.resolve_for_path(Path::new("src/lib.rs"));
215        let unknown = registry.resolve_for_path(Path::new("notes/readme.rst"));
216
217        assert!(txt.is_some());
218        assert!(md
219            .as_ref()
220            .is_some_and(|extractor| extractor.supports().contains(&"md")));
221        assert!(code
222            .as_ref()
223            .is_some_and(|extractor| extractor.profile_key() == "code"));
224        assert!(unknown.is_some());
225    }
226}