kbolt_core/ingest/
extract.rs1use std::collections::HashMap;
2use std::path::Path;
3use std::sync::Arc;
4
5use crate::Result;
6
7#[derive(Debug, Clone, PartialEq, Eq)]
8pub struct ExtractedDocument {
9 pub blocks: Vec<ExtractedBlock>,
10 pub metadata: HashMap<String, String>,
11 pub title: Option<String>,
12}
13
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub struct ExtractedBlock {
16 pub text: String,
17 pub offset: usize,
18 pub length: usize,
19 pub kind: BlockKind,
20 pub heading_path: Vec<String>,
21 pub attrs: HashMap<String, String>,
22}
23
24#[derive(Debug, Clone, PartialEq, Eq)]
25pub enum BlockKind {
26 Heading,
27 Paragraph,
28 ListItem,
29 BlockQuote,
30 CodeFence,
31 TableHeader,
32 TableRow,
33 HtmlBlock,
34}
35
36pub trait Extractor: Send + Sync {
37 fn supports(&self) -> &[&str];
38
39 fn profile_key(&self) -> &'static str {
40 "txt"
41 }
42
43 fn version(&self) -> u32 {
44 1
45 }
46
47 fn supports_path(&self, _path: &Path) -> bool {
48 false
49 }
50
51 fn extract(&self, path: &Path, bytes: &[u8]) -> Result<ExtractedDocument>;
52}
53
54#[derive(Default)]
55pub struct ExtractorRegistry {
56 by_extension: HashMap<String, Arc<dyn Extractor>>,
57 fallback_extractors: Vec<Arc<dyn Extractor>>,
58}
59
60impl ExtractorRegistry {
61 pub fn new() -> Self {
62 Self::default()
63 }
64
65 pub fn register(&mut self, extractor: Arc<dyn Extractor>) {
66 for extension in extractor.supports() {
67 self.by_extension
68 .insert(normalize_extension_key(extension), Arc::clone(&extractor));
69 }
70 self.fallback_extractors.push(extractor);
71 }
72
73 pub fn resolve_for_path(&self, path: &Path) -> Option<Arc<dyn Extractor>> {
74 if let Some(extension) = path.extension().and_then(|value| value.to_str()) {
75 let key = normalize_extension_key(extension);
76 if let Some(extractor) = self.by_extension.get(&key) {
77 return Some(Arc::clone(extractor));
78 }
79 }
80
81 for extractor in &self.fallback_extractors {
82 if extractor.supports_path(path) {
83 return Some(Arc::clone(extractor));
84 }
85 }
86
87 None
88 }
89}
90
91pub fn default_registry() -> ExtractorRegistry {
92 let mut registry = ExtractorRegistry::new();
93 registry.register(Arc::new(crate::ingest::html::HtmlExtractor));
94 registry.register(Arc::new(crate::ingest::markdown::MarkdownExtractor));
95 registry.register(Arc::new(crate::ingest::pdf::PdfExtractor));
96 registry.register(Arc::new(crate::ingest::code::CodeExtractor));
97 registry.register(Arc::new(crate::ingest::plaintext::PlaintextExtractor));
98 registry
99}
100
101fn normalize_extension_key(raw: &str) -> String {
102 raw.trim().trim_start_matches('.').to_ascii_lowercase()
103}
104
105#[cfg(test)]
106mod tests {
107 use std::collections::HashMap;
108 use std::path::Path;
109 use std::sync::Arc;
110
111 use super::{
112 default_registry, normalize_extension_key, BlockKind, ExtractedBlock, ExtractedDocument,
113 Extractor, ExtractorRegistry,
114 };
115 use crate::Result;
116
117 struct DummyExtractor;
118
119 impl Extractor for DummyExtractor {
120 fn supports(&self) -> &[&str] {
121 &["txt"]
122 }
123
124 fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
125 Ok(ExtractedDocument {
126 blocks: vec![ExtractedBlock {
127 text: String::from_utf8_lossy(bytes).to_string(),
128 offset: 0,
129 length: bytes.len(),
130 kind: BlockKind::Paragraph,
131 heading_path: vec![],
132 attrs: HashMap::new(),
133 }],
134 metadata: HashMap::new(),
135 title: None,
136 })
137 }
138 }
139
140 struct PathFallbackExtractor;
141
142 impl Extractor for PathFallbackExtractor {
143 fn supports(&self) -> &[&str] {
144 &[]
145 }
146
147 fn supports_path(&self, path: &Path) -> bool {
148 path.file_name().and_then(|value| value.to_str()) == Some("LICENSE")
149 }
150
151 fn extract(&self, _path: &Path, _bytes: &[u8]) -> Result<ExtractedDocument> {
152 Ok(ExtractedDocument {
153 blocks: vec![],
154 metadata: HashMap::new(),
155 title: None,
156 })
157 }
158 }
159
160 #[test]
161 fn extractor_default_supports_path_is_false() {
162 let extractor = DummyExtractor;
163 assert!(!extractor.supports_path(Path::new("notes/readme.txt")));
164 }
165
166 #[test]
167 fn extractor_default_profile_key_is_txt() {
168 let extractor = DummyExtractor;
169 assert_eq!(extractor.profile_key(), "txt");
170 }
171
172 #[test]
173 fn extracted_document_tracks_blocks_and_spans() {
174 let extractor = DummyExtractor;
175 let document = extractor
176 .extract(Path::new("notes/readme.txt"), b"hello world")
177 .expect("extract document");
178 assert_eq!(document.blocks.len(), 1);
179 assert_eq!(document.blocks[0].offset, 0);
180 assert_eq!(document.blocks[0].length, 11);
181 assert_eq!(document.blocks[0].kind, BlockKind::Paragraph);
182 }
183
184 #[test]
185 fn extension_key_normalization_trims_prefix_and_case() {
186 assert_eq!(normalize_extension_key("MD"), "md");
187 assert_eq!(normalize_extension_key(".Markdown"), "markdown");
188 assert_eq!(normalize_extension_key(" rs "), "rs");
189 }
190
191 #[test]
192 fn registry_resolves_by_extension_before_fallbacks() {
193 let mut registry = ExtractorRegistry::new();
194 registry.register(Arc::new(DummyExtractor));
195 registry.register(Arc::new(PathFallbackExtractor));
196
197 let resolved = registry
198 .resolve_for_path(Path::new("notes/readme.TXT"))
199 .expect("resolve txt extractor");
200 assert_eq!(resolved.supports(), ["txt"]);
201 }
202
203 #[test]
204 fn registry_uses_supports_path_as_fallback() {
205 let mut registry = ExtractorRegistry::new();
206 registry.register(Arc::new(PathFallbackExtractor));
207
208 let resolved = registry
209 .resolve_for_path(Path::new("docs/LICENSE"))
210 .expect("resolve fallback extractor");
211 assert!(resolved.supports().is_empty());
212 }
213
214 #[test]
215 fn default_registry_resolves_plaintext_extensions() {
216 let registry = default_registry();
217
218 let txt = registry.resolve_for_path(Path::new("notes/readme.txt"));
219 let html = registry.resolve_for_path(Path::new("docs/page.html"));
220 let htm = registry.resolve_for_path(Path::new("docs/page.htm"));
221 let md = registry.resolve_for_path(Path::new("notes/readme.md"));
222 let pdf = registry.resolve_for_path(Path::new("papers/guide.pdf"));
223 let code = registry.resolve_for_path(Path::new("src/lib.rs"));
224 let unknown = registry.resolve_for_path(Path::new("notes/readme.rst"));
225
226 assert!(txt.is_some());
227 assert!(html
228 .as_ref()
229 .is_some_and(|extractor| extractor.profile_key() == "html"));
230 assert!(htm
231 .as_ref()
232 .is_some_and(|extractor| extractor.profile_key() == "html"));
233 assert!(md
234 .as_ref()
235 .is_some_and(|extractor| extractor.supports().contains(&"md")));
236 assert!(pdf
237 .as_ref()
238 .is_some_and(|extractor| extractor.profile_key() == "pdf"));
239 assert!(code
240 .as_ref()
241 .is_some_and(|extractor| extractor.profile_key() == "code"));
242 assert!(unknown.is_none());
243 }
244}