kbolt_core/ingest/
extract.rs1use std::collections::HashMap;
2use std::path::Path;
3use std::sync::Arc;
4
5use crate::Result;
6
7#[derive(Debug, Clone, PartialEq, Eq)]
8pub struct ExtractedDocument {
9 pub blocks: Vec<ExtractedBlock>,
10 pub metadata: HashMap<String, String>,
11 pub title: Option<String>,
12}
13
14#[derive(Debug, Clone, PartialEq, Eq)]
15pub struct ExtractedBlock {
16 pub text: String,
17 pub offset: usize,
18 pub length: usize,
19 pub kind: BlockKind,
20 pub heading_path: Vec<String>,
21 pub attrs: HashMap<String, String>,
22}
23
24#[derive(Debug, Clone, PartialEq, Eq)]
25pub enum BlockKind {
26 Heading,
27 Paragraph,
28 ListItem,
29 BlockQuote,
30 CodeFence,
31 TableHeader,
32 TableRow,
33 HtmlBlock,
34}
35
36pub trait Extractor: Send + Sync {
37 fn supports(&self) -> &[&str];
38
39 fn profile_key(&self) -> &'static str {
40 "txt"
41 }
42
43 fn supports_path(&self, _path: &Path) -> bool {
44 false
45 }
46
47 fn extract(&self, path: &Path, bytes: &[u8]) -> Result<ExtractedDocument>;
48}
49
50#[derive(Default)]
51pub struct ExtractorRegistry {
52 by_extension: HashMap<String, Arc<dyn Extractor>>,
53 fallback_extractors: Vec<Arc<dyn Extractor>>,
54}
55
56impl ExtractorRegistry {
57 pub fn new() -> Self {
58 Self::default()
59 }
60
61 pub fn register(&mut self, extractor: Arc<dyn Extractor>) {
62 for extension in extractor.supports() {
63 self.by_extension
64 .insert(normalize_extension_key(extension), Arc::clone(&extractor));
65 }
66 self.fallback_extractors.push(extractor);
67 }
68
69 pub fn resolve_for_path(&self, path: &Path) -> Option<Arc<dyn Extractor>> {
70 if let Some(extension) = path.extension().and_then(|value| value.to_str()) {
71 let key = normalize_extension_key(extension);
72 if let Some(extractor) = self.by_extension.get(&key) {
73 return Some(Arc::clone(extractor));
74 }
75 }
76
77 for extractor in &self.fallback_extractors {
78 if extractor.supports_path(path) {
79 return Some(Arc::clone(extractor));
80 }
81 }
82
83 None
84 }
85}
86
87pub fn default_registry() -> ExtractorRegistry {
88 let mut registry = ExtractorRegistry::new();
89 registry.register(Arc::new(crate::ingest::markdown::MarkdownExtractor));
90 registry.register(Arc::new(crate::ingest::code::CodeExtractor));
91 registry.register(Arc::new(crate::ingest::plaintext::PlaintextExtractor));
92 registry
93}
94
95fn normalize_extension_key(raw: &str) -> String {
96 raw.trim().trim_start_matches('.').to_ascii_lowercase()
97}
98
99#[cfg(test)]
100mod tests {
101 use std::collections::HashMap;
102 use std::path::Path;
103 use std::sync::Arc;
104
105 use super::{
106 default_registry, normalize_extension_key, BlockKind, ExtractedBlock, ExtractedDocument,
107 Extractor, ExtractorRegistry,
108 };
109 use crate::Result;
110
111 struct DummyExtractor;
112
113 impl Extractor for DummyExtractor {
114 fn supports(&self) -> &[&str] {
115 &["txt"]
116 }
117
118 fn extract(&self, _path: &Path, bytes: &[u8]) -> Result<ExtractedDocument> {
119 Ok(ExtractedDocument {
120 blocks: vec![ExtractedBlock {
121 text: String::from_utf8_lossy(bytes).to_string(),
122 offset: 0,
123 length: bytes.len(),
124 kind: BlockKind::Paragraph,
125 heading_path: vec![],
126 attrs: HashMap::new(),
127 }],
128 metadata: HashMap::new(),
129 title: None,
130 })
131 }
132 }
133
134 struct PathFallbackExtractor;
135
136 impl Extractor for PathFallbackExtractor {
137 fn supports(&self) -> &[&str] {
138 &[]
139 }
140
141 fn supports_path(&self, path: &Path) -> bool {
142 path.file_name().and_then(|value| value.to_str()) == Some("LICENSE")
143 }
144
145 fn extract(&self, _path: &Path, _bytes: &[u8]) -> Result<ExtractedDocument> {
146 Ok(ExtractedDocument {
147 blocks: vec![],
148 metadata: HashMap::new(),
149 title: None,
150 })
151 }
152 }
153
154 #[test]
155 fn extractor_default_supports_path_is_false() {
156 let extractor = DummyExtractor;
157 assert!(!extractor.supports_path(Path::new("notes/readme.txt")));
158 }
159
160 #[test]
161 fn extractor_default_profile_key_is_txt() {
162 let extractor = DummyExtractor;
163 assert_eq!(extractor.profile_key(), "txt");
164 }
165
166 #[test]
167 fn extracted_document_tracks_blocks_and_spans() {
168 let extractor = DummyExtractor;
169 let document = extractor
170 .extract(Path::new("notes/readme.txt"), b"hello world")
171 .expect("extract document");
172 assert_eq!(document.blocks.len(), 1);
173 assert_eq!(document.blocks[0].offset, 0);
174 assert_eq!(document.blocks[0].length, 11);
175 assert_eq!(document.blocks[0].kind, BlockKind::Paragraph);
176 }
177
178 #[test]
179 fn extension_key_normalization_trims_prefix_and_case() {
180 assert_eq!(normalize_extension_key("MD"), "md");
181 assert_eq!(normalize_extension_key(".Markdown"), "markdown");
182 assert_eq!(normalize_extension_key(" rs "), "rs");
183 }
184
185 #[test]
186 fn registry_resolves_by_extension_before_fallbacks() {
187 let mut registry = ExtractorRegistry::new();
188 registry.register(Arc::new(DummyExtractor));
189 registry.register(Arc::new(PathFallbackExtractor));
190
191 let resolved = registry
192 .resolve_for_path(Path::new("notes/readme.TXT"))
193 .expect("resolve txt extractor");
194 assert_eq!(resolved.supports(), ["txt"]);
195 }
196
197 #[test]
198 fn registry_uses_supports_path_as_fallback() {
199 let mut registry = ExtractorRegistry::new();
200 registry.register(Arc::new(PathFallbackExtractor));
201
202 let resolved = registry
203 .resolve_for_path(Path::new("docs/LICENSE"))
204 .expect("resolve fallback extractor");
205 assert!(resolved.supports().is_empty());
206 }
207
208 #[test]
209 fn default_registry_resolves_plaintext_extensions() {
210 let registry = default_registry();
211
212 let txt = registry.resolve_for_path(Path::new("notes/readme.txt"));
213 let md = registry.resolve_for_path(Path::new("notes/readme.md"));
214 let code = registry.resolve_for_path(Path::new("src/lib.rs"));
215 let unknown = registry.resolve_for_path(Path::new("notes/readme.rst"));
216
217 assert!(txt.is_some());
218 assert!(md
219 .as_ref()
220 .is_some_and(|extractor| extractor.supports().contains(&"md")));
221 assert!(code
222 .as_ref()
223 .is_some_and(|extractor| extractor.profile_key() == "code"));
224 assert!(unknown.is_some());
225 }
226}