a3s_code_core/doc/
registry.rs1use anyhow::Result;
2use std::collections::HashMap;
3use std::path::Path;
4use std::sync::Arc;
5
6use super::{DocumentExtractionMetadata, DocumentParser, ExtractedDocument};
7
8#[derive(Clone)]
9pub struct DocumentParserRegistry {
10 parsers: Vec<Arc<dyn DocumentParser>>,
11 extension_map: HashMap<String, Arc<dyn DocumentParser>>,
12}
13
14impl DocumentParserRegistry {
15 pub fn empty() -> Self {
16 Self {
17 parsers: Vec::new(),
18 extension_map: HashMap::new(),
19 }
20 }
21
22 pub fn register(&mut self, parser: Arc<dyn DocumentParser>) {
23 for ext in parser.supported_extensions() {
24 self.extension_map
25 .insert(ext.to_lowercase(), Arc::clone(&parser));
26 }
27 self.parsers.push(parser);
28 }
29
30 pub fn find_parser(&self, path: &Path) -> Option<Arc<dyn DocumentParser>> {
31 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
32 if let Some(parser) = self.extension_map.get(&ext.to_lowercase()) {
33 return Some(Arc::clone(parser));
34 }
35 }
36
37 self.parsers
38 .iter()
39 .find(|parser| parser.can_parse(path))
40 .cloned()
41 }
42
43 pub fn parse_file_extracted(&self, path: &Path) -> Result<Option<ExtractedDocument>> {
44 let parser = match self.find_parser(path) {
45 Some(parser) => parser,
46 None => return Ok(None),
47 };
48
49 if let Ok(meta) = std::fs::metadata(path) {
50 if meta.len() > parser.max_file_size() {
51 tracing::debug!(
52 "Skipping {} ({}): exceeds parser '{}' limit of {} bytes",
53 path.display(),
54 meta.len(),
55 parser.name(),
56 parser.max_file_size()
57 );
58 return Ok(None);
59 }
60 }
61
62 match parser.parse_extracted(path) {
63 Ok(document) => Ok(Some(annotate_extracted_document(document, parser.as_ref()))),
64 Err(error) => {
65 tracing::warn!(
66 "Parser '{}' failed on {}: {}",
67 parser.name(),
68 path.display(),
69 error
70 );
71 Ok(None)
72 }
73 }
74 }
75
76 pub fn parse_file(&self, path: &Path) -> Result<Option<String>> {
77 Ok(self
78 .parse_file_extracted(path)?
79 .map(ExtractedDocument::into_parsed_document)
80 .map(|document| document.to_text()))
81 }
82
83 pub fn parsers(&self) -> &[Arc<dyn DocumentParser>] {
84 &self.parsers
85 }
86
87 pub fn len(&self) -> usize {
88 self.parsers.len()
89 }
90
91 pub fn is_empty(&self) -> bool {
92 self.parsers.is_empty()
93 }
94}
95
96impl Default for DocumentParserRegistry {
97 fn default() -> Self {
98 Self::empty()
99 }
100}
101
102fn annotate_extracted_document(
103 mut extracted: ExtractedDocument,
104 parser: &dyn DocumentParser,
105) -> ExtractedDocument {
106 let metadata = extracted
107 .extraction_metadata
108 .get_or_insert_with(DocumentExtractionMetadata::default);
109 if metadata.parser_name.is_none() {
110 metadata.parser_name = Some(parser.name().to_string());
111 }
112 if metadata.parser_signature.is_none() {
113 metadata.parser_signature = Some(parser.signature());
114 }
115 if metadata.extractor.is_none() {
116 metadata.extractor = extracted
117 .document
118 .metadata
119 .as_ref()
120 .and_then(|metadata| metadata.provenance.as_ref())
121 .and_then(|provenance| provenance.extractor.clone());
122 }
123 if metadata.detected_file_type.is_none() {
124 metadata.detected_file_type = extracted
125 .document
126 .metadata
127 .as_ref()
128 .and_then(|metadata| metadata.detected_file_type.clone());
129 }
130 extracted
131}
132
133#[cfg(test)]
134mod tests {
135 use super::*;
136 use std::io::Write;
137 use tempfile::TempDir;
138
139 fn write_temp(dir: &TempDir, name: &str, content: &str) -> std::path::PathBuf {
140 let path = dir.path().join(name);
141 let mut f = std::fs::File::create(&path).unwrap();
142 write!(f, "{}", content).unwrap();
143 path
144 }
145
146 #[test]
147 fn registry_empty_has_no_parsers() {
148 let r = DocumentParserRegistry::empty();
149 assert!(r.is_empty());
150 assert!(r.find_parser(Path::new("main.rs")).is_none());
151 }
152
153 #[test]
154 fn registry_later_registration_wins() {
155 struct ParserA;
156 impl DocumentParser for ParserA {
157 fn name(&self) -> &str {
158 "a"
159 }
160 fn supported_extensions(&self) -> &[&str] {
161 &["txt"]
162 }
163 fn parse(&self, _: &Path) -> Result<String> {
164 Ok("A".into())
165 }
166 }
167
168 struct ParserB;
169 impl DocumentParser for ParserB {
170 fn name(&self) -> &str {
171 "b"
172 }
173 fn supported_extensions(&self) -> &[&str] {
174 &["txt"]
175 }
176 fn parse(&self, _: &Path) -> Result<String> {
177 Ok("B".into())
178 }
179 }
180
181 let mut r = DocumentParserRegistry::empty();
182 r.register(Arc::new(ParserA));
183 r.register(Arc::new(ParserB));
184
185 let p = r.find_parser(Path::new("file.txt")).unwrap();
186 assert_eq!(p.name(), "b");
187 }
188
189 #[test]
190 fn parse_file_extracted_returns_structured_output() {
191 struct TextParser;
192 impl DocumentParser for TextParser {
193 fn name(&self) -> &str {
194 "text"
195 }
196 fn supported_extensions(&self) -> &[&str] {
197 &["rs"]
198 }
199 fn parse(&self, path: &Path) -> Result<String> {
200 Ok(std::fs::read_to_string(path)?)
201 }
202 }
203
204 let dir = TempDir::new().unwrap();
205 let path = write_temp(&dir, "hello.rs", "fn main() {}");
206
207 let mut r = DocumentParserRegistry::empty();
208 r.register(Arc::new(TextParser));
209 let result = r.parse_file_extracted(&path).unwrap();
210 assert!(result.is_some());
211 assert!(result
212 .unwrap()
213 .into_parsed_document()
214 .to_text()
215 .contains("fn main"));
216 }
217
218 #[test]
219 fn parse_file_extracted_annotates_parser_metadata() {
220 struct TextParser;
221 impl DocumentParser for TextParser {
222 fn name(&self) -> &str {
223 "text"
224 }
225 fn signature(&self) -> String {
226 "text@v1".to_string()
227 }
228 fn supported_extensions(&self) -> &[&str] {
229 &["rs"]
230 }
231 fn parse(&self, path: &Path) -> Result<String> {
232 Ok(std::fs::read_to_string(path)?)
233 }
234 }
235
236 let dir = TempDir::new().unwrap();
237 let path = write_temp(&dir, "hello.rs", "fn main() {}");
238
239 let mut r = DocumentParserRegistry::empty();
240 r.register(Arc::new(TextParser));
241 let result = r.parse_file_extracted(&path).unwrap().unwrap();
242
243 assert_eq!(
244 result
245 .extraction_metadata
246 .as_ref()
247 .and_then(|metadata| metadata.parser_name.as_deref()),
248 Some("text")
249 );
250 assert_eq!(
251 result
252 .extraction_metadata
253 .as_ref()
254 .and_then(|metadata| metadata.parser_signature.as_deref()),
255 Some("text@v1")
256 );
257 }
258
259 #[test]
260 fn parse_file_skips_oversized_file() {
261 struct TinyMaxParser;
262 impl DocumentParser for TinyMaxParser {
263 fn name(&self) -> &str {
264 "tiny"
265 }
266 fn supported_extensions(&self) -> &[&str] {
267 &["dat"]
268 }
269 fn parse(&self, path: &Path) -> Result<String> {
270 Ok(std::fs::read_to_string(path)?)
271 }
272 fn max_file_size(&self) -> u64 {
273 3
274 }
275 }
276
277 let dir = TempDir::new().unwrap();
278 let path = write_temp(&dir, "big.dat", "more than 3 bytes");
279
280 let mut r = DocumentParserRegistry::empty();
281 r.register(Arc::new(TinyMaxParser));
282
283 assert!(r.parse_file(&path).unwrap().is_none());
284 }
285}