Skip to main content

a3s_code_core/doc/
registry.rs

1use anyhow::Result;
2use std::collections::HashMap;
3use std::path::Path;
4use std::sync::Arc;
5
6use super::{DocumentExtractionMetadata, DocumentParser, ExtractedDocument};
7
8#[derive(Clone)]
9pub struct DocumentParserRegistry {
10    parsers: Vec<Arc<dyn DocumentParser>>,
11    extension_map: HashMap<String, Arc<dyn DocumentParser>>,
12}
13
14impl DocumentParserRegistry {
15    pub fn empty() -> Self {
16        Self {
17            parsers: Vec::new(),
18            extension_map: HashMap::new(),
19        }
20    }
21
22    pub fn register(&mut self, parser: Arc<dyn DocumentParser>) {
23        for ext in parser.supported_extensions() {
24            self.extension_map
25                .insert(ext.to_lowercase(), Arc::clone(&parser));
26        }
27        self.parsers.push(parser);
28    }
29
30    pub fn find_parser(&self, path: &Path) -> Option<Arc<dyn DocumentParser>> {
31        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
32            if let Some(parser) = self.extension_map.get(&ext.to_lowercase()) {
33                return Some(Arc::clone(parser));
34            }
35        }
36
37        self.parsers
38            .iter()
39            .find(|parser| parser.can_parse(path))
40            .cloned()
41    }
42
43    pub fn parse_file_extracted(&self, path: &Path) -> Result<Option<ExtractedDocument>> {
44        let parser = match self.find_parser(path) {
45            Some(parser) => parser,
46            None => return Ok(None),
47        };
48
49        if let Ok(meta) = std::fs::metadata(path) {
50            if meta.len() > parser.max_file_size() {
51                tracing::debug!(
52                    "Skipping {} ({}): exceeds parser '{}' limit of {} bytes",
53                    path.display(),
54                    meta.len(),
55                    parser.name(),
56                    parser.max_file_size()
57                );
58                return Ok(None);
59            }
60        }
61
62        match parser.parse_extracted(path) {
63            Ok(document) => Ok(Some(annotate_extracted_document(document, parser.as_ref()))),
64            Err(error) => {
65                tracing::warn!(
66                    "Parser '{}' failed on {}: {}",
67                    parser.name(),
68                    path.display(),
69                    error
70                );
71                Ok(None)
72            }
73        }
74    }
75
76    pub fn parse_file(&self, path: &Path) -> Result<Option<String>> {
77        Ok(self
78            .parse_file_extracted(path)?
79            .map(ExtractedDocument::into_parsed_document)
80            .map(|document| document.to_text()))
81    }
82
83    pub fn parsers(&self) -> &[Arc<dyn DocumentParser>] {
84        &self.parsers
85    }
86
87    pub fn len(&self) -> usize {
88        self.parsers.len()
89    }
90
91    pub fn is_empty(&self) -> bool {
92        self.parsers.is_empty()
93    }
94}
95
96impl Default for DocumentParserRegistry {
97    fn default() -> Self {
98        Self::empty()
99    }
100}
101
102fn annotate_extracted_document(
103    mut extracted: ExtractedDocument,
104    parser: &dyn DocumentParser,
105) -> ExtractedDocument {
106    let metadata = extracted
107        .extraction_metadata
108        .get_or_insert_with(DocumentExtractionMetadata::default);
109    if metadata.parser_name.is_none() {
110        metadata.parser_name = Some(parser.name().to_string());
111    }
112    if metadata.parser_signature.is_none() {
113        metadata.parser_signature = Some(parser.signature());
114    }
115    if metadata.extractor.is_none() {
116        metadata.extractor = extracted
117            .document
118            .metadata
119            .as_ref()
120            .and_then(|metadata| metadata.provenance.as_ref())
121            .and_then(|provenance| provenance.extractor.clone());
122    }
123    if metadata.detected_file_type.is_none() {
124        metadata.detected_file_type = extracted
125            .document
126            .metadata
127            .as_ref()
128            .and_then(|metadata| metadata.detected_file_type.clone());
129    }
130    extracted
131}
132
133#[cfg(test)]
134mod tests {
135    use super::*;
136    use std::io::Write;
137    use tempfile::TempDir;
138
139    fn write_temp(dir: &TempDir, name: &str, content: &str) -> std::path::PathBuf {
140        let path = dir.path().join(name);
141        let mut f = std::fs::File::create(&path).unwrap();
142        write!(f, "{}", content).unwrap();
143        path
144    }
145
146    #[test]
147    fn registry_empty_has_no_parsers() {
148        let r = DocumentParserRegistry::empty();
149        assert!(r.is_empty());
150        assert!(r.find_parser(Path::new("main.rs")).is_none());
151    }
152
153    #[test]
154    fn registry_later_registration_wins() {
155        struct ParserA;
156        impl DocumentParser for ParserA {
157            fn name(&self) -> &str {
158                "a"
159            }
160            fn supported_extensions(&self) -> &[&str] {
161                &["txt"]
162            }
163            fn parse(&self, _: &Path) -> Result<String> {
164                Ok("A".into())
165            }
166        }
167
168        struct ParserB;
169        impl DocumentParser for ParserB {
170            fn name(&self) -> &str {
171                "b"
172            }
173            fn supported_extensions(&self) -> &[&str] {
174                &["txt"]
175            }
176            fn parse(&self, _: &Path) -> Result<String> {
177                Ok("B".into())
178            }
179        }
180
181        let mut r = DocumentParserRegistry::empty();
182        r.register(Arc::new(ParserA));
183        r.register(Arc::new(ParserB));
184
185        let p = r.find_parser(Path::new("file.txt")).unwrap();
186        assert_eq!(p.name(), "b");
187    }
188
189    #[test]
190    fn parse_file_extracted_returns_structured_output() {
191        struct TextParser;
192        impl DocumentParser for TextParser {
193            fn name(&self) -> &str {
194                "text"
195            }
196            fn supported_extensions(&self) -> &[&str] {
197                &["rs"]
198            }
199            fn parse(&self, path: &Path) -> Result<String> {
200                Ok(std::fs::read_to_string(path)?)
201            }
202        }
203
204        let dir = TempDir::new().unwrap();
205        let path = write_temp(&dir, "hello.rs", "fn main() {}");
206
207        let mut r = DocumentParserRegistry::empty();
208        r.register(Arc::new(TextParser));
209        let result = r.parse_file_extracted(&path).unwrap();
210        assert!(result.is_some());
211        assert!(result
212            .unwrap()
213            .into_parsed_document()
214            .to_text()
215            .contains("fn main"));
216    }
217
218    #[test]
219    fn parse_file_extracted_annotates_parser_metadata() {
220        struct TextParser;
221        impl DocumentParser for TextParser {
222            fn name(&self) -> &str {
223                "text"
224            }
225            fn signature(&self) -> String {
226                "text@v1".to_string()
227            }
228            fn supported_extensions(&self) -> &[&str] {
229                &["rs"]
230            }
231            fn parse(&self, path: &Path) -> Result<String> {
232                Ok(std::fs::read_to_string(path)?)
233            }
234        }
235
236        let dir = TempDir::new().unwrap();
237        let path = write_temp(&dir, "hello.rs", "fn main() {}");
238
239        let mut r = DocumentParserRegistry::empty();
240        r.register(Arc::new(TextParser));
241        let result = r.parse_file_extracted(&path).unwrap().unwrap();
242
243        assert_eq!(
244            result
245                .extraction_metadata
246                .as_ref()
247                .and_then(|metadata| metadata.parser_name.as_deref()),
248            Some("text")
249        );
250        assert_eq!(
251            result
252                .extraction_metadata
253                .as_ref()
254                .and_then(|metadata| metadata.parser_signature.as_deref()),
255            Some("text@v1")
256        );
257    }
258
259    #[test]
260    fn parse_file_skips_oversized_file() {
261        struct TinyMaxParser;
262        impl DocumentParser for TinyMaxParser {
263            fn name(&self) -> &str {
264                "tiny"
265            }
266            fn supported_extensions(&self) -> &[&str] {
267                &["dat"]
268            }
269            fn parse(&self, path: &Path) -> Result<String> {
270                Ok(std::fs::read_to_string(path)?)
271            }
272            fn max_file_size(&self) -> u64 {
273                3
274            }
275        }
276
277        let dir = TempDir::new().unwrap();
278        let path = write_temp(&dir, "big.dat", "more than 3 bytes");
279
280        let mut r = DocumentParserRegistry::empty();
281        r.register(Arc::new(TinyMaxParser));
282
283        assert!(r.parse_file(&path).unwrap().is_none());
284    }
285}