1use anyhow::Result;
37use std::collections::HashMap;
38use std::path::Path;
39use std::sync::Arc;
40
41pub trait DocumentParser: Send + Sync {
51 fn name(&self) -> &str;
53
54 fn supported_extensions(&self) -> &[&str];
58
59 fn parse(&self, path: &Path) -> Result<String>;
64
65 fn can_parse(&self, path: &Path) -> bool {
69 path.extension()
70 .and_then(|e| e.to_str())
71 .map(|ext| {
72 self.supported_extensions()
73 .iter()
74 .any(|s| s.eq_ignore_ascii_case(ext))
75 })
76 .unwrap_or(false)
77 }
78
79 fn max_file_size(&self) -> u64 {
82 10 * 1024 * 1024
83 }
84}
85
86pub struct PlainTextParser;
94
95impl DocumentParser for PlainTextParser {
96 fn name(&self) -> &str {
97 "plain-text"
98 }
99
100 fn supported_extensions(&self) -> &[&str] {
101 &[
102 "rs",
104 "py",
105 "ts",
106 "tsx",
107 "js",
108 "jsx",
109 "go",
110 "java",
111 "c",
112 "cpp",
113 "h",
114 "hpp",
115 "cs",
116 "rb",
117 "php",
118 "swift",
119 "kt",
120 "scala",
121 "sh",
122 "bash",
123 "zsh",
124 "fish",
125 "toml",
127 "yaml",
128 "yml",
129 "json",
130 "jsonc",
131 "ini",
132 "conf",
133 "cfg",
134 "env",
135 "xml",
136 "md",
138 "mdx",
139 "txt",
140 "rst",
141 "adoc",
142 "org",
143 "html",
145 "htm",
146 "css",
147 "scss",
148 "sass",
149 "less",
150 "csv",
152 "tsv",
153 "log",
154 "makefile",
156 "dockerfile",
157 "gradlew",
158 ]
159 }
160
161 fn parse(&self, path: &Path) -> Result<String> {
162 std::fs::read_to_string(path).map_err(|e| {
163 anyhow::anyhow!(
164 "plain-text parser: failed to read {}: {}",
165 path.display(),
166 e
167 )
168 })
169 }
170
171 fn max_file_size(&self) -> u64 {
172 1024 * 1024 }
174}
175
176#[derive(Clone)]
186pub struct DocumentParserRegistry {
187 parsers: Vec<Arc<dyn DocumentParser>>,
189 extension_map: HashMap<String, Arc<dyn DocumentParser>>,
191}
192
193impl DocumentParserRegistry {
194 pub fn new() -> Self {
196 let mut r = Self::empty();
197 r.register(Arc::new(PlainTextParser));
198 r
199 }
200
201 pub fn empty() -> Self {
203 Self {
204 parsers: Vec::new(),
205 extension_map: HashMap::new(),
206 }
207 }
208
209 pub fn register(&mut self, parser: Arc<dyn DocumentParser>) {
211 for ext in parser.supported_extensions() {
212 self.extension_map
213 .insert(ext.to_lowercase(), Arc::clone(&parser));
214 }
215 self.parsers.push(parser);
216 }
217
218 pub fn find_parser(&self, path: &Path) -> Option<Arc<dyn DocumentParser>> {
221 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
223 if let Some(p) = self.extension_map.get(&ext.to_lowercase()) {
224 return Some(Arc::clone(p));
225 }
226 }
227 self.parsers.iter().find(|p| p.can_parse(path)).cloned()
229 }
230
231 pub fn parse_file(&self, path: &Path) -> Result<Option<String>> {
238 let parser = match self.find_parser(path) {
239 Some(p) => p,
240 None => return Ok(None),
241 };
242
243 if let Ok(meta) = std::fs::metadata(path) {
244 if meta.len() > parser.max_file_size() {
245 tracing::debug!(
246 "Skipping {} ({}): exceeds parser '{}' limit of {} bytes",
247 path.display(),
248 meta.len(),
249 parser.name(),
250 parser.max_file_size()
251 );
252 return Ok(None);
253 }
254 }
255
256 match parser.parse(path) {
257 Ok(content) => Ok(Some(content)),
258 Err(e) => {
259 tracing::warn!(
260 "Parser '{}' failed on {}: {}",
261 parser.name(),
262 path.display(),
263 e
264 );
265 Ok(None)
266 }
267 }
268 }
269
270 pub fn parsers(&self) -> &[Arc<dyn DocumentParser>] {
272 &self.parsers
273 }
274
275 pub fn len(&self) -> usize {
277 self.parsers.len()
278 }
279
280 pub fn is_empty(&self) -> bool {
282 self.parsers.is_empty()
283 }
284}
285
286impl Default for DocumentParserRegistry {
287 fn default() -> Self {
288 Self::new()
289 }
290}
291
292#[cfg(test)]
297mod tests {
298 use super::*;
299 use std::io::Write;
300 use tempfile::TempDir;
301
302 fn write_temp(dir: &TempDir, name: &str, content: &str) -> std::path::PathBuf {
303 let path = dir.path().join(name);
304 let mut f = std::fs::File::create(&path).unwrap();
305 write!(f, "{}", content).unwrap();
306 path
307 }
308
309 #[test]
310 fn plain_text_parser_basic() {
311 let parser = PlainTextParser;
312 assert_eq!(parser.name(), "plain-text");
313 assert!(parser.supported_extensions().contains(&"rs"));
314 assert!(parser.supported_extensions().contains(&"md"));
315 assert!(parser.supported_extensions().contains(&"json"));
316 }
317
318 #[test]
319 fn registry_default_has_plain_text() {
320 let r = DocumentParserRegistry::new();
321 assert_eq!(r.len(), 1);
322 assert!(r.find_parser(Path::new("main.rs")).is_some());
323 }
324
325 #[test]
326 fn registry_empty_has_no_parsers() {
327 let r = DocumentParserRegistry::empty();
328 assert!(r.is_empty());
329 assert!(r.find_parser(Path::new("main.rs")).is_none());
330 }
331
332 #[test]
333 fn registry_finds_parser_by_extension() {
334 let r = DocumentParserRegistry::new();
335 assert!(r.find_parser(Path::new("main.rs")).is_some());
336 assert!(r.find_parser(Path::new("config.toml")).is_some());
337 assert!(r.find_parser(Path::new("README.md")).is_some());
338 }
339
340 #[test]
341 fn registry_no_parser_for_binary() {
342 let r = DocumentParserRegistry::new();
343 assert!(r.find_parser(Path::new("binary.exe")).is_none());
344 assert!(r.find_parser(Path::new("document.pdf")).is_none());
345 }
346
347 #[test]
348 fn registry_later_registration_wins() {
349 struct ParserA;
350 impl DocumentParser for ParserA {
351 fn name(&self) -> &str {
352 "a"
353 }
354 fn supported_extensions(&self) -> &[&str] {
355 &["txt"]
356 }
357 fn parse(&self, _: &Path) -> Result<String> {
358 Ok("A".into())
359 }
360 }
361
362 struct ParserB;
363 impl DocumentParser for ParserB {
364 fn name(&self) -> &str {
365 "b"
366 }
367 fn supported_extensions(&self) -> &[&str] {
368 &["txt"]
369 }
370 fn parse(&self, _: &Path) -> Result<String> {
371 Ok("B".into())
372 }
373 }
374
375 let mut r = DocumentParserRegistry::empty();
376 r.register(Arc::new(ParserA));
377 r.register(Arc::new(ParserB));
378
379 let p = r.find_parser(Path::new("file.txt")).unwrap();
380 assert_eq!(p.name(), "b");
381 }
382
383 #[test]
384 fn parse_file_reads_text() {
385 let dir = TempDir::new().unwrap();
386 let path = write_temp(&dir, "hello.rs", "fn main() {}");
387
388 let r = DocumentParserRegistry::new();
389 let result = r.parse_file(&path).unwrap();
390 assert!(result.is_some());
391 assert!(result.unwrap().contains("fn main"));
392 }
393
394 #[test]
395 fn parse_file_returns_none_for_unknown_extension() {
396 let dir = TempDir::new().unwrap();
397 let path = write_temp(&dir, "file.xyz", "data");
398
399 let r = DocumentParserRegistry::new();
400 assert!(r.parse_file(&path).unwrap().is_none());
401 }
402
403 #[test]
404 fn parse_file_skips_oversized_file() {
405 struct TinyMaxParser;
406 impl DocumentParser for TinyMaxParser {
407 fn name(&self) -> &str {
408 "tiny"
409 }
410 fn supported_extensions(&self) -> &[&str] {
411 &["dat"]
412 }
413 fn parse(&self, path: &Path) -> Result<String> {
414 std::fs::read_to_string(path).map_err(Into::into)
415 }
416 fn max_file_size(&self) -> u64 {
417 3
418 } }
420
421 let dir = TempDir::new().unwrap();
422 let path = write_temp(&dir, "big.dat", "more than 3 bytes");
423
424 let mut r = DocumentParserRegistry::empty();
425 r.register(Arc::new(TinyMaxParser));
426
427 assert!(r.parse_file(&path).unwrap().is_none());
428 }
429}