Skip to main content

a3s_code_core/
document_parser.rs

1//! Document Parser Extension Point
2//!
3//! `DocumentParser` is a core extension point that allows users to extend
4//! agentic tools (agentic_search, agentic_parse, etc.) with custom file format
5//! support for binary and structured formats such as PDF, Excel, Word, etc.
6//!
7//! # Architecture
8//!
9//! - **Core**: `DocumentParser` trait + `DocumentParserRegistry` live here
10//! - **Default**: `PlainTextParser` covers all common text/code formats
11//! - **Plugins**: agentic-search and agentic-parse use this registry via `ToolContext`
12//! - **Custom**: Users register additional parsers via `SessionOptions`
13//!
14//! # Example
15//!
16//! ```rust,no_run
17//! use a3s_code_core::document_parser::{DocumentParser, DocumentParserRegistry};
18//! use std::path::Path;
19//! use anyhow::Result;
20//!
21//! struct PdfParser;
22//!
23//! impl DocumentParser for PdfParser {
24//!     fn name(&self) -> &str { "pdf" }
25//!     fn supported_extensions(&self) -> &[&str] { &["pdf"] }
26//!     fn parse(&self, path: &Path) -> Result<String> {
27//!         // e.g. pdf_extract::extract_text(path)
28//!         todo!()
29//!     }
30//! }
31//!
32//! let mut registry = DocumentParserRegistry::new();
33//! registry.register(std::sync::Arc::new(PdfParser));
34//! ```
35
36use anyhow::Result;
37use std::collections::HashMap;
38use std::path::Path;
39use std::sync::Arc;
40
41// ============================================================================
42// DocumentParser trait
43// ============================================================================
44
45/// Extension point for custom file format parsing.
46///
47/// Implement this trait to add support for formats that cannot be read as plain
48/// text (PDF, Excel, Word, images with OCR, etc.) without modifying any core
49/// tool logic.
50pub trait DocumentParser: Send + Sync {
51    /// Unique parser identifier (used for logging and debugging).
52    fn name(&self) -> &str;
53
54    /// File extensions this parser handles (case-insensitive, no leading dot).
55    ///
56    /// Example: `&["pdf", "PDF"]`
57    fn supported_extensions(&self) -> &[&str];
58
59    /// Extract plain-text content from `path`.
60    ///
61    /// Return `Err` if the file cannot be read or parsed; the registry will
62    /// log a warning and skip the file rather than propagating the error.
63    fn parse(&self, path: &Path) -> Result<String>;
64
65    /// Override to control whether this parser will attempt a file before the
66    /// extension lookup.  The default checks extension against
67    /// `supported_extensions()`.
68    fn can_parse(&self, path: &Path) -> bool {
69        path.extension()
70            .and_then(|e| e.to_str())
71            .map(|ext| {
72                self.supported_extensions()
73                    .iter()
74                    .any(|s| s.eq_ignore_ascii_case(ext))
75            })
76            .unwrap_or(false)
77    }
78
79    /// Maximum file size (bytes) this parser accepts.  Files larger than this
80    /// limit are silently skipped.  Default: 10 MiB.
81    fn max_file_size(&self) -> u64 {
82        10 * 1024 * 1024
83    }
84}
85
86// ============================================================================
87// PlainTextParser — built-in default
88// ============================================================================
89
90/// Built-in parser for all common text, code, and config formats.
91///
92/// Handles UTF-8 files up to 1 MiB.  Binary or oversized files are skipped.
93pub struct PlainTextParser;
94
95impl DocumentParser for PlainTextParser {
96    fn name(&self) -> &str {
97        "plain-text"
98    }
99
100    fn supported_extensions(&self) -> &[&str] {
101        &[
102            // Source code
103            "rs",
104            "py",
105            "ts",
106            "tsx",
107            "js",
108            "jsx",
109            "go",
110            "java",
111            "c",
112            "cpp",
113            "h",
114            "hpp",
115            "cs",
116            "rb",
117            "php",
118            "swift",
119            "kt",
120            "scala",
121            "sh",
122            "bash",
123            "zsh",
124            "fish",
125            // Config / data
126            "toml",
127            "yaml",
128            "yml",
129            "json",
130            "jsonc",
131            "ini",
132            "conf",
133            "cfg",
134            "env",
135            "xml",
136            // Documentation
137            "md",
138            "mdx",
139            "txt",
140            "rst",
141            "adoc",
142            "org",
143            // Web
144            "html",
145            "htm",
146            "css",
147            "scss",
148            "sass",
149            "less",
150            // Data
151            "csv",
152            "tsv",
153            "log",
154            // Build
155            "makefile",
156            "dockerfile",
157            "gradlew",
158        ]
159    }
160
161    fn parse(&self, path: &Path) -> Result<String> {
162        std::fs::read_to_string(path).map_err(|e| {
163            anyhow::anyhow!(
164                "plain-text parser: failed to read {}: {}",
165                path.display(),
166                e
167            )
168        })
169    }
170
171    fn max_file_size(&self) -> u64 {
172        1024 * 1024 // 1 MiB
173    }
174}
175
176// ============================================================================
177// DocumentParserRegistry
178// ============================================================================
179
180/// Registry that maps file extensions to `DocumentParser` implementations.
181///
182/// - Created with `new()`: includes `PlainTextParser` for all common formats.
183/// - Created with `empty()`: no parsers at all (useful for overriding defaults).
184/// - Later registrations override earlier ones for the same extension.
185#[derive(Clone)]
186pub struct DocumentParserRegistry {
187    /// Registration order is preserved for fallback `can_parse` checks.
188    parsers: Vec<Arc<dyn DocumentParser>>,
189    /// Fast O(1) lookup by lowercase extension.
190    extension_map: HashMap<String, Arc<dyn DocumentParser>>,
191}
192
193impl DocumentParserRegistry {
194    /// Create a registry pre-loaded with `PlainTextParser`.
195    pub fn new() -> Self {
196        let mut r = Self::empty();
197        r.register(Arc::new(PlainTextParser));
198        r
199    }
200
201    /// Create an empty registry (no parsers registered).
202    pub fn empty() -> Self {
203        Self {
204            parsers: Vec::new(),
205            extension_map: HashMap::new(),
206        }
207    }
208
209    /// Register a parser.  Later registrations win for the same extension.
210    pub fn register(&mut self, parser: Arc<dyn DocumentParser>) {
211        for ext in parser.supported_extensions() {
212            self.extension_map
213                .insert(ext.to_lowercase(), Arc::clone(&parser));
214        }
215        self.parsers.push(parser);
216    }
217
218    /// Find the parser responsible for `path` (extension lookup, then linear
219    /// `can_parse` scan).
220    pub fn find_parser(&self, path: &Path) -> Option<Arc<dyn DocumentParser>> {
221        // Fast path: extension map
222        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
223            if let Some(p) = self.extension_map.get(&ext.to_lowercase()) {
224                return Some(Arc::clone(p));
225            }
226        }
227        // Slow path: can_parse scan (for parsers without extensions, e.g. Dockerfile)
228        self.parsers.iter().find(|p| p.can_parse(path)).cloned()
229    }
230
231    /// Parse `path` and return the extracted text.
232    ///
233    /// Returns:
234    /// - `Ok(Some(text))` — parsed successfully
235    /// - `Ok(None)` — no parser available, or file too large
236    /// - `Err(_)` — I/O or metadata failure (not a parse failure)
237    pub fn parse_file(&self, path: &Path) -> Result<Option<String>> {
238        let parser = match self.find_parser(path) {
239            Some(p) => p,
240            None => return Ok(None),
241        };
242
243        if let Ok(meta) = std::fs::metadata(path) {
244            if meta.len() > parser.max_file_size() {
245                tracing::debug!(
246                    "Skipping {} ({}): exceeds parser '{}' limit of {} bytes",
247                    path.display(),
248                    meta.len(),
249                    parser.name(),
250                    parser.max_file_size()
251                );
252                return Ok(None);
253            }
254        }
255
256        match parser.parse(path) {
257            Ok(content) => Ok(Some(content)),
258            Err(e) => {
259                tracing::warn!(
260                    "Parser '{}' failed on {}: {}",
261                    parser.name(),
262                    path.display(),
263                    e
264                );
265                Ok(None)
266            }
267        }
268    }
269
270    /// All registered parsers (in registration order).
271    pub fn parsers(&self) -> &[Arc<dyn DocumentParser>] {
272        &self.parsers
273    }
274
275    /// Number of registered parsers.
276    pub fn len(&self) -> usize {
277        self.parsers.len()
278    }
279
280    /// Returns `true` if no parsers are registered.
281    pub fn is_empty(&self) -> bool {
282        self.parsers.is_empty()
283    }
284}
285
286impl Default for DocumentParserRegistry {
287    fn default() -> Self {
288        Self::new()
289    }
290}
291
292// ============================================================================
293// Tests
294// ============================================================================
295
296#[cfg(test)]
297mod tests {
298    use super::*;
299    use std::io::Write;
300    use tempfile::TempDir;
301
302    fn write_temp(dir: &TempDir, name: &str, content: &str) -> std::path::PathBuf {
303        let path = dir.path().join(name);
304        let mut f = std::fs::File::create(&path).unwrap();
305        write!(f, "{}", content).unwrap();
306        path
307    }
308
309    #[test]
310    fn plain_text_parser_basic() {
311        let parser = PlainTextParser;
312        assert_eq!(parser.name(), "plain-text");
313        assert!(parser.supported_extensions().contains(&"rs"));
314        assert!(parser.supported_extensions().contains(&"md"));
315        assert!(parser.supported_extensions().contains(&"json"));
316    }
317
318    #[test]
319    fn registry_default_has_plain_text() {
320        let r = DocumentParserRegistry::new();
321        assert_eq!(r.len(), 1);
322        assert!(r.find_parser(Path::new("main.rs")).is_some());
323    }
324
325    #[test]
326    fn registry_empty_has_no_parsers() {
327        let r = DocumentParserRegistry::empty();
328        assert!(r.is_empty());
329        assert!(r.find_parser(Path::new("main.rs")).is_none());
330    }
331
332    #[test]
333    fn registry_finds_parser_by_extension() {
334        let r = DocumentParserRegistry::new();
335        assert!(r.find_parser(Path::new("main.rs")).is_some());
336        assert!(r.find_parser(Path::new("config.toml")).is_some());
337        assert!(r.find_parser(Path::new("README.md")).is_some());
338    }
339
340    #[test]
341    fn registry_no_parser_for_binary() {
342        let r = DocumentParserRegistry::new();
343        assert!(r.find_parser(Path::new("binary.exe")).is_none());
344        assert!(r.find_parser(Path::new("document.pdf")).is_none());
345    }
346
347    #[test]
348    fn registry_later_registration_wins() {
349        struct ParserA;
350        impl DocumentParser for ParserA {
351            fn name(&self) -> &str {
352                "a"
353            }
354            fn supported_extensions(&self) -> &[&str] {
355                &["txt"]
356            }
357            fn parse(&self, _: &Path) -> Result<String> {
358                Ok("A".into())
359            }
360        }
361
362        struct ParserB;
363        impl DocumentParser for ParserB {
364            fn name(&self) -> &str {
365                "b"
366            }
367            fn supported_extensions(&self) -> &[&str] {
368                &["txt"]
369            }
370            fn parse(&self, _: &Path) -> Result<String> {
371                Ok("B".into())
372            }
373        }
374
375        let mut r = DocumentParserRegistry::empty();
376        r.register(Arc::new(ParserA));
377        r.register(Arc::new(ParserB));
378
379        let p = r.find_parser(Path::new("file.txt")).unwrap();
380        assert_eq!(p.name(), "b");
381    }
382
383    #[test]
384    fn parse_file_reads_text() {
385        let dir = TempDir::new().unwrap();
386        let path = write_temp(&dir, "hello.rs", "fn main() {}");
387
388        let r = DocumentParserRegistry::new();
389        let result = r.parse_file(&path).unwrap();
390        assert!(result.is_some());
391        assert!(result.unwrap().contains("fn main"));
392    }
393
394    #[test]
395    fn parse_file_returns_none_for_unknown_extension() {
396        let dir = TempDir::new().unwrap();
397        let path = write_temp(&dir, "file.xyz", "data");
398
399        let r = DocumentParserRegistry::new();
400        assert!(r.parse_file(&path).unwrap().is_none());
401    }
402
403    #[test]
404    fn parse_file_skips_oversized_file() {
405        struct TinyMaxParser;
406        impl DocumentParser for TinyMaxParser {
407            fn name(&self) -> &str {
408                "tiny"
409            }
410            fn supported_extensions(&self) -> &[&str] {
411                &["dat"]
412            }
413            fn parse(&self, path: &Path) -> Result<String> {
414                std::fs::read_to_string(path).map_err(Into::into)
415            }
416            fn max_file_size(&self) -> u64 {
417                3
418            } // 3 bytes max
419        }
420
421        let dir = TempDir::new().unwrap();
422        let path = write_temp(&dir, "big.dat", "more than 3 bytes");
423
424        let mut r = DocumentParserRegistry::empty();
425        r.register(Arc::new(TinyMaxParser));
426
427        assert!(r.parse_file(&path).unwrap().is_none());
428    }
429}