Skip to main content

a3s_code_core/
document_parser.rs

1//! Document Parser Extension Point
2//!
3//! `DocumentParser` is a core extension point that allows users to extend
4//! agentic tools (agentic_search, agentic_parse, etc.) with custom file format
5//! support for binary and structured formats such as PDF, Excel, Word, etc.
6//!
7//! # Architecture
8//!
9//! - **Core**: `DocumentParser` trait + `DocumentParserRegistry` live here
10//! - **Default**: `PlainTextParser` covers all common text/code formats
11//! - **Built-in tools**: agentic-search and agentic-parse use this registry via `ToolContext`
12//! - **Custom**: Users register additional parsers via `SessionOptions`
13//!
14//! # Example
15//!
16//! ```rust,no_run
17//! use a3s_code_core::document_parser::{DocumentParser, DocumentParserRegistry};
18//! use std::path::Path;
19//! use anyhow::Result;
20//!
21//! struct PdfParser;
22//!
23//! impl DocumentParser for PdfParser {
24//!     fn name(&self) -> &str { "pdf" }
25//!     fn supported_extensions(&self) -> &[&str] { &["pdf"] }
26//!     fn parse(&self, path: &Path) -> Result<String> {
27//!         // e.g. pdf_extract::extract_text(path)
28//!         todo!()
29//!     }
30//! }
31//!
32//! let mut registry = DocumentParserRegistry::new();
33//! registry.register(std::sync::Arc::new(PdfParser));
34//! ```
35
36use anyhow::Result;
37use std::collections::HashMap;
38use std::path::Path;
39use std::sync::Arc;
40
41// ============================================================================
42// Structured document model
43// ============================================================================
44
45#[derive(Debug, Clone, PartialEq, Eq)]
46pub enum DocumentBlockKind {
47    Paragraph,
48    Heading,
49    Table,
50    Section,
51    Metadata,
52    Slide,
53    EmailHeader,
54    Code,
55    Raw,
56}
57
58#[derive(Debug, Clone, Default, PartialEq, Eq)]
59pub struct DocumentBlockLocation {
60    pub source: Option<String>,
61    pub page: Option<usize>,
62    pub ordinal: Option<usize>,
63}
64
65#[derive(Debug, Clone, PartialEq, Eq)]
66pub struct DocumentBlock {
67    pub kind: DocumentBlockKind,
68    pub label: Option<String>,
69    pub content: String,
70    pub location: Option<DocumentBlockLocation>,
71}
72
73impl DocumentBlock {
74    pub fn new(
75        kind: DocumentBlockKind,
76        label: Option<impl Into<String>>,
77        content: impl Into<String>,
78    ) -> Self {
79        Self {
80            kind,
81            label: label.map(Into::into),
82            content: content.into(),
83            location: None,
84        }
85    }
86
87    pub fn with_source(mut self, source: impl Into<String>) -> Self {
88        self.location
89            .get_or_insert_with(DocumentBlockLocation::default)
90            .source = Some(source.into());
91        self
92    }
93
94    pub fn with_page(mut self, page: usize) -> Self {
95        self.location
96            .get_or_insert_with(DocumentBlockLocation::default)
97            .page = Some(page);
98        self
99    }
100
101    pub fn with_ordinal(mut self, ordinal: usize) -> Self {
102        self.location
103            .get_or_insert_with(DocumentBlockLocation::default)
104            .ordinal = Some(ordinal);
105        self
106    }
107}
108
109#[derive(Debug, Clone, Default, PartialEq, Eq)]
110pub struct ParsedDocument {
111    pub title: Option<String>,
112    pub blocks: Vec<DocumentBlock>,
113}
114
115impl ParsedDocument {
116    pub fn new() -> Self {
117        Self::default()
118    }
119
120    pub fn from_text(text: impl Into<String>) -> Self {
121        Self {
122            title: None,
123            blocks: vec![DocumentBlock::new(
124                DocumentBlockKind::Raw,
125                None::<String>,
126                text,
127            )],
128        }
129    }
130
131    pub fn with_title(mut self, title: impl Into<String>) -> Self {
132        self.title = Some(title.into());
133        self
134    }
135
136    pub fn push(&mut self, block: DocumentBlock) {
137        self.blocks.push(block);
138    }
139
140    pub fn block_count(&self) -> usize {
141        self.blocks.len()
142    }
143
144    pub fn non_empty_block_count(&self) -> usize {
145        self.blocks
146            .iter()
147            .filter(|block| !block.content.trim().is_empty())
148            .count()
149    }
150
151    pub fn char_count(&self) -> usize {
152        self.to_text().chars().count()
153    }
154
155    pub fn is_empty(&self) -> bool {
156        self.blocks.iter().all(|b| b.content.trim().is_empty())
157    }
158
159    pub fn to_text(&self) -> String {
160        let mut parts = Vec::new();
161        if let Some(title) = &self.title {
162            if !title.trim().is_empty() {
163                parts.push(title.trim().to_string());
164            }
165        }
166        for block in &self.blocks {
167            let mut chunk = String::new();
168            if let Some(label) = &block.label {
169                if !label.trim().is_empty() {
170                    chunk.push_str(label.trim());
171                    chunk.push('\n');
172                }
173            }
174            chunk.push_str(block.content.trim());
175            if !chunk.trim().is_empty() {
176                parts.push(chunk.trim().to_string());
177            }
178        }
179        parts.join("\n\n")
180    }
181}
182
183// ============================================================================
184// DocumentParser trait
185// ============================================================================
186
187/// Extension point for custom file format parsing.
188///
189/// Implement this trait to add support for formats that cannot be read as plain
190/// text (PDF, Excel, Word, images with OCR, etc.) without modifying any core
191/// tool logic.
192pub trait DocumentParser: Send + Sync {
193    /// Unique parser identifier (used for logging and debugging).
194    fn name(&self) -> &str;
195
196    /// File extensions this parser handles (case-insensitive, no leading dot).
197    ///
198    /// Example: `&["pdf", "PDF"]`
199    fn supported_extensions(&self) -> &[&str];
200
201    /// Extract plain-text content from `path`.
202    ///
203    /// Return `Err` if the file cannot be read or parsed; the registry will
204    /// log a warning and skip the file rather than propagating the error.
205    fn parse(&self, path: &Path) -> Result<String>;
206
207    /// Extract a structured document from `path`.
208    ///
209    /// The default implementation wraps [`DocumentParser::parse`] into a
210    /// single raw-text block so existing parsers remain source-compatible.
211    fn parse_document(&self, path: &Path) -> Result<ParsedDocument> {
212        Ok(ParsedDocument::from_text(self.parse(path)?))
213    }
214
215    /// Override to control whether this parser will attempt a file before the
216    /// extension lookup.  The default checks extension against
217    /// `supported_extensions()`.
218    fn can_parse(&self, path: &Path) -> bool {
219        path.extension()
220            .and_then(|e| e.to_str())
221            .map(|ext| {
222                self.supported_extensions()
223                    .iter()
224                    .any(|s| s.eq_ignore_ascii_case(ext))
225            })
226            .unwrap_or(false)
227    }
228
229    /// Maximum file size (bytes) this parser accepts.  Files larger than this
230    /// limit are silently skipped.  Default: 10 MiB.
231    fn max_file_size(&self) -> u64 {
232        10 * 1024 * 1024
233    }
234}
235
236// ============================================================================
237// PlainTextParser — built-in default
238// ============================================================================
239
240/// Built-in parser for all common text, code, and config formats.
241///
242/// Handles UTF-8 files up to 1 MiB.  Binary or oversized files are skipped.
243pub struct PlainTextParser;
244
245impl DocumentParser for PlainTextParser {
246    fn name(&self) -> &str {
247        "plain-text"
248    }
249
250    fn supported_extensions(&self) -> &[&str] {
251        &[
252            // Source code
253            "rs",
254            "py",
255            "ts",
256            "tsx",
257            "js",
258            "jsx",
259            "go",
260            "java",
261            "c",
262            "cpp",
263            "h",
264            "hpp",
265            "cs",
266            "rb",
267            "php",
268            "swift",
269            "kt",
270            "scala",
271            "sh",
272            "bash",
273            "zsh",
274            "fish",
275            // Config / data
276            "toml",
277            "yaml",
278            "yml",
279            "json",
280            "jsonc",
281            "ini",
282            "conf",
283            "cfg",
284            "env",
285            "xml",
286            // Documentation
287            "md",
288            "mdx",
289            "txt",
290            "rst",
291            "adoc",
292            "org",
293            // Web
294            "html",
295            "htm",
296            "css",
297            "scss",
298            "sass",
299            "less",
300            // Data
301            "csv",
302            "tsv",
303            "log",
304            // Build
305            "makefile",
306            "dockerfile",
307            "gradlew",
308        ]
309    }
310
311    fn parse(&self, path: &Path) -> Result<String> {
312        std::fs::read_to_string(path).map_err(|e| {
313            anyhow::anyhow!(
314                "plain-text parser: failed to read {}: {}",
315                path.display(),
316                e
317            )
318        })
319    }
320
321    fn max_file_size(&self) -> u64 {
322        1024 * 1024 // 1 MiB
323    }
324}
325
326// ============================================================================
327// DocumentParserRegistry
328// ============================================================================
329
330/// Registry that maps file extensions to `DocumentParser` implementations.
331///
332/// - Created with `new()`: includes `PlainTextParser` for all common formats.
333/// - Created with `empty()`: no parsers at all (useful for overriding defaults).
334/// - Later registrations override earlier ones for the same extension.
335#[derive(Clone)]
336pub struct DocumentParserRegistry {
337    /// Registration order is preserved for fallback `can_parse` checks.
338    parsers: Vec<Arc<dyn DocumentParser>>,
339    /// Fast O(1) lookup by lowercase extension.
340    extension_map: HashMap<String, Arc<dyn DocumentParser>>,
341}
342
343impl DocumentParserRegistry {
344    /// Create a registry pre-loaded with `PlainTextParser`.
345    pub fn new() -> Self {
346        Self::new_with_default_parser(crate::config::DefaultParserConfig::default(), None)
347    }
348
349    /// Create a registry pre-loaded with `PlainTextParser` and a configured
350    /// `DefaultParser`.
351    pub fn new_with_default_parser_config(config: crate::config::DefaultParserConfig) -> Self {
352        Self::new_with_default_parser(config, None)
353    }
354
355    /// Create a registry pre-loaded with `PlainTextParser` and a configured
356    /// `DefaultParser`, optionally wiring an OCR provider into that parser.
357    pub fn new_with_default_parser(
358        config: crate::config::DefaultParserConfig,
359        ocr_provider: Option<Arc<dyn crate::default_parser::DefaultParserOcrProvider>>,
360    ) -> Self {
361        let mut r = Self::empty();
362        r.register(Arc::new(PlainTextParser));
363        if config.enabled {
364            let parser = match ocr_provider {
365                Some(provider) => {
366                    crate::default_parser::DefaultParser::with_config_and_ocr(config, provider)
367                }
368                None => crate::default_parser::DefaultParser::with_config(config),
369            };
370            r.register(Arc::new(parser));
371        }
372        r
373    }
374
375    /// Create an empty registry (no parsers registered).
376    pub fn empty() -> Self {
377        Self {
378            parsers: Vec::new(),
379            extension_map: HashMap::new(),
380        }
381    }
382
383    /// Register a parser.  Later registrations win for the same extension.
384    pub fn register(&mut self, parser: Arc<dyn DocumentParser>) {
385        for ext in parser.supported_extensions() {
386            self.extension_map
387                .insert(ext.to_lowercase(), Arc::clone(&parser));
388        }
389        self.parsers.push(parser);
390    }
391
392    /// Find the parser responsible for `path` (extension lookup, then linear
393    /// `can_parse` scan).
394    pub fn find_parser(&self, path: &Path) -> Option<Arc<dyn DocumentParser>> {
395        // Fast path: extension map
396        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
397            if let Some(p) = self.extension_map.get(&ext.to_lowercase()) {
398                return Some(Arc::clone(p));
399            }
400        }
401        // Slow path: can_parse scan (for parsers without extensions, e.g. Dockerfile)
402        self.parsers.iter().find(|p| p.can_parse(path)).cloned()
403    }
404
405    /// Parse `path` and return a structured document.
406    ///
407    /// Returns:
408    /// - `Ok(Some(document))` — parsed successfully
409    /// - `Ok(None)` — no parser available, or file too large
410    /// - `Err(_)` — I/O or metadata failure (not a parse failure)
411    pub fn parse_file_document(&self, path: &Path) -> Result<Option<ParsedDocument>> {
412        let parser = match self.find_parser(path) {
413            Some(p) => p,
414            None => return Ok(None),
415        };
416
417        if let Ok(meta) = std::fs::metadata(path) {
418            if meta.len() > parser.max_file_size() {
419                tracing::debug!(
420                    "Skipping {} ({}): exceeds parser '{}' limit of {} bytes",
421                    path.display(),
422                    meta.len(),
423                    parser.name(),
424                    parser.max_file_size()
425                );
426                return Ok(None);
427            }
428        }
429
430        match parser.parse_document(path) {
431            Ok(document) => Ok(Some(document)),
432            Err(e) => {
433                tracing::warn!(
434                    "Parser '{}' failed on {}: {}",
435                    parser.name(),
436                    path.display(),
437                    e
438                );
439                Ok(None)
440            }
441        }
442    }
443
444    /// Parse `path` and return extracted text for compatibility with older
445    /// call sites.
446    pub fn parse_file(&self, path: &Path) -> Result<Option<String>> {
447        Ok(self
448            .parse_file_document(path)?
449            .map(|document| document.to_text()))
450    }
451
452    /// All registered parsers (in registration order).
453    pub fn parsers(&self) -> &[Arc<dyn DocumentParser>] {
454        &self.parsers
455    }
456
457    /// Number of registered parsers.
458    pub fn len(&self) -> usize {
459        self.parsers.len()
460    }
461
462    /// Returns `true` if no parsers are registered.
463    pub fn is_empty(&self) -> bool {
464        self.parsers.is_empty()
465    }
466}
467
468impl Default for DocumentParserRegistry {
469    fn default() -> Self {
470        Self::new()
471    }
472}
473
474// ============================================================================
475// Tests
476// ============================================================================
477
478#[cfg(test)]
479mod tests {
480    use super::*;
481    use std::io::Write;
482    use tempfile::TempDir;
483
484    fn write_temp(dir: &TempDir, name: &str, content: &str) -> std::path::PathBuf {
485        let path = dir.path().join(name);
486        let mut f = std::fs::File::create(&path).unwrap();
487        write!(f, "{}", content).unwrap();
488        path
489    }
490
491    #[test]
492    fn plain_text_parser_basic() {
493        let parser = PlainTextParser;
494        assert_eq!(parser.name(), "plain-text");
495        assert!(parser.supported_extensions().contains(&"rs"));
496        assert!(parser.supported_extensions().contains(&"md"));
497        assert!(parser.supported_extensions().contains(&"json"));
498    }
499
500    #[test]
501    fn registry_default_has_plain_text() {
502        let r = DocumentParserRegistry::new();
503        assert!(r.len() >= 2);
504        assert!(r.find_parser(Path::new("main.rs")).is_some());
505    }
506
507    #[test]
508    fn registry_empty_has_no_parsers() {
509        let r = DocumentParserRegistry::empty();
510        assert!(r.is_empty());
511        assert!(r.find_parser(Path::new("main.rs")).is_none());
512    }
513
514    #[test]
515    fn registry_finds_parser_by_extension() {
516        let r = DocumentParserRegistry::new();
517        assert!(r.find_parser(Path::new("main.rs")).is_some());
518        assert!(r.find_parser(Path::new("config.toml")).is_some());
519        assert!(r.find_parser(Path::new("README.md")).is_some());
520    }
521
522    #[test]
523    fn registry_no_parser_for_binary() {
524        let r = DocumentParserRegistry::new();
525        assert!(r.find_parser(Path::new("binary.exe")).is_none());
526        assert!(r.find_parser(Path::new("document.pdf")).is_some());
527    }
528
529    #[test]
530    fn registry_later_registration_wins() {
531        struct ParserA;
532        impl DocumentParser for ParserA {
533            fn name(&self) -> &str {
534                "a"
535            }
536            fn supported_extensions(&self) -> &[&str] {
537                &["txt"]
538            }
539            fn parse(&self, _: &Path) -> Result<String> {
540                Ok("A".into())
541            }
542        }
543
544        struct ParserB;
545        impl DocumentParser for ParserB {
546            fn name(&self) -> &str {
547                "b"
548            }
549            fn supported_extensions(&self) -> &[&str] {
550                &["txt"]
551            }
552            fn parse(&self, _: &Path) -> Result<String> {
553                Ok("B".into())
554            }
555        }
556
557        let mut r = DocumentParserRegistry::empty();
558        r.register(Arc::new(ParserA));
559        r.register(Arc::new(ParserB));
560
561        let p = r.find_parser(Path::new("file.txt")).unwrap();
562        assert_eq!(p.name(), "b");
563    }
564
565    #[test]
566    fn parse_file_reads_text() {
567        let dir = TempDir::new().unwrap();
568        let path = write_temp(&dir, "hello.rs", "fn main() {}");
569
570        let r = DocumentParserRegistry::new();
571        let result = r.parse_file(&path).unwrap();
572        assert!(result.is_some());
573        assert!(result.unwrap().contains("fn main"));
574    }
575
576    #[test]
577    fn parse_file_document_returns_structured_output() {
578        let dir = TempDir::new().unwrap();
579        let path = write_temp(&dir, "hello.rs", "fn main() {}");
580
581        let r = DocumentParserRegistry::new();
582        let result = r.parse_file_document(&path).unwrap();
583        assert!(result.is_some());
584        let document = result.unwrap();
585        assert!(!document.blocks.is_empty());
586        assert!(document.to_text().contains("fn main"));
587    }
588
589    #[test]
590    fn parsed_document_stats_helpers() {
591        let document = ParsedDocument {
592            title: Some("hello".to_string()),
593            blocks: vec![
594                DocumentBlock::new(DocumentBlockKind::Paragraph, Some("intro"), "hello world"),
595                DocumentBlock::new(DocumentBlockKind::Raw, None::<String>, "   "),
596            ],
597        };
598
599        assert_eq!(document.block_count(), 2);
600        assert_eq!(document.non_empty_block_count(), 1);
601        assert!(document.char_count() >= "hello".len());
602    }
603
604    #[test]
605    fn document_block_location_builders() {
606        let block = DocumentBlock::new(DocumentBlockKind::Paragraph, Some("intro"), "hello")
607            .with_source("chapter1")
608            .with_page(3)
609            .with_ordinal(7);
610
611        let location = block.location.expect("location should exist");
612        assert_eq!(location.source.as_deref(), Some("chapter1"));
613        assert_eq!(location.page, Some(3));
614        assert_eq!(location.ordinal, Some(7));
615    }
616
617    #[test]
618    fn parse_file_returns_none_for_unknown_extension() {
619        let dir = TempDir::new().unwrap();
620        let path = write_temp(&dir, "file.xyz", "data");
621
622        let r = DocumentParserRegistry::new();
623        assert!(r.parse_file(&path).unwrap().is_none());
624    }
625
626    #[test]
627    fn parse_file_skips_oversized_file() {
628        struct TinyMaxParser;
629        impl DocumentParser for TinyMaxParser {
630            fn name(&self) -> &str {
631                "tiny"
632            }
633            fn supported_extensions(&self) -> &[&str] {
634                &["dat"]
635            }
636            fn parse(&self, path: &Path) -> Result<String> {
637                std::fs::read_to_string(path).map_err(Into::into)
638            }
639            fn max_file_size(&self) -> u64 {
640                3
641            } // 3 bytes max
642        }
643
644        let dir = TempDir::new().unwrap();
645        let path = write_temp(&dir, "big.dat", "more than 3 bytes");
646
647        let mut r = DocumentParserRegistry::empty();
648        r.register(Arc::new(TinyMaxParser));
649
650        assert!(r.parse_file(&path).unwrap().is_none());
651    }
652}