Skip to main content

a3s_code_core/document/
parser.rs

1//! Document parser types used by A3S Code's context acquisition pipeline.
2//!
3//! These types exist so `agentic_search`, `agentic_parse`, and session wiring
4//! can register a small set of document parsers when better context
5//! extraction is needed.
6//!
7//! They are not intended to turn `a3s-code-core` into a general-purpose
8//! document processing framework.
9//!
10//! # Architecture
11//!
12//! - **Contracts**: parser trait and registry live in `crate::doc`
13//! - **Core defaults**: `PlainTextParser` plus the internal composite parser factory live here
14//! - **Built-in tools**: `agentic_search` and `agentic_parse` consume this registry via `ToolContext`
15//! - **Goal**: recover better model context from non-plaintext project files
16//!
17//! # Example
18//!
19//! ```rust,no_run
20//! use a3s_code_core::document_parser::{DocumentParser, DocumentParserRegistry};
21//! use std::path::Path;
22//! use anyhow::Result;
23//!
24//! struct PdfParser;
25//!
26//! impl DocumentParser for PdfParser {
27//!     fn name(&self) -> &str { "pdf" }
28//!     fn supported_extensions(&self) -> &[&str] { &["pdf"] }
29//!     fn parse(&self, path: &Path) -> Result<String> {
30//!         todo!()
31//!     }
32//! }
33//!
34//! let mut registry = DocumentParserRegistry::empty();
35//! registry.register(std::sync::Arc::new(PdfParser));
36//! ```
37
38use anyhow::Result;
39use std::path::Path;
40use std::sync::Arc;
41
42pub use crate::doc::{
43    DocumentBlock, DocumentBlockKind, DocumentBlockLocation, DocumentConfidence, DocumentMetadata,
44    DocumentParser, DocumentParserRegistry, DocumentProvenance, ParsedDocument,
45};
46
47/// Built-in parser for all common text, code, and config formats.
48///
49/// Handles UTF-8 files up to 1 MiB. Binary or oversized files are skipped.
50pub struct PlainTextParser;
51
52impl DocumentParser for PlainTextParser {
53    fn name(&self) -> &str {
54        "plain-text"
55    }
56
57    fn supported_extensions(&self) -> &[&str] {
58        &[
59            "rs",
60            "py",
61            "ts",
62            "tsx",
63            "js",
64            "jsx",
65            "go",
66            "java",
67            "c",
68            "cpp",
69            "h",
70            "hpp",
71            "cs",
72            "rb",
73            "php",
74            "swift",
75            "kt",
76            "scala",
77            "sh",
78            "bash",
79            "zsh",
80            "fish",
81            "toml",
82            "yaml",
83            "yml",
84            "json",
85            "jsonc",
86            "ini",
87            "conf",
88            "cfg",
89            "env",
90            "xml",
91            "md",
92            "mdx",
93            "txt",
94            "rst",
95            "adoc",
96            "org",
97            "tex",
98            "latex",
99            "typ",
100            "typst",
101            "html",
102            "htm",
103            "css",
104            "scss",
105            "sass",
106            "less",
107            "csv",
108            "tsv",
109            "log",
110            "makefile",
111            "dockerfile",
112            "gradlew",
113        ]
114    }
115
116    fn parse(&self, path: &Path) -> Result<String> {
117        std::fs::read_to_string(path).map_err(|e| {
118            anyhow::anyhow!(
119                "plain-text parser: failed to read {}: {}",
120                path.display(),
121                e
122            )
123        })
124    }
125
126    fn parse_extracted(&self, path: &Path) -> Result<crate::document_pipeline::ExtractedDocument> {
127        Ok(crate::document_pipeline::ExtractedDocument::new(
128            crate::document_parser::ParsedDocument::from_text(self.parse(path)?),
129        ))
130    }
131
132    fn max_file_size(&self) -> u64 {
133        1024 * 1024
134    }
135}
136
137/// Build the default document parser registry using the default parser config.
138pub fn default_document_parser_registry() -> DocumentParserRegistry {
139    crate::document_registry_factory::build_document_parser_registry(
140        crate::config::DocumentParserConfig::default(),
141        None,
142    )
143}
144
145/// Build the default document parser registry using an explicit parser config.
146pub fn document_parser_registry_with_config(
147    config: crate::config::DocumentParserConfig,
148) -> DocumentParserRegistry {
149    crate::document_registry_factory::build_document_parser_registry(config, None)
150}
151
152/// Build the default document parser registry using an explicit parser config
153/// and OCR provider.
154pub fn document_parser_registry_with_config_and_ocr(
155    config: crate::config::DocumentParserConfig,
156    ocr_provider: Arc<dyn crate::document_ocr::DocumentOcrProvider>,
157) -> DocumentParserRegistry {
158    crate::document_registry_factory::build_document_parser_registry(config, Some(ocr_provider))
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164    use std::io::Write;
165    use tempfile::TempDir;
166
167    fn write_temp(dir: &TempDir, name: &str, content: &str) -> std::path::PathBuf {
168        let path = dir.path().join(name);
169        let mut f = std::fs::File::create(&path).unwrap();
170        write!(f, "{}", content).unwrap();
171        path
172    }
173
174    fn build_registry() -> DocumentParserRegistry {
175        crate::document_registry_factory::build_document_parser_registry(
176            crate::config::DocumentParserConfig::default(),
177            None,
178        )
179    }
180
181    #[test]
182    fn plain_text_parser_basic() {
183        let parser = PlainTextParser;
184        assert_eq!(parser.name(), "plain-text");
185        assert!(parser.supported_extensions().contains(&"rs"));
186        assert!(parser.supported_extensions().contains(&"md"));
187        assert!(parser.supported_extensions().contains(&"tex"));
188        assert!(parser.supported_extensions().contains(&"typst"));
189        assert!(parser.supported_extensions().contains(&"json"));
190    }
191
192    #[test]
193    fn registry_default_has_plain_text() {
194        let r = build_registry();
195        assert!(r.len() >= 2);
196        assert!(r.find_parser(Path::new("main.rs")).is_some());
197    }
198
199    #[test]
200    fn registry_finds_parser_by_extension() {
201        let r = build_registry();
202        assert!(r.find_parser(Path::new("main.rs")).is_some());
203        assert!(r.find_parser(Path::new("config.toml")).is_some());
204        assert!(r.find_parser(Path::new("README.md")).is_some());
205    }
206
207    #[test]
208    fn registry_no_parser_for_binary() {
209        let r = build_registry();
210        assert!(r.find_parser(Path::new("binary.exe")).is_none());
211        assert!(r.find_parser(Path::new("document.pdf")).is_some());
212    }
213
214    #[test]
215    fn parse_file_reads_text() {
216        let dir = TempDir::new().unwrap();
217        let path = write_temp(&dir, "hello.rs", "fn main() {}");
218
219        let r = build_registry();
220        let result = r.parse_file(&path).unwrap();
221        assert!(result.is_some());
222        assert!(result.unwrap().contains("fn main"));
223    }
224
225    #[test]
226    fn parse_file_extracted_returns_structured_output() {
227        let dir = TempDir::new().unwrap();
228        let path = write_temp(&dir, "hello.rs", "fn main() {}");
229
230        let r = build_registry();
231        let result = r.parse_file_extracted(&path).unwrap();
232        assert!(result.is_some());
233        assert!(result
234            .unwrap()
235            .into_parsed_document()
236            .to_text()
237            .contains("fn main"));
238    }
239
240    #[test]
241    fn parsed_document_stats_helpers() {
242        let document = ParsedDocument {
243            title: Some("hello".to_string()),
244            blocks: vec![
245                DocumentBlock::new(DocumentBlockKind::Paragraph, Some("intro"), "hello world"),
246                DocumentBlock::new(DocumentBlockKind::Raw, None::<String>, "   "),
247            ],
248            metadata: None,
249            ..Default::default()
250        };
251
252        assert_eq!(document.block_count(), 2);
253        assert_eq!(document.non_empty_block_count(), 1);
254        assert!(document.char_count() >= "hello".len());
255    }
256
257    #[test]
258    fn document_block_location_builders() {
259        let block = DocumentBlock::new(DocumentBlockKind::Paragraph, Some("intro"), "hello")
260            .with_source("chapter1")
261            .with_page(3)
262            .with_ordinal(7);
263
264        let location = block.location.expect("location should exist");
265        assert_eq!(location.source.as_deref(), Some("chapter1"));
266        assert_eq!(location.page, Some(3));
267        assert_eq!(location.ordinal, Some(7));
268    }
269
270    #[test]
271    fn parse_file_returns_none_for_unknown_extension() {
272        let dir = TempDir::new().unwrap();
273        let path = write_temp(&dir, "file.xyz", "data");
274
275        let r = build_registry();
276        assert!(r.parse_file(&path).unwrap().is_none());
277    }
278}