1use anyhow::Result;
39use std::path::Path;
40use std::sync::Arc;
41
42pub use crate::doc::{
43 DocumentBlock, DocumentBlockKind, DocumentBlockLocation, DocumentConfidence, DocumentMetadata,
44 DocumentParser, DocumentParserRegistry, DocumentProvenance, ParsedDocument,
45};
46
47pub struct PlainTextParser;
51
52impl DocumentParser for PlainTextParser {
53 fn name(&self) -> &str {
54 "plain-text"
55 }
56
57 fn supported_extensions(&self) -> &[&str] {
58 &[
59 "rs",
60 "py",
61 "ts",
62 "tsx",
63 "js",
64 "jsx",
65 "go",
66 "java",
67 "c",
68 "cpp",
69 "h",
70 "hpp",
71 "cs",
72 "rb",
73 "php",
74 "swift",
75 "kt",
76 "scala",
77 "sh",
78 "bash",
79 "zsh",
80 "fish",
81 "toml",
82 "yaml",
83 "yml",
84 "json",
85 "jsonc",
86 "ini",
87 "conf",
88 "cfg",
89 "env",
90 "xml",
91 "md",
92 "mdx",
93 "txt",
94 "rst",
95 "adoc",
96 "org",
97 "tex",
98 "latex",
99 "typ",
100 "typst",
101 "html",
102 "htm",
103 "css",
104 "scss",
105 "sass",
106 "less",
107 "csv",
108 "tsv",
109 "log",
110 "makefile",
111 "dockerfile",
112 "gradlew",
113 ]
114 }
115
116 fn parse(&self, path: &Path) -> Result<String> {
117 std::fs::read_to_string(path).map_err(|e| {
118 anyhow::anyhow!(
119 "plain-text parser: failed to read {}: {}",
120 path.display(),
121 e
122 )
123 })
124 }
125
126 fn parse_extracted(&self, path: &Path) -> Result<crate::document_pipeline::ExtractedDocument> {
127 Ok(crate::document_pipeline::ExtractedDocument::new(
128 crate::document_parser::ParsedDocument::from_text(self.parse(path)?),
129 ))
130 }
131
132 fn max_file_size(&self) -> u64 {
133 1024 * 1024
134 }
135}
136
137pub fn default_document_parser_registry() -> DocumentParserRegistry {
139 crate::document_registry_factory::build_document_parser_registry(
140 crate::config::DocumentParserConfig::default(),
141 None,
142 )
143}
144
145pub fn document_parser_registry_with_config(
147 config: crate::config::DocumentParserConfig,
148) -> DocumentParserRegistry {
149 crate::document_registry_factory::build_document_parser_registry(config, None)
150}
151
152pub fn document_parser_registry_with_config_and_ocr(
155 config: crate::config::DocumentParserConfig,
156 ocr_provider: Arc<dyn crate::document_ocr::DocumentOcrProvider>,
157) -> DocumentParserRegistry {
158 crate::document_registry_factory::build_document_parser_registry(config, Some(ocr_provider))
159}
160
161#[cfg(test)]
162mod tests {
163 use super::*;
164 use std::io::Write;
165 use tempfile::TempDir;
166
167 fn write_temp(dir: &TempDir, name: &str, content: &str) -> std::path::PathBuf {
168 let path = dir.path().join(name);
169 let mut f = std::fs::File::create(&path).unwrap();
170 write!(f, "{}", content).unwrap();
171 path
172 }
173
174 fn build_registry() -> DocumentParserRegistry {
175 crate::document_registry_factory::build_document_parser_registry(
176 crate::config::DocumentParserConfig::default(),
177 None,
178 )
179 }
180
181 #[test]
182 fn plain_text_parser_basic() {
183 let parser = PlainTextParser;
184 assert_eq!(parser.name(), "plain-text");
185 assert!(parser.supported_extensions().contains(&"rs"));
186 assert!(parser.supported_extensions().contains(&"md"));
187 assert!(parser.supported_extensions().contains(&"tex"));
188 assert!(parser.supported_extensions().contains(&"typst"));
189 assert!(parser.supported_extensions().contains(&"json"));
190 }
191
192 #[test]
193 fn registry_default_has_plain_text() {
194 let r = build_registry();
195 assert!(r.len() >= 2);
196 assert!(r.find_parser(Path::new("main.rs")).is_some());
197 }
198
199 #[test]
200 fn registry_finds_parser_by_extension() {
201 let r = build_registry();
202 assert!(r.find_parser(Path::new("main.rs")).is_some());
203 assert!(r.find_parser(Path::new("config.toml")).is_some());
204 assert!(r.find_parser(Path::new("README.md")).is_some());
205 }
206
207 #[test]
208 fn registry_no_parser_for_binary() {
209 let r = build_registry();
210 assert!(r.find_parser(Path::new("binary.exe")).is_none());
211 assert!(r.find_parser(Path::new("document.pdf")).is_some());
212 }
213
214 #[test]
215 fn parse_file_reads_text() {
216 let dir = TempDir::new().unwrap();
217 let path = write_temp(&dir, "hello.rs", "fn main() {}");
218
219 let r = build_registry();
220 let result = r.parse_file(&path).unwrap();
221 assert!(result.is_some());
222 assert!(result.unwrap().contains("fn main"));
223 }
224
225 #[test]
226 fn parse_file_extracted_returns_structured_output() {
227 let dir = TempDir::new().unwrap();
228 let path = write_temp(&dir, "hello.rs", "fn main() {}");
229
230 let r = build_registry();
231 let result = r.parse_file_extracted(&path).unwrap();
232 assert!(result.is_some());
233 assert!(result
234 .unwrap()
235 .into_parsed_document()
236 .to_text()
237 .contains("fn main"));
238 }
239
240 #[test]
241 fn parsed_document_stats_helpers() {
242 let document = ParsedDocument {
243 title: Some("hello".to_string()),
244 blocks: vec![
245 DocumentBlock::new(DocumentBlockKind::Paragraph, Some("intro"), "hello world"),
246 DocumentBlock::new(DocumentBlockKind::Raw, None::<String>, " "),
247 ],
248 metadata: None,
249 ..Default::default()
250 };
251
252 assert_eq!(document.block_count(), 2);
253 assert_eq!(document.non_empty_block_count(), 1);
254 assert!(document.char_count() >= "hello".len());
255 }
256
257 #[test]
258 fn document_block_location_builders() {
259 let block = DocumentBlock::new(DocumentBlockKind::Paragraph, Some("intro"), "hello")
260 .with_source("chapter1")
261 .with_page(3)
262 .with_ordinal(7);
263
264 let location = block.location.expect("location should exist");
265 assert_eq!(location.source.as_deref(), Some("chapter1"));
266 assert_eq!(location.page, Some(3));
267 assert_eq!(location.ordinal, Some(7));
268 }
269
270 #[test]
271 fn parse_file_returns_none_for_unknown_extension() {
272 let dir = TempDir::new().unwrap();
273 let path = write_temp(&dir, "file.xyz", "data");
274
275 let r = build_registry();
276 assert!(r.parse_file(&path).unwrap().is_none());
277 }
278}