1use anyhow::Result;
37use std::collections::HashMap;
38use std::path::Path;
39use std::sync::Arc;
40
41#[derive(Debug, Clone, PartialEq, Eq)]
46pub enum DocumentBlockKind {
47 Paragraph,
48 Heading,
49 Table,
50 Section,
51 Metadata,
52 Slide,
53 EmailHeader,
54 Code,
55 Raw,
56}
57
58#[derive(Debug, Clone, Default, PartialEq, Eq)]
59pub struct DocumentBlockLocation {
60 pub source: Option<String>,
61 pub page: Option<usize>,
62 pub ordinal: Option<usize>,
63}
64
65#[derive(Debug, Clone, PartialEq, Eq)]
66pub struct DocumentBlock {
67 pub kind: DocumentBlockKind,
68 pub label: Option<String>,
69 pub content: String,
70 pub location: Option<DocumentBlockLocation>,
71}
72
73impl DocumentBlock {
74 pub fn new(
75 kind: DocumentBlockKind,
76 label: Option<impl Into<String>>,
77 content: impl Into<String>,
78 ) -> Self {
79 Self {
80 kind,
81 label: label.map(Into::into),
82 content: content.into(),
83 location: None,
84 }
85 }
86
87 pub fn with_source(mut self, source: impl Into<String>) -> Self {
88 self.location
89 .get_or_insert_with(DocumentBlockLocation::default)
90 .source = Some(source.into());
91 self
92 }
93
94 pub fn with_page(mut self, page: usize) -> Self {
95 self.location
96 .get_or_insert_with(DocumentBlockLocation::default)
97 .page = Some(page);
98 self
99 }
100
101 pub fn with_ordinal(mut self, ordinal: usize) -> Self {
102 self.location
103 .get_or_insert_with(DocumentBlockLocation::default)
104 .ordinal = Some(ordinal);
105 self
106 }
107}
108
109#[derive(Debug, Clone, Default, PartialEq, Eq)]
110pub struct ParsedDocument {
111 pub title: Option<String>,
112 pub blocks: Vec<DocumentBlock>,
113}
114
115impl ParsedDocument {
116 pub fn new() -> Self {
117 Self::default()
118 }
119
120 pub fn from_text(text: impl Into<String>) -> Self {
121 Self {
122 title: None,
123 blocks: vec![DocumentBlock::new(
124 DocumentBlockKind::Raw,
125 None::<String>,
126 text,
127 )],
128 }
129 }
130
131 pub fn with_title(mut self, title: impl Into<String>) -> Self {
132 self.title = Some(title.into());
133 self
134 }
135
136 pub fn push(&mut self, block: DocumentBlock) {
137 self.blocks.push(block);
138 }
139
140 pub fn block_count(&self) -> usize {
141 self.blocks.len()
142 }
143
144 pub fn non_empty_block_count(&self) -> usize {
145 self.blocks
146 .iter()
147 .filter(|block| !block.content.trim().is_empty())
148 .count()
149 }
150
151 pub fn char_count(&self) -> usize {
152 self.to_text().chars().count()
153 }
154
155 pub fn is_empty(&self) -> bool {
156 self.blocks.iter().all(|b| b.content.trim().is_empty())
157 }
158
159 pub fn to_text(&self) -> String {
160 let mut parts = Vec::new();
161 if let Some(title) = &self.title {
162 if !title.trim().is_empty() {
163 parts.push(title.trim().to_string());
164 }
165 }
166 for block in &self.blocks {
167 let mut chunk = String::new();
168 if let Some(label) = &block.label {
169 if !label.trim().is_empty() {
170 chunk.push_str(label.trim());
171 chunk.push('\n');
172 }
173 }
174 chunk.push_str(block.content.trim());
175 if !chunk.trim().is_empty() {
176 parts.push(chunk.trim().to_string());
177 }
178 }
179 parts.join("\n\n")
180 }
181}
182
183pub trait DocumentParser: Send + Sync {
193 fn name(&self) -> &str;
195
196 fn supported_extensions(&self) -> &[&str];
200
201 fn parse(&self, path: &Path) -> Result<String>;
206
207 fn parse_document(&self, path: &Path) -> Result<ParsedDocument> {
212 Ok(ParsedDocument::from_text(self.parse(path)?))
213 }
214
215 fn can_parse(&self, path: &Path) -> bool {
219 path.extension()
220 .and_then(|e| e.to_str())
221 .map(|ext| {
222 self.supported_extensions()
223 .iter()
224 .any(|s| s.eq_ignore_ascii_case(ext))
225 })
226 .unwrap_or(false)
227 }
228
229 fn max_file_size(&self) -> u64 {
232 10 * 1024 * 1024
233 }
234}
235
236pub struct PlainTextParser;
244
245impl DocumentParser for PlainTextParser {
246 fn name(&self) -> &str {
247 "plain-text"
248 }
249
250 fn supported_extensions(&self) -> &[&str] {
251 &[
252 "rs",
254 "py",
255 "ts",
256 "tsx",
257 "js",
258 "jsx",
259 "go",
260 "java",
261 "c",
262 "cpp",
263 "h",
264 "hpp",
265 "cs",
266 "rb",
267 "php",
268 "swift",
269 "kt",
270 "scala",
271 "sh",
272 "bash",
273 "zsh",
274 "fish",
275 "toml",
277 "yaml",
278 "yml",
279 "json",
280 "jsonc",
281 "ini",
282 "conf",
283 "cfg",
284 "env",
285 "xml",
286 "md",
288 "mdx",
289 "txt",
290 "rst",
291 "adoc",
292 "org",
293 "html",
295 "htm",
296 "css",
297 "scss",
298 "sass",
299 "less",
300 "csv",
302 "tsv",
303 "log",
304 "makefile",
306 "dockerfile",
307 "gradlew",
308 ]
309 }
310
311 fn parse(&self, path: &Path) -> Result<String> {
312 std::fs::read_to_string(path).map_err(|e| {
313 anyhow::anyhow!(
314 "plain-text parser: failed to read {}: {}",
315 path.display(),
316 e
317 )
318 })
319 }
320
321 fn max_file_size(&self) -> u64 {
322 1024 * 1024 }
324}
325
326#[derive(Clone)]
336pub struct DocumentParserRegistry {
337 parsers: Vec<Arc<dyn DocumentParser>>,
339 extension_map: HashMap<String, Arc<dyn DocumentParser>>,
341}
342
343impl DocumentParserRegistry {
344 pub fn new() -> Self {
346 Self::new_with_default_parser(crate::config::DefaultParserConfig::default(), None)
347 }
348
349 pub fn new_with_default_parser_config(config: crate::config::DefaultParserConfig) -> Self {
352 Self::new_with_default_parser(config, None)
353 }
354
355 pub fn new_with_default_parser(
358 config: crate::config::DefaultParserConfig,
359 ocr_provider: Option<Arc<dyn crate::default_parser::DefaultParserOcrProvider>>,
360 ) -> Self {
361 let mut r = Self::empty();
362 r.register(Arc::new(PlainTextParser));
363 if config.enabled {
364 let parser = match ocr_provider {
365 Some(provider) => {
366 crate::default_parser::DefaultParser::with_config_and_ocr(config, provider)
367 }
368 None => crate::default_parser::DefaultParser::with_config(config),
369 };
370 r.register(Arc::new(parser));
371 }
372 r
373 }
374
375 pub fn empty() -> Self {
377 Self {
378 parsers: Vec::new(),
379 extension_map: HashMap::new(),
380 }
381 }
382
383 pub fn register(&mut self, parser: Arc<dyn DocumentParser>) {
385 for ext in parser.supported_extensions() {
386 self.extension_map
387 .insert(ext.to_lowercase(), Arc::clone(&parser));
388 }
389 self.parsers.push(parser);
390 }
391
392 pub fn find_parser(&self, path: &Path) -> Option<Arc<dyn DocumentParser>> {
395 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
397 if let Some(p) = self.extension_map.get(&ext.to_lowercase()) {
398 return Some(Arc::clone(p));
399 }
400 }
401 self.parsers.iter().find(|p| p.can_parse(path)).cloned()
403 }
404
405 pub fn parse_file_document(&self, path: &Path) -> Result<Option<ParsedDocument>> {
412 let parser = match self.find_parser(path) {
413 Some(p) => p,
414 None => return Ok(None),
415 };
416
417 if let Ok(meta) = std::fs::metadata(path) {
418 if meta.len() > parser.max_file_size() {
419 tracing::debug!(
420 "Skipping {} ({}): exceeds parser '{}' limit of {} bytes",
421 path.display(),
422 meta.len(),
423 parser.name(),
424 parser.max_file_size()
425 );
426 return Ok(None);
427 }
428 }
429
430 match parser.parse_document(path) {
431 Ok(document) => Ok(Some(document)),
432 Err(e) => {
433 tracing::warn!(
434 "Parser '{}' failed on {}: {}",
435 parser.name(),
436 path.display(),
437 e
438 );
439 Ok(None)
440 }
441 }
442 }
443
444 pub fn parse_file(&self, path: &Path) -> Result<Option<String>> {
447 Ok(self
448 .parse_file_document(path)?
449 .map(|document| document.to_text()))
450 }
451
452 pub fn parsers(&self) -> &[Arc<dyn DocumentParser>] {
454 &self.parsers
455 }
456
457 pub fn len(&self) -> usize {
459 self.parsers.len()
460 }
461
462 pub fn is_empty(&self) -> bool {
464 self.parsers.is_empty()
465 }
466}
467
468impl Default for DocumentParserRegistry {
469 fn default() -> Self {
470 Self::new()
471 }
472}
473
474#[cfg(test)]
479mod tests {
480 use super::*;
481 use std::io::Write;
482 use tempfile::TempDir;
483
484 fn write_temp(dir: &TempDir, name: &str, content: &str) -> std::path::PathBuf {
485 let path = dir.path().join(name);
486 let mut f = std::fs::File::create(&path).unwrap();
487 write!(f, "{}", content).unwrap();
488 path
489 }
490
491 #[test]
492 fn plain_text_parser_basic() {
493 let parser = PlainTextParser;
494 assert_eq!(parser.name(), "plain-text");
495 assert!(parser.supported_extensions().contains(&"rs"));
496 assert!(parser.supported_extensions().contains(&"md"));
497 assert!(parser.supported_extensions().contains(&"json"));
498 }
499
500 #[test]
501 fn registry_default_has_plain_text() {
502 let r = DocumentParserRegistry::new();
503 assert!(r.len() >= 2);
504 assert!(r.find_parser(Path::new("main.rs")).is_some());
505 }
506
507 #[test]
508 fn registry_empty_has_no_parsers() {
509 let r = DocumentParserRegistry::empty();
510 assert!(r.is_empty());
511 assert!(r.find_parser(Path::new("main.rs")).is_none());
512 }
513
514 #[test]
515 fn registry_finds_parser_by_extension() {
516 let r = DocumentParserRegistry::new();
517 assert!(r.find_parser(Path::new("main.rs")).is_some());
518 assert!(r.find_parser(Path::new("config.toml")).is_some());
519 assert!(r.find_parser(Path::new("README.md")).is_some());
520 }
521
522 #[test]
523 fn registry_no_parser_for_binary() {
524 let r = DocumentParserRegistry::new();
525 assert!(r.find_parser(Path::new("binary.exe")).is_none());
526 assert!(r.find_parser(Path::new("document.pdf")).is_some());
527 }
528
529 #[test]
530 fn registry_later_registration_wins() {
531 struct ParserA;
532 impl DocumentParser for ParserA {
533 fn name(&self) -> &str {
534 "a"
535 }
536 fn supported_extensions(&self) -> &[&str] {
537 &["txt"]
538 }
539 fn parse(&self, _: &Path) -> Result<String> {
540 Ok("A".into())
541 }
542 }
543
544 struct ParserB;
545 impl DocumentParser for ParserB {
546 fn name(&self) -> &str {
547 "b"
548 }
549 fn supported_extensions(&self) -> &[&str] {
550 &["txt"]
551 }
552 fn parse(&self, _: &Path) -> Result<String> {
553 Ok("B".into())
554 }
555 }
556
557 let mut r = DocumentParserRegistry::empty();
558 r.register(Arc::new(ParserA));
559 r.register(Arc::new(ParserB));
560
561 let p = r.find_parser(Path::new("file.txt")).unwrap();
562 assert_eq!(p.name(), "b");
563 }
564
565 #[test]
566 fn parse_file_reads_text() {
567 let dir = TempDir::new().unwrap();
568 let path = write_temp(&dir, "hello.rs", "fn main() {}");
569
570 let r = DocumentParserRegistry::new();
571 let result = r.parse_file(&path).unwrap();
572 assert!(result.is_some());
573 assert!(result.unwrap().contains("fn main"));
574 }
575
576 #[test]
577 fn parse_file_document_returns_structured_output() {
578 let dir = TempDir::new().unwrap();
579 let path = write_temp(&dir, "hello.rs", "fn main() {}");
580
581 let r = DocumentParserRegistry::new();
582 let result = r.parse_file_document(&path).unwrap();
583 assert!(result.is_some());
584 let document = result.unwrap();
585 assert!(!document.blocks.is_empty());
586 assert!(document.to_text().contains("fn main"));
587 }
588
589 #[test]
590 fn parsed_document_stats_helpers() {
591 let document = ParsedDocument {
592 title: Some("hello".to_string()),
593 blocks: vec![
594 DocumentBlock::new(DocumentBlockKind::Paragraph, Some("intro"), "hello world"),
595 DocumentBlock::new(DocumentBlockKind::Raw, None::<String>, " "),
596 ],
597 };
598
599 assert_eq!(document.block_count(), 2);
600 assert_eq!(document.non_empty_block_count(), 1);
601 assert!(document.char_count() >= "hello".len());
602 }
603
604 #[test]
605 fn document_block_location_builders() {
606 let block = DocumentBlock::new(DocumentBlockKind::Paragraph, Some("intro"), "hello")
607 .with_source("chapter1")
608 .with_page(3)
609 .with_ordinal(7);
610
611 let location = block.location.expect("location should exist");
612 assert_eq!(location.source.as_deref(), Some("chapter1"));
613 assert_eq!(location.page, Some(3));
614 assert_eq!(location.ordinal, Some(7));
615 }
616
617 #[test]
618 fn parse_file_returns_none_for_unknown_extension() {
619 let dir = TempDir::new().unwrap();
620 let path = write_temp(&dir, "file.xyz", "data");
621
622 let r = DocumentParserRegistry::new();
623 assert!(r.parse_file(&path).unwrap().is_none());
624 }
625
626 #[test]
627 fn parse_file_skips_oversized_file() {
628 struct TinyMaxParser;
629 impl DocumentParser for TinyMaxParser {
630 fn name(&self) -> &str {
631 "tiny"
632 }
633 fn supported_extensions(&self) -> &[&str] {
634 &["dat"]
635 }
636 fn parse(&self, path: &Path) -> Result<String> {
637 std::fs::read_to_string(path).map_err(Into::into)
638 }
639 fn max_file_size(&self) -> u64 {
640 3
641 } }
643
644 let dir = TempDir::new().unwrap();
645 let path = write_temp(&dir, "big.dat", "more than 3 bytes");
646
647 let mut r = DocumentParserRegistry::empty();
648 r.register(Arc::new(TinyMaxParser));
649
650 assert!(r.parse_file(&path).unwrap().is_none());
651 }
652}