1use super::{
6 AnalyzedChart, ChartProcessor, ColumnLayout, DocumentLayout, DocumentMetadata, DocumentSection,
7 DocumentType, EmbeddingWeights, ExtractedTable, ImageProcessor, MultiModalDocument,
8 MultiModalEmbeddings, ProcessedImage, SectionType, TableProcessor,
9};
10use crate::{RragError, RragResult};
11use serde::{Deserialize, Serialize};
12use std::path::Path;
13
14pub struct DocumentParser {
16 config: DocumentParserConfig,
18
19 image_processor: Box<dyn ImageProcessor>,
21
22 table_processor: Box<dyn TableProcessor>,
24
25 chart_processor: Box<dyn ChartProcessor>,
27
28 text_extractor: TextExtractor,
30
31 section_analyzer: SectionAnalyzer,
33
34 layout_detector: LayoutDetector,
36}
37
38#[derive(Debug, Clone)]
40pub struct DocumentParserConfig {
41 pub supported_types: Vec<DocumentType>,
43
44 pub extract_text: bool,
46
47 pub extract_images: bool,
49
50 pub extract_tables: bool,
52
53 pub extract_charts: bool,
55
56 pub analyze_structure: bool,
58
59 pub max_file_size: usize,
61
62 pub max_pages: Option<usize>,
64}
65
66pub struct TextExtractor {
68 config: TextExtractionConfig,
70
71 pdf_extractor: PDFTextExtractor,
73
74 word_extractor: WordTextExtractor,
76
77 ppt_extractor: PowerPointTextExtractor,
79
80 html_extractor: HTMLTextExtractor,
82}
83
84#[derive(Debug, Clone)]
86pub struct TextExtractionConfig {
87 pub preserve_formatting: bool,
89
90 pub extract_footnotes: bool,
92
93 pub extract_headers_footers: bool,
95
96 pub min_block_size: usize,
98}
99
100pub struct SectionAnalyzer {
102 patterns: Vec<SectionPattern>,
104
105 heading_detector: HeadingDetector,
107}
108
109pub struct LayoutDetector {
111 column_threshold: f32,
113
114 reading_order_analyzer: ReadingOrderAnalyzer,
116}
117
118pub struct PDFTextExtractor {
120 extract_metadata: bool,
122
123 extract_bookmarks: bool,
125}
126
127pub struct WordTextExtractor {
129 extract_styles: bool,
131
132 extract_comments: bool,
134}
135
136pub struct PowerPointTextExtractor {
138 extract_notes: bool,
140
141 extract_animations: bool,
143}
144
145pub struct HTMLTextExtractor {
147 remove_scripts: bool,
149
150 remove_styles: bool,
152}
153
154#[derive(Debug, Clone)]
156pub struct SectionPattern {
157 pub pattern: String,
159
160 pub section_type: SectionType,
162
163 pub priority: u32,
165}
166
167pub struct HeadingDetector {
169 patterns: Vec<HeadingPattern>,
171}
172
173#[derive(Debug, Clone)]
175pub struct HeadingPattern {
176 pub pattern: String,
178
179 pub level: usize,
181
182 pub confidence: f32,
184}
185
186pub struct ReadingOrderAnalyzer {
188 strategy: ReadingOrderStrategy,
190}
191
192#[derive(Debug, Clone, Copy)]
194pub enum ReadingOrderStrategy {
195 LeftToRight,
196 TopToBottom,
197 ZPattern,
198 FPattern,
199 Auto,
200}
201
202#[derive(Debug, Clone)]
204pub struct DocumentParseResult {
205 pub document: MultiModalDocument,
207
208 pub confidence: f32,
210
211 pub processing_time_ms: u64,
213
214 pub warnings: Vec<String>,
216
217 pub statistics: ParseStatistics,
219}
220
221#[derive(Debug, Clone, Serialize, Deserialize)]
223pub struct ParseStatistics {
224 pub text_length: usize,
226
227 pub image_count: usize,
229
230 pub table_count: usize,
232
233 pub chart_count: usize,
235
236 pub section_count: usize,
238
239 pub page_count: usize,
241}
242
243impl DocumentParser {
244 pub fn new(
246 config: DocumentParserConfig,
247 image_processor: Box<dyn ImageProcessor>,
248 table_processor: Box<dyn TableProcessor>,
249 chart_processor: Box<dyn ChartProcessor>,
250 ) -> RragResult<Self> {
251 let text_extractor = TextExtractor::new(TextExtractionConfig::default())?;
252 let section_analyzer = SectionAnalyzer::new()?;
253 let layout_detector = LayoutDetector::new();
254
255 Ok(Self {
256 config,
257 image_processor,
258 table_processor,
259 chart_processor,
260 text_extractor,
261 section_analyzer,
262 layout_detector,
263 })
264 }
265
266 pub async fn parse_document(&self, file_path: &Path) -> RragResult<DocumentParseResult> {
268 let start_time = std::time::Instant::now();
269
270 let doc_type = self.detect_document_type(file_path)?;
272
273 self.validate_file_size(file_path)?;
275
276 let content = self.extract_content(file_path, doc_type).await?;
278
279 let images = if self.config.extract_images {
281 self.extract_images(&content).await?
282 } else {
283 vec![]
284 };
285
286 let tables = if self.config.extract_tables {
287 self.extract_tables(&content).await?
288 } else {
289 vec![]
290 };
291
292 let charts = if self.config.extract_charts {
293 self.extract_charts(&content).await?
294 } else {
295 vec![]
296 };
297
298 let layout = if self.config.analyze_structure {
300 self.analyze_layout(&content).await?
301 } else {
302 DocumentLayout {
303 pages: 1,
304 sections: vec![],
305 reading_order: vec![],
306 columns: None,
307 document_type: doc_type,
308 }
309 };
310
311 let metadata = self.extract_metadata(file_path, &content)?;
313
314 let document_id = format!(
316 "doc_{}",
317 uuid::Uuid::new_v4().to_string().split('-').next().unwrap()
318 );
319 let document = MultiModalDocument {
320 id: document_id,
321 text_content: content.text,
322 images,
323 tables,
324 charts,
325 layout,
326 embeddings: MultiModalEmbeddings {
327 text_embeddings: vec![],
328 visual_embeddings: None,
329 table_embeddings: None,
330 fused_embedding: vec![],
331 weights: EmbeddingWeights {
332 text_weight: 0.6,
333 visual_weight: 0.2,
334 table_weight: 0.1,
335 chart_weight: 0.1,
336 },
337 },
338 metadata,
339 };
340
341 let processing_time = start_time.elapsed().as_millis() as u64;
342
343 Ok(DocumentParseResult {
344 confidence: 0.85,
345 processing_time_ms: processing_time,
346 warnings: vec![],
347 statistics: ParseStatistics {
348 text_length: document.text_content.len(),
349 image_count: document.images.len(),
350 table_count: document.tables.len(),
351 chart_count: document.charts.len(),
352 section_count: document.layout.sections.len(),
353 page_count: document.layout.pages,
354 },
355 document,
356 })
357 }
358
359 fn detect_document_type(&self, file_path: &Path) -> RragResult<DocumentType> {
361 let extension = file_path
362 .extension()
363 .and_then(|ext| ext.to_str())
364 .unwrap_or("")
365 .to_lowercase();
366
367 match extension.as_str() {
368 "pdf" => Ok(DocumentType::PDF),
369 "doc" | "docx" => Ok(DocumentType::Word),
370 "ppt" | "pptx" => Ok(DocumentType::PowerPoint),
371 "html" | "htm" => Ok(DocumentType::HTML),
372 "md" => Ok(DocumentType::Markdown),
373 "txt" => Ok(DocumentType::PlainText),
374 _ => Ok(DocumentType::Mixed),
375 }
376 }
377
378 fn validate_file_size(&self, file_path: &Path) -> RragResult<()> {
380 let metadata =
381 std::fs::metadata(file_path).map_err(|e| RragError::io_error(e.to_string()))?;
382
383 if metadata.len() as usize > self.config.max_file_size {
384 return Err(RragError::validation(
385 "file_size",
386 format!("maximum {} bytes", self.config.max_file_size),
387 format!("{} bytes", metadata.len()),
388 ));
389 }
390
391 Ok(())
392 }
393
394 async fn extract_content(
396 &self,
397 file_path: &Path,
398 doc_type: DocumentType,
399 ) -> RragResult<ExtractedContent> {
400 match doc_type {
401 DocumentType::PDF => self.text_extractor.extract_from_pdf(file_path).await,
402 DocumentType::Word => self.text_extractor.extract_from_word(file_path).await,
403 DocumentType::PowerPoint => self.text_extractor.extract_from_ppt(file_path).await,
404 DocumentType::HTML => self.text_extractor.extract_from_html(file_path).await,
405 DocumentType::Markdown => self.text_extractor.extract_from_markdown(file_path).await,
406 DocumentType::PlainText => self.text_extractor.extract_from_text(file_path).await,
407 DocumentType::Mixed => {
408 self.text_extractor.extract_auto_detect(file_path).await
410 }
411 }
412 }
413
414 async fn extract_images(&self, content: &ExtractedContent) -> RragResult<Vec<ProcessedImage>> {
416 let mut images = Vec::new();
417
418 for image_ref in &content.image_references {
419 if let Ok(processed) = self.image_processor.process_image(&image_ref.path) {
420 images.push(processed);
421 }
422 }
423
424 Ok(images)
425 }
426
427 async fn extract_tables(&self, content: &ExtractedContent) -> RragResult<Vec<ExtractedTable>> {
429 let mut tables = Vec::new();
430
431 for table_content in &content.table_content {
432 if let Ok(extracted) = self.table_processor.extract_table(table_content) {
433 tables.extend(extracted);
434 }
435 }
436
437 Ok(tables)
438 }
439
440 async fn extract_charts(&self, content: &ExtractedContent) -> RragResult<Vec<AnalyzedChart>> {
442 let mut charts = Vec::new();
443
444 for chart_ref in &content.chart_references {
445 if let Ok(analyzed) = self.chart_processor.analyze_chart(&chart_ref.path) {
446 charts.push(analyzed);
447 }
448 }
449
450 Ok(charts)
451 }
452
453 async fn analyze_layout(&self, content: &ExtractedContent) -> RragResult<DocumentLayout> {
455 let sections = self.section_analyzer.analyze_sections(&content.text)?;
456 let reading_order = self.layout_detector.determine_reading_order(§ions)?;
457 let columns = self.layout_detector.detect_columns(&content.text)?;
458
459 Ok(DocumentLayout {
460 pages: content.page_count,
461 sections,
462 reading_order,
463 columns,
464 document_type: content.document_type,
465 })
466 }
467
468 fn extract_metadata(
470 &self,
471 file_path: &Path,
472 content: &ExtractedContent,
473 ) -> RragResult<DocumentMetadata> {
474 let file_metadata =
475 std::fs::metadata(file_path).map_err(|e| RragError::io_error(e.to_string()))?;
476
477 Ok(DocumentMetadata {
478 title: content.title.clone(),
479 author: content.author.clone(),
480 creation_date: content.creation_date.clone(),
481 modification_date: file_metadata
482 .modified()
483 .ok()
484 .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
485 .map(|d| d.as_secs().to_string()),
486 page_count: content.page_count,
487 word_count: content.text.split_whitespace().count(),
488 language: content.language.clone().unwrap_or_else(|| "en".to_string()),
489 format: content.document_type,
490 })
491 }
492}
493
494#[derive(Debug, Clone)]
496pub struct ExtractedContent {
497 pub text: String,
499
500 pub document_type: DocumentType,
502
503 pub page_count: usize,
505
506 pub image_references: Vec<ImageReference>,
508
509 pub table_content: Vec<String>,
511
512 pub chart_references: Vec<ChartReference>,
514
515 pub title: Option<String>,
517
518 pub author: Option<String>,
520
521 pub creation_date: Option<String>,
523
524 pub language: Option<String>,
526}
527
528#[derive(Debug, Clone)]
530pub struct ImageReference {
531 pub path: std::path::PathBuf,
532 pub caption: Option<String>,
533 pub alt_text: Option<String>,
534}
535
536#[derive(Debug, Clone)]
538pub struct ChartReference {
539 pub path: std::path::PathBuf,
540 pub title: Option<String>,
541 pub description: Option<String>,
542}
543
544impl TextExtractor {
545 pub fn new(config: TextExtractionConfig) -> RragResult<Self> {
547 Ok(Self {
548 config,
549 pdf_extractor: PDFTextExtractor::new(),
550 word_extractor: WordTextExtractor::new(),
551 ppt_extractor: PowerPointTextExtractor::new(),
552 html_extractor: HTMLTextExtractor::new(),
553 })
554 }
555
556 pub async fn extract_from_pdf(&self, file_path: &Path) -> RragResult<ExtractedContent> {
558 self.pdf_extractor.extract(file_path).await
559 }
560
561 pub async fn extract_from_word(&self, file_path: &Path) -> RragResult<ExtractedContent> {
563 self.word_extractor.extract(file_path).await
564 }
565
566 pub async fn extract_from_ppt(&self, file_path: &Path) -> RragResult<ExtractedContent> {
568 self.ppt_extractor.extract(file_path).await
569 }
570
571 pub async fn extract_from_html(&self, file_path: &Path) -> RragResult<ExtractedContent> {
573 self.html_extractor.extract(file_path).await
574 }
575
576 pub async fn extract_from_markdown(&self, file_path: &Path) -> RragResult<ExtractedContent> {
578 let content =
579 std::fs::read_to_string(file_path).map_err(|e| RragError::io_error(e.to_string()))?;
580
581 Ok(ExtractedContent {
582 text: content,
583 document_type: DocumentType::Markdown,
584 page_count: 1,
585 image_references: vec![],
586 table_content: vec![],
587 chart_references: vec![],
588 title: None,
589 author: None,
590 creation_date: None,
591 language: Some("en".to_string()),
592 })
593 }
594
595 pub async fn extract_from_text(&self, file_path: &Path) -> RragResult<ExtractedContent> {
597 let content =
598 std::fs::read_to_string(file_path).map_err(|e| RragError::io_error(e.to_string()))?;
599
600 Ok(ExtractedContent {
601 text: content,
602 document_type: DocumentType::PlainText,
603 page_count: 1,
604 image_references: vec![],
605 table_content: vec![],
606 chart_references: vec![],
607 title: None,
608 author: None,
609 creation_date: None,
610 language: Some("en".to_string()),
611 })
612 }
613
614 pub async fn extract_auto_detect(&self, file_path: &Path) -> RragResult<ExtractedContent> {
616 self.extract_from_text(file_path).await
618 }
619}
620
621impl SectionAnalyzer {
622 pub fn new() -> RragResult<Self> {
624 let patterns = vec![
625 SectionPattern {
626 pattern: r"^Abstract\s*$".to_string(),
627 section_type: SectionType::Abstract,
628 priority: 100,
629 },
630 SectionPattern {
631 pattern: r"^Introduction\s*$".to_string(),
632 section_type: SectionType::Introduction,
633 priority: 90,
634 },
635 SectionPattern {
636 pattern: r"^Conclusion\s*$".to_string(),
637 section_type: SectionType::Conclusion,
638 priority: 80,
639 },
640 SectionPattern {
641 pattern: r"^References\s*$".to_string(),
642 section_type: SectionType::References,
643 priority: 70,
644 },
645 ];
646
647 let heading_detector = HeadingDetector::new();
648
649 Ok(Self {
650 patterns,
651 heading_detector,
652 })
653 }
654
655 pub fn analyze_sections(&self, text: &str) -> RragResult<Vec<DocumentSection>> {
657 let mut sections = Vec::new();
658 let lines: Vec<&str> = text.lines().collect();
659
660 let mut current_section: Option<DocumentSection> = None;
661 let mut content_buffer = String::new();
662
663 for (_line_idx, line) in lines.iter().enumerate() {
664 let trimmed = line.trim();
665
666 if let Some((section_type, level)) = self.detect_section_start(trimmed) {
668 if let Some(mut section) = current_section.take() {
670 section.content = content_buffer.trim().to_string();
671 sections.push(section);
672 content_buffer.clear();
673 }
674
675 current_section = Some(DocumentSection {
677 id: format!("section_{}", sections.len()),
678 title: Some(trimmed.to_string()),
679 content: String::new(),
680 section_type,
681 level,
682 page_range: (1, 1), });
684 } else {
685 content_buffer.push_str(line);
687 content_buffer.push('\n');
688 }
689 }
690
691 if let Some(mut section) = current_section {
693 section.content = content_buffer.trim().to_string();
694 sections.push(section);
695 }
696
697 if sections.is_empty() {
699 sections.push(DocumentSection {
700 id: "section_0".to_string(),
701 title: None,
702 content: text.to_string(),
703 section_type: SectionType::Body,
704 level: 1,
705 page_range: (1, 1),
706 });
707 }
708
709 Ok(sections)
710 }
711
712 fn detect_section_start(&self, line: &str) -> Option<(SectionType, usize)> {
714 for pattern in &self.patterns {
716 if let Ok(regex) = regex::Regex::new(&pattern.pattern) {
717 if regex.is_match(line) {
718 return Some((pattern.section_type, 1));
719 }
720 }
721 }
722
723 if let Some((level, _)) = self.heading_detector.detect_heading(line) {
725 return Some((SectionType::Body, level));
726 }
727
728 None
729 }
730}
731
732impl HeadingDetector {
733 pub fn new() -> Self {
735 let patterns = vec![
736 HeadingPattern {
737 pattern: r"^#+\s+".to_string(), level: 1,
739 confidence: 0.9,
740 },
741 HeadingPattern {
742 pattern: r"^[A-Z][A-Z\s]{5,}\s*$".to_string(), level: 1,
744 confidence: 0.7,
745 },
746 ];
747
748 Self { patterns }
749 }
750
751 pub fn detect_heading(&self, line: &str) -> Option<(usize, f32)> {
753 for pattern in &self.patterns {
754 if let Ok(regex) = regex::Regex::new(&pattern.pattern) {
755 if regex.is_match(line) {
756 let level = if pattern.pattern.starts_with("^#+") {
758 line.chars().take_while(|&c| c == '#').count()
759 } else {
760 pattern.level
761 };
762
763 return Some((level, pattern.confidence));
764 }
765 }
766 }
767
768 None
769 }
770}
771
772impl LayoutDetector {
773 pub fn new() -> Self {
775 Self {
776 column_threshold: 0.3,
777 reading_order_analyzer: ReadingOrderAnalyzer::new(),
778 }
779 }
780
781 pub fn determine_reading_order(&self, sections: &[DocumentSection]) -> RragResult<Vec<String>> {
783 Ok(sections.iter().map(|s| s.id.clone()).collect())
784 }
785
786 pub fn detect_columns(&self, text: &str) -> RragResult<Option<ColumnLayout>> {
788 let lines: Vec<&str> = text.lines().collect();
790 let avg_line_length =
791 lines.iter().map(|line| line.len()).sum::<usize>() as f32 / lines.len() as f32;
792
793 if avg_line_length > 120.0 {
794 Ok(Some(ColumnLayout {
796 column_count: 2,
797 column_widths: vec![0.5, 0.5],
798 gutter_width: 0.05,
799 }))
800 } else {
801 Ok(None)
802 }
803 }
804}
805
806impl ReadingOrderAnalyzer {
807 pub fn new() -> Self {
809 Self {
810 strategy: ReadingOrderStrategy::Auto,
811 }
812 }
813}
814
815impl PDFTextExtractor {
817 pub fn new() -> Self {
818 Self {
819 extract_metadata: true,
820 extract_bookmarks: true,
821 }
822 }
823
824 pub async fn extract(&self, _file_path: &Path) -> RragResult<ExtractedContent> {
825 Ok(ExtractedContent {
827 text: "Extracted PDF content".to_string(),
828 document_type: DocumentType::PDF,
829 page_count: 5,
830 image_references: vec![],
831 table_content: vec![],
832 chart_references: vec![],
833 title: Some("Sample PDF Document".to_string()),
834 author: Some("PDF Author".to_string()),
835 creation_date: Some("2024-01-01".to_string()),
836 language: Some("en".to_string()),
837 })
838 }
839}
840
841impl WordTextExtractor {
842 pub fn new() -> Self {
843 Self {
844 extract_styles: true,
845 extract_comments: false,
846 }
847 }
848
849 pub async fn extract(&self, _file_path: &Path) -> RragResult<ExtractedContent> {
850 Ok(ExtractedContent {
852 text: "Extracted Word content".to_string(),
853 document_type: DocumentType::Word,
854 page_count: 3,
855 image_references: vec![],
856 table_content: vec![],
857 chart_references: vec![],
858 title: Some("Sample Word Document".to_string()),
859 author: Some("Word Author".to_string()),
860 creation_date: Some("2024-01-01".to_string()),
861 language: Some("en".to_string()),
862 })
863 }
864}
865
866impl PowerPointTextExtractor {
867 pub fn new() -> Self {
868 Self {
869 extract_notes: true,
870 extract_animations: false,
871 }
872 }
873
874 pub async fn extract(&self, _file_path: &Path) -> RragResult<ExtractedContent> {
875 Ok(ExtractedContent {
877 text: "Extracted PowerPoint content".to_string(),
878 document_type: DocumentType::PowerPoint,
879 page_count: 10,
880 image_references: vec![],
881 table_content: vec![],
882 chart_references: vec![],
883 title: Some("Sample PowerPoint Presentation".to_string()),
884 author: Some("PPT Author".to_string()),
885 creation_date: Some("2024-01-01".to_string()),
886 language: Some("en".to_string()),
887 })
888 }
889}
890
891impl HTMLTextExtractor {
892 pub fn new() -> Self {
893 Self {
894 remove_scripts: true,
895 remove_styles: true,
896 }
897 }
898
899 pub async fn extract(&self, file_path: &Path) -> RragResult<ExtractedContent> {
900 let html_content =
901 std::fs::read_to_string(file_path).map_err(|e| RragError::io_error(e.to_string()))?;
902
903 let text = html_content
905 .split('<')
906 .enumerate()
907 .filter_map(|(i, part)| {
908 if i == 0 {
909 Some(part)
910 } else if let Some(end_pos) = part.find('>') {
911 Some(&part[end_pos + 1..])
912 } else {
913 None
914 }
915 })
916 .collect::<Vec<_>>()
917 .join("");
918
919 Ok(ExtractedContent {
920 text,
921 document_type: DocumentType::HTML,
922 page_count: 1,
923 image_references: vec![],
924 table_content: vec![],
925 chart_references: vec![],
926 title: None,
927 author: None,
928 creation_date: None,
929 language: Some("en".to_string()),
930 })
931 }
932}
933
934impl Default for DocumentParserConfig {
935 fn default() -> Self {
936 Self {
937 supported_types: vec![
938 DocumentType::PDF,
939 DocumentType::Word,
940 DocumentType::HTML,
941 DocumentType::Markdown,
942 DocumentType::PlainText,
943 ],
944 extract_text: true,
945 extract_images: true,
946 extract_tables: true,
947 extract_charts: true,
948 analyze_structure: true,
949 max_file_size: 100 * 1024 * 1024, max_pages: Some(1000),
951 }
952 }
953}
954
955impl Default for TextExtractionConfig {
956 fn default() -> Self {
957 Self {
958 preserve_formatting: true,
959 extract_footnotes: true,
960 extract_headers_footers: false,
961 min_block_size: 10,
962 }
963 }
964}
965
966#[cfg(test)]
967mod tests {
968 use super::*;
969 use tempfile::NamedTempFile;
970
971 #[test]
972 fn test_document_type_detection() {
973 let parser = create_test_parser();
974
975 let pdf_path = std::path::Path::new("test.pdf");
976 assert_eq!(
977 parser.detect_document_type(pdf_path).unwrap(),
978 DocumentType::PDF
979 );
980
981 let word_path = std::path::Path::new("test.docx");
982 assert_eq!(
983 parser.detect_document_type(word_path).unwrap(),
984 DocumentType::Word
985 );
986 }
987
988 #[test]
989 fn test_section_detection() {
990 let analyzer = SectionAnalyzer::new().unwrap();
991 let text = "Abstract\n\nThis is the abstract.\n\nIntroduction\n\nThis is the introduction.";
992
993 let sections = analyzer.analyze_sections(text).unwrap();
994 assert_eq!(sections.len(), 2);
995 assert_eq!(sections[0].section_type, SectionType::Abstract);
996 assert_eq!(sections[1].section_type, SectionType::Introduction);
997 }
998
999 #[test]
1000 fn test_heading_detection() {
1001 let detector = HeadingDetector::new();
1002
1003 assert!(detector.detect_heading("# Main Heading").is_some());
1005 assert!(detector.detect_heading("## Sub Heading").is_some());
1006
1007 assert!(detector.detect_heading("MAIN SECTION").is_some());
1009
1010 assert!(detector.detect_heading("This is regular text").is_none());
1012 }
1013
1014 fn create_test_parser() -> DocumentParser {
1015 use super::super::{chart_processor, image_processor, table_processor};
1016
1017 DocumentParser::new(
1018 DocumentParserConfig::default(),
1019 Box::new(
1020 image_processor::DefaultImageProcessor::new(
1021 super::super::ImageProcessingConfig::default(),
1022 )
1023 .unwrap(),
1024 ),
1025 Box::new(
1026 table_processor::DefaultTableProcessor::new(
1027 super::super::TableExtractionConfig::default(),
1028 )
1029 .unwrap(),
1030 ),
1031 Box::new(
1032 chart_processor::DefaultChartProcessor::new(
1033 super::super::ChartAnalysisConfig::default(),
1034 )
1035 .unwrap(),
1036 ),
1037 )
1038 .unwrap()
1039 }
1040}