rexis_rag/multimodal/
document_parser.rs

1//! # Document Parser
2//!
3//! Comprehensive document parsing with multi-modal content extraction.
4
5use super::{
6    AnalyzedChart, ChartProcessor, ColumnLayout, DocumentLayout, DocumentMetadata, DocumentSection,
7    DocumentType, EmbeddingWeights, ExtractedTable, ImageProcessor, MultiModalDocument,
8    MultiModalEmbeddings, ProcessedImage, SectionType, TableProcessor,
9};
10use crate::{RragError, RragResult};
11use serde::{Deserialize, Serialize};
12use std::path::Path;
13
14/// Document parser for multi-modal content
15pub struct DocumentParser {
16    /// Configuration
17    config: DocumentParserConfig,
18
19    /// Image processor
20    image_processor: Box<dyn ImageProcessor>,
21
22    /// Table processor
23    table_processor: Box<dyn TableProcessor>,
24
25    /// Chart processor
26    chart_processor: Box<dyn ChartProcessor>,
27
28    /// Text extractor
29    text_extractor: TextExtractor,
30
31    /// Section analyzer
32    section_analyzer: SectionAnalyzer,
33
34    /// Layout detector
35    layout_detector: LayoutDetector,
36}
37
38/// Document parser configuration
39#[derive(Debug, Clone)]
40pub struct DocumentParserConfig {
41    /// Supported document types
42    pub supported_types: Vec<DocumentType>,
43
44    /// Extract text content
45    pub extract_text: bool,
46
47    /// Extract images
48    pub extract_images: bool,
49
50    /// Extract tables
51    pub extract_tables: bool,
52
53    /// Extract charts
54    pub extract_charts: bool,
55
56    /// Analyze document structure
57    pub analyze_structure: bool,
58
59    /// Maximum file size (bytes)
60    pub max_file_size: usize,
61
62    /// Page processing limit
63    pub max_pages: Option<usize>,
64}
65
66/// Text extraction component
67pub struct TextExtractor {
68    /// Configuration
69    config: TextExtractionConfig,
70
71    /// PDF extractor
72    pdf_extractor: PDFTextExtractor,
73
74    /// Word extractor
75    word_extractor: WordTextExtractor,
76
77    /// PowerPoint extractor
78    ppt_extractor: PowerPointTextExtractor,
79
80    /// HTML extractor
81    html_extractor: HTMLTextExtractor,
82}
83
84/// Text extraction configuration
85#[derive(Debug, Clone)]
86pub struct TextExtractionConfig {
87    /// Preserve formatting
88    pub preserve_formatting: bool,
89
90    /// Extract footnotes
91    pub extract_footnotes: bool,
92
93    /// Extract headers/footers
94    pub extract_headers_footers: bool,
95
96    /// Minimum text block size
97    pub min_block_size: usize,
98}
99
100/// Section analysis component
101pub struct SectionAnalyzer {
102    /// Section detection patterns
103    patterns: Vec<SectionPattern>,
104
105    /// Heading detection
106    heading_detector: HeadingDetector,
107}
108
109/// Layout detection component
110pub struct LayoutDetector {
111    /// Column detection threshold
112    column_threshold: f32,
113
114    /// Reading order analysis
115    reading_order_analyzer: ReadingOrderAnalyzer,
116}
117
118/// PDF text extractor
119pub struct PDFTextExtractor {
120    /// Extract metadata
121    extract_metadata: bool,
122
123    /// Extract bookmarks
124    extract_bookmarks: bool,
125}
126
127/// Word document text extractor
128pub struct WordTextExtractor {
129    /// Extract styles
130    extract_styles: bool,
131
132    /// Extract comments
133    extract_comments: bool,
134}
135
136/// PowerPoint text extractor
137pub struct PowerPointTextExtractor {
138    /// Extract slide notes
139    extract_notes: bool,
140
141    /// Extract animations
142    extract_animations: bool,
143}
144
145/// HTML text extractor
146pub struct HTMLTextExtractor {
147    /// Remove scripts
148    remove_scripts: bool,
149
150    /// Remove styles
151    remove_styles: bool,
152}
153
154/// Section detection pattern
155#[derive(Debug, Clone)]
156pub struct SectionPattern {
157    /// Pattern regex
158    pub pattern: String,
159
160    /// Section type
161    pub section_type: SectionType,
162
163    /// Priority (higher = more specific)
164    pub priority: u32,
165}
166
167/// Heading detection component
168pub struct HeadingDetector {
169    /// Heading patterns
170    patterns: Vec<HeadingPattern>,
171}
172
173/// Heading pattern
174#[derive(Debug, Clone)]
175pub struct HeadingPattern {
176    /// Pattern regex
177    pub pattern: String,
178
179    /// Heading level
180    pub level: usize,
181
182    /// Confidence score
183    pub confidence: f32,
184}
185
186/// Reading order analyzer
187pub struct ReadingOrderAnalyzer {
188    /// Analysis strategy
189    strategy: ReadingOrderStrategy,
190}
191
192/// Reading order strategies
193#[derive(Debug, Clone, Copy)]
194pub enum ReadingOrderStrategy {
195    LeftToRight,
196    TopToBottom,
197    ZPattern,
198    FPattern,
199    Auto,
200}
201
202/// Document parsing result
203#[derive(Debug, Clone)]
204pub struct DocumentParseResult {
205    /// Parsed document
206    pub document: MultiModalDocument,
207
208    /// Parsing confidence
209    pub confidence: f32,
210
211    /// Processing time
212    pub processing_time_ms: u64,
213
214    /// Warnings
215    pub warnings: Vec<String>,
216
217    /// Parsing statistics
218    pub statistics: ParseStatistics,
219}
220
221/// Parsing statistics
222#[derive(Debug, Clone, Serialize, Deserialize)]
223pub struct ParseStatistics {
224    /// Total text length
225    pub text_length: usize,
226
227    /// Image count
228    pub image_count: usize,
229
230    /// Table count
231    pub table_count: usize,
232
233    /// Chart count
234    pub chart_count: usize,
235
236    /// Section count
237    pub section_count: usize,
238
239    /// Page count
240    pub page_count: usize,
241}
242
243impl DocumentParser {
244    /// Create new document parser
245    pub fn new(
246        config: DocumentParserConfig,
247        image_processor: Box<dyn ImageProcessor>,
248        table_processor: Box<dyn TableProcessor>,
249        chart_processor: Box<dyn ChartProcessor>,
250    ) -> RragResult<Self> {
251        let text_extractor = TextExtractor::new(TextExtractionConfig::default())?;
252        let section_analyzer = SectionAnalyzer::new()?;
253        let layout_detector = LayoutDetector::new();
254
255        Ok(Self {
256            config,
257            image_processor,
258            table_processor,
259            chart_processor,
260            text_extractor,
261            section_analyzer,
262            layout_detector,
263        })
264    }
265
266    /// Parse document from file
267    pub async fn parse_document(&self, file_path: &Path) -> RragResult<DocumentParseResult> {
268        let start_time = std::time::Instant::now();
269
270        // Detect document type
271        let doc_type = self.detect_document_type(file_path)?;
272
273        // Validate file size
274        self.validate_file_size(file_path)?;
275
276        // Extract content based on type
277        let content = self.extract_content(file_path, doc_type).await?;
278
279        // Parse multi-modal elements
280        let images = if self.config.extract_images {
281            self.extract_images(&content).await?
282        } else {
283            vec![]
284        };
285
286        let tables = if self.config.extract_tables {
287            self.extract_tables(&content).await?
288        } else {
289            vec![]
290        };
291
292        let charts = if self.config.extract_charts {
293            self.extract_charts(&content).await?
294        } else {
295            vec![]
296        };
297
298        // Analyze document structure
299        let layout = if self.config.analyze_structure {
300            self.analyze_layout(&content).await?
301        } else {
302            DocumentLayout {
303                pages: 1,
304                sections: vec![],
305                reading_order: vec![],
306                columns: None,
307                document_type: doc_type,
308            }
309        };
310
311        // Extract metadata
312        let metadata = self.extract_metadata(file_path, &content)?;
313
314        // Create document
315        let document_id = format!(
316            "doc_{}",
317            uuid::Uuid::new_v4().to_string().split('-').next().unwrap()
318        );
319        let document = MultiModalDocument {
320            id: document_id,
321            text_content: content.text,
322            images,
323            tables,
324            charts,
325            layout,
326            embeddings: MultiModalEmbeddings {
327                text_embeddings: vec![],
328                visual_embeddings: None,
329                table_embeddings: None,
330                fused_embedding: vec![],
331                weights: EmbeddingWeights {
332                    text_weight: 0.6,
333                    visual_weight: 0.2,
334                    table_weight: 0.1,
335                    chart_weight: 0.1,
336                },
337            },
338            metadata,
339        };
340
341        let processing_time = start_time.elapsed().as_millis() as u64;
342
343        Ok(DocumentParseResult {
344            confidence: 0.85,
345            processing_time_ms: processing_time,
346            warnings: vec![],
347            statistics: ParseStatistics {
348                text_length: document.text_content.len(),
349                image_count: document.images.len(),
350                table_count: document.tables.len(),
351                chart_count: document.charts.len(),
352                section_count: document.layout.sections.len(),
353                page_count: document.layout.pages,
354            },
355            document,
356        })
357    }
358
359    /// Detect document type from file
360    fn detect_document_type(&self, file_path: &Path) -> RragResult<DocumentType> {
361        let extension = file_path
362            .extension()
363            .and_then(|ext| ext.to_str())
364            .unwrap_or("")
365            .to_lowercase();
366
367        match extension.as_str() {
368            "pdf" => Ok(DocumentType::PDF),
369            "doc" | "docx" => Ok(DocumentType::Word),
370            "ppt" | "pptx" => Ok(DocumentType::PowerPoint),
371            "html" | "htm" => Ok(DocumentType::HTML),
372            "md" => Ok(DocumentType::Markdown),
373            "txt" => Ok(DocumentType::PlainText),
374            _ => Ok(DocumentType::Mixed),
375        }
376    }
377
378    /// Validate file size
379    fn validate_file_size(&self, file_path: &Path) -> RragResult<()> {
380        let metadata =
381            std::fs::metadata(file_path).map_err(|e| RragError::io_error(e.to_string()))?;
382
383        if metadata.len() as usize > self.config.max_file_size {
384            return Err(RragError::validation(
385                "file_size",
386                format!("maximum {} bytes", self.config.max_file_size),
387                format!("{} bytes", metadata.len()),
388            ));
389        }
390
391        Ok(())
392    }
393
394    /// Extract content from document
395    async fn extract_content(
396        &self,
397        file_path: &Path,
398        doc_type: DocumentType,
399    ) -> RragResult<ExtractedContent> {
400        match doc_type {
401            DocumentType::PDF => self.text_extractor.extract_from_pdf(file_path).await,
402            DocumentType::Word => self.text_extractor.extract_from_word(file_path).await,
403            DocumentType::PowerPoint => self.text_extractor.extract_from_ppt(file_path).await,
404            DocumentType::HTML => self.text_extractor.extract_from_html(file_path).await,
405            DocumentType::Markdown => self.text_extractor.extract_from_markdown(file_path).await,
406            DocumentType::PlainText => self.text_extractor.extract_from_text(file_path).await,
407            DocumentType::Mixed => {
408                // Try to auto-detect based on content
409                self.text_extractor.extract_auto_detect(file_path).await
410            }
411        }
412    }
413
414    /// Extract images from content
415    async fn extract_images(&self, content: &ExtractedContent) -> RragResult<Vec<ProcessedImage>> {
416        let mut images = Vec::new();
417
418        for image_ref in &content.image_references {
419            if let Ok(processed) = self.image_processor.process_image(&image_ref.path) {
420                images.push(processed);
421            }
422        }
423
424        Ok(images)
425    }
426
427    /// Extract tables from content
428    async fn extract_tables(&self, content: &ExtractedContent) -> RragResult<Vec<ExtractedTable>> {
429        let mut tables = Vec::new();
430
431        for table_content in &content.table_content {
432            if let Ok(extracted) = self.table_processor.extract_table(table_content) {
433                tables.extend(extracted);
434            }
435        }
436
437        Ok(tables)
438    }
439
440    /// Extract charts from content
441    async fn extract_charts(&self, content: &ExtractedContent) -> RragResult<Vec<AnalyzedChart>> {
442        let mut charts = Vec::new();
443
444        for chart_ref in &content.chart_references {
445            if let Ok(analyzed) = self.chart_processor.analyze_chart(&chart_ref.path) {
446                charts.push(analyzed);
447            }
448        }
449
450        Ok(charts)
451    }
452
453    /// Analyze document layout
454    async fn analyze_layout(&self, content: &ExtractedContent) -> RragResult<DocumentLayout> {
455        let sections = self.section_analyzer.analyze_sections(&content.text)?;
456        let reading_order = self.layout_detector.determine_reading_order(&sections)?;
457        let columns = self.layout_detector.detect_columns(&content.text)?;
458
459        Ok(DocumentLayout {
460            pages: content.page_count,
461            sections,
462            reading_order,
463            columns,
464            document_type: content.document_type,
465        })
466    }
467
468    /// Extract document metadata
469    fn extract_metadata(
470        &self,
471        file_path: &Path,
472        content: &ExtractedContent,
473    ) -> RragResult<DocumentMetadata> {
474        let file_metadata =
475            std::fs::metadata(file_path).map_err(|e| RragError::io_error(e.to_string()))?;
476
477        Ok(DocumentMetadata {
478            title: content.title.clone(),
479            author: content.author.clone(),
480            creation_date: content.creation_date.clone(),
481            modification_date: file_metadata
482                .modified()
483                .ok()
484                .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
485                .map(|d| d.as_secs().to_string()),
486            page_count: content.page_count,
487            word_count: content.text.split_whitespace().count(),
488            language: content.language.clone().unwrap_or_else(|| "en".to_string()),
489            format: content.document_type,
490        })
491    }
492}
493
494/// Extracted content from document
495#[derive(Debug, Clone)]
496pub struct ExtractedContent {
497    /// Text content
498    pub text: String,
499
500    /// Document type
501    pub document_type: DocumentType,
502
503    /// Page count
504    pub page_count: usize,
505
506    /// Image references
507    pub image_references: Vec<ImageReference>,
508
509    /// Table content
510    pub table_content: Vec<String>,
511
512    /// Chart references
513    pub chart_references: Vec<ChartReference>,
514
515    /// Document title
516    pub title: Option<String>,
517
518    /// Document author
519    pub author: Option<String>,
520
521    /// Creation date
522    pub creation_date: Option<String>,
523
524    /// Language
525    pub language: Option<String>,
526}
527
528/// Image reference in document
529#[derive(Debug, Clone)]
530pub struct ImageReference {
531    pub path: std::path::PathBuf,
532    pub caption: Option<String>,
533    pub alt_text: Option<String>,
534}
535
536/// Chart reference in document
537#[derive(Debug, Clone)]
538pub struct ChartReference {
539    pub path: std::path::PathBuf,
540    pub title: Option<String>,
541    pub description: Option<String>,
542}
543
544impl TextExtractor {
545    /// Create new text extractor
546    pub fn new(config: TextExtractionConfig) -> RragResult<Self> {
547        Ok(Self {
548            config,
549            pdf_extractor: PDFTextExtractor::new(),
550            word_extractor: WordTextExtractor::new(),
551            ppt_extractor: PowerPointTextExtractor::new(),
552            html_extractor: HTMLTextExtractor::new(),
553        })
554    }
555
556    /// Extract from PDF
557    pub async fn extract_from_pdf(&self, file_path: &Path) -> RragResult<ExtractedContent> {
558        self.pdf_extractor.extract(file_path).await
559    }
560
561    /// Extract from Word document
562    pub async fn extract_from_word(&self, file_path: &Path) -> RragResult<ExtractedContent> {
563        self.word_extractor.extract(file_path).await
564    }
565
566    /// Extract from PowerPoint
567    pub async fn extract_from_ppt(&self, file_path: &Path) -> RragResult<ExtractedContent> {
568        self.ppt_extractor.extract(file_path).await
569    }
570
571    /// Extract from HTML
572    pub async fn extract_from_html(&self, file_path: &Path) -> RragResult<ExtractedContent> {
573        self.html_extractor.extract(file_path).await
574    }
575
576    /// Extract from Markdown
577    pub async fn extract_from_markdown(&self, file_path: &Path) -> RragResult<ExtractedContent> {
578        let content =
579            std::fs::read_to_string(file_path).map_err(|e| RragError::io_error(e.to_string()))?;
580
581        Ok(ExtractedContent {
582            text: content,
583            document_type: DocumentType::Markdown,
584            page_count: 1,
585            image_references: vec![],
586            table_content: vec![],
587            chart_references: vec![],
588            title: None,
589            author: None,
590            creation_date: None,
591            language: Some("en".to_string()),
592        })
593    }
594
595    /// Extract from plain text
596    pub async fn extract_from_text(&self, file_path: &Path) -> RragResult<ExtractedContent> {
597        let content =
598            std::fs::read_to_string(file_path).map_err(|e| RragError::io_error(e.to_string()))?;
599
600        Ok(ExtractedContent {
601            text: content,
602            document_type: DocumentType::PlainText,
603            page_count: 1,
604            image_references: vec![],
605            table_content: vec![],
606            chart_references: vec![],
607            title: None,
608            author: None,
609            creation_date: None,
610            language: Some("en".to_string()),
611        })
612    }
613
614    /// Auto-detect and extract
615    pub async fn extract_auto_detect(&self, file_path: &Path) -> RragResult<ExtractedContent> {
616        // For simplicity, treat as plain text
617        self.extract_from_text(file_path).await
618    }
619}
620
621impl SectionAnalyzer {
622    /// Create new section analyzer
623    pub fn new() -> RragResult<Self> {
624        let patterns = vec![
625            SectionPattern {
626                pattern: r"^Abstract\s*$".to_string(),
627                section_type: SectionType::Abstract,
628                priority: 100,
629            },
630            SectionPattern {
631                pattern: r"^Introduction\s*$".to_string(),
632                section_type: SectionType::Introduction,
633                priority: 90,
634            },
635            SectionPattern {
636                pattern: r"^Conclusion\s*$".to_string(),
637                section_type: SectionType::Conclusion,
638                priority: 80,
639            },
640            SectionPattern {
641                pattern: r"^References\s*$".to_string(),
642                section_type: SectionType::References,
643                priority: 70,
644            },
645        ];
646
647        let heading_detector = HeadingDetector::new();
648
649        Ok(Self {
650            patterns,
651            heading_detector,
652        })
653    }
654
655    /// Analyze document sections
656    pub fn analyze_sections(&self, text: &str) -> RragResult<Vec<DocumentSection>> {
657        let mut sections = Vec::new();
658        let lines: Vec<&str> = text.lines().collect();
659
660        let mut current_section: Option<DocumentSection> = None;
661        let mut content_buffer = String::new();
662
663        for (_line_idx, line) in lines.iter().enumerate() {
664            let trimmed = line.trim();
665
666            // Check if this line matches a section pattern
667            if let Some((section_type, level)) = self.detect_section_start(trimmed) {
668                // Save previous section
669                if let Some(mut section) = current_section.take() {
670                    section.content = content_buffer.trim().to_string();
671                    sections.push(section);
672                    content_buffer.clear();
673                }
674
675                // Start new section
676                current_section = Some(DocumentSection {
677                    id: format!("section_{}", sections.len()),
678                    title: Some(trimmed.to_string()),
679                    content: String::new(),
680                    section_type,
681                    level,
682                    page_range: (1, 1), // Simplified
683                });
684            } else {
685                // Add to current content
686                content_buffer.push_str(line);
687                content_buffer.push('\n');
688            }
689        }
690
691        // Save final section
692        if let Some(mut section) = current_section {
693            section.content = content_buffer.trim().to_string();
694            sections.push(section);
695        }
696
697        // If no sections detected, create a default body section
698        if sections.is_empty() {
699            sections.push(DocumentSection {
700                id: "section_0".to_string(),
701                title: None,
702                content: text.to_string(),
703                section_type: SectionType::Body,
704                level: 1,
705                page_range: (1, 1),
706            });
707        }
708
709        Ok(sections)
710    }
711
712    /// Detect section start
713    fn detect_section_start(&self, line: &str) -> Option<(SectionType, usize)> {
714        // Check patterns first
715        for pattern in &self.patterns {
716            if let Ok(regex) = regex::Regex::new(&pattern.pattern) {
717                if regex.is_match(line) {
718                    return Some((pattern.section_type, 1));
719                }
720            }
721        }
722
723        // Check heading patterns
724        if let Some((level, _)) = self.heading_detector.detect_heading(line) {
725            return Some((SectionType::Body, level));
726        }
727
728        None
729    }
730}
731
732impl HeadingDetector {
733    /// Create new heading detector
734    pub fn new() -> Self {
735        let patterns = vec![
736            HeadingPattern {
737                pattern: r"^#+\s+".to_string(), // Markdown headers
738                level: 1,
739                confidence: 0.9,
740            },
741            HeadingPattern {
742                pattern: r"^[A-Z][A-Z\s]{5,}\s*$".to_string(), // ALL CAPS
743                level: 1,
744                confidence: 0.7,
745            },
746        ];
747
748        Self { patterns }
749    }
750
751    /// Detect if line is a heading
752    pub fn detect_heading(&self, line: &str) -> Option<(usize, f32)> {
753        for pattern in &self.patterns {
754            if let Ok(regex) = regex::Regex::new(&pattern.pattern) {
755                if regex.is_match(line) {
756                    // Calculate level for markdown headers
757                    let level = if pattern.pattern.starts_with("^#+") {
758                        line.chars().take_while(|&c| c == '#').count()
759                    } else {
760                        pattern.level
761                    };
762
763                    return Some((level, pattern.confidence));
764                }
765            }
766        }
767
768        None
769    }
770}
771
772impl LayoutDetector {
773    /// Create new layout detector
774    pub fn new() -> Self {
775        Self {
776            column_threshold: 0.3,
777            reading_order_analyzer: ReadingOrderAnalyzer::new(),
778        }
779    }
780
781    /// Determine reading order
782    pub fn determine_reading_order(&self, sections: &[DocumentSection]) -> RragResult<Vec<String>> {
783        Ok(sections.iter().map(|s| s.id.clone()).collect())
784    }
785
786    /// Detect column layout
787    pub fn detect_columns(&self, text: &str) -> RragResult<Option<ColumnLayout>> {
788        // Simplified column detection
789        let lines: Vec<&str> = text.lines().collect();
790        let avg_line_length =
791            lines.iter().map(|line| line.len()).sum::<usize>() as f32 / lines.len() as f32;
792
793        if avg_line_length > 120.0 {
794            // Likely multi-column layout
795            Ok(Some(ColumnLayout {
796                column_count: 2,
797                column_widths: vec![0.5, 0.5],
798                gutter_width: 0.05,
799            }))
800        } else {
801            Ok(None)
802        }
803    }
804}
805
806impl ReadingOrderAnalyzer {
807    /// Create new reading order analyzer
808    pub fn new() -> Self {
809        Self {
810            strategy: ReadingOrderStrategy::Auto,
811        }
812    }
813}
814
815// PDF, Word, PowerPoint, HTML extractors (simplified implementations)
816impl PDFTextExtractor {
817    pub fn new() -> Self {
818        Self {
819            extract_metadata: true,
820            extract_bookmarks: true,
821        }
822    }
823
824    pub async fn extract(&self, _file_path: &Path) -> RragResult<ExtractedContent> {
825        // Simplified PDF extraction
826        Ok(ExtractedContent {
827            text: "Extracted PDF content".to_string(),
828            document_type: DocumentType::PDF,
829            page_count: 5,
830            image_references: vec![],
831            table_content: vec![],
832            chart_references: vec![],
833            title: Some("Sample PDF Document".to_string()),
834            author: Some("PDF Author".to_string()),
835            creation_date: Some("2024-01-01".to_string()),
836            language: Some("en".to_string()),
837        })
838    }
839}
840
841impl WordTextExtractor {
842    pub fn new() -> Self {
843        Self {
844            extract_styles: true,
845            extract_comments: false,
846        }
847    }
848
849    pub async fn extract(&self, _file_path: &Path) -> RragResult<ExtractedContent> {
850        // Simplified Word extraction
851        Ok(ExtractedContent {
852            text: "Extracted Word content".to_string(),
853            document_type: DocumentType::Word,
854            page_count: 3,
855            image_references: vec![],
856            table_content: vec![],
857            chart_references: vec![],
858            title: Some("Sample Word Document".to_string()),
859            author: Some("Word Author".to_string()),
860            creation_date: Some("2024-01-01".to_string()),
861            language: Some("en".to_string()),
862        })
863    }
864}
865
866impl PowerPointTextExtractor {
867    pub fn new() -> Self {
868        Self {
869            extract_notes: true,
870            extract_animations: false,
871        }
872    }
873
874    pub async fn extract(&self, _file_path: &Path) -> RragResult<ExtractedContent> {
875        // Simplified PowerPoint extraction
876        Ok(ExtractedContent {
877            text: "Extracted PowerPoint content".to_string(),
878            document_type: DocumentType::PowerPoint,
879            page_count: 10,
880            image_references: vec![],
881            table_content: vec![],
882            chart_references: vec![],
883            title: Some("Sample PowerPoint Presentation".to_string()),
884            author: Some("PPT Author".to_string()),
885            creation_date: Some("2024-01-01".to_string()),
886            language: Some("en".to_string()),
887        })
888    }
889}
890
891impl HTMLTextExtractor {
892    pub fn new() -> Self {
893        Self {
894            remove_scripts: true,
895            remove_styles: true,
896        }
897    }
898
899    pub async fn extract(&self, file_path: &Path) -> RragResult<ExtractedContent> {
900        let html_content =
901            std::fs::read_to_string(file_path).map_err(|e| RragError::io_error(e.to_string()))?;
902
903        // Simplified HTML text extraction (remove tags)
904        let text = html_content
905            .split('<')
906            .enumerate()
907            .filter_map(|(i, part)| {
908                if i == 0 {
909                    Some(part)
910                } else if let Some(end_pos) = part.find('>') {
911                    Some(&part[end_pos + 1..])
912                } else {
913                    None
914                }
915            })
916            .collect::<Vec<_>>()
917            .join("");
918
919        Ok(ExtractedContent {
920            text,
921            document_type: DocumentType::HTML,
922            page_count: 1,
923            image_references: vec![],
924            table_content: vec![],
925            chart_references: vec![],
926            title: None,
927            author: None,
928            creation_date: None,
929            language: Some("en".to_string()),
930        })
931    }
932}
933
934impl Default for DocumentParserConfig {
935    fn default() -> Self {
936        Self {
937            supported_types: vec![
938                DocumentType::PDF,
939                DocumentType::Word,
940                DocumentType::HTML,
941                DocumentType::Markdown,
942                DocumentType::PlainText,
943            ],
944            extract_text: true,
945            extract_images: true,
946            extract_tables: true,
947            extract_charts: true,
948            analyze_structure: true,
949            max_file_size: 100 * 1024 * 1024, // 100MB
950            max_pages: Some(1000),
951        }
952    }
953}
954
955impl Default for TextExtractionConfig {
956    fn default() -> Self {
957        Self {
958            preserve_formatting: true,
959            extract_footnotes: true,
960            extract_headers_footers: false,
961            min_block_size: 10,
962        }
963    }
964}
965
966#[cfg(test)]
967mod tests {
968    use super::*;
969    use tempfile::NamedTempFile;
970
971    #[test]
972    fn test_document_type_detection() {
973        let parser = create_test_parser();
974
975        let pdf_path = std::path::Path::new("test.pdf");
976        assert_eq!(
977            parser.detect_document_type(pdf_path).unwrap(),
978            DocumentType::PDF
979        );
980
981        let word_path = std::path::Path::new("test.docx");
982        assert_eq!(
983            parser.detect_document_type(word_path).unwrap(),
984            DocumentType::Word
985        );
986    }
987
988    #[test]
989    fn test_section_detection() {
990        let analyzer = SectionAnalyzer::new().unwrap();
991        let text = "Abstract\n\nThis is the abstract.\n\nIntroduction\n\nThis is the introduction.";
992
993        let sections = analyzer.analyze_sections(text).unwrap();
994        assert_eq!(sections.len(), 2);
995        assert_eq!(sections[0].section_type, SectionType::Abstract);
996        assert_eq!(sections[1].section_type, SectionType::Introduction);
997    }
998
999    #[test]
1000    fn test_heading_detection() {
1001        let detector = HeadingDetector::new();
1002
1003        // Markdown heading
1004        assert!(detector.detect_heading("# Main Heading").is_some());
1005        assert!(detector.detect_heading("## Sub Heading").is_some());
1006
1007        // All caps heading
1008        assert!(detector.detect_heading("MAIN SECTION").is_some());
1009
1010        // Regular text
1011        assert!(detector.detect_heading("This is regular text").is_none());
1012    }
1013
1014    fn create_test_parser() -> DocumentParser {
1015        use super::super::{chart_processor, image_processor, table_processor};
1016
1017        DocumentParser::new(
1018            DocumentParserConfig::default(),
1019            Box::new(
1020                image_processor::DefaultImageProcessor::new(
1021                    super::super::ImageProcessingConfig::default(),
1022                )
1023                .unwrap(),
1024            ),
1025            Box::new(
1026                table_processor::DefaultTableProcessor::new(
1027                    super::super::TableExtractionConfig::default(),
1028                )
1029                .unwrap(),
1030            ),
1031            Box::new(
1032                chart_processor::DefaultChartProcessor::new(
1033                    super::super::ChartAnalysisConfig::default(),
1034                )
1035                .unwrap(),
1036            ),
1037        )
1038        .unwrap()
1039    }
1040}
rexis_rag/multimodal/document_parser.rs

rexis_rag/multimodal/
document_parser.rs