rexis_rag/multimodal/
mod.rs

1//! # Multi-Modal RAG Processing
2//!
3//! Advanced multi-modal processing capabilities for handling diverse content types
4//! in RAG systems including images, tables, charts, PDFs, and structured documents.
5//!
6//! This module enables RAG systems to process and understand content beyond plain text,
7//! making it possible to build applications that can reason over visual content,
8//! extract information from tables, analyze charts, and process complex document layouts.
9//!
10//! ## Features
11//!
12//! - **Image Processing**: Extract features, generate captions, detect objects
13//! - **Table Processing**: Extract structured data from HTML, CSV, and PDF tables
14//! - **Chart Analysis**: Understand charts, graphs, and visualizations
15//! - **OCR Integration**: Extract text from images and scanned documents
16//! - **Layout Analysis**: Understand document structure and reading order
17//! - **Embedding Fusion**: Combine embeddings from different modalities
18//! - **Multi-Modal Retrieval**: Search across text, images, and structured data
19//!
20//! ## Supported Formats
21//!
22//! - **Images**: PNG, JPEG, GIF, WebP, SVG
23//! - **Documents**: PDF, Word, PowerPoint, HTML
24//! - **Tables**: HTML tables, CSV, TSV, Excel
25//! - **Charts**: PNG/JPEG charts, SVG graphics
26//! - **Mixed Content**: Documents with embedded images and tables
27//!
28//! ## Examples
29//!
30//! ### Basic Multi-Modal Document Processing
31//! ```rust
32//! use rrag::multimodal::{MultiModalService, MultiModalConfig, MultiModalDocument};
33//!
34//! # async fn example() -> rrag::RragResult<()> {
35//! let service = MultiModalService::new(
36//!     MultiModalConfig::default()
37//!         .enable_image_processing(true)
38//!         .enable_table_extraction(true)
39//!         .enable_chart_analysis(true)
40//! ).await?;
41//!
42//! // Process a document with mixed content
43//! let document = MultiModalDocument::new()
44//!     .add_text("Q4 2024 Revenue Report")
45//!     .add_image("charts/revenue_chart.png")
46//!     .add_table("data/quarterly_results.csv");
47//!
48//! let processed = service.process_document(document).await?;
49//! tracing::debug!("Extracted {} text chunks, {} images, {} tables",
50//!          processed.text_chunks.len(),
51//!          processed.images.len(),
52//!          processed.tables.len());
53//! # Ok(())
54//! # }
55//! ```
56//!
57//! ### Image Analysis and Captioning
58//! ```rust
59//! use rrag::multimodal::image_processor::{ImageProcessor, ProcessingConfig};
60//!
61//! # async fn example() -> rrag::RragResult<()> {
62//! let processor = ImageProcessor::new(
63//!     ProcessingConfig::default()
64//!         .enable_object_detection(true)
65//!         .enable_captioning(true)
66//!         .enable_ocr(true)
67//! );
68//!
69//! let image_path = "images/product_diagram.png";
70//! let analysis = processor.analyze_image(image_path).await?;
71//!
72//! tracing::debug!("Caption: {}", analysis.caption);
73//! tracing::debug!("Detected {} objects", analysis.objects.len());
74//! tracing::debug!("Extracted text: {}", analysis.text);
75//! # Ok(())
76//! # }
77//! ```
78//!
79//! ### Table Extraction and Analysis
80//! ```rust
81//! use rrag::multimodal::table_processor::{TableProcessor, TableConfig};
82//!
83//! # async fn example() -> rrag::RragResult<()> {
84//! let processor = TableProcessor::new(TableConfig::default());
85//!
86//! // Extract tables from HTML
87//! let html = r#"<table><tr><th>Product</th><th>Revenue</th></tr>..."#;
88//! let tables = processor.extract_from_html(html).await?;
89//!
90//! for table in tables {
91//!     tracing::debug!("Table: {} rows, {} columns",
92//!              table.rows.len(),
93//!              table.headers.len());
94//! }
95//! # Ok(())
96//! # }
97//! ```
98//!
99//! ### Chart Analysis
100//! ```rust
101//! use rrag::multimodal::chart_processor::{ChartProcessor, ChartConfig};
102//!
103//! # async fn example() -> rrag::RragResult<()> {
104//! let processor = ChartProcessor::new(ChartConfig::default());
105//!
106//! let chart_path = "charts/sales_trend.png";
107//! let analysis = processor.analyze_chart(chart_path).await?;
108//!
109//! tracing::debug!("Chart type: {:?}", analysis.chart_type);
110//! tracing::debug!("Description: {}", analysis.description);
111//! tracing::debug!("Key insights: {:?}", analysis.insights);
112//! # Ok(())
113//! # }
114//! ```
115//!
116//! ### Multi-Modal Search
117//! ```rust
118//! use rrag::multimodal::retrieval::{MultiModalRetriever, SearchOptions};
119//!
120//! # async fn example() -> rrag::RragResult<()> {
121//! let retriever = MultiModalRetriever::new().await?;
122//!
123//! // Search across text, images, and tables
124//! let results = retriever.search_multi_modal(
125//!     "revenue trends Q4 2024",
126//!     SearchOptions::new()
127//!         .include_text(true)
128//!         .include_images(true)
129//!         .include_tables(true)
130//! ).await?;
131//!
132//! for result in results {
133//!     match result.content_type {
134//!         ContentType::Text => tracing::debug!("Text: {}", result.content),
135//!         ContentType::Image => tracing::debug!("Image: {}", result.path),
136//!         ContentType::Table => tracing::debug!("Table: {} rows", result.metadata["rows"]),
137//!     }
138//! }
139//! # Ok(())
140//! # }
141//! ```
142//!
143//! ## Performance Considerations
144//!
145//! - **Batch Processing**: Process multiple items together for efficiency
146//! - **Caching**: Cache embeddings and analysis results
147//! - **Parallel Processing**: Use multiple threads for CPU-intensive tasks
148//! - **GPU Acceleration**: Leverage CUDA for deep learning models (when available)
149//! - **Memory Management**: Stream large files to avoid memory issues
150//!
151//! ## Model Integration
152//!
153//! The module supports integration with various pre-trained models:
154//!
155//! - **Vision Models**: CLIP, BLIP, ViT for image understanding
156//! - **OCR Models**: Tesseract, EasyOCR, TrOCR
157//! - **Layout Models**: LayoutLM, DiT for document layout
158//! - **Table Models**: TableNet, TableTransformer
159//! - **Chart Models**: ChartQA, PlotQA for chart understanding
160
161pub mod chart_processor;
162pub mod document_parser;
163pub mod embedding_fusion;
164pub mod image_processor;
165pub mod layout_analysis;
166pub mod ocr;
167pub mod retrieval;
168pub mod table_processor;
169
170use crate::RragResult;
171use serde::{Deserialize, Serialize};
172use std::path::Path;
173
174/// Multi-modal processing service
175pub struct MultiModalService {
176    /// Configuration
177    config: MultiModalConfig,
178
179    /// Image processor
180    image_processor: Box<dyn ImageProcessor>,
181
182    /// Table processor
183    table_processor: Box<dyn TableProcessor>,
184
185    /// Chart processor
186    chart_processor: Box<dyn ChartProcessor>,
187
188    /// OCR engine
189    ocr_engine: Box<dyn OCREngine>,
190
191    /// Layout analyzer
192    layout_analyzer: Box<dyn LayoutAnalyzer>,
193
194    /// Embedding fusion strategy
195    fusion_strategy: Box<dyn EmbeddingFusionStrategy>,
196}
197
198/// Multi-modal configuration
199#[derive(Debug, Clone)]
200pub struct MultiModalConfig {
201    /// Enable image processing
202    pub process_images: bool,
203
204    /// Enable table extraction
205    pub process_tables: bool,
206
207    /// Enable chart analysis
208    pub process_charts: bool,
209
210    /// Image processing config
211    pub image_config: ImageProcessingConfig,
212
213    /// Table extraction config
214    pub table_config: TableExtractionConfig,
215
216    /// Chart analysis config
217    pub chart_config: ChartAnalysisConfig,
218
219    /// OCR configuration
220    pub ocr_config: OCRConfig,
221
222    /// Layout analysis config
223    pub layout_config: LayoutAnalysisConfig,
224
225    /// Fusion strategy
226    pub fusion_strategy: FusionStrategy,
227}
228
229/// Image processing configuration
230#[derive(Debug, Clone)]
231pub struct ImageProcessingConfig {
232    /// Maximum image dimensions
233    pub max_width: u32,
234    pub max_height: u32,
235
236    /// Image formats to process
237    pub supported_formats: Vec<ImageFormat>,
238
239    /// Enable CLIP embeddings
240    pub use_clip: bool,
241
242    /// Enable image captioning
243    pub generate_captions: bool,
244
245    /// Extract visual features
246    pub extract_features: bool,
247
248    /// Compression quality (0-100)
249    pub compression_quality: u8,
250}
251
252/// Table extraction configuration
253#[derive(Debug, Clone)]
254pub struct TableExtractionConfig {
255    /// Minimum rows for valid table
256    pub min_rows: usize,
257
258    /// Minimum columns for valid table
259    pub min_cols: usize,
260
261    /// Extract headers
262    pub extract_headers: bool,
263
264    /// Infer data types
265    pub infer_types: bool,
266
267    /// Generate summaries
268    pub generate_summaries: bool,
269
270    /// Output format
271    pub output_format: TableOutputFormat,
272}
273
274/// Chart analysis configuration
275#[derive(Debug, Clone)]
276pub struct ChartAnalysisConfig {
277    /// Chart types to recognize
278    pub chart_types: Vec<ChartType>,
279
280    /// Extract data points
281    pub extract_data: bool,
282
283    /// Generate descriptions
284    pub generate_descriptions: bool,
285
286    /// Analyze trends
287    pub analyze_trends: bool,
288}
289
290/// OCR configuration
291#[derive(Debug, Clone)]
292pub struct OCRConfig {
293    /// OCR engine to use
294    pub engine: OCREngineType,
295
296    /// Languages to recognize
297    pub languages: Vec<String>,
298
299    /// Confidence threshold
300    pub confidence_threshold: f32,
301
302    /// Enable spell correction
303    pub spell_correction: bool,
304
305    /// Preserve formatting
306    pub preserve_formatting: bool,
307}
308
309/// Layout analysis configuration
310#[derive(Debug, Clone)]
311pub struct LayoutAnalysisConfig {
312    /// Detect document structure
313    pub detect_structure: bool,
314
315    /// Identify sections
316    pub identify_sections: bool,
317
318    /// Extract reading order
319    pub extract_reading_order: bool,
320
321    /// Detect columns
322    pub detect_columns: bool,
323}
324
325/// Multi-modal document representation
326#[derive(Debug, Clone, Serialize, Deserialize)]
327pub struct MultiModalDocument {
328    /// Document ID
329    pub id: String,
330
331    /// Text content
332    pub text_content: String,
333
334    /// Images in document
335    pub images: Vec<ProcessedImage>,
336
337    /// Tables in document
338    pub tables: Vec<ExtractedTable>,
339
340    /// Charts in document
341    pub charts: Vec<AnalyzedChart>,
342
343    /// Document layout
344    pub layout: DocumentLayout,
345
346    /// Combined embeddings
347    pub embeddings: MultiModalEmbeddings,
348
349    /// Metadata
350    pub metadata: DocumentMetadata,
351}
352
353/// Processed image data
354#[derive(Debug, Clone, Serialize, Deserialize)]
355pub struct ProcessedImage {
356    /// Image ID
357    pub id: String,
358
359    /// Original path or URL
360    pub source: String,
361
362    /// Image caption
363    pub caption: Option<String>,
364
365    /// OCR text if applicable
366    pub ocr_text: Option<String>,
367
368    /// Visual features
369    pub features: Option<VisualFeatures>,
370
371    /// CLIP embedding
372    pub clip_embedding: Option<Vec<f32>>,
373
374    /// Image metadata
375    pub metadata: ImageMetadata,
376}
377
378/// Extracted table data
379#[derive(Debug, Clone, Serialize, Deserialize)]
380pub struct ExtractedTable {
381    /// Table ID
382    pub id: String,
383
384    /// Table headers
385    pub headers: Vec<String>,
386
387    /// Table data rows
388    pub rows: Vec<Vec<TableCell>>,
389
390    /// Table summary
391    pub summary: Option<String>,
392
393    /// Column types
394    pub column_types: Vec<DataType>,
395
396    /// Table embedding
397    pub embedding: Option<Vec<f32>>,
398
399    /// Statistics
400    pub statistics: Option<TableStatistics>,
401}
402
403/// Analyzed chart data
404#[derive(Debug, Clone, Serialize, Deserialize)]
405pub struct AnalyzedChart {
406    /// Chart ID
407    pub id: String,
408
409    /// Chart type
410    pub chart_type: ChartType,
411
412    /// Chart title
413    pub title: Option<String>,
414
415    /// Axis labels
416    pub axes: ChartAxes,
417
418    /// Data points
419    pub data_points: Vec<DataPoint>,
420
421    /// Trend analysis
422    pub trends: Option<TrendAnalysis>,
423
424    /// Description
425    pub description: Option<String>,
426
427    /// Chart embedding
428    pub embedding: Option<Vec<f32>>,
429}
430
431/// Document layout information
432#[derive(Debug, Clone, Serialize, Deserialize)]
433pub struct DocumentLayout {
434    /// Page count
435    pub pages: usize,
436
437    /// Document sections
438    pub sections: Vec<DocumentSection>,
439
440    /// Reading order
441    pub reading_order: Vec<String>,
442
443    /// Column layout
444    pub columns: Option<ColumnLayout>,
445
446    /// Document type
447    pub document_type: DocumentType,
448}
449
450/// Multi-modal embeddings
451#[derive(Debug, Clone, Serialize, Deserialize)]
452pub struct MultiModalEmbeddings {
453    /// Text embeddings
454    pub text_embeddings: Vec<f32>,
455
456    /// Visual embeddings (averaged)
457    pub visual_embeddings: Option<Vec<f32>>,
458
459    /// Table embeddings (averaged)
460    pub table_embeddings: Option<Vec<f32>>,
461
462    /// Fused embedding
463    pub fused_embedding: Vec<f32>,
464
465    /// Embedding weights
466    pub weights: EmbeddingWeights,
467}
468
469/// Image processor trait
470pub trait ImageProcessor: Send + Sync {
471    /// Process image
472    fn process_image(&self, image_path: &Path) -> RragResult<ProcessedImage>;
473
474    /// Extract features
475    fn extract_features(&self, image_path: &Path) -> RragResult<VisualFeatures>;
476
477    /// Generate caption
478    fn generate_caption(&self, image_path: &Path) -> RragResult<String>;
479
480    /// Generate CLIP embedding
481    fn generate_clip_embedding(&self, image_path: &Path) -> RragResult<Vec<f32>>;
482}
483
484/// Table processor trait
485pub trait TableProcessor: Send + Sync {
486    /// Extract table from document
487    fn extract_table(&self, content: &str) -> RragResult<Vec<ExtractedTable>>;
488
489    /// Parse table structure
490    fn parse_structure(&self, table_html: &str) -> RragResult<ExtractedTable>;
491
492    /// Generate table summary
493    fn generate_summary(&self, table: &ExtractedTable) -> RragResult<String>;
494
495    /// Calculate statistics
496    fn calculate_statistics(&self, table: &ExtractedTable) -> RragResult<TableStatistics>;
497}
498
499/// Chart processor trait
500pub trait ChartProcessor: Send + Sync {
501    /// Analyze chart
502    fn analyze_chart(&self, image_path: &Path) -> RragResult<AnalyzedChart>;
503
504    /// Extract data points
505    fn extract_data_points(&self, chart_image: &Path) -> RragResult<Vec<DataPoint>>;
506
507    /// Identify chart type
508    fn identify_type(&self, chart_image: &Path) -> RragResult<ChartType>;
509
510    /// Analyze trends
511    fn analyze_trends(&self, data_points: &[DataPoint]) -> RragResult<TrendAnalysis>;
512}
513
514/// OCR engine trait
515pub trait OCREngine: Send + Sync {
516    /// Perform OCR on image
517    fn ocr(&self, image_path: &Path) -> RragResult<OCRResult>;
518
519    /// Get text with confidence
520    fn get_text_with_confidence(&self, image_path: &Path) -> RragResult<Vec<(String, f32)>>;
521
522    /// Get text layout
523    fn get_layout(&self, image_path: &Path) -> RragResult<TextLayout>;
524}
525
526/// Layout analyzer trait
527pub trait LayoutAnalyzer: Send + Sync {
528    /// Analyze document layout
529    fn analyze_layout(&self, document_path: &Path) -> RragResult<DocumentLayout>;
530
531    /// Detect sections
532    fn detect_sections(&self, content: &str) -> RragResult<Vec<DocumentSection>>;
533
534    /// Extract reading order
535    fn extract_reading_order(&self, layout: &DocumentLayout) -> RragResult<Vec<String>>;
536}
537
538/// Embedding fusion strategy trait
539pub trait EmbeddingFusionStrategy: Send + Sync {
540    /// Fuse multi-modal embeddings
541    fn fuse_embeddings(&self, embeddings: &MultiModalEmbeddings) -> RragResult<Vec<f32>>;
542
543    /// Calculate optimal weights
544    fn calculate_weights(&self, document: &MultiModalDocument) -> RragResult<EmbeddingWeights>;
545}
546
547// Supporting types
548
549/// Image formats
550#[derive(Debug, Clone, Copy, PartialEq, Eq)]
551pub enum ImageFormat {
552    JPEG,
553    PNG,
554    GIF,
555    BMP,
556    WEBP,
557    SVG,
558    TIFF,
559}
560
561/// Table output formats
562#[derive(Debug, Clone, Copy)]
563pub enum TableOutputFormat {
564    CSV,
565    JSON,
566    Markdown,
567    HTML,
568}
569
570/// Chart types
571#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
572pub enum ChartType {
573    Line,
574    Bar,
575    Pie,
576    Scatter,
577    Area,
578    Histogram,
579    HeatMap,
580    Box,
581    Unknown,
582}
583
584/// OCR engine types
585#[derive(Debug, Clone, Copy, PartialEq)]
586pub enum OCREngineType {
587    Tesseract,
588    EasyOCR,
589    PaddleOCR,
590    CloudVision,
591}
592
593/// Document types
594#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
595pub enum DocumentType {
596    PDF,
597    Word,
598    PowerPoint,
599    HTML,
600    Markdown,
601    PlainText,
602    Mixed,
603}
604
605/// Data types for table columns
606#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
607pub enum DataType {
608    String,
609    Number,
610    Date,
611    Boolean,
612    Mixed,
613}
614
615/// Fusion strategies
616#[derive(Debug, Clone, Copy)]
617pub enum FusionStrategy {
618    /// Simple averaging
619    Average,
620
621    /// Weighted average based on content
622    Weighted,
623
624    /// Concatenation
625    Concatenate,
626
627    /// Attention-based fusion
628    Attention,
629
630    /// Learned fusion
631    Learned,
632}
633
634/// Visual features extracted from images
635#[derive(Debug, Clone, Serialize, Deserialize)]
636pub struct VisualFeatures {
637    /// Dominant colors
638    pub colors: Vec<Color>,
639
640    /// Detected objects
641    pub objects: Vec<DetectedObject>,
642
643    /// Scene classification
644    pub scene: Option<String>,
645
646    /// Image quality metrics
647    pub quality: ImageQuality,
648
649    /// Spatial layout
650    pub layout: SpatialLayout,
651}
652
653/// Table cell
654#[derive(Debug, Clone, Serialize, Deserialize)]
655pub struct TableCell {
656    /// Cell value
657    pub value: String,
658
659    /// Cell type
660    pub data_type: DataType,
661
662    /// Cell formatting
663    pub formatting: Option<CellFormatting>,
664}
665
666/// Table statistics
667#[derive(Debug, Clone, Serialize, Deserialize)]
668pub struct TableStatistics {
669    /// Row count
670    pub row_count: usize,
671
672    /// Column count
673    pub column_count: usize,
674
675    /// Null percentage per column
676    pub null_percentages: Vec<f32>,
677
678    /// Column statistics
679    pub column_stats: Vec<ColumnStatistics>,
680}
681
682/// Column statistics
683#[derive(Debug, Clone, Serialize, Deserialize)]
684pub struct ColumnStatistics {
685    /// Column name
686    pub name: String,
687
688    /// For numeric columns
689    pub numeric_stats: Option<NumericStatistics>,
690
691    /// For text columns
692    pub text_stats: Option<TextStatistics>,
693
694    /// Unique values count
695    pub unique_count: usize,
696}
697
698/// Numeric statistics
699#[derive(Debug, Clone, Serialize, Deserialize)]
700pub struct NumericStatistics {
701    pub min: f64,
702    pub max: f64,
703    pub mean: f64,
704    pub median: f64,
705    pub std_dev: f64,
706}
707
708/// Text statistics
709#[derive(Debug, Clone, Serialize, Deserialize)]
710pub struct TextStatistics {
711    pub min_length: usize,
712    pub max_length: usize,
713    pub avg_length: f32,
714    pub most_common: Vec<(String, usize)>,
715}
716
717/// Chart axes information
718#[derive(Debug, Clone, Serialize, Deserialize)]
719pub struct ChartAxes {
720    pub x_label: Option<String>,
721    pub y_label: Option<String>,
722    pub x_range: Option<(f64, f64)>,
723    pub y_range: Option<(f64, f64)>,
724}
725
726/// Data point in chart
727#[derive(Debug, Clone, Serialize, Deserialize)]
728pub struct DataPoint {
729    pub x: f64,
730    pub y: f64,
731    pub label: Option<String>,
732    pub series: Option<String>,
733}
734
735/// Trend analysis results
736#[derive(Debug, Clone, Serialize, Deserialize)]
737pub struct TrendAnalysis {
738    /// Trend direction
739    pub direction: TrendDirection,
740
741    /// Trend strength (0-1)
742    pub strength: f32,
743
744    /// Seasonality detected
745    pub seasonality: Option<Seasonality>,
746
747    /// Outliers
748    pub outliers: Vec<DataPoint>,
749
750    /// Forecast
751    pub forecast: Option<Vec<DataPoint>>,
752}
753
754/// Trend directions
755#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq)]
756pub enum TrendDirection {
757    Increasing,
758    Decreasing,
759    Stable,
760    Volatile,
761}
762
763/// Seasonality patterns
764#[derive(Debug, Clone, Serialize, Deserialize)]
765pub struct Seasonality {
766    pub period: f64,
767    pub amplitude: f64,
768    pub phase: f64,
769}
770
771/// OCR result
772#[derive(Debug, Clone)]
773pub struct OCRResult {
774    /// Extracted text
775    pub text: String,
776
777    /// Overall confidence
778    pub confidence: f32,
779
780    /// Word-level results
781    pub words: Vec<OCRWord>,
782
783    /// Detected languages
784    pub languages: Vec<String>,
785}
786
787/// OCR word result
788#[derive(Debug, Clone)]
789pub struct OCRWord {
790    pub text: String,
791    pub confidence: f32,
792    pub bounding_box: BoundingBox,
793}
794
795/// Bounding box
796#[derive(Debug, Clone)]
797pub struct BoundingBox {
798    pub x: u32,
799    pub y: u32,
800    pub width: u32,
801    pub height: u32,
802}
803
804/// Text layout from OCR
805#[derive(Debug, Clone)]
806pub struct TextLayout {
807    /// Text blocks
808    pub blocks: Vec<TextBlock>,
809
810    /// Reading order
811    pub reading_order: Vec<usize>,
812
813    /// Detected columns
814    pub columns: Option<Vec<Column>>,
815}
816
817/// Text block
818#[derive(Debug, Clone)]
819pub struct TextBlock {
820    pub id: usize,
821    pub text: String,
822    pub bounding_box: BoundingBox,
823    pub block_type: BlockType,
824}
825
826/// Block types
827#[derive(Debug, Clone, Copy)]
828pub enum BlockType {
829    Title,
830    Heading,
831    Paragraph,
832    Caption,
833    Footer,
834    Header,
835}
836
837/// Column in layout
838#[derive(Debug, Clone)]
839pub struct Column {
840    pub index: usize,
841    pub blocks: Vec<usize>,
842    pub width: u32,
843}
844
845/// Document section
846#[derive(Debug, Clone, Serialize, Deserialize)]
847pub struct DocumentSection {
848    pub id: String,
849    pub title: Option<String>,
850    pub content: String,
851    pub section_type: SectionType,
852    pub level: usize,
853    pub page_range: (usize, usize),
854}
855
856/// Section types
857#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq, Hash)]
858pub enum SectionType {
859    Title,
860    Abstract,
861    Introduction,
862    Body,
863    Conclusion,
864    References,
865    Appendix,
866}
867
868/// Column layout
869#[derive(Debug, Clone, Serialize, Deserialize)]
870pub struct ColumnLayout {
871    pub column_count: usize,
872    pub column_widths: Vec<f32>,
873    pub gutter_width: f32,
874}
875
876/// Document metadata
877#[derive(Debug, Clone, Serialize, Deserialize)]
878pub struct DocumentMetadata {
879    pub title: Option<String>,
880    pub author: Option<String>,
881    pub creation_date: Option<String>,
882    pub modification_date: Option<String>,
883    pub page_count: usize,
884    pub word_count: usize,
885    pub language: String,
886    pub format: DocumentType,
887}
888
889/// Image metadata
890#[derive(Debug, Clone, Serialize, Deserialize)]
891pub struct ImageMetadata {
892    pub width: u32,
893    pub height: u32,
894    pub format: String,
895    pub size_bytes: usize,
896    pub dpi: Option<u32>,
897    pub color_space: Option<String>,
898}
899
900/// Color information
901#[derive(Debug, Clone, Serialize, Deserialize)]
902pub struct Color {
903    pub rgb: (u8, u8, u8),
904    pub percentage: f32,
905    pub name: Option<String>,
906}
907
908/// Detected object in image
909#[derive(Debug, Clone, Serialize, Deserialize)]
910pub struct DetectedObject {
911    pub class: String,
912    pub confidence: f32,
913    pub bounding_box: (f32, f32, f32, f32),
914}
915
916/// Image quality metrics
917#[derive(Debug, Clone, Serialize, Deserialize)]
918pub struct ImageQuality {
919    pub sharpness: f32,
920    pub contrast: f32,
921    pub brightness: f32,
922    pub noise_level: f32,
923}
924
925/// Spatial layout of image
926#[derive(Debug, Clone, Serialize, Deserialize)]
927pub struct SpatialLayout {
928    pub composition_type: CompositionType,
929    pub focal_points: Vec<(f32, f32)>,
930    pub balance: f32,
931}
932
933/// Composition types
934#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
935pub enum CompositionType {
936    RuleOfThirds,
937    Centered,
938    Diagonal,
939    Symmetrical,
940    Asymmetrical,
941}
942
943/// Cell formatting
944#[derive(Debug, Clone, Serialize, Deserialize)]
945pub struct CellFormatting {
946    pub bold: bool,
947    pub italic: bool,
948    pub color: Option<String>,
949    pub background: Option<String>,
950}
951
952/// Embedding weights for fusion
953#[derive(Debug, Clone, Serialize, Deserialize)]
954pub struct EmbeddingWeights {
955    pub text_weight: f32,
956    pub visual_weight: f32,
957    pub table_weight: f32,
958    pub chart_weight: f32,
959}
960
961impl MultiModalService {
962    /// Create new multi-modal service
963    pub fn new(config: MultiModalConfig) -> RragResult<Self> {
964        Ok(Self {
965            config: config.clone(),
966            image_processor: Box::new(image_processor::DefaultImageProcessor::new(
967                config.image_config,
968            )?),
969            table_processor: Box::new(table_processor::DefaultTableProcessor::new(
970                config.table_config,
971            )?),
972            chart_processor: Box::new(chart_processor::DefaultChartProcessor::new(
973                config.chart_config,
974            )?),
975            ocr_engine: Box::new(ocr::DefaultOCREngine::new(config.ocr_config)?),
976            layout_analyzer: Box::new(layout_analysis::DefaultLayoutAnalyzer::new(
977                config.layout_config,
978            )?),
979            fusion_strategy: Box::new(embedding_fusion::DefaultFusionStrategy::new(
980                config.fusion_strategy,
981            )?),
982        })
983    }
984
985    /// Process multi-modal document
986    pub async fn process_document(&self, _document_path: &Path) -> RragResult<MultiModalDocument> {
987        // Implementation would process all modalities
988        todo!("Implement multi-modal document processing")
989    }
990
991    /// Extract all modalities
992    pub async fn extract_modalities(&self, _content: &[u8]) -> RragResult<MultiModalDocument> {
993        // Implementation would extract different modalities
994        todo!("Implement modality extraction")
995    }
996}
997
998impl Default for MultiModalConfig {
999    fn default() -> Self {
1000        Self {
1001            process_images: true,
1002            process_tables: true,
1003            process_charts: true,
1004            image_config: ImageProcessingConfig::default(),
1005            table_config: TableExtractionConfig::default(),
1006            chart_config: ChartAnalysisConfig::default(),
1007            ocr_config: OCRConfig::default(),
1008            layout_config: LayoutAnalysisConfig::default(),
1009            fusion_strategy: FusionStrategy::Weighted,
1010        }
1011    }
1012}
1013
1014impl Default for ImageProcessingConfig {
1015    fn default() -> Self {
1016        Self {
1017            max_width: 1920,
1018            max_height: 1080,
1019            supported_formats: vec![ImageFormat::JPEG, ImageFormat::PNG, ImageFormat::WEBP],
1020            use_clip: true,
1021            generate_captions: true,
1022            extract_features: true,
1023            compression_quality: 85,
1024        }
1025    }
1026}
1027
1028impl Default for TableExtractionConfig {
1029    fn default() -> Self {
1030        Self {
1031            min_rows: 2,
1032            min_cols: 2,
1033            extract_headers: true,
1034            infer_types: true,
1035            generate_summaries: true,
1036            output_format: TableOutputFormat::JSON,
1037        }
1038    }
1039}
1040
1041impl Default for ChartAnalysisConfig {
1042    fn default() -> Self {
1043        Self {
1044            chart_types: vec![
1045                ChartType::Line,
1046                ChartType::Bar,
1047                ChartType::Pie,
1048                ChartType::Scatter,
1049            ],
1050            extract_data: true,
1051            generate_descriptions: true,
1052            analyze_trends: true,
1053        }
1054    }
1055}
1056
1057impl Default for OCRConfig {
1058    fn default() -> Self {
1059        Self {
1060            engine: OCREngineType::Tesseract,
1061            languages: vec!["eng".to_string()],
1062            confidence_threshold: 0.7,
1063            spell_correction: true,
1064            preserve_formatting: true,
1065        }
1066    }
1067}
1068
1069impl Default for LayoutAnalysisConfig {
1070    fn default() -> Self {
1071        Self {
1072            detect_structure: true,
1073            identify_sections: true,
1074            extract_reading_order: true,
1075            detect_columns: true,
1076        }
1077    }
1078}
1079
1080#[cfg(test)]
1081mod tests {
1082    use super::*;
1083
1084    #[test]
1085    fn test_multimodal_config() {
1086        let config = MultiModalConfig::default();
1087        assert!(config.process_images);
1088        assert!(config.process_tables);
1089        assert!(config.process_charts);
1090    }
1091
1092    #[test]
1093    fn test_image_config() {
1094        let config = ImageProcessingConfig::default();
1095        assert_eq!(config.max_width, 1920);
1096        assert_eq!(config.max_height, 1080);
1097        assert!(config.use_clip);
1098    }
1099}