rexis_rag/multimodal/
table_processor.rs

1//! # Table Processing
2//!
3//! Advanced table extraction, analysis, and embedding generation.
4
5use super::{
6    ColumnStatistics, DataType, ExtractedTable, NumericStatistics, TableCell,
7    TableExtractionConfig, TableProcessor, TableStatistics, TextStatistics,
8};
9use crate::{RragError, RragResult};
10use serde::{Deserialize, Serialize};
11use std::collections::HashMap;
12
13/// Default table processor implementation
14pub struct DefaultTableProcessor {
15    /// Configuration
16    config: TableExtractionConfig,
17
18    /// HTML parser
19    html_parser: HtmlTableParser,
20
21    /// CSV parser
22    csv_parser: CsvTableParser,
23
24    /// Markdown parser
25    markdown_parser: MarkdownTableParser,
26
27    /// Statistics calculator
28    stats_calculator: StatisticsCalculator,
29
30    /// Type inferrer
31    type_inferrer: TypeInferrer,
32
33    /// Summary generator
34    summary_generator: TableSummaryGenerator,
35}
36
37/// HTML table parser
38pub struct HtmlTableParser {
39    /// Configuration
40    config: HtmlParserConfig,
41}
42
43/// HTML parser configuration
44#[derive(Debug, Clone)]
45pub struct HtmlParserConfig {
46    /// Extract table headers
47    pub extract_headers: bool,
48
49    /// Preserve cell formatting
50    pub preserve_formatting: bool,
51
52    /// Handle merged cells
53    pub handle_merges: bool,
54
55    /// Maximum table size
56    pub max_cells: usize,
57}
58
59/// CSV table parser
60pub struct CsvTableParser {
61    /// Delimiter detection
62    delimiter_detector: DelimiterDetector,
63
64    /// Quote handling
65    quote_char: char,
66
67    /// Escape handling
68    escape_char: Option<char>,
69}
70
71/// Delimiter detection utility
72pub struct DelimiterDetector;
73
74/// Markdown table parser
75pub struct MarkdownTableParser;
76
77/// Statistics calculator
78pub struct StatisticsCalculator;
79
80/// Type inference engine
81pub struct TypeInferrer;
82
83/// Table summary generator
84pub struct TableSummaryGenerator {
85    /// Summary templates
86    templates: HashMap<SummaryType, String>,
87
88    /// Generation strategy
89    strategy: SummaryStrategy,
90}
91
92/// Summary types
93#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
94pub enum SummaryType {
95    Brief,
96    Detailed,
97    Statistical,
98    Narrative,
99}
100
101/// Summary generation strategies
102#[derive(Debug, Clone, Copy)]
103pub enum SummaryStrategy {
104    TemplateBase,
105    MLGenerated,
106    Hybrid,
107}
108
109/// Table parsing result
110#[derive(Debug, Clone)]
111pub struct TableParseResult {
112    /// Extracted tables
113    pub tables: Vec<ExtractedTable>,
114
115    /// Parsing confidence
116    pub confidence: f32,
117
118    /// Parsing metadata
119    pub metadata: ParseMetadata,
120
121    /// Warnings and issues
122    pub warnings: Vec<String>,
123}
124
125/// Parse metadata
126#[derive(Debug, Clone)]
127pub struct ParseMetadata {
128    /// Parser used
129    pub parser_type: ParserType,
130
131    /// Processing time
132    pub processing_time_ms: u64,
133
134    /// Source format detected
135    pub detected_format: SourceFormat,
136
137    /// Table structure confidence
138    pub structure_confidence: f32,
139}
140
141/// Parser types
142#[derive(Debug, Clone, Copy)]
143pub enum ParserType {
144    Html,
145    Csv,
146    Markdown,
147    Excel,
148    Auto,
149}
150
151/// Source formats
152#[derive(Debug, Clone, Copy)]
153pub enum SourceFormat {
154    Html,
155    Csv,
156    Tsv,
157    Markdown,
158    Excel,
159    Unknown,
160}
161
162/// Table quality assessment
163#[derive(Debug, Clone, Serialize, Deserialize)]
164pub struct TableQuality {
165    /// Completeness score (0-1)
166    pub completeness: f32,
167
168    /// Consistency score (0-1)
169    pub consistency: f32,
170
171    /// Structure quality (0-1)
172    pub structure_quality: f32,
173
174    /// Data quality (0-1)
175    pub data_quality: f32,
176
177    /// Overall quality (0-1)
178    pub overall_quality: f32,
179
180    /// Quality issues
181    pub issues: Vec<QualityIssue>,
182}
183
184/// Quality issues
185#[derive(Debug, Clone, Serialize, Deserialize)]
186pub struct QualityIssue {
187    /// Issue type
188    pub issue_type: QualityIssueType,
189
190    /// Issue description
191    pub description: String,
192
193    /// Severity level
194    pub severity: IssueSeverity,
195
196    /// Location in table
197    pub location: Option<CellLocation>,
198}
199
200/// Quality issue types
201#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
202pub enum QualityIssueType {
203    MissingValues,
204    InconsistentTypes,
205    DuplicateRows,
206    InvalidData,
207    StructuralIssues,
208    EncodingIssues,
209}
210
211/// Issue severity levels
212#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
213pub enum IssueSeverity {
214    Low,
215    Medium,
216    High,
217    Critical,
218}
219
220/// Cell location
221#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
222pub struct CellLocation {
223    pub row: usize,
224    pub column: usize,
225}
226
227impl DefaultTableProcessor {
228    /// Create new table processor
229    pub fn new(config: TableExtractionConfig) -> RragResult<Self> {
230        let html_parser = HtmlTableParser::new(HtmlParserConfig::default());
231        let csv_parser = CsvTableParser::new();
232        let markdown_parser = MarkdownTableParser::new();
233        let stats_calculator = StatisticsCalculator::new();
234        let type_inferrer = TypeInferrer::new();
235        let summary_generator = TableSummaryGenerator::new();
236
237        Ok(Self {
238            config,
239            html_parser,
240            csv_parser,
241            markdown_parser,
242            stats_calculator,
243            type_inferrer,
244            summary_generator,
245        })
246    }
247
248    /// Auto-detect table format and parse
249    pub fn auto_parse(&self, content: &str) -> RragResult<TableParseResult> {
250        let detected_format = self.detect_format(content)?;
251
252        match detected_format {
253            SourceFormat::Html => self.parse_html_tables(content),
254            SourceFormat::Csv => self.parse_csv_table(content),
255            SourceFormat::Markdown => self.parse_markdown_tables(content),
256            _ => Err(RragError::document_processing("Unsupported table format")),
257        }
258    }
259
260    /// Detect table format from content
261    fn detect_format(&self, content: &str) -> RragResult<SourceFormat> {
262        // HTML detection
263        if content.contains("<table") || content.contains("<tr") {
264            return Ok(SourceFormat::Html);
265        }
266
267        // Markdown detection
268        if content.contains('|')
269            && content
270                .lines()
271                .any(|line| line.chars().filter(|&c| c == '|').count() >= 2)
272        {
273            return Ok(SourceFormat::Markdown);
274        }
275
276        // CSV/TSV detection
277        let comma_count = content.chars().filter(|&c| c == ',').count();
278        let tab_count = content.chars().filter(|&c| c == '\t').count();
279        let semicolon_count = content.chars().filter(|&c| c == ';').count();
280
281        if comma_count > tab_count && comma_count > semicolon_count {
282            Ok(SourceFormat::Csv)
283        } else if tab_count > comma_count && tab_count > semicolon_count {
284            Ok(SourceFormat::Tsv)
285        } else if semicolon_count > 0 {
286            Ok(SourceFormat::Csv) // European CSV format
287        } else {
288            Ok(SourceFormat::Unknown)
289        }
290    }
291
292    /// Parse HTML tables
293    fn parse_html_tables(&self, html: &str) -> RragResult<TableParseResult> {
294        let tables = self.html_parser.parse(html)?;
295
296        Ok(TableParseResult {
297            tables,
298            confidence: 0.9,
299            metadata: ParseMetadata {
300                parser_type: ParserType::Html,
301                processing_time_ms: 10,
302                detected_format: SourceFormat::Html,
303                structure_confidence: 0.9,
304            },
305            warnings: vec![],
306        })
307    }
308
309    /// Parse CSV table
310    fn parse_csv_table(&self, csv: &str) -> RragResult<TableParseResult> {
311        let table = self.csv_parser.parse(csv)?;
312
313        Ok(TableParseResult {
314            tables: vec![table],
315            confidence: 0.85,
316            metadata: ParseMetadata {
317                parser_type: ParserType::Csv,
318                processing_time_ms: 5,
319                detected_format: SourceFormat::Csv,
320                structure_confidence: 0.85,
321            },
322            warnings: vec![],
323        })
324    }
325
326    /// Parse Markdown tables
327    fn parse_markdown_tables(&self, markdown: &str) -> RragResult<TableParseResult> {
328        let tables = self.markdown_parser.parse(markdown)?;
329
330        Ok(TableParseResult {
331            tables,
332            confidence: 0.8,
333            metadata: ParseMetadata {
334                parser_type: ParserType::Markdown,
335                processing_time_ms: 8,
336                detected_format: SourceFormat::Markdown,
337                structure_confidence: 0.8,
338            },
339            warnings: vec![],
340        })
341    }
342
343    /// Assess table quality
344    pub fn assess_quality(&self, table: &ExtractedTable) -> RragResult<TableQuality> {
345        let mut issues = Vec::new();
346
347        // Check completeness
348        let total_cells = table.rows.len() * table.headers.len();
349        let empty_cells = table
350            .rows
351            .iter()
352            .flatten()
353            .filter(|cell| cell.value.trim().is_empty())
354            .count();
355
356        let completeness = 1.0 - (empty_cells as f32 / total_cells as f32);
357
358        if completeness < 0.8 {
359            issues.push(QualityIssue {
360                issue_type: QualityIssueType::MissingValues,
361                description: format!(
362                    "High missing value rate: {:.1}%",
363                    (1.0 - completeness) * 100.0
364                ),
365                severity: if completeness < 0.5 {
366                    IssueSeverity::High
367                } else {
368                    IssueSeverity::Medium
369                },
370                location: None,
371            });
372        }
373
374        // Check type consistency
375        let mut consistency_score = 1.0;
376        for (col_idx, col_type) in table.column_types.iter().enumerate() {
377            let inconsistent_count = table
378                .rows
379                .iter()
380                .filter(|row| {
381                    if let Some(cell) = row.get(col_idx) {
382                        !self.type_inferrer.matches_type(&cell.value, *col_type)
383                    } else {
384                        false
385                    }
386                })
387                .count();
388
389            if inconsistent_count > 0 {
390                consistency_score *= 1.0 - (inconsistent_count as f32 / table.rows.len() as f32);
391
392                if inconsistent_count as f32 / table.rows.len() as f32 > 0.1 {
393                    issues.push(QualityIssue {
394                        issue_type: QualityIssueType::InconsistentTypes,
395                        description: format!("Column {} has inconsistent data types", col_idx),
396                        severity: IssueSeverity::Medium,
397                        location: None,
398                    });
399                }
400            }
401        }
402
403        // Structure quality
404        let structure_quality = if table.headers.is_empty() { 0.5 } else { 1.0 };
405
406        // Data quality (simplified)
407        let data_quality = (completeness + consistency_score) / 2.0;
408
409        // Overall quality
410        let overall_quality =
411            (completeness + consistency_score + structure_quality + data_quality) / 4.0;
412
413        Ok(TableQuality {
414            completeness,
415            consistency: consistency_score,
416            structure_quality,
417            data_quality,
418            overall_quality,
419            issues,
420        })
421    }
422}
423
424impl TableProcessor for DefaultTableProcessor {
425    fn extract_table(&self, content: &str) -> RragResult<Vec<ExtractedTable>> {
426        let parse_result = self.auto_parse(content)?;
427        Ok(parse_result.tables)
428    }
429
430    fn parse_structure(&self, table_html: &str) -> RragResult<ExtractedTable> {
431        let parse_result = self.html_parser.parse(table_html)?;
432        parse_result
433            .into_iter()
434            .next()
435            .ok_or_else(|| RragError::document_processing("No table found in HTML"))
436    }
437
438    fn generate_summary(&self, table: &ExtractedTable) -> RragResult<String> {
439        self.summary_generator.generate(table, SummaryType::Brief)
440    }
441
442    fn calculate_statistics(&self, table: &ExtractedTable) -> RragResult<TableStatistics> {
443        self.stats_calculator.calculate(table)
444    }
445}
446
447impl HtmlTableParser {
448    /// Create new HTML parser
449    pub fn new(config: HtmlParserConfig) -> Self {
450        Self { config }
451    }
452
453    /// Parse HTML content for tables
454    pub fn parse(&self, _html: &str) -> RragResult<Vec<ExtractedTable>> {
455        // Simulate HTML parsing
456        let table_id = format!(
457            "table_{}",
458            uuid::Uuid::new_v4().to_string().split('-').next().unwrap()
459        );
460
461        let headers = vec!["Name".to_string(), "Age".to_string(), "City".to_string()];
462
463        let rows = vec![
464            vec![
465                TableCell {
466                    value: "John".to_string(),
467                    data_type: DataType::String,
468                    formatting: None,
469                },
470                TableCell {
471                    value: "25".to_string(),
472                    data_type: DataType::Number,
473                    formatting: None,
474                },
475                TableCell {
476                    value: "New York".to_string(),
477                    data_type: DataType::String,
478                    formatting: None,
479                },
480            ],
481            vec![
482                TableCell {
483                    value: "Alice".to_string(),
484                    data_type: DataType::String,
485                    formatting: None,
486                },
487                TableCell {
488                    value: "30".to_string(),
489                    data_type: DataType::Number,
490                    formatting: None,
491                },
492                TableCell {
493                    value: "London".to_string(),
494                    data_type: DataType::String,
495                    formatting: None,
496                },
497            ],
498        ];
499
500        let column_types = vec![DataType::String, DataType::Number, DataType::String];
501
502        Ok(vec![ExtractedTable {
503            id: table_id,
504            headers,
505            rows,
506            summary: None,
507            column_types,
508            embedding: None,
509            statistics: None,
510        }])
511    }
512
513    /// Extract table attributes
514    pub fn extract_attributes(&self, _table_element: &str) -> HashMap<String, String> {
515        // Simulate attribute extraction
516        let mut attributes = HashMap::new();
517        attributes.insert("border".to_string(), "1".to_string());
518        attributes.insert("cellpadding".to_string(), "2".to_string());
519        attributes
520    }
521}
522
523impl CsvTableParser {
524    /// Create new CSV parser
525    pub fn new() -> Self {
526        Self {
527            delimiter_detector: DelimiterDetector,
528            quote_char: '"',
529            escape_char: Some('\\'),
530        }
531    }
532
533    /// Parse CSV content
534    pub fn parse(&self, csv: &str) -> RragResult<ExtractedTable> {
535        let delimiter = self.delimiter_detector.detect(csv);
536        let lines: Vec<&str> = csv.lines().collect();
537
538        if lines.is_empty() {
539            return Err(RragError::document_processing("Empty CSV content"));
540        }
541
542        // Parse header
543        let headers: Vec<String> = lines[0]
544            .split(delimiter)
545            .map(|s| s.trim().trim_matches(self.quote_char).to_string())
546            .collect();
547
548        // Parse rows
549        let mut rows = Vec::new();
550        for line in lines.iter().skip(1) {
551            let values: Vec<String> = line
552                .split(delimiter)
553                .map(|s| s.trim().trim_matches(self.quote_char).to_string())
554                .collect();
555
556            if values.len() == headers.len() {
557                let row: Vec<TableCell> = values
558                    .into_iter()
559                    .map(|value| {
560                        let data_type = self.infer_type(&value);
561                        TableCell {
562                            value,
563                            data_type,
564                            formatting: None,
565                        }
566                    })
567                    .collect();
568
569                rows.push(row);
570            }
571        }
572
573        // Infer column types
574        let column_types = self.infer_column_types(&rows, headers.len());
575
576        let table_id = format!(
577            "csv_table_{}",
578            uuid::Uuid::new_v4().to_string().split('-').next().unwrap()
579        );
580
581        Ok(ExtractedTable {
582            id: table_id,
583            headers,
584            rows,
585            summary: None,
586            column_types,
587            embedding: None,
588            statistics: None,
589        })
590    }
591
592    /// Infer data type from value
593    fn infer_type(&self, value: &str) -> DataType {
594        if value.trim().is_empty() {
595            return DataType::String;
596        }
597
598        // Try parsing as number
599        if value.parse::<f64>().is_ok() {
600            return DataType::Number;
601        }
602
603        // Try parsing as date
604        if self.is_date_like(value) {
605            return DataType::Date;
606        }
607
608        // Try parsing as boolean
609        if matches!(
610            value.to_lowercase().as_str(),
611            "true" | "false" | "yes" | "no" | "1" | "0"
612        ) {
613            return DataType::Boolean;
614        }
615
616        DataType::String
617    }
618
619    /// Check if value looks like a date
620    fn is_date_like(&self, value: &str) -> bool {
621        // Simple date pattern matching
622        let date_patterns = [
623            r"\d{4}-\d{2}-\d{2}", // YYYY-MM-DD
624            r"\d{2}/\d{2}/\d{4}", // MM/DD/YYYY
625            r"\d{2}-\d{2}-\d{4}", // MM-DD-YYYY
626        ];
627
628        date_patterns.iter().any(|pattern| {
629            regex::Regex::new(pattern)
630                .map(|re| re.is_match(value))
631                .unwrap_or(false)
632        })
633    }
634
635    /// Infer column types from all rows
636    fn infer_column_types(&self, rows: &[Vec<TableCell>], num_cols: usize) -> Vec<DataType> {
637        let mut column_types = vec![DataType::String; num_cols];
638
639        for col_idx in 0..num_cols {
640            let mut type_counts = HashMap::new();
641
642            for row in rows {
643                if let Some(cell) = row.get(col_idx) {
644                    *type_counts.entry(cell.data_type).or_insert(0) += 1;
645                }
646            }
647
648            // Choose most common type
649            if let Some((&most_common_type, _)) = type_counts.iter().max_by_key(|(_, &count)| count)
650            {
651                column_types[col_idx] = most_common_type;
652            }
653        }
654
655        column_types
656    }
657}
658
659impl DelimiterDetector {
660    /// Detect CSV delimiter
661    pub fn detect(&self, csv: &str) -> char {
662        let first_line = csv.lines().next().unwrap_or("");
663
664        let comma_count = first_line.chars().filter(|&c| c == ',').count();
665        let semicolon_count = first_line.chars().filter(|&c| c == ';').count();
666        let tab_count = first_line.chars().filter(|&c| c == '\t').count();
667        let pipe_count = first_line.chars().filter(|&c| c == '|').count();
668
669        if comma_count >= semicolon_count && comma_count >= tab_count && comma_count >= pipe_count {
670            ','
671        } else if semicolon_count >= tab_count && semicolon_count >= pipe_count {
672            ';'
673        } else if tab_count >= pipe_count {
674            '\t'
675        } else {
676            '|'
677        }
678    }
679}
680
681impl MarkdownTableParser {
682    /// Create new Markdown parser
683    pub fn new() -> Self {
684        Self
685    }
686
687    /// Parse Markdown tables
688    pub fn parse(&self, markdown: &str) -> RragResult<Vec<ExtractedTable>> {
689        let mut tables = Vec::new();
690        let lines: Vec<&str> = markdown.lines().collect();
691
692        let mut i = 0;
693        while i < lines.len() {
694            if self.is_table_start(&lines[i..]) {
695                let table = self.parse_single_table(&lines[i..])?;
696                tables.push(table.0);
697                i += table.1; // Skip processed lines
698            } else {
699                i += 1;
700            }
701        }
702
703        Ok(tables)
704    }
705
706    /// Check if lines start a table
707    fn is_table_start(&self, lines: &[&str]) -> bool {
708        if lines.len() < 2 {
709            return false;
710        }
711
712        // Check for table header separator
713        lines[1]
714            .chars()
715            .all(|c| c.is_whitespace() || c == '|' || c == '-' || c == ':')
716    }
717
718    /// Parse single Markdown table
719    fn parse_single_table(&self, lines: &[&str]) -> RragResult<(ExtractedTable, usize)> {
720        let mut table_lines = Vec::new();
721        let mut line_count = 0;
722
723        // Collect table lines
724        for &line in lines {
725            if line.contains('|') {
726                table_lines.push(line);
727                line_count += 1;
728            } else if !table_lines.is_empty() {
729                break;
730            }
731        }
732
733        if table_lines.len() < 2 {
734            return Err(RragError::document_processing("Invalid Markdown table"));
735        }
736
737        // Parse headers
738        let headers: Vec<String> = table_lines[0]
739            .split('|')
740            .map(|s| s.trim().to_string())
741            .filter(|s| !s.is_empty())
742            .collect();
743
744        // Skip separator line (index 1)
745
746        // Parse data rows
747        let mut rows = Vec::new();
748        for &line in table_lines.iter().skip(2) {
749            let values: Vec<String> = line
750                .split('|')
751                .map(|s| s.trim().to_string())
752                .filter(|s| !s.is_empty())
753                .collect();
754
755            if values.len() == headers.len() {
756                let row: Vec<TableCell> = values
757                    .into_iter()
758                    .map(|value| {
759                        let data_type = self.infer_type(&value);
760                        TableCell {
761                            value,
762                            data_type,
763                            formatting: None,
764                        }
765                    })
766                    .collect();
767
768                rows.push(row);
769            }
770        }
771
772        let column_types = vec![DataType::String; headers.len()]; // Simplified
773        let table_id = format!(
774            "md_table_{}",
775            uuid::Uuid::new_v4().to_string().split('-').next().unwrap()
776        );
777
778        let table = ExtractedTable {
779            id: table_id,
780            headers,
781            rows,
782            summary: None,
783            column_types,
784            embedding: None,
785            statistics: None,
786        };
787
788        Ok((table, line_count))
789    }
790
791    /// Infer data type from Markdown cell
792    fn infer_type(&self, value: &str) -> DataType {
793        // Simplified type inference
794        if value.parse::<f64>().is_ok() {
795            DataType::Number
796        } else {
797            DataType::String
798        }
799    }
800}
801
802impl StatisticsCalculator {
803    /// Create new statistics calculator
804    pub fn new() -> Self {
805        Self
806    }
807
808    /// Calculate table statistics
809    pub fn calculate(&self, table: &ExtractedTable) -> RragResult<TableStatistics> {
810        let row_count = table.rows.len();
811        let column_count = table.headers.len();
812
813        // Calculate null percentages
814        let mut null_percentages = Vec::new();
815        for col_idx in 0..column_count {
816            let null_count = table
817                .rows
818                .iter()
819                .filter(|row| {
820                    row.get(col_idx)
821                        .map(|cell| cell.value.trim().is_empty())
822                        .unwrap_or(true)
823                })
824                .count();
825
826            let null_percentage = if row_count > 0 {
827                null_count as f32 / row_count as f32
828            } else {
829                0.0
830            };
831
832            null_percentages.push(null_percentage);
833        }
834
835        // Calculate column statistics
836        let mut column_stats = Vec::new();
837        for (col_idx, header) in table.headers.iter().enumerate() {
838            let values: Vec<String> = table
839                .rows
840                .iter()
841                .filter_map(|row| row.get(col_idx))
842                .map(|cell| cell.value.clone())
843                .collect();
844
845            let unique_count = values
846                .iter()
847                .collect::<std::collections::HashSet<_>>()
848                .len();
849
850            let numeric_stats = if table.column_types.get(col_idx) == Some(&DataType::Number) {
851                self.calculate_numeric_stats(&values)
852            } else {
853                None
854            };
855
856            let text_stats = if table.column_types.get(col_idx) == Some(&DataType::String) {
857                Some(self.calculate_text_stats(&values))
858            } else {
859                None
860            };
861
862            column_stats.push(ColumnStatistics {
863                name: header.clone(),
864                numeric_stats,
865                text_stats,
866                unique_count,
867            });
868        }
869
870        Ok(TableStatistics {
871            row_count,
872            column_count,
873            null_percentages,
874            column_stats,
875        })
876    }
877
878    /// Calculate numeric statistics
879    fn calculate_numeric_stats(&self, values: &[String]) -> Option<NumericStatistics> {
880        let numbers: Vec<f64> = values.iter().filter_map(|s| s.parse().ok()).collect();
881
882        if numbers.is_empty() {
883            return None;
884        }
885
886        let min = numbers.iter().fold(f64::INFINITY, |a, &b| a.min(b));
887        let max = numbers.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
888        let mean = numbers.iter().sum::<f64>() / numbers.len() as f64;
889
890        let mut sorted_numbers = numbers.clone();
891        sorted_numbers.sort_by(|a, b| a.partial_cmp(b).unwrap());
892        let median = if sorted_numbers.len() % 2 == 0 {
893            (sorted_numbers[sorted_numbers.len() / 2 - 1]
894                + sorted_numbers[sorted_numbers.len() / 2])
895                / 2.0
896        } else {
897            sorted_numbers[sorted_numbers.len() / 2]
898        };
899
900        let variance =
901            numbers.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / numbers.len() as f64;
902        let std_dev = variance.sqrt();
903
904        Some(NumericStatistics {
905            min,
906            max,
907            mean,
908            median,
909            std_dev,
910        })
911    }
912
913    /// Calculate text statistics
914    fn calculate_text_stats(&self, values: &[String]) -> TextStatistics {
915        let lengths: Vec<usize> = values.iter().map(|s| s.len()).collect();
916
917        let min_length = lengths.iter().min().copied().unwrap_or(0);
918        let max_length = lengths.iter().max().copied().unwrap_or(0);
919        let avg_length = if !lengths.is_empty() {
920            lengths.iter().sum::<usize>() as f32 / lengths.len() as f32
921        } else {
922            0.0
923        };
924
925        // Count occurrences
926        let mut counts = HashMap::new();
927        for value in values {
928            *counts.entry(value.clone()).or_insert(0) += 1;
929        }
930
931        let mut most_common: Vec<(String, usize)> = counts.into_iter().collect();
932        most_common.sort_by(|a, b| b.1.cmp(&a.1));
933        most_common.truncate(5); // Top 5
934
935        TextStatistics {
936            min_length,
937            max_length,
938            avg_length,
939            most_common,
940        }
941    }
942}
943
944impl TypeInferrer {
945    /// Create new type inferrer
946    pub fn new() -> Self {
947        Self
948    }
949
950    /// Check if value matches expected type
951    pub fn matches_type(&self, value: &str, expected_type: DataType) -> bool {
952        match expected_type {
953            DataType::String => true, // Any value can be a string
954            DataType::Number => value.parse::<f64>().is_ok(),
955            DataType::Date => self.is_date_like(value),
956            DataType::Boolean => matches!(
957                value.to_lowercase().as_str(),
958                "true" | "false" | "yes" | "no" | "1" | "0"
959            ),
960            DataType::Mixed => true, // Mixed type accepts anything
961        }
962    }
963
964    /// Check if value looks like a date
965    fn is_date_like(&self, value: &str) -> bool {
966        // Basic date pattern matching
967        let patterns = [
968            r"^\d{4}-\d{2}-\d{2}$",
969            r"^\d{2}/\d{2}/\d{4}$",
970            r"^\d{2}-\d{2}-\d{4}$",
971        ];
972
973        patterns.iter().any(|pattern| {
974            regex::Regex::new(pattern)
975                .map(|re| re.is_match(value))
976                .unwrap_or(false)
977        })
978    }
979}
980
981impl TableSummaryGenerator {
982    /// Create new summary generator
983    pub fn new() -> Self {
984        let mut templates = HashMap::new();
985        templates.insert(
986            SummaryType::Brief,
987            "Table with {row_count} rows and {col_count} columns. Columns: {headers}.".to_string(),
988        );
989        templates.insert(
990            SummaryType::Detailed,
991            "This table contains {row_count} rows and {col_count} columns. The columns are: {headers}. {additional_info}".to_string()
992        );
993
994        Self {
995            templates,
996            strategy: SummaryStrategy::TemplateBase,
997        }
998    }
999
1000    /// Generate table summary
1001    pub fn generate(
1002        &self,
1003        table: &ExtractedTable,
1004        summary_type: SummaryType,
1005    ) -> RragResult<String> {
1006        match self.strategy {
1007            SummaryStrategy::TemplateBase => self.generate_template_based(table, summary_type),
1008            SummaryStrategy::MLGenerated => self.generate_ml_based(table),
1009            SummaryStrategy::Hybrid => self.generate_hybrid(table, summary_type),
1010        }
1011    }
1012
1013    /// Generate template-based summary
1014    fn generate_template_based(
1015        &self,
1016        table: &ExtractedTable,
1017        summary_type: SummaryType,
1018    ) -> RragResult<String> {
1019        let template = self
1020            .templates
1021            .get(&summary_type)
1022            .ok_or_else(|| RragError::configuration("Summary template not found"))?;
1023
1024        let summary = template
1025            .replace("{row_count}", &table.rows.len().to_string())
1026            .replace("{col_count}", &table.headers.len().to_string())
1027            .replace("{headers}", &table.headers.join(", "));
1028
1029        Ok(summary)
1030    }
1031
1032    /// Generate ML-based summary (placeholder)
1033    fn generate_ml_based(&self, _table: &ExtractedTable) -> RragResult<String> {
1034        // Placeholder for ML-generated summary
1035        Ok("ML-generated summary would go here".to_string())
1036    }
1037
1038    /// Generate hybrid summary
1039    fn generate_hybrid(
1040        &self,
1041        table: &ExtractedTable,
1042        summary_type: SummaryType,
1043    ) -> RragResult<String> {
1044        let base_summary = self.generate_template_based(table, summary_type)?;
1045        // Could enhance with ML-generated insights
1046        Ok(base_summary)
1047    }
1048}
1049
1050impl Default for HtmlParserConfig {
1051    fn default() -> Self {
1052        Self {
1053            extract_headers: true,
1054            preserve_formatting: true,
1055            handle_merges: true,
1056            max_cells: 10000,
1057        }
1058    }
1059}
1060
1061#[cfg(test)]
1062mod tests {
1063    use super::*;
1064
1065    #[test]
1066    fn test_table_processor_creation() {
1067        let config = TableExtractionConfig::default();
1068        let processor = DefaultTableProcessor::new(config).unwrap();
1069
1070        assert_eq!(processor.config.min_rows, 2);
1071        assert_eq!(processor.config.min_cols, 2);
1072    }
1073
1074    #[test]
1075    fn test_format_detection() {
1076        let processor = DefaultTableProcessor::new(TableExtractionConfig::default()).unwrap();
1077
1078        let html = "<table><tr><td>test</td></tr></table>";
1079        assert!(matches!(
1080            processor.detect_format(html).unwrap(),
1081            SourceFormat::Html
1082        ));
1083
1084        let csv = "name,age,city\nJohn,25,NYC";
1085        assert!(matches!(
1086            processor.detect_format(csv).unwrap(),
1087            SourceFormat::Csv
1088        ));
1089
1090        let markdown = "| Name | Age |\n|------|-----|\n| John | 25 |";
1091        assert!(matches!(
1092            processor.detect_format(markdown).unwrap(),
1093            SourceFormat::Markdown
1094        ));
1095    }
1096
1097    #[test]
1098    fn test_delimiter_detection() {
1099        let detector = DelimiterDetector;
1100
1101        assert_eq!(detector.detect("a,b,c"), ',');
1102        assert_eq!(detector.detect("a;b;c"), ';');
1103        assert_eq!(detector.detect("a\tb\tc"), '\t');
1104        assert_eq!(detector.detect("a|b|c"), '|');
1105    }
1106
1107    #[test]
1108    fn test_type_inference() {
1109        let inferrer = TypeInferrer::new();
1110
1111        assert!(inferrer.matches_type("123", DataType::Number));
1112        assert!(inferrer.matches_type("hello", DataType::String));
1113        assert!(inferrer.matches_type("true", DataType::Boolean));
1114        assert!(inferrer.matches_type("2023-01-01", DataType::Date));
1115    }
1116
1117    #[test]
1118    fn test_statistics_calculation() {
1119        let calculator = StatisticsCalculator::new();
1120        let values = vec!["1".to_string(), "2".to_string(), "3".to_string()];
1121
1122        let stats = calculator.calculate_numeric_stats(&values).unwrap();
1123        assert_eq!(stats.min, 1.0);
1124        assert_eq!(stats.max, 3.0);
1125        assert_eq!(stats.mean, 2.0);
1126    }
1127}
rexis_rag/multimodal/table_processor.rs

rexis_rag/multimodal/
table_processor.rs