1use super::{
6 ColumnStatistics, DataType, ExtractedTable, NumericStatistics, TableCell,
7 TableExtractionConfig, TableProcessor, TableStatistics, TextStatistics,
8};
9use crate::{RragError, RragResult};
10use serde::{Deserialize, Serialize};
11use std::collections::HashMap;
12
13pub struct DefaultTableProcessor {
15 config: TableExtractionConfig,
17
18 html_parser: HtmlTableParser,
20
21 csv_parser: CsvTableParser,
23
24 markdown_parser: MarkdownTableParser,
26
27 stats_calculator: StatisticsCalculator,
29
30 type_inferrer: TypeInferrer,
32
33 summary_generator: TableSummaryGenerator,
35}
36
37pub struct HtmlTableParser {
39 config: HtmlParserConfig,
41}
42
43#[derive(Debug, Clone)]
45pub struct HtmlParserConfig {
46 pub extract_headers: bool,
48
49 pub preserve_formatting: bool,
51
52 pub handle_merges: bool,
54
55 pub max_cells: usize,
57}
58
59pub struct CsvTableParser {
61 delimiter_detector: DelimiterDetector,
63
64 quote_char: char,
66
67 escape_char: Option<char>,
69}
70
71pub struct DelimiterDetector;
73
74pub struct MarkdownTableParser;
76
77pub struct StatisticsCalculator;
79
80pub struct TypeInferrer;
82
83pub struct TableSummaryGenerator {
85 templates: HashMap<SummaryType, String>,
87
88 strategy: SummaryStrategy,
90}
91
92#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
94pub enum SummaryType {
95 Brief,
96 Detailed,
97 Statistical,
98 Narrative,
99}
100
101#[derive(Debug, Clone, Copy)]
103pub enum SummaryStrategy {
104 TemplateBase,
105 MLGenerated,
106 Hybrid,
107}
108
109#[derive(Debug, Clone)]
111pub struct TableParseResult {
112 pub tables: Vec<ExtractedTable>,
114
115 pub confidence: f32,
117
118 pub metadata: ParseMetadata,
120
121 pub warnings: Vec<String>,
123}
124
125#[derive(Debug, Clone)]
127pub struct ParseMetadata {
128 pub parser_type: ParserType,
130
131 pub processing_time_ms: u64,
133
134 pub detected_format: SourceFormat,
136
137 pub structure_confidence: f32,
139}
140
141#[derive(Debug, Clone, Copy)]
143pub enum ParserType {
144 Html,
145 Csv,
146 Markdown,
147 Excel,
148 Auto,
149}
150
151#[derive(Debug, Clone, Copy)]
153pub enum SourceFormat {
154 Html,
155 Csv,
156 Tsv,
157 Markdown,
158 Excel,
159 Unknown,
160}
161
162#[derive(Debug, Clone, Serialize, Deserialize)]
164pub struct TableQuality {
165 pub completeness: f32,
167
168 pub consistency: f32,
170
171 pub structure_quality: f32,
173
174 pub data_quality: f32,
176
177 pub overall_quality: f32,
179
180 pub issues: Vec<QualityIssue>,
182}
183
184#[derive(Debug, Clone, Serialize, Deserialize)]
186pub struct QualityIssue {
187 pub issue_type: QualityIssueType,
189
190 pub description: String,
192
193 pub severity: IssueSeverity,
195
196 pub location: Option<CellLocation>,
198}
199
200#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
202pub enum QualityIssueType {
203 MissingValues,
204 InconsistentTypes,
205 DuplicateRows,
206 InvalidData,
207 StructuralIssues,
208 EncodingIssues,
209}
210
211#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
213pub enum IssueSeverity {
214 Low,
215 Medium,
216 High,
217 Critical,
218}
219
220#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
222pub struct CellLocation {
223 pub row: usize,
224 pub column: usize,
225}
226
227impl DefaultTableProcessor {
228 pub fn new(config: TableExtractionConfig) -> RragResult<Self> {
230 let html_parser = HtmlTableParser::new(HtmlParserConfig::default());
231 let csv_parser = CsvTableParser::new();
232 let markdown_parser = MarkdownTableParser::new();
233 let stats_calculator = StatisticsCalculator::new();
234 let type_inferrer = TypeInferrer::new();
235 let summary_generator = TableSummaryGenerator::new();
236
237 Ok(Self {
238 config,
239 html_parser,
240 csv_parser,
241 markdown_parser,
242 stats_calculator,
243 type_inferrer,
244 summary_generator,
245 })
246 }
247
248 pub fn auto_parse(&self, content: &str) -> RragResult<TableParseResult> {
250 let detected_format = self.detect_format(content)?;
251
252 match detected_format {
253 SourceFormat::Html => self.parse_html_tables(content),
254 SourceFormat::Csv => self.parse_csv_table(content),
255 SourceFormat::Markdown => self.parse_markdown_tables(content),
256 _ => Err(RragError::document_processing("Unsupported table format")),
257 }
258 }
259
260 fn detect_format(&self, content: &str) -> RragResult<SourceFormat> {
262 if content.contains("<table") || content.contains("<tr") {
264 return Ok(SourceFormat::Html);
265 }
266
267 if content.contains('|')
269 && content
270 .lines()
271 .any(|line| line.chars().filter(|&c| c == '|').count() >= 2)
272 {
273 return Ok(SourceFormat::Markdown);
274 }
275
276 let comma_count = content.chars().filter(|&c| c == ',').count();
278 let tab_count = content.chars().filter(|&c| c == '\t').count();
279 let semicolon_count = content.chars().filter(|&c| c == ';').count();
280
281 if comma_count > tab_count && comma_count > semicolon_count {
282 Ok(SourceFormat::Csv)
283 } else if tab_count > comma_count && tab_count > semicolon_count {
284 Ok(SourceFormat::Tsv)
285 } else if semicolon_count > 0 {
286 Ok(SourceFormat::Csv) } else {
288 Ok(SourceFormat::Unknown)
289 }
290 }
291
292 fn parse_html_tables(&self, html: &str) -> RragResult<TableParseResult> {
294 let tables = self.html_parser.parse(html)?;
295
296 Ok(TableParseResult {
297 tables,
298 confidence: 0.9,
299 metadata: ParseMetadata {
300 parser_type: ParserType::Html,
301 processing_time_ms: 10,
302 detected_format: SourceFormat::Html,
303 structure_confidence: 0.9,
304 },
305 warnings: vec![],
306 })
307 }
308
309 fn parse_csv_table(&self, csv: &str) -> RragResult<TableParseResult> {
311 let table = self.csv_parser.parse(csv)?;
312
313 Ok(TableParseResult {
314 tables: vec![table],
315 confidence: 0.85,
316 metadata: ParseMetadata {
317 parser_type: ParserType::Csv,
318 processing_time_ms: 5,
319 detected_format: SourceFormat::Csv,
320 structure_confidence: 0.85,
321 },
322 warnings: vec![],
323 })
324 }
325
326 fn parse_markdown_tables(&self, markdown: &str) -> RragResult<TableParseResult> {
328 let tables = self.markdown_parser.parse(markdown)?;
329
330 Ok(TableParseResult {
331 tables,
332 confidence: 0.8,
333 metadata: ParseMetadata {
334 parser_type: ParserType::Markdown,
335 processing_time_ms: 8,
336 detected_format: SourceFormat::Markdown,
337 structure_confidence: 0.8,
338 },
339 warnings: vec![],
340 })
341 }
342
343 pub fn assess_quality(&self, table: &ExtractedTable) -> RragResult<TableQuality> {
345 let mut issues = Vec::new();
346
347 let total_cells = table.rows.len() * table.headers.len();
349 let empty_cells = table
350 .rows
351 .iter()
352 .flatten()
353 .filter(|cell| cell.value.trim().is_empty())
354 .count();
355
356 let completeness = 1.0 - (empty_cells as f32 / total_cells as f32);
357
358 if completeness < 0.8 {
359 issues.push(QualityIssue {
360 issue_type: QualityIssueType::MissingValues,
361 description: format!(
362 "High missing value rate: {:.1}%",
363 (1.0 - completeness) * 100.0
364 ),
365 severity: if completeness < 0.5 {
366 IssueSeverity::High
367 } else {
368 IssueSeverity::Medium
369 },
370 location: None,
371 });
372 }
373
374 let mut consistency_score = 1.0;
376 for (col_idx, col_type) in table.column_types.iter().enumerate() {
377 let inconsistent_count = table
378 .rows
379 .iter()
380 .filter(|row| {
381 if let Some(cell) = row.get(col_idx) {
382 !self.type_inferrer.matches_type(&cell.value, *col_type)
383 } else {
384 false
385 }
386 })
387 .count();
388
389 if inconsistent_count > 0 {
390 consistency_score *= 1.0 - (inconsistent_count as f32 / table.rows.len() as f32);
391
392 if inconsistent_count as f32 / table.rows.len() as f32 > 0.1 {
393 issues.push(QualityIssue {
394 issue_type: QualityIssueType::InconsistentTypes,
395 description: format!("Column {} has inconsistent data types", col_idx),
396 severity: IssueSeverity::Medium,
397 location: None,
398 });
399 }
400 }
401 }
402
403 let structure_quality = if table.headers.is_empty() { 0.5 } else { 1.0 };
405
406 let data_quality = (completeness + consistency_score) / 2.0;
408
409 let overall_quality =
411 (completeness + consistency_score + structure_quality + data_quality) / 4.0;
412
413 Ok(TableQuality {
414 completeness,
415 consistency: consistency_score,
416 structure_quality,
417 data_quality,
418 overall_quality,
419 issues,
420 })
421 }
422}
423
424impl TableProcessor for DefaultTableProcessor {
425 fn extract_table(&self, content: &str) -> RragResult<Vec<ExtractedTable>> {
426 let parse_result = self.auto_parse(content)?;
427 Ok(parse_result.tables)
428 }
429
430 fn parse_structure(&self, table_html: &str) -> RragResult<ExtractedTable> {
431 let parse_result = self.html_parser.parse(table_html)?;
432 parse_result
433 .into_iter()
434 .next()
435 .ok_or_else(|| RragError::document_processing("No table found in HTML"))
436 }
437
438 fn generate_summary(&self, table: &ExtractedTable) -> RragResult<String> {
439 self.summary_generator.generate(table, SummaryType::Brief)
440 }
441
442 fn calculate_statistics(&self, table: &ExtractedTable) -> RragResult<TableStatistics> {
443 self.stats_calculator.calculate(table)
444 }
445}
446
447impl HtmlTableParser {
448 pub fn new(config: HtmlParserConfig) -> Self {
450 Self { config }
451 }
452
453 pub fn parse(&self, _html: &str) -> RragResult<Vec<ExtractedTable>> {
455 let table_id = format!(
457 "table_{}",
458 uuid::Uuid::new_v4().to_string().split('-').next().unwrap()
459 );
460
461 let headers = vec!["Name".to_string(), "Age".to_string(), "City".to_string()];
462
463 let rows = vec![
464 vec![
465 TableCell {
466 value: "John".to_string(),
467 data_type: DataType::String,
468 formatting: None,
469 },
470 TableCell {
471 value: "25".to_string(),
472 data_type: DataType::Number,
473 formatting: None,
474 },
475 TableCell {
476 value: "New York".to_string(),
477 data_type: DataType::String,
478 formatting: None,
479 },
480 ],
481 vec![
482 TableCell {
483 value: "Alice".to_string(),
484 data_type: DataType::String,
485 formatting: None,
486 },
487 TableCell {
488 value: "30".to_string(),
489 data_type: DataType::Number,
490 formatting: None,
491 },
492 TableCell {
493 value: "London".to_string(),
494 data_type: DataType::String,
495 formatting: None,
496 },
497 ],
498 ];
499
500 let column_types = vec![DataType::String, DataType::Number, DataType::String];
501
502 Ok(vec![ExtractedTable {
503 id: table_id,
504 headers,
505 rows,
506 summary: None,
507 column_types,
508 embedding: None,
509 statistics: None,
510 }])
511 }
512
513 pub fn extract_attributes(&self, _table_element: &str) -> HashMap<String, String> {
515 let mut attributes = HashMap::new();
517 attributes.insert("border".to_string(), "1".to_string());
518 attributes.insert("cellpadding".to_string(), "2".to_string());
519 attributes
520 }
521}
522
523impl CsvTableParser {
524 pub fn new() -> Self {
526 Self {
527 delimiter_detector: DelimiterDetector,
528 quote_char: '"',
529 escape_char: Some('\\'),
530 }
531 }
532
533 pub fn parse(&self, csv: &str) -> RragResult<ExtractedTable> {
535 let delimiter = self.delimiter_detector.detect(csv);
536 let lines: Vec<&str> = csv.lines().collect();
537
538 if lines.is_empty() {
539 return Err(RragError::document_processing("Empty CSV content"));
540 }
541
542 let headers: Vec<String> = lines[0]
544 .split(delimiter)
545 .map(|s| s.trim().trim_matches(self.quote_char).to_string())
546 .collect();
547
548 let mut rows = Vec::new();
550 for line in lines.iter().skip(1) {
551 let values: Vec<String> = line
552 .split(delimiter)
553 .map(|s| s.trim().trim_matches(self.quote_char).to_string())
554 .collect();
555
556 if values.len() == headers.len() {
557 let row: Vec<TableCell> = values
558 .into_iter()
559 .map(|value| {
560 let data_type = self.infer_type(&value);
561 TableCell {
562 value,
563 data_type,
564 formatting: None,
565 }
566 })
567 .collect();
568
569 rows.push(row);
570 }
571 }
572
573 let column_types = self.infer_column_types(&rows, headers.len());
575
576 let table_id = format!(
577 "csv_table_{}",
578 uuid::Uuid::new_v4().to_string().split('-').next().unwrap()
579 );
580
581 Ok(ExtractedTable {
582 id: table_id,
583 headers,
584 rows,
585 summary: None,
586 column_types,
587 embedding: None,
588 statistics: None,
589 })
590 }
591
592 fn infer_type(&self, value: &str) -> DataType {
594 if value.trim().is_empty() {
595 return DataType::String;
596 }
597
598 if value.parse::<f64>().is_ok() {
600 return DataType::Number;
601 }
602
603 if self.is_date_like(value) {
605 return DataType::Date;
606 }
607
608 if matches!(
610 value.to_lowercase().as_str(),
611 "true" | "false" | "yes" | "no" | "1" | "0"
612 ) {
613 return DataType::Boolean;
614 }
615
616 DataType::String
617 }
618
619 fn is_date_like(&self, value: &str) -> bool {
621 let date_patterns = [
623 r"\d{4}-\d{2}-\d{2}", r"\d{2}/\d{2}/\d{4}", r"\d{2}-\d{2}-\d{4}", ];
627
628 date_patterns.iter().any(|pattern| {
629 regex::Regex::new(pattern)
630 .map(|re| re.is_match(value))
631 .unwrap_or(false)
632 })
633 }
634
635 fn infer_column_types(&self, rows: &[Vec<TableCell>], num_cols: usize) -> Vec<DataType> {
637 let mut column_types = vec![DataType::String; num_cols];
638
639 for col_idx in 0..num_cols {
640 let mut type_counts = HashMap::new();
641
642 for row in rows {
643 if let Some(cell) = row.get(col_idx) {
644 *type_counts.entry(cell.data_type).or_insert(0) += 1;
645 }
646 }
647
648 if let Some((&most_common_type, _)) = type_counts.iter().max_by_key(|(_, &count)| count)
650 {
651 column_types[col_idx] = most_common_type;
652 }
653 }
654
655 column_types
656 }
657}
658
659impl DelimiterDetector {
660 pub fn detect(&self, csv: &str) -> char {
662 let first_line = csv.lines().next().unwrap_or("");
663
664 let comma_count = first_line.chars().filter(|&c| c == ',').count();
665 let semicolon_count = first_line.chars().filter(|&c| c == ';').count();
666 let tab_count = first_line.chars().filter(|&c| c == '\t').count();
667 let pipe_count = first_line.chars().filter(|&c| c == '|').count();
668
669 if comma_count >= semicolon_count && comma_count >= tab_count && comma_count >= pipe_count {
670 ','
671 } else if semicolon_count >= tab_count && semicolon_count >= pipe_count {
672 ';'
673 } else if tab_count >= pipe_count {
674 '\t'
675 } else {
676 '|'
677 }
678 }
679}
680
681impl MarkdownTableParser {
682 pub fn new() -> Self {
684 Self
685 }
686
687 pub fn parse(&self, markdown: &str) -> RragResult<Vec<ExtractedTable>> {
689 let mut tables = Vec::new();
690 let lines: Vec<&str> = markdown.lines().collect();
691
692 let mut i = 0;
693 while i < lines.len() {
694 if self.is_table_start(&lines[i..]) {
695 let table = self.parse_single_table(&lines[i..])?;
696 tables.push(table.0);
697 i += table.1; } else {
699 i += 1;
700 }
701 }
702
703 Ok(tables)
704 }
705
706 fn is_table_start(&self, lines: &[&str]) -> bool {
708 if lines.len() < 2 {
709 return false;
710 }
711
712 lines[1]
714 .chars()
715 .all(|c| c.is_whitespace() || c == '|' || c == '-' || c == ':')
716 }
717
718 fn parse_single_table(&self, lines: &[&str]) -> RragResult<(ExtractedTable, usize)> {
720 let mut table_lines = Vec::new();
721 let mut line_count = 0;
722
723 for &line in lines {
725 if line.contains('|') {
726 table_lines.push(line);
727 line_count += 1;
728 } else if !table_lines.is_empty() {
729 break;
730 }
731 }
732
733 if table_lines.len() < 2 {
734 return Err(RragError::document_processing("Invalid Markdown table"));
735 }
736
737 let headers: Vec<String> = table_lines[0]
739 .split('|')
740 .map(|s| s.trim().to_string())
741 .filter(|s| !s.is_empty())
742 .collect();
743
744 let mut rows = Vec::new();
748 for &line in table_lines.iter().skip(2) {
749 let values: Vec<String> = line
750 .split('|')
751 .map(|s| s.trim().to_string())
752 .filter(|s| !s.is_empty())
753 .collect();
754
755 if values.len() == headers.len() {
756 let row: Vec<TableCell> = values
757 .into_iter()
758 .map(|value| {
759 let data_type = self.infer_type(&value);
760 TableCell {
761 value,
762 data_type,
763 formatting: None,
764 }
765 })
766 .collect();
767
768 rows.push(row);
769 }
770 }
771
772 let column_types = vec![DataType::String; headers.len()]; let table_id = format!(
774 "md_table_{}",
775 uuid::Uuid::new_v4().to_string().split('-').next().unwrap()
776 );
777
778 let table = ExtractedTable {
779 id: table_id,
780 headers,
781 rows,
782 summary: None,
783 column_types,
784 embedding: None,
785 statistics: None,
786 };
787
788 Ok((table, line_count))
789 }
790
791 fn infer_type(&self, value: &str) -> DataType {
793 if value.parse::<f64>().is_ok() {
795 DataType::Number
796 } else {
797 DataType::String
798 }
799 }
800}
801
802impl StatisticsCalculator {
803 pub fn new() -> Self {
805 Self
806 }
807
808 pub fn calculate(&self, table: &ExtractedTable) -> RragResult<TableStatistics> {
810 let row_count = table.rows.len();
811 let column_count = table.headers.len();
812
813 let mut null_percentages = Vec::new();
815 for col_idx in 0..column_count {
816 let null_count = table
817 .rows
818 .iter()
819 .filter(|row| {
820 row.get(col_idx)
821 .map(|cell| cell.value.trim().is_empty())
822 .unwrap_or(true)
823 })
824 .count();
825
826 let null_percentage = if row_count > 0 {
827 null_count as f32 / row_count as f32
828 } else {
829 0.0
830 };
831
832 null_percentages.push(null_percentage);
833 }
834
835 let mut column_stats = Vec::new();
837 for (col_idx, header) in table.headers.iter().enumerate() {
838 let values: Vec<String> = table
839 .rows
840 .iter()
841 .filter_map(|row| row.get(col_idx))
842 .map(|cell| cell.value.clone())
843 .collect();
844
845 let unique_count = values
846 .iter()
847 .collect::<std::collections::HashSet<_>>()
848 .len();
849
850 let numeric_stats = if table.column_types.get(col_idx) == Some(&DataType::Number) {
851 self.calculate_numeric_stats(&values)
852 } else {
853 None
854 };
855
856 let text_stats = if table.column_types.get(col_idx) == Some(&DataType::String) {
857 Some(self.calculate_text_stats(&values))
858 } else {
859 None
860 };
861
862 column_stats.push(ColumnStatistics {
863 name: header.clone(),
864 numeric_stats,
865 text_stats,
866 unique_count,
867 });
868 }
869
870 Ok(TableStatistics {
871 row_count,
872 column_count,
873 null_percentages,
874 column_stats,
875 })
876 }
877
878 fn calculate_numeric_stats(&self, values: &[String]) -> Option<NumericStatistics> {
880 let numbers: Vec<f64> = values.iter().filter_map(|s| s.parse().ok()).collect();
881
882 if numbers.is_empty() {
883 return None;
884 }
885
886 let min = numbers.iter().fold(f64::INFINITY, |a, &b| a.min(b));
887 let max = numbers.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
888 let mean = numbers.iter().sum::<f64>() / numbers.len() as f64;
889
890 let mut sorted_numbers = numbers.clone();
891 sorted_numbers.sort_by(|a, b| a.partial_cmp(b).unwrap());
892 let median = if sorted_numbers.len() % 2 == 0 {
893 (sorted_numbers[sorted_numbers.len() / 2 - 1]
894 + sorted_numbers[sorted_numbers.len() / 2])
895 / 2.0
896 } else {
897 sorted_numbers[sorted_numbers.len() / 2]
898 };
899
900 let variance =
901 numbers.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / numbers.len() as f64;
902 let std_dev = variance.sqrt();
903
904 Some(NumericStatistics {
905 min,
906 max,
907 mean,
908 median,
909 std_dev,
910 })
911 }
912
913 fn calculate_text_stats(&self, values: &[String]) -> TextStatistics {
915 let lengths: Vec<usize> = values.iter().map(|s| s.len()).collect();
916
917 let min_length = lengths.iter().min().copied().unwrap_or(0);
918 let max_length = lengths.iter().max().copied().unwrap_or(0);
919 let avg_length = if !lengths.is_empty() {
920 lengths.iter().sum::<usize>() as f32 / lengths.len() as f32
921 } else {
922 0.0
923 };
924
925 let mut counts = HashMap::new();
927 for value in values {
928 *counts.entry(value.clone()).or_insert(0) += 1;
929 }
930
931 let mut most_common: Vec<(String, usize)> = counts.into_iter().collect();
932 most_common.sort_by(|a, b| b.1.cmp(&a.1));
933 most_common.truncate(5); TextStatistics {
936 min_length,
937 max_length,
938 avg_length,
939 most_common,
940 }
941 }
942}
943
944impl TypeInferrer {
945 pub fn new() -> Self {
947 Self
948 }
949
950 pub fn matches_type(&self, value: &str, expected_type: DataType) -> bool {
952 match expected_type {
953 DataType::String => true, DataType::Number => value.parse::<f64>().is_ok(),
955 DataType::Date => self.is_date_like(value),
956 DataType::Boolean => matches!(
957 value.to_lowercase().as_str(),
958 "true" | "false" | "yes" | "no" | "1" | "0"
959 ),
960 DataType::Mixed => true, }
962 }
963
964 fn is_date_like(&self, value: &str) -> bool {
966 let patterns = [
968 r"^\d{4}-\d{2}-\d{2}$",
969 r"^\d{2}/\d{2}/\d{4}$",
970 r"^\d{2}-\d{2}-\d{4}$",
971 ];
972
973 patterns.iter().any(|pattern| {
974 regex::Regex::new(pattern)
975 .map(|re| re.is_match(value))
976 .unwrap_or(false)
977 })
978 }
979}
980
981impl TableSummaryGenerator {
982 pub fn new() -> Self {
984 let mut templates = HashMap::new();
985 templates.insert(
986 SummaryType::Brief,
987 "Table with {row_count} rows and {col_count} columns. Columns: {headers}.".to_string(),
988 );
989 templates.insert(
990 SummaryType::Detailed,
991 "This table contains {row_count} rows and {col_count} columns. The columns are: {headers}. {additional_info}".to_string()
992 );
993
994 Self {
995 templates,
996 strategy: SummaryStrategy::TemplateBase,
997 }
998 }
999
1000 pub fn generate(
1002 &self,
1003 table: &ExtractedTable,
1004 summary_type: SummaryType,
1005 ) -> RragResult<String> {
1006 match self.strategy {
1007 SummaryStrategy::TemplateBase => self.generate_template_based(table, summary_type),
1008 SummaryStrategy::MLGenerated => self.generate_ml_based(table),
1009 SummaryStrategy::Hybrid => self.generate_hybrid(table, summary_type),
1010 }
1011 }
1012
1013 fn generate_template_based(
1015 &self,
1016 table: &ExtractedTable,
1017 summary_type: SummaryType,
1018 ) -> RragResult<String> {
1019 let template = self
1020 .templates
1021 .get(&summary_type)
1022 .ok_or_else(|| RragError::configuration("Summary template not found"))?;
1023
1024 let summary = template
1025 .replace("{row_count}", &table.rows.len().to_string())
1026 .replace("{col_count}", &table.headers.len().to_string())
1027 .replace("{headers}", &table.headers.join(", "));
1028
1029 Ok(summary)
1030 }
1031
1032 fn generate_ml_based(&self, _table: &ExtractedTable) -> RragResult<String> {
1034 Ok("ML-generated summary would go here".to_string())
1036 }
1037
1038 fn generate_hybrid(
1040 &self,
1041 table: &ExtractedTable,
1042 summary_type: SummaryType,
1043 ) -> RragResult<String> {
1044 let base_summary = self.generate_template_based(table, summary_type)?;
1045 Ok(base_summary)
1047 }
1048}
1049
1050impl Default for HtmlParserConfig {
1051 fn default() -> Self {
1052 Self {
1053 extract_headers: true,
1054 preserve_formatting: true,
1055 handle_merges: true,
1056 max_cells: 10000,
1057 }
1058 }
1059}
1060
1061#[cfg(test)]
1062mod tests {
1063 use super::*;
1064
1065 #[test]
1066 fn test_table_processor_creation() {
1067 let config = TableExtractionConfig::default();
1068 let processor = DefaultTableProcessor::new(config).unwrap();
1069
1070 assert_eq!(processor.config.min_rows, 2);
1071 assert_eq!(processor.config.min_cols, 2);
1072 }
1073
1074 #[test]
1075 fn test_format_detection() {
1076 let processor = DefaultTableProcessor::new(TableExtractionConfig::default()).unwrap();
1077
1078 let html = "<table><tr><td>test</td></tr></table>";
1079 assert!(matches!(
1080 processor.detect_format(html).unwrap(),
1081 SourceFormat::Html
1082 ));
1083
1084 let csv = "name,age,city\nJohn,25,NYC";
1085 assert!(matches!(
1086 processor.detect_format(csv).unwrap(),
1087 SourceFormat::Csv
1088 ));
1089
1090 let markdown = "| Name | Age |\n|------|-----|\n| John | 25 |";
1091 assert!(matches!(
1092 processor.detect_format(markdown).unwrap(),
1093 SourceFormat::Markdown
1094 ));
1095 }
1096
1097 #[test]
1098 fn test_delimiter_detection() {
1099 let detector = DelimiterDetector;
1100
1101 assert_eq!(detector.detect("a,b,c"), ',');
1102 assert_eq!(detector.detect("a;b;c"), ';');
1103 assert_eq!(detector.detect("a\tb\tc"), '\t');
1104 assert_eq!(detector.detect("a|b|c"), '|');
1105 }
1106
1107 #[test]
1108 fn test_type_inference() {
1109 let inferrer = TypeInferrer::new();
1110
1111 assert!(inferrer.matches_type("123", DataType::Number));
1112 assert!(inferrer.matches_type("hello", DataType::String));
1113 assert!(inferrer.matches_type("true", DataType::Boolean));
1114 assert!(inferrer.matches_type("2023-01-01", DataType::Date));
1115 }
1116
1117 #[test]
1118 fn test_statistics_calculation() {
1119 let calculator = StatisticsCalculator::new();
1120 let values = vec!["1".to_string(), "2".to_string(), "3".to_string()];
1121
1122 let stats = calculator.calculate_numeric_stats(&values).unwrap();
1123 assert_eq!(stats.min, 1.0);
1124 assert_eq!(stats.max, 3.0);
1125 assert_eq!(stats.mean, 2.0);
1126 }
1127}