use super::{
ColumnStatistics, DataType, ExtractedTable, NumericStatistics, TableCell,
TableExtractionConfig, TableProcessor, TableStatistics, TextStatistics,
};
use crate::{RragError, RragResult};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
pub struct DefaultTableProcessor {
config: TableExtractionConfig,
html_parser: HtmlTableParser,
csv_parser: CsvTableParser,
markdown_parser: MarkdownTableParser,
stats_calculator: StatisticsCalculator,
type_inferrer: TypeInferrer,
summary_generator: TableSummaryGenerator,
}
pub struct HtmlTableParser {
config: HtmlParserConfig,
}
#[derive(Debug, Clone)]
pub struct HtmlParserConfig {
pub extract_headers: bool,
pub preserve_formatting: bool,
pub handle_merges: bool,
pub max_cells: usize,
}
pub struct CsvTableParser {
delimiter_detector: DelimiterDetector,
quote_char: char,
escape_char: Option<char>,
}
pub struct DelimiterDetector;
pub struct MarkdownTableParser;
pub struct StatisticsCalculator;
pub struct TypeInferrer;
pub struct TableSummaryGenerator {
templates: HashMap<SummaryType, String>,
strategy: SummaryStrategy,
}
#[derive(Debug, Clone, Copy, Hash, PartialEq, Eq)]
pub enum SummaryType {
Brief,
Detailed,
Statistical,
Narrative,
}
#[derive(Debug, Clone, Copy)]
pub enum SummaryStrategy {
TemplateBase,
MLGenerated,
Hybrid,
}
#[derive(Debug, Clone)]
pub struct TableParseResult {
pub tables: Vec<ExtractedTable>,
pub confidence: f32,
pub metadata: ParseMetadata,
pub warnings: Vec<String>,
}
#[derive(Debug, Clone)]
pub struct ParseMetadata {
pub parser_type: ParserType,
pub processing_time_ms: u64,
pub detected_format: SourceFormat,
pub structure_confidence: f32,
}
#[derive(Debug, Clone, Copy)]
pub enum ParserType {
Html,
Csv,
Markdown,
Excel,
Auto,
}
#[derive(Debug, Clone, Copy)]
pub enum SourceFormat {
Html,
Csv,
Tsv,
Markdown,
Excel,
Unknown,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TableQuality {
pub completeness: f32,
pub consistency: f32,
pub structure_quality: f32,
pub data_quality: f32,
pub overall_quality: f32,
pub issues: Vec<QualityIssue>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct QualityIssue {
pub issue_type: QualityIssueType,
pub description: String,
pub severity: IssueSeverity,
pub location: Option<CellLocation>,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum QualityIssueType {
MissingValues,
InconsistentTypes,
DuplicateRows,
InvalidData,
StructuralIssues,
EncodingIssues,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub enum IssueSeverity {
Low,
Medium,
High,
Critical,
}
#[derive(Debug, Clone, Copy, Serialize, Deserialize)]
pub struct CellLocation {
pub row: usize,
pub column: usize,
}
impl DefaultTableProcessor {
pub fn new(config: TableExtractionConfig) -> RragResult<Self> {
let html_parser = HtmlTableParser::new(HtmlParserConfig::default());
let csv_parser = CsvTableParser::new();
let markdown_parser = MarkdownTableParser::new();
let stats_calculator = StatisticsCalculator::new();
let type_inferrer = TypeInferrer::new();
let summary_generator = TableSummaryGenerator::new();
Ok(Self {
config,
html_parser,
csv_parser,
markdown_parser,
stats_calculator,
type_inferrer,
summary_generator,
})
}
pub fn auto_parse(&self, content: &str) -> RragResult<TableParseResult> {
let detected_format = self.detect_format(content)?;
match detected_format {
SourceFormat::Html => self.parse_html_tables(content),
SourceFormat::Csv => self.parse_csv_table(content),
SourceFormat::Markdown => self.parse_markdown_tables(content),
_ => Err(RragError::document_processing("Unsupported table format")),
}
}
fn detect_format(&self, content: &str) -> RragResult<SourceFormat> {
if content.contains("<table") || content.contains("<tr") {
return Ok(SourceFormat::Html);
}
if content.contains('|')
&& content
.lines()
.any(|line| line.chars().filter(|&c| c == '|').count() >= 2)
{
return Ok(SourceFormat::Markdown);
}
let comma_count = content.chars().filter(|&c| c == ',').count();
let tab_count = content.chars().filter(|&c| c == '\t').count();
let semicolon_count = content.chars().filter(|&c| c == ';').count();
if comma_count > tab_count && comma_count > semicolon_count {
Ok(SourceFormat::Csv)
} else if tab_count > comma_count && tab_count > semicolon_count {
Ok(SourceFormat::Tsv)
} else if semicolon_count > 0 {
Ok(SourceFormat::Csv) } else {
Ok(SourceFormat::Unknown)
}
}
fn parse_html_tables(&self, html: &str) -> RragResult<TableParseResult> {
let tables = self.html_parser.parse(html)?;
Ok(TableParseResult {
tables,
confidence: 0.9,
metadata: ParseMetadata {
parser_type: ParserType::Html,
processing_time_ms: 10,
detected_format: SourceFormat::Html,
structure_confidence: 0.9,
},
warnings: vec![],
})
}
fn parse_csv_table(&self, csv: &str) -> RragResult<TableParseResult> {
let table = self.csv_parser.parse(csv)?;
Ok(TableParseResult {
tables: vec![table],
confidence: 0.85,
metadata: ParseMetadata {
parser_type: ParserType::Csv,
processing_time_ms: 5,
detected_format: SourceFormat::Csv,
structure_confidence: 0.85,
},
warnings: vec![],
})
}
fn parse_markdown_tables(&self, markdown: &str) -> RragResult<TableParseResult> {
let tables = self.markdown_parser.parse(markdown)?;
Ok(TableParseResult {
tables,
confidence: 0.8,
metadata: ParseMetadata {
parser_type: ParserType::Markdown,
processing_time_ms: 8,
detected_format: SourceFormat::Markdown,
structure_confidence: 0.8,
},
warnings: vec![],
})
}
pub fn assess_quality(&self, table: &ExtractedTable) -> RragResult<TableQuality> {
let mut issues = Vec::new();
let total_cells = table.rows.len() * table.headers.len();
let empty_cells = table
.rows
.iter()
.flatten()
.filter(|cell| cell.value.trim().is_empty())
.count();
let completeness = 1.0 - (empty_cells as f32 / total_cells as f32);
if completeness < 0.8 {
issues.push(QualityIssue {
issue_type: QualityIssueType::MissingValues,
description: format!(
"High missing value rate: {:.1}%",
(1.0 - completeness) * 100.0
),
severity: if completeness < 0.5 {
IssueSeverity::High
} else {
IssueSeverity::Medium
},
location: None,
});
}
let mut consistency_score = 1.0;
for (col_idx, col_type) in table.column_types.iter().enumerate() {
let inconsistent_count = table
.rows
.iter()
.filter(|row| {
if let Some(cell) = row.get(col_idx) {
!self.type_inferrer.matches_type(&cell.value, *col_type)
} else {
false
}
})
.count();
if inconsistent_count > 0 {
consistency_score *= 1.0 - (inconsistent_count as f32 / table.rows.len() as f32);
if inconsistent_count as f32 / table.rows.len() as f32 > 0.1 {
issues.push(QualityIssue {
issue_type: QualityIssueType::InconsistentTypes,
description: format!("Column {} has inconsistent data types", col_idx),
severity: IssueSeverity::Medium,
location: None,
});
}
}
}
let structure_quality = if table.headers.is_empty() { 0.5 } else { 1.0 };
let data_quality = (completeness + consistency_score) / 2.0;
let overall_quality =
(completeness + consistency_score + structure_quality + data_quality) / 4.0;
Ok(TableQuality {
completeness,
consistency: consistency_score,
structure_quality,
data_quality,
overall_quality,
issues,
})
}
}
impl TableProcessor for DefaultTableProcessor {
fn extract_table(&self, content: &str) -> RragResult<Vec<ExtractedTable>> {
let parse_result = self.auto_parse(content)?;
Ok(parse_result.tables)
}
fn parse_structure(&self, table_html: &str) -> RragResult<ExtractedTable> {
let parse_result = self.html_parser.parse(table_html)?;
parse_result
.into_iter()
.next()
.ok_or_else(|| RragError::document_processing("No table found in HTML"))
}
fn generate_summary(&self, table: &ExtractedTable) -> RragResult<String> {
self.summary_generator.generate(table, SummaryType::Brief)
}
fn calculate_statistics(&self, table: &ExtractedTable) -> RragResult<TableStatistics> {
self.stats_calculator.calculate(table)
}
}
impl HtmlTableParser {
pub fn new(config: HtmlParserConfig) -> Self {
Self { config }
}
pub fn parse(&self, _html: &str) -> RragResult<Vec<ExtractedTable>> {
let table_id = format!(
"table_{}",
uuid::Uuid::new_v4().to_string().split('-').next().unwrap()
);
let headers = vec!["Name".to_string(), "Age".to_string(), "City".to_string()];
let rows = vec![
vec![
TableCell {
value: "John".to_string(),
data_type: DataType::String,
formatting: None,
},
TableCell {
value: "25".to_string(),
data_type: DataType::Number,
formatting: None,
},
TableCell {
value: "New York".to_string(),
data_type: DataType::String,
formatting: None,
},
],
vec![
TableCell {
value: "Alice".to_string(),
data_type: DataType::String,
formatting: None,
},
TableCell {
value: "30".to_string(),
data_type: DataType::Number,
formatting: None,
},
TableCell {
value: "London".to_string(),
data_type: DataType::String,
formatting: None,
},
],
];
let column_types = vec![DataType::String, DataType::Number, DataType::String];
Ok(vec![ExtractedTable {
id: table_id,
headers,
rows,
summary: None,
column_types,
embedding: None,
statistics: None,
}])
}
pub fn extract_attributes(&self, _table_element: &str) -> HashMap<String, String> {
let mut attributes = HashMap::new();
attributes.insert("border".to_string(), "1".to_string());
attributes.insert("cellpadding".to_string(), "2".to_string());
attributes
}
}
impl CsvTableParser {
pub fn new() -> Self {
Self {
delimiter_detector: DelimiterDetector,
quote_char: '"',
escape_char: Some('\\'),
}
}
pub fn parse(&self, csv: &str) -> RragResult<ExtractedTable> {
let delimiter = self.delimiter_detector.detect(csv);
let lines: Vec<&str> = csv.lines().collect();
if lines.is_empty() {
return Err(RragError::document_processing("Empty CSV content"));
}
let headers: Vec<String> = lines[0]
.split(delimiter)
.map(|s| s.trim().trim_matches(self.quote_char).to_string())
.collect();
let mut rows = Vec::new();
for line in lines.iter().skip(1) {
let values: Vec<String> = line
.split(delimiter)
.map(|s| s.trim().trim_matches(self.quote_char).to_string())
.collect();
if values.len() == headers.len() {
let row: Vec<TableCell> = values
.into_iter()
.map(|value| {
let data_type = self.infer_type(&value);
TableCell {
value,
data_type,
formatting: None,
}
})
.collect();
rows.push(row);
}
}
let column_types = self.infer_column_types(&rows, headers.len());
let table_id = format!(
"csv_table_{}",
uuid::Uuid::new_v4().to_string().split('-').next().unwrap()
);
Ok(ExtractedTable {
id: table_id,
headers,
rows,
summary: None,
column_types,
embedding: None,
statistics: None,
})
}
fn infer_type(&self, value: &str) -> DataType {
if value.trim().is_empty() {
return DataType::String;
}
if value.parse::<f64>().is_ok() {
return DataType::Number;
}
if self.is_date_like(value) {
return DataType::Date;
}
if matches!(
value.to_lowercase().as_str(),
"true" | "false" | "yes" | "no" | "1" | "0"
) {
return DataType::Boolean;
}
DataType::String
}
fn is_date_like(&self, value: &str) -> bool {
let date_patterns = [
r"\d{4}-\d{2}-\d{2}", r"\d{2}/\d{2}/\d{4}", r"\d{2}-\d{2}-\d{4}", ];
date_patterns.iter().any(|pattern| {
regex::Regex::new(pattern)
.map(|re| re.is_match(value))
.unwrap_or(false)
})
}
fn infer_column_types(&self, rows: &[Vec<TableCell>], num_cols: usize) -> Vec<DataType> {
let mut column_types = vec![DataType::String; num_cols];
for col_idx in 0..num_cols {
let mut type_counts = HashMap::new();
for row in rows {
if let Some(cell) = row.get(col_idx) {
*type_counts.entry(cell.data_type).or_insert(0) += 1;
}
}
if let Some((&most_common_type, _)) = type_counts.iter().max_by_key(|(_, &count)| count)
{
column_types[col_idx] = most_common_type;
}
}
column_types
}
}
impl DelimiterDetector {
pub fn detect(&self, csv: &str) -> char {
let first_line = csv.lines().next().unwrap_or("");
let comma_count = first_line.chars().filter(|&c| c == ',').count();
let semicolon_count = first_line.chars().filter(|&c| c == ';').count();
let tab_count = first_line.chars().filter(|&c| c == '\t').count();
let pipe_count = first_line.chars().filter(|&c| c == '|').count();
if comma_count >= semicolon_count && comma_count >= tab_count && comma_count >= pipe_count {
','
} else if semicolon_count >= tab_count && semicolon_count >= pipe_count {
';'
} else if tab_count >= pipe_count {
'\t'
} else {
'|'
}
}
}
impl MarkdownTableParser {
pub fn new() -> Self {
Self
}
pub fn parse(&self, markdown: &str) -> RragResult<Vec<ExtractedTable>> {
let mut tables = Vec::new();
let lines: Vec<&str> = markdown.lines().collect();
let mut i = 0;
while i < lines.len() {
if self.is_table_start(&lines[i..]) {
let table = self.parse_single_table(&lines[i..])?;
tables.push(table.0);
i += table.1; } else {
i += 1;
}
}
Ok(tables)
}
fn is_table_start(&self, lines: &[&str]) -> bool {
if lines.len() < 2 {
return false;
}
lines[1]
.chars()
.all(|c| c.is_whitespace() || c == '|' || c == '-' || c == ':')
}
fn parse_single_table(&self, lines: &[&str]) -> RragResult<(ExtractedTable, usize)> {
let mut table_lines = Vec::new();
let mut line_count = 0;
for &line in lines {
if line.contains('|') {
table_lines.push(line);
line_count += 1;
} else if !table_lines.is_empty() {
break;
}
}
if table_lines.len() < 2 {
return Err(RragError::document_processing("Invalid Markdown table"));
}
let headers: Vec<String> = table_lines[0]
.split('|')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
let mut rows = Vec::new();
for &line in table_lines.iter().skip(2) {
let values: Vec<String> = line
.split('|')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect();
if values.len() == headers.len() {
let row: Vec<TableCell> = values
.into_iter()
.map(|value| {
let data_type = self.infer_type(&value);
TableCell {
value,
data_type,
formatting: None,
}
})
.collect();
rows.push(row);
}
}
let column_types = vec![DataType::String; headers.len()]; let table_id = format!(
"md_table_{}",
uuid::Uuid::new_v4().to_string().split('-').next().unwrap()
);
let table = ExtractedTable {
id: table_id,
headers,
rows,
summary: None,
column_types,
embedding: None,
statistics: None,
};
Ok((table, line_count))
}
fn infer_type(&self, value: &str) -> DataType {
if value.parse::<f64>().is_ok() {
DataType::Number
} else {
DataType::String
}
}
}
impl StatisticsCalculator {
pub fn new() -> Self {
Self
}
pub fn calculate(&self, table: &ExtractedTable) -> RragResult<TableStatistics> {
let row_count = table.rows.len();
let column_count = table.headers.len();
let mut null_percentages = Vec::new();
for col_idx in 0..column_count {
let null_count = table
.rows
.iter()
.filter(|row| {
row.get(col_idx)
.map(|cell| cell.value.trim().is_empty())
.unwrap_or(true)
})
.count();
let null_percentage = if row_count > 0 {
null_count as f32 / row_count as f32
} else {
0.0
};
null_percentages.push(null_percentage);
}
let mut column_stats = Vec::new();
for (col_idx, header) in table.headers.iter().enumerate() {
let values: Vec<String> = table
.rows
.iter()
.filter_map(|row| row.get(col_idx))
.map(|cell| cell.value.clone())
.collect();
let unique_count = values
.iter()
.collect::<std::collections::HashSet<_>>()
.len();
let numeric_stats = if table.column_types.get(col_idx) == Some(&DataType::Number) {
self.calculate_numeric_stats(&values)
} else {
None
};
let text_stats = if table.column_types.get(col_idx) == Some(&DataType::String) {
Some(self.calculate_text_stats(&values))
} else {
None
};
column_stats.push(ColumnStatistics {
name: header.clone(),
numeric_stats,
text_stats,
unique_count,
});
}
Ok(TableStatistics {
row_count,
column_count,
null_percentages,
column_stats,
})
}
fn calculate_numeric_stats(&self, values: &[String]) -> Option<NumericStatistics> {
let numbers: Vec<f64> = values.iter().filter_map(|s| s.parse().ok()).collect();
if numbers.is_empty() {
return None;
}
let min = numbers.iter().fold(f64::INFINITY, |a, &b| a.min(b));
let max = numbers.iter().fold(f64::NEG_INFINITY, |a, &b| a.max(b));
let mean = numbers.iter().sum::<f64>() / numbers.len() as f64;
let mut sorted_numbers = numbers.clone();
sorted_numbers.sort_by(|a, b| a.partial_cmp(b).unwrap());
let median = if sorted_numbers.len() % 2 == 0 {
(sorted_numbers[sorted_numbers.len() / 2 - 1]
+ sorted_numbers[sorted_numbers.len() / 2])
/ 2.0
} else {
sorted_numbers[sorted_numbers.len() / 2]
};
let variance =
numbers.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / numbers.len() as f64;
let std_dev = variance.sqrt();
Some(NumericStatistics {
min,
max,
mean,
median,
std_dev,
})
}
fn calculate_text_stats(&self, values: &[String]) -> TextStatistics {
let lengths: Vec<usize> = values.iter().map(|s| s.len()).collect();
let min_length = lengths.iter().min().copied().unwrap_or(0);
let max_length = lengths.iter().max().copied().unwrap_or(0);
let avg_length = if !lengths.is_empty() {
lengths.iter().sum::<usize>() as f32 / lengths.len() as f32
} else {
0.0
};
let mut counts = HashMap::new();
for value in values {
*counts.entry(value.clone()).or_insert(0) += 1;
}
let mut most_common: Vec<(String, usize)> = counts.into_iter().collect();
most_common.sort_by(|a, b| b.1.cmp(&a.1));
most_common.truncate(5);
TextStatistics {
min_length,
max_length,
avg_length,
most_common,
}
}
}
impl TypeInferrer {
pub fn new() -> Self {
Self
}
pub fn matches_type(&self, value: &str, expected_type: DataType) -> bool {
match expected_type {
DataType::String => true, DataType::Number => value.parse::<f64>().is_ok(),
DataType::Date => self.is_date_like(value),
DataType::Boolean => matches!(
value.to_lowercase().as_str(),
"true" | "false" | "yes" | "no" | "1" | "0"
),
DataType::Mixed => true, }
}
fn is_date_like(&self, value: &str) -> bool {
let patterns = [
r"^\d{4}-\d{2}-\d{2}$",
r"^\d{2}/\d{2}/\d{4}$",
r"^\d{2}-\d{2}-\d{4}$",
];
patterns.iter().any(|pattern| {
regex::Regex::new(pattern)
.map(|re| re.is_match(value))
.unwrap_or(false)
})
}
}
impl TableSummaryGenerator {
pub fn new() -> Self {
let mut templates = HashMap::new();
templates.insert(
SummaryType::Brief,
"Table with {row_count} rows and {col_count} columns. Columns: {headers}.".to_string(),
);
templates.insert(
SummaryType::Detailed,
"This table contains {row_count} rows and {col_count} columns. The columns are: {headers}. {additional_info}".to_string()
);
Self {
templates,
strategy: SummaryStrategy::TemplateBase,
}
}
pub fn generate(
&self,
table: &ExtractedTable,
summary_type: SummaryType,
) -> RragResult<String> {
match self.strategy {
SummaryStrategy::TemplateBase => self.generate_template_based(table, summary_type),
SummaryStrategy::MLGenerated => self.generate_ml_based(table),
SummaryStrategy::Hybrid => self.generate_hybrid(table, summary_type),
}
}
fn generate_template_based(
&self,
table: &ExtractedTable,
summary_type: SummaryType,
) -> RragResult<String> {
let template = self
.templates
.get(&summary_type)
.ok_or_else(|| RragError::configuration("Summary template not found"))?;
let summary = template
.replace("{row_count}", &table.rows.len().to_string())
.replace("{col_count}", &table.headers.len().to_string())
.replace("{headers}", &table.headers.join(", "));
Ok(summary)
}
fn generate_ml_based(&self, _table: &ExtractedTable) -> RragResult<String> {
Ok("ML-generated summary would go here".to_string())
}
fn generate_hybrid(
&self,
table: &ExtractedTable,
summary_type: SummaryType,
) -> RragResult<String> {
let base_summary = self.generate_template_based(table, summary_type)?;
Ok(base_summary)
}
}
impl Default for HtmlParserConfig {
fn default() -> Self {
Self {
extract_headers: true,
preserve_formatting: true,
handle_merges: true,
max_cells: 10000,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_table_processor_creation() {
let config = TableExtractionConfig::default();
let processor = DefaultTableProcessor::new(config).unwrap();
assert_eq!(processor.config.min_rows, 2);
assert_eq!(processor.config.min_cols, 2);
}
#[test]
fn test_format_detection() {
let processor = DefaultTableProcessor::new(TableExtractionConfig::default()).unwrap();
let html = "<table><tr><td>test</td></tr></table>";
assert!(matches!(
processor.detect_format(html).unwrap(),
SourceFormat::Html
));
let csv = "name,age,city\nJohn,25,NYC";
assert!(matches!(
processor.detect_format(csv).unwrap(),
SourceFormat::Csv
));
let markdown = "| Name | Age |\n|------|-----|\n| John | 25 |";
assert!(matches!(
processor.detect_format(markdown).unwrap(),
SourceFormat::Markdown
));
}
#[test]
fn test_delimiter_detection() {
let detector = DelimiterDetector;
assert_eq!(detector.detect("a,b,c"), ',');
assert_eq!(detector.detect("a;b;c"), ';');
assert_eq!(detector.detect("a\tb\tc"), '\t');
assert_eq!(detector.detect("a|b|c"), '|');
}
#[test]
fn test_type_inference() {
let inferrer = TypeInferrer::new();
assert!(inferrer.matches_type("123", DataType::Number));
assert!(inferrer.matches_type("hello", DataType::String));
assert!(inferrer.matches_type("true", DataType::Boolean));
assert!(inferrer.matches_type("2023-01-01", DataType::Date));
}
#[test]
fn test_statistics_calculation() {
let calculator = StatisticsCalculator::new();
let values = vec!["1".to_string(), "2".to_string(), "3".to_string()];
let stats = calculator.calculate_numeric_stats(&values).unwrap();
assert_eq!(stats.min, 1.0);
assert_eq!(stats.max, 3.0);
assert_eq!(stats.mean, 2.0);
}
}