spreadsheet_mcp/analysis/
stats.rs1use crate::model::{CellValue, ColumnSummary};
2use crate::utils::column_number_to_name;
3use crate::workbook::cell_to_value;
4use std::collections::{BTreeMap, HashSet};
5use umya_spreadsheet::Worksheet;
6
7pub struct SheetStats {
8 pub numeric_columns: Vec<ColumnSummary>,
9 pub text_columns: Vec<ColumnSummary>,
10 pub null_counts: BTreeMap<String, u32>,
11 pub duplicate_warnings: Vec<String>,
12 pub density: f32,
13}
14
15pub fn compute_sheet_statistics(sheet: &Worksheet, _sample_rows: usize) -> SheetStats {
16 let (max_col, max_row) = sheet.get_highest_column_and_row();
17 if max_col == 0 || max_row == 0 {
18 return SheetStats {
19 numeric_columns: Vec::new(),
20 text_columns: Vec::new(),
21 null_counts: BTreeMap::new(),
22 duplicate_warnings: Vec::new(),
23 density: 0.0,
24 };
25 }
26
27 let mut numeric = Vec::new();
28 let mut text = Vec::new();
29 let mut null_counts = BTreeMap::new();
30 let mut duplicate_warnings = Vec::new();
31 let mut filled_cells = 0u32;
32
33 for col in 1..=max_col {
34 let column_name = column_number_to_name(col);
35 let header = sheet.get_cell((1u32, col)).and_then(cell_to_value);
36 let mut numeric_values = Vec::new();
37 let mut text_values = Vec::new();
38 let mut samples = Vec::new();
39 let mut unique_values: HashSet<String> = HashSet::new();
40 let mut duplicate_flag = false;
41
42 for row in 1..=max_row {
43 if let Some(cell) = sheet.get_cell((row, col))
44 && let Some(value) = cell_to_value(cell)
45 {
46 filled_cells += 1;
47 match value.clone() {
48 CellValue::Number(n) => numeric_values.push(n),
49 CellValue::Bool(_)
50 | CellValue::Text(_)
51 | CellValue::Date(_)
52 | CellValue::Error(_) => {
53 if let CellValue::Text(ref s) = value
54 && !unique_values.insert(s.clone())
55 {
56 duplicate_flag = true;
57 }
58 text_values.push(value.clone());
59 }
60 }
61 if samples.len() < 5 {
62 samples.push(value);
63 }
64 }
65 }
66
67 let nulls = max_row - (numeric_values.len() as u32 + text_values.len() as u32);
68 if nulls > 0 {
69 null_counts.insert(column_name.clone(), nulls);
70 }
71
72 if duplicate_flag {
73 duplicate_warnings.push(format!("Column {column_name} contains duplicate values"));
74 }
75
76 let summary = ColumnSummary {
77 header: header.map(cell_value_to_string),
78 column: column_name.clone(),
79 samples,
80 min: if numeric_values.is_empty() {
81 None
82 } else {
83 numeric_values.iter().cloned().reduce(f64::min)
84 },
85 max: if numeric_values.is_empty() {
86 None
87 } else {
88 numeric_values.iter().cloned().reduce(f64::max)
89 },
90 mean: if numeric_values.is_empty() {
91 None
92 } else {
93 Some(numeric_values.iter().sum::<f64>() / numeric_values.len() as f64)
94 },
95 };
96
97 if numeric_values.len() >= text_values.len() {
98 numeric.push(summary);
99 } else if !text_values.is_empty() {
100 text.push(summary);
101 }
102 }
103
104 let total_cells = (max_col * max_row) as f32;
105 let density = if total_cells == 0.0 {
106 0.0
107 } else {
108 filled_cells as f32 / total_cells
109 };
110
111 SheetStats {
112 numeric_columns: numeric,
113 text_columns: text,
114 null_counts,
115 duplicate_warnings,
116 density,
117 }
118}
119fn cell_value_to_string(value: CellValue) -> String {
120 match value {
121 CellValue::Text(s) => s,
122 CellValue::Number(n) => format!("{n}"),
123 CellValue::Bool(b) => b.to_string(),
124 CellValue::Date(d) => d,
125 CellValue::Error(e) => e,
126 }
127}