spreadsheet_mcp/analysis/
stats.rs

1use crate::model::{CellValue, ColumnSummary};
2use crate::utils::column_number_to_name;
3use crate::workbook::cell_to_value;
4use std::collections::{BTreeMap, HashSet};
5use umya_spreadsheet::Worksheet;
6
7pub struct SheetStats {
8    pub numeric_columns: Vec<ColumnSummary>,
9    pub text_columns: Vec<ColumnSummary>,
10    pub null_counts: BTreeMap<String, u32>,
11    pub duplicate_warnings: Vec<String>,
12    pub density: f32,
13}
14
15pub fn compute_sheet_statistics(sheet: &Worksheet, _sample_rows: usize) -> SheetStats {
16    let (max_col, max_row) = sheet.get_highest_column_and_row();
17    if max_col == 0 || max_row == 0 {
18        return SheetStats {
19            numeric_columns: Vec::new(),
20            text_columns: Vec::new(),
21            null_counts: BTreeMap::new(),
22            duplicate_warnings: Vec::new(),
23            density: 0.0,
24        };
25    }
26
27    let mut numeric = Vec::new();
28    let mut text = Vec::new();
29    let mut null_counts = BTreeMap::new();
30    let mut duplicate_warnings = Vec::new();
31    let mut filled_cells = 0u32;
32
33    for col in 1..=max_col {
34        let column_name = column_number_to_name(col);
35        let header = sheet.get_cell((1u32, col)).and_then(cell_to_value);
36        let mut numeric_values = Vec::new();
37        let mut text_values = Vec::new();
38        let mut samples = Vec::new();
39        let mut unique_values: HashSet<String> = HashSet::new();
40        let mut duplicate_flag = false;
41
42        for row in 1..=max_row {
43            if let Some(cell) = sheet.get_cell((row, col))
44                && let Some(value) = cell_to_value(cell)
45            {
46                filled_cells += 1;
47                match value.clone() {
48                    CellValue::Number(n) => numeric_values.push(n),
49                    CellValue::Bool(_)
50                    | CellValue::Text(_)
51                    | CellValue::Date(_)
52                    | CellValue::Error(_) => {
53                        if let CellValue::Text(ref s) = value
54                            && !unique_values.insert(s.clone())
55                        {
56                            duplicate_flag = true;
57                        }
58                        text_values.push(value.clone());
59                    }
60                }
61                if samples.len() < 5 {
62                    samples.push(value);
63                }
64            }
65        }
66
67        let nulls = max_row - (numeric_values.len() as u32 + text_values.len() as u32);
68        if nulls > 0 {
69            null_counts.insert(column_name.clone(), nulls);
70        }
71
72        if duplicate_flag {
73            duplicate_warnings.push(format!("Column {column_name} contains duplicate values"));
74        }
75
76        let summary = ColumnSummary {
77            header: header.map(cell_value_to_string),
78            column: column_name.clone(),
79            samples,
80            min: if numeric_values.is_empty() {
81                None
82            } else {
83                numeric_values.iter().cloned().reduce(f64::min)
84            },
85            max: if numeric_values.is_empty() {
86                None
87            } else {
88                numeric_values.iter().cloned().reduce(f64::max)
89            },
90            mean: if numeric_values.is_empty() {
91                None
92            } else {
93                Some(numeric_values.iter().sum::<f64>() / numeric_values.len() as f64)
94            },
95        };
96
97        if numeric_values.len() >= text_values.len() {
98            numeric.push(summary);
99        } else if !text_values.is_empty() {
100            text.push(summary);
101        }
102    }
103
104    let total_cells = (max_col * max_row) as f32;
105    let density = if total_cells == 0.0 {
106        0.0
107    } else {
108        filled_cells as f32 / total_cells
109    };
110
111    SheetStats {
112        numeric_columns: numeric,
113        text_columns: text,
114        null_counts,
115        duplicate_warnings,
116        density,
117    }
118}
119fn cell_value_to_string(value: CellValue) -> String {
120    match value {
121        CellValue::Text(s) => s,
122        CellValue::Number(n) => format!("{n}"),
123        CellValue::Bool(b) => b.to_string(),
124        CellValue::Date(d) => d,
125        CellValue::Error(e) => e,
126    }
127}