Skip to main content

xls_rs/operations/
stats.rs

1//! Statistical operations
2
3use super::core::DataOperations;
4use super::types::AggFunc;
5use anyhow::Result;
6
7impl DataOperations {
8    /// Describe/summary statistics for all numeric columns
9    pub fn describe(&self, data: &[Vec<String>]) -> Result<Vec<Vec<String>>> {
10        if data.is_empty() {
11            return Ok(Vec::new());
12        }
13
14        let header = &data[0];
15        let num_cols = header.len();
16
17        let mut columns: Vec<Vec<f64>> = vec![Vec::new(); num_cols];
18        for row in data.iter().skip(1) {
19            for (idx, val) in row.iter().enumerate() {
20                if let Ok(num) = val.parse::<f64>() {
21                    columns[idx].push(num);
22                }
23            }
24        }
25
26        let mut result = Vec::new();
27
28        let mut stat_header = vec!["stat".to_string()];
29        stat_header.extend(header.iter().cloned());
30        result.push(stat_header);
31
32        let stats = ["count", "mean", "std", "min", "25%", "50%", "75%", "max"];
33        for stat in stats {
34            let mut row = vec![stat.to_string()];
35            for col_values in &columns {
36                let value = if col_values.is_empty() {
37                    "NaN".to_string()
38                } else {
39                    match stat {
40                        "count" => col_values.len().to_string(),
41                        "mean" => format!(
42                            "{:.2}",
43                            col_values.iter().sum::<f64>() / col_values.len() as f64
44                        ),
45                        "std" => {
46                            let mean = col_values.iter().sum::<f64>() / col_values.len() as f64;
47                            let variance =
48                                col_values.iter().map(|x| (x - mean).powi(2)).sum::<f64>()
49                                    / col_values.len() as f64;
50                            format!("{:.2}", variance.sqrt())
51                        }
52                        "min" => format!(
53                            "{:.2}",
54                            col_values.iter().cloned().fold(f64::INFINITY, f64::min)
55                        ),
56                        "max" => format!(
57                            "{:.2}",
58                            col_values.iter().cloned().fold(f64::NEG_INFINITY, f64::max)
59                        ),
60                        "25%" | "50%" | "75%" => {
61                            let mut sorted = col_values.clone();
62                            sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
63                            let p = match stat {
64                                "25%" => 0.25,
65                                "50%" => 0.50,
66                                "75%" => 0.75,
67                                _ => 0.5,
68                            };
69                            let idx = ((sorted.len() - 1) as f64 * p) as usize;
70                            format!("{:.2}", sorted[idx])
71                        }
72                        _ => "".to_string(),
73                    }
74                };
75                row.push(value);
76            }
77            result.push(row);
78        }
79
80        Ok(result)
81    }
82
83    /// Count unique values in a column
84    pub fn value_counts(&self, data: &[Vec<String>], column: usize) -> Vec<Vec<String>> {
85        use std::collections::HashMap;
86
87        let mut counts: HashMap<String, usize> = HashMap::new();
88        for row in data.iter().skip(1) {
89            if let Some(val) = row.get(column) {
90                *counts.entry(val.clone()).or_insert(0) += 1;
91            }
92        }
93
94        let mut result: Vec<(String, usize)> = counts.into_iter().collect();
95        result.sort_by(|a, b| b.1.cmp(&a.1));
96
97        let mut output = vec![vec!["value".to_string(), "count".to_string()]];
98        for (val, count) in result {
99            output.push(vec![val, count.to_string()]);
100        }
101        output
102    }
103
104    /// Pivot table
105    pub fn pivot(
106        &self,
107        data: &[Vec<String>],
108        index_col: usize,
109        columns_col: usize,
110        values_col: usize,
111        agg: AggFunc,
112    ) -> Result<Vec<Vec<String>>> {
113        use std::collections::{BTreeSet, HashMap};
114
115        if data.is_empty() {
116            return Ok(Vec::new());
117        }
118
119        let mut col_values: BTreeSet<String> = BTreeSet::new();
120        let mut index_values: BTreeSet<String> = BTreeSet::new();
121        let mut groups: HashMap<(String, String), Vec<f64>> = HashMap::new();
122
123        for row in data.iter().skip(1) {
124            let idx = row.get(index_col).cloned().unwrap_or_default();
125            let col = row.get(columns_col).cloned().unwrap_or_default();
126            let val = row
127                .get(values_col)
128                .and_then(|v| v.parse::<f64>().ok())
129                .unwrap_or(0.0);
130
131            index_values.insert(idx.clone());
132            col_values.insert(col.clone());
133            groups.entry((idx, col)).or_default().push(val);
134        }
135
136        let col_values: Vec<String> = col_values.into_iter().collect();
137        let index_values: Vec<String> = index_values.into_iter().collect();
138
139        let mut result = Vec::new();
140
141        let index_name = data[0]
142            .get(index_col)
143            .cloned()
144            .unwrap_or_else(|| "index".to_string());
145        let mut header = vec![index_name];
146        header.extend(col_values.iter().cloned());
147        result.push(header);
148
149        for idx in &index_values {
150            let mut row = vec![idx.clone()];
151            for col in &col_values {
152                let values = groups.get(&(idx.clone(), col.clone()));
153                let agg_val = match values {
154                    Some(vals) => agg.apply(vals),
155                    None => 0.0,
156                };
157                row.push(format!("{:.2}", agg_val));
158            }
159            result.push(row);
160        }
161
162        Ok(result)
163    }
164
165    /// Frequency crosstab: counts of `(row_col, col_col)` pairs (two categorical columns).
166    ///
167    /// First row is the header: row dimension name, then distinct values from `col_col`.
168    /// First column lists distinct values from `row_col`; cell `(r, c)` is the count.
169    pub fn crosstab(
170        &self,
171        data: &[Vec<String>],
172        row_col: usize,
173        col_col: usize,
174    ) -> Result<Vec<Vec<String>>> {
175        use std::collections::{BTreeSet, HashMap};
176
177        if data.is_empty() {
178            return Ok(Vec::new());
179        }
180
181        let mut row_vals: BTreeSet<String> = BTreeSet::new();
182        let mut col_vals: BTreeSet<String> = BTreeSet::new();
183        let mut counts: HashMap<(String, String), usize> = HashMap::new();
184
185        for row in data.iter().skip(1) {
186            let r = row.get(row_col).cloned().unwrap_or_default();
187            let c = row.get(col_col).cloned().unwrap_or_default();
188            row_vals.insert(r.clone());
189            col_vals.insert(c.clone());
190            *counts.entry((r, c)).or_insert(0) += 1;
191        }
192
193        let row_vals: Vec<String> = row_vals.into_iter().collect();
194        let col_vals: Vec<String> = col_vals.into_iter().collect();
195
196        let row_name = data[0]
197            .get(row_col)
198            .cloned()
199            .unwrap_or_else(|| "row".to_string());
200
201        let mut header = vec![row_name];
202        header.extend(col_vals.iter().cloned());
203
204        let mut result = vec![header];
205
206        for rv in &row_vals {
207            let mut out_row = vec![rv.clone()];
208            for cv in &col_vals {
209                let n = counts
210                    .get(&(rv.clone(), cv.clone()))
211                    .copied()
212                    .unwrap_or(0);
213                out_row.push(n.to_string());
214            }
215            result.push(out_row);
216        }
217
218        Ok(result)
219    }
220
221    /// Correlation matrix
222    pub fn correlation(&self, data: &[Vec<String>], columns: &[usize]) -> Result<Vec<Vec<String>>> {
223        if data.is_empty() || columns.is_empty() {
224            return Ok(Vec::new());
225        }
226
227        let header = &data[0];
228
229        let mut col_data: Vec<Vec<f64>> = vec![Vec::new(); columns.len()];
230        for row in data.iter().skip(1) {
231            for (i, &col_idx) in columns.iter().enumerate() {
232                if let Some(val) = row.get(col_idx).and_then(|v| v.parse::<f64>().ok()) {
233                    col_data[i].push(val);
234                }
235            }
236        }
237
238        let mut result = Vec::new();
239
240        let mut corr_header = vec!["".to_string()];
241        for &col_idx in columns {
242            corr_header.push(
243                header
244                    .get(col_idx)
245                    .cloned()
246                    .unwrap_or_else(|| format!("col_{}", col_idx)),
247            );
248        }
249        result.push(corr_header);
250
251        for (i, &col_i) in columns.iter().enumerate() {
252            let col_name = header
253                .get(col_i)
254                .cloned()
255                .unwrap_or_else(|| format!("col_{}", col_i));
256            let mut row = vec![col_name];
257
258            for (j, _) in columns.iter().enumerate() {
259                let corr = self.pearson_correlation(&col_data[i], &col_data[j]);
260                row.push(format!("{:.4}", corr));
261            }
262            result.push(row);
263        }
264
265        Ok(result)
266    }
267
268    pub(crate) fn pearson_correlation(&self, x: &[f64], y: &[f64]) -> f64 {
269        let n = x.len().min(y.len());
270        if n == 0 {
271            return 0.0;
272        }
273
274        let mean_x = x.iter().take(n).sum::<f64>() / n as f64;
275        let mean_y = y.iter().take(n).sum::<f64>() / n as f64;
276
277        let mut cov = 0.0;
278        let mut var_x = 0.0;
279        let mut var_y = 0.0;
280
281        for i in 0..n {
282            let dx = x[i] - mean_x;
283            let dy = y[i] - mean_y;
284            cov += dx * dy;
285            var_x += dx * dx;
286            var_y += dy * dy;
287        }
288
289        if var_x == 0.0 || var_y == 0.0 {
290            return 0.0;
291        }
292
293        cov / (var_x.sqrt() * var_y.sqrt())
294    }
295
296    /// Infer column types
297    pub fn dtypes(&self, data: &[Vec<String>]) -> Vec<Vec<String>> {
298        if data.is_empty() {
299            return Vec::new();
300        }
301
302        let header = &data[0];
303        let mut result = vec![vec![
304            "column".to_string(),
305            "dtype".to_string(),
306            "non_null".to_string(),
307        ]];
308
309        for (col_idx, col_name) in header.iter().enumerate() {
310            let mut int_count = 0;
311            let mut float_count = 0;
312            let mut bool_count = 0;
313            let mut non_null = 0;
314            let total = data.len() - 1;
315
316            for row in data.iter().skip(1) {
317                if let Some(val) = row.get(col_idx) {
318                    if val.is_empty() {
319                        continue;
320                    }
321                    non_null += 1;
322
323                    if val.parse::<i64>().is_ok() {
324                        int_count += 1;
325                    } else if val.parse::<f64>().is_ok() {
326                        float_count += 1;
327                    } else if val.eq_ignore_ascii_case("true") || val.eq_ignore_ascii_case("false")
328                    {
329                        bool_count += 1;
330                    }
331                }
332            }
333
334            let dtype = if non_null == 0 {
335                "empty"
336            } else if int_count == non_null {
337                "int"
338            } else if int_count + float_count == non_null {
339                "float"
340            } else if bool_count == non_null {
341                "bool"
342            } else {
343                "string"
344            };
345
346            result.push(vec![
347                col_name.clone(),
348                dtype.to_string(),
349                format!("{}/{}", non_null, total),
350            ]);
351        }
352
353        result
354    }
355
356    /// Get unique values in a column
357    pub fn unique(&self, data: &[Vec<String>], column: usize) -> Vec<Vec<String>> {
358        use std::collections::HashSet;
359
360        let mut seen: HashSet<String> = HashSet::new();
361        let mut result = vec![vec!["value".to_string()]];
362
363        for row in data.iter().skip(1) {
364            if let Some(val) = row.get(column) {
365                if seen.insert(val.clone()) {
366                    result.push(vec![val.clone()]);
367                }
368            }
369        }
370
371        result
372    }
373
374    /// Count unique values in a column
375    pub fn nunique(&self, data: &[Vec<String>], column: usize) -> usize {
376        use std::collections::HashSet;
377
378        let unique: HashSet<&String> = data
379            .iter()
380            .skip(1)
381            .filter_map(|row| row.get(column))
382            .collect();
383
384        unique.len()
385    }
386
387    /// Get info about the dataset
388    pub fn info(&self, data: &[Vec<String>]) -> Vec<Vec<String>> {
389        if data.is_empty() {
390            return Vec::new();
391        }
392
393        let header = &data[0];
394        let num_rows = data.len() - 1;
395        let num_cols = header.len();
396
397        let mut result = vec![
398            vec!["metric".to_string(), "value".to_string()],
399            vec!["rows".to_string(), num_rows.to_string()],
400            vec!["columns".to_string(), num_cols.to_string()],
401        ];
402
403        let total_chars: usize = data
404            .iter()
405            .flat_map(|row| row.iter())
406            .map(|s| s.len())
407            .sum();
408        result.push(vec!["memory_bytes".to_string(), total_chars.to_string()]);
409
410        for (idx, col_name) in header.iter().enumerate() {
411            let non_null: usize = data
412                .iter()
413                .skip(1)
414                .filter(|row| row.get(idx).map(|s| !s.is_empty()).unwrap_or(false))
415                .count();
416            let null_count = num_rows - non_null;
417            let unique_count = self.nunique(data, idx);
418
419            result.push(vec![
420                format!("col_{}", col_name),
421                format!(
422                    "non_null={}, null={}, unique={}",
423                    non_null, null_count, unique_count
424                ),
425            ]);
426        }
427
428        result
429    }
430}