1use super::core::DataOperations;
4use super::types::AggFunc;
5use anyhow::Result;
6
7impl DataOperations {
8 pub fn describe(&self, data: &[Vec<String>]) -> Result<Vec<Vec<String>>> {
10 if data.is_empty() {
11 return Ok(Vec::new());
12 }
13
14 let header = &data[0];
15 let num_cols = header.len();
16
17 let mut columns: Vec<Vec<f64>> = vec![Vec::new(); num_cols];
18 for row in data.iter().skip(1) {
19 for (idx, val) in row.iter().enumerate() {
20 if let Ok(num) = val.parse::<f64>() {
21 columns[idx].push(num);
22 }
23 }
24 }
25
26 let mut result = Vec::new();
27
28 let mut stat_header = vec!["stat".to_string()];
29 stat_header.extend(header.iter().cloned());
30 result.push(stat_header);
31
32 let stats = ["count", "mean", "std", "min", "25%", "50%", "75%", "max"];
33 for stat in stats {
34 let mut row = vec![stat.to_string()];
35 for col_values in &columns {
36 let value = if col_values.is_empty() {
37 "NaN".to_string()
38 } else {
39 match stat {
40 "count" => col_values.len().to_string(),
41 "mean" => format!(
42 "{:.2}",
43 col_values.iter().sum::<f64>() / col_values.len() as f64
44 ),
45 "std" => {
46 let mean = col_values.iter().sum::<f64>() / col_values.len() as f64;
47 let variance =
48 col_values.iter().map(|x| (x - mean).powi(2)).sum::<f64>()
49 / col_values.len() as f64;
50 format!("{:.2}", variance.sqrt())
51 }
52 "min" => format!(
53 "{:.2}",
54 col_values.iter().cloned().fold(f64::INFINITY, f64::min)
55 ),
56 "max" => format!(
57 "{:.2}",
58 col_values.iter().cloned().fold(f64::NEG_INFINITY, f64::max)
59 ),
60 "25%" | "50%" | "75%" => {
61 let mut sorted = col_values.clone();
62 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
63 let p = match stat {
64 "25%" => 0.25,
65 "50%" => 0.50,
66 "75%" => 0.75,
67 _ => 0.5,
68 };
69 let idx = ((sorted.len() - 1) as f64 * p) as usize;
70 format!("{:.2}", sorted[idx])
71 }
72 _ => "".to_string(),
73 }
74 };
75 row.push(value);
76 }
77 result.push(row);
78 }
79
80 Ok(result)
81 }
82
83 pub fn value_counts(&self, data: &[Vec<String>], column: usize) -> Vec<Vec<String>> {
85 use std::collections::HashMap;
86
87 let mut counts: HashMap<String, usize> = HashMap::new();
88 for row in data.iter().skip(1) {
89 if let Some(val) = row.get(column) {
90 *counts.entry(val.clone()).or_insert(0) += 1;
91 }
92 }
93
94 let mut result: Vec<(String, usize)> = counts.into_iter().collect();
95 result.sort_by(|a, b| b.1.cmp(&a.1));
96
97 let mut output = vec![vec!["value".to_string(), "count".to_string()]];
98 for (val, count) in result {
99 output.push(vec![val, count.to_string()]);
100 }
101 output
102 }
103
104 pub fn pivot(
106 &self,
107 data: &[Vec<String>],
108 index_col: usize,
109 columns_col: usize,
110 values_col: usize,
111 agg: AggFunc,
112 ) -> Result<Vec<Vec<String>>> {
113 use std::collections::{BTreeSet, HashMap};
114
115 if data.is_empty() {
116 return Ok(Vec::new());
117 }
118
119 let mut col_values: BTreeSet<String> = BTreeSet::new();
120 let mut index_values: BTreeSet<String> = BTreeSet::new();
121 let mut groups: HashMap<(String, String), Vec<f64>> = HashMap::new();
122
123 for row in data.iter().skip(1) {
124 let idx = row.get(index_col).cloned().unwrap_or_default();
125 let col = row.get(columns_col).cloned().unwrap_or_default();
126 let val = row
127 .get(values_col)
128 .and_then(|v| v.parse::<f64>().ok())
129 .unwrap_or(0.0);
130
131 index_values.insert(idx.clone());
132 col_values.insert(col.clone());
133 groups.entry((idx, col)).or_default().push(val);
134 }
135
136 let col_values: Vec<String> = col_values.into_iter().collect();
137 let index_values: Vec<String> = index_values.into_iter().collect();
138
139 let mut result = Vec::new();
140
141 let index_name = data[0]
142 .get(index_col)
143 .cloned()
144 .unwrap_or_else(|| "index".to_string());
145 let mut header = vec![index_name];
146 header.extend(col_values.iter().cloned());
147 result.push(header);
148
149 for idx in &index_values {
150 let mut row = vec![idx.clone()];
151 for col in &col_values {
152 let values = groups.get(&(idx.clone(), col.clone()));
153 let agg_val = match values {
154 Some(vals) => agg.apply(vals),
155 None => 0.0,
156 };
157 row.push(format!("{:.2}", agg_val));
158 }
159 result.push(row);
160 }
161
162 Ok(result)
163 }
164
165 pub fn crosstab(
170 &self,
171 data: &[Vec<String>],
172 row_col: usize,
173 col_col: usize,
174 ) -> Result<Vec<Vec<String>>> {
175 use std::collections::{BTreeSet, HashMap};
176
177 if data.is_empty() {
178 return Ok(Vec::new());
179 }
180
181 let mut row_vals: BTreeSet<String> = BTreeSet::new();
182 let mut col_vals: BTreeSet<String> = BTreeSet::new();
183 let mut counts: HashMap<(String, String), usize> = HashMap::new();
184
185 for row in data.iter().skip(1) {
186 let r = row.get(row_col).cloned().unwrap_or_default();
187 let c = row.get(col_col).cloned().unwrap_or_default();
188 row_vals.insert(r.clone());
189 col_vals.insert(c.clone());
190 *counts.entry((r, c)).or_insert(0) += 1;
191 }
192
193 let row_vals: Vec<String> = row_vals.into_iter().collect();
194 let col_vals: Vec<String> = col_vals.into_iter().collect();
195
196 let row_name = data[0]
197 .get(row_col)
198 .cloned()
199 .unwrap_or_else(|| "row".to_string());
200
201 let mut header = vec![row_name];
202 header.extend(col_vals.iter().cloned());
203
204 let mut result = vec![header];
205
206 for rv in &row_vals {
207 let mut out_row = vec![rv.clone()];
208 for cv in &col_vals {
209 let n = counts
210 .get(&(rv.clone(), cv.clone()))
211 .copied()
212 .unwrap_or(0);
213 out_row.push(n.to_string());
214 }
215 result.push(out_row);
216 }
217
218 Ok(result)
219 }
220
221 pub fn correlation(&self, data: &[Vec<String>], columns: &[usize]) -> Result<Vec<Vec<String>>> {
223 if data.is_empty() || columns.is_empty() {
224 return Ok(Vec::new());
225 }
226
227 let header = &data[0];
228
229 let mut col_data: Vec<Vec<f64>> = vec![Vec::new(); columns.len()];
230 for row in data.iter().skip(1) {
231 for (i, &col_idx) in columns.iter().enumerate() {
232 if let Some(val) = row.get(col_idx).and_then(|v| v.parse::<f64>().ok()) {
233 col_data[i].push(val);
234 }
235 }
236 }
237
238 let mut result = Vec::new();
239
240 let mut corr_header = vec!["".to_string()];
241 for &col_idx in columns {
242 corr_header.push(
243 header
244 .get(col_idx)
245 .cloned()
246 .unwrap_or_else(|| format!("col_{}", col_idx)),
247 );
248 }
249 result.push(corr_header);
250
251 for (i, &col_i) in columns.iter().enumerate() {
252 let col_name = header
253 .get(col_i)
254 .cloned()
255 .unwrap_or_else(|| format!("col_{}", col_i));
256 let mut row = vec![col_name];
257
258 for (j, _) in columns.iter().enumerate() {
259 let corr = self.pearson_correlation(&col_data[i], &col_data[j]);
260 row.push(format!("{:.4}", corr));
261 }
262 result.push(row);
263 }
264
265 Ok(result)
266 }
267
268 pub(crate) fn pearson_correlation(&self, x: &[f64], y: &[f64]) -> f64 {
269 let n = x.len().min(y.len());
270 if n == 0 {
271 return 0.0;
272 }
273
274 let mean_x = x.iter().take(n).sum::<f64>() / n as f64;
275 let mean_y = y.iter().take(n).sum::<f64>() / n as f64;
276
277 let mut cov = 0.0;
278 let mut var_x = 0.0;
279 let mut var_y = 0.0;
280
281 for i in 0..n {
282 let dx = x[i] - mean_x;
283 let dy = y[i] - mean_y;
284 cov += dx * dy;
285 var_x += dx * dx;
286 var_y += dy * dy;
287 }
288
289 if var_x == 0.0 || var_y == 0.0 {
290 return 0.0;
291 }
292
293 cov / (var_x.sqrt() * var_y.sqrt())
294 }
295
296 pub fn dtypes(&self, data: &[Vec<String>]) -> Vec<Vec<String>> {
298 if data.is_empty() {
299 return Vec::new();
300 }
301
302 let header = &data[0];
303 let mut result = vec![vec![
304 "column".to_string(),
305 "dtype".to_string(),
306 "non_null".to_string(),
307 ]];
308
309 for (col_idx, col_name) in header.iter().enumerate() {
310 let mut int_count = 0;
311 let mut float_count = 0;
312 let mut bool_count = 0;
313 let mut non_null = 0;
314 let total = data.len() - 1;
315
316 for row in data.iter().skip(1) {
317 if let Some(val) = row.get(col_idx) {
318 if val.is_empty() {
319 continue;
320 }
321 non_null += 1;
322
323 if val.parse::<i64>().is_ok() {
324 int_count += 1;
325 } else if val.parse::<f64>().is_ok() {
326 float_count += 1;
327 } else if val.eq_ignore_ascii_case("true") || val.eq_ignore_ascii_case("false")
328 {
329 bool_count += 1;
330 }
331 }
332 }
333
334 let dtype = if non_null == 0 {
335 "empty"
336 } else if int_count == non_null {
337 "int"
338 } else if int_count + float_count == non_null {
339 "float"
340 } else if bool_count == non_null {
341 "bool"
342 } else {
343 "string"
344 };
345
346 result.push(vec![
347 col_name.clone(),
348 dtype.to_string(),
349 format!("{}/{}", non_null, total),
350 ]);
351 }
352
353 result
354 }
355
356 pub fn unique(&self, data: &[Vec<String>], column: usize) -> Vec<Vec<String>> {
358 use std::collections::HashSet;
359
360 let mut seen: HashSet<String> = HashSet::new();
361 let mut result = vec![vec!["value".to_string()]];
362
363 for row in data.iter().skip(1) {
364 if let Some(val) = row.get(column) {
365 if seen.insert(val.clone()) {
366 result.push(vec![val.clone()]);
367 }
368 }
369 }
370
371 result
372 }
373
374 pub fn nunique(&self, data: &[Vec<String>], column: usize) -> usize {
376 use std::collections::HashSet;
377
378 let unique: HashSet<&String> = data
379 .iter()
380 .skip(1)
381 .filter_map(|row| row.get(column))
382 .collect();
383
384 unique.len()
385 }
386
387 pub fn info(&self, data: &[Vec<String>]) -> Vec<Vec<String>> {
389 if data.is_empty() {
390 return Vec::new();
391 }
392
393 let header = &data[0];
394 let num_rows = data.len() - 1;
395 let num_cols = header.len();
396
397 let mut result = vec![
398 vec!["metric".to_string(), "value".to_string()],
399 vec!["rows".to_string(), num_rows.to_string()],
400 vec!["columns".to_string(), num_cols.to_string()],
401 ];
402
403 let total_chars: usize = data
404 .iter()
405 .flat_map(|row| row.iter())
406 .map(|s| s.len())
407 .sum();
408 result.push(vec!["memory_bytes".to_string(), total_chars.to_string()]);
409
410 for (idx, col_name) in header.iter().enumerate() {
411 let non_null: usize = data
412 .iter()
413 .skip(1)
414 .filter(|row| row.get(idx).map(|s| !s.is_empty()).unwrap_or(false))
415 .count();
416 let null_count = num_rows - non_null;
417 let unique_count = self.nunique(data, idx);
418
419 result.push(vec![
420 format!("col_{}", col_name),
421 format!(
422 "non_null={}, null={}, unique={}",
423 non_null, null_count, unique_count
424 ),
425 ]);
426 }
427
428 result
429 }
430}