Skip to main content

xls_rs/operations/
pandas.rs

1//! Pandas-inspired data operations
2
3use super::core::DataOperations;
4use super::types::{AggFunc, JoinType};
5use anyhow::Result;
6
7impl DataOperations {
8    /// Select specific columns by index
9    pub fn select_columns(&self, data: &[Vec<String>], columns: &[usize]) -> Vec<Vec<String>> {
10        data.iter()
11            .map(|row| {
12                columns
13                    .iter()
14                    .map(|&idx| row.get(idx).cloned().unwrap_or_default())
15                    .collect()
16            })
17            .collect()
18    }
19
20    /// Select columns by name (first row is header)
21    pub fn select_columns_by_name(
22        &self,
23        data: &[Vec<String>],
24        names: &[&str],
25    ) -> Result<Vec<Vec<String>>> {
26        if data.is_empty() {
27            return Ok(Vec::new());
28        }
29
30        let header = &data[0];
31        let indices: Vec<usize> = names
32            .iter()
33            .map(|name| {
34                header
35                    .iter()
36                    .position(|h| h == *name)
37                    .ok_or_else(|| anyhow::anyhow!("Column '{}' not found", name))
38            })
39            .collect::<Result<Vec<_>>>()?;
40
41        Ok(self.select_columns(data, &indices))
42    }
43
44    /// Get first n rows (head)
45    pub fn head(&self, data: &[Vec<String>], n: usize) -> Vec<Vec<String>> {
46        data.iter().take(n).cloned().collect()
47    }
48
49    /// Get last n rows (tail)
50    pub fn tail(&self, data: &[Vec<String>], n: usize) -> Vec<Vec<String>> {
51        let len = data.len();
52        if n >= len {
53            data.to_vec()
54        } else {
55            data[len - n..].to_vec()
56        }
57    }
58
59    /// Sample random rows
60    pub fn sample(&self, data: &[Vec<String>], n: usize, seed: Option<u64>) -> Vec<Vec<String>> {
61        use std::collections::HashSet;
62
63        if n >= data.len() {
64            return data.to_vec();
65        }
66
67        let mut rng_state = seed.unwrap_or(42);
68        let mut next_rand = || {
69            rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
70            rng_state
71        };
72
73        let mut indices = HashSet::new();
74        while indices.len() < n {
75            let idx = (next_rand() as usize) % data.len();
76            indices.insert(idx);
77        }
78
79        let mut result: Vec<Vec<String>> = indices.iter().map(|&idx| data[idx].clone()).collect();
80        result.sort_by_key(|_| next_rand());
81        result
82    }
83
84    /// Drop columns by index
85    pub fn drop_columns(&self, data: &[Vec<String>], columns: &[usize]) -> Vec<Vec<String>> {
86        let drop_set: std::collections::HashSet<usize> = columns.iter().copied().collect();
87        data.iter()
88            .map(|row| {
89                row.iter()
90                    .enumerate()
91                    .filter(|(idx, _)| !drop_set.contains(idx))
92                    .map(|(_, val)| val.clone())
93                    .collect()
94            })
95            .collect()
96    }
97
98    /// Rename columns (first row is header)
99    pub fn rename_columns(
100        &self,
101        data: &mut Vec<Vec<String>>,
102        renames: &[(&str, &str)],
103    ) -> Result<()> {
104        if data.is_empty() {
105            return Ok(());
106        }
107
108        let header = &mut data[0];
109        for (old_name, new_name) in renames {
110            if let Some(pos) = header.iter().position(|h| h == *old_name) {
111                header[pos] = new_name.to_string();
112            }
113        }
114        Ok(())
115    }
116
117    /// Fill missing/empty values
118    pub fn fillna(&self, data: &mut Vec<Vec<String>>, value: &str) {
119        for row in data.iter_mut() {
120            for cell in row.iter_mut() {
121                if cell.is_empty() {
122                    *cell = value.to_string();
123                }
124            }
125        }
126    }
127
128    /// Drop rows with any empty values
129    pub fn dropna(&self, data: &[Vec<String>]) -> Vec<Vec<String>> {
130        data.iter()
131            .filter(|row| !row.iter().any(|cell| cell.is_empty()))
132            .cloned()
133            .collect()
134    }
135
136    /// Concatenate multiple datasets vertically
137    pub fn concat(&self, datasets: &[Vec<Vec<String>>]) -> Vec<Vec<String>> {
138        let mut result = Vec::new();
139        for dataset in datasets {
140            result.extend(dataset.iter().cloned());
141        }
142        result
143    }
144
145    /// Join two datasets on a column
146    pub fn join(
147        &self,
148        left: &[Vec<String>],
149        right: &[Vec<String>],
150        left_col: usize,
151        right_col: usize,
152        how: JoinType,
153    ) -> Result<Vec<Vec<String>>> {
154        use std::collections::HashMap;
155
156        if left.is_empty() || right.is_empty() {
157            return Ok(Vec::new());
158        }
159
160        let mut right_index: HashMap<String, Vec<usize>> = HashMap::new();
161        for (idx, row) in right.iter().enumerate() {
162            if let Some(key) = row.get(right_col) {
163                right_index.entry(key.clone()).or_default().push(idx);
164            }
165        }
166
167        let right_width = right.iter().map(|r| r.len()).max().unwrap_or(0);
168        let empty_right: Vec<String> = vec![String::new(); right_width];
169
170        let mut result = Vec::new();
171        let mut matched_right: std::collections::HashSet<usize> = std::collections::HashSet::new();
172
173        for left_row in left {
174            let key = left_row.get(left_col).cloned().unwrap_or_default();
175
176            if let Some(right_indices) = right_index.get(&key) {
177                for &right_idx in right_indices {
178                    matched_right.insert(right_idx);
179                    let mut new_row = left_row.clone();
180                    for (idx, val) in right[right_idx].iter().enumerate() {
181                        if idx != right_col {
182                            new_row.push(val.clone());
183                        }
184                    }
185                    result.push(new_row);
186                }
187            } else if matches!(how, JoinType::Left | JoinType::Outer) {
188                let mut new_row = left_row.clone();
189                for (idx, val) in empty_right.iter().enumerate() {
190                    if idx != right_col {
191                        new_row.push(val.clone());
192                    }
193                }
194                result.push(new_row);
195            }
196        }
197
198        if matches!(how, JoinType::Right | JoinType::Outer) {
199            let left_width = left.iter().map(|r| r.len()).max().unwrap_or(0);
200            let empty_left: Vec<String> = vec![String::new(); left_width];
201
202            for (idx, right_row) in right.iter().enumerate() {
203                if !matched_right.contains(&idx) {
204                    let mut new_row = empty_left.clone();
205                    if let Some(key) = right_row.get(right_col) {
206                        if left_col < new_row.len() {
207                            new_row[left_col] = key.clone();
208                        }
209                    }
210                    for (i, val) in right_row.iter().enumerate() {
211                        if i != right_col {
212                            new_row.push(val.clone());
213                        }
214                    }
215                    result.push(new_row);
216                }
217            }
218        }
219
220        Ok(result)
221    }
222
223    /// Group by column with aggregations
224    pub fn groupby(
225        &self,
226        data: &[Vec<String>],
227        group_col: usize,
228        aggregations: &[(usize, AggFunc)],
229    ) -> Result<Vec<Vec<String>>> {
230        use std::collections::HashMap;
231
232        if data.is_empty() {
233            return Ok(Vec::new());
234        }
235
236        let header = &data[0];
237        let mut groups: HashMap<String, Vec<Vec<f64>>> = HashMap::new();
238
239        for row in data.iter().skip(1) {
240            let key = row.get(group_col).cloned().unwrap_or_default();
241            let entry = groups
242                .entry(key)
243                .or_insert_with(|| vec![Vec::new(); aggregations.len()]);
244
245            for (i, (col, _)) in aggregations.iter().enumerate() {
246                if let Some(val) = row.get(*col).and_then(|v| v.parse::<f64>().ok()) {
247                    entry[i].push(val);
248                }
249            }
250        }
251
252        let mut result = Vec::new();
253
254        // Header
255        let mut result_header = vec![
256            header
257                .get(group_col)
258                .cloned()
259                .unwrap_or_else(|| "group".to_string()),
260        ];
261        for (col, agg) in aggregations {
262            let col_name = header
263                .get(*col)
264                .cloned()
265                .unwrap_or_else(|| format!("col_{}", col));
266            result_header.push(format!("{}_{}", agg.name(), col_name));
267        }
268        result.push(result_header);
269
270        // Data
271        let mut keys: Vec<_> = groups.keys().cloned().collect();
272        keys.sort();
273
274        for key in keys {
275            let values = &groups[&key];
276            let mut row = vec![key];
277            for (i, (_, agg)) in aggregations.iter().enumerate() {
278                let agg_val = agg.apply(&values[i]);
279                row.push(format!("{:.2}", agg_val));
280            }
281            result.push(row);
282        }
283
284        Ok(result)
285    }
286
287    /// Unpivot wide → long: repeat `id_vars` for each `value_vars` column; add `variable` and `value`.
288    ///
289    /// If `value_vars` is empty, uses every column index not listed in `id_vars`.
290    pub fn melt(
291        &self,
292        data: &[Vec<String>],
293        id_vars: &[usize],
294        value_vars: &[usize],
295    ) -> Result<Vec<Vec<String>>> {
296        use std::collections::HashSet;
297
298        if data.is_empty() {
299            return Ok(Vec::new());
300        }
301
302        let header = &data[0];
303        let max_len = data.iter().map(|r| r.len()).max().unwrap_or(0);
304
305        for &i in id_vars {
306            if i >= max_len {
307                anyhow::bail!("id column index {} out of range", i);
308            }
309        }
310
311        let id_set: HashSet<usize> = id_vars.iter().copied().collect();
312        let value_indices: Vec<usize> = if value_vars.is_empty() {
313            (0..header.len()).filter(|i| !id_set.contains(i)).collect()
314        } else {
315            for &i in value_vars {
316                if i >= max_len {
317                    anyhow::bail!("value column index {} out of range", i);
318                }
319                if id_set.contains(&i) {
320                    anyhow::bail!("value column {} cannot also be an id column", i);
321                }
322            }
323            value_vars.to_vec()
324        };
325
326        if value_indices.is_empty() {
327            anyhow::bail!("melt: no value columns (add id_vars or pass value_vars)");
328        }
329
330        let mut out_header: Vec<String> = id_vars
331            .iter()
332            .map(|&i| {
333                header
334                    .get(i)
335                    .cloned()
336                    .unwrap_or_else(|| format!("col_{}", i))
337            })
338            .collect();
339        out_header.push("variable".to_string());
340        out_header.push("value".to_string());
341
342        let mut result = vec![out_header];
343
344        for row in data.iter().skip(1) {
345            for &v in &value_indices {
346                let mut new_row: Vec<String> = id_vars
347                    .iter()
348                    .map(|&i| row.get(i).cloned().unwrap_or_default())
349                    .collect();
350                let var_name = header
351                    .get(v)
352                    .cloned()
353                    .unwrap_or_else(|| format!("col_{}", v));
354                let val = row.get(v).cloned().unwrap_or_default();
355                new_row.push(var_name);
356                new_row.push(val);
357                result.push(new_row);
358            }
359        }
360
361        Ok(result)
362    }
363}