xls_rs/operations/
pandas.rs1use super::core::DataOperations;
4use super::types::{AggFunc, JoinType};
5use anyhow::Result;
6
7impl DataOperations {
8 pub fn select_columns(&self, data: &[Vec<String>], columns: &[usize]) -> Vec<Vec<String>> {
10 data.iter()
11 .map(|row| {
12 columns
13 .iter()
14 .map(|&idx| row.get(idx).cloned().unwrap_or_default())
15 .collect()
16 })
17 .collect()
18 }
19
20 pub fn select_columns_by_name(
22 &self,
23 data: &[Vec<String>],
24 names: &[&str],
25 ) -> Result<Vec<Vec<String>>> {
26 if data.is_empty() {
27 return Ok(Vec::new());
28 }
29
30 let header = &data[0];
31 let indices: Vec<usize> = names
32 .iter()
33 .map(|name| {
34 header
35 .iter()
36 .position(|h| h == *name)
37 .ok_or_else(|| anyhow::anyhow!("Column '{}' not found", name))
38 })
39 .collect::<Result<Vec<_>>>()?;
40
41 Ok(self.select_columns(data, &indices))
42 }
43
44 pub fn head(&self, data: &[Vec<String>], n: usize) -> Vec<Vec<String>> {
46 data.iter().take(n).cloned().collect()
47 }
48
49 pub fn tail(&self, data: &[Vec<String>], n: usize) -> Vec<Vec<String>> {
51 let len = data.len();
52 if n >= len {
53 data.to_vec()
54 } else {
55 data[len - n..].to_vec()
56 }
57 }
58
59 pub fn sample(&self, data: &[Vec<String>], n: usize, seed: Option<u64>) -> Vec<Vec<String>> {
61 use std::collections::HashSet;
62
63 if n >= data.len() {
64 return data.to_vec();
65 }
66
67 let mut rng_state = seed.unwrap_or(42);
68 let mut next_rand = || {
69 rng_state = rng_state.wrapping_mul(6364136223846793005).wrapping_add(1);
70 rng_state
71 };
72
73 let mut indices = HashSet::new();
74 while indices.len() < n {
75 let idx = (next_rand() as usize) % data.len();
76 indices.insert(idx);
77 }
78
79 let mut result: Vec<Vec<String>> = indices.iter().map(|&idx| data[idx].clone()).collect();
80 result.sort_by_key(|_| next_rand());
81 result
82 }
83
84 pub fn drop_columns(&self, data: &[Vec<String>], columns: &[usize]) -> Vec<Vec<String>> {
86 let drop_set: std::collections::HashSet<usize> = columns.iter().copied().collect();
87 data.iter()
88 .map(|row| {
89 row.iter()
90 .enumerate()
91 .filter(|(idx, _)| !drop_set.contains(idx))
92 .map(|(_, val)| val.clone())
93 .collect()
94 })
95 .collect()
96 }
97
98 pub fn rename_columns(
100 &self,
101 data: &mut Vec<Vec<String>>,
102 renames: &[(&str, &str)],
103 ) -> Result<()> {
104 if data.is_empty() {
105 return Ok(());
106 }
107
108 let header = &mut data[0];
109 for (old_name, new_name) in renames {
110 if let Some(pos) = header.iter().position(|h| h == *old_name) {
111 header[pos] = new_name.to_string();
112 }
113 }
114 Ok(())
115 }
116
117 pub fn fillna(&self, data: &mut Vec<Vec<String>>, value: &str) {
119 for row in data.iter_mut() {
120 for cell in row.iter_mut() {
121 if cell.is_empty() {
122 *cell = value.to_string();
123 }
124 }
125 }
126 }
127
128 pub fn dropna(&self, data: &[Vec<String>]) -> Vec<Vec<String>> {
130 data.iter()
131 .filter(|row| !row.iter().any(|cell| cell.is_empty()))
132 .cloned()
133 .collect()
134 }
135
136 pub fn concat(&self, datasets: &[Vec<Vec<String>>]) -> Vec<Vec<String>> {
138 let mut result = Vec::new();
139 for dataset in datasets {
140 result.extend(dataset.iter().cloned());
141 }
142 result
143 }
144
145 pub fn join(
147 &self,
148 left: &[Vec<String>],
149 right: &[Vec<String>],
150 left_col: usize,
151 right_col: usize,
152 how: JoinType,
153 ) -> Result<Vec<Vec<String>>> {
154 use std::collections::HashMap;
155
156 if left.is_empty() || right.is_empty() {
157 return Ok(Vec::new());
158 }
159
160 let mut right_index: HashMap<String, Vec<usize>> = HashMap::new();
161 for (idx, row) in right.iter().enumerate() {
162 if let Some(key) = row.get(right_col) {
163 right_index.entry(key.clone()).or_default().push(idx);
164 }
165 }
166
167 let right_width = right.iter().map(|r| r.len()).max().unwrap_or(0);
168 let empty_right: Vec<String> = vec![String::new(); right_width];
169
170 let mut result = Vec::new();
171 let mut matched_right: std::collections::HashSet<usize> = std::collections::HashSet::new();
172
173 for left_row in left {
174 let key = left_row.get(left_col).cloned().unwrap_or_default();
175
176 if let Some(right_indices) = right_index.get(&key) {
177 for &right_idx in right_indices {
178 matched_right.insert(right_idx);
179 let mut new_row = left_row.clone();
180 for (idx, val) in right[right_idx].iter().enumerate() {
181 if idx != right_col {
182 new_row.push(val.clone());
183 }
184 }
185 result.push(new_row);
186 }
187 } else if matches!(how, JoinType::Left | JoinType::Outer) {
188 let mut new_row = left_row.clone();
189 for (idx, val) in empty_right.iter().enumerate() {
190 if idx != right_col {
191 new_row.push(val.clone());
192 }
193 }
194 result.push(new_row);
195 }
196 }
197
198 if matches!(how, JoinType::Right | JoinType::Outer) {
199 let left_width = left.iter().map(|r| r.len()).max().unwrap_or(0);
200 let empty_left: Vec<String> = vec![String::new(); left_width];
201
202 for (idx, right_row) in right.iter().enumerate() {
203 if !matched_right.contains(&idx) {
204 let mut new_row = empty_left.clone();
205 if let Some(key) = right_row.get(right_col) {
206 if left_col < new_row.len() {
207 new_row[left_col] = key.clone();
208 }
209 }
210 for (i, val) in right_row.iter().enumerate() {
211 if i != right_col {
212 new_row.push(val.clone());
213 }
214 }
215 result.push(new_row);
216 }
217 }
218 }
219
220 Ok(result)
221 }
222
223 pub fn groupby(
225 &self,
226 data: &[Vec<String>],
227 group_col: usize,
228 aggregations: &[(usize, AggFunc)],
229 ) -> Result<Vec<Vec<String>>> {
230 use std::collections::HashMap;
231
232 if data.is_empty() {
233 return Ok(Vec::new());
234 }
235
236 let header = &data[0];
237 let mut groups: HashMap<String, Vec<Vec<f64>>> = HashMap::new();
238
239 for row in data.iter().skip(1) {
240 let key = row.get(group_col).cloned().unwrap_or_default();
241 let entry = groups
242 .entry(key)
243 .or_insert_with(|| vec![Vec::new(); aggregations.len()]);
244
245 for (i, (col, _)) in aggregations.iter().enumerate() {
246 if let Some(val) = row.get(*col).and_then(|v| v.parse::<f64>().ok()) {
247 entry[i].push(val);
248 }
249 }
250 }
251
252 let mut result = Vec::new();
253
254 let mut result_header = vec![
256 header
257 .get(group_col)
258 .cloned()
259 .unwrap_or_else(|| "group".to_string()),
260 ];
261 for (col, agg) in aggregations {
262 let col_name = header
263 .get(*col)
264 .cloned()
265 .unwrap_or_else(|| format!("col_{}", col));
266 result_header.push(format!("{}_{}", agg.name(), col_name));
267 }
268 result.push(result_header);
269
270 let mut keys: Vec<_> = groups.keys().cloned().collect();
272 keys.sort();
273
274 for key in keys {
275 let values = &groups[&key];
276 let mut row = vec![key];
277 for (i, (_, agg)) in aggregations.iter().enumerate() {
278 let agg_val = agg.apply(&values[i]);
279 row.push(format!("{:.2}", agg_val));
280 }
281 result.push(row);
282 }
283
284 Ok(result)
285 }
286
287 pub fn melt(
291 &self,
292 data: &[Vec<String>],
293 id_vars: &[usize],
294 value_vars: &[usize],
295 ) -> Result<Vec<Vec<String>>> {
296 use std::collections::HashSet;
297
298 if data.is_empty() {
299 return Ok(Vec::new());
300 }
301
302 let header = &data[0];
303 let max_len = data.iter().map(|r| r.len()).max().unwrap_or(0);
304
305 for &i in id_vars {
306 if i >= max_len {
307 anyhow::bail!("id column index {} out of range", i);
308 }
309 }
310
311 let id_set: HashSet<usize> = id_vars.iter().copied().collect();
312 let value_indices: Vec<usize> = if value_vars.is_empty() {
313 (0..header.len()).filter(|i| !id_set.contains(i)).collect()
314 } else {
315 for &i in value_vars {
316 if i >= max_len {
317 anyhow::bail!("value column index {} out of range", i);
318 }
319 if id_set.contains(&i) {
320 anyhow::bail!("value column {} cannot also be an id column", i);
321 }
322 }
323 value_vars.to_vec()
324 };
325
326 if value_indices.is_empty() {
327 anyhow::bail!("melt: no value columns (add id_vars or pass value_vars)");
328 }
329
330 let mut out_header: Vec<String> = id_vars
331 .iter()
332 .map(|&i| {
333 header
334 .get(i)
335 .cloned()
336 .unwrap_or_else(|| format!("col_{}", i))
337 })
338 .collect();
339 out_header.push("variable".to_string());
340 out_header.push("value".to_string());
341
342 let mut result = vec![out_header];
343
344 for row in data.iter().skip(1) {
345 for &v in &value_indices {
346 let mut new_row: Vec<String> = id_vars
347 .iter()
348 .map(|&i| row.get(i).cloned().unwrap_or_default())
349 .collect();
350 let var_name = header
351 .get(v)
352 .cloned()
353 .unwrap_or_else(|| format!("col_{}", v));
354 let val = row.get(v).cloned().unwrap_or_default();
355 new_row.push(var_name);
356 new_row.push(val);
357 result.push(new_row);
358 }
359 }
360
361 Ok(result)
362 }
363}