1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
//! Selection functionality for OptimizedDataFrame
use std::collections::HashSet;
use crate::column::Column;
use crate::core::error::OptionExt;
use crate::error::Result;
use crate::optimized::split_dataframe::core::OptimizedDataFrame;
impl OptimizedDataFrame {
/// Select columns to create a new DataFrame
///
/// # Arguments
/// * `columns` - Array of column names to select
///
/// # Returns
/// * `Result<Self>` - New DataFrame with selected columns
pub fn select_columns(&self, columns: &[&str]) -> Result<Self> {
let mut df = Self::new();
// Create a set of column names (for existence check)
let column_set: HashSet<&str> = self.column_names.iter().map(|s| s.as_str()).collect();
// Add specified columns to the new DataFrame
for &col_name in columns {
if !column_set.contains(col_name) {
// Return error if column doesn't exist
return Err(crate::error::Error::ColumnNotFound(col_name.to_string()));
}
let col_idx = self
.column_indices
.get(col_name)
.ok_or_column_error(col_name)?;
let column = &self.columns[*col_idx];
df.add_column(col_name.to_string(), column.clone())?;
}
// Copy index
if let Some(ref index) = self.index {
df.index = Some(index.clone());
}
Ok(df)
}
/// Select rows by index
///
/// # Arguments
/// * `indices` - Array of row indices to select
///
/// # Returns
/// * `Result<Self>` - New DataFrame with selected rows
///
/// Note: A method with the same name exists in sort.rs but that one is private
pub fn select_rows_by_indices(&self, indices: &[usize]) -> Result<Self> {
let mut df = Self::new();
// Process each column
for (col_idx, col_name) in self.column_names.iter().enumerate() {
let column = &self.columns[col_idx];
// Extract data from selected rows
let new_column = match column {
Column::Int64(col) => {
let values: Vec<i64> = indices
.iter()
.filter_map(|&idx| {
if idx < self.row_count {
col.get(idx).ok().flatten()
} else {
None
}
})
.collect();
Column::Int64(crate::column::Int64Column::new(values))
}
Column::Float64(col) => {
let values: Vec<f64> = indices
.iter()
.filter_map(|&idx| {
if idx < self.row_count {
col.get(idx).ok().flatten()
} else {
None
}
})
.collect();
Column::Float64(crate::column::Float64Column::new(values))
}
Column::String(col) => {
let values: Vec<String> = indices
.iter()
.filter_map(|&idx| {
if idx < self.row_count {
col.get(idx).ok().flatten().map(|s| s.to_string())
} else {
None
}
})
.collect();
Column::String(crate::column::StringColumn::new(values))
}
Column::Boolean(col) => {
let values: Vec<bool> = indices
.iter()
.filter_map(|&idx| {
if idx < self.row_count {
col.get(idx).ok().flatten()
} else {
None
}
})
.collect();
Column::Boolean(crate::column::BooleanColumn::new(values))
}
};
df.add_column(col_name.clone(), new_column)?;
}
// Create new index
// NOTE: We could extract corresponding values from the existing index,
// but for simplicity, we create a new sequential index here
df.set_default_index()?;
Ok(df)
}
/// Select both rows and columns
///
/// # Arguments
/// * `row_indices` - Array of row indices to select
/// * `columns` - Array of column names to select
///
/// # Returns
/// * `Result<Self>` - New DataFrame with selected rows and columns
pub fn select_rows_columns(&self, row_indices: &[usize], columns: &[&str]) -> Result<Self> {
// First select columns
let cols_selected = self.select_columns(columns)?;
// Then select rows
cols_selected.select_rows_by_indices(row_indices)
}
/// Select rows using a mask
///
/// # Arguments
/// * `mask` - Boolean vector representing selection condition (True rows are selected)
///
/// # Returns
/// * `Result<Self>` - New DataFrame with rows matching the condition
pub fn select_by_mask(&self, mask: &[bool]) -> Result<Self> {
if mask.len() != self.row_count {
return Err(crate::error::Error::Format(format!(
"Mask length ({}) does not match DataFrame row count ({})",
mask.len(),
self.row_count
)));
}
// Create a list of indices from the mask
let indices: Vec<usize> = mask
.iter()
.enumerate()
.filter_map(|(i, &keep)| if keep { Some(i) } else { None })
.collect();
// Execute selection by indices
self.select_rows_by_indices(&indices)
}
}
/// Implementation for selecting rows based on row indices (used by other modules)
pub(crate) fn select_rows_by_indices_impl(
df: &OptimizedDataFrame,
indices: &[usize],
) -> Result<OptimizedDataFrame> {
// Return an empty DataFrame if there are no rows
if indices.is_empty() {
return Ok(OptimizedDataFrame::new());
}
let mut result = OptimizedDataFrame::new();
// Process each column
for (name, &column_idx) in &df.column_indices {
let column = &df.columns[column_idx];
// Get data from row indices based on column type
let selected_col = match column {
Column::Int64(col) => {
let selected_data: Vec<i64> = indices
.iter()
.map(|&idx| col.get(idx).ok().flatten().unwrap_or_default())
.collect();
Column::Int64(crate::column::Int64Column::new(selected_data))
}
Column::Float64(col) => {
let selected_data: Vec<f64> = indices
.iter()
.map(|&idx| col.get(idx).ok().flatten().unwrap_or_default())
.collect();
Column::Float64(crate::column::Float64Column::new(selected_data))
}
Column::String(col) => {
let selected_data: Vec<String> = indices
.iter()
.map(|&idx| {
col.get(idx)
.ok()
.flatten()
.map(|s| s.to_string())
.unwrap_or_default()
})
.collect();
Column::String(crate::column::StringColumn::new(selected_data))
}
Column::Boolean(col) => {
let selected_data: Vec<bool> = indices
.iter()
.map(|&idx| col.get(idx).ok().flatten().unwrap_or_default())
.collect();
Column::Boolean(crate::column::BooleanColumn::new(selected_data))
}
};
result.add_column(name.clone(), selected_col)?;
}
// Get index and select appropriate index values for selected rows
if let Some(ref idx) = df.get_index() {
// Process index selection based on the selected row indices
match idx {
crate::index::DataFrameIndex::Simple(simple_idx) => {
// Create new index with values corresponding to selected rows
let selected_index_values: Vec<String> = indices
.iter()
.filter_map(|&row_idx| {
if row_idx < simple_idx.len() {
simple_idx.get_value(row_idx).cloned()
} else {
None
}
})
.collect();
// Create new simple index from selected values
let new_index = crate::index::Index::new(selected_index_values)?;
result.set_index(crate::index::DataFrameIndex::Simple(new_index))?;
}
crate::index::DataFrameIndex::Multi(multi_idx) => {
// For multi-index, extract the corresponding rows
let selected_multi_values: Vec<Vec<String>> = indices
.iter()
.filter_map(|&row_idx| {
if row_idx < multi_idx.len() {
multi_idx.get_tuple(row_idx)
} else {
None
}
})
.collect();
// Create new multi-index from selected rows
if !selected_multi_values.is_empty() {
let level_names: Vec<Option<String>> =
multi_idx.names().iter().cloned().collect();
let new_multi_index = crate::index::MultiIndex::from_tuples(
selected_multi_values,
Some(level_names),
)?;
result.set_index(crate::index::DataFrameIndex::Multi(new_multi_index))?;
} else {
// Fallback to default index if no valid rows selected
result.set_default_index()?;
}
}
}
}
Ok(result)
}