1use crate::error::DataFrameError;
4use crate::scalar::Scalar;
5
6#[derive(Debug, Clone, Copy, PartialEq, Eq)]
8#[non_exhaustive]
9pub enum DataType {
10 Bool,
11 Int64,
12 UInt64,
13 Float64,
14 Utf8,
15}
16
17#[derive(Debug, Clone)]
19#[non_exhaustive]
20pub enum ColumnData {
21 Bool(Vec<Option<bool>>),
22 Int64(Vec<Option<i64>>),
23 UInt64(Vec<Option<u64>>),
24 Float64(Vec<Option<f64>>),
25 String(Vec<Option<String>>),
26}
27
28impl ColumnData {
29 #[must_use]
31 pub fn len(&self) -> usize {
32 match self {
33 Self::Bool(v) => v.len(),
34 Self::Int64(v) => v.len(),
35 Self::UInt64(v) => v.len(),
36 Self::Float64(v) => v.len(),
37 Self::String(v) => v.len(),
38 }
39 }
40
41 #[must_use]
43 pub fn is_empty(&self) -> bool {
44 self.len() == 0
45 }
46
47 #[must_use]
49 pub fn dtype(&self) -> DataType {
50 match self {
51 Self::Bool(_) => DataType::Bool,
52 Self::Int64(_) => DataType::Int64,
53 Self::UInt64(_) => DataType::UInt64,
54 Self::Float64(_) => DataType::Float64,
55 Self::String(_) => DataType::Utf8,
56 }
57 }
58
59 #[must_use]
61 pub fn get(&self, index: usize) -> Option<Scalar> {
62 match self {
63 Self::Bool(v) => v.get(index).map(|o| match o {
64 Some(b) => Scalar::Bool(*b),
65 None => Scalar::Null,
66 }),
67 Self::Int64(v) => v.get(index).map(|o| match o {
68 Some(n) => Scalar::Int64(*n),
69 None => Scalar::Null,
70 }),
71 Self::UInt64(v) => v.get(index).map(|o| match o {
72 Some(n) => Scalar::UInt64(*n),
73 None => Scalar::Null,
74 }),
75 Self::Float64(v) => v.get(index).map(|o| match o {
76 Some(n) => Scalar::Float64(*n),
77 None => Scalar::Null,
78 }),
79 Self::String(v) => v.get(index).map(|o| match o {
80 Some(s) => Scalar::String(s.clone()),
81 None => Scalar::Null,
82 }),
83 }
84 }
85
86 #[must_use]
88 pub fn non_null_count(&self) -> usize {
89 match self {
90 Self::Bool(v) => v.iter().filter(|o| o.is_some()).count(),
91 Self::Int64(v) => v.iter().filter(|o| o.is_some()).count(),
92 Self::UInt64(v) => v.iter().filter(|o| o.is_some()).count(),
93 Self::Float64(v) => v.iter().filter(|o| o.is_some()).count(),
94 Self::String(v) => v.iter().filter(|o| o.is_some()).count(),
95 }
96 }
97
98 pub fn take(&self, indices: &[usize]) -> Self {
106 #[allow(
107 clippy::indexing_slicing,
108 reason = "indices are always derived from 0..len() in sort/filter/group_by — bounds are structurally guaranteed by callers"
109 )]
110 match self {
111 Self::Bool(v) => Self::Bool(indices.iter().map(|&i| v[i]).collect()),
112 Self::Int64(v) => Self::Int64(indices.iter().map(|&i| v[i]).collect()),
113 Self::UInt64(v) => Self::UInt64(indices.iter().map(|&i| v[i]).collect()),
114 Self::Float64(v) => Self::Float64(indices.iter().map(|&i| v[i]).collect()),
115 Self::String(v) => Self::String(indices.iter().map(|&i| v[i].clone()).collect()),
116 }
117 }
118
119 pub fn take_optional(&self, indices: &[Option<usize>]) -> Self {
124 #[allow(
125 clippy::indexing_slicing,
126 reason = "Some(i) indices are derived from 0..len() in join probe — bounds are structurally guaranteed by callers"
127 )]
128 match self {
129 Self::Bool(v) => Self::Bool(indices.iter().map(|opt| opt.and_then(|i| v[i])).collect()),
130 Self::Int64(v) => {
131 Self::Int64(indices.iter().map(|opt| opt.and_then(|i| v[i])).collect())
132 }
133 Self::UInt64(v) => {
134 Self::UInt64(indices.iter().map(|opt| opt.and_then(|i| v[i])).collect())
135 }
136 Self::Float64(v) => {
137 Self::Float64(indices.iter().map(|opt| opt.and_then(|i| v[i])).collect())
138 }
139 Self::String(v) => Self::String(
140 indices
141 .iter()
142 .map(|opt| opt.and_then(|i| v[i].clone()))
143 .collect(),
144 ),
145 }
146 }
147}
148
149#[derive(Debug, Clone)]
151pub struct Column {
152 name: String,
153 data: ColumnData,
154}
155
156impl Column {
157 pub fn new_bool(name: impl Into<String>, data: Vec<Option<bool>>) -> Self {
163 Self {
164 name: name.into(),
165 data: ColumnData::Bool(data),
166 }
167 }
168
169 pub fn new_i64(name: impl Into<String>, data: Vec<Option<i64>>) -> Self {
171 Self {
172 name: name.into(),
173 data: ColumnData::Int64(data),
174 }
175 }
176
177 pub fn new_u64(name: impl Into<String>, data: Vec<Option<u64>>) -> Self {
179 Self {
180 name: name.into(),
181 data: ColumnData::UInt64(data),
182 }
183 }
184
185 pub fn new_f64(name: impl Into<String>, data: Vec<Option<f64>>) -> Self {
187 Self {
188 name: name.into(),
189 data: ColumnData::Float64(data),
190 }
191 }
192
193 pub fn new_string(name: impl Into<String>, data: Vec<Option<String>>) -> Self {
195 Self {
196 name: name.into(),
197 data: ColumnData::String(data),
198 }
199 }
200
201 pub fn from_bools(name: impl Into<String>, data: Vec<bool>) -> Self {
207 Self::new_bool(name, data.into_iter().map(Some).collect())
208 }
209
210 pub fn from_i64s(name: impl Into<String>, data: Vec<i64>) -> Self {
212 Self::new_i64(name, data.into_iter().map(Some).collect())
213 }
214
215 pub fn from_u64s(name: impl Into<String>, data: Vec<u64>) -> Self {
217 Self::new_u64(name, data.into_iter().map(Some).collect())
218 }
219
220 pub fn from_f64s(name: impl Into<String>, data: Vec<f64>) -> Self {
222 Self::new_f64(name, data.into_iter().map(Some).collect())
223 }
224
225 pub fn from_strings(name: impl Into<String>, data: Vec<String>) -> Self {
227 Self::new_string(name, data.into_iter().map(Some).collect())
228 }
229
230 pub fn from_strs(name: impl Into<String>, data: &[&str]) -> Self {
232 Self::new_string(name, data.iter().map(|s| Some((*s).to_string())).collect())
233 }
234
235 #[must_use]
241 pub fn name(&self) -> &str {
242 &self.name
243 }
244
245 #[must_use]
247 pub fn dtype(&self) -> DataType {
248 self.data.dtype()
249 }
250
251 #[must_use]
253 pub fn len(&self) -> usize {
254 self.data.len()
255 }
256
257 #[must_use]
259 pub fn is_empty(&self) -> bool {
260 self.data.is_empty()
261 }
262
263 #[must_use]
265 pub fn non_null_count(&self) -> usize {
266 self.data.non_null_count()
267 }
268
269 #[must_use]
271 pub fn null_count(&self) -> usize {
272 #[allow(
274 clippy::arithmetic_side_effects,
275 reason = "non_null_count() is always <= len() by construction — both count the same Vec elements"
276 )]
277 {
278 self.len() - self.non_null_count()
279 }
280 }
281
282 #[must_use]
284 pub fn data(&self) -> &ColumnData {
285 &self.data
286 }
287
288 #[must_use]
290 pub fn get(&self, index: usize) -> Option<Scalar> {
291 self.data.get(index)
292 }
293
294 #[must_use]
296 pub fn rename(&self, name: impl Into<String>) -> Self {
297 Self {
298 name: name.into(),
299 data: self.data.clone(),
300 }
301 }
302
303 pub fn take(&self, indices: &[usize]) -> Self {
305 Self {
306 name: self.name.clone(),
307 data: self.data.take(indices),
308 }
309 }
310
311 pub fn take_optional(&self, indices: &[Option<usize>]) -> Self {
314 Self {
315 name: self.name.clone(),
316 data: self.data.take_optional(indices),
317 }
318 }
319
320 pub fn as_str_iter(&self) -> Result<impl Iterator<Item = Option<&str>>, DataFrameError> {
326 match &self.data {
327 ColumnData::String(v) => Ok(v.iter().map(|o| o.as_deref())),
328 ColumnData::Bool(_)
329 | ColumnData::Int64(_)
330 | ColumnData::UInt64(_)
331 | ColumnData::Float64(_) => Err(DataFrameError::TypeMismatch {
332 column: self.name.clone(),
333 expected: DataType::Utf8,
334 actual: self.dtype(),
335 }),
336 }
337 }
338
339 pub fn as_i64_iter(&self) -> Result<impl Iterator<Item = Option<i64>> + '_, DataFrameError> {
341 match &self.data {
342 ColumnData::Int64(v) => Ok(v.iter().copied()),
343 ColumnData::Bool(_)
344 | ColumnData::UInt64(_)
345 | ColumnData::Float64(_)
346 | ColumnData::String(_) => Err(DataFrameError::TypeMismatch {
347 column: self.name.clone(),
348 expected: DataType::Int64,
349 actual: self.dtype(),
350 }),
351 }
352 }
353
354 pub fn as_u64_iter(&self) -> Result<impl Iterator<Item = Option<u64>> + '_, DataFrameError> {
356 match &self.data {
357 ColumnData::UInt64(v) => Ok(v.iter().copied()),
358 ColumnData::Bool(_)
359 | ColumnData::Int64(_)
360 | ColumnData::Float64(_)
361 | ColumnData::String(_) => Err(DataFrameError::TypeMismatch {
362 column: self.name.clone(),
363 expected: DataType::UInt64,
364 actual: self.dtype(),
365 }),
366 }
367 }
368
369 pub fn as_f64_iter(&self) -> Result<impl Iterator<Item = Option<f64>> + '_, DataFrameError> {
371 match &self.data {
372 ColumnData::Float64(v) => Ok(v.iter().copied()),
373 ColumnData::Bool(_)
374 | ColumnData::Int64(_)
375 | ColumnData::UInt64(_)
376 | ColumnData::String(_) => Err(DataFrameError::TypeMismatch {
377 column: self.name.clone(),
378 expected: DataType::Float64,
379 actual: self.dtype(),
380 }),
381 }
382 }
383
384 pub fn as_bool_iter(&self) -> Result<impl Iterator<Item = Option<bool>> + '_, DataFrameError> {
386 match &self.data {
387 ColumnData::Bool(v) => Ok(v.iter().copied()),
388 ColumnData::Int64(_)
389 | ColumnData::UInt64(_)
390 | ColumnData::Float64(_)
391 | ColumnData::String(_) => Err(DataFrameError::TypeMismatch {
392 column: self.name.clone(),
393 expected: DataType::Bool,
394 actual: self.dtype(),
395 }),
396 }
397 }
398
399 pub fn get_str(&self, index: usize) -> Result<Option<&str>, DataFrameError> {
401 match &self.data {
402 ColumnData::String(v) => match v.get(index) {
403 Some(o) => Ok(o.as_deref()),
404 None => Err(DataFrameError::IndexOutOfBounds {
405 index,
406 length: v.len(),
407 }),
408 },
409 ColumnData::Bool(_)
410 | ColumnData::Int64(_)
411 | ColumnData::UInt64(_)
412 | ColumnData::Float64(_) => Err(DataFrameError::TypeMismatch {
413 column: self.name.clone(),
414 expected: DataType::Utf8,
415 actual: self.dtype(),
416 }),
417 }
418 }
419}
420
421#[cfg(test)]
422mod tests {
423 use super::*;
424
425 #[test]
426 fn from_strs_construction() {
427 let c = Column::from_strs("names", &["alice", "bob", "carol"]);
428 assert_eq!(c.name(), "names");
429 assert_eq!(c.dtype(), DataType::Utf8);
430 assert_eq!(c.len(), 3);
431 assert_eq!(c.non_null_count(), 3);
432 assert_eq!(c.null_count(), 0);
433 }
434
435 #[test]
436 fn nullable_column() {
437 let c = Column::new_i64("x", vec![Some(1), None, Some(3)]);
438 assert_eq!(c.len(), 3);
439 assert_eq!(c.non_null_count(), 2);
440 assert_eq!(c.null_count(), 1);
441 assert_eq!(c.get(0), Some(Scalar::Int64(1)));
442 assert_eq!(c.get(1), Some(Scalar::Null));
443 assert_eq!(c.get(3), None);
444 }
445
446 #[test]
447 fn typed_iterators() {
448 let c = Column::from_i64s("nums", vec![10, 20, 30]);
449 let vals: Vec<_> = c.as_i64_iter().unwrap_or_else(|_| unreachable!()).collect();
450 assert_eq!(vals, vec![Some(10), Some(20), Some(30)]);
451
452 assert!(c.as_str_iter().is_err());
454 }
455
456 #[test]
457 fn take_indices() {
458 let c = Column::from_strs("x", &["a", "b", "c", "d"]);
459 let taken = c.take(&[0, 2, 3]);
460 assert_eq!(taken.len(), 3);
461 assert_eq!(taken.get_str(0).unwrap_or(None), Some("a"));
462 assert_eq!(taken.get_str(1).unwrap_or(None), Some("c"));
463 assert_eq!(taken.get_str(2).unwrap_or(None), Some("d"));
464 }
465
466 #[test]
467 fn rename_column() {
468 let c = Column::from_i64s("old", vec![1, 2]);
469 let c2 = c.rename("new");
470 assert_eq!(c2.name(), "new");
471 assert_eq!(c2.len(), 2);
472 }
473
474 #[test]
475 fn take_optional_indices() {
476 let c = Column::from_strs("x", &["a", "b", "c"]);
477 let taken = c.take_optional(&[Some(0), None, Some(2)]);
478 assert_eq!(taken.len(), 3);
479 assert_eq!(taken.get_str(0).unwrap_or(None), Some("a"));
480 assert_eq!(taken.get_str(1).unwrap_or(None), None);
481 assert_eq!(taken.get_str(2).unwrap_or(None), Some("c"));
482
483 let n = Column::from_i64s("n", vec![10, 20, 30]);
485 let taken = n.take_optional(&[None, Some(1), Some(2)]);
486 assert_eq!(taken.get(0), Some(Scalar::Null));
487 assert_eq!(taken.get(1), Some(Scalar::Int64(20)));
488 assert_eq!(taken.get(2), Some(Scalar::Int64(30)));
489 }
490
491 #[test]
492 fn all_data_types_construct() {
493 let b = Column::from_bools("b", vec![true, false]);
494 assert_eq!(b.dtype(), DataType::Bool);
495 let i = Column::from_i64s("i", vec![1, 2]);
496 assert_eq!(i.dtype(), DataType::Int64);
497 let u = Column::from_u64s("u", vec![1, 2]);
498 assert_eq!(u.dtype(), DataType::UInt64);
499 let f = Column::from_f64s("f", vec![1.0, 2.0]);
500 assert_eq!(f.dtype(), DataType::Float64);
501 let s = Column::from_strings("s", vec!["a".into(), "b".into()]);
502 assert_eq!(s.dtype(), DataType::Utf8);
503 }
504}