1use crate::error::InsightError;
30
31#[derive(Debug, Clone, PartialEq)]
39pub struct ValidityBitmap {
40 bits: Vec<u64>,
41 len: usize,
42}
43
44impl ValidityBitmap {
45 pub fn all_valid(len: usize) -> Self {
47 let n_words = len.div_ceil(64);
48 let mut bits = vec![u64::MAX; n_words];
49 let trailing = len % 64;
50 if trailing != 0 && n_words > 0 {
51 bits[n_words - 1] = (1u64 << trailing) - 1;
52 }
53 Self { bits, len }
54 }
55
56 pub fn all_invalid(len: usize) -> Self {
58 let n_words = len.div_ceil(64);
59 Self {
60 bits: vec![0u64; n_words],
61 len,
62 }
63 }
64
65 pub fn empty() -> Self {
67 Self {
68 bits: Vec::new(),
69 len: 0,
70 }
71 }
72
73 #[inline]
75 pub fn is_valid(&self, idx: usize) -> bool {
76 debug_assert!(
77 idx < self.len,
78 "index {idx} out of bounds (len={})",
79 self.len
80 );
81 let (word, bit) = (idx / 64, idx % 64);
82 (self.bits[word] >> bit) & 1 == 1
83 }
84
85 #[inline]
87 pub fn set_valid(&mut self, idx: usize) {
88 debug_assert!(
89 idx < self.len,
90 "index {idx} out of bounds (len={})",
91 self.len
92 );
93 let (word, bit) = (idx / 64, idx % 64);
94 self.bits[word] |= 1u64 << bit;
95 }
96
97 #[inline]
99 pub fn set_invalid(&mut self, idx: usize) {
100 debug_assert!(
101 idx < self.len,
102 "index {idx} out of bounds (len={})",
103 self.len
104 );
105 let (word, bit) = (idx / 64, idx % 64);
106 self.bits[word] &= !(1u64 << bit);
107 }
108
109 pub fn push(&mut self, valid: bool) {
111 let idx = self.len;
112 self.len += 1;
113 let word = idx / 64;
114 let bit = idx % 64;
115 if word >= self.bits.len() {
116 self.bits.push(0);
117 }
118 if valid {
119 self.bits[word] |= 1u64 << bit;
120 }
121 }
122
123 #[inline]
125 pub fn len(&self) -> usize {
126 self.len
127 }
128
129 #[inline]
131 pub fn is_empty(&self) -> bool {
132 self.len == 0
133 }
134
135 pub fn null_count(&self) -> usize {
139 let valid_count: usize = self.bits.iter().map(|w| w.count_ones() as usize).sum();
140 self.len - valid_count
141 }
142
143 pub fn valid_count(&self) -> usize {
145 self.len - self.null_count()
146 }
147
148 pub fn has_nulls(&self) -> bool {
150 self.null_count() > 0
151 }
152
153 pub fn valid_indices(&self) -> ValidIndicesIter<'_> {
155 ValidIndicesIter {
156 bitmap: self,
157 current: 0,
158 }
159 }
160}
161
162pub struct ValidIndicesIter<'a> {
164 bitmap: &'a ValidityBitmap,
165 current: usize,
166}
167
168impl<'a> Iterator for ValidIndicesIter<'a> {
169 type Item = usize;
170
171 fn next(&mut self) -> Option<usize> {
172 while self.current < self.bitmap.len {
173 let idx = self.current;
174 self.current += 1;
175 if self.bitmap.is_valid(idx) {
176 return Some(idx);
177 }
178 }
179 None
180 }
181
182 fn size_hint(&self) -> (usize, Option<usize>) {
183 (0, Some(self.bitmap.len - self.current))
184 }
185}
186
187#[derive(Debug, Clone, Copy, PartialEq, Eq)]
191pub enum DataType {
192 Numeric,
194 Boolean,
196 Categorical,
198 Text,
200}
201
202impl std::fmt::Display for DataType {
203 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
204 match self {
205 Self::Numeric => write!(f, "Numeric"),
206 Self::Boolean => write!(f, "Boolean"),
207 Self::Categorical => write!(f, "Categorical"),
208 Self::Text => write!(f, "Text"),
209 }
210 }
211}
212
213#[derive(Debug, Clone, PartialEq)]
221pub enum Column {
222 Numeric {
224 values: Vec<f64>,
225 validity: ValidityBitmap,
226 },
227 Boolean {
229 values: Vec<bool>,
230 validity: ValidityBitmap,
231 },
232 Categorical {
238 dictionary: Vec<String>,
239 indices: Vec<u32>,
240 validity: ValidityBitmap,
241 },
242 Text {
244 values: Vec<String>,
245 validity: ValidityBitmap,
246 },
247}
248
249impl Column {
250 pub fn numeric(values: Vec<f64>, validity: ValidityBitmap) -> Self {
252 Self::Numeric { values, validity }
253 }
254
255 pub fn boolean(values: Vec<bool>, validity: ValidityBitmap) -> Self {
257 Self::Boolean { values, validity }
258 }
259
260 pub fn categorical(
262 dictionary: Vec<String>,
263 indices: Vec<u32>,
264 validity: ValidityBitmap,
265 ) -> Self {
266 Self::Categorical {
267 dictionary,
268 indices,
269 validity,
270 }
271 }
272
273 pub fn text(values: Vec<String>, validity: ValidityBitmap) -> Self {
275 Self::Text { values, validity }
276 }
277
278 pub fn data_type(&self) -> DataType {
280 match self {
281 Self::Numeric { .. } => DataType::Numeric,
282 Self::Boolean { .. } => DataType::Boolean,
283 Self::Categorical { .. } => DataType::Categorical,
284 Self::Text { .. } => DataType::Text,
285 }
286 }
287
288 pub fn len(&self) -> usize {
290 self.validity().len()
291 }
292
293 pub fn is_empty(&self) -> bool {
295 self.len() == 0
296 }
297
298 pub fn validity(&self) -> &ValidityBitmap {
300 match self {
301 Self::Numeric { validity, .. }
302 | Self::Boolean { validity, .. }
303 | Self::Categorical { validity, .. }
304 | Self::Text { validity, .. } => validity,
305 }
306 }
307
308 pub fn null_count(&self) -> usize {
310 self.validity().null_count()
311 }
312
313 pub fn valid_count(&self) -> usize {
315 self.validity().valid_count()
316 }
317
318 pub fn is_valid(&self, idx: usize) -> bool {
320 self.validity().is_valid(idx)
321 }
322
323 pub fn as_numeric(&self) -> Option<&[f64]> {
325 match self {
326 Self::Numeric { values, .. } => Some(values),
327 _ => None,
328 }
329 }
330
331 pub fn as_boolean(&self) -> Option<&[bool]> {
333 match self {
334 Self::Boolean { values, .. } => Some(values),
335 _ => None,
336 }
337 }
338
339 pub fn valid_numeric_values(&self) -> Option<Vec<f64>> {
341 match self {
342 Self::Numeric { values, validity } => {
343 let result: Vec<f64> = validity.valid_indices().map(|i| values[i]).collect();
344 Some(result)
345 }
346 _ => None,
347 }
348 }
349
350 pub fn category_at(&self, idx: usize) -> Option<&str> {
352 match self {
353 Self::Categorical {
354 dictionary,
355 indices,
356 validity,
357 } => {
358 if validity.is_valid(idx) {
359 dictionary.get(indices[idx] as usize).map(|s| s.as_str())
360 } else {
361 None
362 }
363 }
364 _ => None,
365 }
366 }
367
368 pub fn text_at(&self, idx: usize) -> Option<&str> {
370 match self {
371 Self::Text { values, validity } => {
372 if validity.is_valid(idx) {
373 Some(&values[idx])
374 } else {
375 None
376 }
377 }
378 _ => None,
379 }
380 }
381}
382
383#[derive(Debug, Clone)]
412pub struct DataFrame {
413 names: Vec<String>,
414 columns: Vec<Column>,
415 row_count: usize,
416}
417
418impl DataFrame {
419 pub fn new() -> Self {
421 Self {
422 names: Vec::new(),
423 columns: Vec::new(),
424 row_count: 0,
425 }
426 }
427
428 pub fn add_column(&mut self, name: String, column: Column) -> Result<(), InsightError> {
433 let col_len = column.len();
434 if self.columns.is_empty() {
435 self.row_count = col_len;
436 } else if col_len != self.row_count {
437 return Err(InsightError::DimensionMismatch {
438 expected: self.row_count,
439 actual: col_len,
440 });
441 }
442 self.names.push(name);
443 self.columns.push(column);
444 Ok(())
445 }
446
447 #[inline]
449 pub fn row_count(&self) -> usize {
450 self.row_count
451 }
452
453 #[inline]
455 pub fn column_count(&self) -> usize {
456 self.columns.len()
457 }
458
459 pub fn is_empty(&self) -> bool {
461 self.columns.is_empty()
462 }
463
464 pub fn column_names(&self) -> &[String] {
466 &self.names
467 }
468
469 pub fn column(&self, index: usize) -> Option<&Column> {
471 self.columns.get(index)
472 }
473
474 pub fn column_by_name(&self, name: &str) -> Option<&Column> {
476 self.column_index(name).map(|i| &self.columns[i])
477 }
478
479 pub fn column_index(&self, name: &str) -> Option<usize> {
481 self.names.iter().position(|n| n == name)
482 }
483
484 pub fn iter(&self) -> impl Iterator<Item = (&str, &Column)> {
486 self.names
487 .iter()
488 .map(|s| s.as_str())
489 .zip(self.columns.iter())
490 }
491
492 pub fn schema(&self) -> Vec<(&str, DataType)> {
494 self.names
495 .iter()
496 .zip(self.columns.iter())
497 .map(|(name, col)| (name.as_str(), col.data_type()))
498 .collect()
499 }
500
501 pub fn total_null_count(&self) -> usize {
503 self.columns.iter().map(|c| c.null_count()).sum()
504 }
505}
506
507impl Default for DataFrame {
508 fn default() -> Self {
509 Self::new()
510 }
511}
512
513#[cfg(test)]
516mod tests {
517 use super::*;
518
519 #[test]
522 fn bitmap_all_valid() {
523 let bm = ValidityBitmap::all_valid(100);
524 assert_eq!(bm.len(), 100);
525 assert_eq!(bm.null_count(), 0);
526 assert_eq!(bm.valid_count(), 100);
527 for i in 0..100 {
528 assert!(bm.is_valid(i));
529 }
530 }
531
532 #[test]
533 fn bitmap_all_invalid() {
534 let bm = ValidityBitmap::all_invalid(100);
535 assert_eq!(bm.null_count(), 100);
536 assert_eq!(bm.valid_count(), 0);
537 for i in 0..100 {
538 assert!(!bm.is_valid(i));
539 }
540 }
541
542 #[test]
543 fn bitmap_set_operations() {
544 let mut bm = ValidityBitmap::all_valid(10);
545 bm.set_invalid(3);
546 bm.set_invalid(7);
547 assert_eq!(bm.null_count(), 2);
548 assert!(!bm.is_valid(3));
549 assert!(!bm.is_valid(7));
550 assert!(bm.is_valid(0));
551 assert!(bm.is_valid(9));
552
553 bm.set_valid(3);
554 assert!(bm.is_valid(3));
555 assert_eq!(bm.null_count(), 1);
556 }
557
558 #[test]
559 fn bitmap_push() {
560 let mut bm = ValidityBitmap::empty();
561 bm.push(true);
562 bm.push(false);
563 bm.push(true);
564 assert_eq!(bm.len(), 3);
565 assert!(bm.is_valid(0));
566 assert!(!bm.is_valid(1));
567 assert!(bm.is_valid(2));
568 assert_eq!(bm.null_count(), 1);
569 }
570
571 #[test]
572 fn bitmap_boundary_64() {
573 let bm = ValidityBitmap::all_valid(64);
574 assert_eq!(bm.bits.len(), 1);
575 assert_eq!(bm.null_count(), 0);
576
577 let bm65 = ValidityBitmap::all_valid(65);
578 assert_eq!(bm65.bits.len(), 2);
579 assert_eq!(bm65.null_count(), 0);
580 assert!(bm65.is_valid(64));
581 }
582
583 #[test]
584 fn bitmap_push_across_word_boundary() {
585 let mut bm = ValidityBitmap::empty();
586 for i in 0..128 {
587 bm.push(i % 3 != 0); }
589 assert_eq!(bm.len(), 128);
590 let expected_nulls = (0..128).filter(|i| i % 3 == 0).count();
591 assert_eq!(bm.null_count(), expected_nulls);
592 }
593
594 #[test]
595 fn bitmap_valid_indices() {
596 let mut bm = ValidityBitmap::all_valid(5);
597 bm.set_invalid(1);
598 bm.set_invalid(3);
599 let indices: Vec<usize> = bm.valid_indices().collect();
600 assert_eq!(indices, vec![0, 2, 4]);
601 }
602
603 #[test]
606 fn numeric_column_basics() {
607 let col = Column::numeric(vec![1.0, 2.0, 3.0], ValidityBitmap::all_valid(3));
608 assert_eq!(col.data_type(), DataType::Numeric);
609 assert_eq!(col.len(), 3);
610 assert_eq!(col.null_count(), 0);
611 assert_eq!(col.as_numeric(), Some(&[1.0, 2.0, 3.0][..]));
612 }
613
614 #[test]
615 fn numeric_column_with_nulls() {
616 let mut validity = ValidityBitmap::all_valid(4);
617 validity.set_invalid(1);
618 validity.set_invalid(3);
619 let col = Column::numeric(vec![1.0, 0.0, 3.0, 0.0], validity);
620 assert_eq!(col.null_count(), 2);
621 assert_eq!(col.valid_count(), 2);
622 assert!(col.is_valid(0));
623 assert!(!col.is_valid(1));
624 let valid = col.valid_numeric_values().expect("numeric column");
625 assert_eq!(valid, vec![1.0, 3.0]);
626 }
627
628 #[test]
629 fn boolean_column() {
630 let col = Column::boolean(vec![true, false, true], ValidityBitmap::all_valid(3));
631 assert_eq!(col.data_type(), DataType::Boolean);
632 assert_eq!(col.as_boolean(), Some(&[true, false, true][..]));
633 }
634
635 #[test]
636 fn categorical_column() {
637 let dict = vec!["low".into(), "med".into(), "high".into()];
638 let indices = vec![0, 1, 2, 1, 0];
639 let col = Column::categorical(dict, indices, ValidityBitmap::all_valid(5));
640 assert_eq!(col.data_type(), DataType::Categorical);
641 assert_eq!(col.category_at(0), Some("low"));
642 assert_eq!(col.category_at(1), Some("med"));
643 assert_eq!(col.category_at(2), Some("high"));
644 assert_eq!(col.category_at(3), Some("med"));
645 }
646
647 #[test]
648 fn categorical_column_with_null() {
649 let dict = vec!["a".into(), "b".into()];
650 let indices = vec![0, 0, 1];
651 let mut validity = ValidityBitmap::all_valid(3);
652 validity.set_invalid(1);
653 let col = Column::categorical(dict, indices, validity);
654 assert_eq!(col.category_at(0), Some("a"));
655 assert_eq!(col.category_at(1), None);
656 assert_eq!(col.category_at(2), Some("b"));
657 }
658
659 #[test]
660 fn text_column() {
661 let col = Column::text(
662 vec!["hello".into(), "world".into()],
663 ValidityBitmap::all_valid(2),
664 );
665 assert_eq!(col.data_type(), DataType::Text);
666 assert_eq!(col.text_at(0), Some("hello"));
667 assert_eq!(col.text_at(1), Some("world"));
668 }
669
670 #[test]
671 fn text_column_with_null() {
672 let mut validity = ValidityBitmap::all_valid(2);
673 validity.set_invalid(0);
674 let col = Column::text(vec![String::new(), "world".into()], validity);
675 assert_eq!(col.text_at(0), None);
676 assert_eq!(col.text_at(1), Some("world"));
677 }
678
679 #[test]
682 fn empty_dataframe() {
683 let df = DataFrame::new();
684 assert_eq!(df.row_count(), 0);
685 assert_eq!(df.column_count(), 0);
686 assert!(df.is_empty());
687 }
688
689 #[test]
690 fn add_columns() {
691 let mut df = DataFrame::new();
692 df.add_column(
693 "x".to_string(),
694 Column::numeric(vec![1.0, 2.0, 3.0], ValidityBitmap::all_valid(3)),
695 )
696 .expect("first column");
697
698 df.add_column(
699 "y".to_string(),
700 Column::numeric(vec![4.0, 5.0, 6.0], ValidityBitmap::all_valid(3)),
701 )
702 .expect("second column");
703
704 assert_eq!(df.row_count(), 3);
705 assert_eq!(df.column_count(), 2);
706 assert_eq!(df.column_names(), &["x", "y"]);
707 }
708
709 #[test]
710 fn column_length_mismatch() {
711 let mut df = DataFrame::new();
712 df.add_column(
713 "x".to_string(),
714 Column::numeric(vec![1.0, 2.0], ValidityBitmap::all_valid(2)),
715 )
716 .unwrap();
717
718 let result = df.add_column(
719 "y".to_string(),
720 Column::numeric(vec![1.0, 2.0, 3.0], ValidityBitmap::all_valid(3)),
721 );
722 assert!(result.is_err());
723 }
724
725 #[test]
726 fn column_by_name_lookup() {
727 let mut df = DataFrame::new();
728 df.add_column(
729 "temp".to_string(),
730 Column::numeric(vec![20.5, 21.3], ValidityBitmap::all_valid(2)),
731 )
732 .unwrap();
733
734 let col = df.column_by_name("temp").expect("found");
735 assert_eq!(col.data_type(), DataType::Numeric);
736
737 assert!(df.column_by_name("missing").is_none());
738 }
739
740 #[test]
741 fn dataframe_schema() {
742 let mut df = DataFrame::new();
743 df.add_column(
744 "x".to_string(),
745 Column::numeric(vec![1.0], ValidityBitmap::all_valid(1)),
746 )
747 .unwrap();
748 df.add_column(
749 "ok".to_string(),
750 Column::boolean(vec![true], ValidityBitmap::all_valid(1)),
751 )
752 .unwrap();
753 df.add_column(
754 "label".to_string(),
755 Column::text(vec!["a".into()], ValidityBitmap::all_valid(1)),
756 )
757 .unwrap();
758
759 let schema = df.schema();
760 assert_eq!(schema[0], ("x", DataType::Numeric));
761 assert_eq!(schema[1], ("ok", DataType::Boolean));
762 assert_eq!(schema[2], ("label", DataType::Text));
763 }
764
765 #[test]
766 fn total_null_count() {
767 let mut df = DataFrame::new();
768 let mut v1 = ValidityBitmap::all_valid(3);
769 v1.set_invalid(1);
770 let mut v2 = ValidityBitmap::all_valid(3);
771 v2.set_invalid(0);
772 v2.set_invalid(2);
773 df.add_column("a".into(), Column::numeric(vec![1.0, 0.0, 3.0], v1))
774 .unwrap();
775 df.add_column("b".into(), Column::numeric(vec![0.0, 5.0, 0.0], v2))
776 .unwrap();
777 assert_eq!(df.total_null_count(), 3);
778 }
779
780 #[test]
781 fn dataframe_iter() {
782 let mut df = DataFrame::new();
783 df.add_column(
784 "x".into(),
785 Column::numeric(vec![1.0], ValidityBitmap::all_valid(1)),
786 )
787 .unwrap();
788 df.add_column(
789 "y".into(),
790 Column::numeric(vec![2.0], ValidityBitmap::all_valid(1)),
791 )
792 .unwrap();
793
794 let pairs: Vec<(&str, DataType)> = df.iter().map(|(n, c)| (n, c.data_type())).collect();
795 assert_eq!(
796 pairs,
797 vec![("x", DataType::Numeric), ("y", DataType::Numeric)]
798 );
799 }
800}