1use crate::Column;
7use std::collections::BTreeSet;
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum CompOp {
12 Gt,
13 Lt,
14 Ge,
15 Le,
16 Eq,
17}
18
19#[derive(Debug, Clone)]
21pub struct ColumnStats {
22 pub min_f64: Option<f64>,
24 pub max_f64: Option<f64>,
26 pub min_i64: Option<i64>,
28 pub max_i64: Option<i64>,
30 pub sorted_asc: bool,
32 pub sorted_desc: bool,
34 pub n_distinct: Option<usize>,
36 pub n_null: usize,
38 pub nrows: usize,
40}
41
42impl ColumnStats {
43 pub fn compute(col: &Column) -> Self {
45 match col {
46 Column::Float(v) => Self::compute_float(v),
47 Column::Int(v) => Self::compute_int(v),
48 Column::Str(v) => Self::compute_str(v),
49 Column::Bool(v) => Self::compute_bool(v),
50 Column::Categorical { codes, levels } => Self::compute_categorical(codes, levels),
51 Column::DateTime(v) => Self::compute_datetime(v),
52 }
53 }
54
55 fn compute_float(v: &[f64]) -> Self {
56 if v.is_empty() {
57 return Self::empty();
58 }
59
60 let mut min = f64::INFINITY;
61 let mut max = f64::NEG_INFINITY;
62 let mut n_null = 0usize;
63 let mut sorted_asc = true;
64 let mut sorted_desc = true;
65 let mut prev: Option<f64> = None;
66
67 for &val in v {
68 if val.is_nan() {
69 n_null += 1;
70 if prev.is_some() {
72 sorted_asc = false;
73 sorted_desc = false;
74 }
75 } else {
76 if val < min {
77 min = val;
78 }
79 if val > max {
80 max = val;
81 }
82 if let Some(p) = prev {
83 if !p.is_nan() {
84 if val < p {
85 sorted_asc = false;
86 }
87 if val > p {
88 sorted_desc = false;
89 }
90 }
91 }
92 }
93 prev = Some(val);
94 }
95
96 ColumnStats {
97 min_f64: if min.is_finite() { Some(min) } else { None },
98 max_f64: if max.is_finite() { Some(max) } else { None },
99 min_i64: None,
100 max_i64: None,
101 sorted_asc,
102 sorted_desc,
103 n_distinct: None,
104 n_null,
105 nrows: v.len(),
106 }
107 }
108
109 fn compute_int(v: &[i64]) -> Self {
110 if v.is_empty() {
111 return Self::empty();
112 }
113
114 let mut min = i64::MAX;
115 let mut max = i64::MIN;
116 let mut sorted_asc = true;
117 let mut sorted_desc = true;
118
119 for i in 0..v.len() {
120 let val = v[i];
121 if val < min {
122 min = val;
123 }
124 if val > max {
125 max = val;
126 }
127 if i > 0 {
128 if val < v[i - 1] {
129 sorted_asc = false;
130 }
131 if val > v[i - 1] {
132 sorted_desc = false;
133 }
134 }
135 }
136
137 ColumnStats {
138 min_f64: None,
139 max_f64: None,
140 min_i64: Some(min),
141 max_i64: Some(max),
142 sorted_asc,
143 sorted_desc,
144 n_distinct: None,
145 n_null: 0,
146 nrows: v.len(),
147 }
148 }
149
150 fn compute_str(v: &[String]) -> Self {
151 if v.is_empty() {
152 return Self::empty();
153 }
154
155 let mut sorted_asc = true;
156 let mut sorted_desc = true;
157 let mut distinct = BTreeSet::new();
159
160 for i in 0..v.len() {
161 distinct.insert(v[i].as_str());
162 if i > 0 {
163 if v[i] < v[i - 1] {
164 sorted_asc = false;
165 }
166 if v[i] > v[i - 1] {
167 sorted_desc = false;
168 }
169 }
170 }
171
172 ColumnStats {
173 min_f64: None,
174 max_f64: None,
175 min_i64: None,
176 max_i64: None,
177 sorted_asc,
178 sorted_desc,
179 n_distinct: Some(distinct.len()),
180 n_null: 0,
181 nrows: v.len(),
182 }
183 }
184
185 fn compute_bool(v: &[bool]) -> Self {
186 if v.is_empty() {
187 return Self::empty();
188 }
189
190 let mut sorted_asc = true;
191 let mut sorted_desc = true;
192 let mut has_true = false;
193 let mut has_false = false;
194
195 for i in 0..v.len() {
196 if v[i] {
197 has_true = true;
198 } else {
199 has_false = true;
200 }
201 if i > 0 {
202 if v[i] < v[i - 1] {
204 sorted_asc = false;
205 }
206 if v[i] > v[i - 1] {
207 sorted_desc = false;
208 }
209 }
210 }
211
212 let n_distinct = match (has_true, has_false) {
213 (true, true) => 2,
214 (true, false) | (false, true) => 1,
215 _ => 0,
216 };
217
218 ColumnStats {
219 min_f64: None,
220 max_f64: None,
221 min_i64: None,
222 max_i64: None,
223 sorted_asc,
224 sorted_desc,
225 n_distinct: Some(n_distinct),
226 n_null: 0,
227 nrows: v.len(),
228 }
229 }
230
231 fn compute_categorical(codes: &[u32], levels: &[String]) -> Self {
232 if codes.is_empty() {
233 return Self::empty();
234 }
235
236 let mut min = u32::MAX;
237 let mut max = u32::MIN;
238 let mut sorted_asc = true;
239 let mut sorted_desc = true;
240 let mut distinct = BTreeSet::new();
241
242 for i in 0..codes.len() {
243 let c = codes[i];
244 if c < min {
245 min = c;
246 }
247 if c > max {
248 max = c;
249 }
250 distinct.insert(c);
251 if i > 0 {
252 if c < codes[i - 1] {
253 sorted_asc = false;
254 }
255 if c > codes[i - 1] {
256 sorted_desc = false;
257 }
258 }
259 }
260
261 ColumnStats {
262 min_f64: None,
263 max_f64: None,
264 min_i64: Some(min as i64),
265 max_i64: Some(max as i64),
266 sorted_asc,
267 sorted_desc,
268 n_distinct: Some(distinct.len().min(levels.len())),
269 n_null: 0,
270 nrows: codes.len(),
271 }
272 }
273
274 fn compute_datetime(v: &[i64]) -> Self {
275 Self::compute_int(v)
277 }
278
279 fn empty() -> Self {
280 ColumnStats {
281 min_f64: None,
282 max_f64: None,
283 min_i64: None,
284 max_i64: None,
285 sorted_asc: true, sorted_desc: true, n_distinct: Some(0),
288 n_null: 0,
289 nrows: 0,
290 }
291 }
292
293 pub fn can_skip_gt(&self, threshold: f64) -> bool {
297 self.max_f64.map_or(false, |max| max <= threshold)
298 }
299
300 pub fn can_skip_lt(&self, threshold: f64) -> bool {
302 self.min_f64.map_or(false, |min| min >= threshold)
303 }
304
305 pub fn can_skip_ge(&self, threshold: f64) -> bool {
307 self.max_f64.map_or(false, |max| max < threshold)
308 }
309
310 pub fn can_skip_le(&self, threshold: f64) -> bool {
312 self.min_f64.map_or(false, |min| min > threshold)
313 }
314
315 pub fn can_skip_eq_f64(&self, value: f64) -> bool {
317 match (self.min_f64, self.max_f64) {
318 (Some(min), Some(max)) => value < min || value > max,
319 _ => false,
320 }
321 }
322
323 pub fn can_skip_gt_i64(&self, threshold: i64) -> bool {
325 self.max_i64.map_or(false, |max| max <= threshold)
326 }
327
328 pub fn can_skip_lt_i64(&self, threshold: i64) -> bool {
330 self.min_i64.map_or(false, |min| min >= threshold)
331 }
332
333 pub fn can_skip_eq_i64(&self, value: i64) -> bool {
335 match (self.min_i64, self.max_i64) {
336 (Some(min), Some(max)) => value < min || value > max,
337 _ => false,
338 }
339 }
340
341 pub fn is_sorted(&self) -> bool {
345 self.sorted_asc || self.sorted_desc
346 }
347
348 pub fn binary_search_range_f64(col: &[f64], op: CompOp, threshold: f64) -> (usize, usize) {
354 match op {
355 CompOp::Gt => {
356 let start = col.partition_point(|&v| v <= threshold);
357 (start, col.len())
358 }
359 CompOp::Ge => {
360 let start = col.partition_point(|&v| v < threshold);
361 (start, col.len())
362 }
363 CompOp::Lt => {
364 let end = col.partition_point(|&v| v < threshold);
365 (0, end)
366 }
367 CompOp::Le => {
368 let end = col.partition_point(|&v| v <= threshold);
369 (0, end)
370 }
371 CompOp::Eq => {
372 let start = col.partition_point(|&v| v < threshold);
373 let end = col.partition_point(|&v| v <= threshold);
374 (start, end)
375 }
376 }
377 }
378
379 pub fn binary_search_range_i64(col: &[i64], op: CompOp, threshold: i64) -> (usize, usize) {
382 match op {
383 CompOp::Gt => {
384 let start = col.partition_point(|&v| v <= threshold);
385 (start, col.len())
386 }
387 CompOp::Ge => {
388 let start = col.partition_point(|&v| v < threshold);
389 (start, col.len())
390 }
391 CompOp::Lt => {
392 let end = col.partition_point(|&v| v < threshold);
393 (0, end)
394 }
395 CompOp::Le => {
396 let end = col.partition_point(|&v| v <= threshold);
397 (0, end)
398 }
399 CompOp::Eq => {
400 let start = col.partition_point(|&v| v < threshold);
401 let end = col.partition_point(|&v| v <= threshold);
402 (start, end)
403 }
404 }
405 }
406}
407
408#[derive(Debug, Clone)]
410pub struct DataFrameStats {
411 pub column_stats: Vec<(String, ColumnStats)>,
413}
414
415impl DataFrameStats {
416 pub fn compute(df: &crate::DataFrame) -> Self {
418 let column_stats = df
419 .columns
420 .iter()
421 .map(|(name, col)| (name.clone(), ColumnStats::compute(col)))
422 .collect();
423 DataFrameStats { column_stats }
424 }
425
426 pub fn get(&self, name: &str) -> Option<&ColumnStats> {
428 self.column_stats
429 .iter()
430 .find(|(n, _)| n == name)
431 .map(|(_, s)| s)
432 }
433}
434
435#[cfg(test)]
438mod tests {
439 use super::*;
440 use crate::{Column, DataFrame};
441
442 #[test]
444 fn test_float_min_max() {
445 let col = Column::Float(vec![3.0, 1.0, 4.0, 1.5, 9.2]);
446 let stats = ColumnStats::compute(&col);
447 assert_eq!(stats.min_f64, Some(1.0));
448 assert_eq!(stats.max_f64, Some(9.2));
449 assert_eq!(stats.nrows, 5);
450 assert_eq!(stats.n_null, 0);
451 }
452
453 #[test]
455 fn test_int_min_max() {
456 let col = Column::Int(vec![10, -3, 42, 0, 7]);
457 let stats = ColumnStats::compute(&col);
458 assert_eq!(stats.min_i64, Some(-3));
459 assert_eq!(stats.max_i64, Some(42));
460 assert_eq!(stats.nrows, 5);
461 }
462
463 #[test]
465 fn test_sorted_ascending() {
466 let col = Column::Float(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
467 let stats = ColumnStats::compute(&col);
468 assert!(stats.sorted_asc);
469 assert!(!stats.sorted_desc);
470 assert!(stats.is_sorted());
471 }
472
473 #[test]
475 fn test_sorted_descending() {
476 let col = Column::Int(vec![50, 40, 30, 20, 10]);
477 let stats = ColumnStats::compute(&col);
478 assert!(!stats.sorted_asc);
479 assert!(stats.sorted_desc);
480 assert!(stats.is_sorted());
481 }
482
483 #[test]
485 fn test_unsorted() {
486 let col = Column::Float(vec![3.0, 1.0, 4.0, 1.5, 9.2]);
487 let stats = ColumnStats::compute(&col);
488 assert!(!stats.sorted_asc);
489 assert!(!stats.sorted_desc);
490 assert!(!stats.is_sorted());
491 }
492
493 #[test]
495 fn test_can_skip_gt() {
496 let col = Column::Float(vec![1.0, 2.0, 3.0]);
497 let stats = ColumnStats::compute(&col);
498 assert!(stats.can_skip_gt(5.0));
500 assert!(stats.can_skip_gt(3.0));
502 assert!(!stats.can_skip_gt(2.0));
504 }
505
506 #[test]
508 fn test_can_skip_lt() {
509 let col = Column::Float(vec![5.0, 6.0, 7.0]);
510 let stats = ColumnStats::compute(&col);
511 assert!(stats.can_skip_lt(3.0));
513 assert!(stats.can_skip_lt(5.0));
515 assert!(!stats.can_skip_lt(6.0));
517 }
518
519 #[test]
521 fn test_can_skip_ge() {
522 let col = Column::Float(vec![1.0, 2.0, 3.0]);
523 let stats = ColumnStats::compute(&col);
524 assert!(stats.can_skip_ge(4.0));
526 assert!(!stats.can_skip_ge(3.0));
528 }
529
530 #[test]
532 fn test_can_skip_le() {
533 let col = Column::Float(vec![5.0, 6.0, 7.0]);
534 let stats = ColumnStats::compute(&col);
535 assert!(stats.can_skip_le(4.0));
537 assert!(!stats.can_skip_le(5.0));
539 }
540
541 #[test]
543 fn test_can_skip_eq() {
544 let col = Column::Float(vec![10.0, 20.0, 30.0]);
545 let stats = ColumnStats::compute(&col);
546 assert!(stats.can_skip_eq_f64(5.0));
548 assert!(stats.can_skip_eq_f64(35.0));
549 assert!(!stats.can_skip_eq_f64(15.0));
551 assert!(!stats.can_skip_eq_f64(10.0));
552 assert!(!stats.can_skip_eq_f64(30.0));
553 }
554
555 #[test]
557 fn test_binary_search_range_f64() {
558 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
559
560 assert_eq!(
562 ColumnStats::binary_search_range_f64(&data, CompOp::Gt, 3.0),
563 (3, 5)
564 );
565 assert_eq!(
567 ColumnStats::binary_search_range_f64(&data, CompOp::Ge, 3.0),
568 (2, 5)
569 );
570 assert_eq!(
572 ColumnStats::binary_search_range_f64(&data, CompOp::Lt, 3.0),
573 (0, 2)
574 );
575 assert_eq!(
577 ColumnStats::binary_search_range_f64(&data, CompOp::Le, 3.0),
578 (0, 3)
579 );
580 assert_eq!(
582 ColumnStats::binary_search_range_f64(&data, CompOp::Eq, 3.0),
583 (2, 3)
584 );
585 assert_eq!(
587 ColumnStats::binary_search_range_f64(&data, CompOp::Eq, 2.5),
588 (2, 2)
589 );
590 }
591
592 #[test]
594 fn test_binary_search_range_i64() {
595 let data = vec![10, 20, 30, 40, 50];
596
597 assert_eq!(
598 ColumnStats::binary_search_range_i64(&data, CompOp::Gt, 30),
599 (3, 5)
600 );
601 assert_eq!(
602 ColumnStats::binary_search_range_i64(&data, CompOp::Eq, 30),
603 (2, 3)
604 );
605 assert_eq!(
606 ColumnStats::binary_search_range_i64(&data, CompOp::Lt, 10),
607 (0, 0)
608 );
609 }
610
611 #[test]
613 fn test_nan_handling() {
614 let col = Column::Float(vec![f64::NAN, 2.0, f64::NAN, 5.0, 1.0]);
615 let stats = ColumnStats::compute(&col);
616 assert_eq!(stats.min_f64, Some(1.0));
617 assert_eq!(stats.max_f64, Some(5.0));
618 assert_eq!(stats.n_null, 2);
619 assert_eq!(stats.nrows, 5);
620 }
621
622 #[test]
624 fn test_all_nan_column() {
625 let col = Column::Float(vec![f64::NAN, f64::NAN]);
626 let stats = ColumnStats::compute(&col);
627 assert_eq!(stats.min_f64, None);
628 assert_eq!(stats.max_f64, None);
629 assert_eq!(stats.n_null, 2);
630 assert!(!stats.can_skip_gt(0.0));
632 }
633
634 #[test]
636 fn test_empty_column() {
637 let col = Column::Float(vec![]);
638 let stats = ColumnStats::compute(&col);
639 assert_eq!(stats.min_f64, None);
640 assert_eq!(stats.max_f64, None);
641 assert_eq!(stats.nrows, 0);
642 assert!(stats.sorted_asc);
643 assert!(stats.sorted_desc);
644 assert!(stats.is_sorted());
645 assert_eq!(stats.n_distinct, Some(0));
646 }
647
648 #[test]
650 fn test_empty_int_column() {
651 let col = Column::Int(vec![]);
652 let stats = ColumnStats::compute(&col);
653 assert_eq!(stats.min_i64, None);
654 assert_eq!(stats.max_i64, None);
655 assert_eq!(stats.nrows, 0);
656 assert!(stats.is_sorted());
657 }
658
659 #[test]
661 fn test_str_n_distinct() {
662 let col = Column::Str(vec![
663 "apple".into(),
664 "banana".into(),
665 "apple".into(),
666 "cherry".into(),
667 ]);
668 let stats = ColumnStats::compute(&col);
669 assert_eq!(stats.n_distinct, Some(3));
670 assert_eq!(stats.nrows, 4);
671 }
672
673 #[test]
675 fn test_sorted_str() {
676 let col = Column::Str(vec!["a".into(), "b".into(), "c".into()]);
677 let stats = ColumnStats::compute(&col);
678 assert!(stats.sorted_asc);
679 assert!(!stats.sorted_desc);
680 }
681
682 #[test]
684 fn test_dataframe_stats() {
685 let mut df = DataFrame::new();
686 df.columns
687 .push(("age".into(), Column::Int(vec![25, 30, 45])));
688 df.columns
689 .push(("score".into(), Column::Float(vec![88.5, 92.0, 76.3])));
690
691 let stats = DataFrameStats::compute(&df);
692 assert_eq!(stats.column_stats.len(), 2);
693
694 let age_stats = stats.get("age").unwrap();
695 assert_eq!(age_stats.min_i64, Some(25));
696 assert_eq!(age_stats.max_i64, Some(45));
697
698 let score_stats = stats.get("score").unwrap();
699 assert_eq!(score_stats.min_f64, Some(76.3));
700 assert_eq!(score_stats.max_f64, Some(92.0));
701
702 assert!(stats.get("nonexistent").is_none());
704 }
705
706 #[test]
708 fn test_zone_map_can_skip() {
709 let col = Column::Float(vec![100.0, 200.0, 300.0]);
711 let stats = ColumnStats::compute(&col);
712
713 assert!(stats.can_skip_gt(500.0));
715 assert!(stats.can_skip_lt(50.0));
717 assert!(stats.can_skip_eq_f64(400.0));
719 assert!(stats.can_skip_eq_f64(50.0));
721 }
722
723 #[test]
725 fn test_zone_map_cannot_skip() {
726 let col = Column::Float(vec![100.0, 200.0, 300.0]);
727 let stats = ColumnStats::compute(&col);
728
729 assert!(!stats.can_skip_gt(150.0));
731 assert!(!stats.can_skip_lt(250.0));
733 assert!(!stats.can_skip_eq_f64(200.0));
735 }
736
737 #[test]
739 fn test_determinism() {
740 let data = vec![3.14, 2.71, 1.41, 1.73, 0.577];
741 for _ in 0..3 {
742 let col = Column::Float(data.clone());
743 let stats = ColumnStats::compute(&col);
744 assert_eq!(stats.min_f64, Some(0.577));
745 assert_eq!(stats.max_f64, Some(3.14));
746 assert!(!stats.sorted_asc);
747 assert!(!stats.sorted_desc);
748 assert_eq!(stats.n_null, 0);
749 assert_eq!(stats.nrows, 5);
750 }
751 }
752
753 #[test]
755 fn test_determinism_int_str() {
756 for _ in 0..3 {
757 let icol = Column::Int(vec![9, 1, 5, 3, 7]);
758 let is = ColumnStats::compute(&icol);
759 assert_eq!(is.min_i64, Some(1));
760 assert_eq!(is.max_i64, Some(9));
761
762 let scol = Column::Str(vec!["x".into(), "a".into(), "m".into()]);
763 let ss = ColumnStats::compute(&scol);
764 assert_eq!(ss.n_distinct, Some(3));
765 }
766 }
767
768 #[test]
770 fn test_bool_column() {
771 let col = Column::Bool(vec![true, false, true, true]);
772 let stats = ColumnStats::compute(&col);
773 assert_eq!(stats.n_distinct, Some(2));
774 assert_eq!(stats.nrows, 4);
775 assert!(!stats.sorted_asc);
776 assert!(!stats.sorted_desc);
777 }
778
779 #[test]
781 fn test_categorical_column() {
782 let col = Column::Categorical {
783 levels: vec!["low".into(), "med".into(), "high".into()],
784 codes: vec![0, 1, 2, 1, 0],
785 };
786 let stats = ColumnStats::compute(&col);
787 assert_eq!(stats.min_i64, Some(0));
788 assert_eq!(stats.max_i64, Some(2));
789 assert_eq!(stats.n_distinct, Some(3));
790 assert!(!stats.is_sorted());
791 }
792
793 #[test]
795 fn test_single_element() {
796 let col = Column::Float(vec![42.0]);
797 let stats = ColumnStats::compute(&col);
798 assert_eq!(stats.min_f64, Some(42.0));
799 assert_eq!(stats.max_f64, Some(42.0));
800 assert!(stats.sorted_asc);
801 assert!(stats.sorted_desc);
802 assert!(stats.is_sorted());
803 }
804
805 #[test]
807 fn test_int_zone_map_skip() {
808 let col = Column::Int(vec![10, 20, 30]);
809 let stats = ColumnStats::compute(&col);
810 assert!(stats.can_skip_gt_i64(30));
811 assert!(stats.can_skip_gt_i64(50));
812 assert!(!stats.can_skip_gt_i64(15));
813 assert!(stats.can_skip_lt_i64(5));
814 assert!(stats.can_skip_lt_i64(10));
815 assert!(!stats.can_skip_lt_i64(20));
816 assert!(stats.can_skip_eq_i64(5));
817 assert!(stats.can_skip_eq_i64(35));
818 assert!(!stats.can_skip_eq_i64(20));
819 }
820
821 #[test]
823 fn test_constant_column() {
824 let col = Column::Float(vec![7.0, 7.0, 7.0]);
825 let stats = ColumnStats::compute(&col);
826 assert_eq!(stats.min_f64, Some(7.0));
827 assert_eq!(stats.max_f64, Some(7.0));
828 assert!(stats.sorted_asc);
830 assert!(stats.sorted_desc);
831 assert!(stats.can_skip_eq_f64(8.0));
832 assert!(!stats.can_skip_eq_f64(7.0));
833 }
834}