1use crate::Column;
7use std::collections::BTreeSet;
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum CompOp {
12 Gt,
13 Lt,
14 Ge,
15 Le,
16 Eq,
17}
18
19#[derive(Debug, Clone)]
21pub struct ColumnStats {
22 pub min_f64: Option<f64>,
24 pub max_f64: Option<f64>,
26 pub min_i64: Option<i64>,
28 pub max_i64: Option<i64>,
30 pub sorted_asc: bool,
32 pub sorted_desc: bool,
34 pub n_distinct: Option<usize>,
36 pub n_null: usize,
38 pub nrows: usize,
40}
41
42impl ColumnStats {
43 pub fn compute(col: &Column) -> Self {
45 if matches!(col, Column::CategoricalAdaptive(_)) {
47 return Self::compute(&col.to_legacy_categorical());
48 }
49 match col {
50 Column::Float(v) => Self::compute_float(v),
51 Column::Int(v) => Self::compute_int(v),
52 Column::Str(v) => Self::compute_str(v),
53 Column::Bool(v) => Self::compute_bool(v),
54 Column::Categorical { codes, levels } => Self::compute_categorical(codes, levels),
55 Column::DateTime(v) => Self::compute_datetime(v),
56 Column::CategoricalAdaptive(_) => unreachable!("handled by early return"),
57 }
58 }
59
60 fn compute_float(v: &[f64]) -> Self {
61 if v.is_empty() {
62 return Self::empty();
63 }
64
65 let mut min = f64::INFINITY;
66 let mut max = f64::NEG_INFINITY;
67 let mut n_null = 0usize;
68 let mut sorted_asc = true;
69 let mut sorted_desc = true;
70 let mut prev: Option<f64> = None;
71
72 for &val in v {
73 if val.is_nan() {
74 n_null += 1;
75 if prev.is_some() {
77 sorted_asc = false;
78 sorted_desc = false;
79 }
80 } else {
81 if val < min {
82 min = val;
83 }
84 if val > max {
85 max = val;
86 }
87 if let Some(p) = prev {
88 if !p.is_nan() {
89 if val < p {
90 sorted_asc = false;
91 }
92 if val > p {
93 sorted_desc = false;
94 }
95 }
96 }
97 }
98 prev = Some(val);
99 }
100
101 ColumnStats {
102 min_f64: if min.is_finite() { Some(min) } else { None },
103 max_f64: if max.is_finite() { Some(max) } else { None },
104 min_i64: None,
105 max_i64: None,
106 sorted_asc,
107 sorted_desc,
108 n_distinct: None,
109 n_null,
110 nrows: v.len(),
111 }
112 }
113
114 fn compute_int(v: &[i64]) -> Self {
115 if v.is_empty() {
116 return Self::empty();
117 }
118
119 let mut min = i64::MAX;
120 let mut max = i64::MIN;
121 let mut sorted_asc = true;
122 let mut sorted_desc = true;
123
124 for i in 0..v.len() {
125 let val = v[i];
126 if val < min {
127 min = val;
128 }
129 if val > max {
130 max = val;
131 }
132 if i > 0 {
133 if val < v[i - 1] {
134 sorted_asc = false;
135 }
136 if val > v[i - 1] {
137 sorted_desc = false;
138 }
139 }
140 }
141
142 ColumnStats {
143 min_f64: None,
144 max_f64: None,
145 min_i64: Some(min),
146 max_i64: Some(max),
147 sorted_asc,
148 sorted_desc,
149 n_distinct: None,
150 n_null: 0,
151 nrows: v.len(),
152 }
153 }
154
155 fn compute_str(v: &[String]) -> Self {
156 if v.is_empty() {
157 return Self::empty();
158 }
159
160 let mut sorted_asc = true;
161 let mut sorted_desc = true;
162 let mut distinct = BTreeSet::new();
164
165 for i in 0..v.len() {
166 distinct.insert(v[i].as_str());
167 if i > 0 {
168 if v[i] < v[i - 1] {
169 sorted_asc = false;
170 }
171 if v[i] > v[i - 1] {
172 sorted_desc = false;
173 }
174 }
175 }
176
177 ColumnStats {
178 min_f64: None,
179 max_f64: None,
180 min_i64: None,
181 max_i64: None,
182 sorted_asc,
183 sorted_desc,
184 n_distinct: Some(distinct.len()),
185 n_null: 0,
186 nrows: v.len(),
187 }
188 }
189
190 fn compute_bool(v: &[bool]) -> Self {
191 if v.is_empty() {
192 return Self::empty();
193 }
194
195 let mut sorted_asc = true;
196 let mut sorted_desc = true;
197 let mut has_true = false;
198 let mut has_false = false;
199
200 for i in 0..v.len() {
201 if v[i] {
202 has_true = true;
203 } else {
204 has_false = true;
205 }
206 if i > 0 {
207 if v[i] < v[i - 1] {
209 sorted_asc = false;
210 }
211 if v[i] > v[i - 1] {
212 sorted_desc = false;
213 }
214 }
215 }
216
217 let n_distinct = match (has_true, has_false) {
218 (true, true) => 2,
219 (true, false) | (false, true) => 1,
220 _ => 0,
221 };
222
223 ColumnStats {
224 min_f64: None,
225 max_f64: None,
226 min_i64: None,
227 max_i64: None,
228 sorted_asc,
229 sorted_desc,
230 n_distinct: Some(n_distinct),
231 n_null: 0,
232 nrows: v.len(),
233 }
234 }
235
236 fn compute_categorical(codes: &[u32], levels: &[String]) -> Self {
237 if codes.is_empty() {
238 return Self::empty();
239 }
240
241 let mut min = u32::MAX;
242 let mut max = u32::MIN;
243 let mut sorted_asc = true;
244 let mut sorted_desc = true;
245 let mut distinct = BTreeSet::new();
246
247 for i in 0..codes.len() {
248 let c = codes[i];
249 if c < min {
250 min = c;
251 }
252 if c > max {
253 max = c;
254 }
255 distinct.insert(c);
256 if i > 0 {
257 if c < codes[i - 1] {
258 sorted_asc = false;
259 }
260 if c > codes[i - 1] {
261 sorted_desc = false;
262 }
263 }
264 }
265
266 ColumnStats {
267 min_f64: None,
268 max_f64: None,
269 min_i64: Some(min as i64),
270 max_i64: Some(max as i64),
271 sorted_asc,
272 sorted_desc,
273 n_distinct: Some(distinct.len().min(levels.len())),
274 n_null: 0,
275 nrows: codes.len(),
276 }
277 }
278
279 fn compute_datetime(v: &[i64]) -> Self {
280 Self::compute_int(v)
282 }
283
284 fn empty() -> Self {
285 ColumnStats {
286 min_f64: None,
287 max_f64: None,
288 min_i64: None,
289 max_i64: None,
290 sorted_asc: true, sorted_desc: true, n_distinct: Some(0),
293 n_null: 0,
294 nrows: 0,
295 }
296 }
297
298 pub fn can_skip_gt(&self, threshold: f64) -> bool {
302 self.max_f64.map_or(false, |max| max <= threshold)
303 }
304
305 pub fn can_skip_lt(&self, threshold: f64) -> bool {
307 self.min_f64.map_or(false, |min| min >= threshold)
308 }
309
310 pub fn can_skip_ge(&self, threshold: f64) -> bool {
312 self.max_f64.map_or(false, |max| max < threshold)
313 }
314
315 pub fn can_skip_le(&self, threshold: f64) -> bool {
317 self.min_f64.map_or(false, |min| min > threshold)
318 }
319
320 pub fn can_skip_eq_f64(&self, value: f64) -> bool {
322 match (self.min_f64, self.max_f64) {
323 (Some(min), Some(max)) => value < min || value > max,
324 _ => false,
325 }
326 }
327
328 pub fn can_skip_gt_i64(&self, threshold: i64) -> bool {
330 self.max_i64.map_or(false, |max| max <= threshold)
331 }
332
333 pub fn can_skip_lt_i64(&self, threshold: i64) -> bool {
335 self.min_i64.map_or(false, |min| min >= threshold)
336 }
337
338 pub fn can_skip_eq_i64(&self, value: i64) -> bool {
340 match (self.min_i64, self.max_i64) {
341 (Some(min), Some(max)) => value < min || value > max,
342 _ => false,
343 }
344 }
345
346 pub fn is_sorted(&self) -> bool {
350 self.sorted_asc || self.sorted_desc
351 }
352
353 pub fn binary_search_range_f64(col: &[f64], op: CompOp, threshold: f64) -> (usize, usize) {
359 match op {
360 CompOp::Gt => {
361 let start = col.partition_point(|&v| v <= threshold);
362 (start, col.len())
363 }
364 CompOp::Ge => {
365 let start = col.partition_point(|&v| v < threshold);
366 (start, col.len())
367 }
368 CompOp::Lt => {
369 let end = col.partition_point(|&v| v < threshold);
370 (0, end)
371 }
372 CompOp::Le => {
373 let end = col.partition_point(|&v| v <= threshold);
374 (0, end)
375 }
376 CompOp::Eq => {
377 let start = col.partition_point(|&v| v < threshold);
378 let end = col.partition_point(|&v| v <= threshold);
379 (start, end)
380 }
381 }
382 }
383
384 pub fn binary_search_range_i64(col: &[i64], op: CompOp, threshold: i64) -> (usize, usize) {
387 match op {
388 CompOp::Gt => {
389 let start = col.partition_point(|&v| v <= threshold);
390 (start, col.len())
391 }
392 CompOp::Ge => {
393 let start = col.partition_point(|&v| v < threshold);
394 (start, col.len())
395 }
396 CompOp::Lt => {
397 let end = col.partition_point(|&v| v < threshold);
398 (0, end)
399 }
400 CompOp::Le => {
401 let end = col.partition_point(|&v| v <= threshold);
402 (0, end)
403 }
404 CompOp::Eq => {
405 let start = col.partition_point(|&v| v < threshold);
406 let end = col.partition_point(|&v| v <= threshold);
407 (start, end)
408 }
409 }
410 }
411}
412
413#[derive(Debug, Clone)]
415pub struct DataFrameStats {
416 pub column_stats: Vec<(String, ColumnStats)>,
418}
419
420impl DataFrameStats {
421 pub fn compute(df: &crate::DataFrame) -> Self {
423 let column_stats = df
424 .columns
425 .iter()
426 .map(|(name, col)| (name.clone(), ColumnStats::compute(col)))
427 .collect();
428 DataFrameStats { column_stats }
429 }
430
431 pub fn get(&self, name: &str) -> Option<&ColumnStats> {
433 self.column_stats
434 .iter()
435 .find(|(n, _)| n == name)
436 .map(|(_, s)| s)
437 }
438}
439
440#[cfg(test)]
443mod tests {
444 use super::*;
445 use crate::{Column, DataFrame};
446
447 #[test]
449 fn test_float_min_max() {
450 let col = Column::Float(vec![3.0, 1.0, 4.0, 1.5, 9.2]);
451 let stats = ColumnStats::compute(&col);
452 assert_eq!(stats.min_f64, Some(1.0));
453 assert_eq!(stats.max_f64, Some(9.2));
454 assert_eq!(stats.nrows, 5);
455 assert_eq!(stats.n_null, 0);
456 }
457
458 #[test]
460 fn test_int_min_max() {
461 let col = Column::Int(vec![10, -3, 42, 0, 7]);
462 let stats = ColumnStats::compute(&col);
463 assert_eq!(stats.min_i64, Some(-3));
464 assert_eq!(stats.max_i64, Some(42));
465 assert_eq!(stats.nrows, 5);
466 }
467
468 #[test]
470 fn test_sorted_ascending() {
471 let col = Column::Float(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
472 let stats = ColumnStats::compute(&col);
473 assert!(stats.sorted_asc);
474 assert!(!stats.sorted_desc);
475 assert!(stats.is_sorted());
476 }
477
478 #[test]
480 fn test_sorted_descending() {
481 let col = Column::Int(vec![50, 40, 30, 20, 10]);
482 let stats = ColumnStats::compute(&col);
483 assert!(!stats.sorted_asc);
484 assert!(stats.sorted_desc);
485 assert!(stats.is_sorted());
486 }
487
488 #[test]
490 fn test_unsorted() {
491 let col = Column::Float(vec![3.0, 1.0, 4.0, 1.5, 9.2]);
492 let stats = ColumnStats::compute(&col);
493 assert!(!stats.sorted_asc);
494 assert!(!stats.sorted_desc);
495 assert!(!stats.is_sorted());
496 }
497
498 #[test]
500 fn test_can_skip_gt() {
501 let col = Column::Float(vec![1.0, 2.0, 3.0]);
502 let stats = ColumnStats::compute(&col);
503 assert!(stats.can_skip_gt(5.0));
505 assert!(stats.can_skip_gt(3.0));
507 assert!(!stats.can_skip_gt(2.0));
509 }
510
511 #[test]
513 fn test_can_skip_lt() {
514 let col = Column::Float(vec![5.0, 6.0, 7.0]);
515 let stats = ColumnStats::compute(&col);
516 assert!(stats.can_skip_lt(3.0));
518 assert!(stats.can_skip_lt(5.0));
520 assert!(!stats.can_skip_lt(6.0));
522 }
523
524 #[test]
526 fn test_can_skip_ge() {
527 let col = Column::Float(vec![1.0, 2.0, 3.0]);
528 let stats = ColumnStats::compute(&col);
529 assert!(stats.can_skip_ge(4.0));
531 assert!(!stats.can_skip_ge(3.0));
533 }
534
535 #[test]
537 fn test_can_skip_le() {
538 let col = Column::Float(vec![5.0, 6.0, 7.0]);
539 let stats = ColumnStats::compute(&col);
540 assert!(stats.can_skip_le(4.0));
542 assert!(!stats.can_skip_le(5.0));
544 }
545
546 #[test]
548 fn test_can_skip_eq() {
549 let col = Column::Float(vec![10.0, 20.0, 30.0]);
550 let stats = ColumnStats::compute(&col);
551 assert!(stats.can_skip_eq_f64(5.0));
553 assert!(stats.can_skip_eq_f64(35.0));
554 assert!(!stats.can_skip_eq_f64(15.0));
556 assert!(!stats.can_skip_eq_f64(10.0));
557 assert!(!stats.can_skip_eq_f64(30.0));
558 }
559
560 #[test]
562 fn test_binary_search_range_f64() {
563 let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
564
565 assert_eq!(
567 ColumnStats::binary_search_range_f64(&data, CompOp::Gt, 3.0),
568 (3, 5)
569 );
570 assert_eq!(
572 ColumnStats::binary_search_range_f64(&data, CompOp::Ge, 3.0),
573 (2, 5)
574 );
575 assert_eq!(
577 ColumnStats::binary_search_range_f64(&data, CompOp::Lt, 3.0),
578 (0, 2)
579 );
580 assert_eq!(
582 ColumnStats::binary_search_range_f64(&data, CompOp::Le, 3.0),
583 (0, 3)
584 );
585 assert_eq!(
587 ColumnStats::binary_search_range_f64(&data, CompOp::Eq, 3.0),
588 (2, 3)
589 );
590 assert_eq!(
592 ColumnStats::binary_search_range_f64(&data, CompOp::Eq, 2.5),
593 (2, 2)
594 );
595 }
596
597 #[test]
599 fn test_binary_search_range_i64() {
600 let data = vec![10, 20, 30, 40, 50];
601
602 assert_eq!(
603 ColumnStats::binary_search_range_i64(&data, CompOp::Gt, 30),
604 (3, 5)
605 );
606 assert_eq!(
607 ColumnStats::binary_search_range_i64(&data, CompOp::Eq, 30),
608 (2, 3)
609 );
610 assert_eq!(
611 ColumnStats::binary_search_range_i64(&data, CompOp::Lt, 10),
612 (0, 0)
613 );
614 }
615
616 #[test]
618 fn test_nan_handling() {
619 let col = Column::Float(vec![f64::NAN, 2.0, f64::NAN, 5.0, 1.0]);
620 let stats = ColumnStats::compute(&col);
621 assert_eq!(stats.min_f64, Some(1.0));
622 assert_eq!(stats.max_f64, Some(5.0));
623 assert_eq!(stats.n_null, 2);
624 assert_eq!(stats.nrows, 5);
625 }
626
627 #[test]
629 fn test_all_nan_column() {
630 let col = Column::Float(vec![f64::NAN, f64::NAN]);
631 let stats = ColumnStats::compute(&col);
632 assert_eq!(stats.min_f64, None);
633 assert_eq!(stats.max_f64, None);
634 assert_eq!(stats.n_null, 2);
635 assert!(!stats.can_skip_gt(0.0));
637 }
638
639 #[test]
641 fn test_empty_column() {
642 let col = Column::Float(vec![]);
643 let stats = ColumnStats::compute(&col);
644 assert_eq!(stats.min_f64, None);
645 assert_eq!(stats.max_f64, None);
646 assert_eq!(stats.nrows, 0);
647 assert!(stats.sorted_asc);
648 assert!(stats.sorted_desc);
649 assert!(stats.is_sorted());
650 assert_eq!(stats.n_distinct, Some(0));
651 }
652
653 #[test]
655 fn test_empty_int_column() {
656 let col = Column::Int(vec![]);
657 let stats = ColumnStats::compute(&col);
658 assert_eq!(stats.min_i64, None);
659 assert_eq!(stats.max_i64, None);
660 assert_eq!(stats.nrows, 0);
661 assert!(stats.is_sorted());
662 }
663
664 #[test]
666 fn test_str_n_distinct() {
667 let col = Column::Str(vec![
668 "apple".into(),
669 "banana".into(),
670 "apple".into(),
671 "cherry".into(),
672 ]);
673 let stats = ColumnStats::compute(&col);
674 assert_eq!(stats.n_distinct, Some(3));
675 assert_eq!(stats.nrows, 4);
676 }
677
678 #[test]
680 fn test_sorted_str() {
681 let col = Column::Str(vec!["a".into(), "b".into(), "c".into()]);
682 let stats = ColumnStats::compute(&col);
683 assert!(stats.sorted_asc);
684 assert!(!stats.sorted_desc);
685 }
686
687 #[test]
689 fn test_dataframe_stats() {
690 let mut df = DataFrame::new();
691 df.columns
692 .push(("age".into(), Column::Int(vec![25, 30, 45])));
693 df.columns
694 .push(("score".into(), Column::Float(vec![88.5, 92.0, 76.3])));
695
696 let stats = DataFrameStats::compute(&df);
697 assert_eq!(stats.column_stats.len(), 2);
698
699 let age_stats = stats.get("age").unwrap();
700 assert_eq!(age_stats.min_i64, Some(25));
701 assert_eq!(age_stats.max_i64, Some(45));
702
703 let score_stats = stats.get("score").unwrap();
704 assert_eq!(score_stats.min_f64, Some(76.3));
705 assert_eq!(score_stats.max_f64, Some(92.0));
706
707 assert!(stats.get("nonexistent").is_none());
709 }
710
711 #[test]
713 fn test_zone_map_can_skip() {
714 let col = Column::Float(vec![100.0, 200.0, 300.0]);
716 let stats = ColumnStats::compute(&col);
717
718 assert!(stats.can_skip_gt(500.0));
720 assert!(stats.can_skip_lt(50.0));
722 assert!(stats.can_skip_eq_f64(400.0));
724 assert!(stats.can_skip_eq_f64(50.0));
726 }
727
728 #[test]
730 fn test_zone_map_cannot_skip() {
731 let col = Column::Float(vec![100.0, 200.0, 300.0]);
732 let stats = ColumnStats::compute(&col);
733
734 assert!(!stats.can_skip_gt(150.0));
736 assert!(!stats.can_skip_lt(250.0));
738 assert!(!stats.can_skip_eq_f64(200.0));
740 }
741
742 #[test]
744 fn test_determinism() {
745 let data = vec![3.14, 2.71, 1.41, 1.73, 0.577];
746 for _ in 0..3 {
747 let col = Column::Float(data.clone());
748 let stats = ColumnStats::compute(&col);
749 assert_eq!(stats.min_f64, Some(0.577));
750 assert_eq!(stats.max_f64, Some(3.14));
751 assert!(!stats.sorted_asc);
752 assert!(!stats.sorted_desc);
753 assert_eq!(stats.n_null, 0);
754 assert_eq!(stats.nrows, 5);
755 }
756 }
757
758 #[test]
760 fn test_determinism_int_str() {
761 for _ in 0..3 {
762 let icol = Column::Int(vec![9, 1, 5, 3, 7]);
763 let is = ColumnStats::compute(&icol);
764 assert_eq!(is.min_i64, Some(1));
765 assert_eq!(is.max_i64, Some(9));
766
767 let scol = Column::Str(vec!["x".into(), "a".into(), "m".into()]);
768 let ss = ColumnStats::compute(&scol);
769 assert_eq!(ss.n_distinct, Some(3));
770 }
771 }
772
773 #[test]
775 fn test_bool_column() {
776 let col = Column::Bool(vec![true, false, true, true]);
777 let stats = ColumnStats::compute(&col);
778 assert_eq!(stats.n_distinct, Some(2));
779 assert_eq!(stats.nrows, 4);
780 assert!(!stats.sorted_asc);
781 assert!(!stats.sorted_desc);
782 }
783
784 #[test]
786 fn test_categorical_column() {
787 let col = Column::Categorical {
788 levels: vec!["low".into(), "med".into(), "high".into()],
789 codes: vec![0, 1, 2, 1, 0],
790 };
791 let stats = ColumnStats::compute(&col);
792 assert_eq!(stats.min_i64, Some(0));
793 assert_eq!(stats.max_i64, Some(2));
794 assert_eq!(stats.n_distinct, Some(3));
795 assert!(!stats.is_sorted());
796 }
797
798 #[test]
800 fn test_single_element() {
801 let col = Column::Float(vec![42.0]);
802 let stats = ColumnStats::compute(&col);
803 assert_eq!(stats.min_f64, Some(42.0));
804 assert_eq!(stats.max_f64, Some(42.0));
805 assert!(stats.sorted_asc);
806 assert!(stats.sorted_desc);
807 assert!(stats.is_sorted());
808 }
809
810 #[test]
812 fn test_int_zone_map_skip() {
813 let col = Column::Int(vec![10, 20, 30]);
814 let stats = ColumnStats::compute(&col);
815 assert!(stats.can_skip_gt_i64(30));
816 assert!(stats.can_skip_gt_i64(50));
817 assert!(!stats.can_skip_gt_i64(15));
818 assert!(stats.can_skip_lt_i64(5));
819 assert!(stats.can_skip_lt_i64(10));
820 assert!(!stats.can_skip_lt_i64(20));
821 assert!(stats.can_skip_eq_i64(5));
822 assert!(stats.can_skip_eq_i64(35));
823 assert!(!stats.can_skip_eq_i64(20));
824 }
825
826 #[test]
828 fn test_constant_column() {
829 let col = Column::Float(vec![7.0, 7.0, 7.0]);
830 let stats = ColumnStats::compute(&col);
831 assert_eq!(stats.min_f64, Some(7.0));
832 assert_eq!(stats.max_f64, Some(7.0));
833 assert!(stats.sorted_asc);
835 assert!(stats.sorted_desc);
836 assert!(stats.can_skip_eq_f64(8.0));
837 assert!(!stats.can_skip_eq_f64(7.0));
838 }
839}