Skip to main content

cjc_data/
column_meta.rs

1//! Column-level metadata for query optimization.
2//!
3//! Zone maps (min/max per column) and sorted flags enable
4//! the TidyView optimizer to skip unnecessary work.
5
6use crate::Column;
7use std::collections::BTreeSet;
8
9/// Comparison operator for filter predicates.
10#[derive(Debug, Clone, Copy, PartialEq, Eq)]
11pub enum CompOp {
12    Gt,
13    Lt,
14    Ge,
15    Le,
16    Eq,
17}
18
19/// Statistical metadata for a single column.
20#[derive(Debug, Clone)]
21pub struct ColumnStats {
22    /// Minimum value (for numeric columns)
23    pub min_f64: Option<f64>,
24    /// Maximum value (for numeric columns)
25    pub max_f64: Option<f64>,
26    /// Minimum value (for int columns)
27    pub min_i64: Option<i64>,
28    /// Maximum value (for int columns)
29    pub max_i64: Option<i64>,
30    /// Whether the column is sorted ascending
31    pub sorted_asc: bool,
32    /// Whether the column is sorted descending
33    pub sorted_desc: bool,
34    /// Number of distinct values (exact for small columns, estimated for large)
35    pub n_distinct: Option<usize>,
36    /// Number of null/NaN values
37    pub n_null: usize,
38    /// Total number of rows
39    pub nrows: usize,
40}
41
42impl ColumnStats {
43    /// Compute stats for a column.
44    pub fn compute(col: &Column) -> Self {
45        // CategoricalAdaptive: convert to legacy Categorical and recurse.
46        if matches!(col, Column::CategoricalAdaptive(_)) {
47            return Self::compute(&col.to_legacy_categorical());
48        }
49        match col {
50            Column::Float(v) => Self::compute_float(v),
51            Column::Int(v) => Self::compute_int(v),
52            Column::Str(v) => Self::compute_str(v),
53            Column::Bool(v) => Self::compute_bool(v),
54            Column::Categorical { codes, levels } => Self::compute_categorical(codes, levels),
55            Column::DateTime(v) => Self::compute_datetime(v),
56            Column::CategoricalAdaptive(_) => unreachable!("handled by early return"),
57        }
58    }
59
60    fn compute_float(v: &[f64]) -> Self {
61        if v.is_empty() {
62            return Self::empty();
63        }
64
65        let mut min = f64::INFINITY;
66        let mut max = f64::NEG_INFINITY;
67        let mut n_null = 0usize;
68        let mut sorted_asc = true;
69        let mut sorted_desc = true;
70        let mut prev: Option<f64> = None;
71
72        for &val in v {
73            if val.is_nan() {
74                n_null += 1;
75                // NaN breaks sort order
76                if prev.is_some() {
77                    sorted_asc = false;
78                    sorted_desc = false;
79                }
80            } else {
81                if val < min {
82                    min = val;
83                }
84                if val > max {
85                    max = val;
86                }
87                if let Some(p) = prev {
88                    if !p.is_nan() {
89                        if val < p {
90                            sorted_asc = false;
91                        }
92                        if val > p {
93                            sorted_desc = false;
94                        }
95                    }
96                }
97            }
98            prev = Some(val);
99        }
100
101        ColumnStats {
102            min_f64: if min.is_finite() { Some(min) } else { None },
103            max_f64: if max.is_finite() { Some(max) } else { None },
104            min_i64: None,
105            max_i64: None,
106            sorted_asc,
107            sorted_desc,
108            n_distinct: None,
109            n_null,
110            nrows: v.len(),
111        }
112    }
113
114    fn compute_int(v: &[i64]) -> Self {
115        if v.is_empty() {
116            return Self::empty();
117        }
118
119        let mut min = i64::MAX;
120        let mut max = i64::MIN;
121        let mut sorted_asc = true;
122        let mut sorted_desc = true;
123
124        for i in 0..v.len() {
125            let val = v[i];
126            if val < min {
127                min = val;
128            }
129            if val > max {
130                max = val;
131            }
132            if i > 0 {
133                if val < v[i - 1] {
134                    sorted_asc = false;
135                }
136                if val > v[i - 1] {
137                    sorted_desc = false;
138                }
139            }
140        }
141
142        ColumnStats {
143            min_f64: None,
144            max_f64: None,
145            min_i64: Some(min),
146            max_i64: Some(max),
147            sorted_asc,
148            sorted_desc,
149            n_distinct: None,
150            n_null: 0,
151            nrows: v.len(),
152        }
153    }
154
155    fn compute_str(v: &[String]) -> Self {
156        if v.is_empty() {
157            return Self::empty();
158        }
159
160        let mut sorted_asc = true;
161        let mut sorted_desc = true;
162        // Use BTreeSet for deterministic distinct counting
163        let mut distinct = BTreeSet::new();
164
165        for i in 0..v.len() {
166            distinct.insert(v[i].as_str());
167            if i > 0 {
168                if v[i] < v[i - 1] {
169                    sorted_asc = false;
170                }
171                if v[i] > v[i - 1] {
172                    sorted_desc = false;
173                }
174            }
175        }
176
177        ColumnStats {
178            min_f64: None,
179            max_f64: None,
180            min_i64: None,
181            max_i64: None,
182            sorted_asc,
183            sorted_desc,
184            n_distinct: Some(distinct.len()),
185            n_null: 0,
186            nrows: v.len(),
187        }
188    }
189
190    fn compute_bool(v: &[bool]) -> Self {
191        if v.is_empty() {
192            return Self::empty();
193        }
194
195        let mut sorted_asc = true;
196        let mut sorted_desc = true;
197        let mut has_true = false;
198        let mut has_false = false;
199
200        for i in 0..v.len() {
201            if v[i] {
202                has_true = true;
203            } else {
204                has_false = true;
205            }
206            if i > 0 {
207                // false < true in Rust's Ord
208                if v[i] < v[i - 1] {
209                    sorted_asc = false;
210                }
211                if v[i] > v[i - 1] {
212                    sorted_desc = false;
213                }
214            }
215        }
216
217        let n_distinct = match (has_true, has_false) {
218            (true, true) => 2,
219            (true, false) | (false, true) => 1,
220            _ => 0,
221        };
222
223        ColumnStats {
224            min_f64: None,
225            max_f64: None,
226            min_i64: None,
227            max_i64: None,
228            sorted_asc,
229            sorted_desc,
230            n_distinct: Some(n_distinct),
231            n_null: 0,
232            nrows: v.len(),
233        }
234    }
235
236    fn compute_categorical(codes: &[u32], levels: &[String]) -> Self {
237        if codes.is_empty() {
238            return Self::empty();
239        }
240
241        let mut min = u32::MAX;
242        let mut max = u32::MIN;
243        let mut sorted_asc = true;
244        let mut sorted_desc = true;
245        let mut distinct = BTreeSet::new();
246
247        for i in 0..codes.len() {
248            let c = codes[i];
249            if c < min {
250                min = c;
251            }
252            if c > max {
253                max = c;
254            }
255            distinct.insert(c);
256            if i > 0 {
257                if c < codes[i - 1] {
258                    sorted_asc = false;
259                }
260                if c > codes[i - 1] {
261                    sorted_desc = false;
262                }
263            }
264        }
265
266        ColumnStats {
267            min_f64: None,
268            max_f64: None,
269            min_i64: Some(min as i64),
270            max_i64: Some(max as i64),
271            sorted_asc,
272            sorted_desc,
273            n_distinct: Some(distinct.len().min(levels.len())),
274            n_null: 0,
275            nrows: codes.len(),
276        }
277    }
278
279    fn compute_datetime(v: &[i64]) -> Self {
280        // DateTime columns store epoch millis as i64; reuse int logic.
281        Self::compute_int(v)
282    }
283
284    fn empty() -> Self {
285        ColumnStats {
286            min_f64: None,
287            max_f64: None,
288            min_i64: None,
289            max_i64: None,
290            sorted_asc: true,  // vacuously sorted
291            sorted_desc: true, // vacuously sorted
292            n_distinct: Some(0),
293            n_null: 0,
294            nrows: 0,
295        }
296    }
297
298    // ── Zone-map skip predicates ──────────────────────────────────────
299
300    /// Check if a filter `col > threshold` can be ruled out (all values <= threshold).
301    pub fn can_skip_gt(&self, threshold: f64) -> bool {
302        self.max_f64.map_or(false, |max| max <= threshold)
303    }
304
305    /// Check if a filter `col < threshold` can be ruled out (all values >= threshold).
306    pub fn can_skip_lt(&self, threshold: f64) -> bool {
307        self.min_f64.map_or(false, |min| min >= threshold)
308    }
309
310    /// Check if a filter `col >= threshold` can be ruled out (all values < threshold).
311    pub fn can_skip_ge(&self, threshold: f64) -> bool {
312        self.max_f64.map_or(false, |max| max < threshold)
313    }
314
315    /// Check if a filter `col <= threshold` can be ruled out (all values > threshold).
316    pub fn can_skip_le(&self, threshold: f64) -> bool {
317        self.min_f64.map_or(false, |min| min > threshold)
318    }
319
320    /// Check if a filter `col == value` can be ruled out (value outside [min, max]).
321    pub fn can_skip_eq_f64(&self, value: f64) -> bool {
322        match (self.min_f64, self.max_f64) {
323            (Some(min), Some(max)) => value < min || value > max,
324            _ => false,
325        }
326    }
327
328    /// Check if a filter `col > threshold` can be ruled out for int columns.
329    pub fn can_skip_gt_i64(&self, threshold: i64) -> bool {
330        self.max_i64.map_or(false, |max| max <= threshold)
331    }
332
333    /// Check if a filter `col < threshold` can be ruled out for int columns.
334    pub fn can_skip_lt_i64(&self, threshold: i64) -> bool {
335        self.min_i64.map_or(false, |min| min >= threshold)
336    }
337
338    /// Check if a filter `col == value` can be ruled out for int columns.
339    pub fn can_skip_eq_i64(&self, value: i64) -> bool {
340        match (self.min_i64, self.max_i64) {
341            (Some(min), Some(max)) => value < min || value > max,
342            _ => false,
343        }
344    }
345
346    // ── Sorted-column helpers ─────────────────────────────────────────
347
348    /// Check if column is sorted (ascending or descending).
349    pub fn is_sorted(&self) -> bool {
350        self.sorted_asc || self.sorted_desc
351    }
352
353    /// For a sorted ascending float column, find the row range satisfying a
354    /// comparison predicate via binary search. Returns `(start, end)` where
355    /// matching rows are `start..end`.
356    ///
357    /// Caller must ensure the slice is actually sorted ascending.
358    pub fn binary_search_range_f64(col: &[f64], op: CompOp, threshold: f64) -> (usize, usize) {
359        match op {
360            CompOp::Gt => {
361                let start = col.partition_point(|&v| v <= threshold);
362                (start, col.len())
363            }
364            CompOp::Ge => {
365                let start = col.partition_point(|&v| v < threshold);
366                (start, col.len())
367            }
368            CompOp::Lt => {
369                let end = col.partition_point(|&v| v < threshold);
370                (0, end)
371            }
372            CompOp::Le => {
373                let end = col.partition_point(|&v| v <= threshold);
374                (0, end)
375            }
376            CompOp::Eq => {
377                let start = col.partition_point(|&v| v < threshold);
378                let end = col.partition_point(|&v| v <= threshold);
379                (start, end)
380            }
381        }
382    }
383
384    /// For a sorted ascending int column, find the row range satisfying a
385    /// comparison predicate via binary search. Returns `(start, end)`.
386    pub fn binary_search_range_i64(col: &[i64], op: CompOp, threshold: i64) -> (usize, usize) {
387        match op {
388            CompOp::Gt => {
389                let start = col.partition_point(|&v| v <= threshold);
390                (start, col.len())
391            }
392            CompOp::Ge => {
393                let start = col.partition_point(|&v| v < threshold);
394                (start, col.len())
395            }
396            CompOp::Lt => {
397                let end = col.partition_point(|&v| v < threshold);
398                (0, end)
399            }
400            CompOp::Le => {
401                let end = col.partition_point(|&v| v <= threshold);
402                (0, end)
403            }
404            CompOp::Eq => {
405                let start = col.partition_point(|&v| v < threshold);
406                let end = col.partition_point(|&v| v <= threshold);
407                (start, end)
408            }
409        }
410    }
411}
412
413/// Metadata for an entire DataFrame.
414#[derive(Debug, Clone)]
415pub struct DataFrameStats {
416    /// Per-column stats, in column order.
417    pub column_stats: Vec<(String, ColumnStats)>,
418}
419
420impl DataFrameStats {
421    /// Compute stats for all columns in a DataFrame.
422    pub fn compute(df: &crate::DataFrame) -> Self {
423        let column_stats = df
424            .columns
425            .iter()
426            .map(|(name, col)| (name.clone(), ColumnStats::compute(col)))
427            .collect();
428        DataFrameStats { column_stats }
429    }
430
431    /// Get stats for a named column.
432    pub fn get(&self, name: &str) -> Option<&ColumnStats> {
433        self.column_stats
434            .iter()
435            .find(|(n, _)| n == name)
436            .map(|(_, s)| s)
437    }
438}
439
440// ── Tests ─────────────────────────────────────────────────────────────────
441
442#[cfg(test)]
443mod tests {
444    use super::*;
445    use crate::{Column, DataFrame};
446
447    // 1. Float column min/max computation
448    #[test]
449    fn test_float_min_max() {
450        let col = Column::Float(vec![3.0, 1.0, 4.0, 1.5, 9.2]);
451        let stats = ColumnStats::compute(&col);
452        assert_eq!(stats.min_f64, Some(1.0));
453        assert_eq!(stats.max_f64, Some(9.2));
454        assert_eq!(stats.nrows, 5);
455        assert_eq!(stats.n_null, 0);
456    }
457
458    // 2. Int column min/max computation
459    #[test]
460    fn test_int_min_max() {
461        let col = Column::Int(vec![10, -3, 42, 0, 7]);
462        let stats = ColumnStats::compute(&col);
463        assert_eq!(stats.min_i64, Some(-3));
464        assert_eq!(stats.max_i64, Some(42));
465        assert_eq!(stats.nrows, 5);
466    }
467
468    // 3. Sorted ascending detection
469    #[test]
470    fn test_sorted_ascending() {
471        let col = Column::Float(vec![1.0, 2.0, 3.0, 4.0, 5.0]);
472        let stats = ColumnStats::compute(&col);
473        assert!(stats.sorted_asc);
474        assert!(!stats.sorted_desc);
475        assert!(stats.is_sorted());
476    }
477
478    // 4. Sorted descending detection
479    #[test]
480    fn test_sorted_descending() {
481        let col = Column::Int(vec![50, 40, 30, 20, 10]);
482        let stats = ColumnStats::compute(&col);
483        assert!(!stats.sorted_asc);
484        assert!(stats.sorted_desc);
485        assert!(stats.is_sorted());
486    }
487
488    // 5. Unsorted detection
489    #[test]
490    fn test_unsorted() {
491        let col = Column::Float(vec![3.0, 1.0, 4.0, 1.5, 9.2]);
492        let stats = ColumnStats::compute(&col);
493        assert!(!stats.sorted_asc);
494        assert!(!stats.sorted_desc);
495        assert!(!stats.is_sorted());
496    }
497
498    // 6a. can_skip_gt
499    #[test]
500    fn test_can_skip_gt() {
501        let col = Column::Float(vec![1.0, 2.0, 3.0]);
502        let stats = ColumnStats::compute(&col);
503        // max is 3.0, threshold 5.0 => all values <= 5.0, can skip
504        assert!(stats.can_skip_gt(5.0));
505        // max is 3.0, threshold 3.0 => all values <= 3.0, can skip
506        assert!(stats.can_skip_gt(3.0));
507        // max is 3.0, threshold 2.0 => some values > 2.0, cannot skip
508        assert!(!stats.can_skip_gt(2.0));
509    }
510
511    // 6b. can_skip_lt
512    #[test]
513    fn test_can_skip_lt() {
514        let col = Column::Float(vec![5.0, 6.0, 7.0]);
515        let stats = ColumnStats::compute(&col);
516        // min is 5.0, threshold 3.0 => all values >= 3.0, can skip
517        assert!(stats.can_skip_lt(3.0));
518        // min is 5.0, threshold 5.0 => all values >= 5.0, can skip
519        assert!(stats.can_skip_lt(5.0));
520        // min is 5.0, threshold 6.0 => some values < 6.0, cannot skip
521        assert!(!stats.can_skip_lt(6.0));
522    }
523
524    // 6c. can_skip_ge
525    #[test]
526    fn test_can_skip_ge() {
527        let col = Column::Float(vec![1.0, 2.0, 3.0]);
528        let stats = ColumnStats::compute(&col);
529        // max is 3.0, threshold 4.0 => all values < 4.0, can skip col >= 4.0
530        assert!(stats.can_skip_ge(4.0));
531        // max is 3.0, threshold 3.0 => 3.0 >= 3.0 is true, cannot skip
532        assert!(!stats.can_skip_ge(3.0));
533    }
534
535    // 6d. can_skip_le
536    #[test]
537    fn test_can_skip_le() {
538        let col = Column::Float(vec![5.0, 6.0, 7.0]);
539        let stats = ColumnStats::compute(&col);
540        // min is 5.0, threshold 4.0 => all values > 4.0, can skip col <= 4.0
541        assert!(stats.can_skip_le(4.0));
542        // min is 5.0, threshold 5.0 => 5.0 <= 5.0 is true, cannot skip
543        assert!(!stats.can_skip_le(5.0));
544    }
545
546    // 6e. can_skip_eq
547    #[test]
548    fn test_can_skip_eq() {
549        let col = Column::Float(vec![10.0, 20.0, 30.0]);
550        let stats = ColumnStats::compute(&col);
551        // value outside range
552        assert!(stats.can_skip_eq_f64(5.0));
553        assert!(stats.can_skip_eq_f64(35.0));
554        // value inside range
555        assert!(!stats.can_skip_eq_f64(15.0));
556        assert!(!stats.can_skip_eq_f64(10.0));
557        assert!(!stats.can_skip_eq_f64(30.0));
558    }
559
560    // 7. Binary search range on sorted float column
561    #[test]
562    fn test_binary_search_range_f64() {
563        let data = vec![1.0, 2.0, 3.0, 4.0, 5.0];
564
565        // col > 3.0 => rows [3, 4] (values 4.0, 5.0)
566        assert_eq!(
567            ColumnStats::binary_search_range_f64(&data, CompOp::Gt, 3.0),
568            (3, 5)
569        );
570        // col >= 3.0 => rows [2, 3, 4] (values 3.0, 4.0, 5.0)
571        assert_eq!(
572            ColumnStats::binary_search_range_f64(&data, CompOp::Ge, 3.0),
573            (2, 5)
574        );
575        // col < 3.0 => rows [0, 1] (values 1.0, 2.0)
576        assert_eq!(
577            ColumnStats::binary_search_range_f64(&data, CompOp::Lt, 3.0),
578            (0, 2)
579        );
580        // col <= 3.0 => rows [0, 1, 2] (values 1.0, 2.0, 3.0)
581        assert_eq!(
582            ColumnStats::binary_search_range_f64(&data, CompOp::Le, 3.0),
583            (0, 3)
584        );
585        // col == 3.0 => row [2] (value 3.0)
586        assert_eq!(
587            ColumnStats::binary_search_range_f64(&data, CompOp::Eq, 3.0),
588            (2, 3)
589        );
590        // col == 2.5 => no match
591        assert_eq!(
592            ColumnStats::binary_search_range_f64(&data, CompOp::Eq, 2.5),
593            (2, 2)
594        );
595    }
596
597    // 7b. Binary search range on sorted int column
598    #[test]
599    fn test_binary_search_range_i64() {
600        let data = vec![10, 20, 30, 40, 50];
601
602        assert_eq!(
603            ColumnStats::binary_search_range_i64(&data, CompOp::Gt, 30),
604            (3, 5)
605        );
606        assert_eq!(
607            ColumnStats::binary_search_range_i64(&data, CompOp::Eq, 30),
608            (2, 3)
609        );
610        assert_eq!(
611            ColumnStats::binary_search_range_i64(&data, CompOp::Lt, 10),
612            (0, 0)
613        );
614    }
615
616    // 8. NaN handling: NaN does not affect min/max
617    #[test]
618    fn test_nan_handling() {
619        let col = Column::Float(vec![f64::NAN, 2.0, f64::NAN, 5.0, 1.0]);
620        let stats = ColumnStats::compute(&col);
621        assert_eq!(stats.min_f64, Some(1.0));
622        assert_eq!(stats.max_f64, Some(5.0));
623        assert_eq!(stats.n_null, 2);
624        assert_eq!(stats.nrows, 5);
625    }
626
627    // 8b. All-NaN column
628    #[test]
629    fn test_all_nan_column() {
630        let col = Column::Float(vec![f64::NAN, f64::NAN]);
631        let stats = ColumnStats::compute(&col);
632        assert_eq!(stats.min_f64, None);
633        assert_eq!(stats.max_f64, None);
634        assert_eq!(stats.n_null, 2);
635        // can_skip should return false (no stats available)
636        assert!(!stats.can_skip_gt(0.0));
637    }
638
639    // 9. Empty column stats
640    #[test]
641    fn test_empty_column() {
642        let col = Column::Float(vec![]);
643        let stats = ColumnStats::compute(&col);
644        assert_eq!(stats.min_f64, None);
645        assert_eq!(stats.max_f64, None);
646        assert_eq!(stats.nrows, 0);
647        assert!(stats.sorted_asc);
648        assert!(stats.sorted_desc);
649        assert!(stats.is_sorted());
650        assert_eq!(stats.n_distinct, Some(0));
651    }
652
653    // 9b. Empty int column
654    #[test]
655    fn test_empty_int_column() {
656        let col = Column::Int(vec![]);
657        let stats = ColumnStats::compute(&col);
658        assert_eq!(stats.min_i64, None);
659        assert_eq!(stats.max_i64, None);
660        assert_eq!(stats.nrows, 0);
661        assert!(stats.is_sorted());
662    }
663
664    // 10. String column n_distinct
665    #[test]
666    fn test_str_n_distinct() {
667        let col = Column::Str(vec![
668            "apple".into(),
669            "banana".into(),
670            "apple".into(),
671            "cherry".into(),
672        ]);
673        let stats = ColumnStats::compute(&col);
674        assert_eq!(stats.n_distinct, Some(3));
675        assert_eq!(stats.nrows, 4);
676    }
677
678    // 10b. Sorted string column
679    #[test]
680    fn test_sorted_str() {
681        let col = Column::Str(vec!["a".into(), "b".into(), "c".into()]);
682        let stats = ColumnStats::compute(&col);
683        assert!(stats.sorted_asc);
684        assert!(!stats.sorted_desc);
685    }
686
687    // 11. DataFrameStats compute + get
688    #[test]
689    fn test_dataframe_stats() {
690        let mut df = DataFrame::new();
691        df.columns
692            .push(("age".into(), Column::Int(vec![25, 30, 45])));
693        df.columns
694            .push(("score".into(), Column::Float(vec![88.5, 92.0, 76.3])));
695
696        let stats = DataFrameStats::compute(&df);
697        assert_eq!(stats.column_stats.len(), 2);
698
699        let age_stats = stats.get("age").unwrap();
700        assert_eq!(age_stats.min_i64, Some(25));
701        assert_eq!(age_stats.max_i64, Some(45));
702
703        let score_stats = stats.get("score").unwrap();
704        assert_eq!(score_stats.min_f64, Some(76.3));
705        assert_eq!(score_stats.max_f64, Some(92.0));
706
707        // Non-existent column
708        assert!(stats.get("nonexistent").is_none());
709    }
710
711    // 12. Zone map correctly identifies "can skip entire column"
712    #[test]
713    fn test_zone_map_can_skip() {
714        // Column with values [100, 200, 300]
715        let col = Column::Float(vec![100.0, 200.0, 300.0]);
716        let stats = ColumnStats::compute(&col);
717
718        // Filter: col > 500 => max=300 <= 500, can skip
719        assert!(stats.can_skip_gt(500.0));
720        // Filter: col < 50 => min=100 >= 50, can skip
721        assert!(stats.can_skip_lt(50.0));
722        // Filter: col == 400 => 400 > max, can skip
723        assert!(stats.can_skip_eq_f64(400.0));
724        // Filter: col == 50 => 50 < min, can skip
725        assert!(stats.can_skip_eq_f64(50.0));
726    }
727
728    // 13. Zone map correctly identifies "cannot skip"
729    #[test]
730    fn test_zone_map_cannot_skip() {
731        let col = Column::Float(vec![100.0, 200.0, 300.0]);
732        let stats = ColumnStats::compute(&col);
733
734        // Filter: col > 150 => max=300 > 150, cannot skip
735        assert!(!stats.can_skip_gt(150.0));
736        // Filter: col < 250 => min=100 < 250, cannot skip
737        assert!(!stats.can_skip_lt(250.0));
738        // Filter: col == 200 => in [100, 300], cannot skip
739        assert!(!stats.can_skip_eq_f64(200.0));
740    }
741
742    // 14. Determinism test: 3 runs produce identical stats
743    #[test]
744    fn test_determinism() {
745        let data = vec![3.14, 2.71, 1.41, 1.73, 0.577];
746        for _ in 0..3 {
747            let col = Column::Float(data.clone());
748            let stats = ColumnStats::compute(&col);
749            assert_eq!(stats.min_f64, Some(0.577));
750            assert_eq!(stats.max_f64, Some(3.14));
751            assert!(!stats.sorted_asc);
752            assert!(!stats.sorted_desc);
753            assert_eq!(stats.n_null, 0);
754            assert_eq!(stats.nrows, 5);
755        }
756    }
757
758    // 14b. Determinism for int + str columns
759    #[test]
760    fn test_determinism_int_str() {
761        for _ in 0..3 {
762            let icol = Column::Int(vec![9, 1, 5, 3, 7]);
763            let is = ColumnStats::compute(&icol);
764            assert_eq!(is.min_i64, Some(1));
765            assert_eq!(is.max_i64, Some(9));
766
767            let scol = Column::Str(vec!["x".into(), "a".into(), "m".into()]);
768            let ss = ColumnStats::compute(&scol);
769            assert_eq!(ss.n_distinct, Some(3));
770        }
771    }
772
773    // 15. Bool column stats
774    #[test]
775    fn test_bool_column() {
776        let col = Column::Bool(vec![true, false, true, true]);
777        let stats = ColumnStats::compute(&col);
778        assert_eq!(stats.n_distinct, Some(2));
779        assert_eq!(stats.nrows, 4);
780        assert!(!stats.sorted_asc);
781        assert!(!stats.sorted_desc);
782    }
783
784    // 16. Categorical column
785    #[test]
786    fn test_categorical_column() {
787        let col = Column::Categorical {
788            levels: vec!["low".into(), "med".into(), "high".into()],
789            codes: vec![0, 1, 2, 1, 0],
790        };
791        let stats = ColumnStats::compute(&col);
792        assert_eq!(stats.min_i64, Some(0));
793        assert_eq!(stats.max_i64, Some(2));
794        assert_eq!(stats.n_distinct, Some(3));
795        assert!(!stats.is_sorted());
796    }
797
798    // 17. Single-element column
799    #[test]
800    fn test_single_element() {
801        let col = Column::Float(vec![42.0]);
802        let stats = ColumnStats::compute(&col);
803        assert_eq!(stats.min_f64, Some(42.0));
804        assert_eq!(stats.max_f64, Some(42.0));
805        assert!(stats.sorted_asc);
806        assert!(stats.sorted_desc);
807        assert!(stats.is_sorted());
808    }
809
810    // 18. Int zone-map skip predicates
811    #[test]
812    fn test_int_zone_map_skip() {
813        let col = Column::Int(vec![10, 20, 30]);
814        let stats = ColumnStats::compute(&col);
815        assert!(stats.can_skip_gt_i64(30));
816        assert!(stats.can_skip_gt_i64(50));
817        assert!(!stats.can_skip_gt_i64(15));
818        assert!(stats.can_skip_lt_i64(5));
819        assert!(stats.can_skip_lt_i64(10));
820        assert!(!stats.can_skip_lt_i64(20));
821        assert!(stats.can_skip_eq_i64(5));
822        assert!(stats.can_skip_eq_i64(35));
823        assert!(!stats.can_skip_eq_i64(20));
824    }
825
826    // 19. Constant column (all same value)
827    #[test]
828    fn test_constant_column() {
829        let col = Column::Float(vec![7.0, 7.0, 7.0]);
830        let stats = ColumnStats::compute(&col);
831        assert_eq!(stats.min_f64, Some(7.0));
832        assert_eq!(stats.max_f64, Some(7.0));
833        // A constant column is both ascending and descending
834        assert!(stats.sorted_asc);
835        assert!(stats.sorted_desc);
836        assert!(stats.can_skip_eq_f64(8.0));
837        assert!(!stats.can_skip_eq_f64(7.0));
838    }
839}