Skip to main content

quill_sql/catalog/
stats.rs

1use std::cmp::Ordering;
2use std::collections::{HashMap, HashSet};
3
4use crate::catalog::Schema;
5use crate::expression::BinaryOp;
6use crate::storage::tuple::Tuple;
7use crate::utils::scalar::ScalarValue;
8
9const DISTINCT_SAMPLE_LIMIT: usize = 1024;
10const MIN_SELECTIVITY: f64 = 0.01;
11
12/// Basic per-table statistics derived from a table scan.
13#[derive(Debug, Clone)]
14pub struct TableStatistics {
15    /// Number of visible (non-deleted) tuples discovered.
16    pub row_count: u64,
17    /// Per-column stats keyed by column name.
18    pub column_stats: HashMap<String, ColumnStatistics>,
19}
20
21impl TableStatistics {
22    pub fn empty(schema: &Schema) -> Self {
23        let column_stats = schema
24            .columns
25            .iter()
26            .map(|col| (col.name.clone(), ColumnStatistics::default()))
27            .collect();
28        Self {
29            row_count: 0,
30            column_stats,
31        }
32    }
33
34    pub fn record_tuple(&mut self, tuple: &Tuple) {
35        self.row_count += 1;
36        for (column, value) in tuple.schema.columns.iter().zip(tuple.data.iter()) {
37            if let Some(stats) = self.column_stats.get_mut(&column.name) {
38                stats.record_value(value);
39            }
40        }
41    }
42}
43
44#[derive(Debug, Clone, Default)]
45pub struct ColumnStatistics {
46    pub null_count: u64,
47    pub non_null_count: u64,
48    pub min: Option<ScalarValue>,
49    pub max: Option<ScalarValue>,
50    distinct_sample: HashSet<ScalarValue>,
51    sampled_non_null_count: u64,
52    distinct_estimate: f64,
53}
54
55impl ColumnStatistics {
56    pub fn record_value(&mut self, value: &ScalarValue) {
57        if value.is_null() {
58            self.null_count += 1;
59            return;
60        }
61
62        self.non_null_count += 1;
63        let is_new_min = self
64            .min
65            .as_ref()
66            .is_none_or(|min| matches!(value.partial_cmp(min), Some(Ordering::Less)));
67        if is_new_min {
68            self.min = Some(value.clone());
69        }
70
71        let is_new_max = self
72            .max
73            .as_ref()
74            .is_none_or(|max| matches!(value.partial_cmp(max), Some(Ordering::Greater)));
75        if is_new_max {
76            self.max = Some(value.clone());
77        }
78
79        if self.sampled_non_null_count < DISTINCT_SAMPLE_LIMIT as u64 {
80            self.sampled_non_null_count += 1;
81            self.distinct_sample.insert(value.clone());
82            self.distinct_estimate = self.distinct_sample.len() as f64;
83        } else if self.sampled_non_null_count > 0 {
84            let sample_unique = self.distinct_sample.len() as f64;
85            let sample_size = self.sampled_non_null_count as f64;
86            self.distinct_estimate = (sample_unique / sample_size) * self.non_null_count as f64;
87        } else {
88            self.distinct_estimate = self.non_null_count as f64;
89        }
90    }
91
92    pub fn total_count(&self) -> u64 {
93        self.null_count + self.non_null_count
94    }
95
96    pub fn estimated_distinct_count(&self) -> Option<f64> {
97        if self.non_null_count == 0 {
98            return None;
99        }
100        let estimate = if self.distinct_estimate > 0.0 {
101            self.distinct_estimate
102        } else {
103            self.non_null_count as f64
104        };
105        Some(estimate.max(1.0))
106    }
107
108    pub fn selectivity_for_op(&self, op: BinaryOp, literal: &ScalarValue) -> Option<f64> {
109        match op {
110            BinaryOp::Eq => self
111                .estimated_distinct_count()
112                .map(|distinct| (1.0 / distinct).max(MIN_SELECTIVITY)),
113            BinaryOp::NotEq => self
114                .estimated_distinct_count()
115                .map(|distinct| (1.0 - 1.0 / distinct).clamp(MIN_SELECTIVITY, 1.0)),
116            BinaryOp::Gt | BinaryOp::GtEq | BinaryOp::Lt | BinaryOp::LtEq => {
117                self.range_selectivity(op, literal)
118            }
119            _ => None,
120        }
121    }
122
123    fn range_selectivity(&self, op: BinaryOp, literal: &ScalarValue) -> Option<f64> {
124        let min = self.min.as_ref()?;
125        let max = self.max.as_ref()?;
126        let literal_cast = literal.cast_to(&min.data_type()).ok()?;
127
128        let min_f = scalar_to_f64(min)?;
129        let max_f = scalar_to_f64(max)?;
130        let literal_f = scalar_to_f64(&literal_cast)?;
131
132        if max_f <= min_f {
133            return Some(0.5);
134        }
135        let fraction = ((literal_f - min_f) / (max_f - min_f)).clamp(0.0, 1.0);
136        let sel = match op {
137            BinaryOp::Gt => 1.0 - fraction,
138            BinaryOp::GtEq => 1.0 - fraction,
139            BinaryOp::Lt => fraction,
140            BinaryOp::LtEq => fraction,
141            _ => return None,
142        };
143        Some(sel.clamp(MIN_SELECTIVITY, 1.0))
144    }
145}
146
147fn scalar_to_f64(value: &ScalarValue) -> Option<f64> {
148    match value {
149        ScalarValue::Int8(Some(v)) => Some(*v as f64),
150        ScalarValue::Int16(Some(v)) => Some(*v as f64),
151        ScalarValue::Int32(Some(v)) => Some(*v as f64),
152        ScalarValue::Int64(Some(v)) => Some(*v as f64),
153        ScalarValue::UInt8(Some(v)) => Some(*v as f64),
154        ScalarValue::UInt16(Some(v)) => Some(*v as f64),
155        ScalarValue::UInt32(Some(v)) => Some(*v as f64),
156        ScalarValue::UInt64(Some(v)) => Some(*v as f64),
157        ScalarValue::Float32(Some(v)) => Some(*v as f64),
158        ScalarValue::Float64(Some(v)) => Some(*v),
159        _ => None,
160    }
161}
162
163#[cfg(test)]
164mod tests {
165    use super::*;
166
167    #[test]
168    fn eq_selectivity_reflects_distinct_count() {
169        let mut stats = ColumnStatistics::default();
170        stats.record_value(&ScalarValue::Int32(Some(1)));
171        stats.record_value(&ScalarValue::Int32(Some(2)));
172        stats.record_value(&ScalarValue::Int32(Some(1)));
173
174        let sel = stats
175            .selectivity_for_op(BinaryOp::Eq, &ScalarValue::Int32(Some(1)))
176            .unwrap();
177        assert!(sel > 0.4 && sel < 0.6);
178    }
179
180    #[test]
181    fn range_selectivity_uses_min_max() {
182        let mut stats = ColumnStatistics::default();
183        for v in 0..=10 {
184            stats.record_value(&ScalarValue::Int32(Some(v)));
185        }
186        let gt_sel = stats
187            .selectivity_for_op(BinaryOp::Gt, &ScalarValue::Int32(Some(5)))
188            .unwrap();
189        assert!(gt_sel > 0.4 && gt_sel < 0.6);
190    }
191}