quill_sql/catalog/
stats.rs

1use std::cmp::Ordering;
2use std::collections::{HashMap, HashSet};
3use std::sync::Arc;
4
5use crate::catalog::Schema;
6use crate::error::QuillSQLResult;
7use crate::expression::BinaryOp;
8use crate::storage::table_heap::{TableHeap, TableIterator};
9use crate::storage::tuple::Tuple;
10use crate::utils::scalar::ScalarValue;
11
12const DISTINCT_SAMPLE_LIMIT: usize = 1024;
13const MIN_SELECTIVITY: f64 = 0.01;
14
15/// Basic per-table statistics derived from a table scan.
16#[derive(Debug, Clone)]
17pub struct TableStatistics {
18    /// Number of visible (non-deleted) tuples discovered.
19    pub row_count: u64,
20    /// Per-column stats keyed by column name.
21    pub column_stats: HashMap<String, ColumnStatistics>,
22}
23
24impl TableStatistics {
25    pub fn empty(schema: &Schema) -> Self {
26        let column_stats = schema
27            .columns
28            .iter()
29            .map(|col| (col.name.clone(), ColumnStatistics::default()))
30            .collect();
31        Self {
32            row_count: 0,
33            column_stats,
34        }
35    }
36
37    pub fn record_tuple(&mut self, tuple: &Tuple) {
38        self.row_count += 1;
39        for (column, value) in tuple.schema.columns.iter().zip(tuple.data.iter()) {
40            if let Some(stats) = self.column_stats.get_mut(&column.name) {
41                stats.record_value(value);
42            }
43        }
44    }
45
46    pub fn analyze(table_heap: Arc<TableHeap>) -> QuillSQLResult<Self> {
47        let mut stats = TableStatistics::empty(table_heap.schema.as_ref());
48        let mut iterator = TableIterator::new(table_heap.clone(), ..);
49        while let Some((_rid, meta, tuple)) = iterator.next()? {
50            if meta.is_deleted {
51                continue;
52            }
53            stats.record_tuple(&tuple);
54        }
55        Ok(stats)
56    }
57}
58
59#[derive(Debug, Clone, Default)]
60pub struct ColumnStatistics {
61    pub null_count: u64,
62    pub non_null_count: u64,
63    pub min: Option<ScalarValue>,
64    pub max: Option<ScalarValue>,
65    distinct_sample: HashSet<ScalarValue>,
66    sampled_non_null_count: u64,
67    distinct_estimate: f64,
68}
69
70impl ColumnStatistics {
71    pub fn record_value(&mut self, value: &ScalarValue) {
72        if value.is_null() {
73            self.null_count += 1;
74            return;
75        }
76
77        self.non_null_count += 1;
78        let is_new_min = self.min.as_ref().map_or(true, |min| {
79            matches!(value.partial_cmp(min), Some(Ordering::Less))
80        });
81        if is_new_min {
82            self.min = Some(value.clone());
83        }
84
85        let is_new_max = self.max.as_ref().map_or(true, |max| {
86            matches!(value.partial_cmp(max), Some(Ordering::Greater))
87        });
88        if is_new_max {
89            self.max = Some(value.clone());
90        }
91
92        if self.sampled_non_null_count < DISTINCT_SAMPLE_LIMIT as u64 {
93            self.sampled_non_null_count += 1;
94            self.distinct_sample.insert(value.clone());
95            self.distinct_estimate = self.distinct_sample.len() as f64;
96        } else if self.sampled_non_null_count > 0 {
97            let sample_unique = self.distinct_sample.len() as f64;
98            let sample_size = self.sampled_non_null_count as f64;
99            self.distinct_estimate = (sample_unique / sample_size) * self.non_null_count as f64;
100        } else {
101            self.distinct_estimate = self.non_null_count as f64;
102        }
103    }
104
105    pub fn total_count(&self) -> u64 {
106        self.null_count + self.non_null_count
107    }
108
109    pub fn estimated_distinct_count(&self) -> Option<f64> {
110        if self.non_null_count == 0 {
111            return None;
112        }
113        let estimate = if self.distinct_estimate > 0.0 {
114            self.distinct_estimate
115        } else {
116            self.non_null_count as f64
117        };
118        Some(estimate.max(1.0))
119    }
120
121    pub fn selectivity_for_op(&self, op: BinaryOp, literal: &ScalarValue) -> Option<f64> {
122        match op {
123            BinaryOp::Eq => self
124                .estimated_distinct_count()
125                .map(|distinct| (1.0 / distinct).max(MIN_SELECTIVITY)),
126            BinaryOp::NotEq => self
127                .estimated_distinct_count()
128                .map(|distinct| (1.0 - 1.0 / distinct).clamp(MIN_SELECTIVITY, 1.0)),
129            BinaryOp::Gt | BinaryOp::GtEq | BinaryOp::Lt | BinaryOp::LtEq => {
130                self.range_selectivity(op, literal)
131            }
132            _ => None,
133        }
134    }
135
136    fn range_selectivity(&self, op: BinaryOp, literal: &ScalarValue) -> Option<f64> {
137        let min = self.min.as_ref()?;
138        let max = self.max.as_ref()?;
139        let literal_cast = literal.cast_to(&min.data_type()).ok()?;
140
141        let min_f = scalar_to_f64(min)?;
142        let max_f = scalar_to_f64(max)?;
143        let literal_f = scalar_to_f64(&literal_cast)?;
144
145        if max_f <= min_f {
146            return Some(0.5);
147        }
148        let fraction = ((literal_f - min_f) / (max_f - min_f)).clamp(0.0, 1.0);
149        let sel = match op {
150            BinaryOp::Gt => 1.0 - fraction,
151            BinaryOp::GtEq => 1.0 - fraction,
152            BinaryOp::Lt => fraction,
153            BinaryOp::LtEq => fraction,
154            _ => return None,
155        };
156        Some(sel.clamp(MIN_SELECTIVITY, 1.0))
157    }
158}
159
160fn scalar_to_f64(value: &ScalarValue) -> Option<f64> {
161    match value {
162        ScalarValue::Int8(Some(v)) => Some(*v as f64),
163        ScalarValue::Int16(Some(v)) => Some(*v as f64),
164        ScalarValue::Int32(Some(v)) => Some(*v as f64),
165        ScalarValue::Int64(Some(v)) => Some(*v as f64),
166        ScalarValue::UInt8(Some(v)) => Some(*v as f64),
167        ScalarValue::UInt16(Some(v)) => Some(*v as f64),
168        ScalarValue::UInt32(Some(v)) => Some(*v as f64),
169        ScalarValue::UInt64(Some(v)) => Some(*v as f64),
170        ScalarValue::Float32(Some(v)) => Some(*v as f64),
171        ScalarValue::Float64(Some(v)) => Some(*v),
172        _ => None,
173    }
174}
175
176#[cfg(test)]
177mod tests {
178    use super::*;
179
180    #[test]
181    fn eq_selectivity_reflects_distinct_count() {
182        let mut stats = ColumnStatistics::default();
183        stats.record_value(&ScalarValue::Int32(Some(1)));
184        stats.record_value(&ScalarValue::Int32(Some(2)));
185        stats.record_value(&ScalarValue::Int32(Some(1)));
186
187        let sel = stats
188            .selectivity_for_op(BinaryOp::Eq, &ScalarValue::Int32(Some(1)))
189            .unwrap();
190        assert!(sel > 0.4 && sel < 0.6);
191    }
192
193    #[test]
194    fn range_selectivity_uses_min_max() {
195        let mut stats = ColumnStatistics::default();
196        for v in 0..=10 {
197            stats.record_value(&ScalarValue::Int32(Some(v)));
198        }
199        let gt_sel = stats
200            .selectivity_for_op(BinaryOp::Gt, &ScalarValue::Int32(Some(5)))
201            .unwrap();
202        assert!(gt_sel > 0.4 && gt_sel < 0.6);
203    }
204}