quill_sql/catalog/
stats.rs1use std::cmp::Ordering;
2use std::collections::{HashMap, HashSet};
3use std::sync::Arc;
4
5use crate::catalog::Schema;
6use crate::error::QuillSQLResult;
7use crate::expression::BinaryOp;
8use crate::storage::table_heap::{TableHeap, TableIterator};
9use crate::storage::tuple::Tuple;
10use crate::utils::scalar::ScalarValue;
11
12const DISTINCT_SAMPLE_LIMIT: usize = 1024;
13const MIN_SELECTIVITY: f64 = 0.01;
14
15#[derive(Debug, Clone)]
17pub struct TableStatistics {
18 pub row_count: u64,
20 pub column_stats: HashMap<String, ColumnStatistics>,
22}
23
24impl TableStatistics {
25 pub fn empty(schema: &Schema) -> Self {
26 let column_stats = schema
27 .columns
28 .iter()
29 .map(|col| (col.name.clone(), ColumnStatistics::default()))
30 .collect();
31 Self {
32 row_count: 0,
33 column_stats,
34 }
35 }
36
37 pub fn record_tuple(&mut self, tuple: &Tuple) {
38 self.row_count += 1;
39 for (column, value) in tuple.schema.columns.iter().zip(tuple.data.iter()) {
40 if let Some(stats) = self.column_stats.get_mut(&column.name) {
41 stats.record_value(value);
42 }
43 }
44 }
45
46 pub fn analyze(table_heap: Arc<TableHeap>) -> QuillSQLResult<Self> {
47 let mut stats = TableStatistics::empty(table_heap.schema.as_ref());
48 let mut iterator = TableIterator::new(table_heap.clone(), ..);
49 while let Some((_rid, meta, tuple)) = iterator.next()? {
50 if meta.is_deleted {
51 continue;
52 }
53 stats.record_tuple(&tuple);
54 }
55 Ok(stats)
56 }
57}
58
59#[derive(Debug, Clone, Default)]
60pub struct ColumnStatistics {
61 pub null_count: u64,
62 pub non_null_count: u64,
63 pub min: Option<ScalarValue>,
64 pub max: Option<ScalarValue>,
65 distinct_sample: HashSet<ScalarValue>,
66 sampled_non_null_count: u64,
67 distinct_estimate: f64,
68}
69
70impl ColumnStatistics {
71 pub fn record_value(&mut self, value: &ScalarValue) {
72 if value.is_null() {
73 self.null_count += 1;
74 return;
75 }
76
77 self.non_null_count += 1;
78 let is_new_min = self.min.as_ref().map_or(true, |min| {
79 matches!(value.partial_cmp(min), Some(Ordering::Less))
80 });
81 if is_new_min {
82 self.min = Some(value.clone());
83 }
84
85 let is_new_max = self.max.as_ref().map_or(true, |max| {
86 matches!(value.partial_cmp(max), Some(Ordering::Greater))
87 });
88 if is_new_max {
89 self.max = Some(value.clone());
90 }
91
92 if self.sampled_non_null_count < DISTINCT_SAMPLE_LIMIT as u64 {
93 self.sampled_non_null_count += 1;
94 self.distinct_sample.insert(value.clone());
95 self.distinct_estimate = self.distinct_sample.len() as f64;
96 } else if self.sampled_non_null_count > 0 {
97 let sample_unique = self.distinct_sample.len() as f64;
98 let sample_size = self.sampled_non_null_count as f64;
99 self.distinct_estimate = (sample_unique / sample_size) * self.non_null_count as f64;
100 } else {
101 self.distinct_estimate = self.non_null_count as f64;
102 }
103 }
104
105 pub fn total_count(&self) -> u64 {
106 self.null_count + self.non_null_count
107 }
108
109 pub fn estimated_distinct_count(&self) -> Option<f64> {
110 if self.non_null_count == 0 {
111 return None;
112 }
113 let estimate = if self.distinct_estimate > 0.0 {
114 self.distinct_estimate
115 } else {
116 self.non_null_count as f64
117 };
118 Some(estimate.max(1.0))
119 }
120
121 pub fn selectivity_for_op(&self, op: BinaryOp, literal: &ScalarValue) -> Option<f64> {
122 match op {
123 BinaryOp::Eq => self
124 .estimated_distinct_count()
125 .map(|distinct| (1.0 / distinct).max(MIN_SELECTIVITY)),
126 BinaryOp::NotEq => self
127 .estimated_distinct_count()
128 .map(|distinct| (1.0 - 1.0 / distinct).clamp(MIN_SELECTIVITY, 1.0)),
129 BinaryOp::Gt | BinaryOp::GtEq | BinaryOp::Lt | BinaryOp::LtEq => {
130 self.range_selectivity(op, literal)
131 }
132 _ => None,
133 }
134 }
135
136 fn range_selectivity(&self, op: BinaryOp, literal: &ScalarValue) -> Option<f64> {
137 let min = self.min.as_ref()?;
138 let max = self.max.as_ref()?;
139 let literal_cast = literal.cast_to(&min.data_type()).ok()?;
140
141 let min_f = scalar_to_f64(min)?;
142 let max_f = scalar_to_f64(max)?;
143 let literal_f = scalar_to_f64(&literal_cast)?;
144
145 if max_f <= min_f {
146 return Some(0.5);
147 }
148 let fraction = ((literal_f - min_f) / (max_f - min_f)).clamp(0.0, 1.0);
149 let sel = match op {
150 BinaryOp::Gt => 1.0 - fraction,
151 BinaryOp::GtEq => 1.0 - fraction,
152 BinaryOp::Lt => fraction,
153 BinaryOp::LtEq => fraction,
154 _ => return None,
155 };
156 Some(sel.clamp(MIN_SELECTIVITY, 1.0))
157 }
158}
159
160fn scalar_to_f64(value: &ScalarValue) -> Option<f64> {
161 match value {
162 ScalarValue::Int8(Some(v)) => Some(*v as f64),
163 ScalarValue::Int16(Some(v)) => Some(*v as f64),
164 ScalarValue::Int32(Some(v)) => Some(*v as f64),
165 ScalarValue::Int64(Some(v)) => Some(*v as f64),
166 ScalarValue::UInt8(Some(v)) => Some(*v as f64),
167 ScalarValue::UInt16(Some(v)) => Some(*v as f64),
168 ScalarValue::UInt32(Some(v)) => Some(*v as f64),
169 ScalarValue::UInt64(Some(v)) => Some(*v as f64),
170 ScalarValue::Float32(Some(v)) => Some(*v as f64),
171 ScalarValue::Float64(Some(v)) => Some(*v),
172 _ => None,
173 }
174}
175
176#[cfg(test)]
177mod tests {
178 use super::*;
179
180 #[test]
181 fn eq_selectivity_reflects_distinct_count() {
182 let mut stats = ColumnStatistics::default();
183 stats.record_value(&ScalarValue::Int32(Some(1)));
184 stats.record_value(&ScalarValue::Int32(Some(2)));
185 stats.record_value(&ScalarValue::Int32(Some(1)));
186
187 let sel = stats
188 .selectivity_for_op(BinaryOp::Eq, &ScalarValue::Int32(Some(1)))
189 .unwrap();
190 assert!(sel > 0.4 && sel < 0.6);
191 }
192
193 #[test]
194 fn range_selectivity_uses_min_max() {
195 let mut stats = ColumnStatistics::default();
196 for v in 0..=10 {
197 stats.record_value(&ScalarValue::Int32(Some(v)));
198 }
199 let gt_sel = stats
200 .selectivity_for_op(BinaryOp::Gt, &ScalarValue::Int32(Some(5)))
201 .unwrap();
202 assert!(gt_sel > 0.4 && gt_sel < 0.6);
203 }
204}