quill_sql/catalog/
stats.rs1use std::cmp::Ordering;
2use std::collections::{HashMap, HashSet};
3
4use crate::catalog::Schema;
5use crate::expression::BinaryOp;
6use crate::storage::tuple::Tuple;
7use crate::utils::scalar::ScalarValue;
8
9const DISTINCT_SAMPLE_LIMIT: usize = 1024;
10const MIN_SELECTIVITY: f64 = 0.01;
11
12#[derive(Debug, Clone)]
14pub struct TableStatistics {
15 pub row_count: u64,
17 pub column_stats: HashMap<String, ColumnStatistics>,
19}
20
21impl TableStatistics {
22 pub fn empty(schema: &Schema) -> Self {
23 let column_stats = schema
24 .columns
25 .iter()
26 .map(|col| (col.name.clone(), ColumnStatistics::default()))
27 .collect();
28 Self {
29 row_count: 0,
30 column_stats,
31 }
32 }
33
34 pub fn record_tuple(&mut self, tuple: &Tuple) {
35 self.row_count += 1;
36 for (column, value) in tuple.schema.columns.iter().zip(tuple.data.iter()) {
37 if let Some(stats) = self.column_stats.get_mut(&column.name) {
38 stats.record_value(value);
39 }
40 }
41 }
42}
43
44#[derive(Debug, Clone, Default)]
45pub struct ColumnStatistics {
46 pub null_count: u64,
47 pub non_null_count: u64,
48 pub min: Option<ScalarValue>,
49 pub max: Option<ScalarValue>,
50 distinct_sample: HashSet<ScalarValue>,
51 sampled_non_null_count: u64,
52 distinct_estimate: f64,
53}
54
55impl ColumnStatistics {
56 pub fn record_value(&mut self, value: &ScalarValue) {
57 if value.is_null() {
58 self.null_count += 1;
59 return;
60 }
61
62 self.non_null_count += 1;
63 let is_new_min = self
64 .min
65 .as_ref()
66 .is_none_or(|min| matches!(value.partial_cmp(min), Some(Ordering::Less)));
67 if is_new_min {
68 self.min = Some(value.clone());
69 }
70
71 let is_new_max = self
72 .max
73 .as_ref()
74 .is_none_or(|max| matches!(value.partial_cmp(max), Some(Ordering::Greater)));
75 if is_new_max {
76 self.max = Some(value.clone());
77 }
78
79 if self.sampled_non_null_count < DISTINCT_SAMPLE_LIMIT as u64 {
80 self.sampled_non_null_count += 1;
81 self.distinct_sample.insert(value.clone());
82 self.distinct_estimate = self.distinct_sample.len() as f64;
83 } else if self.sampled_non_null_count > 0 {
84 let sample_unique = self.distinct_sample.len() as f64;
85 let sample_size = self.sampled_non_null_count as f64;
86 self.distinct_estimate = (sample_unique / sample_size) * self.non_null_count as f64;
87 } else {
88 self.distinct_estimate = self.non_null_count as f64;
89 }
90 }
91
92 pub fn total_count(&self) -> u64 {
93 self.null_count + self.non_null_count
94 }
95
96 pub fn estimated_distinct_count(&self) -> Option<f64> {
97 if self.non_null_count == 0 {
98 return None;
99 }
100 let estimate = if self.distinct_estimate > 0.0 {
101 self.distinct_estimate
102 } else {
103 self.non_null_count as f64
104 };
105 Some(estimate.max(1.0))
106 }
107
108 pub fn selectivity_for_op(&self, op: BinaryOp, literal: &ScalarValue) -> Option<f64> {
109 match op {
110 BinaryOp::Eq => self
111 .estimated_distinct_count()
112 .map(|distinct| (1.0 / distinct).max(MIN_SELECTIVITY)),
113 BinaryOp::NotEq => self
114 .estimated_distinct_count()
115 .map(|distinct| (1.0 - 1.0 / distinct).clamp(MIN_SELECTIVITY, 1.0)),
116 BinaryOp::Gt | BinaryOp::GtEq | BinaryOp::Lt | BinaryOp::LtEq => {
117 self.range_selectivity(op, literal)
118 }
119 _ => None,
120 }
121 }
122
123 fn range_selectivity(&self, op: BinaryOp, literal: &ScalarValue) -> Option<f64> {
124 let min = self.min.as_ref()?;
125 let max = self.max.as_ref()?;
126 let literal_cast = literal.cast_to(&min.data_type()).ok()?;
127
128 let min_f = scalar_to_f64(min)?;
129 let max_f = scalar_to_f64(max)?;
130 let literal_f = scalar_to_f64(&literal_cast)?;
131
132 if max_f <= min_f {
133 return Some(0.5);
134 }
135 let fraction = ((literal_f - min_f) / (max_f - min_f)).clamp(0.0, 1.0);
136 let sel = match op {
137 BinaryOp::Gt => 1.0 - fraction,
138 BinaryOp::GtEq => 1.0 - fraction,
139 BinaryOp::Lt => fraction,
140 BinaryOp::LtEq => fraction,
141 _ => return None,
142 };
143 Some(sel.clamp(MIN_SELECTIVITY, 1.0))
144 }
145}
146
147fn scalar_to_f64(value: &ScalarValue) -> Option<f64> {
148 match value {
149 ScalarValue::Int8(Some(v)) => Some(*v as f64),
150 ScalarValue::Int16(Some(v)) => Some(*v as f64),
151 ScalarValue::Int32(Some(v)) => Some(*v as f64),
152 ScalarValue::Int64(Some(v)) => Some(*v as f64),
153 ScalarValue::UInt8(Some(v)) => Some(*v as f64),
154 ScalarValue::UInt16(Some(v)) => Some(*v as f64),
155 ScalarValue::UInt32(Some(v)) => Some(*v as f64),
156 ScalarValue::UInt64(Some(v)) => Some(*v as f64),
157 ScalarValue::Float32(Some(v)) => Some(*v as f64),
158 ScalarValue::Float64(Some(v)) => Some(*v),
159 _ => None,
160 }
161}
162
163#[cfg(test)]
164mod tests {
165 use super::*;
166
167 #[test]
168 fn eq_selectivity_reflects_distinct_count() {
169 let mut stats = ColumnStatistics::default();
170 stats.record_value(&ScalarValue::Int32(Some(1)));
171 stats.record_value(&ScalarValue::Int32(Some(2)));
172 stats.record_value(&ScalarValue::Int32(Some(1)));
173
174 let sel = stats
175 .selectivity_for_op(BinaryOp::Eq, &ScalarValue::Int32(Some(1)))
176 .unwrap();
177 assert!(sel > 0.4 && sel < 0.6);
178 }
179
180 #[test]
181 fn range_selectivity_uses_min_max() {
182 let mut stats = ColumnStatistics::default();
183 for v in 0..=10 {
184 stats.record_value(&ScalarValue::Int32(Some(v)));
185 }
186 let gt_sel = stats
187 .selectivity_for_op(BinaryOp::Gt, &ScalarValue::Int32(Some(5)))
188 .unwrap();
189 assert!(gt_sel > 0.4 && gt_sel < 0.6);
190 }
191}