vibesql_storage/statistics/
histogram.rs1use vibesql_types::SqlValue;
7
8#[derive(Debug, Clone)]
10pub struct Histogram {
11 pub buckets: Vec<HistogramBucket>,
13
14 pub bucket_strategy: BucketStrategy,
16
17 pub total_rows: usize,
19}
20
21#[derive(Debug, Clone)]
23pub struct HistogramBucket {
24 pub lower_bound: SqlValue,
26
27 pub upper_bound: SqlValue,
29
30 pub frequency: f64,
32
33 pub distinct_count: usize,
35}
36
37#[derive(Debug, Clone, PartialEq)]
39pub enum BucketStrategy {
40 EqualWidth,
43
44 EqualDepth,
47
48 Hybrid,
51}
52
53impl Histogram {
54 pub fn create(values: &[SqlValue], num_buckets: usize, strategy: BucketStrategy) -> Self {
61 if values.is_empty() {
62 return Self { buckets: Vec::new(), bucket_strategy: strategy, total_rows: 0 };
63 }
64
65 let buckets = match strategy {
66 BucketStrategy::EqualWidth => Self::create_equal_width_buckets(values, num_buckets),
67 BucketStrategy::EqualDepth => Self::create_equal_depth_buckets(values, num_buckets),
68 BucketStrategy::Hybrid => Self::create_hybrid_buckets(values, num_buckets),
69 };
70
71 Self { buckets, bucket_strategy: strategy, total_rows: values.len() }
72 }
73
74 fn create_equal_width_buckets(values: &[SqlValue], num_buckets: usize) -> Vec<HistogramBucket> {
76 if values.is_empty() || num_buckets == 0 {
77 return Vec::new();
78 }
79
80 let mut sorted_values = values.to_vec();
81 sorted_values.sort();
82
83 let min_val = &sorted_values[0];
84 let max_val = &sorted_values[sorted_values.len() - 1];
85
86 if min_val == max_val {
88 return vec![HistogramBucket {
89 lower_bound: min_val.clone(),
90 upper_bound: max_val.clone(),
91 frequency: 1.0,
92 distinct_count: 1,
93 }];
94 }
95
96 Self::create_equal_depth_buckets(values, num_buckets)
99 }
100
101 fn create_equal_depth_buckets(values: &[SqlValue], num_buckets: usize) -> Vec<HistogramBucket> {
103 if values.is_empty() || num_buckets == 0 {
104 return Vec::new();
105 }
106
107 let mut sorted_values = values.to_vec();
108 sorted_values.sort();
109
110 if sorted_values[0] == sorted_values[sorted_values.len() - 1] {
112 return vec![HistogramBucket {
113 lower_bound: sorted_values[0].clone(),
114 upper_bound: sorted_values[0].clone(),
115 frequency: 1.0,
116 distinct_count: 1,
117 }];
118 }
119
120 let target_bucket_size = (values.len() as f64 / num_buckets as f64).ceil() as usize;
121 let mut buckets = Vec::new();
122
123 let mut i = 0;
124 while i < sorted_values.len() {
125 let start_idx = i;
126 let end_idx = (i + target_bucket_size).min(sorted_values.len());
127
128 let lower_bound = sorted_values[start_idx].clone();
129 let upper_bound = sorted_values[end_idx - 1].clone();
130
131 let mut distinct_values = std::collections::HashSet::new();
133 for value in &sorted_values[start_idx..end_idx] {
134 distinct_values.insert(value.clone());
135 }
136
137 let bucket_rows = end_idx - start_idx;
138 let frequency = bucket_rows as f64 / values.len() as f64;
139
140 buckets.push(HistogramBucket {
141 lower_bound,
142 upper_bound,
143 frequency,
144 distinct_count: distinct_values.len(),
145 });
146
147 i = end_idx;
148 }
149
150 buckets
151 }
152
153 fn create_hybrid_buckets(values: &[SqlValue], num_buckets: usize) -> Vec<HistogramBucket> {
155 Self::create_equal_depth_buckets(values, num_buckets)
158 }
159
160 pub fn estimate_equality_selectivity(&self, value: &SqlValue) -> f64 {
162 if self.buckets.is_empty() {
163 return 0.0;
164 }
165
166 for bucket in &self.buckets {
168 if value >= &bucket.lower_bound && value <= &bucket.upper_bound {
169 if bucket.distinct_count > 0 {
171 return bucket.frequency / bucket.distinct_count as f64;
172 }
173 }
174 }
175
176 0.0
178 }
179
180 pub fn estimate_range_selectivity(&self, operator: &str, value: &SqlValue) -> f64 {
183 if self.buckets.is_empty() {
184 return 0.0;
185 }
186
187 let mut selectivity = 0.0;
188
189 match operator {
190 ">" | ">=" => {
191 for bucket in &self.buckets {
193 if &bucket.lower_bound > value {
194 selectivity += bucket.frequency;
196 } else if &bucket.upper_bound >= value {
197 let partial = if operator == ">" {
200 bucket.frequency * 0.5 } else {
203 bucket.frequency * 0.5 + self.estimate_equality_selectivity(value)
205 };
206 selectivity += partial;
207 }
208 }
209 }
210 "<" | "<=" => {
211 for bucket in &self.buckets {
213 if &bucket.upper_bound < value {
214 selectivity += bucket.frequency;
216 } else if &bucket.lower_bound <= value {
217 let partial = if operator == "<" {
219 bucket.frequency * 0.5
221 } else {
222 bucket.frequency * 0.5 + self.estimate_equality_selectivity(value)
224 };
225 selectivity += partial;
226 }
227 }
228 }
229 _ => return 0.33, }
231
232 selectivity.clamp(0.0, 1.0)
233 }
234
235 pub fn estimate_between_selectivity(&self, lower: &SqlValue, upper: &SqlValue) -> f64 {
237 if self.buckets.is_empty() {
238 return 0.0;
239 }
240
241 let mut selectivity = 0.0;
242
243 for bucket in &self.buckets {
244 if &bucket.upper_bound < lower || &bucket.lower_bound > upper {
246 continue;
248 }
249
250 if &bucket.lower_bound >= lower && &bucket.upper_bound <= upper {
251 selectivity += bucket.frequency;
253 } else {
254 selectivity += bucket.frequency * 0.5;
258 }
259 }
260
261 selectivity.clamp(0.0, 1.0)
262 }
263
264 pub fn bucket_count(&self) -> usize {
266 self.buckets.len()
267 }
268
269 pub fn is_empty(&self) -> bool {
271 self.buckets.is_empty()
272 }
273}
274
275#[cfg(test)]
276mod tests {
277 use vibesql_types::SqlValue;
278
279 use super::*;
280
281 #[test]
282 fn test_histogram_equal_depth() {
283 let mut values = Vec::new();
285 for _ in 0..70 {
286 values.push(SqlValue::Integer(1));
287 }
288 for _ in 0..20 {
289 values.push(SqlValue::Integer(2));
290 }
291 for _ in 0..10 {
292 values.push(SqlValue::Integer(3));
293 }
294
295 let histogram = Histogram::create(&values, 3, BucketStrategy::EqualDepth);
296
297 assert_eq!(histogram.bucket_count(), 3);
298 assert_eq!(histogram.total_rows, 100);
299
300 for bucket in &histogram.buckets {
302 assert!((bucket.frequency - 0.33).abs() < 0.1);
303 }
304 }
305
306 #[test]
307 fn test_histogram_equality_selectivity() {
308 let values = vec![
309 SqlValue::Integer(1),
310 SqlValue::Integer(1),
311 SqlValue::Integer(1),
312 SqlValue::Integer(2),
313 SqlValue::Integer(3),
314 ];
315
316 let histogram = Histogram::create(&values, 2, BucketStrategy::EqualDepth);
317
318 let sel = histogram.estimate_equality_selectivity(&SqlValue::Integer(1));
320
321 assert!(sel > 0.0);
323 assert!(sel <= 1.0);
324 }
325
326 #[test]
327 fn test_histogram_range_selectivity() {
328 let values: Vec<SqlValue> = (1..=100).map(SqlValue::Integer).collect();
329
330 let histogram = Histogram::create(&values, 10, BucketStrategy::EqualDepth);
331
332 let sel = histogram.estimate_range_selectivity(">", &SqlValue::Integer(50));
334
335 assert!((sel - 0.5).abs() < 0.2);
337 }
338
339 #[test]
340 fn test_histogram_between_selectivity() {
341 let values: Vec<SqlValue> = (1..=100).map(SqlValue::Integer).collect();
342
343 let histogram = Histogram::create(&values, 10, BucketStrategy::EqualDepth);
344
345 let sel =
347 histogram.estimate_between_selectivity(&SqlValue::Integer(25), &SqlValue::Integer(75));
348
349 assert!((sel - 0.5).abs() < 0.3);
351 }
352
353 #[test]
354 fn test_empty_histogram() {
355 let histogram = Histogram::create(&[], 10, BucketStrategy::EqualDepth);
356
357 assert!(histogram.is_empty());
358 assert_eq!(histogram.bucket_count(), 0);
359 assert_eq!(histogram.estimate_equality_selectivity(&SqlValue::Integer(1)), 0.0);
360 }
361
362 #[test]
363 fn test_single_value_histogram() {
364 let values = vec![SqlValue::Integer(42); 100];
365
366 let histogram = Histogram::create(&values, 10, BucketStrategy::EqualDepth);
367
368 assert_eq!(histogram.bucket_count(), 1);
369 assert_eq!(histogram.buckets[0].distinct_count, 1);
370 assert_eq!(histogram.buckets[0].frequency, 1.0);
371 }
372}