vibesql_storage/statistics/
histogram.rs

1//! Histogram-based selectivity estimation for non-uniform distributions
2//!
3//! Histograms provide more accurate selectivity estimates than assuming
4//! uniform distribution, especially for skewed data.
5
6use vibesql_types::SqlValue;
7
8/// Histogram for tracking value distribution
9#[derive(Debug, Clone)]
10pub struct Histogram {
11    /// Histogram buckets
12    pub buckets: Vec<HistogramBucket>,
13
14    /// Strategy used to create buckets
15    pub bucket_strategy: BucketStrategy,
16
17    /// Total number of rows represented
18    pub total_rows: usize,
19}
20
21/// A single bucket in the histogram
22#[derive(Debug, Clone)]
23pub struct HistogramBucket {
24    /// Lower bound of this bucket (inclusive)
25    pub lower_bound: SqlValue,
26
27    /// Upper bound of this bucket (inclusive)
28    pub upper_bound: SqlValue,
29
30    /// Fraction of rows in this bucket (0.0 to 1.0)
31    pub frequency: f64,
32
33    /// Number of distinct values within this bucket
34    pub distinct_count: usize,
35}
36
37/// Strategy for dividing data into histogram buckets
38#[derive(Debug, Clone, PartialEq)]
39pub enum BucketStrategy {
40    /// Equal-width buckets: each bucket covers same value range
41    /// Good for uniformly distributed data
42    EqualWidth,
43
44    /// Equal-depth buckets: each bucket contains same number of rows
45    /// Good for skewed data
46    EqualDepth,
47
48    /// Hybrid: mix of equal-width and equal-depth
49    /// Balances accuracy for both uniform and skewed data
50    Hybrid,
51}
52
53impl Histogram {
54    /// Create a histogram from sample data
55    ///
56    /// # Arguments
57    /// * `values` - Sample values (should not include NULLs)
58    /// * `num_buckets` - Number of buckets to create (default: 100)
59    /// * `strategy` - Bucketing strategy to use
60    pub fn create(values: &[SqlValue], num_buckets: usize, strategy: BucketStrategy) -> Self {
61        if values.is_empty() {
62            return Self { buckets: Vec::new(), bucket_strategy: strategy, total_rows: 0 };
63        }
64
65        let buckets = match strategy {
66            BucketStrategy::EqualWidth => Self::create_equal_width_buckets(values, num_buckets),
67            BucketStrategy::EqualDepth => Self::create_equal_depth_buckets(values, num_buckets),
68            BucketStrategy::Hybrid => Self::create_hybrid_buckets(values, num_buckets),
69        };
70
71        Self { buckets, bucket_strategy: strategy, total_rows: values.len() }
72    }
73
74    /// Create equal-width buckets
75    fn create_equal_width_buckets(values: &[SqlValue], num_buckets: usize) -> Vec<HistogramBucket> {
76        if values.is_empty() || num_buckets == 0 {
77            return Vec::new();
78        }
79
80        let mut sorted_values = values.to_vec();
81        sorted_values.sort();
82
83        let min_val = &sorted_values[0];
84        let max_val = &sorted_values[sorted_values.len() - 1];
85
86        // Can't create equal-width buckets if all values are the same
87        if min_val == max_val {
88            return vec![HistogramBucket {
89                lower_bound: min_val.clone(),
90                upper_bound: max_val.clone(),
91                frequency: 1.0,
92                distinct_count: 1,
93            }];
94        }
95
96        // For numeric types, we can calculate bucket boundaries
97        // For now, we'll use a simplified approach with equal-depth as fallback
98        Self::create_equal_depth_buckets(values, num_buckets)
99    }
100
101    /// Create equal-depth buckets (each bucket has roughly equal number of rows)
102    fn create_equal_depth_buckets(values: &[SqlValue], num_buckets: usize) -> Vec<HistogramBucket> {
103        if values.is_empty() || num_buckets == 0 {
104            return Vec::new();
105        }
106
107        let mut sorted_values = values.to_vec();
108        sorted_values.sort();
109
110        // Special case: if all values are the same, create a single bucket
111        if sorted_values[0] == sorted_values[sorted_values.len() - 1] {
112            return vec![HistogramBucket {
113                lower_bound: sorted_values[0].clone(),
114                upper_bound: sorted_values[0].clone(),
115                frequency: 1.0,
116                distinct_count: 1,
117            }];
118        }
119
120        let target_bucket_size = (values.len() as f64 / num_buckets as f64).ceil() as usize;
121        let mut buckets = Vec::new();
122
123        let mut i = 0;
124        while i < sorted_values.len() {
125            let start_idx = i;
126            let end_idx = (i + target_bucket_size).min(sorted_values.len());
127
128            let lower_bound = sorted_values[start_idx].clone();
129            let upper_bound = sorted_values[end_idx - 1].clone();
130
131            // Count distinct values in this range
132            let mut distinct_values = std::collections::HashSet::new();
133            for value in &sorted_values[start_idx..end_idx] {
134                distinct_values.insert(value.clone());
135            }
136
137            let bucket_rows = end_idx - start_idx;
138            let frequency = bucket_rows as f64 / values.len() as f64;
139
140            buckets.push(HistogramBucket {
141                lower_bound,
142                upper_bound,
143                frequency,
144                distinct_count: distinct_values.len(),
145            });
146
147            i = end_idx;
148        }
149
150        buckets
151    }
152
153    /// Create hybrid buckets (combination of equal-width and equal-depth)
154    fn create_hybrid_buckets(values: &[SqlValue], num_buckets: usize) -> Vec<HistogramBucket> {
155        // For now, use equal-depth as it works better for skewed data
156        // In the future, we could detect skew and choose strategy accordingly
157        Self::create_equal_depth_buckets(values, num_buckets)
158    }
159
160    /// Estimate selectivity for equality predicate: col = value
161    pub fn estimate_equality_selectivity(&self, value: &SqlValue) -> f64 {
162        if self.buckets.is_empty() {
163            return 0.0;
164        }
165
166        // Find the bucket containing this value
167        for bucket in &self.buckets {
168            if value >= &bucket.lower_bound && value <= &bucket.upper_bound {
169                // Assume uniform distribution within the bucket
170                if bucket.distinct_count > 0 {
171                    return bucket.frequency / bucket.distinct_count as f64;
172                }
173            }
174        }
175
176        // Value not in any bucket
177        0.0
178    }
179
180    /// Estimate selectivity for range predicate: col > value, col >= value, col < value, col <=
181    /// value
182    pub fn estimate_range_selectivity(&self, operator: &str, value: &SqlValue) -> f64 {
183        if self.buckets.is_empty() {
184            return 0.0;
185        }
186
187        let mut selectivity = 0.0;
188
189        match operator {
190            ">" | ">=" => {
191                // Sum frequency of all buckets above the value
192                for bucket in &self.buckets {
193                    if &bucket.lower_bound > value {
194                        // Entire bucket is above the value
195                        selectivity += bucket.frequency;
196                    } else if &bucket.upper_bound >= value {
197                        // Value is within this bucket, estimate partial selectivity
198                        // Assume uniform distribution within bucket
199                        let partial = if operator == ">" {
200                            // Exclude the equality case
201                            bucket.frequency * 0.5 // Rough estimate
202                        } else {
203                            // Include the equality case
204                            bucket.frequency * 0.5 + self.estimate_equality_selectivity(value)
205                        };
206                        selectivity += partial;
207                    }
208                }
209            }
210            "<" | "<=" => {
211                // Sum frequency of all buckets below the value
212                for bucket in &self.buckets {
213                    if &bucket.upper_bound < value {
214                        // Entire bucket is below the value
215                        selectivity += bucket.frequency;
216                    } else if &bucket.lower_bound <= value {
217                        // Value is within this bucket, estimate partial selectivity
218                        let partial = if operator == "<" {
219                            // Exclude the equality case
220                            bucket.frequency * 0.5
221                        } else {
222                            // Include the equality case
223                            bucket.frequency * 0.5 + self.estimate_equality_selectivity(value)
224                        };
225                        selectivity += partial;
226                    }
227                }
228            }
229            _ => return 0.33, // Default fallback
230        }
231
232        selectivity.clamp(0.0, 1.0)
233    }
234
235    /// Estimate selectivity for BETWEEN predicate: col BETWEEN lower AND upper
236    pub fn estimate_between_selectivity(&self, lower: &SqlValue, upper: &SqlValue) -> f64 {
237        if self.buckets.is_empty() {
238            return 0.0;
239        }
240
241        let mut selectivity = 0.0;
242
243        for bucket in &self.buckets {
244            // Check if bucket overlaps with [lower, upper] range
245            if &bucket.upper_bound < lower || &bucket.lower_bound > upper {
246                // No overlap
247                continue;
248            }
249
250            if &bucket.lower_bound >= lower && &bucket.upper_bound <= upper {
251                // Bucket is fully contained in range
252                selectivity += bucket.frequency;
253            } else {
254                // Partial overlap - estimate fraction
255                // This is a simplified estimate; more sophisticated logic
256                // would calculate exact overlap based on value distribution
257                selectivity += bucket.frequency * 0.5;
258            }
259        }
260
261        selectivity.clamp(0.0, 1.0)
262    }
263
264    /// Get the number of buckets
265    pub fn bucket_count(&self) -> usize {
266        self.buckets.len()
267    }
268
269    /// Check if histogram is empty
270    pub fn is_empty(&self) -> bool {
271        self.buckets.is_empty()
272    }
273}
274
275#[cfg(test)]
276mod tests {
277    use vibesql_types::SqlValue;
278
279    use super::*;
280
281    #[test]
282    fn test_histogram_equal_depth() {
283        // Create skewed data: many 1s, some 2s, few 3s
284        let mut values = Vec::new();
285        for _ in 0..70 {
286            values.push(SqlValue::Integer(1));
287        }
288        for _ in 0..20 {
289            values.push(SqlValue::Integer(2));
290        }
291        for _ in 0..10 {
292            values.push(SqlValue::Integer(3));
293        }
294
295        let histogram = Histogram::create(&values, 3, BucketStrategy::EqualDepth);
296
297        assert_eq!(histogram.bucket_count(), 3);
298        assert_eq!(histogram.total_rows, 100);
299
300        // Each bucket should have roughly equal frequency (33.3%)
301        for bucket in &histogram.buckets {
302            assert!((bucket.frequency - 0.33).abs() < 0.1);
303        }
304    }
305
306    #[test]
307    fn test_histogram_equality_selectivity() {
308        let values = vec![
309            SqlValue::Integer(1),
310            SqlValue::Integer(1),
311            SqlValue::Integer(1),
312            SqlValue::Integer(2),
313            SqlValue::Integer(3),
314        ];
315
316        let histogram = Histogram::create(&values, 2, BucketStrategy::EqualDepth);
317
318        // Estimate selectivity for value 1 (appears 3 times out of 5)
319        let sel = histogram.estimate_equality_selectivity(&SqlValue::Integer(1));
320
321        // Should be roughly 60% (3/5), but depends on bucketing
322        assert!(sel > 0.0);
323        assert!(sel <= 1.0);
324    }
325
326    #[test]
327    fn test_histogram_range_selectivity() {
328        let values: Vec<SqlValue> = (1..=100).map(SqlValue::Integer).collect();
329
330        let histogram = Histogram::create(&values, 10, BucketStrategy::EqualDepth);
331
332        // Estimate selectivity for values > 50
333        let sel = histogram.estimate_range_selectivity(">", &SqlValue::Integer(50));
334
335        // Should be roughly 50%
336        assert!((sel - 0.5).abs() < 0.2);
337    }
338
339    #[test]
340    fn test_histogram_between_selectivity() {
341        let values: Vec<SqlValue> = (1..=100).map(SqlValue::Integer).collect();
342
343        let histogram = Histogram::create(&values, 10, BucketStrategy::EqualDepth);
344
345        // Estimate selectivity for BETWEEN 25 AND 75
346        let sel =
347            histogram.estimate_between_selectivity(&SqlValue::Integer(25), &SqlValue::Integer(75));
348
349        // Should be roughly 50% (50 values out of 100)
350        assert!((sel - 0.5).abs() < 0.3);
351    }
352
353    #[test]
354    fn test_empty_histogram() {
355        let histogram = Histogram::create(&[], 10, BucketStrategy::EqualDepth);
356
357        assert!(histogram.is_empty());
358        assert_eq!(histogram.bucket_count(), 0);
359        assert_eq!(histogram.estimate_equality_selectivity(&SqlValue::Integer(1)), 0.0);
360    }
361
362    #[test]
363    fn test_single_value_histogram() {
364        let values = vec![SqlValue::Integer(42); 100];
365
366        let histogram = Histogram::create(&values, 10, BucketStrategy::EqualDepth);
367
368        assert_eq!(histogram.bucket_count(), 1);
369        assert_eq!(histogram.buckets[0].distinct_count, 1);
370        assert_eq!(histogram.buckets[0].frequency, 1.0);
371    }
372}