Skip to main content

grafeo_core/statistics/
histogram.rs

1//! Equi-depth histograms for understanding value distributions.
2//!
3//! When the optimizer sees `WHERE age > 30`, it needs to know what fraction of
4//! rows match. Histograms split the value range into buckets of roughly equal
5//! row counts, letting us estimate selectivity without scanning the data.
6
7use grafeo_common::types::Value;
8use std::cmp::Ordering;
9
10/// One slice of the value distribution - a range with its row count.
11#[derive(Debug, Clone)]
12pub struct HistogramBucket {
13    /// Lower bound (inclusive).
14    pub lower: Value,
15    /// Upper bound (inclusive).
16    pub upper: Value,
17    /// Number of distinct values in this bucket.
18    pub distinct_count: u64,
19    /// Number of values in this bucket.
20    pub row_count: u64,
21}
22
23impl HistogramBucket {
24    /// Creates a new histogram bucket.
25    pub fn new(lower: Value, upper: Value, distinct_count: u64, row_count: u64) -> Self {
26        Self {
27            lower,
28            upper,
29            distinct_count,
30            row_count,
31        }
32    }
33
34    /// Checks if a value falls within this bucket.
35    pub fn contains(&self, value: &Value) -> bool {
36        compare_values(value, &self.lower) != Some(Ordering::Less)
37            && compare_values(value, &self.upper) != Some(Ordering::Greater)
38    }
39}
40
41/// Divides a column's value range into buckets of roughly equal row counts.
42///
43/// Build one with [`build()`](Self::build) from sorted values, then use
44/// [`estimate_equality_selectivity()`](Self::estimate_equality_selectivity)
45/// or [`estimate_range_selectivity()`](Self::estimate_range_selectivity)
46/// to predict how many rows will match a predicate.
47#[derive(Debug, Clone)]
48pub struct Histogram {
49    /// Histogram buckets, sorted by lower bound.
50    buckets: Vec<HistogramBucket>,
51    /// Total number of rows represented.
52    total_rows: u64,
53    /// Total number of distinct values.
54    total_distinct: u64,
55}
56
57impl Histogram {
58    /// Creates a new histogram with the given buckets.
59    pub fn new(buckets: Vec<HistogramBucket>) -> Self {
60        let total_rows = buckets.iter().map(|b| b.row_count).sum();
61        let total_distinct = buckets.iter().map(|b| b.distinct_count).sum();
62
63        Self {
64            buckets,
65            total_rows,
66            total_distinct,
67        }
68    }
69
70    /// Creates an equi-depth histogram from sorted values.
71    ///
72    /// # Arguments
73    /// * `sorted_values` - Values sorted in ascending order.
74    /// * `num_buckets` - Target number of buckets.
75    pub fn build(sorted_values: &[Value], num_buckets: usize) -> Self {
76        if sorted_values.is_empty() {
77            return Self::new(Vec::new());
78        }
79
80        let num_buckets = num_buckets.max(1).min(sorted_values.len());
81        let rows_per_bucket = sorted_values.len() / num_buckets;
82
83        let mut buckets = Vec::with_capacity(num_buckets);
84        let mut current_start = 0;
85
86        for i in 0..num_buckets {
87            let end = if i == num_buckets - 1 {
88                sorted_values.len()
89            } else {
90                current_start + rows_per_bucket
91            };
92
93            if current_start >= sorted_values.len() {
94                break;
95            }
96
97            let bucket_values = &sorted_values[current_start..end];
98            // Invariant: slice is non-empty because:
99            // - current_start < sorted_values.len() (guard on line 89)
100            // - end > current_start (rows_per_bucket >= 1 or end = len > current_start)
101            let lower = bucket_values
102                .first()
103                .expect("bucket_values is non-empty: current_start < end")
104                .clone();
105            let upper = bucket_values
106                .last()
107                .expect("bucket_values is non-empty: current_start < end")
108                .clone();
109
110            // Count distinct values in bucket
111            let mut distinct = 1u64;
112            for j in 1..bucket_values.len() {
113                if bucket_values[j] != bucket_values[j - 1] {
114                    distinct += 1;
115                }
116            }
117
118            buckets.push(HistogramBucket::new(
119                lower,
120                upper,
121                distinct,
122                bucket_values.len() as u64,
123            ));
124
125            current_start = end;
126        }
127
128        Self::new(buckets)
129    }
130
131    /// Returns the number of buckets.
132    pub fn bucket_count(&self) -> usize {
133        self.buckets.len()
134    }
135
136    /// Returns the buckets.
137    pub fn buckets(&self) -> &[HistogramBucket] {
138        &self.buckets
139    }
140
141    /// Returns the total row count.
142    pub fn total_rows(&self) -> u64 {
143        self.total_rows
144    }
145
146    /// Returns the total distinct count.
147    pub fn total_distinct(&self) -> u64 {
148        self.total_distinct
149    }
150
151    /// Estimates the selectivity of an equality predicate.
152    ///
153    /// Returns the estimated fraction of rows that match the value.
154    pub fn estimate_equality_selectivity(&self, value: &Value) -> f64 {
155        if self.total_rows == 0 {
156            return 0.0;
157        }
158
159        // Find the bucket containing the value
160        for bucket in &self.buckets {
161            if bucket.contains(value) {
162                // Assume uniform distribution within bucket
163                if bucket.distinct_count == 0 {
164                    return 0.0;
165                }
166                return (bucket.row_count as f64 / bucket.distinct_count as f64)
167                    / self.total_rows as f64;
168            }
169        }
170
171        // Value not in histogram - assume very low selectivity
172        1.0 / self.total_rows as f64
173    }
174
175    /// Estimates the selectivity of a range predicate.
176    ///
177    /// # Arguments
178    /// * `lower` - Lower bound (None for unbounded).
179    /// * `upper` - Upper bound (None for unbounded).
180    /// * `lower_inclusive` - Whether lower bound is inclusive.
181    /// * `upper_inclusive` - Whether upper bound is inclusive.
182    pub fn estimate_range_selectivity(
183        &self,
184        lower: Option<&Value>,
185        upper: Option<&Value>,
186        lower_inclusive: bool,
187        upper_inclusive: bool,
188    ) -> f64 {
189        if self.total_rows == 0 {
190            return 0.0;
191        }
192
193        let mut matching_rows = 0.0;
194
195        for bucket in &self.buckets {
196            // Check if bucket overlaps with range
197            let bucket_in_range = match (lower, upper) {
198                (None, None) => true,
199                (Some(l), None) => compare_values(&bucket.upper, l) != Some(Ordering::Less),
200                (None, Some(u)) => compare_values(&bucket.lower, u) != Some(Ordering::Greater),
201                (Some(l), Some(u)) => {
202                    compare_values(&bucket.upper, l) != Some(Ordering::Less)
203                        && compare_values(&bucket.lower, u) != Some(Ordering::Greater)
204                }
205            };
206
207            if !bucket_in_range {
208                continue;
209            }
210
211            // Estimate the fraction of the bucket that's in range
212            let bucket_fraction = estimate_bucket_overlap(
213                &bucket.lower,
214                &bucket.upper,
215                lower,
216                upper,
217                lower_inclusive,
218                upper_inclusive,
219            );
220
221            matching_rows += bucket.row_count as f64 * bucket_fraction;
222        }
223
224        matching_rows / self.total_rows as f64
225    }
226
227    /// Estimates the selectivity of a less-than predicate.
228    pub fn estimate_less_than_selectivity(&self, value: &Value, inclusive: bool) -> f64 {
229        self.estimate_range_selectivity(None, Some(value), true, inclusive)
230    }
231
232    /// Estimates the selectivity of a greater-than predicate.
233    pub fn estimate_greater_than_selectivity(&self, value: &Value, inclusive: bool) -> f64 {
234        self.estimate_range_selectivity(Some(value), None, inclusive, true)
235    }
236}
237
238/// Estimates the fraction of a bucket that overlaps with a range.
239fn estimate_bucket_overlap(
240    bucket_lower: &Value,
241    bucket_upper: &Value,
242    range_lower: Option<&Value>,
243    range_upper: Option<&Value>,
244    _lower_inclusive: bool,
245    _upper_inclusive: bool,
246) -> f64 {
247    // For simplicity, use linear interpolation based on value positions
248    // This is a rough estimate but works well for numeric values
249
250    // If range fully contains bucket, return 1.0
251    let range_contains_lower = range_lower
252        .map(|l| compare_values(bucket_lower, l) != Some(Ordering::Less))
253        .unwrap_or(true);
254    let range_contains_upper = range_upper
255        .map(|u| compare_values(bucket_upper, u) != Some(Ordering::Greater))
256        .unwrap_or(true);
257
258    if range_contains_lower && range_contains_upper {
259        return 1.0;
260    }
261
262    // For partial overlap, estimate fraction
263    // This is simplified - a real implementation would use numeric interpolation
264    match (bucket_lower, bucket_upper) {
265        (Value::Int64(bl), Value::Int64(bu)) => {
266            let bucket_range = (bu - bl) as f64;
267            if bucket_range <= 0.0 {
268                return 1.0;
269            }
270
271            let effective_lower = range_lower
272                .and_then(|l| {
273                    if let Value::Int64(li) = l {
274                        Some(*li)
275                    } else {
276                        None
277                    }
278                })
279                .unwrap_or(*bl);
280
281            let effective_upper = range_upper
282                .and_then(|u| {
283                    if let Value::Int64(ui) = u {
284                        Some(*ui)
285                    } else {
286                        None
287                    }
288                })
289                .unwrap_or(*bu);
290
291            let overlap_lower = effective_lower.max(*bl);
292            let overlap_upper = effective_upper.min(*bu);
293
294            if overlap_upper < overlap_lower {
295                return 0.0;
296            }
297
298            (overlap_upper - overlap_lower) as f64 / bucket_range
299        }
300        (Value::Float64(bl), Value::Float64(bu)) => {
301            let bucket_range = bu - bl;
302            if bucket_range <= 0.0 {
303                return 1.0;
304            }
305
306            let effective_lower = range_lower
307                .and_then(|l| {
308                    if let Value::Float64(li) = l {
309                        Some(*li)
310                    } else {
311                        None
312                    }
313                })
314                .unwrap_or(*bl);
315
316            let effective_upper = range_upper
317                .and_then(|u| {
318                    if let Value::Float64(ui) = u {
319                        Some(*ui)
320                    } else {
321                        None
322                    }
323                })
324                .unwrap_or(*bu);
325
326            let overlap_lower = effective_lower.max(*bl);
327            let overlap_upper = effective_upper.min(*bu);
328
329            if overlap_upper < overlap_lower {
330                return 0.0;
331            }
332
333            (overlap_upper - overlap_lower) / bucket_range
334        }
335        _ => {
336            // For non-numeric types, assume 0.5 for partial overlap
337            0.5
338        }
339    }
340}
341
342/// Compares two values.
343fn compare_values(a: &Value, b: &Value) -> Option<Ordering> {
344    match (a, b) {
345        (Value::Int64(a), Value::Int64(b)) => Some(a.cmp(b)),
346        (Value::Float64(a), Value::Float64(b)) => a.partial_cmp(b),
347        (Value::String(a), Value::String(b)) => Some(a.cmp(b)),
348        (Value::Bool(a), Value::Bool(b)) => Some(a.cmp(b)),
349        (Value::Int64(a), Value::Float64(b)) => (*a as f64).partial_cmp(b),
350        (Value::Float64(a), Value::Int64(b)) => a.partial_cmp(&(*b as f64)),
351        _ => None,
352    }
353}
354
355#[cfg(test)]
356mod tests {
357    use super::*;
358
359    fn create_int_values(values: &[i64]) -> Vec<Value> {
360        values.iter().map(|&v| Value::Int64(v)).collect()
361    }
362
363    #[test]
364    fn test_histogram_build() {
365        let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
366        let hist = Histogram::build(&values, 2);
367
368        assert_eq!(hist.bucket_count(), 2);
369        assert_eq!(hist.total_rows(), 10);
370    }
371
372    #[test]
373    fn test_equality_selectivity() {
374        let values = create_int_values(&[1, 1, 2, 2, 2, 3, 3, 3, 3, 4]);
375        let hist = Histogram::build(&values, 4);
376
377        // Value 3 appears 4 times out of 10
378        let sel = hist.estimate_equality_selectivity(&Value::Int64(3));
379        assert!(sel > 0.0 && sel < 1.0);
380    }
381
382    #[test]
383    fn test_range_selectivity() {
384        let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
385        let hist = Histogram::build(&values, 5);
386
387        // Range 1-5 should be about 50%
388        let sel = hist.estimate_range_selectivity(
389            Some(&Value::Int64(1)),
390            Some(&Value::Int64(5)),
391            true,
392            true,
393        );
394        assert!(sel >= 0.3 && sel <= 0.7);
395
396        // Range 1-10 should be 100%
397        let sel_full = hist.estimate_range_selectivity(
398            Some(&Value::Int64(1)),
399            Some(&Value::Int64(10)),
400            true,
401            true,
402        );
403        assert!((sel_full - 1.0).abs() < 0.1);
404    }
405
406    #[test]
407    fn test_less_than_selectivity() {
408        let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
409        let hist = Histogram::build(&values, 5);
410
411        // Less than 5 should be about 40%
412        let sel = hist.estimate_less_than_selectivity(&Value::Int64(5), false);
413        assert!(sel > 0.0 && sel < 1.0);
414    }
415
416    #[test]
417    fn test_empty_histogram() {
418        let hist = Histogram::build(&[], 5);
419
420        assert_eq!(hist.bucket_count(), 0);
421        assert_eq!(hist.total_rows(), 0);
422        assert_eq!(hist.estimate_equality_selectivity(&Value::Int64(5)), 0.0);
423    }
424}