grafeo_core/statistics/
histogram.rs

1//! Equi-depth histograms for understanding value distributions.
2//!
3//! When the optimizer sees `WHERE age > 30`, it needs to know what fraction of
4//! rows match. Histograms split the value range into buckets of roughly equal
5//! row counts, letting us estimate selectivity without scanning the data.
6
7use grafeo_common::types::Value;
8use std::cmp::Ordering;
9
10/// One slice of the value distribution - a range with its row count.
11#[derive(Debug, Clone)]
12pub struct HistogramBucket {
13    /// Lower bound (inclusive).
14    pub lower: Value,
15    /// Upper bound (inclusive).
16    pub upper: Value,
17    /// Number of distinct values in this bucket.
18    pub distinct_count: u64,
19    /// Number of values in this bucket.
20    pub row_count: u64,
21}
22
23impl HistogramBucket {
24    /// Creates a new histogram bucket.
25    pub fn new(lower: Value, upper: Value, distinct_count: u64, row_count: u64) -> Self {
26        Self {
27            lower,
28            upper,
29            distinct_count,
30            row_count,
31        }
32    }
33
34    /// Checks if a value falls within this bucket.
35    pub fn contains(&self, value: &Value) -> bool {
36        compare_values(value, &self.lower) != Some(Ordering::Less)
37            && compare_values(value, &self.upper) != Some(Ordering::Greater)
38    }
39}
40
41/// Divides a column's value range into buckets of roughly equal row counts.
42///
43/// Build one with [`build()`](Self::build) from sorted values, then use
44/// [`estimate_equality_selectivity()`](Self::estimate_equality_selectivity)
45/// or [`estimate_range_selectivity()`](Self::estimate_range_selectivity)
46/// to predict how many rows will match a predicate.
47#[derive(Debug, Clone)]
48pub struct Histogram {
49    /// Histogram buckets, sorted by lower bound.
50    buckets: Vec<HistogramBucket>,
51    /// Total number of rows represented.
52    total_rows: u64,
53    /// Total number of distinct values.
54    total_distinct: u64,
55}
56
57impl Histogram {
58    /// Creates a new histogram with the given buckets.
59    pub fn new(buckets: Vec<HistogramBucket>) -> Self {
60        let total_rows = buckets.iter().map(|b| b.row_count).sum();
61        let total_distinct = buckets.iter().map(|b| b.distinct_count).sum();
62
63        Self {
64            buckets,
65            total_rows,
66            total_distinct,
67        }
68    }
69
70    /// Creates an equi-depth histogram from sorted values.
71    ///
72    /// # Arguments
73    /// * `sorted_values` - Values sorted in ascending order.
74    /// * `num_buckets` - Target number of buckets.
75    ///
76    /// # Panics
77    ///
78    /// Panics if a non-empty bucket slice has no first or last element (invariant violation).
79    pub fn build(sorted_values: &[Value], num_buckets: usize) -> Self {
80        if sorted_values.is_empty() {
81            return Self::new(Vec::new());
82        }
83
84        let num_buckets = num_buckets.max(1).min(sorted_values.len());
85        let rows_per_bucket = sorted_values.len() / num_buckets;
86
87        let mut buckets = Vec::with_capacity(num_buckets);
88        let mut current_start = 0;
89
90        for i in 0..num_buckets {
91            let end = if i == num_buckets - 1 {
92                sorted_values.len()
93            } else {
94                current_start + rows_per_bucket
95            };
96
97            if current_start >= sorted_values.len() {
98                break;
99            }
100
101            let bucket_values = &sorted_values[current_start..end];
102            // Invariant: slice is non-empty because:
103            // - current_start < sorted_values.len() (guard on line 89)
104            // - end > current_start (rows_per_bucket >= 1 or end = len > current_start)
105            let lower = bucket_values
106                .first()
107                .expect("bucket_values is non-empty: current_start < end")
108                .clone();
109            let upper = bucket_values
110                .last()
111                .expect("bucket_values is non-empty: current_start < end")
112                .clone();
113
114            // Count distinct values in bucket
115            let mut distinct = 1u64;
116            for j in 1..bucket_values.len() {
117                if bucket_values[j] != bucket_values[j - 1] {
118                    distinct += 1;
119                }
120            }
121
122            buckets.push(HistogramBucket::new(
123                lower,
124                upper,
125                distinct,
126                bucket_values.len() as u64,
127            ));
128
129            current_start = end;
130        }
131
132        Self::new(buckets)
133    }
134
135    /// Returns the number of buckets.
136    pub fn bucket_count(&self) -> usize {
137        self.buckets.len()
138    }
139
140    /// Returns the buckets.
141    pub fn buckets(&self) -> &[HistogramBucket] {
142        &self.buckets
143    }
144
145    /// Returns the total row count.
146    pub fn total_rows(&self) -> u64 {
147        self.total_rows
148    }
149
150    /// Returns the total distinct count.
151    pub fn total_distinct(&self) -> u64 {
152        self.total_distinct
153    }
154
155    /// Estimates the selectivity of an equality predicate.
156    ///
157    /// Returns the estimated fraction of rows that match the value.
158    pub fn estimate_equality_selectivity(&self, value: &Value) -> f64 {
159        if self.total_rows == 0 {
160            return 0.0;
161        }
162
163        // Find the bucket containing the value
164        for bucket in &self.buckets {
165            if bucket.contains(value) {
166                // Assume uniform distribution within bucket
167                if bucket.distinct_count == 0 {
168                    return 0.0;
169                }
170                return (bucket.row_count as f64 / bucket.distinct_count as f64)
171                    / self.total_rows as f64;
172            }
173        }
174
175        // Value not in histogram - assume very low selectivity
176        1.0 / self.total_rows as f64
177    }
178
179    /// Estimates the selectivity of a range predicate.
180    ///
181    /// # Arguments
182    /// * `lower` - Lower bound (None for unbounded).
183    /// * `upper` - Upper bound (None for unbounded).
184    /// * `lower_inclusive` - Whether lower bound is inclusive.
185    /// * `upper_inclusive` - Whether upper bound is inclusive.
186    pub fn estimate_range_selectivity(
187        &self,
188        lower: Option<&Value>,
189        upper: Option<&Value>,
190        lower_inclusive: bool,
191        upper_inclusive: bool,
192    ) -> f64 {
193        if self.total_rows == 0 {
194            return 0.0;
195        }
196
197        let mut matching_rows = 0.0;
198
199        for bucket in &self.buckets {
200            // Check if bucket overlaps with range
201            let bucket_in_range = match (lower, upper) {
202                (None, None) => true,
203                (Some(l), None) => compare_values(&bucket.upper, l) != Some(Ordering::Less),
204                (None, Some(u)) => compare_values(&bucket.lower, u) != Some(Ordering::Greater),
205                (Some(l), Some(u)) => {
206                    compare_values(&bucket.upper, l) != Some(Ordering::Less)
207                        && compare_values(&bucket.lower, u) != Some(Ordering::Greater)
208                }
209            };
210
211            if !bucket_in_range {
212                continue;
213            }
214
215            // Estimate the fraction of the bucket that's in range
216            let bucket_fraction = estimate_bucket_overlap(
217                &bucket.lower,
218                &bucket.upper,
219                lower,
220                upper,
221                lower_inclusive,
222                upper_inclusive,
223            );
224
225            matching_rows += bucket.row_count as f64 * bucket_fraction;
226        }
227
228        matching_rows / self.total_rows as f64
229    }
230
231    /// Estimates the selectivity of a less-than predicate.
232    pub fn estimate_less_than_selectivity(&self, value: &Value, inclusive: bool) -> f64 {
233        self.estimate_range_selectivity(None, Some(value), true, inclusive)
234    }
235
236    /// Estimates the selectivity of a greater-than predicate.
237    pub fn estimate_greater_than_selectivity(&self, value: &Value, inclusive: bool) -> f64 {
238        self.estimate_range_selectivity(Some(value), None, inclusive, true)
239    }
240}
241
242/// Estimates the fraction of a bucket that overlaps with a range.
243fn estimate_bucket_overlap(
244    bucket_lower: &Value,
245    bucket_upper: &Value,
246    range_lower: Option<&Value>,
247    range_upper: Option<&Value>,
248    _lower_inclusive: bool,
249    _upper_inclusive: bool,
250) -> f64 {
251    // For simplicity, use linear interpolation based on value positions
252    // This is a rough estimate but works well for numeric values
253
254    // If range fully contains bucket, return 1.0
255    let range_contains_lower = range_lower.map_or(true, |l| {
256        compare_values(bucket_lower, l) != Some(Ordering::Less)
257    });
258    let range_contains_upper = range_upper.map_or(true, |u| {
259        compare_values(bucket_upper, u) != Some(Ordering::Greater)
260    });
261
262    if range_contains_lower && range_contains_upper {
263        return 1.0;
264    }
265
266    // For partial overlap, estimate fraction
267    // This is simplified - a real implementation would use numeric interpolation
268    match (bucket_lower, bucket_upper) {
269        (Value::Int64(bl), Value::Int64(bu)) => {
270            let bucket_range = (bu - bl) as f64;
271            if bucket_range <= 0.0 {
272                return 1.0;
273            }
274
275            let effective_lower = range_lower
276                .and_then(|l| {
277                    if let Value::Int64(li) = l {
278                        Some(*li)
279                    } else {
280                        None
281                    }
282                })
283                .unwrap_or(*bl);
284
285            let effective_upper = range_upper
286                .and_then(|u| {
287                    if let Value::Int64(ui) = u {
288                        Some(*ui)
289                    } else {
290                        None
291                    }
292                })
293                .unwrap_or(*bu);
294
295            let overlap_lower = effective_lower.max(*bl);
296            let overlap_upper = effective_upper.min(*bu);
297
298            if overlap_upper < overlap_lower {
299                return 0.0;
300            }
301
302            (overlap_upper - overlap_lower) as f64 / bucket_range
303        }
304        (Value::Float64(bl), Value::Float64(bu)) => {
305            let bucket_range = bu - bl;
306            if bucket_range <= 0.0 {
307                return 1.0;
308            }
309
310            let effective_lower = range_lower
311                .and_then(|l| {
312                    if let Value::Float64(li) = l {
313                        Some(*li)
314                    } else {
315                        None
316                    }
317                })
318                .unwrap_or(*bl);
319
320            let effective_upper = range_upper
321                .and_then(|u| {
322                    if let Value::Float64(ui) = u {
323                        Some(*ui)
324                    } else {
325                        None
326                    }
327                })
328                .unwrap_or(*bu);
329
330            let overlap_lower = effective_lower.max(*bl);
331            let overlap_upper = effective_upper.min(*bu);
332
333            if overlap_upper < overlap_lower {
334                return 0.0;
335            }
336
337            (overlap_upper - overlap_lower) / bucket_range
338        }
339        _ => {
340            // For non-numeric types, assume 0.5 for partial overlap
341            0.5
342        }
343    }
344}
345
346/// Compares two values.
347fn compare_values(a: &Value, b: &Value) -> Option<Ordering> {
348    match (a, b) {
349        (Value::Int64(a), Value::Int64(b)) => Some(a.cmp(b)),
350        (Value::Float64(a), Value::Float64(b)) => a.partial_cmp(b),
351        (Value::String(a), Value::String(b)) => Some(a.cmp(b)),
352        (Value::Bool(a), Value::Bool(b)) => Some(a.cmp(b)),
353        (Value::Int64(a), Value::Float64(b)) => (*a as f64).partial_cmp(b),
354        (Value::Float64(a), Value::Int64(b)) => a.partial_cmp(&(*b as f64)),
355        (Value::Timestamp(a), Value::Timestamp(b)) => Some(a.cmp(b)),
356        (Value::Date(a), Value::Date(b)) => Some(a.cmp(b)),
357        (Value::Time(a), Value::Time(b)) => Some(a.cmp(b)),
358        _ => None,
359    }
360}
361
362#[cfg(test)]
363mod tests {
364    use super::*;
365
366    fn create_int_values(values: &[i64]) -> Vec<Value> {
367        values.iter().map(|&v| Value::Int64(v)).collect()
368    }
369
370    #[test]
371    fn test_histogram_build() {
372        let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
373        let hist = Histogram::build(&values, 2);
374
375        assert_eq!(hist.bucket_count(), 2);
376        assert_eq!(hist.total_rows(), 10);
377    }
378
379    #[test]
380    fn test_equality_selectivity() {
381        let values = create_int_values(&[1, 1, 2, 2, 2, 3, 3, 3, 3, 4]);
382        let hist = Histogram::build(&values, 4);
383
384        // Value 3 appears 4 times out of 10
385        let sel = hist.estimate_equality_selectivity(&Value::Int64(3));
386        assert!(sel > 0.0 && sel < 1.0);
387    }
388
389    #[test]
390    fn test_range_selectivity() {
391        let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
392        let hist = Histogram::build(&values, 5);
393
394        // Range 1-5 should be about 50%
395        let sel = hist.estimate_range_selectivity(
396            Some(&Value::Int64(1)),
397            Some(&Value::Int64(5)),
398            true,
399            true,
400        );
401        assert!((0.3..=0.7).contains(&sel));
402
403        // Range 1-10 should be 100%
404        let sel_full = hist.estimate_range_selectivity(
405            Some(&Value::Int64(1)),
406            Some(&Value::Int64(10)),
407            true,
408            true,
409        );
410        assert!((sel_full - 1.0).abs() < 0.1);
411    }
412
413    #[test]
414    fn test_less_than_selectivity() {
415        let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
416        let hist = Histogram::build(&values, 5);
417
418        // Less than 5 should be about 40%
419        let sel = hist.estimate_less_than_selectivity(&Value::Int64(5), false);
420        assert!(sel > 0.0 && sel < 1.0);
421    }
422
423    #[test]
424    fn test_empty_histogram() {
425        let hist = Histogram::build(&[], 5);
426
427        assert_eq!(hist.bucket_count(), 0);
428        assert_eq!(hist.total_rows(), 0);
429        assert_eq!(hist.estimate_equality_selectivity(&Value::Int64(5)), 0.0);
430    }
431}
grafeo_core/statistics/histogram.rs

grafeo_core/statistics/
histogram.rs