Skip to main content

grafeo_core/statistics/
histogram.rs

1//! Equi-depth histograms for selectivity estimation.
2//!
3//! Histograms are used to estimate the selectivity of range predicates
4//! and equality predicates on high-cardinality columns.
5
6use grafeo_common::types::Value;
7use std::cmp::Ordering;
8
9/// An equi-depth histogram bucket.
10#[derive(Debug, Clone)]
11pub struct HistogramBucket {
12    /// Lower bound (inclusive).
13    pub lower: Value,
14    /// Upper bound (inclusive).
15    pub upper: Value,
16    /// Number of distinct values in this bucket.
17    pub distinct_count: u64,
18    /// Number of values in this bucket.
19    pub row_count: u64,
20}
21
22impl HistogramBucket {
23    /// Creates a new histogram bucket.
24    pub fn new(lower: Value, upper: Value, distinct_count: u64, row_count: u64) -> Self {
25        Self {
26            lower,
27            upper,
28            distinct_count,
29            row_count,
30        }
31    }
32
33    /// Checks if a value falls within this bucket.
34    pub fn contains(&self, value: &Value) -> bool {
35        compare_values(value, &self.lower) != Some(Ordering::Less)
36            && compare_values(value, &self.upper) != Some(Ordering::Greater)
37    }
38}
39
40/// An equi-depth histogram.
41///
42/// Each bucket contains approximately the same number of rows.
43#[derive(Debug, Clone)]
44pub struct Histogram {
45    /// Histogram buckets, sorted by lower bound.
46    buckets: Vec<HistogramBucket>,
47    /// Total number of rows represented.
48    total_rows: u64,
49    /// Total number of distinct values.
50    total_distinct: u64,
51}
52
53impl Histogram {
54    /// Creates a new histogram with the given buckets.
55    pub fn new(buckets: Vec<HistogramBucket>) -> Self {
56        let total_rows = buckets.iter().map(|b| b.row_count).sum();
57        let total_distinct = buckets.iter().map(|b| b.distinct_count).sum();
58
59        Self {
60            buckets,
61            total_rows,
62            total_distinct,
63        }
64    }
65
66    /// Creates an equi-depth histogram from sorted values.
67    ///
68    /// # Arguments
69    /// * `sorted_values` - Values sorted in ascending order.
70    /// * `num_buckets` - Target number of buckets.
71    pub fn build(sorted_values: &[Value], num_buckets: usize) -> Self {
72        if sorted_values.is_empty() {
73            return Self::new(Vec::new());
74        }
75
76        let num_buckets = num_buckets.max(1).min(sorted_values.len());
77        let rows_per_bucket = sorted_values.len() / num_buckets;
78
79        let mut buckets = Vec::with_capacity(num_buckets);
80        let mut current_start = 0;
81
82        for i in 0..num_buckets {
83            let end = if i == num_buckets - 1 {
84                sorted_values.len()
85            } else {
86                current_start + rows_per_bucket
87            };
88
89            if current_start >= sorted_values.len() {
90                break;
91            }
92
93            let bucket_values = &sorted_values[current_start..end];
94            // Invariant: slice is non-empty because:
95            // - current_start < sorted_values.len() (guard on line 89)
96            // - end > current_start (rows_per_bucket >= 1 or end = len > current_start)
97            let lower = bucket_values
98                .first()
99                .expect("bucket_values is non-empty: current_start < end")
100                .clone();
101            let upper = bucket_values
102                .last()
103                .expect("bucket_values is non-empty: current_start < end")
104                .clone();
105
106            // Count distinct values in bucket
107            let mut distinct = 1u64;
108            for j in 1..bucket_values.len() {
109                if bucket_values[j] != bucket_values[j - 1] {
110                    distinct += 1;
111                }
112            }
113
114            buckets.push(HistogramBucket::new(
115                lower,
116                upper,
117                distinct,
118                bucket_values.len() as u64,
119            ));
120
121            current_start = end;
122        }
123
124        Self::new(buckets)
125    }
126
127    /// Returns the number of buckets.
128    pub fn bucket_count(&self) -> usize {
129        self.buckets.len()
130    }
131
132    /// Returns the buckets.
133    pub fn buckets(&self) -> &[HistogramBucket] {
134        &self.buckets
135    }
136
137    /// Returns the total row count.
138    pub fn total_rows(&self) -> u64 {
139        self.total_rows
140    }
141
142    /// Returns the total distinct count.
143    pub fn total_distinct(&self) -> u64 {
144        self.total_distinct
145    }
146
147    /// Estimates the selectivity of an equality predicate.
148    ///
149    /// Returns the estimated fraction of rows that match the value.
150    pub fn estimate_equality_selectivity(&self, value: &Value) -> f64 {
151        if self.total_rows == 0 {
152            return 0.0;
153        }
154
155        // Find the bucket containing the value
156        for bucket in &self.buckets {
157            if bucket.contains(value) {
158                // Assume uniform distribution within bucket
159                if bucket.distinct_count == 0 {
160                    return 0.0;
161                }
162                return (bucket.row_count as f64 / bucket.distinct_count as f64)
163                    / self.total_rows as f64;
164            }
165        }
166
167        // Value not in histogram - assume very low selectivity
168        1.0 / self.total_rows as f64
169    }
170
171    /// Estimates the selectivity of a range predicate.
172    ///
173    /// # Arguments
174    /// * `lower` - Lower bound (None for unbounded).
175    /// * `upper` - Upper bound (None for unbounded).
176    /// * `lower_inclusive` - Whether lower bound is inclusive.
177    /// * `upper_inclusive` - Whether upper bound is inclusive.
178    pub fn estimate_range_selectivity(
179        &self,
180        lower: Option<&Value>,
181        upper: Option<&Value>,
182        lower_inclusive: bool,
183        upper_inclusive: bool,
184    ) -> f64 {
185        if self.total_rows == 0 {
186            return 0.0;
187        }
188
189        let mut matching_rows = 0.0;
190
191        for bucket in &self.buckets {
192            // Check if bucket overlaps with range
193            let bucket_in_range = match (lower, upper) {
194                (None, None) => true,
195                (Some(l), None) => compare_values(&bucket.upper, l) != Some(Ordering::Less),
196                (None, Some(u)) => compare_values(&bucket.lower, u) != Some(Ordering::Greater),
197                (Some(l), Some(u)) => {
198                    compare_values(&bucket.upper, l) != Some(Ordering::Less)
199                        && compare_values(&bucket.lower, u) != Some(Ordering::Greater)
200                }
201            };
202
203            if !bucket_in_range {
204                continue;
205            }
206
207            // Estimate the fraction of the bucket that's in range
208            let bucket_fraction = estimate_bucket_overlap(
209                &bucket.lower,
210                &bucket.upper,
211                lower,
212                upper,
213                lower_inclusive,
214                upper_inclusive,
215            );
216
217            matching_rows += bucket.row_count as f64 * bucket_fraction;
218        }
219
220        matching_rows / self.total_rows as f64
221    }
222
223    /// Estimates the selectivity of a less-than predicate.
224    pub fn estimate_less_than_selectivity(&self, value: &Value, inclusive: bool) -> f64 {
225        self.estimate_range_selectivity(None, Some(value), true, inclusive)
226    }
227
228    /// Estimates the selectivity of a greater-than predicate.
229    pub fn estimate_greater_than_selectivity(&self, value: &Value, inclusive: bool) -> f64 {
230        self.estimate_range_selectivity(Some(value), None, inclusive, true)
231    }
232}
233
234/// Estimates the fraction of a bucket that overlaps with a range.
235fn estimate_bucket_overlap(
236    bucket_lower: &Value,
237    bucket_upper: &Value,
238    range_lower: Option<&Value>,
239    range_upper: Option<&Value>,
240    _lower_inclusive: bool,
241    _upper_inclusive: bool,
242) -> f64 {
243    // For simplicity, use linear interpolation based on value positions
244    // This is a rough estimate but works well for numeric values
245
246    // If range fully contains bucket, return 1.0
247    let range_contains_lower = range_lower
248        .map(|l| compare_values(bucket_lower, l) != Some(Ordering::Less))
249        .unwrap_or(true);
250    let range_contains_upper = range_upper
251        .map(|u| compare_values(bucket_upper, u) != Some(Ordering::Greater))
252        .unwrap_or(true);
253
254    if range_contains_lower && range_contains_upper {
255        return 1.0;
256    }
257
258    // For partial overlap, estimate fraction
259    // This is simplified - a real implementation would use numeric interpolation
260    match (bucket_lower, bucket_upper) {
261        (Value::Int64(bl), Value::Int64(bu)) => {
262            let bucket_range = (bu - bl) as f64;
263            if bucket_range <= 0.0 {
264                return 1.0;
265            }
266
267            let effective_lower = range_lower
268                .and_then(|l| {
269                    if let Value::Int64(li) = l {
270                        Some(*li)
271                    } else {
272                        None
273                    }
274                })
275                .unwrap_or(*bl);
276
277            let effective_upper = range_upper
278                .and_then(|u| {
279                    if let Value::Int64(ui) = u {
280                        Some(*ui)
281                    } else {
282                        None
283                    }
284                })
285                .unwrap_or(*bu);
286
287            let overlap_lower = effective_lower.max(*bl);
288            let overlap_upper = effective_upper.min(*bu);
289
290            if overlap_upper < overlap_lower {
291                return 0.0;
292            }
293
294            (overlap_upper - overlap_lower) as f64 / bucket_range
295        }
296        (Value::Float64(bl), Value::Float64(bu)) => {
297            let bucket_range = bu - bl;
298            if bucket_range <= 0.0 {
299                return 1.0;
300            }
301
302            let effective_lower = range_lower
303                .and_then(|l| {
304                    if let Value::Float64(li) = l {
305                        Some(*li)
306                    } else {
307                        None
308                    }
309                })
310                .unwrap_or(*bl);
311
312            let effective_upper = range_upper
313                .and_then(|u| {
314                    if let Value::Float64(ui) = u {
315                        Some(*ui)
316                    } else {
317                        None
318                    }
319                })
320                .unwrap_or(*bu);
321
322            let overlap_lower = effective_lower.max(*bl);
323            let overlap_upper = effective_upper.min(*bu);
324
325            if overlap_upper < overlap_lower {
326                return 0.0;
327            }
328
329            (overlap_upper - overlap_lower) / bucket_range
330        }
331        _ => {
332            // For non-numeric types, assume 0.5 for partial overlap
333            0.5
334        }
335    }
336}
337
338/// Compares two values.
339fn compare_values(a: &Value, b: &Value) -> Option<Ordering> {
340    match (a, b) {
341        (Value::Int64(a), Value::Int64(b)) => Some(a.cmp(b)),
342        (Value::Float64(a), Value::Float64(b)) => a.partial_cmp(b),
343        (Value::String(a), Value::String(b)) => Some(a.cmp(b)),
344        (Value::Bool(a), Value::Bool(b)) => Some(a.cmp(b)),
345        (Value::Int64(a), Value::Float64(b)) => (*a as f64).partial_cmp(b),
346        (Value::Float64(a), Value::Int64(b)) => a.partial_cmp(&(*b as f64)),
347        _ => None,
348    }
349}
350
351#[cfg(test)]
352mod tests {
353    use super::*;
354
355    fn create_int_values(values: &[i64]) -> Vec<Value> {
356        values.iter().map(|&v| Value::Int64(v)).collect()
357    }
358
359    #[test]
360    fn test_histogram_build() {
361        let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
362        let hist = Histogram::build(&values, 2);
363
364        assert_eq!(hist.bucket_count(), 2);
365        assert_eq!(hist.total_rows(), 10);
366    }
367
368    #[test]
369    fn test_equality_selectivity() {
370        let values = create_int_values(&[1, 1, 2, 2, 2, 3, 3, 3, 3, 4]);
371        let hist = Histogram::build(&values, 4);
372
373        // Value 3 appears 4 times out of 10
374        let sel = hist.estimate_equality_selectivity(&Value::Int64(3));
375        assert!(sel > 0.0 && sel < 1.0);
376    }
377
378    #[test]
379    fn test_range_selectivity() {
380        let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
381        let hist = Histogram::build(&values, 5);
382
383        // Range 1-5 should be about 50%
384        let sel = hist.estimate_range_selectivity(
385            Some(&Value::Int64(1)),
386            Some(&Value::Int64(5)),
387            true,
388            true,
389        );
390        assert!(sel >= 0.3 && sel <= 0.7);
391
392        // Range 1-10 should be 100%
393        let sel_full = hist.estimate_range_selectivity(
394            Some(&Value::Int64(1)),
395            Some(&Value::Int64(10)),
396            true,
397            true,
398        );
399        assert!((sel_full - 1.0).abs() < 0.1);
400    }
401
402    #[test]
403    fn test_less_than_selectivity() {
404        let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
405        let hist = Histogram::build(&values, 5);
406
407        // Less than 5 should be about 40%
408        let sel = hist.estimate_less_than_selectivity(&Value::Int64(5), false);
409        assert!(sel > 0.0 && sel < 1.0);
410    }
411
412    #[test]
413    fn test_empty_histogram() {
414        let hist = Histogram::build(&[], 5);
415
416        assert_eq!(hist.bucket_count(), 0);
417        assert_eq!(hist.total_rows(), 0);
418        assert_eq!(hist.estimate_equality_selectivity(&Value::Int64(5)), 0.0);
419    }
420}