1use grafeo_common::types::Value;
8use std::cmp::Ordering;
9
10#[derive(Debug, Clone)]
12pub struct HistogramBucket {
13 pub lower: Value,
15 pub upper: Value,
17 pub distinct_count: u64,
19 pub row_count: u64,
21}
22
23impl HistogramBucket {
24 pub fn new(lower: Value, upper: Value, distinct_count: u64, row_count: u64) -> Self {
26 Self {
27 lower,
28 upper,
29 distinct_count,
30 row_count,
31 }
32 }
33
34 pub fn contains(&self, value: &Value) -> bool {
36 compare_values(value, &self.lower) != Some(Ordering::Less)
37 && compare_values(value, &self.upper) != Some(Ordering::Greater)
38 }
39}
40
41#[derive(Debug, Clone)]
48pub struct Histogram {
49 buckets: Vec<HistogramBucket>,
51 total_rows: u64,
53 total_distinct: u64,
55}
56
57impl Histogram {
58 pub fn new(buckets: Vec<HistogramBucket>) -> Self {
60 let total_rows = buckets.iter().map(|b| b.row_count).sum();
61 let total_distinct = buckets.iter().map(|b| b.distinct_count).sum();
62
63 Self {
64 buckets,
65 total_rows,
66 total_distinct,
67 }
68 }
69
70 pub fn build(sorted_values: &[Value], num_buckets: usize) -> Self {
76 if sorted_values.is_empty() {
77 return Self::new(Vec::new());
78 }
79
80 let num_buckets = num_buckets.max(1).min(sorted_values.len());
81 let rows_per_bucket = sorted_values.len() / num_buckets;
82
83 let mut buckets = Vec::with_capacity(num_buckets);
84 let mut current_start = 0;
85
86 for i in 0..num_buckets {
87 let end = if i == num_buckets - 1 {
88 sorted_values.len()
89 } else {
90 current_start + rows_per_bucket
91 };
92
93 if current_start >= sorted_values.len() {
94 break;
95 }
96
97 let bucket_values = &sorted_values[current_start..end];
98 let lower = bucket_values
102 .first()
103 .expect("bucket_values is non-empty: current_start < end")
104 .clone();
105 let upper = bucket_values
106 .last()
107 .expect("bucket_values is non-empty: current_start < end")
108 .clone();
109
110 let mut distinct = 1u64;
112 for j in 1..bucket_values.len() {
113 if bucket_values[j] != bucket_values[j - 1] {
114 distinct += 1;
115 }
116 }
117
118 buckets.push(HistogramBucket::new(
119 lower,
120 upper,
121 distinct,
122 bucket_values.len() as u64,
123 ));
124
125 current_start = end;
126 }
127
128 Self::new(buckets)
129 }
130
131 pub fn bucket_count(&self) -> usize {
133 self.buckets.len()
134 }
135
136 pub fn buckets(&self) -> &[HistogramBucket] {
138 &self.buckets
139 }
140
141 pub fn total_rows(&self) -> u64 {
143 self.total_rows
144 }
145
146 pub fn total_distinct(&self) -> u64 {
148 self.total_distinct
149 }
150
151 pub fn estimate_equality_selectivity(&self, value: &Value) -> f64 {
155 if self.total_rows == 0 {
156 return 0.0;
157 }
158
159 for bucket in &self.buckets {
161 if bucket.contains(value) {
162 if bucket.distinct_count == 0 {
164 return 0.0;
165 }
166 return (bucket.row_count as f64 / bucket.distinct_count as f64)
167 / self.total_rows as f64;
168 }
169 }
170
171 1.0 / self.total_rows as f64
173 }
174
175 pub fn estimate_range_selectivity(
183 &self,
184 lower: Option<&Value>,
185 upper: Option<&Value>,
186 lower_inclusive: bool,
187 upper_inclusive: bool,
188 ) -> f64 {
189 if self.total_rows == 0 {
190 return 0.0;
191 }
192
193 let mut matching_rows = 0.0;
194
195 for bucket in &self.buckets {
196 let bucket_in_range = match (lower, upper) {
198 (None, None) => true,
199 (Some(l), None) => compare_values(&bucket.upper, l) != Some(Ordering::Less),
200 (None, Some(u)) => compare_values(&bucket.lower, u) != Some(Ordering::Greater),
201 (Some(l), Some(u)) => {
202 compare_values(&bucket.upper, l) != Some(Ordering::Less)
203 && compare_values(&bucket.lower, u) != Some(Ordering::Greater)
204 }
205 };
206
207 if !bucket_in_range {
208 continue;
209 }
210
211 let bucket_fraction = estimate_bucket_overlap(
213 &bucket.lower,
214 &bucket.upper,
215 lower,
216 upper,
217 lower_inclusive,
218 upper_inclusive,
219 );
220
221 matching_rows += bucket.row_count as f64 * bucket_fraction;
222 }
223
224 matching_rows / self.total_rows as f64
225 }
226
227 pub fn estimate_less_than_selectivity(&self, value: &Value, inclusive: bool) -> f64 {
229 self.estimate_range_selectivity(None, Some(value), true, inclusive)
230 }
231
232 pub fn estimate_greater_than_selectivity(&self, value: &Value, inclusive: bool) -> f64 {
234 self.estimate_range_selectivity(Some(value), None, inclusive, true)
235 }
236}
237
238fn estimate_bucket_overlap(
240 bucket_lower: &Value,
241 bucket_upper: &Value,
242 range_lower: Option<&Value>,
243 range_upper: Option<&Value>,
244 _lower_inclusive: bool,
245 _upper_inclusive: bool,
246) -> f64 {
247 let range_contains_lower = range_lower
252 .map(|l| compare_values(bucket_lower, l) != Some(Ordering::Less))
253 .unwrap_or(true);
254 let range_contains_upper = range_upper
255 .map(|u| compare_values(bucket_upper, u) != Some(Ordering::Greater))
256 .unwrap_or(true);
257
258 if range_contains_lower && range_contains_upper {
259 return 1.0;
260 }
261
262 match (bucket_lower, bucket_upper) {
265 (Value::Int64(bl), Value::Int64(bu)) => {
266 let bucket_range = (bu - bl) as f64;
267 if bucket_range <= 0.0 {
268 return 1.0;
269 }
270
271 let effective_lower = range_lower
272 .and_then(|l| {
273 if let Value::Int64(li) = l {
274 Some(*li)
275 } else {
276 None
277 }
278 })
279 .unwrap_or(*bl);
280
281 let effective_upper = range_upper
282 .and_then(|u| {
283 if let Value::Int64(ui) = u {
284 Some(*ui)
285 } else {
286 None
287 }
288 })
289 .unwrap_or(*bu);
290
291 let overlap_lower = effective_lower.max(*bl);
292 let overlap_upper = effective_upper.min(*bu);
293
294 if overlap_upper < overlap_lower {
295 return 0.0;
296 }
297
298 (overlap_upper - overlap_lower) as f64 / bucket_range
299 }
300 (Value::Float64(bl), Value::Float64(bu)) => {
301 let bucket_range = bu - bl;
302 if bucket_range <= 0.0 {
303 return 1.0;
304 }
305
306 let effective_lower = range_lower
307 .and_then(|l| {
308 if let Value::Float64(li) = l {
309 Some(*li)
310 } else {
311 None
312 }
313 })
314 .unwrap_or(*bl);
315
316 let effective_upper = range_upper
317 .and_then(|u| {
318 if let Value::Float64(ui) = u {
319 Some(*ui)
320 } else {
321 None
322 }
323 })
324 .unwrap_or(*bu);
325
326 let overlap_lower = effective_lower.max(*bl);
327 let overlap_upper = effective_upper.min(*bu);
328
329 if overlap_upper < overlap_lower {
330 return 0.0;
331 }
332
333 (overlap_upper - overlap_lower) / bucket_range
334 }
335 _ => {
336 0.5
338 }
339 }
340}
341
342fn compare_values(a: &Value, b: &Value) -> Option<Ordering> {
344 match (a, b) {
345 (Value::Int64(a), Value::Int64(b)) => Some(a.cmp(b)),
346 (Value::Float64(a), Value::Float64(b)) => a.partial_cmp(b),
347 (Value::String(a), Value::String(b)) => Some(a.cmp(b)),
348 (Value::Bool(a), Value::Bool(b)) => Some(a.cmp(b)),
349 (Value::Int64(a), Value::Float64(b)) => (*a as f64).partial_cmp(b),
350 (Value::Float64(a), Value::Int64(b)) => a.partial_cmp(&(*b as f64)),
351 _ => None,
352 }
353}
354
355#[cfg(test)]
356mod tests {
357 use super::*;
358
359 fn create_int_values(values: &[i64]) -> Vec<Value> {
360 values.iter().map(|&v| Value::Int64(v)).collect()
361 }
362
363 #[test]
364 fn test_histogram_build() {
365 let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
366 let hist = Histogram::build(&values, 2);
367
368 assert_eq!(hist.bucket_count(), 2);
369 assert_eq!(hist.total_rows(), 10);
370 }
371
372 #[test]
373 fn test_equality_selectivity() {
374 let values = create_int_values(&[1, 1, 2, 2, 2, 3, 3, 3, 3, 4]);
375 let hist = Histogram::build(&values, 4);
376
377 let sel = hist.estimate_equality_selectivity(&Value::Int64(3));
379 assert!(sel > 0.0 && sel < 1.0);
380 }
381
382 #[test]
383 fn test_range_selectivity() {
384 let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
385 let hist = Histogram::build(&values, 5);
386
387 let sel = hist.estimate_range_selectivity(
389 Some(&Value::Int64(1)),
390 Some(&Value::Int64(5)),
391 true,
392 true,
393 );
394 assert!(sel >= 0.3 && sel <= 0.7);
395
396 let sel_full = hist.estimate_range_selectivity(
398 Some(&Value::Int64(1)),
399 Some(&Value::Int64(10)),
400 true,
401 true,
402 );
403 assert!((sel_full - 1.0).abs() < 0.1);
404 }
405
406 #[test]
407 fn test_less_than_selectivity() {
408 let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
409 let hist = Histogram::build(&values, 5);
410
411 let sel = hist.estimate_less_than_selectivity(&Value::Int64(5), false);
413 assert!(sel > 0.0 && sel < 1.0);
414 }
415
416 #[test]
417 fn test_empty_histogram() {
418 let hist = Histogram::build(&[], 5);
419
420 assert_eq!(hist.bucket_count(), 0);
421 assert_eq!(hist.total_rows(), 0);
422 assert_eq!(hist.estimate_equality_selectivity(&Value::Int64(5)), 0.0);
423 }
424}