1use grafeo_common::types::Value;
7use std::cmp::Ordering;
8
9#[derive(Debug, Clone)]
11pub struct HistogramBucket {
12 pub lower: Value,
14 pub upper: Value,
16 pub distinct_count: u64,
18 pub row_count: u64,
20}
21
22impl HistogramBucket {
23 pub fn new(lower: Value, upper: Value, distinct_count: u64, row_count: u64) -> Self {
25 Self {
26 lower,
27 upper,
28 distinct_count,
29 row_count,
30 }
31 }
32
33 pub fn contains(&self, value: &Value) -> bool {
35 compare_values(value, &self.lower) != Some(Ordering::Less)
36 && compare_values(value, &self.upper) != Some(Ordering::Greater)
37 }
38}
39
40#[derive(Debug, Clone)]
44pub struct Histogram {
45 buckets: Vec<HistogramBucket>,
47 total_rows: u64,
49 total_distinct: u64,
51}
52
53impl Histogram {
54 pub fn new(buckets: Vec<HistogramBucket>) -> Self {
56 let total_rows = buckets.iter().map(|b| b.row_count).sum();
57 let total_distinct = buckets.iter().map(|b| b.distinct_count).sum();
58
59 Self {
60 buckets,
61 total_rows,
62 total_distinct,
63 }
64 }
65
66 pub fn build(sorted_values: &[Value], num_buckets: usize) -> Self {
72 if sorted_values.is_empty() {
73 return Self::new(Vec::new());
74 }
75
76 let num_buckets = num_buckets.max(1).min(sorted_values.len());
77 let rows_per_bucket = sorted_values.len() / num_buckets;
78
79 let mut buckets = Vec::with_capacity(num_buckets);
80 let mut current_start = 0;
81
82 for i in 0..num_buckets {
83 let end = if i == num_buckets - 1 {
84 sorted_values.len()
85 } else {
86 current_start + rows_per_bucket
87 };
88
89 if current_start >= sorted_values.len() {
90 break;
91 }
92
93 let bucket_values = &sorted_values[current_start..end];
94 let lower = bucket_values
98 .first()
99 .expect("bucket_values is non-empty: current_start < end")
100 .clone();
101 let upper = bucket_values
102 .last()
103 .expect("bucket_values is non-empty: current_start < end")
104 .clone();
105
106 let mut distinct = 1u64;
108 for j in 1..bucket_values.len() {
109 if bucket_values[j] != bucket_values[j - 1] {
110 distinct += 1;
111 }
112 }
113
114 buckets.push(HistogramBucket::new(
115 lower,
116 upper,
117 distinct,
118 bucket_values.len() as u64,
119 ));
120
121 current_start = end;
122 }
123
124 Self::new(buckets)
125 }
126
127 pub fn bucket_count(&self) -> usize {
129 self.buckets.len()
130 }
131
132 pub fn buckets(&self) -> &[HistogramBucket] {
134 &self.buckets
135 }
136
137 pub fn total_rows(&self) -> u64 {
139 self.total_rows
140 }
141
142 pub fn total_distinct(&self) -> u64 {
144 self.total_distinct
145 }
146
147 pub fn estimate_equality_selectivity(&self, value: &Value) -> f64 {
151 if self.total_rows == 0 {
152 return 0.0;
153 }
154
155 for bucket in &self.buckets {
157 if bucket.contains(value) {
158 if bucket.distinct_count == 0 {
160 return 0.0;
161 }
162 return (bucket.row_count as f64 / bucket.distinct_count as f64)
163 / self.total_rows as f64;
164 }
165 }
166
167 1.0 / self.total_rows as f64
169 }
170
171 pub fn estimate_range_selectivity(
179 &self,
180 lower: Option<&Value>,
181 upper: Option<&Value>,
182 lower_inclusive: bool,
183 upper_inclusive: bool,
184 ) -> f64 {
185 if self.total_rows == 0 {
186 return 0.0;
187 }
188
189 let mut matching_rows = 0.0;
190
191 for bucket in &self.buckets {
192 let bucket_in_range = match (lower, upper) {
194 (None, None) => true,
195 (Some(l), None) => compare_values(&bucket.upper, l) != Some(Ordering::Less),
196 (None, Some(u)) => compare_values(&bucket.lower, u) != Some(Ordering::Greater),
197 (Some(l), Some(u)) => {
198 compare_values(&bucket.upper, l) != Some(Ordering::Less)
199 && compare_values(&bucket.lower, u) != Some(Ordering::Greater)
200 }
201 };
202
203 if !bucket_in_range {
204 continue;
205 }
206
207 let bucket_fraction = estimate_bucket_overlap(
209 &bucket.lower,
210 &bucket.upper,
211 lower,
212 upper,
213 lower_inclusive,
214 upper_inclusive,
215 );
216
217 matching_rows += bucket.row_count as f64 * bucket_fraction;
218 }
219
220 matching_rows / self.total_rows as f64
221 }
222
223 pub fn estimate_less_than_selectivity(&self, value: &Value, inclusive: bool) -> f64 {
225 self.estimate_range_selectivity(None, Some(value), true, inclusive)
226 }
227
228 pub fn estimate_greater_than_selectivity(&self, value: &Value, inclusive: bool) -> f64 {
230 self.estimate_range_selectivity(Some(value), None, inclusive, true)
231 }
232}
233
234fn estimate_bucket_overlap(
236 bucket_lower: &Value,
237 bucket_upper: &Value,
238 range_lower: Option<&Value>,
239 range_upper: Option<&Value>,
240 _lower_inclusive: bool,
241 _upper_inclusive: bool,
242) -> f64 {
243 let range_contains_lower = range_lower
248 .map(|l| compare_values(bucket_lower, l) != Some(Ordering::Less))
249 .unwrap_or(true);
250 let range_contains_upper = range_upper
251 .map(|u| compare_values(bucket_upper, u) != Some(Ordering::Greater))
252 .unwrap_or(true);
253
254 if range_contains_lower && range_contains_upper {
255 return 1.0;
256 }
257
258 match (bucket_lower, bucket_upper) {
261 (Value::Int64(bl), Value::Int64(bu)) => {
262 let bucket_range = (bu - bl) as f64;
263 if bucket_range <= 0.0 {
264 return 1.0;
265 }
266
267 let effective_lower = range_lower
268 .and_then(|l| {
269 if let Value::Int64(li) = l {
270 Some(*li)
271 } else {
272 None
273 }
274 })
275 .unwrap_or(*bl);
276
277 let effective_upper = range_upper
278 .and_then(|u| {
279 if let Value::Int64(ui) = u {
280 Some(*ui)
281 } else {
282 None
283 }
284 })
285 .unwrap_or(*bu);
286
287 let overlap_lower = effective_lower.max(*bl);
288 let overlap_upper = effective_upper.min(*bu);
289
290 if overlap_upper < overlap_lower {
291 return 0.0;
292 }
293
294 (overlap_upper - overlap_lower) as f64 / bucket_range
295 }
296 (Value::Float64(bl), Value::Float64(bu)) => {
297 let bucket_range = bu - bl;
298 if bucket_range <= 0.0 {
299 return 1.0;
300 }
301
302 let effective_lower = range_lower
303 .and_then(|l| {
304 if let Value::Float64(li) = l {
305 Some(*li)
306 } else {
307 None
308 }
309 })
310 .unwrap_or(*bl);
311
312 let effective_upper = range_upper
313 .and_then(|u| {
314 if let Value::Float64(ui) = u {
315 Some(*ui)
316 } else {
317 None
318 }
319 })
320 .unwrap_or(*bu);
321
322 let overlap_lower = effective_lower.max(*bl);
323 let overlap_upper = effective_upper.min(*bu);
324
325 if overlap_upper < overlap_lower {
326 return 0.0;
327 }
328
329 (overlap_upper - overlap_lower) / bucket_range
330 }
331 _ => {
332 0.5
334 }
335 }
336}
337
338fn compare_values(a: &Value, b: &Value) -> Option<Ordering> {
340 match (a, b) {
341 (Value::Int64(a), Value::Int64(b)) => Some(a.cmp(b)),
342 (Value::Float64(a), Value::Float64(b)) => a.partial_cmp(b),
343 (Value::String(a), Value::String(b)) => Some(a.cmp(b)),
344 (Value::Bool(a), Value::Bool(b)) => Some(a.cmp(b)),
345 (Value::Int64(a), Value::Float64(b)) => (*a as f64).partial_cmp(b),
346 (Value::Float64(a), Value::Int64(b)) => a.partial_cmp(&(*b as f64)),
347 _ => None,
348 }
349}
350
351#[cfg(test)]
352mod tests {
353 use super::*;
354
355 fn create_int_values(values: &[i64]) -> Vec<Value> {
356 values.iter().map(|&v| Value::Int64(v)).collect()
357 }
358
359 #[test]
360 fn test_histogram_build() {
361 let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
362 let hist = Histogram::build(&values, 2);
363
364 assert_eq!(hist.bucket_count(), 2);
365 assert_eq!(hist.total_rows(), 10);
366 }
367
368 #[test]
369 fn test_equality_selectivity() {
370 let values = create_int_values(&[1, 1, 2, 2, 2, 3, 3, 3, 3, 4]);
371 let hist = Histogram::build(&values, 4);
372
373 let sel = hist.estimate_equality_selectivity(&Value::Int64(3));
375 assert!(sel > 0.0 && sel < 1.0);
376 }
377
378 #[test]
379 fn test_range_selectivity() {
380 let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
381 let hist = Histogram::build(&values, 5);
382
383 let sel = hist.estimate_range_selectivity(
385 Some(&Value::Int64(1)),
386 Some(&Value::Int64(5)),
387 true,
388 true,
389 );
390 assert!(sel >= 0.3 && sel <= 0.7);
391
392 let sel_full = hist.estimate_range_selectivity(
394 Some(&Value::Int64(1)),
395 Some(&Value::Int64(10)),
396 true,
397 true,
398 );
399 assert!((sel_full - 1.0).abs() < 0.1);
400 }
401
402 #[test]
403 fn test_less_than_selectivity() {
404 let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
405 let hist = Histogram::build(&values, 5);
406
407 let sel = hist.estimate_less_than_selectivity(&Value::Int64(5), false);
409 assert!(sel > 0.0 && sel < 1.0);
410 }
411
412 #[test]
413 fn test_empty_histogram() {
414 let hist = Histogram::build(&[], 5);
415
416 assert_eq!(hist.bucket_count(), 0);
417 assert_eq!(hist.total_rows(), 0);
418 assert_eq!(hist.estimate_equality_selectivity(&Value::Int64(5)), 0.0);
419 }
420}