1use grafeo_common::types::Value;
8use std::cmp::Ordering;
9
10#[derive(Debug, Clone)]
12pub struct HistogramBucket {
13 pub lower: Value,
15 pub upper: Value,
17 pub distinct_count: u64,
19 pub row_count: u64,
21}
22
23impl HistogramBucket {
24 pub fn new(lower: Value, upper: Value, distinct_count: u64, row_count: u64) -> Self {
26 Self {
27 lower,
28 upper,
29 distinct_count,
30 row_count,
31 }
32 }
33
34 pub fn contains(&self, value: &Value) -> bool {
36 compare_values(value, &self.lower) != Some(Ordering::Less)
37 && compare_values(value, &self.upper) != Some(Ordering::Greater)
38 }
39}
40
41#[derive(Debug, Clone)]
48pub struct Histogram {
49 buckets: Vec<HistogramBucket>,
51 total_rows: u64,
53 total_distinct: u64,
55}
56
57impl Histogram {
58 pub fn new(buckets: Vec<HistogramBucket>) -> Self {
60 let total_rows = buckets.iter().map(|b| b.row_count).sum();
61 let total_distinct = buckets.iter().map(|b| b.distinct_count).sum();
62
63 Self {
64 buckets,
65 total_rows,
66 total_distinct,
67 }
68 }
69
70 pub fn build(sorted_values: &[Value], num_buckets: usize) -> Self {
80 if sorted_values.is_empty() {
81 return Self::new(Vec::new());
82 }
83
84 let num_buckets = num_buckets.max(1).min(sorted_values.len());
85 let rows_per_bucket = sorted_values.len() / num_buckets;
86
87 let mut buckets = Vec::with_capacity(num_buckets);
88 let mut current_start = 0;
89
90 for i in 0..num_buckets {
91 let end = if i == num_buckets - 1 {
92 sorted_values.len()
93 } else {
94 current_start + rows_per_bucket
95 };
96
97 if current_start >= sorted_values.len() {
98 break;
99 }
100
101 let bucket_values = &sorted_values[current_start..end];
102 let lower = bucket_values
106 .first()
107 .expect("bucket_values is non-empty: current_start < end")
108 .clone();
109 let upper = bucket_values
110 .last()
111 .expect("bucket_values is non-empty: current_start < end")
112 .clone();
113
114 let mut distinct = 1u64;
116 for j in 1..bucket_values.len() {
117 if bucket_values[j] != bucket_values[j - 1] {
118 distinct += 1;
119 }
120 }
121
122 buckets.push(HistogramBucket::new(
123 lower,
124 upper,
125 distinct,
126 bucket_values.len() as u64,
127 ));
128
129 current_start = end;
130 }
131
132 Self::new(buckets)
133 }
134
135 pub fn bucket_count(&self) -> usize {
137 self.buckets.len()
138 }
139
140 pub fn buckets(&self) -> &[HistogramBucket] {
142 &self.buckets
143 }
144
145 pub fn total_rows(&self) -> u64 {
147 self.total_rows
148 }
149
150 pub fn total_distinct(&self) -> u64 {
152 self.total_distinct
153 }
154
155 pub fn estimate_equality_selectivity(&self, value: &Value) -> f64 {
159 if self.total_rows == 0 {
160 return 0.0;
161 }
162
163 for bucket in &self.buckets {
165 if bucket.contains(value) {
166 if bucket.distinct_count == 0 {
168 return 0.0;
169 }
170 return (bucket.row_count as f64 / bucket.distinct_count as f64)
171 / self.total_rows as f64;
172 }
173 }
174
175 1.0 / self.total_rows as f64
177 }
178
179 pub fn estimate_range_selectivity(
187 &self,
188 lower: Option<&Value>,
189 upper: Option<&Value>,
190 lower_inclusive: bool,
191 upper_inclusive: bool,
192 ) -> f64 {
193 if self.total_rows == 0 {
194 return 0.0;
195 }
196
197 let mut matching_rows = 0.0;
198
199 for bucket in &self.buckets {
200 let bucket_in_range = match (lower, upper) {
202 (None, None) => true,
203 (Some(l), None) => compare_values(&bucket.upper, l) != Some(Ordering::Less),
204 (None, Some(u)) => compare_values(&bucket.lower, u) != Some(Ordering::Greater),
205 (Some(l), Some(u)) => {
206 compare_values(&bucket.upper, l) != Some(Ordering::Less)
207 && compare_values(&bucket.lower, u) != Some(Ordering::Greater)
208 }
209 };
210
211 if !bucket_in_range {
212 continue;
213 }
214
215 let bucket_fraction = estimate_bucket_overlap(
217 &bucket.lower,
218 &bucket.upper,
219 lower,
220 upper,
221 lower_inclusive,
222 upper_inclusive,
223 );
224
225 matching_rows += bucket.row_count as f64 * bucket_fraction;
226 }
227
228 matching_rows / self.total_rows as f64
229 }
230
231 pub fn estimate_less_than_selectivity(&self, value: &Value, inclusive: bool) -> f64 {
233 self.estimate_range_selectivity(None, Some(value), true, inclusive)
234 }
235
236 pub fn estimate_greater_than_selectivity(&self, value: &Value, inclusive: bool) -> f64 {
238 self.estimate_range_selectivity(Some(value), None, inclusive, true)
239 }
240}
241
242fn estimate_bucket_overlap(
244 bucket_lower: &Value,
245 bucket_upper: &Value,
246 range_lower: Option<&Value>,
247 range_upper: Option<&Value>,
248 _lower_inclusive: bool,
249 _upper_inclusive: bool,
250) -> f64 {
251 let range_contains_lower = range_lower.map_or(true, |l| {
256 compare_values(bucket_lower, l) != Some(Ordering::Less)
257 });
258 let range_contains_upper = range_upper.map_or(true, |u| {
259 compare_values(bucket_upper, u) != Some(Ordering::Greater)
260 });
261
262 if range_contains_lower && range_contains_upper {
263 return 1.0;
264 }
265
266 match (bucket_lower, bucket_upper) {
269 (Value::Int64(bl), Value::Int64(bu)) => {
270 let bucket_range = (bu - bl) as f64;
271 if bucket_range <= 0.0 {
272 return 1.0;
273 }
274
275 let effective_lower = range_lower
276 .and_then(|l| {
277 if let Value::Int64(li) = l {
278 Some(*li)
279 } else {
280 None
281 }
282 })
283 .unwrap_or(*bl);
284
285 let effective_upper = range_upper
286 .and_then(|u| {
287 if let Value::Int64(ui) = u {
288 Some(*ui)
289 } else {
290 None
291 }
292 })
293 .unwrap_or(*bu);
294
295 let overlap_lower = effective_lower.max(*bl);
296 let overlap_upper = effective_upper.min(*bu);
297
298 if overlap_upper < overlap_lower {
299 return 0.0;
300 }
301
302 (overlap_upper - overlap_lower) as f64 / bucket_range
303 }
304 (Value::Float64(bl), Value::Float64(bu)) => {
305 let bucket_range = bu - bl;
306 if bucket_range <= 0.0 {
307 return 1.0;
308 }
309
310 let effective_lower = range_lower
311 .and_then(|l| {
312 if let Value::Float64(li) = l {
313 Some(*li)
314 } else {
315 None
316 }
317 })
318 .unwrap_or(*bl);
319
320 let effective_upper = range_upper
321 .and_then(|u| {
322 if let Value::Float64(ui) = u {
323 Some(*ui)
324 } else {
325 None
326 }
327 })
328 .unwrap_or(*bu);
329
330 let overlap_lower = effective_lower.max(*bl);
331 let overlap_upper = effective_upper.min(*bu);
332
333 if overlap_upper < overlap_lower {
334 return 0.0;
335 }
336
337 (overlap_upper - overlap_lower) / bucket_range
338 }
339 _ => {
340 0.5
342 }
343 }
344}
345
346fn compare_values(a: &Value, b: &Value) -> Option<Ordering> {
348 match (a, b) {
349 (Value::Int64(a), Value::Int64(b)) => Some(a.cmp(b)),
350 (Value::Float64(a), Value::Float64(b)) => a.partial_cmp(b),
351 (Value::String(a), Value::String(b)) => Some(a.cmp(b)),
352 (Value::Bool(a), Value::Bool(b)) => Some(a.cmp(b)),
353 (Value::Int64(a), Value::Float64(b)) => (*a as f64).partial_cmp(b),
354 (Value::Float64(a), Value::Int64(b)) => a.partial_cmp(&(*b as f64)),
355 (Value::Timestamp(a), Value::Timestamp(b)) => Some(a.cmp(b)),
356 (Value::Date(a), Value::Date(b)) => Some(a.cmp(b)),
357 (Value::Time(a), Value::Time(b)) => Some(a.cmp(b)),
358 _ => None,
359 }
360}
361
362#[cfg(test)]
363mod tests {
364 use super::*;
365
366 fn create_int_values(values: &[i64]) -> Vec<Value> {
367 values.iter().map(|&v| Value::Int64(v)).collect()
368 }
369
370 #[test]
371 fn test_histogram_build() {
372 let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
373 let hist = Histogram::build(&values, 2);
374
375 assert_eq!(hist.bucket_count(), 2);
376 assert_eq!(hist.total_rows(), 10);
377 }
378
379 #[test]
380 fn test_equality_selectivity() {
381 let values = create_int_values(&[1, 1, 2, 2, 2, 3, 3, 3, 3, 4]);
382 let hist = Histogram::build(&values, 4);
383
384 let sel = hist.estimate_equality_selectivity(&Value::Int64(3));
386 assert!(sel > 0.0 && sel < 1.0);
387 }
388
389 #[test]
390 fn test_range_selectivity() {
391 let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
392 let hist = Histogram::build(&values, 5);
393
394 let sel = hist.estimate_range_selectivity(
396 Some(&Value::Int64(1)),
397 Some(&Value::Int64(5)),
398 true,
399 true,
400 );
401 assert!((0.3..=0.7).contains(&sel));
402
403 let sel_full = hist.estimate_range_selectivity(
405 Some(&Value::Int64(1)),
406 Some(&Value::Int64(10)),
407 true,
408 true,
409 );
410 assert!((sel_full - 1.0).abs() < 0.1);
411 }
412
413 #[test]
414 fn test_less_than_selectivity() {
415 let values = create_int_values(&[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]);
416 let hist = Histogram::build(&values, 5);
417
418 let sel = hist.estimate_less_than_selectivity(&Value::Int64(5), false);
420 assert!(sel > 0.0 && sel < 1.0);
421 }
422
423 #[test]
424 fn test_empty_histogram() {
425 let hist = Histogram::build(&[], 5);
426
427 assert_eq!(hist.bucket_count(), 0);
428 assert_eq!(hist.total_rows(), 0);
429 assert_eq!(hist.estimate_equality_selectivity(&Value::Int64(5)), 0.0);
430 }
431}