reddb_server/storage/query/planner/
histogram.rs1use std::cmp::Ordering;
24
25#[derive(Debug, Clone)]
33pub enum ColumnValue {
34 Int(i64),
35 Float(f64),
36 Text(String),
37}
38
39impl ColumnValue {
40 fn cmp_inner(&self, other: &ColumnValue) -> Ordering {
41 match (self, other) {
42 (ColumnValue::Int(a), ColumnValue::Int(b)) => a.cmp(b),
43 (ColumnValue::Float(a), ColumnValue::Float(b)) => {
44 a.partial_cmp(b).unwrap_or(Ordering::Equal)
45 }
46 (ColumnValue::Text(a), ColumnValue::Text(b)) => a.cmp(b),
47 (ColumnValue::Int(_), _) => Ordering::Less,
52 (ColumnValue::Float(_), ColumnValue::Int(_)) => Ordering::Greater,
53 (ColumnValue::Float(_), _) => Ordering::Less,
54 (ColumnValue::Text(_), _) => Ordering::Greater,
55 }
56 }
57}
58
59impl PartialEq for ColumnValue {
60 fn eq(&self, other: &Self) -> bool {
61 self.cmp_inner(other) == Ordering::Equal
62 }
63}
64
65impl Eq for ColumnValue {}
66
67impl std::hash::Hash for ColumnValue {
68 fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
69 std::mem::discriminant(self).hash(state);
70 match self {
71 ColumnValue::Int(v) => v.hash(state),
72 ColumnValue::Float(v) => v.to_bits().hash(state),
73 ColumnValue::Text(v) => v.hash(state),
74 }
75 }
76}
77
78impl PartialOrd for ColumnValue {
79 fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
80 Some(std::cmp::Ord::cmp(self, other))
81 }
82}
83
84impl Ord for ColumnValue {
85 fn cmp(&self, other: &Self) -> Ordering {
86 self.cmp_inner(other)
87 }
88}
89
90#[derive(Debug, Clone)]
94pub struct Bucket {
95 pub min: ColumnValue,
96 pub max: ColumnValue,
97 pub count: u64,
98}
99
100#[derive(Debug, Clone)]
107pub struct Histogram {
108 pub buckets: Vec<Bucket>,
109 pub total_count: u64,
110}
111
112impl Histogram {
113 pub fn equi_depth_from_sample(mut values: Vec<ColumnValue>, bucket_count: usize) -> Self {
119 if values.is_empty() {
120 return Self {
121 buckets: Vec::new(),
122 total_count: 0,
123 };
124 }
125 let bucket_count = bucket_count.clamp(1, values.len());
126 values.sort();
127 let total = values.len();
128 let per_bucket = total / bucket_count;
129 let mut remainder = total % bucket_count;
130 let mut buckets = Vec::with_capacity(bucket_count);
131 let mut idx = 0;
132 for _ in 0..bucket_count {
133 let take = if remainder > 0 {
134 remainder -= 1;
135 per_bucket + 1
136 } else {
137 per_bucket
138 };
139 if take == 0 {
140 break;
141 }
142 let end = (idx + take).min(values.len());
143 let min = values[idx].clone();
144 let max = values[end - 1].clone();
145 buckets.push(Bucket {
146 min,
147 max,
148 count: take as u64,
149 });
150 idx = end;
151 }
152 Self {
153 buckets,
154 total_count: total as u64,
155 }
156 }
157
158 pub fn range_selectivity(&self, lo: Option<&ColumnValue>, hi: Option<&ColumnValue>) -> f64 {
167 if self.total_count == 0 || self.buckets.is_empty() {
168 return 1.0;
169 }
170 let mut matched: f64 = 0.0;
171 for bucket in &self.buckets {
172 let bucket_size = bucket.count as f64;
173 let bucket_below_query = hi.is_some() && bucket.min > *hi.unwrap();
176 let bucket_above_query = lo.is_some() && bucket.max < *lo.unwrap();
177 if bucket_below_query || bucket_above_query {
178 continue;
179 }
180 let fully_inside_low = lo.is_none() || bucket.min >= *lo.unwrap();
182 let fully_inside_high = hi.is_none() || bucket.max <= *hi.unwrap();
183 if fully_inside_low && fully_inside_high {
184 matched += bucket_size;
185 continue;
186 }
187 matched += bucket_size * 0.5;
194 }
195 let result = matched / self.total_count as f64;
196 result.clamp(0.0, 1.0)
197 }
198
199 pub fn bucket_count(&self) -> usize {
201 self.buckets.len()
202 }
203}
204
205#[derive(Debug, Clone, Default)]
212pub struct MostCommonValues {
213 pub values: Vec<(ColumnValue, f64)>,
214}
215
216impl MostCommonValues {
217 pub fn new(mut entries: Vec<(ColumnValue, f64)>) -> Self {
222 entries.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(Ordering::Equal));
223 Self { values: entries }
224 }
225
226 pub fn frequency_of(&self, value: &ColumnValue) -> Option<f64> {
229 self.values
230 .iter()
231 .find(|(v, _)| v == value)
232 .map(|(_, f)| *f)
233 }
234
235 pub fn total_frequency(&self) -> f64 {
238 self.values.iter().map(|(_, f)| *f).sum()
239 }
240
241 pub fn len(&self) -> usize {
243 self.values.len()
244 }
245
246 pub fn is_empty(&self) -> bool {
248 self.values.is_empty()
249 }
250}
251
252#[cfg(test)]
253mod tests {
254 use super::*;
255
256 fn ints(vals: &[i64]) -> Vec<ColumnValue> {
257 vals.iter().map(|&i| ColumnValue::Int(i)).collect()
258 }
259
260 #[test]
261 fn empty_sample_produces_empty_histogram() {
262 let h = Histogram::equi_depth_from_sample(vec![], 4);
263 assert_eq!(h.bucket_count(), 0);
264 assert_eq!(h.total_count, 0);
265 assert_eq!(h.range_selectivity(None, None), 1.0);
267 }
268
269 #[test]
270 fn equi_depth_buckets_are_roughly_equal_count() {
271 let sample: Vec<ColumnValue> = (0..100i64).map(ColumnValue::Int).collect();
272 let h = Histogram::equi_depth_from_sample(sample, 10);
273 assert_eq!(h.bucket_count(), 10);
274 assert_eq!(h.total_count, 100);
275 for bucket in &h.buckets {
276 assert_eq!(bucket.count, 10);
277 }
278 }
279
280 #[test]
281 fn equi_depth_distributes_remainder() {
282 let sample: Vec<ColumnValue> = (0..13i64).map(ColumnValue::Int).collect();
284 let h = Histogram::equi_depth_from_sample(sample, 4);
285 assert_eq!(h.bucket_count(), 4);
286 let total: u64 = h.buckets.iter().map(|b| b.count).sum();
287 assert_eq!(total, 13);
288 let counts: Vec<u64> = h.buckets.iter().map(|b| b.count).collect();
291 assert_eq!(counts.iter().sum::<u64>(), 13);
292 assert!(counts.iter().all(|&c| c >= 3 && c <= 4));
293 }
294
295 #[test]
296 fn range_selectivity_full_table_is_one() {
297 let h = Histogram::equi_depth_from_sample(ints(&[1, 2, 3, 4, 5]), 5);
298 assert_eq!(h.range_selectivity(None, None), 1.0);
300 }
301
302 #[test]
303 fn range_selectivity_below_min_is_zero() {
304 let h = Histogram::equi_depth_from_sample(ints(&[10, 20, 30, 40]), 4);
305 let zero = ColumnValue::Int(0);
306 let five = ColumnValue::Int(5);
307 let s = h.range_selectivity(Some(&zero), Some(&five));
308 assert_eq!(s, 0.0);
309 }
310
311 #[test]
312 fn range_selectivity_above_max_is_zero() {
313 let h = Histogram::equi_depth_from_sample(ints(&[10, 20, 30, 40]), 4);
314 let hi = ColumnValue::Int(100);
315 let lo = ColumnValue::Int(50);
316 let s = h.range_selectivity(Some(&lo), Some(&hi));
317 assert_eq!(s, 0.0);
318 }
319
320 #[test]
321 fn histogram_beats_uniform_on_skewed_range() {
322 let mut sample: Vec<ColumnValue> = Vec::new();
324 for i in 0..80 {
325 sample.push(ColumnValue::Int(i % 10));
326 }
327 for i in 0..20 {
328 sample.push(ColumnValue::Int(10 + i * 50));
329 }
330 let h = Histogram::equi_depth_from_sample(sample, 10);
331 let nine = ColumnValue::Int(9);
334 let s = h.range_selectivity(None, Some(&nine));
335 assert!(s > 0.5, "histogram selectivity {} should exceed 0.5", s);
336 assert!(s <= 1.0);
337 }
338
339 #[test]
340 fn range_selectivity_clamped_to_unit_interval() {
341 let h = Histogram::equi_depth_from_sample(ints(&[1, 2, 3]), 3);
342 let lo = ColumnValue::Int(99);
344 let hi = ColumnValue::Int(100);
345 let s = h.range_selectivity(Some(&lo), Some(&hi));
346 assert!((0.0..=1.0).contains(&s));
347 assert_eq!(s, 0.0);
348 }
349
350 #[test]
353 fn mcv_frequency_lookup() {
354 let mcv = MostCommonValues::new(vec![
355 (ColumnValue::Int(7), 0.5),
356 (ColumnValue::Int(42), 0.3),
357 (ColumnValue::Int(99), 0.05),
358 ]);
359 assert_eq!(mcv.frequency_of(&ColumnValue::Int(7)), Some(0.5));
360 assert_eq!(mcv.frequency_of(&ColumnValue::Int(42)), Some(0.3));
361 assert!(mcv.frequency_of(&ColumnValue::Int(0)).is_none());
362 }
363
364 #[test]
365 fn mcv_total_frequency_sums_correctly() {
366 let mcv = MostCommonValues::new(vec![
367 (ColumnValue::Int(1), 0.4),
368 (ColumnValue::Int(2), 0.3),
369 (ColumnValue::Int(3), 0.2),
370 ]);
371 let total = mcv.total_frequency();
372 assert!((total - 0.9).abs() < 1e-9);
373 }
374
375 #[test]
376 fn mcv_sorts_by_frequency_descending() {
377 let mcv = MostCommonValues::new(vec![
378 (ColumnValue::Int(1), 0.1),
379 (ColumnValue::Int(2), 0.5),
380 (ColumnValue::Int(3), 0.2),
381 ]);
382 assert_eq!(mcv.values[0].1, 0.5);
383 assert_eq!(mcv.values[1].1, 0.2);
384 assert_eq!(mcv.values[2].1, 0.1);
385 }
386
387 #[test]
388 fn mcv_beats_uniform_on_skewed_eq() {
389 let mcv = MostCommonValues::new(vec![
391 (ColumnValue::Text("boss".to_string()), 0.5),
392 (ColumnValue::Text("alice".to_string()), 0.05),
393 (ColumnValue::Text("bob".to_string()), 0.05),
394 ]);
395 let boss = ColumnValue::Text("boss".to_string());
396 let freq = mcv.frequency_of(&boss).unwrap();
397 assert_eq!(freq, 0.5);
399 assert!(freq > 0.01);
400 }
401
402 #[test]
403 fn mcv_empty_when_no_values() {
404 let mcv = MostCommonValues::new(vec![]);
405 assert!(mcv.is_empty());
406 assert_eq!(mcv.len(), 0);
407 assert_eq!(mcv.total_frequency(), 0.0);
408 assert!(mcv.frequency_of(&ColumnValue::Int(1)).is_none());
409 }
410}