1use grafeo_common::types::Value;
11use std::cmp::Ordering;
12use std::collections::HashMap;
13
14#[derive(Debug, Clone)]
19pub struct ZoneMapEntry {
20 pub min: Option<Value>,
22 pub max: Option<Value>,
24 pub null_count: u64,
26 pub row_count: u64,
28 pub bloom_filter: Option<BloomFilter>,
30}
31
32impl ZoneMapEntry {
33 pub fn new() -> Self {
35 Self {
36 min: None,
37 max: None,
38 null_count: 0,
39 row_count: 0,
40 bloom_filter: None,
41 }
42 }
43
44 pub fn with_min_max(min: Value, max: Value, null_count: u64, row_count: u64) -> Self {
46 Self {
47 min: Some(min),
48 max: Some(max),
49 null_count,
50 row_count,
51 bloom_filter: None,
52 }
53 }
54
55 pub fn with_bloom_filter(mut self, filter: BloomFilter) -> Self {
57 self.bloom_filter = Some(filter);
58 self
59 }
60
61 pub fn might_contain_equal(&self, value: &Value) -> bool {
65 if matches!(value, Value::Null) {
67 return self.null_count > 0;
68 }
69
70 if self.is_all_null() {
72 return false;
73 }
74
75 if let Some(ref bloom) = self.bloom_filter
77 && !bloom.might_contain(value)
78 {
79 return false;
80 }
81
82 match (&self.min, &self.max) {
84 (Some(min), Some(max)) => {
85 let cmp_min = compare_values(value, min);
86 let cmp_max = compare_values(value, max);
87
88 match (cmp_min, cmp_max) {
90 (Some(Ordering::Less), _) => false, (_, Some(Ordering::Greater)) => false, _ => true,
93 }
94 }
95 _ => self.might_contain_non_null(),
97 }
98 }
99
100 pub fn might_contain_less_than(&self, value: &Value, inclusive: bool) -> bool {
104 match &self.min {
105 Some(min) => {
106 let cmp = compare_values(min, value);
107 match cmp {
108 Some(Ordering::Less) => true,
109 Some(Ordering::Equal) => inclusive,
110 Some(Ordering::Greater) => false,
111 None => true,
112 }
113 }
114 None => self.null_count > 0, }
116 }
117
118 pub fn might_contain_greater_than(&self, value: &Value, inclusive: bool) -> bool {
122 match &self.max {
123 Some(max) => {
124 let cmp = compare_values(max, value);
125 match cmp {
126 Some(Ordering::Greater) => true,
127 Some(Ordering::Equal) => inclusive,
128 Some(Ordering::Less) => false,
129 None => true,
130 }
131 }
132 None => self.null_count > 0,
133 }
134 }
135
136 pub fn might_contain_range(
138 &self,
139 lower: Option<&Value>,
140 upper: Option<&Value>,
141 lower_inclusive: bool,
142 upper_inclusive: bool,
143 ) -> bool {
144 if let Some(lower_val) = lower
146 && !self.might_contain_greater_than(lower_val, lower_inclusive)
147 {
148 return false;
149 }
150
151 if let Some(upper_val) = upper
153 && !self.might_contain_less_than(upper_val, upper_inclusive)
154 {
155 return false;
156 }
157
158 true
159 }
160
161 pub fn might_contain_non_null(&self) -> bool {
163 self.row_count > self.null_count
164 }
165
166 pub fn is_all_null(&self) -> bool {
168 self.row_count > 0 && self.null_count == self.row_count
169 }
170
171 pub fn null_fraction(&self) -> f64 {
173 if self.row_count == 0 {
174 0.0
175 } else {
176 self.null_count as f64 / self.row_count as f64
177 }
178 }
179}
180
181impl Default for ZoneMapEntry {
182 fn default() -> Self {
183 Self::new()
184 }
185}
186
187pub struct ZoneMapBuilder {
194 min: Option<Value>,
195 max: Option<Value>,
196 null_count: u64,
197 row_count: u64,
198 bloom_builder: Option<BloomFilterBuilder>,
199}
200
201const DEFAULT_BLOOM_EXPECTED_ITEMS: usize = 2048;
203
204const DEFAULT_BLOOM_FALSE_POSITIVE_RATE: f64 = 0.01;
206
207impl ZoneMapBuilder {
208 pub fn new() -> Self {
214 Self {
215 min: None,
216 max: None,
217 null_count: 0,
218 row_count: 0,
219 bloom_builder: Some(BloomFilterBuilder::new(
220 DEFAULT_BLOOM_EXPECTED_ITEMS,
221 DEFAULT_BLOOM_FALSE_POSITIVE_RATE,
222 )),
223 }
224 }
225
226 pub fn without_bloom_filter() -> Self {
231 Self {
232 min: None,
233 max: None,
234 null_count: 0,
235 row_count: 0,
236 bloom_builder: None,
237 }
238 }
239
240 pub fn with_bloom_filter(expected_items: usize, false_positive_rate: f64) -> Self {
247 Self {
248 min: None,
249 max: None,
250 null_count: 0,
251 row_count: 0,
252 bloom_builder: Some(BloomFilterBuilder::new(expected_items, false_positive_rate)),
253 }
254 }
255
256 pub fn add(&mut self, value: &Value) {
258 self.row_count += 1;
259
260 if matches!(value, Value::Null) {
261 self.null_count += 1;
262 return;
263 }
264
265 self.min = match &self.min {
267 None => Some(value.clone()),
268 Some(current) => {
269 if compare_values(value, current) == Some(Ordering::Less) {
270 Some(value.clone())
271 } else {
272 self.min.clone()
273 }
274 }
275 };
276
277 self.max = match &self.max {
279 None => Some(value.clone()),
280 Some(current) => {
281 if compare_values(value, current) == Some(Ordering::Greater) {
282 Some(value.clone())
283 } else {
284 self.max.clone()
285 }
286 }
287 };
288
289 if let Some(ref mut bloom) = self.bloom_builder {
291 bloom.add(value);
292 }
293 }
294
295 pub fn build(self) -> ZoneMapEntry {
297 let bloom_filter = self.bloom_builder.map(|b| b.build());
298
299 ZoneMapEntry {
300 min: self.min,
301 max: self.max,
302 null_count: self.null_count,
303 row_count: self.row_count,
304 bloom_filter,
305 }
306 }
307}
308
309impl Default for ZoneMapBuilder {
310 fn default() -> Self {
311 Self::new()
312 }
313}
314
315pub struct ZoneMapIndex {
320 entries: HashMap<u64, ZoneMapEntry>,
322 property: String,
324}
325
326impl ZoneMapIndex {
327 pub fn new(property: impl Into<String>) -> Self {
329 Self {
330 entries: HashMap::new(),
331 property: property.into(),
332 }
333 }
334
335 pub fn property(&self) -> &str {
337 &self.property
338 }
339
340 pub fn insert(&mut self, chunk_id: u64, entry: ZoneMapEntry) {
342 self.entries.insert(chunk_id, entry);
343 }
344
345 pub fn get(&self, chunk_id: u64) -> Option<&ZoneMapEntry> {
347 self.entries.get(&chunk_id)
348 }
349
350 pub fn remove(&mut self, chunk_id: u64) -> Option<ZoneMapEntry> {
352 self.entries.remove(&chunk_id)
353 }
354
355 pub fn len(&self) -> usize {
357 self.entries.len()
358 }
359
360 pub fn is_empty(&self) -> bool {
362 self.entries.is_empty()
363 }
364
365 pub fn filter_equal<'a>(
367 &'a self,
368 value: &'a Value,
369 chunk_ids: impl Iterator<Item = u64> + 'a,
370 ) -> impl Iterator<Item = u64> + 'a {
371 chunk_ids.filter(move |&id| {
372 self.entries
373 .get(&id)
374 .map_or(true, |e| e.might_contain_equal(value)) })
376 }
377
378 pub fn filter_range<'a>(
380 &'a self,
381 lower: Option<&'a Value>,
382 upper: Option<&'a Value>,
383 lower_inclusive: bool,
384 upper_inclusive: bool,
385 chunk_ids: impl Iterator<Item = u64> + 'a,
386 ) -> impl Iterator<Item = u64> + 'a {
387 chunk_ids.filter(move |&id| {
388 self.entries.get(&id).map_or(true, |e| {
389 e.might_contain_range(lower, upper, lower_inclusive, upper_inclusive)
390 })
391 })
392 }
393
394 pub fn chunks_ordered_by_min(&self) -> Vec<u64> {
396 let mut chunks: Vec<_> = self.entries.iter().collect();
397 chunks.sort_by(|(_, a), (_, b)| match (&a.min, &b.min) {
398 (Some(a_min), Some(b_min)) => compare_values(a_min, b_min).unwrap_or(Ordering::Equal),
399 (Some(_), None) => Ordering::Less,
400 (None, Some(_)) => Ordering::Greater,
401 (None, None) => Ordering::Equal,
402 });
403 chunks.into_iter().map(|(&id, _)| id).collect()
404 }
405
406 pub fn overall_stats(&self) -> (Option<Value>, Option<Value>, u64, u64) {
408 let mut min: Option<Value> = None;
409 let mut max: Option<Value> = None;
410 let mut null_count = 0u64;
411 let mut row_count = 0u64;
412
413 for entry in self.entries.values() {
414 null_count += entry.null_count;
415 row_count += entry.row_count;
416
417 if let Some(ref entry_min) = entry.min {
418 min = match min {
419 None => Some(entry_min.clone()),
420 Some(ref current) => {
421 if compare_values(entry_min, current) == Some(Ordering::Less) {
422 Some(entry_min.clone())
423 } else {
424 min
425 }
426 }
427 };
428 }
429
430 if let Some(ref entry_max) = entry.max {
431 max = match max {
432 None => Some(entry_max.clone()),
433 Some(ref current) => {
434 if compare_values(entry_max, current) == Some(Ordering::Greater) {
435 Some(entry_max.clone())
436 } else {
437 max
438 }
439 }
440 };
441 }
442 }
443
444 (min, max, null_count, row_count)
445 }
446}
447
448#[derive(Debug, Clone)]
454pub struct BloomFilter {
455 bits: Vec<u64>,
457 num_hashes: usize,
459 num_bits: usize,
461}
462
463impl BloomFilter {
464 pub fn new(num_bits: usize, num_hashes: usize) -> Self {
466 let num_words = (num_bits + 63) / 64;
467 Self {
468 bits: vec![0; num_words],
469 num_hashes,
470 num_bits,
471 }
472 }
473
474 pub fn add(&mut self, value: &Value) {
476 let hashes = self.compute_hashes(value);
477 for h in hashes {
478 let bit_idx = h % self.num_bits;
479 let word_idx = bit_idx / 64;
480 let bit_pos = bit_idx % 64;
481 self.bits[word_idx] |= 1 << bit_pos;
482 }
483 }
484
485 pub fn might_contain(&self, value: &Value) -> bool {
487 let hashes = self.compute_hashes(value);
488 for h in hashes {
489 let bit_idx = h % self.num_bits;
490 let word_idx = bit_idx / 64;
491 let bit_pos = bit_idx % 64;
492 if (self.bits[word_idx] & (1 << bit_pos)) == 0 {
493 return false;
494 }
495 }
496 true
497 }
498
499 fn compute_hashes(&self, value: &Value) -> Vec<usize> {
500 let base_hash = value_hash(value);
502 let mut hashes = Vec::with_capacity(self.num_hashes);
503
504 for i in 0..self.num_hashes {
505 let h1 = base_hash;
507 let h2 = base_hash.rotate_left(17);
508 hashes.push((h1.wrapping_add((i as u64).wrapping_mul(h2))) as usize);
509 }
510
511 hashes
512 }
513}
514
515pub struct BloomFilterBuilder {
517 filter: BloomFilter,
518}
519
520impl BloomFilterBuilder {
521 pub fn new(expected_items: usize, false_positive_rate: f64) -> Self {
523 let num_bits = optimal_num_bits(expected_items, false_positive_rate);
525 let num_hashes = optimal_num_hashes(num_bits, expected_items);
526
527 Self {
528 filter: BloomFilter::new(num_bits, num_hashes),
529 }
530 }
531
532 pub fn add(&mut self, value: &Value) {
534 self.filter.add(value);
535 }
536
537 pub fn build(self) -> BloomFilter {
539 self.filter
540 }
541}
542
543fn optimal_num_bits(n: usize, p: f64) -> usize {
545 let ln2_squared = std::f64::consts::LN_2 * std::f64::consts::LN_2;
546 ((-(n as f64) * p.ln()) / ln2_squared).ceil() as usize
547}
548
549fn optimal_num_hashes(m: usize, n: usize) -> usize {
551 ((m as f64 / n as f64) * std::f64::consts::LN_2).ceil() as usize
552}
553
554fn value_hash(value: &Value) -> u64 {
556 use std::collections::hash_map::DefaultHasher;
557 use std::hash::{Hash, Hasher};
558
559 let mut hasher = DefaultHasher::new();
560
561 match value {
562 Value::Null => 0u64.hash(&mut hasher),
563 Value::Bool(b) => b.hash(&mut hasher),
564 Value::Int64(i) => i.hash(&mut hasher),
565 Value::Float64(f) => f.to_bits().hash(&mut hasher),
566 Value::String(s) => s.hash(&mut hasher),
567 Value::Bytes(b) => b.hash(&mut hasher),
568 _ => format!("{value:?}").hash(&mut hasher),
569 }
570
571 hasher.finish()
572}
573
574fn compare_values(a: &Value, b: &Value) -> Option<Ordering> {
576 match (a, b) {
577 (Value::Int64(a), Value::Int64(b)) => Some(a.cmp(b)),
578 (Value::Float64(a), Value::Float64(b)) => a.partial_cmp(b),
579 (Value::String(a), Value::String(b)) => Some(a.cmp(b)),
580 (Value::Bool(a), Value::Bool(b)) => Some(a.cmp(b)),
581 (Value::Int64(a), Value::Float64(b)) => (*a as f64).partial_cmp(b),
582 (Value::Float64(a), Value::Int64(b)) => a.partial_cmp(&(*b as f64)),
583 _ => None,
584 }
585}
586
587#[cfg(test)]
588mod tests {
589 use super::*;
590
591 #[test]
592 fn test_zone_map_entry_equal() {
593 let entry = ZoneMapEntry::with_min_max(Value::Int64(10), Value::Int64(100), 0, 100);
594
595 assert!(entry.might_contain_equal(&Value::Int64(50)));
596 assert!(entry.might_contain_equal(&Value::Int64(10)));
597 assert!(entry.might_contain_equal(&Value::Int64(100)));
598 assert!(!entry.might_contain_equal(&Value::Int64(5)));
599 assert!(!entry.might_contain_equal(&Value::Int64(105)));
600 }
601
602 #[test]
603 fn test_zone_map_entry_range() {
604 let entry = ZoneMapEntry::with_min_max(Value::Int64(10), Value::Int64(100), 0, 100);
605
606 assert!(entry.might_contain_range(
608 Some(&Value::Int64(0)),
609 Some(&Value::Int64(200)),
610 true,
611 true
612 ));
613
614 assert!(entry.might_contain_range(
616 Some(&Value::Int64(50)),
617 Some(&Value::Int64(150)),
618 true,
619 true
620 ));
621
622 assert!(!entry.might_contain_range(
624 Some(&Value::Int64(101)),
625 Some(&Value::Int64(200)),
626 true,
627 true
628 ));
629 }
630
631 #[test]
632 fn test_zone_map_builder() {
633 let mut builder = ZoneMapBuilder::new();
634
635 for i in 1..=100 {
636 builder.add(&Value::Int64(i));
637 }
638 builder.add(&Value::Null);
639 builder.add(&Value::Null);
640
641 let entry = builder.build();
642
643 assert_eq!(entry.min, Some(Value::Int64(1)));
644 assert_eq!(entry.max, Some(Value::Int64(100)));
645 assert_eq!(entry.null_count, 2);
646 assert_eq!(entry.row_count, 102);
647 }
648
649 #[test]
650 fn test_zone_map_with_bloom() {
651 let mut builder = ZoneMapBuilder::with_bloom_filter(100, 0.01);
652
653 for i in 1..=100 {
654 builder.add(&Value::Int64(i));
655 }
656
657 let entry = builder.build();
658
659 assert!(entry.bloom_filter.is_some());
660 assert!(entry.might_contain_equal(&Value::Int64(50)));
661 }
663
664 #[test]
665 fn test_zone_map_index() {
666 let mut index = ZoneMapIndex::new("age");
667
668 index.insert(
669 0,
670 ZoneMapEntry::with_min_max(Value::Int64(0), Value::Int64(30), 0, 100),
671 );
672 index.insert(
673 1,
674 ZoneMapEntry::with_min_max(Value::Int64(31), Value::Int64(60), 0, 100),
675 );
676 index.insert(
677 2,
678 ZoneMapEntry::with_min_max(Value::Int64(61), Value::Int64(100), 0, 100),
679 );
680
681 let matching: Vec<_> = index.filter_equal(&Value::Int64(25), 0..3).collect();
683 assert_eq!(matching, vec![0]);
684
685 let matching: Vec<_> = index.filter_equal(&Value::Int64(75), 0..3).collect();
687 assert_eq!(matching, vec![2]);
688
689 let matching: Vec<_> = index
691 .filter_range(
692 Some(&Value::Int64(25)),
693 Some(&Value::Int64(65)),
694 true,
695 true,
696 0..3,
697 )
698 .collect();
699 assert_eq!(matching, vec![0, 1, 2]);
700 }
701
702 #[test]
703 fn test_bloom_filter() {
704 let mut filter = BloomFilter::new(1000, 7);
705
706 for i in 0..100 {
707 filter.add(&Value::Int64(i));
708 }
709
710 for i in 0..100 {
712 assert!(filter.might_contain(&Value::Int64(i)));
713 }
714
715 let _ = filter.might_contain(&Value::Int64(1000));
718 }
719
720 #[test]
721 fn test_zone_map_nulls() {
722 let entry = ZoneMapEntry {
723 min: None,
724 max: None,
725 null_count: 10,
726 row_count: 10,
727 bloom_filter: None,
728 };
729
730 assert!(entry.is_all_null());
731 assert!(!entry.might_contain_non_null());
732 assert!(entry.might_contain_equal(&Value::Null));
733 assert!(!entry.might_contain_equal(&Value::Int64(5)));
734 }
735}