supertable_core/
statistics.rs1use anyhow::Result;
6use arrow::array::RecordBatch;
7use std::collections::HashMap;
8
9#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
11pub struct ColumnStats {
12 pub null_count: i64,
13 pub nan_count: i64,
14 pub min_value: Option<serde_json::Value>,
15 pub max_value: Option<serde_json::Value>,
16 pub bloom_filter: Option<BloomFilter>,
18}
19
20#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
22pub struct BloomFilter {
23 pub bitset: Vec<u64>,
24 pub num_hashes: u32,
25 pub size_bits: u64,
26}
27
28impl BloomFilter {
29 pub fn new(expected_elements: usize, false_positive_rate: f64) -> Self {
30 let size_bits = (-(expected_elements as f64) * false_positive_rate.ln()
31 / (2_f64.ln().powi(2)))
32 .ceil() as u64;
33 let num_hashes =
34 ((size_bits as f64 / expected_elements as f64) * 2_f64.ln()).round() as u32;
35 let num_u64s = (size_bits as f64 / 64.0).ceil() as usize;
36
37 Self {
38 bitset: vec![0; num_u64s],
39 num_hashes,
40 size_bits,
41 }
42 }
43
44 fn hash(&self, value: &[u8], seed: u32) -> u64 {
45 use std::collections::hash_map::DefaultHasher;
46 use std::hash::{Hash, Hasher};
47 let mut hasher = DefaultHasher::new();
48 value.hash(&mut hasher);
49 seed.hash(&mut hasher);
50 hasher.finish() % self.size_bits
51 }
52
53 pub fn insert(&mut self, value: &[u8]) {
54 for i in 0..self.num_hashes {
55 let bit = self.hash(value, i);
56 let idx = (bit / 64) as usize;
57 let offset = (bit % 64) as u32;
58 self.bitset[idx] |= 1 << offset;
59 }
60 }
61
62 pub fn contains(&self, value: &[u8]) -> bool {
63 for i in 0..self.num_hashes {
64 let bit = self.hash(value, i);
65 let idx = (bit / 64) as usize;
66 let offset = (bit % 64) as u32;
67 if (self.bitset[idx] & (1 << offset)) == 0 {
68 return false;
69 }
70 }
71 true
72 }
73}
74
75pub fn calculate_stats(batch: &RecordBatch) -> Result<HashMap<i32, ColumnStats>> {
77 let mut stats = HashMap::new();
78 let schema = batch.schema();
79
80 for (i, field) in schema.fields().iter().enumerate() {
81 let column = batch.column(i);
82 let null_count = column.null_count() as i64;
83
84 let (min_val, max_val) = match field.data_type() {
87 arrow::datatypes::DataType::Int32 | arrow::datatypes::DataType::Int64 => {
88 (None, None)
90 }
91 _ => (None, None),
92 };
93
94 stats.insert(
95 (i as i32) + 1, ColumnStats {
97 null_count,
98 nan_count: 0, min_value: min_val,
100 max_value: max_val,
101 bloom_filter: None,
102 },
103 );
104 }
105
106 Ok(stats)
107}