use anyhow::Result;
use arrow::array::RecordBatch;
use std::collections::HashMap;
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ColumnStats {
pub null_count: i64,
pub nan_count: i64,
pub min_value: Option<serde_json::Value>,
pub max_value: Option<serde_json::Value>,
pub bloom_filter: Option<BloomFilter>,
}
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct BloomFilter {
pub bitset: Vec<u64>,
pub num_hashes: u32,
pub size_bits: u64,
}
impl BloomFilter {
pub fn new(expected_elements: usize, false_positive_rate: f64) -> Self {
let size_bits = (-(expected_elements as f64) * false_positive_rate.ln()
/ (2_f64.ln().powi(2)))
.ceil() as u64;
let num_hashes =
((size_bits as f64 / expected_elements as f64) * 2_f64.ln()).round() as u32;
let num_u64s = (size_bits as f64 / 64.0).ceil() as usize;
Self {
bitset: vec![0; num_u64s],
num_hashes,
size_bits,
}
}
fn hash(&self, value: &[u8], seed: u32) -> u64 {
use std::collections::hash_map::DefaultHasher;
use std::hash::{Hash, Hasher};
let mut hasher = DefaultHasher::new();
value.hash(&mut hasher);
seed.hash(&mut hasher);
hasher.finish() % self.size_bits
}
pub fn insert(&mut self, value: &[u8]) {
for i in 0..self.num_hashes {
let bit = self.hash(value, i);
let idx = (bit / 64) as usize;
let offset = (bit % 64) as u32;
self.bitset[idx] |= 1 << offset;
}
}
pub fn contains(&self, value: &[u8]) -> bool {
for i in 0..self.num_hashes {
let bit = self.hash(value, i);
let idx = (bit / 64) as usize;
let offset = (bit % 64) as u32;
if (self.bitset[idx] & (1 << offset)) == 0 {
return false;
}
}
true
}
}
pub fn calculate_stats(batch: &RecordBatch) -> Result<HashMap<i32, ColumnStats>> {
let mut stats = HashMap::new();
let schema = batch.schema();
for (i, field) in schema.fields().iter().enumerate() {
let column = batch.column(i);
let null_count = column.null_count() as i64;
let (min_val, max_val) = match field.data_type() {
arrow::datatypes::DataType::Int32 | arrow::datatypes::DataType::Int64 => {
(None, None)
}
_ => (None, None),
};
stats.insert(
(i as i32) + 1, ColumnStats {
null_count,
nan_count: 0, min_value: min_val,
max_value: max_val,
bloom_filter: None,
},
);
}
Ok(stats)
}