Skip to main content

supertable_core/
statistics.rs

1//! # SuperTable Statistics
2//!
3//! This module provides tools for collecting and managing table statistics.
4
5use anyhow::Result;
6use arrow::array::RecordBatch;
7use std::collections::HashMap;
8
9/// Statistics for a single column.
10#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
11pub struct ColumnStats {
12    pub null_count: i64,
13    pub nan_count: i64,
14    pub min_value: Option<serde_json::Value>,
15    pub max_value: Option<serde_json::Value>,
16    /// Bloom filter for equality predicate pushdown
17    pub bloom_filter: Option<BloomFilter>,
18}
19
20/// A simple bitset-based Bloom Filter.
21#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
22pub struct BloomFilter {
23    pub bitset: Vec<u64>,
24    pub num_hashes: u32,
25    pub size_bits: u64,
26}
27
28impl BloomFilter {
29    pub fn new(expected_elements: usize, false_positive_rate: f64) -> Self {
30        let size_bits = (-(expected_elements as f64) * false_positive_rate.ln()
31            / (2_f64.ln().powi(2)))
32        .ceil() as u64;
33        let num_hashes =
34            ((size_bits as f64 / expected_elements as f64) * 2_f64.ln()).round() as u32;
35        let num_u64s = (size_bits as f64 / 64.0).ceil() as usize;
36
37        Self {
38            bitset: vec![0; num_u64s],
39            num_hashes,
40            size_bits,
41        }
42    }
43
44    fn hash(&self, value: &[u8], seed: u32) -> u64 {
45        use std::collections::hash_map::DefaultHasher;
46        use std::hash::{Hash, Hasher};
47        let mut hasher = DefaultHasher::new();
48        value.hash(&mut hasher);
49        seed.hash(&mut hasher);
50        hasher.finish() % self.size_bits
51    }
52
53    pub fn insert(&mut self, value: &[u8]) {
54        for i in 0..self.num_hashes {
55            let bit = self.hash(value, i);
56            let idx = (bit / 64) as usize;
57            let offset = (bit % 64) as u32;
58            self.bitset[idx] |= 1 << offset;
59        }
60    }
61
62    pub fn contains(&self, value: &[u8]) -> bool {
63        for i in 0..self.num_hashes {
64            let bit = self.hash(value, i);
65            let idx = (bit / 64) as usize;
66            let offset = (bit % 64) as u32;
67            if (self.bitset[idx] & (1 << offset)) == 0 {
68                return false;
69            }
70        }
71        true
72    }
73}
74
75/// Helper to calculate statistics from a RecordBatch.
76pub fn calculate_stats(batch: &RecordBatch) -> Result<HashMap<i32, ColumnStats>> {
77    let mut stats = HashMap::new();
78    let schema = batch.schema();
79
80    for (i, field) in schema.fields().iter().enumerate() {
81        let column = batch.column(i);
82        let null_count = column.null_count() as i64;
83
84        // Simplified min/max calculation for the prototype.
85        // In a real implementation, we'd handle all types and serialize properly.
86        let (min_val, max_val) = match field.data_type() {
87            arrow::datatypes::DataType::Int32 | arrow::datatypes::DataType::Int64 => {
88                // ... logic to get min/max ...
89                (None, None)
90            }
91            _ => (None, None),
92        };
93
94        stats.insert(
95            (i as i32) + 1, // Placeholder for field ID
96            ColumnStats {
97                null_count,
98                nan_count: 0, // TODO: Implement nan_count
99                min_value: min_val,
100                max_value: max_val,
101                bloom_filter: None,
102            },
103        );
104    }
105
106    Ok(stats)
107}