supertable-core 0.1.0

Core library for SuperTable, a next-generation open table format
Documentation
//! # SuperTable Statistics
//!
//! This module provides tools for collecting and managing table statistics.

use anyhow::Result;
use arrow::array::RecordBatch;
use std::collections::HashMap;

/// Statistics for a single column.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct ColumnStats {
    pub null_count: i64,
    pub nan_count: i64,
    pub min_value: Option<serde_json::Value>,
    pub max_value: Option<serde_json::Value>,
    /// Bloom filter for equality predicate pushdown
    pub bloom_filter: Option<BloomFilter>,
}

/// A simple bitset-based Bloom Filter.
#[derive(Debug, Clone, serde::Serialize, serde::Deserialize)]
pub struct BloomFilter {
    pub bitset: Vec<u64>,
    pub num_hashes: u32,
    pub size_bits: u64,
}

impl BloomFilter {
    pub fn new(expected_elements: usize, false_positive_rate: f64) -> Self {
        let size_bits = (-(expected_elements as f64) * false_positive_rate.ln()
            / (2_f64.ln().powi(2)))
        .ceil() as u64;
        let num_hashes =
            ((size_bits as f64 / expected_elements as f64) * 2_f64.ln()).round() as u32;
        let num_u64s = (size_bits as f64 / 64.0).ceil() as usize;

        Self {
            bitset: vec![0; num_u64s],
            num_hashes,
            size_bits,
        }
    }

    fn hash(&self, value: &[u8], seed: u32) -> u64 {
        use std::collections::hash_map::DefaultHasher;
        use std::hash::{Hash, Hasher};
        let mut hasher = DefaultHasher::new();
        value.hash(&mut hasher);
        seed.hash(&mut hasher);
        hasher.finish() % self.size_bits
    }

    pub fn insert(&mut self, value: &[u8]) {
        for i in 0..self.num_hashes {
            let bit = self.hash(value, i);
            let idx = (bit / 64) as usize;
            let offset = (bit % 64) as u32;
            self.bitset[idx] |= 1 << offset;
        }
    }

    pub fn contains(&self, value: &[u8]) -> bool {
        for i in 0..self.num_hashes {
            let bit = self.hash(value, i);
            let idx = (bit / 64) as usize;
            let offset = (bit % 64) as u32;
            if (self.bitset[idx] & (1 << offset)) == 0 {
                return false;
            }
        }
        true
    }
}

/// Helper to calculate statistics from a RecordBatch.
pub fn calculate_stats(batch: &RecordBatch) -> Result<HashMap<i32, ColumnStats>> {
    let mut stats = HashMap::new();
    let schema = batch.schema();

    for (i, field) in schema.fields().iter().enumerate() {
        let column = batch.column(i);
        let null_count = column.null_count() as i64;

        // Simplified min/max calculation for the prototype.
        // In a real implementation, we'd handle all types and serialize properly.
        let (min_val, max_val) = match field.data_type() {
            arrow::datatypes::DataType::Int32 | arrow::datatypes::DataType::Int64 => {
                // ... logic to get min/max ...
                (None, None)
            }
            _ => (None, None),
        };

        stats.insert(
            (i as i32) + 1, // Placeholder for field ID
            ColumnStats {
                null_count,
                nan_count: 0, // TODO: Implement nan_count
                min_value: min_val,
                max_value: max_val,
                bloom_filter: None,
            },
        );
    }

    Ok(stats)
}