genomicframe-core 0.2.0

//! Shared statistics utilities for genomic data
//!
//! This module provides composable, streaming-friendly statistics
//! that can be computed over large genomic datasets with minimal memory usage.

use crate::parallel::Mergeable;
use std::collections::HashMap;

/// Running statistics accumulator (Welford's online algorithm)
///
/// Computes mean, variance, and standard deviation in a single pass
/// with numerically stable updates. Memory: O(1)
#[derive(Debug, Clone, Default)]
pub struct RunningStats {
    count: usize,
    mean: f64,
    m2: f64, // Sum of squared differences from mean
    min: Option<f64>,
    max: Option<f64>,
}

impl RunningStats {
    /// Create a new running statistics accumulator
    pub fn new() -> Self {
        Self::default()
    }

    /// Add a single value to the accumulator
    pub fn push(&mut self, value: f64) {
        self.count += 1;
        let delta = value - self.mean;
        self.mean += delta / self.count as f64;
        let delta2 = value - self.mean;
        self.m2 += delta * delta2;

        self.min = Some(self.min.map_or(value, |m| m.min(value)));
        self.max = Some(self.max.map_or(value, |m| m.max(value)));
    }

    /// Get the count of values
    pub fn count(&self) -> usize {
        self.count
    }

    /// Get the mean value
    pub fn mean(&self) -> Option<f64> {
        if self.count > 0 {
            Some(self.mean)
        } else {
            None
        }
    }

    /// Get the sample variance
    pub fn variance(&self) -> Option<f64> {
        if self.count > 1 {
            Some(self.m2 / (self.count - 1) as f64)
        } else {
            None
        }
    }

    /// Get the sample standard deviation
    pub fn std_dev(&self) -> Option<f64> {
        self.variance().map(|v| v.sqrt())
    }

    /// Get the minimum value
    pub fn min(&self) -> Option<f64> {
        self.min
    }

    /// Get the maximum value
    pub fn max(&self) -> Option<f64> {
        self.max
    }
}

/// Implementation of Mergeable for parallel statistics computation
///
/// Uses Chan's parallel variance algorithm to correctly merge
/// running statistics from multiple threads.
///
/// Reference: https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
impl Mergeable for RunningStats {
    fn merge(&mut self, other: Self) {
        if other.count == 0 {
            return; // Nothing to merge
        }
        if self.count == 0 {
            *self = other; // Replace with other
            return;
        }

        let total_count = self.count + other.count;
        let delta = other.mean - self.mean;

        // Merge min/max
        self.min = match (self.min, other.min) {
            (Some(a), Some(b)) => Some(a.min(b)),
            (Some(a), None) => Some(a),
            (None, Some(b)) => Some(b),
            (None, None) => None,
        };

        self.max = match (self.max, other.max) {
            (Some(a), Some(b)) => Some(a.max(b)),
            (Some(a), None) => Some(a),
            (None, Some(b)) => Some(b),
            (None, None) => None,
        };

        // Merge mean and m2 using Chan's algorithm
        self.m2 += other.m2 + delta * delta * (self.count * other.count) as f64 / total_count as f64;
        self.mean = (self.mean * self.count as f64 + other.mean * other.count as f64)
            / total_count as f64;
        self.count = total_count;
    }
}

/// Accumulator for categorical data (counts by category)
///
/// Memory: O(k) where k is the number of unique categories
#[derive(Debug, Clone, Default)]
pub struct CategoryCounter<T: std::hash::Hash + Eq> {
    counts: HashMap<T, usize>,
    total: usize,
}

impl<T: std::hash::Hash + Eq> CategoryCounter<T> {
    /// Create a new category counter
    pub fn new() -> Self {
        Self {
            counts: HashMap::new(),
            total: 0,
        }
    }

    /// Increment count for a category
    pub fn increment(&mut self, category: T) {
        *self.counts.entry(category).or_insert(0) += 1;
        self.total += 1;
    }

    /// Increment count for a category by a specific amount
    pub fn increment_by(&mut self, category: T, amount: usize) {
        *self.counts.entry(category).or_insert(0) += amount;
        self.total += amount;
    }

    /// Get count for a specific category
    pub fn get(&self, category: &T) -> usize {
        self.counts.get(category).copied().unwrap_or(0)
    }

    /// Get total count across all categories
    pub fn total(&self) -> usize {
        self.total
    }

    /// Get the number of unique categories
    pub fn num_categories(&self) -> usize {
        self.counts.len()
    }

    /// Get frequency (proportion) for a category
    pub fn frequency(&self, category: &T) -> f64 {
        if self.total == 0 {
            0.0
        } else {
            self.get(category) as f64 / self.total as f64
        }
    }

    /// Get all categories and their counts
    pub fn categories(&self) -> &HashMap<T, usize> {
        &self.counts
    }

    /// Iterate over (category, count) pairs
    pub fn iter(&self) -> impl Iterator<Item = (&T, &usize)> {
        self.counts.iter()
    }
}

/// Specialized implementation for String to allow efficient &str increments
impl CategoryCounter<String> {
    /// Optimized increment for &str - only allocates when category is new
    ///
    /// This is much more efficient than increment(category.to_string()) because
    /// it only allocates a String when inserting a new category, not on every call.
    ///
    /// For existing categories, this does a HashMap lookup with &str (no allocation),
    /// and only allocates a new String when the category is seen for the first time.
    pub fn increment_str(&mut self, category: &str) {
        // First try to increment existing entry (no allocation)
        if let Some(count) = self.counts.get_mut(category) {
            *count += 1;
        } else {
            // Only allocate String for new categories
            self.counts.insert(category.to_string(), 1);
        }
        self.total += 1;
    }
}

/// Implementation of Mergeable for parallel statistics computation
///
/// Merges category counts from multiple threads by summing counts
/// for each category.
impl<T: std::hash::Hash + Eq + Send> Mergeable for CategoryCounter<T> {
    fn merge(&mut self, other: Self) {
        for (category, count) in other.counts {
            self.increment_by(category, count);
        }
    }
}

/// Percentile calculator using reservoir sampling for memory efficiency
///
/// Stores up to `capacity` samples, then switches to reservoir sampling
/// for approximate percentile calculation.
#[derive(Debug, Clone)]
pub struct PercentileEstimator {
    samples: Vec<f64>,
    capacity: usize,
    total_seen: usize,
}

impl PercentileEstimator {
    /// Create a new percentile estimator with given capacity
    ///
    /// For exact percentiles, set capacity >= expected number of values.
    /// For approximate percentiles on large datasets, use smaller capacity (e.g., 10,000).
    pub fn new(capacity: usize) -> Self {
        Self {
            samples: Vec::with_capacity(capacity),
            capacity,
            total_seen: 0,
        }
    }

    /// Add a value to the estimator
    pub fn push(&mut self, value: f64) {
        self.total_seen += 1;

        if self.samples.len() < self.capacity {
            // Still filling buffer
            self.samples.push(value);
        } else {
            // Reservoir sampling: random replacement
            use std::collections::hash_map::RandomState;
            use std::hash::{BuildHasher, Hash, Hasher};

            let mut hasher = RandomState::new().build_hasher();
            self.total_seen.hash(&mut hasher);
            let random_index = (hasher.finish() as usize) % self.total_seen;

            if random_index < self.capacity {
                self.samples[random_index] = value;
            }
        }
    }

    /// Calculate percentile (0.0 to 1.0)
    ///
    /// Returns None if no samples have been added.
    pub fn percentile(&mut self, p: f64) -> Option<f64> {
        if self.samples.is_empty() {
            return None;
        }

        // Sort samples (required for percentile calculation)
        self.samples.sort_by(|a, b| a.partial_cmp(b).unwrap());

        let index = (p * (self.samples.len() - 1) as f64) as usize;
        Some(self.samples[index])
    }

    /// Get the median (50th percentile)
    pub fn median(&mut self) -> Option<f64> {
        self.percentile(0.5)
    }

    /// Get the total number of values seen (not just stored)
    pub fn total_seen(&self) -> usize {
        self.total_seen
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_running_stats() {
        let mut stats = RunningStats::new();

        stats.push(10.0);
        stats.push(20.0);
        stats.push(30.0);

        assert_eq!(stats.count(), 3);
        assert_eq!(stats.mean(), Some(20.0));
        assert_eq!(stats.min(), Some(10.0));
        assert_eq!(stats.max(), Some(30.0));

        // Variance should be 100.0
        let var = stats.variance().unwrap();
        assert!((var - 100.0).abs() < 1e-10);

        // Std dev should be 10.0
        let std = stats.std_dev().unwrap();
        assert!((std - 10.0).abs() < 1e-10);
    }

    #[test]
    fn test_category_counter() {
        let mut counter = CategoryCounter::new();

        counter.increment("A");
        counter.increment("B");
        counter.increment("A");
        counter.increment("C");

        assert_eq!(counter.total(), 4);
        assert_eq!(counter.num_categories(), 3);
        assert_eq!(counter.get(&"A"), 2);
        assert_eq!(counter.get(&"B"), 1);
        assert_eq!(counter.frequency(&"A"), 0.5);
    }

    #[test]
    fn test_percentile_estimator() {
        let mut estimator = PercentileEstimator::new(100);

        for i in 1..=100 {
            estimator.push(i as f64);
        }

        assert_eq!(estimator.total_seen(), 100);

        let median = estimator.median().unwrap();
        assert!((median - 50.5).abs() < 1.0); // Approximate

        let p95 = estimator.percentile(0.95).unwrap();
        assert!(p95 > 90.0 && p95 <= 100.0);
    }

    #[test]
    fn test_running_stats_merge() {
        // Create two stats objects
        let mut stats1 = RunningStats::new();
        stats1.push(10.0);
        stats1.push(20.0);
        stats1.push(30.0);

        let mut stats2 = RunningStats::new();
        stats2.push(40.0);
        stats2.push(50.0);

        // Merge stats2 into stats1
        stats1.merge(stats2);

        // Should have combined statistics
        assert_eq!(stats1.count(), 5);
        assert_eq!(stats1.mean(), Some(30.0)); // (10+20+30+40+50)/5 = 30
        assert_eq!(stats1.min(), Some(10.0));
        assert_eq!(stats1.max(), Some(50.0));

        // Variance of [10, 20, 30, 40, 50] should be 250
        let var = stats1.variance().unwrap();
        assert!((var - 250.0).abs() < 1e-10);
    }

    #[test]
    fn test_running_stats_merge_empty() {
        let mut stats1 = RunningStats::new();
        stats1.push(10.0);

        let stats2 = RunningStats::new(); // Empty

        stats1.merge(stats2);
        assert_eq!(stats1.count(), 1);
        assert_eq!(stats1.mean(), Some(10.0));
    }

    #[test]
    fn test_category_counter_merge() {
        let mut counter1 = CategoryCounter::new();
        counter1.increment("A");
        counter1.increment("B");
        counter1.increment("A");

        let mut counter2 = CategoryCounter::new();
        counter2.increment("B");
        counter2.increment("C");
        counter2.increment("C");

        counter1.merge(counter2);

        assert_eq!(counter1.total(), 6);
        assert_eq!(counter1.get(&"A"), 2);
        assert_eq!(counter1.get(&"B"), 2);
        assert_eq!(counter1.get(&"C"), 2);
        assert_eq!(counter1.num_categories(), 3);
    }

    #[test]
    fn test_mergeable_merge_all() {
        let mut stats1 = RunningStats::new();
        stats1.push(10.0);

        let mut stats2 = RunningStats::new();
        stats2.push(20.0);

        let mut stats3 = RunningStats::new();
        stats3.push(30.0);

        let merged = RunningStats::merge_all(vec![stats1, stats2, stats3]).unwrap();

        assert_eq!(merged.count(), 3);
        assert_eq!(merged.mean(), Some(20.0));
    }
}