genomicframe-core 0.2.0

High-performance genomics I/O and interoperability layer
Documentation
//! BED statistics computation
//!
//! Streaming statistics collection for BED interval files.
//! Memory-efficient: processes millions of intervals with O(1) memory for basic stats.

use crate::core::GenomicRecordIterator;
use crate::error::Result;
use crate::parallel::Mergeable;
use crate::stats::{CategoryCounter, RunningStats};

use super::reader::{BedReader, BedRecord};

/// Comprehensive statistics for BED files
#[derive(Debug, Clone)]
pub struct BedStats {
    /// Total number of intervals
    pub total_intervals: usize,

    /// Number of BED3 records (minimal fields only)
    pub bed3_count: usize,

    /// Number of BED6 records (includes name, score, strand)
    pub bed6_count: usize,

    /// Number of BED12 records (includes blocks/exons)
    pub bed12_count: usize,

    /// Interval length statistics
    pub length_stats: RunningStats,

    /// Distribution of interval lengths
    pub length_distribution: CategoryCounter<u64>,

    /// Intervals per chromosome
    pub intervals_per_chrom: CategoryCounter<String>,

    /// Score statistics (for BED6+ records)
    pub score_stats: RunningStats,

    /// Strand distribution (for BED6+ records)
    pub plus_strand: usize,
    pub minus_strand: usize,
    pub unstranded: usize,

    /// Block/exon statistics (for BED12 records)
    pub block_count_stats: RunningStats,
    pub total_blocks: usize,

    /// Total genomic span covered (sum of all interval lengths)
    pub total_span: u64,

    /// Min interval length
    pub min_length: u64,

    /// Max interval length
    pub max_length: u64,
}

impl Default for BedStats {
    fn default() -> Self {
        Self::new()
    }
}

impl BedStats {
    /// Create a new statistics collector
    pub fn new() -> Self {
        Self {
            total_intervals: 0,
            bed3_count: 0,
            bed6_count: 0,
            bed12_count: 0,
            length_stats: RunningStats::new(),
            length_distribution: CategoryCounter::new(),
            intervals_per_chrom: CategoryCounter::new(),
            score_stats: RunningStats::new(),
            plus_strand: 0,
            minus_strand: 0,
            unstranded: 0,
            block_count_stats: RunningStats::new(),
            total_blocks: 0,
            total_span: 0,
            min_length: u64::MAX,
            max_length: 0,
        }
    }

    /// Update statistics with a single record (streaming)
    pub fn update(&mut self, record: &BedRecord) {
        self.total_intervals += 1;

        // Classify record type
        if record.is_bed12() {
            self.bed12_count += 1;
        } else if record.is_bed6() {
            self.bed6_count += 1;
        } else {
            self.bed3_count += 1;
        }

        // Interval length
        let length = record.len();
        self.length_stats.push(length as f64);
        self.length_distribution.increment(length);
        self.total_span += length;
        self.min_length = self.min_length.min(length);
        self.max_length = self.max_length.max(length);

        // Chromosome distribution
        self.intervals_per_chrom.increment_str(&record.chrom);

        // Score statistics (BED6+)
        if let Some(score) = record.score {
            self.score_stats.push(score);
        }

        // Strand distribution (BED6+)
        match record.strand {
            Some('+') => self.plus_strand += 1,
            Some('-') => self.minus_strand += 1,
            _ => self.unstranded += 1,
        }

        // Block/exon statistics (BED12)
        if let Some(block_count) = record.block_count {
            self.block_count_stats.push(block_count as f64);
            self.total_blocks += block_count as usize;
        }
    }

    /// Compute statistics from a BED reader (sequential processing)
    ///
    /// # Example
    /// ```ignore
    /// use genomicframe_core::formats::bed::{BedReader, BedStats};
    ///
    /// let mut reader = BedReader::from_path("regions.bed")?;
    /// let stats = BedStats::compute(&mut reader)?;
    /// stats.print_summary();
    /// ```
    pub fn compute(reader: &mut BedReader) -> Result<Self> {
        let mut stats = Self::new();

        while let Some(record) = reader.next_record()? {
            stats.update(&record);
        }

        Ok(stats)
    }

    /// Mean interval length
    pub fn mean_length(&self) -> Option<f64> {
        self.length_stats.mean()
    }

    /// Standard deviation of interval length
    pub fn std_length(&self) -> Option<f64> {
        self.length_stats.std_dev()
    }

    /// Mean score (for BED6+ records)
    pub fn mean_score(&self) -> Option<f64> {
        self.score_stats.mean()
    }

    /// Standard deviation of score
    pub fn std_score(&self) -> Option<f64> {
        self.score_stats.std_dev()
    }

    /// Mean number of blocks per interval (for BED12 records)
    pub fn mean_blocks(&self) -> Option<f64> {
        self.block_count_stats.mean()
    }

    /// Percentage of intervals on plus strand
    pub fn plus_strand_percent(&self) -> f64 {
        if self.total_intervals == 0 {
            0.0
        } else {
            (self.plus_strand as f64 / self.total_intervals as f64) * 100.0
        }
    }

    /// Percentage of intervals on minus strand
    pub fn minus_strand_percent(&self) -> f64 {
        if self.total_intervals == 0 {
            0.0
        } else {
            (self.minus_strand as f64 / self.total_intervals as f64) * 100.0
        }
    }

    /// Print a human-readable summary of statistics
    pub fn print_summary(&self) {
        println!("=== BED Statistics ===\n");

        println!("Basic Counts:");
        println!("  Total intervals:   {:>12}", self.total_intervals);
        println!("  BED3 (minimal):    {:>12}", self.bed3_count);
        println!("  BED6 (standard):   {:>12}", self.bed6_count);
        println!("  BED12 (full):      {:>12}", self.bed12_count);
        println!();

        println!("Interval Lengths:");
        println!(
            "  Mean:              {:>12.2}",
            self.mean_length().unwrap_or(0.0)
        );
        println!(
            "  Std Dev:           {:>12.2}",
            self.std_length().unwrap_or(0.0)
        );
        println!("  Min:               {:>12}", self.min_length);
        println!("  Max:               {:>12}", self.max_length);
        println!("  Total span:        {:>12}", self.total_span);
        println!();

        if self.bed6_count > 0 || self.bed12_count > 0 {
            println!("Strand Distribution:");
            println!(
                "  Plus (+):          {:>11.1}%",
                self.plus_strand_percent()
            );
            println!(
                "  Minus (-):         {:>11.1}%",
                self.minus_strand_percent()
            );
            println!("  Unstranded (.):    {:>12}", self.unstranded);
            println!();

            if self.score_stats.count() > 0 {
                println!("Score Statistics:");
                println!(
                    "  Mean:              {:>12.2}",
                    self.mean_score().unwrap_or(0.0)
                );
                println!(
                    "  Std Dev:           {:>12.2}",
                    self.std_score().unwrap_or(0.0)
                );
                println!();
            }
        }

        if self.bed12_count > 0 {
            println!("Block/Exon Structure (BED12):");
            println!("  Total blocks:      {:>12}", self.total_blocks);
            println!(
                "  Mean blocks/int:   {:>12.2}",
                self.mean_blocks().unwrap_or(0.0)
            );
            println!();
        }

        // Top chromosomes (sorted by count descending)
        println!("Top Chromosomes:");
        let mut chrom_vec: Vec<_> = self.intervals_per_chrom.iter().collect();
        chrom_vec.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count descending
        for (chrom, count) in chrom_vec.iter().take(10) {
            let percent = (**count as f64 / self.total_intervals as f64) * 100.0;
            println!("  {:>12}: {:>10} ({:>5.1}%)", chrom, count, percent);
        }
        println!();

        // Length distribution (show quantiles)
        if self.total_intervals > 0 {
            println!("Length Distribution (most common):");
            let mut length_vec: Vec<_> = self.length_distribution.iter().collect();
            length_vec.sort_by(|a, b| b.1.cmp(a.1)); // Sort by count descending
            for (length, count) in length_vec.iter().take(10) {
                let percent = (**count as f64 / self.total_intervals as f64) * 100.0;
                println!("  {:>12} bp: {:>8} ({:>5.1}%)", length, count, percent);
            }
        }
    }
}

/// Implementation of Mergeable for parallel statistics computation
///
/// Merges BED statistics from multiple threads/chunks.
/// All statistics are correctly combined using the underlying
/// Mergeable implementations for RunningStats and CategoryCounter.
impl Mergeable for BedStats {
    fn merge(&mut self, other: Self) {
        // Merge simple counters
        self.total_intervals += other.total_intervals;
        self.bed3_count += other.bed3_count;
        self.bed6_count += other.bed6_count;
        self.bed12_count += other.bed12_count;
        self.plus_strand += other.plus_strand;
        self.minus_strand += other.minus_strand;
        self.unstranded += other.unstranded;
        self.total_blocks += other.total_blocks;
        self.total_span += other.total_span;
        self.min_length = self.min_length.min(other.min_length);
        self.max_length = self.max_length.max(other.max_length);

        // Merge running statistics
        self.length_stats.merge(other.length_stats);
        self.score_stats.merge(other.score_stats);
        self.block_count_stats.merge(other.block_count_stats);

        // Merge categorical data
        self.length_distribution.merge(other.length_distribution);
        self.intervals_per_chrom.merge(other.intervals_per_chrom);
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_bed_stats_new() {
        let stats = BedStats::new();
        assert_eq!(stats.total_intervals, 0);
        assert_eq!(stats.bed3_count, 0);
        assert_eq!(stats.bed6_count, 0);
    }

    #[test]
    fn test_bed_stats_merge() {
        let mut stats1 = BedStats::new();
        stats1.total_intervals = 100;
        stats1.bed3_count = 50;
        stats1.bed6_count = 30;
        stats1.total_span = 10000;

        let mut stats2 = BedStats::new();
        stats2.total_intervals = 50;
        stats2.bed3_count = 25;
        stats2.bed6_count = 15;
        stats2.total_span = 5000;

        stats1.merge(stats2);

        assert_eq!(stats1.total_intervals, 150);
        assert_eq!(stats1.bed3_count, 75);
        assert_eq!(stats1.bed6_count, 45);
        assert_eq!(stats1.total_span, 15000);
    }

    #[test]
    fn test_bed_stats_percentages() {
        let mut stats = BedStats::new();
        stats.total_intervals = 100;
        stats.plus_strand = 60;
        stats.minus_strand = 40;

        assert!((stats.plus_strand_percent() - 60.0).abs() < 0.1);
        assert!((stats.minus_strand_percent() - 40.0).abs() < 0.1);
    }

    #[test]
    fn test_bed_stats_empty() {
        let stats = BedStats::new();
        assert_eq!(stats.plus_strand_percent(), 0.0);
        assert_eq!(stats.minus_strand_percent(), 0.0);
    }
}