genomicframe-core 0.2.0

High-performance genomics I/O and interoperability layer
Documentation
//! GFF/GTF statistics computation
//!
//! Streaming statistics collection for gene annotation files.
//! Memory-efficient: processes millions of features with O(1) memory for basic stats.

use crate::core::GenomicRecordIterator;
use crate::error::Result;
use crate::stats::{CategoryCounter, RunningStats};
use std::collections::HashMap;

use super::reader::{GffReader, GffRecord};

/// Comprehensive statistics for GFF/GTF files
#[derive(Debug, Clone)]
pub struct GffStats {
    /// Total number of features
    pub total_features: usize,

    /// Feature type distribution (gene, exon, CDS, etc.)
    pub feature_types: CategoryCounter<String>,

    /// Chromosome/sequence distribution
    pub sequences: CategoryCounter<String>,

    /// Strand distribution
    pub strand_distribution: HashMap<String, usize>,

    /// Source distribution (e.g., ENSEMBL, RefSeq)
    pub sources: CategoryCounter<String>,

    /// Feature length statistics
    pub length_stats: RunningStats,

    /// Score statistics (if present)
    pub score_stats: RunningStats,

    /// Features with scores
    pub features_with_scores: usize,

    /// Features with phase information (CDS features)
    pub features_with_phase: usize,

    /// Phase distribution for CDS features
    pub phase_distribution: HashMap<u8, usize>,
}

impl Default for GffStats {
    fn default() -> Self {
        Self::new()
    }
}

impl GffStats {
    /// Create a new statistics collector
    pub fn new() -> Self {
        Self {
            total_features: 0,
            feature_types: CategoryCounter::new(),
            sequences: CategoryCounter::new(),
            strand_distribution: HashMap::new(),
            sources: CategoryCounter::new(),
            length_stats: RunningStats::new(),
            score_stats: RunningStats::new(),
            features_with_scores: 0,
            features_with_phase: 0,
            phase_distribution: HashMap::new(),
        }
    }

    /// Update statistics with a single record (streaming)
    pub fn update(&mut self, record: &GffRecord) {
        self.total_features += 1;

        // Feature type
        self.feature_types.increment(record.feature_type.clone());

        // Sequence/chromosome
        self.sequences.increment(record.seqid.clone());

        // Strand
        let strand_str = match record.strand {
            crate::core::Strand::Forward => "+",
            crate::core::Strand::Reverse => "-",
            crate::core::Strand::Unknown => ".",
        };
        *self
            .strand_distribution
            .entry(strand_str.to_string())
            .or_insert(0) += 1;

        // Source
        self.sources.increment(record.source.clone());

        // Length
        let length = record.len() as f64;
        self.length_stats.push(length);

        // Score
        if let Some(score) = record.score {
            self.features_with_scores += 1;
            self.score_stats.push(score);
        }

        // Phase (for CDS features)
        if let Some(phase) = record.phase {
            self.features_with_phase += 1;
            *self.phase_distribution.entry(phase).or_insert(0) += 1;
        }
    }

    /// Compute statistics from a complete reader (consumes the reader)
    ///
    /// This is streaming and memory-efficient: O(1) memory regardless of file size.
    ///
    /// # Example
    ///
    /// ```no_run
    /// use genomicframe_core::formats::gff::{GffReader, GffStats};
    ///
    /// let mut reader = GffReader::from_path("annotations.gff.gz")?;
    /// let stats = GffStats::compute(&mut reader)?;
    /// stats.print_summary();
    /// # Ok::<(), genomicframe_core::error::Error>(())
    /// ```
    pub fn compute(reader: &mut GffReader) -> Result<Self> {
        let mut stats = Self::new();

        while let Some(record) = reader.next_record()? {
            stats.update(&record);
        }

        Ok(stats)
    }

    /// Print a human-readable summary of statistics
    pub fn print_summary(&self) {
        println!("╔═══════════════════════════════════════════════╗");
        println!("║         GFF/GTF Statistics Summary            ║");
        println!("╚═══════════════════════════════════════════════╝");
        println!();

        println!("📊 Overall Statistics");
        println!("  Total features: {}", self.total_features);
        println!();

        println!("🧬 Feature Types");
        let mut feature_vec: Vec<_> = self.feature_types.iter().collect();
        feature_vec.sort_by(|a, b| b.1.cmp(a.1));
        for (feature_type, count) in feature_vec.iter().take(10) {
            let percentage = (**count as f64 / self.total_features as f64) * 100.0;
            println!("  {:<20} {:>10} ({:>6.2}%)", feature_type, count, percentage);
        }
        if self.feature_types.num_categories() > 10 {
            println!(
                "  ... and {} more types",
                self.feature_types.num_categories() - 10
            );
        }
        println!();

        println!("🧭 Sequences/Chromosomes (top 10)");
        let mut seq_vec: Vec<_> = self.sequences.iter().collect();
        seq_vec.sort_by(|a, b| b.1.cmp(a.1));
        for (seq, count) in seq_vec.iter().take(10) {
            let percentage = (**count as f64 / self.total_features as f64) * 100.0;
            println!("  {:<20} {:>10} ({:>6.2}%)", seq, count, percentage);
        }
        if self.sequences.num_categories() > 10 {
            println!(
                "  ... and {} more sequences",
                self.sequences.num_categories() - 10
            );
        }
        println!();

        println!("🧵 Strand Distribution");
        for (strand, count) in &self.strand_distribution {
            let percentage = (*count as f64 / self.total_features as f64) * 100.0;
            let label = match strand.as_str() {
                "+" => "Forward (+)",
                "-" => "Reverse (-)",
                "." => "Unknown (.)",
                _ => strand,
            };
            println!("  {:<20} {:>10} ({:>6.2}%)", label, count, percentage);
        }
        println!();

        println!("📚 Sources (top 10)");
        let mut source_vec: Vec<_> = self.sources.iter().collect();
        source_vec.sort_by(|a, b| b.1.cmp(a.1));
        for (source, count) in source_vec.iter().take(10) {
            let percentage = (**count as f64 / self.total_features as f64) * 100.0;
            println!("  {:<20} {:>10} ({:>6.2}%)", source, count, percentage);
        }
        println!();

        println!("📏 Feature Length Statistics");
        if let Some(mean) = self.length_stats.mean() {
            println!("  Mean:   {:>10.2} bp", mean);
        }
        if let Some(std_dev) = self.length_stats.std_dev() {
            println!("  StdDev: {:>10.2} bp", std_dev);
        }
        if let Some(min) = self.length_stats.min() {
            println!("  Min:    {:>10.0} bp", min);
        }
        if let Some(max) = self.length_stats.max() {
            println!("  Max:    {:>10.0} bp", max);
        }
        println!();

        if self.features_with_scores > 0 {
            println!("⭐ Score Statistics");
            println!("  Features with scores: {}", self.features_with_scores);
            if let Some(mean) = self.score_stats.mean() {
                println!("  Mean:   {:>10.2}", mean);
            }
            if let Some(std_dev) = self.score_stats.std_dev() {
                println!("  StdDev: {:>10.2}", std_dev);
            }
            if let Some(min) = self.score_stats.min() {
                println!("  Min:    {:>10.2}", min);
            }
            if let Some(max) = self.score_stats.max() {
                println!("  Max:    {:>10.2}", max);
            }
            println!();
        }

        if self.features_with_phase > 0 {
            println!("🔢 Phase Distribution (CDS features)");
            for phase in 0..=2 {
                if let Some(count) = self.phase_distribution.get(&phase) {
                    let percentage = (*count as f64 / self.features_with_phase as f64) * 100.0;
                    println!("  Phase {}: {:>10} ({:>6.2}%)", phase, count, percentage);
                }
            }
            println!();
        }
    }

    /// Get the number of unique feature types
    pub fn unique_feature_types(&self) -> usize {
        self.feature_types.num_categories()
    }

    /// Get the number of unique sequences/chromosomes
    pub fn unique_sequences(&self) -> usize {
        self.sequences.num_categories()
    }

    /// Get the number of unique sources
    pub fn unique_sources(&self) -> usize {
        self.sources.num_categories()
    }

    /// Get mean feature length
    pub fn mean_length(&self) -> Option<f64> {
        self.length_stats.mean()
    }

    /// Get total genomic span (sum of all feature lengths)
    /// This computes: count * mean
    pub fn total_span(&self) -> f64 {
        self.length_stats.count() as f64 * self.length_stats.mean().unwrap_or(0.0)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::core::Strand;
    use std::io::Write;
    use tempfile::NamedTempFile;

    #[test]
    fn test_gff_stats_basic() -> Result<()> {
        let gff_data = "##gff-version 3\n\
                        chr1\tENSEMBL\tgene\t1000\t2000\t100\t+\t.\tID=gene1\n\
                        chr1\tENSEMBL\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=gene1\n\
                        chr1\tENSEMBL\texon\t1800\t2000\t.\t+\t.\tID=exon2;Parent=gene1\n\
                        chr2\tRefSeq\tgene\t5000\t6000\t200\t-\t.\tID=gene2\n";

        let mut temp_file = NamedTempFile::new()?;
        temp_file.write_all(gff_data.as_bytes())?;
        temp_file.flush()?;

        let mut reader = GffReader::from_path(temp_file.path())?;
        let stats = GffStats::compute(&mut reader)?;

        assert_eq!(stats.total_features, 4);
        assert_eq!(stats.unique_feature_types(), 2); // gene, exon
        assert_eq!(stats.unique_sequences(), 2); // chr1, chr2
        assert_eq!(stats.features_with_scores, 2); // 2 genes have scores

        // Check feature type counts
        assert_eq!(stats.feature_types.get(&"gene".to_string()), 2);
        assert_eq!(stats.feature_types.get(&"exon".to_string()), 2);

        Ok(())
    }

    #[test]
    fn test_gff_stats_streaming() -> Result<()> {
        // Verify that we can compute stats record-by-record
        let mut stats = GffStats::new();

        let record = GffRecord {
            seqid: "chr1".to_string(),
            source: "test".to_string(),
            feature_type: "gene".to_string(),
            start: 1000,
            end: 2000,
            score: Some(50.0),
            strand: Strand::Forward,
            phase: None,
            attributes: "ID=gene1".to_string(),
        };

        stats.update(&record);

        assert_eq!(stats.total_features, 1);
        assert_eq!(stats.features_with_scores, 1);
        assert_eq!(stats.mean_length(), Some(1001.0));

        Ok(())
    }

    #[test]
    fn test_gff_stats_phase_distribution() -> Result<()> {
        let mut stats = GffStats::new();

        // Add CDS features with different phases
        for phase in 0..=2 {
            let record = GffRecord {
                seqid: "chr1".to_string(),
                source: "test".to_string(),
                feature_type: "CDS".to_string(),
                start: 1000,
                end: 2000,
                score: None,
                strand: Strand::Forward,
                phase: Some(phase),
                attributes: format!("ID=cds{}", phase),
            };
            stats.update(&record);
        }

        assert_eq!(stats.features_with_phase, 3);
        assert_eq!(stats.phase_distribution.get(&0), Some(&1));
        assert_eq!(stats.phase_distribution.get(&1), Some(&1));
        assert_eq!(stats.phase_distribution.get(&2), Some(&1));

        Ok(())
    }
}