use crate::core::GenomicRecordIterator;
use crate::error::Result;
use crate::stats::{CategoryCounter, RunningStats};
use std::collections::HashMap;
use super::reader::{GffReader, GffRecord};
#[derive(Debug, Clone)]
pub struct GffStats {
pub total_features: usize,
pub feature_types: CategoryCounter<String>,
pub sequences: CategoryCounter<String>,
pub strand_distribution: HashMap<String, usize>,
pub sources: CategoryCounter<String>,
pub length_stats: RunningStats,
pub score_stats: RunningStats,
pub features_with_scores: usize,
pub features_with_phase: usize,
pub phase_distribution: HashMap<u8, usize>,
}
impl Default for GffStats {
fn default() -> Self {
Self::new()
}
}
impl GffStats {
pub fn new() -> Self {
Self {
total_features: 0,
feature_types: CategoryCounter::new(),
sequences: CategoryCounter::new(),
strand_distribution: HashMap::new(),
sources: CategoryCounter::new(),
length_stats: RunningStats::new(),
score_stats: RunningStats::new(),
features_with_scores: 0,
features_with_phase: 0,
phase_distribution: HashMap::new(),
}
}
pub fn update(&mut self, record: &GffRecord) {
self.total_features += 1;
self.feature_types.increment(record.feature_type.clone());
self.sequences.increment(record.seqid.clone());
let strand_str = match record.strand {
crate::core::Strand::Forward => "+",
crate::core::Strand::Reverse => "-",
crate::core::Strand::Unknown => ".",
};
*self
.strand_distribution
.entry(strand_str.to_string())
.or_insert(0) += 1;
self.sources.increment(record.source.clone());
let length = record.len() as f64;
self.length_stats.push(length);
if let Some(score) = record.score {
self.features_with_scores += 1;
self.score_stats.push(score);
}
if let Some(phase) = record.phase {
self.features_with_phase += 1;
*self.phase_distribution.entry(phase).or_insert(0) += 1;
}
}
pub fn compute(reader: &mut GffReader) -> Result<Self> {
let mut stats = Self::new();
while let Some(record) = reader.next_record()? {
stats.update(&record);
}
Ok(stats)
}
pub fn print_summary(&self) {
println!("╔═══════════════════════════════════════════════╗");
println!("║ GFF/GTF Statistics Summary ║");
println!("╚═══════════════════════════════════════════════╝");
println!();
println!("📊 Overall Statistics");
println!(" Total features: {}", self.total_features);
println!();
println!("🧬 Feature Types");
let mut feature_vec: Vec<_> = self.feature_types.iter().collect();
feature_vec.sort_by(|a, b| b.1.cmp(a.1));
for (feature_type, count) in feature_vec.iter().take(10) {
let percentage = (**count as f64 / self.total_features as f64) * 100.0;
println!(" {:<20} {:>10} ({:>6.2}%)", feature_type, count, percentage);
}
if self.feature_types.num_categories() > 10 {
println!(
" ... and {} more types",
self.feature_types.num_categories() - 10
);
}
println!();
println!("🧭 Sequences/Chromosomes (top 10)");
let mut seq_vec: Vec<_> = self.sequences.iter().collect();
seq_vec.sort_by(|a, b| b.1.cmp(a.1));
for (seq, count) in seq_vec.iter().take(10) {
let percentage = (**count as f64 / self.total_features as f64) * 100.0;
println!(" {:<20} {:>10} ({:>6.2}%)", seq, count, percentage);
}
if self.sequences.num_categories() > 10 {
println!(
" ... and {} more sequences",
self.sequences.num_categories() - 10
);
}
println!();
println!("🧵 Strand Distribution");
for (strand, count) in &self.strand_distribution {
let percentage = (*count as f64 / self.total_features as f64) * 100.0;
let label = match strand.as_str() {
"+" => "Forward (+)",
"-" => "Reverse (-)",
"." => "Unknown (.)",
_ => strand,
};
println!(" {:<20} {:>10} ({:>6.2}%)", label, count, percentage);
}
println!();
println!("📚 Sources (top 10)");
let mut source_vec: Vec<_> = self.sources.iter().collect();
source_vec.sort_by(|a, b| b.1.cmp(a.1));
for (source, count) in source_vec.iter().take(10) {
let percentage = (**count as f64 / self.total_features as f64) * 100.0;
println!(" {:<20} {:>10} ({:>6.2}%)", source, count, percentage);
}
println!();
println!("📏 Feature Length Statistics");
if let Some(mean) = self.length_stats.mean() {
println!(" Mean: {:>10.2} bp", mean);
}
if let Some(std_dev) = self.length_stats.std_dev() {
println!(" StdDev: {:>10.2} bp", std_dev);
}
if let Some(min) = self.length_stats.min() {
println!(" Min: {:>10.0} bp", min);
}
if let Some(max) = self.length_stats.max() {
println!(" Max: {:>10.0} bp", max);
}
println!();
if self.features_with_scores > 0 {
println!("⭐ Score Statistics");
println!(" Features with scores: {}", self.features_with_scores);
if let Some(mean) = self.score_stats.mean() {
println!(" Mean: {:>10.2}", mean);
}
if let Some(std_dev) = self.score_stats.std_dev() {
println!(" StdDev: {:>10.2}", std_dev);
}
if let Some(min) = self.score_stats.min() {
println!(" Min: {:>10.2}", min);
}
if let Some(max) = self.score_stats.max() {
println!(" Max: {:>10.2}", max);
}
println!();
}
if self.features_with_phase > 0 {
println!("🔢 Phase Distribution (CDS features)");
for phase in 0..=2 {
if let Some(count) = self.phase_distribution.get(&phase) {
let percentage = (*count as f64 / self.features_with_phase as f64) * 100.0;
println!(" Phase {}: {:>10} ({:>6.2}%)", phase, count, percentage);
}
}
println!();
}
}
pub fn unique_feature_types(&self) -> usize {
self.feature_types.num_categories()
}
pub fn unique_sequences(&self) -> usize {
self.sequences.num_categories()
}
pub fn unique_sources(&self) -> usize {
self.sources.num_categories()
}
pub fn mean_length(&self) -> Option<f64> {
self.length_stats.mean()
}
pub fn total_span(&self) -> f64 {
self.length_stats.count() as f64 * self.length_stats.mean().unwrap_or(0.0)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::core::Strand;
use std::io::Write;
use tempfile::NamedTempFile;
#[test]
fn test_gff_stats_basic() -> Result<()> {
let gff_data = "##gff-version 3\n\
chr1\tENSEMBL\tgene\t1000\t2000\t100\t+\t.\tID=gene1\n\
chr1\tENSEMBL\texon\t1000\t1200\t.\t+\t.\tID=exon1;Parent=gene1\n\
chr1\tENSEMBL\texon\t1800\t2000\t.\t+\t.\tID=exon2;Parent=gene1\n\
chr2\tRefSeq\tgene\t5000\t6000\t200\t-\t.\tID=gene2\n";
let mut temp_file = NamedTempFile::new()?;
temp_file.write_all(gff_data.as_bytes())?;
temp_file.flush()?;
let mut reader = GffReader::from_path(temp_file.path())?;
let stats = GffStats::compute(&mut reader)?;
assert_eq!(stats.total_features, 4);
assert_eq!(stats.unique_feature_types(), 2); assert_eq!(stats.unique_sequences(), 2); assert_eq!(stats.features_with_scores, 2);
assert_eq!(stats.feature_types.get(&"gene".to_string()), 2);
assert_eq!(stats.feature_types.get(&"exon".to_string()), 2);
Ok(())
}
#[test]
fn test_gff_stats_streaming() -> Result<()> {
let mut stats = GffStats::new();
let record = GffRecord {
seqid: "chr1".to_string(),
source: "test".to_string(),
feature_type: "gene".to_string(),
start: 1000,
end: 2000,
score: Some(50.0),
strand: Strand::Forward,
phase: None,
attributes: "ID=gene1".to_string(),
};
stats.update(&record);
assert_eq!(stats.total_features, 1);
assert_eq!(stats.features_with_scores, 1);
assert_eq!(stats.mean_length(), Some(1001.0));
Ok(())
}
#[test]
fn test_gff_stats_phase_distribution() -> Result<()> {
let mut stats = GffStats::new();
for phase in 0..=2 {
let record = GffRecord {
seqid: "chr1".to_string(),
source: "test".to_string(),
feature_type: "CDS".to_string(),
start: 1000,
end: 2000,
score: None,
strand: Strand::Forward,
phase: Some(phase),
attributes: format!("ID=cds{}", phase),
};
stats.update(&record);
}
assert_eq!(stats.features_with_phase, 3);
assert_eq!(stats.phase_distribution.get(&0), Some(&1));
assert_eq!(stats.phase_distribution.get(&1), Some(&1));
assert_eq!(stats.phase_distribution.get(&2), Some(&1));
Ok(())
}
}