use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GcStats {
histogram: Vec<u64>,
total_gc: u64,
total_bases: u64,
total_reads: u64,
}
impl Default for GcStats {
fn default() -> Self {
Self::new()
}
}
impl GcStats {
pub fn new() -> Self {
Self {
histogram: vec![0u64; 101],
total_gc: 0,
total_bases: 0,
total_reads: 0,
}
}
#[inline]
pub fn update(&mut self, seq: &[u8]) {
if seq.is_empty() {
return;
}
let mut gc_count: u64 = 0;
for &base in seq {
match base {
b'G' | b'g' | b'C' | b'c' => gc_count += 1,
_ => {}
}
}
let len = seq.len() as u64;
let gc_percent = ((gc_count * 100) / len) as usize;
let gc_percent = gc_percent.min(100);
self.histogram[gc_percent] += 1;
self.total_gc += gc_count;
self.total_bases += len;
self.total_reads += 1;
}
pub fn mean_gc(&self) -> f64 {
if self.total_bases == 0 {
0.0
} else {
(self.total_gc as f64 / self.total_bases as f64) * 100.0
}
}
pub fn histogram(&self) -> &[u64] {
&self.histogram
}
pub fn total_reads(&self) -> u64 {
self.total_reads
}
pub fn total_bases(&self) -> u64 {
self.total_bases
}
pub fn total_gc(&self) -> u64 {
self.total_gc
}
pub fn median_gc(&self) -> u8 {
if self.total_reads == 0 {
return 0;
}
let half = self.total_reads / 2;
let mut cumulative = 0u64;
for (gc_percent, &count) in self.histogram.iter().enumerate() {
cumulative += count;
if cumulative > half {
return gc_percent as u8;
}
}
50 }
pub fn mode_gc(&self) -> u8 {
let (mode, _) = self
.histogram
.iter()
.enumerate()
.max_by_key(|&(_, count)| count)
.unwrap_or((50, &0));
mode as u8
}
pub fn percent_in_range(&self, min_gc: u8, max_gc: u8) -> f64 {
if self.total_reads == 0 {
return 0.0;
}
let min = min_gc as usize;
let max = (max_gc as usize).min(100);
let count: u64 = self.histogram[min..=max].iter().sum();
(count as f64 / self.total_reads as f64) * 100.0
}
pub fn merge(&mut self, other: &GcStats) {
for (i, &count) in other.histogram.iter().enumerate() {
self.histogram[i] += count;
}
self.total_gc += other.total_gc;
self.total_bases += other.total_bases;
self.total_reads += other.total_reads;
}
pub fn from_raw(histogram: &[u64; 101], total_gc: u64, total_bases: u64, total_reads: u64) -> Self {
Self {
histogram: histogram.to_vec(),
total_gc,
total_bases,
total_reads,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_gc_stats_new() {
let gc = GcStats::new();
assert_eq!(gc.total_reads(), 0);
assert_eq!(gc.total_bases(), 0);
assert_eq!(gc.total_gc(), 0);
}
#[test]
fn test_gc_stats_update_all_gc() {
let mut gc = GcStats::new();
gc.update(b"GCGCGCGCGC");
assert_eq!(gc.total_reads(), 1);
assert_eq!(gc.histogram()[100], 1);
assert!((gc.mean_gc() - 100.0).abs() < 0.001);
}
#[test]
fn test_gc_stats_update_no_gc() {
let mut gc = GcStats::new();
gc.update(b"ATATATATAT");
assert_eq!(gc.total_reads(), 1);
assert_eq!(gc.histogram()[0], 1);
assert!((gc.mean_gc() - 0.0).abs() < 0.001);
}
#[test]
fn test_gc_stats_update_50_percent() {
let mut gc = GcStats::new();
gc.update(b"ATGC");
assert_eq!(gc.histogram()[50], 1);
assert!((gc.mean_gc() - 50.0).abs() < 0.001);
}
#[test]
fn test_gc_stats_update_lowercase() {
let mut gc = GcStats::new();
gc.update(b"atgc");
assert_eq!(gc.histogram()[50], 1);
assert!((gc.mean_gc() - 50.0).abs() < 0.001);
}
#[test]
fn test_gc_stats_update_with_n() {
let mut gc = GcStats::new();
gc.update(b"GCNNAT");
assert_eq!(gc.histogram()[33], 1);
}
#[test]
fn test_gc_stats_update_empty() {
let mut gc = GcStats::new();
gc.update(b"");
assert_eq!(gc.total_reads(), 0);
assert_eq!(gc.total_bases(), 0);
}
#[test]
fn test_gc_stats_mean_gc_multiple_reads() {
let mut gc = GcStats::new();
gc.update(b"GGCC"); gc.update(b"AATT");
assert!((gc.mean_gc() - 50.0).abs() < 0.001);
}
#[test]
fn test_gc_stats_median_gc() {
let mut gc = GcStats::new();
gc.update(b"GGGGGGGGGG"); gc.update(b"GCGCGCGCGC"); gc.update(b"AAAAAAAAAA");
assert_eq!(gc.median_gc(), 100);
}
#[test]
fn test_gc_stats_mode_gc() {
let mut gc = GcStats::new();
gc.update(b"ATGC"); gc.update(b"ATGC"); gc.update(b"GGGG");
assert_eq!(gc.mode_gc(), 50);
}
#[test]
fn test_gc_stats_percent_in_range() {
let mut gc = GcStats::new();
gc.update(b"ATGC"); gc.update(b"ATGC"); gc.update(b"GGGG"); gc.update(b"AAAA");
assert!((gc.percent_in_range(40, 60) - 50.0).abs() < 0.1);
}
#[test]
fn test_gc_stats_merge() {
let mut gc1 = GcStats::new();
gc1.update(b"GGGG");
let mut gc2 = GcStats::new();
gc2.update(b"AAAA");
gc1.merge(&gc2);
assert_eq!(gc1.total_reads(), 2);
assert_eq!(gc1.histogram()[100], 1);
assert_eq!(gc1.histogram()[0], 1);
assert!((gc1.mean_gc() - 50.0).abs() < 0.001);
}
#[test]
fn test_gc_stats_serialize() {
let mut gc = GcStats::new();
gc.update(b"ATGC");
let json = serde_json::to_string(&gc).unwrap();
let gc2: GcStats = serde_json::from_str(&json).unwrap();
assert_eq!(gc.total_reads(), gc2.total_reads());
assert!((gc.mean_gc() - gc2.mean_gc()).abs() < 0.001);
}
#[test]
fn test_gc_stats_empty_mean() {
let gc = GcStats::new();
assert_eq!(gc.mean_gc(), 0.0);
assert_eq!(gc.median_gc(), 0);
}
}