use serde::{Deserialize, Serialize};
const A_IDX: usize = 0;
const T_IDX: usize = 1;
const G_IDX: usize = 2;
const C_IDX: usize = 3;
const N_IDX: usize = 4;
const BASE_TO_IDX: [usize; 256] = {
let mut table = [N_IDX; 256];
table[b'A' as usize] = A_IDX;
table[b'a' as usize] = A_IDX;
table[b'T' as usize] = T_IDX;
table[b't' as usize] = T_IDX;
table[b'G' as usize] = G_IDX;
table[b'g' as usize] = G_IDX;
table[b'C' as usize] = C_IDX;
table[b'c' as usize] = C_IDX;
table
};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct BaseContent {
counts: Vec<[u64; 5]>,
}
impl Default for BaseContent {
fn default() -> Self {
Self::new()
}
}
impl BaseContent {
pub fn new() -> Self {
Self::with_capacity(300)
}
pub fn with_capacity(capacity: usize) -> Self {
Self {
counts: Vec::with_capacity(capacity),
}
}
#[inline]
pub fn update(&mut self, seq: &[u8]) {
if seq.len() > self.counts.len() {
self.counts.resize(seq.len(), [0u64; 5]);
}
for (pos, &base) in seq.iter().enumerate() {
let idx = BASE_TO_IDX[base as usize];
self.counts[pos][idx] += 1;
}
}
pub fn get_ratios(&self, position: usize) -> [f64; 5] {
if position >= self.counts.len() {
return [0.0; 5];
}
let counts = &self.counts[position];
let total: u64 = counts.iter().sum();
if total == 0 {
return [0.0; 5];
}
let total_f = total as f64;
[
counts[A_IDX] as f64 / total_f,
counts[T_IDX] as f64 / total_f,
counts[G_IDX] as f64 / total_f,
counts[C_IDX] as f64 / total_f,
counts[N_IDX] as f64 / total_f,
]
}
pub fn get_counts(&self, position: usize) -> Option<[u64; 5]> {
self.counts.get(position).copied()
}
pub fn len(&self) -> usize {
self.counts.len()
}
pub fn is_empty(&self) -> bool {
self.counts.is_empty()
}
pub fn merge(&mut self, other: &BaseContent) {
if other.counts.len() > self.counts.len() {
self.counts.resize(other.counts.len(), [0u64; 5]);
}
for (pos, other_counts) in other.counts.iter().enumerate() {
for (i, &count) in other_counts.iter().enumerate() {
self.counts[pos][i] += count;
}
}
}
pub fn gc_ratio(&self, position: usize) -> f64 {
if position >= self.counts.len() {
return 0.0;
}
let counts = &self.counts[position];
let gc = counts[G_IDX] + counts[C_IDX];
let total: u64 = counts.iter().sum();
if total == 0 {
0.0
} else {
gc as f64 / total as f64
}
}
pub fn all_counts(&self) -> &[[u64; 5]] {
&self.counts
}
pub fn from_raw_counts(counts: Vec<[u64; 5]>) -> Self {
Self { counts }
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_base_content_new() {
let bc = BaseContent::new();
assert!(bc.is_empty());
assert_eq!(bc.len(), 0);
}
#[test]
fn test_base_content_with_capacity() {
let bc = BaseContent::with_capacity(500);
assert!(bc.is_empty());
}
#[test]
fn test_base_content_update_simple() {
let mut bc = BaseContent::new();
bc.update(b"ATGC");
assert_eq!(bc.len(), 4);
assert_eq!(bc.get_counts(0), Some([1, 0, 0, 0, 0])); assert_eq!(bc.get_counts(1), Some([0, 1, 0, 0, 0])); assert_eq!(bc.get_counts(2), Some([0, 0, 1, 0, 0])); assert_eq!(bc.get_counts(3), Some([0, 0, 0, 1, 0])); }
#[test]
fn test_base_content_update_with_n() {
let mut bc = BaseContent::new();
bc.update(b"ATGCN");
assert_eq!(bc.len(), 5);
assert_eq!(bc.get_counts(4), Some([0, 0, 0, 0, 1])); }
#[test]
fn test_base_content_update_lowercase() {
let mut bc = BaseContent::new();
bc.update(b"atgc");
assert_eq!(bc.get_counts(0), Some([1, 0, 0, 0, 0])); assert_eq!(bc.get_counts(1), Some([0, 1, 0, 0, 0])); assert_eq!(bc.get_counts(2), Some([0, 0, 1, 0, 0])); assert_eq!(bc.get_counts(3), Some([0, 0, 0, 1, 0])); }
#[test]
fn test_base_content_update_multiple_reads() {
let mut bc = BaseContent::new();
bc.update(b"AAAA");
bc.update(b"TTTT");
bc.update(b"GGGG");
bc.update(b"CCCC");
for pos in 0..4 {
assert_eq!(bc.get_counts(pos), Some([1, 1, 1, 1, 0]));
}
}
#[test]
fn test_base_content_get_ratios() {
let mut bc = BaseContent::new();
bc.update(b"AAAA");
bc.update(b"TTTT");
let ratios = bc.get_ratios(0);
assert!((ratios[0] - 0.5).abs() < 0.001); assert!((ratios[1] - 0.5).abs() < 0.001); assert!((ratios[2] - 0.0).abs() < 0.001); assert!((ratios[3] - 0.0).abs() < 0.001); assert!((ratios[4] - 0.0).abs() < 0.001); }
#[test]
fn test_base_content_get_ratios_invalid_position() {
let bc = BaseContent::new();
let ratios = bc.get_ratios(100);
assert_eq!(ratios, [0.0; 5]);
}
#[test]
fn test_base_content_merge() {
let mut bc1 = BaseContent::new();
bc1.update(b"AAAA");
let mut bc2 = BaseContent::new();
bc2.update(b"TTTT");
bc1.merge(&bc2);
for pos in 0..4 {
let counts = bc1.get_counts(pos).unwrap();
assert_eq!(counts[A_IDX], 1);
assert_eq!(counts[T_IDX], 1);
}
}
#[test]
fn test_base_content_merge_different_lengths() {
let mut bc1 = BaseContent::new();
bc1.update(b"AA");
let mut bc2 = BaseContent::new();
bc2.update(b"TTTT");
bc1.merge(&bc2);
assert_eq!(bc1.len(), 4);
assert_eq!(bc1.get_counts(0).unwrap()[A_IDX], 1);
assert_eq!(bc1.get_counts(0).unwrap()[T_IDX], 1);
assert_eq!(bc1.get_counts(2).unwrap()[A_IDX], 0);
assert_eq!(bc1.get_counts(2).unwrap()[T_IDX], 1);
}
#[test]
fn test_base_content_gc_ratio() {
let mut bc = BaseContent::new();
bc.update(b"GGCC");
bc.update(b"AATT");
assert!((bc.gc_ratio(0) - 0.5).abs() < 0.001);
}
#[test]
fn test_base_content_dynamic_extension() {
let mut bc = BaseContent::with_capacity(10);
bc.update(b"A");
assert_eq!(bc.len(), 1);
bc.update(b"ATGCATGCATGCATGC"); assert_eq!(bc.len(), 16);
}
#[test]
fn test_base_content_serialize() {
let mut bc = BaseContent::new();
bc.update(b"ATGC");
let json = serde_json::to_string(&bc).unwrap();
let bc2: BaseContent = serde_json::from_str(&json).unwrap();
assert_eq!(bc.len(), bc2.len());
for pos in 0..bc.len() {
assert_eq!(bc.get_counts(pos), bc2.get_counts(pos));
}
}
}