use std::collections::HashMap;
use crate::error::{CoreError, CoreResult};
#[must_use]
pub fn nucleotide_frequencies(seq: &[u8]) -> [f64; 4] {
let mut counts = [0u64; 4]; for &b in seq {
match b.to_ascii_uppercase() {
b'A' => counts[0] += 1,
b'C' => counts[1] += 1,
b'G' => counts[2] += 1,
b'T' => counts[3] += 1,
_ => {}
}
}
let total: u64 = counts.iter().sum();
if total == 0 {
return [0.0; 4];
}
let t = total as f64;
[
counts[0] as f64 / t,
counts[1] as f64 / t,
counts[2] as f64 / t,
counts[3] as f64 / t,
]
}
#[must_use]
pub fn dinucleotide_frequencies(seq: &[u8]) -> HashMap<[u8; 2], f64> {
let mut counts: HashMap<[u8; 2], u64> = HashMap::new();
if seq.len() < 2 {
return HashMap::new();
}
let upper: Vec<u8> = seq.iter().map(|&b| b.to_ascii_uppercase()).collect();
let mut total = 0u64;
for window in upper.windows(2) {
let b0 = window[0];
let b1 = window[1];
if is_acgt(b0) && is_acgt(b1) {
*counts.entry([b0, b1]).or_insert(0) += 1;
total += 1;
}
}
if total == 0 {
return HashMap::new();
}
let t = total as f64;
counts.into_iter().map(|(k, v)| (k, v as f64 / t)).collect()
}
pub fn codon_usage_table(dna: &[u8]) -> CoreResult<HashMap<[u8; 3], f64>> {
let upper: Vec<u8> = dna.iter().map(|&b| b.to_ascii_uppercase()).collect();
let mut counts: HashMap<[u8; 3], u64> = HashMap::new();
let mut total = 0u64;
let mut i = 0;
while i + 3 <= upper.len() {
let b0 = upper[i];
let b1 = upper[i + 1];
let b2 = upper[i + 2];
if is_acgt(b0) && is_acgt(b1) && is_acgt(b2) {
*counts.entry([b0, b1, b2]).or_insert(0) += 1;
total += 1;
} else {
for &b in &[b0, b1, b2] {
if !b.is_ascii() {
return Err(CoreError::ValueError(crate::error_context!(format!(
"Non-ASCII byte 0x{b:02X} found in DNA sequence"
))));
}
}
}
i += 3;
}
if total == 0 {
return Ok(HashMap::new());
}
let t = total as f64;
Ok(counts.into_iter().map(|(k, v)| (k, v as f64 / t)).collect())
}
#[inline]
fn is_acgt(b: u8) -> bool {
matches!(b, b'A' | b'C' | b'G' | b'T')
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_nuc_freq_uniform() {
let freqs = nucleotide_frequencies(b"ACGT");
for &f in &freqs {
assert!((f - 0.25).abs() < 1e-10, "expected 0.25, got {f}");
}
}
#[test]
fn test_nuc_freq_only_a() {
let freqs = nucleotide_frequencies(b"AAAA");
assert!((freqs[0] - 1.0).abs() < 1e-10);
assert_eq!(freqs[1], 0.0);
assert_eq!(freqs[2], 0.0);
assert_eq!(freqs[3], 0.0);
}
#[test]
fn test_nuc_freq_empty() {
let freqs = nucleotide_frequencies(b"");
assert_eq!(freqs, [0.0; 4]);
}
#[test]
fn test_nuc_freq_ignores_n() {
let freqs = nucleotide_frequencies(b"ACGTN");
let sum: f64 = freqs.iter().sum();
assert!((sum - 1.0).abs() < 1e-10, "sum should be 1.0, got {sum}");
}
#[test]
fn test_nuc_freq_order_acgt() {
let freqs = nucleotide_frequencies(b"AACGT");
assert!((freqs[0] - 2.0 / 5.0).abs() < 1e-10); assert!((freqs[1] - 1.0 / 5.0).abs() < 1e-10); assert!((freqs[2] - 1.0 / 5.0).abs() < 1e-10); assert!((freqs[3] - 1.0 / 5.0).abs() < 1e-10); }
#[test]
fn test_dinuc_freq_basic() {
let freqs = dinucleotide_frequencies(b"ATCG");
assert_eq!(freqs.len(), 3);
for &v in freqs.values() {
assert!((v - 1.0 / 3.0).abs() < 1e-10, "expected 1/3, got {v}");
}
}
#[test]
fn test_dinuc_freq_repeated() {
let freqs = dinucleotide_frequencies(b"ATATAT");
let at_freq = freqs[b"AT"];
let ta_freq = freqs[b"TA"];
assert!((at_freq - 3.0 / 5.0).abs() < 1e-10);
assert!((ta_freq - 2.0 / 5.0).abs() < 1e-10);
}
#[test]
fn test_dinuc_freq_empty_sequence() {
let freqs = dinucleotide_frequencies(b"");
assert!(freqs.is_empty());
}
#[test]
fn test_dinuc_freq_single_base() {
let freqs = dinucleotide_frequencies(b"A");
assert!(freqs.is_empty());
}
#[test]
fn test_dinuc_freq_sums_to_one() {
let freqs = dinucleotide_frequencies(b"ATGCATGCNN");
if !freqs.is_empty() {
let sum: f64 = freqs.values().sum();
assert!((sum - 1.0).abs() < 1e-10, "sum should be 1.0, got {sum}");
}
}
#[test]
fn test_codon_usage_two_atg_one_taa() {
let table = codon_usage_table(b"ATGATGTAA").expect("codon_usage_table failed");
let atg = table[b"ATG"];
let taa = table[b"TAA"];
assert!(
(atg - 2.0 / 3.0).abs() < 1e-10,
"ATG expected 2/3, got {atg}"
);
assert!(
(taa - 1.0 / 3.0).abs() < 1e-10,
"TAA expected 1/3, got {taa}"
);
}
#[test]
fn test_codon_usage_empty() {
let table = codon_usage_table(b"").expect("codon_usage_table failed");
assert!(table.is_empty());
}
#[test]
fn test_codon_usage_sums_to_one() {
let table = codon_usage_table(b"ATGAAACCCGGG").expect("codon_usage_table failed");
let sum: f64 = table.values().sum();
assert!((sum - 1.0).abs() < 1e-10, "sum should be 1.0, got {sum}");
}
#[test]
fn test_codon_usage_skips_ambiguous() {
let table = codon_usage_table(b"ATGNNN").expect("codon_usage_table failed");
assert_eq!(table.len(), 1);
let atg = table[b"ATG"];
assert!((atg - 1.0).abs() < 1e-10);
}
}