use serde::Serialize;
use std::collections::HashMap;
include!(concat!(env!("OUT_DIR"), "/canonical_kmers.rs"));
pub fn average_quality(quality: &[u8], quality_encoding: &str) -> f32 {
let offset = match quality_encoding {
"Phred+33" => 33,
"Phred+64" => 64,
_ => 33, };
let mut sum = 0;
let count = quality.len() as i32;
for &q in quality {
sum += q as i32 - offset;
}
if count > 0 {
sum as f32 / count as f32
} else {
0.0
}
}
#[inline(always)]
pub fn gc_content(sequence: &[u8]) -> f32 {
if sequence.is_empty() {
return 0.0;
}
let mut gc_count = 0;
let mut valid_base_count = 0;
for &base in sequence {
match base {
b'G' | b'C' => {
gc_count += 1;
valid_base_count += 1;
}
b'A' | b'T' => {
valid_base_count += 1;
}
_ => {} }
}
if valid_base_count == 0 {
0.0
} else {
(gc_count as f32 / valid_base_count as f32) * 100.0
}
}
fn round_to_4_sig_figs(value: f32) -> f32 {
if value == 0.0 {
return 0.0;
}
let scale = 10.0_f32.powf(4.0 - value.abs().log10().floor());
(value * scale).round() / scale
}
#[derive(Serialize)]
struct TetraFrequency {
tetra: String,
percentage: f32,
}
pub fn tetranucleotide_frequencies(sequence: &[u8], limit: Option<usize>) -> (String, usize) {
let mut tetra_counts: HashMap<String, usize> = HashMap::new();
if sequence.len() < 4 {
return ("[]".to_string(), 0);
}
for window in sequence.windows(4) {
let is_unambiguous = window
.iter()
.all(|&base| matches!(base, b'A' | b'C' | b'T' | b'G'));
if is_unambiguous {
if let Ok(tetra) = std::str::from_utf8(window) {
*tetra_counts.entry(tetra.to_string()).or_insert(0) += 1;
}
}
}
let unique_count = tetra_counts.len();
if unique_count == 0 {
return ("[]".to_string(), 0);
}
let total_count: f32 = tetra_counts.values().sum::<usize>() as f32;
let mut frequencies: Vec<TetraFrequency> = tetra_counts
.into_iter()
.map(|(tetra, count)| {
let percentage = (count as f32 / total_count) * 100.0;
TetraFrequency {
tetra,
percentage: round_to_4_sig_figs(percentage),
}
})
.collect();
frequencies.sort_by(|a, b| b.percentage.partial_cmp(&a.percentage).unwrap());
if let Some(limit) = limit {
frequencies.truncate(limit);
}
(
serde_json::to_string(&frequencies).unwrap_or_else(|_| "[]".to_string()),
unique_count,
)
}
#[derive(Serialize)]
struct CTetraFrequency {
tetra: String,
percentage: f32,
}
pub fn canonical_tetranucleotide_frequencies(
sequence: &[u8],
limit: Option<usize>,
) -> (String, usize) {
let mut ctetra_counts: HashMap<String, usize> = HashMap::new();
if sequence.len() < 4 {
return ("[]".to_string(), 0);
}
for window in sequence.windows(4) {
let is_unambiguous = window
.iter()
.all(|&base| matches!(base, b'A' | b'C' | b'T' | b'G'));
if is_unambiguous {
if let Ok(tetra) = std::str::from_utf8(window) {
if let Some(&canonical) = CANONICAL_K_MERS.get(tetra) {
*ctetra_counts.entry(canonical.to_string()).or_insert(0) += 1;
}
}
}
}
let unique_count = ctetra_counts.len();
if unique_count == 0 {
return ("[]".to_string(), 0);
}
let total_count: f32 = ctetra_counts.values().sum::<usize>() as f32;
let mut frequencies: Vec<CTetraFrequency> = ctetra_counts
.into_iter()
.map(|(tetra, count)| {
let percentage = (count as f32 / total_count) * 100.0;
CTetraFrequency {
tetra,
percentage: round_to_4_sig_figs(percentage),
}
})
.collect();
frequencies.sort_by(|a, b| b.percentage.partial_cmp(&a.percentage).unwrap());
if let Some(limit) = limit {
frequencies.truncate(limit);
}
(
serde_json::to_string(&frequencies).unwrap_or_else(|_| "[]".to_string()),
unique_count,
)
}