use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
#[serde(rename_all = "snake_case")]
#[derive(Default)]
pub enum GcBiasModelKind {
#[default]
Default,
Flat,
Custom,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct GcBiasConfig {
#[serde(default = "default_true")]
pub enabled: bool,
#[serde(default)]
pub model: GcBiasModelKind,
#[serde(default = "default_severity")]
pub severity: f64,
}
fn default_true() -> bool {
true
}
fn default_severity() -> f64 {
1.0
}
impl Default for GcBiasConfig {
fn default() -> Self {
Self {
enabled: true,
model: GcBiasModelKind::Default,
severity: 1.0,
}
}
}
#[derive(Debug, Clone)]
pub struct GcBiasModel {
enabled: bool,
kind: GcBiasModelKind,
sigma: f64,
severity: f64,
}
impl Default for GcBiasModel {
fn default() -> Self {
Self::from_config(&GcBiasConfig::default())
}
}
impl GcBiasModel {
pub fn from_config(cfg: &GcBiasConfig) -> Self {
Self {
enabled: cfg.enabled,
kind: cfg.model.clone(),
sigma: 0.23,
severity: cfg.severity.max(0.0),
}
}
pub fn coverage_multiplier(&self, gc_fraction: f64) -> f64 {
if !self.enabled || self.kind == GcBiasModelKind::Flat {
return 1.0;
}
if self.severity == 0.0 {
return 1.0;
}
let raw = gaussian(gc_fraction, 0.5, self.sigma);
let blended = 1.0 + self.severity * (raw - 1.0);
blended.clamp(0.0, 1.0)
}
pub fn gc_fraction(sequence: &[u8]) -> f64 {
let mut gc = 0usize;
let mut total = 0usize;
for &b in sequence {
match b.to_ascii_uppercase() {
b'G' | b'C' => {
gc += 1;
total += 1;
}
b'A' | b'T' => {
total += 1;
}
_ => {}
}
}
if total == 0 {
return 0.5; }
gc as f64 / total as f64
}
}
fn gaussian(x: f64, mu: f64, sigma: f64) -> f64 {
let z = (x - mu) / sigma;
(-0.5 * z * z).exp()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_gc_fraction() {
assert!((GcBiasModel::gc_fraction(b"GGCC") - 1.0).abs() < 1e-9);
assert!((GcBiasModel::gc_fraction(b"AATT") - 0.0).abs() < 1e-9);
assert!((GcBiasModel::gc_fraction(b"ACGT") - 0.5).abs() < 1e-9);
assert!((GcBiasModel::gc_fraction(b"GCGAAT") - 0.5).abs() < 1e-9);
}
#[test]
fn test_multiplier_at_50_percent() {
let model = GcBiasModel::default();
let m = model.coverage_multiplier(0.5);
assert!((m - 1.0).abs() < 1e-6, "expected ~1.0 at 50% GC, got {m}");
}
#[test]
fn test_multiplier_at_extremes() {
let model = GcBiasModel::default();
let m_10 = model.coverage_multiplier(0.1);
let m_90 = model.coverage_multiplier(0.9);
assert!(m_10 < 0.5, "10% GC: expected < 0.5, got {m_10}");
assert!(m_90 < 0.5, "90% GC: expected < 0.5, got {m_90}");
assert!(
(m_10 - m_90).abs() < 1e-9,
"bias should be symmetric: m_10={m_10}, m_90={m_90}"
);
}
#[test]
fn test_flat_model() {
let cfg = GcBiasConfig {
enabled: true,
model: GcBiasModelKind::Flat,
severity: 1.0,
};
let model = GcBiasModel::from_config(&cfg);
for gc in [0.0, 0.1, 0.2, 0.5, 0.8, 0.9, 1.0] {
let m = model.coverage_multiplier(gc);
assert!(
(m - 1.0).abs() < 1e-9,
"flat model: expected 1.0 at GC={gc}, got {m}"
);
}
}
#[test]
fn test_severity_scaling() {
let cfg_low = GcBiasConfig {
enabled: true,
model: GcBiasModelKind::Default,
severity: 0.5,
};
let cfg_high = GcBiasConfig {
enabled: true,
model: GcBiasModelKind::Default,
severity: 2.0,
};
let model_low = GcBiasModel::from_config(&cfg_low);
let model_high = GcBiasModel::from_config(&cfg_high);
let m_low = model_low.coverage_multiplier(0.1);
let m_high = model_high.coverage_multiplier(0.1);
assert!(
m_high < m_low,
"higher severity should reduce multiplier further: low={m_low}, high={m_high}"
);
let cfg_zero = GcBiasConfig {
enabled: true,
model: GcBiasModelKind::Default,
severity: 0.0,
};
let model_zero = GcBiasModel::from_config(&cfg_zero);
assert!(
(model_zero.coverage_multiplier(0.1) - 1.0).abs() < 1e-9,
"severity=0 should give multiplier 1.0"
);
}
#[test]
fn test_coverage_distribution() {
use rand::rngs::StdRng;
use rand::Rng;
use rand::SeedableRng;
let model = GcBiasModel::default();
let mut rng = StdRng::seed_from_u64(42);
let n = 10_000usize;
let mut kept_at_rich = 0usize;
let mut kept_neutral = 0usize;
for _ in 0..n {
let m = model.coverage_multiplier(0.1);
if rng.random::<f64>() < m {
kept_at_rich += 1;
}
let m = model.coverage_multiplier(0.5);
if rng.random::<f64>() < m {
kept_neutral += 1;
}
}
assert!(
kept_neutral > n * 99 / 100,
"expected >99% retention at 50% GC, got {kept_neutral}/{n}"
);
assert!(
kept_at_rich < n / 2,
"expected <50% retention at 10% GC, got {kept_at_rich}/{n}"
);
assert!(
kept_at_rich < kept_neutral,
"AT-rich should have fewer reads than GC-neutral"
);
}
#[test]
fn test_gc_fraction_n_bases() {
assert!(
(GcBiasModel::gc_fraction(b"NNACGT") - 0.5).abs() < 1e-9,
"N bases should be excluded"
);
assert!(
(GcBiasModel::gc_fraction(b"NNNN") - 0.5).abs() < 1e-9,
"all-N sequence should return 0.5"
);
assert!(
(GcBiasModel::gc_fraction(b"NGCNN") - 1.0).abs() < 1e-9,
"N bases should not count toward denominator"
);
}
}