use std::collections::HashMap;
use std::path::Path;
use anyhow::{Context, Result};
use rand::Rng;
use serde::{Deserialize, Serialize};
use crate::core::quality::{ParametricQualityModel, QualityModel};
#[derive(Debug, Deserialize, Serialize)]
pub struct ProfileJson {
pub platform: Option<String>,
pub read_length: usize,
pub quality_distribution: QualityDistributionJson,
#[serde(default)]
pub substitution_matrix: HashMap<String, f64>,
#[serde(default)]
pub context_effects: HashMap<String, ContextEffectJson>,
}
#[derive(Debug, Deserialize, Serialize)]
pub struct QualityDistributionJson {
pub read1: Vec<Vec<[f64; 2]>>,
#[serde(default)]
pub read2: Option<Vec<Vec<[f64; 2]>>>,
}
#[derive(Debug, Deserialize, Serialize)]
pub struct ContextEffectJson {
pub quality_penalty: u8,
}
#[derive(Debug, Clone)]
pub struct ContextEffect {
pub quality_penalty: u8,
}
pub struct EmpiricalQualityModel {
quality_distributions: Vec<Vec<(u8, f64)>>,
substitution_matrix: HashMap<(u8, u8), f64>,
context_effects: HashMap<Vec<u8>, ContextEffect>,
#[allow(dead_code)]
pub platform: Option<String>,
}
impl EmpiricalQualityModel {
pub fn from_file(path: &Path) -> Result<Self> {
let contents = std::fs::read_to_string(path)
.with_context(|| format!("cannot read profile: {}", path.display()))?;
Self::from_json_str(&contents)
}
pub fn from_json_str(json: &str) -> Result<Self> {
let profile: ProfileJson =
serde_json::from_str(json).context("failed to parse error profile JSON")?;
Self::from_profile_json(profile)
}
fn from_profile_json(profile: ProfileJson) -> Result<Self> {
let raw_read1 = &profile.quality_distribution.read1;
anyhow::ensure!(!raw_read1.is_empty(), "quality_distribution.read1 is empty");
let quality_distributions = Self::build_distributions(raw_read1)?;
let mut substitution_matrix: HashMap<(u8, u8), f64> =
HashMap::with_capacity(profile.substitution_matrix.len());
for (key, prob) in &profile.substitution_matrix {
let (from, to) = Self::parse_substitution_key(key)?;
substitution_matrix.insert((from, to), *prob);
}
let mut context_effects: HashMap<Vec<u8>, ContextEffect> =
HashMap::with_capacity(profile.context_effects.len());
for (ctx_str, effect_json) in profile.context_effects {
context_effects.insert(
ctx_str.into_bytes(),
ContextEffect {
quality_penalty: effect_json.quality_penalty,
},
);
}
Ok(Self {
quality_distributions,
substitution_matrix,
context_effects,
platform: profile.platform,
})
}
fn build_distributions(raw: &[Vec<[f64; 2]>]) -> Result<Vec<Vec<(u8, f64)>>> {
raw.iter()
.enumerate()
.map(|(pos, pairs)| {
anyhow::ensure!(
!pairs.is_empty(),
"quality_distribution entry at position {} is empty",
pos
);
let mut entries: Vec<(u8, f64)> =
pairs.iter().map(|pair| (pair[0] as u8, pair[1])).collect();
let total: f64 = entries.iter().map(|(_, w)| w).sum();
anyhow::ensure!(
total > 0.0,
"quality distribution weights must sum to > 0 at position {}",
pos
);
let mut cumulative = 0.0_f64;
for (_, w) in &mut entries {
cumulative += *w / total;
*w = cumulative;
}
if let Some(last) = entries.last_mut() {
last.1 = 1.0;
}
Ok(entries)
})
.collect()
}
fn parse_substitution_key(key: &str) -> Result<(u8, u8)> {
let bytes = key.as_bytes();
anyhow::ensure!(
bytes.len() == 3 && bytes[1] == b'>',
"substitution key must be in 'X>Y' format, got: {}",
key
);
Ok((bytes[0], bytes[2]))
}
fn sample_quality_at<R: Rng>(&self, pos: usize, rng: &mut R) -> u8 {
let table_pos = pos.min(self.quality_distributions.len() - 1);
let cdf = &self.quality_distributions[table_pos];
let r: f64 = rng.random();
for &(q, cum) in cdf {
if r <= cum {
return q;
}
}
cdf.last().map(|&(q, _)| q).unwrap_or(30)
}
fn total_substitution_prob(&self, from: u8) -> f64 {
const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
BASES
.iter()
.filter(|&&b| b != from)
.map(|&to| {
self.substitution_matrix
.get(&(from, to))
.copied()
.unwrap_or(0.0)
})
.sum()
}
fn sample_substitution<R: Rng>(&self, from: u8, rng: &mut R) -> u8 {
const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
let total = self.total_substitution_prob(from);
if total <= 0.0 {
return from;
}
let r: f64 = rng.random::<f64>() * total;
let mut cumulative = 0.0;
for &to in BASES.iter().filter(|&&b| b != from) {
cumulative += self
.substitution_matrix
.get(&(from, to))
.copied()
.unwrap_or(0.0);
if r <= cumulative {
return to;
}
}
*BASES.iter().find(|&&b| b != from).unwrap_or(&b'A')
}
pub fn apply_context_effects(&self, seq: &[u8], qualities: &mut [u8]) {
for (ctx_bytes, effect) in &self.context_effects {
let ctx_len = ctx_bytes.len();
for end in ctx_len..=seq.len() {
let window = &seq[end - ctx_len..end];
if window == ctx_bytes.as_slice() && end < qualities.len() {
qualities[end] = qualities[end].saturating_sub(effect.quality_penalty).max(2);
}
}
}
}
}
impl QualityModel for EmpiricalQualityModel {
fn generate_qualities<R: Rng>(&self, read_length: usize, rng: &mut R) -> Vec<u8> {
(0..read_length)
.map(|pos| self.sample_quality_at(pos, rng))
.collect()
}
fn inject_errors<R: Rng>(&self, sequence: &mut [u8], qualities: &[u8], rng: &mut R) {
let mut effective = qualities.to_vec();
self.apply_context_effects(sequence, &mut effective);
for i in 0..sequence.len() {
let from = sequence[i];
let error_prob = ParametricQualityModel::error_probability(effective[i]);
if rng.random::<f64>() < error_prob {
let total_sub = self.total_substitution_prob(from);
if total_sub > 0.0 {
sequence[i] = self.sample_substitution(from, rng);
} else {
const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
loop {
let new_base = BASES[rng.random_range(0..4)];
if new_base != from {
sequence[i] = new_base;
break;
}
}
}
}
}
}
}
#[cfg(test)]
pub fn load_from_config(profile_path: Option<&Path>) -> Result<Option<EmpiricalQualityModel>> {
match profile_path {
Some(path) => Ok(Some(EmpiricalQualityModel::from_file(path)?)),
None => Ok(None),
}
}
#[cfg(test)]
mod tests {
use super::*;
use rand::rngs::StdRng;
use rand::SeedableRng;
use crate::core::quality::{ParametricQualityModel, QualityModel};
fn minimal_profile_json() -> &'static str {
r#"{
"platform": "NovaSeq 6000",
"read_length": 150,
"quality_distribution": {
"read1": [
[[37, 0.8], [36, 0.1], [35, 0.1]],
[[36, 0.7], [35, 0.2], [34, 0.1]],
[[35, 0.6], [34, 0.3], [33, 0.1]]
]
},
"substitution_matrix": {
"A>C": 0.001,
"A>G": 0.003,
"A>T": 0.0005,
"C>A": 0.001,
"C>G": 0.0005,
"C>T": 0.004,
"G>A": 0.003,
"G>C": 0.0005,
"G>T": 0.001,
"T>A": 0.0005,
"T>C": 0.004,
"T>G": 0.001
},
"context_effects": {
"GGG": { "quality_penalty": 5 }
}
}"#
}
#[test]
fn test_load_profile() {
let model = EmpiricalQualityModel::from_json_str(minimal_profile_json())
.expect("should parse valid profile");
assert_eq!(model.platform.as_deref(), Some("NovaSeq 6000"));
assert_eq!(model.quality_distributions.len(), 3);
assert!(!model.substitution_matrix.is_empty());
assert!(model.context_effects.contains_key(b"GGG".as_ref()));
}
#[test]
fn test_invalid_profile() {
let bad_json = r#"{ "platform": "test", "quality_distribution": { "read1": [] } }"#;
assert!(
EmpiricalQualityModel::from_json_str(bad_json).is_err(),
"should fail on missing read_length"
);
let empty_read1 = r#"{
"platform": "test",
"read_length": 100,
"quality_distribution": { "read1": [] }
}"#;
assert!(
EmpiricalQualityModel::from_json_str(empty_read1).is_err(),
"should fail on empty read1"
);
assert!(EmpiricalQualityModel::from_json_str("not json").is_err());
}
#[test]
fn test_empirical_quality_distribution() {
let json = r#"{
"platform": "test",
"read_length": 1,
"quality_distribution": {
"read1": [
[[30, 0.5], [20, 0.3], [10, 0.2]]
]
}
}"#;
let model = EmpiricalQualityModel::from_json_str(json).unwrap();
let mut rng = StdRng::seed_from_u64(42);
let n = 10_000usize;
let mut counts: HashMap<u8, usize> = HashMap::new();
for _ in 0..n {
let quals = model.generate_qualities(1, &mut rng);
*counts.entry(quals[0]).or_insert(0) += 1;
}
let freq_30 = *counts.get(&30).unwrap_or(&0) as f64 / n as f64;
let freq_20 = *counts.get(&20).unwrap_or(&0) as f64 / n as f64;
let freq_10 = *counts.get(&10).unwrap_or(&0) as f64 / n as f64;
assert!(
(freq_30 - 0.50).abs() < 0.04,
"Q30 frequency {freq_30} not close to 0.50"
);
assert!(
(freq_20 - 0.30).abs() < 0.04,
"Q20 frequency {freq_20} not close to 0.30"
);
assert!(
(freq_10 - 0.20).abs() < 0.04,
"Q10 frequency {freq_10} not close to 0.20"
);
}
#[test]
fn test_substitution_matrix() {
let json = r#"{
"platform": "test",
"read_length": 1,
"quality_distribution": {
"read1": [
[[10, 1.0]]
]
},
"substitution_matrix": {
"C>T": 0.08,
"C>A": 0.02
}
}"#;
let model = EmpiricalQualityModel::from_json_str(json).unwrap();
let mut rng = StdRng::seed_from_u64(99);
let n = 100_000usize;
let mut ct_count = 0usize;
let mut ca_count = 0usize;
for _ in 0..n {
let mut seq = vec![b'C'];
let quals = vec![10u8];
model.inject_errors(&mut seq, &quals, &mut rng);
match seq[0] {
b'T' => ct_count += 1,
b'A' => ca_count += 1,
_ => {}
}
}
let total_errors = ct_count + ca_count;
assert!(total_errors > 0, "expected some errors at Q10");
let ct_fraction = ct_count as f64 / total_errors as f64;
assert!(
(ct_fraction - 0.80).abs() < 0.05,
"C>T fraction {ct_fraction} not close to 0.80"
);
}
#[test]
fn test_context_effects() {
let json = r#"{
"platform": "test",
"read_length": 4,
"quality_distribution": {
"read1": [
[[30, 1.0]],
[[30, 1.0]],
[[30, 1.0]],
[[30, 1.0]]
]
},
"context_effects": {
"GGG": { "quality_penalty": 5 }
}
}"#;
let model = EmpiricalQualityModel::from_json_str(json).unwrap();
let mut rng = StdRng::seed_from_u64(7);
let seq = b"GGGA";
let mut qualities = model.generate_qualities(4, &mut rng);
assert!(
qualities.iter().all(|&q| q == 30),
"all should start at Q30"
);
model.apply_context_effects(seq, &mut qualities);
assert_eq!(
qualities[3], 25,
"expected Q25 after GGG context, got {}",
qualities[3]
);
assert_eq!(qualities[0], 30);
assert_eq!(qualities[1], 30);
assert_eq!(qualities[2], 30);
}
#[test]
fn test_trait_compatibility() {
fn use_model<M: QualityModel>(model: &M, rng: &mut StdRng) {
let quals = model.generate_qualities(50, rng);
assert_eq!(quals.len(), 50);
let mut seq = vec![b'A'; 50];
model.inject_errors(&mut seq, &quals, rng);
for &b in &seq {
assert!(matches!(b, b'A' | b'C' | b'G' | b'T'));
}
}
let mut rng = StdRng::seed_from_u64(1);
let parametric = ParametricQualityModel::new(36, 0.003);
use_model(¶metric, &mut rng);
let empirical = EmpiricalQualityModel::from_json_str(minimal_profile_json()).unwrap();
use_model(&empirical, &mut rng);
}
#[test]
fn test_fallback_to_parametric() {
let result = load_from_config(None).expect("should not error with no path");
assert!(
result.is_none(),
"expected None when no profile path is set"
);
let model = ParametricQualityModel::new(36, 0.003);
let mut rng = StdRng::seed_from_u64(42);
let quals = model.generate_qualities(100, &mut rng);
assert_eq!(quals.len(), 100);
let mut seq = vec![b'A'; 100];
model.inject_errors(&mut seq, &quals, &mut rng);
for &b in &seq {
assert!(matches!(b, b'A' | b'C' | b'G' | b'T'));
}
}
}