holodeck_lib/error_model/mod.rs
1//! Sequencing error models for realistic quality score and error simulation.
2//!
3//! The [`ErrorModel`] trait defines the interface for position-dependent error
4//! models. The [`apply_errors`] free function applies any error model to a
5//! read's bases, introducing substitution errors and generating quality scores.
6
7pub mod illumina;
8
9use rand::Rng;
10
11/// Which read in a pair this is.
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum ReadEnd {
14 /// First read in a pair (or the only read for single-end).
15 Read1,
16 /// Second read in a pair.
17 Read2,
18}
19
20/// Trait for position-dependent sequencing error models.
21///
22/// Implementors produce an error probability for each cycle position,
23/// allowing realistic simulation of position-dependent error rates.
24/// The trait is the extensibility point for future platform support
25/// (Element, Ultima, etc.).
26pub trait ErrorModel {
27 /// Return the probability of a sequencing error at the given cycle
28 /// position (0-based) for the given read end.
29 fn error_probability(&self, cycle: usize, read_end: ReadEnd) -> f64;
30
31 /// Return the Phred+33 base quality score for the given cycle and read
32 /// end (before per-base noise).
33 ///
34 /// The default implementation computes `Q = -10*log10(p_err)` clamped to
35 /// [2, 41] and adds the Phred+33 offset. Implementors may override this
36 /// with a precomputed lookup for speed.
37 fn base_quality_phred33(&self, cycle: usize, read_end: ReadEnd) -> u8 {
38 let p_err = self.error_probability(cycle, read_end);
39 #[expect(clippy::cast_possible_truncation, reason = "clamped to [2, 41]")]
40 #[expect(clippy::cast_sign_loss, reason = "clamped to [2, 41]")]
41 if p_err > 0.0 {
42 (-10.0 * p_err.log10()).round().clamp(2.0, 41.0) as u8 + 33
43 } else {
44 41 + 33
45 }
46 }
47}
48
49/// Apply sequencing errors to a read's bases in place using the given error
50/// model, returning the number of errors and quality scores (Phred+33).
51///
52/// For each base position:
53/// 1. Look up error probability and base quality from the model
54/// 2. If a random draw < error probability, substitute a random different base
55/// 3. Assign a quality score by adding noise to the base quality
56///
57/// Returns `(n_errors, quality_scores)`.
58pub fn apply_errors(
59 model: &impl ErrorModel,
60 bases: &mut [u8],
61 read_end: ReadEnd,
62 rng: &mut impl Rng,
63) -> (usize, Vec<u8>) {
64 let mut qualities = Vec::with_capacity(bases.len());
65 let mut n_errors = 0;
66
67 for (cycle, base) in bases.iter_mut().enumerate() {
68 let p_err = model.error_probability(cycle, read_end);
69
70 // Possibly introduce an error.
71 if rng.random::<f64>() < p_err {
72 *base = random_different_base(*base, rng);
73 n_errors += 1;
74 }
75
76 // Look up precomputed base quality and add per-base noise.
77 let base_qual = model.base_quality_phred33(cycle, read_end);
78 let noise: f64 = rng.random_range(-2.0..2.0);
79 #[expect(clippy::cast_possible_truncation, reason = "clamped to [35, 74]")]
80 #[expect(clippy::cast_sign_loss, reason = "clamped to [35, 74]")]
81 let q_noisy = (f64::from(base_qual) + noise).round().clamp(35.0, 74.0) as u8;
82
83 qualities.push(q_noisy);
84 }
85
86 (n_errors, qualities)
87}
88
89/// Substitute a base with a uniformly random different base.
90///
91/// Given a DNA base (A, C, G, T), returns one of the other three bases
92/// chosen uniformly at random using a single RNG draw and a lookup table.
93fn random_different_base(base: u8, rng: &mut impl Rng) -> u8 {
94 // For each input base, the three possible substitute bases.
95 const ALTS: [u8; 12] = [
96 b'C', b'G', b'T', // A -> C, G, T
97 b'A', b'G', b'T', // C -> A, G, T
98 b'A', b'C', b'T', // G -> A, C, T
99 b'A', b'C', b'G', // T -> A, C, G
100 ];
101
102 // Map base to row offset: A=0, C=3, G=6, T=9.
103 let row = match base {
104 b'A' => 0,
105 b'C' => 3,
106 b'G' => 6,
107 _ => 9, // T or any other base
108 };
109
110 ALTS[row + rng.random_range(0..3usize)]
111}
112
113#[cfg(test)]
114mod tests {
115 use super::*;
116
117 #[test]
118 fn test_random_different_base_never_same() {
119 let mut rng = rand::rng();
120 for _ in 0..1000 {
121 let original = b'A';
122 let result = random_different_base(original, &mut rng);
123 assert_ne!(result, original);
124 assert!(matches!(result, b'C' | b'G' | b'T'));
125 }
126 }
127
128 #[test]
129 fn test_random_different_base_all_bases() {
130 let mut rng = rand::rng();
131 for base in [b'A', b'C', b'G', b'T'] {
132 let result = random_different_base(base, &mut rng);
133 assert_ne!(result, base);
134 }
135 }
136}