Skip to main content

holodeck_lib/error_model/
mod.rs

1//! Sequencing error models for realistic quality score and error simulation.
2//!
3//! The [`ErrorModel`] trait defines the interface for position-dependent error
4//! models. The [`apply_errors`] free function applies any error model to a
5//! read's bases, introducing substitution errors and generating quality scores.
6
7pub mod illumina;
8
9use rand::Rng;
10
11/// Which read in a pair this is.
12#[derive(Debug, Clone, Copy, PartialEq, Eq)]
13pub enum ReadEnd {
14    /// First read in a pair (or the only read for single-end).
15    Read1,
16    /// Second read in a pair.
17    Read2,
18}
19
20/// Trait for position-dependent sequencing error models.
21///
22/// Implementors produce an error probability for each cycle position,
23/// allowing realistic simulation of position-dependent error rates.
24/// The trait is the extensibility point for future platform support
25/// (Element, Ultima, etc.).
26pub trait ErrorModel {
27    /// Return the probability of a sequencing error at the given cycle
28    /// position (0-based) for the given read end.
29    fn error_probability(&self, cycle: usize, read_end: ReadEnd) -> f64;
30
31    /// Return the Phred+33 base quality score for the given cycle and read
32    /// end (before per-base noise).
33    ///
34    /// The default implementation computes `Q = -10*log10(p_err)` clamped to
35    /// [2, 41] and adds the Phred+33 offset.  Implementors may override this
36    /// with a precomputed lookup for speed.
37    fn base_quality_phred33(&self, cycle: usize, read_end: ReadEnd) -> u8 {
38        let p_err = self.error_probability(cycle, read_end);
39        #[expect(clippy::cast_possible_truncation, reason = "clamped to [2, 41]")]
40        #[expect(clippy::cast_sign_loss, reason = "clamped to [2, 41]")]
41        if p_err > 0.0 {
42            (-10.0 * p_err.log10()).round().clamp(2.0, 41.0) as u8 + 33
43        } else {
44            41 + 33
45        }
46    }
47}
48
49/// Apply sequencing errors to a read's bases in place using the given error
50/// model, returning the number of errors and quality scores (Phred+33).
51///
52/// For each base position:
53/// 1. Look up error probability and base quality from the model
54/// 2. If a random draw < error probability, substitute a random different base
55/// 3. Assign a quality score by adding noise to the base quality
56///
57/// Returns `(n_errors, quality_scores)`.
58pub fn apply_errors(
59    model: &impl ErrorModel,
60    bases: &mut [u8],
61    read_end: ReadEnd,
62    rng: &mut impl Rng,
63) -> (usize, Vec<u8>) {
64    let mut qualities = Vec::with_capacity(bases.len());
65    let mut n_errors = 0;
66
67    for (cycle, base) in bases.iter_mut().enumerate() {
68        let p_err = model.error_probability(cycle, read_end);
69
70        // Possibly introduce an error.
71        if rng.random::<f64>() < p_err {
72            *base = random_different_base(*base, rng);
73            n_errors += 1;
74        }
75
76        // Look up precomputed base quality and add per-base noise.
77        let base_qual = model.base_quality_phred33(cycle, read_end);
78        let noise: f64 = rng.random_range(-2.0..2.0);
79        #[expect(clippy::cast_possible_truncation, reason = "clamped to [35, 74]")]
80        #[expect(clippy::cast_sign_loss, reason = "clamped to [35, 74]")]
81        let q_noisy = (f64::from(base_qual) + noise).round().clamp(35.0, 74.0) as u8;
82
83        qualities.push(q_noisy);
84    }
85
86    (n_errors, qualities)
87}
88
89/// Substitute a base with a uniformly random different base.
90///
91/// Given a DNA base (A, C, G, T), returns one of the other three bases
92/// chosen uniformly at random using a single RNG draw and a lookup table.
93fn random_different_base(base: u8, rng: &mut impl Rng) -> u8 {
94    // For each input base, the three possible substitute bases.
95    const ALTS: [u8; 12] = [
96        b'C', b'G', b'T', // A -> C, G, T
97        b'A', b'G', b'T', // C -> A, G, T
98        b'A', b'C', b'T', // G -> A, C, T
99        b'A', b'C', b'G', // T -> A, C, G
100    ];
101
102    // Map base to row offset: A=0, C=3, G=6, T=9.
103    let row = match base {
104        b'A' => 0,
105        b'C' => 3,
106        b'G' => 6,
107        _ => 9, // T or any other base
108    };
109
110    ALTS[row + rng.random_range(0..3usize)]
111}
112
113#[cfg(test)]
114mod tests {
115    use super::*;
116
117    #[test]
118    fn test_random_different_base_never_same() {
119        let mut rng = rand::rng();
120        for _ in 0..1000 {
121            let original = b'A';
122            let result = random_different_base(original, &mut rng);
123            assert_ne!(result, original);
124            assert!(matches!(result, b'C' | b'G' | b'T'));
125        }
126    }
127
128    #[test]
129    fn test_random_different_base_all_bases() {
130        let mut rng = rand::rng();
131        for base in [b'A', b'C', b'G', b'T'] {
132            let result = random_different_base(base, &mut rng);
133            assert_ne!(result, base);
134        }
135    }
136}