Skip to main content

holodeck_lib/error_model/
illumina.rs

1use super::{ErrorModel, ReadEnd};
2
3/// Fraction of read length at which the quality decay ramp begins.
4const DECAY_START_FRACTION: f64 = 0.7;
5
6/// Default R2 error rate multiplier relative to R1.
7const DEFAULT_R2_MULTIPLIER: f64 = 1.5;
8
9/// Precomputed per-cycle error probability and base quality score.
10#[derive(Debug, Clone, Copy)]
11struct CycleParams {
12    /// Error probability at this cycle.
13    p_err: f64,
14    /// Base quality score (before noise): Q = -10*log10(p_err), Phred+33.
15    base_qual: u8,
16}
17
18/// Illumina-style position-dependent error model.
19///
20/// Error rate starts at `min_error_rate` for early cycles and linearly ramps
21/// to `max_error_rate` by the last cycle. The ramp begins at approximately
22/// 70% of the read length, mimicking the quality decay caused by phasing and
23/// dephasing in Illumina sequencing.
24///
25/// Read 2 has a higher error rate than read 1 (controlled by
26/// `r2_multiplier`), reflecting the typically worse quality of the second
27/// read in a pair.
28///
29/// Error probabilities and base quality scores are precomputed at
30/// construction time for each cycle and read end, so the per-base hot path
31/// is a table lookup rather than repeated floating-point math.
32pub struct IlluminaErrorModel {
33    /// Read length this model is configured for.
34    read_length: usize,
35    /// Error rate at the start of reads (e.g. 0.001).
36    min_error_rate: f64,
37    /// Maximum error rate at the end of reads (e.g. 0.01).
38    max_error_rate: f64,
39    /// Multiplier applied to the entire R2 error curve (e.g. 1.5).
40    r2_multiplier: f64,
41    /// Cycle position where the error ramp begins (computed from read_length).
42    decay_start: usize,
43    /// Precomputed per-cycle parameters for R1.
44    r1_params: Vec<CycleParams>,
45    /// Precomputed per-cycle parameters for R2.
46    r2_params: Vec<CycleParams>,
47}
48
49impl IlluminaErrorModel {
50    /// Create a new Illumina error model with default R2 multiplier (1.5x).
51    ///
52    /// # Arguments
53    /// * `read_length` — Number of cycles (bases) per read.
54    /// * `min_error_rate` — Per-base error rate at the start of reads.
55    /// * `max_error_rate` — Per-base error rate at the end of reads.
56    ///
57    /// # Panics
58    /// Panics if `read_length` is 0 or if `min_error_rate > max_error_rate`.
59    #[must_use]
60    pub fn new(read_length: usize, min_error_rate: f64, max_error_rate: f64) -> Self {
61        Self::with_r2_multiplier(read_length, min_error_rate, max_error_rate, DEFAULT_R2_MULTIPLIER)
62    }
63
64    /// Create a new Illumina error model with a custom R2 multiplier.
65    ///
66    /// The `r2_multiplier` scales the entire error curve for read 2 relative
67    /// to read 1. A value of 1.0 means R1 and R2 have identical error rates.
68    ///
69    /// # Panics
70    /// Panics if `read_length` is 0 or if `min_error_rate > max_error_rate`.
71    #[must_use]
72    pub fn with_r2_multiplier(
73        read_length: usize,
74        min_error_rate: f64,
75        max_error_rate: f64,
76        r2_multiplier: f64,
77    ) -> Self {
78        assert!(read_length > 0, "read_length must be > 0");
79        assert!(
80            min_error_rate <= max_error_rate,
81            "min_error_rate ({min_error_rate}) must be <= max_error_rate ({max_error_rate})"
82        );
83
84        #[expect(clippy::cast_possible_truncation, reason = "product is bounded by read_length")]
85        #[expect(clippy::cast_sign_loss, reason = "product is non-negative")]
86        let decay_start = (read_length as f64 * DECAY_START_FRACTION).round() as usize;
87
88        let mut model = Self {
89            read_length,
90            min_error_rate,
91            max_error_rate,
92            r2_multiplier,
93            decay_start,
94            r1_params: Vec::new(),
95            r2_params: Vec::new(),
96        };
97
98        // Precompute per-cycle error probabilities and base quality scores.
99        model.r1_params =
100            (0..read_length).map(|c| model.compute_cycle_params(c, ReadEnd::Read1)).collect();
101        model.r2_params =
102            (0..read_length).map(|c| model.compute_cycle_params(c, ReadEnd::Read2)).collect();
103
104        model
105    }
106}
107
108impl IlluminaErrorModel {
109    /// Compute error probability for a given cycle using the ramp model.
110    fn compute_error_probability(&self, cycle: usize, read_end: ReadEnd) -> f64 {
111        let base_rate = if cycle < self.decay_start {
112            self.min_error_rate
113        } else {
114            let ramp_len = self.read_length.saturating_sub(self.decay_start).max(1);
115            let progress = (cycle - self.decay_start) as f64 / ramp_len as f64;
116            self.min_error_rate + (self.max_error_rate - self.min_error_rate) * progress.min(1.0)
117        };
118
119        match read_end {
120            ReadEnd::Read1 => base_rate,
121            ReadEnd::Read2 => (base_rate * self.r2_multiplier).min(1.0),
122        }
123    }
124
125    /// Precompute the error probability and base quality for a cycle.
126    fn compute_cycle_params(&self, cycle: usize, read_end: ReadEnd) -> CycleParams {
127        let p_err = self.compute_error_probability(cycle, read_end);
128
129        #[expect(clippy::cast_possible_truncation, reason = "clamped to [2, 41]")]
130        #[expect(clippy::cast_sign_loss, reason = "clamped to [2, 41]")]
131        let base_qual = if p_err > 0.0 {
132            (-10.0 * p_err.log10()).round().clamp(2.0, 41.0) as u8 + 33
133        } else {
134            41 + 33
135        };
136
137        CycleParams { p_err, base_qual }
138    }
139
140    /// Return the precomputed cycle parameters for the given cycle and read end.
141    #[inline]
142    fn cycle_params(&self, cycle: usize, read_end: ReadEnd) -> CycleParams {
143        match read_end {
144            ReadEnd::Read1 => self.r1_params[cycle],
145            ReadEnd::Read2 => self.r2_params[cycle],
146        }
147    }
148}
149
150impl ErrorModel for IlluminaErrorModel {
151    fn error_probability(&self, cycle: usize, read_end: ReadEnd) -> f64 {
152        self.cycle_params(cycle, read_end).p_err
153    }
154
155    fn base_quality_phred33(&self, cycle: usize, read_end: ReadEnd) -> u8 {
156        self.cycle_params(cycle, read_end).base_qual
157    }
158}
159
160#[cfg(test)]
161mod tests {
162    use super::*;
163
164    #[test]
165    fn test_error_rate_at_start() {
166        let model = IlluminaErrorModel::new(150, 0.001, 0.01);
167        let p = model.error_probability(0, ReadEnd::Read1);
168        assert!((p - 0.001).abs() < 1e-10);
169    }
170
171    #[test]
172    fn test_error_rate_flat_before_decay() {
173        let model = IlluminaErrorModel::new(150, 0.001, 0.01);
174        // decay_start ≈ 105 for read_length 150
175        for cycle in 0..100 {
176            let p = model.error_probability(cycle, ReadEnd::Read1);
177            assert!((p - 0.001).abs() < 1e-10, "cycle {cycle}: expected 0.001, got {p}");
178        }
179    }
180
181    #[test]
182    fn test_error_rate_ramps_to_max() {
183        let model = IlluminaErrorModel::new(150, 0.001, 0.01);
184        let p = model.error_probability(149, ReadEnd::Read1);
185        assert!((p - 0.01).abs() < 0.001, "expected ~0.01 at last cycle, got {p}");
186    }
187
188    #[test]
189    fn test_error_rate_increases_monotonically() {
190        let model = IlluminaErrorModel::new(150, 0.001, 0.01);
191        let mut prev = 0.0;
192        for cycle in 0..150 {
193            let p = model.error_probability(cycle, ReadEnd::Read1);
194            assert!(p >= prev, "cycle {cycle}: {p} < {prev}");
195            prev = p;
196        }
197    }
198
199    #[test]
200    fn test_r2_higher_than_r1() {
201        let model = IlluminaErrorModel::new(150, 0.001, 0.01);
202        for cycle in [0, 50, 100, 149] {
203            let p1 = model.error_probability(cycle, ReadEnd::Read1);
204            let p2 = model.error_probability(cycle, ReadEnd::Read2);
205            assert!(p2 > p1, "cycle {cycle}: R2 ({p2}) should be > R1 ({p1})");
206        }
207    }
208
209    #[test]
210    fn test_r2_multiplier() {
211        let model = IlluminaErrorModel::new(150, 0.001, 0.01);
212        let p1 = model.error_probability(0, ReadEnd::Read1);
213        let p2 = model.error_probability(0, ReadEnd::Read2);
214        assert!((p2 - p1 * 1.5).abs() < 1e-10, "R2 should be 1.5x R1 at start");
215    }
216
217    #[test]
218    fn test_apply_errors_reproducible() {
219        use rand::SeedableRng;
220        use rand::rngs::SmallRng;
221
222        let model = IlluminaErrorModel::new(10, 0.1, 0.3);
223        let mut bases1 = b"ACGTACGTAC".to_vec();
224        let mut bases2 = b"ACGTACGTAC".to_vec();
225
226        let mut rng1 = SmallRng::seed_from_u64(42);
227        let mut rng2 = SmallRng::seed_from_u64(42);
228
229        let (n1, q1) =
230            crate::error_model::apply_errors(&model, &mut bases1, ReadEnd::Read1, &mut rng1);
231        let (n2, q2) =
232            crate::error_model::apply_errors(&model, &mut bases2, ReadEnd::Read1, &mut rng2);
233
234        assert_eq!(bases1, bases2);
235        assert_eq!(q1, q2);
236        assert_eq!(n1, n2);
237    }
238
239    #[test]
240    fn test_apply_errors_introduces_errors() {
241        use rand::SeedableRng;
242        use rand::rngs::SmallRng;
243
244        let model = IlluminaErrorModel::new(100, 0.5, 0.5); // High error rate
245        let original = vec![b'A'; 100];
246        let mut bases = original.clone();
247        let mut rng = SmallRng::seed_from_u64(123);
248
249        let (n_errors, qualities) =
250            crate::error_model::apply_errors(&model, &mut bases, ReadEnd::Read1, &mut rng);
251
252        assert!(n_errors > 10, "expected many errors with 50% rate, got {n_errors}");
253        assert_eq!(qualities.len(), 100);
254        // Some bases should have changed.
255        assert_ne!(bases, original);
256    }
257
258    #[test]
259    fn test_apply_errors_quality_encoding() {
260        use rand::SeedableRng;
261        use rand::rngs::SmallRng;
262
263        let model = IlluminaErrorModel::new(10, 0.001, 0.01);
264        let mut bases = b"ACGTACGTAC".to_vec();
265        let mut rng = SmallRng::seed_from_u64(99);
266
267        let (_, qualities) =
268            crate::error_model::apply_errors(&model, &mut bases, ReadEnd::Read1, &mut rng);
269
270        for &q in &qualities {
271            // Phred+33: quality 2..=41 maps to ASCII 35..=74
272            assert!(q >= 35, "quality {q} below minimum Phred+33 (35)");
273            assert!(q <= 74, "quality {q} above maximum Phred+33 (74)");
274        }
275    }
276
277    #[test]
278    fn test_apply_errors_zero_rate() {
279        use rand::SeedableRng;
280        use rand::rngs::SmallRng;
281
282        let model = IlluminaErrorModel::new(10, 0.0, 0.0);
283        let original = b"ACGTACGTAC".to_vec();
284        let mut bases = original.clone();
285        let mut rng = SmallRng::seed_from_u64(42);
286
287        let (n_errors, qualities) =
288            crate::error_model::apply_errors(&model, &mut bases, ReadEnd::Read1, &mut rng);
289
290        assert_eq!(n_errors, 0, "zero error rate should produce no errors");
291        assert_eq!(bases, original, "bases should be unchanged");
292        assert_eq!(qualities.len(), 10);
293    }
294
295    #[test]
296    #[should_panic(expected = "read_length must be > 0")]
297    fn test_zero_read_length_panics() {
298        let _ = IlluminaErrorModel::new(0, 0.001, 0.01);
299    }
300
301    #[test]
302    #[should_panic(expected = "min_error_rate")]
303    fn test_min_gt_max_panics() {
304        let _ = IlluminaErrorModel::new(150, 0.1, 0.01);
305    }
306}