fgumi 0.2.0

High-performance tools for UMI-tagged sequencing data: extraction, grouping, and consensus calling
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
//! Strand bias models for duplex sequencing simulation.
//!
//! Models the distribution of reads between A and B strands in duplex
//! sequencing, accounting for PCR amplification bias where one strand
//! may start amplifying in earlier rounds.

use rand::Rng;
use rand_distr::{Beta, Distribution};

/// Beta-distributed A/B strand ratio model for duplex sequencing.
///
/// In duplex sequencing, molecules are sequenced from both strands (A and B).
/// Due to PCR amplification dynamics, the ratio of A to B strand reads is
/// rarely exactly 50/50. This model uses a Beta distribution to simulate
/// realistic strand ratios.
///
/// # Parameters
///
/// - `alpha` and `beta` control the shape of the distribution
/// - When alpha == beta, the distribution is symmetric around 0.5
/// - Higher values concentrate the distribution closer to 0.5
/// - Lower values allow more extreme ratios
///
/// # Examples
///
/// ```
/// use fgumi_lib::simulate::StrandBiasModel;
/// use fgumi_lib::simulate::create_rng;
///
/// // Symmetric distribution centered at 0.5
/// let model = StrandBiasModel::default();
/// let mut rng = create_rng(Some(42));
///
/// // Split 10 reads between A and B strands
/// let (a_count, b_count) = model.split_reads(10, &mut rng);
/// assert_eq!(a_count + b_count, 10);
/// ```
#[derive(Debug, Clone)]
pub struct StrandBiasModel {
    /// Beta distribution alpha parameter
    pub alpha: f64,
    /// Beta distribution beta parameter
    pub beta: f64,
}

impl Default for StrandBiasModel {
    fn default() -> Self {
        // alpha=5, beta=5 gives a symmetric distribution
        // concentrated around 0.5 but with realistic variation
        Self { alpha: 5.0, beta: 5.0 }
    }
}

#[allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
impl StrandBiasModel {
    /// Create a new strand bias model with custom parameters.
    ///
    /// # Arguments
    ///
    /// * `alpha` - Beta distribution alpha parameter (must be > 0)
    /// * `beta` - Beta distribution beta parameter (must be > 0)
    #[must_use]
    pub fn new(alpha: f64, beta: f64) -> Self {
        Self { alpha, beta }
    }

    /// Create a model with no strand bias (always 50/50 split).
    ///
    /// Uses very high alpha/beta values to concentrate near 0.5.
    #[must_use]
    pub fn no_bias() -> Self {
        Self { alpha: 1000.0, beta: 1000.0 }
    }

    /// Sample an A-strand fraction from the distribution.
    ///
    /// # Arguments
    ///
    /// * `rng` - Random number generator
    ///
    /// # Returns
    ///
    /// A value in [0, 1] representing the fraction of reads on the A strand.
    ///
    /// # Panics
    ///
    /// Panics if `alpha` or `beta` are invalid Beta distribution parameters.
    pub fn sample_a_fraction(&self, rng: &mut impl Rng) -> f64 {
        let dist = Beta::new(self.alpha, self.beta).expect("Invalid beta parameters");
        dist.sample(rng)
    }

    /// Split a total read count between A and B strands.
    ///
    /// # Arguments
    ///
    /// * `total` - Total number of reads to split
    /// * `rng` - Random number generator
    ///
    /// # Returns
    ///
    /// A tuple of (`a_count`, `b_count`) where `a_count` + `b_count` == total.
    #[allow(clippy::cast_sign_loss)]
    pub fn split_reads(&self, total: usize, rng: &mut impl Rng) -> (usize, usize) {
        if total == 0 {
            return (0, 0);
        }

        let a_fraction = self.sample_a_fraction(rng);
        let a_count = (total as f64 * a_fraction).round() as usize;
        let b_count = total.saturating_sub(a_count);

        (a_count, b_count)
    }

    /// Split a total read count, ensuring both strands have at least `min_per_strand` reads.
    ///
    /// If total < 2 * `min_per_strand`, the constraint cannot be satisfied and
    /// the split is done without the constraint.
    ///
    /// # Arguments
    ///
    /// * `total` - Total number of reads to split
    /// * `min_per_strand` - Minimum reads required per strand
    /// * `rng` - Random number generator
    ///
    /// # Returns
    ///
    /// A tuple of (`a_count`, `b_count`) where `a_count` + `b_count` == total.
    #[allow(clippy::cast_sign_loss)]
    pub fn split_reads_with_minimum(
        &self,
        total: usize,
        min_per_strand: usize,
        rng: &mut impl Rng,
    ) -> (usize, usize) {
        if total < 2 * min_per_strand {
            // Cannot satisfy constraint, fall back to unconstrained split
            return self.split_reads(total, rng);
        }

        // Reserve minimum for each strand, then distribute remainder
        let remainder = total - 2 * min_per_strand;
        let a_fraction = self.sample_a_fraction(rng);
        let a_extra = (remainder as f64 * a_fraction).round() as usize;

        let a_count = min_per_strand + a_extra;
        let b_count = total - a_count;

        (a_count, b_count)
    }
}

#[cfg(test)]
#[allow(clippy::cast_precision_loss)]
mod tests {
    use super::*;
    use crate::simulate::create_rng;

    #[test]
    fn test_default_parameters() {
        let model = StrandBiasModel::default();
        assert!((model.alpha - 5.0).abs() < f64::EPSILON);
        assert!((model.beta - 5.0).abs() < f64::EPSILON);
    }

    #[test]
    fn test_a_fraction_in_range() {
        let model = StrandBiasModel::default();
        let mut rng = create_rng(Some(42));

        for _ in 0..100 {
            let frac = model.sample_a_fraction(&mut rng);
            assert!((0.0..=1.0).contains(&frac));
        }
    }

    #[test]
    fn test_symmetric_distribution() {
        let model = StrandBiasModel::new(5.0, 5.0);
        let mut rng = create_rng(Some(42));

        let fractions: Vec<f64> = (0..1000).map(|_| model.sample_a_fraction(&mut rng)).collect();
        let mean: f64 = fractions.iter().sum::<f64>() / fractions.len() as f64;

        // With symmetric alpha=beta, mean should be close to 0.5
        assert!((mean - 0.5).abs() < 0.05);
    }

    #[test]
    fn test_split_reads_sum() {
        let model = StrandBiasModel::default();
        let mut rng = create_rng(Some(42));

        for total in [0, 1, 5, 10, 100] {
            let (a, b) = model.split_reads(total, &mut rng);
            assert_eq!(a + b, total);
        }
    }

    #[test]
    fn test_split_reads_zero() {
        let model = StrandBiasModel::default();
        let mut rng = create_rng(Some(42));

        let (a, b) = model.split_reads(0, &mut rng);
        assert_eq!(a, 0);
        assert_eq!(b, 0);
    }

    #[test]
    fn test_no_bias() {
        let model = StrandBiasModel::no_bias();
        let mut rng = create_rng(Some(42));

        // With very high alpha/beta, fraction should be very close to 0.5
        let fractions: Vec<f64> = (0..100).map(|_| model.sample_a_fraction(&mut rng)).collect();
        let mean: f64 = fractions.iter().sum::<f64>() / fractions.len() as f64;

        assert!((mean - 0.5).abs() < 0.01);
    }

    #[test]
    fn test_split_with_minimum() {
        let model = StrandBiasModel::default();
        let mut rng = create_rng(Some(42));

        // With 10 reads and min 2, both strands should have at least 2
        for _ in 0..100 {
            let (a, b) = model.split_reads_with_minimum(10, 2, &mut rng);
            assert!(a >= 2);
            assert!(b >= 2);
            assert_eq!(a + b, 10);
        }
    }

    #[test]
    fn test_split_with_minimum_unsatisfiable() {
        let model = StrandBiasModel::default();
        let mut rng = create_rng(Some(42));

        // With 3 reads and min 2, constraint cannot be satisfied
        // Should fall back to unconstrained split
        let (a, b) = model.split_reads_with_minimum(3, 2, &mut rng);
        assert_eq!(a + b, 3);
    }

    #[test]
    fn test_reproducibility() {
        let model = StrandBiasModel::default();

        let mut rng1 = create_rng(Some(42));
        let mut rng2 = create_rng(Some(42));

        let fractions1: Vec<f64> = (0..10).map(|_| model.sample_a_fraction(&mut rng1)).collect();
        let fractions2: Vec<f64> = (0..10).map(|_| model.sample_a_fraction(&mut rng2)).collect();

        assert_eq!(fractions1, fractions2);
    }

    #[test]
    fn test_split_reads_one() {
        let model = StrandBiasModel::default();
        let mut rng = create_rng(Some(42));

        // With 1 read, should go to either A or B
        for _ in 0..100 {
            let (a, b) = model.split_reads(1, &mut rng);
            assert_eq!(a + b, 1);
            assert!(a == 0 || a == 1);
        }
    }

    #[test]
    fn test_split_reads_large() {
        let model = StrandBiasModel::new(5.0, 5.0);
        let mut rng = create_rng(Some(42));

        let (a, b) = model.split_reads(10000, &mut rng);
        assert_eq!(a + b, 10000);

        // With symmetric distribution, should be roughly equal
        let frac_a = a as f64 / 10000.0;
        assert!(frac_a > 0.3 && frac_a < 0.7, "Expected roughly balanced split, got {frac_a}");
    }

    #[test]
    fn test_asymmetric_a_heavy() {
        let model = StrandBiasModel::new(10.0, 2.0);
        let mut rng = create_rng(Some(42));

        let splits: Vec<(usize, usize)> =
            (0..100).map(|_| model.split_reads(100, &mut rng)).collect();

        let avg_a: f64 = splits.iter().map(|(a, _)| *a as f64).sum::<f64>() / splits.len() as f64;
        let avg_b: f64 = splits.iter().map(|(_, b)| *b as f64).sum::<f64>() / splits.len() as f64;

        // A-biased, so avg_a should be higher
        assert!(avg_a > avg_b, "Expected A > B, got A={avg_a}, B={avg_b}");
        assert!(avg_a > 70.0, "Expected strong A bias");
    }

    #[test]
    fn test_asymmetric_b_heavy() {
        let model = StrandBiasModel::new(2.0, 10.0);
        let mut rng = create_rng(Some(42));

        let splits: Vec<(usize, usize)> =
            (0..100).map(|_| model.split_reads(100, &mut rng)).collect();

        let avg_a: f64 = splits.iter().map(|(a, _)| *a as f64).sum::<f64>() / splits.len() as f64;
        let avg_b: f64 = splits.iter().map(|(_, b)| *b as f64).sum::<f64>() / splits.len() as f64;

        // B-biased, so avg_b should be higher
        assert!(avg_b > avg_a, "Expected B > A, got A={avg_a}, B={avg_b}");
        assert!(avg_b > 70.0, "Expected strong B bias");
    }

    #[test]
    fn test_split_with_minimum_zero() {
        let model = StrandBiasModel::default();
        let mut rng = create_rng(Some(42));

        // min_per_strand = 0 is same as unconstrained
        for _ in 0..100 {
            let (a, b) = model.split_reads_with_minimum(10, 0, &mut rng);
            assert_eq!(a + b, 10);
        }
    }

    #[test]
    fn test_split_with_minimum_half() {
        let model = StrandBiasModel::default();
        let mut rng = create_rng(Some(42));

        // min_per_strand = 5 with total = 10 means exactly 5/5
        for _ in 0..100 {
            let (a, b) = model.split_reads_with_minimum(10, 5, &mut rng);
            assert_eq!(a, 5);
            assert_eq!(b, 5);
        }
    }

    #[test]
    fn test_split_with_minimum_respects_bounds() {
        let model = StrandBiasModel::new(8.0, 2.0); // Heavily A-biased
        let mut rng = create_rng(Some(42));

        // Even with heavy A bias, should respect minimum of 3 per strand
        for _ in 0..100 {
            let (a, b) = model.split_reads_with_minimum(20, 3, &mut rng);
            assert!(a >= 3, "A={a} should be >= 3");
            assert!(b >= 3, "B={b} should be >= 3");
            assert_eq!(a + b, 20);
        }
    }

    #[test]
    fn test_high_concentration() {
        // Very high alpha/beta should concentrate around 0.5
        let model = StrandBiasModel::new(100.0, 100.0);
        let mut rng = create_rng(Some(42));

        let fractions: Vec<f64> = (0..100).map(|_| model.sample_a_fraction(&mut rng)).collect();

        // Mean should be very close to 0.5
        let mean: f64 = fractions.iter().sum::<f64>() / fractions.len() as f64;
        assert!((mean - 0.5).abs() < 0.05, "Mean {mean} too far from 0.5 with high concentration");

        // Most fractions should be within 0.15 of 0.5
        let close_count = fractions.iter().filter(|&&f| (f - 0.5).abs() < 0.15).count();
        assert!(close_count > 90, "Only {close_count}/100 fractions within 0.15 of 0.5");
    }

    #[test]
    fn test_low_concentration() {
        // Very low alpha/beta should give extreme values
        let model = StrandBiasModel::new(0.5, 0.5);
        let mut rng = create_rng(Some(42));

        let fractions: Vec<f64> = (0..1000).map(|_| model.sample_a_fraction(&mut rng)).collect();

        // Should have some extreme values
        let extreme_low = fractions.iter().filter(|&&f| f < 0.2).count();
        let extreme_high = fractions.iter().filter(|&&f| f > 0.8).count();

        assert!(extreme_low > 100, "Expected more extreme low values, got {extreme_low}");
        assert!(extreme_high > 100, "Expected more extreme high values, got {extreme_high}");
    }

    #[test]
    fn test_split_reads_consistency() {
        let model = StrandBiasModel::default();
        let mut rng = create_rng(Some(42));

        // Verify that splits always sum to total
        for total in 0..50 {
            let (a, b) = model.split_reads(total, &mut rng);
            assert_eq!(a + b, total, "Split ({a}, {b}) doesn't sum to {total}");
        }
    }

    #[test]
    fn test_a_fraction_distribution_shape() {
        let model = StrandBiasModel::new(2.0, 5.0);
        let mut rng = create_rng(Some(42));

        let fractions: Vec<f64> = (0..10000).map(|_| model.sample_a_fraction(&mut rng)).collect();

        // Expected mean of Beta(2, 5) is 2/(2+5) = 0.286
        let mean: f64 = fractions.iter().sum::<f64>() / fractions.len() as f64;
        let expected_mean = 2.0 / 7.0;

        assert!(
            (mean - expected_mean).abs() < 0.02,
            "Mean {mean} not close to expected {expected_mean}"
        );
    }
}