Skip to main content

cyanea_seq/
read_sim.rs

1//! Illumina-style short-read simulator.
2//!
3//! Generates synthetic paired-end or single-end reads from a reference
4//! sequence, with configurable coverage, fragment sizes, and a
5//! position-dependent error profile that mimics Illumina 3' quality
6//! degradation.
7
8use std::f64::consts::PI;
9
10/// Configuration for the read simulator.
11#[derive(Debug, Clone)]
12#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
13pub struct ReadSimConfig {
14    /// Read length in bases (default 150).
15    pub read_length: usize,
16    /// Desired sequencing coverage (default 30.0).
17    pub coverage: f64,
18    /// Mean insert/fragment size in bases (default 300.0).
19    pub fragment_mean: f64,
20    /// Standard deviation of fragment size (default 50.0).
21    pub fragment_std: f64,
22    /// Per-base error rate at position 0 (default 0.001).
23    pub error_rate: f64,
24    /// Whether to produce paired-end reads (default true).
25    pub paired: bool,
26    /// PRNG seed for reproducibility.
27    pub seed: u64,
28}
29
30impl Default for ReadSimConfig {
31    fn default() -> Self {
32        Self {
33            read_length: 150,
34            coverage: 30.0,
35            fragment_mean: 300.0,
36            fragment_std: 50.0,
37            error_rate: 0.001,
38            paired: true,
39            seed: 42,
40        }
41    }
42}
43
44/// A single simulated read with its true genomic origin.
45#[derive(Debug, Clone)]
46pub struct SimulatedRead {
47    /// Read name (e.g. "sim_read_0/1").
48    pub name: String,
49    /// Nucleotide sequence (A, C, G, T) with introduced errors.
50    pub sequence: Vec<u8>,
51    /// Phred+33 encoded quality scores.
52    pub quality: Vec<u8>,
53    /// True 0-based start position on the reference.
54    pub true_position: u64,
55    /// Chromosome / contig name.
56    pub true_chrom: String,
57    /// Whether this is read 1 of a pair (false for read 2 or single-end).
58    pub is_read1: bool,
59}
60
61// ---------------------------------------------------------------------------
62// Private Xorshift64 PRNG
63// ---------------------------------------------------------------------------
64
65struct Xorshift64 {
66    state: u64,
67}
68
69impl Xorshift64 {
70    fn new(seed: u64) -> Self {
71        // Avoid the all-zero fixed point.
72        Self {
73            state: if seed == 0 { 1 } else { seed },
74        }
75    }
76
77    fn next_u64(&mut self) -> u64 {
78        let mut x = self.state;
79        x ^= x << 13;
80        x ^= x >> 7;
81        x ^= x << 17;
82        self.state = x;
83        x
84    }
85
86    fn next_f64(&mut self) -> f64 {
87        self.next_u64() as f64 / u64::MAX as f64
88    }
89}
90
91// ---------------------------------------------------------------------------
92// Helpers
93// ---------------------------------------------------------------------------
94
95/// Box-Muller transform: produce a standard-normal sample from two uniforms.
96fn box_muller(rng: &mut Xorshift64) -> f64 {
97    let u1 = rng.next_f64().max(f64::MIN_POSITIVE); // avoid ln(0)
98    let u2 = rng.next_f64();
99    (-2.0 * u1.ln()).sqrt() * (2.0 * PI * u2).cos()
100}
101
102/// Reverse complement of a byte slice (A<->T, C<->G).
103fn reverse_complement(seq: &[u8]) -> Vec<u8> {
104    seq.iter()
105        .rev()
106        .map(|&b| match b {
107            b'A' => b'T',
108            b'T' => b'A',
109            b'C' => b'G',
110            b'G' => b'C',
111            _ => b'N',
112        })
113        .collect()
114}
115
116/// Pick a random base that differs from `original`.
117fn substitute_base(rng: &mut Xorshift64, original: u8) -> u8 {
118    const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
119    loop {
120        let b = BASES[(rng.next_u64() % 4) as usize];
121        if b != original {
122            return b;
123        }
124    }
125}
126
127/// Compute Phred+33 quality byte for a given error probability.
128fn phred_quality(error_prob: f64) -> u8 {
129    if error_prob <= 0.0 {
130        // Perfect quality — cap at Q40.
131        return 73; // 40 + 33
132    }
133    let q = (-10.0 * error_prob.log10()).round() as i32;
134    let clamped = q.clamp(2, 40) as u8; // Q2..Q40
135    clamped + 33 // Phred+33 encoding: 35..=73
136}
137
138// ---------------------------------------------------------------------------
139// Public API
140// ---------------------------------------------------------------------------
141
142/// Simulate Illumina-style reads from a reference sequence.
143///
144/// Returns an empty `Vec` when the reference is shorter than `read_length`.
145///
146/// # Arguments
147///
148/// * `reference` - The reference sequence as uppercase ACGT bytes.
149/// * `chrom_name` - Name to tag on every read's `true_chrom` field.
150/// * `config` - Simulation parameters (coverage, error model, etc.).
151pub fn simulate_reads(
152    reference: &[u8],
153    chrom_name: &str,
154    config: &ReadSimConfig,
155) -> Vec<SimulatedRead> {
156    let ref_len = reference.len();
157    if ref_len < config.read_length {
158        return Vec::new();
159    }
160
161    let mut rng = Xorshift64::new(config.seed);
162
163    // Total reads needed to achieve the requested coverage.
164    let n_reads =
165        ((ref_len as f64 * config.coverage) / config.read_length as f64).round() as usize;
166
167    // Pre-compute per-position quality scores.
168    let qualities: Vec<u8> = (0..config.read_length)
169        .map(|i| {
170            let ep = config.error_rate * (1.0 + 2.0 * i as f64 / config.read_length as f64);
171            phred_quality(ep)
172        })
173        .collect();
174
175    let mut reads = Vec::new();
176
177    if config.paired {
178        let n_fragments = n_reads / 2;
179        for frag_idx in 0..n_fragments {
180            // Sample fragment size from N(mean, std), clamped.
181            let frag_size = {
182                let z = box_muller(&mut rng);
183                let raw = config.fragment_mean + config.fragment_std * z;
184                raw.round().clamp(config.read_length as f64, ref_len as f64) as usize
185            };
186
187            // Random start position.
188            let max_start = if ref_len > frag_size {
189                ref_len - frag_size
190            } else {
191                0
192            };
193            let start = if max_start > 0 {
194                (rng.next_u64() % max_start as u64) as usize
195            } else {
196                0
197            };
198
199            let fragment = &reference[start..start + frag_size];
200
201            // Read 1: first read_length bases of fragment.
202            let r1_bases: Vec<u8> = fragment[..config.read_length].to_vec();
203
204            // Read 2: last read_length bases of fragment, reverse complemented.
205            let r2_fwd = &fragment[frag_size - config.read_length..];
206            let r2_bases = reverse_complement(r2_fwd);
207
208            let r1_pos = start as u64;
209            let r2_pos = (start + frag_size - config.read_length) as u64;
210
211            // Introduce errors and build reads.
212            let r1 = build_read(
213                &mut rng,
214                &r1_bases,
215                &qualities,
216                config,
217                frag_idx,
218                true,
219                r1_pos,
220                chrom_name,
221            );
222            let r2 = build_read(
223                &mut rng,
224                &r2_bases,
225                &qualities,
226                config,
227                frag_idx,
228                false,
229                r2_pos,
230                chrom_name,
231            );
232            reads.push(r1);
233            reads.push(r2);
234        }
235    } else {
236        for frag_idx in 0..n_reads {
237            let max_start = ref_len - config.read_length;
238            let start = if max_start > 0 {
239                (rng.next_u64() % max_start as u64) as usize
240            } else {
241                0
242            };
243            let bases: Vec<u8> = reference[start..start + config.read_length].to_vec();
244            let r = build_read(
245                &mut rng,
246                &bases,
247                &qualities,
248                config,
249                frag_idx,
250                true,
251                start as u64,
252                chrom_name,
253            );
254            reads.push(r);
255        }
256    }
257
258    reads
259}
260
261/// Apply position-dependent errors and package into a [`SimulatedRead`].
262fn build_read(
263    rng: &mut Xorshift64,
264    bases: &[u8],
265    qualities: &[u8],
266    config: &ReadSimConfig,
267    frag_idx: usize,
268    is_read1: bool,
269    true_position: u64,
270    chrom_name: &str,
271) -> SimulatedRead {
272    let read_len = config.read_length;
273    let mut seq = bases.to_vec();
274
275    for i in 0..read_len {
276        let error_prob =
277            config.error_rate * (1.0 + 2.0 * i as f64 / read_len as f64);
278        if rng.next_f64() < error_prob {
279            seq[i] = substitute_base(rng, seq[i]);
280        }
281    }
282
283    let suffix = if is_read1 { "/1" } else { "/2" };
284    SimulatedRead {
285        name: format!("sim_read_{}{}", frag_idx, suffix),
286        sequence: seq,
287        quality: qualities.to_vec(),
288        true_position,
289        true_chrom: chrom_name.to_string(),
290        is_read1,
291    }
292}
293
294// ---------------------------------------------------------------------------
295// Tests
296// ---------------------------------------------------------------------------
297
298#[cfg(test)]
299mod tests {
300    use super::*;
301
302    /// Helper: a 1 kb reference of repeating ACGT.
303    fn reference_1kb() -> Vec<u8> {
304        let unit = b"ACGTACGTACGTACGT"; // 16 bp
305        unit.iter().copied().cycle().take(1000).collect()
306    }
307
308    #[test]
309    fn read_count_proportional_to_coverage() {
310        let reference = reference_1kb();
311        let config = ReadSimConfig {
312            read_length: 100,
313            coverage: 10.0,
314            paired: false,
315            seed: 123,
316            ..Default::default()
317        };
318        let reads = simulate_reads(&reference, "chr1", &config);
319        // Expected: 1000 * 10.0 / 100 = 100 reads.
320        let expected = ((reference.len() as f64 * config.coverage) / config.read_length as f64)
321            .round() as usize;
322        assert_eq!(reads.len(), expected);
323    }
324
325    #[test]
326    fn read_length_matches_config() {
327        let reference = reference_1kb();
328        let config = ReadSimConfig {
329            read_length: 75,
330            coverage: 5.0,
331            paired: true,
332            seed: 456,
333            ..Default::default()
334        };
335        let reads = simulate_reads(&reference, "chr1", &config);
336        assert!(!reads.is_empty());
337        for r in &reads {
338            assert_eq!(r.sequence.len(), 75);
339            assert_eq!(r.quality.len(), 75);
340        }
341    }
342
343    #[test]
344    fn paired_produces_both_mates() {
345        let reference = reference_1kb();
346        let config = ReadSimConfig {
347            read_length: 100,
348            coverage: 10.0,
349            paired: true,
350            seed: 789,
351            ..Default::default()
352        };
353        let reads = simulate_reads(&reference, "chrX", &config);
354        assert!(!reads.is_empty());
355        // Paired reads come in pairs (read1, read2, read1, read2, ...).
356        assert_eq!(reads.len() % 2, 0);
357        let n_read1 = reads.iter().filter(|r| r.is_read1).count();
358        let n_read2 = reads.iter().filter(|r| !r.is_read1).count();
359        assert_eq!(n_read1, n_read2);
360        assert!(n_read1 > 0);
361        // Check naming convention.
362        assert!(reads[0].name.ends_with("/1"));
363        assert!(reads[1].name.ends_with("/2"));
364    }
365
366    #[test]
367    fn reads_are_valid_dna() {
368        let reference = reference_1kb();
369        let config = ReadSimConfig {
370            coverage: 5.0,
371            seed: 101,
372            ..Default::default()
373        };
374        let reads = simulate_reads(&reference, "chr1", &config);
375        for r in &reads {
376            for &b in &r.sequence {
377                assert!(
378                    b == b'A' || b == b'C' || b == b'G' || b == b'T',
379                    "unexpected base: {}",
380                    b as char
381                );
382            }
383        }
384    }
385
386    #[test]
387    fn quality_scores_valid_phred33() {
388        let reference = reference_1kb();
389        let config = ReadSimConfig {
390            coverage: 5.0,
391            seed: 202,
392            ..Default::default()
393        };
394        let reads = simulate_reads(&reference, "chr1", &config);
395        for r in &reads {
396            for &q in &r.quality {
397                assert!(
398                    (35..=73).contains(&q),
399                    "quality byte {} out of Phred+33 range 35..=73",
400                    q
401                );
402            }
403        }
404    }
405
406    #[test]
407    fn zero_error_rate_reads_match_reference() {
408        let reference = reference_1kb();
409        let config = ReadSimConfig {
410            read_length: 100,
411            coverage: 10.0,
412            error_rate: 0.0,
413            paired: false,
414            seed: 303,
415            ..Default::default()
416        };
417        let reads = simulate_reads(&reference, "chr1", &config);
418        for r in &reads {
419            let pos = r.true_position as usize;
420            let expected = &reference[pos..pos + 100];
421            assert_eq!(
422                r.sequence, expected,
423                "read at position {} does not match reference",
424                pos
425            );
426        }
427    }
428}