1use std::f64::consts::PI;
9
10#[derive(Debug, Clone)]
12#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
13pub struct ReadSimConfig {
14 pub read_length: usize,
16 pub coverage: f64,
18 pub fragment_mean: f64,
20 pub fragment_std: f64,
22 pub error_rate: f64,
24 pub paired: bool,
26 pub seed: u64,
28}
29
30impl Default for ReadSimConfig {
31 fn default() -> Self {
32 Self {
33 read_length: 150,
34 coverage: 30.0,
35 fragment_mean: 300.0,
36 fragment_std: 50.0,
37 error_rate: 0.001,
38 paired: true,
39 seed: 42,
40 }
41 }
42}
43
44#[derive(Debug, Clone)]
46pub struct SimulatedRead {
47 pub name: String,
49 pub sequence: Vec<u8>,
51 pub quality: Vec<u8>,
53 pub true_position: u64,
55 pub true_chrom: String,
57 pub is_read1: bool,
59}
60
61struct Xorshift64 {
66 state: u64,
67}
68
69impl Xorshift64 {
70 fn new(seed: u64) -> Self {
71 Self {
73 state: if seed == 0 { 1 } else { seed },
74 }
75 }
76
77 fn next_u64(&mut self) -> u64 {
78 let mut x = self.state;
79 x ^= x << 13;
80 x ^= x >> 7;
81 x ^= x << 17;
82 self.state = x;
83 x
84 }
85
86 fn next_f64(&mut self) -> f64 {
87 self.next_u64() as f64 / u64::MAX as f64
88 }
89}
90
91fn box_muller(rng: &mut Xorshift64) -> f64 {
97 let u1 = rng.next_f64().max(f64::MIN_POSITIVE); let u2 = rng.next_f64();
99 (-2.0 * u1.ln()).sqrt() * (2.0 * PI * u2).cos()
100}
101
102fn reverse_complement(seq: &[u8]) -> Vec<u8> {
104 seq.iter()
105 .rev()
106 .map(|&b| match b {
107 b'A' => b'T',
108 b'T' => b'A',
109 b'C' => b'G',
110 b'G' => b'C',
111 _ => b'N',
112 })
113 .collect()
114}
115
116fn substitute_base(rng: &mut Xorshift64, original: u8) -> u8 {
118 const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
119 loop {
120 let b = BASES[(rng.next_u64() % 4) as usize];
121 if b != original {
122 return b;
123 }
124 }
125}
126
127fn phred_quality(error_prob: f64) -> u8 {
129 if error_prob <= 0.0 {
130 return 73; }
133 let q = (-10.0 * error_prob.log10()).round() as i32;
134 let clamped = q.clamp(2, 40) as u8; clamped + 33 }
137
138pub fn simulate_reads(
152 reference: &[u8],
153 chrom_name: &str,
154 config: &ReadSimConfig,
155) -> Vec<SimulatedRead> {
156 let ref_len = reference.len();
157 if ref_len < config.read_length {
158 return Vec::new();
159 }
160
161 let mut rng = Xorshift64::new(config.seed);
162
163 let n_reads =
165 ((ref_len as f64 * config.coverage) / config.read_length as f64).round() as usize;
166
167 let qualities: Vec<u8> = (0..config.read_length)
169 .map(|i| {
170 let ep = config.error_rate * (1.0 + 2.0 * i as f64 / config.read_length as f64);
171 phred_quality(ep)
172 })
173 .collect();
174
175 let mut reads = Vec::new();
176
177 if config.paired {
178 let n_fragments = n_reads / 2;
179 for frag_idx in 0..n_fragments {
180 let frag_size = {
182 let z = box_muller(&mut rng);
183 let raw = config.fragment_mean + config.fragment_std * z;
184 raw.round().clamp(config.read_length as f64, ref_len as f64) as usize
185 };
186
187 let max_start = if ref_len > frag_size {
189 ref_len - frag_size
190 } else {
191 0
192 };
193 let start = if max_start > 0 {
194 (rng.next_u64() % max_start as u64) as usize
195 } else {
196 0
197 };
198
199 let fragment = &reference[start..start + frag_size];
200
201 let r1_bases: Vec<u8> = fragment[..config.read_length].to_vec();
203
204 let r2_fwd = &fragment[frag_size - config.read_length..];
206 let r2_bases = reverse_complement(r2_fwd);
207
208 let r1_pos = start as u64;
209 let r2_pos = (start + frag_size - config.read_length) as u64;
210
211 let r1 = build_read(
213 &mut rng,
214 &r1_bases,
215 &qualities,
216 config,
217 frag_idx,
218 true,
219 r1_pos,
220 chrom_name,
221 );
222 let r2 = build_read(
223 &mut rng,
224 &r2_bases,
225 &qualities,
226 config,
227 frag_idx,
228 false,
229 r2_pos,
230 chrom_name,
231 );
232 reads.push(r1);
233 reads.push(r2);
234 }
235 } else {
236 for frag_idx in 0..n_reads {
237 let max_start = ref_len - config.read_length;
238 let start = if max_start > 0 {
239 (rng.next_u64() % max_start as u64) as usize
240 } else {
241 0
242 };
243 let bases: Vec<u8> = reference[start..start + config.read_length].to_vec();
244 let r = build_read(
245 &mut rng,
246 &bases,
247 &qualities,
248 config,
249 frag_idx,
250 true,
251 start as u64,
252 chrom_name,
253 );
254 reads.push(r);
255 }
256 }
257
258 reads
259}
260
261fn build_read(
263 rng: &mut Xorshift64,
264 bases: &[u8],
265 qualities: &[u8],
266 config: &ReadSimConfig,
267 frag_idx: usize,
268 is_read1: bool,
269 true_position: u64,
270 chrom_name: &str,
271) -> SimulatedRead {
272 let read_len = config.read_length;
273 let mut seq = bases.to_vec();
274
275 for i in 0..read_len {
276 let error_prob =
277 config.error_rate * (1.0 + 2.0 * i as f64 / read_len as f64);
278 if rng.next_f64() < error_prob {
279 seq[i] = substitute_base(rng, seq[i]);
280 }
281 }
282
283 let suffix = if is_read1 { "/1" } else { "/2" };
284 SimulatedRead {
285 name: format!("sim_read_{}{}", frag_idx, suffix),
286 sequence: seq,
287 quality: qualities.to_vec(),
288 true_position,
289 true_chrom: chrom_name.to_string(),
290 is_read1,
291 }
292}
293
294#[cfg(test)]
299mod tests {
300 use super::*;
301
302 fn reference_1kb() -> Vec<u8> {
304 let unit = b"ACGTACGTACGTACGT"; unit.iter().copied().cycle().take(1000).collect()
306 }
307
308 #[test]
309 fn read_count_proportional_to_coverage() {
310 let reference = reference_1kb();
311 let config = ReadSimConfig {
312 read_length: 100,
313 coverage: 10.0,
314 paired: false,
315 seed: 123,
316 ..Default::default()
317 };
318 let reads = simulate_reads(&reference, "chr1", &config);
319 let expected = ((reference.len() as f64 * config.coverage) / config.read_length as f64)
321 .round() as usize;
322 assert_eq!(reads.len(), expected);
323 }
324
325 #[test]
326 fn read_length_matches_config() {
327 let reference = reference_1kb();
328 let config = ReadSimConfig {
329 read_length: 75,
330 coverage: 5.0,
331 paired: true,
332 seed: 456,
333 ..Default::default()
334 };
335 let reads = simulate_reads(&reference, "chr1", &config);
336 assert!(!reads.is_empty());
337 for r in &reads {
338 assert_eq!(r.sequence.len(), 75);
339 assert_eq!(r.quality.len(), 75);
340 }
341 }
342
343 #[test]
344 fn paired_produces_both_mates() {
345 let reference = reference_1kb();
346 let config = ReadSimConfig {
347 read_length: 100,
348 coverage: 10.0,
349 paired: true,
350 seed: 789,
351 ..Default::default()
352 };
353 let reads = simulate_reads(&reference, "chrX", &config);
354 assert!(!reads.is_empty());
355 assert_eq!(reads.len() % 2, 0);
357 let n_read1 = reads.iter().filter(|r| r.is_read1).count();
358 let n_read2 = reads.iter().filter(|r| !r.is_read1).count();
359 assert_eq!(n_read1, n_read2);
360 assert!(n_read1 > 0);
361 assert!(reads[0].name.ends_with("/1"));
363 assert!(reads[1].name.ends_with("/2"));
364 }
365
366 #[test]
367 fn reads_are_valid_dna() {
368 let reference = reference_1kb();
369 let config = ReadSimConfig {
370 coverage: 5.0,
371 seed: 101,
372 ..Default::default()
373 };
374 let reads = simulate_reads(&reference, "chr1", &config);
375 for r in &reads {
376 for &b in &r.sequence {
377 assert!(
378 b == b'A' || b == b'C' || b == b'G' || b == b'T',
379 "unexpected base: {}",
380 b as char
381 );
382 }
383 }
384 }
385
386 #[test]
387 fn quality_scores_valid_phred33() {
388 let reference = reference_1kb();
389 let config = ReadSimConfig {
390 coverage: 5.0,
391 seed: 202,
392 ..Default::default()
393 };
394 let reads = simulate_reads(&reference, "chr1", &config);
395 for r in &reads {
396 for &q in &r.quality {
397 assert!(
398 (35..=73).contains(&q),
399 "quality byte {} out of Phred+33 range 35..=73",
400 q
401 );
402 }
403 }
404 }
405
406 #[test]
407 fn zero_error_rate_reads_match_reference() {
408 let reference = reference_1kb();
409 let config = ReadSimConfig {
410 read_length: 100,
411 coverage: 10.0,
412 error_rate: 0.0,
413 paired: false,
414 seed: 303,
415 ..Default::default()
416 };
417 let reads = simulate_reads(&reference, "chr1", &config);
418 for r in &reads {
419 let pos = r.true_position as usize;
420 let expected = &reference[pos..pos + 100];
421 assert_eq!(
422 r.sequence, expected,
423 "read at position {} does not match reference",
424 pos
425 );
426 }
427 }
428}