1use noodles::sam::alignment::record::cigar::op::{Kind, Op};
9use noodles::sam::alignment::record_buf::Cigar;
10use rand::Rng;
11
12use crate::error_model::{self, ErrorModel, ReadEnd};
13use crate::fragment::{Fragment, extract_read_bases, lowercase_fraction, uppercase_in_place};
14use crate::read_naming::{TruthAlignment, encoded_pe_name, encoded_se_name, simple_name};
15
16#[derive(Debug, Clone)]
18pub struct SimulatedRead {
19 pub name: String,
21 pub bases: Vec<u8>,
23 pub qualities: Vec<u8>,
25}
26
27#[derive(Debug)]
29pub struct ReadPair {
30 pub read1: SimulatedRead,
32 pub read2: Option<SimulatedRead>,
34 pub r1_truth: TruthAlignment,
36 pub r2_truth: Option<TruthAlignment>,
38 pub r1_cigar: Cigar,
41 pub r2_cigar: Option<Cigar>,
43}
44
45#[allow(clippy::too_many_arguments, clippy::too_many_lines)] pub fn generate_read_pair(
73 fragment: &Fragment,
74 contig_name: &str,
75 read_num: u64,
76 read_length: usize,
77 paired: bool,
78 adapter_r1: &[u8],
79 adapter_r2: &[u8],
80 max_n_frac: f64,
81 model: &impl ErrorModel,
82 simple_names: bool,
83 rng: &mut impl Rng,
84) -> Option<ReadPair> {
85 let frag_len = fragment.bases.len();
86 let genomic = frag_len.min(read_length);
87 let adapter_bases = read_length.saturating_sub(genomic);
88 let right_start = frag_len.saturating_sub(genomic);
89
90 let r1_negative_strand = !fragment.is_forward;
97
98 let mut r1_bases =
103 extract_read_bases(&fragment.bases, read_length, adapter_r1, r1_negative_strand);
104 if lowercase_fraction(&r1_bases) > max_n_frac {
105 return None;
106 }
107
108 let r2_negative_strand = fragment.is_forward;
110 let r2_bases_pre = if paired {
111 let bases =
112 extract_read_bases(&fragment.bases, read_length, adapter_r2, r2_negative_strand);
113 if lowercase_fraction(&bases) > max_n_frac {
114 return None;
115 }
116 Some(bases)
117 } else {
118 None
119 };
120
121 uppercase_in_place(&mut r1_bases);
123 let (r1_errors, r1_quals) =
124 error_model::apply_errors(model, &mut r1_bases, ReadEnd::Read1, rng);
125
126 let r1_positions = if r1_negative_strand {
130 &fragment.ref_positions[right_start..frag_len]
131 } else {
132 &fragment.ref_positions[..genomic]
133 };
134 let r1_cigar = cigar_from_ref_positions(r1_positions, adapter_bases, r1_negative_strand);
135
136 let r1_ref_pos = if fragment.ref_positions.is_empty() {
138 0
139 } else if r1_negative_strand {
140 fragment.ref_positions[right_start] + 1
141 } else {
142 fragment.ref_positions[0] + 1
143 };
144
145 #[expect(clippy::cast_possible_truncation, reason = "fragment length fits in u32")]
149 let fragment_length = frag_len as u32;
150
151 let r1_truth = TruthAlignment {
152 contig: contig_name.to_string(),
153 position: r1_ref_pos,
154 is_forward: fragment.is_forward,
155 haplotype: fragment.haplotype_index,
156 fragment_length,
157 n_errors: r1_errors,
158 };
159
160 match r2_bases_pre {
163 None => {
164 let name = if simple_names {
165 simple_name(read_num)
166 } else {
167 encoded_se_name(read_num, &r1_truth)
168 };
169
170 Some(ReadPair {
171 read1: SimulatedRead { name, bases: r1_bases, qualities: r1_quals },
172 read2: None,
173 r1_truth,
174 r2_truth: None,
175 r1_cigar,
176 r2_cigar: None,
177 })
178 }
179 Some(mut r2_bases) => {
180 uppercase_in_place(&mut r2_bases);
181 let (r2_errors, r2_quals) =
182 error_model::apply_errors(model, &mut r2_bases, ReadEnd::Read2, rng);
183
184 let r2_positions = if r2_negative_strand {
186 &fragment.ref_positions[right_start..frag_len]
187 } else {
188 &fragment.ref_positions[..genomic]
189 };
190 let r2_cigar =
191 cigar_from_ref_positions(r2_positions, adapter_bases, r2_negative_strand);
192
193 let r2_ref_pos = if fragment.ref_positions.is_empty() {
195 0
196 } else if r2_negative_strand {
197 fragment.ref_positions[right_start] + 1
198 } else {
199 fragment.ref_positions[0] + 1
200 };
201
202 let r2_truth = TruthAlignment {
203 contig: contig_name.to_string(),
204 position: r2_ref_pos,
205 is_forward: !fragment.is_forward,
206 haplotype: fragment.haplotype_index,
207 fragment_length,
208 n_errors: r2_errors,
209 };
210
211 let name = if simple_names {
212 simple_name(read_num)
213 } else {
214 encoded_pe_name(read_num, &r1_truth, &r2_truth)
215 };
216
217 Some(ReadPair {
218 read1: SimulatedRead { name: name.clone(), bases: r1_bases, qualities: r1_quals },
219 read2: Some(SimulatedRead { name, bases: r2_bases, qualities: r2_quals }),
220 r1_truth,
221 r2_truth: Some(r2_truth),
222 r1_cigar,
223 r2_cigar: Some(r2_cigar),
224 })
225 }
226 }
227}
228
229#[must_use]
246pub fn cigar_from_ref_positions(
247 positions: &[u32],
248 adapter_bases: usize,
249 negative_strand: bool,
250) -> Cigar {
251 let mut ops: Vec<Op> = Vec::new();
252
253 if positions.is_empty() {
254 if adapter_bases > 0 {
255 ops.push(Op::new(Kind::SoftClip, adapter_bases));
256 }
257 return Cigar::from(ops);
258 }
259
260 let mut match_run: usize = 1; let mut ins_run: usize = 0;
262
263 for i in 1..positions.len() {
264 let prev = positions[i - 1];
265 let curr = positions[i];
266
267 if curr == prev + 1 {
268 if ins_run > 0 {
270 ops.push(Op::new(Kind::Insertion, ins_run));
271 ins_run = 0;
272 }
273 match_run += 1;
274 } else if curr == prev {
275 if match_run > 0 {
277 ops.push(Op::new(Kind::Match, match_run));
278 match_run = 0;
279 }
280 ins_run += 1;
281 } else {
282 if ins_run > 0 {
284 ops.push(Op::new(Kind::Insertion, ins_run));
285 ins_run = 0;
286 }
287 if match_run > 0 {
288 ops.push(Op::new(Kind::Match, match_run));
289 }
290 let gap = curr.saturating_sub(prev).saturating_sub(1);
291 if gap > 0 {
292 ops.push(Op::new(Kind::Deletion, gap as usize));
293 }
294 match_run = 1; }
296 }
297
298 if ins_run > 0 {
300 ops.push(Op::new(Kind::Insertion, ins_run));
301 }
302 if match_run > 0 {
303 ops.push(Op::new(Kind::Match, match_run));
304 }
305
306 if adapter_bases > 0 {
310 if negative_strand {
311 ops.insert(0, Op::new(Kind::SoftClip, adapter_bases));
312 } else {
313 ops.push(Op::new(Kind::SoftClip, adapter_bases));
314 }
315 }
316
317 Cigar::from(ops)
318}
319
320#[must_use]
322pub fn cigar_to_string(cigar: &Cigar) -> String {
323 use std::fmt::Write;
324 cigar.as_ref().iter().fold(String::new(), |mut s, op| {
325 let kind_char = match op.kind() {
326 Kind::Match => 'M',
327 Kind::Insertion => 'I',
328 Kind::Deletion => 'D',
329 Kind::SoftClip => 'S',
330 Kind::HardClip => 'H',
331 Kind::Skip => 'N',
332 Kind::Pad => 'P',
333 Kind::SequenceMatch => '=',
334 Kind::SequenceMismatch => 'X',
335 };
336 let _ = write!(s, "{}{kind_char}", op.len());
337 s
338 })
339}
340
341#[cfg(test)]
342mod tests {
343 use rand::SeedableRng;
344 use rand::rngs::SmallRng;
345
346 use super::*;
347 use crate::error_model::illumina::IlluminaErrorModel;
348
349 fn test_fragment(bases: &[u8], ref_start: u32) -> Fragment {
351 #[expect(clippy::cast_possible_truncation, reason = "test data is small")]
352 let ref_positions: Vec<u32> = (ref_start..ref_start + bases.len() as u32).collect();
353 Fragment {
354 bases: bases.to_vec(),
355 ref_positions,
356 ref_start,
357 is_forward: true,
358 haplotype_index: 0,
359 }
360 }
361
362 #[test]
365 fn test_cigar_all_match() {
366 let cigar = cigar_from_ref_positions(&[0, 1, 2, 3, 4], 0, false);
367 assert_eq!(cigar_to_string(&cigar), "5M");
368 }
369
370 #[test]
371 fn test_cigar_with_insertion() {
372 let cigar = cigar_from_ref_positions(&[0, 1, 2, 2, 2, 3, 4], 0, false);
374 assert_eq!(cigar_to_string(&cigar), "3M2I2M");
375 }
376
377 #[test]
378 fn test_cigar_with_deletion() {
379 let cigar = cigar_from_ref_positions(&[0, 1, 2, 5, 6], 0, false);
381 assert_eq!(cigar_to_string(&cigar), "3M2D2M");
382 }
383
384 #[test]
385 fn test_cigar_with_adapter_softclip_forward() {
386 let cigar = cigar_from_ref_positions(&[0, 1, 2], 2, false);
388 assert_eq!(cigar_to_string(&cigar), "3M2S");
389 }
390
391 #[test]
392 fn test_cigar_with_adapter_softclip_negative_strand() {
393 let cigar = cigar_from_ref_positions(&[0, 1, 2], 2, true);
395 assert_eq!(cigar_to_string(&cigar), "2S3M");
396 }
397
398 #[test]
399 fn test_cigar_all_adapter() {
400 let cigar = cigar_from_ref_positions(&[], 5, false);
402 assert_eq!(cigar_to_string(&cigar), "5S");
403 }
404
405 #[test]
406 fn test_cigar_with_insertion_and_deletion() {
407 let cigar = cigar_from_ref_positions(&[0, 1, 2, 2, 5, 6], 0, false);
409 assert_eq!(cigar_to_string(&cigar), "3M1I2D2M");
410 }
411
412 #[test]
413 fn test_cigar_with_adapter_and_deletion() {
414 let cigar = cigar_from_ref_positions(&[0, 1, 4, 5], 3, false);
415 assert_eq!(cigar_to_string(&cigar), "2M2D2M3S");
416 }
417
418 #[test]
419 fn test_cigar_single_base() {
420 let cigar = cigar_from_ref_positions(&[42], 0, false);
421 assert_eq!(cigar_to_string(&cigar), "1M");
422 }
423
424 #[test]
425 fn test_cigar_high_positions() {
426 let cigar = cigar_from_ref_positions(&[100, 101, 102, 103, 104], 0, true);
429 assert_eq!(cigar_to_string(&cigar), "5M");
430 }
431
432 #[test]
435 fn test_generate_pe_read_pair() {
436 let fragment = test_fragment(b"ACGTACGTACGTACGTACGT", 100);
437 let model = IlluminaErrorModel::new(10, 0.0, 0.0);
438 let mut rng = SmallRng::seed_from_u64(42);
439
440 let pair = generate_read_pair(
441 &fragment, "chr1", 1, 10, true, b"ADAPTER", b"ADAPTER", 1.0, &model, false, &mut rng,
442 )
443 .expect("no ambiguous bases — should not reject");
444
445 assert_eq!(pair.read1.bases, b"ACGTACGTAC");
446 assert!(pair.read2.is_some());
447 assert_eq!(pair.r1_truth.position, 101);
448 assert!(pair.r2_truth.is_some());
449 assert_eq!(cigar_to_string(&pair.r1_cigar), "10M");
450 assert_eq!(cigar_to_string(pair.r2_cigar.as_ref().unwrap()), "10M");
451 }
452
453 #[test]
454 fn test_generate_se_read() {
455 let fragment = test_fragment(b"ACGTACGTAC", 100);
456 let model = IlluminaErrorModel::new(10, 0.0, 0.0);
457 let mut rng = SmallRng::seed_from_u64(42);
458
459 let pair = generate_read_pair(
460 &fragment, "chr1", 5, 10, false, b"ADAPTER", b"ADAPTER", 1.0, &model, false, &mut rng,
461 )
462 .unwrap();
463
464 assert!(pair.read2.is_none());
465 assert!(pair.r2_cigar.is_none());
466 assert_eq!(cigar_to_string(&pair.r1_cigar), "10M");
467 }
468
469 #[test]
470 fn test_adapter_cigar_softclip() {
471 let fragment = test_fragment(b"AC", 0);
474 let model = IlluminaErrorModel::new(5, 0.0, 0.0);
475 let mut rng = SmallRng::seed_from_u64(42);
476
477 let pair = generate_read_pair(
478 &fragment, "chr1", 1, 5, true, b"TTTTT", b"GGGGG", 1.0, &model, false, &mut rng,
479 )
480 .unwrap();
481
482 assert_eq!(cigar_to_string(&pair.r1_cigar), "2M3S");
483 assert_eq!(cigar_to_string(pair.r2_cigar.as_ref().unwrap()), "3S2M");
484 }
485
486 #[test]
487 fn test_simple_name_mode() {
488 let fragment = test_fragment(b"ACGT", 0);
489 let model = IlluminaErrorModel::new(4, 0.0, 0.0);
490 let mut rng = SmallRng::seed_from_u64(42);
491
492 let pair = generate_read_pair(
493 &fragment, "chr1", 42, 4, true, b"A", b"A", 1.0, &model, true, &mut rng,
494 )
495 .unwrap();
496
497 assert_eq!(pair.read1.name, "holodeck::42");
498 }
499
500 #[test]
501 fn test_quality_scores_correct_length() {
502 let fragment = test_fragment(b"ACGTACGTAC", 0);
503 let model = IlluminaErrorModel::new(10, 0.001, 0.01);
504 let mut rng = SmallRng::seed_from_u64(42);
505
506 let pair = generate_read_pair(
507 &fragment, "chr1", 1, 10, true, b"A", b"A", 1.0, &model, false, &mut rng,
508 )
509 .unwrap();
510
511 assert_eq!(pair.read1.qualities.len(), 10);
512 assert_eq!(pair.read2.as_ref().unwrap().qualities.len(), 10);
513 }
514
515 #[test]
516 fn test_rejects_when_r1_exceeds_max_n_frac() {
517 let mut fragment = test_fragment(b"acgtacgtac", 100);
520 fragment.is_forward = true;
522 let model = IlluminaErrorModel::new(10, 0.0, 0.0);
523 let mut rng = SmallRng::seed_from_u64(42);
524
525 let pair = generate_read_pair(
526 &fragment, "chr1", 1, 10, true, b"ADAPTER", b"ADAPTER", 0.5, &model, false, &mut rng,
527 );
528
529 assert!(pair.is_none(), "all-lowercase fragment should be rejected at threshold 0.5");
530 }
531
532 #[test]
533 fn test_accepts_when_lowercase_below_threshold() {
534 let fragment = test_fragment(b"ACaGcTAtCA", 0);
537 let model = IlluminaErrorModel::new(10, 0.0, 0.0);
538 let mut rng = SmallRng::seed_from_u64(42);
539
540 let pair = generate_read_pair(
541 &fragment, "chr1", 1, 10, true, b"ADAPTER", b"ADAPTER", 0.5, &model, false, &mut rng,
542 )
543 .expect("0.3 < 0.5 — should accept");
544
545 for &b in &pair.read1.bases {
547 assert!(matches!(b, b'A' | b'C' | b'G' | b'T' | b'N'), "r1 got {b:?}");
548 }
549 for &b in &pair.read2.as_ref().unwrap().bases {
550 assert!(matches!(b, b'A' | b'C' | b'G' | b'T' | b'N'), "r2 got {b:?}");
551 }
552 }
553
554 #[test]
555 fn test_rejects_before_applying_errors_to_r1() {
556 let mut fragment_half_bad = test_fragment(&[b'A'; 20], 0);
562 for b in &mut fragment_half_bad.bases[10..] {
564 *b = b'a';
565 }
566 fragment_half_bad.is_forward = true;
567
568 let clean_fragment = test_fragment(&[b'A'; 20], 0);
569 let model = IlluminaErrorModel::new(10, 0.5, 0.5); let mut rng_a = SmallRng::seed_from_u64(123);
573 let rejected = generate_read_pair(
574 &fragment_half_bad,
575 "chr1",
576 1,
577 10,
578 true,
579 b"TTTTTTTTTT",
580 b"TTTTTTTTTT",
581 0.5,
582 &model,
583 false,
584 &mut rng_a,
585 );
586 assert!(rejected.is_none(), "expected rejection when R2 is all-lowercase");
587 let after_reject = generate_read_pair(
588 &clean_fragment,
589 "chr1",
590 2,
591 10,
592 true,
593 b"TTTTTTTTTT",
594 b"TTTTTTTTTT",
595 0.5,
596 &model,
597 false,
598 &mut rng_a,
599 )
600 .unwrap();
601
602 let mut rng_b = SmallRng::seed_from_u64(123);
604 let direct = generate_read_pair(
605 &clean_fragment,
606 "chr1",
607 2,
608 10,
609 true,
610 b"TTTTTTTTTT",
611 b"TTTTTTTTTT",
612 0.5,
613 &model,
614 false,
615 &mut rng_b,
616 )
617 .unwrap();
618
619 assert_eq!(after_reject.read1.bases, direct.read1.bases);
622 assert_eq!(after_reject.read1.qualities, direct.read1.qualities);
623 assert_eq!(
624 after_reject.read2.as_ref().unwrap().bases,
625 direct.read2.as_ref().unwrap().bases
626 );
627 }
628}