1use noodles::sam::alignment::record::cigar::op::{Kind, Op};
9use noodles::sam::alignment::record_buf::Cigar;
10use rand::Rng;
11
12use crate::error_model::{self, ErrorModel, ReadEnd};
13use crate::fragment::{Fragment, extract_read_bases};
14use crate::read_naming::{TruthAlignment, encoded_pe_name, encoded_se_name, simple_name};
15
16#[derive(Debug, Clone)]
18pub struct SimulatedRead {
19 pub name: String,
21 pub bases: Vec<u8>,
23 pub qualities: Vec<u8>,
25}
26
27#[derive(Debug)]
29pub struct ReadPair {
30 pub read1: SimulatedRead,
32 pub read2: Option<SimulatedRead>,
34 pub r1_truth: TruthAlignment,
36 pub r2_truth: Option<TruthAlignment>,
38 pub r1_cigar: Cigar,
41 pub r2_cigar: Option<Cigar>,
43}
44
45#[allow(clippy::too_many_arguments)] pub fn generate_read_pair(
64 fragment: &Fragment,
65 contig_name: &str,
66 read_num: u64,
67 read_length: usize,
68 paired: bool,
69 adapter_r1: &[u8],
70 adapter_r2: &[u8],
71 model: &impl ErrorModel,
72 simple_names: bool,
73 rng: &mut impl Rng,
74) -> ReadPair {
75 let frag_len = fragment.bases.len();
76 let genomic = frag_len.min(read_length);
77 let adapter_bases = read_length.saturating_sub(genomic);
78 let right_start = frag_len.saturating_sub(genomic);
79
80 let r1_negative_strand = !fragment.is_forward;
87
88 let mut r1_bases =
90 extract_read_bases(&fragment.bases, read_length, adapter_r1, r1_negative_strand);
91 let (r1_errors, r1_quals) =
92 error_model::apply_errors(model, &mut r1_bases, ReadEnd::Read1, rng);
93
94 let r1_positions = if r1_negative_strand {
98 &fragment.ref_positions[right_start..frag_len]
99 } else {
100 &fragment.ref_positions[..genomic]
101 };
102 let r1_cigar = cigar_from_ref_positions(r1_positions, adapter_bases, r1_negative_strand);
103
104 let r1_ref_pos = if fragment.ref_positions.is_empty() {
106 0
107 } else if r1_negative_strand {
108 fragment.ref_positions[right_start] + 1
109 } else {
110 fragment.ref_positions[0] + 1
111 };
112
113 #[expect(clippy::cast_possible_truncation, reason = "fragment length fits in u32")]
117 let fragment_length = frag_len as u32;
118
119 let r1_truth = TruthAlignment {
120 contig: contig_name.to_string(),
121 position: r1_ref_pos,
122 is_forward: fragment.is_forward,
123 haplotype: fragment.haplotype_index,
124 fragment_length,
125 n_errors: r1_errors,
126 };
127
128 if !paired {
129 let name =
130 if simple_names { simple_name(read_num) } else { encoded_se_name(read_num, &r1_truth) };
131
132 return ReadPair {
133 read1: SimulatedRead { name, bases: r1_bases, qualities: r1_quals },
134 read2: None,
135 r1_truth,
136 r2_truth: None,
137 r1_cigar,
138 r2_cigar: None,
139 };
140 }
141
142 let r2_negative_strand = fragment.is_forward;
145 let mut r2_bases =
146 extract_read_bases(&fragment.bases, read_length, adapter_r2, r2_negative_strand);
147 let (r2_errors, r2_quals) =
148 error_model::apply_errors(model, &mut r2_bases, ReadEnd::Read2, rng);
149
150 let r2_positions = if r2_negative_strand {
152 &fragment.ref_positions[right_start..frag_len]
153 } else {
154 &fragment.ref_positions[..genomic]
155 };
156 let r2_cigar = cigar_from_ref_positions(r2_positions, adapter_bases, r2_negative_strand);
157
158 let r2_ref_pos = if fragment.ref_positions.is_empty() {
160 0
161 } else if r2_negative_strand {
162 fragment.ref_positions[right_start] + 1
163 } else {
164 fragment.ref_positions[0] + 1
165 };
166
167 let r2_truth = TruthAlignment {
168 contig: contig_name.to_string(),
169 position: r2_ref_pos,
170 is_forward: !fragment.is_forward,
171 haplotype: fragment.haplotype_index,
172 fragment_length,
173 n_errors: r2_errors,
174 };
175
176 let name = if simple_names {
177 simple_name(read_num)
178 } else {
179 encoded_pe_name(read_num, &r1_truth, &r2_truth)
180 };
181
182 ReadPair {
183 read1: SimulatedRead { name: name.clone(), bases: r1_bases, qualities: r1_quals },
184 read2: Some(SimulatedRead { name, bases: r2_bases, qualities: r2_quals }),
185 r1_truth,
186 r2_truth: Some(r2_truth),
187 r1_cigar,
188 r2_cigar: Some(r2_cigar),
189 }
190}
191
192#[must_use]
209pub fn cigar_from_ref_positions(
210 positions: &[u32],
211 adapter_bases: usize,
212 negative_strand: bool,
213) -> Cigar {
214 let mut ops: Vec<Op> = Vec::new();
215
216 if positions.is_empty() {
217 if adapter_bases > 0 {
218 ops.push(Op::new(Kind::SoftClip, adapter_bases));
219 }
220 return Cigar::from(ops);
221 }
222
223 let mut match_run: usize = 1; let mut ins_run: usize = 0;
225
226 for i in 1..positions.len() {
227 let prev = positions[i - 1];
228 let curr = positions[i];
229
230 if curr == prev + 1 {
231 if ins_run > 0 {
233 ops.push(Op::new(Kind::Insertion, ins_run));
234 ins_run = 0;
235 }
236 match_run += 1;
237 } else if curr == prev {
238 if match_run > 0 {
240 ops.push(Op::new(Kind::Match, match_run));
241 match_run = 0;
242 }
243 ins_run += 1;
244 } else {
245 if ins_run > 0 {
247 ops.push(Op::new(Kind::Insertion, ins_run));
248 ins_run = 0;
249 }
250 if match_run > 0 {
251 ops.push(Op::new(Kind::Match, match_run));
252 }
253 let gap = curr.saturating_sub(prev).saturating_sub(1);
254 if gap > 0 {
255 ops.push(Op::new(Kind::Deletion, gap as usize));
256 }
257 match_run = 1; }
259 }
260
261 if ins_run > 0 {
263 ops.push(Op::new(Kind::Insertion, ins_run));
264 }
265 if match_run > 0 {
266 ops.push(Op::new(Kind::Match, match_run));
267 }
268
269 if adapter_bases > 0 {
273 if negative_strand {
274 ops.insert(0, Op::new(Kind::SoftClip, adapter_bases));
275 } else {
276 ops.push(Op::new(Kind::SoftClip, adapter_bases));
277 }
278 }
279
280 Cigar::from(ops)
281}
282
283#[must_use]
285pub fn cigar_to_string(cigar: &Cigar) -> String {
286 use std::fmt::Write;
287 cigar.as_ref().iter().fold(String::new(), |mut s, op| {
288 let kind_char = match op.kind() {
289 Kind::Match => 'M',
290 Kind::Insertion => 'I',
291 Kind::Deletion => 'D',
292 Kind::SoftClip => 'S',
293 Kind::HardClip => 'H',
294 Kind::Skip => 'N',
295 Kind::Pad => 'P',
296 Kind::SequenceMatch => '=',
297 Kind::SequenceMismatch => 'X',
298 };
299 let _ = write!(s, "{}{kind_char}", op.len());
300 s
301 })
302}
303
304#[cfg(test)]
305mod tests {
306 use rand::SeedableRng;
307 use rand::rngs::SmallRng;
308
309 use super::*;
310 use crate::error_model::illumina::IlluminaErrorModel;
311
312 fn test_fragment(bases: &[u8], ref_start: u32) -> Fragment {
314 #[expect(clippy::cast_possible_truncation, reason = "test data is small")]
315 let ref_positions: Vec<u32> = (ref_start..ref_start + bases.len() as u32).collect();
316 Fragment {
317 bases: bases.to_vec(),
318 ref_positions,
319 ref_start,
320 is_forward: true,
321 haplotype_index: 0,
322 }
323 }
324
325 #[test]
328 fn test_cigar_all_match() {
329 let cigar = cigar_from_ref_positions(&[0, 1, 2, 3, 4], 0, false);
330 assert_eq!(cigar_to_string(&cigar), "5M");
331 }
332
333 #[test]
334 fn test_cigar_with_insertion() {
335 let cigar = cigar_from_ref_positions(&[0, 1, 2, 2, 2, 3, 4], 0, false);
337 assert_eq!(cigar_to_string(&cigar), "3M2I2M");
338 }
339
340 #[test]
341 fn test_cigar_with_deletion() {
342 let cigar = cigar_from_ref_positions(&[0, 1, 2, 5, 6], 0, false);
344 assert_eq!(cigar_to_string(&cigar), "3M2D2M");
345 }
346
347 #[test]
348 fn test_cigar_with_adapter_softclip_forward() {
349 let cigar = cigar_from_ref_positions(&[0, 1, 2], 2, false);
351 assert_eq!(cigar_to_string(&cigar), "3M2S");
352 }
353
354 #[test]
355 fn test_cigar_with_adapter_softclip_negative_strand() {
356 let cigar = cigar_from_ref_positions(&[0, 1, 2], 2, true);
358 assert_eq!(cigar_to_string(&cigar), "2S3M");
359 }
360
361 #[test]
362 fn test_cigar_all_adapter() {
363 let cigar = cigar_from_ref_positions(&[], 5, false);
365 assert_eq!(cigar_to_string(&cigar), "5S");
366 }
367
368 #[test]
369 fn test_cigar_with_insertion_and_deletion() {
370 let cigar = cigar_from_ref_positions(&[0, 1, 2, 2, 5, 6], 0, false);
372 assert_eq!(cigar_to_string(&cigar), "3M1I2D2M");
373 }
374
375 #[test]
376 fn test_cigar_with_adapter_and_deletion() {
377 let cigar = cigar_from_ref_positions(&[0, 1, 4, 5], 3, false);
378 assert_eq!(cigar_to_string(&cigar), "2M2D2M3S");
379 }
380
381 #[test]
382 fn test_cigar_single_base() {
383 let cigar = cigar_from_ref_positions(&[42], 0, false);
384 assert_eq!(cigar_to_string(&cigar), "1M");
385 }
386
387 #[test]
388 fn test_cigar_high_positions() {
389 let cigar = cigar_from_ref_positions(&[100, 101, 102, 103, 104], 0, true);
392 assert_eq!(cigar_to_string(&cigar), "5M");
393 }
394
395 #[test]
398 fn test_generate_pe_read_pair() {
399 let fragment = test_fragment(b"ACGTACGTACGTACGTACGT", 100);
400 let model = IlluminaErrorModel::new(10, 0.0, 0.0);
401 let mut rng = SmallRng::seed_from_u64(42);
402
403 let pair = generate_read_pair(
404 &fragment, "chr1", 1, 10, true, b"ADAPTER", b"ADAPTER", &model, false, &mut rng,
405 );
406
407 assert_eq!(pair.read1.bases, b"ACGTACGTAC");
408 assert!(pair.read2.is_some());
409 assert_eq!(pair.r1_truth.position, 101);
410 assert!(pair.r2_truth.is_some());
411 assert_eq!(cigar_to_string(&pair.r1_cigar), "10M");
412 assert_eq!(cigar_to_string(pair.r2_cigar.as_ref().unwrap()), "10M");
413 }
414
415 #[test]
416 fn test_generate_se_read() {
417 let fragment = test_fragment(b"ACGTACGTAC", 100);
418 let model = IlluminaErrorModel::new(10, 0.0, 0.0);
419 let mut rng = SmallRng::seed_from_u64(42);
420
421 let pair = generate_read_pair(
422 &fragment, "chr1", 5, 10, false, b"ADAPTER", b"ADAPTER", &model, false, &mut rng,
423 );
424
425 assert!(pair.read2.is_none());
426 assert!(pair.r2_cigar.is_none());
427 assert_eq!(cigar_to_string(&pair.r1_cigar), "10M");
428 }
429
430 #[test]
431 fn test_adapter_cigar_softclip() {
432 let fragment = test_fragment(b"AC", 0);
435 let model = IlluminaErrorModel::new(5, 0.0, 0.0);
436 let mut rng = SmallRng::seed_from_u64(42);
437
438 let pair = generate_read_pair(
439 &fragment, "chr1", 1, 5, true, b"TTTTT", b"GGGGG", &model, false, &mut rng,
440 );
441
442 assert_eq!(cigar_to_string(&pair.r1_cigar), "2M3S");
443 assert_eq!(cigar_to_string(pair.r2_cigar.as_ref().unwrap()), "3S2M");
444 }
445
446 #[test]
447 fn test_simple_name_mode() {
448 let fragment = test_fragment(b"ACGT", 0);
449 let model = IlluminaErrorModel::new(4, 0.0, 0.0);
450 let mut rng = SmallRng::seed_from_u64(42);
451
452 let pair =
453 generate_read_pair(&fragment, "chr1", 42, 4, true, b"A", b"A", &model, true, &mut rng);
454
455 assert_eq!(pair.read1.name, "holodeck::42");
456 }
457
458 #[test]
459 fn test_quality_scores_correct_length() {
460 let fragment = test_fragment(b"ACGTACGTAC", 0);
461 let model = IlluminaErrorModel::new(10, 0.001, 0.01);
462 let mut rng = SmallRng::seed_from_u64(42);
463
464 let pair =
465 generate_read_pair(&fragment, "chr1", 1, 10, true, b"A", b"A", &model, false, &mut rng);
466
467 assert_eq!(pair.read1.qualities.len(), 10);
468 assert_eq!(pair.read2.as_ref().unwrap().qualities.len(), 10);
469 }
470}