1use noodles::sam::alignment::record::cigar::op::{Kind, Op};
9use noodles::sam::alignment::record_buf::Cigar;
10use rand::Rng;
11
12use crate::error_model::{self, ErrorModel, ReadEnd};
13use crate::fragment::{Fragment, extract_read_bases};
14use crate::read_naming::{TruthAlignment, encoded_pe_name, encoded_se_name, simple_name};
15
16#[derive(Debug, Clone)]
18pub struct SimulatedRead {
19 pub name: String,
21 pub bases: Vec<u8>,
23 pub qualities: Vec<u8>,
25}
26
27#[derive(Debug)]
29pub struct ReadPair {
30 pub read1: SimulatedRead,
32 pub read2: Option<SimulatedRead>,
34 pub r1_truth: TruthAlignment,
36 pub r2_truth: Option<TruthAlignment>,
38 pub r1_cigar: Cigar,
41 pub r2_cigar: Option<Cigar>,
43}
44
45#[allow(clippy::too_many_arguments)] pub fn generate_read_pair(
64 fragment: &Fragment,
65 contig_name: &str,
66 read_num: u64,
67 read_length: usize,
68 paired: bool,
69 adapter_r1: &[u8],
70 adapter_r2: &[u8],
71 model: &impl ErrorModel,
72 simple_names: bool,
73 rng: &mut impl Rng,
74) -> ReadPair {
75 let frag_len = fragment.bases.len();
76 let genomic = frag_len.min(read_length);
77 let adapter_bases = read_length.saturating_sub(genomic);
78 let right_start = frag_len.saturating_sub(genomic);
79
80 let r1_negative_strand = !fragment.is_forward;
87
88 let mut r1_bases =
90 extract_read_bases(&fragment.bases, read_length, adapter_r1, r1_negative_strand);
91 let (r1_errors, r1_quals) =
92 error_model::apply_errors(model, &mut r1_bases, ReadEnd::Read1, rng);
93
94 let r1_positions = if r1_negative_strand {
98 &fragment.ref_positions[right_start..frag_len]
99 } else {
100 &fragment.ref_positions[..genomic]
101 };
102 let r1_cigar = cigar_from_ref_positions(r1_positions, adapter_bases, r1_negative_strand);
103
104 let r1_ref_pos = if fragment.ref_positions.is_empty() {
106 0
107 } else if r1_negative_strand {
108 fragment.ref_positions[right_start] + 1
109 } else {
110 fragment.ref_positions[0] + 1
111 };
112
113 let r1_truth = TruthAlignment {
114 contig: contig_name.to_string(),
115 position: r1_ref_pos,
116 is_forward: fragment.is_forward,
117 haplotype: fragment.haplotype_index,
118 n_errors: r1_errors,
119 };
120
121 if !paired {
122 let name =
123 if simple_names { simple_name(read_num) } else { encoded_se_name(read_num, &r1_truth) };
124
125 return ReadPair {
126 read1: SimulatedRead { name, bases: r1_bases, qualities: r1_quals },
127 read2: None,
128 r1_truth,
129 r2_truth: None,
130 r1_cigar,
131 r2_cigar: None,
132 };
133 }
134
135 let r2_negative_strand = fragment.is_forward;
138 let mut r2_bases =
139 extract_read_bases(&fragment.bases, read_length, adapter_r2, r2_negative_strand);
140 let (r2_errors, r2_quals) =
141 error_model::apply_errors(model, &mut r2_bases, ReadEnd::Read2, rng);
142
143 let r2_positions = if r2_negative_strand {
145 &fragment.ref_positions[right_start..frag_len]
146 } else {
147 &fragment.ref_positions[..genomic]
148 };
149 let r2_cigar = cigar_from_ref_positions(r2_positions, adapter_bases, r2_negative_strand);
150
151 let r2_ref_pos = if fragment.ref_positions.is_empty() {
153 0
154 } else if r2_negative_strand {
155 fragment.ref_positions[right_start] + 1
156 } else {
157 fragment.ref_positions[0] + 1
158 };
159
160 let r2_truth = TruthAlignment {
161 contig: contig_name.to_string(),
162 position: r2_ref_pos,
163 is_forward: !fragment.is_forward,
164 haplotype: fragment.haplotype_index,
165 n_errors: r2_errors,
166 };
167
168 let name = if simple_names {
169 simple_name(read_num)
170 } else {
171 encoded_pe_name(read_num, &r1_truth, &r2_truth)
172 };
173
174 ReadPair {
175 read1: SimulatedRead { name: name.clone(), bases: r1_bases, qualities: r1_quals },
176 read2: Some(SimulatedRead { name, bases: r2_bases, qualities: r2_quals }),
177 r1_truth,
178 r2_truth: Some(r2_truth),
179 r1_cigar,
180 r2_cigar: Some(r2_cigar),
181 }
182}
183
184#[must_use]
201pub fn cigar_from_ref_positions(
202 positions: &[u32],
203 adapter_bases: usize,
204 negative_strand: bool,
205) -> Cigar {
206 let mut ops: Vec<Op> = Vec::new();
207
208 if positions.is_empty() {
209 if adapter_bases > 0 {
210 ops.push(Op::new(Kind::SoftClip, adapter_bases));
211 }
212 return Cigar::from(ops);
213 }
214
215 let mut match_run: usize = 1; let mut ins_run: usize = 0;
217
218 for i in 1..positions.len() {
219 let prev = positions[i - 1];
220 let curr = positions[i];
221
222 if curr == prev + 1 {
223 if ins_run > 0 {
225 ops.push(Op::new(Kind::Insertion, ins_run));
226 ins_run = 0;
227 }
228 match_run += 1;
229 } else if curr == prev {
230 if match_run > 0 {
232 ops.push(Op::new(Kind::Match, match_run));
233 match_run = 0;
234 }
235 ins_run += 1;
236 } else {
237 if ins_run > 0 {
239 ops.push(Op::new(Kind::Insertion, ins_run));
240 ins_run = 0;
241 }
242 if match_run > 0 {
243 ops.push(Op::new(Kind::Match, match_run));
244 }
245 let gap = curr.saturating_sub(prev).saturating_sub(1);
246 if gap > 0 {
247 ops.push(Op::new(Kind::Deletion, gap as usize));
248 }
249 match_run = 1; }
251 }
252
253 if ins_run > 0 {
255 ops.push(Op::new(Kind::Insertion, ins_run));
256 }
257 if match_run > 0 {
258 ops.push(Op::new(Kind::Match, match_run));
259 }
260
261 if adapter_bases > 0 {
265 if negative_strand {
266 ops.insert(0, Op::new(Kind::SoftClip, adapter_bases));
267 } else {
268 ops.push(Op::new(Kind::SoftClip, adapter_bases));
269 }
270 }
271
272 Cigar::from(ops)
273}
274
275#[must_use]
277pub fn cigar_to_string(cigar: &Cigar) -> String {
278 use std::fmt::Write;
279 cigar.as_ref().iter().fold(String::new(), |mut s, op| {
280 let kind_char = match op.kind() {
281 Kind::Match => 'M',
282 Kind::Insertion => 'I',
283 Kind::Deletion => 'D',
284 Kind::SoftClip => 'S',
285 Kind::HardClip => 'H',
286 Kind::Skip => 'N',
287 Kind::Pad => 'P',
288 Kind::SequenceMatch => '=',
289 Kind::SequenceMismatch => 'X',
290 };
291 let _ = write!(s, "{}{kind_char}", op.len());
292 s
293 })
294}
295
296#[cfg(test)]
297mod tests {
298 use rand::SeedableRng;
299 use rand::rngs::SmallRng;
300
301 use super::*;
302 use crate::error_model::illumina::IlluminaErrorModel;
303
304 fn test_fragment(bases: &[u8], ref_start: u32) -> Fragment {
306 #[expect(clippy::cast_possible_truncation, reason = "test data is small")]
307 let ref_positions: Vec<u32> = (ref_start..ref_start + bases.len() as u32).collect();
308 Fragment {
309 bases: bases.to_vec(),
310 ref_positions,
311 ref_start,
312 is_forward: true,
313 haplotype_index: 0,
314 }
315 }
316
317 #[test]
320 fn test_cigar_all_match() {
321 let cigar = cigar_from_ref_positions(&[0, 1, 2, 3, 4], 0, false);
322 assert_eq!(cigar_to_string(&cigar), "5M");
323 }
324
325 #[test]
326 fn test_cigar_with_insertion() {
327 let cigar = cigar_from_ref_positions(&[0, 1, 2, 2, 2, 3, 4], 0, false);
329 assert_eq!(cigar_to_string(&cigar), "3M2I2M");
330 }
331
332 #[test]
333 fn test_cigar_with_deletion() {
334 let cigar = cigar_from_ref_positions(&[0, 1, 2, 5, 6], 0, false);
336 assert_eq!(cigar_to_string(&cigar), "3M2D2M");
337 }
338
339 #[test]
340 fn test_cigar_with_adapter_softclip_forward() {
341 let cigar = cigar_from_ref_positions(&[0, 1, 2], 2, false);
343 assert_eq!(cigar_to_string(&cigar), "3M2S");
344 }
345
346 #[test]
347 fn test_cigar_with_adapter_softclip_negative_strand() {
348 let cigar = cigar_from_ref_positions(&[0, 1, 2], 2, true);
350 assert_eq!(cigar_to_string(&cigar), "2S3M");
351 }
352
353 #[test]
354 fn test_cigar_all_adapter() {
355 let cigar = cigar_from_ref_positions(&[], 5, false);
357 assert_eq!(cigar_to_string(&cigar), "5S");
358 }
359
360 #[test]
361 fn test_cigar_with_insertion_and_deletion() {
362 let cigar = cigar_from_ref_positions(&[0, 1, 2, 2, 5, 6], 0, false);
364 assert_eq!(cigar_to_string(&cigar), "3M1I2D2M");
365 }
366
367 #[test]
368 fn test_cigar_with_adapter_and_deletion() {
369 let cigar = cigar_from_ref_positions(&[0, 1, 4, 5], 3, false);
370 assert_eq!(cigar_to_string(&cigar), "2M2D2M3S");
371 }
372
373 #[test]
374 fn test_cigar_single_base() {
375 let cigar = cigar_from_ref_positions(&[42], 0, false);
376 assert_eq!(cigar_to_string(&cigar), "1M");
377 }
378
379 #[test]
380 fn test_cigar_high_positions() {
381 let cigar = cigar_from_ref_positions(&[100, 101, 102, 103, 104], 0, true);
384 assert_eq!(cigar_to_string(&cigar), "5M");
385 }
386
387 #[test]
390 fn test_generate_pe_read_pair() {
391 let fragment = test_fragment(b"ACGTACGTACGTACGTACGT", 100);
392 let model = IlluminaErrorModel::new(10, 0.0, 0.0);
393 let mut rng = SmallRng::seed_from_u64(42);
394
395 let pair = generate_read_pair(
396 &fragment, "chr1", 1, 10, true, b"ADAPTER", b"ADAPTER", &model, false, &mut rng,
397 );
398
399 assert_eq!(pair.read1.bases, b"ACGTACGTAC");
400 assert!(pair.read2.is_some());
401 assert_eq!(pair.r1_truth.position, 101);
402 assert!(pair.r2_truth.is_some());
403 assert_eq!(cigar_to_string(&pair.r1_cigar), "10M");
404 assert_eq!(cigar_to_string(pair.r2_cigar.as_ref().unwrap()), "10M");
405 }
406
407 #[test]
408 fn test_generate_se_read() {
409 let fragment = test_fragment(b"ACGTACGTAC", 100);
410 let model = IlluminaErrorModel::new(10, 0.0, 0.0);
411 let mut rng = SmallRng::seed_from_u64(42);
412
413 let pair = generate_read_pair(
414 &fragment, "chr1", 5, 10, false, b"ADAPTER", b"ADAPTER", &model, false, &mut rng,
415 );
416
417 assert!(pair.read2.is_none());
418 assert!(pair.r2_cigar.is_none());
419 assert_eq!(cigar_to_string(&pair.r1_cigar), "10M");
420 }
421
422 #[test]
423 fn test_adapter_cigar_softclip() {
424 let fragment = test_fragment(b"AC", 0);
427 let model = IlluminaErrorModel::new(5, 0.0, 0.0);
428 let mut rng = SmallRng::seed_from_u64(42);
429
430 let pair = generate_read_pair(
431 &fragment, "chr1", 1, 5, true, b"TTTTT", b"GGGGG", &model, false, &mut rng,
432 );
433
434 assert_eq!(cigar_to_string(&pair.r1_cigar), "2M3S");
435 assert_eq!(cigar_to_string(pair.r2_cigar.as_ref().unwrap()), "3S2M");
436 }
437
438 #[test]
439 fn test_simple_name_mode() {
440 let fragment = test_fragment(b"ACGT", 0);
441 let model = IlluminaErrorModel::new(4, 0.0, 0.0);
442 let mut rng = SmallRng::seed_from_u64(42);
443
444 let pair =
445 generate_read_pair(&fragment, "chr1", 42, 4, true, b"A", b"A", &model, true, &mut rng);
446
447 assert_eq!(pair.read1.name, "holodeck:42");
448 }
449
450 #[test]
451 fn test_quality_scores_correct_length() {
452 let fragment = test_fragment(b"ACGTACGTAC", 0);
453 let model = IlluminaErrorModel::new(10, 0.001, 0.01);
454 let mut rng = SmallRng::seed_from_u64(42);
455
456 let pair =
457 generate_read_pair(&fragment, "chr1", 1, 10, true, b"A", b"A", &model, false, &mut rng);
458
459 assert_eq!(pair.read1.qualities.len(), 10);
460 assert_eq!(pair.read2.as_ref().unwrap().qualities.len(), 10);
461 }
462}