Skip to main content

holodeck_lib/
read.rs

1//! Simulated read pair generation.
2//!
3//! Combines fragment extraction, error model application, and read naming
4//! into complete [`ReadPair`] objects ready for FASTQ output. Computes
5//! proper CIGARs from the haplotype-to-reference coordinate mapping for
6//! golden BAM output.
7
8use noodles::sam::alignment::record::cigar::op::{Kind, Op};
9use noodles::sam::alignment::record_buf::Cigar;
10use rand::Rng;
11
12use crate::error_model::{self, ErrorModel, ReadEnd};
13use crate::fragment::{Fragment, extract_read_bases};
14use crate::read_naming::{TruthAlignment, encoded_pe_name, encoded_se_name, simple_name};
15
16/// A single simulated read with bases, quality scores, and metadata.
17#[derive(Debug, Clone)]
18pub struct SimulatedRead {
19    /// Read name (shared between R1 and R2 of a pair).
20    pub name: String,
21    /// Base sequence (possibly with errors applied).
22    pub bases: Vec<u8>,
23    /// Quality scores (Phred+33 encoded).
24    pub qualities: Vec<u8>,
25}
26
27/// A simulated read pair (or single read for SE mode).
28#[derive(Debug)]
29pub struct ReadPair {
30    /// First read (always present).
31    pub read1: SimulatedRead,
32    /// Second read (present only for paired-end mode).
33    pub read2: Option<SimulatedRead>,
34    /// Truth alignment for R1.
35    pub r1_truth: TruthAlignment,
36    /// Truth alignment for R2 (present only for paired-end mode).
37    pub r2_truth: Option<TruthAlignment>,
38    /// Truth CIGAR for R1, reflecting haplotype variants and adapter
39    /// soft-clipping.
40    pub r1_cigar: Cigar,
41    /// Truth CIGAR for R2. `None` for single-end reads.
42    pub r2_cigar: Option<Cigar>,
43}
44
45/// Generate a simulated read pair from a fragment.
46///
47/// Extracts R1 and optionally R2 bases from the fragment, applies the error
48/// model, constructs read names with truth information, and computes CIGARs
49/// from the fragment's reference coordinate mapping.
50///
51/// # Arguments
52/// * `fragment` — The source fragment with bases and reference positions.
53/// * `contig_name` — Contig name for read naming.
54/// * `read_num` — 1-based read pair number.
55/// * `read_length` — Desired read length.
56/// * `paired` — Whether to generate both R1 and R2.
57/// * `adapter_r1` — Adapter sequence for R1.
58/// * `adapter_r2` — Adapter sequence for R2.
59/// * `error_model` — Error model to apply.
60/// * `simple_names` — Use simple names instead of encoded truth names.
61/// * `rng` — Random number generator.
62#[allow(clippy::too_many_arguments)] // Simulation parameters are naturally numerous
63pub fn generate_read_pair(
64    fragment: &Fragment,
65    contig_name: &str,
66    read_num: u64,
67    read_length: usize,
68    paired: bool,
69    adapter_r1: &[u8],
70    adapter_r2: &[u8],
71    model: &impl ErrorModel,
72    simple_names: bool,
73    rng: &mut impl Rng,
74) -> ReadPair {
75    let frag_len = fragment.bases.len();
76    let genomic = frag_len.min(read_length);
77    let adapter_bases = read_length.saturating_sub(genomic);
78    let right_start = frag_len.saturating_sub(genomic);
79
80    // For a forward (top-strand) fragment, R1 reads from the left end
81    // (forward strand) and R2 reads from the right end (reverse strand).
82    // For a reverse (bottom-strand) fragment, R1 reads from the right end
83    // (reverse strand) and R2 reads from the left end (forward strand).
84    // The left read is always forward-strand and the right read is always
85    // reverse-strand; what changes is which becomes R1 vs R2.
86    let r1_negative_strand = !fragment.is_forward;
87
88    // Extract R1 bases.
89    let mut r1_bases =
90        extract_read_bases(&fragment.bases, read_length, adapter_r1, r1_negative_strand);
91    let (r1_errors, r1_quals) =
92        error_model::apply_errors(model, &mut r1_bases, ReadEnd::Read1, rng);
93
94    // R1 ref_positions and CIGAR.  Fragment ref_positions are always in
95    // ascending (forward) reference order; negative-strand reads take from
96    // the right end of the fragment but the positions are still ascending.
97    let r1_positions = if r1_negative_strand {
98        &fragment.ref_positions[right_start..frag_len]
99    } else {
100        &fragment.ref_positions[..genomic]
101    };
102    let r1_cigar = cigar_from_ref_positions(r1_positions, adapter_bases, r1_negative_strand);
103
104    // R1 truth position: leftmost reference coordinate (1-based).
105    let r1_ref_pos = if fragment.ref_positions.is_empty() {
106        0
107    } else if r1_negative_strand {
108        fragment.ref_positions[right_start] + 1
109    } else {
110        fragment.ref_positions[0] + 1
111    };
112
113    // Fragment length in bases. When it is shorter than the read length,
114    // bases `[fragment_length..read_length)` of each emitted read are adapter
115    // (optionally N-padded).
116    #[expect(clippy::cast_possible_truncation, reason = "fragment length fits in u32")]
117    let fragment_length = frag_len as u32;
118
119    let r1_truth = TruthAlignment {
120        contig: contig_name.to_string(),
121        position: r1_ref_pos,
122        is_forward: fragment.is_forward,
123        haplotype: fragment.haplotype_index,
124        fragment_length,
125        n_errors: r1_errors,
126    };
127
128    if !paired {
129        let name =
130            if simple_names { simple_name(read_num) } else { encoded_se_name(read_num, &r1_truth) };
131
132        return ReadPair {
133            read1: SimulatedRead { name, bases: r1_bases, qualities: r1_quals },
134            read2: None,
135            r1_truth,
136            r2_truth: None,
137            r1_cigar,
138            r2_cigar: None,
139        };
140    }
141
142    // Paired-end: extract R2 bases from the opposite end of the fragment.
143    // R2 is on the negative strand whenever the fragment is forward-strand.
144    let r2_negative_strand = fragment.is_forward;
145    let mut r2_bases =
146        extract_read_bases(&fragment.bases, read_length, adapter_r2, r2_negative_strand);
147    let (r2_errors, r2_quals) =
148        error_model::apply_errors(model, &mut r2_bases, ReadEnd::Read2, rng);
149
150    // R2 ref_positions and CIGAR.
151    let r2_positions = if r2_negative_strand {
152        &fragment.ref_positions[right_start..frag_len]
153    } else {
154        &fragment.ref_positions[..genomic]
155    };
156    let r2_cigar = cigar_from_ref_positions(r2_positions, adapter_bases, r2_negative_strand);
157
158    // R2 truth position: leftmost reference coordinate (1-based).
159    let r2_ref_pos = if fragment.ref_positions.is_empty() {
160        0
161    } else if r2_negative_strand {
162        fragment.ref_positions[right_start] + 1
163    } else {
164        fragment.ref_positions[0] + 1
165    };
166
167    let r2_truth = TruthAlignment {
168        contig: contig_name.to_string(),
169        position: r2_ref_pos,
170        is_forward: !fragment.is_forward,
171        haplotype: fragment.haplotype_index,
172        fragment_length,
173        n_errors: r2_errors,
174    };
175
176    let name = if simple_names {
177        simple_name(read_num)
178    } else {
179        encoded_pe_name(read_num, &r1_truth, &r2_truth)
180    };
181
182    ReadPair {
183        read1: SimulatedRead { name: name.clone(), bases: r1_bases, qualities: r1_quals },
184        read2: Some(SimulatedRead { name, bases: r2_bases, qualities: r2_quals }),
185        r1_truth,
186        r2_truth: Some(r2_truth),
187        r1_cigar,
188        r2_cigar: Some(r2_cigar),
189    }
190}
191
192/// Compute a CIGAR from a slice of ascending reference positions, plus
193/// optional adapter soft-clipping.
194///
195/// Positions must be in ascending order (forward strand). Consecutive
196/// positions incrementing by 1 produce M ops, same position produces I ops,
197/// and gaps produce D ops.
198///
199/// This works for both R1 and R2: BAM CIGARs are always expressed in forward
200/// reference order from the leftmost aligned position, so even negative-strand
201/// reads use ascending positions.
202///
203/// Adapter bases are appended as a soft-clip (S) operation. For
204/// negative-strand reads (`negative_strand = true`) the adapter is sequenced
205/// at the 3' end of the read but sits at the left (5') end of the stored BAM
206/// record after reverse-complementing, so the S op is placed at the *start*
207/// of the CIGAR. For forward-strand reads the S op is placed at the *end*.
208#[must_use]
209pub fn cigar_from_ref_positions(
210    positions: &[u32],
211    adapter_bases: usize,
212    negative_strand: bool,
213) -> Cigar {
214    let mut ops: Vec<Op> = Vec::new();
215
216    if positions.is_empty() {
217        if adapter_bases > 0 {
218            ops.push(Op::new(Kind::SoftClip, adapter_bases));
219        }
220        return Cigar::from(ops);
221    }
222
223    let mut match_run: usize = 1; // First base is always a match.
224    let mut ins_run: usize = 0;
225
226    for i in 1..positions.len() {
227        let prev = positions[i - 1];
228        let curr = positions[i];
229
230        if curr == prev + 1 {
231            // Sequential ascending position: flush any insertion, extend match.
232            if ins_run > 0 {
233                ops.push(Op::new(Kind::Insertion, ins_run));
234                ins_run = 0;
235            }
236            match_run += 1;
237        } else if curr == prev {
238            // Same position: insertion base. Flush match run, extend insertion.
239            if match_run > 0 {
240                ops.push(Op::new(Kind::Match, match_run));
241                match_run = 0;
242            }
243            ins_run += 1;
244        } else {
245            // Gap in positions: deletion. Flush current runs.
246            if ins_run > 0 {
247                ops.push(Op::new(Kind::Insertion, ins_run));
248                ins_run = 0;
249            }
250            if match_run > 0 {
251                ops.push(Op::new(Kind::Match, match_run));
252            }
253            let gap = curr.saturating_sub(prev).saturating_sub(1);
254            if gap > 0 {
255                ops.push(Op::new(Kind::Deletion, gap as usize));
256            }
257            match_run = 1; // Current base starts a new match.
258        }
259    }
260
261    // Flush remaining runs.
262    if ins_run > 0 {
263        ops.push(Op::new(Kind::Insertion, ins_run));
264    }
265    if match_run > 0 {
266        ops.push(Op::new(Kind::Match, match_run));
267    }
268
269    // Adapter bases as soft-clip. Negative-strand reads are stored
270    // reverse-complemented in BAM, so the adapter (at the 3' end in read
271    // order) moves to the left (5') end of the record — a leading S op.
272    if adapter_bases > 0 {
273        if negative_strand {
274            ops.insert(0, Op::new(Kind::SoftClip, adapter_bases));
275        } else {
276            ops.push(Op::new(Kind::SoftClip, adapter_bases));
277        }
278    }
279
280    Cigar::from(ops)
281}
282
283/// Format a CIGAR as a human-readable string (e.g. "50M3I20M2S").
284#[must_use]
285pub fn cigar_to_string(cigar: &Cigar) -> String {
286    use std::fmt::Write;
287    cigar.as_ref().iter().fold(String::new(), |mut s, op| {
288        let kind_char = match op.kind() {
289            Kind::Match => 'M',
290            Kind::Insertion => 'I',
291            Kind::Deletion => 'D',
292            Kind::SoftClip => 'S',
293            Kind::HardClip => 'H',
294            Kind::Skip => 'N',
295            Kind::Pad => 'P',
296            Kind::SequenceMatch => '=',
297            Kind::SequenceMismatch => 'X',
298        };
299        let _ = write!(s, "{}{kind_char}", op.len());
300        s
301    })
302}
303
304#[cfg(test)]
305mod tests {
306    use rand::SeedableRng;
307    use rand::rngs::SmallRng;
308
309    use super::*;
310    use crate::error_model::illumina::IlluminaErrorModel;
311
312    /// Build a simple fragment for testing.
313    fn test_fragment(bases: &[u8], ref_start: u32) -> Fragment {
314        #[expect(clippy::cast_possible_truncation, reason = "test data is small")]
315        let ref_positions: Vec<u32> = (ref_start..ref_start + bases.len() as u32).collect();
316        Fragment {
317            bases: bases.to_vec(),
318            ref_positions,
319            ref_start,
320            is_forward: true,
321            haplotype_index: 0,
322        }
323    }
324
325    // --- CIGAR generation tests ---
326
327    #[test]
328    fn test_cigar_all_match() {
329        let cigar = cigar_from_ref_positions(&[0, 1, 2, 3, 4], 0, false);
330        assert_eq!(cigar_to_string(&cigar), "5M");
331    }
332
333    #[test]
334    fn test_cigar_with_insertion() {
335        // Positions 0,1,2,2,2,3,4: two inserted bases at ref pos 2.
336        let cigar = cigar_from_ref_positions(&[0, 1, 2, 2, 2, 3, 4], 0, false);
337        assert_eq!(cigar_to_string(&cigar), "3M2I2M");
338    }
339
340    #[test]
341    fn test_cigar_with_deletion() {
342        // Gap from 2 to 5: 2 deleted ref bases.
343        let cigar = cigar_from_ref_positions(&[0, 1, 2, 5, 6], 0, false);
344        assert_eq!(cigar_to_string(&cigar), "3M2D2M");
345    }
346
347    #[test]
348    fn test_cigar_with_adapter_softclip_forward() {
349        // Forward-strand: adapter is a trailing soft-clip.
350        let cigar = cigar_from_ref_positions(&[0, 1, 2], 2, false);
351        assert_eq!(cigar_to_string(&cigar), "3M2S");
352    }
353
354    #[test]
355    fn test_cigar_with_adapter_softclip_negative_strand() {
356        // Negative-strand: adapter moves to a leading soft-clip after RC.
357        let cigar = cigar_from_ref_positions(&[0, 1, 2], 2, true);
358        assert_eq!(cigar_to_string(&cigar), "2S3M");
359    }
360
361    #[test]
362    fn test_cigar_all_adapter() {
363        // All-adapter read: placement doesn't matter, but it still round-trips.
364        let cigar = cigar_from_ref_positions(&[], 5, false);
365        assert_eq!(cigar_to_string(&cigar), "5S");
366    }
367
368    #[test]
369    fn test_cigar_with_insertion_and_deletion() {
370        // Insertion at pos 2 (two extra bases), then deletion of 2 ref bases.
371        let cigar = cigar_from_ref_positions(&[0, 1, 2, 2, 5, 6], 0, false);
372        assert_eq!(cigar_to_string(&cigar), "3M1I2D2M");
373    }
374
375    #[test]
376    fn test_cigar_with_adapter_and_deletion() {
377        let cigar = cigar_from_ref_positions(&[0, 1, 4, 5], 3, false);
378        assert_eq!(cigar_to_string(&cigar), "2M2D2M3S");
379    }
380
381    #[test]
382    fn test_cigar_single_base() {
383        let cigar = cigar_from_ref_positions(&[42], 0, false);
384        assert_eq!(cigar_to_string(&cigar), "1M");
385    }
386
387    #[test]
388    fn test_cigar_high_positions() {
389        // Negative-strand R2: positions start from a high offset (ascending),
390        // no adapter — CIGAR is identical to forward strand.
391        let cigar = cigar_from_ref_positions(&[100, 101, 102, 103, 104], 0, true);
392        assert_eq!(cigar_to_string(&cigar), "5M");
393    }
394
395    // --- Read pair generation tests ---
396
397    #[test]
398    fn test_generate_pe_read_pair() {
399        let fragment = test_fragment(b"ACGTACGTACGTACGTACGT", 100);
400        let model = IlluminaErrorModel::new(10, 0.0, 0.0);
401        let mut rng = SmallRng::seed_from_u64(42);
402
403        let pair = generate_read_pair(
404            &fragment, "chr1", 1, 10, true, b"ADAPTER", b"ADAPTER", &model, false, &mut rng,
405        );
406
407        assert_eq!(pair.read1.bases, b"ACGTACGTAC");
408        assert!(pair.read2.is_some());
409        assert_eq!(pair.r1_truth.position, 101);
410        assert!(pair.r2_truth.is_some());
411        assert_eq!(cigar_to_string(&pair.r1_cigar), "10M");
412        assert_eq!(cigar_to_string(pair.r2_cigar.as_ref().unwrap()), "10M");
413    }
414
415    #[test]
416    fn test_generate_se_read() {
417        let fragment = test_fragment(b"ACGTACGTAC", 100);
418        let model = IlluminaErrorModel::new(10, 0.0, 0.0);
419        let mut rng = SmallRng::seed_from_u64(42);
420
421        let pair = generate_read_pair(
422            &fragment, "chr1", 5, 10, false, b"ADAPTER", b"ADAPTER", &model, false, &mut rng,
423        );
424
425        assert!(pair.read2.is_none());
426        assert!(pair.r2_cigar.is_none());
427        assert_eq!(cigar_to_string(&pair.r1_cigar), "10M");
428    }
429
430    #[test]
431    fn test_adapter_cigar_softclip() {
432        // Forward-strand fragment: R1 is positive-strand (trailing S), R2 is
433        // negative-strand (leading S after BAM reverse-complement).
434        let fragment = test_fragment(b"AC", 0);
435        let model = IlluminaErrorModel::new(5, 0.0, 0.0);
436        let mut rng = SmallRng::seed_from_u64(42);
437
438        let pair = generate_read_pair(
439            &fragment, "chr1", 1, 5, true, b"TTTTT", b"GGGGG", &model, false, &mut rng,
440        );
441
442        assert_eq!(cigar_to_string(&pair.r1_cigar), "2M3S");
443        assert_eq!(cigar_to_string(pair.r2_cigar.as_ref().unwrap()), "3S2M");
444    }
445
446    #[test]
447    fn test_simple_name_mode() {
448        let fragment = test_fragment(b"ACGT", 0);
449        let model = IlluminaErrorModel::new(4, 0.0, 0.0);
450        let mut rng = SmallRng::seed_from_u64(42);
451
452        let pair =
453            generate_read_pair(&fragment, "chr1", 42, 4, true, b"A", b"A", &model, true, &mut rng);
454
455        assert_eq!(pair.read1.name, "holodeck::42");
456    }
457
458    #[test]
459    fn test_quality_scores_correct_length() {
460        let fragment = test_fragment(b"ACGTACGTAC", 0);
461        let model = IlluminaErrorModel::new(10, 0.001, 0.01);
462        let mut rng = SmallRng::seed_from_u64(42);
463
464        let pair =
465            generate_read_pair(&fragment, "chr1", 1, 10, true, b"A", b"A", &model, false, &mut rng);
466
467        assert_eq!(pair.read1.qualities.len(), 10);
468        assert_eq!(pair.read2.as_ref().unwrap().qualities.len(), 10);
469    }
470}