Skip to main content

holodeck_lib/
read.rs

1//! Simulated read pair generation.
2//!
3//! Combines fragment extraction, error model application, and read naming
4//! into complete [`ReadPair`] objects ready for FASTQ output. Computes
5//! proper CIGARs from the haplotype-to-reference coordinate mapping for
6//! golden BAM output.
7
8use noodles::sam::alignment::record::cigar::op::{Kind, Op};
9use noodles::sam::alignment::record_buf::Cigar;
10use rand::Rng;
11
12use crate::error_model::{self, ErrorModel, ReadEnd};
13use crate::fragment::{Fragment, extract_read_bases};
14use crate::read_naming::{TruthAlignment, encoded_pe_name, encoded_se_name, simple_name};
15
16/// A single simulated read with bases, quality scores, and metadata.
17#[derive(Debug, Clone)]
18pub struct SimulatedRead {
19    /// Read name (shared between R1 and R2 of a pair).
20    pub name: String,
21    /// Base sequence (possibly with errors applied).
22    pub bases: Vec<u8>,
23    /// Quality scores (Phred+33 encoded).
24    pub qualities: Vec<u8>,
25}
26
27/// A simulated read pair (or single read for SE mode).
28#[derive(Debug)]
29pub struct ReadPair {
30    /// First read (always present).
31    pub read1: SimulatedRead,
32    /// Second read (present only for paired-end mode).
33    pub read2: Option<SimulatedRead>,
34    /// Truth alignment for R1.
35    pub r1_truth: TruthAlignment,
36    /// Truth alignment for R2 (present only for paired-end mode).
37    pub r2_truth: Option<TruthAlignment>,
38    /// Truth CIGAR for R1, reflecting haplotype variants and adapter
39    /// soft-clipping.
40    pub r1_cigar: Cigar,
41    /// Truth CIGAR for R2. `None` for single-end reads.
42    pub r2_cigar: Option<Cigar>,
43}
44
45/// Generate a simulated read pair from a fragment.
46///
47/// Extracts R1 and optionally R2 bases from the fragment, applies the error
48/// model, constructs read names with truth information, and computes CIGARs
49/// from the fragment's reference coordinate mapping.
50///
51/// # Arguments
52/// * `fragment` — The source fragment with bases and reference positions.
53/// * `contig_name` — Contig name for read naming.
54/// * `read_num` — 1-based read pair number.
55/// * `read_length` — Desired read length.
56/// * `paired` — Whether to generate both R1 and R2.
57/// * `adapter_r1` — Adapter sequence for R1.
58/// * `adapter_r2` — Adapter sequence for R2.
59/// * `error_model` — Error model to apply.
60/// * `simple_names` — Use simple names instead of encoded truth names.
61/// * `rng` — Random number generator.
62#[allow(clippy::too_many_arguments)] // Simulation parameters are naturally numerous
63pub fn generate_read_pair(
64    fragment: &Fragment,
65    contig_name: &str,
66    read_num: u64,
67    read_length: usize,
68    paired: bool,
69    adapter_r1: &[u8],
70    adapter_r2: &[u8],
71    model: &impl ErrorModel,
72    simple_names: bool,
73    rng: &mut impl Rng,
74) -> ReadPair {
75    let frag_len = fragment.bases.len();
76    let genomic = frag_len.min(read_length);
77    let adapter_bases = read_length.saturating_sub(genomic);
78    let right_start = frag_len.saturating_sub(genomic);
79
80    // For a forward (top-strand) fragment, R1 reads from the left end
81    // (forward strand) and R2 reads from the right end (reverse strand).
82    // For a reverse (bottom-strand) fragment, R1 reads from the right end
83    // (reverse strand) and R2 reads from the left end (forward strand).
84    // The left read is always forward-strand and the right read is always
85    // reverse-strand; what changes is which becomes R1 vs R2.
86    let r1_negative_strand = !fragment.is_forward;
87
88    // Extract R1 bases.
89    let mut r1_bases =
90        extract_read_bases(&fragment.bases, read_length, adapter_r1, r1_negative_strand);
91    let (r1_errors, r1_quals) =
92        error_model::apply_errors(model, &mut r1_bases, ReadEnd::Read1, rng);
93
94    // R1 ref_positions and CIGAR.  Fragment ref_positions are always in
95    // ascending (forward) reference order; negative-strand reads take from
96    // the right end of the fragment but the positions are still ascending.
97    let r1_positions = if r1_negative_strand {
98        &fragment.ref_positions[right_start..frag_len]
99    } else {
100        &fragment.ref_positions[..genomic]
101    };
102    let r1_cigar = cigar_from_ref_positions(r1_positions, adapter_bases, r1_negative_strand);
103
104    // R1 truth position: leftmost reference coordinate (1-based).
105    let r1_ref_pos = if fragment.ref_positions.is_empty() {
106        0
107    } else if r1_negative_strand {
108        fragment.ref_positions[right_start] + 1
109    } else {
110        fragment.ref_positions[0] + 1
111    };
112
113    let r1_truth = TruthAlignment {
114        contig: contig_name.to_string(),
115        position: r1_ref_pos,
116        is_forward: fragment.is_forward,
117        haplotype: fragment.haplotype_index,
118        n_errors: r1_errors,
119    };
120
121    if !paired {
122        let name =
123            if simple_names { simple_name(read_num) } else { encoded_se_name(read_num, &r1_truth) };
124
125        return ReadPair {
126            read1: SimulatedRead { name, bases: r1_bases, qualities: r1_quals },
127            read2: None,
128            r1_truth,
129            r2_truth: None,
130            r1_cigar,
131            r2_cigar: None,
132        };
133    }
134
135    // Paired-end: extract R2 bases from the opposite end of the fragment.
136    // R2 is on the negative strand whenever the fragment is forward-strand.
137    let r2_negative_strand = fragment.is_forward;
138    let mut r2_bases =
139        extract_read_bases(&fragment.bases, read_length, adapter_r2, r2_negative_strand);
140    let (r2_errors, r2_quals) =
141        error_model::apply_errors(model, &mut r2_bases, ReadEnd::Read2, rng);
142
143    // R2 ref_positions and CIGAR.
144    let r2_positions = if r2_negative_strand {
145        &fragment.ref_positions[right_start..frag_len]
146    } else {
147        &fragment.ref_positions[..genomic]
148    };
149    let r2_cigar = cigar_from_ref_positions(r2_positions, adapter_bases, r2_negative_strand);
150
151    // R2 truth position: leftmost reference coordinate (1-based).
152    let r2_ref_pos = if fragment.ref_positions.is_empty() {
153        0
154    } else if r2_negative_strand {
155        fragment.ref_positions[right_start] + 1
156    } else {
157        fragment.ref_positions[0] + 1
158    };
159
160    let r2_truth = TruthAlignment {
161        contig: contig_name.to_string(),
162        position: r2_ref_pos,
163        is_forward: !fragment.is_forward,
164        haplotype: fragment.haplotype_index,
165        n_errors: r2_errors,
166    };
167
168    let name = if simple_names {
169        simple_name(read_num)
170    } else {
171        encoded_pe_name(read_num, &r1_truth, &r2_truth)
172    };
173
174    ReadPair {
175        read1: SimulatedRead { name: name.clone(), bases: r1_bases, qualities: r1_quals },
176        read2: Some(SimulatedRead { name, bases: r2_bases, qualities: r2_quals }),
177        r1_truth,
178        r2_truth: Some(r2_truth),
179        r1_cigar,
180        r2_cigar: Some(r2_cigar),
181    }
182}
183
184/// Compute a CIGAR from a slice of ascending reference positions, plus
185/// optional adapter soft-clipping.
186///
187/// Positions must be in ascending order (forward strand). Consecutive
188/// positions incrementing by 1 produce M ops, same position produces I ops,
189/// and gaps produce D ops.
190///
191/// This works for both R1 and R2: BAM CIGARs are always expressed in forward
192/// reference order from the leftmost aligned position, so even negative-strand
193/// reads use ascending positions.
194///
195/// Adapter bases are appended as a soft-clip (S) operation. For
196/// negative-strand reads (`negative_strand = true`) the adapter is sequenced
197/// at the 3' end of the read but sits at the left (5') end of the stored BAM
198/// record after reverse-complementing, so the S op is placed at the *start*
199/// of the CIGAR. For forward-strand reads the S op is placed at the *end*.
200#[must_use]
201pub fn cigar_from_ref_positions(
202    positions: &[u32],
203    adapter_bases: usize,
204    negative_strand: bool,
205) -> Cigar {
206    let mut ops: Vec<Op> = Vec::new();
207
208    if positions.is_empty() {
209        if adapter_bases > 0 {
210            ops.push(Op::new(Kind::SoftClip, adapter_bases));
211        }
212        return Cigar::from(ops);
213    }
214
215    let mut match_run: usize = 1; // First base is always a match.
216    let mut ins_run: usize = 0;
217
218    for i in 1..positions.len() {
219        let prev = positions[i - 1];
220        let curr = positions[i];
221
222        if curr == prev + 1 {
223            // Sequential ascending position: flush any insertion, extend match.
224            if ins_run > 0 {
225                ops.push(Op::new(Kind::Insertion, ins_run));
226                ins_run = 0;
227            }
228            match_run += 1;
229        } else if curr == prev {
230            // Same position: insertion base. Flush match run, extend insertion.
231            if match_run > 0 {
232                ops.push(Op::new(Kind::Match, match_run));
233                match_run = 0;
234            }
235            ins_run += 1;
236        } else {
237            // Gap in positions: deletion. Flush current runs.
238            if ins_run > 0 {
239                ops.push(Op::new(Kind::Insertion, ins_run));
240                ins_run = 0;
241            }
242            if match_run > 0 {
243                ops.push(Op::new(Kind::Match, match_run));
244            }
245            let gap = curr.saturating_sub(prev).saturating_sub(1);
246            if gap > 0 {
247                ops.push(Op::new(Kind::Deletion, gap as usize));
248            }
249            match_run = 1; // Current base starts a new match.
250        }
251    }
252
253    // Flush remaining runs.
254    if ins_run > 0 {
255        ops.push(Op::new(Kind::Insertion, ins_run));
256    }
257    if match_run > 0 {
258        ops.push(Op::new(Kind::Match, match_run));
259    }
260
261    // Adapter bases as soft-clip. Negative-strand reads are stored
262    // reverse-complemented in BAM, so the adapter (at the 3' end in read
263    // order) moves to the left (5') end of the record — a leading S op.
264    if adapter_bases > 0 {
265        if negative_strand {
266            ops.insert(0, Op::new(Kind::SoftClip, adapter_bases));
267        } else {
268            ops.push(Op::new(Kind::SoftClip, adapter_bases));
269        }
270    }
271
272    Cigar::from(ops)
273}
274
275/// Format a CIGAR as a human-readable string (e.g. "50M3I20M2S").
276#[must_use]
277pub fn cigar_to_string(cigar: &Cigar) -> String {
278    use std::fmt::Write;
279    cigar.as_ref().iter().fold(String::new(), |mut s, op| {
280        let kind_char = match op.kind() {
281            Kind::Match => 'M',
282            Kind::Insertion => 'I',
283            Kind::Deletion => 'D',
284            Kind::SoftClip => 'S',
285            Kind::HardClip => 'H',
286            Kind::Skip => 'N',
287            Kind::Pad => 'P',
288            Kind::SequenceMatch => '=',
289            Kind::SequenceMismatch => 'X',
290        };
291        let _ = write!(s, "{}{kind_char}", op.len());
292        s
293    })
294}
295
296#[cfg(test)]
297mod tests {
298    use rand::SeedableRng;
299    use rand::rngs::SmallRng;
300
301    use super::*;
302    use crate::error_model::illumina::IlluminaErrorModel;
303
304    /// Build a simple fragment for testing.
305    fn test_fragment(bases: &[u8], ref_start: u32) -> Fragment {
306        #[expect(clippy::cast_possible_truncation, reason = "test data is small")]
307        let ref_positions: Vec<u32> = (ref_start..ref_start + bases.len() as u32).collect();
308        Fragment {
309            bases: bases.to_vec(),
310            ref_positions,
311            ref_start,
312            is_forward: true,
313            haplotype_index: 0,
314        }
315    }
316
317    // --- CIGAR generation tests ---
318
319    #[test]
320    fn test_cigar_all_match() {
321        let cigar = cigar_from_ref_positions(&[0, 1, 2, 3, 4], 0, false);
322        assert_eq!(cigar_to_string(&cigar), "5M");
323    }
324
325    #[test]
326    fn test_cigar_with_insertion() {
327        // Positions 0,1,2,2,2,3,4: two inserted bases at ref pos 2.
328        let cigar = cigar_from_ref_positions(&[0, 1, 2, 2, 2, 3, 4], 0, false);
329        assert_eq!(cigar_to_string(&cigar), "3M2I2M");
330    }
331
332    #[test]
333    fn test_cigar_with_deletion() {
334        // Gap from 2 to 5: 2 deleted ref bases.
335        let cigar = cigar_from_ref_positions(&[0, 1, 2, 5, 6], 0, false);
336        assert_eq!(cigar_to_string(&cigar), "3M2D2M");
337    }
338
339    #[test]
340    fn test_cigar_with_adapter_softclip_forward() {
341        // Forward-strand: adapter is a trailing soft-clip.
342        let cigar = cigar_from_ref_positions(&[0, 1, 2], 2, false);
343        assert_eq!(cigar_to_string(&cigar), "3M2S");
344    }
345
346    #[test]
347    fn test_cigar_with_adapter_softclip_negative_strand() {
348        // Negative-strand: adapter moves to a leading soft-clip after RC.
349        let cigar = cigar_from_ref_positions(&[0, 1, 2], 2, true);
350        assert_eq!(cigar_to_string(&cigar), "2S3M");
351    }
352
353    #[test]
354    fn test_cigar_all_adapter() {
355        // All-adapter read: placement doesn't matter, but it still round-trips.
356        let cigar = cigar_from_ref_positions(&[], 5, false);
357        assert_eq!(cigar_to_string(&cigar), "5S");
358    }
359
360    #[test]
361    fn test_cigar_with_insertion_and_deletion() {
362        // Insertion at pos 2 (two extra bases), then deletion of 2 ref bases.
363        let cigar = cigar_from_ref_positions(&[0, 1, 2, 2, 5, 6], 0, false);
364        assert_eq!(cigar_to_string(&cigar), "3M1I2D2M");
365    }
366
367    #[test]
368    fn test_cigar_with_adapter_and_deletion() {
369        let cigar = cigar_from_ref_positions(&[0, 1, 4, 5], 3, false);
370        assert_eq!(cigar_to_string(&cigar), "2M2D2M3S");
371    }
372
373    #[test]
374    fn test_cigar_single_base() {
375        let cigar = cigar_from_ref_positions(&[42], 0, false);
376        assert_eq!(cigar_to_string(&cigar), "1M");
377    }
378
379    #[test]
380    fn test_cigar_high_positions() {
381        // Negative-strand R2: positions start from a high offset (ascending),
382        // no adapter — CIGAR is identical to forward strand.
383        let cigar = cigar_from_ref_positions(&[100, 101, 102, 103, 104], 0, true);
384        assert_eq!(cigar_to_string(&cigar), "5M");
385    }
386
387    // --- Read pair generation tests ---
388
389    #[test]
390    fn test_generate_pe_read_pair() {
391        let fragment = test_fragment(b"ACGTACGTACGTACGTACGT", 100);
392        let model = IlluminaErrorModel::new(10, 0.0, 0.0);
393        let mut rng = SmallRng::seed_from_u64(42);
394
395        let pair = generate_read_pair(
396            &fragment, "chr1", 1, 10, true, b"ADAPTER", b"ADAPTER", &model, false, &mut rng,
397        );
398
399        assert_eq!(pair.read1.bases, b"ACGTACGTAC");
400        assert!(pair.read2.is_some());
401        assert_eq!(pair.r1_truth.position, 101);
402        assert!(pair.r2_truth.is_some());
403        assert_eq!(cigar_to_string(&pair.r1_cigar), "10M");
404        assert_eq!(cigar_to_string(pair.r2_cigar.as_ref().unwrap()), "10M");
405    }
406
407    #[test]
408    fn test_generate_se_read() {
409        let fragment = test_fragment(b"ACGTACGTAC", 100);
410        let model = IlluminaErrorModel::new(10, 0.0, 0.0);
411        let mut rng = SmallRng::seed_from_u64(42);
412
413        let pair = generate_read_pair(
414            &fragment, "chr1", 5, 10, false, b"ADAPTER", b"ADAPTER", &model, false, &mut rng,
415        );
416
417        assert!(pair.read2.is_none());
418        assert!(pair.r2_cigar.is_none());
419        assert_eq!(cigar_to_string(&pair.r1_cigar), "10M");
420    }
421
422    #[test]
423    fn test_adapter_cigar_softclip() {
424        // Forward-strand fragment: R1 is positive-strand (trailing S), R2 is
425        // negative-strand (leading S after BAM reverse-complement).
426        let fragment = test_fragment(b"AC", 0);
427        let model = IlluminaErrorModel::new(5, 0.0, 0.0);
428        let mut rng = SmallRng::seed_from_u64(42);
429
430        let pair = generate_read_pair(
431            &fragment, "chr1", 1, 5, true, b"TTTTT", b"GGGGG", &model, false, &mut rng,
432        );
433
434        assert_eq!(cigar_to_string(&pair.r1_cigar), "2M3S");
435        assert_eq!(cigar_to_string(pair.r2_cigar.as_ref().unwrap()), "3S2M");
436    }
437
438    #[test]
439    fn test_simple_name_mode() {
440        let fragment = test_fragment(b"ACGT", 0);
441        let model = IlluminaErrorModel::new(4, 0.0, 0.0);
442        let mut rng = SmallRng::seed_from_u64(42);
443
444        let pair =
445            generate_read_pair(&fragment, "chr1", 42, 4, true, b"A", b"A", &model, true, &mut rng);
446
447        assert_eq!(pair.read1.name, "holodeck:42");
448    }
449
450    #[test]
451    fn test_quality_scores_correct_length() {
452        let fragment = test_fragment(b"ACGTACGTAC", 0);
453        let model = IlluminaErrorModel::new(10, 0.001, 0.01);
454        let mut rng = SmallRng::seed_from_u64(42);
455
456        let pair =
457            generate_read_pair(&fragment, "chr1", 1, 10, true, b"A", b"A", &model, false, &mut rng);
458
459        assert_eq!(pair.read1.qualities.len(), 10);
460        assert_eq!(pair.read2.as_ref().unwrap().qualities.len(), 10);
461    }
462}