holodeck 0.2.0

Modern NGS read simulator
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
use std::io::Write;
use std::path::PathBuf;

use anyhow::{Result, bail};
use clap::Parser;
use rand::rngs::SmallRng;
use rand::{Rng, SeedableRng};
use rand_distr::{Distribution, Geometric};

use super::command::Command;
use super::common::{BedOptions, ReferenceOptions, SeedOptions};
use crate::bed::TargetRegions;
use crate::fasta::Fasta;
use crate::ploidy::PloidyMap;
use crate::seed::resolve_seed;

/// DNA bases for random mutation generation.
const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];

/// Generate a VCF of random mutations from a reference genome.
///
/// Produces a standard VCF with proper GT fields that can be fed directly to
/// `holodeck simulate`.  Supports independent control of SNP, indel, and MNP
/// rates, with configurable ploidy including per-contig and per-region
/// overrides for handling sex chromosomes and pseudo-autosomal regions.
#[derive(Parser, Debug)]
#[command(after_long_help = "EXAMPLES:\n  \
    holodeck mutate -r ref.fa -o muts.vcf --snp-rate 0.001\n  \
    holodeck mutate -r ref.fa -o muts.vcf --snp-rate 0.001 --ploidy 2 \
    --ploidy-override chrX=1 --ploidy-override chrX:10001-2781479=2")]
pub struct Mutate {
    #[command(flatten)]
    pub reference: ReferenceOptions,

    #[command(flatten)]
    pub bed: BedOptions,

    #[command(flatten)]
    pub seed: SeedOptions,

    /// Output VCF file path.
    #[arg(short = 'o', long, value_name = "VCF")]
    pub output: PathBuf,

    /// Rate of SNP mutations per base.
    #[arg(long, default_value_t = 0.001, value_name = "FLOAT")]
    pub snp_rate: f64,

    /// Rate of indel mutations per base.
    #[arg(long, default_value_t = 0.0001, value_name = "FLOAT")]
    pub indel_rate: f64,

    /// Rate of MNP (multi-nucleotide polymorphism) mutations per base.
    #[arg(long, default_value_t = 0.00005, value_name = "FLOAT")]
    pub mnp_rate: f64,

    /// Parameter for the geometric distribution of indel lengths. Larger values
    /// produce shorter indels on average.
    #[arg(long, default_value_t = 0.7, value_name = "FLOAT")]
    pub indel_length_param: f64,

    /// Ratio of heterozygous to homozygous variants. A value of 2.0 means
    /// twice as many het as hom variants.
    #[arg(long, default_value_t = 2.0, value_name = "FLOAT")]
    pub het_hom_ratio: f64,

    /// Default ploidy for all contigs.
    #[arg(long, default_value_t = 2, value_name = "INT")]
    pub ploidy: u8,

    /// Per-contig or per-region ploidy override. Format: `CONTIG=PLOIDY` or
    /// `CONTIG:START-END=PLOIDY` (1-based coordinates). Later values override
    /// earlier ones for the same positions (last-writer-wins).
    ///
    /// Example: `--ploidy-override chrX=1 --ploidy-override chrX:10001-2781479=2`
    #[arg(long, value_name = "SPEC")]
    pub ploidy_override: Vec<String>,
}

impl Command for Mutate {
    fn execute(&self) -> Result<()> {
        self.validate()?;
        self.run_mutation()
    }
}

impl Mutate {
    /// Validate command-line arguments.
    fn validate(&self) -> Result<()> {
        if self.snp_rate < 0.0 || self.indel_rate < 0.0 || self.mnp_rate < 0.0 {
            bail!("Individual mutation rates must be >= 0");
        }
        let total_rate = self.snp_rate + self.indel_rate + self.mnp_rate;
        if !(0.0..=1.0).contains(&total_rate) {
            bail!("Combined mutation rate must be in [0, 1], got {total_rate}");
        }
        if self.indel_length_param <= 0.0 || self.indel_length_param >= 1.0 {
            bail!("--indel-length-param must be in (0, 1)");
        }
        if self.het_hom_ratio < 0.0 {
            bail!("--het-hom-ratio must be >= 0");
        }
        if self.ploidy == 0 {
            bail!("--ploidy must be >= 1");
        }
        Ok(())
    }

    /// Run the mutation generation pipeline.
    fn run_mutation(&self) -> Result<()> {
        let seed_desc = format!(
            "mutate:{}:{}:{}:{}:{}",
            self.reference.reference.display(),
            self.snp_rate,
            self.indel_rate,
            self.mnp_rate,
            self.ploidy,
        );
        let seed = resolve_seed(self.seed.seed, &seed_desc);
        let mut rng = SmallRng::seed_from_u64(seed);
        log::info!("Using random seed: {seed}");

        let mut fasta = Fasta::from_path(&self.reference.reference)?;
        let dict = fasta.dict().clone();
        log::info!(
            "Loaded reference with {} contigs, total {} bp",
            dict.len(),
            dict.total_length()
        );

        let targets = match &self.bed.targets {
            Some(bed_path) => {
                let t = TargetRegions::from_path(bed_path, &dict)?;
                log::info!(
                    "Restricting mutations to {} bp of target territory",
                    t.total_territory()
                );
                Some(t)
            }
            None => None,
        };

        let ploidy_map = PloidyMap::new(self.ploidy, &self.ploidy_override)?;
        let indel_dist = Geometric::new(self.indel_length_param)
            .map_err(|e| anyhow::anyhow!("Invalid indel length distribution: {e}"))?;

        // Open output VCF.
        let mut vcf_out = std::fs::File::create(&self.output)?;
        Self::write_vcf_header(&mut vcf_out, &dict)?;

        let total_rate = self.snp_rate + self.indel_rate + self.mnp_rate;
        let mut total_variants = 0u64;
        let contig_names: Vec<String> = dict.names().into_iter().map(String::from).collect();

        for contig_name in &contig_names {
            let reference = fasta.load_contig(contig_name)?;
            let contig_idx = dict.get_by_name(contig_name).unwrap().index();

            let mut pos = 0u32;
            #[expect(clippy::cast_possible_truncation, reason = "contig length fits u32")]
            let contig_len = reference.len() as u32;

            while pos < contig_len {
                // Skip non-ACGT bases (Ns).
                if !BASES.contains(&reference[pos as usize]) {
                    pos += 1;
                    continue;
                }

                // Check if this position is within targets (if specified).
                if let Some(tgt) = &targets
                    && !tgt.overlaps(contig_idx, pos, pos + 1)
                {
                    pos += 1;
                    continue;
                }

                // Decide if a mutation occurs at this position.
                if rng.random::<f64>() >= total_rate {
                    pos += 1;
                    continue;
                }

                // Determine mutation type by drawing a second random value
                // and partitioning [0, total_rate) into SNP/indel/MNP ranges.
                let ploidy = ploidy_map.ploidy_at(contig_name, pos);
                let type_roll: f64 = rng.random::<f64>() * total_rate;
                let (ref_allele, alt_allele, advance) = if type_roll < self.snp_rate {
                    generate_snp(reference[pos as usize], &mut rng)
                } else if type_roll < self.snp_rate + self.indel_rate {
                    generate_indel(&reference, pos, contig_len, &indel_dist, &mut rng)
                } else {
                    // MNP requires at least 2 bases remaining.
                    if pos + 2 > contig_len {
                        pos += 1;
                        continue;
                    }
                    generate_mnp(&reference, pos, contig_len, &mut rng)
                };

                // Assign genotype (het vs hom).
                let gt = generate_genotype(ploidy, self.het_hom_ratio, &mut rng);

                // Write VCF record (1-based position).
                writeln!(
                    vcf_out,
                    "{contig_name}\t{}\t.\t{}\t{}\t100\tPASS\t.\tGT\t{gt}",
                    pos + 1,
                    String::from_utf8_lossy(&ref_allele),
                    String::from_utf8_lossy(&alt_allele),
                )?;

                total_variants += 1;
                pos += advance;
            }
        }

        log::info!("Generated {total_variants} variants");
        Ok(())
    }

    /// Write the VCF header to the output file.
    fn write_vcf_header(
        out: &mut std::fs::File,
        dict: &crate::sequence_dict::SequenceDictionary,
    ) -> Result<()> {
        writeln!(out, "##fileformat=VCFv4.3")?;
        writeln!(out, "##source=holodeck-mutate")?;
        for meta in dict.iter() {
            writeln!(out, "##contig=<ID={},length={}>", meta.name(), meta.length())?;
        }
        writeln!(out, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">")?;
        writeln!(out, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE")?;
        Ok(())
    }
}

/// Generate a SNP: single base substitution.
/// Returns `(ref_allele, alt_allele, advance)`.
fn generate_snp(ref_base: u8, rng: &mut impl Rng) -> (Vec<u8>, Vec<u8>, u32) {
    let alt = loop {
        let candidate = BASES[rng.random_range(0..4)];
        if candidate != ref_base {
            break candidate;
        }
    };
    (vec![ref_base], vec![alt], 1)
}

/// Generate an indel (insertion or deletion).
/// Returns `(ref_allele, alt_allele, advance)`.
fn generate_indel(
    reference: &[u8],
    pos: u32,
    contig_len: u32,
    indel_dist: &Geometric,
    rng: &mut impl Rng,
) -> (Vec<u8>, Vec<u8>, u32) {
    // Draw indel length from geometric distribution (minimum 1).
    #[expect(
        clippy::cast_possible_truncation,
        reason = "geometric distribution produces small values"
    )]
    let length = (indel_dist.sample(rng) as u32).max(1);
    let is_insertion: bool = rng.random();
    let anchor = reference[pos as usize];

    if is_insertion {
        // Insertion: REF = anchor base, ALT = anchor + random bases.
        let mut alt = vec![anchor];
        for _ in 0..length {
            alt.push(BASES[rng.random_range(0..4)]);
        }
        (vec![anchor], alt, 1)
    } else {
        // Deletion: REF = anchor + deleted bases, ALT = anchor.
        // Need at least 1 base after the anchor to delete.
        let available = contig_len.saturating_sub(pos + 1);
        if available == 0 {
            // Fall back to insertion at this position.
            let mut alt = vec![anchor];
            for _ in 0..length {
                alt.push(BASES[rng.random_range(0..4)]);
            }
            return (vec![anchor], alt, 1);
        }
        let del_len = length.min(available);
        let del_end = pos + 1 + del_len;
        let ref_allele: Vec<u8> = reference[pos as usize..del_end as usize].to_vec();
        #[expect(clippy::cast_possible_truncation, reason = "allele length fits u32")]
        let advance = ref_allele.len() as u32;
        (ref_allele, vec![anchor], advance)
    }
}

/// Generate an MNP (multi-nucleotide polymorphism, 2-3 bases).
/// Returns `(ref_allele, alt_allele, advance)`.
fn generate_mnp(
    reference: &[u8],
    pos: u32,
    contig_len: u32,
    rng: &mut impl Rng,
) -> (Vec<u8>, Vec<u8>, u32) {
    let length = rng.random_range(2..=3u32).min(contig_len - pos);
    let ref_allele: Vec<u8> = reference[pos as usize..(pos + length) as usize].to_vec();

    // Generate alt allele that differs in at least one position.
    let mut alt_allele = ref_allele.clone();
    for base in &mut alt_allele {
        if rng.random::<f64>() < 0.8 {
            // Mutate this position.
            let original = *base;
            *base = loop {
                let candidate = BASES[rng.random_range(0..4)];
                if candidate != original {
                    break candidate;
                }
            };
        }
    }
    // Ensure at least one base differs.
    if alt_allele == ref_allele {
        let idx = rng.random_range(0..alt_allele.len());
        let original = alt_allele[idx];
        alt_allele[idx] = loop {
            let candidate = BASES[rng.random_range(0..4)];
            if candidate != original {
                break candidate;
            }
        };
    }

    (ref_allele, alt_allele, length)
}

/// Generate a genotype string for the given ploidy and het/hom ratio.
///
/// The het/hom ratio controls the probability of heterozygous vs homozygous
/// genotypes. For a ratio of 2.0, ~67% of variants will be het and ~33% hom.
fn generate_genotype(ploidy: u8, het_hom_ratio: f64, rng: &mut impl Rng) -> String {
    let p_het = het_hom_ratio / (1.0 + het_hom_ratio);
    let is_het = ploidy > 1 && rng.random::<f64>() < p_het;

    if ploidy == 1 {
        // Haploid: always the alt allele.
        "1".to_string()
    } else if is_het {
        // Heterozygous: randomly choose which haplotype(s) get the alt.
        let mut alleles: Vec<&str> = vec!["0"; ploidy as usize];
        let alt_hap = rng.random_range(0..ploidy as usize);
        alleles[alt_hap] = "1";
        alleles.join("/")
    } else {
        // Homozygous alt: all haplotypes get the alt.
        vec!["1"; ploidy as usize].join("/")
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_generate_snp() {
        let mut rng = rand::rng();
        for _ in 0..100 {
            let (ref_a, alt_a, advance) = generate_snp(b'A', &mut rng);
            assert_eq!(ref_a, vec![b'A']);
            assert_eq!(alt_a.len(), 1);
            assert_ne!(alt_a[0], b'A');
            assert_eq!(advance, 1);
        }
    }

    #[test]
    fn test_generate_indel_insertion() {
        let reference = b"ACGTACGT";
        let indel_dist = Geometric::new(0.7).unwrap();
        let mut rng = SmallRng::seed_from_u64(42);

        let mut found_insertion = false;
        for _ in 0..100 {
            let (ref_a, alt_a, advance) = generate_indel(reference, 2, 8, &indel_dist, &mut rng);
            if alt_a.len() > ref_a.len() {
                found_insertion = true;
                assert_eq!(ref_a.len(), 1); // Anchor base only
                assert!(alt_a.len() >= 2); // Anchor + at least 1 inserted base
                assert_eq!(ref_a[0], alt_a[0]); // Same anchor
                assert_eq!(advance, 1);
            }
        }
        assert!(found_insertion, "Should have generated at least one insertion");
    }

    #[test]
    fn test_generate_indel_deletion() {
        let reference = b"ACGTACGT";
        let indel_dist = Geometric::new(0.7).unwrap();
        let mut rng = SmallRng::seed_from_u64(99);

        let mut found_deletion = false;
        for _ in 0..100 {
            let (ref_a, alt_a, _advance) = generate_indel(reference, 2, 8, &indel_dist, &mut rng);
            if ref_a.len() > alt_a.len() {
                found_deletion = true;
                assert_eq!(alt_a.len(), 1); // Anchor base only
                assert!(ref_a.len() >= 2); // Anchor + at least 1 deleted base
                assert_eq!(ref_a[0], alt_a[0]); // Same anchor
            }
        }
        assert!(found_deletion, "Should have generated at least one deletion");
    }

    #[test]
    fn test_generate_mnp() {
        let reference = b"ACGTACGT";
        let mut rng = rand::rng();
        for _ in 0..100 {
            let (ref_a, alt_a, advance) = generate_mnp(reference, 1, 8, &mut rng);
            assert!(ref_a.len() >= 2 && ref_a.len() <= 3);
            assert_eq!(ref_a.len(), alt_a.len());
            assert_ne!(ref_a, alt_a);
            assert_eq!(advance as usize, ref_a.len());
        }
    }

    #[test]
    fn test_generate_genotype_haploid() {
        let mut rng = rand::rng();
        let gt = generate_genotype(1, 2.0, &mut rng);
        assert_eq!(gt, "1");
    }

    #[test]
    fn test_generate_genotype_diploid_het() {
        let mut rng = rand::rng();
        let mut het_count = 0;
        let mut hom_count = 0;
        for _ in 0..1000 {
            let gt = generate_genotype(2, 2.0, &mut rng);
            if gt == "0/1" || gt == "1/0" {
                het_count += 1;
            } else if gt == "1/1" {
                hom_count += 1;
            }
        }
        // With het_hom_ratio=2.0, expect ~67% het, ~33% hom.
        assert!(het_count > 500, "expected majority het, got {het_count}");
        assert!(hom_count > 200, "expected some hom, got {hom_count}");
    }

    #[test]
    fn test_generate_genotype_triploid() {
        let mut rng = rand::rng();
        let gt = generate_genotype(3, 0.0, &mut rng);
        // hom-alt for triploid.
        assert_eq!(gt, "1/1/1");
    }
}