Skip to main content

holodeck_lib/commands/
mutate.rs

1use std::io::Write;
2use std::path::PathBuf;
3
4use anyhow::{Result, bail};
5use clap::Parser;
6use rand::rngs::SmallRng;
7use rand::{Rng, SeedableRng};
8use rand_distr::{Distribution, Geometric};
9
10use super::command::Command;
11use super::common::{BedOptions, ReferenceOptions, SeedOptions};
12use crate::bed::TargetRegions;
13use crate::fasta::Fasta;
14use crate::ploidy::PloidyMap;
15use crate::seed::{derive_seed, resolve_seed};
16
17/// DNA bases for random mutation generation.
18const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
19
20/// Generate a VCF of random mutations from a reference genome.
21///
22/// Produces a standard VCF with proper GT fields that can be fed directly to
23/// `holodeck simulate`.  Supports independent control of SNP, indel, and MNP
24/// rates, with configurable ploidy including per-contig and per-region
25/// overrides for handling sex chromosomes and pseudo-autosomal regions.
26#[derive(Parser, Debug)]
27#[command(after_long_help = "EXAMPLES:\n  \
28    holodeck mutate -r ref.fa -o muts.vcf --snp-rate 0.001\n  \
29    holodeck mutate -r ref.fa -o muts.vcf --snp-rate 0.001 --ploidy 2 \
30    --ploidy-override chrX=1 --ploidy-override chrX:10001-2781479=2")]
31pub struct Mutate {
32    #[command(flatten)]
33    pub reference: ReferenceOptions,
34
35    #[command(flatten)]
36    pub bed: BedOptions,
37
38    #[command(flatten)]
39    pub seed: SeedOptions,
40
41    /// Output VCF file path.
42    #[arg(short = 'o', long, value_name = "VCF")]
43    pub output: PathBuf,
44
45    /// Rate of SNP mutations per base.
46    #[arg(long, default_value_t = 0.001, value_name = "FLOAT")]
47    pub snp_rate: f64,
48
49    /// Rate of indel mutations per base.
50    #[arg(long, default_value_t = 0.0001, value_name = "FLOAT")]
51    pub indel_rate: f64,
52
53    /// Rate of MNP (multi-nucleotide polymorphism) mutations per base.
54    #[arg(long, default_value_t = 0.00005, value_name = "FLOAT")]
55    pub mnp_rate: f64,
56
57    /// Parameter for the geometric distribution of indel lengths. Larger values
58    /// produce shorter indels on average.
59    #[arg(long, default_value_t = 0.7, value_name = "FLOAT")]
60    pub indel_length_param: f64,
61
62    /// Ratio of heterozygous to homozygous variants. A value of 2.0 means
63    /// twice as many het as hom variants.
64    #[arg(long, default_value_t = 2.0, value_name = "FLOAT")]
65    pub het_hom_ratio: f64,
66
67    /// Default ploidy for all contigs.
68    #[arg(long, default_value_t = 2, value_name = "INT")]
69    pub ploidy: u8,
70
71    /// Per-contig or per-region ploidy override. Format: `CONTIG=PLOIDY` or
72    /// `CONTIG:START-END=PLOIDY` (1-based coordinates). Later values override
73    /// earlier ones for the same positions (last-writer-wins).
74    ///
75    /// Example: `--ploidy-override chrX=1 --ploidy-override chrX:10001-2781479=2`
76    #[arg(long, value_name = "SPEC")]
77    pub ploidy_override: Vec<String>,
78}
79
80impl Command for Mutate {
81    fn execute(&self) -> Result<()> {
82        self.validate()?;
83        self.run_mutation()
84    }
85}
86
87impl Mutate {
88    /// Validate command-line arguments.
89    fn validate(&self) -> Result<()> {
90        if self.snp_rate < 0.0 || self.indel_rate < 0.0 || self.mnp_rate < 0.0 {
91            bail!("Individual mutation rates must be >= 0");
92        }
93        let total_rate = self.snp_rate + self.indel_rate + self.mnp_rate;
94        if !(0.0..=1.0).contains(&total_rate) {
95            bail!("Combined mutation rate must be in [0, 1], got {total_rate}");
96        }
97        if self.indel_length_param <= 0.0 || self.indel_length_param >= 1.0 {
98            bail!("--indel-length-param must be in (0, 1)");
99        }
100        if self.het_hom_ratio < 0.0 {
101            bail!("--het-hom-ratio must be >= 0");
102        }
103        if self.ploidy == 0 {
104            bail!("--ploidy must be >= 1");
105        }
106        Ok(())
107    }
108
109    /// Run the mutation generation pipeline.
110    fn run_mutation(&self) -> Result<()> {
111        let seed_desc = format!(
112            "mutate:{}:{}:{}:{}:{}",
113            self.reference.reference.display(),
114            self.snp_rate,
115            self.indel_rate,
116            self.mnp_rate,
117            self.ploidy,
118        );
119        let seed = resolve_seed(self.seed.seed, &seed_desc);
120        let mut rng = SmallRng::seed_from_u64(seed);
121        log::info!("Using random seed: {seed}");
122
123        let mut fasta = Fasta::from_path(&self.reference.reference)?;
124        let dict = fasta.dict().clone();
125        log::info!(
126            "Loaded reference with {} contigs, total {} bp",
127            dict.len(),
128            dict.total_length()
129        );
130
131        let targets = match &self.bed.targets {
132            Some(bed_path) => {
133                let t = TargetRegions::from_path(bed_path, &dict)?;
134                log::info!(
135                    "Restricting mutations to {} bp of target territory",
136                    t.total_territory()
137                );
138                Some(t)
139            }
140            None => None,
141        };
142
143        let ploidy_map = PloidyMap::new(self.ploidy, &self.ploidy_override)?;
144        let indel_dist = Geometric::new(self.indel_length_param)
145            .map_err(|e| anyhow::anyhow!("Invalid indel length distribution: {e}"))?;
146
147        // Open output VCF.
148        let mut vcf_out = std::fs::File::create(&self.output)?;
149        Self::write_vcf_header(&mut vcf_out, &dict)?;
150
151        let total_rate = self.snp_rate + self.indel_rate + self.mnp_rate;
152        let mut total_variants = 0u64;
153        let contig_names: Vec<String> = dict.names().into_iter().map(String::from).collect();
154
155        for contig_name in &contig_names {
156            // Use a per-contig RNG for reference normalization so ambiguity
157            // resolution is reproducible and independent of the mutation RNG.
158            let contig_seed = derive_seed(seed, contig_name);
159            let mut ref_rng = SmallRng::seed_from_u64(contig_seed);
160            let reference = fasta.load_contig(contig_name, &mut ref_rng)?;
161            let contig_idx = dict.get_by_name(contig_name).unwrap().index();
162
163            let mut pos = 0u32;
164            #[expect(clippy::cast_possible_truncation, reason = "contig length fits u32")]
165            let contig_len = reference.len() as u32;
166
167            while pos < contig_len {
168                // Skip non-ACGT bases (Ns).
169                if !BASES.contains(&reference[pos as usize]) {
170                    pos += 1;
171                    continue;
172                }
173
174                // Check if this position is within targets (if specified).
175                if let Some(tgt) = &targets
176                    && !tgt.overlaps(contig_idx, pos, pos + 1)
177                {
178                    pos += 1;
179                    continue;
180                }
181
182                // Decide if a mutation occurs at this position.
183                if rng.random::<f64>() >= total_rate {
184                    pos += 1;
185                    continue;
186                }
187
188                // Determine mutation type by drawing a second random value
189                // and partitioning [0, total_rate) into SNP/indel/MNP ranges.
190                let ploidy = ploidy_map.ploidy_at(contig_name, pos);
191                let type_roll: f64 = rng.random::<f64>() * total_rate;
192                let (ref_allele, alt_allele, advance) = if type_roll < self.snp_rate {
193                    generate_snp(reference[pos as usize], &mut rng)
194                } else if type_roll < self.snp_rate + self.indel_rate {
195                    generate_indel(&reference, pos, contig_len, &indel_dist, &mut rng)
196                } else {
197                    // MNP requires at least 2 bases remaining.
198                    if pos + 2 > contig_len {
199                        pos += 1;
200                        continue;
201                    }
202                    generate_mnp(&reference, pos, contig_len, &mut rng)
203                };
204
205                // The anchor-position check above only covers position `pos`.
206                // Multi-base REF alleles (MNPs and deletions) may still span
207                // an ambiguity-resolved lowercase byte at `pos+1` or later,
208                // which would emit a noncanonical REF into the VCF — skip
209                // those variants.
210                if !ref_allele.iter().all(|b| BASES.contains(b)) {
211                    pos += 1;
212                    continue;
213                }
214
215                // Assign genotype (het vs hom).
216                let gt = generate_genotype(ploidy, self.het_hom_ratio, &mut rng);
217
218                // Write VCF record (1-based position).
219                writeln!(
220                    vcf_out,
221                    "{contig_name}\t{}\t.\t{}\t{}\t100\tPASS\t.\tGT\t{gt}",
222                    pos + 1,
223                    String::from_utf8_lossy(&ref_allele),
224                    String::from_utf8_lossy(&alt_allele),
225                )?;
226
227                total_variants += 1;
228                pos += advance;
229            }
230        }
231
232        log::info!("Generated {total_variants} variants");
233        Ok(())
234    }
235
236    /// Write the VCF header to the output file.
237    fn write_vcf_header(
238        out: &mut std::fs::File,
239        dict: &crate::sequence_dict::SequenceDictionary,
240    ) -> Result<()> {
241        writeln!(out, "##fileformat=VCFv4.3")?;
242        writeln!(out, "##source=holodeck-mutate")?;
243        for meta in dict.iter() {
244            writeln!(out, "##contig=<ID={},length={}>", meta.name(), meta.length())?;
245        }
246        writeln!(out, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">")?;
247        writeln!(out, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE")?;
248        Ok(())
249    }
250}
251
252/// Generate a SNP: single base substitution.
253/// Returns `(ref_allele, alt_allele, advance)`.
254fn generate_snp(ref_base: u8, rng: &mut impl Rng) -> (Vec<u8>, Vec<u8>, u32) {
255    let alt = loop {
256        let candidate = BASES[rng.random_range(0..4)];
257        if candidate != ref_base {
258            break candidate;
259        }
260    };
261    (vec![ref_base], vec![alt], 1)
262}
263
264/// Generate an indel (insertion or deletion).
265/// Returns `(ref_allele, alt_allele, advance)`.
266fn generate_indel(
267    reference: &[u8],
268    pos: u32,
269    contig_len: u32,
270    indel_dist: &Geometric,
271    rng: &mut impl Rng,
272) -> (Vec<u8>, Vec<u8>, u32) {
273    // Draw indel length from geometric distribution (minimum 1).
274    #[expect(
275        clippy::cast_possible_truncation,
276        reason = "geometric distribution produces small values"
277    )]
278    let length = (indel_dist.sample(rng) as u32).max(1);
279    let is_insertion: bool = rng.random();
280    let anchor = reference[pos as usize];
281
282    if is_insertion {
283        // Insertion: REF = anchor base, ALT = anchor + random bases.
284        let mut alt = vec![anchor];
285        for _ in 0..length {
286            alt.push(BASES[rng.random_range(0..4)]);
287        }
288        (vec![anchor], alt, 1)
289    } else {
290        // Deletion: REF = anchor + deleted bases, ALT = anchor.
291        // Need at least 1 base after the anchor to delete.
292        let available = contig_len.saturating_sub(pos + 1);
293        if available == 0 {
294            // Fall back to insertion at this position.
295            let mut alt = vec![anchor];
296            for _ in 0..length {
297                alt.push(BASES[rng.random_range(0..4)]);
298            }
299            return (vec![anchor], alt, 1);
300        }
301        let del_len = length.min(available);
302        let del_end = pos + 1 + del_len;
303        let ref_allele: Vec<u8> = reference[pos as usize..del_end as usize].to_vec();
304        #[expect(clippy::cast_possible_truncation, reason = "allele length fits u32")]
305        let advance = ref_allele.len() as u32;
306        (ref_allele, vec![anchor], advance)
307    }
308}
309
310/// Generate an MNP (multi-nucleotide polymorphism, 2-3 bases).
311/// Returns `(ref_allele, alt_allele, advance)`.
312fn generate_mnp(
313    reference: &[u8],
314    pos: u32,
315    contig_len: u32,
316    rng: &mut impl Rng,
317) -> (Vec<u8>, Vec<u8>, u32) {
318    let length = rng.random_range(2..=3u32).min(contig_len - pos);
319    let ref_allele: Vec<u8> = reference[pos as usize..(pos + length) as usize].to_vec();
320
321    // Generate alt allele that differs in at least one position.
322    let mut alt_allele = ref_allele.clone();
323    for base in &mut alt_allele {
324        if rng.random::<f64>() < 0.8 {
325            // Mutate this position.
326            let original = *base;
327            *base = loop {
328                let candidate = BASES[rng.random_range(0..4)];
329                if candidate != original {
330                    break candidate;
331                }
332            };
333        }
334    }
335    // Ensure at least one base differs.
336    if alt_allele == ref_allele {
337        let idx = rng.random_range(0..alt_allele.len());
338        let original = alt_allele[idx];
339        alt_allele[idx] = loop {
340            let candidate = BASES[rng.random_range(0..4)];
341            if candidate != original {
342                break candidate;
343            }
344        };
345    }
346
347    (ref_allele, alt_allele, length)
348}
349
350/// Generate a genotype string for the given ploidy and het/hom ratio.
351///
352/// The het/hom ratio controls the probability of heterozygous vs homozygous
353/// genotypes. For a ratio of 2.0, ~67% of variants will be het and ~33% hom.
354fn generate_genotype(ploidy: u8, het_hom_ratio: f64, rng: &mut impl Rng) -> String {
355    let p_het = het_hom_ratio / (1.0 + het_hom_ratio);
356    let is_het = ploidy > 1 && rng.random::<f64>() < p_het;
357
358    if ploidy == 1 {
359        // Haploid: always the alt allele.
360        "1".to_string()
361    } else if is_het {
362        // Heterozygous: randomly choose which haplotype(s) get the alt.
363        let mut alleles: Vec<&str> = vec!["0"; ploidy as usize];
364        let alt_hap = rng.random_range(0..ploidy as usize);
365        alleles[alt_hap] = "1";
366        alleles.join("/")
367    } else {
368        // Homozygous alt: all haplotypes get the alt.
369        vec!["1"; ploidy as usize].join("/")
370    }
371}
372
373#[cfg(test)]
374mod tests {
375    use super::*;
376
377    #[test]
378    fn test_generate_snp() {
379        let mut rng = rand::rng();
380        for _ in 0..100 {
381            let (ref_a, alt_a, advance) = generate_snp(b'A', &mut rng);
382            assert_eq!(ref_a, vec![b'A']);
383            assert_eq!(alt_a.len(), 1);
384            assert_ne!(alt_a[0], b'A');
385            assert_eq!(advance, 1);
386        }
387    }
388
389    #[test]
390    fn test_generate_indel_insertion() {
391        let reference = b"ACGTACGT";
392        let indel_dist = Geometric::new(0.7).unwrap();
393        let mut rng = SmallRng::seed_from_u64(42);
394
395        let mut found_insertion = false;
396        for _ in 0..100 {
397            let (ref_a, alt_a, advance) = generate_indel(reference, 2, 8, &indel_dist, &mut rng);
398            if alt_a.len() > ref_a.len() {
399                found_insertion = true;
400                assert_eq!(ref_a.len(), 1); // Anchor base only
401                assert!(alt_a.len() >= 2); // Anchor + at least 1 inserted base
402                assert_eq!(ref_a[0], alt_a[0]); // Same anchor
403                assert_eq!(advance, 1);
404            }
405        }
406        assert!(found_insertion, "Should have generated at least one insertion");
407    }
408
409    #[test]
410    fn test_generate_indel_deletion() {
411        let reference = b"ACGTACGT";
412        let indel_dist = Geometric::new(0.7).unwrap();
413        let mut rng = SmallRng::seed_from_u64(99);
414
415        let mut found_deletion = false;
416        for _ in 0..100 {
417            let (ref_a, alt_a, _advance) = generate_indel(reference, 2, 8, &indel_dist, &mut rng);
418            if ref_a.len() > alt_a.len() {
419                found_deletion = true;
420                assert_eq!(alt_a.len(), 1); // Anchor base only
421                assert!(ref_a.len() >= 2); // Anchor + at least 1 deleted base
422                assert_eq!(ref_a[0], alt_a[0]); // Same anchor
423            }
424        }
425        assert!(found_deletion, "Should have generated at least one deletion");
426    }
427
428    #[test]
429    fn test_generate_mnp() {
430        let reference = b"ACGTACGT";
431        let mut rng = rand::rng();
432        for _ in 0..100 {
433            let (ref_a, alt_a, advance) = generate_mnp(reference, 1, 8, &mut rng);
434            assert!(ref_a.len() >= 2 && ref_a.len() <= 3);
435            assert_eq!(ref_a.len(), alt_a.len());
436            assert_ne!(ref_a, alt_a);
437            assert_eq!(advance as usize, ref_a.len());
438        }
439    }
440
441    #[test]
442    fn test_generate_genotype_haploid() {
443        let mut rng = rand::rng();
444        let gt = generate_genotype(1, 2.0, &mut rng);
445        assert_eq!(gt, "1");
446    }
447
448    #[test]
449    fn test_generate_genotype_diploid_het() {
450        let mut rng = rand::rng();
451        let mut het_count = 0;
452        let mut hom_count = 0;
453        for _ in 0..1000 {
454            let gt = generate_genotype(2, 2.0, &mut rng);
455            if gt == "0/1" || gt == "1/0" {
456                het_count += 1;
457            } else if gt == "1/1" {
458                hom_count += 1;
459            }
460        }
461        // With het_hom_ratio=2.0, expect ~67% het, ~33% hom.
462        assert!(het_count > 500, "expected majority het, got {het_count}");
463        assert!(hom_count > 200, "expected some hom, got {hom_count}");
464    }
465
466    #[test]
467    fn test_generate_genotype_triploid() {
468        let mut rng = rand::rng();
469        let gt = generate_genotype(3, 0.0, &mut rng);
470        // hom-alt for triploid.
471        assert_eq!(gt, "1/1/1");
472    }
473}