Skip to main content

holodeck_lib/commands/
mutate.rs

1use std::io::Write;
2use std::path::PathBuf;
3
4use anyhow::{Result, bail};
5use clap::Parser;
6use rand::rngs::SmallRng;
7use rand::{Rng, SeedableRng};
8use rand_distr::{Distribution, Geometric};
9
10use super::command::Command;
11use super::common::{BedOptions, ReferenceOptions, SeedOptions};
12use crate::bed::TargetRegions;
13use crate::fasta::Fasta;
14use crate::ploidy::PloidyMap;
15use crate::seed::resolve_seed;
16
17/// DNA bases for random mutation generation.
18const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
19
20/// Generate a VCF of random mutations from a reference genome.
21///
22/// Produces a standard VCF with proper GT fields that can be fed directly to
23/// `holodeck simulate`.  Supports independent control of SNP, indel, and MNP
24/// rates, with configurable ploidy including per-contig and per-region
25/// overrides for handling sex chromosomes and pseudo-autosomal regions.
26#[derive(Parser, Debug)]
27#[command(after_long_help = "EXAMPLES:\n  \
28    holodeck mutate -r ref.fa -o muts.vcf --snp-rate 0.001\n  \
29    holodeck mutate -r ref.fa -o muts.vcf --snp-rate 0.001 --ploidy 2 \
30    --ploidy-override chrX=1 --ploidy-override chrX:10001-2781479=2")]
31pub struct Mutate {
32    #[command(flatten)]
33    pub reference: ReferenceOptions,
34
35    #[command(flatten)]
36    pub bed: BedOptions,
37
38    #[command(flatten)]
39    pub seed: SeedOptions,
40
41    /// Output VCF file path.
42    #[arg(short = 'o', long, value_name = "VCF")]
43    pub output: PathBuf,
44
45    /// Rate of SNP mutations per base.
46    #[arg(long, default_value_t = 0.001, value_name = "FLOAT")]
47    pub snp_rate: f64,
48
49    /// Rate of indel mutations per base.
50    #[arg(long, default_value_t = 0.0001, value_name = "FLOAT")]
51    pub indel_rate: f64,
52
53    /// Rate of MNP (multi-nucleotide polymorphism) mutations per base.
54    #[arg(long, default_value_t = 0.00005, value_name = "FLOAT")]
55    pub mnp_rate: f64,
56
57    /// Parameter for the geometric distribution of indel lengths. Larger values
58    /// produce shorter indels on average.
59    #[arg(long, default_value_t = 0.7, value_name = "FLOAT")]
60    pub indel_length_param: f64,
61
62    /// Ratio of heterozygous to homozygous variants. A value of 2.0 means
63    /// twice as many het as hom variants.
64    #[arg(long, default_value_t = 2.0, value_name = "FLOAT")]
65    pub het_hom_ratio: f64,
66
67    /// Default ploidy for all contigs.
68    #[arg(long, default_value_t = 2, value_name = "INT")]
69    pub ploidy: u8,
70
71    /// Per-contig or per-region ploidy override. Format: `CONTIG=PLOIDY` or
72    /// `CONTIG:START-END=PLOIDY` (1-based coordinates). Later values override
73    /// earlier ones for the same positions (last-writer-wins).
74    ///
75    /// Example: `--ploidy-override chrX=1 --ploidy-override chrX:10001-2781479=2`
76    #[arg(long, value_name = "SPEC")]
77    pub ploidy_override: Vec<String>,
78}
79
80impl Command for Mutate {
81    fn execute(&self) -> Result<()> {
82        self.validate()?;
83        self.run_mutation()
84    }
85}
86
87impl Mutate {
88    /// Validate command-line arguments.
89    fn validate(&self) -> Result<()> {
90        if self.snp_rate < 0.0 || self.indel_rate < 0.0 || self.mnp_rate < 0.0 {
91            bail!("Individual mutation rates must be >= 0");
92        }
93        let total_rate = self.snp_rate + self.indel_rate + self.mnp_rate;
94        if !(0.0..=1.0).contains(&total_rate) {
95            bail!("Combined mutation rate must be in [0, 1], got {total_rate}");
96        }
97        if self.indel_length_param <= 0.0 || self.indel_length_param >= 1.0 {
98            bail!("--indel-length-param must be in (0, 1)");
99        }
100        if self.het_hom_ratio < 0.0 {
101            bail!("--het-hom-ratio must be >= 0");
102        }
103        if self.ploidy == 0 {
104            bail!("--ploidy must be >= 1");
105        }
106        Ok(())
107    }
108
109    /// Run the mutation generation pipeline.
110    fn run_mutation(&self) -> Result<()> {
111        let seed_desc = format!(
112            "mutate:{}:{}:{}:{}:{}",
113            self.reference.reference.display(),
114            self.snp_rate,
115            self.indel_rate,
116            self.mnp_rate,
117            self.ploidy,
118        );
119        let seed = resolve_seed(self.seed.seed, &seed_desc);
120        let mut rng = SmallRng::seed_from_u64(seed);
121        log::info!("Using random seed: {seed}");
122
123        let mut fasta = Fasta::from_path(&self.reference.reference)?;
124        let dict = fasta.dict().clone();
125        log::info!(
126            "Loaded reference with {} contigs, total {} bp",
127            dict.len(),
128            dict.total_length()
129        );
130
131        let targets = match &self.bed.targets {
132            Some(bed_path) => {
133                let t = TargetRegions::from_path(bed_path, &dict)?;
134                log::info!(
135                    "Restricting mutations to {} bp of target territory",
136                    t.total_territory()
137                );
138                Some(t)
139            }
140            None => None,
141        };
142
143        let ploidy_map = PloidyMap::new(self.ploidy, &self.ploidy_override)?;
144        let indel_dist = Geometric::new(self.indel_length_param)
145            .map_err(|e| anyhow::anyhow!("Invalid indel length distribution: {e}"))?;
146
147        // Open output VCF.
148        let mut vcf_out = std::fs::File::create(&self.output)?;
149        Self::write_vcf_header(&mut vcf_out, &dict)?;
150
151        let total_rate = self.snp_rate + self.indel_rate + self.mnp_rate;
152        let mut total_variants = 0u64;
153        let contig_names: Vec<String> = dict.names().into_iter().map(String::from).collect();
154
155        for contig_name in &contig_names {
156            let reference = fasta.load_contig(contig_name)?;
157            let contig_idx = dict.get_by_name(contig_name).unwrap().index();
158
159            let mut pos = 0u32;
160            #[expect(clippy::cast_possible_truncation, reason = "contig length fits u32")]
161            let contig_len = reference.len() as u32;
162
163            while pos < contig_len {
164                // Skip non-ACGT bases (Ns).
165                if !BASES.contains(&reference[pos as usize]) {
166                    pos += 1;
167                    continue;
168                }
169
170                // Check if this position is within targets (if specified).
171                if let Some(tgt) = &targets
172                    && !tgt.overlaps(contig_idx, pos, pos + 1)
173                {
174                    pos += 1;
175                    continue;
176                }
177
178                // Decide if a mutation occurs at this position.
179                if rng.random::<f64>() >= total_rate {
180                    pos += 1;
181                    continue;
182                }
183
184                // Determine mutation type by drawing a second random value
185                // and partitioning [0, total_rate) into SNP/indel/MNP ranges.
186                let ploidy = ploidy_map.ploidy_at(contig_name, pos);
187                let type_roll: f64 = rng.random::<f64>() * total_rate;
188                let (ref_allele, alt_allele, advance) = if type_roll < self.snp_rate {
189                    generate_snp(reference[pos as usize], &mut rng)
190                } else if type_roll < self.snp_rate + self.indel_rate {
191                    generate_indel(&reference, pos, contig_len, &indel_dist, &mut rng)
192                } else {
193                    // MNP requires at least 2 bases remaining.
194                    if pos + 2 > contig_len {
195                        pos += 1;
196                        continue;
197                    }
198                    generate_mnp(&reference, pos, contig_len, &mut rng)
199                };
200
201                // Assign genotype (het vs hom).
202                let gt = generate_genotype(ploidy, self.het_hom_ratio, &mut rng);
203
204                // Write VCF record (1-based position).
205                writeln!(
206                    vcf_out,
207                    "{contig_name}\t{}\t.\t{}\t{}\t100\tPASS\t.\tGT\t{gt}",
208                    pos + 1,
209                    String::from_utf8_lossy(&ref_allele),
210                    String::from_utf8_lossy(&alt_allele),
211                )?;
212
213                total_variants += 1;
214                pos += advance;
215            }
216        }
217
218        log::info!("Generated {total_variants} variants");
219        Ok(())
220    }
221
222    /// Write the VCF header to the output file.
223    fn write_vcf_header(
224        out: &mut std::fs::File,
225        dict: &crate::sequence_dict::SequenceDictionary,
226    ) -> Result<()> {
227        writeln!(out, "##fileformat=VCFv4.3")?;
228        writeln!(out, "##source=holodeck-mutate")?;
229        for meta in dict.iter() {
230            writeln!(out, "##contig=<ID={},length={}>", meta.name(), meta.length())?;
231        }
232        writeln!(out, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">")?;
233        writeln!(out, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE")?;
234        Ok(())
235    }
236}
237
238/// Generate a SNP: single base substitution.
239/// Returns `(ref_allele, alt_allele, advance)`.
240fn generate_snp(ref_base: u8, rng: &mut impl Rng) -> (Vec<u8>, Vec<u8>, u32) {
241    let alt = loop {
242        let candidate = BASES[rng.random_range(0..4)];
243        if candidate != ref_base {
244            break candidate;
245        }
246    };
247    (vec![ref_base], vec![alt], 1)
248}
249
250/// Generate an indel (insertion or deletion).
251/// Returns `(ref_allele, alt_allele, advance)`.
252fn generate_indel(
253    reference: &[u8],
254    pos: u32,
255    contig_len: u32,
256    indel_dist: &Geometric,
257    rng: &mut impl Rng,
258) -> (Vec<u8>, Vec<u8>, u32) {
259    // Draw indel length from geometric distribution (minimum 1).
260    #[expect(
261        clippy::cast_possible_truncation,
262        reason = "geometric distribution produces small values"
263    )]
264    let length = (indel_dist.sample(rng) as u32).max(1);
265    let is_insertion: bool = rng.random();
266    let anchor = reference[pos as usize];
267
268    if is_insertion {
269        // Insertion: REF = anchor base, ALT = anchor + random bases.
270        let mut alt = vec![anchor];
271        for _ in 0..length {
272            alt.push(BASES[rng.random_range(0..4)]);
273        }
274        (vec![anchor], alt, 1)
275    } else {
276        // Deletion: REF = anchor + deleted bases, ALT = anchor.
277        // Need at least 1 base after the anchor to delete.
278        let available = contig_len.saturating_sub(pos + 1);
279        if available == 0 {
280            // Fall back to insertion at this position.
281            let mut alt = vec![anchor];
282            for _ in 0..length {
283                alt.push(BASES[rng.random_range(0..4)]);
284            }
285            return (vec![anchor], alt, 1);
286        }
287        let del_len = length.min(available);
288        let del_end = pos + 1 + del_len;
289        let ref_allele: Vec<u8> = reference[pos as usize..del_end as usize].to_vec();
290        #[expect(clippy::cast_possible_truncation, reason = "allele length fits u32")]
291        let advance = ref_allele.len() as u32;
292        (ref_allele, vec![anchor], advance)
293    }
294}
295
296/// Generate an MNP (multi-nucleotide polymorphism, 2-3 bases).
297/// Returns `(ref_allele, alt_allele, advance)`.
298fn generate_mnp(
299    reference: &[u8],
300    pos: u32,
301    contig_len: u32,
302    rng: &mut impl Rng,
303) -> (Vec<u8>, Vec<u8>, u32) {
304    let length = rng.random_range(2..=3u32).min(contig_len - pos);
305    let ref_allele: Vec<u8> = reference[pos as usize..(pos + length) as usize].to_vec();
306
307    // Generate alt allele that differs in at least one position.
308    let mut alt_allele = ref_allele.clone();
309    for base in &mut alt_allele {
310        if rng.random::<f64>() < 0.8 {
311            // Mutate this position.
312            let original = *base;
313            *base = loop {
314                let candidate = BASES[rng.random_range(0..4)];
315                if candidate != original {
316                    break candidate;
317                }
318            };
319        }
320    }
321    // Ensure at least one base differs.
322    if alt_allele == ref_allele {
323        let idx = rng.random_range(0..alt_allele.len());
324        let original = alt_allele[idx];
325        alt_allele[idx] = loop {
326            let candidate = BASES[rng.random_range(0..4)];
327            if candidate != original {
328                break candidate;
329            }
330        };
331    }
332
333    (ref_allele, alt_allele, length)
334}
335
336/// Generate a genotype string for the given ploidy and het/hom ratio.
337///
338/// The het/hom ratio controls the probability of heterozygous vs homozygous
339/// genotypes. For a ratio of 2.0, ~67% of variants will be het and ~33% hom.
340fn generate_genotype(ploidy: u8, het_hom_ratio: f64, rng: &mut impl Rng) -> String {
341    let p_het = het_hom_ratio / (1.0 + het_hom_ratio);
342    let is_het = ploidy > 1 && rng.random::<f64>() < p_het;
343
344    if ploidy == 1 {
345        // Haploid: always the alt allele.
346        "1".to_string()
347    } else if is_het {
348        // Heterozygous: randomly choose which haplotype(s) get the alt.
349        let mut alleles: Vec<&str> = vec!["0"; ploidy as usize];
350        let alt_hap = rng.random_range(0..ploidy as usize);
351        alleles[alt_hap] = "1";
352        alleles.join("/")
353    } else {
354        // Homozygous alt: all haplotypes get the alt.
355        vec!["1"; ploidy as usize].join("/")
356    }
357}
358
359#[cfg(test)]
360mod tests {
361    use super::*;
362
363    #[test]
364    fn test_generate_snp() {
365        let mut rng = rand::rng();
366        for _ in 0..100 {
367            let (ref_a, alt_a, advance) = generate_snp(b'A', &mut rng);
368            assert_eq!(ref_a, vec![b'A']);
369            assert_eq!(alt_a.len(), 1);
370            assert_ne!(alt_a[0], b'A');
371            assert_eq!(advance, 1);
372        }
373    }
374
375    #[test]
376    fn test_generate_indel_insertion() {
377        let reference = b"ACGTACGT";
378        let indel_dist = Geometric::new(0.7).unwrap();
379        let mut rng = SmallRng::seed_from_u64(42);
380
381        let mut found_insertion = false;
382        for _ in 0..100 {
383            let (ref_a, alt_a, advance) = generate_indel(reference, 2, 8, &indel_dist, &mut rng);
384            if alt_a.len() > ref_a.len() {
385                found_insertion = true;
386                assert_eq!(ref_a.len(), 1); // Anchor base only
387                assert!(alt_a.len() >= 2); // Anchor + at least 1 inserted base
388                assert_eq!(ref_a[0], alt_a[0]); // Same anchor
389                assert_eq!(advance, 1);
390            }
391        }
392        assert!(found_insertion, "Should have generated at least one insertion");
393    }
394
395    #[test]
396    fn test_generate_indel_deletion() {
397        let reference = b"ACGTACGT";
398        let indel_dist = Geometric::new(0.7).unwrap();
399        let mut rng = SmallRng::seed_from_u64(99);
400
401        let mut found_deletion = false;
402        for _ in 0..100 {
403            let (ref_a, alt_a, _advance) = generate_indel(reference, 2, 8, &indel_dist, &mut rng);
404            if ref_a.len() > alt_a.len() {
405                found_deletion = true;
406                assert_eq!(alt_a.len(), 1); // Anchor base only
407                assert!(ref_a.len() >= 2); // Anchor + at least 1 deleted base
408                assert_eq!(ref_a[0], alt_a[0]); // Same anchor
409            }
410        }
411        assert!(found_deletion, "Should have generated at least one deletion");
412    }
413
414    #[test]
415    fn test_generate_mnp() {
416        let reference = b"ACGTACGT";
417        let mut rng = rand::rng();
418        for _ in 0..100 {
419            let (ref_a, alt_a, advance) = generate_mnp(reference, 1, 8, &mut rng);
420            assert!(ref_a.len() >= 2 && ref_a.len() <= 3);
421            assert_eq!(ref_a.len(), alt_a.len());
422            assert_ne!(ref_a, alt_a);
423            assert_eq!(advance as usize, ref_a.len());
424        }
425    }
426
427    #[test]
428    fn test_generate_genotype_haploid() {
429        let mut rng = rand::rng();
430        let gt = generate_genotype(1, 2.0, &mut rng);
431        assert_eq!(gt, "1");
432    }
433
434    #[test]
435    fn test_generate_genotype_diploid_het() {
436        let mut rng = rand::rng();
437        let mut het_count = 0;
438        let mut hom_count = 0;
439        for _ in 0..1000 {
440            let gt = generate_genotype(2, 2.0, &mut rng);
441            if gt == "0/1" || gt == "1/0" {
442                het_count += 1;
443            } else if gt == "1/1" {
444                hom_count += 1;
445            }
446        }
447        // With het_hom_ratio=2.0, expect ~67% het, ~33% hom.
448        assert!(het_count > 500, "expected majority het, got {het_count}");
449        assert!(hom_count > 200, "expected some hom, got {hom_count}");
450    }
451
452    #[test]
453    fn test_generate_genotype_triploid() {
454        let mut rng = rand::rng();
455        let gt = generate_genotype(3, 0.0, &mut rng);
456        // hom-alt for triploid.
457        assert_eq!(gt, "1/1/1");
458    }
459}