1use std::io::Write;
2use std::path::PathBuf;
3
4use anyhow::{Result, bail};
5use clap::Parser;
6use rand::rngs::SmallRng;
7use rand::{Rng, SeedableRng};
8use rand_distr::{Distribution, Geometric};
9
10use super::command::Command;
11use super::common::{BedOptions, ReferenceOptions, SeedOptions};
12use crate::bed::TargetRegions;
13use crate::fasta::Fasta;
14use crate::ploidy::PloidyMap;
15use crate::seed::resolve_seed;
16
17const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
19
20#[derive(Parser, Debug)]
27#[command(after_long_help = "EXAMPLES:\n \
28 holodeck mutate -r ref.fa -o muts.vcf --snp-rate 0.001\n \
29 holodeck mutate -r ref.fa -o muts.vcf --snp-rate 0.001 --ploidy 2 \
30 --ploidy-override chrX=1 --ploidy-override chrX:10001-2781479=2")]
31pub struct Mutate {
32 #[command(flatten)]
33 pub reference: ReferenceOptions,
34
35 #[command(flatten)]
36 pub bed: BedOptions,
37
38 #[command(flatten)]
39 pub seed: SeedOptions,
40
41 #[arg(short = 'o', long, value_name = "VCF")]
43 pub output: PathBuf,
44
45 #[arg(long, default_value_t = 0.001, value_name = "FLOAT")]
47 pub snp_rate: f64,
48
49 #[arg(long, default_value_t = 0.0001, value_name = "FLOAT")]
51 pub indel_rate: f64,
52
53 #[arg(long, default_value_t = 0.00005, value_name = "FLOAT")]
55 pub mnp_rate: f64,
56
57 #[arg(long, default_value_t = 0.7, value_name = "FLOAT")]
60 pub indel_length_param: f64,
61
62 #[arg(long, default_value_t = 2.0, value_name = "FLOAT")]
65 pub het_hom_ratio: f64,
66
67 #[arg(long, default_value_t = 2, value_name = "INT")]
69 pub ploidy: u8,
70
71 #[arg(long, value_name = "SPEC")]
77 pub ploidy_override: Vec<String>,
78}
79
80impl Command for Mutate {
81 fn execute(&self) -> Result<()> {
82 self.validate()?;
83 self.run_mutation()
84 }
85}
86
87impl Mutate {
88 fn validate(&self) -> Result<()> {
90 if self.snp_rate < 0.0 || self.indel_rate < 0.0 || self.mnp_rate < 0.0 {
91 bail!("Individual mutation rates must be >= 0");
92 }
93 let total_rate = self.snp_rate + self.indel_rate + self.mnp_rate;
94 if !(0.0..=1.0).contains(&total_rate) {
95 bail!("Combined mutation rate must be in [0, 1], got {total_rate}");
96 }
97 if self.indel_length_param <= 0.0 || self.indel_length_param >= 1.0 {
98 bail!("--indel-length-param must be in (0, 1)");
99 }
100 if self.het_hom_ratio < 0.0 {
101 bail!("--het-hom-ratio must be >= 0");
102 }
103 if self.ploidy == 0 {
104 bail!("--ploidy must be >= 1");
105 }
106 Ok(())
107 }
108
109 fn run_mutation(&self) -> Result<()> {
111 let seed_desc = format!(
112 "mutate:{}:{}:{}:{}:{}",
113 self.reference.reference.display(),
114 self.snp_rate,
115 self.indel_rate,
116 self.mnp_rate,
117 self.ploidy,
118 );
119 let seed = resolve_seed(self.seed.seed, &seed_desc);
120 let mut rng = SmallRng::seed_from_u64(seed);
121 log::info!("Using random seed: {seed}");
122
123 let mut fasta = Fasta::from_path(&self.reference.reference)?;
124 let dict = fasta.dict().clone();
125 log::info!(
126 "Loaded reference with {} contigs, total {} bp",
127 dict.len(),
128 dict.total_length()
129 );
130
131 let targets = match &self.bed.targets {
132 Some(bed_path) => {
133 let t = TargetRegions::from_path(bed_path, &dict)?;
134 log::info!(
135 "Restricting mutations to {} bp of target territory",
136 t.total_territory()
137 );
138 Some(t)
139 }
140 None => None,
141 };
142
143 let ploidy_map = PloidyMap::new(self.ploidy, &self.ploidy_override)?;
144 let indel_dist = Geometric::new(self.indel_length_param)
145 .map_err(|e| anyhow::anyhow!("Invalid indel length distribution: {e}"))?;
146
147 let mut vcf_out = std::fs::File::create(&self.output)?;
149 Self::write_vcf_header(&mut vcf_out, &dict)?;
150
151 let total_rate = self.snp_rate + self.indel_rate + self.mnp_rate;
152 let mut total_variants = 0u64;
153 let contig_names: Vec<String> = dict.names().into_iter().map(String::from).collect();
154
155 for contig_name in &contig_names {
156 let reference = fasta.load_contig(contig_name)?;
157 let contig_idx = dict.get_by_name(contig_name).unwrap().index();
158
159 let mut pos = 0u32;
160 #[expect(clippy::cast_possible_truncation, reason = "contig length fits u32")]
161 let contig_len = reference.len() as u32;
162
163 while pos < contig_len {
164 if !BASES.contains(&reference[pos as usize]) {
166 pos += 1;
167 continue;
168 }
169
170 if let Some(tgt) = &targets
172 && !tgt.overlaps(contig_idx, pos, pos + 1)
173 {
174 pos += 1;
175 continue;
176 }
177
178 if rng.random::<f64>() >= total_rate {
180 pos += 1;
181 continue;
182 }
183
184 let ploidy = ploidy_map.ploidy_at(contig_name, pos);
187 let type_roll: f64 = rng.random::<f64>() * total_rate;
188 let (ref_allele, alt_allele, advance) = if type_roll < self.snp_rate {
189 generate_snp(reference[pos as usize], &mut rng)
190 } else if type_roll < self.snp_rate + self.indel_rate {
191 generate_indel(&reference, pos, contig_len, &indel_dist, &mut rng)
192 } else {
193 if pos + 2 > contig_len {
195 pos += 1;
196 continue;
197 }
198 generate_mnp(&reference, pos, contig_len, &mut rng)
199 };
200
201 let gt = generate_genotype(ploidy, self.het_hom_ratio, &mut rng);
203
204 writeln!(
206 vcf_out,
207 "{contig_name}\t{}\t.\t{}\t{}\t100\tPASS\t.\tGT\t{gt}",
208 pos + 1,
209 String::from_utf8_lossy(&ref_allele),
210 String::from_utf8_lossy(&alt_allele),
211 )?;
212
213 total_variants += 1;
214 pos += advance;
215 }
216 }
217
218 log::info!("Generated {total_variants} variants");
219 Ok(())
220 }
221
222 fn write_vcf_header(
224 out: &mut std::fs::File,
225 dict: &crate::sequence_dict::SequenceDictionary,
226 ) -> Result<()> {
227 writeln!(out, "##fileformat=VCFv4.3")?;
228 writeln!(out, "##source=holodeck-mutate")?;
229 for meta in dict.iter() {
230 writeln!(out, "##contig=<ID={},length={}>", meta.name(), meta.length())?;
231 }
232 writeln!(out, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">")?;
233 writeln!(out, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE")?;
234 Ok(())
235 }
236}
237
238fn generate_snp(ref_base: u8, rng: &mut impl Rng) -> (Vec<u8>, Vec<u8>, u32) {
241 let alt = loop {
242 let candidate = BASES[rng.random_range(0..4)];
243 if candidate != ref_base {
244 break candidate;
245 }
246 };
247 (vec![ref_base], vec![alt], 1)
248}
249
250fn generate_indel(
253 reference: &[u8],
254 pos: u32,
255 contig_len: u32,
256 indel_dist: &Geometric,
257 rng: &mut impl Rng,
258) -> (Vec<u8>, Vec<u8>, u32) {
259 #[expect(
261 clippy::cast_possible_truncation,
262 reason = "geometric distribution produces small values"
263 )]
264 let length = (indel_dist.sample(rng) as u32).max(1);
265 let is_insertion: bool = rng.random();
266 let anchor = reference[pos as usize];
267
268 if is_insertion {
269 let mut alt = vec![anchor];
271 for _ in 0..length {
272 alt.push(BASES[rng.random_range(0..4)]);
273 }
274 (vec![anchor], alt, 1)
275 } else {
276 let available = contig_len.saturating_sub(pos + 1);
279 if available == 0 {
280 let mut alt = vec![anchor];
282 for _ in 0..length {
283 alt.push(BASES[rng.random_range(0..4)]);
284 }
285 return (vec![anchor], alt, 1);
286 }
287 let del_len = length.min(available);
288 let del_end = pos + 1 + del_len;
289 let ref_allele: Vec<u8> = reference[pos as usize..del_end as usize].to_vec();
290 #[expect(clippy::cast_possible_truncation, reason = "allele length fits u32")]
291 let advance = ref_allele.len() as u32;
292 (ref_allele, vec![anchor], advance)
293 }
294}
295
296fn generate_mnp(
299 reference: &[u8],
300 pos: u32,
301 contig_len: u32,
302 rng: &mut impl Rng,
303) -> (Vec<u8>, Vec<u8>, u32) {
304 let length = rng.random_range(2..=3u32).min(contig_len - pos);
305 let ref_allele: Vec<u8> = reference[pos as usize..(pos + length) as usize].to_vec();
306
307 let mut alt_allele = ref_allele.clone();
309 for base in &mut alt_allele {
310 if rng.random::<f64>() < 0.8 {
311 let original = *base;
313 *base = loop {
314 let candidate = BASES[rng.random_range(0..4)];
315 if candidate != original {
316 break candidate;
317 }
318 };
319 }
320 }
321 if alt_allele == ref_allele {
323 let idx = rng.random_range(0..alt_allele.len());
324 let original = alt_allele[idx];
325 alt_allele[idx] = loop {
326 let candidate = BASES[rng.random_range(0..4)];
327 if candidate != original {
328 break candidate;
329 }
330 };
331 }
332
333 (ref_allele, alt_allele, length)
334}
335
336fn generate_genotype(ploidy: u8, het_hom_ratio: f64, rng: &mut impl Rng) -> String {
341 let p_het = het_hom_ratio / (1.0 + het_hom_ratio);
342 let is_het = ploidy > 1 && rng.random::<f64>() < p_het;
343
344 if ploidy == 1 {
345 "1".to_string()
347 } else if is_het {
348 let mut alleles: Vec<&str> = vec!["0"; ploidy as usize];
350 let alt_hap = rng.random_range(0..ploidy as usize);
351 alleles[alt_hap] = "1";
352 alleles.join("/")
353 } else {
354 vec!["1"; ploidy as usize].join("/")
356 }
357}
358
359#[cfg(test)]
360mod tests {
361 use super::*;
362
363 #[test]
364 fn test_generate_snp() {
365 let mut rng = rand::rng();
366 for _ in 0..100 {
367 let (ref_a, alt_a, advance) = generate_snp(b'A', &mut rng);
368 assert_eq!(ref_a, vec![b'A']);
369 assert_eq!(alt_a.len(), 1);
370 assert_ne!(alt_a[0], b'A');
371 assert_eq!(advance, 1);
372 }
373 }
374
375 #[test]
376 fn test_generate_indel_insertion() {
377 let reference = b"ACGTACGT";
378 let indel_dist = Geometric::new(0.7).unwrap();
379 let mut rng = SmallRng::seed_from_u64(42);
380
381 let mut found_insertion = false;
382 for _ in 0..100 {
383 let (ref_a, alt_a, advance) = generate_indel(reference, 2, 8, &indel_dist, &mut rng);
384 if alt_a.len() > ref_a.len() {
385 found_insertion = true;
386 assert_eq!(ref_a.len(), 1); assert!(alt_a.len() >= 2); assert_eq!(ref_a[0], alt_a[0]); assert_eq!(advance, 1);
390 }
391 }
392 assert!(found_insertion, "Should have generated at least one insertion");
393 }
394
395 #[test]
396 fn test_generate_indel_deletion() {
397 let reference = b"ACGTACGT";
398 let indel_dist = Geometric::new(0.7).unwrap();
399 let mut rng = SmallRng::seed_from_u64(99);
400
401 let mut found_deletion = false;
402 for _ in 0..100 {
403 let (ref_a, alt_a, _advance) = generate_indel(reference, 2, 8, &indel_dist, &mut rng);
404 if ref_a.len() > alt_a.len() {
405 found_deletion = true;
406 assert_eq!(alt_a.len(), 1); assert!(ref_a.len() >= 2); assert_eq!(ref_a[0], alt_a[0]); }
410 }
411 assert!(found_deletion, "Should have generated at least one deletion");
412 }
413
414 #[test]
415 fn test_generate_mnp() {
416 let reference = b"ACGTACGT";
417 let mut rng = rand::rng();
418 for _ in 0..100 {
419 let (ref_a, alt_a, advance) = generate_mnp(reference, 1, 8, &mut rng);
420 assert!(ref_a.len() >= 2 && ref_a.len() <= 3);
421 assert_eq!(ref_a.len(), alt_a.len());
422 assert_ne!(ref_a, alt_a);
423 assert_eq!(advance as usize, ref_a.len());
424 }
425 }
426
427 #[test]
428 fn test_generate_genotype_haploid() {
429 let mut rng = rand::rng();
430 let gt = generate_genotype(1, 2.0, &mut rng);
431 assert_eq!(gt, "1");
432 }
433
434 #[test]
435 fn test_generate_genotype_diploid_het() {
436 let mut rng = rand::rng();
437 let mut het_count = 0;
438 let mut hom_count = 0;
439 for _ in 0..1000 {
440 let gt = generate_genotype(2, 2.0, &mut rng);
441 if gt == "0/1" || gt == "1/0" {
442 het_count += 1;
443 } else if gt == "1/1" {
444 hom_count += 1;
445 }
446 }
447 assert!(het_count > 500, "expected majority het, got {het_count}");
449 assert!(hom_count > 200, "expected some hom, got {hom_count}");
450 }
451
452 #[test]
453 fn test_generate_genotype_triploid() {
454 let mut rng = rand::rng();
455 let gt = generate_genotype(3, 0.0, &mut rng);
456 assert_eq!(gt, "1/1/1");
458 }
459}