1use std::io::Write;
2use std::path::PathBuf;
3
4use anyhow::{Result, bail};
5use clap::Parser;
6use rand::rngs::SmallRng;
7use rand::{Rng, SeedableRng};
8use rand_distr::{Distribution, Geometric};
9
10use super::command::Command;
11use super::common::{BedOptions, ReferenceOptions, SeedOptions};
12use crate::bed::TargetRegions;
13use crate::fasta::Fasta;
14use crate::ploidy::PloidyMap;
15use crate::seed::{derive_seed, resolve_seed};
16
17const BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
19
20#[derive(Parser, Debug)]
27#[command(after_long_help = "EXAMPLES:\n \
28 holodeck mutate -r ref.fa -o muts.vcf --snp-rate 0.001\n \
29 holodeck mutate -r ref.fa -o muts.vcf --snp-rate 0.001 --ploidy 2 \
30 --ploidy-override chrX=1 --ploidy-override chrX:10001-2781479=2")]
31pub struct Mutate {
32 #[command(flatten)]
33 pub reference: ReferenceOptions,
34
35 #[command(flatten)]
36 pub bed: BedOptions,
37
38 #[command(flatten)]
39 pub seed: SeedOptions,
40
41 #[arg(short = 'o', long, value_name = "VCF")]
43 pub output: PathBuf,
44
45 #[arg(long, default_value_t = 0.001, value_name = "FLOAT")]
47 pub snp_rate: f64,
48
49 #[arg(long, default_value_t = 0.0001, value_name = "FLOAT")]
51 pub indel_rate: f64,
52
53 #[arg(long, default_value_t = 0.00005, value_name = "FLOAT")]
55 pub mnp_rate: f64,
56
57 #[arg(long, default_value_t = 0.7, value_name = "FLOAT")]
60 pub indel_length_param: f64,
61
62 #[arg(long, default_value_t = 2.0, value_name = "FLOAT")]
65 pub het_hom_ratio: f64,
66
67 #[arg(long, default_value_t = 2, value_name = "INT")]
69 pub ploidy: u8,
70
71 #[arg(long, value_name = "SPEC")]
77 pub ploidy_override: Vec<String>,
78}
79
80impl Command for Mutate {
81 fn execute(&self) -> Result<()> {
82 self.validate()?;
83 self.run_mutation()
84 }
85}
86
87impl Mutate {
88 fn validate(&self) -> Result<()> {
90 if self.snp_rate < 0.0 || self.indel_rate < 0.0 || self.mnp_rate < 0.0 {
91 bail!("Individual mutation rates must be >= 0");
92 }
93 let total_rate = self.snp_rate + self.indel_rate + self.mnp_rate;
94 if !(0.0..=1.0).contains(&total_rate) {
95 bail!("Combined mutation rate must be in [0, 1], got {total_rate}");
96 }
97 if self.indel_length_param <= 0.0 || self.indel_length_param >= 1.0 {
98 bail!("--indel-length-param must be in (0, 1)");
99 }
100 if self.het_hom_ratio < 0.0 {
101 bail!("--het-hom-ratio must be >= 0");
102 }
103 if self.ploidy == 0 {
104 bail!("--ploidy must be >= 1");
105 }
106 Ok(())
107 }
108
109 fn run_mutation(&self) -> Result<()> {
111 let seed_desc = format!(
112 "mutate:{}:{}:{}:{}:{}",
113 self.reference.reference.display(),
114 self.snp_rate,
115 self.indel_rate,
116 self.mnp_rate,
117 self.ploidy,
118 );
119 let seed = resolve_seed(self.seed.seed, &seed_desc);
120 let mut rng = SmallRng::seed_from_u64(seed);
121 log::info!("Using random seed: {seed}");
122
123 let mut fasta = Fasta::from_path(&self.reference.reference)?;
124 let dict = fasta.dict().clone();
125 log::info!(
126 "Loaded reference with {} contigs, total {} bp",
127 dict.len(),
128 dict.total_length()
129 );
130
131 let targets = match &self.bed.targets {
132 Some(bed_path) => {
133 let t = TargetRegions::from_path(bed_path, &dict)?;
134 log::info!(
135 "Restricting mutations to {} bp of target territory",
136 t.total_territory()
137 );
138 Some(t)
139 }
140 None => None,
141 };
142
143 let ploidy_map = PloidyMap::new(self.ploidy, &self.ploidy_override)?;
144 let indel_dist = Geometric::new(self.indel_length_param)
145 .map_err(|e| anyhow::anyhow!("Invalid indel length distribution: {e}"))?;
146
147 let mut vcf_out = std::fs::File::create(&self.output)?;
149 Self::write_vcf_header(&mut vcf_out, &dict)?;
150
151 let total_rate = self.snp_rate + self.indel_rate + self.mnp_rate;
152 let mut total_variants = 0u64;
153 let contig_names: Vec<String> = dict.names().into_iter().map(String::from).collect();
154
155 for contig_name in &contig_names {
156 let contig_seed = derive_seed(seed, contig_name);
159 let mut ref_rng = SmallRng::seed_from_u64(contig_seed);
160 let reference = fasta.load_contig(contig_name, &mut ref_rng)?;
161 let contig_idx = dict.get_by_name(contig_name).unwrap().index();
162
163 let mut pos = 0u32;
164 #[expect(clippy::cast_possible_truncation, reason = "contig length fits u32")]
165 let contig_len = reference.len() as u32;
166
167 while pos < contig_len {
168 if !BASES.contains(&reference[pos as usize]) {
170 pos += 1;
171 continue;
172 }
173
174 if let Some(tgt) = &targets
176 && !tgt.overlaps(contig_idx, pos, pos + 1)
177 {
178 pos += 1;
179 continue;
180 }
181
182 if rng.random::<f64>() >= total_rate {
184 pos += 1;
185 continue;
186 }
187
188 let ploidy = ploidy_map.ploidy_at(contig_name, pos);
191 let type_roll: f64 = rng.random::<f64>() * total_rate;
192 let (ref_allele, alt_allele, advance) = if type_roll < self.snp_rate {
193 generate_snp(reference[pos as usize], &mut rng)
194 } else if type_roll < self.snp_rate + self.indel_rate {
195 generate_indel(&reference, pos, contig_len, &indel_dist, &mut rng)
196 } else {
197 if pos + 2 > contig_len {
199 pos += 1;
200 continue;
201 }
202 generate_mnp(&reference, pos, contig_len, &mut rng)
203 };
204
205 if !ref_allele.iter().all(|b| BASES.contains(b)) {
211 pos += 1;
212 continue;
213 }
214
215 let gt = generate_genotype(ploidy, self.het_hom_ratio, &mut rng);
217
218 writeln!(
220 vcf_out,
221 "{contig_name}\t{}\t.\t{}\t{}\t100\tPASS\t.\tGT\t{gt}",
222 pos + 1,
223 String::from_utf8_lossy(&ref_allele),
224 String::from_utf8_lossy(&alt_allele),
225 )?;
226
227 total_variants += 1;
228 pos += advance;
229 }
230 }
231
232 log::info!("Generated {total_variants} variants");
233 Ok(())
234 }
235
236 fn write_vcf_header(
238 out: &mut std::fs::File,
239 dict: &crate::sequence_dict::SequenceDictionary,
240 ) -> Result<()> {
241 writeln!(out, "##fileformat=VCFv4.3")?;
242 writeln!(out, "##source=holodeck-mutate")?;
243 for meta in dict.iter() {
244 writeln!(out, "##contig=<ID={},length={}>", meta.name(), meta.length())?;
245 }
246 writeln!(out, "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">")?;
247 writeln!(out, "#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE")?;
248 Ok(())
249 }
250}
251
252fn generate_snp(ref_base: u8, rng: &mut impl Rng) -> (Vec<u8>, Vec<u8>, u32) {
255 let alt = loop {
256 let candidate = BASES[rng.random_range(0..4)];
257 if candidate != ref_base {
258 break candidate;
259 }
260 };
261 (vec![ref_base], vec![alt], 1)
262}
263
264fn generate_indel(
267 reference: &[u8],
268 pos: u32,
269 contig_len: u32,
270 indel_dist: &Geometric,
271 rng: &mut impl Rng,
272) -> (Vec<u8>, Vec<u8>, u32) {
273 #[expect(
275 clippy::cast_possible_truncation,
276 reason = "geometric distribution produces small values"
277 )]
278 let length = (indel_dist.sample(rng) as u32).max(1);
279 let is_insertion: bool = rng.random();
280 let anchor = reference[pos as usize];
281
282 if is_insertion {
283 let mut alt = vec![anchor];
285 for _ in 0..length {
286 alt.push(BASES[rng.random_range(0..4)]);
287 }
288 (vec![anchor], alt, 1)
289 } else {
290 let available = contig_len.saturating_sub(pos + 1);
293 if available == 0 {
294 let mut alt = vec![anchor];
296 for _ in 0..length {
297 alt.push(BASES[rng.random_range(0..4)]);
298 }
299 return (vec![anchor], alt, 1);
300 }
301 let del_len = length.min(available);
302 let del_end = pos + 1 + del_len;
303 let ref_allele: Vec<u8> = reference[pos as usize..del_end as usize].to_vec();
304 #[expect(clippy::cast_possible_truncation, reason = "allele length fits u32")]
305 let advance = ref_allele.len() as u32;
306 (ref_allele, vec![anchor], advance)
307 }
308}
309
310fn generate_mnp(
313 reference: &[u8],
314 pos: u32,
315 contig_len: u32,
316 rng: &mut impl Rng,
317) -> (Vec<u8>, Vec<u8>, u32) {
318 let length = rng.random_range(2..=3u32).min(contig_len - pos);
319 let ref_allele: Vec<u8> = reference[pos as usize..(pos + length) as usize].to_vec();
320
321 let mut alt_allele = ref_allele.clone();
323 for base in &mut alt_allele {
324 if rng.random::<f64>() < 0.8 {
325 let original = *base;
327 *base = loop {
328 let candidate = BASES[rng.random_range(0..4)];
329 if candidate != original {
330 break candidate;
331 }
332 };
333 }
334 }
335 if alt_allele == ref_allele {
337 let idx = rng.random_range(0..alt_allele.len());
338 let original = alt_allele[idx];
339 alt_allele[idx] = loop {
340 let candidate = BASES[rng.random_range(0..4)];
341 if candidate != original {
342 break candidate;
343 }
344 };
345 }
346
347 (ref_allele, alt_allele, length)
348}
349
350fn generate_genotype(ploidy: u8, het_hom_ratio: f64, rng: &mut impl Rng) -> String {
355 let p_het = het_hom_ratio / (1.0 + het_hom_ratio);
356 let is_het = ploidy > 1 && rng.random::<f64>() < p_het;
357
358 if ploidy == 1 {
359 "1".to_string()
361 } else if is_het {
362 let mut alleles: Vec<&str> = vec!["0"; ploidy as usize];
364 let alt_hap = rng.random_range(0..ploidy as usize);
365 alleles[alt_hap] = "1";
366 alleles.join("/")
367 } else {
368 vec!["1"; ploidy as usize].join("/")
370 }
371}
372
373#[cfg(test)]
374mod tests {
375 use super::*;
376
377 #[test]
378 fn test_generate_snp() {
379 let mut rng = rand::rng();
380 for _ in 0..100 {
381 let (ref_a, alt_a, advance) = generate_snp(b'A', &mut rng);
382 assert_eq!(ref_a, vec![b'A']);
383 assert_eq!(alt_a.len(), 1);
384 assert_ne!(alt_a[0], b'A');
385 assert_eq!(advance, 1);
386 }
387 }
388
389 #[test]
390 fn test_generate_indel_insertion() {
391 let reference = b"ACGTACGT";
392 let indel_dist = Geometric::new(0.7).unwrap();
393 let mut rng = SmallRng::seed_from_u64(42);
394
395 let mut found_insertion = false;
396 for _ in 0..100 {
397 let (ref_a, alt_a, advance) = generate_indel(reference, 2, 8, &indel_dist, &mut rng);
398 if alt_a.len() > ref_a.len() {
399 found_insertion = true;
400 assert_eq!(ref_a.len(), 1); assert!(alt_a.len() >= 2); assert_eq!(ref_a[0], alt_a[0]); assert_eq!(advance, 1);
404 }
405 }
406 assert!(found_insertion, "Should have generated at least one insertion");
407 }
408
409 #[test]
410 fn test_generate_indel_deletion() {
411 let reference = b"ACGTACGT";
412 let indel_dist = Geometric::new(0.7).unwrap();
413 let mut rng = SmallRng::seed_from_u64(99);
414
415 let mut found_deletion = false;
416 for _ in 0..100 {
417 let (ref_a, alt_a, _advance) = generate_indel(reference, 2, 8, &indel_dist, &mut rng);
418 if ref_a.len() > alt_a.len() {
419 found_deletion = true;
420 assert_eq!(alt_a.len(), 1); assert!(ref_a.len() >= 2); assert_eq!(ref_a[0], alt_a[0]); }
424 }
425 assert!(found_deletion, "Should have generated at least one deletion");
426 }
427
428 #[test]
429 fn test_generate_mnp() {
430 let reference = b"ACGTACGT";
431 let mut rng = rand::rng();
432 for _ in 0..100 {
433 let (ref_a, alt_a, advance) = generate_mnp(reference, 1, 8, &mut rng);
434 assert!(ref_a.len() >= 2 && ref_a.len() <= 3);
435 assert_eq!(ref_a.len(), alt_a.len());
436 assert_ne!(ref_a, alt_a);
437 assert_eq!(advance as usize, ref_a.len());
438 }
439 }
440
441 #[test]
442 fn test_generate_genotype_haploid() {
443 let mut rng = rand::rng();
444 let gt = generate_genotype(1, 2.0, &mut rng);
445 assert_eq!(gt, "1");
446 }
447
448 #[test]
449 fn test_generate_genotype_diploid_het() {
450 let mut rng = rand::rng();
451 let mut het_count = 0;
452 let mut hom_count = 0;
453 for _ in 0..1000 {
454 let gt = generate_genotype(2, 2.0, &mut rng);
455 if gt == "0/1" || gt == "1/0" {
456 het_count += 1;
457 } else if gt == "1/1" {
458 hom_count += 1;
459 }
460 }
461 assert!(het_count > 500, "expected majority het, got {het_count}");
463 assert!(hom_count > 200, "expected some hom, got {hom_count}");
464 }
465
466 #[test]
467 fn test_generate_genotype_triploid() {
468 let mut rng = rand::rng();
469 let gt = generate_genotype(3, 0.0, &mut rng);
470 assert_eq!(gt, "1/1/1");
472 }
473}