1use crate::{
103 AllowedAGCTN, DNARestrictive, Error, F32Bw0and1, GetDNARestrictive, ModChar, OrdPair, ReadState,
104};
105use crate::{write_bam_denovo, write_fasta};
106use derive_builder::Builder;
107use itertools::join;
108use rand::seq::IteratorRandom as _;
109use rand::{Rng, random};
110use rust_htslib::bam;
111use rust_htslib::bam::record::{Aux, Cigar, CigarString};
112use serde::{Deserialize, Serialize};
113use std::iter;
114use std::num::{NonZeroU32, NonZeroU64};
115use std::ops::RangeInclusive;
116use std::path::Path;
117use std::str::FromStr as _;
118use uuid::Uuid;
119
120type OF = OrdPair<F32Bw0and1>;
123
124macro_rules! ord_pair_f32_bw0and1 {
129 ($low:expr, $high:expr) => {
130 OrdPair::new(
131 F32Bw0and1::new($low).unwrap(),
132 F32Bw0and1::new($high).unwrap(),
133 )
134 .unwrap()
135 };
136}
137
138#[derive(Builder, Debug, Default, PartialEq, Clone, Serialize, Deserialize)]
186#[serde(default)]
187#[builder(default, build_fn(error = "Error"), pattern = "owned", derive(Clone))]
188#[non_exhaustive]
189pub struct SimulationConfig {
190 pub contigs: ContigConfig,
192 pub reads: Vec<ReadConfig>,
194}
195
196#[derive(Builder, Debug, PartialEq, Clone, Serialize, Deserialize)]
249#[serde(default)]
250#[builder(default, build_fn(error = "Error"), pattern = "owned", derive(Clone))]
251#[non_exhaustive]
252pub struct ContigConfig {
253 #[builder(field(ty = "u32", build = "self.number.try_into()?"))]
255 pub number: NonZeroU32,
256 #[builder(field(ty = "(u64, u64)", build = "self.len_range.try_into()?"))]
258 pub len_range: OrdPair<NonZeroU64>,
259 #[builder(field(
261 ty = "String",
262 build = "(!self.repeated_seq.is_empty()).then(|| DNARestrictive::from_str(&self.repeated_seq)).transpose()?"
263 ))]
264 pub repeated_seq: Option<DNARestrictive>,
265}
266
267#[derive(Builder, Debug, Clone, PartialEq, Serialize, Deserialize)]
315#[serde(default)]
316#[builder(default, build_fn(error = "Error"), pattern = "owned", derive(Clone))]
317#[non_exhaustive]
318pub struct ReadConfig {
319 #[builder(field(ty = "u32", build = "self.number.try_into()?"))]
321 pub number: NonZeroU32,
322 #[builder(field(ty = "(u8, u8)", build = "self.mapq_range.try_into()?"))]
324 pub mapq_range: OrdPair<u8>,
325 #[builder(field(ty = "(u8, u8)", build = "self.base_qual_range.try_into()?"))]
327 pub base_qual_range: OrdPair<u8>,
328 #[builder(field(ty = "(f32, f32)", build = "self.len_range.try_into()?"))]
330 pub len_range: OrdPair<F32Bw0and1>,
331 #[builder(field(
333 ty = "String",
334 build = "(!self.barcode.is_empty()).then(|| DNARestrictive::from_str(&self.barcode)).transpose()?"
335 ))]
336 pub barcode: Option<DNARestrictive>,
337 #[builder(field(
339 ty = "(f32, f32)",
340 build = "Some(self.delete).map(TryInto::try_into).transpose()?"
341 ))]
342 pub delete: Option<OrdPair<F32Bw0and1>>,
343 #[builder(field(
345 ty = "String",
346 build = "(!self.insert_middle.is_empty()).then(|| DNARestrictive::from_str(&self.insert_middle)).transpose()?"
347 ))]
348 pub insert_middle: Option<DNARestrictive>,
349 #[builder(field(ty = "f32", build = "Some(F32Bw0and1::try_from(self.mismatch)?)"))]
351 pub mismatch: Option<F32Bw0and1>,
352 pub mods: Vec<ModConfig>,
354}
355
356#[derive(Builder, Debug, Clone, PartialEq, Serialize, Deserialize)]
383#[serde(default)]
384#[builder(default, build_fn(error = "Error"), pattern = "owned", derive(Clone))]
385#[non_exhaustive]
386pub struct ModConfig {
387 #[builder(field(ty = "char", build = "self.base.try_into()?"))]
389 pub base: AllowedAGCTN,
390 pub is_strand_plus: bool,
392 #[builder(field(ty = "String", build = "self.mod_code.parse()?"))]
394 pub mod_code: ModChar,
395 #[builder(field(
407 ty = "Vec<u32>",
408 build = "self.win.iter().map(|&x| NonZeroU32::new(x).ok_or(Error::Zero(\"cannot use zero-\
409sized windows in builder\".to_owned()))).collect::<Result<Vec<NonZeroU32>,_>>()?"
410 ))]
411 pub win: Vec<NonZeroU32>,
412 #[builder(field(
415 ty = "Vec<(f32, f32)>",
416 build = "self.mod_range.iter().map(|&x| OF::try_from(x)).collect::<Result<Vec<OF>, _>>()?"
417 ))]
418 pub mod_range: Vec<OrdPair<F32Bw0and1>>,
419}
420
421#[derive(Builder, Debug, Clone, Serialize, Deserialize)]
437#[serde(default)]
438#[builder(default, build_fn(error = "Error"))]
439#[non_exhaustive]
440pub struct Contig {
441 #[builder(setter(into))]
443 pub name: String,
444 #[builder(field(ty = "String", build = "self.seq.parse()?"))]
446 pub seq: DNARestrictive,
447}
448
449impl GetDNARestrictive for Contig {
450 fn get_dna_restrictive(&self) -> &DNARestrictive {
452 &self.seq
453 }
454}
455
456impl Default for Contig {
457 fn default() -> Self {
458 Self {
459 name: String::new(),
460 seq: DNARestrictive::from_str("A").expect("valid default DNA"),
461 }
462 }
463}
464
465impl Default for ContigConfig {
466 fn default() -> Self {
467 Self {
468 number: NonZeroU32::new(1).unwrap(),
469 len_range: OrdPair::new(NonZeroU64::new(1).unwrap(), NonZeroU64::new(1).unwrap())
470 .unwrap(),
471 repeated_seq: None,
472 }
473 }
474}
475
476impl Default for ReadConfig {
477 fn default() -> Self {
478 Self {
479 number: NonZeroU32::new(1).unwrap(),
480 mapq_range: OrdPair::new(0, 0).unwrap(),
481 base_qual_range: OrdPair::new(0, 0).unwrap(),
482 len_range: ord_pair_f32_bw0and1!(0.0, 0.0),
483 barcode: None,
484 delete: None,
485 insert_middle: None,
486 mismatch: None,
487 mods: Vec::new(),
488 }
489 }
490}
491
492impl Default for ModConfig {
493 fn default() -> Self {
494 Self {
495 base: AllowedAGCTN::C,
496 is_strand_plus: true,
497 mod_code: ModChar::new('m'),
498 win: vec![NonZeroU32::new(1).unwrap()],
499 mod_range: vec![ord_pair_f32_bw0and1!(0.0, 1.0)],
500 }
501 }
502}
503
504#[derive(Debug, Clone, Serialize, Deserialize)]
506pub struct PerfectSeqMatchToNot {
507 seq: Vec<u8>,
509 barcode: Option<DNARestrictive>,
511 delete: Option<OrdPair<F32Bw0and1>>,
513 insert_middle: Option<DNARestrictive>,
515 mismatch: Option<F32Bw0and1>,
517}
518
519impl PerfectSeqMatchToNot {
520 pub fn seq(seq: Vec<u8>) -> Result<Self, Error> {
543 if seq.is_empty() {
544 return Err(Error::InvalidState(
545 "Sequence length is 0; cannot create PerfectSeqMatchToNot with empty sequence"
546 .into(),
547 ));
548 }
549 Ok(Self {
550 seq,
551 barcode: None,
552 delete: None,
553 insert_middle: None,
554 mismatch: None,
555 })
556 }
557
558 #[must_use]
560 pub fn barcode(mut self, barcode: DNARestrictive) -> Self {
561 self.barcode = Some(barcode);
562 self
563 }
564
565 #[must_use]
567 pub fn delete(mut self, delete: OrdPair<F32Bw0and1>) -> Self {
568 self.delete = Some(delete);
569 self
570 }
571
572 #[must_use]
574 pub fn insert_middle(mut self, insert: DNARestrictive) -> Self {
575 self.insert_middle = Some(insert);
576 self
577 }
578
579 #[must_use]
581 pub fn mismatch(mut self, mismatch: F32Bw0and1) -> Self {
582 self.mismatch = Some(mismatch);
583 self
584 }
585
586 #[expect(
763 clippy::cast_precision_loss,
764 clippy::cast_possible_truncation,
765 clippy::cast_sign_loss,
766 reason = "Casting and arithmetic needed for fractional calculations on sequence positions"
767 )]
768 #[expect(
769 clippy::too_many_lines,
770 reason = "performs many steps to alter sequence (indel, mismatch, barcode)"
771 )]
772 pub fn build<R: Rng>(
773 self,
774 read_state: ReadState,
775 rng: &mut R,
776 ) -> Result<(Vec<u8>, Option<CigarString>), Error> {
777 let seq = self.seq;
778
779 let mut bases_and_ops: Vec<(u8, u8)> = seq.iter().map(|&base| (base, b'M')).collect();
782
783 if let Some(mismatch_frac) = self.mismatch {
787 let num_mismatches =
788 ((bases_and_ops.len() as f32) * mismatch_frac.val()).round() as usize;
789
790 let indices_to_mutate: Vec<usize> =
791 (0..bases_and_ops.len()).choose_multiple(rng, num_mismatches);
792
793 for idx in indices_to_mutate {
794 let item = bases_and_ops
795 .get_mut(idx)
796 .expect("idx is within bounds from choose_multiple");
797 let current_base = item.0;
798 let new_base = match current_base {
799 v @ (b'A' | b'C' | b'G' | b'T') => {
800 loop {
802 let candidate: AllowedAGCTN = rng.random();
803 let candidate_u8: u8 = candidate.into();
804 if candidate_u8 != v && candidate_u8 != b'N' {
805 break candidate_u8;
806 }
807 }
808 }
809 other => other, };
811 item.0 = new_base;
812 }
813 }
814
815 if let Some(delete_range) = self.delete {
817 let start = ((bases_and_ops.len() as f32) * delete_range.low().val()).round() as usize;
818 let end_raw =
819 ((bases_and_ops.len() as f32) * delete_range.high().val()).round() as usize;
820 let end = end_raw.min(bases_and_ops.len());
821
822 if let Some(slice) = bases_and_ops.get_mut(start..end) {
823 for item in slice {
824 item.1 = b'D';
825 }
826 }
827 }
828
829 if let Some(insert_seq) = self.insert_middle {
831 let middle = bases_and_ops.len() / 2;
832 let insert_bases = insert_seq.get_dna_restrictive().get();
833 let insertions: Vec<(u8, u8)> = insert_bases.iter().map(|&b| (b, b'I')).collect();
834
835 drop(
836 bases_and_ops
837 .splice(middle..middle, insertions)
838 .collect::<Vec<_>>(),
839 );
840 }
841
842 if let Some(barcode) = self.barcode {
844 let barcoded_seq = add_barcode(&[], barcode.clone(), read_state);
846 let barcode_len = barcode.get_dna_restrictive().get().len();
847
848 let (start_bc, end_bc) = barcoded_seq.split_at(barcode_len);
850
851 let bc_tuples_start: Vec<(u8, u8)> = start_bc.iter().map(|&b| (b, b'S')).collect();
853 drop(
854 bases_and_ops
855 .splice(0..0, bc_tuples_start)
856 .collect::<Vec<_>>(),
857 );
858
859 let bc_tuples_end: Vec<(u8, u8)> = end_bc.iter().map(|&b| (b, b'S')).collect();
861 bases_and_ops.extend(bc_tuples_end);
862 }
863
864 let cigar_string = CigarString(
866 bases_and_ops
867 .iter()
868 .map(|&(_, op)| op)
869 .fold(Vec::<(u8, u32)>::new(), |mut acc, op| {
870 match acc.last_mut() {
871 Some(&mut (last_op, ref mut count)) if last_op == op => {
872 *count = count.saturating_add(1);
873 }
874 _ => acc.push((op, 1u32)),
875 }
876 acc
877 })
878 .into_iter()
879 .map(|(op, count)| match op {
880 b'M' => Cigar::Match(count),
881 b'I' => Cigar::Ins(count),
882 b'D' => Cigar::Del(count),
883 b'S' => Cigar::SoftClip(count),
884 _ => unreachable!("Invalid CIGAR operation: {op}"),
885 })
886 .collect::<Vec<_>>(),
887 );
888
889 let final_seq: Vec<u8> = bases_and_ops
891 .iter()
892 .copied()
893 .filter_map(|item| (item.1 != b'D').then_some(item.0))
894 .collect();
895
896 let first_ok = (cigar_string.0.as_slice().first().map(|x| x.char()) == Some('M'))
898 || (cigar_string
899 .0
900 .as_slice()
901 .first_chunk::<2>()
902 .map(|&x| [x[0].char(), x[1].char()])
903 == Some(['S', 'M']));
904 let last_ok = (cigar_string.0.as_slice().last().map(|x| x.char()) == Some('M'))
905 || (cigar_string
906 .0
907 .as_slice()
908 .last_chunk::<2>()
909 .map(|&x| [x[0].char(), x[1].char()])
910 == Some(['M', 'S']));
911 if first_ok && last_ok {
912 Ok((
913 final_seq,
914 (!read_state.is_unmapped()).then_some(cigar_string),
915 ))
916 } else {
917 Err(Error::SimulateDNASeqCIGAREndProblem(
918 "too few bp or insertion/deletion close to end".to_owned(),
919 ))
920 }
921 }
922}
923
924pub fn generate_random_dna_modification<R: Rng, S: GetDNARestrictive>(
973 mod_configs: &[ModConfig],
974 seq: &S,
975 rng: &mut R,
976) -> (String, Vec<u8>) {
977 let seq_bytes = seq.get_dna_restrictive().get();
978 let mut mm_str = String::new();
979
980 let mut ml_vec: Vec<u8> = Vec::with_capacity(seq_bytes.len());
982
983 for mod_config in mod_configs {
984 let base = u8::from(mod_config.base);
985 let strand = if mod_config.is_strand_plus { '+' } else { '-' };
986 let mod_code = mod_config.mod_code;
987 let mut count = if base == b'N' {
988 seq_bytes.len()
989 } else {
990 seq_bytes
991 .iter()
992 .zip(iter::repeat(&base))
993 .filter(|&(&a, &b)| a == b)
994 .count()
995 };
996 let mut output: Vec<u8> = Vec::with_capacity(count);
997 for k in mod_config
998 .win
999 .iter()
1000 .cycle()
1001 .zip(mod_config.mod_range.iter().cycle())
1002 {
1003 let low = u8::from(k.1.low());
1004 let high = u8::from(k.1.high());
1005 #[expect(
1006 clippy::redundant_else,
1007 reason = "so that the clippy arithmetic lint fits better with the code"
1008 )]
1009 #[expect(
1010 clippy::arithmetic_side_effects,
1011 reason = "we loop only if count > 0 and catch count == 0, thus count doesn't underflow"
1012 )]
1013 if count == 0 {
1014 break;
1015 } else {
1016 for _ in 0..k.0.get() {
1017 output.push(rng.random_range(low..=high));
1018 count -= 1;
1019 if count == 0 {
1020 break;
1021 }
1022 }
1023 }
1024 }
1025 if !output.is_empty() {
1026 let mod_len = output.len();
1027 ml_vec.append(&mut output);
1028 mm_str += format!(
1029 "{}{}{}?,{};",
1030 base as char,
1031 strand,
1032 mod_code,
1033 join(vec![0; mod_len], ",")
1034 )
1035 .as_str();
1036 }
1037 }
1038 ml_vec.shrink_to_fit();
1039 (mm_str, ml_vec)
1040}
1041
1042pub fn generate_random_dna_sequence<R: Rng>(length: NonZeroU64, rng: &mut R) -> Vec<u8> {
1058 const DNA_BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
1059 iter::repeat_with(|| {
1060 *DNA_BASES
1061 .get(rng.random_range(0..4))
1062 .expect("random_range(0..4) is always 0-3")
1063 })
1064 .take(usize::try_from(length.get()).expect("sequence length exceeds usize::MAX"))
1065 .collect()
1066}
1067
1068#[must_use]
1093#[expect(
1094 clippy::needless_pass_by_value,
1095 reason = "barcode by value allows passing `&'static str` easily (I think, may fix in future). \
1096If you don't know what this means, ignore this; this is _not_ a bug."
1097)]
1098pub fn add_barcode(read_seq: &[u8], barcode: DNARestrictive, read_state: ReadState) -> Vec<u8> {
1099 let bc_bytes = barcode.get();
1100
1101 match read_state {
1102 ReadState::PrimaryFwd
1103 | ReadState::SecondaryFwd
1104 | ReadState::SupplementaryFwd
1105 | ReadState::Unmapped => {
1106 let revcomp_bc = bio::alphabets::dna::revcomp(bc_bytes);
1108 [bc_bytes, read_seq, &revcomp_bc[..]].concat()
1109 }
1110 ReadState::PrimaryRev | ReadState::SecondaryRev | ReadState::SupplementaryRev => {
1111 let comp_bc: Vec<u8> = bc_bytes
1113 .iter()
1114 .map(|&b| bio::alphabets::dna::complement(b))
1115 .collect();
1116 let rev_bc: Vec<u8> = bc_bytes.iter().copied().rev().collect();
1117 [&comp_bc[..], read_seq, &rev_bc[..]].concat()
1118 }
1119 }
1120}
1121
1122#[expect(
1141 clippy::missing_panics_doc,
1142 reason = "no length number or DNA conversion problems expected"
1143)]
1144pub fn generate_contigs_denovo<R: Rng>(
1145 contig_number: NonZeroU32,
1146 len_range: OrdPair<NonZeroU64>,
1147 rng: &mut R,
1148) -> Vec<Contig> {
1149 (0..contig_number.get())
1150 .map(|i| {
1151 let length = rng.random_range(len_range.low().get()..=len_range.high().get());
1152 let seq_bytes =
1153 generate_random_dna_sequence(NonZeroU64::try_from(length).expect("no error"), rng);
1154 let seq_str = String::from_utf8(seq_bytes).expect("valid DNA sequence");
1155 ContigBuilder::default()
1156 .name(format!("contig_{i:05}"))
1157 .seq(seq_str)
1158 .build()
1159 .expect("no error")
1160 })
1161 .collect()
1162}
1163
1164#[expect(
1194 clippy::missing_panics_doc,
1195 reason = "no length number or DNA conversion problems expected"
1196)]
1197pub fn generate_contigs_denovo_repeated_seq<R: Rng, S: GetDNARestrictive>(
1198 contig_number: NonZeroU32,
1199 len_range: OrdPair<NonZeroU64>,
1200 seq: &S,
1201 rng: &mut R,
1202) -> Vec<Contig> {
1203 (0..contig_number.get())
1204 .map(|i| {
1205 let length = rng.random_range(len_range.low().get()..=len_range.high().get());
1206 let contig_seq: Vec<u8> = seq
1207 .get_dna_restrictive()
1208 .get()
1209 .iter()
1210 .cycle()
1211 .take(usize::try_from(length).expect("number conversion error"))
1212 .copied()
1213 .collect();
1214 let seq_str =
1215 String::from_utf8(contig_seq).expect("valid UTF-8 from repeated DNA sequence");
1216 ContigBuilder::default()
1217 .name(format!("contig_{i:05}"))
1218 .seq(seq_str)
1219 .build()
1220 .expect("valid DNA sequence from repeated pattern")
1221 })
1222 .collect()
1223}
1224
1225#[expect(
1262 clippy::missing_panics_doc,
1263 clippy::too_many_lines,
1264 reason = "number conversion errors or mis-generation of DNA bases are unlikely \
1265 as we generate DNA sequences ourselves here, and genomic data are unlikely \
1266 to exceed ~2^63 bp or have ~2^32 contigs. Function length is acceptable for read generation"
1267)]
1268#[expect(
1269 clippy::cast_possible_truncation,
1270 reason = "read length calculated as a fraction of contig length, managed with trunc()"
1271)]
1272pub fn generate_reads_denovo<R: Rng, S: GetDNARestrictive>(
1273 contigs: &[S],
1274 read_config: &ReadConfig,
1275 read_group: &str,
1276 rng: &mut R,
1277) -> Result<Vec<bam::Record>, Error> {
1278 if contigs.is_empty() {
1279 return Err(Error::UnavailableData("no contigs found".to_owned()));
1280 }
1281
1282 let mut reads = Vec::new();
1283
1284 for _ in 0..read_config.number.get() {
1285 let contig_idx = rng.random_range(0..contigs.len());
1287 let contig = contigs
1288 .get(contig_idx)
1289 .expect("contig_idx is within contigs range");
1290 let contig_len = contig.get_dna_restrictive().get().len() as u64;
1291
1292 #[expect(
1294 clippy::cast_precision_loss,
1295 reason = "u64->f32 causes precision loss but we are fine with this for now"
1296 )]
1297 #[expect(
1298 clippy::cast_sign_loss,
1299 reason = "these are positive numbers so no problem"
1300 )]
1301 let read_len = ((rng
1302 .random_range(read_config.len_range.low().val()..=read_config.len_range.high().val())
1303 * contig_len as f32)
1304 .trunc() as u64)
1305 .min(contig_len);
1306
1307 let start_pos = rng.random_range(0..=(contig_len.saturating_sub(read_len)));
1309
1310 let random_state: ReadState = random();
1312 let end_pos = usize::try_from(start_pos.checked_add(read_len).expect("u64 overflow"))
1313 .expect("number conversion error");
1314 let (read_seq, cigar) = {
1315 let start_idx = usize::try_from(start_pos).expect("number conversion error");
1316 let temp_seq = contig
1317 .get_dna_restrictive()
1318 .get()
1319 .get(start_idx..end_pos)
1320 .expect("start_idx and end_pos are within contig bounds")
1321 .to_vec();
1322 let mut builder = PerfectSeqMatchToNot::seq(temp_seq)?;
1323 if let Some(barcode) = read_config.barcode.clone() {
1324 builder = builder.barcode(barcode);
1325 }
1326 if let Some(delete) = read_config.delete {
1327 builder = builder.delete(delete);
1328 }
1329 if let Some(insert_middle) = read_config.insert_middle.clone() {
1330 builder = builder.insert_middle(insert_middle);
1331 }
1332 if let Some(mismatch) = read_config.mismatch {
1333 builder = builder.mismatch(mismatch);
1334 }
1335 builder.build(random_state, rng)?
1336 };
1337
1338 let qual: Vec<u8> = iter::repeat_with(|| {
1340 rng.random_range(RangeInclusive::from(read_config.base_qual_range))
1341 })
1342 .take(read_seq.len())
1343 .collect();
1344
1345 let seq: DNARestrictive;
1347 let (mod_prob_mm_tag, mod_pos_ml_tag) = generate_random_dna_modification(
1348 &read_config.mods,
1349 match random_state.strand() {
1350 '.' | '+' => {
1351 seq = DNARestrictive::from_str(str::from_utf8(&read_seq).expect("no error"))
1352 .expect("no error");
1353 &seq
1354 }
1355 '-' => {
1356 seq = DNARestrictive::from_str(
1357 str::from_utf8(&bio::alphabets::dna::revcomp(&read_seq)).expect("no error"),
1358 )
1359 .expect("no error");
1360 &seq
1361 }
1362 _ => unreachable!("`strand` is supposed to return one of +/-/."),
1363 },
1364 rng,
1365 );
1366
1367 let record = {
1369 let mut record = bam::Record::new();
1370 let qname = format!("{}.{}", read_group, Uuid::new_v4()).into_bytes();
1371 record.unset_flags();
1372 record.set_flags(u16::from(random_state));
1373 if random_state == ReadState::Unmapped {
1374 record.set(&qname, None, &read_seq, &qual);
1375 record.set_mapq(255);
1376 record.set_tid(-1);
1377 record.set_pos(-1);
1378 } else {
1379 let mapq = rng.random_range(RangeInclusive::from(read_config.mapq_range));
1380 record.set(&qname, cigar.as_ref(), &read_seq, &qual);
1381 record.set_mapq(mapq);
1382 record.set_tid(i32::try_from(contig_idx).expect("number conversion error"));
1383 record.set_pos(i64::try_from(start_pos).expect("number conversion error"));
1384 }
1385 record.set_mpos(-1);
1386 record.set_mtid(-1);
1387 record.push_aux(b"RG", Aux::String(read_group))?;
1388 if !mod_prob_mm_tag.is_empty() && !mod_pos_ml_tag.is_empty() {
1389 record.push_aux(b"MM", Aux::String(mod_prob_mm_tag.as_str()))?;
1390 record.push_aux(b"ML", Aux::ArrayU8((&mod_pos_ml_tag).into()))?;
1391 }
1392 record
1393 };
1394 reads.push(record);
1395 }
1396
1397 Ok(reads)
1398}
1399
1400pub fn run<F>(
1445 config: SimulationConfig,
1446 bam_output_path: &F,
1447 fasta_output_path: &F,
1448) -> Result<(), Error>
1449where
1450 F: AsRef<Path> + ?Sized,
1451{
1452 let mut rng = rand::rng();
1453
1454 let contigs = match config.contigs.repeated_seq {
1455 Some(seq) => generate_contigs_denovo_repeated_seq(
1456 config.contigs.number,
1457 config.contigs.len_range,
1458 &seq,
1459 &mut rng,
1460 ),
1461 None => generate_contigs_denovo(config.contigs.number, config.contigs.len_range, &mut rng),
1462 };
1463 let read_groups: Vec<String> = (0..config.reads.len()).map(|k| k.to_string()).collect();
1464 let reads = {
1465 let mut temp_reads = Vec::new();
1466 for k in config.reads.into_iter().zip(read_groups.clone()) {
1467 temp_reads.append(&mut generate_reads_denovo(&contigs, &k.0, &k.1, &mut rng)?);
1468 }
1469 temp_reads.sort_by_key(|k| (k.is_unmapped(), k.tid(), k.pos(), k.is_reverse()));
1470 temp_reads
1471 };
1472
1473 write_bam_denovo(
1474 reads,
1475 contigs
1476 .iter()
1477 .map(|k| (k.name.clone(), k.get_dna_restrictive().get().len())),
1478 read_groups,
1479 vec![String::from("simulated BAM file, not real data")],
1480 bam_output_path,
1481 )?;
1482 write_fasta(
1483 contigs.into_iter().map(|k| (k.name.clone(), k)),
1484 fasta_output_path,
1485 )?;
1486
1487 Ok(())
1488}
1489
1490#[derive(Debug, Serialize, Deserialize)]
1495pub struct TempBamSimulation {
1496 bam_path: String,
1498 fasta_path: String,
1500}
1501
1502impl TempBamSimulation {
1503 pub fn new(config: SimulationConfig) -> Result<Self, Error> {
1508 let temp_dir = std::env::temp_dir();
1509 let bam_path = temp_dir.join(format!("{}.bam", Uuid::new_v4()));
1510 let fasta_path = temp_dir.join(format!("{}.fa", Uuid::new_v4()));
1511
1512 run(config, &bam_path, &fasta_path)?;
1513
1514 Ok(Self {
1515 bam_path: bam_path.to_string_lossy().to_string(),
1516 fasta_path: fasta_path.to_string_lossy().to_string(),
1517 })
1518 }
1519
1520 #[must_use]
1522 pub fn bam_path(&self) -> &str {
1523 &self.bam_path
1524 }
1525
1526 #[must_use]
1528 pub fn fasta_path(&self) -> &str {
1529 &self.fasta_path
1530 }
1531}
1532
1533impl Drop for TempBamSimulation {
1534 fn drop(&mut self) {
1535 drop(std::fs::remove_file(&self.bam_path));
1537 drop(std::fs::remove_file(&self.fasta_path));
1538 drop(std::fs::remove_file(&(self.bam_path.clone() + ".bai")));
1539 }
1540}
1541
1542#[cfg(test)]
1543mod random_dna_generation_test {
1544 use super::*;
1545
1546 #[test]
1548 fn generate_random_dna_sequence_works() {
1549 let seq = generate_random_dna_sequence(NonZeroU64::new(100).unwrap(), &mut rand::rng());
1550 assert_eq!(seq.len(), 100);
1551 for base in seq {
1552 assert!([b'A', b'C', b'G', b'T'].contains(&base));
1553 }
1554 }
1555}
1556
1557#[cfg(test)]
1558mod read_generation_no_mods_tests {
1559 use super::*;
1560 use rust_htslib::bam::Read as _;
1561
1562 #[test]
1564 fn generate_reads_denovo_no_mods_works() {
1565 let mut rng = rand::rng();
1566 let contigs = vec![
1567 ContigBuilder::default()
1568 .name("contig_0")
1569 .seq("ACGTACGTACGTACGTACGT".into())
1570 .build()
1571 .unwrap(),
1572 ContigBuilder::default()
1573 .name("contig_1")
1574 .seq("TGCATGCATGCATGCATGCA".into())
1575 .build()
1576 .unwrap(),
1577 ];
1578
1579 let config = ReadConfigBuilder::default()
1580 .number(10)
1581 .mapq_range((10, 20))
1582 .base_qual_range((30, 50))
1583 .len_range((0.2, 0.8))
1584 .build()
1585 .unwrap();
1586
1587 let reads = generate_reads_denovo(&contigs, &config, "ph", &mut rng).unwrap();
1588 assert_eq!(reads.len(), 10);
1589
1590 for read in &reads {
1591 if !read.is_unmapped() {
1593 assert!((10..=20).contains(&read.mapq()));
1594 assert!((0..2).contains(&read.tid()));
1595 }
1596
1597 assert!((4..=16).contains(&read.seq_len()));
1599 assert!(read.qual().iter().all(|x| (30..=50).contains(x)));
1600 let _: rust_htslib::errors::Error = read.aux(b"MM").unwrap_err();
1601 let _: rust_htslib::errors::Error = read.aux(b"ML").unwrap_err();
1602
1603 assert_eq!(read.qname().get(0..3).unwrap(), *b"ph.");
1605 let _: Uuid =
1606 Uuid::parse_str(str::from_utf8(read.qname().get(3..).unwrap()).unwrap()).unwrap();
1607 }
1608 }
1609
1610 #[test]
1612 fn full_bam_generation() {
1613 let config_json = r#"{
1614 "contigs": {
1615 "number": 2,
1616 "len_range": [100, 200]
1617 },
1618 "reads": [{
1619 "number": 1000,
1620 "mapq_range": [10, 20],
1621 "base_qual_range": [10, 20],
1622 "len_range": [0.1, 0.8]
1623 }]
1624 }"#;
1625
1626 let temp_dir = std::env::temp_dir();
1628 let bam_path = temp_dir.join(format!("{}.bam", Uuid::new_v4()));
1629 let fasta_path = temp_dir.join(format!("{}.fa", Uuid::new_v4()));
1630
1631 let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
1632 run(config, &bam_path, &fasta_path).unwrap();
1633
1634 assert!(bam_path.exists());
1635 assert!(fasta_path.exists());
1636
1637 let mut reader = bam::Reader::from_path(&bam_path).unwrap();
1639 let header = reader.header();
1640 assert_eq!(header.target_count(), 2);
1641 assert_eq!(reader.records().count(), 1000);
1642
1643 let bai_path = bam_path.with_extension("bam.bai");
1645 std::fs::remove_file(&bam_path).unwrap();
1646 std::fs::remove_file(fasta_path).unwrap();
1647 std::fs::remove_file(bai_path).unwrap();
1648 }
1649
1650 #[test]
1652 fn temp_bam_simulation_struct_no_mods() {
1653 let config_json = r#"{
1654 "contigs": {
1655 "number": 2,
1656 "len_range": [100, 200]
1657 },
1658 "reads": [{
1659 "number": 1000,
1660 "mapq_range": [10, 20],
1661 "base_qual_range": [10, 20],
1662 "len_range": [0.1, 0.8]
1663 }]
1664 }"#;
1665
1666 let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
1668 let sim = TempBamSimulation::new(config).unwrap();
1669
1670 assert!(Path::new(sim.bam_path()).exists());
1672 assert!(Path::new(sim.fasta_path()).exists());
1673
1674 let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
1676 let header = reader.header();
1677 assert_eq!(header.target_count(), 2);
1678 assert_eq!(reader.records().count(), 1000);
1679 }
1680
1681 #[test]
1683 fn temp_bam_simulation_cleanup() {
1684 let config_json = r#"{
1685 "contigs": {
1686 "number": 1,
1687 "len_range": [50, 50]
1688 },
1689 "reads": [{
1690 "number": 10,
1691 "len_range": [0.5, 0.5]
1692 }]
1693 }"#;
1694
1695 let bam_path: String;
1696 let fasta_path: String;
1697
1698 {
1699 let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
1700 let sim = TempBamSimulation::new(config).unwrap();
1701 bam_path = sim.bam_path().to_string();
1702 fasta_path = sim.fasta_path().to_string();
1703
1704 assert!(Path::new(&bam_path).exists());
1706 assert!(Path::new(&fasta_path).exists());
1707 } assert!(!Path::new(&bam_path).exists());
1711 assert!(!Path::new(&fasta_path).exists());
1712 }
1713
1714 #[test]
1716 fn generate_reads_denovo_empty_contigs_error() {
1717 let contigs: Vec<Contig> = vec![];
1718 let config = ReadConfig::default();
1719 let mut rng = rand::rng();
1720
1721 let result = generate_reads_denovo(&contigs, &config, "test", &mut rng);
1722 assert!(matches!(result, Err(Error::UnavailableData(_))));
1723 }
1724
1725 #[test]
1727 fn generate_reads_denovo_zero_length_error() {
1728 let contigs = [ContigBuilder::default()
1729 .name("tiny")
1730 .seq("ACGT".into())
1731 .build()
1732 .unwrap()];
1733
1734 let config = ReadConfigBuilder::default()
1735 .number(1)
1736 .len_range((0.0, 0.0))
1737 .build()
1738 .unwrap();
1739
1740 let mut rng = rand::rng();
1741 let result = generate_reads_denovo(&contigs, &config, "test", &mut rng);
1742 assert!(matches!(result, Err(Error::InvalidState(_))));
1743 }
1744
1745 #[test]
1747 fn run_empty_reads_error() {
1748 let invalid_json = r#"{ "reads": [] }"#; let temp_dir = std::env::temp_dir();
1750 let bam_path = temp_dir.join(format!("{}.bam", Uuid::new_v4()));
1751 let fasta_path = temp_dir.join(format!("{}.fa", Uuid::new_v4()));
1752
1753 let config: SimulationConfig = serde_json::from_str(invalid_json).unwrap();
1754 let result = run(config, &bam_path, &fasta_path);
1755 drop(result);
1758 let bai_path = bam_path.with_extension("bam.bai");
1759 drop(std::fs::remove_file(&bam_path));
1760 drop(std::fs::remove_file(&fasta_path));
1761 drop(std::fs::remove_file(&bai_path));
1762 }
1763
1764 #[test]
1766 fn multiple_read_groups_work() {
1767 let config_json = r#"{
1768 "contigs": {
1769 "number": 2,
1770 "len_range": [100, 200]
1771 },
1772 "reads": [
1773 {
1774 "number": 50,
1775 "mapq_range": [10, 20],
1776 "base_qual_range": [20, 30],
1777 "len_range": [0.1, 0.5]
1778 },
1779 {
1780 "number": 75,
1781 "mapq_range": [30, 40],
1782 "base_qual_range": [25, 35],
1783 "len_range": [0.3, 0.7]
1784 },
1785 {
1786 "number": 25,
1787 "mapq_range": [5, 15],
1788 "base_qual_range": [15, 25],
1789 "len_range": [0.2, 0.6]
1790 }
1791 ]
1792 }"#;
1793
1794 let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
1795 let sim = TempBamSimulation::new(config).unwrap();
1796 let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
1797
1798 assert_eq!(reader.records().count(), 150);
1800
1801 let mut reader2 = bam::Reader::from_path(sim.bam_path()).unwrap();
1803 let header = reader2.header();
1804 let header_text = std::str::from_utf8(header.as_bytes()).unwrap();
1805 assert!(header_text.contains("@RG\tID:0"));
1806 assert!(header_text.contains("@RG\tID:1"));
1807 assert!(header_text.contains("@RG\tID:2"));
1808
1809 let mut rg_counts = [0, 0, 0];
1811 for record in reader2.records() {
1812 let read = record.unwrap();
1813 let qname = std::str::from_utf8(read.qname()).unwrap();
1814
1815 if let Ok(Aux::String(rg)) = read.aux(b"RG") {
1816 let parts: Vec<&str> = qname.split('.').collect();
1818 assert_eq!(parts.len(), 2, "Read name should be in format 'n.uuid'");
1819
1820 let read_group_prefix =
1821 parts.first().expect("parts should have at least 1 element");
1822 let uuid_part = parts.get(1).expect("parts should have 2 elements");
1823
1824 assert_eq!(
1825 *read_group_prefix, rg,
1826 "Read name prefix should match read group"
1827 );
1828
1829 assert!(
1831 Uuid::parse_str(uuid_part).is_ok(),
1832 "Read name should contain a valid UUID after the dot"
1833 );
1834
1835 match rg {
1836 "0" => *rg_counts.get_mut(0).unwrap() += 1,
1837 "1" => *rg_counts.get_mut(1).unwrap() += 1,
1838 "2" => *rg_counts.get_mut(2).unwrap() += 1,
1839 unexpected => unreachable!("Unexpected read group: {unexpected}"),
1840 }
1841 }
1842 }
1843 assert_eq!(rg_counts[0], 50);
1844 assert_eq!(rg_counts[1], 75);
1845 assert_eq!(rg_counts[2], 25);
1846 }
1847
1848 #[expect(
1850 clippy::cast_sign_loss,
1851 reason = "tid and pos are non-negative in valid BAM"
1852 )]
1853 #[expect(
1854 clippy::cast_possible_truncation,
1855 reason = "tid fits in usize for normal BAM files"
1856 )]
1857 #[test]
1858 fn read_sequences_match_contigs() {
1859 let contigs = vec![
1860 ContigBuilder::default()
1861 .name("chr1")
1862 .seq("ACGTACGTACGTACGTACGTACGTACGTACGT".into())
1863 .build()
1864 .unwrap(),
1865 ContigBuilder::default()
1866 .name("chr2")
1867 .seq("TGCATGCATGCATGCATGCATGCATGCATGCA".into())
1868 .build()
1869 .unwrap(),
1870 ];
1871
1872 let read_config = ReadConfigBuilder::default()
1873 .number(50)
1874 .len_range((0.2, 0.8))
1875 .build()
1876 .unwrap();
1877
1878 let mut rng = rand::rng();
1879 let reads = generate_reads_denovo(&contigs, &read_config, "test", &mut rng).unwrap();
1880
1881 let mut validated_count = 0;
1882 for read in &reads {
1883 if !read.is_unmapped() && !read.is_reverse() {
1885 let cigar = read.cigar();
1886
1887 let has_soft_clip = cigar.iter().any(|op| matches!(op, Cigar::SoftClip(_)));
1889 if has_soft_clip {
1890 continue;
1891 }
1892
1893 let tid = read.tid() as usize;
1894 let pos = read.pos() as usize;
1895 let contig = contigs.get(tid).expect("valid tid");
1896 let contig_seq = contig.get_dna_restrictive().get();
1897
1898 let mut aligned_len = 0usize;
1900 for op in &cigar {
1901 if let Cigar::Match(len) = *op {
1902 aligned_len = aligned_len.checked_add(len as usize).expect("overflow");
1903 }
1904 }
1905
1906 let read_seq = read.seq().as_bytes();
1907 let expected_seq = contig_seq
1908 .get(pos..pos.checked_add(aligned_len).expect("overflow"))
1909 .expect("contig subsequence exists");
1910
1911 assert_eq!(
1912 read_seq, expected_seq,
1913 "Forward read sequence should exactly match contig substring"
1914 );
1915 validated_count += 1;
1916 }
1917 }
1918
1919 assert!(
1921 validated_count > 0,
1922 "Should have validated at least some forward reads"
1923 );
1924 }
1925
1926 #[test]
1928 fn different_read_states_work() {
1929 let config_json = r#"{
1930 "contigs": {
1931 "number": 1,
1932 "len_range": [200, 200]
1933 },
1934 "reads": [{
1935 "number": 1000,
1936 "mapq_range": [10, 20],
1937 "base_qual_range": [20, 30],
1938 "len_range": [0.2, 0.8]
1939 }]
1940 }"#;
1941
1942 let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
1943 let sim = TempBamSimulation::new(config).unwrap();
1944 let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
1945
1946 let mut has_unmapped = false;
1947 let mut has_forward = false;
1948 let mut has_reverse = false;
1949 let mut has_secondary = false;
1950 let mut has_supplementary = false;
1951
1952 for record in reader.records() {
1953 let read = record.unwrap();
1954
1955 if read.is_unmapped() {
1956 has_unmapped = true;
1957 assert_eq!(read.tid(), -1);
1959 assert_eq!(read.pos(), -1);
1960 assert_eq!(read.mapq(), 255);
1961 } else {
1962 if read.is_reverse() {
1963 has_reverse = true;
1964 } else {
1965 has_forward = true;
1966 }
1967
1968 if read.is_secondary() {
1969 has_secondary = true;
1970 }
1971
1972 if read.is_supplementary() {
1973 has_supplementary = true;
1974 }
1975 }
1976 }
1977
1978 assert!(has_unmapped, "Should have unmapped reads");
1980 assert!(has_forward, "Should have forward reads");
1981 assert!(has_reverse, "Should have reverse reads");
1982 assert!(has_secondary, "Should have secondary reads");
1983 assert!(has_supplementary, "Should have supplementary reads");
1984 }
1985
1986 #[test]
1988 fn read_length_full_contig_works() {
1989 let contigs = vec![
1990 ContigBuilder::default()
1991 .name("chr1")
1992 .seq("ACGTACGTACGTACGTACGTACGTACGTACGT".into())
1993 .build()
1994 .unwrap(),
1995 ];
1996
1997 let config_full_length = ReadConfigBuilder::default()
1998 .number(20)
1999 .len_range((1.0, 1.0))
2000 .build()
2001 .unwrap();
2002
2003 let mut rng = rand::rng();
2004 let reads_full =
2005 generate_reads_denovo(&contigs, &config_full_length, "test", &mut rng).unwrap();
2006
2007 for read in &reads_full {
2009 if !read.is_unmapped() {
2010 assert_eq!(read.seq_len(), 32, "Read should be full contig length");
2011 }
2012 }
2013 }
2014
2015 #[test]
2017 fn read_length_small_fraction_works() {
2018 let contigs = [ContigBuilder::default()
2020 .name("chr1")
2021 .seq("ACGT".repeat(50))
2022 .build()
2023 .unwrap()];
2024
2025 let config_small = ReadConfigBuilder::default()
2026 .number(20)
2027 .len_range((0.01, 0.05))
2028 .build()
2029 .unwrap();
2030
2031 let mut rng = rand::rng();
2032 let reads_small = generate_reads_denovo(&contigs, &config_small, "test", &mut rng).unwrap();
2033
2034 for read in &reads_small {
2036 if !read.is_unmapped() {
2037 assert!(read.seq_len() <= 10, "Read should be very small");
2038 assert!(read.seq_len() >= 2, "Read should be at least 2bp");
2039 }
2040 }
2041 }
2042}
2043
2044#[cfg(test)]
2045mod read_generation_barcodes {
2046 use super::*;
2047 use rust_htslib::bam::Read as _;
2048
2049 #[expect(
2051 clippy::shadow_unrelated,
2052 reason = "repetition is fine; each block is clearly separated"
2053 )]
2054 #[test]
2055 fn add_barcode_works() {
2056 let read_seq = b"GGGGGGGG".to_vec();
2057 let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
2058
2059 let result = add_barcode(&read_seq, barcode.clone(), ReadState::PrimaryFwd);
2061 assert_eq!(result, b"ACGTAAGGGGGGGGTTACGT".to_vec());
2062
2063 let result = add_barcode(&read_seq, barcode, ReadState::PrimaryRev);
2065 assert_eq!(result, b"TGCATTGGGGGGGGAATGCA".to_vec());
2066 }
2067
2068 #[test]
2070 fn cigar_strings_valid_with_or_without_barcodes() {
2071 let contigs = [ContigBuilder::default()
2073 .name("chr1")
2074 .seq("ACGTACGTACGTACGTACGTACGTACGTACGT".into())
2075 .build()
2076 .unwrap()];
2077
2078 let config_no_barcode = ReadConfigBuilder::default()
2079 .number(20)
2080 .len_range((0.3, 0.7))
2081 .build()
2082 .unwrap();
2083
2084 let mut rng = rand::rng();
2085 let reads_no_barcode =
2086 generate_reads_denovo(&contigs, &config_no_barcode, "test", &mut rng).unwrap();
2087
2088 for read in &reads_no_barcode {
2089 if !read.is_unmapped() {
2090 let cigar = read.cigar();
2091 assert_eq!(cigar.len(), 1);
2093 assert!(matches!(cigar.first().unwrap(), Cigar::Match(_)));
2094
2095 let cigar_len: u32 = cigar
2097 .iter()
2098 .map(|op| match *op {
2099 Cigar::Match(len) | Cigar::SoftClip(len) => len,
2100 Cigar::Ins(_)
2101 | Cigar::Del(_)
2102 | Cigar::RefSkip(_)
2103 | Cigar::HardClip(_)
2104 | Cigar::Pad(_)
2105 | Cigar::Equal(_)
2106 | Cigar::Diff(_) => unreachable!(),
2107 })
2108 .sum();
2109 assert_eq!(cigar_len as usize, read.seq_len());
2110 }
2111 }
2112
2113 let config_with_barcode = ReadConfigBuilder::default()
2115 .number(20)
2116 .len_range((0.3, 0.7))
2117 .barcode("ACGTAA".into())
2118 .build()
2119 .unwrap();
2120
2121 let reads_with_barcode =
2122 generate_reads_denovo(&contigs, &config_with_barcode, "test", &mut rng).unwrap();
2123
2124 for read in &reads_with_barcode {
2125 if !read.is_unmapped() {
2126 let cigar = read.cigar();
2127 assert_eq!(cigar.len(), 3);
2129 assert!(matches!(cigar.first().unwrap(), Cigar::SoftClip(6))); assert!(matches!(cigar.get(1).unwrap(), Cigar::Match(_)));
2131 assert!(matches!(cigar.get(2).unwrap(), Cigar::SoftClip(6))); let cigar_len: u32 = cigar
2135 .iter()
2136 .map(|op| match *op {
2137 Cigar::Match(len) | Cigar::SoftClip(len) => len,
2138 Cigar::Ins(_)
2139 | Cigar::Del(_)
2140 | Cigar::RefSkip(_)
2141 | Cigar::HardClip(_)
2142 | Cigar::Pad(_)
2143 | Cigar::Equal(_)
2144 | Cigar::Diff(_) => unreachable!(),
2145 })
2146 .sum();
2147 assert_eq!(cigar_len as usize, read.seq_len());
2148 }
2149 }
2150 }
2151
2152 #[test]
2154 fn generate_reads_denovo_with_barcode_works() {
2155 let config_json = r#"{
2156 "contigs": {
2157 "number": 1,
2158 "len_range": [20, 20]
2159 },
2160 "reads": [{
2161 "number": 10,
2162 "len_range": [0.2, 0.8],
2163 "barcode": "GTACGG"
2164 }]
2165 }"#;
2166
2167 let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
2168 let sim = TempBamSimulation::new(config).unwrap();
2169 let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
2170
2171 for record in reader.records() {
2172 let read = record.unwrap();
2173 let seq = read.seq().as_bytes();
2174 let seq_len = seq.len();
2175
2176 if read.is_reverse() {
2177 assert_eq!(seq.get(..6).expect("seq has at least 6 bases"), b"CATGCC");
2179 assert_eq!(
2180 seq.get(seq_len.saturating_sub(6)..)
2181 .expect("seq has at least 6 bases"),
2182 b"GGCATG"
2183 );
2184 } else {
2185 assert_eq!(seq.get(..6).expect("seq has at least 6 bases"), b"GTACGG");
2187 assert_eq!(
2188 seq.get(seq_len.saturating_sub(6)..)
2189 .expect("seq has at least 6 bases"),
2190 b"CCGTAC"
2191 );
2192 }
2193 }
2194 }
2195}
2196
2197#[cfg(test)]
2198mod read_generation_with_mods_tests {
2199 use super::*;
2200 use crate::{CurrRead, ThresholdState, curr_reads_to_dataframe};
2201 use rust_htslib::bam::Read as _;
2202
2203 #[test]
2205 fn generate_reads_denovo_with_mods_works() {
2206 let mut rng = rand::rng();
2207 let contigs = vec![
2208 ContigBuilder::default()
2209 .name("contig_0")
2210 .seq("ACGTACGTACGTACGTACGT".into())
2211 .build()
2212 .unwrap(),
2213 ContigBuilder::default()
2214 .name("contig_1")
2215 .seq("TGCATGCATGCATGCATGCA".into())
2216 .build()
2217 .unwrap(),
2218 ];
2219
2220 let config = ReadConfigBuilder::default()
2221 .number(10000)
2222 .mapq_range((10, 20))
2223 .base_qual_range((30, 50))
2224 .len_range((0.2, 0.8))
2225 .mods(
2226 [ModConfigBuilder::default()
2227 .base('C')
2228 .mod_code("m".into())
2229 .win([4].into())
2230 .mod_range([(0f32, 1f32)].into())
2231 .build()
2232 .unwrap()]
2233 .into(),
2234 )
2235 .build()
2236 .unwrap();
2237
2238 let reads = generate_reads_denovo(&contigs, &config, "1", &mut rng).unwrap();
2239 assert_eq!(reads.len(), 10000);
2240
2241 let mut zero_to_255_visited = vec![false; 256];
2242
2243 for read in &reads {
2244 if !read.is_unmapped() {
2246 assert!((10..=20).contains(&read.mapq()));
2247 assert!((0..2).contains(&read.tid()));
2248 }
2249
2250 assert!((4..=16).contains(&read.seq_len()));
2252 assert!(read.qual().iter().all(|x| (30..=50).contains(x)));
2253
2254 let Aux::String(mod_pos_mm_tag) = read.aux(b"MM").unwrap() else {
2256 unreachable!()
2257 };
2258 let Aux::ArrayU8(mod_prob_ml_tag) = read.aux(b"ML").unwrap() else {
2259 unreachable!()
2260 };
2261 let pos: Vec<&str> = mod_pos_mm_tag
2263 .strip_prefix("C+m?,")
2264 .unwrap()
2265 .strip_suffix(';')
2266 .unwrap()
2267 .split(',')
2268 .collect();
2269 assert!(pos.iter().all(|&x| x == "0"), "All MM values should be 0");
2270
2271 #[expect(
2273 clippy::indexing_slicing,
2274 reason = "we are not gonna bother, this should be fine"
2275 )]
2276 for k in 0..mod_prob_ml_tag.len() {
2277 zero_to_255_visited[usize::from(mod_prob_ml_tag.get(k).expect("no error"))] = true;
2278 }
2279 }
2280
2281 assert!(zero_to_255_visited.into_iter().all(|x| x));
2284 }
2285
2286 #[test]
2288 fn temp_bam_simulation_struct_with_mods() {
2289 let config_json = r#"{
2290 "contigs": {
2291 "number": 2,
2292 "len_range": [100, 200]
2293 },
2294 "reads": [{
2295 "number": 1000,
2296 "mapq_range": [10, 20],
2297 "base_qual_range": [10, 20],
2298 "len_range": [0.1, 0.8],
2299 "mods": [{
2300 "base": "T",
2301 "is_strand_plus": true,
2302 "mod_code": "T",
2303 "win": [4, 5],
2304 "mod_range": [[0.1, 0.2], [0.3, 0.4]]
2305 }]
2306 }]
2307 }"#;
2308
2309 let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
2311 let sim = TempBamSimulation::new(config).unwrap();
2312
2313 assert!(Path::new(sim.bam_path()).exists());
2315 assert!(Path::new(sim.fasta_path()).exists());
2316
2317 let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
2319 let header = reader.header();
2320 assert_eq!(header.target_count(), 2);
2321 assert_eq!(reader.records().count(), 1000);
2322 }
2323
2324 #[test]
2326 fn generate_random_dna_modification_empty_config() {
2327 let seq = DNARestrictive::from_str("ACGTACGT").unwrap();
2328 let mod_configs: Vec<ModConfig> = vec![];
2329 let mut rng = rand::rng();
2330
2331 let (mm_str, ml_vec) = generate_random_dna_modification(&mod_configs, &seq, &mut rng);
2332
2333 assert_eq!(mm_str, String::new());
2334 assert_eq!(ml_vec.len(), 0);
2335 }
2336
2337 #[test]
2339 fn generate_random_dna_modification_single_mod() {
2340 let seq = DNARestrictive::from_str("ACGTCGCGATCG").unwrap();
2341 let mod_config = ModConfigBuilder::default()
2342 .base('C')
2343 .mod_code("m".into())
2344 .win(vec![2])
2345 .mod_range(vec![(0.5, 0.5)])
2346 .build()
2347 .unwrap();
2348
2349 let mut rng = rand::rng();
2350 let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
2351
2352 assert_eq!(ml_vec.len(), 4);
2354 assert!(ml_vec.iter().all(|&x| x == 128u8));
2356 let probs: Vec<&str> = mm_str
2358 .strip_prefix("C+m?,")
2359 .unwrap()
2360 .strip_suffix(';')
2361 .unwrap()
2362 .split(',')
2363 .collect();
2364 assert_eq!(probs.len(), 4);
2365 assert!(probs.iter().all(|&x| x == "0"));
2366 }
2367
2368 #[test]
2370 fn generate_random_dna_modification_multiple_mods() {
2371 let seq = DNARestrictive::from_str("ACGTACGT").unwrap();
2372
2373 let mod_config_c = ModConfigBuilder::default()
2374 .base('C')
2375 .mod_code('m'.into())
2376 .win(vec![1])
2377 .mod_range(vec![(0.8, 0.8)])
2378 .build()
2379 .unwrap();
2380
2381 let mod_config_t = ModConfigBuilder::default()
2382 .base('T')
2383 .is_strand_plus(false)
2384 .mod_code('t'.into())
2385 .win(vec![1])
2386 .mod_range(vec![(0.4, 0.4)])
2387 .build()
2388 .unwrap();
2389
2390 let mut rng = rand::rng();
2391 let (mm_str, ml_vec) =
2392 generate_random_dna_modification(&[mod_config_c, mod_config_t], &seq, &mut rng);
2393
2394 assert!(mm_str.contains("C+m?,"));
2396 assert!(mm_str.contains("T-t?,"));
2397 assert_eq!(
2398 ml_vec,
2399 vec![204u8, 204u8, 102u8, 102u8],
2400 "ML values should be 204,204,102,102"
2401 );
2402
2403 let c_section = mm_str
2405 .split(';')
2406 .find(|s| s.starts_with("C+m?"))
2407 .expect("Should have C+m? section");
2408 let c_probs: Vec<&str> = c_section
2409 .strip_prefix("C+m?,")
2410 .unwrap()
2411 .split(',')
2412 .collect();
2413 assert_eq!(
2414 c_probs.len(),
2415 2,
2416 "Should have exactly 2 C modifications in ACGTACGT"
2417 );
2418 assert!(
2420 c_probs.iter().all(|&x| x == "0"),
2421 "All C modifications should have gap pos 0",
2422 );
2423
2424 let t_section = mm_str
2426 .split(';')
2427 .find(|s| s.starts_with("T-t?"))
2428 .expect("Should have T-t? section");
2429 let t_probs: Vec<&str> = t_section
2430 .strip_prefix("T-t?,")
2431 .unwrap()
2432 .split(',')
2433 .collect();
2434 assert_eq!(
2435 t_probs.len(),
2436 2,
2437 "Should have exactly 2 T modifications in ACGTACGT"
2438 );
2439 assert!(
2441 t_probs.iter().all(|&x| x == "0"),
2442 "All T modifications should have a gap pos of 0"
2443 );
2444 }
2445
2446 #[test]
2448 fn generate_random_dna_modification_n_base() {
2449 let seq = DNARestrictive::from_str("ACGT").unwrap();
2450
2451 let mod_config = ModConfigBuilder::default()
2452 .base('N')
2453 .is_strand_plus(true)
2454 .mod_code("n".into())
2455 .win([4].into())
2456 .mod_range([(0.5, 0.5)].into())
2457 .build()
2458 .unwrap();
2459 let mut rng = rand::rng();
2460 let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
2461
2462 assert_eq!(ml_vec.len(), 4);
2464 let pos: Vec<&str> = mm_str
2466 .strip_prefix("N+n?,")
2467 .unwrap()
2468 .strip_suffix(';')
2469 .unwrap()
2470 .split(',')
2471 .collect();
2472 assert_eq!(pos.len(), 4, "Should have exactly 4 modifications for ACGT");
2473
2474 assert!(
2476 pos.iter().all(|&x| x == "0"),
2477 "All N base modifications should have 0 as their gap position coordinate"
2478 );
2479
2480 assert_eq!(
2482 ml_vec,
2483 vec![128u8, 128u8, 128u8, 128u8],
2484 "All ML values should be 128 and number of values is 4"
2485 );
2486 }
2487
2488 #[test]
2490 fn generate_random_dna_modification_cycling_windows() {
2491 let seq = DNARestrictive::from_str("CCCCCCCCCCCCCCCC").unwrap(); let mod_config = ModConfigBuilder::default()
2494 .base('C')
2495 .mod_code("m".into())
2496 .win([3, 2].into())
2497 .mod_range([(0.8, 0.8), (0.4, 0.4)].into())
2498 .build()
2499 .unwrap();
2500
2501 let mut rng = rand::rng();
2502 let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
2503
2504 let pos: Vec<&str> = mm_str
2506 .strip_prefix("C+m?,")
2507 .unwrap()
2508 .strip_suffix(';')
2509 .unwrap()
2510 .split(',')
2511 .collect();
2512 assert_eq!(pos, vec!["0"; 16]);
2513 let expected_pattern: Vec<u8> = vec![
2515 204, 204, 204, 102, 102, 204, 204, 204, 102, 102, 204, 204, 204, 102, 102, 204,
2516 ];
2517 assert_eq!(ml_vec, expected_pattern);
2518 }
2519 #[expect(
2521 clippy::similar_names,
2522 reason = "has_c_mod, has_a_mod, has_t_mod are clear in context"
2523 )]
2524 #[test]
2525 fn multiple_simultaneous_modifications_work() {
2526 let config_json = r#"{
2527 "contigs": {
2528 "number": 1,
2529 "len_range": [100, 100],
2530 "repeated_seq": "ACGTACGTACGTACGT"
2531 },
2532 "reads": [{
2533 "number": 20,
2534 "mapq_range": [20, 30],
2535 "base_qual_range": [20, 30],
2536 "len_range": [0.8, 1.0],
2537 "mods": [
2538 {
2539 "base": "C",
2540 "is_strand_plus": true,
2541 "mod_code": "m",
2542 "win": [2],
2543 "mod_range": [[0.7, 0.9]]
2544 },
2545 {
2546 "base": "A",
2547 "is_strand_plus": true,
2548 "mod_code": "a",
2549 "win": [3],
2550 "mod_range": [[0.3, 0.5]]
2551 },
2552 {
2553 "base": "T",
2554 "is_strand_plus": false,
2555 "mod_code": "t",
2556 "win": [1],
2557 "mod_range": [[0.5, 0.6]]
2558 }
2559 ]
2560 }]
2561 }"#;
2562
2563 let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
2564 let sim = TempBamSimulation::new(config).unwrap();
2565 let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
2566
2567 let mut has_c_mod = false;
2568 let mut has_a_mod = false;
2569 let mut has_t_mod = false;
2570
2571 for record in reader.records() {
2572 let read = record.unwrap();
2573 if let Ok(Aux::String(mm_tag)) = read.aux(b"MM") {
2574 if mm_tag.contains("C+m?") {
2576 has_c_mod = true;
2577 }
2578 if mm_tag.contains("A+a?") {
2579 has_a_mod = true;
2580 }
2581 if mm_tag.contains("T-t?") {
2582 has_t_mod = true;
2583 }
2584
2585 assert!(
2587 matches!(read.aux(b"ML").unwrap(), Aux::ArrayU8(_)),
2588 "ML tag should be ArrayU8"
2589 );
2590 }
2591 }
2592
2593 assert!(has_c_mod, "Should have C+m modifications");
2594 assert!(has_a_mod, "Should have A+a modifications");
2595 assert!(has_t_mod, "Should have T-t modifications");
2596 }
2597
2598 #[expect(
2600 clippy::shadow_unrelated,
2601 reason = "clear variable reuse in sequential tests"
2602 )]
2603 #[test]
2604 fn modifications_target_correct_bases() {
2605 let seq = DNARestrictive::from_str("AAAACCCCGGGGTTTT").unwrap();
2607
2608 let mod_config_template = ModConfigBuilder::default()
2610 .base('C')
2611 .mod_code("m".into())
2612 .win([4].into())
2613 .mod_range([(1.0, 1.0)].into());
2614
2615 let mod_config_c = mod_config_template.clone().build().unwrap();
2617
2618 let mut rng = rand::rng();
2619 let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_c], &seq, &mut rng);
2620
2621 assert_eq!(ml_vec.len(), 4);
2623 let probs: Vec<&str> = mm_str
2624 .strip_prefix("C+m?,")
2625 .unwrap()
2626 .strip_suffix(';')
2627 .unwrap()
2628 .split(',')
2629 .collect();
2630 assert_eq!(probs.len(), 4, "Should have exactly 4 C modifications");
2631
2632 let mod_config_t = mod_config_template
2634 .clone()
2635 .base('T')
2636 .is_strand_plus(false)
2637 .mod_code("t".into())
2638 .build()
2639 .unwrap();
2640
2641 let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_t], &seq, &mut rng);
2642
2643 assert_eq!(ml_vec.len(), 4);
2645 let probs: Vec<&str> = mm_str
2646 .strip_prefix("T-t?,")
2647 .unwrap()
2648 .strip_suffix(';')
2649 .unwrap()
2650 .split(',')
2651 .collect();
2652 assert_eq!(probs.len(), 4, "Should have exactly 4 T modifications");
2653
2654 let mod_config_a = mod_config_template
2656 .clone()
2657 .base('A')
2658 .mod_code("a".into())
2659 .build()
2660 .unwrap();
2661
2662 let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_a], &seq, &mut rng);
2663
2664 assert_eq!(ml_vec.len(), 4);
2666 let probs: Vec<&str> = mm_str
2667 .strip_prefix("A+a?,")
2668 .unwrap()
2669 .strip_suffix(';')
2670 .unwrap()
2671 .split(',')
2672 .collect();
2673 assert_eq!(probs.len(), 4, "Should have exactly 4 A modifications");
2674
2675 let mod_config_g = mod_config_template
2677 .clone()
2678 .base('G')
2679 .mod_code("g".into())
2680 .build()
2681 .unwrap();
2682
2683 let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_g], &seq, &mut rng);
2684
2685 assert_eq!(ml_vec.len(), 4);
2687 let probs: Vec<&str> = mm_str
2688 .strip_prefix("G+g?,")
2689 .unwrap()
2690 .strip_suffix(';')
2691 .unwrap()
2692 .split(',')
2693 .collect();
2694 assert_eq!(probs.len(), 4, "Should have exactly 4 G modifications");
2695 }
2696
2697 #[test]
2699 fn edge_case_no_target_base_for_modification() {
2700 let seq = DNARestrictive::from_str("AAAAAAAAAA").unwrap(); let mod_config = ModConfigBuilder::default()
2703 .base('C')
2704 .mod_code("m".into())
2705 .win([5].into())
2706 .mod_range([(0.5, 0.5)].into())
2707 .build()
2708 .unwrap();
2709
2710 let mut rng = rand::rng();
2711 let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
2712
2713 assert_eq!(mm_str, String::new());
2715 assert_eq!(ml_vec.len(), 0);
2716 }
2717
2718 #[expect(
2720 clippy::shadow_unrelated,
2721 reason = "clear variable reuse in sequential tests"
2722 )]
2723 #[test]
2724 fn edge_case_modification_probability_extremes() {
2725 let seq = DNARestrictive::from_str("CCCCCCCC").unwrap();
2726
2727 let mod_config_template = ModConfigBuilder::default()
2729 .base('C')
2730 .mod_code("m".into())
2731 .win([8].into());
2732
2733 let mod_config_zero = mod_config_template
2734 .clone()
2735 .mod_range([(0.0, 0.0)].into())
2736 .build()
2737 .unwrap();
2738
2739 let mut rng = rand::rng();
2740 let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_zero], &seq, &mut rng);
2741
2742 assert_eq!(ml_vec, vec![0u8; 8]);
2743 let pos: Vec<&str> = mm_str
2744 .strip_prefix("C+m?,")
2745 .unwrap()
2746 .strip_suffix(';')
2747 .unwrap()
2748 .split(',')
2749 .collect();
2750 assert!(pos.iter().all(|&x| x == "0"));
2751
2752 let mod_config_one = mod_config_template
2754 .clone()
2755 .mod_range([(1.0, 1.0)].into())
2756 .build()
2757 .unwrap();
2758
2759 let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_one], &seq, &mut rng);
2760
2761 assert_eq!(ml_vec, vec![255u8; 8]);
2762 let pos: Vec<&str> = mm_str
2763 .strip_prefix("C+m?,")
2764 .unwrap()
2765 .strip_suffix(';')
2766 .unwrap()
2767 .split(',')
2768 .collect();
2769 assert!(pos.iter().all(|&x| x == "0"));
2770 }
2771
2772 #[expect(
2774 clippy::indexing_slicing,
2775 reason = "test validates length before indexing"
2776 )]
2777 #[test]
2778 fn config_deserialization_works_with_json_with_mods() {
2779 let json_str = r#"{
2781 "contigs": {
2782 "number": 5,
2783 "len_range": [100, 500],
2784 "repeated_seq": "ACGTACGT"
2785 },
2786 "reads": [
2787 {
2788 "number": 100,
2789 "mapq_range": [10, 30],
2790 "base_qual_range": [20, 40],
2791 "len_range": [0.1, 0.9],
2792 "barcode": "ACGTAA",
2793 "mods": [{
2794 "base": "C",
2795 "is_strand_plus": true,
2796 "mod_code": "m",
2797 "win": [5, 3],
2798 "mod_range": [[0.3, 0.7], [0.1, 0.5]]
2799 }]
2800 },
2801 {
2802 "number": 50,
2803 "mapq_range": [5, 15],
2804 "base_qual_range": [15, 25],
2805 "len_range": [0.2, 0.8]
2806 }
2807 ]
2808 }"#;
2809
2810 let config: SimulationConfig = serde_json::from_str(json_str).unwrap();
2812
2813 assert_eq!(config.contigs.number.get(), 5);
2815 assert_eq!(config.contigs.len_range.low().get(), 100);
2816 assert_eq!(config.contigs.len_range.high().get(), 500);
2817 assert!(config.contigs.repeated_seq.is_some());
2818
2819 assert_eq!(config.reads.len(), 2);
2820
2821 assert_eq!(config.reads[0].number.get(), 100);
2823 assert_eq!(config.reads[0].mapq_range.low(), 10);
2824 assert_eq!(config.reads[0].mapq_range.high(), 30);
2825 assert_eq!(config.reads[0].base_qual_range.low(), 20);
2826 assert_eq!(config.reads[0].base_qual_range.high(), 40);
2827 assert!(config.reads[0].barcode.is_some());
2828 assert_eq!(config.reads[0].mods.len(), 1);
2829 assert!(matches!(config.reads[0].mods[0].base, AllowedAGCTN::C));
2830 assert_eq!(config.reads[0].mods[0].win.len(), 2);
2831
2832 assert_eq!(config.reads[1].number.get(), 50);
2834 assert_eq!(config.reads[1].mods.len(), 0);
2835 assert!(config.reads[1].barcode.is_none());
2836 }
2837
2838 #[test]
2841 fn barcodes_with_modifications_integration_works() {
2842 let contigs = [ContigBuilder::default()
2844 .name("chr1")
2845 .seq("A".repeat(50))
2846 .build()
2847 .unwrap()];
2848
2849 let config = ReadConfigBuilder::default()
2850 .number(20)
2851 .len_range((0.3, 0.7))
2852 .barcode("ACGTAA".into()) .mods(
2854 [ModConfigBuilder::default()
2855 .base('C')
2856 .mod_code("m".into())
2857 .win([10].into())
2858 .mod_range([(0.8, 0.8)].into())
2859 .build()
2860 .unwrap()]
2861 .into(),
2862 )
2863 .build()
2864 .unwrap();
2865
2866 let mut rng = rand::rng();
2867 let reads = generate_reads_denovo(&contigs, &config, "test", &mut rng).unwrap();
2868
2869 let mut has_modifications = false;
2870 for read in &reads {
2871 if !read.is_unmapped() {
2872 let mm_pos_result = read.aux(b"MM");
2878 let ml_prob_result = read.aux(b"ML");
2879
2880 assert!(
2882 mm_pos_result.is_ok(),
2883 "MM tag should be present due to C in barcode"
2884 );
2885 assert!(
2886 ml_prob_result.is_ok(),
2887 "ML tag should be present due to C in barcode"
2888 );
2889
2890 if let Ok(Aux::String(mm_str)) = mm_pos_result {
2891 assert!(
2892 mm_str.starts_with("C+m?,"),
2893 "MM tag should contain C+m modifications"
2894 );
2895 assert!(!mm_str.is_empty(), "MM tag should not be empty");
2896 has_modifications = true;
2897 }
2898 }
2899 }
2900
2901 assert!(
2902 has_modifications,
2903 "Should have found C modifications from barcode bases"
2904 );
2905 }
2906
2907 #[test]
2909 fn edge_case_window_larger_than_sequence() {
2910 let seq = DNARestrictive::from_str("ACGTACGT").unwrap(); let mod_config = ModConfigBuilder::default()
2913 .base('C')
2914 .mod_code("m".into())
2915 .win([100].into()) .mod_range([(0.5, 0.5)].into())
2917 .build()
2918 .unwrap();
2919
2920 let mut rng = rand::rng();
2921 let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
2922
2923 assert_eq!(
2925 ml_vec,
2926 vec![128u8, 128u8],
2927 "All prob should be 128 (0.5*255)"
2928 );
2929 let pos: Vec<&str> = mm_str
2930 .strip_prefix("C+m?,")
2931 .unwrap()
2932 .strip_suffix(';')
2933 .unwrap()
2934 .split(',')
2935 .collect();
2936 assert_eq!(pos.len(), 2, "Should have exactly 2 position values");
2937 assert!(pos.iter().all(|&x| x == "0"), "All pos should be 0");
2938 }
2939
2940 #[test]
2942 fn edge_case_single_base_windows() {
2943 let seq = DNARestrictive::from_str("C".repeat(16).as_str()).unwrap(); let mod_config = ModConfigBuilder::default()
2946 .base('C')
2947 .is_strand_plus(true)
2948 .mod_code("m".into())
2949 .win([1].into())
2950 .mod_range([(0.9, 0.9), (0.1, 0.1)].into())
2951 .build()
2952 .unwrap();
2953
2954 let mut rng = rand::rng();
2955 let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
2956
2957 let expected_pattern: Vec<u8> = vec![
2960 230, 26, 230, 26, 230, 26, 230, 26, 230, 26, 230, 26, 230, 26, 230, 26,
2961 ];
2962 assert_eq!(ml_vec, expected_pattern);
2963 let pos: Vec<&str> = mm_str
2964 .strip_prefix("C+m?,")
2965 .unwrap()
2966 .strip_suffix(';')
2967 .unwrap()
2968 .split(',')
2969 .collect();
2970 assert_eq!(pos, vec!["0"; 16], "pos should have 16 entries, all 0");
2971 }
2972
2973 #[test]
2975 fn reverse_reads_modification_positions_correct() {
2976 let contigs = [ContigBuilder::default()
2978 .name("chr1")
2979 .seq("AAAAACAAAAACAAAAAC".into())
2980 .build()
2981 .unwrap()];
2982
2983 let config = ReadConfigBuilder::default()
2984 .number(100)
2985 .len_range((1.0, 1.0))
2986 .mods(
2987 [ModConfigBuilder::default()
2988 .base('C')
2989 .mod_code("m".into())
2990 .win([10].into())
2991 .mod_range([(0.8, 0.8)].into())
2992 .build()
2993 .unwrap()]
2994 .into(),
2995 )
2996 .build()
2997 .unwrap();
2998
2999 let mut rng = rand::rng();
3000 let reads = generate_reads_denovo(&contigs, &config, "test", &mut rng).unwrap();
3001
3002 let mut forward_with_mods = 0;
3003
3004 for read in &reads {
3005 if !read.is_unmapped() {
3006 let mm_result = read.aux(b"MM");
3007
3008 if read.is_reverse() {
3009 assert!(
3012 mm_result.is_err(),
3013 "Reverse reads should have no MM tag (revcomp has no C's)"
3014 );
3015 } else {
3016 assert!(mm_result.is_ok(), "Forward reads should have MM tag");
3018 if let Ok(Aux::String(mm_str)) = mm_result {
3019 let probs: Vec<&str> = mm_str
3021 .strip_prefix("C+m?,")
3022 .unwrap()
3023 .strip_suffix(';')
3024 .unwrap()
3025 .split(',')
3026 .collect();
3027 assert_eq!(probs.len(), 3, "Should have exactly 3 C modifications");
3028 forward_with_mods += 1;
3029 }
3030 }
3031 }
3032 }
3033
3034 assert!(
3035 forward_with_mods > 0,
3036 "Should have validated some forward reads with modifications"
3037 );
3038 }
3039
3040 #[expect(
3050 clippy::too_many_lines,
3051 reason = "test requires setup, iteration, and multiple assertions for thorough validation"
3052 )]
3053 #[test]
3054 fn mismatch_mod_check() {
3055 let json_str = r#"{
3056 "contigs": {
3057 "number": 1,
3058 "len_range": [100, 100],
3059 "repeated_seq": "ACGT"
3060 },
3061 "reads": [
3062 {
3063 "number": 100,
3064 "len_range": [1.0, 1.0],
3065 "mods": [{
3066 "base": "C",
3067 "is_strand_plus": true,
3068 "mod_code": "m",
3069 "win": [1],
3070 "mod_range": [[1.0, 1.0]]
3071 }]
3072 },
3073 {
3074 "number": 100,
3075 "len_range": [1.0, 1.0],
3076 "mismatch": 1.0,
3077 "mods": [{
3078 "base": "C",
3079 "is_strand_plus": true,
3080 "mod_code": "m",
3081 "win": [1],
3082 "mod_range": [[0.51, 0.51]]
3083 }]
3084 }
3085 ]
3086 }"#;
3087
3088 let config: SimulationConfig = serde_json::from_str(json_str).unwrap();
3089 let sim = TempBamSimulation::new(config).unwrap();
3090
3091 let mut bam = bam::Reader::from_path(sim.bam_path()).unwrap();
3092 let mut df_collection = Vec::new();
3093
3094 for k in bam.records() {
3095 let record = k.unwrap();
3096 let curr_read = CurrRead::default()
3097 .try_from_only_alignment(&record)
3098 .unwrap()
3099 .set_mod_data(&record, ThresholdState::GtEq(0), 0)
3100 .unwrap();
3101 df_collection.push(curr_read);
3102 }
3103
3104 let df = curr_reads_to_dataframe(df_collection.as_slice()).unwrap();
3105
3106 let read_id_col = df.column("read_id").unwrap().str().unwrap();
3107 let ref_position_col = df.column("ref_position").unwrap().i64().unwrap();
3108 let alignment_type_col = df.column("alignment_type").unwrap().str().unwrap();
3109 let mod_quality_col = df.column("mod_quality").unwrap().u32().unwrap();
3110
3111 let mut row_count_0: usize = 0;
3112 let mut row_count_1: usize = 0;
3113
3114 let mut group0_forward_position_violations: usize = 0;
3116 let mut group0_reverse_position_violations: usize = 0;
3117 let mut group0_unmapped_position_violations: usize = 0;
3118 let mut group0_quality_violations: usize = 0;
3119 let mut group1_forward_position_violations: usize = 0;
3120 let mut group1_reverse_position_violations: usize = 0;
3121 let mut group1_unmapped_position_violations: usize = 0;
3122 let mut group1_quality_violations: usize = 0;
3123 let mut read_id_violations: usize = 0;
3124
3125 for i in 0..df.height() {
3126 let read_id = read_id_col.get(i).unwrap();
3127 let ref_position = ref_position_col.get(i).unwrap();
3128 let alignment_type = alignment_type_col.get(i).unwrap();
3129 let mod_quality = mod_quality_col.get(i).unwrap();
3130
3131 if read_id.starts_with("0.") {
3132 if alignment_type.contains("forward") {
3134 if ref_position % 4 != 1 {
3135 group0_forward_position_violations += 1;
3136 }
3137 } else if alignment_type.contains("reverse") {
3138 if ref_position % 4 != 2 {
3139 group0_reverse_position_violations += 1;
3140 }
3141 } else {
3142 if ref_position != -1 {
3144 group0_unmapped_position_violations += 1;
3145 }
3146 }
3147 if mod_quality != 255 {
3148 group0_quality_violations += 1;
3149 }
3150 row_count_0 += 1;
3151 } else if read_id.starts_with("1.") {
3152 if alignment_type.contains("forward") {
3154 if ref_position % 4 == 1 {
3155 group1_forward_position_violations += 1;
3156 }
3157 } else if alignment_type.contains("reverse") {
3158 if ref_position % 4 == 2 {
3159 group1_reverse_position_violations += 1;
3160 }
3161 } else {
3162 if ref_position != -1 {
3164 group1_unmapped_position_violations += 1;
3165 }
3166 }
3167 if mod_quality != 130 {
3168 group1_quality_violations += 1;
3169 }
3170 row_count_1 += 1;
3171 } else {
3172 read_id_violations += 1;
3173 }
3174 }
3175
3176 assert_eq!(
3178 group0_forward_position_violations, 0,
3179 "Group 0 forward: expected 0 position violations"
3180 );
3181 assert_eq!(
3182 group0_reverse_position_violations, 0,
3183 "Group 0 reverse: expected 0 position violations"
3184 );
3185 assert_eq!(
3186 group0_unmapped_position_violations, 0,
3187 "Group 0 unmapped: expected 0 position violations"
3188 );
3189 assert_eq!(
3190 group0_quality_violations, 0,
3191 "Group 0: expected 0 quality violations"
3192 );
3193 assert_eq!(
3194 group1_forward_position_violations, 0,
3195 "Group 1 forward: expected 0 position violations"
3196 );
3197 assert_eq!(
3198 group1_reverse_position_violations, 0,
3199 "Group 1 reverse: expected 0 position violations"
3200 );
3201 assert_eq!(
3202 group1_unmapped_position_violations, 0,
3203 "Group 1 unmapped: expected 0 position violations"
3204 );
3205 assert_eq!(
3206 group1_quality_violations, 0,
3207 "Group 1: expected 0 quality violations"
3208 );
3209 assert_eq!(read_id_violations, 0, "expected 0 read_id violations");
3210 assert!(
3211 row_count_0 > 0,
3212 "Should have processed some rows from group 0"
3213 );
3214 assert!(
3215 row_count_1 > 0,
3216 "Should have processed some rows from group 1"
3217 );
3218 }
3219}
3220
3221#[cfg(test)]
3222mod contig_generation_tests {
3223 use super::*;
3224 use rust_htslib::bam::Read as _;
3225
3226 #[test]
3228 fn edge_case_repeated_sequence_exact_length() {
3229 let config_json = r#"{
3230 "contigs": {
3231 "number": 1,
3232 "len_range": [16, 16],
3233 "repeated_seq": "ACGT"
3234 },
3235 "reads": [{
3236 "number": 5,
3237 "len_range": [0.5, 0.5]
3238 }]
3239 }"#;
3240
3241 let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
3242 let sim = TempBamSimulation::new(config).unwrap();
3243 let reader = bam::Reader::from_path(sim.bam_path()).unwrap();
3244
3245 let header = reader.header();
3247 assert_eq!(header.target_len(0).unwrap(), 16);
3248 }
3249
3250 #[test]
3252 fn edge_case_minimum_contig_size() {
3253 let config_json = r#"{
3254 "contigs": {
3255 "number": 1,
3256 "len_range": [1, 1]
3257 },
3258 "reads": [{
3259 "number": 10,
3260 "mapq_range": [10, 20],
3261 "base_qual_range": [20, 30],
3262 "len_range": [1.0, 1.0]
3263 }]
3264 }"#;
3265
3266 let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
3267 let sim = TempBamSimulation::new(config).unwrap();
3268 let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
3269
3270 let header = reader.header();
3272 assert_eq!(header.target_count(), 1);
3273 assert_eq!(header.target_len(0).unwrap(), 1);
3274
3275 assert!(reader.records().count() > 0);
3277 }
3278
3279 #[test]
3281 fn generate_contigs_denovo_works() {
3282 let config = ContigConfigBuilder::default()
3283 .number(5)
3284 .len_range((100, 200))
3285 .build()
3286 .unwrap();
3287 let contigs = generate_contigs_denovo(config.number, config.len_range, &mut rand::rng());
3288 assert_eq!(contigs.len(), 5);
3289 for (i, contig) in contigs.iter().enumerate() {
3290 assert_eq!(contig.name, format!("contig_0000{i}"));
3291 assert!((100..=200).contains(&contig.get_dna_restrictive().get().len()));
3292 for base in contig.get_dna_restrictive().get() {
3293 assert!([b'A', b'C', b'G', b'T'].contains(base));
3294 }
3295 }
3296 }
3297
3298 #[test]
3300 fn generate_contigs_denovo_repeated_seq_works() {
3301 let seq = DNARestrictive::from_str("ACGT").unwrap();
3302 let contigs = generate_contigs_denovo_repeated_seq(
3303 NonZeroU32::new(10000).unwrap(),
3304 OrdPair::new(NonZeroU64::new(10).unwrap(), NonZeroU64::new(12).unwrap()).unwrap(),
3305 &seq,
3306 &mut rand::rng(),
3307 );
3308
3309 let mut counts = [0, 0, 0];
3310
3311 assert_eq!(contigs.len(), 10000);
3312 for (i, contig) in contigs.iter().enumerate() {
3313 assert_eq!(contig.name, format!("contig_{i:05}"));
3314 let idx = match contig.get_dna_restrictive().get() {
3315 b"ACGTACGTAC" => 0,
3316 b"ACGTACGTACG" => 1,
3317 b"ACGTACGTACGT" => 2,
3318 _ => unreachable!(),
3319 };
3320 *counts
3321 .get_mut(idx)
3322 .expect("idx is 0, 1, or 2; counts has 3 elements") += 1;
3323 }
3324
3325 for count in counts {
3327 assert!(count >= 3000);
3328 }
3329 }
3330}
3331
3332#[cfg(test)]
3334mod perfect_seq_match_to_not_tests {
3335 use super::*;
3336
3337 #[test]
3338 fn seq_with_valid_sequence() -> Result<(), Error> {
3339 let _builder = PerfectSeqMatchToNot::seq(b"ACGT".to_vec())?;
3340 Ok(())
3341 }
3342
3343 #[test]
3344 fn seq_with_empty_sequence_returns_error() {
3345 let result = PerfectSeqMatchToNot::seq(b"".to_vec());
3346 assert!(matches!(result, Err(Error::InvalidState(_))));
3347 }
3348
3349 #[test]
3350 fn build_without_barcode_forward_read() -> Result<(), Error> {
3351 let mut rng = rand::rng();
3352 let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
3353 .build(ReadState::PrimaryFwd, &mut rng)?;
3354 assert_eq!(seq, b"GGGGGGGG");
3355 assert_eq!(cigar.expect("no error").to_string(), "8M");
3356 Ok(())
3357 }
3358
3359 #[test]
3360 fn build_without_barcode_reverse_read() -> Result<(), Error> {
3361 let mut rng = rand::rng();
3362 let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
3363 .build(ReadState::PrimaryRev, &mut rng)?;
3364 assert_eq!(seq, b"GGGGGGGG");
3365 assert_eq!(cigar.expect("no error").to_string(), "8M");
3366 Ok(())
3367 }
3368
3369 #[test]
3370 fn build_with_barcode_forward_read() -> Result<(), Error> {
3371 let mut rng = rand::rng();
3372 let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
3373 let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
3374 .barcode(barcode)
3375 .build(ReadState::PrimaryFwd, &mut rng)?;
3376 assert_eq!(seq, b"ACGTAAGGGGGGGGTTACGT");
3377 assert_eq!(cigar.expect("no error").to_string(), "6S8M6S");
3378 Ok(())
3379 }
3380
3381 #[test]
3382 fn build_with_barcode_reverse_read() -> Result<(), Error> {
3383 let mut rng = rand::rng();
3384 let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
3385 let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
3386 .barcode(barcode)
3387 .build(ReadState::PrimaryRev, &mut rng)?;
3388 assert_eq!(seq, b"TGCATTGGGGGGGGAATGCA");
3389 assert_eq!(cigar.expect("no error").to_string(), "6S8M6S");
3390 Ok(())
3391 }
3392
3393 #[test]
3394 fn build_unmapped_read_returns_none_cigar() -> Result<(), Error> {
3395 let mut rng = rand::rng();
3396 let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
3397 .build(ReadState::Unmapped, &mut rng)?;
3398 assert_eq!(seq, b"GGGGGGGG");
3399 assert!(cigar.is_none());
3400 Ok(())
3401 }
3402
3403 #[test]
3404 fn build_unmapped_read_with_barcode_returns_none_cigar() -> Result<(), Error> {
3405 let mut rng = rand::rng();
3406 let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
3407 let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
3408 .barcode(barcode)
3409 .build(ReadState::Unmapped, &mut rng)?;
3410 assert_eq!(seq, b"ACGTAAGGGGGGGGTTACGT");
3411 assert!(cigar.is_none());
3412 Ok(())
3413 }
3414
3415 #[test]
3416 fn build_with_delete() -> Result<(), Error> {
3417 let mut rng = rand::rng();
3418 let delete_range = OrdPair::new(F32Bw0and1::try_from(0.5)?, F32Bw0and1::try_from(0.75)?)?;
3419 let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAATTTTCCCCGGGG".to_vec())?
3420 .delete(delete_range)
3421 .build(ReadState::PrimaryFwd, &mut rng)?;
3422
3423 assert_eq!(seq, b"AAAATTTTGGGG");
3427 assert_eq!(cigar.expect("no error").to_string(), "8M4D4M");
3428 Ok(())
3429 }
3430
3431 #[test]
3432 fn build_with_insert_middle() -> Result<(), Error> {
3433 let mut rng = rand::rng();
3434 let insert_seq = DNARestrictive::from_str("TTTT").unwrap();
3435 let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAGGGG".to_vec())?
3436 .insert_middle(insert_seq)
3437 .build(ReadState::PrimaryFwd, &mut rng)?;
3438
3439 assert_eq!(seq, b"AAAATTTTGGGG");
3444 assert_eq!(cigar.expect("no error").to_string(), "4M4I4M");
3445 Ok(())
3446 }
3447
3448 #[test]
3449 fn build_with_mismatch() -> Result<(), Error> {
3450 let mut rng = rand::rng();
3451 let mismatch_frac = F32Bw0and1::try_from(0.5)?;
3452 let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())?
3453 .mismatch(mismatch_frac)
3454 .build(ReadState::PrimaryFwd, &mut rng)?;
3455
3456 let mismatch_count = seq.iter().filter(|&&b| b != b'A').count();
3460 assert_eq!(mismatch_count, 4);
3461
3462 assert_eq!(cigar.expect("no error").to_string(), "8M");
3464 Ok(())
3465 }
3466
3467 #[test]
3468 fn build_with_delete_and_insert() -> Result<(), Error> {
3469 let mut rng = rand::rng();
3470 let delete_range = OrdPair::new(F32Bw0and1::try_from(0.5)?, F32Bw0and1::try_from(0.75)?)?;
3471 let insert_seq = DNARestrictive::from_str("TT").unwrap();
3472 let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAACCCCGGGG".to_vec())?
3473 .delete(delete_range)
3474 .insert_middle(insert_seq)
3475 .build(ReadState::PrimaryFwd, &mut rng)?;
3476
3477 assert_eq!(seq, b"AAAACCTTGGG");
3483 assert_eq!(cigar.expect("no error").to_string(), "6M2I3D3M");
3484 Ok(())
3485 }
3486
3487 #[test]
3488 fn build_with_all_features() -> Result<(), Error> {
3489 let mut rng = rand::rng();
3490 let delete_range = OrdPair::new(F32Bw0and1::try_from(0.25)?, F32Bw0and1::try_from(0.5)?)?;
3491 let insert_seq = DNARestrictive::from_str("CC").unwrap();
3492 let mismatch_frac = F32Bw0and1::try_from(0.25)?;
3493 let barcode = DNARestrictive::from_str("AAA").unwrap();
3494
3495 let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGGGGGGGGGG".to_vec())?
3496 .delete(delete_range)
3497 .insert_middle(insert_seq)
3498 .mismatch(mismatch_frac)
3499 .barcode(barcode)
3500 .build(ReadState::PrimaryFwd, &mut rng)?;
3501
3502 assert_eq!(seq.len(), 20);
3508
3509 let cigar_str = cigar.expect("no error").to_string();
3511 assert!(cigar_str.starts_with("3S")); assert!(cigar_str.ends_with("3S")); assert!(cigar_str.contains('D')); assert!(cigar_str.contains('I')); Ok(())
3516 }
3517
3518 #[test]
3519 fn build_with_delete_zero_length() -> Result<(), Error> {
3520 let mut rng = rand::rng();
3521 let delete_range = OrdPair::new(F32Bw0and1::try_from(0.5)?, F32Bw0and1::try_from(0.5)?)?;
3523 let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())?
3524 .delete(delete_range)
3525 .build(ReadState::PrimaryFwd, &mut rng)?;
3526
3527 assert_eq!(seq, b"AAAAAAAA");
3529 assert_eq!(cigar.expect("no error").to_string(), "8M");
3530 Ok(())
3531 }
3532
3533 #[test]
3534 fn build_with_mismatch_zero_fraction() -> Result<(), Error> {
3535 let mut rng = rand::rng();
3536 let mismatch_frac = F32Bw0and1::try_from(0.0)?;
3537 let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())?
3538 .mismatch(mismatch_frac)
3539 .build(ReadState::PrimaryFwd, &mut rng)?;
3540
3541 assert_eq!(seq, b"AAAAAAAA");
3543 assert_eq!(cigar.expect("no error").to_string(), "8M");
3544 Ok(())
3545 }
3546
3547 #[test]
3548 #[should_panic(expected = "SimulateDNASeqCIGAREndProblem")]
3549 fn build_with_delete_whole_length() {
3550 let mut rng = rand::rng();
3551 let delete_range = OrdPair::new(F32Bw0and1::zero(), F32Bw0and1::one()).unwrap();
3552 let (_seq, _cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())
3553 .unwrap()
3554 .delete(delete_range)
3555 .build(ReadState::PrimaryFwd, &mut rng)
3556 .unwrap();
3557 }
3558
3559 #[test]
3560 #[should_panic(expected = "SimulateDNASeqCIGAREndProblem")]
3561 fn build_with_insert_almost_whole_length() {
3562 let mut rng = rand::rng();
3563 let insert_seq = DNARestrictive::from_str("AATT").unwrap();
3564 let (_seq, _cigar) = PerfectSeqMatchToNot::seq(b"A".to_vec())
3565 .unwrap()
3566 .insert_middle(insert_seq)
3567 .build(ReadState::PrimaryFwd, &mut rng)
3568 .unwrap();
3569 }
3570
3571 #[test]
3572 #[should_panic(expected = "SimulateDNASeqCIGAREndProblem")]
3573 fn build_with_delete_whole_length_with_barcode() {
3574 let mut rng = rand::rng();
3575 let delete_range = OrdPair::new(F32Bw0and1::zero(), F32Bw0and1::one()).unwrap();
3576 let barcode = DNARestrictive::from_str("CGCG").unwrap();
3577 let (_seq, _cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())
3578 .unwrap()
3579 .delete(delete_range)
3580 .barcode(barcode)
3581 .build(ReadState::PrimaryFwd, &mut rng)
3582 .unwrap();
3583 }
3584
3585 #[test]
3586 #[should_panic(expected = "SimulateDNASeqCIGAREndProblem")]
3587 fn build_with_insert_almost_whole_length_with_barcode() {
3588 let mut rng = rand::rng();
3589 let insert_seq = DNARestrictive::from_str("AATT").unwrap();
3590 let barcode = DNARestrictive::from_str("CGCG").unwrap();
3591 let (_seq, _cigar) = PerfectSeqMatchToNot::seq(b"A".to_vec())
3592 .unwrap()
3593 .insert_middle(insert_seq)
3594 .barcode(barcode)
3595 .build(ReadState::PrimaryFwd, &mut rng)
3596 .unwrap();
3597 }
3598}