use crate::{
AllowedAGCTN, DNARestrictive, Error, F32Bw0and1, GetDNARestrictive, ModChar, OrdPair, ReadState,
};
use crate::{write_bam_denovo, write_fasta};
use derive_builder::Builder;
use itertools::join;
use rand::Rng;
use rand::SeedableRng as _;
use rand::rngs::StdRng;
use rand::seq::IteratorRandom as _;
use rust_htslib::bam;
use rust_htslib::bam::record::{Aux, Cigar, CigarString};
use serde::{Deserialize, Serialize};
use std::iter;
use std::num::{NonZeroU32, NonZeroU64};
use std::ops::RangeInclusive;
use std::path::Path;
use std::str::FromStr as _;
use uuid::Uuid;
type OF = OrdPair<F32Bw0and1>;
macro_rules! ord_pair_f32_bw0and1 {
($low:expr, $high:expr) => {
OrdPair::new(
F32Bw0and1::new($low).unwrap(),
F32Bw0and1::new($high).unwrap(),
)
.unwrap()
};
}
#[derive(Builder, Debug, Default, PartialEq, Clone, Serialize, Deserialize)]
#[serde(default)]
#[builder(default, build_fn(error = "Error"), pattern = "owned", derive(Clone))]
#[non_exhaustive]
pub struct SimulationConfig {
pub contigs: ContigConfig,
pub reads: Vec<ReadConfig>,
#[builder(setter(into, strip_option))]
pub seed: Option<u64>,
}
#[derive(Builder, Debug, PartialEq, Clone, Serialize, Deserialize)]
#[serde(default)]
#[builder(default, build_fn(error = "Error"), pattern = "owned", derive(Clone))]
#[non_exhaustive]
pub struct ContigConfig {
#[builder(field(ty = "u32", build = "self.number.try_into()?"))]
pub number: NonZeroU32,
#[builder(field(ty = "(u64, u64)", build = "self.len_range.try_into()?"))]
pub len_range: OrdPair<NonZeroU64>,
#[builder(field(
ty = "String",
build = "(!self.repeated_seq.is_empty()).then(|| DNARestrictive::from_str(&self.repeated_seq)).transpose()?"
))]
pub repeated_seq: Option<DNARestrictive>,
}
#[derive(Builder, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(default)]
#[builder(default, build_fn(error = "Error"), pattern = "owned", derive(Clone))]
#[non_exhaustive]
pub struct ReadConfig {
#[builder(field(ty = "u32", build = "self.number.try_into()?"))]
pub number: NonZeroU32,
#[builder(field(ty = "(u8, u8)", build = "self.mapq_range.try_into()?"))]
pub mapq_range: OrdPair<u8>,
#[builder(field(ty = "(u8, u8)", build = "self.base_qual_range.try_into()?"))]
pub base_qual_range: OrdPair<u8>,
#[builder(field(ty = "(f32, f32)", build = "self.len_range.try_into()?"))]
pub len_range: OrdPair<F32Bw0and1>,
#[builder(field(
ty = "String",
build = "(!self.barcode.is_empty()).then(|| DNARestrictive::from_str(&self.barcode)).transpose()?"
))]
pub barcode: Option<DNARestrictive>,
#[builder(field(
ty = "(f32, f32)",
build = "Some(self.delete).map(TryInto::try_into).transpose()?"
))]
pub delete: Option<OrdPair<F32Bw0and1>>,
#[builder(field(
ty = "String",
build = "(!self.insert_middle.is_empty()).then(|| DNARestrictive::from_str(&self.insert_middle)).transpose()?"
))]
pub insert_middle: Option<DNARestrictive>,
#[builder(field(ty = "f32", build = "Some(F32Bw0and1::try_from(self.mismatch)?)"))]
pub mismatch: Option<F32Bw0and1>,
pub mods: Vec<ModConfig>,
}
#[derive(Builder, Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(default)]
#[builder(default, build_fn(error = "Error"), pattern = "owned", derive(Clone))]
#[non_exhaustive]
pub struct ModConfig {
#[builder(field(ty = "char", build = "self.base.try_into()?"))]
pub base: AllowedAGCTN,
pub is_strand_plus: bool,
#[builder(field(ty = "String", build = "self.mod_code.parse()?"))]
pub mod_code: ModChar,
#[builder(field(
ty = "Vec<u32>",
build = "self.win.iter().map(|&x| NonZeroU32::new(x).ok_or(Error::Zero(\"cannot use zero-\
sized windows in builder\".to_owned()))).collect::<Result<Vec<NonZeroU32>,_>>()?"
))]
pub win: Vec<NonZeroU32>,
#[builder(field(
ty = "Vec<(f32, f32)>",
build = "self.mod_range.iter().map(|&x| OF::try_from(x)).collect::<Result<Vec<OF>, _>>()?"
))]
pub mod_range: Vec<OrdPair<F32Bw0and1>>,
}
#[derive(Builder, Debug, Clone, Serialize, Deserialize)]
#[serde(default)]
#[builder(default, build_fn(error = "Error"))]
#[non_exhaustive]
pub struct Contig {
#[builder(setter(into))]
pub name: String,
#[builder(field(ty = "String", build = "self.seq.parse()?"))]
pub seq: DNARestrictive,
}
impl GetDNARestrictive for Contig {
fn get_dna_restrictive(&self) -> &DNARestrictive {
&self.seq
}
}
impl Default for Contig {
fn default() -> Self {
Self {
name: String::new(),
seq: DNARestrictive::from_str("A").expect("valid default DNA"),
}
}
}
impl Default for ContigConfig {
fn default() -> Self {
Self {
number: NonZeroU32::new(1).unwrap(),
len_range: OrdPair::new(NonZeroU64::new(1).unwrap(), NonZeroU64::new(1).unwrap())
.unwrap(),
repeated_seq: None,
}
}
}
impl Default for ReadConfig {
fn default() -> Self {
Self {
number: NonZeroU32::new(1).unwrap(),
mapq_range: OrdPair::new(0, 0).unwrap(),
base_qual_range: OrdPair::new(0, 0).unwrap(),
len_range: ord_pair_f32_bw0and1!(0.0, 0.0),
barcode: None,
delete: None,
insert_middle: None,
mismatch: None,
mods: Vec::new(),
}
}
}
impl Default for ModConfig {
fn default() -> Self {
Self {
base: AllowedAGCTN::C,
is_strand_plus: true,
mod_code: ModChar::new('m'),
win: vec![NonZeroU32::new(1).unwrap()],
mod_range: vec![ord_pair_f32_bw0and1!(0.0, 1.0)],
}
}
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct PerfectSeqMatchToNot {
seq: Vec<u8>,
barcode: Option<DNARestrictive>,
delete: Option<OrdPair<F32Bw0and1>>,
insert_middle: Option<DNARestrictive>,
mismatch: Option<F32Bw0and1>,
}
impl PerfectSeqMatchToNot {
pub fn seq(seq: Vec<u8>) -> Result<Self, Error> {
if seq.is_empty() {
return Err(Error::InvalidState(
"Sequence length is 0; cannot create PerfectSeqMatchToNot with empty sequence"
.into(),
));
}
Ok(Self {
seq,
barcode: None,
delete: None,
insert_middle: None,
mismatch: None,
})
}
#[must_use]
pub fn barcode(mut self, barcode: DNARestrictive) -> Self {
self.barcode = Some(barcode);
self
}
#[must_use]
pub fn delete(mut self, delete: OrdPair<F32Bw0and1>) -> Self {
self.delete = Some(delete);
self
}
#[must_use]
pub fn insert_middle(mut self, insert: DNARestrictive) -> Self {
self.insert_middle = Some(insert);
self
}
#[must_use]
pub fn mismatch(mut self, mismatch: F32Bw0and1) -> Self {
self.mismatch = Some(mismatch);
self
}
#[expect(
clippy::cast_precision_loss,
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
reason = "Casting and arithmetic needed for fractional calculations on sequence positions"
)]
#[expect(
clippy::too_many_lines,
reason = "performs many steps to alter sequence (indel, mismatch, barcode)"
)]
pub fn build<R: Rng>(
self,
read_state: ReadState,
rng: &mut R,
) -> Result<(Vec<u8>, Option<CigarString>), Error> {
let seq = self.seq;
let mut bases_and_ops: Vec<(u8, u8)> = seq.iter().map(|&base| (base, b'M')).collect();
if let Some(mismatch_frac) = self.mismatch {
let num_mismatches =
((bases_and_ops.len() as f32) * mismatch_frac.val()).round() as usize;
let indices_to_mutate: Vec<usize> =
(0..bases_and_ops.len()).choose_multiple(rng, num_mismatches);
for idx in indices_to_mutate {
let item = bases_and_ops
.get_mut(idx)
.expect("idx is within bounds from choose_multiple");
let current_base = item.0;
let new_base = match current_base {
v @ (b'A' | b'C' | b'G' | b'T') => {
loop {
let candidate: AllowedAGCTN = rng.random();
let candidate_u8: u8 = candidate.into();
if candidate_u8 != v && candidate_u8 != b'N' {
break candidate_u8;
}
}
}
other => other, };
item.0 = new_base;
}
}
if let Some(delete_range) = self.delete {
let start = ((bases_and_ops.len() as f32) * delete_range.low().val()).round() as usize;
let end_raw =
((bases_and_ops.len() as f32) * delete_range.high().val()).round() as usize;
let end = end_raw.min(bases_and_ops.len());
if let Some(slice) = bases_and_ops.get_mut(start..end) {
for item in slice {
item.1 = b'D';
}
}
}
if let Some(insert_seq) = self.insert_middle {
let middle = bases_and_ops.len() / 2;
let insert_bases = insert_seq.get_dna_restrictive().get();
let insertions: Vec<(u8, u8)> = insert_bases.iter().map(|&b| (b, b'I')).collect();
drop(
bases_and_ops
.splice(middle..middle, insertions)
.collect::<Vec<_>>(),
);
}
if let Some(barcode) = self.barcode {
let barcoded_seq = add_barcode(&[], barcode.clone(), read_state);
let barcode_len = barcode.get_dna_restrictive().get().len();
let (start_bc, end_bc) = barcoded_seq.split_at(barcode_len);
let bc_tuples_start: Vec<(u8, u8)> = start_bc.iter().map(|&b| (b, b'S')).collect();
drop(
bases_and_ops
.splice(0..0, bc_tuples_start)
.collect::<Vec<_>>(),
);
let bc_tuples_end: Vec<(u8, u8)> = end_bc.iter().map(|&b| (b, b'S')).collect();
bases_and_ops.extend(bc_tuples_end);
}
let cigar_string = CigarString(
bases_and_ops
.iter()
.map(|&(_, op)| op)
.fold(Vec::<(u8, u32)>::new(), |mut acc, op| {
match acc.last_mut() {
Some(&mut (last_op, ref mut count)) if last_op == op => {
*count = count.saturating_add(1);
}
_ => acc.push((op, 1u32)),
}
acc
})
.into_iter()
.map(|(op, count)| match op {
b'M' => Cigar::Match(count),
b'I' => Cigar::Ins(count),
b'D' => Cigar::Del(count),
b'S' => Cigar::SoftClip(count),
_ => unreachable!("Invalid CIGAR operation: {op}"),
})
.collect::<Vec<_>>(),
);
let final_seq: Vec<u8> = bases_and_ops
.iter()
.copied()
.filter_map(|item| (item.1 != b'D').then_some(item.0))
.collect();
let first_ok = (cigar_string.0.as_slice().first().map(|x| x.char()) == Some('M'))
|| (cigar_string
.0
.as_slice()
.first_chunk::<2>()
.map(|&x| [x[0].char(), x[1].char()])
== Some(['S', 'M']));
let last_ok = (cigar_string.0.as_slice().last().map(|x| x.char()) == Some('M'))
|| (cigar_string
.0
.as_slice()
.last_chunk::<2>()
.map(|&x| [x[0].char(), x[1].char()])
== Some(['M', 'S']));
if first_ok && last_ok {
Ok((
final_seq,
(!read_state.is_unmapped()).then_some(cigar_string),
))
} else {
Err(Error::SimulateDNASeqCIGAREndProblem(
"too few bp or insertion/deletion close to end".to_owned(),
))
}
}
}
pub fn generate_random_dna_modification<R: Rng, S: GetDNARestrictive>(
mod_configs: &[ModConfig],
seq: &S,
rng: &mut R,
) -> (String, Vec<u8>) {
let seq_bytes = seq.get_dna_restrictive().get();
let mut mm_str = String::new();
let mut ml_vec: Vec<u8> = Vec::with_capacity(seq_bytes.len());
for mod_config in mod_configs {
let base = u8::from(mod_config.base);
let strand = if mod_config.is_strand_plus { '+' } else { '-' };
let mod_code = mod_config.mod_code;
let mut count = if base == b'N' {
seq_bytes.len()
} else {
seq_bytes
.iter()
.zip(iter::repeat(&base))
.filter(|&(&a, &b)| a == b)
.count()
};
let mut output: Vec<u8> = Vec::with_capacity(count);
for k in mod_config
.win
.iter()
.cycle()
.zip(mod_config.mod_range.iter().cycle())
{
let low = u8::from(k.1.low());
let high = u8::from(k.1.high());
#[expect(
clippy::redundant_else,
reason = "so that the clippy arithmetic lint fits better with the code"
)]
#[expect(
clippy::arithmetic_side_effects,
reason = "we loop only if count > 0 and catch count == 0, thus count doesn't underflow"
)]
if count == 0 {
break;
} else {
for _ in 0..k.0.get() {
output.push(rng.random_range(low..=high));
count -= 1;
if count == 0 {
break;
}
}
}
}
if !output.is_empty() {
let mod_len = output.len();
ml_vec.append(&mut output);
mm_str += format!(
"{}{}{}?,{};",
base as char,
strand,
mod_code,
join(vec![0; mod_len], ",")
)
.as_str();
}
}
ml_vec.shrink_to_fit();
(mm_str, ml_vec)
}
pub fn generate_random_dna_sequence<R: Rng>(length: NonZeroU64, rng: &mut R) -> Vec<u8> {
const DNA_BASES: [u8; 4] = [b'A', b'C', b'G', b'T'];
iter::repeat_with(|| {
*DNA_BASES
.get(rng.random_range(0..4))
.expect("random_range(0..4) is always 0-3")
})
.take(usize::try_from(length.get()).expect("sequence length exceeds usize::MAX"))
.collect()
}
#[must_use]
#[expect(
clippy::needless_pass_by_value,
reason = "barcode by value allows passing `&'static str` easily (I think, may fix in future). \
If you don't know what this means, ignore this; this is _not_ a bug."
)]
pub fn add_barcode(read_seq: &[u8], barcode: DNARestrictive, read_state: ReadState) -> Vec<u8> {
let bc_bytes = barcode.get();
match read_state {
ReadState::PrimaryFwd
| ReadState::SecondaryFwd
| ReadState::SupplementaryFwd
| ReadState::Unmapped => {
let revcomp_bc = bio::alphabets::dna::revcomp(bc_bytes);
[bc_bytes, read_seq, &revcomp_bc[..]].concat()
}
ReadState::PrimaryRev | ReadState::SecondaryRev | ReadState::SupplementaryRev => {
let comp_bc: Vec<u8> = bc_bytes
.iter()
.map(|&b| bio::alphabets::dna::complement(b))
.collect();
let rev_bc: Vec<u8> = bc_bytes.iter().copied().rev().collect();
[&comp_bc[..], read_seq, &rev_bc[..]].concat()
}
}
}
#[expect(
clippy::missing_panics_doc,
reason = "no length number or DNA conversion problems expected"
)]
pub fn generate_contigs_denovo<R: Rng>(
contig_number: NonZeroU32,
len_range: OrdPair<NonZeroU64>,
rng: &mut R,
) -> Vec<Contig> {
(0..contig_number.get())
.map(|i| {
let length = rng.random_range(len_range.low().get()..=len_range.high().get());
let seq_bytes =
generate_random_dna_sequence(NonZeroU64::try_from(length).expect("no error"), rng);
let seq_str = String::from_utf8(seq_bytes).expect("valid DNA sequence");
ContigBuilder::default()
.name(format!("contig_{i:05}"))
.seq(seq_str)
.build()
.expect("no error")
})
.collect()
}
#[expect(
clippy::missing_panics_doc,
reason = "no length number or DNA conversion problems expected"
)]
pub fn generate_contigs_denovo_repeated_seq<R: Rng, S: GetDNARestrictive>(
contig_number: NonZeroU32,
len_range: OrdPair<NonZeroU64>,
seq: &S,
rng: &mut R,
) -> Vec<Contig> {
(0..contig_number.get())
.map(|i| {
let length = rng.random_range(len_range.low().get()..=len_range.high().get());
let contig_seq: Vec<u8> = seq
.get_dna_restrictive()
.get()
.iter()
.cycle()
.take(usize::try_from(length).expect("number conversion error"))
.copied()
.collect();
let seq_str =
String::from_utf8(contig_seq).expect("valid UTF-8 from repeated DNA sequence");
ContigBuilder::default()
.name(format!("contig_{i:05}"))
.seq(seq_str)
.build()
.expect("valid DNA sequence from repeated pattern")
})
.collect()
}
#[expect(
clippy::missing_panics_doc,
clippy::too_many_lines,
reason = "number conversion errors or mis-generation of DNA bases are unlikely \
as we generate DNA sequences ourselves here, and genomic data are unlikely \
to exceed ~2^63 bp or have ~2^32 contigs. Function length is acceptable for read generation"
)]
#[expect(
clippy::cast_possible_truncation,
reason = "read length calculated as a fraction of contig length, managed with trunc()"
)]
pub fn generate_reads_denovo<R: Rng, S: GetDNARestrictive>(
contigs: &[S],
read_config: &ReadConfig,
read_group: &str,
rng: &mut R,
) -> Result<Vec<bam::Record>, Error> {
if contigs.is_empty() {
return Err(Error::UnavailableData("no contigs found".to_owned()));
}
let mut reads = Vec::new();
for _ in 0..read_config.number.get() {
let contig_idx = rng.random_range(0..contigs.len());
let contig = contigs
.get(contig_idx)
.expect("contig_idx is within contigs range");
let contig_len = contig.get_dna_restrictive().get().len() as u64;
#[expect(
clippy::cast_precision_loss,
reason = "u64->f32 causes precision loss but we are fine with this for now"
)]
#[expect(
clippy::cast_sign_loss,
reason = "these are positive numbers so no problem"
)]
let read_len = ((rng
.random_range(read_config.len_range.low().val()..=read_config.len_range.high().val())
* contig_len as f32)
.trunc() as u64)
.min(contig_len);
let start_pos = rng.random_range(0..=(contig_len.saturating_sub(read_len)));
let random_state: ReadState = rng.random();
let end_pos = usize::try_from(start_pos.checked_add(read_len).expect("u64 overflow"))
.expect("number conversion error");
let (read_seq, cigar) = {
let start_idx = usize::try_from(start_pos).expect("number conversion error");
let temp_seq = contig
.get_dna_restrictive()
.get()
.get(start_idx..end_pos)
.expect("start_idx and end_pos are within contig bounds")
.to_vec();
let mut builder = PerfectSeqMatchToNot::seq(temp_seq)?;
if let Some(barcode) = read_config.barcode.clone() {
builder = builder.barcode(barcode);
}
if let Some(delete) = read_config.delete {
builder = builder.delete(delete);
}
if let Some(insert_middle) = read_config.insert_middle.clone() {
builder = builder.insert_middle(insert_middle);
}
if let Some(mismatch) = read_config.mismatch {
builder = builder.mismatch(mismatch);
}
builder.build(random_state, rng)?
};
let qual: Vec<u8> = iter::repeat_with(|| {
rng.random_range(RangeInclusive::from(read_config.base_qual_range))
})
.take(read_seq.len())
.collect();
let seq: DNARestrictive;
let (mod_prob_mm_tag, mod_pos_ml_tag) = generate_random_dna_modification(
&read_config.mods,
match random_state.strand() {
'.' | '+' => {
seq = DNARestrictive::from_str(str::from_utf8(&read_seq).expect("no error"))
.expect("no error");
&seq
}
'-' => {
seq = DNARestrictive::from_str(
str::from_utf8(&bio::alphabets::dna::revcomp(&read_seq)).expect("no error"),
)
.expect("no error");
&seq
}
_ => unreachable!("`strand` is supposed to return one of +/-/."),
},
rng,
);
let record = {
let mut record = bam::Record::new();
let uuid = {
let mut bytes = [0u8; 16];
rng.fill(&mut bytes);
uuid::Builder::from_random_bytes(bytes).into_uuid()
};
let qname = format!("{read_group}.{uuid}").into_bytes();
record.unset_flags();
record.set_flags(u16::from(random_state));
if random_state == ReadState::Unmapped {
record.set(&qname, None, &read_seq, &qual);
record.set_mapq(255);
record.set_tid(-1);
record.set_pos(-1);
} else {
let mapq = rng.random_range(RangeInclusive::from(read_config.mapq_range));
record.set(&qname, cigar.as_ref(), &read_seq, &qual);
record.set_mapq(mapq);
record.set_tid(i32::try_from(contig_idx).expect("number conversion error"));
record.set_pos(i64::try_from(start_pos).expect("number conversion error"));
}
record.set_mpos(-1);
record.set_mtid(-1);
record.push_aux(b"RG", Aux::String(read_group))?;
if !mod_prob_mm_tag.is_empty() && !mod_pos_ml_tag.is_empty() {
record.push_aux(b"MM", Aux::String(mod_prob_mm_tag.as_str()))?;
record.push_aux(b"ML", Aux::ArrayU8((&mod_pos_ml_tag).into()))?;
}
record
};
reads.push(record);
}
Ok(reads)
}
pub fn run<F>(
config: SimulationConfig,
bam_output_path: &F,
fasta_output_path: &F,
) -> Result<(), Error>
where
F: AsRef<Path> + ?Sized,
{
if let Some(s) = config.seed {
let mut rng = StdRng::seed_from_u64(s);
run_inner(config, bam_output_path, fasta_output_path, &mut rng)
} else {
let mut rng = rand::rng();
run_inner(config, bam_output_path, fasta_output_path, &mut rng)
}
}
fn run_inner<F, R: Rng>(
config: SimulationConfig,
bam_output_path: &F,
fasta_output_path: &F,
rng: &mut R,
) -> Result<(), Error>
where
F: AsRef<Path> + ?Sized,
{
let contigs = match config.contigs.repeated_seq {
Some(seq) => generate_contigs_denovo_repeated_seq(
config.contigs.number,
config.contigs.len_range,
&seq,
rng,
),
None => generate_contigs_denovo(config.contigs.number, config.contigs.len_range, rng),
};
let read_groups: Vec<String> = (0..config.reads.len()).map(|k| k.to_string()).collect();
let reads = {
let mut temp_reads = Vec::new();
for k in config.reads.into_iter().zip(read_groups.clone()) {
temp_reads.append(&mut generate_reads_denovo(&contigs, &k.0, &k.1, rng)?);
}
temp_reads.sort_by_key(|k| (k.is_unmapped(), k.tid(), k.pos(), k.is_reverse()));
temp_reads
};
write_bam_denovo(
reads,
contigs
.iter()
.map(|k| (k.name.clone(), k.get_dna_restrictive().get().len())),
read_groups,
vec![String::from("simulated BAM file, not real data")],
bam_output_path,
)?;
write_fasta(
contigs.into_iter().map(|k| (k.name.clone(), k)),
fasta_output_path,
)?;
Ok(())
}
#[derive(Debug, Serialize, Deserialize)]
pub struct TempBamSimulation {
bam_path: String,
fasta_path: String,
}
impl TempBamSimulation {
pub fn new(config: SimulationConfig) -> Result<Self, Error> {
let temp_dir = std::env::temp_dir();
let bam_path = temp_dir.join(format!("{}.bam", Uuid::new_v4()));
let fasta_path = temp_dir.join(format!("{}.fa", Uuid::new_v4()));
run(config, &bam_path, &fasta_path)?;
Ok(Self {
bam_path: bam_path.to_string_lossy().to_string(),
fasta_path: fasta_path.to_string_lossy().to_string(),
})
}
#[must_use]
pub fn bam_path(&self) -> &str {
&self.bam_path
}
#[must_use]
pub fn fasta_path(&self) -> &str {
&self.fasta_path
}
}
impl Drop for TempBamSimulation {
fn drop(&mut self) {
drop(std::fs::remove_file(&self.bam_path));
drop(std::fs::remove_file(&self.fasta_path));
drop(std::fs::remove_file(&(self.bam_path.clone() + ".bai")));
}
}
#[cfg(test)]
mod random_dna_generation_test {
use super::*;
#[test]
fn generate_random_dna_sequence_works() {
let seq = generate_random_dna_sequence(NonZeroU64::new(100).unwrap(), &mut rand::rng());
assert_eq!(seq.len(), 100);
for base in seq {
assert!([b'A', b'C', b'G', b'T'].contains(&base));
}
}
}
#[cfg(test)]
mod seeded_simulation_tests {
use super::*;
use rust_htslib::bam::Read as _;
fn run_seeded_simulation(config_json: &str) -> Result<Vec<bam::Record>, Error> {
let config: SimulationConfig = serde_json::from_str(config_json)?;
let temp = TempBamSimulation::new(config)?;
let mut reader = crate::nanalogue_bam_reader(temp.bam_path())?;
reader
.records()
.collect::<Result<Vec<_>, _>>()
.map_err(Error::from)
}
#[test]
fn seeded_simulation_is_reproducible() -> Result<(), Error> {
let config_json = r#"{
"contigs": { "number": 2, "len_range": [500, 1000],
"repeated_seq": "ATCGAATT" },
"reads": [{ "number": 50, "mapq_range": [10, 20],
"base_qual_range": [10, 20], "len_range": [0.2, 0.5],
"mods": [
{ "base": "T", "is_strand_plus": true, "mod_code": "T",
"win": [4, 5], "mod_range": [[0.1, 0.2], [0.3, 0.4]] },
{ "base": "C", "is_strand_plus": true, "mod_code": "m",
"win": [3], "mod_range": [[0.6, 0.9]] }
] }],
"seed": 12345
}"#;
let run_a = run_seeded_simulation(config_json)?;
let run_b = run_seeded_simulation(config_json)?;
assert_eq!(run_a, run_b, "same seed must produce identical BAM records");
Ok(())
}
}
#[cfg(test)]
mod read_generation_no_mods_tests {
use super::*;
use rust_htslib::bam::Read as _;
#[test]
fn generate_reads_denovo_no_mods_works() {
let mut rng = rand::rng();
let contigs = vec![
ContigBuilder::default()
.name("contig_0")
.seq("ACGTACGTACGTACGTACGT".into())
.build()
.unwrap(),
ContigBuilder::default()
.name("contig_1")
.seq("TGCATGCATGCATGCATGCA".into())
.build()
.unwrap(),
];
let config = ReadConfigBuilder::default()
.number(10)
.mapq_range((10, 20))
.base_qual_range((30, 50))
.len_range((0.2, 0.8))
.build()
.unwrap();
let reads = generate_reads_denovo(&contigs, &config, "ph", &mut rng).unwrap();
assert_eq!(reads.len(), 10);
for read in &reads {
if !read.is_unmapped() {
assert!((10..=20).contains(&read.mapq()));
assert!((0..2).contains(&read.tid()));
}
assert!((4..=16).contains(&read.seq_len()));
assert!(read.qual().iter().all(|x| (30..=50).contains(x)));
let _: rust_htslib::errors::Error = read.aux(b"MM").unwrap_err();
let _: rust_htslib::errors::Error = read.aux(b"ML").unwrap_err();
assert_eq!(read.qname().get(0..3).unwrap(), *b"ph.");
let _: Uuid =
Uuid::parse_str(str::from_utf8(read.qname().get(3..).unwrap()).unwrap()).unwrap();
}
}
#[test]
fn full_bam_generation() {
let config_json = r#"{
"contigs": {
"number": 2,
"len_range": [100, 200]
},
"reads": [{
"number": 1000,
"mapq_range": [10, 20],
"base_qual_range": [10, 20],
"len_range": [0.1, 0.8]
}]
}"#;
let temp_dir = std::env::temp_dir();
let bam_path = temp_dir.join(format!("{}.bam", Uuid::new_v4()));
let fasta_path = temp_dir.join(format!("{}.fa", Uuid::new_v4()));
let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
run(config, &bam_path, &fasta_path).unwrap();
assert!(bam_path.exists());
assert!(fasta_path.exists());
let mut reader = bam::Reader::from_path(&bam_path).unwrap();
let header = reader.header();
assert_eq!(header.target_count(), 2);
assert_eq!(reader.records().count(), 1000);
let bai_path = bam_path.with_extension("bam.bai");
std::fs::remove_file(&bam_path).unwrap();
std::fs::remove_file(fasta_path).unwrap();
std::fs::remove_file(bai_path).unwrap();
}
#[test]
fn temp_bam_simulation_struct_no_mods() {
let config_json = r#"{
"contigs": {
"number": 2,
"len_range": [100, 200]
},
"reads": [{
"number": 1000,
"mapq_range": [10, 20],
"base_qual_range": [10, 20],
"len_range": [0.1, 0.8]
}]
}"#;
let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
let sim = TempBamSimulation::new(config).unwrap();
assert!(Path::new(sim.bam_path()).exists());
assert!(Path::new(sim.fasta_path()).exists());
let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
let header = reader.header();
assert_eq!(header.target_count(), 2);
assert_eq!(reader.records().count(), 1000);
}
#[test]
fn temp_bam_simulation_cleanup() {
let config_json = r#"{
"contigs": {
"number": 1,
"len_range": [50, 50]
},
"reads": [{
"number": 10,
"len_range": [0.5, 0.5]
}]
}"#;
let bam_path: String;
let fasta_path: String;
{
let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
let sim = TempBamSimulation::new(config).unwrap();
bam_path = sim.bam_path().to_string();
fasta_path = sim.fasta_path().to_string();
assert!(Path::new(&bam_path).exists());
assert!(Path::new(&fasta_path).exists());
}
assert!(!Path::new(&bam_path).exists());
assert!(!Path::new(&fasta_path).exists());
}
#[test]
fn generate_reads_denovo_empty_contigs_error() {
let contigs: Vec<Contig> = vec![];
let config = ReadConfig::default();
let mut rng = rand::rng();
let result = generate_reads_denovo(&contigs, &config, "test", &mut rng);
assert!(matches!(result, Err(Error::UnavailableData(_))));
}
#[test]
fn generate_reads_denovo_zero_length_error() {
let contigs = [ContigBuilder::default()
.name("tiny")
.seq("ACGT".into())
.build()
.unwrap()];
let config = ReadConfigBuilder::default()
.number(1)
.len_range((0.0, 0.0))
.build()
.unwrap();
let mut rng = rand::rng();
let result = generate_reads_denovo(&contigs, &config, "test", &mut rng);
assert!(matches!(result, Err(Error::InvalidState(_))));
}
#[test]
fn run_empty_reads_error() {
let invalid_json = r#"{ "reads": [] }"#; let temp_dir = std::env::temp_dir();
let bam_path = temp_dir.join(format!("{}.bam", Uuid::new_v4()));
let fasta_path = temp_dir.join(format!("{}.fa", Uuid::new_v4()));
let config: SimulationConfig = serde_json::from_str(invalid_json).unwrap();
let result = run(config, &bam_path, &fasta_path);
drop(result);
let bai_path = bam_path.with_extension("bam.bai");
drop(std::fs::remove_file(&bam_path));
drop(std::fs::remove_file(&fasta_path));
drop(std::fs::remove_file(&bai_path));
}
#[test]
fn multiple_read_groups_work() {
let config_json = r#"{
"contigs": {
"number": 2,
"len_range": [100, 200]
},
"reads": [
{
"number": 50,
"mapq_range": [10, 20],
"base_qual_range": [20, 30],
"len_range": [0.1, 0.5]
},
{
"number": 75,
"mapq_range": [30, 40],
"base_qual_range": [25, 35],
"len_range": [0.3, 0.7]
},
{
"number": 25,
"mapq_range": [5, 15],
"base_qual_range": [15, 25],
"len_range": [0.2, 0.6]
}
]
}"#;
let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
let sim = TempBamSimulation::new(config).unwrap();
let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
assert_eq!(reader.records().count(), 150);
let mut reader2 = bam::Reader::from_path(sim.bam_path()).unwrap();
let header = reader2.header();
let header_text = std::str::from_utf8(header.as_bytes()).unwrap();
assert!(header_text.contains("@RG\tID:0"));
assert!(header_text.contains("@RG\tID:1"));
assert!(header_text.contains("@RG\tID:2"));
let mut rg_counts = [0, 0, 0];
for record in reader2.records() {
let read = record.unwrap();
let qname = std::str::from_utf8(read.qname()).unwrap();
if let Ok(Aux::String(rg)) = read.aux(b"RG") {
let parts: Vec<&str> = qname.split('.').collect();
assert_eq!(parts.len(), 2, "Read name should be in format 'n.uuid'");
let read_group_prefix =
parts.first().expect("parts should have at least 1 element");
let uuid_part = parts.get(1).expect("parts should have 2 elements");
assert_eq!(
*read_group_prefix, rg,
"Read name prefix should match read group"
);
assert!(
Uuid::parse_str(uuid_part).is_ok(),
"Read name should contain a valid UUID after the dot"
);
match rg {
"0" => *rg_counts.get_mut(0).unwrap() += 1,
"1" => *rg_counts.get_mut(1).unwrap() += 1,
"2" => *rg_counts.get_mut(2).unwrap() += 1,
unexpected => unreachable!("Unexpected read group: {unexpected}"),
}
}
}
assert_eq!(rg_counts[0], 50);
assert_eq!(rg_counts[1], 75);
assert_eq!(rg_counts[2], 25);
}
#[expect(
clippy::cast_sign_loss,
reason = "tid and pos are non-negative in valid BAM"
)]
#[expect(
clippy::cast_possible_truncation,
reason = "tid fits in usize for normal BAM files"
)]
#[test]
fn read_sequences_match_contigs() {
let contigs = vec![
ContigBuilder::default()
.name("chr1")
.seq("ACGTACGTACGTACGTACGTACGTACGTACGT".into())
.build()
.unwrap(),
ContigBuilder::default()
.name("chr2")
.seq("TGCATGCATGCATGCATGCATGCATGCATGCA".into())
.build()
.unwrap(),
];
let read_config = ReadConfigBuilder::default()
.number(50)
.len_range((0.2, 0.8))
.build()
.unwrap();
let mut rng = rand::rng();
let reads = generate_reads_denovo(&contigs, &read_config, "test", &mut rng).unwrap();
let mut validated_count = 0;
for read in &reads {
if !read.is_unmapped() && !read.is_reverse() {
let cigar = read.cigar();
let has_soft_clip = cigar.iter().any(|op| matches!(op, Cigar::SoftClip(_)));
if has_soft_clip {
continue;
}
let tid = read.tid() as usize;
let pos = read.pos() as usize;
let contig = contigs.get(tid).expect("valid tid");
let contig_seq = contig.get_dna_restrictive().get();
let mut aligned_len = 0usize;
for op in &cigar {
if let Cigar::Match(len) = *op {
aligned_len = aligned_len.checked_add(len as usize).expect("overflow");
}
}
let read_seq = read.seq().as_bytes();
let expected_seq = contig_seq
.get(pos..pos.checked_add(aligned_len).expect("overflow"))
.expect("contig subsequence exists");
assert_eq!(
read_seq, expected_seq,
"Forward read sequence should exactly match contig substring"
);
validated_count += 1;
}
}
assert!(
validated_count > 0,
"Should have validated at least some forward reads"
);
}
#[test]
fn different_read_states_work() {
let config_json = r#"{
"contigs": {
"number": 1,
"len_range": [200, 200]
},
"reads": [{
"number": 1000,
"mapq_range": [10, 20],
"base_qual_range": [20, 30],
"len_range": [0.2, 0.8]
}]
}"#;
let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
let sim = TempBamSimulation::new(config).unwrap();
let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
let mut has_unmapped = false;
let mut has_forward = false;
let mut has_reverse = false;
let mut has_secondary = false;
let mut has_supplementary = false;
for record in reader.records() {
let read = record.unwrap();
if read.is_unmapped() {
has_unmapped = true;
assert_eq!(read.tid(), -1);
assert_eq!(read.pos(), -1);
assert_eq!(read.mapq(), 255);
} else {
if read.is_reverse() {
has_reverse = true;
} else {
has_forward = true;
}
if read.is_secondary() {
has_secondary = true;
}
if read.is_supplementary() {
has_supplementary = true;
}
}
}
assert!(has_unmapped, "Should have unmapped reads");
assert!(has_forward, "Should have forward reads");
assert!(has_reverse, "Should have reverse reads");
assert!(has_secondary, "Should have secondary reads");
assert!(has_supplementary, "Should have supplementary reads");
}
#[test]
fn read_length_full_contig_works() {
let contigs = vec![
ContigBuilder::default()
.name("chr1")
.seq("ACGTACGTACGTACGTACGTACGTACGTACGT".into())
.build()
.unwrap(),
];
let config_full_length = ReadConfigBuilder::default()
.number(20)
.len_range((1.0, 1.0))
.build()
.unwrap();
let mut rng = rand::rng();
let reads_full =
generate_reads_denovo(&contigs, &config_full_length, "test", &mut rng).unwrap();
for read in &reads_full {
if !read.is_unmapped() {
assert_eq!(read.seq_len(), 32, "Read should be full contig length");
}
}
}
#[test]
fn read_length_small_fraction_works() {
let contigs = [ContigBuilder::default()
.name("chr1")
.seq("ACGT".repeat(50))
.build()
.unwrap()];
let config_small = ReadConfigBuilder::default()
.number(20)
.len_range((0.01, 0.05))
.build()
.unwrap();
let mut rng = rand::rng();
let reads_small = generate_reads_denovo(&contigs, &config_small, "test", &mut rng).unwrap();
for read in &reads_small {
if !read.is_unmapped() {
assert!(read.seq_len() <= 10, "Read should be very small");
assert!(read.seq_len() >= 2, "Read should be at least 2bp");
}
}
}
}
#[cfg(test)]
mod read_generation_barcodes {
use super::*;
use rust_htslib::bam::Read as _;
#[expect(
clippy::shadow_unrelated,
reason = "repetition is fine; each block is clearly separated"
)]
#[test]
fn add_barcode_works() {
let read_seq = b"GGGGGGGG".to_vec();
let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
let result = add_barcode(&read_seq, barcode.clone(), ReadState::PrimaryFwd);
assert_eq!(result, b"ACGTAAGGGGGGGGTTACGT".to_vec());
let result = add_barcode(&read_seq, barcode, ReadState::PrimaryRev);
assert_eq!(result, b"TGCATTGGGGGGGGAATGCA".to_vec());
}
#[test]
fn cigar_strings_valid_with_or_without_barcodes() {
let contigs = [ContigBuilder::default()
.name("chr1")
.seq("ACGTACGTACGTACGTACGTACGTACGTACGT".into())
.build()
.unwrap()];
let config_no_barcode = ReadConfigBuilder::default()
.number(20)
.len_range((0.3, 0.7))
.build()
.unwrap();
let mut rng = rand::rng();
let reads_no_barcode =
generate_reads_denovo(&contigs, &config_no_barcode, "test", &mut rng).unwrap();
for read in &reads_no_barcode {
if !read.is_unmapped() {
let cigar = read.cigar();
assert_eq!(cigar.len(), 1);
assert!(matches!(cigar.first().unwrap(), Cigar::Match(_)));
let cigar_len: u32 = cigar
.iter()
.map(|op| match *op {
Cigar::Match(len) | Cigar::SoftClip(len) => len,
Cigar::Ins(_)
| Cigar::Del(_)
| Cigar::RefSkip(_)
| Cigar::HardClip(_)
| Cigar::Pad(_)
| Cigar::Equal(_)
| Cigar::Diff(_) => unreachable!(),
})
.sum();
assert_eq!(cigar_len as usize, read.seq_len());
}
}
let config_with_barcode = ReadConfigBuilder::default()
.number(20)
.len_range((0.3, 0.7))
.barcode("ACGTAA".into())
.build()
.unwrap();
let reads_with_barcode =
generate_reads_denovo(&contigs, &config_with_barcode, "test", &mut rng).unwrap();
for read in &reads_with_barcode {
if !read.is_unmapped() {
let cigar = read.cigar();
assert_eq!(cigar.len(), 3);
assert!(matches!(cigar.first().unwrap(), Cigar::SoftClip(6))); assert!(matches!(cigar.get(1).unwrap(), Cigar::Match(_)));
assert!(matches!(cigar.get(2).unwrap(), Cigar::SoftClip(6)));
let cigar_len: u32 = cigar
.iter()
.map(|op| match *op {
Cigar::Match(len) | Cigar::SoftClip(len) => len,
Cigar::Ins(_)
| Cigar::Del(_)
| Cigar::RefSkip(_)
| Cigar::HardClip(_)
| Cigar::Pad(_)
| Cigar::Equal(_)
| Cigar::Diff(_) => unreachable!(),
})
.sum();
assert_eq!(cigar_len as usize, read.seq_len());
}
}
}
#[test]
fn generate_reads_denovo_with_barcode_works() {
let config_json = r#"{
"contigs": {
"number": 1,
"len_range": [20, 20]
},
"reads": [{
"number": 10,
"len_range": [0.2, 0.8],
"barcode": "GTACGG"
}]
}"#;
let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
let sim = TempBamSimulation::new(config).unwrap();
let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
for record in reader.records() {
let read = record.unwrap();
let seq = read.seq().as_bytes();
let seq_len = seq.len();
if read.is_reverse() {
assert_eq!(seq.get(..6).expect("seq has at least 6 bases"), b"CATGCC");
assert_eq!(
seq.get(seq_len.saturating_sub(6)..)
.expect("seq has at least 6 bases"),
b"GGCATG"
);
} else {
assert_eq!(seq.get(..6).expect("seq has at least 6 bases"), b"GTACGG");
assert_eq!(
seq.get(seq_len.saturating_sub(6)..)
.expect("seq has at least 6 bases"),
b"CCGTAC"
);
}
}
}
}
#[cfg(test)]
mod read_generation_with_mods_tests {
use super::*;
use crate::{CurrRead, ThresholdState, curr_reads_to_dataframe};
use rust_htslib::bam::Read as _;
#[test]
fn generate_reads_denovo_with_mods_works() {
let mut rng = rand::rng();
let contigs = vec![
ContigBuilder::default()
.name("contig_0")
.seq("ACGTACGTACGTACGTACGT".into())
.build()
.unwrap(),
ContigBuilder::default()
.name("contig_1")
.seq("TGCATGCATGCATGCATGCA".into())
.build()
.unwrap(),
];
let config = ReadConfigBuilder::default()
.number(10000)
.mapq_range((10, 20))
.base_qual_range((30, 50))
.len_range((0.2, 0.8))
.mods(
[ModConfigBuilder::default()
.base('C')
.mod_code("m".into())
.win([4].into())
.mod_range([(0f32, 1f32)].into())
.build()
.unwrap()]
.into(),
)
.build()
.unwrap();
let reads = generate_reads_denovo(&contigs, &config, "1", &mut rng).unwrap();
assert_eq!(reads.len(), 10000);
let mut zero_to_255_visited = vec![false; 256];
for read in &reads {
if !read.is_unmapped() {
assert!((10..=20).contains(&read.mapq()));
assert!((0..2).contains(&read.tid()));
}
assert!((4..=16).contains(&read.seq_len()));
assert!(read.qual().iter().all(|x| (30..=50).contains(x)));
let Aux::String(mod_pos_mm_tag) = read.aux(b"MM").unwrap() else {
unreachable!()
};
let Aux::ArrayU8(mod_prob_ml_tag) = read.aux(b"ML").unwrap() else {
unreachable!()
};
let pos: Vec<&str> = mod_pos_mm_tag
.strip_prefix("C+m?,")
.unwrap()
.strip_suffix(';')
.unwrap()
.split(',')
.collect();
assert!(pos.iter().all(|&x| x == "0"), "All MM values should be 0");
#[expect(
clippy::indexing_slicing,
reason = "we are not gonna bother, this should be fine"
)]
for k in 0..mod_prob_ml_tag.len() {
zero_to_255_visited[usize::from(mod_prob_ml_tag.get(k).expect("no error"))] = true;
}
}
assert!(zero_to_255_visited.into_iter().all(|x| x));
}
#[test]
fn temp_bam_simulation_struct_with_mods() {
let config_json = r#"{
"contigs": {
"number": 2,
"len_range": [100, 200]
},
"reads": [{
"number": 1000,
"mapq_range": [10, 20],
"base_qual_range": [10, 20],
"len_range": [0.1, 0.8],
"mods": [{
"base": "T",
"is_strand_plus": true,
"mod_code": "T",
"win": [4, 5],
"mod_range": [[0.1, 0.2], [0.3, 0.4]]
}]
}]
}"#;
let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
let sim = TempBamSimulation::new(config).unwrap();
assert!(Path::new(sim.bam_path()).exists());
assert!(Path::new(sim.fasta_path()).exists());
let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
let header = reader.header();
assert_eq!(header.target_count(), 2);
assert_eq!(reader.records().count(), 1000);
}
#[test]
fn generate_random_dna_modification_empty_config() {
let seq = DNARestrictive::from_str("ACGTACGT").unwrap();
let mod_configs: Vec<ModConfig> = vec![];
let mut rng = rand::rng();
let (mm_str, ml_vec) = generate_random_dna_modification(&mod_configs, &seq, &mut rng);
assert_eq!(mm_str, String::new());
assert_eq!(ml_vec.len(), 0);
}
#[test]
fn generate_random_dna_modification_single_mod() {
let seq = DNARestrictive::from_str("ACGTCGCGATCG").unwrap();
let mod_config = ModConfigBuilder::default()
.base('C')
.mod_code("m".into())
.win(vec![2])
.mod_range(vec![(0.5, 0.5)])
.build()
.unwrap();
let mut rng = rand::rng();
let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
assert_eq!(ml_vec.len(), 4);
assert!(ml_vec.iter().all(|&x| x == 128u8));
let probs: Vec<&str> = mm_str
.strip_prefix("C+m?,")
.unwrap()
.strip_suffix(';')
.unwrap()
.split(',')
.collect();
assert_eq!(probs.len(), 4);
assert!(probs.iter().all(|&x| x == "0"));
}
#[test]
fn generate_random_dna_modification_multiple_mods() {
let seq = DNARestrictive::from_str("ACGTACGT").unwrap();
let mod_config_c = ModConfigBuilder::default()
.base('C')
.mod_code('m'.into())
.win(vec![1])
.mod_range(vec![(0.8, 0.8)])
.build()
.unwrap();
let mod_config_t = ModConfigBuilder::default()
.base('T')
.is_strand_plus(false)
.mod_code('t'.into())
.win(vec![1])
.mod_range(vec![(0.4, 0.4)])
.build()
.unwrap();
let mut rng = rand::rng();
let (mm_str, ml_vec) =
generate_random_dna_modification(&[mod_config_c, mod_config_t], &seq, &mut rng);
assert!(mm_str.contains("C+m?,"));
assert!(mm_str.contains("T-t?,"));
assert_eq!(
ml_vec,
vec![204u8, 204u8, 102u8, 102u8],
"ML values should be 204,204,102,102"
);
let c_section = mm_str
.split(';')
.find(|s| s.starts_with("C+m?"))
.expect("Should have C+m? section");
let c_probs: Vec<&str> = c_section
.strip_prefix("C+m?,")
.unwrap()
.split(',')
.collect();
assert_eq!(
c_probs.len(),
2,
"Should have exactly 2 C modifications in ACGTACGT"
);
assert!(
c_probs.iter().all(|&x| x == "0"),
"All C modifications should have gap pos 0",
);
let t_section = mm_str
.split(';')
.find(|s| s.starts_with("T-t?"))
.expect("Should have T-t? section");
let t_probs: Vec<&str> = t_section
.strip_prefix("T-t?,")
.unwrap()
.split(',')
.collect();
assert_eq!(
t_probs.len(),
2,
"Should have exactly 2 T modifications in ACGTACGT"
);
assert!(
t_probs.iter().all(|&x| x == "0"),
"All T modifications should have a gap pos of 0"
);
}
#[test]
fn generate_random_dna_modification_n_base() {
let seq = DNARestrictive::from_str("ACGT").unwrap();
let mod_config = ModConfigBuilder::default()
.base('N')
.is_strand_plus(true)
.mod_code("n".into())
.win([4].into())
.mod_range([(0.5, 0.5)].into())
.build()
.unwrap();
let mut rng = rand::rng();
let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
assert_eq!(ml_vec.len(), 4);
let pos: Vec<&str> = mm_str
.strip_prefix("N+n?,")
.unwrap()
.strip_suffix(';')
.unwrap()
.split(',')
.collect();
assert_eq!(pos.len(), 4, "Should have exactly 4 modifications for ACGT");
assert!(
pos.iter().all(|&x| x == "0"),
"All N base modifications should have 0 as their gap position coordinate"
);
assert_eq!(
ml_vec,
vec![128u8, 128u8, 128u8, 128u8],
"All ML values should be 128 and number of values is 4"
);
}
#[test]
fn generate_random_dna_modification_cycling_windows() {
let seq = DNARestrictive::from_str("CCCCCCCCCCCCCCCC").unwrap();
let mod_config = ModConfigBuilder::default()
.base('C')
.mod_code("m".into())
.win([3, 2].into())
.mod_range([(0.8, 0.8), (0.4, 0.4)].into())
.build()
.unwrap();
let mut rng = rand::rng();
let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
let pos: Vec<&str> = mm_str
.strip_prefix("C+m?,")
.unwrap()
.strip_suffix(';')
.unwrap()
.split(',')
.collect();
assert_eq!(pos, vec!["0"; 16]);
let expected_pattern: Vec<u8> = vec![
204, 204, 204, 102, 102, 204, 204, 204, 102, 102, 204, 204, 204, 102, 102, 204,
];
assert_eq!(ml_vec, expected_pattern);
}
#[expect(
clippy::similar_names,
reason = "has_c_mod, has_a_mod, has_t_mod are clear in context"
)]
#[test]
fn multiple_simultaneous_modifications_work() {
let config_json = r#"{
"contigs": {
"number": 1,
"len_range": [100, 100],
"repeated_seq": "ACGTACGTACGTACGT"
},
"reads": [{
"number": 20,
"mapq_range": [20, 30],
"base_qual_range": [20, 30],
"len_range": [0.8, 1.0],
"mods": [
{
"base": "C",
"is_strand_plus": true,
"mod_code": "m",
"win": [2],
"mod_range": [[0.7, 0.9]]
},
{
"base": "A",
"is_strand_plus": true,
"mod_code": "a",
"win": [3],
"mod_range": [[0.3, 0.5]]
},
{
"base": "T",
"is_strand_plus": false,
"mod_code": "t",
"win": [1],
"mod_range": [[0.5, 0.6]]
}
]
}]
}"#;
let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
let sim = TempBamSimulation::new(config).unwrap();
let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
let mut has_c_mod = false;
let mut has_a_mod = false;
let mut has_t_mod = false;
for record in reader.records() {
let read = record.unwrap();
if let Ok(Aux::String(mm_tag)) = read.aux(b"MM") {
if mm_tag.contains("C+m?") {
has_c_mod = true;
}
if mm_tag.contains("A+a?") {
has_a_mod = true;
}
if mm_tag.contains("T-t?") {
has_t_mod = true;
}
assert!(
matches!(read.aux(b"ML").unwrap(), Aux::ArrayU8(_)),
"ML tag should be ArrayU8"
);
}
}
assert!(has_c_mod, "Should have C+m modifications");
assert!(has_a_mod, "Should have A+a modifications");
assert!(has_t_mod, "Should have T-t modifications");
}
#[expect(
clippy::shadow_unrelated,
reason = "clear variable reuse in sequential tests"
)]
#[test]
fn modifications_target_correct_bases() {
let seq = DNARestrictive::from_str("AAAACCCCGGGGTTTT").unwrap();
let mod_config_template = ModConfigBuilder::default()
.base('C')
.mod_code("m".into())
.win([4].into())
.mod_range([(1.0, 1.0)].into());
let mod_config_c = mod_config_template.clone().build().unwrap();
let mut rng = rand::rng();
let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_c], &seq, &mut rng);
assert_eq!(ml_vec.len(), 4);
let probs: Vec<&str> = mm_str
.strip_prefix("C+m?,")
.unwrap()
.strip_suffix(';')
.unwrap()
.split(',')
.collect();
assert_eq!(probs.len(), 4, "Should have exactly 4 C modifications");
let mod_config_t = mod_config_template
.clone()
.base('T')
.is_strand_plus(false)
.mod_code("t".into())
.build()
.unwrap();
let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_t], &seq, &mut rng);
assert_eq!(ml_vec.len(), 4);
let probs: Vec<&str> = mm_str
.strip_prefix("T-t?,")
.unwrap()
.strip_suffix(';')
.unwrap()
.split(',')
.collect();
assert_eq!(probs.len(), 4, "Should have exactly 4 T modifications");
let mod_config_a = mod_config_template
.clone()
.base('A')
.mod_code("a".into())
.build()
.unwrap();
let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_a], &seq, &mut rng);
assert_eq!(ml_vec.len(), 4);
let probs: Vec<&str> = mm_str
.strip_prefix("A+a?,")
.unwrap()
.strip_suffix(';')
.unwrap()
.split(',')
.collect();
assert_eq!(probs.len(), 4, "Should have exactly 4 A modifications");
let mod_config_g = mod_config_template
.clone()
.base('G')
.mod_code("g".into())
.build()
.unwrap();
let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_g], &seq, &mut rng);
assert_eq!(ml_vec.len(), 4);
let probs: Vec<&str> = mm_str
.strip_prefix("G+g?,")
.unwrap()
.strip_suffix(';')
.unwrap()
.split(',')
.collect();
assert_eq!(probs.len(), 4, "Should have exactly 4 G modifications");
}
#[test]
fn edge_case_no_target_base_for_modification() {
let seq = DNARestrictive::from_str("AAAAAAAAAA").unwrap();
let mod_config = ModConfigBuilder::default()
.base('C')
.mod_code("m".into())
.win([5].into())
.mod_range([(0.5, 0.5)].into())
.build()
.unwrap();
let mut rng = rand::rng();
let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
assert_eq!(mm_str, String::new());
assert_eq!(ml_vec.len(), 0);
}
#[expect(
clippy::shadow_unrelated,
reason = "clear variable reuse in sequential tests"
)]
#[test]
fn edge_case_modification_probability_extremes() {
let seq = DNARestrictive::from_str("CCCCCCCC").unwrap();
let mod_config_template = ModConfigBuilder::default()
.base('C')
.mod_code("m".into())
.win([8].into());
let mod_config_zero = mod_config_template
.clone()
.mod_range([(0.0, 0.0)].into())
.build()
.unwrap();
let mut rng = rand::rng();
let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_zero], &seq, &mut rng);
assert_eq!(ml_vec, vec![0u8; 8]);
let pos: Vec<&str> = mm_str
.strip_prefix("C+m?,")
.unwrap()
.strip_suffix(';')
.unwrap()
.split(',')
.collect();
assert!(pos.iter().all(|&x| x == "0"));
let mod_config_one = mod_config_template
.clone()
.mod_range([(1.0, 1.0)].into())
.build()
.unwrap();
let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config_one], &seq, &mut rng);
assert_eq!(ml_vec, vec![255u8; 8]);
let pos: Vec<&str> = mm_str
.strip_prefix("C+m?,")
.unwrap()
.strip_suffix(';')
.unwrap()
.split(',')
.collect();
assert!(pos.iter().all(|&x| x == "0"));
}
#[expect(
clippy::indexing_slicing,
reason = "test validates length before indexing"
)]
#[test]
fn config_deserialization_works_with_json_with_mods() {
let json_str = r#"{
"contigs": {
"number": 5,
"len_range": [100, 500],
"repeated_seq": "ACGTACGT"
},
"reads": [
{
"number": 100,
"mapq_range": [10, 30],
"base_qual_range": [20, 40],
"len_range": [0.1, 0.9],
"barcode": "ACGTAA",
"mods": [{
"base": "C",
"is_strand_plus": true,
"mod_code": "m",
"win": [5, 3],
"mod_range": [[0.3, 0.7], [0.1, 0.5]]
}]
},
{
"number": 50,
"mapq_range": [5, 15],
"base_qual_range": [15, 25],
"len_range": [0.2, 0.8]
}
]
}"#;
let config: SimulationConfig = serde_json::from_str(json_str).unwrap();
assert_eq!(config.contigs.number.get(), 5);
assert_eq!(config.contigs.len_range.low().get(), 100);
assert_eq!(config.contigs.len_range.high().get(), 500);
assert!(config.contigs.repeated_seq.is_some());
assert_eq!(config.reads.len(), 2);
assert_eq!(config.reads[0].number.get(), 100);
assert_eq!(config.reads[0].mapq_range.low(), 10);
assert_eq!(config.reads[0].mapq_range.high(), 30);
assert_eq!(config.reads[0].base_qual_range.low(), 20);
assert_eq!(config.reads[0].base_qual_range.high(), 40);
assert!(config.reads[0].barcode.is_some());
assert_eq!(config.reads[0].mods.len(), 1);
assert!(matches!(config.reads[0].mods[0].base, AllowedAGCTN::C));
assert_eq!(config.reads[0].mods[0].win.len(), 2);
assert_eq!(config.reads[1].number.get(), 50);
assert_eq!(config.reads[1].mods.len(), 0);
assert!(config.reads[1].barcode.is_none());
}
#[test]
fn barcodes_with_modifications_integration_works() {
let contigs = [ContigBuilder::default()
.name("chr1")
.seq("A".repeat(50))
.build()
.unwrap()];
let config = ReadConfigBuilder::default()
.number(20)
.len_range((0.3, 0.7))
.barcode("ACGTAA".into()) .mods(
[ModConfigBuilder::default()
.base('C')
.mod_code("m".into())
.win([10].into())
.mod_range([(0.8, 0.8)].into())
.build()
.unwrap()]
.into(),
)
.build()
.unwrap();
let mut rng = rand::rng();
let reads = generate_reads_denovo(&contigs, &config, "test", &mut rng).unwrap();
let mut has_modifications = false;
for read in &reads {
if !read.is_unmapped() {
let mm_pos_result = read.aux(b"MM");
let ml_prob_result = read.aux(b"ML");
assert!(
mm_pos_result.is_ok(),
"MM tag should be present due to C in barcode"
);
assert!(
ml_prob_result.is_ok(),
"ML tag should be present due to C in barcode"
);
if let Ok(Aux::String(mm_str)) = mm_pos_result {
assert!(
mm_str.starts_with("C+m?,"),
"MM tag should contain C+m modifications"
);
assert!(!mm_str.is_empty(), "MM tag should not be empty");
has_modifications = true;
}
}
}
assert!(
has_modifications,
"Should have found C modifications from barcode bases"
);
}
#[test]
fn edge_case_window_larger_than_sequence() {
let seq = DNARestrictive::from_str("ACGTACGT").unwrap();
let mod_config = ModConfigBuilder::default()
.base('C')
.mod_code("m".into())
.win([100].into()) .mod_range([(0.5, 0.5)].into())
.build()
.unwrap();
let mut rng = rand::rng();
let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
assert_eq!(
ml_vec,
vec![128u8, 128u8],
"All prob should be 128 (0.5*255)"
);
let pos: Vec<&str> = mm_str
.strip_prefix("C+m?,")
.unwrap()
.strip_suffix(';')
.unwrap()
.split(',')
.collect();
assert_eq!(pos.len(), 2, "Should have exactly 2 position values");
assert!(pos.iter().all(|&x| x == "0"), "All pos should be 0");
}
#[test]
fn edge_case_single_base_windows() {
let seq = DNARestrictive::from_str("C".repeat(16).as_str()).unwrap();
let mod_config = ModConfigBuilder::default()
.base('C')
.is_strand_plus(true)
.mod_code("m".into())
.win([1].into())
.mod_range([(0.9, 0.9), (0.1, 0.1)].into())
.build()
.unwrap();
let mut rng = rand::rng();
let (mm_str, ml_vec) = generate_random_dna_modification(&[mod_config], &seq, &mut rng);
let expected_pattern: Vec<u8> = vec![
230, 26, 230, 26, 230, 26, 230, 26, 230, 26, 230, 26, 230, 26, 230, 26,
];
assert_eq!(ml_vec, expected_pattern);
let pos: Vec<&str> = mm_str
.strip_prefix("C+m?,")
.unwrap()
.strip_suffix(';')
.unwrap()
.split(',')
.collect();
assert_eq!(pos, vec!["0"; 16], "pos should have 16 entries, all 0");
}
#[test]
fn reverse_reads_modification_positions_correct() {
let contigs = [ContigBuilder::default()
.name("chr1")
.seq("AAAAACAAAAACAAAAAC".into())
.build()
.unwrap()];
let config = ReadConfigBuilder::default()
.number(100)
.len_range((1.0, 1.0))
.mods(
[ModConfigBuilder::default()
.base('C')
.mod_code("m".into())
.win([10].into())
.mod_range([(0.8, 0.8)].into())
.build()
.unwrap()]
.into(),
)
.build()
.unwrap();
let mut rng = rand::rng();
let reads = generate_reads_denovo(&contigs, &config, "test", &mut rng).unwrap();
let mut forward_with_mods = 0;
for read in &reads {
if !read.is_unmapped() {
let mm_result = read.aux(b"MM");
if read.is_reverse() {
assert!(
mm_result.is_err(),
"Reverse reads should have no MM tag (revcomp has no C's)"
);
} else {
assert!(mm_result.is_ok(), "Forward reads should have MM tag");
if let Ok(Aux::String(mm_str)) = mm_result {
let probs: Vec<&str> = mm_str
.strip_prefix("C+m?,")
.unwrap()
.strip_suffix(';')
.unwrap()
.split(',')
.collect();
assert_eq!(probs.len(), 3, "Should have exactly 3 C modifications");
forward_with_mods += 1;
}
}
}
}
assert!(
forward_with_mods > 0,
"Should have validated some forward reads with modifications"
);
}
#[expect(
clippy::too_many_lines,
reason = "test requires setup, iteration, and multiple assertions for thorough validation"
)]
#[test]
fn mismatch_mod_check() {
let json_str = r#"{
"contigs": {
"number": 1,
"len_range": [100, 100],
"repeated_seq": "ACGT"
},
"reads": [
{
"number": 100,
"len_range": [1.0, 1.0],
"mods": [{
"base": "C",
"is_strand_plus": true,
"mod_code": "m",
"win": [1],
"mod_range": [[1.0, 1.0]]
}]
},
{
"number": 100,
"len_range": [1.0, 1.0],
"mismatch": 1.0,
"mods": [{
"base": "C",
"is_strand_plus": true,
"mod_code": "m",
"win": [1],
"mod_range": [[0.51, 0.51]]
}]
}
]
}"#;
let config: SimulationConfig = serde_json::from_str(json_str).unwrap();
let sim = TempBamSimulation::new(config).unwrap();
let mut bam = bam::Reader::from_path(sim.bam_path()).unwrap();
let mut df_collection = Vec::new();
for k in bam.records() {
let record = k.unwrap();
let curr_read = CurrRead::default()
.try_from_only_alignment(&record)
.unwrap()
.set_mod_data(&record, ThresholdState::GtEq(0), 0)
.unwrap();
df_collection.push(curr_read);
}
let df = curr_reads_to_dataframe(df_collection.as_slice()).unwrap();
let read_id_col = df.column("read_id").unwrap().str().unwrap();
let ref_position_col = df.column("ref_position").unwrap().i64().unwrap();
let alignment_type_col = df.column("alignment_type").unwrap().str().unwrap();
let mod_quality_col = df.column("mod_quality").unwrap().u32().unwrap();
let mut row_count_0: usize = 0;
let mut row_count_1: usize = 0;
let mut group0_forward_position_violations: usize = 0;
let mut group0_reverse_position_violations: usize = 0;
let mut group0_unmapped_position_violations: usize = 0;
let mut group0_quality_violations: usize = 0;
let mut group1_forward_position_violations: usize = 0;
let mut group1_reverse_position_violations: usize = 0;
let mut group1_unmapped_position_violations: usize = 0;
let mut group1_quality_violations: usize = 0;
let mut read_id_violations: usize = 0;
for i in 0..df.height() {
let read_id = read_id_col.get(i).unwrap();
let ref_position = ref_position_col.get(i).unwrap();
let alignment_type = alignment_type_col.get(i).unwrap();
let mod_quality = mod_quality_col.get(i).unwrap();
if read_id.starts_with("0.") {
if alignment_type.contains("forward") {
if ref_position % 4 != 1 {
group0_forward_position_violations += 1;
}
} else if alignment_type.contains("reverse") {
if ref_position % 4 != 2 {
group0_reverse_position_violations += 1;
}
} else {
if ref_position != -1 {
group0_unmapped_position_violations += 1;
}
}
if mod_quality != 255 {
group0_quality_violations += 1;
}
row_count_0 += 1;
} else if read_id.starts_with("1.") {
if alignment_type.contains("forward") {
if ref_position % 4 == 1 {
group1_forward_position_violations += 1;
}
} else if alignment_type.contains("reverse") {
if ref_position % 4 == 2 {
group1_reverse_position_violations += 1;
}
} else {
if ref_position != -1 {
group1_unmapped_position_violations += 1;
}
}
if mod_quality != 130 {
group1_quality_violations += 1;
}
row_count_1 += 1;
} else {
read_id_violations += 1;
}
}
assert_eq!(
group0_forward_position_violations, 0,
"Group 0 forward: expected 0 position violations"
);
assert_eq!(
group0_reverse_position_violations, 0,
"Group 0 reverse: expected 0 position violations"
);
assert_eq!(
group0_unmapped_position_violations, 0,
"Group 0 unmapped: expected 0 position violations"
);
assert_eq!(
group0_quality_violations, 0,
"Group 0: expected 0 quality violations"
);
assert_eq!(
group1_forward_position_violations, 0,
"Group 1 forward: expected 0 position violations"
);
assert_eq!(
group1_reverse_position_violations, 0,
"Group 1 reverse: expected 0 position violations"
);
assert_eq!(
group1_unmapped_position_violations, 0,
"Group 1 unmapped: expected 0 position violations"
);
assert_eq!(
group1_quality_violations, 0,
"Group 1: expected 0 quality violations"
);
assert_eq!(read_id_violations, 0, "expected 0 read_id violations");
assert!(
row_count_0 > 0,
"Should have processed some rows from group 0"
);
assert!(
row_count_1 > 0,
"Should have processed some rows from group 1"
);
}
}
#[cfg(test)]
mod contig_generation_tests {
use super::*;
use rust_htslib::bam::Read as _;
#[test]
fn edge_case_repeated_sequence_exact_length() {
let config_json = r#"{
"contigs": {
"number": 1,
"len_range": [16, 16],
"repeated_seq": "ACGT"
},
"reads": [{
"number": 5,
"len_range": [0.5, 0.5]
}]
}"#;
let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
let sim = TempBamSimulation::new(config).unwrap();
let reader = bam::Reader::from_path(sim.bam_path()).unwrap();
let header = reader.header();
assert_eq!(header.target_len(0).unwrap(), 16);
}
#[test]
fn edge_case_minimum_contig_size() {
let config_json = r#"{
"contigs": {
"number": 1,
"len_range": [1, 1]
},
"reads": [{
"number": 10,
"mapq_range": [10, 20],
"base_qual_range": [20, 30],
"len_range": [1.0, 1.0]
}]
}"#;
let config: SimulationConfig = serde_json::from_str(config_json).unwrap();
let sim = TempBamSimulation::new(config).unwrap();
let mut reader = bam::Reader::from_path(sim.bam_path()).unwrap();
let header = reader.header();
assert_eq!(header.target_count(), 1);
assert_eq!(header.target_len(0).unwrap(), 1);
assert!(reader.records().count() > 0);
}
#[test]
fn generate_contigs_denovo_works() {
let config = ContigConfigBuilder::default()
.number(5)
.len_range((100, 200))
.build()
.unwrap();
let contigs = generate_contigs_denovo(config.number, config.len_range, &mut rand::rng());
assert_eq!(contigs.len(), 5);
for (i, contig) in contigs.iter().enumerate() {
assert_eq!(contig.name, format!("contig_0000{i}"));
assert!((100..=200).contains(&contig.get_dna_restrictive().get().len()));
for base in contig.get_dna_restrictive().get() {
assert!([b'A', b'C', b'G', b'T'].contains(base));
}
}
}
#[test]
fn generate_contigs_denovo_repeated_seq_works() {
let seq = DNARestrictive::from_str("ACGT").unwrap();
let contigs = generate_contigs_denovo_repeated_seq(
NonZeroU32::new(10000).unwrap(),
OrdPair::new(NonZeroU64::new(10).unwrap(), NonZeroU64::new(12).unwrap()).unwrap(),
&seq,
&mut rand::rng(),
);
let mut counts = [0, 0, 0];
assert_eq!(contigs.len(), 10000);
for (i, contig) in contigs.iter().enumerate() {
assert_eq!(contig.name, format!("contig_{i:05}"));
let idx = match contig.get_dna_restrictive().get() {
b"ACGTACGTAC" => 0,
b"ACGTACGTACG" => 1,
b"ACGTACGTACGT" => 2,
_ => unreachable!(),
};
*counts
.get_mut(idx)
.expect("idx is 0, 1, or 2; counts has 3 elements") += 1;
}
for count in counts {
assert!(count >= 3000);
}
}
}
#[cfg(test)]
mod perfect_seq_match_to_not_tests {
use super::*;
#[test]
fn seq_with_valid_sequence() -> Result<(), Error> {
let _builder = PerfectSeqMatchToNot::seq(b"ACGT".to_vec())?;
Ok(())
}
#[test]
fn seq_with_empty_sequence_returns_error() {
let result = PerfectSeqMatchToNot::seq(b"".to_vec());
assert!(matches!(result, Err(Error::InvalidState(_))));
}
#[test]
fn build_without_barcode_forward_read() -> Result<(), Error> {
let mut rng = rand::rng();
let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
.build(ReadState::PrimaryFwd, &mut rng)?;
assert_eq!(seq, b"GGGGGGGG");
assert_eq!(cigar.expect("no error").to_string(), "8M");
Ok(())
}
#[test]
fn build_without_barcode_reverse_read() -> Result<(), Error> {
let mut rng = rand::rng();
let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
.build(ReadState::PrimaryRev, &mut rng)?;
assert_eq!(seq, b"GGGGGGGG");
assert_eq!(cigar.expect("no error").to_string(), "8M");
Ok(())
}
#[test]
fn build_with_barcode_forward_read() -> Result<(), Error> {
let mut rng = rand::rng();
let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
.barcode(barcode)
.build(ReadState::PrimaryFwd, &mut rng)?;
assert_eq!(seq, b"ACGTAAGGGGGGGGTTACGT");
assert_eq!(cigar.expect("no error").to_string(), "6S8M6S");
Ok(())
}
#[test]
fn build_with_barcode_reverse_read() -> Result<(), Error> {
let mut rng = rand::rng();
let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
.barcode(barcode)
.build(ReadState::PrimaryRev, &mut rng)?;
assert_eq!(seq, b"TGCATTGGGGGGGGAATGCA");
assert_eq!(cigar.expect("no error").to_string(), "6S8M6S");
Ok(())
}
#[test]
fn build_unmapped_read_returns_none_cigar() -> Result<(), Error> {
let mut rng = rand::rng();
let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
.build(ReadState::Unmapped, &mut rng)?;
assert_eq!(seq, b"GGGGGGGG");
assert!(cigar.is_none());
Ok(())
}
#[test]
fn build_unmapped_read_with_barcode_returns_none_cigar() -> Result<(), Error> {
let mut rng = rand::rng();
let barcode = DNARestrictive::from_str("ACGTAA").unwrap();
let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGG".to_vec())?
.barcode(barcode)
.build(ReadState::Unmapped, &mut rng)?;
assert_eq!(seq, b"ACGTAAGGGGGGGGTTACGT");
assert!(cigar.is_none());
Ok(())
}
#[test]
fn build_with_delete() -> Result<(), Error> {
let mut rng = rand::rng();
let delete_range = OrdPair::new(F32Bw0and1::try_from(0.5)?, F32Bw0and1::try_from(0.75)?)?;
let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAATTTTCCCCGGGG".to_vec())?
.delete(delete_range)
.build(ReadState::PrimaryFwd, &mut rng)?;
assert_eq!(seq, b"AAAATTTTGGGG");
assert_eq!(cigar.expect("no error").to_string(), "8M4D4M");
Ok(())
}
#[test]
fn build_with_insert_middle() -> Result<(), Error> {
let mut rng = rand::rng();
let insert_seq = DNARestrictive::from_str("TTTT").unwrap();
let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAGGGG".to_vec())?
.insert_middle(insert_seq)
.build(ReadState::PrimaryFwd, &mut rng)?;
assert_eq!(seq, b"AAAATTTTGGGG");
assert_eq!(cigar.expect("no error").to_string(), "4M4I4M");
Ok(())
}
#[test]
fn build_with_mismatch() -> Result<(), Error> {
let mut rng = rand::rng();
let mismatch_frac = F32Bw0and1::try_from(0.5)?;
let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())?
.mismatch(mismatch_frac)
.build(ReadState::PrimaryFwd, &mut rng)?;
let mismatch_count = seq.iter().filter(|&&b| b != b'A').count();
assert_eq!(mismatch_count, 4);
assert_eq!(cigar.expect("no error").to_string(), "8M");
Ok(())
}
#[test]
fn build_with_delete_and_insert() -> Result<(), Error> {
let mut rng = rand::rng();
let delete_range = OrdPair::new(F32Bw0and1::try_from(0.5)?, F32Bw0and1::try_from(0.75)?)?;
let insert_seq = DNARestrictive::from_str("TT").unwrap();
let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAACCCCGGGG".to_vec())?
.delete(delete_range)
.insert_middle(insert_seq)
.build(ReadState::PrimaryFwd, &mut rng)?;
assert_eq!(seq, b"AAAACCTTGGG");
assert_eq!(cigar.expect("no error").to_string(), "6M2I3D3M");
Ok(())
}
#[test]
fn build_with_all_features() -> Result<(), Error> {
let mut rng = rand::rng();
let delete_range = OrdPair::new(F32Bw0and1::try_from(0.25)?, F32Bw0and1::try_from(0.5)?)?;
let insert_seq = DNARestrictive::from_str("CC").unwrap();
let mismatch_frac = F32Bw0and1::try_from(0.25)?;
let barcode = DNARestrictive::from_str("AAA").unwrap();
let (seq, cigar) = PerfectSeqMatchToNot::seq(b"GGGGGGGGGGGGGGGG".to_vec())?
.delete(delete_range)
.insert_middle(insert_seq)
.mismatch(mismatch_frac)
.barcode(barcode)
.build(ReadState::PrimaryFwd, &mut rng)?;
assert_eq!(seq.len(), 20);
let cigar_str = cigar.expect("no error").to_string();
assert!(cigar_str.starts_with("3S")); assert!(cigar_str.ends_with("3S")); assert!(cigar_str.contains('D')); assert!(cigar_str.contains('I')); Ok(())
}
#[test]
fn build_with_delete_zero_length() -> Result<(), Error> {
let mut rng = rand::rng();
let delete_range = OrdPair::new(F32Bw0and1::try_from(0.5)?, F32Bw0and1::try_from(0.5)?)?;
let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())?
.delete(delete_range)
.build(ReadState::PrimaryFwd, &mut rng)?;
assert_eq!(seq, b"AAAAAAAA");
assert_eq!(cigar.expect("no error").to_string(), "8M");
Ok(())
}
#[test]
fn build_with_mismatch_zero_fraction() -> Result<(), Error> {
let mut rng = rand::rng();
let mismatch_frac = F32Bw0and1::try_from(0.0)?;
let (seq, cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())?
.mismatch(mismatch_frac)
.build(ReadState::PrimaryFwd, &mut rng)?;
assert_eq!(seq, b"AAAAAAAA");
assert_eq!(cigar.expect("no error").to_string(), "8M");
Ok(())
}
#[test]
#[should_panic(expected = "SimulateDNASeqCIGAREndProblem")]
fn build_with_delete_whole_length() {
let mut rng = rand::rng();
let delete_range = OrdPair::new(F32Bw0and1::zero(), F32Bw0and1::one()).unwrap();
let (_seq, _cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())
.unwrap()
.delete(delete_range)
.build(ReadState::PrimaryFwd, &mut rng)
.unwrap();
}
#[test]
#[should_panic(expected = "SimulateDNASeqCIGAREndProblem")]
fn build_with_insert_almost_whole_length() {
let mut rng = rand::rng();
let insert_seq = DNARestrictive::from_str("AATT").unwrap();
let (_seq, _cigar) = PerfectSeqMatchToNot::seq(b"A".to_vec())
.unwrap()
.insert_middle(insert_seq)
.build(ReadState::PrimaryFwd, &mut rng)
.unwrap();
}
#[test]
#[should_panic(expected = "SimulateDNASeqCIGAREndProblem")]
fn build_with_delete_whole_length_with_barcode() {
let mut rng = rand::rng();
let delete_range = OrdPair::new(F32Bw0and1::zero(), F32Bw0and1::one()).unwrap();
let barcode = DNARestrictive::from_str("CGCG").unwrap();
let (_seq, _cigar) = PerfectSeqMatchToNot::seq(b"AAAAAAAA".to_vec())
.unwrap()
.delete(delete_range)
.barcode(barcode)
.build(ReadState::PrimaryFwd, &mut rng)
.unwrap();
}
#[test]
#[should_panic(expected = "SimulateDNASeqCIGAREndProblem")]
fn build_with_insert_almost_whole_length_with_barcode() {
let mut rng = rand::rng();
let insert_seq = DNARestrictive::from_str("AATT").unwrap();
let barcode = DNARestrictive::from_str("CGCG").unwrap();
let (_seq, _cigar) = PerfectSeqMatchToNot::seq(b"A".to_vec())
.unwrap()
.insert_middle(insert_seq)
.barcode(barcode)
.build(ReadState::PrimaryFwd, &mut rng)
.unwrap();
}
}