#![allow(
clippy::similar_names,
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
clippy::cast_precision_loss,
clippy::too_many_lines
)]
mod helpers;
use std::path::PathBuf;
use helpers::{
PileupColumn, TestEnv, VcfVariant, count_fastq_records, non_repetitive_seq, pileup_bases,
read_gzipped, run_simulate, simple_env,
};
use std::collections::HashMap;
use noodles::bam;
use noodles::sam::alignment::RecordBuf;
use noodles::sam::alignment::record::cigar::op::Kind as CigarKind;
fn any_bam_record_has_cigar_op(bam_path: &std::path::Path, target: CigarKind) -> bool {
let mut reader = bam::io::reader::Builder.build_from_path(bam_path).unwrap();
let header = reader.read_header().unwrap();
reader
.record_bufs(&header)
.map(|r| r.unwrap())
.any(|r| r.cigar().as_ref().iter().any(|op| op.kind() == target))
}
fn read_base_map(
record: &RecordBuf,
) -> (HashMap<u32, u8>, std::collections::HashSet<u32>, std::collections::HashSet<u32>) {
let mut base_at = HashMap::new();
let mut ins_after = std::collections::HashSet::new();
let mut del_at = std::collections::HashSet::new();
let Some(start) = record.alignment_start() else {
return (base_at, ins_after, del_at);
};
let mut ref_pos = (usize::from(start) - 1) as u32;
let seq = record.sequence().as_ref();
let mut read_pos = 0u32;
for op in record.cigar().as_ref() {
let len = op.len() as u32;
match op.kind() {
CigarKind::Match | CigarKind::SequenceMatch | CigarKind::SequenceMismatch => {
for i in 0..len {
if (read_pos + i) < seq.len() as u32 {
base_at.insert(ref_pos + i, seq[(read_pos + i) as usize]);
}
}
ref_pos += len;
read_pos += len;
}
CigarKind::Insertion => {
if ref_pos > 0 {
ins_after.insert(ref_pos - 1);
}
read_pos += len;
}
CigarKind::Deletion | CigarKind::Skip => {
for i in 0..len {
del_at.insert(ref_pos + i);
}
ref_pos += len;
}
CigarKind::SoftClip => {
read_pos += len;
}
CigarKind::HardClip | CigarKind::Pad => {}
}
}
(base_at, ins_after, del_at)
}
#[test]
fn test_simulate_basic_pe() {
let env = simple_env();
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"10",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"30",
]);
assert!(ok, "simulate failed: {stderr}");
let r1_path = std::path::PathBuf::from(format!("{}.r1.fastq.gz", out.display()));
let r2_path = std::path::PathBuf::from(format!("{}.r2.fastq.gz", out.display()));
assert!(r1_path.exists(), "R1 file missing: {}", r1_path.display());
assert!(r2_path.exists(), "R2 file missing: {}", r2_path.display());
let r1_contents = read_gzipped(&r1_path);
let r2_contents = read_gzipped(&r2_path);
let r1_records = count_fastq_records(&r1_contents);
let r2_records = count_fastq_records(&r2_contents);
assert_eq!(r1_records, r2_records, "R1 and R2 should have same record count");
assert!(r1_records > 0, "Should have generated some reads");
let first_seq = r1_contents.lines().nth(1).unwrap();
assert_eq!(first_seq.len(), 50, "Read length should be 50");
let first_name = r1_contents.lines().next().unwrap();
assert!(
first_name.starts_with("@holodeck::"),
"Read name should start with @holodeck:: got {first_name}"
);
}
#[test]
fn test_simulate_single_end() {
use holodeck_lib::read_naming::parse_encoded_se_name;
let env = simple_env();
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"5",
"--read-length",
"50",
"--fragment-mean",
"100",
"--fragment-stddev",
"20",
"--single-end",
]);
assert!(ok, "simulate failed: {stderr}");
let r1_path = std::path::PathBuf::from(format!("{}.r1.fastq.gz", out.display()));
let r2_path = std::path::PathBuf::from(format!("{}.r2.fastq.gz", out.display()));
assert!(r1_path.exists(), "R1 file should exist");
assert!(!r2_path.exists(), "R2 file should NOT exist for SE");
let r1_contents = read_gzipped(&r1_path);
let r1_records = count_fastq_records(&r1_contents);
assert!(r1_records > 0, "Should have generated some SE reads");
let first_name = r1_contents.lines().next().unwrap().trim_start_matches('@');
let (_read_num, truth) = parse_encoded_se_name(first_name)
.unwrap_or_else(|| panic!("SE encoded name should parse: {first_name}"));
assert!(!truth.contig.is_empty(), "contig should be non-empty: {first_name}");
assert!(truth.fragment_length > 0, "fragment_length should be positive: {first_name}");
}
#[test]
fn test_fragment_length_identifies_adapter_boundary() {
use holodeck_lib::read_naming::parse_encoded_pe_name;
let adapter_r1_prefix = b"AGATCGG";
let adapter_r2_prefix = b"AGATCGG";
let seq = non_repetitive_seq(20_000);
let env = TestEnv::new(&[("chr1", &seq)]);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"20",
"--read-length",
"150",
"--fragment-mean",
"40",
"--fragment-stddev",
"10",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--seed",
"11",
]);
assert!(ok, "simulate failed: {stderr}");
let check = |fastq_path: &std::path::Path, adapter_prefix: &[u8], label: &str| {
let contents = read_gzipped(fastq_path);
let mut lines = contents.lines();
let mut short_reads_checked = 0usize;
while let (Some(name_line), Some(seq_line), Some(_plus), Some(_qual)) =
(lines.next(), lines.next(), lines.next(), lines.next())
{
let name = name_line.trim_start_matches('@');
let Some((_num, r1, _r2)) = parse_encoded_pe_name(name) else { continue };
let frag_len = r1.fragment_length as usize;
let bases = seq_line.as_bytes();
assert_eq!(bases.len(), 150, "read length mismatch in {label}");
if frag_len < bases.len() {
let prefix_take = adapter_prefix.len().min(bases.len() - frag_len);
assert_eq!(
&bases[frag_len..frag_len + prefix_take],
&adapter_prefix[..prefix_take],
"{label}: bases starting at FRAG_LEN={frag_len} should match the adapter; got read={name}",
);
short_reads_checked += 1;
}
}
assert!(
short_reads_checked > 0,
"{label}: expected some reads with FRAG_LEN < read_length"
);
};
check(
&std::path::PathBuf::from(format!("{}.r1.fastq.gz", out.display())),
adapter_r1_prefix,
"R1",
);
check(
&std::path::PathBuf::from(format!("{}.r2.fastq.gz", out.display())),
adapter_r2_prefix,
"R2",
);
}
#[test]
fn test_simulate_simple_names() {
let env = simple_env();
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"5",
"--read-length",
"50",
"--fragment-mean",
"100",
"--fragment-stddev",
"20",
"--simple-names",
]);
assert!(ok, "simulate failed: {stderr}");
let r1_path = std::path::PathBuf::from(format!("{}.r1.fastq.gz", out.display()));
let r1_contents = read_gzipped(&r1_path);
let first_name = r1_contents.lines().next().unwrap();
assert_eq!(first_name, "@holodeck::1", "Simple name should be holodeck::N");
}
#[test]
fn test_simulate_seed_reproducibility() {
let env = simple_env();
let out1 = env.dir.path().join("run1");
let out2 = env.dir.path().join("run2");
let common_args = |out: &std::path::Path| {
vec![
"simulate".to_string(),
"-r".to_string(),
env.fasta_path.to_str().unwrap().to_string(),
"-o".to_string(),
out.to_str().unwrap().to_string(),
"--coverage".to_string(),
"5".to_string(),
"--read-length".to_string(),
"50".to_string(),
"--fragment-mean".to_string(),
"100".to_string(),
"--fragment-stddev".to_string(),
"20".to_string(),
"--seed".to_string(),
"42".to_string(),
]
};
let args1 = common_args(&out1);
let args1_str: Vec<&str> = args1.iter().map(String::as_str).collect();
let (ok1, _, _) = run_simulate(&args1_str);
assert!(ok1);
let args2 = common_args(&out2);
let args2_str: Vec<&str> = args2.iter().map(String::as_str).collect();
let (ok2, _, _) = run_simulate(&args2_str);
assert!(ok2);
let r1_a = read_gzipped(&std::path::PathBuf::from(format!("{}.r1.fastq.gz", out1.display())));
let r1_b = read_gzipped(&std::path::PathBuf::from(format!("{}.r1.fastq.gz", out2.display())));
assert_eq!(r1_a, r1_b, "Same seed should produce identical output");
}
#[test]
fn test_simulate_with_bed_targets() {
let seq = b"ACGTACGTAC".repeat(100); let env = TestEnv::new(&[("chr1", &seq)]);
let bed_path = env.write_bed(&[("chr1", 100, 200)]); let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"10",
"--read-length",
"50",
"--fragment-mean",
"100",
"--fragment-stddev",
"20",
"-b",
bed_path.to_str().unwrap(),
]);
assert!(ok, "simulate with BED failed: {stderr}");
let r1_path = std::path::PathBuf::from(format!("{}.r1.fastq.gz", out.display()));
let r1_contents = read_gzipped(&r1_path);
let r1_records = count_fastq_records(&r1_contents);
assert!(r1_records > 0, "Should have generated reads overlapping target");
}
#[test]
fn test_simulate_sparse_targets_produce_expected_reads() {
let seq = b"ACGTACGTAC".repeat(10_000); let env = TestEnv::new(&[("chr1", &seq)]);
let bed_path = env.write_bed(&[
("chr1", 5_000, 5_100),
("chr1", 50_000, 50_100),
("chr1", 90_000, 90_100),
]);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"30",
"--read-length",
"50",
"--fragment-mean",
"100",
"--fragment-stddev",
"20",
"-b",
bed_path.to_str().unwrap(),
"--seed",
"1",
]);
assert!(ok, "simulate with sparse targets failed: {stderr}");
let r1_path = std::path::PathBuf::from(format!("{}.r1.fastq.gz", out.display()));
let r1_contents = read_gzipped(&r1_path);
let r1_records = count_fastq_records(&r1_contents);
assert!(
r1_records >= 165,
"Expected ~179 reads, got {r1_records} (too few — coverage calculation bug?)"
);
assert!(r1_records <= 195, "Expected ~179 reads, got {r1_records} (too many)");
}
#[test]
fn test_simulate_reads_overlap_targets() {
use holodeck_lib::read_naming::parse_encoded_pe_name;
let seq = b"ACGTACGTAC".repeat(10_000); let env = TestEnv::new(&[("chr1", &seq)]);
let bed_path = env.write_bed(&[("chr1", 10_000, 10_200)]);
let out = env.output_prefix();
let read_length: u32 = 50;
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"50",
"--read-length",
&read_length.to_string(),
"--fragment-mean",
"100",
"--fragment-stddev",
"20",
"-b",
bed_path.to_str().unwrap(),
"--seed",
"7",
]);
assert!(ok, "simulate failed: {stderr}");
let r1_path = std::path::PathBuf::from(format!("{}.r1.fastq.gz", out.display()));
let r1_contents = read_gzipped(&r1_path);
let target_start: u32 = 10_000;
let target_end: u32 = 10_200;
let mut checked = 0;
for line in r1_contents.lines() {
if !line.starts_with('@') {
continue;
}
let name = line.trim_start_matches('@');
let Some((_read_num, r1, r2)) = parse_encoded_pe_name(name) else {
continue;
};
let frag_start_0 = r1.position.min(r2.position) - 1;
let frag_end_0 = frag_start_0 + r1.fragment_length;
assert!(
frag_end_0 > target_start && frag_start_0 < target_end,
"Fragment [{frag_start_0}, {frag_end_0}) does not overlap target [{target_start}, {target_end})"
);
checked += 1;
}
assert!(checked > 0, "Should have checked at least one read");
}
#[test]
fn test_simulate_golden_bam() {
use noodles::sam::alignment::record::data::field::Tag as DataTag;
use noodles::sam::alignment::record::data::field::Value as DataValueRef;
use noodles::sam::header::record::value::map::program::tag as pg_tag;
use noodles::sam::header::record::value::map::read_group::tag as rg_tag;
let env = simple_env();
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"5",
"--read-length",
"50",
"--fragment-mean",
"100",
"--fragment-stddev",
"20",
"--golden-bam",
"--seed",
"42",
]);
assert!(ok, "simulate with --golden-bam failed: {stderr}");
let bam_path = std::path::PathBuf::from(format!("{}.golden.bam", out.display()));
assert!(bam_path.exists(), "Golden BAM file missing: {}", bam_path.display());
let mut reader = noodles::bam::io::reader::Builder.build_from_path(&bam_path).unwrap();
let header = reader.read_header().unwrap();
assert_eq!(header.reference_sequences().len(), 1);
let (name, _) = header.reference_sequences().get_index(0).unwrap();
assert_eq!(name.as_ref() as &[u8], b"chr1");
assert_eq!(header.programs().as_ref().len(), 1);
let (pg_id, pg) = header.programs().as_ref().iter().next().unwrap();
assert_eq!(pg_id.as_ref() as &[u8], b"holodeck");
let pg_name = pg.other_fields().get(&pg_tag::NAME).expect("@PG missing PN");
assert_eq!(pg_name.as_ref() as &[u8], b"holodeck");
let pg_cl = pg.other_fields().get(&pg_tag::COMMAND_LINE).expect("@PG missing CL");
let pg_cl_str = std::str::from_utf8(pg_cl).unwrap();
assert!(pg_cl_str.contains("--golden-bam"), "@PG CL should echo argv, got: {pg_cl_str}");
let pg_vn = pg.other_fields().get(&pg_tag::VERSION).expect("@PG missing VN");
assert!(!pg_vn.is_empty(), "@PG VN should be non-empty");
assert_eq!(header.read_groups().len(), 1);
let (rg_id, rg) = header.read_groups().iter().next().unwrap();
assert_eq!(rg_id.as_ref() as &[u8], b"A");
let rg_sm = rg.other_fields().get(&rg_tag::SAMPLE).expect("@RG missing SM");
assert_eq!(rg_sm.as_ref() as &[u8], b"holodeck-simulation");
let rg_lb = rg.other_fields().get(&rg_tag::LIBRARY).expect("@RG missing LB");
assert_eq!(rg_lb.as_ref() as &[u8], b"holodeck-simulation");
let rg_pl = rg.other_fields().get(&rg_tag::PLATFORM).expect("@RG missing PL");
assert_eq!(rg_pl.as_ref() as &[u8], b"ILLUMINA");
let records: Vec<_> = reader.records().map(Result::unwrap).collect();
assert!(!records.is_empty(), "Golden BAM should have records");
for rec in &records {
let rg_field = rec
.data()
.get(&DataTag::READ_GROUP)
.expect("record missing RG tag")
.expect("failed to decode RG");
match rg_field {
DataValueRef::String(s) => assert_eq!(s as &[u8], b"A"),
other => panic!("RG tag should be a Z-string, got {other:?}"),
}
}
let r1_path = std::path::PathBuf::from(format!("{}.r1.fastq.gz", out.display()));
let r1_contents = read_gzipped(&r1_path);
let fastq_records = count_fastq_records(&r1_contents);
assert_eq!(records.len(), fastq_records * 2, "Golden BAM should have 2 records per read pair");
}
#[test]
fn test_golden_bam_read_group_uses_vcf_sample_name() {
use noodles::sam::header::record::value::map::read_group::tag as rg_tag;
let contig_len = 2_000usize;
let seq = non_repetitive_seq(contig_len);
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf(
"NA12878",
&[("chr1", contig_len)],
&[VcfVariant {
chrom: "chr1",
pos_1based: 500,
ref_allele: "A",
alt_alleles: &["T"],
gt: "0|1",
}],
);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"5",
"--read-length",
"50",
"--fragment-mean",
"100",
"--fragment-stddev",
"20",
"--golden-bam",
"--seed",
"42",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = std::path::PathBuf::from(format!("{}.golden.bam", out.display()));
let mut reader = noodles::bam::io::reader::Builder.build_from_path(&bam_path).unwrap();
let header = reader.read_header().unwrap();
let (_id, rg) = header.read_groups().iter().next().unwrap();
let sm = rg.other_fields().get(&rg_tag::SAMPLE).expect("@RG missing SM");
assert_eq!(sm.as_ref() as &[u8], b"NA12878");
let lb = rg.other_fields().get(&rg_tag::LIBRARY).expect("@RG missing LB");
assert_eq!(lb.as_ref() as &[u8], b"NA12878");
}
#[test]
fn test_targeted_coverage_matches_requested() {
use noodles::sam::alignment::Record as _;
let contig_len = 100_000usize;
let seq = b"ACGT".repeat(contig_len / 4);
let env = TestEnv::new(&[("chr1", &seq)]);
let targets: Vec<(&str, u32, u32)> = vec![
("chr1", 5_000, 5_010), ("chr1", 15_000, 15_050), ("chr1", 30_000, 30_100), ("chr1", 50_000, 50_250), ("chr1", 65_000, 66_000), ("chr1", 80_000, 85_000), ];
let bed_path = env.write_bed(&targets);
let out = env.output_prefix();
let requested_coverage = 500.0;
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
&requested_coverage.to_string(),
"--read-length",
"150",
"--fragment-mean",
"300",
"--fragment-stddev",
"50",
"-b",
bed_path.to_str().unwrap(),
"--golden-bam",
"--seed",
"42",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = std::path::PathBuf::from(format!("{}.golden.bam", out.display()));
let mut reader = noodles::bam::io::reader::Builder.build_from_path(&bam_path).unwrap();
let _header = reader.read_header().unwrap();
let mut coverage = vec![0u32; contig_len];
for result in reader.records() {
let record = result.unwrap();
let Some(Ok(start)) = record.alignment_start() else { continue };
let Some(Ok(end)) = record.alignment_end() else { continue };
let start_0 = usize::from(start) - 1;
let end_0 = usize::from(end); for depth in coverage.iter_mut().take(end_0.min(contig_len)).skip(start_0) {
*depth += 1;
}
}
let tolerance = 0.08;
let mut all_ok = true;
for &(_, start, end) in &targets {
let width = (end - start) as usize;
let total: u64 = coverage[start as usize..end as usize].iter().map(|&c| u64::from(c)).sum();
#[allow(clippy::cast_precision_loss)]
let mean_cov = total as f64 / width as f64;
let lower = requested_coverage * (1.0 - tolerance);
let upper = requested_coverage * (1.0 + tolerance);
eprintln!(
" Target [{start}, {end}) width={width}bp: mean_cov={mean_cov:.1}x \
(expected {requested_coverage}x ±{:.0}%)",
tolerance * 100.0
);
if mean_cov < lower || mean_cov > upper {
eprintln!(" FAIL: outside [{lower:.1}, {upper:.1}]");
all_ok = false;
}
}
assert!(all_ok, "One or more targets had mean coverage outside ±5% tolerance");
}
#[test]
fn test_wgs_coverage_accuracy() {
let seq = non_repetitive_seq(10_000);
let env = TestEnv::new(&[("chr1", &seq)]);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"100",
"--read-length",
"150",
"--fragment-mean",
"300",
"--fragment-stddev",
"50",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"42",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let edge = 300usize;
let interior: Vec<&PileupColumn> = cols[edge..cols.len() - edge].iter().collect();
assert!(!interior.is_empty(), "interior region should be non-empty");
let mean_depth: f64 =
interior.iter().map(|c| f64::from(c.total)).sum::<f64>() / interior.len() as f64;
assert!(
(95.0..=105.0).contains(&mean_depth),
"interior mean depth {mean_depth:.1}x should be 100x ±5%"
);
for (i, col) in interior.iter().enumerate() {
let ref_base = seq[edge + i];
let ref_count = match ref_base {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
assert_eq!(
ref_count,
col.total,
"pos {} should be 100% reference base '{}' but got {}/{} ref",
edge + i,
char::from(ref_base),
ref_count,
col.total
);
}
}
#[test]
fn test_multi_contig_read_distribution() {
use holodeck_lib::read_naming::parse_encoded_pe_name;
let seq1 = non_repetitive_seq(10_000);
let seq2 = non_repetitive_seq(5_000);
let seq3 = non_repetitive_seq(1_000);
let env = TestEnv::new(&[("chr1", &seq1), ("chr2", &seq2), ("chr3", &seq3)]);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"30",
"--read-length",
"150",
"--fragment-mean",
"300",
"--fragment-stddev",
"50",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"99",
]);
assert!(ok, "simulate failed: {stderr}");
let r1_path = PathBuf::from(format!("{}.r1.fastq.gz", out.display()));
let r1_contents = read_gzipped(&r1_path);
let mut contig_counts: std::collections::HashMap<String, usize> =
std::collections::HashMap::new();
for line in r1_contents.lines() {
if !line.starts_with('@') {
continue;
}
let name = line.trim_start_matches('@');
let Some((_read_num, r1, _r2)) = parse_encoded_pe_name(name) else { continue };
*contig_counts.entry(r1.contig.clone()).or_insert(0) += 1;
}
let total: usize = contig_counts.values().sum();
assert!(total > 0, "should have generated reads");
let expected = [("chr1", 10.0_f64 / 16.0), ("chr2", 5.0 / 16.0), ("chr3", 1.0 / 16.0)];
for (chrom, expected_frac) in &expected {
let count = *contig_counts.get(*chrom).unwrap_or(&0);
let actual_frac = count as f64 / total as f64;
let rel_err = (actual_frac - expected_frac).abs() / expected_frac;
assert!(
rel_err <= 0.15,
"{chrom}: expected {:.1}% of reads, got {:.1}% (rel error {:.1}%)",
expected_frac * 100.0,
actual_frac * 100.0,
rel_err * 100.0
);
}
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let edge = 300usize;
let interior: Vec<&PileupColumn> = cols[edge..cols.len() - edge].iter().collect();
let mean_depth: f64 =
interior.iter().map(|c| f64::from(c.total)).sum::<f64>() / interior.len() as f64;
assert!(
(24.0..=36.0).contains(&mean_depth),
"chr1 interior mean depth {mean_depth:.1}x should be ~30x ±20%"
);
for (i, col) in interior.iter().enumerate() {
let ref_base = seq1[edge + i];
let ref_count = match ref_base {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
assert_eq!(
ref_count,
col.total,
"chr1 pos {} should be 100% reference but got {}/{} ref",
edge + i,
ref_count,
col.total
);
}
}
#[test]
fn test_zero_error_rate_reads_match_reference() {
let seq = non_repetitive_seq(1_000);
let env = TestEnv::new(&[("chr1", &seq)]);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"500",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"7",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let edge = 210usize; let interior: Vec<&PileupColumn> = cols[edge..cols.len() - edge].iter().collect();
for (i, col) in interior.iter().enumerate() {
let abs_pos = edge + i;
assert!(
col.total >= 300,
"pos {abs_pos}: depth {} is below minimum 300 — simulation may have generated too few reads",
col.total
);
let ref_base = seq[abs_pos];
let ref_count = match ref_base {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
assert_eq!(
ref_count,
col.total,
"pos {abs_pos}: expected 100% '{}' but got {}/{} ref (a={} c={} g={} t={} n={})",
char::from(ref_base),
ref_count,
col.total,
col.a,
col.c,
col.g,
col.t,
col.n
);
}
}
#[test]
fn test_adapter_content_short_fragments() {
let seq = b"ACGT".repeat(250); let env = TestEnv::new(&[("chr1", &seq)]);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--fragment-mean",
"30",
"--fragment-stddev",
"5",
"--read-length",
"100",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"42",
]);
assert!(ok, "simulate failed: {stderr}");
let r1_prefix = b"AGATCGGA"; let r1_path = PathBuf::from(format!("{}.r1.fastq.gz", out.display()));
let r2_path = PathBuf::from(format!("{}.r2.fastq.gz", out.display()));
let r1_contents = read_gzipped(&r1_path);
let r2_contents = read_gzipped(&r2_path);
let r1_lines: Vec<&str> = r1_contents.lines().collect();
let r2_lines: Vec<&str> = r2_contents.lines().collect();
let n_records = r1_lines.len() / 4;
assert!(n_records > 0, "should have generated reads");
for i in 0..n_records {
let r1_seq = r1_lines[i * 4 + 1].as_bytes();
let r2_seq = r2_lines[i * 4 + 1].as_bytes();
let find_adapter = |seq: &[u8], prefix: &[u8]| -> Option<usize> {
seq.windows(prefix.len()).position(|w| w == prefix)
};
let r1_adapter_start = find_adapter(r1_seq, r1_prefix);
let r2_adapter_start = find_adapter(r2_seq, r1_prefix);
assert!(r1_adapter_start.is_some(), "read {i}: R1 should contain adapter prefix");
assert!(r2_adapter_start.is_some(), "read {i}: R2 should contain adapter prefix");
assert_eq!(
r1_adapter_start, r2_adapter_start,
"read {i}: adapter start should be same in R1 ({r1_adapter_start:?}) and R2 ({r2_adapter_start:?})"
);
}
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let mut reader = bam::io::reader::Builder.build_from_path(&bam_path).unwrap();
let header = reader.read_header().unwrap();
for result in reader.record_bufs(&header) {
let record: RecordBuf = result.unwrap();
let ops = record.cigar().as_ref();
assert!(!ops.is_empty(), "CIGAR should not be empty");
let first_is_clip = ops.first().unwrap().kind() == CigarKind::SoftClip;
let last_is_clip = ops.last().unwrap().kind() == CigarKind::SoftClip;
assert!(
first_is_clip ^ last_is_clip,
"expected exactly one terminal SoftClip; first={:?} last={:?}",
ops.first().unwrap().kind(),
ops.last().unwrap().kind()
);
let m_len: usize = ops
.iter()
.filter(|op| {
matches!(
op.kind(),
CigarKind::Match | CigarKind::SequenceMatch | CigarKind::SequenceMismatch
)
})
.map(|op| op.len())
.sum();
let s_len: usize =
ops.iter().filter(|op| op.kind() == CigarKind::SoftClip).map(|op| op.len()).sum();
assert_eq!(m_len + s_len, 100, "M+S ops should sum to read_length=100");
}
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
for (pos, col) in cols.iter().enumerate() {
if col.total < 5 {
continue;
} let ref_base = seq[pos];
let ref_count = match ref_base {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
assert_eq!(
ref_count,
col.total,
"pos {pos}: expected 100% '{}' but got {}/{} ref",
char::from(ref_base),
ref_count,
col.total
);
}
}
#[test]
fn test_snp_hom_alt_pileup() {
let seq = non_repetitive_seq(1_000);
let ref_base = seq[500];
let alt_base = match ref_base {
b'A' => b'C',
b'C' => b'G',
b'G' => b'T',
_ => b'A',
};
let ref_str = (ref_base as char).to_string();
let alt_str = (alt_base as char).to_string();
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf(
"SAMPLE",
&[("chr1", 1_000)],
&[VcfVariant {
chrom: "chr1",
pos_1based: 501, ref_allele: &ref_str,
alt_alleles: &[&alt_str],
gt: "1|1",
}],
);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"1000",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"1",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let snp_col = &cols[500];
let alt_count = match alt_base {
b'A' => snp_col.a,
b'C' => snp_col.c,
b'G' => snp_col.g,
b'T' => snp_col.t,
_ => 0,
};
assert!(snp_col.total > 0, "pos 500 should have coverage");
let alt_frac = f64::from(alt_count) / f64::from(snp_col.total);
assert!(
alt_frac >= 0.95,
"pos 500: expected ≥95% alt '{}' but got {}/{} ({:.1}%)",
char::from(alt_base),
alt_count,
snp_col.total,
alt_frac * 100.0
);
for pos in (490..500).chain(501..511) {
let col = &cols[pos];
if col.total == 0 {
continue;
}
let flanking_ref = seq[pos];
let ref_count = match flanking_ref {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
let ref_frac = f64::from(ref_count) / f64::from(col.total);
assert!(
ref_frac >= 0.99,
"pos {pos}: expected ≥99% ref '{}' but got {}/{} ({:.1}%)",
char::from(flanking_ref),
ref_count,
col.total,
ref_frac * 100.0
);
}
let edge = 210usize; for col in &cols[edge..cols.len() - edge] {
assert!(col.total >= 600, "pos {}: depth {} is below minimum 600", col.pos, col.total);
}
}
#[test]
fn test_snp_phased_het_pileup() {
let seq = non_repetitive_seq(1_000);
let ref_base = seq[500];
let alt_base = match ref_base {
b'A' => b'C',
b'C' => b'G',
b'G' => b'T',
_ => b'A',
};
let ref_str = (ref_base as char).to_string();
let alt_str = (alt_base as char).to_string();
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf(
"SAMPLE",
&[("chr1", 1_000)],
&[VcfVariant {
chrom: "chr1",
pos_1based: 501, ref_allele: &ref_str,
alt_alleles: &[&alt_str],
gt: "0|1",
}],
);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"2000",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"2",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let snp_col = &cols[500];
let alt_count = match alt_base {
b'A' => snp_col.a,
b'C' => snp_col.c,
b'G' => snp_col.g,
b'T' => snp_col.t,
_ => 0,
};
assert!(snp_col.total > 0, "pos 500 should have coverage");
let alt_frac = f64::from(alt_count) / f64::from(snp_col.total);
assert!(
(0.42..=0.58).contains(&alt_frac),
"pos 500: expected alt fraction in [0.42, 0.58] but got {}/{} ({:.1}%)",
alt_count,
snp_col.total,
alt_frac * 100.0
);
for pos in (490..500).chain(501..511) {
let col = &cols[pos];
if col.total == 0 {
continue;
}
let flanking_ref = seq[pos];
let ref_count = match flanking_ref {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
let ref_frac = f64::from(ref_count) / f64::from(col.total);
assert!(
ref_frac >= 0.99,
"pos {pos}: expected ≥99% ref '{}' but got {}/{} ({:.1}%)",
char::from(flanking_ref),
ref_count,
col.total,
ref_frac * 100.0
);
}
let edge = 210usize; for col in &cols[edge..cols.len() - edge] {
assert!(col.total >= 1200, "pos {}: depth {} is below minimum 1200", col.pos, col.total);
}
}
#[test]
fn test_deletion_pileup() {
let seq = non_repetitive_seq(1_000);
let ref_4bp: String = seq[400..404].iter().map(|&b| b as char).collect();
let anchor_str: String = std::iter::once(seq[400] as char).collect();
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf(
"SAMPLE",
&[("chr1", 1_000)],
&[VcfVariant {
chrom: "chr1",
pos_1based: 401,
ref_allele: &ref_4bp,
alt_alleles: &[&anchor_str],
gt: "1|1",
}],
);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"1000",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"3",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let flanking_totals: Vec<u32> = (380..400).chain(410..420).map(|pos| cols[pos].total).collect();
let flanking_sum: u32 = flanking_totals.iter().sum();
let flanking_mean = f64::from(flanking_sum) / flanking_totals.len() as f64;
let depth_threshold = (flanking_mean * 0.2) as u32;
let anchor_col = &cols[400];
assert!(anchor_col.total > 0, "anchor pos 400 should have coverage");
let anchor_base_count = match seq[400] {
b'A' => anchor_col.a,
b'C' => anchor_col.c,
b'G' => anchor_col.g,
b'T' => anchor_col.t,
_ => 0,
};
let anchor_frac = f64::from(anchor_base_count) / f64::from(anchor_col.total);
assert!(
anchor_frac >= 0.95,
"anchor pos 400: expected ≥95% ref base but got {}/{} ({:.1}%)",
anchor_base_count,
anchor_col.total,
anchor_frac * 100.0
);
for col in &cols[401..404] {
assert!(
col.total <= depth_threshold,
"deleted pos {}: depth {} exceeds 20% of flanking mean \
({flanking_mean:.0}); expected near-zero coverage",
col.pos,
col.total
);
}
for pos in (380..400).chain(410..420) {
let col = &cols[pos];
assert!(col.total >= 600, "flanking pos {pos}: depth {} below 600", col.total);
let ref_base = seq[pos];
let ref_count = match ref_base {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
let ref_frac = f64::from(ref_count) / f64::from(col.total);
assert!(
ref_frac >= 0.99,
"flanking pos {pos}: expected ≥99% ref '{}' but got {}/{} ({:.1}%)",
char::from(ref_base),
ref_count,
col.total,
ref_frac * 100.0
);
}
assert!(
any_bam_record_has_cigar_op(&bam_path, CigarKind::Deletion),
"expected at least one BAM record with a D CIGAR op"
);
}
#[test]
fn test_insertion_pileup() {
let seq = non_repetitive_seq(1_000);
let anchor_char = seq[500] as char;
let anchor_str: String = std::iter::once(anchor_char).collect();
let alt_str: String = format!("{anchor_char}CCC");
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf(
"SAMPLE",
&[("chr1", 1_000)],
&[VcfVariant {
chrom: "chr1",
pos_1based: 501,
ref_allele: &anchor_str,
alt_alleles: &[&alt_str],
gt: "1|1",
}],
);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"1000",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"4",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let anchor_col = &cols[500];
assert!(anchor_col.total > 0, "anchor pos 500 should have coverage");
let ccc_count = anchor_col.ins_seqs.get(b"CCC".as_ref()).copied().unwrap_or(0);
let ccc_frac = f64::from(ccc_count) / f64::from(anchor_col.total);
assert!(
ccc_frac >= 0.90,
"anchor pos 500: expected ≥90% reads with 'CCC' insertion but got {}/{} ({:.1}%)",
ccc_count,
anchor_col.total,
ccc_frac * 100.0
);
let ins_frac = f64::from(anchor_col.ins) / f64::from(anchor_col.total);
assert!(
ins_frac >= 0.90,
"anchor pos 500: expected ≥90% reads with any insertion but got {}/{} ({:.1}%)",
anchor_col.ins,
anchor_col.total,
ins_frac * 100.0
);
for pos in (480..499).chain(502..521) {
let col = &cols[pos];
if col.total == 0 {
continue;
}
assert!(col.total >= 600, "flanking pos {pos}: depth {} below 600", col.total);
let ref_base = seq[pos];
let ref_count = match ref_base {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
let ref_frac = f64::from(ref_count) / f64::from(col.total);
assert!(
ref_frac >= 0.99,
"flanking pos {pos}: expected ≥99% ref '{}' but got {}/{} ({:.1}%)",
char::from(ref_base),
ref_count,
col.total,
ref_frac * 100.0
);
}
assert!(
any_bam_record_has_cigar_op(&bam_path, CigarKind::Insertion),
"expected at least one BAM record with an I CIGAR op"
);
}
#[test]
fn test_het_deletion_pileup() {
let seq = non_repetitive_seq(1_000);
let ref_4bp: String = seq[400..404].iter().map(|&b| b as char).collect();
let anchor_str: String = std::iter::once(seq[400] as char).collect();
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf(
"SAMPLE",
&[("chr1", 1_000)],
&[VcfVariant {
chrom: "chr1",
pos_1based: 401,
ref_allele: &ref_4bp,
alt_alleles: &[&anchor_str],
gt: "0|1",
}],
);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"2000",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"3",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let flanking_totals: Vec<u32> = (380..400).chain(410..420).map(|pos| cols[pos].total).collect();
let flanking_sum: u32 = flanking_totals.iter().sum();
let flanking_mean = f64::from(flanking_sum) / flanking_totals.len() as f64;
for col in &cols[401..404] {
let frac_of_flanking = f64::from(col.total) / flanking_mean;
assert!(
(0.35..=0.65).contains(&frac_of_flanking),
"het-del pos {}: depth {} is {:.1}% of flanking mean {:.0}, \
expected ~50%",
col.pos,
col.total,
frac_of_flanking * 100.0,
flanking_mean
);
}
for pos in (380..400).chain(410..420) {
let col = &cols[pos];
if col.total == 0 {
continue;
}
let ref_base = seq[pos];
let ref_count = match ref_base {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
let ref_frac = f64::from(ref_count) / f64::from(col.total);
assert!(
ref_frac >= 0.99,
"flanking pos {pos}: expected ≥99% ref, got {:.1}%",
ref_frac * 100.0
);
}
}
#[test]
fn test_het_insertion_pileup() {
let seq = non_repetitive_seq(1_000);
let anchor_char = seq[500] as char;
let anchor_str = anchor_char.to_string();
let alt_str = format!("{anchor_char}CCC");
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf(
"SAMPLE",
&[("chr1", 1_000)],
&[VcfVariant {
chrom: "chr1",
pos_1based: 501,
ref_allele: &anchor_str,
alt_alleles: &[&alt_str],
gt: "0|1",
}],
);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"2000",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"4",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let anchor_col = &cols[500];
assert!(anchor_col.total > 0, "anchor pos 500 should have coverage");
let ins_frac = f64::from(anchor_col.ins) / f64::from(anchor_col.total);
assert!(
(0.40..=0.60).contains(&ins_frac),
"anchor pos 500: expected 40-60% reads with insertion, got {}/{} ({:.1}%)",
anchor_col.ins,
anchor_col.total,
ins_frac * 100.0
);
}
#[test]
fn test_multiple_variants_same_contig() {
let seq = non_repetitive_seq(2_000);
let snp_ref_base = seq[400];
let snp_alt_base = match snp_ref_base {
b'A' => b'C',
b'C' => b'G',
b'G' => b'T',
_ => b'A',
};
let snp_ref_str = (snp_ref_base as char).to_string();
let snp_alt_str = (snp_alt_base as char).to_string();
let del_ref: String = seq[800..804].iter().map(|&b| b as char).collect();
let del_alt: String = std::iter::once(seq[800] as char).collect();
let ins_anchor = seq[1200] as char;
let ins_ref = ins_anchor.to_string();
let ins_alt = format!("{ins_anchor}GGG");
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf(
"SAMPLE",
&[("chr1", 2_000)],
&[
VcfVariant {
chrom: "chr1",
pos_1based: 401,
ref_allele: &snp_ref_str,
alt_alleles: &[&snp_alt_str],
gt: "1|1",
},
VcfVariant {
chrom: "chr1",
pos_1based: 801,
ref_allele: &del_ref,
alt_alleles: &[&del_alt],
gt: "1|1",
},
VcfVariant {
chrom: "chr1",
pos_1based: 1201,
ref_allele: &ins_ref,
alt_alleles: &[&ins_alt],
gt: "1|1",
},
],
);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"500",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"5",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let snp_col = &cols[400];
let snp_alt_count = match snp_alt_base {
b'A' => snp_col.a,
b'C' => snp_col.c,
b'G' => snp_col.g,
b'T' => snp_col.t,
_ => 0,
};
assert!(snp_col.total > 0, "SNP position should have coverage");
let snp_frac = f64::from(snp_alt_count) / f64::from(snp_col.total);
assert!(snp_frac >= 0.95, "SNP pos 400: expected ≥95% alt, got {:.1}%", snp_frac * 100.0);
let flanking_mean: f64 =
(780..800).chain(810..820).map(|p| f64::from(cols[p].total)).sum::<f64>() / 30.0;
for col in &cols[801..804] {
let frac = f64::from(col.total) / flanking_mean;
assert!(
frac <= 0.20,
"del pos {}: depth {} is {:.1}% of flanking mean {:.0}, expected near-zero",
col.pos,
col.total,
frac * 100.0,
flanking_mean,
);
}
let ins_col = &cols[1200];
assert!(ins_col.total > 0, "Insertion anchor should have coverage");
let ins_frac = f64::from(ins_col.ins) / f64::from(ins_col.total);
assert!(
ins_frac >= 0.90,
"ins pos 1200: expected ≥90% insertion, got {:.1}%",
ins_frac * 100.0
);
}
#[test]
fn test_unphased_het_snp_pileup() {
let seq = non_repetitive_seq(1_000);
let ref_base = seq[500];
let alt_base = match ref_base {
b'A' => b'C',
b'C' => b'G',
b'G' => b'T',
_ => b'A',
};
let ref_str = (ref_base as char).to_string();
let alt_str = (alt_base as char).to_string();
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf(
"SAMPLE",
&[("chr1", 1_000)],
&[VcfVariant {
chrom: "chr1",
pos_1based: 501,
ref_allele: &ref_str,
alt_alleles: &[&alt_str],
gt: "0/1", }],
);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"2000",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"10",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let snp_col = &cols[500];
let alt_count = match alt_base {
b'A' => snp_col.a,
b'C' => snp_col.c,
b'G' => snp_col.g,
b'T' => snp_col.t,
_ => 0,
};
assert!(snp_col.total > 0, "pos 500 should have coverage");
let alt_frac = f64::from(alt_count) / f64::from(snp_col.total);
assert!(
(0.42..=0.58).contains(&alt_frac),
"unphased het pos 500: expected ~50% alt, got {}/{} ({:.1}%)",
alt_count,
snp_col.total,
alt_frac * 100.0
);
}
#[test]
fn test_multi_allelic_site() {
let seq = non_repetitive_seq(1_000);
let ref_base = seq[500];
let alts: Vec<u8> = b"ACGT".iter().copied().filter(|&b| b != ref_base).collect();
let alt1 = alts[0];
let alt2 = alts[1];
let ref_str = (ref_base as char).to_string();
let alt1_str = (alt1 as char).to_string();
let alt2_str = (alt2 as char).to_string();
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf(
"SAMPLE",
&[("chr1", 1_000)],
&[VcfVariant {
chrom: "chr1",
pos_1based: 501,
ref_allele: &ref_str,
alt_alleles: &[&alt1_str, &alt2_str],
gt: "1|2",
}],
);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"2000",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"11",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let col = &cols[500];
assert!(col.total > 0, "pos 500 should have coverage");
let alt1_count = match alt1 {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
let alt2_count = match alt2 {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
let ref_count = match ref_base {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
let ref_frac = f64::from(ref_count) / f64::from(col.total);
assert!(
ref_frac <= 0.05,
"pos 500: expected ≤5% ref with GT 1|2, got {:.1}%",
ref_frac * 100.0
);
let alt1_frac = f64::from(alt1_count) / f64::from(col.total);
let alt2_frac = f64::from(alt2_count) / f64::from(col.total);
assert!(
(0.40..=0.60).contains(&alt1_frac),
"pos 500: expected alt1 '{}' ~50%, got {:.1}%",
alt1 as char,
alt1_frac * 100.0
);
assert!(
(0.40..=0.60).contains(&alt2_frac),
"pos 500: expected alt2 '{}' ~50%, got {:.1}%",
alt2 as char,
alt2_frac * 100.0
);
}
#[test]
fn test_phased_variant_cluster_cis() {
let seq = non_repetitive_seq(1_000);
let snp_positions: Vec<usize> = vec![490, 500, 510, 520];
let mut snp_alts: Vec<u8> = Vec::new();
let mut variants = Vec::new();
for &pos in &snp_positions {
let ref_base = seq[pos];
let alt_base = match ref_base {
b'A' => b'C',
b'C' => b'G',
b'G' => b'T',
_ => b'A',
};
snp_alts.push(alt_base);
}
let ref_strs: Vec<String> =
snp_positions.iter().map(|&pos| (seq[pos] as char).to_string()).collect();
let alt_strs: Vec<String> = snp_alts.iter().map(|&b| (b as char).to_string()).collect();
let alt_refs: Vec<&str> = alt_strs.iter().map(String::as_str).collect();
for (i, &pos) in snp_positions.iter().enumerate() {
variants.push(VcfVariant {
chrom: "chr1",
pos_1based: (pos + 1) as u32,
ref_allele: &ref_strs[i],
alt_alleles: std::slice::from_ref(&alt_refs[i]),
gt: "0|1",
});
}
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf("SAMPLE", &[("chr1", 1_000)], &variants);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"1000",
"--read-length",
"100",
"--fragment-mean",
"200",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"20",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let mut reader = bam::io::reader::Builder.build_from_path(&bam_path).unwrap();
let header = reader.read_header().unwrap();
let mut all_alt = 0u32;
let mut all_ref = 0u32;
let mut mixed = 0u32;
for result in reader.record_bufs(&header) {
let record = result.unwrap();
let (base_at, _, _) = read_base_map(&record);
let snp_pos_u32: Vec<u32> = snp_positions.iter().map(|&p| p as u32).collect();
if !snp_pos_u32.iter().all(|p| base_at.contains_key(p)) {
continue;
}
let mut has_alt = Vec::new();
for (i, &pos) in snp_pos_u32.iter().enumerate() {
let read_base = base_at[&pos];
has_alt.push(read_base == snp_alts[i]);
}
if has_alt.iter().all(|&a| a) {
all_alt += 1;
} else if has_alt.iter().all(|&a| !a) {
all_ref += 1;
} else {
mixed += 1;
}
}
let total_spanning = all_alt + all_ref + mixed;
assert!(total_spanning >= 100, "Need sufficient spanning reads, got {total_spanning}");
assert_eq!(
mixed, 0,
"Phased cis cluster: expected 0 mixed reads, got {mixed} \
(all_alt={all_alt}, all_ref={all_ref}, mixed={mixed})"
);
let alt_frac = f64::from(all_alt) / f64::from(total_spanning);
assert!(
(0.40..=0.60).contains(&alt_frac),
"Expected ~50% all-alt reads, got {:.1}%",
alt_frac * 100.0
);
}
#[test]
fn test_phased_variant_cluster_trans() {
let seq = non_repetitive_seq(1_000);
let snp_positions: Vec<usize> = vec![490, 500, 510, 520];
let gts = ["1|0", "0|1", "1|0", "0|1"]; let mut snp_alts: Vec<u8> = Vec::new();
for &pos in &snp_positions {
let alt = match seq[pos] {
b'A' => b'C',
b'C' => b'G',
b'G' => b'T',
_ => b'A',
};
snp_alts.push(alt);
}
let ref_strs: Vec<String> =
snp_positions.iter().map(|&pos| (seq[pos] as char).to_string()).collect();
let alt_strs: Vec<String> = snp_alts.iter().map(|&b| (b as char).to_string()).collect();
let alt_refs: Vec<&str> = alt_strs.iter().map(String::as_str).collect();
let mut variants = Vec::new();
for (i, &pos) in snp_positions.iter().enumerate() {
variants.push(VcfVariant {
chrom: "chr1",
pos_1based: (pos + 1) as u32,
ref_allele: &ref_strs[i],
alt_alleles: std::slice::from_ref(&alt_refs[i]),
gt: gts[i],
});
}
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf("SAMPLE", &[("chr1", 1_000)], &variants);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"1000",
"--read-length",
"100",
"--fragment-mean",
"200",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"21",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let mut reader = bam::io::reader::Builder.build_from_path(&bam_path).unwrap();
let header = reader.read_header().unwrap();
let mut pattern_a = 0u32;
let mut pattern_b = 0u32;
let mut other = 0u32;
for result in reader.record_bufs(&header) {
let record = result.unwrap();
let (base_at, _, _) = read_base_map(&record);
let snp_pos_u32: Vec<u32> = snp_positions.iter().map(|&p| p as u32).collect();
if !snp_pos_u32.iter().all(|p| base_at.contains_key(p)) {
continue;
}
let has_alt: Vec<bool> =
snp_pos_u32.iter().enumerate().map(|(i, &pos)| base_at[&pos] == snp_alts[i]).collect();
if has_alt == [true, false, true, false] {
pattern_a += 1;
} else if has_alt == [false, true, false, true] {
pattern_b += 1;
} else {
other += 1;
}
}
let total_spanning = pattern_a + pattern_b + other;
assert!(total_spanning >= 100, "Need sufficient spanning reads, got {total_spanning}");
assert_eq!(
other, 0,
"Trans cluster: expected 0 unexpected patterns, got {other} \
(pattern_a={pattern_a}, pattern_b={pattern_b})"
);
let a_frac = f64::from(pattern_a) / f64::from(total_spanning);
assert!(
(0.40..=0.60).contains(&a_frac),
"Expected ~50% pattern A reads, got {:.1}%",
a_frac * 100.0
);
}
#[test]
fn test_unphased_variant_cluster() {
let seq = non_repetitive_seq(1_000);
let snp_positions: Vec<usize> = vec![495, 505, 515];
let mut snp_alts: Vec<u8> = Vec::new();
for &pos in &snp_positions {
let alt = match seq[pos] {
b'A' => b'C',
b'C' => b'G',
b'G' => b'T',
_ => b'A',
};
snp_alts.push(alt);
}
let ref_strs: Vec<String> =
snp_positions.iter().map(|&pos| (seq[pos] as char).to_string()).collect();
let alt_strs: Vec<String> = snp_alts.iter().map(|&b| (b as char).to_string()).collect();
let alt_refs: Vec<&str> = alt_strs.iter().map(String::as_str).collect();
let mut variants = Vec::new();
for (i, &pos) in snp_positions.iter().enumerate() {
variants.push(VcfVariant {
chrom: "chr1",
pos_1based: (pos + 1) as u32,
ref_allele: &ref_strs[i],
alt_alleles: std::slice::from_ref(&alt_refs[i]),
gt: "0/1", });
}
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf("SAMPLE", &[("chr1", 1_000)], &variants);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"2000",
"--read-length",
"100",
"--fragment-mean",
"200",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"22",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let mut reader = bam::io::reader::Builder.build_from_path(&bam_path).unwrap();
let header = reader.read_header().unwrap();
let mut pattern_counts: HashMap<Vec<bool>, u32> = HashMap::new();
for result in reader.record_bufs(&header) {
let record = result.unwrap();
let (base_at, _, _) = read_base_map(&record);
let snp_pos_u32: Vec<u32> = snp_positions.iter().map(|&p| p as u32).collect();
if !snp_pos_u32.iter().all(|p| base_at.contains_key(p)) {
continue;
}
let pattern: Vec<bool> =
snp_pos_u32.iter().enumerate().map(|(i, &pos)| base_at[&pos] == snp_alts[i]).collect();
*pattern_counts.entry(pattern).or_insert(0) += 1;
}
let total: u32 = pattern_counts.values().sum();
assert!(total >= 200, "Need sufficient spanning reads, got {total}");
let distinct_patterns = pattern_counts.len();
assert_eq!(
distinct_patterns, 2,
"Expected exactly 2 read allele patterns (one per haplotype), got {distinct_patterns}"
);
for (pattern, &count) in &pattern_counts {
let frac = f64::from(count) / f64::from(total);
assert!(
(0.40..=0.60).contains(&frac),
"Pattern {pattern:?} has {count}/{total} reads ({:.1}%), expected ~50%",
frac * 100.0
);
}
}
#[test]
fn test_phased_snp_and_deletion_cluster() {
let seq = non_repetitive_seq(1_000);
let snp_ref_base = seq[500];
let snp_alt_base = match snp_ref_base {
b'A' => b'C',
b'C' => b'G',
b'G' => b'T',
_ => b'A',
};
let snp_ref_str = (snp_ref_base as char).to_string();
let snp_alt_str = (snp_alt_base as char).to_string();
let del_ref: String = seq[510..514].iter().map(|&b| b as char).collect();
let del_alt: String = std::iter::once(seq[510] as char).collect();
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf(
"SAMPLE",
&[("chr1", 1_000)],
&[
VcfVariant {
chrom: "chr1",
pos_1based: 501,
ref_allele: &snp_ref_str,
alt_alleles: &[&snp_alt_str],
gt: "0|1",
},
VcfVariant {
chrom: "chr1",
pos_1based: 511,
ref_allele: &del_ref,
alt_alleles: &[&del_alt],
gt: "0|1",
},
],
);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"1000",
"--read-length",
"100",
"--fragment-mean",
"200",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"23",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let mut reader = bam::io::reader::Builder.build_from_path(&bam_path).unwrap();
let header = reader.read_header().unwrap();
let mut snp_alt_del_present = 0u32; let mut snp_ref_del_absent = 0u32; let mut discordant = 0u32;
for result in reader.record_bufs(&header) {
let record = result.unwrap();
let (base_at, _, del_at) = read_base_map(&record);
if !base_at.contains_key(&500) || !base_at.contains_key(&515) {
continue;
}
let has_snp_alt = base_at.get(&500) == Some(&snp_alt_base);
let has_deletion = del_at.contains(&511) || del_at.contains(&512) || del_at.contains(&513);
if has_snp_alt && has_deletion {
snp_alt_del_present += 1;
} else if !has_snp_alt && !has_deletion {
snp_ref_del_absent += 1;
} else {
discordant += 1;
}
}
let total = snp_alt_del_present + snp_ref_del_absent + discordant;
assert!(total >= 100, "Need sufficient spanning reads, got {total}");
assert_eq!(
discordant, 0,
"Phased SNP+deletion: expected 0 discordant reads, got {discordant} \
(concordant_alt={snp_alt_del_present}, concordant_ref={snp_ref_del_absent})"
);
let alt_frac = f64::from(snp_alt_del_present) / f64::from(total);
assert!(
(0.40..=0.60).contains(&alt_frac),
"Expected ~50% alt-carrying reads, got {:.1}%",
alt_frac * 100.0
);
}
#[test]
fn test_phased_snp_and_insertion_cluster() {
let seq = non_repetitive_seq(1_000);
let snp_ref_base = seq[500];
let snp_alt_base = match snp_ref_base {
b'A' => b'C',
b'C' => b'G',
b'G' => b'T',
_ => b'A',
};
let snp_ref_str = (snp_ref_base as char).to_string();
let snp_alt_str = (snp_alt_base as char).to_string();
let ins_anchor = seq[515] as char;
let ins_ref = ins_anchor.to_string();
let ins_alt = format!("{ins_anchor}TTT");
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf(
"SAMPLE",
&[("chr1", 1_000)],
&[
VcfVariant {
chrom: "chr1",
pos_1based: 501,
ref_allele: &snp_ref_str,
alt_alleles: &[&snp_alt_str],
gt: "0|1",
},
VcfVariant {
chrom: "chr1",
pos_1based: 516,
ref_allele: &ins_ref,
alt_alleles: &[&ins_alt],
gt: "0|1",
},
],
);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"1000",
"--read-length",
"100",
"--fragment-mean",
"200",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"24",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let mut reader = bam::io::reader::Builder.build_from_path(&bam_path).unwrap();
let header = reader.read_header().unwrap();
let mut snp_alt_ins_present = 0u32;
let mut snp_ref_ins_absent = 0u32;
let mut discordant = 0u32;
for result in reader.record_bufs(&header) {
let record = result.unwrap();
let (base_at, ins_after, _) = read_base_map(&record);
if !base_at.contains_key(&500) || !base_at.contains_key(&515) {
continue;
}
let has_snp_alt = base_at.get(&500) == Some(&snp_alt_base);
let has_insertion = ins_after.contains(&515);
if has_snp_alt && has_insertion {
snp_alt_ins_present += 1;
} else if !has_snp_alt && !has_insertion {
snp_ref_ins_absent += 1;
} else {
discordant += 1;
}
}
let total = snp_alt_ins_present + snp_ref_ins_absent + discordant;
assert!(total >= 100, "Need sufficient spanning reads, got {total}");
let discordant_frac = f64::from(discordant) / f64::from(total);
assert!(
discordant_frac < 0.01,
"Phased SNP+insertion: expected <1% discordant, got {discordant}/{total} ({:.1}%) \
(concordant_alt={snp_alt_ins_present}, concordant_ref={snp_ref_ins_absent})",
discordant_frac * 100.0
);
let concordant = snp_alt_ins_present + snp_ref_ins_absent;
let alt_frac = f64::from(snp_alt_ins_present) / f64::from(concordant);
assert!(
(0.40..=0.60).contains(&alt_frac),
"Expected ~50% alt-carrying reads, got {:.1}%",
alt_frac * 100.0
);
}
#[test]
fn test_simulate_vcf_plus_bed() {
let seq = non_repetitive_seq(10_000);
let inside_ref = seq[5000];
let inside_alt = match inside_ref {
b'A' => b'C',
b'C' => b'G',
b'G' => b'T',
_ => b'A',
};
let outside_ref_str = (seq[1000] as char).to_string();
let outside_alt_str = match seq[1000] {
b'A' => "C".to_string(),
b'C' => "G".to_string(),
_ => "A".to_string(),
};
let inside_ref_str = (inside_ref as char).to_string();
let inside_alt_str = (inside_alt as char).to_string();
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.write_vcf(
"SAMPLE",
&[("chr1", 10_000)],
&[
VcfVariant {
chrom: "chr1",
pos_1based: 1001,
ref_allele: &outside_ref_str,
alt_alleles: &[&outside_alt_str],
gt: "1|1",
},
VcfVariant {
chrom: "chr1",
pos_1based: 5001,
ref_allele: &inside_ref_str,
alt_alleles: &[&inside_alt_str],
gt: "1|1",
},
],
);
let bed_path = env.write_bed(&[("chr1", 4800, 5200)]);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-b",
bed_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"500",
"--read-length",
"100",
"--fragment-mean",
"200",
"--fragment-stddev",
"30",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"30",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let inside_col = &cols[5000];
assert!(inside_col.total > 0, "Inside-target position should have coverage");
let inside_alt_count = match inside_alt {
b'A' => inside_col.a,
b'C' => inside_col.c,
b'G' => inside_col.g,
b'T' => inside_col.t,
_ => 0,
};
let inside_frac = f64::from(inside_alt_count) / f64::from(inside_col.total);
assert!(
inside_frac >= 0.95,
"Inside-target SNP: expected ≥95% alt, got {:.1}%",
inside_frac * 100.0
);
let outside_col = &cols[1000];
assert!(
outside_col.total <= 5,
"Outside-target position 1000 should have near-zero coverage, got {}",
outside_col.total
);
}
#[test]
fn test_simulate_error_rate_introduces_mismatches() {
let seq = non_repetitive_seq(2_000);
let env = TestEnv::new(&[("chr1", &seq)]);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"200",
"--read-length",
"100",
"--fragment-mean",
"200",
"--fragment-stddev",
"30",
"--min-error-rate",
"0.01",
"--max-error-rate",
"0.02",
"--golden-bam",
"--seed",
"31",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let edge = 250;
let interior = &cols[edge..cols.len() - edge];
assert!(!interior.is_empty());
let mut total_bases: u64 = 0;
let mut mismatch_bases: u64 = 0;
for (i, col) in interior.iter().enumerate() {
if col.total == 0 {
continue;
}
let ref_base = seq[edge + i];
let ref_count = match ref_base {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
total_bases += u64::from(col.total);
mismatch_bases += u64::from(col.total - ref_count);
}
assert!(total_bases > 0, "Should have some bases in interior");
let error_rate = mismatch_bases as f64 / total_bases as f64;
assert!(error_rate > 0.005, "Error rate {error_rate:.4} too low — errors should be detectable",);
assert!(error_rate < 0.05, "Error rate {error_rate:.4} too high — something is wrong",);
}
#[test]
fn test_simulate_multiple_contigs_with_variants() {
let seq1 = non_repetitive_seq(2_000);
let seq2 = non_repetitive_seq(2_000);
let snp1_ref = seq1[500];
let snp1_alt = match snp1_ref {
b'A' => b'C',
b'C' => b'G',
b'G' => b'T',
_ => b'A',
};
let snp2_ref = seq2[800];
let snp2_alt = match snp2_ref {
b'A' => b'C',
b'C' => b'G',
b'G' => b'T',
_ => b'A',
};
let snp1_ref_str = (snp1_ref as char).to_string();
let snp1_alt_str = (snp1_alt as char).to_string();
let snp2_ref_str = (snp2_ref as char).to_string();
let snp2_alt_str = (snp2_alt as char).to_string();
let env = TestEnv::new(&[("chr1", &seq1), ("chr2", &seq2)]);
let vcf_path = env.write_vcf(
"SAMPLE",
&[("chr1", 2_000), ("chr2", 2_000)],
&[
VcfVariant {
chrom: "chr1",
pos_1based: 501,
ref_allele: &snp1_ref_str,
alt_alleles: &[&snp1_alt_str],
gt: "1|1",
},
VcfVariant {
chrom: "chr2",
pos_1based: 801,
ref_allele: &snp2_ref_str,
alt_alleles: &[&snp2_alt_str],
gt: "1|1",
},
],
);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"500",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"32",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
let pileup = pileup_bases(&bam_path);
let col1 = &pileup[0][500];
assert!(col1.total > 0, "chr1 pos 500 should have coverage");
let alt1_count = match snp1_alt {
b'A' => col1.a,
b'C' => col1.c,
b'G' => col1.g,
b'T' => col1.t,
_ => 0,
};
let alt1_frac = f64::from(alt1_count) / f64::from(col1.total);
assert!(alt1_frac >= 0.95, "chr1 SNP: expected ≥95% alt, got {:.1}%", alt1_frac * 100.0);
let col2 = &pileup[1][800];
assert!(col2.total > 0, "chr2 pos 800 should have coverage");
let alt2_count = match snp2_alt {
b'A' => col2.a,
b'C' => col2.c,
b'G' => col2.g,
b'T' => col2.t,
_ => 0,
};
let alt2_frac = f64::from(alt2_count) / f64::from(col2.total);
assert!(alt2_frac >= 0.95, "chr2 SNP: expected ≥95% alt, got {:.1}%", alt2_frac * 100.0);
}
#[test]
fn test_simulate_very_short_contig() {
let seq = non_repetitive_seq(50); let env = TestEnv::new(&[("chr1", &seq)]);
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"100",
"--read-length",
"50",
"--fragment-mean",
"300",
"--fragment-stddev",
"50",
"--seed",
"33",
]);
assert!(ok, "simulate failed: {stderr}");
let r1_path = PathBuf::from(format!("{}.r1.fastq.gz", out.display()));
let r1_contents = read_gzipped(&r1_path);
let r1_records = count_fastq_records(&r1_contents);
assert!(r1_records > 0, "Should have generated reads even for short contig");
}
#[test]
fn test_mutate_then_simulate() {
use helpers::run_mutate;
let seq = non_repetitive_seq(5_000);
let env = TestEnv::new(&[("chr1", &seq)]);
let vcf_path = env.dir.path().join("muts.vcf");
let (ok, _, stderr) = run_mutate(&[
"mutate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
vcf_path.to_str().unwrap(),
"--snp-rate",
"0.005",
"--indel-rate",
"0",
"--mnp-rate",
"0",
"--seed",
"40",
]);
assert!(ok, "mutate failed: {stderr}");
let out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-v",
vcf_path.to_str().unwrap(),
"-o",
out.to_str().unwrap(),
"--coverage",
"100",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"20",
"--min-error-rate",
"0",
"--max-error-rate",
"0",
"--golden-bam",
"--seed",
"41",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", out.display()));
assert!(bam_path.exists(), "Golden BAM should exist");
let pileup = pileup_bases(&bam_path);
let cols = &pileup[0];
let edge = 200;
let interior = &cols[edge..cols.len() - edge];
let mut positions_with_alt = 0;
for (i, col) in interior.iter().enumerate() {
if col.total == 0 {
continue;
}
let ref_base = seq[edge + i];
let ref_count = match ref_base {
b'A' => col.a,
b'C' => col.c,
b'G' => col.g,
b'T' => col.t,
_ => 0,
};
if ref_count < col.total {
positions_with_alt += 1;
}
}
assert!(
positions_with_alt > 0,
"Expected some positions with non-reference bases from mutated VCF"
);
}
#[test]
fn test_simulate_then_eval() {
use helpers::run_eval;
let seq = non_repetitive_seq(2_000);
let env = TestEnv::new(&[("chr1", &seq)]);
let sim_out = env.output_prefix();
let (ok, _, stderr) = run_simulate(&[
"simulate",
"-r",
env.fasta_path.to_str().unwrap(),
"-o",
sim_out.to_str().unwrap(),
"--coverage",
"20",
"--read-length",
"50",
"--fragment-mean",
"150",
"--fragment-stddev",
"20",
"--golden-bam",
"--single-end",
"--seed",
"42",
]);
assert!(ok, "simulate failed: {stderr}");
let bam_path = PathBuf::from(format!("{}.golden.bam", sim_out.display()));
let eval_out = env.dir.path().join("eval");
let (ok, _, stderr) = run_eval(&[
"eval",
"--mapped",
bam_path.to_str().unwrap(),
"-o",
eval_out.to_str().unwrap(),
]);
assert!(ok, "eval failed: {stderr}");
let eval_file = PathBuf::from(format!("{}.eval.txt", eval_out.display()));
let contents = std::fs::read_to_string(&eval_file).unwrap();
for line in contents.lines() {
if line.starts_with("ALL\t") {
let fields: Vec<&str> = line.split('\t').collect();
let total: u64 = fields[1].parse().unwrap();
let correct: u64 = fields[2].parse().unwrap();
let mismapped: u64 = fields[3].parse().unwrap();
let unmapped: u64 = fields[4].parse().unwrap();
assert!(total > 0, "Should have evaluated some reads");
assert_eq!(correct, total, "All reads should be correct");
assert_eq!(mismapped, 0, "No reads should be mismapped");
assert_eq!(unmapped, 0, "No reads should be unmapped");
return;
}
}
panic!("ALL row not found in eval output");
}
#[test]
fn test_sample_without_vcf_fails() {
let env = simple_env();
let out = env.output_prefix();
let out_str = out.to_str().unwrap();
let ref_str = env.fasta_path.to_str().unwrap();
let (ok, _stdout, stderr) = run_simulate(&[
"simulate",
"-r",
ref_str,
"-o",
out_str,
"--sample",
"NA12878",
"--coverage",
"1",
]);
assert!(!ok, "should fail when --sample is given without --vcf");
assert!(
stderr.contains("--sample requires --vcf"),
"error should mention --sample requires --vcf, got: {stderr}"
);
}
#[test]
fn test_wrong_sample_name_fails() {
let env = simple_env();
let out = env.output_prefix();
let out_str = out.to_str().unwrap();
let ref_str = env.fasta_path.to_str().unwrap();
let vcf_path = env.write_vcf_header_only(&["SampleA"], &[("chr1", 1000)]);
let vcf_str = vcf_path.to_str().unwrap();
let (ok, _stdout, stderr) = run_simulate(&[
"simulate",
"-r",
ref_str,
"-v",
vcf_str,
"--sample",
"NoSuchSample",
"-o",
out_str,
"--coverage",
"1",
]);
assert!(!ok, "should fail when sample name is not in VCF");
assert!(
stderr.contains("NoSuchSample") && stderr.contains("not found"),
"error should name the missing sample, got: {stderr}"
);
assert!(stderr.contains("SampleA"), "error should list available samples, got: {stderr}");
}
#[test]
fn test_multi_sample_vcf_without_sample_flag_fails() {
let env = simple_env();
let out = env.output_prefix();
let out_str = out.to_str().unwrap();
let ref_str = env.fasta_path.to_str().unwrap();
let vcf_path = env.write_vcf_header_only(&["SampleA", "SampleB", "SampleC"], &[("chr1", 1000)]);
let vcf_str = vcf_path.to_str().unwrap();
let (ok, _stdout, stderr) =
run_simulate(&["simulate", "-r", ref_str, "-v", vcf_str, "-o", out_str, "--coverage", "1"]);
assert!(!ok, "should fail when multi-sample VCF has no --sample");
assert!(
stderr.contains("3 samples") && stderr.contains("--sample"),
"error should mention sample count and --sample flag, got: {stderr}"
);
assert!(
stderr.contains("SampleA") && stderr.contains("SampleB") && stderr.contains("SampleC"),
"error should list available samples, got: {stderr}"
);
}
#[test]
fn test_single_sample_vcf_without_sample_flag_works() {
let env = simple_env();
let out = env.output_prefix();
let out_str = out.to_str().unwrap();
let ref_str = env.fasta_path.to_str().unwrap();
let vcf_path = env.write_vcf_header_only(&["OnlySample"], &[("chr1", 1000)]);
let vcf_str = vcf_path.to_str().unwrap();
let (ok, _stdout, stderr) =
run_simulate(&["simulate", "-r", ref_str, "-v", vcf_str, "-o", out_str, "--coverage", "1"]);
assert!(ok, "single-sample VCF without --sample should succeed: {stderr}");
}
#[test]
fn test_multi_sample_vcf_with_correct_sample_works() {
let env = simple_env();
let out = env.output_prefix();
let out_str = out.to_str().unwrap();
let ref_str = env.fasta_path.to_str().unwrap();
let vcf_path = env.write_vcf_header_only(&["SampleA", "SampleB"], &[("chr1", 1000)]);
let vcf_str = vcf_path.to_str().unwrap();
let (ok, _stdout, stderr) = run_simulate(&[
"simulate",
"-r",
ref_str,
"-v",
vcf_str,
"--sample",
"SampleB",
"-o",
out_str,
"--coverage",
"1",
]);
assert!(ok, "multi-sample VCF with correct --sample should succeed: {stderr}");
}
#[test]
fn test_output_directory_does_not_exist_fails() {
let env = simple_env();
let ref_str = env.fasta_path.to_str().unwrap();
let bad_out = env.dir.path().join("no_such_dir").join("output");
let out_str = bad_out.to_str().unwrap();
let (ok, _stdout, stderr) =
run_simulate(&["simulate", "-r", ref_str, "-o", out_str, "--coverage", "1"]);
assert!(!ok, "should fail when output directory doesn't exist");
assert!(
stderr.contains("Output directory does not exist"),
"error should mention missing output directory, got: {stderr}"
);
}