use crate::commands::command::Command;
use crate::commands::common::parse_bool;
use crate::commands::simulate::common::{
FamilySizeArgs, InsertSizeArgs, QualityArgs, SimulationCommon, generate_random_sequence,
};
use crate::progress::ProgressTracker;
use crate::simulate::{FastqWriter, create_rng};
use anyhow::{Context, Result, bail};
use clap::Parser;
use crossbeam_channel::bounded;
use fgumi_dna::dna::complement_base;
use log::{info, warn};
use noodles::fasta;
use rand::{Rng, RngExt};
use rayon::prelude::*;
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::thread;
#[derive(Parser, Debug)]
#[command(
name = "fastq-reads",
about = "Generate R1/R2 FASTQ.gz files for extract",
long_about = r#"
Generate synthetic paired-end FASTQ files with UMI sequences.
The output is suitable for input to `fgumi extract`. Each read pair contains:
- R1: UMI sequence + template (according to --read-structure-r1)
- R2: Template only (according to --read-structure-r2)
Quality scores follow a realistic model with warmup, peak, and decay phases.
"#
)]
pub struct FastqReads {
#[arg(short = '1', long = "r1", required = true)]
pub r1_output: PathBuf,
#[arg(short = '2', long = "r2", required = true)]
pub r2_output: PathBuf,
#[arg(long = "truth", required = true)]
pub truth_output: PathBuf,
#[arg(long = "read-structure-r1", default_value = "8M+T")]
pub read_structure_r1: String,
#[arg(long = "read-structure-r2", default_value = "+T")]
pub read_structure_r2: String,
#[arg(long = "duplex", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub duplex: bool,
#[arg(short = 'r', long = "reference")]
pub reference: Option<PathBuf>,
#[arg(short = 'i', long = "includelist")]
pub includelist: Option<PathBuf>,
#[arg(short = 't', long = "threads", default_value = "1")]
pub threads: usize,
#[command(flatten)]
pub common: SimulationCommon,
#[command(flatten)]
pub quality: QualityArgs,
#[command(flatten)]
pub family_size: FamilySizeArgs,
#[command(flatten)]
pub insert_size: InsertSizeArgs,
}
struct ReadRecord {
read_name: String,
r1_seq: Vec<u8>,
r1_quals: Vec<u8>,
r2_seq: Vec<u8>,
r2_quals: Vec<u8>,
umi: Vec<u8>,
mol_id: usize,
family_id: usize,
strand: &'static str,
chrom: Option<String>,
pos: Option<usize>,
}
struct ReferenceGenome {
names: Vec<String>,
sequences: Vec<Vec<u8>>,
cumulative_lengths: Vec<usize>,
total_length: usize,
}
impl ReferenceGenome {
fn load(path: &PathBuf) -> Result<Self> {
info!("Loading reference from {}", path.display());
let file = File::open(path)
.with_context(|| format!("Failed to open reference: {}", path.display()))?;
let reader = BufReader::new(file);
let mut fasta_reader = fasta::io::Reader::new(reader);
let mut names = Vec::new();
let mut sequences = Vec::new();
let mut cumulative_lengths = Vec::new();
let mut total_length = 0usize;
for result in fasta_reader.records() {
let record = result.with_context(|| "Failed to read FASTA record")?;
let name = std::str::from_utf8(record.name())
.with_context(|| "Invalid chromosome name")?
.to_string();
let seq: Vec<u8> =
record.sequence().as_ref().iter().map(|&b| b.to_ascii_uppercase()).collect();
if seq.len() >= 1000 {
total_length += seq.len();
cumulative_lengths.push(total_length);
names.push(name);
sequences.push(seq);
}
}
if sequences.is_empty() {
bail!("No valid sequences found in reference FASTA");
}
info!("Loaded {} chromosomes, total {} bp", sequences.len(), total_length);
Ok(Self { names, sequences, cumulative_lengths, total_length })
}
fn sample_sequence(
&self,
length: usize,
rng: &mut impl Rng,
) -> Option<(usize, usize, Vec<u8>)> {
for _ in 0..10 {
let genome_pos = rng.random_range(0..self.total_length.saturating_sub(length));
let chrom_idx =
self.cumulative_lengths.iter().position(|&cum| cum > genome_pos).unwrap_or(0);
let chrom_start =
if chrom_idx == 0 { 0 } else { self.cumulative_lengths[chrom_idx - 1] };
let local_pos = genome_pos - chrom_start;
let seq = &self.sequences[chrom_idx];
if local_pos + length > seq.len() {
continue;
}
let template = &seq[local_pos..local_pos + length];
if template.iter().any(|&b| b == b'N' || b == b'n') {
continue;
}
return Some((chrom_idx, local_pos, template.to_vec()));
}
None
}
}
#[derive(Clone)]
struct GenerationParams {
umi_length: usize,
read_length: usize,
duplex: bool,
min_family_size: usize,
r2_quality_offset: i8,
includelist: Option<Vec<Vec<u8>>>,
}
fn load_includelist(path: &Path) -> Result<Vec<Vec<u8>>> {
let file = File::open(path)
.with_context(|| format!("Failed to open includelist: {}", path.display()))?;
let reader = BufReader::new(file);
let mut umis: Vec<(usize, Vec<u8>)> = Vec::new();
for (line_num, line) in reader.lines().enumerate() {
let line = line.with_context(|| format!("Failed to read line {}", line_num + 1))?;
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let umi = trimmed.as_bytes().to_ascii_uppercase();
if !umi.iter().all(|&b| matches!(b, b'A' | b'C' | b'G' | b'T')) {
bail!("Invalid UMI at line {}: '{}' (only A, C, G, T allowed)", line_num + 1, trimmed,);
}
umis.push((line_num + 1, umi));
}
if umis.is_empty() {
bail!("Includelist is empty: {}", path.display());
}
let expected_len = umis[0].1.len();
for (file_line, umi) in &umis {
if umi.len() != expected_len {
bail!(
"UMI length mismatch at line {}: length {} but expected {} (from first UMI)",
file_line,
umi.len(),
expected_len,
);
}
}
Ok(umis.into_iter().map(|(_, umi)| umi).collect())
}
const CHANNEL_CAPACITY: usize = 1_000;
impl Command for FastqReads {
fn execute(&self, _command_line: &str) -> Result<()> {
let includelist = if let Some(ref path) = self.includelist {
let umis = load_includelist(path)?;
info!("Loaded {} UMIs from includelist (length={})", umis.len(), umis[0].len());
Some(umis)
} else {
None
};
let umi_length = if let Some(ref list) = includelist {
let list_len = list[0].len();
if list_len > self.common.read_length {
bail!(
"Includelist UMI length {} exceeds read length {}",
list_len,
self.common.read_length,
);
}
if self.common.umi_length != list_len {
warn!(
"--umi-length {} overridden by includelist UMI length {}",
self.common.umi_length, list_len,
);
}
list_len
} else {
if self.common.umi_length > self.common.read_length {
bail!(
"UMI length {} exceeds read length {}",
self.common.umi_length,
self.common.read_length,
);
}
self.common.umi_length
};
info!("Generating FASTQ reads");
info!(" Output R1: {}", self.r1_output.display());
info!(" Output R2: {}", self.r2_output.display());
info!(" Truth: {}", self.truth_output.display());
info!(" Num molecules: {}", self.common.num_molecules);
info!(" Read length: {}", self.common.read_length);
info!(" UMI length: {}", umi_length);
if let Some(ref path) = self.includelist {
info!(" Includelist: {}", path.display());
}
info!(" Duplex: {}", self.duplex);
info!(" Threads: {}", self.threads);
if let Some(ref path) = self.reference {
info!(" Reference: {}", path.display());
}
let reference = if let Some(ref path) = self.reference {
Some(Arc::new(ReferenceGenome::load(path)?))
} else {
None
};
let gen_threads = if self.threads <= 1 { 1 } else { self.threads.max(2) };
let compress_threads = self.threads;
let quality_model = Arc::new(self.quality.to_quality_model());
let quality_bias = Arc::new(self.quality.to_quality_bias());
let family_dist = Arc::new(self.family_size.to_family_size_distribution()?);
let insert_model = Arc::new(self.insert_size.to_insert_size_model());
let params = Arc::new(GenerationParams {
umi_length,
read_length: self.common.read_length,
duplex: self.duplex,
min_family_size: self.family_size.min_family_size,
r2_quality_offset: self.quality.r2_quality_offset,
includelist,
});
let mut seed_rng = create_rng(self.common.seed);
let mol_seeds: Vec<u64> =
(0..self.common.num_molecules).map(|_| seed_rng.random()).collect();
let (sender, receiver) = bounded::<Vec<ReadRecord>>(CHANNEL_CAPACITY);
let r1_path = self.r1_output.clone();
let r2_path = self.r2_output.clone();
let truth_path = self.truth_output.clone();
let has_reference = reference.is_some();
let writer_handle = thread::spawn(move || -> Result<u64> {
let mut r1_writer = FastqWriter::with_threads(&r1_path, compress_threads)?;
let mut r2_writer = FastqWriter::with_threads(&r2_path, compress_threads)?;
let truth_file = File::create(&truth_path)
.with_context(|| format!("Failed to create {}", truth_path.display()))?;
let mut truth_writer = BufWriter::new(truth_file);
if has_reference {
writeln!(
truth_writer,
"read_name\ttrue_umi\tmolecule_id\tfamily_id\tstrand\tchrom\tpos"
)?;
} else {
writeln!(truth_writer, "read_name\ttrue_umi\tmolecule_id\tfamily_id\tstrand")?;
}
let mut read_count = 0u64;
let progress = ProgressTracker::new("Generated read pairs").with_interval(100_000);
for batch in receiver {
for record in batch {
read_count += 1;
progress.log_if_needed(1);
r1_writer.write_record(&record.read_name, &record.r1_seq, &record.r1_quals)?;
r2_writer.write_record(&record.read_name, &record.r2_seq, &record.r2_quals)?;
if has_reference {
writeln!(
truth_writer,
"{}\t{}\t{}\t{}\t{}\t{}\t{}",
record.read_name,
String::from_utf8_lossy(&record.umi),
record.mol_id,
record.family_id,
record.strand,
record.chrom.as_deref().unwrap_or("."),
record.pos.map_or_else(|| ".".to_string(), |p| (p + 1).to_string())
)?;
} else {
writeln!(
truth_writer,
"{}\t{}\t{}\t{}\t{}",
record.read_name,
String::from_utf8_lossy(&record.umi),
record.mol_id,
record.family_id,
record.strand
)?;
}
}
}
progress.log_final();
r1_writer.finish()?;
r2_writer.finish()?;
truth_writer.flush()?;
Ok(read_count)
});
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(gen_threads)
.build()
.with_context(|| "Failed to create thread pool")?;
let generation_result: Result<(), crossbeam_channel::SendError<Vec<ReadRecord>>> = pool
.install(|| {
mol_seeds.into_par_iter().enumerate().try_for_each(|(mol_id, seed)| {
let batch = generate_molecule_reads(
mol_id,
seed,
¶ms,
&quality_model,
&quality_bias,
&family_dist,
&insert_model,
reference.as_deref(),
);
sender.send(batch)
})
});
drop(sender);
if let Err(e) = generation_result {
return Err(anyhow::anyhow!("Failed to send record to writer: {e}"));
}
let read_count =
writer_handle.join().map_err(|_| anyhow::anyhow!("Writer thread panicked"))??;
info!("Generated {read_count} read pairs");
info!("Done");
Ok(())
}
}
#[allow(clippy::too_many_arguments)]
fn generate_molecule_reads(
mol_id: usize,
seed: u64,
params: &GenerationParams,
quality_model: &crate::simulate::PositionQualityModel,
quality_bias: &crate::simulate::ReadPairQualityBias,
family_dist: &crate::simulate::FamilySizeDistribution,
insert_model: &crate::simulate::InsertSizeModel,
reference: Option<&ReferenceGenome>,
) -> Vec<ReadRecord> {
let mut rng = create_rng(Some(seed));
let mut records = Vec::new();
let (umi_r1, umi_r2) = if let Some(ref includelist) = params.includelist {
let idx_r1 = rng.random_range(0..includelist.len());
let idx_r2 = rng.random_range(0..includelist.len());
(includelist[idx_r1].clone(), includelist[idx_r2].clone())
} else {
(
generate_random_sequence(params.umi_length, &mut rng),
generate_random_sequence(params.umi_length, &mut rng),
)
};
let family_size = family_dist.sample(&mut rng, params.min_family_size);
let insert_size = insert_model.sample(&mut rng);
let template_len = insert_size.saturating_sub(params.umi_length);
let (template, chrom, pos) = if let Some(ref_genome) = reference {
if let Some((chrom_idx, local_pos, seq)) =
ref_genome.sample_sequence(template_len, &mut rng)
{
let chrom_name = ref_genome.names[chrom_idx].clone();
(seq, Some(chrom_name), Some(local_pos))
} else {
(generate_random_sequence(template_len, &mut rng), None, None)
}
} else {
(generate_random_sequence(template_len, &mut rng), None, None)
};
let r1_template_len = params.read_length.saturating_sub(params.umi_length);
let r2_template_len = params.read_length.saturating_sub(params.umi_length);
let r2_start = template.len().saturating_sub(r2_template_len);
let r2_end = template.len();
let mut r1_seq_buf = Vec::with_capacity(params.read_length);
let mut r2_seq_buf = Vec::with_capacity(params.read_length);
let mut read_counter = 0usize;
for read_idx in 0..family_size {
let (strand, family_idx): (&'static str, usize) = if params.duplex {
if rng.random_bool(0.5) { ("A", 0) } else { ("B", 1) }
} else {
("A", 0)
};
let (r1_umi, r2_umi) = if strand == "B" {
(&umi_r2, &umi_r1) } else {
(&umi_r1, &umi_r2) };
let combined_umi: Vec<u8> = {
let mut umi = r1_umi.clone();
umi.push(b'-');
umi.extend_from_slice(r2_umi);
umi
};
read_counter += 1;
let read_name = format!("mol{mol_id:06}_read{read_counter:04}");
r1_seq_buf.clear();
r2_seq_buf.clear();
if strand == "A" {
r1_seq_buf.extend_from_slice(r1_umi);
let r1_template_end = r1_template_len.min(template.len());
r1_seq_buf.extend_from_slice(&template[..r1_template_end]);
r2_seq_buf.extend_from_slice(r2_umi);
reverse_complement_into(&template[r2_start..r2_end], &mut r2_seq_buf);
} else {
r1_seq_buf.extend_from_slice(r1_umi);
reverse_complement_into(&template[r2_start..r2_end], &mut r1_seq_buf);
r2_seq_buf.extend_from_slice(r2_umi);
let r1_template_end = r1_template_len.min(template.len());
r2_seq_buf.extend_from_slice(&template[..r1_template_end]);
}
introduce_errors_inplace(&mut r1_seq_buf[..params.umi_length], 0.01, &mut rng);
pad_sequence_inplace(&mut r1_seq_buf, params.read_length, &mut rng);
introduce_errors_inplace(&mut r2_seq_buf[..params.umi_length], 0.01, &mut rng);
pad_sequence_inplace(&mut r2_seq_buf, params.read_length, &mut rng);
let r1_quals = quality_model.generate_qualities(r1_seq_buf.len(), &mut rng);
let r2_quals_raw = quality_model.generate_qualities(r2_seq_buf.len(), &mut rng);
let r2_quals = if params.r2_quality_offset != 0 {
quality_bias.apply_to_vec(&r2_quals_raw, true)
} else {
r2_quals_raw
};
records.push(ReadRecord {
read_name,
r1_seq: r1_seq_buf.clone(),
r1_quals,
r2_seq: r2_seq_buf.clone(),
r2_quals,
umi: combined_umi.clone(),
mol_id,
family_id: family_idx * 1000 + read_idx,
strand,
chrom: chrom.clone(),
pos,
});
}
records
}
fn reverse_complement_into(seq: &[u8], output: &mut Vec<u8>) {
output.reserve(seq.len());
for &b in seq.iter().rev() {
output.push(complement_base(b));
}
}
fn introduce_errors_inplace(seq: &mut [u8], error_rate: f64, rng: &mut impl Rng) {
const ALTERNATIVES: [&[u8; 3]; 256] = {
let mut table: [&[u8; 3]; 256] = [b"ACG"; 256]; table[b'A' as usize] = b"CGT";
table[b'C' as usize] = b"AGT";
table[b'G' as usize] = b"ACT";
table[b'T' as usize] = b"ACG";
table
};
for base in seq.iter_mut() {
if rng.random::<f64>() < error_rate {
let alts = ALTERNATIVES[*base as usize];
*base = alts[rng.random_range(0..3)];
}
}
}
fn pad_sequence_inplace(seq: &mut Vec<u8>, target_len: usize, rng: &mut impl Rng) {
const BASES: &[u8] = b"ACGT";
seq.reserve(target_len.saturating_sub(seq.len()));
while seq.len() < target_len {
seq.push(BASES[rng.random_range(0..4)]);
}
seq.truncate(target_len);
}
#[cfg(test)]
#[allow(clippy::naive_bytecount)]
mod tests {
use super::*;
use crate::simulate::create_rng;
fn reverse_complement(seq: &[u8]) -> Vec<u8> {
let mut result = Vec::with_capacity(seq.len());
reverse_complement_into(seq, &mut result);
result
}
fn introduce_errors(seq: &[u8], error_rate: f64, rng: &mut impl Rng) -> Vec<u8> {
let mut result = seq.to_vec();
introduce_errors_inplace(&mut result, error_rate, rng);
result
}
fn pad_sequence(mut seq: Vec<u8>, target_len: usize, rng: &mut impl Rng) -> Vec<u8> {
pad_sequence_inplace(&mut seq, target_len, rng);
seq
}
#[test]
fn test_generate_random_sequence_length() {
let mut rng = create_rng(Some(42));
for len in [0, 1, 10, 100, 500] {
let seq = generate_random_sequence(len, &mut rng);
assert_eq!(seq.len(), len);
}
}
#[test]
fn test_generate_random_sequence_valid_bases() {
let mut rng = create_rng(Some(42));
let seq = generate_random_sequence(1000, &mut rng);
for &base in &seq {
assert!(
base == b'A' || base == b'C' || base == b'G' || base == b'T',
"Invalid base: {base}"
);
}
}
#[test]
fn test_generate_random_sequence_distribution() {
let mut rng = create_rng(Some(42));
let seq = generate_random_sequence(10000, &mut rng);
let count_a = seq.iter().filter(|&&b| b == b'A').count();
let count_c = seq.iter().filter(|&&b| b == b'C').count();
let count_g = seq.iter().filter(|&&b| b == b'G').count();
let count_t = seq.iter().filter(|&&b| b == b'T').count();
for (name, count) in [("A", count_a), ("C", count_c), ("G", count_g), ("T", count_t)] {
let frac = count as f64 / 10000.0;
assert!((0.20..0.30).contains(&frac), "{name} fraction {frac} out of expected range");
}
}
#[test]
fn test_generate_random_sequence_reproducible() {
let mut rng1 = create_rng(Some(42));
let mut rng2 = create_rng(Some(42));
let seq1 = generate_random_sequence(100, &mut rng1);
let seq2 = generate_random_sequence(100, &mut rng2);
assert_eq!(seq1, seq2);
}
#[test]
fn test_reverse_complement_basic() {
assert_eq!(reverse_complement(b"A"), b"T");
assert_eq!(reverse_complement(b"T"), b"A");
assert_eq!(reverse_complement(b"C"), b"G");
assert_eq!(reverse_complement(b"G"), b"C");
}
#[test]
fn test_reverse_complement_sequence() {
assert_eq!(reverse_complement(b"ACGT"), b"ACGT");
assert_eq!(reverse_complement(b"AAAT"), b"ATTT");
assert_eq!(reverse_complement(b"GGGC"), b"GCCC");
}
#[test]
fn test_reverse_complement_empty() {
assert_eq!(reverse_complement(b""), Vec::<u8>::new());
}
#[test]
fn test_reverse_complement_single() {
for (base, expected) in [(b'A', b'T'), (b'T', b'A'), (b'C', b'G'), (b'G', b'C')] {
assert_eq!(reverse_complement(&[base]), vec![expected]);
}
}
#[test]
fn test_reverse_complement_unknown_bases() {
assert_eq!(reverse_complement(b"N"), b"N");
assert_eq!(reverse_complement(b"X"), b"X");
assert_eq!(reverse_complement(b"ANT"), b"ANT"); }
#[test]
fn test_reverse_complement_double() {
let mut rng = create_rng(Some(42));
let seq = generate_random_sequence(100, &mut rng);
let rc = reverse_complement(&seq);
let rc_rc = reverse_complement(&rc);
assert_eq!(seq, rc_rc);
}
#[test]
fn test_introduce_errors_zero_rate() {
let mut rng = create_rng(Some(42));
let seq = b"ACGTACGTACGT";
let result = introduce_errors(seq, 0.0, &mut rng);
assert_eq!(result, seq.to_vec());
}
#[test]
fn test_introduce_errors_full_rate() {
let mut rng = create_rng(Some(42));
let seq = b"AAAAAAAAAA";
let result = introduce_errors(seq, 1.0, &mut rng);
for &base in &result {
assert_ne!(base, b'A', "Expected all bases to be mutated");
}
}
#[test]
fn test_introduce_errors_valid_bases() {
let mut rng = create_rng(Some(42));
let seq = generate_random_sequence(1000, &mut rng);
let result = introduce_errors(&seq, 0.5, &mut rng);
for &base in &result {
assert!(
base == b'A' || base == b'C' || base == b'G' || base == b'T',
"Invalid base after mutation: {base}"
);
}
}
#[test]
fn test_introduce_errors_length_preserved() {
let mut rng = create_rng(Some(42));
let seq = generate_random_sequence(100, &mut rng);
let result = introduce_errors(&seq, 0.1, &mut rng);
assert_eq!(result.len(), seq.len());
}
#[test]
fn test_introduce_errors_approximate_rate() {
let mut rng = create_rng(Some(42));
let seq = vec![b'A'; 10000];
let result = introduce_errors(&seq, 0.1, &mut rng);
let error_count = result.iter().filter(|&&b| b != b'A').count();
let error_rate = error_count as f64 / 10000.0;
assert!(
(error_rate - 0.1).abs() < 0.02,
"Error rate {error_rate} not close to expected 0.1"
);
}
#[test]
fn test_pad_sequence_already_correct_length() {
let mut rng = create_rng(Some(42));
let seq = b"ACGTACGT".to_vec();
let result = pad_sequence(seq.clone(), 8, &mut rng);
assert_eq!(result, seq);
}
#[test]
fn test_pad_sequence_needs_padding() {
let mut rng = create_rng(Some(42));
let seq = b"ACGT".to_vec();
let result = pad_sequence(seq, 10, &mut rng);
assert_eq!(result.len(), 10);
assert_eq!(&result[..4], b"ACGT");
for &base in &result[4..] {
assert!(base == b'A' || base == b'C' || base == b'G' || base == b'T');
}
}
#[test]
fn test_pad_sequence_needs_truncation() {
let mut rng = create_rng(Some(42));
let seq = b"ACGTACGTACGT".to_vec();
let result = pad_sequence(seq, 5, &mut rng);
assert_eq!(result.len(), 5);
assert_eq!(result, b"ACGTA".to_vec());
}
#[test]
fn test_pad_sequence_empty_to_length() {
let mut rng = create_rng(Some(42));
let seq = Vec::new();
let result = pad_sequence(seq, 10, &mut rng);
assert_eq!(result.len(), 10);
for &base in &result {
assert!(base == b'A' || base == b'C' || base == b'G' || base == b'T');
}
}
#[test]
fn test_pad_sequence_to_zero() {
let mut rng = create_rng(Some(42));
let seq = b"ACGT".to_vec();
let result = pad_sequence(seq, 0, &mut rng);
assert!(result.is_empty());
}
#[test]
fn test_load_includelist_valid() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "AACAC\nAAGGA\nAATGC\n").expect("failed to write file");
let umis = load_includelist(&path).expect("failed to load includelist");
assert_eq!(umis.len(), 3);
assert_eq!(umis[0], b"AACAC");
assert_eq!(umis[1], b"AAGGA");
assert_eq!(umis[2], b"AATGC");
}
#[test]
fn test_load_includelist_lowercase_uppercased() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "aacac\naagga\n").expect("failed to write file");
let umis = load_includelist(&path).expect("failed to load includelist");
assert_eq!(umis[0], b"AACAC");
assert_eq!(umis[1], b"AAGGA");
}
#[test]
fn test_load_includelist_skips_blank_lines() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "AACAC\n\nAAGGA\n \nAATGC\n").expect("failed to write file");
let umis = load_includelist(&path).expect("failed to load includelist");
assert_eq!(umis.len(), 3);
}
#[test]
fn test_load_includelist_rejects_invalid_bases() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "AACAC\nAANGA\n").expect("failed to write file");
let result = load_includelist(&path);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("Invalid UMI"));
}
#[test]
fn test_load_includelist_rejects_mismatched_lengths() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "AACAC\nAAGG\n").expect("failed to write file");
let result = load_includelist(&path);
assert!(result.is_err());
let msg = result.unwrap_err().to_string();
assert!(msg.contains("length mismatch"));
assert!(msg.contains("line 2"), "Expected file line number in error: {msg}");
}
#[test]
fn test_load_includelist_mismatched_length_reports_correct_line_with_blanks() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "AACAC\n\nAAGGA\nAAGG\n").expect("failed to write file");
let result = load_includelist(&path);
assert!(result.is_err());
let msg = result.unwrap_err().to_string();
assert!(msg.contains("line 4"), "Expected file line 4 in error: {msg}");
}
#[test]
fn test_load_includelist_rejects_empty() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "\n\n").expect("failed to write file");
let result = load_includelist(&path);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("empty"));
}
#[test]
fn test_execute_rejects_includelist_umi_exceeding_read_length() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let includelist_path = dir.path().join("umis.txt");
std::fs::write(&includelist_path, "AAAAACCCCC\n").expect("failed to write file");
let r1 = dir.path().join("r1.fq.gz");
let r2 = dir.path().join("r2.fq.gz");
let truth = dir.path().join("truth.tsv");
let cmd = FastqReads::try_parse_from([
"fastq-reads",
"-1",
r1.to_str().expect("path should be valid UTF-8"),
"-2",
r2.to_str().expect("path should be valid UTF-8"),
"--truth",
truth.to_str().expect("path should be valid UTF-8"),
"-i",
includelist_path.to_str().expect("path should be valid UTF-8"),
"--read-length",
"5", ])
.expect("valid CLI arguments");
let result = cmd.execute("");
assert!(result.is_err());
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("exceeds read length"),
"Expected 'exceeds read length' in error: {msg}",
);
}
#[test]
fn test_execute_rejects_umi_length_exceeding_read_length() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let r1 = dir.path().join("r1.fq.gz");
let r2 = dir.path().join("r2.fq.gz");
let truth = dir.path().join("truth.tsv");
let cmd = FastqReads::try_parse_from([
"fastq-reads",
"-1",
r1.to_str().expect("path should be valid UTF-8"),
"-2",
r2.to_str().expect("path should be valid UTF-8"),
"--truth",
truth.to_str().expect("path should be valid UTF-8"),
"--umi-length",
"100",
"--read-length",
"50",
])
.expect("valid CLI arguments");
let result = cmd.execute("");
assert!(result.is_err());
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("exceeds read length"),
"Expected 'exceeds read length' in error: {msg}",
);
}
#[test]
fn test_generate_molecule_reads_with_includelist() {
let umis = vec![b"AAAAA".to_vec(), b"CCCCC".to_vec(), b"GGGGG".to_vec()];
let params = GenerationParams {
umi_length: 5,
read_length: 50,
duplex: false,
min_family_size: 1,
r2_quality_offset: 0,
includelist: Some(umis.clone()),
};
let quality_model =
crate::simulate::PositionQualityModel::new(10, 25, 37, 100, 0.08, 2, 2.0);
let quality_bias = crate::simulate::ReadPairQualityBias::new(0);
let family_dist = crate::simulate::FamilySizeDistribution::log_normal(3.0, 1.0);
let insert_model = crate::simulate::InsertSizeModel::new(150.0, 30.0, 50, 500);
for seed in 0..20u64 {
let records = generate_molecule_reads(
0,
seed,
¶ms,
&quality_model,
&quality_bias,
&family_dist,
&insert_model,
None,
);
for record in &records {
let umi_str = String::from_utf8_lossy(&record.umi);
let parts: Vec<&str> = umi_str.split('-').collect();
assert_eq!(parts.len(), 2);
assert!(
umis.iter().any(|u| u == parts[0].as_bytes()),
"R1 UMI '{}' not in includelist",
parts[0],
);
assert!(
umis.iter().any(|u| u == parts[1].as_bytes()),
"R2 UMI '{}' not in includelist",
parts[1],
);
}
}
}
}