use crate::commands::command::Command;
use crate::commands::common::parse_bool;
use crate::commands::simulate::common::{
FamilySizeArgs, InsertSizeArgs, MethylationArgs, MethylationConfig, QualityArgs,
ReferenceGenome, SimulationCommon, apply_methylation_conversion, generate_random_sequence,
};
use crate::progress::ProgressTracker;
use crate::simulate::{FastqWriter, create_rng};
use anyhow::{Context, Result, bail};
use clap::Parser;
use crossbeam_channel::bounded;
use fgumi_dna::dna::complement_base;
use log::{info, warn};
use rand::{Rng, RngExt};
use rayon::prelude::*;
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::thread;
#[derive(Parser, Debug)]
#[command(
name = "fastq-reads",
about = "Generate R1/R2 FASTQ.gz files for extract",
long_about = r#"
Generate synthetic paired-end FASTQ files with UMI sequences.
The output is suitable for input to `fgumi extract`. Each read pair contains:
- R1: UMI sequence + template (according to --read-structure-r1)
- R2: Template only (according to --read-structure-r2)
Quality scores follow a realistic model with warmup, peak, and decay phases.
"#
)]
pub struct FastqReads {
#[arg(short = '1', long = "r1", required = true)]
pub r1_output: PathBuf,
#[arg(short = '2', long = "r2", required = true)]
pub r2_output: PathBuf,
#[arg(long = "truth", required = true)]
pub truth_output: PathBuf,
#[arg(long = "read-structure-r1", default_value = "8M+T")]
pub read_structure_r1: String,
#[arg(long = "read-structure-r2", default_value = "+T")]
pub read_structure_r2: String,
#[arg(long = "duplex", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub duplex: bool,
#[arg(short = 'r', long = "reference", required = true)]
pub reference: PathBuf,
#[arg(short = 'i', long = "includelist")]
pub includelist: Option<PathBuf>,
#[arg(short = 't', long = "threads", default_value = "1")]
pub threads: usize,
#[command(flatten)]
pub common: SimulationCommon,
#[command(flatten)]
pub quality: QualityArgs,
#[command(flatten)]
pub family_size: FamilySizeArgs,
#[command(flatten)]
pub insert_size: InsertSizeArgs,
#[command(flatten)]
pub methylation: MethylationArgs,
}
struct ReadRecord {
read_name: String,
r1_seq: Vec<u8>,
r1_quals: Vec<u8>,
r2_seq: Vec<u8>,
r2_quals: Vec<u8>,
umi: Vec<u8>,
mol_id: usize,
family_id: usize,
strand: &'static str,
chrom: String,
pos: usize,
}
#[derive(Clone)]
struct GenerationParams {
umi_length: usize,
read_length: usize,
min_family_size: usize,
r2_quality_offset: i8,
duplex: bool,
includelist: Option<Vec<Vec<u8>>>,
methylation: MethylationConfig,
}
fn load_includelist(path: &Path) -> Result<Vec<Vec<u8>>> {
let file = File::open(path)
.with_context(|| format!("Failed to open includelist: {}", path.display()))?;
let reader = BufReader::new(file);
let mut umis: Vec<(usize, Vec<u8>)> = Vec::new();
for (line_num, line) in reader.lines().enumerate() {
let line = line.with_context(|| format!("Failed to read line {}", line_num + 1))?;
let trimmed = line.trim();
if trimmed.is_empty() {
continue;
}
let umi = trimmed.as_bytes().to_ascii_uppercase();
if !umi.iter().all(|&b| matches!(b, b'A' | b'C' | b'G' | b'T')) {
bail!("Invalid UMI at line {}: '{}' (only A, C, G, T allowed)", line_num + 1, trimmed,);
}
umis.push((line_num + 1, umi));
}
if umis.is_empty() {
bail!("Includelist is empty: {}", path.display());
}
let expected_len = umis[0].1.len();
for (file_line, umi) in &umis {
if umi.len() != expected_len {
bail!(
"UMI length mismatch at line {}: length {} but expected {} (from first UMI)",
file_line,
umi.len(),
expected_len,
);
}
}
Ok(umis.into_iter().map(|(_, umi)| umi).collect())
}
const CHANNEL_CAPACITY: usize = 1_000;
impl Command for FastqReads {
fn execute(&self, _command_line: &str) -> Result<()> {
let includelist = if let Some(ref path) = self.includelist {
let umis = load_includelist(path)?;
info!("Loaded {} UMIs from includelist (length={})", umis.len(), umis[0].len());
Some(umis)
} else {
None
};
let umi_length = if let Some(ref list) = includelist {
let list_len = list[0].len();
if list_len > self.common.read_length {
bail!(
"Includelist UMI length {} exceeds read length {}",
list_len,
self.common.read_length,
);
}
if self.common.umi_length != list_len {
warn!(
"--umi-length {} overridden by includelist UMI length {}",
self.common.umi_length, list_len,
);
}
list_len
} else {
if self.common.umi_length > self.common.read_length {
bail!(
"UMI length {} exceeds read length {}",
self.common.umi_length,
self.common.read_length,
);
}
self.common.umi_length
};
let methylation = self.methylation.resolve();
self.methylation.validate()?;
info!("Generating FASTQ reads");
info!(" Output R1: {}", self.r1_output.display());
info!(" Output R2: {}", self.r2_output.display());
info!(" Truth: {}", self.truth_output.display());
info!(" Num molecules: {}", self.common.num_molecules);
info!(" Read length: {}", self.common.read_length);
info!(" UMI length: {}", umi_length);
if let Some(ref path) = self.includelist {
info!(" Includelist: {}", path.display());
}
info!(" Duplex: {}", self.duplex);
info!(" Threads: {}", self.threads);
info!(" Reference: {}", self.reference.display());
if methylation.mode.is_enabled() {
info!(" Methylation mode: {:?}", methylation.mode);
info!(" CpG methylation rate: {}", methylation.cpg_methylation_rate);
info!(" Conversion rate: {}", methylation.conversion_rate);
}
let reference = Arc::new(ReferenceGenome::load(&self.reference)?);
if reference.max_contig_length() < self.common.read_length {
bail!(
"No reference contig is >= read length ({} bp). \
The longest contig is {} bp. Use a larger reference or shorter --read-length.",
self.common.read_length,
reference.max_contig_length(),
);
}
let gen_threads = if self.threads <= 1 { 1 } else { self.threads.max(2) };
let compress_threads = self.threads;
let quality_model = Arc::new(self.quality.to_quality_model());
let quality_bias = Arc::new(self.quality.to_quality_bias());
let family_dist = Arc::new(self.family_size.to_family_size_distribution()?);
let insert_model = Arc::new(self.insert_size.to_insert_size_model());
let params = Arc::new(GenerationParams {
umi_length,
read_length: self.common.read_length,
min_family_size: self.family_size.min_family_size,
r2_quality_offset: self.quality.r2_quality_offset,
duplex: self.duplex,
includelist,
methylation,
});
let mut seed_rng = create_rng(self.common.seed);
let mol_seeds: Vec<u64> =
(0..self.common.num_molecules).map(|_| seed_rng.random()).collect();
let (sender, receiver) = bounded::<Vec<ReadRecord>>(CHANNEL_CAPACITY);
let r1_path = self.r1_output.clone();
let r2_path = self.r2_output.clone();
let truth_path = self.truth_output.clone();
let writer_handle = thread::spawn(move || -> Result<u64> {
let mut r1_writer = FastqWriter::with_threads(&r1_path, compress_threads)?;
let mut r2_writer = FastqWriter::with_threads(&r2_path, compress_threads)?;
let truth_file = File::create(&truth_path)
.with_context(|| format!("Failed to create {}", truth_path.display()))?;
let mut truth_writer = BufWriter::new(truth_file);
writeln!(
truth_writer,
"read_name\ttrue_umi\tmolecule_id\tfamily_id\tstrand\tchrom\tpos"
)?;
let mut read_count = 0u64;
let progress = ProgressTracker::new("Generated read pairs").with_interval(100_000);
for batch in receiver {
for record in batch {
read_count += 1;
progress.log_if_needed(1);
r1_writer.write_record(&record.read_name, &record.r1_seq, &record.r1_quals)?;
r2_writer.write_record(&record.read_name, &record.r2_seq, &record.r2_quals)?;
writeln!(
truth_writer,
"{}\t{}\t{}\t{}\t{}\t{}\t{}",
record.read_name,
String::from_utf8_lossy(&record.umi),
record.mol_id,
record.family_id,
record.strand,
record.chrom,
record.pos
)?;
}
}
progress.log_final();
r1_writer.finish()?;
r2_writer.finish()?;
truth_writer.flush()?;
Ok(read_count)
});
let pool = rayon::ThreadPoolBuilder::new()
.num_threads(gen_threads)
.build()
.with_context(|| "Failed to create thread pool")?;
let generation_result: Result<(), crossbeam_channel::SendError<Vec<ReadRecord>>> = pool
.install(|| {
mol_seeds.into_par_iter().enumerate().try_for_each(|(mol_id, seed)| {
let batch = generate_molecule_reads(
mol_id,
seed,
¶ms,
&quality_model,
&quality_bias,
&family_dist,
&insert_model,
&reference,
);
sender.send(batch)
})
});
drop(sender);
if let Err(e) = generation_result {
return Err(anyhow::anyhow!("Failed to send record to writer: {e}"));
}
let read_count =
writer_handle.join().map_err(|_| anyhow::anyhow!("Writer thread panicked"))??;
info!("Generated {read_count} read pairs");
info!("Done");
Ok(())
}
}
#[allow(clippy::too_many_arguments)]
fn generate_molecule_reads(
mol_id: usize,
seed: u64,
params: &GenerationParams,
quality_model: &crate::simulate::PositionQualityModel,
quality_bias: &crate::simulate::ReadPairQualityBias,
family_dist: &crate::simulate::FamilySizeDistribution,
insert_model: &crate::simulate::InsertSizeModel,
reference: &ReferenceGenome,
) -> Vec<ReadRecord> {
let mut rng = create_rng(Some(seed));
let mut records = Vec::new();
let (umi_r1, umi_r2) = if let Some(ref includelist) = params.includelist {
let idx_r1 = rng.random_range(0..includelist.len());
let idx_r2 = rng.random_range(0..includelist.len());
(includelist[idx_r1].clone(), includelist[idx_r2].clone())
} else {
(
generate_random_sequence(params.umi_length, &mut rng),
generate_random_sequence(params.umi_length, &mut rng),
)
};
let family_size = family_dist.sample(&mut rng, params.min_family_size);
let insert_size = insert_model.sample(&mut rng);
let template_len = insert_size.saturating_sub(params.umi_length);
let (chrom_idx, pos, template) = reference
.sample_sequence(template_len, &mut rng)
.expect("Failed to sample sequence from reference");
let chrom = reference.name(chrom_idx).to_string();
let template_len = params.read_length.saturating_sub(params.umi_length);
let r2_start = template.len().saturating_sub(template_len);
let r2_end = template.len();
let mut r1_seq_buf = Vec::with_capacity(params.read_length);
let mut r2_seq_buf = Vec::with_capacity(params.read_length);
let mut read_counter = 0usize;
for read_idx in 0..family_size {
let (strand, family_idx): (&'static str, usize) = if params.duplex {
if rng.random_bool(0.5) { ("A", 0) } else { ("B", 1) }
} else {
("A", 0)
};
let (r1_umi, r2_umi) = if strand == "B" {
(&umi_r2, &umi_r1) } else {
(&umi_r1, &umi_r2) };
let combined_umi: Vec<u8> = {
let mut umi = r1_umi.clone();
umi.push(b'-');
umi.extend_from_slice(r2_umi);
umi
};
read_counter += 1;
let read_name = format!("mol{mol_id:06}_read{read_counter:04}");
r1_seq_buf.clear();
r2_seq_buf.clear();
if strand == "A" {
r1_seq_buf.extend_from_slice(r1_umi);
let r1_template_end = template_len.min(template.len());
let mut r1_template = template[..r1_template_end].to_vec();
apply_methylation_conversion(
&mut r1_template,
&template,
0,
true, ¶ms.methylation,
&mut rng,
);
r1_seq_buf.extend_from_slice(&r1_template);
r2_seq_buf.extend_from_slice(r2_umi);
let mut r2_template = template[r2_start..r2_end].to_vec();
apply_methylation_conversion(
&mut r2_template,
&template,
r2_start,
false, ¶ms.methylation,
&mut rng,
);
reverse_complement_into(&r2_template, &mut r2_seq_buf);
} else {
r1_seq_buf.extend_from_slice(r1_umi);
let mut r1_template = template[r2_start..r2_end].to_vec();
apply_methylation_conversion(
&mut r1_template,
&template,
r2_start,
false, ¶ms.methylation,
&mut rng,
);
reverse_complement_into(&r1_template, &mut r1_seq_buf);
r2_seq_buf.extend_from_slice(r2_umi);
let r1_template_end = template_len.min(template.len());
let mut r2_template = template[..r1_template_end].to_vec();
apply_methylation_conversion(
&mut r2_template,
&template,
0,
true, ¶ms.methylation,
&mut rng,
);
r2_seq_buf.extend_from_slice(&r2_template);
}
introduce_errors_inplace(&mut r1_seq_buf[..params.umi_length], 0.01, &mut rng);
pad_sequence_inplace(&mut r1_seq_buf, params.read_length, &mut rng);
introduce_errors_inplace(&mut r2_seq_buf[..params.umi_length], 0.01, &mut rng);
pad_sequence_inplace(&mut r2_seq_buf, params.read_length, &mut rng);
let r1_quals = quality_model.generate_qualities(r1_seq_buf.len(), &mut rng);
let r2_quals_raw = quality_model.generate_qualities(r2_seq_buf.len(), &mut rng);
let r2_quals = if params.r2_quality_offset != 0 {
quality_bias.apply_to_vec(&r2_quals_raw, true)
} else {
r2_quals_raw
};
records.push(ReadRecord {
read_name,
r1_seq: r1_seq_buf.clone(),
r1_quals,
r2_seq: r2_seq_buf.clone(),
r2_quals,
umi: combined_umi.clone(),
mol_id,
family_id: family_idx * 1000 + read_idx,
strand,
chrom: chrom.clone(),
pos,
});
}
records
}
fn reverse_complement_into(seq: &[u8], output: &mut Vec<u8>) {
output.reserve(seq.len());
for &b in seq.iter().rev() {
output.push(complement_base(b));
}
}
fn introduce_errors_inplace(seq: &mut [u8], error_rate: f64, rng: &mut impl Rng) {
const ALTERNATIVES: [&[u8; 3]; 256] = {
let mut table: [&[u8; 3]; 256] = [b"ACG"; 256]; table[b'A' as usize] = b"CGT";
table[b'C' as usize] = b"AGT";
table[b'G' as usize] = b"ACT";
table[b'T' as usize] = b"ACG";
table
};
for base in seq.iter_mut() {
if rng.random::<f64>() < error_rate {
let alts = ALTERNATIVES[*base as usize];
*base = alts[rng.random_range(0..3)];
}
}
}
fn pad_sequence_inplace(seq: &mut Vec<u8>, target_len: usize, rng: &mut impl Rng) {
const BASES: &[u8] = b"ACGT";
seq.reserve(target_len.saturating_sub(seq.len()));
while seq.len() < target_len {
seq.push(BASES[rng.random_range(0..4)]);
}
seq.truncate(target_len);
}
#[cfg(test)]
#[allow(clippy::naive_bytecount)]
mod tests {
use super::*;
use crate::simulate::create_rng;
fn reverse_complement(seq: &[u8]) -> Vec<u8> {
let mut result = Vec::with_capacity(seq.len());
reverse_complement_into(seq, &mut result);
result
}
fn introduce_errors(seq: &[u8], error_rate: f64, rng: &mut impl Rng) -> Vec<u8> {
let mut result = seq.to_vec();
introduce_errors_inplace(&mut result, error_rate, rng);
result
}
fn pad_sequence(mut seq: Vec<u8>, target_len: usize, rng: &mut impl Rng) -> Vec<u8> {
pad_sequence_inplace(&mut seq, target_len, rng);
seq
}
#[test]
fn test_generate_random_sequence_length() {
let mut rng = create_rng(Some(42));
for len in [0, 1, 10, 100, 500] {
let seq = generate_random_sequence(len, &mut rng);
assert_eq!(seq.len(), len);
}
}
#[test]
fn test_generate_random_sequence_valid_bases() {
let mut rng = create_rng(Some(42));
let seq = generate_random_sequence(1000, &mut rng);
for &base in &seq {
assert!(
base == b'A' || base == b'C' || base == b'G' || base == b'T',
"Invalid base: {base}"
);
}
}
#[test]
fn test_generate_random_sequence_distribution() {
let mut rng = create_rng(Some(42));
let seq = generate_random_sequence(10000, &mut rng);
let count_a = seq.iter().filter(|&&b| b == b'A').count();
let count_c = seq.iter().filter(|&&b| b == b'C').count();
let count_g = seq.iter().filter(|&&b| b == b'G').count();
let count_t = seq.iter().filter(|&&b| b == b'T').count();
for (name, count) in [("A", count_a), ("C", count_c), ("G", count_g), ("T", count_t)] {
let frac = count as f64 / 10000.0;
assert!((0.20..0.30).contains(&frac), "{name} fraction {frac} out of expected range");
}
}
#[test]
fn test_generate_random_sequence_reproducible() {
let mut rng1 = create_rng(Some(42));
let mut rng2 = create_rng(Some(42));
let seq1 = generate_random_sequence(100, &mut rng1);
let seq2 = generate_random_sequence(100, &mut rng2);
assert_eq!(seq1, seq2);
}
#[test]
fn test_reverse_complement_basic() {
assert_eq!(reverse_complement(b"A"), b"T");
assert_eq!(reverse_complement(b"T"), b"A");
assert_eq!(reverse_complement(b"C"), b"G");
assert_eq!(reverse_complement(b"G"), b"C");
}
#[test]
fn test_reverse_complement_sequence() {
assert_eq!(reverse_complement(b"ACGT"), b"ACGT");
assert_eq!(reverse_complement(b"AAAT"), b"ATTT");
assert_eq!(reverse_complement(b"GGGC"), b"GCCC");
}
#[test]
fn test_reverse_complement_empty() {
assert_eq!(reverse_complement(b""), Vec::<u8>::new());
}
#[test]
fn test_reverse_complement_single() {
for (base, expected) in [(b'A', b'T'), (b'T', b'A'), (b'C', b'G'), (b'G', b'C')] {
assert_eq!(reverse_complement(&[base]), vec![expected]);
}
}
#[test]
fn test_reverse_complement_unknown_bases() {
assert_eq!(reverse_complement(b"N"), b"N");
assert_eq!(reverse_complement(b"X"), b"X");
assert_eq!(reverse_complement(b"ANT"), b"ANT"); }
#[test]
fn test_reverse_complement_double() {
let mut rng = create_rng(Some(42));
let seq = generate_random_sequence(100, &mut rng);
let rc = reverse_complement(&seq);
let rc_rc = reverse_complement(&rc);
assert_eq!(seq, rc_rc);
}
#[test]
fn test_introduce_errors_zero_rate() {
let mut rng = create_rng(Some(42));
let seq = b"ACGTACGTACGT";
let result = introduce_errors(seq, 0.0, &mut rng);
assert_eq!(result, seq.to_vec());
}
#[test]
fn test_introduce_errors_full_rate() {
let mut rng = create_rng(Some(42));
let seq = b"AAAAAAAAAA";
let result = introduce_errors(seq, 1.0, &mut rng);
for &base in &result {
assert_ne!(base, b'A', "Expected all bases to be mutated");
}
}
#[test]
fn test_introduce_errors_valid_bases() {
let mut rng = create_rng(Some(42));
let seq = generate_random_sequence(1000, &mut rng);
let result = introduce_errors(&seq, 0.5, &mut rng);
for &base in &result {
assert!(
base == b'A' || base == b'C' || base == b'G' || base == b'T',
"Invalid base after mutation: {base}"
);
}
}
#[test]
fn test_introduce_errors_length_preserved() {
let mut rng = create_rng(Some(42));
let seq = generate_random_sequence(100, &mut rng);
let result = introduce_errors(&seq, 0.1, &mut rng);
assert_eq!(result.len(), seq.len());
}
#[test]
fn test_introduce_errors_approximate_rate() {
let mut rng = create_rng(Some(42));
let seq = vec![b'A'; 10000];
let result = introduce_errors(&seq, 0.1, &mut rng);
let error_count = result.iter().filter(|&&b| b != b'A').count();
let error_rate = error_count as f64 / 10000.0;
assert!(
(error_rate - 0.1).abs() < 0.02,
"Error rate {error_rate} not close to expected 0.1"
);
}
#[test]
fn test_pad_sequence_already_correct_length() {
let mut rng = create_rng(Some(42));
let seq = b"ACGTACGT".to_vec();
let result = pad_sequence(seq.clone(), 8, &mut rng);
assert_eq!(result, seq);
}
#[test]
fn test_pad_sequence_needs_padding() {
let mut rng = create_rng(Some(42));
let seq = b"ACGT".to_vec();
let result = pad_sequence(seq, 10, &mut rng);
assert_eq!(result.len(), 10);
assert_eq!(&result[..4], b"ACGT");
for &base in &result[4..] {
assert!(base == b'A' || base == b'C' || base == b'G' || base == b'T');
}
}
#[test]
fn test_pad_sequence_needs_truncation() {
let mut rng = create_rng(Some(42));
let seq = b"ACGTACGTACGT".to_vec();
let result = pad_sequence(seq, 5, &mut rng);
assert_eq!(result.len(), 5);
assert_eq!(result, b"ACGTA".to_vec());
}
#[test]
fn test_pad_sequence_empty_to_length() {
let mut rng = create_rng(Some(42));
let seq = Vec::new();
let result = pad_sequence(seq, 10, &mut rng);
assert_eq!(result.len(), 10);
for &base in &result {
assert!(base == b'A' || base == b'C' || base == b'G' || base == b'T');
}
}
#[test]
fn test_pad_sequence_to_zero() {
let mut rng = create_rng(Some(42));
let seq = b"ACGT".to_vec();
let result = pad_sequence(seq, 0, &mut rng);
assert!(result.is_empty());
}
#[test]
fn test_load_includelist_valid() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "AACAC\nAAGGA\nAATGC\n").expect("failed to write file");
let umis = load_includelist(&path).expect("failed to load includelist");
assert_eq!(umis.len(), 3);
assert_eq!(umis[0], b"AACAC");
assert_eq!(umis[1], b"AAGGA");
assert_eq!(umis[2], b"AATGC");
}
#[test]
fn test_load_includelist_lowercase_uppercased() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "aacac\naagga\n").expect("failed to write file");
let umis = load_includelist(&path).expect("failed to load includelist");
assert_eq!(umis[0], b"AACAC");
assert_eq!(umis[1], b"AAGGA");
}
#[test]
fn test_load_includelist_skips_blank_lines() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "AACAC\n\nAAGGA\n \nAATGC\n").expect("failed to write file");
let umis = load_includelist(&path).expect("failed to load includelist");
assert_eq!(umis.len(), 3);
}
#[test]
fn test_load_includelist_rejects_invalid_bases() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "AACAC\nAANGA\n").expect("failed to write file");
let result = load_includelist(&path);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("Invalid UMI"));
}
#[test]
fn test_load_includelist_rejects_mismatched_lengths() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "AACAC\nAAGG\n").expect("failed to write file");
let result = load_includelist(&path);
assert!(result.is_err());
let msg = result.unwrap_err().to_string();
assert!(msg.contains("length mismatch"));
assert!(msg.contains("line 2"), "Expected file line number in error: {msg}");
}
#[test]
fn test_load_includelist_mismatched_length_reports_correct_line_with_blanks() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "AACAC\n\nAAGGA\nAAGG\n").expect("failed to write file");
let result = load_includelist(&path);
assert!(result.is_err());
let msg = result.unwrap_err().to_string();
assert!(msg.contains("line 4"), "Expected file line 4 in error: {msg}");
}
#[test]
fn test_load_includelist_rejects_empty() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let path = dir.path().join("umis.txt");
std::fs::write(&path, "\n\n").expect("failed to write file");
let result = load_includelist(&path);
assert!(result.is_err());
assert!(result.unwrap_err().to_string().contains("empty"));
}
#[test]
fn test_execute_rejects_includelist_umi_exceeding_read_length() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let includelist_path = dir.path().join("umis.txt");
std::fs::write(&includelist_path, "AAAAACCCCC\n").expect("failed to write file");
let r1 = dir.path().join("r1.fq.gz");
let r2 = dir.path().join("r2.fq.gz");
let truth = dir.path().join("truth.tsv");
let cmd = FastqReads::try_parse_from([
"fastq-reads",
"-1",
r1.to_str().expect("path should be valid UTF-8"),
"-2",
r2.to_str().expect("path should be valid UTF-8"),
"--truth",
truth.to_str().expect("path should be valid UTF-8"),
"-r",
"dummy.fa",
"-i",
includelist_path.to_str().expect("path should be valid UTF-8"),
"--read-length",
"5", ])
.expect("valid CLI arguments");
let result = cmd.execute("");
assert!(result.is_err());
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("exceeds read length"),
"Expected 'exceeds read length' in error: {msg}",
);
}
#[test]
fn test_execute_rejects_umi_length_exceeding_read_length() {
let dir = tempfile::tempdir().expect("failed to create temp dir");
let r1 = dir.path().join("r1.fq.gz");
let r2 = dir.path().join("r2.fq.gz");
let truth = dir.path().join("truth.tsv");
let cmd = FastqReads::try_parse_from([
"fastq-reads",
"-1",
r1.to_str().expect("path should be valid UTF-8"),
"-2",
r2.to_str().expect("path should be valid UTF-8"),
"--truth",
truth.to_str().expect("path should be valid UTF-8"),
"-r",
"dummy.fa",
"--umi-length",
"100",
"--read-length",
"50",
])
.expect("valid CLI arguments");
let result = cmd.execute("");
assert!(result.is_err());
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("exceeds read length"),
"Expected 'exceeds read length' in error: {msg}",
);
}
#[test]
fn test_generate_molecule_reads_with_includelist() {
use std::io::Write as IoWrite;
use tempfile::NamedTempFile;
let mut fasta = NamedTempFile::new().unwrap();
writeln!(fasta, ">chr1").unwrap();
fasta.write_all(&b"ACGT".repeat(500)).unwrap();
writeln!(fasta).unwrap();
fasta.flush().unwrap();
let ref_genome = ReferenceGenome::load(fasta.path()).unwrap();
let umis = vec![b"AAAAA".to_vec(), b"CCCCC".to_vec(), b"GGGGG".to_vec()];
let params = GenerationParams {
umi_length: 5,
read_length: 50,
min_family_size: 1,
r2_quality_offset: 0,
duplex: false,
includelist: Some(umis.clone()),
methylation: MethylationConfig {
mode: fgumi_consensus::MethylationMode::Disabled,
cpg_methylation_rate: 0.75,
conversion_rate: 0.98,
},
};
let quality_model =
crate::simulate::PositionQualityModel::new(10, 25, 37, 100, 0.08, 2, 2.0);
let quality_bias = crate::simulate::ReadPairQualityBias::new(0);
let family_dist = crate::simulate::FamilySizeDistribution::log_normal(3.0, 1.0);
let insert_model = crate::simulate::InsertSizeModel::new(150.0, 30.0, 50, 500);
for seed in 0..20u64 {
let records = generate_molecule_reads(
0,
seed,
¶ms,
&quality_model,
&quality_bias,
&family_dist,
&insert_model,
&ref_genome,
);
for record in &records {
let umi_str = String::from_utf8_lossy(&record.umi);
let parts: Vec<&str> = umi_str.split('-').collect();
assert_eq!(parts.len(), 2);
assert!(
umis.iter().any(|u| u == parts[0].as_bytes()),
"R1 UMI '{}' not in includelist",
parts[0],
);
assert!(
umis.iter().any(|u| u == parts[1].as_bytes()),
"R2 UMI '{}' not in includelist",
parts[1],
);
}
}
}
#[test]
fn test_emseq_fastq_reads_converts_non_cpg_c() {
use crate::commands::simulate::common::apply_methylation_conversion;
let template = b"CACACACACACACACACACAC".to_vec();
let mut r1 = template.clone();
let mut rng = create_rng(Some(42));
let config = MethylationConfig {
mode: fgumi_consensus::MethylationMode::EmSeq,
cpg_methylation_rate: 0.75,
conversion_rate: 1.0,
};
apply_methylation_conversion(&mut r1, &template, 0, true, &config, &mut rng);
for (i, &b) in r1.iter().enumerate() {
if template[i] == b'C' {
assert_eq!(b, b'T', "position {i}: non-CpG C should be converted to T in EM-Seq");
} else {
assert_eq!(b, template[i], "position {i}: A should remain unchanged");
}
}
}
#[test]
fn test_taps_fastq_reads_preserves_non_cpg_c() {
use crate::commands::simulate::common::apply_methylation_conversion;
let template = b"CACACACACACACACACACAC".to_vec();
let mut r1 = template.clone();
let mut rng = create_rng(Some(42));
let config = MethylationConfig {
mode: fgumi_consensus::MethylationMode::Taps,
cpg_methylation_rate: 0.75,
conversion_rate: 1.0,
};
apply_methylation_conversion(&mut r1, &template, 0, true, &config, &mut rng);
assert_eq!(r1, template);
}
#[test]
fn test_reads_in_same_family_differ_with_methylation() {
use crate::commands::simulate::common::apply_methylation_conversion;
let template = b"ACGTCGATCGACGTCGATCG".to_vec();
let mut different_count = 0;
let config = MethylationConfig {
mode: fgumi_consensus::MethylationMode::EmSeq,
cpg_methylation_rate: 0.5,
conversion_rate: 0.5,
};
for seed in 0..50u64 {
let mut r1_a = template.clone();
let mut r1_b = template.clone();
let mut rng_a = create_rng(Some(seed * 2));
let mut rng_b = create_rng(Some(seed * 2 + 1));
apply_methylation_conversion(&mut r1_a, &template, 0, true, &config, &mut rng_a);
apply_methylation_conversion(&mut r1_b, &template, 0, true, &config, &mut rng_b);
if r1_a != r1_b {
different_count += 1;
}
}
assert!(different_count > 10, "Expected most read pairs to differ, got {different_count}");
}
}