use super::sort::TemplateCoordKey;
use crate::bam_io::create_raw_bam_writer;
use crate::commands::command::Command;
use crate::commands::common::CompressionOptions;
use crate::commands::simulate::common::{
FamilySizeArgs, InsertSizeArgs, MethylationArgs, MethylationConfig, MoleculeInfo,
PositionDistArgs, QualityArgs, ReferenceArgs, ReferenceGenome, SimulationCommon,
apply_methylation_conversion, generate_random_sequence, pad_sequence, validate_rate,
};
use crate::commands::simulate::region_to_bin;
use crate::dna::reverse_complement;
use crate::progress::ProgressTracker;
use crate::simulate::{
FamilySizeDistribution, InsertSizeModel, PositionQualityModel, ReadPairQualityBias, create_rng,
};
use anyhow::{Context, Result};
use clap::Parser;
use fgumi_raw_bam::{RawRecord, RawRecordView, SamBuilder, flags as raw_flags};
use log::info;
use noodles::sam::header::Header;
use noodles::sam::header::record::value::map::header::{self as HeaderRecord, Tag as HeaderTag};
use rand::{Rng, RngExt};
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::PathBuf;
#[derive(Parser, Debug)]
#[command(
name = "mapped-reads",
about = "Generate template-coord sorted BAM for group",
long_about = r#"
Generate synthetic paired-end BAM files with proper alignments.
The output is template-coordinate sorted and suitable for input to `fgumi group`.
Reads contain RX tags with UMI sequences.
"#
)]
pub struct MappedReads {
#[arg(short = 'o', long = "output", required = true)]
pub output: PathBuf,
#[arg(long = "truth", required = true)]
pub truth_output: PathBuf,
#[arg(long = "mapq", default_value = "60")]
pub mapq: u8,
#[arg(long = "unmapped-fraction", default_value = "0.0")]
pub unmapped_fraction: f64,
#[arg(short = 't', long = "threads", default_value = "1")]
pub threads: usize,
#[command(flatten)]
pub compression: CompressionOptions,
#[command(flatten)]
pub common: SimulationCommon,
#[command(flatten)]
pub quality: QualityArgs,
#[command(flatten)]
pub family_size: FamilySizeArgs,
#[command(flatten)]
pub insert_size: InsertSizeArgs,
#[command(flatten)]
pub reference: ReferenceArgs,
#[command(flatten)]
pub position_dist: PositionDistArgs,
#[command(flatten)]
pub methylation: MethylationArgs,
}
struct GenerationParams {
read_length: usize,
umi_length: usize,
mapq: u8,
min_family_size: usize,
quality_model: PositionQualityModel,
quality_bias: ReadPairQualityBias,
family_dist: FamilySizeDistribution,
insert_model: InsertSizeModel,
methylation: MethylationConfig,
}
impl Command for MappedReads {
fn execute(&self, command_line: &str) -> Result<()> {
let methylation = self.methylation.resolve();
self.methylation.validate()?;
validate_rate(self.unmapped_fraction, "unmapped-fraction")?;
info!("Generating mapped reads");
info!(" Output: {}", self.output.display());
info!(" Truth: {}", self.truth_output.display());
info!(" Num molecules: {}", self.common.num_molecules);
info!(" Read length: {}", self.common.read_length);
info!(" UMI length: {}", self.common.umi_length);
info!(" Threads: {}", self.threads);
if methylation.mode.is_enabled() {
info!(" Methylation mode: {:?}", methylation.mode);
info!(" CpG methylation rate: {}", methylation.cpg_methylation_rate);
info!(" Conversion rate: {}", methylation.conversion_rate);
}
let ref_genome = ReferenceGenome::load(&self.reference.reference)?;
if ref_genome.max_contig_length() < self.common.read_length {
anyhow::bail!(
"No reference contig is >= read length ({} bp). \
The longest contig is {} bp. Use a larger reference or shorter --read-length.",
self.common.read_length,
ref_genome.max_contig_length(),
);
}
let ref_header = ref_genome.build_bam_header();
let mut header_builder = Header::builder();
let HeaderTag::Other(so_tag) = HeaderTag::from([b'S', b'O']) else { unreachable!() };
let HeaderTag::Other(go_tag) = HeaderTag::from([b'G', b'O']) else { unreachable!() };
let HeaderTag::Other(ss_tag) = HeaderTag::from([b'S', b'S']) else { unreachable!() };
let header_map =
noodles::sam::header::record::value::Map::<HeaderRecord::Header>::builder()
.insert(so_tag, "unsorted")
.insert(go_tag, "query")
.insert(ss_tag, "template-coordinate")
.build()
.expect("header map with valid SO/GO/SS tags");
header_builder = header_builder.set_header(header_map);
for (name, map) in ref_header.reference_sequences() {
header_builder = header_builder.add_reference_sequence(name.clone(), map.clone());
}
header_builder = crate::commands::common::add_pg_to_builder(header_builder, command_line)?;
let header = header_builder.build();
let params = GenerationParams {
read_length: self.common.read_length,
umi_length: self.common.umi_length,
mapq: self.mapq,
min_family_size: self.family_size.min_family_size,
quality_model: self.quality.to_quality_model(),
quality_bias: self.quality.to_quality_bias(),
family_dist: self.family_size.to_family_size_distribution()?,
insert_model: self.insert_size.to_insert_size_model(),
methylation,
};
let num_positions = self.position_dist.num_positions.unwrap_or(self.common.num_molecules);
if num_positions == 0 {
anyhow::bail!("--num-positions must be greater than 0");
}
let usable_bases = ref_genome.total_length().saturating_sub(1000);
let bases_per_position = usable_bases as f64 / num_positions as f64;
if bases_per_position < 1.0 {
anyhow::bail!(
"Too many positions ({num_positions}) for reference of size {} bp. \
Reduce --num-positions or use a larger reference.",
ref_genome.total_length()
);
} else if bases_per_position < 10.0 {
log::warn!(
"Low position spacing ({bases_per_position:.1} bp/position) may cause UMI collisions."
);
}
let mut pos_rng = create_rng(self.common.seed);
let position_table = ref_genome.sample_positions(num_positions, &mut pos_rng);
let mut seed_rng = create_rng(self.common.seed.map(|s| s.wrapping_add(1)));
let mut unmapped_rng = create_rng(self.common.seed.map(|s| s.wrapping_add(2)));
info!("Computing molecule positions and sort keys...");
let mut molecules: Vec<MoleculeInfo> = (0..self.common.num_molecules)
.map(|mol_id| {
let seed: u64 = seed_rng.random();
let is_unmapped = unmapped_rng.random::<f64>() < self.unmapped_fraction;
let sort_key = if is_unmapped {
TemplateCoordKey::for_unmapped_pair(format!("mol{mol_id:08}"))
} else {
let pos_idx = mol_id % num_positions;
let (chrom_idx, local_pos) = position_table[pos_idx];
let mut mol_rng = create_rng(Some(seed));
for _ in 0..params.umi_length {
let _: usize = mol_rng.random_range(0..4);
}
let _ = params.family_dist.sample(&mut mol_rng, params.min_family_size);
let insert_size = params.insert_model.sample(&mut mol_rng);
TemplateCoordKey::for_f1r2_pair(
chrom_idx as i32, local_pos,
insert_size,
String::new(), format!("mol{mol_id:08}"),
)
};
MoleculeInfo { mol_id, seed, sort_key, is_unmapped }
})
.collect();
info!("Sorting {} molecules by template-coordinate...", molecules.len());
molecules.sort_unstable();
let mut writer = create_raw_bam_writer(
&self.output,
&header,
self.threads,
self.compression.compression_level,
)?;
let truth_file = File::create(&self.truth_output)
.with_context(|| format!("Failed to create {}", self.truth_output.display()))?;
let mut truth_writer = BufWriter::new(truth_file);
writeln!(truth_writer, "read_name\ttrue_umi\tmolecule_id\tchrom\tposition\tstrand")?;
info!("Generating and writing records in sorted order...");
let progress = ProgressTracker::new("Processed molecules").with_interval(100_000);
let mut total_pairs = 0usize;
for mol_info in molecules {
progress.log_if_needed(1);
let pos_idx = mol_info.mol_id % num_positions;
let (pos_chrom_idx, pos_local_pos) = position_table[pos_idx];
let (pairs, chrom_idx, local_pos) = generate_molecule_reads(
mol_info.mol_id,
mol_info.seed,
pos_chrom_idx,
pos_local_pos,
mol_info.is_unmapped,
¶ms,
&ref_genome,
);
for (r1, r2, read_name, umi_str, is_top_strand) in pairs {
if mol_info.is_unmapped {
writer.write_raw_record(r1.as_ref())?;
writer.write_raw_record(r2.as_ref())?;
writeln!(
truth_writer,
"{}\t{}\t{}\t*\t*\t*",
read_name, umi_str, mol_info.mol_id
)?;
} else {
let r1_pos = RawRecordView::new(r1.as_ref()).pos();
let r2_pos = RawRecordView::new(r2.as_ref()).pos();
let r2_first = r1_pos > r2_pos;
if r2_first {
writer.write_raw_record(r2.as_ref())?;
writer.write_raw_record(r1.as_ref())?;
} else {
writer.write_raw_record(r1.as_ref())?;
writer.write_raw_record(r2.as_ref())?;
}
let strand_char = if is_top_strand { '+' } else { '-' };
writeln!(
truth_writer,
"{}\t{}\t{}\t{}\t{}\t{}",
read_name,
umi_str,
mol_info.mol_id,
ref_genome.name(chrom_idx),
local_pos,
strand_char,
)?;
}
total_pairs += 1;
}
}
progress.log_final();
truth_writer.flush()?;
writer.finish()?;
info!("Generated {total_pairs} read pairs");
info!("Done");
Ok(())
}
}
type MoleculeReadPair = (RawRecord, RawRecord, String, String, bool);
type MoleculeReadsResult = (Vec<MoleculeReadPair>, usize, usize);
#[allow(clippy::too_many_arguments)]
fn generate_molecule_reads(
mol_id: usize,
seed: u64,
chrom_idx: usize,
local_pos: usize,
is_unmapped: bool,
params: &GenerationParams,
ref_genome: &ReferenceGenome,
) -> MoleculeReadsResult {
let mut rng = create_rng(Some(seed));
let umi = generate_random_sequence(params.umi_length, &mut rng);
let umi_str = String::from_utf8_lossy(&umi).to_string();
let family_size = params.family_dist.sample(&mut rng, params.min_family_size);
let insert_size = params.insert_model.sample(&mut rng);
let is_top_strand: bool = rng.random();
if is_unmapped {
return generate_unmapped_molecule_reads(
mol_id,
chrom_idx,
local_pos,
&umi_str,
family_size,
params,
&mut rng,
);
}
let (eff_chrom_idx, eff_local_pos, shared_template) =
match ref_genome.sequence_at(chrom_idx, local_pos, insert_size) {
Some(seq) => (chrom_idx, local_pos, Some(seq)),
None => match ref_genome.sample_sequence(insert_size, &mut rng) {
Some((c, p, seq)) => (c, p, Some(seq)),
None => (chrom_idx, local_pos, None),
},
};
let chrom_idx = eff_chrom_idx;
let local_pos = eff_local_pos;
let mut pairs = Vec::with_capacity(family_size);
for read_idx in 0..family_size {
let read_name = format!("mol{mol_id:08}_read{read_idx:04}");
let random_template;
let template: &[u8] = if let Some(ref shared) = shared_template {
shared
} else {
random_template = generate_random_sequence(insert_size, &mut rng);
&random_template
};
let fwd_end = params.read_length.min(template.len());
let mut fwd_seq: Vec<u8> = template[..fwd_end].to_vec();
apply_methylation_conversion(
&mut fwd_seq,
template,
0,
true, ¶ms.methylation,
&mut rng,
);
let fwd_seq = pad_sequence(fwd_seq, params.read_length, &mut rng);
let rev_start = insert_size.saturating_sub(params.read_length);
let rev_end = params.read_length.min(template.len().saturating_sub(rev_start));
let mut rev_template: Vec<u8> = template[rev_start..rev_start + rev_end].to_vec();
apply_methylation_conversion(
&mut rev_template,
template,
rev_start,
false, ¶ms.methylation,
&mut rng,
);
let rev_seq = reverse_complement(&rev_template);
let rev_seq = pad_sequence(rev_seq, params.read_length, &mut rng);
let r1_quals = params.quality_model.generate_qualities(params.read_length, &mut rng);
let r2_quals_raw = params.quality_model.generate_qualities(params.read_length, &mut rng);
let r2_quals = params.quality_bias.apply_to_vec(&r2_quals_raw, true);
let mate_cigar = format!("{}M", params.read_length);
let rev_pos = local_pos + insert_size.saturating_sub(params.read_length);
let (r1_seq, r2_seq, r1_is_reverse, r1_pos, r2_pos, r1_tlen) = if is_top_strand {
(fwd_seq, rev_seq, false, local_pos, rev_pos, insert_size as i32)
} else {
(rev_seq, fwd_seq, true, rev_pos, local_pos, -(insert_size as i32))
};
let r1_record = build_record(
&read_name,
&r1_seq,
&r1_quals,
chrom_idx,
r1_pos,
params.mapq,
true, r1_is_reverse, r2_pos,
r1_tlen,
&umi_str,
&mate_cigar,
params.mapq,
);
let r2_record = build_record(
&read_name,
&r2_seq,
&r2_quals,
chrom_idx,
r2_pos,
params.mapq,
false, !r1_is_reverse, r1_pos,
-r1_tlen,
&umi_str,
&mate_cigar,
params.mapq,
);
pairs.push((r1_record, r2_record, read_name, umi_str.clone(), is_top_strand));
}
(pairs, chrom_idx, local_pos)
}
fn generate_unmapped_molecule_reads(
mol_id: usize,
chrom_idx: usize,
local_pos: usize,
umi_str: &str,
family_size: usize,
params: &GenerationParams,
rng: &mut impl Rng,
) -> MoleculeReadsResult {
let mut pairs = Vec::with_capacity(family_size);
for read_idx in 0..family_size {
let read_name = format!("mol{mol_id:08}_read{read_idx:04}");
let r1_seq_raw = generate_random_sequence(params.read_length, rng);
let r1_seq = pad_sequence(r1_seq_raw, params.read_length, rng);
let r2_seq_raw = generate_random_sequence(params.read_length, rng);
let r2_seq = pad_sequence(r2_seq_raw, params.read_length, rng);
let r1_quals = params.quality_model.generate_qualities(params.read_length, rng);
let r2_quals_raw = params.quality_model.generate_qualities(params.read_length, rng);
let r2_quals = params.quality_bias.apply_to_vec(&r2_quals_raw, true);
let r1_record = build_unmapped_record(&read_name, &r1_seq, &r1_quals, true, umi_str);
let r2_record = build_unmapped_record(&read_name, &r2_seq, &r2_quals, false, umi_str);
pairs.push((r1_record, r2_record, read_name, umi_str.to_string(), false));
}
(pairs, chrom_idx, local_pos)
}
fn build_unmapped_record(
name: &str,
seq: &[u8],
quals: &[u8],
is_first: bool,
umi: &str,
) -> RawRecord {
let segment_flag = if is_first { raw_flags::FIRST_SEGMENT } else { raw_flags::LAST_SEGMENT };
let flags = raw_flags::PAIRED | segment_flag | raw_flags::UNMAPPED | raw_flags::MATE_UNMAPPED;
let mut b = SamBuilder::new();
b.read_name(name.as_bytes())
.flags(flags)
.sequence(seq)
.qualities(quals)
.add_string_tag(b"RX", umi.as_bytes());
b.build()
}
#[allow(clippy::too_many_arguments)]
fn build_record(
name: &str,
seq: &[u8],
quals: &[u8],
ref_id: usize,
pos: usize,
mapq: u8,
is_first: bool,
is_reverse: bool,
mate_pos: usize,
tlen: i32,
umi: &str,
mate_cigar: &str,
mate_mapq: u8,
) -> RawRecord {
let segment_flag = if is_first { raw_flags::FIRST_SEGMENT } else { raw_flags::LAST_SEGMENT };
let reverse_flag = if is_reverse { raw_flags::REVERSE } else { 0 };
let mate_reverse_flag = if is_reverse { 0 } else { raw_flags::MATE_REVERSE };
let flags = raw_flags::PAIRED
| raw_flags::PROPER_PAIR
| segment_flag
| reverse_flag
| mate_reverse_flag;
let n = u32::try_from(seq.len()).expect("sequence length fits u32");
let cigar_op = n << 4;
let alignment_start_1based = u32::try_from(pos + 1).expect("alignment start fits u32");
let alignment_end_1based = alignment_start_1based + n.saturating_sub(1);
let bin = region_to_bin(Some(alignment_start_1based), Some(alignment_end_1based));
let ref_id_i32 = i32::try_from(ref_id).expect("ref_id fits i32");
let pos_i32 = i32::try_from(pos).expect("pos fits i32");
let mate_pos_i32 = i32::try_from(mate_pos).expect("mate_pos fits i32");
let mut b = SamBuilder::new();
b.read_name(name.as_bytes())
.flags(flags)
.ref_id(ref_id_i32)
.pos(pos_i32)
.mapq(mapq)
.bin(bin)
.mate_ref_id(ref_id_i32)
.mate_pos(mate_pos_i32)
.template_length(tlen)
.cigar_ops(&[cigar_op])
.sequence(seq)
.qualities(quals)
.add_string_tag(b"RX", umi.as_bytes())
.add_string_tag(b"MC", mate_cigar.as_bytes())
.add_int_tag(b"MQ", i32::from(mate_mapq));
b.build()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::simulate::create_rng;
fn to_record_buf(raw: &RawRecord) -> noodles::sam::alignment::RecordBuf {
fgumi_raw_bam::raw_record_to_record_buf(raw, &noodles::sam::Header::default())
.expect("raw_record_to_record_buf failed in test")
}
#[test]
fn test_generate_random_sequence_length() {
let mut rng = create_rng(Some(42));
for len in [0, 1, 8, 100, 300] {
let seq = generate_random_sequence(len, &mut rng);
assert_eq!(seq.len(), len);
}
}
#[test]
fn test_generate_random_sequence_valid_bases() {
let mut rng = create_rng(Some(42));
let seq = generate_random_sequence(1000, &mut rng);
for &base in &seq {
assert!(
base == b'A' || base == b'C' || base == b'G' || base == b'T',
"Invalid base: {}",
base as char
);
}
}
#[test]
fn test_generate_random_sequence_reproducibility() {
let mut rng1 = create_rng(Some(42));
let mut rng2 = create_rng(Some(42));
let seq1 = generate_random_sequence(100, &mut rng1);
let seq2 = generate_random_sequence(100, &mut rng2);
assert_eq!(seq1, seq2);
}
#[test]
fn test_reverse_complement_basic() {
assert_eq!(reverse_complement(b"A"), b"T");
assert_eq!(reverse_complement(b"T"), b"A");
assert_eq!(reverse_complement(b"C"), b"G");
assert_eq!(reverse_complement(b"G"), b"C");
}
#[test]
fn test_reverse_complement_sequence() {
assert_eq!(reverse_complement(b"ACGT"), b"ACGT");
assert_eq!(reverse_complement(b"AAAA"), b"TTTT");
assert_eq!(reverse_complement(b"AACG"), b"CGTT");
}
#[test]
fn test_reverse_complement_empty() {
let empty: &[u8] = b"";
assert_eq!(reverse_complement(empty), Vec::<u8>::new());
}
#[test]
fn test_reverse_complement_unknown_base() {
assert_eq!(reverse_complement(b"N"), b"N");
assert_eq!(reverse_complement(b"X"), b"X");
assert_eq!(reverse_complement(b"ANCG"), b"CGNT");
}
#[test]
fn test_reverse_complement_double_rc() {
let seq = b"ACGTACGTACGT";
let rc = reverse_complement(seq);
let rc_rc = reverse_complement(&rc);
assert_eq!(rc_rc, seq.to_vec());
}
#[test]
fn test_pad_sequence_already_correct_length() {
let mut rng = create_rng(Some(42));
let seq = vec![b'A', b'C', b'G', b'T'];
let padded = pad_sequence(seq.clone(), 4, &mut rng);
assert_eq!(padded, seq);
}
#[test]
fn test_pad_sequence_needs_padding() {
let mut rng = create_rng(Some(42));
let seq = vec![b'A', b'C'];
let padded = pad_sequence(seq, 6, &mut rng);
assert_eq!(padded.len(), 6);
assert_eq!(&padded[0..2], b"AC");
for &base in &padded[2..] {
assert!(base == b'A' || base == b'C' || base == b'G' || base == b'T');
}
}
#[test]
fn test_pad_sequence_truncates() {
let mut rng = create_rng(Some(42));
let seq = vec![b'A', b'C', b'G', b'T', b'A', b'C'];
let padded = pad_sequence(seq, 3, &mut rng);
assert_eq!(padded, vec![b'A', b'C', b'G']);
}
#[test]
fn test_pad_sequence_empty_to_length() {
let mut rng = create_rng(Some(42));
let seq: Vec<u8> = vec![];
let padded = pad_sequence(seq, 5, &mut rng);
assert_eq!(padded.len(), 5);
for &base in &padded {
assert!(base == b'A' || base == b'C' || base == b'G' || base == b'T');
}
}
#[test]
fn test_pad_sequence_to_zero() {
let mut rng = create_rng(Some(42));
let seq = vec![b'A', b'C', b'G', b'T'];
let padded = pad_sequence(seq, 0, &mut rng);
assert!(padded.is_empty());
}
#[test]
fn test_build_record_r1_basic() {
let seq = b"ACGT";
let quals = vec![30, 30, 30, 30];
let raw = build_record(
"test_read",
seq,
&quals,
0,
100,
60,
true,
false,
200,
150,
"AAAAAAAA",
"4M",
60,
);
let record = to_record_buf(&raw);
assert!(record.name().is_some());
let flags = record.flags();
assert!(flags.is_segmented());
assert!(flags.is_first_segment());
assert!(!flags.is_last_segment());
assert!(!flags.is_reverse_complemented());
assert!(flags.is_mate_reverse_complemented());
}
#[test]
fn test_build_record_r2_basic() {
let seq = b"ACGT";
let quals = vec![30, 30, 30, 30];
let raw = build_record(
"test_read",
seq,
&quals,
0,
200,
60,
false,
true,
100,
-150,
"AAAAAAAA",
"4M",
60,
);
let record = to_record_buf(&raw);
let flags = record.flags();
assert!(flags.is_segmented());
assert!(!flags.is_first_segment());
assert!(flags.is_last_segment());
assert!(flags.is_reverse_complemented());
assert!(!flags.is_mate_reverse_complemented());
}
#[test]
fn test_build_record_positions() {
let seq = b"ACGTACGT";
let quals = vec![30; 8];
let raw =
build_record("read", seq, &quals, 0, 1000, 60, true, false, 1100, 200, "AAA", "8M", 60);
let record = to_record_buf(&raw);
assert_eq!(
record.alignment_start().expect("record should have alignment start").get(),
1001
);
assert_eq!(
record.mate_alignment_start().expect("record should have mate alignment start").get(),
1101
);
assert_eq!(record.template_length(), 200);
}
#[test]
fn test_build_record_mapping_quality() {
let seq = b"ACGT";
let quals = vec![30; 4];
for mapq in [0, 30, 60] {
let raw = build_record(
"read", seq, &quals, 0, 100, mapq, true, false, 200, 100, "AAA", "4M", mapq,
);
let record = to_record_buf(&raw);
assert_eq!(
record.mapping_quality().expect("record should have mapping quality").get(),
mapq
);
}
}
#[test]
fn test_build_record_quality_scores() {
let seq = b"ACGTACGT";
let quals = vec![10, 20, 30, 40, 30, 20, 10, 5];
let raw =
build_record("read", seq, &quals, 0, 100, 60, true, false, 200, 100, "AAA", "8M", 60);
let record = to_record_buf(&raw);
let record_quals: Vec<u8> = record.quality_scores().iter().collect();
assert_eq!(record_quals, quals);
}
#[test]
fn test_build_record_reference_id() {
let seq = b"ACGT";
let quals = vec![30; 4];
for ref_id in [0, 1, 5] {
let raw = build_record(
"read", seq, &quals, ref_id, 100, 60, true, false, 200, 100, "AAA", "4M", 60,
);
let record = to_record_buf(&raw);
assert_eq!(record.reference_sequence_id(), Some(ref_id));
assert_eq!(record.mate_reference_sequence_id(), Some(ref_id));
}
}
#[test]
fn test_build_record_negative_tlen() {
let seq = b"ACGT";
let quals = vec![30; 4];
let raw =
build_record("read", seq, &quals, 0, 200, 60, false, true, 100, -150, "AAA", "4M", 60);
let record = to_record_buf(&raw);
assert_eq!(record.template_length(), -150);
}
#[test]
fn test_emseq_mapped_reads_conversion() {
let template = b"CACACACACACACACAC".to_vec(); let config = MethylationConfig {
mode: fgumi_consensus::MethylationMode::EmSeq,
cpg_methylation_rate: 0.75,
conversion_rate: 1.0,
};
let mut r1 = template[..8].to_vec();
let mut rng = create_rng(Some(42));
apply_methylation_conversion(&mut r1, &template, 0, true, &config, &mut rng);
for (i, &b) in r1.iter().enumerate() {
if template[i] == b'C' {
assert_eq!(b, b'T', "position {i}: non-CpG C should convert");
}
}
}
#[test]
fn test_reads_in_family_share_template_with_reference() {
use std::io::Write;
use tempfile::NamedTempFile;
let mut fasta = NamedTempFile::new().unwrap();
writeln!(fasta, ">chr1").unwrap();
let seq = b"ACGT".repeat(500); fasta.write_all(&seq).unwrap();
writeln!(fasta).unwrap();
fasta.flush().unwrap();
let ref_genome = ReferenceGenome::load(fasta.path()).unwrap();
let params = GenerationParams {
read_length: 50,
umi_length: 8,
mapq: 60,
min_family_size: 3,
quality_model: crate::simulate::PositionQualityModel::new(
10, 25, 37, 100, 0.08, 2, 0.0,
),
quality_bias: crate::simulate::ReadPairQualityBias::new(0),
family_dist: crate::simulate::FamilySizeDistribution::log_normal(5.0, 1.0),
insert_model: crate::simulate::InsertSizeModel::new(100.0, 10.0, 80, 120),
methylation: MethylationConfig {
mode: fgumi_consensus::MethylationMode::Disabled,
cpg_methylation_rate: 0.75,
conversion_rate: 0.98,
},
};
let (pairs, _chrom, _pos) =
generate_molecule_reads(0, 42, 0, 500, false, ¶ms, &ref_genome);
assert!(pairs.len() >= 3, "Expected at least 3 reads, got {}", pairs.len());
}
#[test]
fn test_mapped_reads_uses_real_ref_coordinates() {
use std::io::Write as IoWrite;
use tempfile::NamedTempFile;
let mut fasta = NamedTempFile::new().unwrap();
writeln!(fasta, ">chr1").unwrap();
fasta.write_all(&b"ACGT".repeat(500)).unwrap(); writeln!(fasta).unwrap();
writeln!(fasta, ">chr2").unwrap();
fasta.write_all(&b"CCGG".repeat(375)).unwrap(); writeln!(fasta).unwrap();
writeln!(fasta, ">chr3").unwrap();
fasta.write_all(&b"AATT".repeat(450)).unwrap(); writeln!(fasta).unwrap();
fasta.flush().unwrap();
let ref_genome = ReferenceGenome::load(fasta.path()).unwrap();
let mut rng = create_rng(Some(42));
let positions = ref_genome.sample_positions(10, &mut rng);
let params = GenerationParams {
read_length: 50,
umi_length: 8,
mapq: 60,
min_family_size: 1,
quality_model: crate::simulate::PositionQualityModel::new(
10, 25, 37, 100, 0.08, 2, 0.0,
),
quality_bias: crate::simulate::ReadPairQualityBias::new(0),
family_dist: crate::simulate::FamilySizeDistribution::log_normal(5.0, 1.0),
insert_model: crate::simulate::InsertSizeModel::new(100.0, 10.0, 80, 120),
methylation: MethylationConfig {
mode: fgumi_consensus::MethylationMode::Disabled,
cpg_methylation_rate: 0.75,
conversion_rate: 0.98,
},
};
let (chrom_idx, local_pos) = positions[0];
let (pairs, eff_chrom, eff_pos) =
generate_molecule_reads(0, 42, chrom_idx, local_pos, false, ¶ms, &ref_genome);
assert!(!pairs.is_empty());
for (r1, r2, _, _, _) in &pairs {
let r1_view = RawRecordView::new(r1.as_ref());
let r2_view = RawRecordView::new(r2.as_ref());
assert_eq!(r1_view.ref_id() as usize, eff_chrom);
assert_eq!(r2_view.ref_id() as usize, eff_chrom);
let r1_pos = r1_view.pos();
let r2_pos = r2_view.pos();
assert!(
r1_pos == eff_pos as i32 || r2_pos == eff_pos as i32,
"Expected one read at position {} (0-based), got r1={}, r2={}",
eff_pos,
r1_pos,
r2_pos
);
}
}
#[test]
fn test_strand_coin_flip_produces_both_orientations() {
use std::io::Write as IoWrite;
use tempfile::NamedTempFile;
let mut fasta = NamedTempFile::new().unwrap();
writeln!(fasta, ">chr1").unwrap();
fasta.write_all(&b"ACGT".repeat(500)).unwrap();
writeln!(fasta).unwrap();
fasta.flush().unwrap();
let ref_genome = ReferenceGenome::load(fasta.path()).unwrap();
let params = GenerationParams {
read_length: 50,
umi_length: 8,
mapq: 60,
min_family_size: 1,
quality_model: crate::simulate::PositionQualityModel::new(
10, 25, 37, 100, 0.08, 2, 0.0,
),
quality_bias: crate::simulate::ReadPairQualityBias::new(0),
family_dist: crate::simulate::FamilySizeDistribution::log_normal(1.0, 0.1),
insert_model: crate::simulate::InsertSizeModel::new(100.0, 5.0, 80, 120),
methylation: MethylationConfig {
mode: fgumi_consensus::MethylationMode::Disabled,
cpg_methylation_rate: 0.75,
conversion_rate: 0.98,
},
};
let mut saw_f1r2 = false;
let mut saw_r1f2 = false;
for seed in 0u64..100 {
let (pairs, _chrom, _pos) =
generate_molecule_reads(seed as usize, seed, 0, 500, false, ¶ms, &ref_genome);
for (r1, _, _, _, is_top) in &pairs {
let r1_flags = RawRecordView::new(r1.as_ref()).flags();
let is_reverse = (r1_flags & raw_flags::REVERSE) != 0;
if *is_top {
saw_f1r2 = true;
assert!(!is_reverse, "F1R2: R1 should be forward");
} else {
saw_r1f2 = true;
assert!(is_reverse, "R1F2: R1 should be reverse");
}
}
}
assert!(saw_f1r2, "Should see at least one F1R2 molecule");
assert!(saw_r1f2, "Should see at least one R1F2 molecule");
}
fn basic_unmapped_test_params(min_family_size: usize) -> GenerationParams {
GenerationParams {
read_length: 50,
umi_length: 8,
mapq: 60,
min_family_size,
quality_model: crate::simulate::PositionQualityModel::new(
10, 25, 37, 100, 0.08, 2, 0.0,
),
quality_bias: crate::simulate::ReadPairQualityBias::new(0),
family_dist: crate::simulate::FamilySizeDistribution::log_normal(5.0, 1.0),
insert_model: crate::simulate::InsertSizeModel::new(100.0, 10.0, 80, 120),
methylation: MethylationConfig {
mode: fgumi_consensus::MethylationMode::Disabled,
cpg_methylation_rate: 0.75,
conversion_rate: 0.98,
},
}
}
#[test]
fn test_unmapped_molecule_produces_unmapped_records() {
use std::io::Write as IoWrite;
use tempfile::NamedTempFile;
let mut fasta = NamedTempFile::new().unwrap();
writeln!(fasta, ">chr1").unwrap();
fasta.write_all(&b"ACGT".repeat(500)).unwrap();
writeln!(fasta).unwrap();
fasta.flush().unwrap();
let ref_genome = ReferenceGenome::load(fasta.path()).unwrap();
let params = basic_unmapped_test_params(3);
let (pairs, _chrom, _pos) =
generate_molecule_reads(0, 42, 0, 500, true, ¶ms, &ref_genome);
assert!(!pairs.is_empty(), "molecule should still emit pairs when unmapped");
for (r1, r2, _, umi, _) in &pairs {
for r in [r1, r2] {
let rb = to_record_buf(r);
let flags = rb.flags();
assert!(flags.is_segmented(), "PAIRED flag should be set");
assert!(flags.is_unmapped(), "UNMAPPED flag should be set");
assert!(flags.is_mate_unmapped(), "MATE_UNMAPPED flag should be set");
assert!(rb.reference_sequence_id().is_none(), "ref_id should be unset");
assert!(rb.alignment_start().is_none(), "pos should be unset");
assert!(rb.mate_reference_sequence_id().is_none(), "mate_ref_id should be unset");
assert!(rb.mate_alignment_start().is_none(), "mate_pos should be unset");
assert_eq!(
rb.mapping_quality().map(|m| m.get()),
Some(0),
"MAPQ should be 0 on unmapped reads"
);
assert_eq!(rb.template_length(), 0, "TLEN should be 0");
assert_eq!(rb.cigar().as_ref().len(), 0, "CIGAR should be empty");
assert_eq!(rb.sequence().len(), params.read_length, "sequence length preserved");
assert_eq!(
rb.quality_scores().as_ref().len(),
params.read_length,
"qualities length preserved"
);
}
let r1_flags = to_record_buf(r1).flags();
let r2_flags = to_record_buf(r2).flags();
assert!(r1_flags.is_first_segment(), "R1 should be FIRST_SEGMENT");
assert!(r2_flags.is_last_segment(), "R2 should be LAST_SEGMENT");
assert_eq!(umi.len(), params.umi_length, "UMI length preserved");
}
}
#[test]
fn test_unmapped_molecule_has_no_mate_alignment_tags() {
use noodles::sam::alignment::record::data::field::Tag;
use std::io::Write as IoWrite;
use tempfile::NamedTempFile;
let mut fasta = NamedTempFile::new().unwrap();
writeln!(fasta, ">chr1").unwrap();
fasta.write_all(&b"ACGT".repeat(500)).unwrap();
writeln!(fasta).unwrap();
fasta.flush().unwrap();
let ref_genome = ReferenceGenome::load(fasta.path()).unwrap();
let params = basic_unmapped_test_params(2);
let (pairs, _chrom, _pos) =
generate_molecule_reads(1, 99, 0, 500, true, ¶ms, &ref_genome);
assert!(!pairs.is_empty());
let mc = Tag::from([b'M', b'C']);
let mq = Tag::from([b'M', b'Q']);
let rx = Tag::from([b'R', b'X']);
for (r1, r2, _, _, _) in &pairs {
for r in [r1, r2] {
let rb = to_record_buf(r);
assert!(rb.data().get(&mc).is_none(), "MC tag must not be present");
assert!(rb.data().get(&mq).is_none(), "MQ tag must not be present");
assert!(rb.data().get(&rx).is_some(), "RX tag must still be present");
}
}
}
#[test]
fn test_mapped_path_unchanged_when_is_unmapped_false() {
use std::io::Write as IoWrite;
use tempfile::NamedTempFile;
let mut fasta = NamedTempFile::new().unwrap();
writeln!(fasta, ">chr1").unwrap();
fasta.write_all(&b"ACGT".repeat(500)).unwrap();
writeln!(fasta).unwrap();
fasta.flush().unwrap();
let ref_genome = ReferenceGenome::load(fasta.path()).unwrap();
let params = basic_unmapped_test_params(1);
let (pairs, _chrom, _pos) =
generate_molecule_reads(0, 42, 0, 500, false, ¶ms, &ref_genome);
assert!(!pairs.is_empty());
for (r1, r2, _, _, _) in &pairs {
for r in [r1, r2] {
let rb = to_record_buf(r);
let flags = rb.flags();
assert!(!flags.is_unmapped(), "mapped path should not set UNMAPPED");
assert!(!flags.is_mate_unmapped(), "mapped path should not set MATE_UNMAPPED");
assert!(rb.reference_sequence_id().is_some());
assert!(rb.alignment_start().is_some());
assert!(rb.mapping_quality().is_some());
assert_ne!(rb.cigar().as_ref().len(), 0, "mapped path should populate CIGAR");
}
}
}
fn write_minimal_fasta(dir: &std::path::Path) -> std::path::PathBuf {
use std::io::Write as IoWrite;
let fasta = dir.join("ref.fa");
let mut f = std::fs::File::create(&fasta).unwrap();
writeln!(f, ">chr1").unwrap();
f.write_all(&b"ACGT".repeat(1500)).unwrap(); writeln!(f).unwrap();
fasta
}
#[test]
fn test_execute_unmapped_fraction_one_produces_all_unmapped() {
use clap::Parser;
use tempfile::tempdir;
let dir = tempdir().unwrap();
let fasta = write_minimal_fasta(dir.path());
let out_bam = dir.path().join("out.bam");
let truth = dir.path().join("truth.tsv");
let cmd = MappedReads::try_parse_from([
"mapped-reads",
"-o",
out_bam.to_str().unwrap(),
"--truth",
truth.to_str().unwrap(),
"-r",
fasta.to_str().unwrap(),
"--num-molecules",
"20",
"--unmapped-fraction",
"1.0",
"--seed",
"42",
])
.expect("CLI parse");
cmd.execute("test").expect("execute() succeeded");
let mut reader = noodles::bam::io::reader::Builder.build_from_path(&out_bam).unwrap();
let header = reader.read_header().unwrap();
let mut n_records = 0;
for result in reader.records() {
let record = result.unwrap();
assert!(
record.flags().is_unmapped(),
"every record must be UNMAPPED when --unmapped-fraction=1.0"
);
assert!(
record.flags().is_mate_unmapped(),
"every record must be MATE_UNMAPPED when --unmapped-fraction=1.0"
);
n_records += 1;
let _ = &header;
}
assert!(n_records > 0, "expected some records, got 0");
}
#[test]
fn test_execute_unmapped_fraction_zero_produces_no_unmapped() {
use clap::Parser;
use tempfile::tempdir;
let dir = tempdir().unwrap();
let fasta = write_minimal_fasta(dir.path());
let out_bam = dir.path().join("out.bam");
let truth = dir.path().join("truth.tsv");
let cmd = MappedReads::try_parse_from([
"mapped-reads",
"-o",
out_bam.to_str().unwrap(),
"--truth",
truth.to_str().unwrap(),
"-r",
fasta.to_str().unwrap(),
"--num-molecules",
"20",
"--unmapped-fraction",
"0.0",
"--seed",
"42",
])
.expect("CLI parse");
cmd.execute("test").expect("execute() succeeded");
let mut reader = noodles::bam::io::reader::Builder.build_from_path(&out_bam).unwrap();
let _ = reader.read_header().unwrap();
for result in reader.records() {
let record = result.unwrap();
assert!(
!record.flags().is_unmapped(),
"no record should be UNMAPPED when --unmapped-fraction=0.0"
);
}
}
#[test]
fn test_execute_unmapped_fraction_mixed_sorts_unmapped_last() {
use clap::Parser;
use tempfile::tempdir;
let dir = tempdir().unwrap();
let fasta = write_minimal_fasta(dir.path());
let out_bam = dir.path().join("out.bam");
let truth = dir.path().join("truth.tsv");
let cmd = MappedReads::try_parse_from([
"mapped-reads",
"-o",
out_bam.to_str().unwrap(),
"--truth",
truth.to_str().unwrap(),
"-r",
fasta.to_str().unwrap(),
"--num-molecules",
"50",
"--unmapped-fraction",
"0.5",
"--seed",
"42",
])
.expect("CLI parse");
cmd.execute("test").expect("execute() succeeded");
let mut reader = noodles::bam::io::reader::Builder.build_from_path(&out_bam).unwrap();
let _ = reader.read_header().unwrap();
let mut mapped_count = 0;
let mut unmapped_count = 0;
let mut first_unmapped_idx: Option<usize> = None;
let mut last_mapped_idx: Option<usize> = None;
for (idx, result) in reader.records().enumerate() {
let record = result.unwrap();
if record.flags().is_unmapped() {
unmapped_count += 1;
first_unmapped_idx.get_or_insert(idx);
} else {
mapped_count += 1;
last_mapped_idx = Some(idx);
}
}
assert!(mapped_count > 0, "expected some mapped records, got {mapped_count}");
assert!(unmapped_count > 0, "expected some unmapped records, got {unmapped_count}");
if let (Some(first_unmapped), Some(last_mapped)) = (first_unmapped_idx, last_mapped_idx) {
assert!(
last_mapped < first_unmapped,
"all mapped records must sort before any unmapped record \
(last mapped idx {last_mapped}, first unmapped idx {first_unmapped})"
);
}
}
#[rstest::rstest]
#[case("-0.1")]
#[case("1.5")]
#[case("NaN")]
fn test_execute_rejects_out_of_range_unmapped_fraction(#[case] bad: &str) {
use clap::Parser;
use tempfile::tempdir;
let dir = tempdir().unwrap();
let fasta = write_minimal_fasta(dir.path());
let out_bam = dir.path().join("out.bam");
let truth = dir.path().join("truth.tsv");
let flag_arg = format!("--unmapped-fraction={bad}");
let cmd = MappedReads::try_parse_from([
"mapped-reads",
"-o",
out_bam.to_str().unwrap(),
"--truth",
truth.to_str().unwrap(),
"-r",
fasta.to_str().unwrap(),
"--num-molecules",
"5",
&flag_arg,
"--seed",
"1",
])
.expect("CLI parse");
let result = cmd.execute("test");
assert!(result.is_err(), "value {bad} should be rejected");
let msg = result.unwrap_err().to_string();
assert!(
msg.contains("--unmapped-fraction"),
"error message should mention the flag name, got: {msg}"
);
}
}