use super::sort::TemplateCoordKey;
use crate::bam_io::create_raw_bam_writer;
use crate::commands::command::Command;
use crate::commands::common::{CompressionOptions, parse_bool};
use crate::commands::simulate::common::{
FamilySizeArgs, InsertSizeArgs, MethylationArgs, MethylationConfig, MoleculeInfo,
PositionDistArgs, QualityArgs, ReferenceArgs, ReferenceGenome, SimulationCommon,
StrandBiasArgs, apply_methylation_conversion, generate_random_sequence, pad_sequence,
};
use crate::commands::simulate::region_to_bin;
use crate::dna::reverse_complement;
use crate::progress::ProgressTracker;
use crate::simulate::{
FamilySizeDistribution, InsertSizeModel, PositionQualityModel, ReadPairQualityBias,
StrandBiasModel, create_rng,
};
use anyhow::{Context, Result};
use clap::Parser;
use fgumi_raw_bam::{RawRecord, SamBuilder, flags as raw_flags};
use log::info;
use noodles::sam::header::Header;
use noodles::sam::header::record::value::map::header::{self as HeaderRecord, Tag as HeaderTag};
use rand::{Rng, RngExt};
use std::fs::File;
use std::io::{BufWriter, Write};
use std::path::PathBuf;
#[derive(Parser, Debug)]
#[command(
name = "grouped-reads",
about = "Generate grouped BAM with MI tags for consensus calling",
long_about = r#"
Generate synthetic BAM files with MI (molecule ID) tags already assigned.
The output is template-coordinate sorted and suitable for input to
`fgumi simplex`, `fgumi duplex`, or `fgumi codec`.
"#
)]
pub struct GroupedReads {
#[arg(short = 'o', long = "output", required = true)]
pub output: PathBuf,
#[arg(long = "truth", required = true)]
pub truth_output: PathBuf,
#[arg(long = "duplex", default_value = "false", num_args = 0..=1, default_missing_value = "true", action = clap::ArgAction::Set, value_parser = parse_bool)]
pub duplex: bool,
#[arg(long = "mapq", default_value = "60")]
pub mapq: u8,
#[arg(short = 't', long = "threads", default_value = "1")]
pub threads: usize,
#[command(flatten)]
pub compression: CompressionOptions,
#[command(flatten)]
pub common: SimulationCommon,
#[command(flatten)]
pub quality: QualityArgs,
#[command(flatten)]
pub family_size: FamilySizeArgs,
#[command(flatten)]
pub insert_size: InsertSizeArgs,
#[command(flatten)]
pub reference: ReferenceArgs,
#[command(flatten)]
pub position_dist: PositionDistArgs,
#[command(flatten)]
pub strand_bias: StrandBiasArgs,
#[command(flatten)]
pub methylation: MethylationArgs,
}
struct GenerationParams {
read_length: usize,
umi_length: usize,
mapq: u8,
duplex: bool,
min_family_size: usize,
quality_model: PositionQualityModel,
quality_bias: ReadPairQualityBias,
family_dist: FamilySizeDistribution,
insert_model: InsertSizeModel,
strand_bias_model: StrandBiasModel,
methylation: MethylationConfig,
}
impl Command for GroupedReads {
fn execute(&self, command_line: &str) -> Result<()> {
let methylation = self.methylation.resolve();
self.methylation.validate()?;
info!("Generating grouped reads");
info!(" Output: {}", self.output.display());
info!(" Truth: {}", self.truth_output.display());
info!(" Duplex: {}", self.duplex);
info!(" Num molecules: {}", self.common.num_molecules);
info!(" Read length: {}", self.common.read_length);
info!(" UMI length: {}", self.common.umi_length);
info!(" Threads: {}", self.threads);
if methylation.mode.is_enabled() {
info!(" Methylation mode: {:?}", methylation.mode);
info!(" CpG methylation rate: {}", methylation.cpg_methylation_rate);
info!(" Conversion rate: {}", methylation.conversion_rate);
}
let ref_genome = ReferenceGenome::load(&self.reference.reference)?;
if ref_genome.max_contig_length() < self.common.read_length {
anyhow::bail!(
"No reference contig is >= read length ({} bp). \
The longest contig is {} bp. Use a larger reference or shorter --read-length.",
self.common.read_length,
ref_genome.max_contig_length(),
);
}
let ref_header = ref_genome.build_bam_header();
let mut header_builder = Header::builder();
let HeaderTag::Other(so_tag) = HeaderTag::from([b'S', b'O']) else { unreachable!() };
let HeaderTag::Other(go_tag) = HeaderTag::from([b'G', b'O']) else { unreachable!() };
let HeaderTag::Other(ss_tag) = HeaderTag::from([b'S', b'S']) else { unreachable!() };
let header_map =
noodles::sam::header::record::value::Map::<HeaderRecord::Header>::builder()
.insert(so_tag, "unsorted")
.insert(go_tag, "query")
.insert(ss_tag, "template-coordinate")
.build()
.expect("header map with valid SO/GO/SS tags");
header_builder = header_builder.set_header(header_map);
for (name, map) in ref_header.reference_sequences() {
header_builder = header_builder.add_reference_sequence(name.clone(), map.clone());
}
header_builder = crate::commands::common::add_pg_to_builder(header_builder, command_line)?;
let header = header_builder.build();
let params = GenerationParams {
read_length: self.common.read_length,
umi_length: self.common.umi_length,
mapq: self.mapq,
duplex: self.duplex,
min_family_size: self.family_size.min_family_size,
quality_model: self.quality.to_quality_model(),
quality_bias: self.quality.to_quality_bias(),
family_dist: self.family_size.to_family_size_distribution()?,
insert_model: self.insert_size.to_insert_size_model(),
strand_bias_model: self.strand_bias.to_strand_bias_model(),
methylation,
};
let num_positions = self.position_dist.num_positions.unwrap_or(self.common.num_molecules);
if num_positions == 0 {
anyhow::bail!("--num-positions must be greater than 0");
}
let usable_bases = ref_genome.total_length().saturating_sub(1000);
let bases_per_position = usable_bases as f64 / num_positions as f64;
if bases_per_position < 1.0 {
anyhow::bail!(
"Too many positions ({num_positions}) for reference of size {} bp. \
Reduce --num-positions or use a larger reference.",
ref_genome.total_length()
);
} else if bases_per_position < 10.0 {
log::warn!(
"Low position spacing ({bases_per_position:.1} bp/position) may cause UMI collisions."
);
}
let mut pos_rng = create_rng(self.common.seed);
let position_table = ref_genome.sample_positions(num_positions, &mut pos_rng);
let mut seed_rng = create_rng(self.common.seed.map(|s| s.wrapping_add(1)));
info!("Computing molecule positions and sort keys...");
let mut molecules: Vec<MoleculeInfo> = (0..self.common.num_molecules)
.map(|mol_id| {
let seed: u64 = seed_rng.random();
let pos_idx = mol_id % num_positions;
let (chrom_idx, local_pos) = position_table[pos_idx];
let mut mol_rng = create_rng(Some(seed));
for _ in 0..params.umi_length {
let _: usize = mol_rng.random_range(0..4);
}
let _ = params.family_dist.sample(&mut mol_rng, params.min_family_size);
let insert_size = params.insert_model.sample(&mut mol_rng);
let sort_key = TemplateCoordKey::for_f1r2_pair(
chrom_idx as i32, local_pos,
insert_size,
mol_id.to_string(), format!("mol{mol_id:08}"),
);
MoleculeInfo { mol_id, seed, sort_key, is_unmapped: false }
})
.collect();
info!("Sorting {} molecules by template-coordinate...", molecules.len());
molecules.sort_unstable();
let mut writer = create_raw_bam_writer(
&self.output,
&header,
self.threads,
self.compression.compression_level,
)?;
let truth_file = File::create(&self.truth_output)
.with_context(|| format!("Failed to create {}", self.truth_output.display()))?;
let mut truth_writer = BufWriter::new(truth_file);
writeln!(
truth_writer,
"read_name\ttrue_umi\tmolecule_id\tmi_tag\tchrom\tposition\tstrand"
)?;
info!("Generating and writing records in sorted order...");
let progress = ProgressTracker::new("Processed molecules").with_interval(100_000);
let mut total_pairs = 0usize;
for mol_info in molecules {
progress.log_if_needed(1);
let pos_idx = mol_info.mol_id % num_positions;
let (pos_chrom_idx, pos_local_pos) = position_table[pos_idx];
let (pairs, chrom_idx, local_pos) = generate_molecule_reads(
mol_info.mol_id,
mol_info.seed,
pos_chrom_idx,
pos_local_pos,
¶ms,
&ref_genome,
);
for (r1, r2, read_name, umi_str, mi_tag, is_top_strand, insert_size) in pairs {
let r2_first = !is_top_strand && insert_size < 2 * params.read_length;
if r2_first {
writer.write_raw_record(r2.as_ref())?;
writer.write_raw_record(r1.as_ref())?;
} else {
writer.write_raw_record(r1.as_ref())?;
writer.write_raw_record(r2.as_ref())?;
}
let strand_char = if is_top_strand { '+' } else { '-' };
writeln!(
truth_writer,
"{}\t{}\t{}\t{}\t{}\t{}\t{}",
read_name,
umi_str,
mol_info.mol_id,
mi_tag,
ref_genome.name(chrom_idx),
local_pos,
strand_char,
)?;
total_pairs += 1;
}
}
progress.log_final();
truth_writer.flush()?;
writer.finish()?;
info!("Generated {total_pairs} read pairs");
info!("Done");
Ok(())
}
}
type MoleculeReadPair = (RawRecord, RawRecord, String, String, String, bool, usize);
type MoleculeReadsResult = (Vec<MoleculeReadPair>, usize, usize);
#[allow(clippy::too_many_arguments)]
fn generate_molecule_reads(
mol_id: usize,
seed: u64,
chrom_idx: usize,
local_pos: usize,
params: &GenerationParams,
ref_genome: &ReferenceGenome,
) -> MoleculeReadsResult {
let mut rng = create_rng(Some(seed));
let umi = generate_random_sequence(params.umi_length, &mut rng);
let umi_str = String::from_utf8_lossy(&umi).to_string();
let family_size = params.family_dist.sample(&mut rng, params.min_family_size);
let insert_size = params.insert_model.sample(&mut rng);
let is_top_strand: bool = rng.random();
let (eff_chrom_idx, eff_local_pos, shared_template) =
match ref_genome.sequence_at(chrom_idx, local_pos, insert_size) {
Some(seq) => (chrom_idx, local_pos, Some(seq)),
None => match ref_genome.sample_sequence(insert_size, &mut rng) {
Some((c, p, seq)) => (c, p, Some(seq)),
None => (chrom_idx, local_pos, None),
},
};
let chrom_idx = eff_chrom_idx;
let local_pos = eff_local_pos;
let mut pairs = Vec::new();
if params.duplex {
let (a_count, b_count) = params.strand_bias_model.split_reads(family_size, &mut rng);
let a_is_top = is_top_strand;
for read_idx in 0..a_count {
let read_name = format!("mol{mol_id:08}_readA{read_idx:04}");
let mi_tag = format!("{mol_id}/A");
let (r1_record, r2_record) = generate_read_pair_records(
&read_name,
&umi_str,
&mi_tag,
chrom_idx,
local_pos,
insert_size,
params.read_length,
params.mapq,
a_is_top,
¶ms.quality_model,
¶ms.quality_bias,
¶ms.methylation,
shared_template.as_deref(),
&mut rng,
);
pairs.push((
r1_record,
r2_record,
read_name,
umi_str.clone(),
mi_tag,
a_is_top,
insert_size,
));
}
let b_is_top = !is_top_strand;
for read_idx in 0..b_count {
let read_name = format!("mol{mol_id:08}_readB{read_idx:04}");
let mi_tag = format!("{mol_id}/B");
let (r1_record, r2_record) = generate_read_pair_records(
&read_name,
&umi_str,
&mi_tag,
chrom_idx,
local_pos,
insert_size,
params.read_length,
params.mapq,
b_is_top,
¶ms.quality_model,
¶ms.quality_bias,
¶ms.methylation,
shared_template.as_deref(),
&mut rng,
);
pairs.push((
r1_record,
r2_record,
read_name,
umi_str.clone(),
mi_tag,
b_is_top,
insert_size,
));
}
} else {
let mi_tag = mol_id.to_string();
for read_idx in 0..family_size {
let read_name = format!("mol{mol_id:08}_read{read_idx:04}");
let (r1_record, r2_record) = generate_read_pair_records(
&read_name,
&umi_str,
&mi_tag,
chrom_idx,
local_pos,
insert_size,
params.read_length,
params.mapq,
is_top_strand,
¶ms.quality_model,
¶ms.quality_bias,
¶ms.methylation,
shared_template.as_deref(),
&mut rng,
);
pairs.push((
r1_record,
r2_record,
read_name,
umi_str.clone(),
mi_tag.clone(),
is_top_strand,
insert_size,
));
}
}
(pairs, chrom_idx, local_pos)
}
#[allow(clippy::too_many_arguments)]
fn generate_read_pair_records(
read_name: &str,
umi_str: &str,
mi_tag: &str,
chrom_idx: usize,
local_pos: usize,
insert_size: usize,
read_length: usize,
mapq: u8,
is_top_strand: bool,
quality_model: &PositionQualityModel,
quality_bias: &ReadPairQualityBias,
methylation: &MethylationConfig,
shared_template: Option<&[u8]>,
rng: &mut impl Rng,
) -> (RawRecord, RawRecord) {
let random_template;
let template: &[u8] = if let Some(shared) = shared_template {
shared
} else {
random_template = generate_random_sequence(insert_size, rng);
&random_template
};
let fwd_end = read_length.min(template.len());
let mut fwd_seq: Vec<u8> = template[..fwd_end].to_vec();
apply_methylation_conversion(
&mut fwd_seq,
template,
0,
true, methylation,
rng,
);
let fwd_seq = pad_sequence(fwd_seq, read_length, rng);
let rev_start = insert_size.saturating_sub(read_length);
let rev_end = read_length.min(template.len().saturating_sub(rev_start));
let mut rev_template: Vec<u8> = template[rev_start..rev_start + rev_end].to_vec();
apply_methylation_conversion(
&mut rev_template,
template,
rev_start,
false, methylation,
rng,
);
let rev_seq = reverse_complement(&rev_template);
let rev_seq = pad_sequence(rev_seq, read_length, rng);
let r1_quals = quality_model.generate_qualities(read_length, rng);
let r2_quals_raw = quality_model.generate_qualities(read_length, rng);
let r2_quals = quality_bias.apply_to_vec(&r2_quals_raw, true);
let mate_cigar = format!("{read_length}M");
let rev_pos = local_pos + insert_size.saturating_sub(read_length);
let (r1_seq, r2_seq, r1_is_reverse, r1_pos, r2_pos, r1_tlen) = if is_top_strand {
(fwd_seq, rev_seq, false, local_pos, rev_pos, insert_size as i32)
} else {
(rev_seq, fwd_seq, true, rev_pos, local_pos, -(insert_size as i32))
};
let r1_record = build_record(
read_name,
&r1_seq,
&r1_quals,
chrom_idx,
r1_pos,
mapq,
true, r1_is_reverse, r2_pos,
r1_tlen,
umi_str,
mi_tag,
&mate_cigar,
mapq,
);
let r2_record = build_record(
read_name,
&r2_seq,
&r2_quals,
chrom_idx,
r2_pos,
mapq,
false, !r1_is_reverse, r1_pos,
-r1_tlen,
umi_str,
mi_tag,
&mate_cigar,
mapq,
);
(r1_record, r2_record)
}
#[allow(clippy::too_many_arguments)]
fn build_record(
name: &str,
seq: &[u8],
quals: &[u8],
ref_id: usize,
pos: usize,
mapq: u8,
is_first: bool,
is_reverse: bool,
mate_pos: usize,
tlen: i32,
umi: &str,
mi_tag: &str,
mate_cigar: &str,
mate_mapq: u8,
) -> RawRecord {
let segment_flag = if is_first { raw_flags::FIRST_SEGMENT } else { raw_flags::LAST_SEGMENT };
let reverse_flag = if is_reverse { raw_flags::REVERSE } else { 0 };
let mate_reverse_flag = if is_reverse { 0 } else { raw_flags::MATE_REVERSE };
let flags = raw_flags::PAIRED
| raw_flags::PROPER_PAIR
| segment_flag
| reverse_flag
| mate_reverse_flag;
let n = u32::try_from(seq.len()).expect("sequence length fits u32");
let cigar_op = n << 4;
let alignment_start_1based = u32::try_from(pos + 1).expect("alignment start fits u32");
let alignment_end_1based = alignment_start_1based + n.saturating_sub(1);
let bin = region_to_bin(Some(alignment_start_1based), Some(alignment_end_1based));
let ref_id_i32 = i32::try_from(ref_id).expect("ref_id fits i32");
let pos_i32 = i32::try_from(pos).expect("pos fits i32");
let mate_pos_i32 = i32::try_from(mate_pos).expect("mate_pos fits i32");
let mut b = SamBuilder::new();
b.read_name(name.as_bytes())
.flags(flags)
.ref_id(ref_id_i32)
.pos(pos_i32)
.mapq(mapq)
.bin(bin)
.mate_ref_id(ref_id_i32)
.mate_pos(mate_pos_i32)
.template_length(tlen)
.cigar_ops(&[cigar_op])
.sequence(seq)
.qualities(quals)
.add_string_tag(b"RX", umi.as_bytes())
.add_string_tag(b"MI", mi_tag.as_bytes())
.add_string_tag(b"MC", mate_cigar.as_bytes())
.add_int_tag(b"MQ", i32::from(mate_mapq));
b.build()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::simulate::create_rng;
use fgumi_raw_bam::RawRecordView;
const DISABLED_METHYLATION: MethylationConfig = MethylationConfig {
mode: fgumi_consensus::MethylationMode::Disabled,
cpg_methylation_rate: 0.75,
conversion_rate: 0.98,
};
fn to_record_buf(raw: &RawRecord) -> noodles::sam::alignment::RecordBuf {
fgumi_raw_bam::raw_record_to_record_buf(raw, &noodles::sam::Header::default())
.expect("raw_record_to_record_buf failed in test")
}
#[test]
fn test_generate_random_sequence_length() {
let mut rng = create_rng(Some(42));
for len in [0, 1, 8, 100, 300] {
let seq = generate_random_sequence(len, &mut rng);
assert_eq!(seq.len(), len);
}
}
#[test]
fn test_generate_random_sequence_valid_bases() {
let mut rng = create_rng(Some(42));
let seq = generate_random_sequence(1000, &mut rng);
for &base in &seq {
assert!(
base == b'A' || base == b'C' || base == b'G' || base == b'T',
"Invalid base: {}",
base as char
);
}
}
#[test]
fn test_reverse_complement_basic() {
assert_eq!(reverse_complement(b"A"), b"T");
assert_eq!(reverse_complement(b"T"), b"A");
assert_eq!(reverse_complement(b"C"), b"G");
assert_eq!(reverse_complement(b"G"), b"C");
}
#[test]
fn test_reverse_complement_sequence() {
assert_eq!(reverse_complement(b"ACGT"), b"ACGT");
assert_eq!(reverse_complement(b"AAAA"), b"TTTT");
assert_eq!(reverse_complement(b"AACG"), b"CGTT");
}
#[test]
fn test_pad_sequence_already_correct_length() {
let mut rng = create_rng(Some(42));
let seq = vec![b'A', b'C', b'G', b'T'];
let padded = pad_sequence(seq.clone(), 4, &mut rng);
assert_eq!(padded, seq);
}
#[test]
fn test_pad_sequence_needs_padding() {
let mut rng = create_rng(Some(42));
let seq = vec![b'A', b'C'];
let padded = pad_sequence(seq, 6, &mut rng);
assert_eq!(padded.len(), 6);
assert_eq!(&padded[0..2], b"AC");
}
#[test]
fn test_build_record_with_mi_tag() {
let seq = b"ACGT";
let quals = vec![30, 30, 30, 30];
let raw = build_record(
"test_read",
seq,
&quals,
0,
100,
60,
true,
false,
200,
150,
"AAAAAAAA",
"42/A",
"4M",
60,
);
let record = to_record_buf(&raw);
assert!(record.name().is_some());
let flags = record.flags();
assert!(flags.is_segmented());
assert!(flags.is_first_segment());
}
#[test]
fn test_build_record_simplex_mi_tag() {
let seq = b"ACGT";
let quals = vec![30, 30, 30, 30];
let raw = build_record(
"test_read",
seq,
&quals,
0,
100,
60,
true,
false,
200,
150,
"AAAAAAAA",
"42",
"4M",
60,
);
let record = to_record_buf(&raw);
assert!(record.name().is_some());
}
#[test]
fn test_generate_read_pair_records() {
let mut rng = create_rng(Some(42));
let quality_model = PositionQualityModel::default();
let quality_bias = ReadPairQualityBias::default();
let (r1_raw, r2_raw) = generate_read_pair_records(
"read_001",
"ACGTACGT",
"0/A",
0, 1000, 300,
150,
60,
true, &quality_model,
&quality_bias,
&DISABLED_METHYLATION,
None,
&mut rng,
);
let r1 = to_record_buf(&r1_raw);
let r2 = to_record_buf(&r2_raw);
assert!(r1.flags().is_first_segment());
assert!(!r1.flags().is_reverse_complemented());
assert!(r2.flags().is_last_segment());
assert!(r2.flags().is_reverse_complemented());
}
#[test]
fn test_generate_read_pair_records_b_strand() {
let mut rng = create_rng(Some(42));
let quality_model = PositionQualityModel::default();
let quality_bias = ReadPairQualityBias::default();
let (r1_raw, r2_raw) = generate_read_pair_records(
"read_001",
"ACGTACGT",
"5/B",
0, 1000, 300,
150,
60,
false, &quality_model,
&quality_bias,
&DISABLED_METHYLATION,
None,
&mut rng,
);
let r1 = to_record_buf(&r1_raw);
let r2 = to_record_buf(&r2_raw);
assert!(r1.flags().is_first_segment());
assert!(r1.flags().is_reverse_complemented());
assert!(r2.flags().is_last_segment());
assert!(!r2.flags().is_reverse_complemented());
}
#[test]
fn test_generate_read_pair_records_simplex() {
let mut rng = create_rng(Some(42));
let quality_model = PositionQualityModel::default();
let quality_bias = ReadPairQualityBias::default();
let (r1_raw, r2_raw) = generate_read_pair_records(
"read_001",
"ACGTACGT",
"10",
0, 1000, 300,
150,
60,
true, &quality_model,
&quality_bias,
&DISABLED_METHYLATION,
None,
&mut rng,
);
let r1 = to_record_buf(&r1_raw);
let r2 = to_record_buf(&r2_raw);
assert!(r1.flags().is_first_segment());
assert!(!r1.flags().is_reverse_complemented());
assert!(r2.flags().is_last_segment());
assert!(r2.flags().is_reverse_complemented());
}
#[test]
fn test_generate_read_pair_records_small_insert() {
let mut rng = create_rng(Some(42));
let quality_model = PositionQualityModel::default();
let quality_bias = ReadPairQualityBias::default();
let (r1_raw, r2_raw) = generate_read_pair_records(
"read_001",
"ACGTACGT",
"0/A",
0, 1000, 50, 150,
60,
true, &quality_model,
&quality_bias,
&DISABLED_METHYLATION,
None,
&mut rng,
);
let r1 = to_record_buf(&r1_raw);
let r2 = to_record_buf(&r2_raw);
assert!(r1.flags().is_first_segment());
assert!(r2.flags().is_last_segment());
}
#[test]
fn test_generate_read_pair_records_reproducibility() {
let quality_model = PositionQualityModel::default();
let quality_bias = ReadPairQualityBias::default();
let mut rng1 = create_rng(Some(42));
let mut rng2 = create_rng(Some(42));
let (r1_a_raw, r2_a_raw) = generate_read_pair_records(
"read_001",
"ACGTACGT",
"0/A",
0, 1000, 300,
150,
60,
true, &quality_model,
&quality_bias,
&DISABLED_METHYLATION,
None,
&mut rng1,
);
let (r1_b_raw, r2_b_raw) = generate_read_pair_records(
"read_001",
"ACGTACGT",
"0/A",
0, 1000, 300,
150,
60,
true, &quality_model,
&quality_bias,
&DISABLED_METHYLATION,
None,
&mut rng2,
);
assert_eq!(
RawRecordView::new(r1_a_raw.as_ref()).flags(),
RawRecordView::new(r1_b_raw.as_ref()).flags()
);
assert_eq!(
RawRecordView::new(r2_a_raw.as_ref()).flags(),
RawRecordView::new(r2_b_raw.as_ref()).flags()
);
}
#[test]
fn test_build_record_r2_flags() {
let seq = b"ACGT";
let quals = vec![30, 30, 30, 30];
let raw = build_record(
"test_read",
seq,
&quals,
0,
200,
60,
false, true, 100,
-150,
"AAAAAAAA",
"42/B",
"4M",
60,
);
let record = to_record_buf(&raw);
let flags = record.flags();
assert!(flags.is_last_segment());
assert!(!flags.is_first_segment());
assert!(flags.is_reverse_complemented());
assert!(!flags.is_mate_reverse_complemented());
}
#[test]
fn test_build_record_positions() {
let seq = b"ACGTACGT";
let quals = vec![30; 8];
let raw = build_record(
"read", seq, &quals, 0, 1000, 60, true, false, 1100, 200, "AAA", "0/A", "8M", 60,
);
let record = to_record_buf(&raw);
assert_eq!(
record.alignment_start().expect("record should have alignment start").get(),
1001
);
assert_eq!(
record.mate_alignment_start().expect("record should have mate alignment start").get(),
1101
);
assert_eq!(record.template_length(), 200);
}
#[test]
fn test_build_record_mapping_quality() {
let seq = b"ACGT";
let quals = vec![30; 4];
for mapq in [0, 30, 60] {
let raw = build_record(
"read", seq, &quals, 0, 100, mapq, true, false, 200, 100, "AAA", "0", "4M", mapq,
);
let record = to_record_buf(&raw);
assert_eq!(
record.mapping_quality().expect("record should have mapping quality").get(),
mapq
);
}
}
#[test]
fn test_build_record_quality_scores() {
let seq = b"ACGTACGT";
let quals = vec![10, 20, 30, 40, 30, 20, 10, 5];
let raw = build_record(
"read", seq, &quals, 0, 100, 60, true, false, 200, 100, "AAA", "0", "8M", 60,
);
let record = to_record_buf(&raw);
let record_quals: Vec<u8> = record.quality_scores().iter().collect();
assert_eq!(record_quals, quals);
}
#[test]
fn test_pad_sequence_truncates() {
let mut rng = create_rng(Some(42));
let seq = vec![b'A', b'C', b'G', b'T', b'A', b'C'];
let padded = pad_sequence(seq, 3, &mut rng);
assert_eq!(padded, vec![b'A', b'C', b'G']);
}
#[test]
fn test_pad_sequence_empty_to_length() {
let mut rng = create_rng(Some(42));
let seq: Vec<u8> = vec![];
let padded = pad_sequence(seq, 5, &mut rng);
assert_eq!(padded.len(), 5);
for &base in &padded {
assert!(base == b'A' || base == b'C' || base == b'G' || base == b'T');
}
}
#[test]
fn test_reverse_complement_empty() {
let empty: &[u8] = b"";
assert_eq!(reverse_complement(empty), Vec::<u8>::new());
}
#[test]
fn test_reverse_complement_unknown_base() {
assert_eq!(reverse_complement(b"N"), b"N");
assert_eq!(reverse_complement(b"ANCG"), b"CGNT");
}
fn check_collision(num_positions: usize, ref_length: usize) -> Result<(), String> {
let usable_bases = ref_length.saturating_sub(1000);
let bases_per_position = usable_bases as f64 / num_positions as f64;
if bases_per_position < 1.0 {
Err(format!(
"Position collision: {num_positions} positions cannot fit in {ref_length} bp reference ({bases_per_position:.2} bp/position)"
))
} else {
Ok(())
}
}
#[test]
fn test_collision_detection_error() {
let result = check_collision(5_000_000, 10_000_000);
assert!(result.is_ok(), "Should pass with ~1.8 bp/position");
let result = check_collision(10_000_000, 10_000_000);
assert!(result.is_err(), "Should fail with <1 bp/position");
let result = check_collision(20_000_000, 10_000_000);
assert!(result.is_err());
}
#[test]
fn test_collision_detection_success() {
let result = check_collision(5_000_000, 250_000_000);
assert!(result.is_ok());
let result = check_collision(1_000_000, 250_000_000);
assert!(result.is_ok());
}
#[test]
fn test_a_strand_orientation() {
let mut rng = create_rng(Some(42));
let quality_model = PositionQualityModel::default();
let quality_bias = ReadPairQualityBias::default();
let (r1_raw, r2_raw) = generate_read_pair_records(
"read_001",
"ACGTACGT",
"1/A",
0, 1000, 300,
150,
60,
true, &quality_model,
&quality_bias,
&DISABLED_METHYLATION,
None,
&mut rng,
);
let r1 = to_record_buf(&r1_raw);
let r2 = to_record_buf(&r2_raw);
assert!(r1.flags().is_first_segment());
assert!(!r1.flags().is_reverse_complemented());
assert!(r2.flags().is_last_segment());
assert!(r2.flags().is_reverse_complemented());
}
#[test]
fn test_emseq_grouped_reads_strand_conversion() {
let template = b"CACACACACACACACAC"; let config = MethylationConfig {
mode: fgumi_consensus::MethylationMode::EmSeq,
cpg_methylation_rate: 0.75,
conversion_rate: 1.0,
};
let quality_model = PositionQualityModel::default();
let quality_bias = ReadPairQualityBias::default();
let mut rng = create_rng(Some(42));
let (r1_raw, _r2_raw) = generate_read_pair_records(
"test_meth",
"AAAAAAAA",
"1/A",
0, 0, template.len(),
template.len(),
60,
true, &quality_model,
&quality_bias,
&config,
Some(template),
&mut rng,
);
let r1 = to_record_buf(&r1_raw);
let r1_seq: Vec<u8> = r1.sequence().as_ref().to_vec();
for (i, &b) in template.iter().enumerate() {
if i >= r1_seq.len() {
break;
}
if b == b'C' {
assert_eq!(r1_seq[i], b'T', "R1 position {i}: non-CpG C should convert to T");
}
}
}
#[test]
fn test_bottom_strand_flips_conversion_orientation() {
let template = b"CATAGATAGATAGATA"; let emseq_config = MethylationConfig {
mode: fgumi_consensus::MethylationMode::EmSeq,
cpg_methylation_rate: 0.75,
conversion_rate: 1.0,
};
let disabled_config = DISABLED_METHYLATION;
let quality_model = PositionQualityModel::default();
let quality_bias = ReadPairQualityBias::default();
let mut rng_a = create_rng(Some(42));
let (r1_a_raw, _) = generate_read_pair_records(
"test",
"AAAAAAAA",
"1/A",
0, 0, template.len(),
template.len(),
60,
true, &quality_model,
&quality_bias,
&emseq_config,
Some(template),
&mut rng_a,
);
let mut rng_b = create_rng(Some(42));
let (r1_b_no_meth_raw, _) = generate_read_pair_records(
"test",
"AAAAAAAA",
"1/B",
0, 0, template.len(),
template.len(),
60,
false, &quality_model,
&quality_bias,
&disabled_config,
Some(template),
&mut rng_b,
);
let mut rng_b2 = create_rng(Some(42));
let (r1_b_meth_raw, _) = generate_read_pair_records(
"test",
"AAAAAAAA",
"1/B",
0, 0, template.len(),
template.len(),
60,
false, &quality_model,
&quality_bias,
&emseq_config,
Some(template),
&mut rng_b2,
);
let r1_a_seq: Vec<u8> = to_record_buf(&r1_a_raw).sequence().as_ref().to_vec();
let r1_b_seq: Vec<u8> = to_record_buf(&r1_b_meth_raw).sequence().as_ref().to_vec();
assert_ne!(r1_a_seq, r1_b_seq, "Top and bottom strand R1 should differ");
let r1_b_no_meth_seq: Vec<u8> =
to_record_buf(&r1_b_no_meth_raw).sequence().as_ref().to_vec();
assert_ne!(
r1_b_seq, r1_b_no_meth_seq,
"Bottom strand with/without methylation should differ"
);
}
#[test]
fn test_grouped_reads_uses_real_ref_coordinates() {
use std::io::Write as IoWrite;
use tempfile::NamedTempFile;
let mut fasta = NamedTempFile::new().unwrap();
writeln!(fasta, ">chr1").unwrap();
fasta.write_all(&b"ACGT".repeat(500)).unwrap();
writeln!(fasta).unwrap();
writeln!(fasta, ">chr2").unwrap();
fasta.write_all(&b"CCGG".repeat(375)).unwrap();
writeln!(fasta).unwrap();
fasta.flush().unwrap();
let ref_genome = ReferenceGenome::load(fasta.path()).unwrap();
let mut rng = create_rng(Some(42));
let positions = ref_genome.sample_positions(10, &mut rng);
let params = GenerationParams {
read_length: 50,
umi_length: 8,
mapq: 60,
duplex: false,
min_family_size: 1,
quality_model: PositionQualityModel::default(),
quality_bias: ReadPairQualityBias::default(),
family_dist: FamilySizeDistribution::log_normal(1.0, 0.1),
insert_model: InsertSizeModel::new(100.0, 5.0, 80, 120),
strand_bias_model: StrandBiasModel::no_bias(),
methylation: DISABLED_METHYLATION,
};
let (chrom_idx, local_pos) = positions[0];
let (pairs, eff_chrom, _eff_pos) =
generate_molecule_reads(0, 42, chrom_idx, local_pos, ¶ms, &ref_genome);
assert!(!pairs.is_empty());
for (r1, r2, _, _, _, _, _) in &pairs {
assert_eq!(RawRecordView::new(r1.as_ref()).ref_id() as usize, eff_chrom);
assert_eq!(RawRecordView::new(r2.as_ref()).ref_id() as usize, eff_chrom);
}
}
#[test]
fn test_duplex_strand_coin_flip_produces_both_orientations_for_a_reads() {
use std::io::Write as IoWrite;
use tempfile::NamedTempFile;
let mut fasta = NamedTempFile::new().unwrap();
writeln!(fasta, ">chr1").unwrap();
fasta.write_all(&b"ACGT".repeat(500)).unwrap();
writeln!(fasta).unwrap();
fasta.flush().unwrap();
let ref_genome = ReferenceGenome::load(fasta.path()).unwrap();
let params = GenerationParams {
read_length: 50,
umi_length: 8,
mapq: 60,
duplex: true,
min_family_size: 2,
quality_model: PositionQualityModel::default(),
quality_bias: ReadPairQualityBias::default(),
family_dist: FamilySizeDistribution::log_normal(2.0, 0.5),
insert_model: InsertSizeModel::new(100.0, 5.0, 80, 120),
strand_bias_model: StrandBiasModel::no_bias(),
methylation: DISABLED_METHYLATION,
};
let mut a_saw_top = false;
let mut a_saw_bottom = false;
for seed in 0u64..100 {
let (pairs, _chrom, _pos) =
generate_molecule_reads(seed as usize, seed, 0, 500, ¶ms, &ref_genome);
for (_, _, _, _, mi_tag, is_top, _) in &pairs {
if mi_tag.ends_with("/A") {
if *is_top {
a_saw_top = true;
} else {
a_saw_bottom = true;
}
}
}
}
assert!(a_saw_top, "A reads should sometimes be F1R2 (top strand)");
assert!(a_saw_bottom, "A reads should sometimes be R1F2 (bottom strand)");
}
}