use noodles::sam::alignment::record::cigar::op::{Kind, Op};
use noodles::sam::alignment::record_buf::Cigar;
use rand::Rng;
use crate::error_model::{self, ErrorModel, ReadEnd};
use crate::fragment::{Fragment, extract_read_bases};
use crate::read_naming::{TruthAlignment, encoded_pe_name, encoded_se_name, simple_name};
#[derive(Debug, Clone)]
pub struct SimulatedRead {
pub name: String,
pub bases: Vec<u8>,
pub qualities: Vec<u8>,
}
#[derive(Debug)]
pub struct ReadPair {
pub read1: SimulatedRead,
pub read2: Option<SimulatedRead>,
pub r1_truth: TruthAlignment,
pub r2_truth: Option<TruthAlignment>,
pub r1_cigar: Cigar,
pub r2_cigar: Option<Cigar>,
}
#[allow(clippy::too_many_arguments)] pub fn generate_read_pair(
fragment: &Fragment,
contig_name: &str,
read_num: u64,
read_length: usize,
paired: bool,
adapter_r1: &[u8],
adapter_r2: &[u8],
model: &impl ErrorModel,
simple_names: bool,
rng: &mut impl Rng,
) -> ReadPair {
let frag_len = fragment.bases.len();
let genomic = frag_len.min(read_length);
let adapter_bases = read_length.saturating_sub(genomic);
let right_start = frag_len.saturating_sub(genomic);
let r1_negative_strand = !fragment.is_forward;
let mut r1_bases =
extract_read_bases(&fragment.bases, read_length, adapter_r1, r1_negative_strand);
let (r1_errors, r1_quals) =
error_model::apply_errors(model, &mut r1_bases, ReadEnd::Read1, rng);
let r1_positions = if r1_negative_strand {
&fragment.ref_positions[right_start..frag_len]
} else {
&fragment.ref_positions[..genomic]
};
let r1_cigar = cigar_from_ref_positions(r1_positions, adapter_bases, r1_negative_strand);
let r1_ref_pos = if fragment.ref_positions.is_empty() {
0
} else if r1_negative_strand {
fragment.ref_positions[right_start] + 1
} else {
fragment.ref_positions[0] + 1
};
#[expect(clippy::cast_possible_truncation, reason = "fragment length fits in u32")]
let fragment_length = frag_len as u32;
let r1_truth = TruthAlignment {
contig: contig_name.to_string(),
position: r1_ref_pos,
is_forward: fragment.is_forward,
haplotype: fragment.haplotype_index,
fragment_length,
n_errors: r1_errors,
};
if !paired {
let name =
if simple_names { simple_name(read_num) } else { encoded_se_name(read_num, &r1_truth) };
return ReadPair {
read1: SimulatedRead { name, bases: r1_bases, qualities: r1_quals },
read2: None,
r1_truth,
r2_truth: None,
r1_cigar,
r2_cigar: None,
};
}
let r2_negative_strand = fragment.is_forward;
let mut r2_bases =
extract_read_bases(&fragment.bases, read_length, adapter_r2, r2_negative_strand);
let (r2_errors, r2_quals) =
error_model::apply_errors(model, &mut r2_bases, ReadEnd::Read2, rng);
let r2_positions = if r2_negative_strand {
&fragment.ref_positions[right_start..frag_len]
} else {
&fragment.ref_positions[..genomic]
};
let r2_cigar = cigar_from_ref_positions(r2_positions, adapter_bases, r2_negative_strand);
let r2_ref_pos = if fragment.ref_positions.is_empty() {
0
} else if r2_negative_strand {
fragment.ref_positions[right_start] + 1
} else {
fragment.ref_positions[0] + 1
};
let r2_truth = TruthAlignment {
contig: contig_name.to_string(),
position: r2_ref_pos,
is_forward: !fragment.is_forward,
haplotype: fragment.haplotype_index,
fragment_length,
n_errors: r2_errors,
};
let name = if simple_names {
simple_name(read_num)
} else {
encoded_pe_name(read_num, &r1_truth, &r2_truth)
};
ReadPair {
read1: SimulatedRead { name: name.clone(), bases: r1_bases, qualities: r1_quals },
read2: Some(SimulatedRead { name, bases: r2_bases, qualities: r2_quals }),
r1_truth,
r2_truth: Some(r2_truth),
r1_cigar,
r2_cigar: Some(r2_cigar),
}
}
#[must_use]
pub fn cigar_from_ref_positions(
positions: &[u32],
adapter_bases: usize,
negative_strand: bool,
) -> Cigar {
let mut ops: Vec<Op> = Vec::new();
if positions.is_empty() {
if adapter_bases > 0 {
ops.push(Op::new(Kind::SoftClip, adapter_bases));
}
return Cigar::from(ops);
}
let mut match_run: usize = 1; let mut ins_run: usize = 0;
for i in 1..positions.len() {
let prev = positions[i - 1];
let curr = positions[i];
if curr == prev + 1 {
if ins_run > 0 {
ops.push(Op::new(Kind::Insertion, ins_run));
ins_run = 0;
}
match_run += 1;
} else if curr == prev {
if match_run > 0 {
ops.push(Op::new(Kind::Match, match_run));
match_run = 0;
}
ins_run += 1;
} else {
if ins_run > 0 {
ops.push(Op::new(Kind::Insertion, ins_run));
ins_run = 0;
}
if match_run > 0 {
ops.push(Op::new(Kind::Match, match_run));
}
let gap = curr.saturating_sub(prev).saturating_sub(1);
if gap > 0 {
ops.push(Op::new(Kind::Deletion, gap as usize));
}
match_run = 1; }
}
if ins_run > 0 {
ops.push(Op::new(Kind::Insertion, ins_run));
}
if match_run > 0 {
ops.push(Op::new(Kind::Match, match_run));
}
if adapter_bases > 0 {
if negative_strand {
ops.insert(0, Op::new(Kind::SoftClip, adapter_bases));
} else {
ops.push(Op::new(Kind::SoftClip, adapter_bases));
}
}
Cigar::from(ops)
}
#[must_use]
pub fn cigar_to_string(cigar: &Cigar) -> String {
use std::fmt::Write;
cigar.as_ref().iter().fold(String::new(), |mut s, op| {
let kind_char = match op.kind() {
Kind::Match => 'M',
Kind::Insertion => 'I',
Kind::Deletion => 'D',
Kind::SoftClip => 'S',
Kind::HardClip => 'H',
Kind::Skip => 'N',
Kind::Pad => 'P',
Kind::SequenceMatch => '=',
Kind::SequenceMismatch => 'X',
};
let _ = write!(s, "{}{kind_char}", op.len());
s
})
}
#[cfg(test)]
mod tests {
use rand::SeedableRng;
use rand::rngs::SmallRng;
use super::*;
use crate::error_model::illumina::IlluminaErrorModel;
fn test_fragment(bases: &[u8], ref_start: u32) -> Fragment {
#[expect(clippy::cast_possible_truncation, reason = "test data is small")]
let ref_positions: Vec<u32> = (ref_start..ref_start + bases.len() as u32).collect();
Fragment {
bases: bases.to_vec(),
ref_positions,
ref_start,
is_forward: true,
haplotype_index: 0,
}
}
#[test]
fn test_cigar_all_match() {
let cigar = cigar_from_ref_positions(&[0, 1, 2, 3, 4], 0, false);
assert_eq!(cigar_to_string(&cigar), "5M");
}
#[test]
fn test_cigar_with_insertion() {
let cigar = cigar_from_ref_positions(&[0, 1, 2, 2, 2, 3, 4], 0, false);
assert_eq!(cigar_to_string(&cigar), "3M2I2M");
}
#[test]
fn test_cigar_with_deletion() {
let cigar = cigar_from_ref_positions(&[0, 1, 2, 5, 6], 0, false);
assert_eq!(cigar_to_string(&cigar), "3M2D2M");
}
#[test]
fn test_cigar_with_adapter_softclip_forward() {
let cigar = cigar_from_ref_positions(&[0, 1, 2], 2, false);
assert_eq!(cigar_to_string(&cigar), "3M2S");
}
#[test]
fn test_cigar_with_adapter_softclip_negative_strand() {
let cigar = cigar_from_ref_positions(&[0, 1, 2], 2, true);
assert_eq!(cigar_to_string(&cigar), "2S3M");
}
#[test]
fn test_cigar_all_adapter() {
let cigar = cigar_from_ref_positions(&[], 5, false);
assert_eq!(cigar_to_string(&cigar), "5S");
}
#[test]
fn test_cigar_with_insertion_and_deletion() {
let cigar = cigar_from_ref_positions(&[0, 1, 2, 2, 5, 6], 0, false);
assert_eq!(cigar_to_string(&cigar), "3M1I2D2M");
}
#[test]
fn test_cigar_with_adapter_and_deletion() {
let cigar = cigar_from_ref_positions(&[0, 1, 4, 5], 3, false);
assert_eq!(cigar_to_string(&cigar), "2M2D2M3S");
}
#[test]
fn test_cigar_single_base() {
let cigar = cigar_from_ref_positions(&[42], 0, false);
assert_eq!(cigar_to_string(&cigar), "1M");
}
#[test]
fn test_cigar_high_positions() {
let cigar = cigar_from_ref_positions(&[100, 101, 102, 103, 104], 0, true);
assert_eq!(cigar_to_string(&cigar), "5M");
}
#[test]
fn test_generate_pe_read_pair() {
let fragment = test_fragment(b"ACGTACGTACGTACGTACGT", 100);
let model = IlluminaErrorModel::new(10, 0.0, 0.0);
let mut rng = SmallRng::seed_from_u64(42);
let pair = generate_read_pair(
&fragment, "chr1", 1, 10, true, b"ADAPTER", b"ADAPTER", &model, false, &mut rng,
);
assert_eq!(pair.read1.bases, b"ACGTACGTAC");
assert!(pair.read2.is_some());
assert_eq!(pair.r1_truth.position, 101);
assert!(pair.r2_truth.is_some());
assert_eq!(cigar_to_string(&pair.r1_cigar), "10M");
assert_eq!(cigar_to_string(pair.r2_cigar.as_ref().unwrap()), "10M");
}
#[test]
fn test_generate_se_read() {
let fragment = test_fragment(b"ACGTACGTAC", 100);
let model = IlluminaErrorModel::new(10, 0.0, 0.0);
let mut rng = SmallRng::seed_from_u64(42);
let pair = generate_read_pair(
&fragment, "chr1", 5, 10, false, b"ADAPTER", b"ADAPTER", &model, false, &mut rng,
);
assert!(pair.read2.is_none());
assert!(pair.r2_cigar.is_none());
assert_eq!(cigar_to_string(&pair.r1_cigar), "10M");
}
#[test]
fn test_adapter_cigar_softclip() {
let fragment = test_fragment(b"AC", 0);
let model = IlluminaErrorModel::new(5, 0.0, 0.0);
let mut rng = SmallRng::seed_from_u64(42);
let pair = generate_read_pair(
&fragment, "chr1", 1, 5, true, b"TTTTT", b"GGGGG", &model, false, &mut rng,
);
assert_eq!(cigar_to_string(&pair.r1_cigar), "2M3S");
assert_eq!(cigar_to_string(pair.r2_cigar.as_ref().unwrap()), "3S2M");
}
#[test]
fn test_simple_name_mode() {
let fragment = test_fragment(b"ACGT", 0);
let model = IlluminaErrorModel::new(4, 0.0, 0.0);
let mut rng = SmallRng::seed_from_u64(42);
let pair =
generate_read_pair(&fragment, "chr1", 42, 4, true, b"A", b"A", &model, true, &mut rng);
assert_eq!(pair.read1.name, "holodeck::42");
}
#[test]
fn test_quality_scores_correct_length() {
let fragment = test_fragment(b"ACGTACGTAC", 0);
let model = IlluminaErrorModel::new(10, 0.001, 0.01);
let mut rng = SmallRng::seed_from_u64(42);
let pair =
generate_read_pair(&fragment, "chr1", 1, 10, true, b"A", b"A", &model, false, &mut rng);
assert_eq!(pair.read1.qualities.len(), 10);
assert_eq!(pair.read2.as_ref().unwrap().qualities.len(), 10);
}
}