use std::path::PathBuf;
use anyhow::Result;
use clap::Args;
use noodles::sam::Header;
use noodles::sam::alignment::record::cigar::op::Kind;
use riker_derive::MetricDocs;
use serde::{Deserialize, Serialize};
use crate::collector::{Collector, drive_collector_single_threaded};
use crate::commands::command::Command;
use crate::commands::common::{InputOptions, OptionalReferenceOptions, OutputOptions};
use crate::counter::Counter;
use crate::fasta::Fasta;
use crate::math::{safe_div, safe_div_f};
use crate::metrics::write_tsv;
use crate::progress::ProgressLogger;
use crate::sam::alignment_reader::AlignmentReader;
use crate::sam::pair_orientation::{PairOrientation, get_pair_orientation};
use crate::sam::record_utils::{derive_sample, get_integer_tag};
use crate::sam::riker_record::{RikerRecord, RikerRecordRequirements};
use crate::simd;
pub const METRICS_SUFFIX: &str = ".alignment-metrics.txt";
const BASE_QUALITY_THRESHOLD: u8 = 20;
#[riker_derive::multi_options("aln", "Alignment Options")]
#[derive(Args, Debug, Clone)]
#[command()]
pub struct AlignmentOptions {
#[arg(long, default_value_t = AlignmentOptions::DEFAULT_MAX_INSERT_SIZE)]
pub max_insert_size: u32,
#[arg(long, default_value_t = AlignmentOptions::DEFAULT_MIN_MAPQ)]
pub min_mapq: u8,
}
impl AlignmentOptions {
const DEFAULT_MAX_INSERT_SIZE: u32 = 100_000;
const DEFAULT_MIN_MAPQ: u8 = 20;
}
impl Default for AlignmentOptions {
fn default() -> Self {
Self { max_insert_size: Self::DEFAULT_MAX_INSERT_SIZE, min_mapq: Self::DEFAULT_MIN_MAPQ }
}
}
#[derive(Args, Debug, Clone)]
#[command(
long_about,
after_long_help = "\
Examples:
riker alignment -i input.bam -o out_prefix
riker alignment -i input.bam -o out_prefix -r ref.fa"
)]
pub struct Alignment {
#[command(flatten)]
pub input: InputOptions,
#[command(flatten)]
pub output: OutputOptions,
#[command(flatten)]
pub reference: OptionalReferenceOptions,
#[command(flatten)]
pub options: AlignmentOptions,
}
impl Command for Alignment {
fn execute(&self) -> Result<()> {
let mut reader =
AlignmentReader::open(&self.input.input, self.reference.reference.as_deref())?;
let mut collector = AlignmentCollector::new(
&self.input.input,
&self.output.output,
self.reference.reference.clone(),
&self.options,
);
let mut progress = ProgressLogger::new("alignment", "reads", 5_000_000);
drive_collector_single_threaded(&mut reader, &mut collector, &mut progress)
}
}
pub struct AlignmentCollector {
input: PathBuf,
metrics_path: PathBuf,
reference_path: Option<PathBuf>,
max_insert_size: u32,
min_mapq: u8,
sample: String,
first: CategoryAccumulator,
second: CategoryAccumulator,
}
impl AlignmentCollector {
#[must_use]
pub fn new(
input: &std::path::Path,
prefix: &std::path::Path,
reference_path: Option<PathBuf>,
options: &AlignmentOptions,
) -> Self {
let metrics_path = super::command::output_path(prefix, METRICS_SUFFIX);
Self {
input: input.to_path_buf(),
metrics_path,
reference_path,
max_insert_size: options.max_insert_size,
min_mapq: options.min_mapq,
sample: String::new(),
first: CategoryAccumulator::new("read1"),
second: CategoryAccumulator::new("read2"),
}
}
}
impl Collector for AlignmentCollector {
fn initialize(&mut self, header: &Header) -> Result<()> {
if let Some(ref ref_path) = self.reference_path {
let fasta = Fasta::from_path(ref_path)?;
fasta.validate_bam_header(header)?;
}
self.sample = derive_sample(&self.input, header);
Ok(())
}
fn accept(&mut self, record: &RikerRecord, _header: &Header) -> Result<()> {
let flags = record.flags();
if flags.is_secondary() || flags.is_supplementary() {
return Ok(());
}
if flags.is_segmented() {
if flags.is_first_segment() {
self.first.process_record(record, self.min_mapq, self.max_insert_size);
} else {
self.second.process_record(record, self.min_mapq, self.max_insert_size);
}
} else {
self.first.process_record(record, self.min_mapq, self.max_insert_size);
}
Ok(())
}
fn finish(&mut self) -> Result<()> {
let mut metrics: Vec<AlignmentSummaryMetric> = Vec::new();
if self.second.total_reads > 0 {
let mut first_metric = self.first.compute_metric();
let mut second_metric = self.second.compute_metric();
let combined = self.first.merge(&self.second);
let mut pair_metric = combined.compute_metric();
pair_metric.bad_cycles = first_metric.bad_cycles + second_metric.bad_cycles;
first_metric.sample.clone_from(&self.sample);
second_metric.sample.clone_from(&self.sample);
pair_metric.sample.clone_from(&self.sample);
metrics.push(first_metric);
metrics.push(second_metric);
metrics.push(pair_metric);
} else {
let mut m = self.first.compute_metric();
m.sample.clone_from(&self.sample);
metrics.push(m);
}
write_tsv(&self.metrics_path, &metrics)?;
Ok(())
}
fn name(&self) -> &'static str {
"alignment"
}
fn field_needs(&self) -> RikerRecordRequirements {
RikerRecordRequirements::NONE
.with_sequence()
.with_aux_tag(*b"NM")
.with_aux_tag(*b"MQ")
.with_aux_tag_presence(*b"SA")
}
}
#[derive(Debug, Serialize, Deserialize, MetricDocs)]
pub struct AlignmentSummaryMetric {
pub sample: String,
pub category: String,
pub total_reads: u64,
pub aligned_reads: u64,
#[serde(serialize_with = "crate::metrics::serialize_f64_5dp")]
pub frac_aligned: f64,
pub hq_aligned_reads: u64,
pub hq_aligned_bases: u64,
pub hq_aligned_q20_bases: u64,
#[serde(serialize_with = "crate::metrics::serialize_f64_2dp")]
pub hq_median_mismatches: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_6dp")]
pub mismatch_rate: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_6dp")]
pub hq_mismatch_rate: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_6dp")]
pub indel_rate: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_2dp")]
pub mean_read_length: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_2dp")]
pub sd_read_length: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_2dp")]
pub median_read_length: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_2dp")]
pub mad_read_length: f64,
pub min_read_length: u64,
pub max_read_length: u64,
#[serde(serialize_with = "crate::metrics::serialize_f64_2dp")]
pub mean_aligned_read_length: f64,
pub aligned_reads_in_pairs: u64,
#[serde(serialize_with = "crate::metrics::serialize_f64_5dp")]
pub frac_aligned_in_pairs: f64,
pub reads_improperly_paired: u64,
#[serde(serialize_with = "crate::metrics::serialize_f64_5dp")]
pub frac_reads_improperly_paired: f64,
pub bad_cycles: u64,
#[serde(serialize_with = "crate::metrics::serialize_f64_5dp")]
pub strand_balance: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_5dp")]
pub frac_chimeras: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_5dp")]
pub frac_softclipped_reads: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_5dp")]
pub frac_hardclipped_reads: f64,
#[serde(serialize_with = "crate::metrics::serialize_f64_2dp")]
pub mean_3prime_softclipped_bases: f64,
}
struct CategoryAccumulator {
category: &'static str,
total_reads: u64,
pf_reads: u64,
aligned_reads: u64,
pf_aligned_bases: u64, hq_aligned_reads: u64,
hq_aligned_bases: u64,
hq_aligned_q20_bases: u64,
aligned_reads_in_pairs: u64,
reads_improperly_paired: u64,
num_positive_strand: u64,
chimeras: u64,
chimeras_denominator: u64,
num_soft_clipped: u64, num_hard_clipped: u64, num_3prime_soft_clipped_bases: u64,
num_reads_with_3prime_softclips: u64,
indels: u64,
total_nm: u64, hq_nm: u64, hq_nm_histogram: Counter<u64>,
read_length_histogram: Counter<u64>,
read_length_sum: u64,
read_length_sum_sq: u64,
aligned_read_length_sum: u64,
bad_cycle_nocalls: Counter<u64>,
}
impl CategoryAccumulator {
fn new(category: &'static str) -> Self {
Self {
category,
total_reads: 0,
pf_reads: 0,
aligned_reads: 0,
pf_aligned_bases: 0,
hq_aligned_reads: 0,
hq_aligned_bases: 0,
hq_aligned_q20_bases: 0,
aligned_reads_in_pairs: 0,
reads_improperly_paired: 0,
num_positive_strand: 0,
chimeras: 0,
chimeras_denominator: 0,
num_soft_clipped: 0,
num_hard_clipped: 0,
num_3prime_soft_clipped_bases: 0,
num_reads_with_3prime_softclips: 0,
indels: 0,
total_nm: 0,
hq_nm: 0,
hq_nm_histogram: Counter::new(),
read_length_histogram: Counter::new(),
read_length_sum: 0,
read_length_sum_sq: 0,
aligned_read_length_sum: 0,
bad_cycle_nocalls: Counter::new(),
}
}
fn process_record(&mut self, record: &RikerRecord, min_mapq: u8, max_insert_size: u32) {
let flags = record.flags();
self.total_reads += 1;
if flags.is_qc_fail() {
return;
}
self.pf_reads += 1;
let seq: &[u8] = record.sequence();
let read_len = seq.len() as u64;
self.read_length_histogram.count(read_len);
self.read_length_sum += read_len;
self.read_length_sum_sq += read_len * read_len;
self.num_hard_clipped += sum_cigar_op(record, Kind::HardClip);
track_bad_cycles(
&mut self.bad_cycle_nocalls,
seq,
read_len,
flags.is_reverse_complemented(),
);
if flags.is_unmapped() {
return;
}
let mapq = record.mapping_quality().map_or(255u8, u8::from);
let is_hq = mapq >= min_mapq;
let negative_strand = flags.is_reverse_complemented();
let cs = self.process_cigar(record, is_hq, negative_strand);
self.num_soft_clipped += cs.soft_clip_bases;
if cs.three_prime_soft_clip > 0 {
self.num_3prime_soft_clipped_bases += cs.three_prime_soft_clip;
self.num_reads_with_3prime_softclips += 1;
}
self.aligned_read_length_sum += cs.aligned_read_length;
self.aligned_reads += 1;
if !negative_strand {
self.num_positive_strand += 1;
}
if is_hq {
self.hq_aligned_reads += 1;
}
let nm = u64::from(get_integer_tag(record, *b"NM").unwrap_or(0));
let indel_bases = cs.insertion_bases + cs.deletion_bases;
let adjusted_nm = nm.saturating_sub(indel_bases);
self.total_nm += adjusted_nm;
if is_hq {
self.hq_nm += adjusted_nm;
self.hq_nm_histogram.count(adjusted_nm);
}
if flags.is_segmented() && !flags.is_mate_unmapped() {
self.aligned_reads_in_pairs += 1;
if !flags.is_properly_segmented() {
self.reads_improperly_paired += 1;
}
let mate_mq = get_integer_tag(record, *b"MQ");
let include_chimera_check = mate_mq.is_none_or(|mq| {
mq >= u32::from(min_mapq) && u32::from(mapq) >= u32::from(min_mapq)
});
if include_chimera_check {
self.chimeras_denominator += 1;
if is_chimeric_pair(record, max_insert_size) {
self.chimeras += 1;
}
}
} else if is_hq {
self.chimeras_denominator += 1;
if record.has_aux_tag(*b"SA") {
self.chimeras += 1;
}
}
}
fn process_cigar(
&mut self,
record: &RikerRecord,
is_hq: bool,
negative_strand: bool,
) -> CigarStats {
let qual_bytes: &[u8] = record.quality_scores();
let has_quals = !qual_bytes.is_empty();
let mut read_pos: usize = 0;
let mut soft_clip_bases = 0u64;
let mut aligned_read_length = 0u64;
let mut insertion_bases = 0u64;
let mut deletion_bases = 0u64;
let mut seen_non_clip = false;
let mut leading_sc = 0u64;
let mut trailing_sc = 0u64;
for op in record.cigar_ops() {
let len = op.len();
match op.kind() {
Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
seen_non_clip = true;
aligned_read_length += len as u64;
self.pf_aligned_bases += len as u64;
if is_hq {
self.hq_aligned_bases += len as u64;
if has_quals {
let start = read_pos.min(qual_bytes.len());
let end = read_pos.saturating_add(len).min(qual_bytes.len());
let window = &qual_bytes[start..end];
self.hq_aligned_q20_bases +=
simd::count_bases_ge_q(window, BASE_QUALITY_THRESHOLD);
}
}
read_pos += len;
}
Kind::Insertion => {
seen_non_clip = true;
aligned_read_length += len as u64;
insertion_bases += len as u64;
self.indels += 1;
read_pos += len;
}
Kind::Deletion => {
seen_non_clip = true;
deletion_bases += len as u64;
self.indels += 1;
}
Kind::SoftClip => {
soft_clip_bases += len as u64;
if seen_non_clip {
trailing_sc += len as u64;
} else {
leading_sc += len as u64;
}
read_pos += len;
}
Kind::HardClip | Kind::Skip | Kind::Pad => {}
}
}
let three_prime_soft_clip = if negative_strand { leading_sc } else { trailing_sc };
CigarStats {
soft_clip_bases,
three_prime_soft_clip,
aligned_read_length,
insertion_bases,
deletion_bases,
}
}
fn merge(&self, other: &Self) -> Self {
let mut read_length_histogram = self.read_length_histogram.clone();
read_length_histogram += &other.read_length_histogram;
let mut hq_nm_histogram = self.hq_nm_histogram.clone();
hq_nm_histogram += &other.hq_nm_histogram;
let mut bad_cycle_nocalls = self.bad_cycle_nocalls.clone();
bad_cycle_nocalls += &other.bad_cycle_nocalls;
Self {
category: "pair",
total_reads: self.total_reads + other.total_reads,
pf_reads: self.pf_reads + other.pf_reads,
aligned_reads: self.aligned_reads + other.aligned_reads,
pf_aligned_bases: self.pf_aligned_bases + other.pf_aligned_bases,
hq_aligned_reads: self.hq_aligned_reads + other.hq_aligned_reads,
hq_aligned_bases: self.hq_aligned_bases + other.hq_aligned_bases,
hq_aligned_q20_bases: self.hq_aligned_q20_bases + other.hq_aligned_q20_bases,
aligned_reads_in_pairs: self.aligned_reads_in_pairs + other.aligned_reads_in_pairs,
reads_improperly_paired: self.reads_improperly_paired + other.reads_improperly_paired,
num_positive_strand: self.num_positive_strand + other.num_positive_strand,
chimeras: self.chimeras + other.chimeras,
chimeras_denominator: self.chimeras_denominator + other.chimeras_denominator,
num_soft_clipped: self.num_soft_clipped + other.num_soft_clipped,
num_hard_clipped: self.num_hard_clipped + other.num_hard_clipped,
num_3prime_soft_clipped_bases: self.num_3prime_soft_clipped_bases
+ other.num_3prime_soft_clipped_bases,
num_reads_with_3prime_softclips: self.num_reads_with_3prime_softclips
+ other.num_reads_with_3prime_softclips,
indels: self.indels + other.indels,
total_nm: self.total_nm + other.total_nm,
hq_nm: self.hq_nm + other.hq_nm,
hq_nm_histogram,
read_length_histogram,
read_length_sum: self.read_length_sum + other.read_length_sum,
read_length_sum_sq: self.read_length_sum_sq + other.read_length_sum_sq,
aligned_read_length_sum: self.aligned_read_length_sum + other.aligned_read_length_sum,
bad_cycle_nocalls,
}
}
fn compute_metric(&self) -> AlignmentSummaryMetric {
let pf_reads = self.pf_reads;
let pf_bases = self.read_length_sum;
let mean_rl = safe_div_f(self.read_length_sum as f64, pf_reads as f64);
let var_rl = if pf_reads > 0 {
(self.read_length_sum_sq as f64 / pf_reads as f64) - mean_rl * mean_rl
} else {
0.0
};
let sd_rl = var_rl.max(0.0).sqrt();
let (median_rl, mad_rl) = self.read_length_histogram.median_and_mad();
let min_rl = self.read_length_histogram.min().unwrap_or(0);
let max_read_len = self.read_length_histogram.max().unwrap_or(0);
let mean_aligned_len =
safe_div_f(self.aligned_read_length_sum as f64, self.aligned_reads as f64);
let mismatch_rate = safe_div_f(self.total_nm as f64, self.pf_aligned_bases as f64);
let hq_mismatch_rate = safe_div_f(self.hq_nm as f64, self.hq_aligned_bases as f64);
let hq_median_mismatches = self.hq_nm_histogram.median();
let indel_rate = safe_div_f(self.indels as f64, self.pf_aligned_bases as f64);
let frac_aligned = safe_div(self.aligned_reads, pf_reads);
let frac_aligned_in_pairs = safe_div(self.aligned_reads_in_pairs, self.aligned_reads);
let frac_improper = safe_div(self.reads_improperly_paired, self.aligned_reads);
let strand_balance = safe_div(self.num_positive_strand, self.aligned_reads);
let frac_chimeras = safe_div(self.chimeras, self.chimeras_denominator);
let frac_softclipped_reads = safe_div_f(self.num_soft_clipped as f64, pf_bases as f64);
let frac_hardclipped_reads = safe_div_f(self.num_hard_clipped as f64, pf_bases as f64);
let avg_3prime = safe_div_f(
self.num_3prime_soft_clipped_bases as f64,
self.num_reads_with_3prime_softclips as f64,
);
let bad_cycles = count_bad_cycles(&self.bad_cycle_nocalls, self.total_reads);
AlignmentSummaryMetric {
sample: String::new(),
category: self.category.to_string(),
total_reads: self.total_reads,
aligned_reads: self.aligned_reads,
frac_aligned,
hq_aligned_reads: self.hq_aligned_reads,
hq_aligned_bases: self.hq_aligned_bases,
hq_aligned_q20_bases: self.hq_aligned_q20_bases,
hq_median_mismatches,
mismatch_rate,
hq_mismatch_rate,
indel_rate,
mean_read_length: mean_rl,
sd_read_length: sd_rl,
median_read_length: median_rl,
mad_read_length: mad_rl,
min_read_length: min_rl,
max_read_length: max_read_len,
mean_aligned_read_length: mean_aligned_len,
aligned_reads_in_pairs: self.aligned_reads_in_pairs,
frac_aligned_in_pairs,
reads_improperly_paired: self.reads_improperly_paired,
frac_reads_improperly_paired: frac_improper,
bad_cycles,
strand_balance,
frac_chimeras,
frac_softclipped_reads,
frac_hardclipped_reads,
mean_3prime_softclipped_bases: avg_3prime,
}
}
}
struct CigarStats {
soft_clip_bases: u64,
three_prime_soft_clip: u64,
aligned_read_length: u64,
insertion_bases: u64,
deletion_bases: u64,
}
fn sum_cigar_op(record: &RikerRecord, target: Kind) -> u64 {
record.cigar_ops().filter(|op| op.kind() == target).map(|op| op.len() as u64).sum()
}
fn is_chimeric_pair(record: &RikerRecord, max_insert_size: u32) -> bool {
if record.reference_sequence_id() != record.mate_reference_sequence_id() {
return true;
}
let tlen = record.template_length().unsigned_abs();
if tlen > max_insert_size {
return true;
}
match get_pair_orientation(record) {
Some(PairOrientation::Fr) | None => {}
Some(_) => return true,
}
if record.has_aux_tag(*b"SA") {
return true;
}
false
}
fn track_bad_cycles(
bad_cycle_nocalls: &mut Counter<u64>,
bases: &[u8],
read_len: u64,
negative_strand: bool,
) {
for (i, &base) in bases.iter().enumerate() {
if base == b'N' {
let cycle = if negative_strand { read_len - i as u64 } else { i as u64 + 1 };
bad_cycle_nocalls.count(cycle);
}
}
}
fn count_bad_cycles(nocall_map: &Counter<u64>, total_reads: u64) -> u64 {
if total_reads == 0 {
return 0;
}
let total = total_reads as f64;
nocall_map.values().filter(|&&count| count as f64 / total >= 0.8).count() as u64
}
#[cfg(test)]
#[allow(clippy::float_cmp)]
mod tests {
use super::*;
use noodles::core::Position;
use noodles::sam::alignment::RecordBuf;
use noodles::sam::alignment::record::{
Flags, MappingQuality,
cigar::{Op, op::Kind as CigarKind},
};
use noodles::sam::alignment::record_buf::{Cigar, Data, QualityScores, Sequence};
#[allow(clippy::too_many_arguments)]
fn make_record(
flags: Flags,
pos: Option<usize>,
mapq: u8,
cigar: Cigar,
seq: &[u8],
quals: &[u8],
ref_id: Option<usize>,
mate_ref_id: Option<usize>,
tlen: i32,
data: noodles::sam::alignment::record_buf::Data,
) -> RikerRecord {
let mut b = RecordBuf::builder()
.set_flags(flags)
.set_mapping_quality(MappingQuality::new(mapq).expect("mapq"))
.set_cigar(cigar)
.set_sequence(Sequence::from(seq.to_vec()))
.set_quality_scores(QualityScores::from(quals.to_vec()))
.set_template_length(tlen)
.set_data(data);
if let Some(rid) = ref_id {
b = b.set_reference_sequence_id(rid);
}
if let Some(mrid) = mate_ref_id {
b = b.set_mate_reference_sequence_id(mrid);
}
if let Some(p) = pos {
b = b.set_alignment_start(Position::new(p).expect("pos"));
}
RikerRecord::from_alignment_record(&Header::default(), &b.build()).unwrap()
}
fn simple_cigar(len: usize) -> Cigar {
[Op::new(CigarKind::Match, len)].into_iter().collect()
}
#[test]
fn test_sum_cigar_op_soft_clip() {
let cigar: Cigar = [
Op::new(CigarKind::SoftClip, 5),
Op::new(CigarKind::Match, 90),
Op::new(CigarKind::SoftClip, 5),
]
.into_iter()
.collect();
let record = make_record(
Flags::empty(),
Some(1),
60,
cigar,
&[b'A'; 100],
&[30u8; 100],
Some(0),
None,
0,
Data::default(),
);
assert_eq!(sum_cigar_op(&record, Kind::SoftClip), 10);
assert_eq!(sum_cigar_op(&record, Kind::Match), 90);
}
fn cigar_stats(record: &RikerRecord, negative_strand: bool) -> CigarStats {
let mut acc = CategoryAccumulator::new("TEST");
acc.process_cigar(record, false, negative_strand)
}
#[test]
fn test_cigar_stats_match_only() {
let record = make_record(
Flags::empty(),
Some(1),
60,
simple_cigar(100),
&[b'A'; 100],
&[30u8; 100],
Some(0),
None,
0,
Data::default(),
);
let cs = cigar_stats(&record, false);
assert_eq!(cs.aligned_read_length, 100);
assert_eq!(cs.soft_clip_bases, 0);
assert_eq!(cs.three_prime_soft_clip, 0);
assert_eq!(cs.insertion_bases, 0);
assert_eq!(cs.deletion_bases, 0);
}
#[test]
fn test_cigar_stats_with_soft_clip() {
let cigar: Cigar = [
Op::new(CigarKind::SoftClip, 5),
Op::new(CigarKind::Match, 90),
Op::new(CigarKind::SoftClip, 5),
]
.into_iter()
.collect();
let record = make_record(
Flags::empty(),
Some(1),
60,
cigar,
&[b'A'; 100],
&[30u8; 100],
Some(0),
None,
0,
Data::default(),
);
let cs = cigar_stats(&record, false);
assert_eq!(cs.aligned_read_length, 90);
assert_eq!(cs.soft_clip_bases, 10);
assert_eq!(cs.three_prime_soft_clip, 5);
}
#[test]
fn test_cigar_stats_with_insertion() {
let cigar: Cigar = [
Op::new(CigarKind::Match, 80),
Op::new(CigarKind::Insertion, 5),
Op::new(CigarKind::Match, 10),
]
.into_iter()
.collect();
let record = make_record(
Flags::empty(),
Some(1),
60,
cigar,
&[b'A'; 95],
&[30u8; 95],
Some(0),
None,
0,
Data::default(),
);
let cs = cigar_stats(&record, false);
assert_eq!(cs.aligned_read_length, 95);
assert_eq!(cs.insertion_bases, 5);
assert_eq!(cs.deletion_bases, 0);
}
#[test]
fn test_3prime_soft_clip_forward_trailing() {
let cigar: Cigar = [
Op::new(CigarKind::SoftClip, 5),
Op::new(CigarKind::Match, 90),
Op::new(CigarKind::SoftClip, 5),
]
.into_iter()
.collect();
let record = make_record(
Flags::empty(),
Some(1),
60,
cigar,
&[b'A'; 100],
&[30u8; 100],
Some(0),
None,
0,
Data::default(),
);
assert_eq!(cigar_stats(&record, false).three_prime_soft_clip, 5);
}
#[test]
fn test_3prime_soft_clip_reverse_leading() {
let cigar: Cigar = [
Op::new(CigarKind::SoftClip, 5),
Op::new(CigarKind::Match, 90),
Op::new(CigarKind::SoftClip, 5),
]
.into_iter()
.collect();
let record = make_record(
Flags::REVERSE_COMPLEMENTED,
Some(1),
60,
cigar,
&[b'A'; 100],
&[30u8; 100],
Some(0),
None,
0,
Data::default(),
);
assert_eq!(cigar_stats(&record, true).three_prime_soft_clip, 5);
}
#[test]
fn test_3prime_soft_clip_none() {
let cigar: Cigar =
[Op::new(CigarKind::SoftClip, 5), Op::new(CigarKind::Match, 90)].into_iter().collect();
let record = make_record(
Flags::empty(),
Some(1),
60,
cigar,
&[b'A'; 95],
&[30u8; 95],
Some(0),
None,
0,
Data::default(),
);
assert_eq!(cigar_stats(&record, false).three_prime_soft_clip, 0);
}
#[test]
fn test_bad_cycles_forward() {
let mut map = Counter::<u64>::new();
track_bad_cycles(&mut map, b"ACNGT", 5, false);
assert_eq!(map.count_of(&3), 1);
assert_eq!(map.len(), 1);
}
#[test]
fn test_bad_cycles_reverse() {
let mut map = Counter::<u64>::new();
track_bad_cycles(&mut map, b"ACNGT", 5, true);
assert_eq!(map.count_of(&3), 1);
}
#[test]
fn test_process_cigar_indels_counted() {
let cigar: Cigar = [
Op::new(CigarKind::Match, 50),
Op::new(CigarKind::Insertion, 2),
Op::new(CigarKind::Match, 48),
Op::new(CigarKind::Deletion, 1),
]
.into_iter()
.collect();
let record = make_record(
Flags::empty(),
Some(1),
60,
cigar,
&[b'A'; 100],
&[30u8; 100],
Some(0),
None,
0,
Data::default(),
);
let mut acc = CategoryAccumulator::new("TEST");
let cs = acc.process_cigar(&record, false, false);
assert_eq!(acc.indels, 2);
assert_eq!(acc.pf_aligned_bases, 98); assert_eq!(cs.insertion_bases, 2);
assert_eq!(cs.deletion_bases, 1);
}
#[test]
fn test_process_cigar_q20_bases() {
let quals: Vec<u8> = (0..10).map(|i| if i < 5 { 30 } else { 10 }).collect();
let record = make_record(
Flags::empty(),
Some(1),
60,
simple_cigar(10),
&[b'A'; 10],
&quals,
Some(0),
None,
0,
Data::default(),
);
let mut acc = CategoryAccumulator::new("TEST");
acc.process_cigar(&record, true, false); assert_eq!(acc.hq_aligned_bases, 10);
assert_eq!(acc.hq_aligned_q20_bases, 5);
}
#[test]
fn test_process_cigar_q20_bases_malformed_short_quals() {
let cigar: Cigar =
[Op::new(CigarKind::Match, 5), Op::new(CigarKind::Match, 10)].into_iter().collect();
let record = make_record(
Flags::empty(),
Some(1),
60,
cigar,
&[b'A'; 15],
&[30u8, 30, 30],
Some(0),
None,
0,
Data::default(),
);
let mut acc = CategoryAccumulator::new("TEST");
acc.process_cigar(&record, true, false);
assert_eq!(acc.hq_aligned_bases, 15);
assert_eq!(acc.hq_aligned_q20_bases, 3);
}
#[test]
fn test_cigar_stats_with_deletion() {
let cigar: Cigar = [
Op::new(CigarKind::Match, 50),
Op::new(CigarKind::Deletion, 2),
Op::new(CigarKind::Match, 48),
]
.into_iter()
.collect();
let record = make_record(
Flags::empty(),
Some(1),
60,
cigar,
&[b'A'; 98],
&[30u8; 98],
Some(0),
None,
0,
Data::default(),
);
let mut acc = CategoryAccumulator::new("TEST");
let cs = acc.process_cigar(&record, false, false);
assert_eq!(cs.aligned_read_length, 98);
assert_eq!(cs.deletion_bases, 2);
assert_eq!(cs.insertion_bases, 0);
assert_eq!(acc.indels, 1);
}
#[test]
fn test_cigar_stats_with_hard_clip() {
let cigar: Cigar = [
Op::new(CigarKind::HardClip, 5),
Op::new(CigarKind::Match, 90),
Op::new(CigarKind::HardClip, 5),
]
.into_iter()
.collect();
let record = make_record(
Flags::empty(),
Some(1),
60,
cigar,
&[b'A'; 90],
&[30u8; 90],
Some(0),
None,
0,
Data::default(),
);
let cs = cigar_stats(&record, false);
assert_eq!(cs.aligned_read_length, 90);
assert_eq!(cs.soft_clip_bases, 0);
assert_eq!(cs.insertion_bases, 0);
assert_eq!(cs.deletion_bases, 0);
}
use noodles::sam::alignment::record_buf::data::field::Value as DataValue;
fn make_chimera_record(
ref_id: usize,
mate_ref_id: usize,
tlen: i32,
is_reverse: bool,
mate_reverse: bool,
read_len: usize,
sa_tag: bool,
) -> RikerRecord {
let mut flags = Flags::SEGMENTED | Flags::PROPERLY_SEGMENTED | Flags::FIRST_SEGMENT;
if is_reverse {
flags |= Flags::REVERSE_COMPLEMENTED;
}
if mate_reverse {
flags |= Flags::MATE_REVERSE_COMPLEMENTED;
}
let cigar: Cigar = [Op::new(CigarKind::Match, read_len)].into_iter().collect();
let mut data = Data::default();
if sa_tag {
data.insert((*b"SA").into(), DataValue::String("chr1,100,+,50M,60,0".into()));
}
let buf = RecordBuf::builder()
.set_flags(flags)
.set_reference_sequence_id(ref_id)
.set_alignment_start(Position::new(100).expect("pos"))
.set_mapping_quality(MappingQuality::new(60).expect("mapq"))
.set_cigar(cigar)
.set_sequence(Sequence::from(vec![b'A'; read_len]))
.set_quality_scores(QualityScores::from(vec![30u8; read_len]))
.set_mate_reference_sequence_id(mate_ref_id)
.set_mate_alignment_start(Position::new(200).expect("pos"))
.set_template_length(tlen)
.set_data(data)
.build();
RikerRecord::from_alignment_record(&Header::default(), &buf).unwrap()
}
#[test]
fn test_chimeric_different_contigs() {
let rec = make_chimera_record(0, 1, 200, false, true, 100, false);
assert!(is_chimeric_pair(&rec, 100_000));
}
#[test]
fn test_chimeric_large_insert() {
let rec = make_chimera_record(0, 0, 200_000, false, true, 100, false);
assert!(is_chimeric_pair(&rec, 100_000));
}
#[test]
fn test_chimeric_non_fr_orientation() {
let rec = make_chimera_record(0, 0, 200, false, false, 100, false);
assert!(is_chimeric_pair(&rec, 100_000));
}
#[test]
fn test_chimeric_sa_tag() {
let rec = make_chimera_record(0, 0, 200, false, true, 100, true);
assert!(is_chimeric_pair(&rec, 100_000));
}
#[test]
fn test_not_chimeric_normal_fr() {
let rec = make_chimera_record(0, 0, 200, false, true, 100, false);
assert!(!is_chimeric_pair(&rec, 100_000));
}
#[test]
fn test_chimeric_exact_max_insert() {
let rec = make_chimera_record(0, 0, 100_000, false, true, 100, false);
assert!(!is_chimeric_pair(&rec, 100_000));
}
#[test]
fn test_count_bad_cycles_at_threshold() {
let map: Counter<u64> = [(1u64, 80u64)].into_iter().collect();
assert_eq!(count_bad_cycles(&map, 100), 1);
}
#[test]
fn test_count_bad_cycles_below_threshold() {
let map: Counter<u64> = [(1u64, 79u64)].into_iter().collect();
assert_eq!(count_bad_cycles(&map, 100), 0);
}
#[test]
fn test_count_bad_cycles_zero_reads() {
let map: Counter<u64> = [(1u64, 10u64)].into_iter().collect();
assert_eq!(count_bad_cycles(&map, 0), 0);
}
#[test]
fn test_count_bad_cycles_multiple() {
let map: Counter<u64> = [(1u64, 90u64), (2, 50), (3, 85)].into_iter().collect();
assert_eq!(count_bad_cycles(&map, 100), 2); }
#[test]
fn test_compute_metric_basic() {
let mut acc = CategoryAccumulator::new("read1");
acc.total_reads = 100;
acc.pf_reads = 100;
acc.aligned_reads = 80;
acc.pf_aligned_bases = 8000;
acc.hq_aligned_reads = 70;
acc.hq_aligned_bases = 7000;
acc.hq_aligned_q20_bases = 6000;
acc.total_nm = 80;
acc.hq_nm = 70;
acc.num_positive_strand = 40;
acc.read_length_sum = 10000;
acc.read_length_sum_sq = 1_000_000;
acc.read_length_histogram.count_n(100, 100);
acc.aligned_read_length_sum = 8000;
let m = acc.compute_metric();
assert_eq!(m.total_reads, 100);
assert_eq!(m.aligned_reads, 80);
assert!((m.frac_aligned - 0.8).abs() < 1e-9);
assert!((m.mismatch_rate - 0.01).abs() < 1e-9); assert!((m.strand_balance - 0.5).abs() < 1e-9); }
#[test]
fn test_compute_metric_zero_reads() {
let acc = CategoryAccumulator::new("read1");
let m = acc.compute_metric();
assert_eq!(m.total_reads, 0);
assert_eq!(m.frac_aligned, 0.0);
assert_eq!(m.mismatch_rate, 0.0);
assert_eq!(m.strand_balance, 0.0);
assert_eq!(m.bad_cycles, 0);
assert_eq!(m.mean_read_length, 0.0);
}
#[test]
fn test_compute_metric_bad_cycles_threshold() {
let mut acc = CategoryAccumulator::new("read1");
acc.total_reads = 100;
acc.pf_reads = 100;
acc.bad_cycle_nocalls.count_n(1, 80);
acc.bad_cycle_nocalls.count_n(2, 79);
acc.read_length_histogram.count_n(100, 100);
acc.read_length_sum = 10000;
let m = acc.compute_metric();
assert_eq!(m.bad_cycles, 1);
}
#[test]
fn test_merge_basic() {
let mut a = CategoryAccumulator::new("read1");
a.total_reads = 50;
a.pf_reads = 45;
a.aligned_reads = 40;
a.pf_aligned_bases = 4000;
a.indels = 5;
a.total_nm = 10;
a.read_length_sum = 5000;
a.read_length_sum_sq = 500_000;
a.read_length_histogram.count_n(100, 50);
a.hq_nm_histogram.count_n(1, 10);
a.bad_cycle_nocalls.count_n(1, 20);
let mut b = CategoryAccumulator::new("read2");
b.total_reads = 60;
b.pf_reads = 55;
b.aligned_reads = 50;
b.pf_aligned_bases = 5000;
b.indels = 3;
b.total_nm = 15;
b.read_length_sum = 6000;
b.read_length_sum_sq = 600_000;
b.read_length_histogram.count_n(100, 60);
b.hq_nm_histogram.count_n(1, 15);
b.bad_cycle_nocalls.count_n(1, 25);
let merged = a.merge(&b);
assert_eq!(merged.category, "pair");
assert_eq!(merged.total_reads, 110);
assert_eq!(merged.pf_reads, 100);
assert_eq!(merged.aligned_reads, 90);
assert_eq!(merged.pf_aligned_bases, 9000);
assert_eq!(merged.indels, 8);
assert_eq!(merged.total_nm, 25);
assert_eq!(merged.read_length_sum, 11000);
assert_eq!(merged.read_length_sum_sq, 1_100_000);
assert_eq!(merged.read_length_histogram.count_of(&100), 110);
assert_eq!(merged.hq_nm_histogram.count_of(&1), 25);
assert_eq!(merged.bad_cycle_nocalls.count_of(&1), 45);
}
}