use std::fmt;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use anyhow::{Context, Result, bail};
use bitvec::prelude::*;
use clap::Args;
use noodles::core::Region;
use noodles::sam::Header;
use noodles::sam::alignment::record::cigar::op::Kind;
use riker_derive::MetricDocs;
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use strum::EnumCount as _;
use crate::collector::Collector;
use crate::commands::command::Command;
use crate::commands::common::{InputOptions, OutputOptions};
use crate::fasta::Fasta;
use crate::metrics::{serialize_f64_2dp, serialize_f64_6dp, write_tsv};
use crate::progress::ProgressLogger;
use crate::sam::alignment_reader::IndexedAlignmentReader;
use crate::sam::mate_buffer::{MateAction, MateBuffer};
use crate::sam::pair_orientation::{PairOrientation, get_pair_orientation};
use crate::sam::riker_record::{RikerRecord, RikerRecordRequirements};
use crate::sequence_dict::SequenceDictionary;
use crate::simd;
use crate::vcf::IndexedVcf;
type HashMap<K, V> = std::collections::HashMap<K, V, rustc_hash::FxBuildHasher>;
pub const MISMATCH_SUFFIX: &str = ".error-mismatch.txt";
pub const OVERLAP_SUFFIX: &str = ".error-overlap.txt";
pub const INDEL_SUFFIX: &str = ".error-indel.txt";
const INLINE_STRATIFIERS: usize = 5;
#[riker_derive::multi_options("error", "Error Metrics Options")]
#[derive(Args, Debug, Clone)]
#[command()]
pub struct ErrorOptions {
#[arg(short = 'r', long, value_name = "FASTA")]
pub reference: PathBuf,
#[arg(long, value_name = "VCF/BCF")]
pub vcf: Option<PathBuf>,
#[arg(long, value_name = "INTERVALS")]
pub intervals: Option<PathBuf>,
#[arg(long, default_value_t = ErrorOptions::DEFAULT_MIN_MAPQ)]
pub min_mapq: u8,
#[arg(long, default_value_t = ErrorOptions::DEFAULT_MIN_BQ)]
pub min_bq: u8,
#[arg(long, default_value_t = false)]
pub include_duplicates: bool,
#[arg(long, default_value_t = ErrorOptions::DEFAULT_MAX_ISIZE)]
pub max_isize: u32,
#[arg(long, default_value_t = false, hide = true)]
pub picard_compat: bool,
#[arg(long, num_args(1..), long_help = ErrorOptions::stratify_by_help())]
pub stratify_by: Vec<String>,
}
impl ErrorOptions {
const DEFAULT_MIN_MAPQ: u8 = 20;
const DEFAULT_MIN_BQ: u8 = 20;
const DEFAULT_MAX_ISIZE: u32 = 1000;
fn stratify_by_help() -> String {
use strum::IntoEnumIterator;
let available: Vec<String> = Stratifier::iter().map(|s| s.to_string()).collect();
let mut help = String::from(
"Stratification groups. Each value is a comma-separated list of stratifiers. \
The \"all\" group is always included regardless.\n\n\
Available stratifiers: ",
);
help.push_str(&available.join(", "));
help.push_str(".\n\n[default: ");
help.push_str(&Self::DEFAULT_STRATIFY_BY.join(" "));
help.push(']');
help
}
const DEFAULT_STRATIFY_BY: &[&str] = &[
"bq",
"isize",
"gc",
"strand",
"pair_orientation",
"hp_len",
"cycle",
"read_num",
"read_num,cycle",
"read_num,hp_len",
"read_num,gc",
"read_num,pre_dinuc",
"mapq",
"nm",
"context_3bp",
];
}
impl Default for ErrorOptions {
fn default() -> Self {
Self {
reference: PathBuf::new(),
vcf: None,
intervals: None,
min_mapq: Self::DEFAULT_MIN_MAPQ,
min_bq: Self::DEFAULT_MIN_BQ,
include_duplicates: false,
max_isize: Self::DEFAULT_MAX_ISIZE,
picard_compat: false,
stratify_by: Vec::new(),
}
}
}
#[derive(Args, Debug, Clone)]
#[command(
long_about,
after_long_help = "\
Examples:
riker error -i input.bam -o out -r ref.fa
riker error -i input.bam -o out -r ref.fa -V known.vcf.gz --stratify-by read_num,cycle bq
riker error -i input.bam -o out -r ref.fa --intervals targets.bed --min-bq 30"
)]
pub struct Error {
#[command(flatten)]
pub input: InputOptions,
#[command(flatten)]
pub output: OutputOptions,
#[command(flatten)]
pub options: ErrorOptions,
}
impl Error {
#[expect(clippy::cast_possible_truncation, reason = "contig lengths fit in u32")]
#[expect(clippy::type_complexity, reason = "internal helper, tuple is clear in context")]
fn build_intervals(
options: &ErrorOptions,
dict: &SequenceDictionary,
) -> Result<(Option<crate::intervals::Intervals>, Vec<(String, u32, u32)>)> {
let parsed = options
.intervals
.as_ref()
.map(|p| crate::intervals::Intervals::from_path(p, dict.clone()))
.transpose()?;
let intervals = if let Some(ref ivs) = parsed {
let mut result = Vec::new();
for ref_id in 0..dict.len() {
if let Some(meta) = dict.get_by_index(ref_id) {
for interval in ivs.get_contig(ref_id) {
result.push((meta.name().to_string(), interval.start, interval.end));
}
}
}
result
} else {
dict.iter().map(|meta| (meta.name().to_string(), 0u32, meta.length() as u32)).collect()
};
Ok((parsed, intervals))
}
}
impl Command for Error {
#[expect(clippy::cast_possible_truncation, reason = "contig lengths fit in u32")]
fn execute(&self) -> Result<()> {
let ref_path = &self.options.reference;
let fai_path = {
let mut p = ref_path.as_os_str().to_owned();
p.push(".fai");
PathBuf::from(p)
};
if !fai_path.exists() {
bail!(
"FASTA index not found: expected {}\n\
Run `samtools faidx {}` to create it.",
fai_path.display(),
ref_path.display(),
);
}
let fasta = Fasta::from_path(ref_path)?;
let mut alignment_reader = IndexedAlignmentReader::open(&self.input.input, Some(ref_path))?;
let header = alignment_reader.header().clone();
let dict = SequenceDictionary::from(&header);
let mut collector =
ErrorCollector::new(&self.input.input, &self.output.output, fasta, &self.options)?;
collector.sample = crate::sam::record_utils::derive_sample(&self.input.input, &header);
let (parsed_intervals, intervals) = Self::build_intervals(&self.options, &dict)?;
if let Some(vcf_path) = &self.options.vcf {
let mut vcf = IndexedVcf::from_path(vcf_path)?;
collector.variant_masks =
crate::vcf::load_variant_masks(&mut vcf, &dict, parsed_intervals.as_ref())?;
}
log::info!("Processing {} intervals across {} contigs", intervals.len(), dict.len());
let mut progress = ProgressLogger::new("error", "reads", 1_000_000);
let mut total_records = 0u64;
let mut contig_cache = ContigCache { name: String::new(), bases: Vec::new() };
let mut prev_region: Option<(String, RegionContext)> = None;
for (contig_name, start, end) in &intervals {
if let Some((prev_contig, region)) = prev_region.as_ref()
&& prev_contig != contig_name
{
for orphan in collector.mate_buffer.flush() {
collector.process_record(&orphan, region, None);
}
}
let region = RegionContext::new(
contig_name,
*start,
*end,
&mut collector.reference,
collector.variant_masks.get(contig_name.as_str()),
&mut contig_cache,
)?;
let region_str = if *start == 0
&& *end >= dict.get_by_name(contig_name).map_or(0, |m| m.length() as u32)
{
contig_name.clone()
} else {
format!("{contig_name}:{}-{end}", start + 1)
};
let query_region: Region = region_str
.parse()
.with_context(|| format!("Failed to parse region: {region_str}"))?;
let requirements = collector.field_needs();
alignment_reader.query_for_each(&query_region, &requirements, |record| {
if !collector.passes_filters(record) {
return Ok(());
}
total_records += 1;
progress.record_with(record, &header);
match collector.mate_buffer.accept(record) {
MateAction::Buffered => {}
MateAction::Alone => {
collector.process_record(record, ®ion, None);
}
MateAction::PairWith(mate) => {
let mate_bases = build_aligned_bases(&mate, collector.min_bq);
let record_bases = build_aligned_bases(record, collector.min_bq);
collector.process_record(&mate, ®ion, Some(&record_bases));
collector.process_record(record, ®ion, Some(&mate_bases));
}
}
Ok(())
})?;
let ref_id = dict
.get_by_name(contig_name)
.map(crate::sequence_dict::SequenceMetadata::index)
.with_context(|| {
format!("contig {contig_name} not found in sequence dictionary")
})?;
for orphan in collector.mate_buffer.flush_behind(ref_id, *end) {
collector.process_record(&orphan, ®ion, None);
}
prev_region = Some((contig_name.clone(), region));
}
if let Some((_, region)) = prev_region.as_ref() {
for orphan in collector.mate_buffer.flush() {
collector.process_record(&orphan, region, None);
}
}
progress.finish();
log::info!("Processed {total_records} records passing filters.");
collector.write_output()?;
Ok(())
}
}
pub struct ErrorCollector {
mismatch_path: PathBuf,
overlap_path: PathBuf,
indel_path: PathBuf,
input_path: PathBuf,
sample: String,
reference: Fasta,
vcf_path: Option<PathBuf>,
variant_masks: std::collections::HashMap<String, BitVec>,
min_mapq: u8,
min_bq: u8,
include_duplicates: bool,
max_isize: u32,
picard_compat: bool,
groups: Vec<StratifierGroup>,
read_level_stratifiers: Vec<Stratifier>,
base_level_stratifiers: Vec<Stratifier>,
mate_buffer: MateBuffer<RikerRecord>,
interner: StringInterner,
accumulators: Vec<GroupAccumulators>,
dict: Option<SequenceDictionary>,
intervals: Option<crate::intervals::Intervals>,
interval_path: Option<PathBuf>,
current_ref_id: Option<usize>,
current_region: Option<RegionContext>,
contig_cache: ContigCache,
}
impl ErrorCollector {
pub fn new(
input: &Path,
prefix: &Path,
reference: Fasta,
options: &ErrorOptions,
) -> Result<Self> {
let mismatch_path = super::command::output_path(prefix, MISMATCH_SUFFIX);
let overlap_path = super::command::output_path(prefix, OVERLAP_SUFFIX);
let indel_path = super::command::output_path(prefix, INDEL_SUFFIX);
let groups = Self::parse_groups(&options.stratify_by)?;
log::info!(
"Stratifier groups: {}",
groups.iter().map(|g| g.name.as_str()).collect::<Vec<_>>().join(", ")
);
let (read_level_stratifiers, base_level_stratifiers) = Stratifier::partition(&groups);
let accumulators = groups.iter().map(|g| GroupAccumulators::new(g.clone())).collect();
Ok(Self {
mismatch_path,
overlap_path,
indel_path,
input_path: input.to_path_buf(),
sample: String::new(),
reference,
vcf_path: options.vcf.clone(),
variant_masks: std::collections::HashMap::new(),
min_mapq: options.min_mapq,
min_bq: options.min_bq,
include_duplicates: options.include_duplicates,
max_isize: options.max_isize,
picard_compat: options.picard_compat,
groups,
read_level_stratifiers,
base_level_stratifiers,
mate_buffer: MateBuffer::new(),
interner: StringInterner::new(),
accumulators,
dict: None,
intervals: None,
interval_path: options.intervals.clone(),
current_ref_id: None,
current_region: None,
contig_cache: ContigCache { name: String::new(), bases: Vec::new() },
})
}
fn parse_groups(stratify_by: &[String]) -> Result<Vec<StratifierGroup>> {
let mut groups = Vec::new();
groups
.push(StratifierGroup { stratifiers: vec![Stratifier::All], name: "all".to_string() });
let specs: Vec<String> = if stratify_by.is_empty() {
ErrorOptions::DEFAULT_STRATIFY_BY.iter().map(|s| (*s).to_string()).collect()
} else {
stratify_by.to_vec()
};
for spec in &specs {
let group = StratifierGroup::parse(spec)?;
if group.stratifiers != vec![Stratifier::All] {
groups.push(group);
}
}
Ok(groups)
}
fn passes_filters(&self, record: &RikerRecord) -> bool {
let flags = record.flags();
if flags.is_unmapped()
|| flags.is_secondary()
|| flags.is_supplementary()
|| flags.is_qc_fail()
{
return false;
}
if !self.include_duplicates && flags.is_duplicate() {
return false;
}
let mapq = record.mapping_quality().map_or(0, |m| m.get());
if mapq < self.min_mapq {
return false;
}
true
}
#[expect(
clippy::cast_precision_loss,
reason = "f64 precision is sufficient for error rate calculations"
)]
fn write_output(&self) -> Result<()> {
let mut mismatch_rows = Vec::new();
let mut overlap_rows = Vec::new();
let mut indel_rows = Vec::new();
for accum in &self.accumulators {
let group_name = &accum.group.name;
let mut entries: Vec<_> = accum.accums.iter().collect();
entries.sort_by_cached_key(|(key, _)| key.format(&self.interner));
for (key, combined) in &entries {
let formatted_key = key.format(&self.interner);
let mm = &combined.mismatch;
if mm.total_bases > 0 {
let frac = mm.error_bases as f64 / mm.total_bases as f64;
mismatch_rows.push(MismatchMetric {
sample: self.sample.clone(),
stratifier: group_name.clone(),
covariate: formatted_key.clone(),
total_bases: mm.total_bases,
error_bases: mm.error_bases,
frac_error: frac,
q_score: q_score(mm.error_bases, mm.total_bases),
});
}
let ov = &combined.overlap;
if ov.overlapping_read_bases > 0 {
overlap_rows.push(OverlappingMismatchMetric {
sample: self.sample.clone(),
stratifier: group_name.clone(),
covariate: formatted_key.clone(),
overlapping_read_bases: ov.overlapping_read_bases,
bases_mismatching_ref_and_mate: ov.bases_mismatching_ref_and_mate,
bases_matching_mate_but_not_ref: ov.bases_matching_mate_but_not_ref,
bases_in_three_way_disagreement: ov.bases_in_three_way_disagreement,
q_mismatching_ref_and_mate: q_score(
ov.bases_mismatching_ref_and_mate,
ov.overlapping_read_bases,
),
q_matching_mate_but_not_ref: q_score(
ov.bases_matching_mate_but_not_ref,
ov.overlapping_read_bases,
),
q_three_way_disagreement: q_score(
ov.bases_in_three_way_disagreement,
ov.overlapping_read_bases,
),
});
}
let ind = &combined.indel;
if ind.total_bases > 0 || ind.num_insertions > 0 || ind.num_deletions > 0 {
let total_events = ind.num_insertions + ind.num_deletions;
let total_for_rate = ind.total_bases.max(1);
indel_rows.push(IndelMetric {
sample: self.sample.clone(),
stratifier: group_name.clone(),
covariate: formatted_key,
total_bases: ind.total_bases,
num_insertions: ind.num_insertions,
num_inserted_bases: ind.num_inserted_bases,
num_deletions: ind.num_deletions,
num_deleted_bases: ind.num_deleted_bases,
frac_indel_error: total_events as f64 / total_for_rate as f64,
q_score: q_score(total_events, ind.total_bases),
});
}
}
}
write_tsv(&self.mismatch_path, &mismatch_rows)?;
write_tsv(&self.overlap_path, &overlap_rows)?;
write_tsv(&self.indel_path, &indel_rows)?;
log::info!("Wrote mismatch metrics to: {}", self.mismatch_path.display());
log::info!("Wrote overlap metrics to: {}", self.overlap_path.display());
log::info!("Wrote indel metrics to: {}", self.indel_path.display());
Ok(())
}
#[expect(
clippy::cast_possible_truncation,
reason = "genomic positions and read lengths fit in u32"
)]
#[expect(clippy::cast_sign_loss, reason = "cycle is always positive after arithmetic")]
#[expect(clippy::cast_possible_wrap, reason = "cycle values fit in i32")]
#[expect(
clippy::too_many_lines,
reason = "CIGAR walk with inline processing is clearest as one function"
)]
fn process_record(
&mut self,
record: &RikerRecord,
region: &RegionContext,
mate_bases: Option<&[(u32, u8)]>,
) {
let is_reverse = record.flags().is_reverse_complemented();
let read_len = record.sequence().len();
let Some(alignment_start) = record.alignment_start() else { return };
let alignment_start = (alignment_start.get() - 1) as u32;
let read_cache =
ReadLevelCache::new(record, &self.interner, self.max_isize, self.picard_compat);
let mut base_cache = BaseCovariateCache::from_read_level(
&self.read_level_stratifiers,
&read_cache,
&self.interner,
);
let mut ref_pos = alignment_start;
let mut read_pos: usize = 0;
let mut cycle: u32 = if is_reverse { read_len as u32 } else { 1 };
let cycle_dir: i32 = if is_reverse { -1 } else { 1 };
let mut mate_scan_idx: usize = 0;
let mut last_anchor_read_offset: Option<usize> = None;
let mut last_anchor_ref_pos: Option<u32> = None;
let mut last_anchor_cycle: Option<u32> = None;
for op in record.cigar_ops() {
let kind = op.kind();
let len = op.len();
match kind {
Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
for _ in 0..len {
self.process_aligned_base(
record,
read_pos,
ref_pos,
region,
cycle,
mate_bases,
&mut mate_scan_idx,
&read_cache,
&mut base_cache,
);
last_anchor_read_offset = Some(read_pos);
last_anchor_ref_pos = Some(ref_pos);
last_anchor_cycle = Some(cycle);
ref_pos += 1;
read_pos += 1;
cycle = (cycle as i32 + cycle_dir) as u32;
}
}
Kind::Insertion => {
let quals = record.quality_scores();
let first_bq = quals.get(read_pos).copied().unwrap_or(0);
if first_bq >= self.min_bq {
let bq_passing = if self.picard_compat { len as u32 } else { 0 };
if let (Some(anchor_ro), Some(anchor_rp), Some(anchor_cy)) =
(last_anchor_read_offset, last_anchor_ref_pos, last_anchor_cycle)
{
self.process_indel(
record,
anchor_ro,
anchor_rp,
region,
anchor_cy,
true,
len as u32,
bq_passing,
&read_cache,
&mut base_cache,
);
} else if bq_passing > 0 && self.picard_compat {
for (group_idx, group) in self.groups.iter().enumerate() {
if let Some(key) = group.key_from_cache(&base_cache) {
self.accumulators[group_idx]
.accums
.entry(key)
.or_default()
.mismatch
.total_bases += u64::from(bq_passing);
}
}
}
}
read_pos += len;
cycle = (cycle as i32 + cycle_dir * len as i32) as u32;
}
Kind::Deletion => {
if let (Some(anchor_ro), Some(anchor_rp), Some(anchor_cy)) =
(last_anchor_read_offset, last_anchor_ref_pos, last_anchor_cycle)
{
self.process_indel(
record,
anchor_ro,
anchor_rp,
region,
anchor_cy,
false,
len as u32,
0, &read_cache,
&mut base_cache,
);
}
ref_pos += len as u32;
}
Kind::SoftClip => {
read_pos += len;
cycle = (cycle as i32 + cycle_dir * len as i32) as u32;
}
Kind::HardClip | Kind::Pad => {}
Kind::Skip => {
ref_pos += len as u32;
}
}
}
}
#[expect(clippy::too_many_arguments)]
fn process_aligned_base(
&mut self,
record: &RikerRecord,
read_offset: usize,
ref_pos: u32,
region: &RegionContext,
cycle: u32,
mate_bases: Option<&[(u32, u8)]>,
mate_scan_idx: &mut usize,
read_cache: &ReadLevelCache,
base_cache: &mut BaseCovariateCache,
) {
if !region.contains(ref_pos) || region.is_excluded(ref_pos) {
return;
}
let quals = record.quality_scores();
if read_offset < quals.len() && quals[read_offset] < self.min_bq {
return;
}
let seq = record.sequence();
let read_base = match seq.get(read_offset) {
Some(&b) => b.to_ascii_uppercase(),
None => return,
};
let ref_base = region.ref_base(ref_pos);
if !is_valid_base(read_base) || (!self.picard_compat && !is_valid_base(ref_base)) {
return;
}
let is_error = read_base != ref_base;
let overlap_mate_base = if let Some(mate_slice) = mate_bases {
while *mate_scan_idx < mate_slice.len() && mate_slice[*mate_scan_idx].0 < ref_pos {
*mate_scan_idx += 1;
}
if *mate_scan_idx < mate_slice.len() && mate_slice[*mate_scan_idx].0 == ref_pos {
let b = mate_slice[*mate_scan_idx].1;
if is_valid_base(b) { Some(b) } else { None }
} else {
None
}
} else {
None
};
base_cache.populate_base_level(
&self.base_level_stratifiers,
record,
read_offset,
ref_pos,
region,
0, cycle,
read_cache,
&self.interner,
);
for (group_idx, group) in self.groups.iter().enumerate() {
let Some(key) = group.key_from_cache(base_cache) else {
continue;
};
let combined = self.accumulators[group_idx].accums.entry(key).or_default();
combined.mismatch.total_bases += 1;
if is_error {
combined.mismatch.error_bases += 1;
}
combined.indel.total_bases += 1;
if let Some(mate_base) = overlap_mate_base {
combined.overlap.overlapping_read_bases += 1;
if is_error {
if mate_base == ref_base {
combined.overlap.bases_mismatching_ref_and_mate += 1;
} else if mate_base == read_base {
combined.overlap.bases_matching_mate_but_not_ref += 1;
} else {
combined.overlap.bases_in_three_way_disagreement += 1;
}
}
}
}
}
#[expect(clippy::too_many_arguments)]
fn process_indel(
&mut self,
record: &RikerRecord,
anchor_read_offset: usize,
anchor_ref_pos: u32,
region: &RegionContext,
cycle: u32,
is_insertion: bool,
indel_len: u32,
bq_passing_insertion_bases: u32,
read_cache: &ReadLevelCache,
base_cache: &mut BaseCovariateCache,
) {
if !region.contains(anchor_ref_pos) {
return;
}
if is_insertion {
if region.is_excluded(anchor_ref_pos) {
return;
}
} else {
let del_start = anchor_ref_pos + 1;
let del_end = anchor_ref_pos + indel_len;
let all_excluded =
(del_start..=del_end).all(|pos| !region.contains(pos) || region.is_excluded(pos));
if all_excluded {
return;
}
}
base_cache.populate_base_level(
&self.base_level_stratifiers,
record,
anchor_read_offset,
anchor_ref_pos,
region,
indel_len,
cycle,
read_cache,
&self.interner,
);
for (group_idx, group) in self.groups.iter().enumerate() {
let Some(key) = group.key_from_cache(base_cache) else {
continue;
};
let combined = self.accumulators[group_idx].accums.entry(key).or_default();
if is_insertion {
combined.indel.num_insertions += 1;
combined.indel.num_inserted_bases += u64::from(indel_len);
if self.picard_compat {
combined.mismatch.total_bases += u64::from(bq_passing_insertion_bases);
}
} else {
combined.indel.num_deletions += 1;
combined.indel.num_deleted_bases += u64::from(indel_len);
}
}
}
}
impl Collector for ErrorCollector {
fn initialize(&mut self, header: &Header) -> Result<()> {
self.sample = crate::sam::record_utils::derive_sample(&self.input_path, header);
let dict = SequenceDictionary::from(header);
if let Some(interval_path) = &self.interval_path {
self.intervals =
Some(crate::intervals::Intervals::from_path(interval_path, dict.clone())?);
}
if let Some(vcf_path) = &self.vcf_path {
let mut vcf = IndexedVcf::from_path(vcf_path)?;
self.variant_masks =
crate::vcf::load_variant_masks(&mut vcf, &dict, self.intervals.as_ref())?;
}
self.dict = Some(dict);
Ok(())
}
#[expect(clippy::cast_possible_truncation, reason = "contig lengths fit in u32")]
fn accept(&mut self, record: &RikerRecord, _header: &Header) -> Result<()> {
if !self.passes_filters(record) {
return Ok(());
}
let Some(ref_id) = record.reference_sequence_id() else {
return Ok(());
};
if self.current_ref_id != Some(ref_id) {
if let Some(region) = self.current_region.take() {
for orphan in self.mate_buffer.flush() {
self.process_record(&orphan, ®ion, None);
}
self.current_region = Some(region);
} else {
self.mate_buffer.clear();
}
let dict = self.dict.as_ref().unwrap();
let Some(meta) = dict.get_by_index(ref_id) else {
return Ok(());
};
if let Some(intervals) = &self.intervals
&& !intervals.has_contig(ref_id)
{
self.current_ref_id = Some(ref_id);
self.current_region = None;
return Ok(());
}
let contig_name = meta.name();
let contig_len = meta.length() as u32;
self.current_region = Some(RegionContext::new_for_contig(
contig_name,
contig_len,
ref_id,
&mut self.reference,
self.variant_masks.get(contig_name),
self.intervals.as_ref(),
&mut self.contig_cache,
)?);
self.current_ref_id = Some(ref_id);
}
let Some(_region) = &self.current_region else {
return Ok(()); };
let region = self.current_region.take().unwrap();
match self.mate_buffer.accept(record) {
MateAction::Buffered => {}
MateAction::Alone => {
self.process_record(record, ®ion, None);
}
MateAction::PairWith(mate) => {
let mate_bases = build_aligned_bases(&mate, self.min_bq);
let record_bases = build_aligned_bases(record, self.min_bq);
self.process_record(&mate, ®ion, Some(&record_bases));
self.process_record(record, ®ion, Some(&mate_bases));
}
}
self.current_region = Some(region);
Ok(())
}
fn finish(&mut self) -> Result<()> {
if let Some(region) = self.current_region.take() {
for orphan in self.mate_buffer.flush() {
self.process_record(&orphan, ®ion, None);
}
self.current_region = Some(region);
} else {
self.mate_buffer.clear();
}
self.write_output()
}
fn name(&self) -> &'static str {
"error"
}
fn field_needs(&self) -> RikerRecordRequirements {
RikerRecordRequirements::NONE.with_sequence().with_aux_tag(*b"NM")
}
}
#[derive(Debug, Serialize, Deserialize, MetricDocs)]
pub struct MismatchMetric {
pub sample: String,
pub stratifier: String,
pub covariate: String,
pub total_bases: u64,
pub error_bases: u64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_error: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub q_score: f64,
}
#[derive(Debug, Serialize, Deserialize, MetricDocs)]
pub struct OverlappingMismatchMetric {
pub sample: String,
pub stratifier: String,
pub covariate: String,
pub overlapping_read_bases: u64,
pub bases_mismatching_ref_and_mate: u64,
pub bases_matching_mate_but_not_ref: u64,
pub bases_in_three_way_disagreement: u64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub q_mismatching_ref_and_mate: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub q_matching_mate_but_not_ref: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub q_three_way_disagreement: f64,
}
#[derive(Debug, Serialize, Deserialize, MetricDocs)]
pub struct IndelMetric {
pub sample: String,
pub stratifier: String,
pub covariate: String,
pub total_bases: u64,
pub num_insertions: u64,
pub num_inserted_bases: u64,
pub num_deletions: u64,
pub num_deleted_bases: u64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_indel_error: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub q_score: f64,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, strum::EnumCount, strum::EnumIter)]
pub enum Stratifier {
All,
Bq,
Mapq,
Cycle,
ReadNum,
Strand,
PairOrientation,
Isize,
Gc,
ReadBase,
RefBase,
HpLen,
PreDinuc,
PostDinuc,
Context3bp,
Nm,
IndelLen,
}
impl Stratifier {
fn partition(groups: &[StratifierGroup]) -> (Vec<Self>, Vec<Self>) {
let mut seen = [false; Self::COUNT];
let mut read_level = Vec::new();
let mut base_level = Vec::new();
for group in groups {
for &strat in &group.stratifiers {
if !seen[strat.index()] {
seen[strat.index()] = true;
if strat.is_read_level() {
read_level.push(strat);
} else {
base_level.push(strat);
}
}
}
}
(read_level, base_level)
}
fn is_read_level(self) -> bool {
matches!(
self,
Self::All
| Self::ReadNum
| Self::Strand
| Self::PairOrientation
| Self::Isize
| Self::Gc
| Self::Mapq
)
}
#[expect(clippy::too_many_arguments)]
fn covariate(
self,
record: &RikerRecord,
read_offset: usize,
ref_pos: u32,
region: &RegionContext,
indel_len: u32,
cycle: u32,
cache: &ReadLevelCache,
interner: &StringInterner,
) -> Option<CovariateValue> {
let is_reverse = record.flags().is_reverse_complemented();
match self {
Self::All => Some(CovariateValue::InternedStr(interner.get("all"))),
Self::Bq => {
let quals = record.quality_scores();
if read_offset < quals.len() {
Some(CovariateValue::Int(u32::from(quals[read_offset])))
} else {
None
}
}
Self::Mapq => cache.mapq,
Self::Cycle => Some(CovariateValue::Int(cycle)),
Self::ReadNum => cache.read_num,
Self::Strand => Some(cache.strand),
Self::PairOrientation => cache.pair_orientation,
Self::Isize => cache.isize_val,
Self::Gc => cache.gc,
Self::ReadBase => {
let base = read_base_at(record, read_offset, is_reverse)?;
Some(CovariateValue::Char(base))
}
Self::RefBase => {
let base = region.ref_base(ref_pos);
let oriented = if is_reverse { complement(base) } else { base };
if is_valid_base(oriented) { Some(CovariateValue::Char(oriented)) } else { None }
}
Self::HpLen => {
let hp = compute_hp_length(record, read_offset, is_reverse);
Some(CovariateValue::Int(hp))
}
Self::PreDinuc => {
let prev = read_base_in_sequencing_order(record, read_offset, is_reverse, -1)?;
let ref_b = region.ref_base(ref_pos);
let ref_oriented = if is_reverse { complement(ref_b) } else { ref_b };
if !is_valid_base(ref_oriented) {
return None;
}
Some(CovariateValue::Dinuc([prev, ref_oriented]))
}
Self::PostDinuc => {
let next = read_base_in_sequencing_order(record, read_offset, is_reverse, 1)?;
let ref_b = region.ref_base(ref_pos);
let ref_oriented = if is_reverse { complement(ref_b) } else { ref_b };
if !is_valid_base(ref_oriented) {
return None;
}
Some(CovariateValue::Dinuc([next, ref_oriented]))
}
Self::Context3bp => {
let prev = read_base_in_sequencing_order(record, read_offset, is_reverse, -1)?;
let next = read_base_in_sequencing_order(record, read_offset, is_reverse, 1)?;
let ref_b = region.ref_base(ref_pos);
let ref_oriented = if is_reverse { complement(ref_b) } else { ref_b };
if !is_valid_base(ref_oriented) {
return None;
}
Some(CovariateValue::Trinuc([prev, ref_oriented, next]))
}
Self::Nm => {
let nm = cache.nm_raw?;
let ref_b = region.ref_base(ref_pos);
let read_b = record.sequence().get(read_offset).copied()?;
let adjusted =
if read_b.to_ascii_uppercase() == ref_b { nm } else { nm.saturating_sub(1) };
Some(CovariateValue::Int(adjusted))
}
Self::IndelLen => Some(CovariateValue::Int(indel_len)),
}
}
fn index(self) -> usize {
self as usize
}
}
impl FromStr for Stratifier {
type Err = anyhow::Error;
fn from_str(s: &str) -> Result<Self> {
match s {
"all" => Ok(Self::All),
"bq" => Ok(Self::Bq),
"mapq" => Ok(Self::Mapq),
"cycle" => Ok(Self::Cycle),
"read_num" => Ok(Self::ReadNum),
"strand" => Ok(Self::Strand),
"pair_orientation" => Ok(Self::PairOrientation),
"isize" => Ok(Self::Isize),
"gc" => Ok(Self::Gc),
"read_base" => Ok(Self::ReadBase),
"ref_base" => Ok(Self::RefBase),
"hp_len" => Ok(Self::HpLen),
"pre_dinuc" => Ok(Self::PreDinuc),
"post_dinuc" => Ok(Self::PostDinuc),
"context_3bp" => Ok(Self::Context3bp),
"nm" => Ok(Self::Nm),
"indel_len" => Ok(Self::IndelLen),
_ => bail!(
"Unknown stratifier: '{s}'. Available: all, bq, mapq, cycle, read_num, \
strand, pair_orientation, isize, gc, read_base, ref_base, hp_len, \
pre_dinuc, post_dinuc, context_3bp, nm, indel_len"
),
}
}
}
impl fmt::Display for Stratifier {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let s = match self {
Self::All => "all",
Self::Bq => "bq",
Self::Mapq => "mapq",
Self::Cycle => "cycle",
Self::ReadNum => "read_num",
Self::Strand => "strand",
Self::PairOrientation => "pair_orientation",
Self::Isize => "isize",
Self::Gc => "gc",
Self::ReadBase => "read_base",
Self::RefBase => "ref_base",
Self::HpLen => "hp_len",
Self::PreDinuc => "pre_dinuc",
Self::PostDinuc => "post_dinuc",
Self::Context3bp => "context_3bp",
Self::Nm => "nm",
Self::IndelLen => "indel_len",
};
write!(f, "{s}")
}
}
#[derive(Debug, Clone)]
pub struct StratifierGroup {
pub stratifiers: Vec<Stratifier>,
pub name: String,
}
impl StratifierGroup {
fn parse(s: &str) -> Result<Self> {
let stratifiers: Vec<Stratifier> =
s.split(',').map(|part| part.trim().parse()).collect::<Result<_>>()?;
if stratifiers.is_empty() {
bail!("Empty stratifier group");
}
if stratifiers.len() > INLINE_STRATIFIERS {
log::warn!("*************************************************************");
log::warn!(
"Stratifier group '{}' has {} stratifiers (max inline: {}).",
s,
stratifiers.len(),
INLINE_STRATIFIERS
);
log::warn!("Groups with more than {INLINE_STRATIFIERS} stratifiers will see");
log::warn!("a sudden and large performance degradation due to heap");
log::warn!("allocation in the inner loop. Consider splitting into");
log::warn!("smaller groups.");
log::warn!("*************************************************************");
}
let name = stratifiers.iter().map(ToString::to_string).collect::<Vec<_>>().join(",");
Ok(Self { stratifiers, name })
}
fn key_from_cache(&self, base_cache: &BaseCovariateCache) -> Option<CovariateKey> {
if self.stratifiers.len() == 1 {
let val = base_cache.get(self.stratifiers[0])?;
Some(CovariateKey::Single(val))
} else {
let mut key = SmallVec::new();
for &strat in &self.stratifiers {
key.push(base_cache.get(strat)?);
}
Some(CovariateKey::Composite(key))
}
}
}
#[derive(Clone, Debug, PartialEq, Eq, Hash)]
pub enum CovariateKey {
Single(CovariateValue),
Composite(SmallVec<[CovariateValue; INLINE_STRATIFIERS]>),
}
impl CovariateKey {
fn format(&self, interner: &StringInterner) -> String {
match self {
Self::Single(v) => v.format(interner),
Self::Composite(vals) => {
let mut s = String::new();
for (i, v) in vals.iter().enumerate() {
if i > 0 {
s.push(',');
}
s.push_str(&v.format(interner));
}
s
}
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum CovariateValue {
Int(u32),
Char(u8),
Dinuc([u8; 2]),
Trinuc([u8; 3]),
InternedStr(u16),
}
impl CovariateValue {
fn format(self, interner: &StringInterner) -> String {
match self {
Self::Int(n) => n.to_string(),
Self::Char(c) => String::from(c as char),
Self::Dinuc(d) => format!("{}{}", d[0] as char, d[1] as char),
Self::Trinuc(t) => {
format!("{}{}{}", t[0] as char, t[1] as char, t[2] as char)
}
Self::InternedStr(id) => interner.resolve(id).to_string(),
}
}
}
struct StringInterner {
strings: Vec<String>,
index: HashMap<String, u16>,
}
impl StringInterner {
fn new() -> Self {
let mut interner = Self { strings: Vec::new(), index: HashMap::default() };
for s in ["all", "R1", "R2", "+", "-", "f1r2", "f2r1", "tandem"] {
interner.intern(s);
}
interner
}
#[expect(
clippy::cast_possible_truncation,
reason = "interned string count is always << u16::MAX"
)]
fn intern(&mut self, s: &str) -> u16 {
if let Some(&id) = self.index.get(s) {
return id;
}
let id = self.strings.len() as u16;
self.strings.push(s.to_string());
self.index.insert(s.to_string(), id);
id
}
fn get(&self, s: &str) -> u16 {
*self.index.get(s).expect("string not interned")
}
fn resolve(&self, id: u16) -> &str {
&self.strings[id as usize]
}
}
pub struct RegionContext {
pub start: u32,
pub end: u32,
pub bases: Vec<u8>,
pub excluded: BitVec,
}
impl RegionContext {
fn new(
contig_name: &str,
start: u32,
end: u32,
fasta: &mut Fasta,
contig_variant_mask: Option<&BitVec>,
contig_cache: &mut ContigCache,
) -> Result<Self> {
if contig_cache.name != contig_name {
contig_cache.bases = fasta.load_contig(contig_name, true)?;
contig_cache.name = contig_name.to_string();
}
let start_usize = start as usize;
let end_usize = (end as usize).min(contig_cache.bases.len());
let bases = contig_cache.bases[start_usize..end_usize].to_vec();
let excluded = if let Some(mask) = contig_variant_mask {
mask[start_usize..end_usize].to_bitvec()
} else {
bitvec![0; bases.len()]
};
Ok(Self { start, end, bases, excluded })
}
fn new_for_contig(
contig_name: &str,
contig_len: u32,
ref_id: usize,
fasta: &mut Fasta,
contig_variant_mask: Option<&BitVec>,
intervals: Option<&crate::intervals::Intervals>,
contig_cache: &mut ContigCache,
) -> Result<Self> {
if contig_cache.name != contig_name {
contig_cache.bases = fasta.load_contig(contig_name, true)?;
contig_cache.name = contig_name.to_string();
}
let bases = contig_cache.bases[..contig_len as usize].to_vec();
let mut excluded = if let Some(ivs) = intervals {
!ivs.contig_bitvec(ref_id) } else {
bitvec![0; bases.len()] };
if let Some(mask) = contig_variant_mask {
excluded |= mask;
}
Ok(Self { start: 0, end: contig_len, bases, excluded })
}
fn ref_base(&self, ref_pos: u32) -> u8 {
let offset = (ref_pos - self.start) as usize;
self.bases.get(offset).copied().unwrap_or(b'N')
}
fn is_excluded(&self, ref_pos: u32) -> bool {
let offset = (ref_pos - self.start) as usize;
self.excluded.get(offset).is_some_and(|b| *b)
}
fn contains(&self, ref_pos: u32) -> bool {
ref_pos >= self.start && ref_pos < self.end
}
}
struct ContigCache {
name: String,
bases: Vec<u8>,
}
struct ReadLevelCache {
read_num: Option<CovariateValue>,
strand: CovariateValue,
pair_orientation: Option<CovariateValue>,
isize_val: Option<CovariateValue>,
gc: Option<CovariateValue>,
mapq: Option<CovariateValue>,
nm_raw: Option<u32>,
}
impl ReadLevelCache {
#[expect(
clippy::cast_possible_truncation,
reason = "GC percentage is always 0-100, fits in u32"
)]
fn new(
record: &RikerRecord,
interner: &StringInterner,
max_isize: u32,
picard_compat: bool,
) -> Self {
let flags = record.flags();
let is_reverse = flags.is_reverse_complemented();
let read_num = if !flags.is_segmented() {
None
} else if flags.is_first_segment() {
Some(CovariateValue::InternedStr(interner.get("R1")))
} else {
Some(CovariateValue::InternedStr(interner.get("R2")))
};
let strand = if is_reverse {
CovariateValue::InternedStr(interner.get("-"))
} else {
CovariateValue::InternedStr(interner.get("+"))
};
let pair_orientation = get_pair_orientation(record).map(|po| {
CovariateValue::InternedStr(interner.get(match po {
PairOrientation::Fr => {
if flags.is_first_segment() == is_reverse {
"f2r1"
} else {
"f1r2"
}
}
PairOrientation::Rf => {
if flags.is_first_segment() == is_reverse {
"f1r2"
} else {
"f2r1"
}
}
PairOrientation::Tandem => "tandem",
}))
});
let raw_isize = record.template_length().unsigned_abs();
let isize_val = if picard_compat {
Some(CovariateValue::Int(raw_isize.min(max_isize)))
} else if raw_isize > max_isize {
None } else {
Some(CovariateValue::Int(raw_isize))
};
let gc = {
let seq = record.sequence();
if seq.is_empty() {
None
} else {
let gc_count = simd::count_gc_case_insensitive(seq) as usize;
let pct = (gc_count * 100 + seq.len() / 2) / seq.len();
Some(CovariateValue::Int(pct as u32))
}
};
let mapq = record.mapping_quality().map(|m| CovariateValue::Int(u32::from(m.get())));
let nm_raw = get_nm_tag(record);
Self { read_num, strand, pair_orientation, isize_val, gc, mapq, nm_raw }
}
}
struct BaseCovariateCache {
values: [Option<CovariateValue>; Stratifier::COUNT],
}
impl BaseCovariateCache {
fn from_read_level(
read_level: &[Stratifier],
cache: &ReadLevelCache,
interner: &StringInterner,
) -> Self {
let mut values = [None; Stratifier::COUNT];
for &strat in read_level {
values[strat.index()] = match strat {
Stratifier::All => Some(CovariateValue::InternedStr(interner.get("all"))),
Stratifier::ReadNum => cache.read_num,
Stratifier::Strand => Some(cache.strand),
Stratifier::PairOrientation => cache.pair_orientation,
Stratifier::Isize => cache.isize_val,
Stratifier::Gc => cache.gc,
Stratifier::Mapq => cache.mapq,
_ => None, };
}
Self { values }
}
#[expect(clippy::too_many_arguments)]
fn populate_base_level(
&mut self,
base_level: &[Stratifier],
record: &RikerRecord,
read_offset: usize,
ref_pos: u32,
region: &RegionContext,
indel_len: u32,
cycle: u32,
cache: &ReadLevelCache,
interner: &StringInterner,
) {
for &strat in base_level {
self.values[strat.index()] = strat.covariate(
record,
read_offset,
ref_pos,
region,
indel_len,
cycle,
cache,
interner,
);
}
}
fn get(&self, strat: Stratifier) -> Option<CovariateValue> {
self.values[strat.index()]
}
}
struct GroupAccumulators {
group: StratifierGroup,
accums: HashMap<CovariateKey, CombinedAccum>,
}
impl GroupAccumulators {
fn new(group: StratifierGroup) -> Self {
Self { group, accums: HashMap::with_capacity_and_hasher(256, rustc_hash::FxBuildHasher) }
}
}
#[derive(Debug, Default, Clone)]
struct CombinedAccum {
mismatch: MismatchAccum,
indel: IndelAccum,
overlap: OverlapAccum,
}
#[derive(Debug, Default, Clone)]
struct MismatchAccum {
total_bases: u64,
error_bases: u64,
}
#[derive(Debug, Default, Clone)]
struct OverlapAccum {
overlapping_read_bases: u64,
bases_mismatching_ref_and_mate: u64,
bases_matching_mate_but_not_ref: u64,
bases_in_three_way_disagreement: u64,
}
#[derive(Debug, Default, Clone)]
struct IndelAccum {
total_bases: u64,
num_insertions: u64,
num_inserted_bases: u64,
num_deletions: u64,
num_deleted_bases: u64,
}
const fn complement(base: u8) -> u8 {
match base {
b'A' => b'T',
b'T' => b'A',
b'C' => b'G',
b'G' => b'C',
_ => b'N',
}
}
const fn is_valid_base(base: u8) -> bool {
matches!(base, b'A' | b'C' | b'G' | b'T')
}
fn read_base_at(record: &RikerRecord, read_offset: usize, is_reverse: bool) -> Option<u8> {
let seq = record.sequence();
let base = *seq.get(read_offset)?;
let oriented = if is_reverse { complement(base) } else { base };
if is_valid_base(oriented) { Some(oriented) } else { None }
}
#[expect(clippy::cast_sign_loss, reason = "array_delta is checked non-negative before cast")]
fn read_base_in_sequencing_order(
record: &RikerRecord,
read_offset: usize,
is_reverse: bool,
delta: i32,
) -> Option<u8> {
let array_delta = if is_reverse { -delta } else { delta };
let target = if array_delta >= 0 {
read_offset.checked_add(array_delta as usize)?
} else {
read_offset.checked_sub(array_delta.unsigned_abs() as usize)?
};
let seq = record.sequence();
let base = *seq.get(target)?;
let oriented = if is_reverse { complement(base) } else { base };
if is_valid_base(oriented) { Some(oriented) } else { None }
}
fn compute_hp_length(record: &RikerRecord, read_offset: usize, is_reverse: bool) -> u32 {
let seq = record.sequence();
let len = seq.len();
if is_reverse {
if read_offset + 1 >= len {
return 0;
}
let anchor = seq[read_offset + 1];
let mut count = 0u32;
let mut i = read_offset + 1;
while i < len && seq[i] == anchor {
count += 1;
i += 1;
}
count
} else {
if read_offset == 0 {
return 0;
}
let anchor = seq[read_offset - 1];
let mut count = 0u32;
let mut i = read_offset;
while i > 0 {
i -= 1;
if seq[i] != anchor {
break;
}
count += 1;
}
count
}
}
fn get_nm_tag(record: &RikerRecord) -> Option<u32> {
let v = record.aux_tag(*b"NM")?.as_int()?;
u32::try_from(v).ok()
}
#[expect(
clippy::cast_precision_loss,
reason = "f64 precision is sufficient for error rate calculations"
)]
fn q_score(errors: u64, total: u64) -> f64 {
if total == 0 {
return 0.0;
}
if errors == 0 {
return 99.0;
}
let rate = errors as f64 / total as f64;
(-10.0 * rate.log10()).min(99.0)
}
#[expect(
clippy::cast_possible_truncation,
reason = "genomic positions and CIGAR op lengths fit in u32"
)]
fn build_aligned_bases(record: &RikerRecord, min_bq: u8) -> Vec<(u32, u8)> {
let Some(alignment_start) = record.alignment_start() else { return Vec::new() };
let alignment_start = (alignment_start.get() - 1) as u32;
let seq = record.sequence();
let quals = record.quality_scores();
let mut bases = Vec::with_capacity(seq.len());
let mut ref_pos = alignment_start;
let mut read_pos: usize = 0;
for op in record.cigar_ops() {
match op.kind() {
Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
for _ in 0..op.len() {
let bq = quals.get(read_pos).copied().unwrap_or(0);
if bq >= min_bq
&& let Some(&base) = seq.get(read_pos)
{
bases.push((ref_pos, base.to_ascii_uppercase()));
}
ref_pos += 1;
read_pos += 1;
}
}
Kind::Insertion | Kind::SoftClip => {
read_pos += op.len();
}
Kind::Deletion | Kind::Skip => {
ref_pos += op.len() as u32;
}
Kind::HardClip | Kind::Pad => {}
}
}
bases
}
#[cfg(test)]
mod tests {
use super::*;
use strum::IntoEnumIterator;
#[test]
fn test_stratifier_count_and_indices() {
let mut seen = [false; Stratifier::COUNT];
for strat in Stratifier::iter() {
let idx = strat.index();
assert!(idx < Stratifier::COUNT, "index {idx} out of range for {strat}");
assert!(!seen[idx], "duplicate index {idx} for {strat}");
seen[idx] = true;
}
assert!(seen.iter().all(|&s| s), "not all indices covered");
}
}