use std::collections::HashSet;
use std::path::{Path, PathBuf};
use anyhow::Result;
use bitvec::prelude::*;
use clap::Args;
use kuva::plot::LinePlot;
use kuva::plot::legend::LegendPosition;
use kuva::render::layout::Layout;
use kuva::render::plots::Plot;
use noodles::sam::Header;
use noodles::sam::alignment::record::cigar::op::Kind;
use riker_derive::MetricDocs;
use serde::{Deserialize, Serialize};
use crate::collector::{Collector, drive_collector_single_threaded};
use crate::commands::command::Command;
use crate::commands::common::{InputOptions, OutputOptions, ReferenceOptions};
use crate::counter::Counter;
use crate::fasta::Fasta;
use crate::intervals::Intervals;
use crate::metrics::{serialize_f64_2dp, serialize_f64_5dp, write_tsv};
use crate::plotting::{FG_BLUE, FG_TEAL, PLOT_HEIGHT, PLOT_WIDTH, write_plot_pdf};
use crate::progress::ProgressLogger;
use crate::sam::alignment_reader::AlignmentReader;
use crate::sam::mate_buffer::{MateBuffer, Peek};
use crate::sam::record_utils::derive_sample;
use crate::sam::riker_record::{RikerRecord, RikerRecordRequirements};
use crate::sequence_dict::SequenceDictionary;
pub const METRICS_SUFFIX: &str = ".wgs-metrics.txt";
pub const COVERAGE_SUFFIX: &str = ".wgs-coverage.txt";
pub const PLOT_SUFFIX: &str = ".wgs-coverage.pdf";
#[riker_derive::multi_options("wgs", "WGS Options")]
#[derive(Args, Debug, Clone)]
#[command()]
pub struct WgsOptions {
#[arg(short = 'L', long, value_name = "FILE")]
pub intervals: Option<PathBuf>,
#[arg(long, default_value_t = false)]
pub include_duplicates: bool,
#[arg(long, default_value_t = WgsOptions::DEFAULT_EXCLUDE_UNPAIRED)]
pub exclude_unpaired_reads: bool,
#[arg(long, default_value_t = WgsOptions::DEFAULT_MIN_MAPQ)]
pub min_mapq: u8,
#[arg(long, default_value_t = WgsOptions::DEFAULT_MIN_BQ)]
pub min_bq: u8,
#[arg(long, default_value_t = WgsOptions::DEFAULT_COVERAGE_CAP)]
pub coverage_cap: u16,
}
impl WgsOptions {
const DEFAULT_EXCLUDE_UNPAIRED: bool = true;
const DEFAULT_MIN_MAPQ: u8 = 20;
const DEFAULT_MIN_BQ: u8 = 20;
const DEFAULT_COVERAGE_CAP: u16 = 250;
}
impl Default for WgsOptions {
fn default() -> Self {
Self {
intervals: None,
include_duplicates: false,
exclude_unpaired_reads: Self::DEFAULT_EXCLUDE_UNPAIRED,
min_mapq: Self::DEFAULT_MIN_MAPQ,
min_bq: Self::DEFAULT_MIN_BQ,
coverage_cap: Self::DEFAULT_COVERAGE_CAP,
}
}
}
#[derive(Args, Debug, Clone)]
#[command(
long_about,
after_long_help = "\
Examples:
riker wgs -i input.bam -o out_prefix -r ref.fa
riker wgs -i input.bam -o out_prefix -r ref.fa -L intervals.bed
riker wgs -i input.bam -o out_prefix -r ref.fa --coverage-cap 500"
)]
pub struct Wgs {
#[command(flatten)]
pub input: InputOptions,
#[command(flatten)]
pub output: OutputOptions,
#[command(flatten)]
pub reference: ReferenceOptions,
#[command(flatten)]
pub options: WgsOptions,
}
impl Command for Wgs {
fn execute(&self) -> Result<()> {
let mut reader = AlignmentReader::open(&self.input.input, Some(&self.reference.reference))?;
let reference = Fasta::from_path(&self.reference.reference)?;
let mut collector =
WgsCollector::new(&self.input.input, &self.output.output, reference, &self.options)?;
let mut progress = ProgressLogger::new("wgs", "reads", 5_000_000);
drive_collector_single_threaded(&mut reader, &mut collector, &mut progress)
}
}
pub struct WgsCollector {
metrics_path: PathBuf,
coverage_path: PathBuf,
plot_path: PathBuf,
plot_title: String,
input_path: PathBuf,
depth: ContigDepth,
mate_buffer: MateBuffer<CachedMate>,
reference: Fasta,
intervals: Option<Intervals>,
include_duplicates: bool,
include_unpaired: bool,
min_mapq: u8,
min_bq: u8,
coverage_cap: u16,
dict: Option<SequenceDictionary>,
current_ref_id: Option<usize>,
current_non_n: BitVec,
processed_contigs: HashSet<usize>,
genome_territory: u64,
depth_histogram: Vec<u64>, bases_excl_mapq: u64,
bases_excl_dupe: u64,
bases_excl_unpaired: u64,
bases_excl_baseq: u64,
bases_excl_overlap: u64,
bases_excl_capped: u64,
sample: String,
}
impl WgsCollector {
pub fn new(
input: &Path,
prefix: &Path,
reference: Fasta,
options: &WgsOptions,
) -> Result<Self> {
let intervals = options
.intervals
.as_ref()
.map(|path| Intervals::from_path(path, reference.dict().clone()))
.transpose()?;
let metrics_path = super::command::output_path(prefix, METRICS_SUFFIX);
let coverage_path = super::command::output_path(prefix, COVERAGE_SUFFIX);
let plot_path = super::command::output_path(prefix, PLOT_SUFFIX);
let hist_len = options.coverage_cap as usize + 1;
Ok(Self {
metrics_path,
coverage_path,
plot_path,
plot_title: String::new(),
input_path: input.to_path_buf(),
depth: ContigDepth::new(options.coverage_cap),
mate_buffer: MateBuffer::new(),
reference,
intervals,
include_duplicates: options.include_duplicates,
include_unpaired: !options.exclude_unpaired_reads,
min_mapq: options.min_mapq,
min_bq: options.min_bq,
coverage_cap: options.coverage_cap,
dict: None,
current_ref_id: None,
current_non_n: BitVec::EMPTY,
processed_contigs: HashSet::new(),
genome_territory: 0,
depth_histogram: vec![0u64; hist_len],
bases_excl_mapq: 0,
bases_excl_dupe: 0,
bases_excl_unpaired: 0,
bases_excl_baseq: 0,
bases_excl_overlap: 0,
bases_excl_capped: 0,
sample: String::new(),
})
}
#[expect(clippy::too_many_lines, reason = "sequential finalization logic")]
fn finish_metrics(&mut self) -> Result<()> {
self.finalize_contig();
self.current_ref_id = None;
let dict = self.dict.as_ref().unwrap();
let n_contigs = dict.len();
for ref_id in 0..n_contigs {
if self.processed_contigs.contains(&ref_id) {
continue;
}
if let Some(ref intervals) = self.intervals
&& !intervals.has_contig(ref_id)
{
continue;
}
let name = dict[ref_id].name();
match self.reference.load_contig(name, false) {
Err(e) => {
log::warn!("wgs: could not load contig '{name}': {e}");
}
Ok(seq) => {
let bv = build_non_n_bitvec(&seq);
let non_n = self.count_eligible_positions(ref_id, &bv);
self.genome_territory += non_n;
self.depth_histogram[0] += non_n;
}
}
}
let total = self.genome_territory;
if total == 0 {
log::warn!("wgs: genome territory is zero; writing empty metrics");
write_tsv(&self.metrics_path, &[] as &[WgsMetrics])?;
write_tsv(&self.coverage_path, &[] as &[WgsCoverageEntry])?;
return Ok(());
}
let hist_counter: Counter<u64> = self
.depth_histogram
.iter()
.enumerate()
.filter(|&(_, &c)| c > 0)
.map(|(d, &c)| (d as u64, c))
.collect();
let (mean, sd, sum_depth) = {
let n = total as f64;
let mut sum = 0.0_f64;
let mut sum_sq = 0.0_f64;
for (depth, &count) in self.depth_histogram.iter().enumerate() {
if count > 0 {
let d = depth as f64;
let c = count as f64;
sum += d * c;
sum_sq += d * d * c;
}
}
let mean = sum / n;
let variance = if total > 1 { (sum_sq - n * mean * mean) / (n - 1.0) } else { 0.0 };
(mean, variance.max(0.0).sqrt(), sum)
};
let (median, mad) = hist_counter.median_and_mad();
let n = self.depth_histogram.len();
let mut at_or_above = vec![0u64; n];
let mut running: u64 = 0;
for d in (0..n).rev() {
running += self.depth_histogram[d];
at_or_above[d] = running;
}
let frac_at = |nx: usize| -> f64 {
if nx >= n { 0.0 } else { at_or_above[nx] as f64 / total as f64 }
};
let fold_penalty = |lower_frac: f64| -> f64 {
let pct_depth = percentile_from_hist(&self.depth_histogram, lower_frac, total);
if pct_depth == 0.0 { 0.0 } else { mean / pct_depth }
};
let total_excl: u64 = self
.bases_excl_mapq
.saturating_add(self.bases_excl_dupe)
.saturating_add(self.bases_excl_unpaired)
.saturating_add(self.bases_excl_baseq)
.saturating_add(self.bases_excl_overlap)
.saturating_add(self.bases_excl_capped);
let total_raw = total_excl as f64 + sum_depth;
let frac_excl =
|n: u64| -> f64 { if total_raw == 0.0 { 0.0 } else { n as f64 / total_raw } };
let frac_excl_total = if total_raw == 0.0 { 0.0 } else { total_excl as f64 / total_raw };
let metrics = WgsMetrics {
sample: self.sample.clone(),
genome_territory: total,
mean_coverage: mean,
sd_coverage: sd,
median_coverage: median,
mad_coverage: mad,
frac_excluded_mapq: frac_excl(self.bases_excl_mapq),
frac_excluded_dupe: frac_excl(self.bases_excl_dupe),
frac_excluded_unpaired: frac_excl(self.bases_excl_unpaired),
frac_excluded_baseq: frac_excl(self.bases_excl_baseq),
frac_excluded_overlap: frac_excl(self.bases_excl_overlap),
frac_excluded_capped: frac_excl(self.bases_excl_capped),
frac_excluded_total: frac_excl_total,
frac_bases_at_1x: frac_at(1),
frac_bases_at_5x: frac_at(5),
frac_bases_at_10x: frac_at(10),
frac_bases_at_15x: frac_at(15),
frac_bases_at_20x: frac_at(20),
frac_bases_at_25x: frac_at(25),
frac_bases_at_30x: frac_at(30),
frac_bases_at_40x: frac_at(40),
frac_bases_at_50x: frac_at(50),
frac_bases_at_60x: frac_at(60),
frac_bases_at_100x: frac_at(100),
fold_80_base_penalty: fold_penalty(0.20),
fold_90_base_penalty: fold_penalty(0.10),
fold_95_base_penalty: fold_penalty(0.05),
};
write_tsv(&self.metrics_path, &[metrics])?;
let coverage_rows: Vec<WgsCoverageEntry> = self
.depth_histogram
.iter()
.enumerate()
.map(|(d, &bases)| {
let frac_bases = bases as f64 / total as f64;
let frac_bases_at_or_above = at_or_above[d] as f64 / total as f64;
WgsCoverageEntry {
sample: self.sample.clone(),
depth: d as u64,
bases,
frac_bases,
bases_at_or_above: at_or_above[d],
frac_bases_at_or_above,
}
})
.collect();
write_tsv(&self.coverage_path, &coverage_rows)?;
#[expect(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
reason = "percentile depth ≤ coverage_cap and non-negative"
)]
let x_max = percentile_from_hist(&self.depth_histogram, 0.999, total) as u64;
self.plot_histogram(x_max, mean)?;
log::info!(
"wgs: genome_territory={total}, mean_coverage={mean:.2}, \
metrics={}, coverage={}, plot={}",
self.metrics_path.display(),
self.coverage_path.display(),
self.plot_path.display(),
);
Ok(())
}
fn finalize_contig(&mut self) {
let Some(ref_id) = self.current_ref_id else {
return;
};
self.bases_excl_capped += self.depth.take_excl_capped();
let non_n_bv = std::mem::take(&mut self.current_non_n);
let intervals_bv = self.intervals.as_ref().map(|intervals| intervals.contig_bitvec(ref_id));
let contig_len = self.depth.len();
let capped = self.coverage_cap;
let mut eligible: u64 = 0;
for pos in 0..contig_len {
let depth = self.depth.take(pos);
let non_n_ok = pos < non_n_bv.len() && non_n_bv[pos];
let interval_ok = intervals_bv.as_ref().is_none_or(|bv| pos < bv.len() && bv[pos]);
if !non_n_ok || !interval_ok {
continue;
}
eligible += 1;
let idx = depth.min(capped) as usize;
self.depth_histogram[idx] += 1;
}
self.genome_territory += eligible;
}
fn plot_histogram(&self, x_max: u64, mean_coverage: f64) -> Result<()> {
if self.genome_territory == 0 || x_max == 0 {
return Ok(());
}
let territory = self.genome_territory as f64;
let x_max_idx = usize::try_from(x_max).expect("x_max ≤ coverage_cap (u16)");
let xy: Vec<(f64, f64)> = self
.depth_histogram
.iter()
.enumerate()
.take(x_max_idx + 1)
.map(|(d, &c)| (d as f64, c as f64 / territory))
.collect();
let covered_bases = territory - self.depth_histogram[0] as f64;
let covered_frac = covered_bases / territory;
let poisson_xy: Vec<(f64, f64)> =
(0..=x_max).map(|k| (k as f64, poisson_pmf(k, mean_coverage) * covered_frac)).collect();
let observed = LinePlot::new()
.with_data(xy)
.with_color(FG_BLUE)
.with_fill()
.with_fill_opacity(0.3)
.with_legend("Empirical");
let theoretical = LinePlot::new()
.with_data(poisson_xy)
.with_color(FG_TEAL)
.with_dashed()
.with_stroke_width(1.5)
.with_legend("Theoretical");
let plots: Vec<Plot> = vec![observed.into(), theoretical.into()];
let layout = Layout::auto_from_plots(&plots)
.with_width(PLOT_WIDTH)
.with_height(PLOT_HEIGHT)
.with_title(&self.plot_title)
.with_x_label("Coverage Depth (X)")
.with_y_label("Fraction of Genome")
.with_minor_ticks(5)
.with_show_minor_grid(true)
.with_legend_position(LegendPosition::InsideTopRight);
write_plot_pdf(plots, layout, &self.plot_path)
}
fn count_eligible_positions(&self, ref_id: usize, non_n: &BitVec) -> u64 {
match &self.intervals {
None => non_n.count_ones() as u64,
Some(intervals) => {
let ivl = intervals.contig_bitvec(ref_id);
if ivl.is_empty() {
return 0;
}
let len = non_n.len().min(ivl.len());
(non_n[..len].to_bitvec() & &ivl[..len]).count_ones() as u64
}
}
}
fn begin_contig(&mut self, ref_id: usize) -> Result<()> {
let name = self.dict.as_ref().unwrap().get_by_index(ref_id).map_or("", |m| m.name());
#[allow(clippy::cast_possible_truncation)]
let contig_len = self.reference.contig_length(name).map_or(0, |n| n as usize);
let seq = self.reference.load_contig(name, false)?;
self.current_non_n = build_non_n_bitvec(&seq);
self.depth.reset_for_contig(contig_len);
self.mate_buffer.clear();
self.current_ref_id = Some(ref_id);
self.processed_contigs.insert(ref_id);
Ok(())
}
#[inline]
fn walk_depth<A: DepthAction>(&mut self, record: &RikerRecord, mut action: A) {
let Some(pos_1based) = record.alignment_start() else {
return;
};
let ref_start_u64 = pos_1based.get() as u64 - 1;
let quals: &[u8] = record.quality_scores();
let contig_len_u64 = self.depth.len() as u64;
let min_bq = self.min_bq;
for (ref_off, read_off, len) in iter_aligned_blocks(record) {
let block_ref_start = ref_start_u64 + u64::from(ref_off);
if block_ref_start >= contig_len_u64 {
continue;
}
let block_ref_end = (block_ref_start + u64::from(len)).min(contig_len_u64);
#[allow(clippy::cast_possible_truncation)]
let usable = (block_ref_end - block_ref_start) as u32;
let read_start = (read_off as usize).min(quals.len());
let read_end = (read_start + usable as usize).min(quals.len());
let available = &quals[read_start..read_end];
#[allow(clippy::cast_possible_truncation)]
let base_pos = block_ref_start as u32;
for (i, &q) in available.iter().enumerate() {
if q < min_bq {
self.bases_excl_baseq += 1;
} else {
#[allow(clippy::cast_possible_truncation)]
let ref_pos = base_pos + i as u32;
if action.is_mate_covered(ref_pos) {
self.bases_excl_overlap += 1;
} else {
self.depth.push(ref_pos);
action.on_depth_counted(ref_pos);
}
}
}
self.bases_excl_baseq += (usable as usize - available.len()) as u64;
}
}
}
impl Collector for WgsCollector {
fn initialize(&mut self, header: &Header) -> Result<()> {
self.reference.validate_bam_header(header)?;
self.dict = Some(SequenceDictionary::from(header));
self.sample = derive_sample(&self.input_path, header);
self.plot_title = format!("Coverage Depth Distribution of {}", self.sample);
Ok(())
}
fn accept(&mut self, record: &RikerRecord, _header: &Header) -> Result<()> {
let flags = record.flags();
if flags.is_unmapped() || flags.is_secondary() || flags.is_supplementary() {
return Ok(());
}
if flags.is_qc_fail() {
return Ok(());
}
let mapq = record.mapping_quality().map_or(0, |m| m.get());
if mapq < self.min_mapq {
self.bases_excl_mapq += count_aligned_bases(record);
return Ok(());
}
if !self.include_duplicates && flags.is_duplicate() {
self.bases_excl_dupe += count_aligned_bases(record);
return Ok(());
}
if !self.include_unpaired {
let is_paired_mapped = flags.is_segmented() && !flags.is_mate_unmapped();
if !is_paired_mapped {
self.bases_excl_unpaired += count_aligned_bases(record);
return Ok(());
}
}
let Some(ref_id) = record.reference_sequence_id() else {
return Ok(());
};
if Some(ref_id) != self.current_ref_id {
if self.current_ref_id.is_some() {
self.finalize_contig();
}
self.begin_contig(ref_id)?;
}
match self.mate_buffer.probe(record) {
Peek::Alone => self.walk_depth(record, AloneAction),
Peek::WouldBuffer { overlap_start, overlap_len } => {
let mut bitmap = bitvec![u64, Lsb0; 0; overlap_len as usize];
self.walk_depth(record, BufferAction { overlap_start, bitmap: &mut bitmap });
self.mate_buffer.insert(record, CachedMate { overlap_start, bitmap });
}
Peek::PairWith(cached) => {
self.walk_depth(record, PairAction { cached: &cached });
}
}
Ok(())
}
fn finish(&mut self) -> Result<()> {
self.finish_metrics()
}
fn name(&self) -> &'static str {
"wgs"
}
fn field_needs(&self) -> RikerRecordRequirements {
RikerRecordRequirements::NONE
}
}
#[derive(Debug, Serialize, Deserialize, MetricDocs)]
pub struct WgsMetrics {
pub sample: String,
pub genome_territory: u64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub mean_coverage: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub sd_coverage: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub median_coverage: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub mad_coverage: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_excluded_mapq: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_excluded_dupe: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_excluded_unpaired: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_excluded_baseq: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_excluded_overlap: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_excluded_capped: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_excluded_total: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_bases_at_1x: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_bases_at_5x: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_bases_at_10x: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_bases_at_15x: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_bases_at_20x: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_bases_at_25x: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_bases_at_30x: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_bases_at_40x: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_bases_at_50x: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_bases_at_60x: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_bases_at_100x: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub fold_80_base_penalty: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub fold_90_base_penalty: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub fold_95_base_penalty: f64,
}
#[derive(Debug, Serialize, Deserialize, MetricDocs)]
pub struct WgsCoverageEntry {
pub sample: String,
pub depth: u64,
pub bases: u64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_bases: f64,
pub bases_at_or_above: u64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_bases_at_or_above: f64,
}
struct ContigDepth {
depth: Vec<u16>,
coverage_cap: u16,
excl_capped: u64,
}
impl ContigDepth {
fn new(coverage_cap: u16) -> Self {
Self { depth: Vec::new(), coverage_cap, excl_capped: 0 }
}
fn len(&self) -> usize {
self.depth.len()
}
fn reset_for_contig(&mut self, contig_len: usize) {
self.depth.resize(contig_len, 0);
}
fn push(&mut self, ref_pos: u32) {
let pos = ref_pos as usize;
if pos >= self.depth.len() {
return;
}
let d = &mut self.depth[pos];
if *d < self.coverage_cap {
*d += 1;
} else {
self.excl_capped += 1;
}
}
fn take(&mut self, pos: usize) -> u16 {
std::mem::take(&mut self.depth[pos])
}
fn take_excl_capped(&mut self) -> u64 {
std::mem::take(&mut self.excl_capped)
}
}
struct CachedMate {
overlap_start: u32,
bitmap: BitVec<u64, Lsb0>,
}
impl CachedMate {
fn covered_at(&self, ref_pos: u32) -> bool {
if ref_pos < self.overlap_start {
return false;
}
let idx = (ref_pos - self.overlap_start) as usize;
if idx >= self.bitmap.len() {
return false;
}
self.bitmap[idx]
}
}
trait DepthAction {
#[inline]
fn is_mate_covered(&self, _ref_pos: u32) -> bool {
false
}
#[inline]
fn on_depth_counted(&mut self, _ref_pos: u32) {}
}
struct AloneAction;
impl DepthAction for AloneAction {}
struct BufferAction<'a> {
overlap_start: u32,
bitmap: &'a mut BitVec<u64, Lsb0>,
}
impl DepthAction for BufferAction<'_> {
#[inline]
fn on_depth_counted(&mut self, ref_pos: u32) {
if ref_pos >= self.overlap_start {
let idx = (ref_pos - self.overlap_start) as usize;
if idx < self.bitmap.len() {
self.bitmap.set(idx, true);
}
}
}
}
struct PairAction<'a> {
cached: &'a CachedMate,
}
impl DepthAction for PairAction<'_> {
#[inline]
fn is_mate_covered(&self, ref_pos: u32) -> bool {
self.cached.covered_at(ref_pos)
}
}
fn iter_aligned_blocks(record: &RikerRecord) -> impl Iterator<Item = (u32, u32, u32)> + '_ {
let mut ref_off: u32 = 0;
let mut read_off: u32 = 0;
record.cigar_ops().filter_map(move |op| {
#[allow(clippy::cast_possible_truncation)]
let len = op.len() as u32;
match op.kind() {
Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
let out = (ref_off, read_off, len);
ref_off += len;
read_off += len;
Some(out)
}
Kind::Insertion | Kind::SoftClip => {
read_off += len;
None
}
Kind::Deletion | Kind::Skip => {
ref_off += len;
None
}
Kind::HardClip | Kind::Pad => None,
}
})
}
fn poisson_pmf(k: u64, lambda: f64) -> f64 {
if lambda <= 0.0 {
return if k == 0 { 1.0 } else { 0.0 };
}
let log_pmf = k as f64 * lambda.ln() - lambda - ln_factorial(k);
log_pmf.exp()
}
fn ln_factorial(k: u64) -> f64 {
(2..=k).map(|i| (i as f64).ln()).sum()
}
fn count_aligned_bases(record: &RikerRecord) -> u64 {
let mut count: u64 = 0;
for op in record.cigar_ops() {
match op.kind() {
Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
count += op.len() as u64;
}
_ => {}
}
}
count
}
fn build_non_n_bitvec(seq: &[u8]) -> BitVec {
const W: usize = usize::BITS as usize;
let len = seq.len();
let mut words = Vec::with_capacity(len.div_ceil(W));
for chunk in seq.chunks(W) {
let mut word: usize = 0;
for (j, &b) in chunk.iter().enumerate() {
if b != b'N' && b != b'n' {
word |= 1 << j;
}
}
words.push(word);
}
let mut bv = BitVec::from_vec(words);
bv.truncate(len);
bv
}
#[expect(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
reason = "depth indices and totals fit safely in u64 conversions"
)]
fn percentile_from_hist(hist: &[u64], lower_frac: f64, total: u64) -> f64 {
if total == 0 {
return 0.0;
}
let threshold = (total as f64 * lower_frac).ceil() as u64;
let mut running: u64 = 0;
for (d, &count) in hist.iter().enumerate() {
running += count;
if running >= threshold {
return d as f64;
}
}
(hist.len() - 1) as f64
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_percentile_from_hist_basic() {
let hist = [1u64, 1, 1];
assert!((percentile_from_hist(&hist, 0.20, 3) - 0.0).abs() < f64::EPSILON);
}
#[test]
fn test_percentile_from_hist_80pct() {
let mut hist = vec![0u64; 11];
hist[10] = 10;
assert!((percentile_from_hist(&hist, 0.20, 10) - 10.0).abs() < f64::EPSILON);
}
#[test]
fn test_percentile_empty_histogram() {
assert!((percentile_from_hist(&[], 0.5, 0) - 0.0).abs() < f64::EPSILON);
}
#[test]
fn test_percentile_single_bin() {
let hist = [10u64];
assert!((percentile_from_hist(&hist, 0.5, 10) - 0.0).abs() < f64::EPSILON);
}
#[test]
fn test_percentile_frac_zero() {
let mut hist = vec![0u64; 11];
hist[0] = 5;
hist[10] = 5;
assert!((percentile_from_hist(&hist, 0.0, 10) - 0.0).abs() < f64::EPSILON);
}
#[test]
fn test_percentile_frac_one() {
let mut hist = vec![0u64; 11];
hist[5] = 5;
hist[10] = 5;
assert!((percentile_from_hist(&hist, 1.0, 10) - 10.0).abs() < f64::EPSILON);
}
#[test]
fn test_percentile_two_bins() {
let mut hist = vec![0u64; 11];
hist[5] = 5;
hist[10] = 5;
assert!((percentile_from_hist(&hist, 0.5, 10) - 5.0).abs() < f64::EPSILON);
}
#[test]
fn test_contig_depth_basic_push_and_take() {
let mut d = ContigDepth::new(250);
d.reset_for_contig(10);
d.push(3);
d.push(3);
d.push(7);
assert_eq!(d.take(0), 0);
assert_eq!(d.take(3), 2);
assert_eq!(d.take(7), 1);
assert_eq!(d.take(3), 0);
}
#[test]
fn test_contig_depth_saturates_at_cap() {
let mut d = ContigDepth::new(3);
d.reset_for_contig(5);
for _ in 0..10 {
d.push(2);
}
assert_eq!(d.take(2), 3);
assert_eq!(d.take_excl_capped(), 7);
}
#[test]
fn test_contig_depth_resize_between_contigs() {
let mut d = ContigDepth::new(250);
d.reset_for_contig(10);
d.push(5);
d.take(5);
d.reset_for_contig(100);
assert_eq!(d.len(), 100);
d.push(99);
assert_eq!(d.take(99), 1);
}
#[test]
fn test_contig_depth_ignores_out_of_range() {
let mut d = ContigDepth::new(250);
d.reset_for_contig(10);
d.push(42); assert_eq!(d.take_excl_capped(), 0);
assert_eq!(d.take(0), 0);
}
fn cached_mate(overlap_start: u32, overlap_len: u32, covered: &[u32]) -> CachedMate {
let mut bitmap = bitvec![u64, Lsb0; 0; overlap_len as usize];
for &ref_pos in covered {
assert!(ref_pos >= overlap_start);
let idx = (ref_pos - overlap_start) as usize;
assert!(idx < overlap_len as usize);
bitmap.set(idx, true);
}
CachedMate { overlap_start, bitmap }
}
#[test]
fn test_cached_mate_covered_at_contiguous_region() {
let cached = cached_mate(100, 10, &[100, 101, 102, 103, 104, 105, 106, 107, 108, 109]);
assert!(cached.covered_at(100));
assert!(cached.covered_at(105));
assert!(cached.covered_at(109));
assert!(!cached.covered_at(99));
assert!(!cached.covered_at(110));
}
#[test]
fn test_cached_mate_covered_at_with_gap() {
let cached = cached_mate(100, 12, &[100, 101, 102, 103, 104, 107, 108, 109, 110, 111]);
assert!(cached.covered_at(100));
assert!(cached.covered_at(104));
assert!(!cached.covered_at(105));
assert!(!cached.covered_at(106));
assert!(cached.covered_at(107));
assert!(cached.covered_at(111));
}
#[test]
fn test_cached_mate_covered_at_with_bq_fail_gaps() {
let cached = cached_mate(100, 10, &[100, 101, 102, 104, 105, 106, 107, 108, 109]);
assert!(cached.covered_at(102));
assert!(!cached.covered_at(103)); assert!(cached.covered_at(104));
}
#[test]
fn test_cached_mate_covered_at_out_of_range() {
let cached = cached_mate(100, 10, &[100, 109]);
assert!(!cached.covered_at(50));
assert!(!cached.covered_at(99));
assert!(!cached.covered_at(110));
assert!(!cached.covered_at(u32::MAX));
}
}