use std::path::{Path, PathBuf};
use anyhow::{Result, ensure};
use clap::Args;
use noodles::sam::Header;
use noodles::sam::alignment::RecordBuf;
use noodles::sam::alignment::record::Cigar as CigarTrait;
use noodles::sam::alignment::record::cigar::Op;
use noodles::sam::alignment::record::cigar::op::Kind;
use riker_derive::MetricDocs;
use serde::{Deserialize, Serialize};
use smallvec::SmallVec;
use crate::collector::Collector;
use crate::commands::command::Command;
use crate::commands::common::{InputOptions, OptionalReferenceOptions, OutputOptions};
use crate::fasta::Fasta;
use crate::intervals::{Interval, Intervals};
use crate::math::{safe_div, safe_div_f};
use crate::metrics::{serialize_f64_2dp, serialize_f64_5dp, serialize_f64_6dp, write_tsv};
use crate::overlapper::Overlapper;
use crate::progress::ProgressLogger;
use crate::sam::alignment_reader::AlignmentReader;
use crate::sam::record_utils::derive_sample;
use crate::sequence_dict::SequenceDictionary;
type AlignmentInfo = (u32, u32, u64, SmallVec<[(u32, u32); 8]>);
pub const METRICS_SUFFIX: &str = ".hybcap-metrics.txt";
pub const PER_TARGET_SUFFIX: &str = ".hybcap-per-target.txt";
pub const PER_BASE_SUFFIX: &str = ".hybcap-per-base.txt";
#[riker_derive::multi_options("hybcap", "Hybrid Capture Options")]
#[derive(Args, Debug, Clone)]
#[command()]
#[allow(clippy::struct_excessive_bools)]
pub struct HybCapOptions {
#[arg(long, value_name = "FILE")]
pub baits: PathBuf,
#[arg(long, value_name = "FILE")]
pub targets: PathBuf,
#[arg(long, default_value_t = false)]
pub include_duplicates: bool,
#[arg(long, value_name = "NAME")]
pub panel_name: Option<String>,
#[arg(long, default_value_t = HybCapOptions::DEFAULT_NEAR_DISTANCE)]
pub near_distance: u64,
#[arg(long, default_value_t = HybCapOptions::DEFAULT_MIN_MAPQ)]
pub min_mapping_quality: u8,
#[arg(long, default_value_t = HybCapOptions::DEFAULT_MIN_BASE_QUALITY)]
pub min_base_quality: u8,
#[arg(long, default_value_t = false)]
pub dont_clip_overlapping_reads: bool,
#[arg(long, default_value_t = HybCapOptions::DEFAULT_INCLUDE_INDELS)]
pub include_indels: bool,
#[arg(long, default_value_t = false)]
pub per_target_coverage: bool,
#[arg(long, default_value_t = false)]
pub per_base_coverage: bool,
}
impl HybCapOptions {
const DEFAULT_NEAR_DISTANCE: u64 = 250;
const DEFAULT_MIN_MAPQ: u8 = 20;
const DEFAULT_MIN_BASE_QUALITY: u8 = 20;
const DEFAULT_INCLUDE_INDELS: bool = false;
}
impl Default for HybCapOptions {
fn default() -> Self {
Self {
baits: PathBuf::new(),
targets: PathBuf::new(),
include_duplicates: false,
panel_name: None,
near_distance: Self::DEFAULT_NEAR_DISTANCE,
min_mapping_quality: Self::DEFAULT_MIN_MAPQ,
min_base_quality: Self::DEFAULT_MIN_BASE_QUALITY,
dont_clip_overlapping_reads: false,
include_indels: Self::DEFAULT_INCLUDE_INDELS,
per_target_coverage: false,
per_base_coverage: false,
}
}
}
#[derive(Args, Debug, Clone)]
#[command(
long_about,
after_long_help = "\
Examples:
riker hybcap -i input.bam -o out_prefix --baits baits.bed --targets targets.bed
riker hybcap -i input.bam -o out_prefix --baits baits.interval_list --targets targets.interval_list -R ref.fa
riker hybcap -i input.bam -o out_prefix --baits baits.bed --targets targets.bed --per-target-coverage"
)]
pub struct HybCap {
#[command(flatten)]
pub input: InputOptions,
#[command(flatten)]
pub output: OutputOptions,
#[command(flatten)]
pub reference: OptionalReferenceOptions,
#[command(flatten)]
pub options: HybCapOptions,
}
impl Command for HybCap {
fn execute(&self) -> Result<()> {
let (mut reader, header) =
AlignmentReader::new(&self.input.input, self.reference.reference.as_deref())?;
let sample = derive_sample(&self.input.input, &header);
let fasta = match &self.reference.reference {
Some(ref_path) => Some(Fasta::from_path(ref_path)?),
None => None,
};
let mut collector = HybCapCollector::new(&self.output.output, fasta, sample, &self.options);
collector.initialize(&header)?;
let mut progress = ProgressLogger::new("hybcap", "reads", 5_000_000);
for result in reader.record_bufs(&header) {
let record = result?;
collector.accept(&record, &header)?;
progress.record_with(&record, &header);
}
progress.finish();
collector.finish()?;
Ok(())
}
}
#[allow(clippy::struct_excessive_bools)]
pub struct HybCapCollector {
metrics_path: PathBuf,
per_target_path: PathBuf,
per_base_path: PathBuf,
baits_path: PathBuf,
targets_path: PathBuf,
fasta: Option<Fasta>,
sample: String,
panel_name: String,
include_duplicates: bool,
near_distance: u32,
min_mapq: u8,
min_bq: u8,
clip_overlapping: bool,
include_indels: bool,
output_per_target: bool,
output_per_base: bool,
dict: Option<SequenceDictionary>,
target_overlapper: Option<Overlapper<usize>>,
bait_overlapper: Option<Overlapper<Interval>>,
expanded_bait_overlapper: Option<Overlapper<()>>,
target_coverages: Vec<(Interval, TargetCoverage)>,
target_gc: Vec<f64>,
merged_bait_territory: u64,
merged_target_territory: u64,
raw_target_count: usize,
genome_size: u64,
total_reads: u64,
deduped_reads: u64,
deduped_reads_aligned: u64,
total_bases: u64,
bases_aligned: u64,
deduped_bases_aligned: u64,
on_bait_bases: u64,
near_bait_bases: u64,
off_bait_bases: u64,
selected_pairs: u64,
selected_unique_pairs: u64,
on_target_bases: u64,
on_target_bases_from_pairs: u64,
exc_dupe_bases: u64,
exc_mapq_bases: u64,
exc_baseq_bases: u64,
exc_overlap_bases: u64,
exc_off_target_bases: u64,
}
impl HybCapCollector {
#[allow(clippy::too_many_arguments)]
#[must_use]
pub fn new(
prefix: &Path,
fasta: Option<Fasta>,
sample: String,
options: &HybCapOptions,
) -> Self {
let panel_name = options.panel_name.clone().unwrap_or_else(|| {
options.baits.file_stem().and_then(|s| s.to_str()).unwrap_or("unknown").to_string()
});
Self {
metrics_path: super::command::output_path(prefix, METRICS_SUFFIX),
per_target_path: super::command::output_path(prefix, PER_TARGET_SUFFIX),
per_base_path: super::command::output_path(prefix, PER_BASE_SUFFIX),
baits_path: options.baits.clone(),
targets_path: options.targets.clone(),
fasta,
sample,
panel_name,
include_duplicates: options.include_duplicates,
#[expect(
clippy::cast_possible_truncation,
reason = "near_distance is a small padding value that fits in u32"
)]
near_distance: options.near_distance as u32,
min_mapq: options.min_mapping_quality,
min_bq: options.min_base_quality,
clip_overlapping: !options.dont_clip_overlapping_reads,
include_indels: options.include_indels,
output_per_target: options.per_target_coverage,
output_per_base: options.per_base_coverage,
dict: None,
target_overlapper: None,
bait_overlapper: None,
expanded_bait_overlapper: None,
target_coverages: Vec::new(),
target_gc: Vec::new(),
merged_bait_territory: 0,
merged_target_territory: 0,
raw_target_count: 0,
genome_size: 0,
total_reads: 0,
deduped_reads: 0,
deduped_reads_aligned: 0,
total_bases: 0,
bases_aligned: 0,
deduped_bases_aligned: 0,
on_bait_bases: 0,
near_bait_bases: 0,
off_bait_bases: 0,
selected_pairs: 0,
selected_unique_pairs: 0,
on_target_bases: 0,
on_target_bases_from_pairs: 0,
exc_dupe_bases: 0,
exc_mapq_bases: 0,
exc_baseq_bases: 0,
exc_overlap_bases: 0,
exc_off_target_bases: 0,
}
}
fn alignment_span_and_blocks(record: &RecordBuf) -> Option<AlignmentInfo> {
let start = record.alignment_start()?.get() - 1; let mut end = start;
let mut aligned: u64 = 0;
let mut blocks: SmallVec<[(u32, u32); 8]> = SmallVec::new();
for op in record.cigar().iter().flatten() {
match op.kind() {
Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
#[expect(
clippy::cast_possible_truncation,
reason = "genomic coordinates fit in u32"
)]
let block_start = end as u32;
end += op.len();
aligned += op.len() as u64;
#[expect(
clippy::cast_possible_truncation,
reason = "genomic coordinates fit in u32"
)]
let block_end = end as u32;
blocks.push((block_start, block_end));
}
Kind::Deletion | Kind::Skip => {
end += op.len();
}
_ => {}
}
}
#[expect(clippy::cast_possible_truncation, reason = "genomic coordinates fit in u32")]
Some((start as u32, end as u32, aligned, blocks))
}
fn classify_bait_bases(
&mut self,
aligned_bases: u64,
on_expanded_bait: bool,
bait_hits: &[(u32, u32)],
blocks: &[(u32, u32)],
) {
if !on_expanded_bait {
self.off_bait_bases += aligned_bases;
return;
}
let mut on_bait: u64 = 0;
for &(block_start, block_end) in blocks {
for &(bait_start, bait_end) in bait_hits {
let overlap_start = block_start.max(bait_start);
let overlap_end = block_end.min(bait_end);
if overlap_start < overlap_end {
on_bait += u64::from(overlap_end - overlap_start);
}
}
}
self.on_bait_bases += on_bait;
self.near_bait_bases += aligned_bases.saturating_sub(on_bait);
}
#[expect(clippy::cast_possible_truncation, reason = "genomic coordinates fit in u32")]
fn compute_mate_clip_ref_pos(record: &RecordBuf) -> Option<u32> {
let flags = record.flags();
if !flags.is_segmented() || flags.is_unmapped() || flags.is_mate_unmapped() {
return None;
}
if record.reference_sequence_id() != record.mate_reference_sequence_id() {
return None;
}
let alignment_start = record.alignment_start()?.get();
let mate_start = record.mate_alignment_start()?.get();
if mate_start < alignment_start {
return None;
}
if mate_start == alignment_start && flags.is_first_segment() {
return None;
}
Some((mate_start - 1) as u32)
}
#[expect(clippy::too_many_lines, reason = "CIGAR walk is inherently detailed")]
#[expect(
clippy::cast_possible_truncation,
reason = "genomic coordinates and depths fit in u32/usize"
)]
fn walk_cigar_for_coverage(
&mut self,
record: &RecordBuf,
start: u32,
mate_clip_ref_pos: Option<u32>,
is_mapped_pair: bool,
target_idxs: &SmallVec<[usize; 4]>,
) {
let qual_bytes: &[u8] = record.quality_scores().as_ref();
let clip_ref_pos = mate_clip_ref_pos.unwrap_or(u32::MAX);
let cached_targets: SmallVec<[(u32, u32, usize); 4]> = target_idxs
.iter()
.map(|&idx| {
let (tgt, _) = &self.target_coverages[idx];
(tgt.start, tgt.end, idx)
})
.collect();
let mut ref_pos = start;
let mut read_offset: usize = 0;
let mut clipping_active = false;
let mut counted_mask: u64 = 0;
for op_result in record.cigar().iter() {
let op: Op = match op_result {
Ok(op) => op,
Err(_) => continue,
};
match op.kind() {
Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
let block_end = ref_pos + op.len() as u32;
let (effective_len, clipped_count) = if block_end <= clip_ref_pos {
(op.len(), 0)
} else if ref_pos >= clip_ref_pos {
(0, op.len())
} else {
let unclipped = (clip_ref_pos - ref_pos) as usize;
(unclipped, op.len() - unclipped)
};
if clipped_count > 0 {
self.exc_overlap_bases += clipped_count as u64;
clipping_active = true;
}
if cached_targets.is_empty() {
if effective_len == op.len() && effective_len > 0 {
let quals = &qual_bytes[read_offset..read_offset + effective_len];
let low_qual =
quals.iter().filter(|&&q| q < self.min_bq).count() as u64;
self.exc_baseq_bases += low_qual;
self.exc_off_target_bases += effective_len as u64 - low_qual;
} else {
for i in 0..effective_len {
let qual = qual_bytes.get(read_offset + i).copied().unwrap_or(0);
if qual < self.min_bq {
self.exc_baseq_bases += 1;
} else {
self.exc_off_target_bases += 1;
}
}
}
} else {
let quals = &qual_bytes[read_offset..read_offset + effective_len];
for (i, &qual) in quals.iter().enumerate() {
if qual < self.min_bq {
self.exc_baseq_bases += 1;
continue;
}
let pos = ref_pos + i as u32;
let mut hit = false;
for (local_idx, &(tgt_start, tgt_end, idx)) in
cached_targets.iter().enumerate()
{
if pos >= tgt_start && pos < tgt_end {
hit = true;
self.target_coverages[idx]
.1
.add_base((pos - tgt_start) as usize);
if counted_mask & (1 << local_idx) == 0 {
self.target_coverages[idx].1.read_count += 1;
counted_mask |= 1 << local_idx;
}
}
}
if hit {
self.on_target_bases += 1;
if is_mapped_pair {
self.on_target_bases_from_pairs += 1;
}
} else {
self.exc_off_target_bases += 1;
}
}
}
ref_pos = block_end;
read_offset += op.len();
}
Kind::Insertion => {
if self.include_indels && !cached_targets.is_empty() {
let on_target = cached_targets.iter().any(|&(tgt_start, tgt_end, _)| {
ref_pos >= tgt_start && ref_pos < tgt_end
});
if on_target {
for i in 0..op.len() {
if clipping_active {
self.exc_overlap_bases += 1;
continue;
}
let qual = qual_bytes.get(read_offset + i).copied().unwrap_or(0);
if qual >= self.min_bq {
self.on_target_bases += 1;
if is_mapped_pair {
self.on_target_bases_from_pairs += 1;
}
}
}
}
}
read_offset += op.len();
}
Kind::Deletion => {
if self.include_indels && !cached_targets.is_empty() {
for i in 0..op.len() {
let pos = ref_pos + i as u32;
for &(tgt_start, tgt_end, idx) in &cached_targets {
if pos >= tgt_start && pos < tgt_end {
self.target_coverages[idx]
.1
.add_base((pos - tgt_start) as usize);
}
}
}
}
ref_pos += op.len() as u32;
}
Kind::SoftClip => {
read_offset += op.len();
}
Kind::Skip => {
ref_pos += op.len() as u32;
}
Kind::HardClip | Kind::Pad => {}
}
}
}
fn compute_all_target_gc(&mut self, dict: &SequenceDictionary) -> Vec<f64> {
let Some(fasta) = &mut self.fasta else {
return vec![0.0; self.target_coverages.len()];
};
let mut gc_fractions = Vec::with_capacity(self.target_coverages.len());
for (iv, _) in &self.target_coverages {
let contig_name = dict.get_by_index(iv.ref_id).map_or("", |m| m.name());
let gc = match fasta.fetch(contig_name, u64::from(iv.start), u64::from(iv.end)) {
Ok(bases) => Self::gc_fraction(&bases),
Err(_) => 0.0,
};
gc_fractions.push(gc);
}
gc_fractions
}
fn gc_fraction(bases: &[u8]) -> f64 {
let mut gc = 0u64;
let mut total = 0u64;
let mut n_count = 0u64;
for &b in bases {
match b {
b'G' | b'C' | b'g' | b'c' => {
gc += 1;
total += 1;
}
b'A' | b'T' | b'a' | b't' => {
total += 1;
}
_ => {
n_count += 1;
total += 1;
}
}
}
safe_div_f(gc as f64 + n_count as f64 * 0.5, total as f64)
}
fn estimate_library_size(read_pairs: u64, unique_pairs: u64) -> Option<u64> {
if read_pairs == 0 || unique_pairs == 0 || unique_pairs >= read_pairs {
return None;
}
let n = read_pairs as f64;
let c = unique_pairs as f64;
let f = |x: f64| -> f64 { c / x - 1.0 + (-n / x).exp() };
let mut lo = 1.0_f64;
let mut hi = 100.0_f64;
if f(lo * c) < 0.0 {
return None;
}
while f(hi * c) >= 0.0 {
hi *= 10.0;
if hi > 1e15 {
return None; }
}
for _ in 0..40 {
let mid = f64::midpoint(lo, hi);
let val = f(mid * c);
if val == 0.0 {
break;
}
if val > 0.0 {
lo = mid;
} else {
hi = mid;
}
}
#[expect(clippy::cast_sign_loss, reason = "result is always positive")]
#[expect(clippy::cast_possible_truncation, reason = "library size fits in u64")]
Some((c * (lo + hi) / 2.0) as u64)
}
fn calculate_hs_penalty(
library_size: Option<u64>,
mean_coverage: f64,
fold_80: f64,
on_target_pct: f64,
pairs: u64,
unique_pairs: u64,
coverage_goal: u32,
) -> f64 {
let lib_size = match library_size {
Some(ls) if ls > 0 => ls,
_ => return 0.0,
};
if mean_coverage == 0.0 || on_target_pct == 0.0 || pairs == 0 || unique_pairs == 0 {
return 0.0;
}
let unique_pair_goal_multiplier = (f64::from(coverage_goal) / mean_coverage) * fold_80;
let mut pair_multiplier = unique_pair_goal_multiplier;
let mut increment = 1.0_f64;
let mut going_up = unique_pair_goal_multiplier >= 1.0;
let mut final_pair_multiplier = pair_multiplier;
for _ in 0..10_000 {
let unique_pair_multiplier =
Self::estimate_roi(lib_size, pair_multiplier, pairs, unique_pairs);
let diff = (unique_pair_multiplier - unique_pair_goal_multiplier).abs();
if diff / unique_pair_goal_multiplier <= 0.001 {
final_pair_multiplier = pair_multiplier;
break;
}
let overshot = if going_up {
unique_pair_multiplier > unique_pair_goal_multiplier
} else {
unique_pair_multiplier < unique_pair_goal_multiplier
};
if overshot {
increment /= 2.0;
going_up = !going_up;
}
if going_up {
pair_multiplier += increment;
} else {
pair_multiplier -= increment;
}
final_pair_multiplier = pair_multiplier;
}
let unique_fraction = (unique_pairs as f64 * unique_pair_goal_multiplier)
/ (pairs as f64 * final_pair_multiplier);
if unique_fraction <= 0.0 {
return 0.0;
}
(1.0 / unique_fraction) * fold_80 * (1.0 / on_target_pct)
}
fn estimate_roi(library_size: u64, x: f64, pairs: u64, unique_pairs: u64) -> f64 {
let l = library_size as f64;
let n = pairs as f64;
let c = unique_pairs as f64;
l * (1.0 - (-(x * n) / l).exp()) / c
}
fn compute_dropout(&self) -> (f64, f64) {
if self.fasta.is_none() || self.target_coverages.is_empty() {
return (0.0, 0.0);
}
let mut target_bases_by_gc = [0u64; 101];
let mut aligned_bases_by_gc = [0u64; 101];
for (i, (iv, cov)) in self.target_coverages.iter().enumerate() {
let gc = self.target_gc.get(i).copied().unwrap_or(0.0);
#[expect(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
reason = "GC is 0.0-1.0, rounded to 0-100"
)]
let gc_bin = (gc * 100.0).round().min(100.0) as usize;
target_bases_by_gc[gc_bin] += u64::from(iv.len());
aligned_bases_by_gc[gc_bin] += cov.total_depth();
}
let total_target_bases: u64 = target_bases_by_gc.iter().sum();
let total_aligned_bases: u64 = aligned_bases_by_gc.iter().sum();
if total_target_bases == 0 || total_aligned_bases == 0 {
return (0.0, 0.0);
}
let total_target_f = total_target_bases as f64;
let total_aligned_f = total_aligned_bases as f64;
let mut at_dropout = 0.0_f64;
let mut gc_dropout = 0.0_f64;
for gc_bin in 0..=100 {
let target_pct = target_bases_by_gc[gc_bin] as f64 / total_target_f;
let aligned_pct = aligned_bases_by_gc[gc_bin] as f64 / total_aligned_f;
let dropout = (target_pct - aligned_pct).max(0.0) * 100.0;
if gc_bin <= 50 {
at_dropout += dropout;
}
if gc_bin >= 50 {
gc_dropout += dropout;
}
}
(at_dropout, gc_dropout)
}
}
impl Collector for HybCapCollector {
fn initialize(&mut self, header: &Header) -> Result<()> {
let dict = SequenceDictionary::from(header);
self.genome_size = dict.iter().map(|s| s.length() as u64).sum();
let raw_baits = Intervals::from_path(&self.baits_path, dict.clone())?;
let raw_targets = Intervals::from_path(&self.targets_path, dict.clone())?;
self.raw_target_count = raw_targets.raw_count();
ensure!(
raw_baits.count() > 0,
"No bait intervals loaded from {}",
self.baits_path.display()
);
ensure!(
raw_targets.count() > 0,
"No target intervals loaded from {}",
self.targets_path.display()
);
let merged_baits = raw_baits.merged();
let merged_targets = raw_targets.merged();
self.merged_bait_territory = merged_baits.territory();
self.merged_target_territory = merged_targets.territory();
log::info!(
"Loaded {} bait intervals ({} bp territory) and {} target intervals ({} bp territory)",
merged_baits.count(),
self.merged_bait_territory,
merged_targets.count(),
self.merged_target_territory
);
self.target_overlapper = Some(Overlapper::<usize>::new(
merged_targets.iter().enumerate().map(|(idx, iv)| (iv.ref_id, iv.start, iv.end, idx)),
));
self.bait_overlapper = Some(Overlapper::<Interval>::from_intervals(&merged_baits));
let expanded_baits = merged_baits.padded(self.near_distance).merged();
self.expanded_bait_overlapper = Some(Overlapper::<()>::new(
expanded_baits.iter().map(|iv| (iv.ref_id, iv.start, iv.end, ())),
));
for iv in merged_targets.iter() {
let len = iv.len() as usize;
self.target_coverages.push((iv.clone(), TargetCoverage::new(len)));
}
self.target_gc = self.compute_all_target_gc(&dict);
self.dict = Some(dict);
Ok(())
}
fn accept(&mut self, record: &RecordBuf, _header: &Header) -> Result<()> {
let flags = record.flags();
if flags.is_secondary() || flags.is_qc_fail() {
return Ok(());
}
let is_supplementary = flags.is_supplementary();
let is_mapped = !flags.is_unmapped();
let is_duplicate = flags.is_duplicate();
if !is_supplementary {
self.total_reads += 1;
if !is_duplicate || self.include_duplicates {
self.deduped_reads += 1;
if is_mapped {
self.deduped_reads_aligned += 1;
}
}
}
let span_info = if is_mapped { Self::alignment_span_and_blocks(record) } else { None };
let read_len = record.sequence().len() as u64;
if !is_supplementary {
self.total_bases += read_len;
}
if let Some((_, _, aligned_bases, _)) = &span_info {
self.bases_aligned += aligned_bases;
if !is_duplicate || self.include_duplicates {
self.deduped_bases_aligned += aligned_bases;
}
}
let Some(ref_id) = record.reference_sequence_id() else {
return Ok(()); };
let Some((start, end, aligned_bases, ref blocks)) = span_info else {
return Ok(());
};
let target_idxs: SmallVec<[usize; 4]> = self
.target_overlapper
.as_ref()
.map(|o| o.get_overlaps(ref_id, start, end).copied().collect())
.unwrap_or_default();
let bait_hits: SmallVec<[(u32, u32); 4]> = self
.bait_overlapper
.as_ref()
.map(|o| o.get_overlaps(ref_id, start, end).map(|iv| (iv.start, iv.end)).collect())
.unwrap_or_default();
let on_expanded_bait = if bait_hits.is_empty() {
self.expanded_bait_overlapper
.as_ref()
.is_some_and(|o| o.overlaps_any(ref_id, start, end))
} else {
true
};
self.classify_bait_bases(aligned_bases, on_expanded_bait, &bait_hits, blocks);
if !is_supplementary {
let is_first_of_pair =
flags.is_segmented() && flags.is_first_segment() && !flags.is_mate_unmapped();
if is_first_of_pair && on_expanded_bait {
self.selected_pairs += 1;
if !is_duplicate || self.include_duplicates {
self.selected_unique_pairs += 1;
}
}
}
if is_duplicate && !self.include_duplicates {
self.exc_dupe_bases += aligned_bases;
return Ok(());
}
let mapq = record.mapping_quality().map_or(0, |mq| mq.get());
if mapq < self.min_mapq {
self.exc_mapq_bases += aligned_bases;
return Ok(());
}
let mate_clip_ref_pos =
if self.clip_overlapping { Self::compute_mate_clip_ref_pos(record) } else { None };
let is_mapped_pair =
flags.is_segmented() && !flags.is_unmapped() && !flags.is_mate_unmapped();
self.walk_cigar_for_coverage(
record,
start,
mate_clip_ref_pos,
is_mapped_pair,
&target_idxs,
);
Ok(())
}
#[expect(clippy::too_many_lines, reason = "metric computation is inherently sequential")]
fn finish(&mut self) -> Result<()> {
let dict = self.dict.as_ref().unwrap();
let mut depth_histogram: Vec<u64> = Vec::new(); let mut total_target_depth: u64 = 0;
let mut min_depth = u64::MAX;
let mut max_depth: u64 = 0;
let mut zero_cvg_targets: usize = 0;
for (iv, cov) in &self.target_coverages {
let target_len = iv.len() as usize;
let target_max = cov.max_depth();
let target_min = cov.min_depth();
if target_max == 0 {
zero_cvg_targets += 1;
}
total_target_depth += cov.total_depth();
min_depth = min_depth.min(target_min);
max_depth = max_depth.max(target_max);
for &d in &cov.hq_depths {
let d_idx = d as usize;
if d_idx >= depth_histogram.len() {
depth_histogram.resize(d_idx + 1, 0);
}
depth_histogram[d_idx] += 1;
}
if target_len == 0 {
zero_cvg_targets += 1;
}
}
if min_depth == u64::MAX {
min_depth = 0;
}
let mean_target_coverage = safe_div(total_target_depth, self.merged_target_territory);
let median_target_coverage = compute_median(&depth_histogram, self.merged_target_territory);
let p20 = compute_percentile(&depth_histogram, self.merged_target_territory, 0.20);
let fold_80_base_penalty = if p20 > 0.0 { mean_target_coverage / p20 } else { 0.0 };
let bases_at_or_above = compute_bases_at_or_above(&depth_histogram);
let frac_at_threshold = |threshold: u64| -> f64 {
#[expect(clippy::cast_possible_truncation, reason = "coverage thresholds fit in usize")]
let t = threshold as usize;
if t < bases_at_or_above.len() {
safe_div(bases_at_or_above[t], self.merged_target_territory)
} else {
0.0
}
};
let (at_dropout, gc_dropout) = self.compute_dropout();
let hs_library_size =
Self::estimate_library_size(self.selected_pairs, self.selected_unique_pairs);
let on_target_pct = safe_div(self.on_target_bases, self.deduped_bases_aligned);
let mean_cov_for_penalty =
safe_div(self.on_target_bases_from_pairs, self.merged_target_territory);
let calc_penalty = |goal: u32| -> f64 {
Self::calculate_hs_penalty(
hs_library_size,
mean_cov_for_penalty,
fold_80_base_penalty,
on_target_pct,
self.selected_pairs,
self.selected_unique_pairs,
goal,
)
};
let total_bait_bases = self.on_bait_bases + self.near_bait_bases + self.off_bait_bases;
let selected_bases = self.on_bait_bases + self.near_bait_bases;
let on_bait_frac = safe_div(self.on_bait_bases, total_bait_bases);
let bait_genome_frac =
safe_div_f(self.merged_bait_territory as f64, self.genome_size as f64);
let fold_enrichment = safe_div_f(on_bait_frac, bait_genome_frac);
let metric = HybCapMetric {
sample: self.sample.clone(),
panel_name: self.panel_name.clone(),
genome_size: self.genome_size,
bait_territory: self.merged_bait_territory,
target_territory: self.merged_target_territory,
bait_design_efficiency: safe_div(
self.merged_target_territory,
self.merged_bait_territory,
),
total_reads: self.total_reads,
deduped_reads: self.deduped_reads,
deduped_reads_aligned: self.deduped_reads_aligned,
total_bases: self.total_bases,
bases_aligned: self.bases_aligned,
deduped_bases_aligned: self.deduped_bases_aligned,
on_bait_bases: self.on_bait_bases,
near_bait_bases: self.near_bait_bases,
off_bait_bases: self.off_bait_bases,
on_target_bases: self.on_target_bases,
on_target_bases_from_pairs: self.on_target_bases_from_pairs,
selected_pairs: self.selected_pairs,
selected_unique_pairs: self.selected_unique_pairs,
frac_deduped_reads: safe_div(self.deduped_reads, self.total_reads),
frac_deduped_reads_aligned: safe_div(self.deduped_reads_aligned, self.deduped_reads),
frac_selected_bases: safe_div(selected_bases, total_bait_bases),
frac_off_bait: safe_div(self.off_bait_bases, total_bait_bases),
on_bait_vs_selected: safe_div(self.on_bait_bases, selected_bases),
mean_bait_coverage: safe_div(self.on_bait_bases, self.merged_bait_territory),
mean_target_coverage,
median_target_coverage,
max_target_coverage: max_depth,
min_target_coverage: min_depth,
frac_uncovered_targets: safe_div(zero_cvg_targets as u64, self.raw_target_count as u64),
fold_enrichment,
fold_80_base_penalty,
frac_exc_dupe: safe_div(self.exc_dupe_bases, self.bases_aligned),
frac_exc_mapq: safe_div(self.exc_mapq_bases, self.bases_aligned),
frac_exc_overlap: safe_div(self.exc_overlap_bases, self.bases_aligned),
frac_exc_baseq: safe_div(self.exc_baseq_bases, self.bases_aligned),
frac_exc_off_target: safe_div(self.exc_off_target_bases, self.bases_aligned),
frac_target_bases_1x: frac_at_threshold(1),
frac_target_bases_10x: frac_at_threshold(10),
frac_target_bases_20x: frac_at_threshold(20),
frac_target_bases_30x: frac_at_threshold(30),
frac_target_bases_50x: frac_at_threshold(50),
frac_target_bases_100x: frac_at_threshold(100),
frac_target_bases_250x: frac_at_threshold(250),
frac_target_bases_500x: frac_at_threshold(500),
frac_target_bases_1000x: frac_at_threshold(1000),
at_dropout,
gc_dropout,
frac_usable_bases_on_bait: safe_div(self.on_bait_bases, self.total_bases),
frac_usable_bases_on_target: safe_div(self.on_target_bases, self.total_bases),
hs_library_size,
hs_penalty_10x: calc_penalty(10),
hs_penalty_20x: calc_penalty(20),
hs_penalty_30x: calc_penalty(30),
hs_penalty_40x: calc_penalty(40),
hs_penalty_50x: calc_penalty(50),
hs_penalty_100x: calc_penalty(100),
};
log::info!("Writing metrics to {}", self.metrics_path.display());
write_tsv(&self.metrics_path, &[metric])?;
if self.output_per_target {
let mut rows = Vec::with_capacity(self.target_coverages.len());
for (i, (iv, cov)) in self.target_coverages.iter().enumerate() {
let contig_name =
dict.get_by_index(iv.ref_id).map_or("?", |m| m.name()).to_string();
let gc_frac = self.target_gc.get(i).copied().unwrap_or(0.0);
let normalized = if mean_target_coverage > 0.0 {
cov.mean_depth() / mean_target_coverage
} else {
0.0
};
rows.push(PerTargetCoverage {
chrom: contig_name,
start: u64::from(iv.start) + 1,
end: u64::from(iv.end),
length: u64::from(iv.len()),
name: iv.name().to_string(),
gc_frac,
mean_coverage: cov.mean_depth(),
normalized_coverage: normalized,
min_coverage: cov.min_depth(),
max_coverage: cov.max_depth(),
frac_0x: cov.frac_at_0x(),
read_count: cov.read_count,
});
}
log::info!("Writing per-target coverage to {}", self.per_target_path.display());
write_tsv(&self.per_target_path, &rows)?;
}
if self.output_per_base {
let mut rows = Vec::new();
for (iv, cov) in &self.target_coverages {
let contig_name =
dict.get_by_index(iv.ref_id).map_or("?", |m| m.name()).to_string();
for (offset, &depth) in cov.hq_depths.iter().enumerate() {
rows.push(PerBaseCoverage {
chrom: contig_name.clone(),
pos: u64::from(iv.start) + offset as u64 + 1, target: iv.name().to_string(),
coverage: u64::from(depth),
});
}
}
log::info!("Writing per-base coverage to {}", self.per_base_path.display());
write_tsv(&self.per_base_path, &rows)?;
}
Ok(())
}
fn name(&self) -> &'static str {
"hybcap"
}
}
#[derive(Debug, Serialize, Deserialize, MetricDocs, Clone)]
pub struct HybCapMetric {
pub sample: String,
pub panel_name: String,
pub genome_size: u64,
pub bait_territory: u64,
pub target_territory: u64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub bait_design_efficiency: f64,
pub total_reads: u64,
pub deduped_reads: u64,
pub deduped_reads_aligned: u64,
pub total_bases: u64,
pub bases_aligned: u64,
pub deduped_bases_aligned: u64,
pub on_bait_bases: u64,
pub near_bait_bases: u64,
pub off_bait_bases: u64,
pub on_target_bases: u64,
pub on_target_bases_from_pairs: u64,
pub selected_pairs: u64,
pub selected_unique_pairs: u64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_deduped_reads: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_deduped_reads_aligned: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_selected_bases: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_off_bait: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub on_bait_vs_selected: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub mean_bait_coverage: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub mean_target_coverage: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub median_target_coverage: f64,
pub max_target_coverage: u64,
pub min_target_coverage: u64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_uncovered_targets: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub fold_enrichment: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub fold_80_base_penalty: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_exc_dupe: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_exc_mapq: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_exc_overlap: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_exc_baseq: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_exc_off_target: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_target_bases_1x: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_target_bases_10x: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_target_bases_20x: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_target_bases_30x: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_target_bases_50x: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_target_bases_100x: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_target_bases_250x: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_target_bases_500x: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_target_bases_1000x: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub at_dropout: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub gc_dropout: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_usable_bases_on_bait: f64,
#[serde(serialize_with = "serialize_f64_6dp")]
pub frac_usable_bases_on_target: f64,
#[serde(serialize_with = "serialize_opt_u64")]
pub hs_library_size: Option<u64>,
#[serde(serialize_with = "serialize_f64_2dp")]
pub hs_penalty_10x: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub hs_penalty_20x: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub hs_penalty_30x: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub hs_penalty_40x: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub hs_penalty_50x: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub hs_penalty_100x: f64,
}
#[derive(Debug, Serialize, Deserialize, MetricDocs)]
pub struct PerTargetCoverage {
pub chrom: String,
pub start: u64,
pub end: u64,
pub length: u64,
pub name: String,
#[serde(serialize_with = "serialize_f64_5dp")]
pub gc_frac: f64,
#[serde(serialize_with = "serialize_f64_2dp")]
pub mean_coverage: f64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub normalized_coverage: f64,
pub min_coverage: u64,
pub max_coverage: u64,
#[serde(serialize_with = "serialize_f64_5dp")]
pub frac_0x: f64,
pub read_count: u64,
}
#[derive(Debug, Serialize, Deserialize, MetricDocs)]
pub struct PerBaseCoverage {
pub chrom: String,
pub pos: u64,
pub target: String,
pub coverage: u64,
}
struct TargetCoverage {
hq_depths: Vec<u32>,
read_count: u64,
}
impl TargetCoverage {
fn new(len: usize) -> Self {
Self { hq_depths: vec![0; len], read_count: 0 }
}
fn add_base(&mut self, offset: usize) {
if offset < self.hq_depths.len() {
self.hq_depths[offset] = self.hq_depths[offset].saturating_add(1);
}
}
fn total_depth(&self) -> u64 {
self.hq_depths.iter().map(|&d| u64::from(d)).sum()
}
fn min_depth(&self) -> u64 {
self.hq_depths.iter().copied().min().map_or(0, u64::from)
}
fn max_depth(&self) -> u64 {
self.hq_depths.iter().copied().max().map_or(0, u64::from)
}
fn mean_depth(&self) -> f64 {
if self.hq_depths.is_empty() {
0.0
} else {
self.total_depth() as f64 / self.hq_depths.len() as f64
}
}
fn frac_at_0x(&self) -> f64 {
if self.hq_depths.is_empty() {
0.0
} else {
let zeros = self.hq_depths.iter().filter(|&&d| d == 0).count();
zeros as f64 / self.hq_depths.len() as f64
}
}
}
fn compute_median(histogram: &[u64], total_bases: u64) -> f64 {
if total_bases == 0 {
return 0.0;
}
let mid = total_bases / 2;
let mut cumulative: u64 = 0;
for (depth, &count) in histogram.iter().enumerate() {
cumulative += count;
if cumulative > mid {
return depth as f64;
}
}
0.0
}
fn compute_percentile(histogram: &[u64], total_bases: u64, percentile: f64) -> f64 {
if total_bases == 0 {
return 0.0;
}
#[expect(
clippy::cast_possible_truncation,
clippy::cast_sign_loss,
reason = "percentile target is always in range"
)]
let target = (total_bases as f64 * percentile) as u64;
let mut cumulative: u64 = 0;
for (depth, &count) in histogram.iter().enumerate() {
cumulative += count;
if cumulative >= target {
return depth as f64;
}
}
0.0
}
fn compute_bases_at_or_above(histogram: &[u64]) -> Vec<u64> {
if histogram.is_empty() {
return Vec::new();
}
let mut result = vec![0u64; histogram.len()];
let mut cumulative: u64 = 0;
for i in (0..histogram.len()).rev() {
cumulative += histogram[i];
result[i] = cumulative;
}
result
}
#[allow(clippy::ref_option)]
fn serialize_opt_u64<S>(value: &Option<u64>, serializer: S) -> Result<S::Ok, S::Error>
where
S: serde::Serializer,
{
match value {
Some(v) => serializer.serialize_u64(*v),
None => serializer.serialize_str(""),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_compute_median_simple() {
let hist = vec![5, 10];
assert!((compute_median(&hist, 15) - 1.0).abs() < f64::EPSILON);
}
#[test]
fn test_compute_median_empty() {
assert!((compute_median(&[], 0)).abs() < f64::EPSILON);
}
#[test]
fn test_compute_percentile() {
let hist = vec![10, 10, 10];
assert!((compute_percentile(&hist, 30, 0.20)).abs() < f64::EPSILON); assert!((compute_percentile(&hist, 30, 0.50) - 1.0).abs() < f64::EPSILON); }
#[test]
fn test_compute_bases_at_or_above() {
let hist = vec![5, 10, 3, 2];
let result = compute_bases_at_or_above(&hist);
assert_eq!(result, vec![20, 15, 5, 2]);
}
#[test]
fn test_estimate_library_size() {
let result = HybCapCollector::estimate_library_size(1000, 900);
assert!(result.is_some());
let ls = result.unwrap();
assert!(ls > 900, "library size {ls} should be > unique pairs");
assert!(HybCapCollector::estimate_library_size(1000, 1000).is_none());
assert!(HybCapCollector::estimate_library_size(0, 0).is_none());
}
#[test]
fn test_estimate_roi() {
let lib_size = 10_000;
let pairs = 1000;
let unique = 900;
let roi = HybCapCollector::estimate_roi(lib_size, 1.0, pairs, unique);
assert!((roi - 1.0).abs() < 0.2, "ROI at 1x = {roi}");
let roi2 = HybCapCollector::estimate_roi(lib_size, 2.0, pairs, unique);
assert!(roi2 > roi, "ROI at 2x ({roi2}) should be > ROI at 1x ({roi})");
}
#[test]
fn test_hs_penalty_basic() {
let penalty = HybCapCollector::calculate_hs_penalty(
Some(100_000),
50.0, 1.5, 0.8, 10_000, 9_000, 30, );
assert!(penalty > 0.0, "penalty should be positive, got {penalty}");
}
#[test]
fn test_hs_penalty_no_library_size() {
let penalty =
HybCapCollector::calculate_hs_penalty(None, 50.0, 1.5, 0.8, 10_000, 9_000, 30);
assert!(penalty.abs() < f64::EPSILON);
}
}