use crate::{CurrRead, Error, ReadState};
use rust_htslib::bam;
use std::collections::BinaryHeap;
use std::rc::Rc;
#[expect(
clippy::arithmetic_side_effects,
reason = "sums can overflow but only if _huge_ amounts of data i.e. total sequence bp = (2^64-1)"
)]
fn get_stats_from_heap(
mut input: BinaryHeap<u64>,
total_length: u64,
) -> (u64, u64, u64, u64, u64, u64) {
let mut counter: u64 = 0;
let mut longest: u64 = 0;
let mut shortest: u64 = 0;
let mut median: u64 = 0;
let mut n50: u64 = 0;
let mut running_total_length: u64 = 0;
let heap_size: u64 = input.len().try_into().expect("heap cannot be this large");
while let Some(v) = input.pop() {
if counter == 0 {
longest = v;
}
running_total_length += v;
if median == 0 && counter > heap_size.div_ceil(2).saturating_sub(1) {
median = v;
}
if n50 == 0 && running_total_length > total_length.div_ceil(2) {
n50 = v;
}
shortest = v;
counter += 1;
}
assert_eq!(
running_total_length, total_length,
"running_total and pre-calculated total are expected to be equal"
);
let mean = total_length.checked_div(counter).unwrap_or(0u64);
(counter, mean, longest, shortest, median, n50)
}
#[expect(
clippy::arithmetic_side_effects,
reason = "sums can overflow but only if _huge_ amounts of data i.e. total sequence bp = (2^64-1)"
)]
pub fn run<W, D>(handle: &mut W, bam_records: D) -> Result<(), Error>
where
W: std::io::Write,
D: IntoIterator<Item = Result<Rc<bam::Record>, rust_htslib::errors::Error>>,
{
let mut primary_count: u64 = 0;
let mut secondary_count: u64 = 0;
let mut supplementary_count: u64 = 0;
let mut unmapped_count: u64 = 0;
let mut reversed_count: u64 = 0;
let mut seq_len_heap = BinaryHeap::<u64>::new();
let mut align_len_heap = BinaryHeap::<u64>::new();
let mut seq_len_total = 0;
let mut align_len_total = 0;
for r in bam_records {
let record = r?;
let curr_read = CurrRead::default()
.set_read_state_and_id(&record)?
.set_seq_len(&record)?;
if curr_read.read_state().strand() == '-' {
reversed_count += 1;
}
match curr_read.read_state() {
ReadState::PrimaryFwd | ReadState::PrimaryRev => {
primary_count += 1;
}
ReadState::SecondaryFwd | ReadState::SecondaryRev => {
secondary_count += 1;
}
ReadState::SupplementaryFwd | ReadState::SupplementaryRev => {
supplementary_count += 1;
}
ReadState::Unmapped => unmapped_count += 1,
}
if let Ok(v) = curr_read.seq_len() {
seq_len_total += v;
seq_len_heap.push(v);
}
match curr_read.set_align_len(&record) {
Ok(cr) => {
let v = cr.align_len()?;
align_len_total += v;
align_len_heap.push(v);
}
Err(Error::Unmapped(_)) => {}
Err(e) => return Err(e),
}
}
let (_, seq_len_mean, seq_len_max, seq_len_min, seq_len_median, seq_len_n50) =
get_stats_from_heap(seq_len_heap, seq_len_total);
let (_, align_len_mean, align_len_max, align_len_min, align_len_median, align_len_n50) =
get_stats_from_heap(align_len_heap, align_len_total);
writeln!(handle, "key\tvalue")?;
writeln!(handle, "n_primary_alignments\t{primary_count}")?;
writeln!(handle, "n_secondary_alignments\t{secondary_count}")?;
writeln!(handle, "n_supplementary_alignments\t{supplementary_count}")?;
writeln!(handle, "n_unmapped_reads\t{unmapped_count}")?;
writeln!(handle, "n_reversed_reads\t{reversed_count}")?;
writeln!(handle, "align_len_mean\t{align_len_mean}")?;
writeln!(handle, "align_len_max\t{align_len_max}")?;
writeln!(handle, "align_len_min\t{align_len_min}")?;
writeln!(handle, "align_len_median\t{align_len_median}")?;
writeln!(handle, "align_len_n50\t{align_len_n50}")?;
writeln!(handle, "seq_len_mean\t{seq_len_mean}")?;
writeln!(handle, "seq_len_max\t{seq_len_max}")?;
writeln!(handle, "seq_len_min\t{seq_len_min}")?;
writeln!(handle, "seq_len_median\t{seq_len_median}")?;
writeln!(handle, "seq_len_n50\t{seq_len_n50}")?;
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use indoc::indoc;
use rust_htslib::bam::Read as _;
#[test]
fn read_stats_example_1() -> Result<(), Error> {
let mut output = Vec::new();
let mut bam_reader = bam::Reader::from_path("./examples/example_1.bam")
.expect("Failed to open example_1.bam");
let bam_records = bam_reader.rc_records();
run(&mut output, bam_records)?;
let output_str = String::from_utf8(output).expect("Invalid UTF-8 output");
let expected_output = indoc! {"key\tvalue
n_primary_alignments\t3
n_secondary_alignments\t0
n_supplementary_alignments\t0
n_unmapped_reads\t1
n_reversed_reads\t1
align_len_mean\t29
align_len_max\t48
align_len_min\t8
align_len_median\t8
align_len_n50\t48
seq_len_mean\t34
seq_len_max\t48
seq_len_min\t8
seq_len_median\t33
seq_len_n50\t48\n"};
assert_eq!(output_str, expected_output);
Ok(())
}
#[test]
fn read_stats_example_3() -> Result<(), Error> {
let mut output = Vec::new();
let mut bam_reader = bam::Reader::from_path("./examples/example_3.bam")
.expect("Failed to open example_3.bam");
let bam_records = bam_reader.rc_records();
run(&mut output, bam_records)?;
let output_str = String::from_utf8(output).expect("Invalid UTF-8 output");
let expected_output = indoc! {"key\tvalue
n_primary_alignments\t6
n_secondary_alignments\t2
n_supplementary_alignments\t2
n_unmapped_reads\t0
n_reversed_reads\t3
align_len_mean\t12
align_len_max\t20
align_len_min\t6
align_len_median\t12
align_len_n50\t15
seq_len_mean\t12
seq_len_max\t20
seq_len_min\t6
seq_len_median\t12
seq_len_n50\t15\n"};
assert_eq!(output_str, expected_output);
Ok(())
}
}