use std::path::PathBuf;
use anyhow::{Context, Result};
use clap::Args;
use serde::Serialize;
use crate::cli::RunContext;
use crate::io::reader::open_input;
use crate::output::{write, HumanDisplay, OutputFormat};
use crate::utils::quality::{count_above_threshold, mean_quality};
use crate::utils::stats_helpers::{mean, median, min_max, nx};
#[derive(Args, Debug)]
pub struct StatsArgs {
pub input: PathBuf,
#[arg(long, conflicts_with = "tsv")]
pub json: bool,
#[arg(long, conflicts_with = "json")]
pub tsv: bool,
#[arg(short = 'e', long)]
pub extended: bool,
}
#[derive(Debug, Serialize)]
pub struct Stats {
pub file: String,
pub read_count: u64,
pub total_bases: u64,
pub min_length: u64,
pub max_length: u64,
pub mean_length: f64,
pub median_length: f64,
pub n50: u64,
pub n90: u64,
pub mean_quality: f64,
pub bases_above_q10_pct: f64,
pub bases_above_q20_pct: f64,
pub bases_above_q30_pct: f64,
pub gc_content_pct: f64,
}
pub fn run(args: StatsArgs, ctx: &RunContext) -> Result<()> {
if ctx.verbose {
eprintln!("[biolic stats] Reading from {}", args.input.display());
}
let stats = compute(&args)?;
let format = if args.json {
OutputFormat::Json
} else if args.tsv {
OutputFormat::Tsv
} else {
OutputFormat::auto()
};
write(&stats, format)?;
Ok(())
}
pub fn compute(args: &StatsArgs) -> Result<Stats> {
let mut reader = open_input(&args.input).context("opening input")?;
let mut read_count: u64 = 0;
let mut total_bases: u64 = 0;
let mut lengths: Vec<u64> = Vec::new();
let mut sum_error_prob: f64 = 0.0;
let mut bases_q10: u64 = 0;
let mut bases_q20: u64 = 0;
let mut bases_q30: u64 = 0;
let mut gc_count: u64 = 0;
let mut acgt_count: u64 = 0;
while let Some(record) = reader.next_record().context("reading record")? {
read_count += 1;
total_bases += record.len() as u64;
lengths.push(record.len() as u64);
if let Some(qual) = &record.qual {
for &q in qual {
sum_error_prob += crate::utils::quality::phred_to_prob(q);
}
bases_q10 += count_above_threshold(qual, 10) as u64;
bases_q20 += count_above_threshold(qual, 20) as u64;
bases_q30 += count_above_threshold(qual, 30) as u64;
}
let (a, c, g, t, _n, _other) = record.base_counts();
acgt_count += a + c + g + t;
gc_count += c + g;
}
let (min_length, max_length) = min_max(&lengths);
let mean_length = mean(&lengths);
let median_length = median(&lengths);
let n50_val = nx(&lengths, 50);
let n90_val = nx(&lengths, 90);
let mean_qual = if total_bases > 0 && sum_error_prob > 0.0 {
crate::utils::quality::prob_to_phred(sum_error_prob / total_bases as f64)
} else {
0.0
};
let bases_above_q10_pct = pct(bases_q10, total_bases);
let bases_above_q20_pct = pct(bases_q20, total_bases);
let bases_above_q30_pct = pct(bases_q30, total_bases);
let gc_content_pct = pct(gc_count, acgt_count);
let _ = (mean_quality(&[]), args.extended);
Ok(Stats {
file: args.input.display().to_string(),
read_count,
total_bases,
min_length,
max_length,
mean_length,
median_length,
n50: n50_val,
n90: n90_val,
mean_quality: mean_qual,
bases_above_q10_pct,
bases_above_q20_pct,
bases_above_q30_pct,
gc_content_pct,
})
}
fn pct(num: u64, denom: u64) -> f64 {
if denom == 0 {
0.0
} else {
100.0 * (num as f64) / (denom as f64)
}
}
impl HumanDisplay for Stats {
fn write_human(&self, w: &mut dyn std::io::Write) -> Result<()> {
writeln!(w, "File: {}", self.file)?;
writeln!(w, "Reads: {}", fmt_int(self.read_count))?;
writeln!(w, "Total bases: {}", fmt_int(self.total_bases))?;
writeln!(w, "Min length: {}", fmt_int(self.min_length))?;
writeln!(w, "Max length: {}", fmt_int(self.max_length))?;
writeln!(w, "Mean length: {:.1}", self.mean_length)?;
writeln!(w, "Median length: {:.1}", self.median_length)?;
writeln!(w, "N50: {}", fmt_int(self.n50))?;
writeln!(w, "N90: {}", fmt_int(self.n90))?;
writeln!(w, "Mean quality: {:.2}", self.mean_quality)?;
writeln!(w, "Bases above Q10: {:.2}%", self.bases_above_q10_pct)?;
writeln!(w, "Bases above Q20: {:.2}%", self.bases_above_q20_pct)?;
writeln!(w, "Bases above Q30: {:.2}%", self.bases_above_q30_pct)?;
writeln!(w, "GC content: {:.2}%", self.gc_content_pct)?;
Ok(())
}
fn write_tsv(&self, w: &mut dyn std::io::Write) -> Result<()> {
writeln!(
w,
"file\treads\ttotal_bases\tmin_length\tmax_length\tmean_length\tmedian_length\tn50\tn90\tmean_quality\tq10_pct\tq20_pct\tq30_pct\tgc_pct"
)?;
writeln!(
w,
"{}\t{}\t{}\t{}\t{}\t{:.1}\t{:.1}\t{}\t{}\t{:.2}\t{:.2}\t{:.2}\t{:.2}\t{:.2}",
self.file,
self.read_count,
self.total_bases,
self.min_length,
self.max_length,
self.mean_length,
self.median_length,
self.n50,
self.n90,
self.mean_quality,
self.bases_above_q10_pct,
self.bases_above_q20_pct,
self.bases_above_q30_pct,
self.gc_content_pct,
)?;
Ok(())
}
}
fn fmt_int(n: u64) -> String {
let s = n.to_string();
let mut out = String::new();
for (i, c) in s.chars().rev().enumerate() {
if i > 0 && i % 3 == 0 {
out.push(',');
}
out.push(c);
}
out.chars().rev().collect()
}