use niffler::get_reader;
use num_format::{Locale, ToFormattedString};
use rust_htslib::bam::{self, Read};
use std::fs;
use std::io::{self, BufRead};
fn read_bam(file: &str, threads: usize) -> Option<(Vec<String>, Vec<usize>)> {
let mut names = Vec::new();
let mut lengths = Vec::new();
let mut bam = bam::Reader::from_path(file).ok()?;
bam.set_threads(threads).ok()?;
for record in bam.records() {
let rec = record.ok()?;
if rec.is_unmapped() || (!rec.is_secondary() && !rec.is_supplementary()) {
names.push(String::from_utf8_lossy(rec.qname()).to_string());
lengths.push(rec.seq().len());
}
}
eprintln!("SAM/BAM read: {}", file);
Some((names, lengths))
}
fn read_bed(file: &str) -> Option<(Vec<String>, Vec<usize>)> {
let mut names = Vec::new();
let mut lengths = Vec::new();
let file = fs::File::open(file).ok()?;
let (reader, _compression) = get_reader(Box::new(file)).ok()?;
let reader = io::BufReader::new(reader);
for line in reader.lines() {
let line = line.ok()?;
if line.starts_with('#') {
continue;
}
let fields: Vec<&str> = line.split_whitespace().collect();
if fields.len() >= 3 {
let start: usize = fields[1].parse().ok()?;
let end: usize = fields[2].parse().ok()?;
names.push(fields[0].to_string());
lengths.push(end - start);
}
}
Some((names, lengths))
}
fn calc_stats(
lengths: &[usize],
quantiles: &[f64],
genome_size: Option<usize>,
) -> (usize, usize, f64, Vec<f64>, usize, usize, usize, f64) {
let n = lengths.len();
let total: usize = genome_size.unwrap_or_else(|| lengths.iter().sum());
let mut sorted_lengths = lengths.to_vec();
sorted_lengths.sort_unstable_by(|a, b| b.cmp(a));
let max = *sorted_lengths.first().unwrap_or(&0);
let min = *sorted_lengths.last().unwrap_or(&0);
let mean = total as f64 / n as f64;
let au_n: f64 = sorted_lengths.iter().map(|&x| (x * x) as f64).sum::<f64>() / total as f64;
let mut quantile_values = Vec::new();
for &q in quantiles {
let idx = (q * n as f64).ceil() as usize - 1;
quantile_values.push(sorted_lengths.get(idx).cloned().unwrap_or(0) as f64);
}
let mut cumulative = 0;
let mut n50 = 0;
for &len in &sorted_lengths {
cumulative += len;
if cumulative >= total / 2 {
n50 = len;
break;
}
}
(total, n, mean, quantile_values, min, max, n50, au_n)
}
pub fn h_fmt<T>(num: T) -> String
where
T: Into<f64> + Copy,
{
let mut num: f64 = num.into();
for unit in ["", "Kbp", "Mbp"] {
if num < 1000.0 {
return format!("{:.2}{}", num, unit);
}
num /= 1000.0;
}
format!("{:.2}{}", num, "Gbp")
}
pub fn seq_stats(
infiles: &[String],
threads: usize,
human_readable: bool,
quantiles: &[f64],
genome_size: Option<usize>,
) {
let mut output = String::from("file\ttotalBp\tnSeqs\tmean\tquantiles\tmin\tmax\tN50\tauN\n");
for file in infiles {
let lengths = if file.ends_with(".bam") || file.ends_with(".sam") || file.ends_with(".cram")
{
log::info!("Reading BAM/SAM/CRAM file: {}", file);
read_bam(file, threads)
} else if file.ends_with(".bed") || file.ends_with(".bed.gz") {
log::info!("Reading BED file: {}", file);
read_bed(file)
} else {
None
};
if let Some((_, lengths)) = lengths {
let (total, n, mean, quantile_values, min, max, n50, au_n) =
calc_stats(&lengths, quantiles, genome_size);
let quantile_str = quantile_values
.iter()
.map(|q| {
if human_readable {
h_fmt(*q)
} else {
q.to_string()
}
})
.collect::<Vec<_>>()
.join("\t");
let line = if human_readable {
format!(
"{}\t{}\t{}\t{:}\t{}\t{}\t{}\t{}\t{:}\n",
file,
h_fmt(total as f64),
n.to_formatted_string(&Locale::en),
h_fmt(mean),
quantile_str,
h_fmt(min as f64),
h_fmt(max as f64),
h_fmt(n50 as f64),
h_fmt(au_n)
)
} else {
format!(
"{}\t{}\t{}\t{:.2}\t{}\t{}\t{}\t{}\t{:.2}\n",
file, total, n, mean, quantile_str, min, max, n50, au_n
)
};
output.push_str(&line);
} else {
eprintln!("Skipping file: {}", file);
}
}
print!("{}", output);
}