use std::collections::HashMap;
use std::io::Write;
use std::path::Path;
use anyhow::{Context, Result};
use log::debug;
use super::bam_stat::{BamStatResult, GcDepthBin};
fn fmt_1dp(v: f64) -> String {
format!("{v:.1}")
}
fn fmt_sci(v: f64) -> String {
let s = format!("{v:.6e}");
if let Some(pos) = s.find('e') {
let (mantissa, exp_part) = s.split_at(pos);
let exp_str = &exp_part[1..]; let (sign, digits) = if let Some(rest) = exp_str.strip_prefix('-') {
("-", rest)
} else if let Some(rest) = exp_str.strip_prefix('+') {
("+", rest)
} else {
("+", exp_str)
};
let exp_num: i32 = digits.parse().unwrap_or(0);
format!("{mantissa}e{sign}{exp_num:02}")
} else {
s
}
}
pub fn write_stats(result: &BamStatResult, output_path: &Path) -> Result<()> {
let mut out = std::fs::File::create(output_path)
.with_context(|| format!("Failed to create stats file: {}", output_path.display()))?;
writeln!(out, "# This file was produced by samtools stats and RustQC")?;
writeln!(
out,
"# The command line was: rustqc rna (samtools stats compatible output)"
)?;
let raw_total = result.primary_count; let filtered = result.qc_failed;
let sequences = raw_total.saturating_sub(filtered);
let avg_len = if raw_total > 0 {
(result.total_len as f64 / raw_total as f64).round()
} else {
0.0
};
let avg_first = if result.first_fragments > 0 {
(result.total_first_fragment_len as f64 / result.first_fragments as f64).round()
} else {
0.0
};
let avg_last = if result.last_fragments > 0 {
(result.total_last_fragment_len as f64 / result.last_fragments as f64).round()
} else {
0.0
};
let avg_quality = if result.quality_count > 0 {
result.quality_sum / result.quality_count as f64
} else {
0.0
};
let (avg_insert, insert_sd) = {
let total_count: u64 = result.is_hist.values().map(|v| v[0]).sum();
if total_count > 0 {
let mut sorted_isizes: Vec<(u64, u64)> =
result.is_hist.iter().map(|(&k, v)| (k, v[0])).collect();
sorted_isizes.sort_by_key(|&(k, _)| k);
let mut bulk_count = 0u64;
let mut weighted_sum = 0.0f64;
let mut ibulk: u64 = 0;
for &(isize_val, count) in &sorted_isizes {
if count > 0 {
ibulk = isize_val + 1;
}
bulk_count += count;
weighted_sum += isize_val as f64 * count as f64;
if bulk_count as f64 / total_count as f64 > 0.99 {
break;
}
}
if bulk_count > 0 {
let mean = weighted_sum / bulk_count as f64;
let mut sd_sum = 0.0f64;
for &(isize_val, count) in &sorted_isizes {
if isize_val == 0 {
continue; }
if isize_val >= ibulk {
break;
}
let diff = isize_val as f64 - mean;
sd_sum += count as f64 * diff * diff;
}
let sd = (sd_sum / bulk_count as f64).sqrt();
(mean, sd)
} else {
(0.0, 0.0)
}
} else {
(0.0, 0.0)
}
};
let error_rate = if result.bases_mapped_cigar > 0 {
result.mismatches as f64 / result.bases_mapped_cigar as f64
} else {
0.0
};
writeln!(
out,
"# CHK, Pair of checksum values for read names, sequences and qualities, calculated using a CRC32 algorithm."
)?;
writeln!(
out,
"CHK\t{:08x}\t{:08x}\t{:08x}",
result.chk[0], result.chk[1], result.chk[2]
)?;
sn(
&mut out,
"raw total sequences:",
raw_total,
"excluding supplementary and secondary reads",
)?;
sn_no_comment(&mut out, "filtered sequences:", filtered)?;
sn_no_comment(&mut out, "sequences:", sequences)?;
sn(&mut out, "is sorted:", 1_u64, "sorted by coordinate")?;
sn_no_comment(&mut out, "1st fragments:", result.first_fragments)?;
sn_no_comment(&mut out, "last fragments:", result.last_fragments)?;
sn_no_comment(&mut out, "reads mapped:", result.primary_mapped)?;
sn(
&mut out,
"reads mapped and paired:",
result.reads_mapped_and_paired,
"paired-end technology bit set + both mates mapped",
)?;
sn_no_comment(&mut out, "reads unmapped:", result.unmapped)?;
sn(
&mut out,
"reads properly paired:",
result.properly_paired,
"proper-pair bit set",
)?;
sn(
&mut out,
"reads paired:",
result.paired_flagstat,
"paired-end technology bit set",
)?;
sn(
&mut out,
"reads duplicated:",
result.primary_duplicates,
"PCR or optical duplicate bit set",
)?;
sn(&mut out, "reads MQ0:", result.reads_mq0, "mapped and MQ=0")?;
sn_no_comment(&mut out, "reads QC failed:", filtered)?;
sn_no_comment(
&mut out,
"non-primary alignments:",
result.secondary + result.supplementary,
)?;
sn_no_comment(&mut out, "supplementary alignments:", result.supplementary)?;
sn(
&mut out,
"total length:",
result.total_len,
"ignores clipping",
)?;
sn(
&mut out,
"total first fragment length:",
result.total_first_fragment_len,
"ignores clipping",
)?;
sn(
&mut out,
"total last fragment length:",
result.total_last_fragment_len,
"ignores clipping",
)?;
sn(
&mut out,
"bases mapped:",
result.bases_mapped,
"ignores clipping",
)?;
sn(
&mut out,
"bases mapped (cigar):",
result.bases_mapped_cigar,
"more accurate",
)?;
sn_no_comment(&mut out, "bases trimmed:", 0_u64)?;
sn_no_comment(&mut out, "bases duplicated:", result.bases_duplicated)?;
sn(&mut out, "mismatches:", result.mismatches, "from NM fields")?;
sn(
&mut out,
"error rate:",
fmt_sci(error_rate),
"mismatches / bases mapped (cigar)",
)?;
sn_no_comment(&mut out, "average length:", avg_len as u64)?;
sn_no_comment(&mut out, "average first fragment length:", avg_first as u64)?;
sn_no_comment(&mut out, "average last fragment length:", avg_last as u64)?;
sn_no_comment(&mut out, "maximum length:", result.max_len)?;
sn_no_comment(
&mut out,
"maximum first fragment length:",
result.max_first_fragment_len,
)?;
sn_no_comment(
&mut out,
"maximum last fragment length:",
result.max_last_fragment_len,
)?;
sn_no_comment(&mut out, "average quality:", fmt_1dp(avg_quality))?;
sn_no_comment(&mut out, "insert size average:", fmt_1dp(avg_insert))?;
sn_no_comment(
&mut out,
"insert size standard deviation:",
fmt_1dp(insert_sd),
)?;
sn_no_comment(&mut out, "inward oriented pairs:", result.inward_pairs / 2)?;
sn_no_comment(
&mut out,
"outward oriented pairs:",
result.outward_pairs / 2,
)?;
sn_no_comment(
&mut out,
"pairs with other orientation:",
result.other_orientation / 2,
)?;
sn_no_comment(
&mut out,
"pairs on different chromosomes:",
result.mate_diff_chr_mapq5,
)?;
sn_no_comment(
&mut out,
"percentage of properly paired reads (%):",
fmt_1dp(if raw_total > 0 {
100.0 * result.properly_paired as f64 / raw_total as f64
} else {
0.0
}),
)?;
write_ffq_lfq(&mut out, "FFQ", "first fragment", &result.ffq)?;
write_ffq_lfq(&mut out, "LFQ", "last fragment", &result.lfq)?;
write_gc_content(&mut out, "GCF", "first fragments", &result.gcf)?;
write_gc_content(&mut out, "GCL", "last fragments", &result.gcl)?;
write_gcc(
&mut out,
"GCC",
"ACGT content per cycle",
&result.fbc_ro,
&result.lbc_ro,
)?;
write_gct(
&mut out,
"GCT",
"ACGT content per cycle, read-oriented (display complement of bases of reverse reads)",
&result.gcc_rc,
)?;
write_base_counts(&mut out, "FBC", "first fragments", &result.fbc_ro)?;
write_total_base_counts(&mut out, "FTC", &result.ftc)?;
write_base_counts(&mut out, "LBC", "last fragments", &result.lbc_ro)?;
write_total_base_counts(&mut out, "LTC", &result.ltc)?;
write_insert_size(&mut out, &result.is_hist)?;
write_length_hist(&mut out, "RL", &result.rl_hist)?;
write_length_hist(&mut out, "FRL", &result.frl_hist)?;
write_length_hist(&mut out, "LRL", &result.lrl_hist)?;
write_mapq_hist(&mut out, &result.mapq_hist)?;
write_indel_dist(&mut out, &result.id_hist)?;
write_indel_cycle(&mut out, &result.ic)?;
write_coverage_dist(&mut out, &result.cov_hist)?;
write_gc_depth(&mut out, &result.gcd_bins, result)?;
debug!("Wrote samtools stats output to {}", output_path.display());
Ok(())
}
fn sn<W: std::io::Write, V: std::fmt::Display>(
out: &mut W,
key: &str,
value: V,
comment: &str,
) -> Result<()> {
writeln!(out, "SN\t{key}\t{value}\t# {comment}")?;
Ok(())
}
fn sn_no_comment<W: std::io::Write, V: std::fmt::Display>(
out: &mut W,
key: &str,
value: V,
) -> Result<()> {
writeln!(out, "SN\t{key}\t{value}")?;
Ok(())
}
fn write_ffq_lfq<W: std::io::Write>(
out: &mut W,
tag: &str,
desc: &str,
data: &[[u64; 64]],
) -> Result<()> {
let max_q_observed = data
.iter()
.flat_map(|row| {
row.iter()
.enumerate()
.rev()
.find(|(_, &c)| c > 0)
.map(|(i, _)| i)
})
.max()
.unwrap_or(0);
let max_q = if max_q_observed + 1 < 64 {
max_q_observed + 1
} else {
max_q_observed
};
writeln!(
out,
"# {tag}, {desc} quality. Columns correspond to qualities and rows to cycles. First column is the cycle number."
)?;
writeln!(
out,
"# Columns correspond to qualities 0, 1, 2, ..., {max_q}"
)?;
writeln!(
out,
"# Use `plot-bamstats -p <outdir> <file>` to generate plots"
)?;
for (cycle, row) in data.iter().enumerate() {
write!(out, "{tag}\t{}", cycle + 1)?; for &count in row.iter().take(max_q + 1) {
write!(out, "\t{count}")?;
}
writeln!(out)?;
}
Ok(())
}
fn write_gc_content<W: std::io::Write>(
out: &mut W,
tag: &str,
desc: &str,
data: &[u64; 200],
) -> Result<()> {
writeln!(
out,
"# {tag}, GC content of {desc}. Use `plot-bamstats -p <outdir> <file>` to generate plots"
)?;
let ngc: usize = 200;
let mut ibase_prev: usize = 0;
for ibase in 0..ngc {
if data[ibase] == data[ibase_prev] {
continue;
}
let gc_pct = (ibase + ibase_prev) as f64 * 0.5 * 100.0 / (ngc - 1) as f64;
writeln!(out, "{tag}\t{gc_pct:.2}\t{}", data[ibase_prev])?;
ibase_prev = ibase;
}
if data[ibase_prev] > 0 {
let gc_pct = ((ngc - 1) + ibase_prev) as f64 * 0.5 * 100.0 / (ngc - 1) as f64;
writeln!(out, "{tag}\t{gc_pct:.2}\t{}", data[ibase_prev])?;
}
Ok(())
}
fn write_gcc<W: std::io::Write>(
out: &mut W,
tag: &str,
desc: &str,
fbc: &[[u64; 6]],
lbc: &[[u64; 6]],
) -> Result<()> {
writeln!(
out,
"# {tag}, {desc}. Use `plot-bamstats -p <outdir> <file>` to generate plots"
)?;
let max_cycles = fbc.len().max(lbc.len());
for cycle in 0..max_cycles {
let mut combined = [0u64; 6];
if cycle < fbc.len() {
for j in 0..6 {
combined[j] += fbc[cycle][j];
}
}
if cycle < lbc.len() {
for j in 0..6 {
combined[j] += lbc[cycle][j];
}
}
let acgt_sum: u64 = combined[..4].iter().sum();
if acgt_sum > 0 {
let pcts: Vec<f64> = combined
.iter()
.map(|&c| 100.0 * c as f64 / acgt_sum as f64)
.collect();
writeln!(
out,
"{tag}\t{}\t{:.2}\t{:.2}\t{:.2}\t{:.2}\t{:.2}\t{:.2}",
cycle + 1,
pcts[0],
pcts[1],
pcts[2],
pcts[3],
pcts[4],
pcts[5]
)?;
}
}
Ok(())
}
fn write_gct<W: std::io::Write>(
out: &mut W,
tag: &str,
desc: &str,
gcc_rc: &[[u64; 4]],
) -> std::io::Result<()> {
writeln!(
out,
"# {desc}. Use `grep ^{tag} | cut -f 2-` to extract this part."
)?;
for (cycle, bases) in gcc_rc.iter().enumerate() {
let total: u64 = bases.iter().sum();
if total > 0 {
let pcts: Vec<f64> = bases
.iter()
.map(|&c| 100.0 * c as f64 / total as f64)
.collect();
writeln!(
out,
"{tag}\t{}\t{:.2}\t{:.2}\t{:.2}\t{:.2}",
cycle + 1,
pcts[0],
pcts[1],
pcts[2],
pcts[3]
)?;
} else {
writeln!(out, "{tag}\t{}\t0.00\t0.00\t0.00\t0.00", cycle + 1)?;
}
}
Ok(())
}
fn write_base_counts<W: std::io::Write>(
out: &mut W,
tag: &str,
desc: &str,
data: &[[u64; 6]],
) -> Result<()> {
writeln!(
out,
"# {tag}, ACGT content per cycle for {desc}. Use `plot-bamstats -p <outdir> <file>` to generate plots"
)?;
for (cycle, row) in data.iter().enumerate() {
let acgt_sum: u64 = row[..4].iter().sum();
if acgt_sum > 0 {
let pcts: Vec<f64> = row
.iter()
.map(|&c| 100.0 * c as f64 / acgt_sum as f64)
.collect();
writeln!(
out,
"{tag}\t{}\t{:.2}\t{:.2}\t{:.2}\t{:.2}\t{:.2}\t{:.2}",
cycle + 1, pcts[0],
pcts[1],
pcts[2],
pcts[3],
pcts[4],
pcts[5]
)?;
}
}
Ok(())
}
fn write_total_base_counts<W: std::io::Write>(
out: &mut W,
tag: &str,
data: &[u64; 5],
) -> Result<()> {
writeln!(
out,
"# {tag}, total base counters. Use `plot-bamstats -p <outdir> <file>` to generate plots"
)?;
writeln!(
out,
"{tag}\t{}\t{}\t{}\t{}\t{}",
data[0], data[1], data[2], data[3], data[4]
)?;
Ok(())
}
fn write_insert_size<W: std::io::Write>(
out: &mut W,
is_hist: &std::collections::HashMap<u64, [u64; 4]>,
) -> Result<()> {
writeln!(
out,
"# IS, insert size. Use `plot-bamstats -p <outdir> <file>` to generate plots"
)?;
if is_hist.is_empty() {
return Ok(());
}
const MAX_INSERT_SIZE: u64 = 8000;
let max_key = is_hist
.keys()
.filter(|&&k| k < MAX_INSERT_SIZE)
.max()
.copied()
.unwrap_or(0);
let nisize: f64 = is_hist.values().map(|v| v[0] as f64 * 0.5).sum();
let mut bulk: f64 = 0.0;
let mut bulk_limit: u64 = max_key;
for isize_val in 0..=max_key {
let count = is_hist.get(&isize_val).map(|v| v[0]).unwrap_or(0);
if count > 0 {
bulk_limit = isize_val + 1;
}
bulk += count as f64 * 0.5;
if nisize > 0.0 && bulk / nisize > 0.99 {
bulk_limit = isize_val + 1;
break;
}
}
for isize_val in 0..bulk_limit {
let counts = is_hist.get(&isize_val).copied().unwrap_or([0; 4]);
writeln!(
out,
"IS\t{isize_val}\t{}\t{}\t{}\t{}",
(counts[0] as f64 * 0.5) as u64,
(counts[1] as f64 * 0.5) as u64,
(counts[2] as f64 * 0.5) as u64,
(counts[3] as f64 * 0.5) as u64
)?;
}
Ok(())
}
fn write_length_hist<W: std::io::Write>(
out: &mut W,
tag: &str,
hist: &std::collections::HashMap<u64, u64>,
) -> Result<()> {
writeln!(
out,
"# {tag}, read length. Use `plot-bamstats -p <outdir> <file>` to generate plots"
)?;
if hist.is_empty() {
return Ok(());
}
let mut lengths: Vec<u64> = hist.keys().copied().collect();
lengths.sort();
for len in lengths {
let count = hist[&len];
if count > 0 {
writeln!(out, "{tag}\t{len}\t{count}")?;
}
}
Ok(())
}
fn write_mapq_hist<W: std::io::Write>(out: &mut W, mapq_hist: &[u64; 256]) -> Result<()> {
writeln!(
out,
"# MAPQ, mapping quality. Use `plot-bamstats -p <outdir> <file>` to generate plots"
)?;
let max_mapq = mapq_hist
.iter()
.enumerate()
.rev()
.find(|(_, &c)| c > 0)
.map(|(i, _)| i)
.unwrap_or(0);
for (q, &count) in mapq_hist.iter().enumerate().take(max_mapq + 1) {
if count > 0 {
writeln!(out, "MAPQ\t{q}\t{count}")?;
}
}
Ok(())
}
fn write_indel_dist<W: std::io::Write>(
out: &mut W,
id_hist: &std::collections::HashMap<u64, [u64; 2]>,
) -> Result<()> {
writeln!(
out,
"# ID, indel size. Use `plot-bamstats -p <outdir> <file>` to generate plots"
)?;
if id_hist.is_empty() {
return Ok(());
}
let mut sizes: Vec<u64> = id_hist.keys().copied().collect();
sizes.sort();
for len in sizes {
let counts = id_hist[&len];
if counts[0] > 0 || counts[1] > 0 {
writeln!(out, "ID\t{len}\t{}\t{}", counts[0], counts[1])?;
}
}
Ok(())
}
fn write_indel_cycle<W: std::io::Write>(out: &mut W, ic: &[[u64; 4]]) -> Result<()> {
writeln!(
out,
"# IC, indels per cycle. Use `plot-bamstats -p <outdir> <file>` to generate plots"
)?;
for (cycle, row) in ic.iter().enumerate() {
if row[0] > 0 || row[1] > 0 || row[2] > 0 || row[3] > 0 {
writeln!(
out,
"IC\t{}\t{}\t{}\t{}\t{}",
cycle + 1, row[0],
row[1],
row[2],
row[3]
)?;
}
}
Ok(())
}
fn write_coverage_dist<W: std::io::Write>(out: &mut W, cov_hist: &HashMap<u32, u64>) -> Result<()> {
writeln!(
out,
"# COV, Coverage distribution. Use `plot-bamstats -p <outdir> <file>` to generate plots"
)?;
let cov_min: u32 = 1;
let cov_max: u32 = 1000;
let cov_step: u32 = 1;
let below_min: u64 = cov_hist
.iter()
.filter(|(&d, _)| d < cov_min)
.map(|(_, &c)| c)
.sum();
if below_min > 0 {
writeln!(out, "COV\t[<{cov_min}]\t{}\t{below_min}", cov_min - 1)?;
}
let mut depth = cov_min;
while depth <= cov_max {
let end = depth + cov_step - 1;
let count: u64 = (depth..=end).filter_map(|d| cov_hist.get(&d)).sum();
if count > 0 {
if cov_step == 1 {
writeln!(out, "COV\t[{depth}-{depth}]\t{depth}\t{count}")?;
} else {
writeln!(out, "COV\t[{depth}-{end}]\t{depth}\t{count}")?;
}
}
depth += cov_step;
}
let above_max: u64 = cov_hist
.iter()
.filter(|(&d, _)| d > cov_max)
.map(|(_, &c)| c)
.sum();
if above_max > 0 {
writeln!(out, "COV\t[{cov_max}<]\t{cov_max}\t{above_max}")?;
}
Ok(())
}
fn gcd_percentile(depths: &[u32], n: usize, p: u32) -> f64 {
let pos = p as f64 * (n as f64 + 1.0) / 100.0;
let k = pos as usize;
if k == 0 {
return depths[0] as f64;
}
if k >= n {
return depths[n - 1] as f64;
}
let d = pos - k as f64;
depths[k - 1] as f64 + d * (depths[k] as f64 - depths[k - 1] as f64)
}
fn write_gc_depth(
out: &mut impl Write,
gcd_bins: &[GcDepthBin],
result: &BamStatResult,
) -> std::io::Result<()> {
if gcd_bins.is_empty() {
return Ok(());
}
writeln!(
out,
"# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. \
The columns are: GC%, unique sequence percentiles, 10th, 25th, \
50th, 75th and 90th depth percentile"
)?;
let last = gcd_bins.len().saturating_sub(1);
let mut bins: Vec<(f32, u32)> = gcd_bins
.iter()
.enumerate()
.map(|(idx, b)| {
let gc_pct = if idx == last {
b.gc
} else if b.depth > 0 {
(100.0 * b.gc / b.depth as f32).round()
} else {
0.0
};
(gc_pct, b.depth)
})
.collect();
bins.sort_by(|a, b| {
a.0.partial_cmp(&b.0)
.unwrap_or(std::cmp::Ordering::Equal)
.then(a.1.cmp(&b.1))
});
let real_bins = bins.len();
bins.insert(0, (0.0f32, 0u32));
let total_for_denom = real_bins + 1;
let output_len = real_bins;
let avg_read_len = if result.primary_count > 0 {
result.total_len as f64 / result.primary_count as f64
} else {
0.0
};
let gcd_bin_size = 20_000.0_f64;
let scale = avg_read_len / gcd_bin_size;
let mut i = 0;
while i < output_len {
let gc = bins[i].0;
let group_start = i;
while i < output_len && (bins[i].0 - gc).abs() < 0.1 {
i += 1;
}
let nbins = i - group_start;
let depths: Vec<u32> = bins[group_start..i].iter().map(|b| b.1).collect();
let cum_pct = (group_start + nbins + 1) as f64 * 100.0 / total_for_denom as f64;
writeln!(
out,
"GCD\t{:.1}\t{:.3}\t{:.3}\t{:.3}\t{:.3}\t{:.3}\t{:.3}",
gc,
cum_pct,
gcd_percentile(&depths, nbins, 10) * scale,
gcd_percentile(&depths, nbins, 25) * scale,
gcd_percentile(&depths, nbins, 50) * scale,
gcd_percentile(&depths, nbins, 75) * scale,
gcd_percentile(&depths, nbins, 90) * scale,
)?;
}
Ok(())
}
#[cfg(test)]
mod tests {
use super::*;
use crate::rna::rseqc::accumulators::BamStatAccum;
use rust_htslib::bam::{self, Read as BamRead};
use std::io::Read;
#[test]
fn test_stats_sn_format() {
let mut reader =
bam::Reader::from_path("tests/data/test.bam").expect("Failed to open test.bam");
let mut accum = BamStatAccum::default();
let mut record = bam::Record::new();
while let Some(res) = reader.read(&mut record) {
res.expect("Error reading BAM record");
accum.process_read(&record, 30);
}
let result = accum.into_result();
let tmp_path = std::env::temp_dir().join("rustqc_test_stats.txt");
write_stats(&result, &tmp_path).expect("Failed to write stats");
let mut contents = String::new();
std::fs::File::open(&tmp_path)
.unwrap()
.read_to_string(&mut contents)
.unwrap();
let _ = std::fs::remove_file(&tmp_path);
assert!(
contents.contains("This file was produced by samtools stats"),
"Missing MultiQC detection header"
);
for line in contents.lines() {
if line.starts_with("SN\t") {
let cols: Vec<&str> = line.splitn(4, '\t').collect();
assert!(
cols.len() >= 3,
"SN line should have at least 3 tab-separated fields: {line}"
);
assert!(
cols[1].ends_with(':'),
"SN key should end with colon: {}",
cols[1]
);
if cols.len() == 4 {
assert!(
cols[3].starts_with("# "),
"SN comment should start with '# ': {}",
cols[3]
);
}
}
}
}
}