use super::paf;
use bio_types::strand::ReqStrand::*;
use colored::Colorize;
use lazy_static::lazy_static;
use regex::Regex;
use rust_htslib::bam::record::Aux;
use rust_htslib::bam::record::{Cigar::*, CigarStringView};
use rust_htslib::bam::Header;
use rust_htslib::bam::HeaderView;
use rust_htslib::bam::Record;
use std::convert::TryFrom;
use std::fmt;
use std::str;
#[derive(Default)]
pub struct Stats {
pub q_nm: String,
pub q_len: i64,
pub q_st: i64,
pub q_en: i64,
pub r_nm: String,
pub r_len: i64,
pub r_st: i64,
pub r_en: i64,
pub strand: char,
pub equal: u32,
pub diff: u32,
pub ins: u32,
pub del: u32,
pub matches: u32,
pub ins_events: u32,
pub del_events: u32,
pub id_by_all: f32,
pub id_by_events: f32,
pub id_by_matches: f32,
}
pub fn parse_md_for_stats(md: &str) -> (u32, u32, u32, u32) {
lazy_static! {
static ref RE: Regex = Regex::new(r"(\d+)|([A-Z])|(\^[A-Z]+)").unwrap();
}
let matches = RE.captures_iter(md);
let mut match_count = 0;
let mut mismatch_count = 0;
let mut insertion_count = 0;
let mut insertion_bases = 0;
for caps in matches {
if let Some(m) = caps.get(1) {
match_count += m.as_str().parse::<u32>().unwrap();
} else if let Some(_m) = caps.get(2) {
mismatch_count += 1;
} else if let Some(ins) = caps.get(3) {
insertion_bases += ins.as_str().len() as u32 - 1;
insertion_count += 1;
}
}
(
match_count,
mismatch_count,
insertion_count,
insertion_bases,
)
}
impl fmt::Display for Stats {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"{} {} {} {} {} {} {}",
self.r_nm, self.r_st, self.r_en, self.strand, self.q_nm, self.q_st, self.q_en
)
}
}
pub fn stats_from_paf(paf: paf::PafRecord) -> Stats {
let mut stats = Stats::default();
add_stats_from_cigar(&CigarStringView::new(paf.cigar, 0), &mut stats, None);
stats.r_nm = paf.t_name;
stats.r_len = paf.t_len as i64;
stats.r_st = paf.t_st as i64;
stats.r_en = paf.t_en as i64;
stats.q_nm = paf.q_name;
stats.q_len = paf.q_len as i64;
stats.q_st = paf.q_st as i64;
stats.q_en = paf.q_en as i64;
stats.strand = paf.strand;
stats
}
pub fn add_stats_from_cigar(cigar: &CigarStringView, stats: &mut Stats, md: Option<&str>) {
for opt in cigar {
match opt {
Del(val) => {
stats.del_events += 1;
stats.del += val
}
Ins(val) => {
stats.ins_events += 1;
stats.ins += val
}
Equal(val) => stats.equal += val,
Diff(val) => stats.diff += val,
Match(val) => {
stats.diff += val;
stats.matches += val
}
_ => (),
}
}
if let Some(md) = md.filter(|_| stats.equal == 0 && stats.matches > 0) {
let (m_count, mm_count, _i_c, _i_bp) = parse_md_for_stats(md);
assert_eq!(m_count + mm_count, stats.diff);
stats.equal = m_count;
stats.diff = mm_count;
}
stats.id_by_all =
100.0 * stats.equal as f32 / (stats.equal + stats.diff + stats.del + stats.ins) as f32;
stats.id_by_events = 100.0 * stats.equal as f32
/ (stats.equal + stats.diff + stats.del_events + stats.ins_events) as f32;
stats.id_by_matches = 100.0 * stats.equal as f32 / (stats.equal + stats.diff) as f32;
if stats.matches > 0 && md.is_none() {
eprint!(
"\r{} {} {}{}",
"\u{26A0} warning:".bold().yellow(),
"cigar string contains".yellow(),
"'M'".bold().red(),
", assuming mismatch since there is no MD tag.".yellow()
);
}
}
pub fn cigar_stats(rec: Record, header: &Header) -> Stats {
let cigar = rec.cigar();
let bam_head = HeaderView::from_header(header);
let r_nm = str::from_utf8(bam_head.tid2name(rec.tid() as u32)).unwrap();
let r_len = bam_head.target_len(rec.tid() as u32).unwrap();
let mut stats = Stats {
r_nm: r_nm.to_string(),
r_len: r_len as i64,
r_st: rec.pos(),
r_en: cigar.end_pos(),
q_nm: std::str::from_utf8(rec.qname()).unwrap().to_string(),
q_len: 0,
q_st: 0,
q_en: 0,
strand: rec
.strand()
.strand_symbol()
.chars()
.next()
.expect("string is empty"),
equal: 0,
diff: 0,
ins: 0,
del: 0,
ins_events: 0,
del_events: 0,
matches: 0,
id_by_matches: 0.0,
id_by_events: 0.0,
id_by_all: 0.0,
};
stats.q_st = cigar.leading_hardclips() + cigar.leading_softclips();
stats.q_en = cigar.leading_hardclips()
+ 1
+ cigar
.read_pos(stats.r_en as u32 - 1, false, false)
.unwrap()
.unwrap() as i64;
stats.q_len = cigar.leading_hardclips()
+ i64::try_from(rec.seq_len()).unwrap()
+ cigar.trailing_hardclips();
if rec.strand() == Reverse {
let temp = stats.q_st;
stats.q_st = stats.q_len - stats.q_en;
stats.q_en = stats.q_len - temp;
}
let md = if let Ok(Aux::String(md_text)) = rec.aux(b"MD") {
Some(md_text)
} else {
None
};
add_stats_from_cigar(&cigar, &mut stats, md);
stats
}
pub fn print_cigar_stats_header(qbed: bool) {
if qbed {
print!("#query_name\tquery_start\tquery_end\tquery_length\t");
print!("strand\t");
print!("reference_name\treference_start\treference_end\treference_length\t");
} else {
print!("#reference_name\treference_start\treference_end\treference_length\t");
print!("strand\t");
print!("query_name\tquery_start\tquery_end\tquery_length\t");
}
println!("perID_by_matches\tperID_by_events\tperID_by_all\tmatches\tmismatches\tdeletion_events\tinsertion_events\tdeletions\tinsertions");
}
pub fn print_cigar_stats(stats: Stats, qbed: bool) {
if qbed {
print!(
"{}\t{}\t{}\t{}\t",
stats.q_nm, stats.q_st, stats.q_en, stats.q_len
);
print!("{}\t", stats.strand);
print!(
"{}\t{}\t{}\t{}\t",
stats.r_nm, stats.r_st, stats.r_en, stats.r_len
);
} else {
print!(
"{}\t{}\t{}\t{}\t",
stats.r_nm, stats.r_st, stats.r_en, stats.r_len
);
print!("{}\t", stats.strand);
print!(
"{}\t{}\t{}\t{}\t",
stats.q_nm, stats.q_st, stats.q_en, stats.q_len
);
}
print!(
"{}\t{}\t{}\t",
stats.id_by_matches, stats.id_by_events, stats.id_by_all
);
println!(
"{}\t{}\t{}\t{}\t{}\t{}",
stats.equal, stats.diff, stats.del_events, stats.ins_events, stats.del, stats.ins
);
}
pub struct StatsWriter {
out: std::io::BufWriter<std::io::StdoutLock<'static>>,
buf: String,
}
impl StatsWriter {
pub fn new() -> Self {
StatsWriter {
out: std::io::BufWriter::with_capacity(
64 * 1024,
std::io::stdout().lock(),
),
buf: String::with_capacity(4096),
}
}
pub fn write_header(&mut self, qbed: bool) {
use std::io::Write;
if qbed {
self.out.write_all(b"#query_name\tquery_start\tquery_end\tquery_length\tstrand\treference_name\treference_start\treference_end\treference_length\t").unwrap();
} else {
self.out.write_all(b"#reference_name\treference_start\treference_end\treference_length\tstrand\tquery_name\tquery_start\tquery_end\tquery_length\t").unwrap();
}
self.out.write_all(b"perID_by_matches\tperID_by_events\tperID_by_all\tmatches\tmismatches\tdeletion_events\tinsertion_events\tdeletions\tinsertions\n").unwrap();
}
pub fn write_stats(&mut self, stats: &Stats, qbed: bool) {
use std::fmt::Write as FmtWrite;
use std::io::Write;
#[inline(always)]
fn push_i64(buf: &mut String, v: i64) {
let mut tmp = itoa::Buffer::new();
buf.push_str(tmp.format(v));
}
#[inline(always)]
fn push_u32(buf: &mut String, v: u32) {
let mut tmp = itoa::Buffer::new();
buf.push_str(tmp.format(v));
}
self.buf.clear();
let buf = &mut self.buf;
if qbed {
buf.push_str(&stats.q_nm); buf.push('\t');
push_i64(buf, stats.q_st); buf.push('\t');
push_i64(buf, stats.q_en); buf.push('\t');
push_i64(buf, stats.q_len); buf.push('\t');
buf.push(stats.strand); buf.push('\t');
buf.push_str(&stats.r_nm); buf.push('\t');
push_i64(buf, stats.r_st); buf.push('\t');
push_i64(buf, stats.r_en); buf.push('\t');
push_i64(buf, stats.r_len); buf.push('\t');
} else {
buf.push_str(&stats.r_nm); buf.push('\t');
push_i64(buf, stats.r_st); buf.push('\t');
push_i64(buf, stats.r_en); buf.push('\t');
push_i64(buf, stats.r_len); buf.push('\t');
buf.push(stats.strand); buf.push('\t');
buf.push_str(&stats.q_nm); buf.push('\t');
push_i64(buf, stats.q_st); buf.push('\t');
push_i64(buf, stats.q_en); buf.push('\t');
push_i64(buf, stats.q_len); buf.push('\t');
}
write!(buf, "{}\t{}\t{}\t", stats.id_by_matches, stats.id_by_events, stats.id_by_all).unwrap();
push_u32(buf, stats.equal); buf.push('\t');
push_u32(buf, stats.diff); buf.push('\t');
push_u32(buf, stats.del_events); buf.push('\t');
push_u32(buf, stats.ins_events); buf.push('\t');
push_u32(buf, stats.del); buf.push('\t');
push_u32(buf, stats.ins); buf.push('\n');
self.out.write_all(self.buf.as_bytes()).unwrap();
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_cigar_stats_from_test_file() {
use rust_htslib::{bam, bam::Read};
let mut bam = bam::Reader::from_path(".test/asm_small.bam").unwrap();
let bam_header = bam::Header::from_template(bam.header());
bam.set_threads(4).unwrap();
for fetch in bam.records() {
let stats = cigar_stats(fetch.unwrap(), &bam_header);
print_cigar_stats(stats, false);
}
}
#[test]
fn test_add_cigar_stats() {
use rust_htslib::bam::record::CigarString;
let cigar = CigarString::try_from("10=10X").unwrap();
let view = CigarStringView::new(cigar, 0);
let mut stats = Stats::default();
add_stats_from_cigar(&view, &mut stats, None);
assert!((50.0 - stats.id_by_all).abs() < 1e-10);
}
}