use std::error::Error;
use std::fs;
use std::ops::Range;
use std::str;
use bio::io::fasta;
use bio::stats::{LogProb, PHREDProb};
use itertools::Itertools;
use ordered_float::NotNan;
use rust_htslib::bcf::Read;
use rust_htslib::{bam, bcf};
use model;
use utils;
use BCFError;
use Event;
pub const NUMERICAL_EPSILON: f64 = 1e-4;
pub fn collect_variants(
record: &mut bcf::Record,
omit_snvs: bool,
omit_indels: bool,
indel_len_range: Option<Range<u32>>,
exclusive_end: bool,
) -> Result<Vec<Option<model::Variant>>, Box<Error>> {
let pos = record.pos();
let svlen = match record.info(b"SVLEN").integer() {
Ok(Some(svlen)) => Some(svlen[0].abs() as u32),
_ => None,
};
let end = match record.info(b"END").integer() {
Ok(Some(end)) => {
let mut end = end[0] as u32 - 1;
if exclusive_end {
debug!("fixing END tag");
end -= 1;
}
Some(end)
}
_ => None,
};
let svtype = match record.info(b"SVTYPE").string() {
Ok(Some(svtype)) => Some(svtype[0].to_owned()),
_ => None,
};
let is_valid_len = |svlen| {
if let Some(ref len_range) = indel_len_range {
if svlen < len_range.start || svlen >= len_range.end {
return false;
}
}
true
};
let is_valid_insertion_alleles = |ref_allele: &[u8], alt_allele: &[u8]| {
alt_allele == b"<INS>"
|| (ref_allele.len() < alt_allele.len()
&& ref_allele == &alt_allele[..ref_allele.len()])
};
let is_valid_deletion_alleles = |ref_allele: &[u8], alt_allele: &[u8]| {
alt_allele == b"<DEL>"
|| (ref_allele.len() > alt_allele.len()
&& &ref_allele[..alt_allele.len()] == alt_allele)
};
let variants = if let Some(svtype) = svtype {
vec![if omit_indels {
None
} else if svtype == b"INS" {
let alleles = record.alleles();
if alleles.len() > 2 {
return Err(Box::new(BCFError::InvalidRecord(
"SVTYPE=INS but more than one ALT allele".to_owned(),
)));
}
let ref_allele = alleles[0];
let alt_allele = alleles[1];
if alt_allele == b"<INS>" {
None
} else {
let len = alt_allele.len() - ref_allele.len();
if is_valid_insertion_alleles(ref_allele, alt_allele) && is_valid_len(len as u32) {
Some(model::Variant::Insertion(
alt_allele[ref_allele.len()..].to_owned(),
))
} else {
None
}
}
} else if svtype == b"DEL" {
let svlen = match (svlen, end) {
(Some(svlen), _) => svlen,
(None, Some(end)) => end - pos,
_ => {
return Err(Box::new(BCFError::MissingTag("SVLEN or END".to_owned())));
}
};
let alleles = record.alleles();
if alleles.len() > 2 {
return Err(Box::new(BCFError::InvalidRecord(
"SVTYPE=DEL but more than one ALT allele".to_owned(),
)));
}
let ref_allele = alleles[0];
let alt_allele = alleles[1];
if alt_allele == b"<DEL>" || is_valid_deletion_alleles(ref_allele, alt_allele) {
if is_valid_len(svlen) {
Some(model::Variant::Deletion(svlen))
} else {
None
}
} else {
None
}
} else {
None
}]
} else {
let alleles = record.alleles();
let ref_allele = alleles[0];
alleles
.iter()
.skip(1)
.map(|alt_allele| {
if alt_allele == b"<*>" {
if omit_snvs {
None
} else {
Some(model::Variant::None)
}
} else if alt_allele[0] == b'<' {
None
} else if alt_allele.len() == 1 && ref_allele.len() == 1 {
if omit_snvs {
None
} else {
Some(model::Variant::SNV(alt_allele[0]))
}
} else if alt_allele.len() == ref_allele.len() {
None
} else {
let indel_len =
(alt_allele.len() as i32 - ref_allele.len() as i32).abs() as u32;
if omit_indels {
None
} else if !is_valid_len(indel_len) {
None
} else if is_valid_deletion_alleles(ref_allele, alt_allele) {
Some(model::Variant::Deletion(
(ref_allele.len() - alt_allele.len()) as u32,
))
} else if is_valid_insertion_alleles(ref_allele, alt_allele) {
Some(model::Variant::Insertion(
alt_allele[ref_allele.len()..].to_owned(),
))
} else {
None
}
}
})
.collect_vec()
};
Ok(variants)
}
pub struct ReferenceBuffer {
reader: fasta::IndexedReader<fs::File>,
chrom: Option<Vec<u8>>,
sequence: Vec<u8>,
}
impl ReferenceBuffer {
pub fn new(fasta: fasta::IndexedReader<fs::File>) -> Self {
ReferenceBuffer {
reader: fasta,
chrom: None,
sequence: Vec::new(),
}
}
pub fn seq(&mut self, chrom: &[u8]) -> Result<&[u8], Box<Error>> {
if let Some(ref last_chrom) = self.chrom {
if last_chrom == &chrom {
return Ok(&self.sequence);
}
}
self.reader.fetch_all(str::from_utf8(chrom)?)?;
self.reader.read(&mut self.sequence)?;
self.chrom = Some(chrom.to_owned());
Ok(&self.sequence)
}
}
fn tags_prob_sum(
record: &mut bcf::Record,
tags: &[String],
vartype: &model::VariantType,
) -> Result<Vec<Option<LogProb>>, Box<Error>> {
let variants = (utils::collect_variants(record, false, false, None, false))?;
let mut tags_probs_out = vec![Vec::new(); variants.len()];
for tag in tags {
if let Some(tags_probs_in) = (record.info(tag.as_bytes()).float())? {
for (i, (variant, tag_prob)) in
variants.iter().zip(tags_probs_in.into_iter()).enumerate()
{
if let Some(ref variant) = *variant {
if !variant.is_type(vartype) || tag_prob.is_nan() {
continue;
}
tags_probs_out[i].push(LogProb::from(PHREDProb(*tag_prob as f64)));
}
}
}
}
Ok(tags_probs_out
.into_iter()
.map(|probs| {
if !probs.is_empty() {
Some(LogProb::ln_sum_exp(&probs).cap_numerical_overshoot(NUMERICAL_EPSILON))
} else {
None
}
})
.collect_vec())
}
pub fn collect_prob_dist<E: Event>(
calls: &mut bcf::Reader,
events: &[E],
vartype: &model::VariantType,
) -> Result<Vec<NotNan<f64>>, Box<Error>> {
let mut record = calls.empty_record();
let mut prob_dist = Vec::new();
let tags = events.iter().map(|e| e.tag_name("PROB")).collect_vec();
loop {
if let Err(e) = calls.read(&mut record) {
if e.is_eof() {
break;
} else {
return Err(Box::new(e));
}
}
for p in utils::tags_prob_sum(&mut record, &tags, &vartype)? {
if let Some(p) = p {
prob_dist.push(NotNan::new(*p)?);
}
}
}
prob_dist.sort();
Ok(prob_dist)
}
pub fn filter_by_threshold<E: Event>(
calls: &mut bcf::Reader,
threshold: Option<LogProb>,
out: &mut bcf::Writer,
events: &[E],
vartype: &model::VariantType,
) -> Result<(), Box<Error>> {
let mut record = calls.empty_record();
let tags = events.iter().map(|e| e.tag_name("PROB")).collect_vec();
loop {
if let Err(e) = calls.read(&mut record) {
if e.is_eof() {
return Ok(());
} else {
return Err(Box::new(e));
}
}
let probs = utils::tags_prob_sum(&mut record, &tags, vartype)?;
let mut remove = vec![false]; remove.extend(probs.into_iter().map(|p| {
match (p, threshold) {
(Some(p), Some(threshold)) if p > threshold || relative_eq!(*p, *threshold) => {
false
}
(Some(_), None) => false,
_ => true,
}
}));
if !remove[1..].iter().all(|r| *r) {
record.remove_alleles(&remove)?;
out.write(&record)?;
}
}
}
#[derive(Debug)]
pub enum Overlap {
Enclosing(u32),
Left(u32),
Right(u32),
Some(u32),
None,
}
impl Overlap {
pub fn new(
record: &bam::Record,
cigar: &bam::record::CigarStringView,
start: u32,
variant: &model::Variant,
consider_clips: bool,
) -> Result<Overlap, Box<Error>> {
let mut pos = record.pos() as u32;
let mut end_pos = cigar.end_pos()? as u32;
if consider_clips {
pos = pos.saturating_sub(model::evidence::Clips::leading(&cigar).soft());
end_pos = end_pos + model::evidence::Clips::trailing(&cigar).soft();
}
let overlap = match variant {
&model::Variant::SNV(_) | &model::Variant::None => {
if pos <= start && end_pos > start {
Overlap::Enclosing(1)
} else {
Overlap::None
}
}
&model::Variant::Deletion(l) => {
let end = start + l;
let enclosing = pos < start && end_pos > end;
if enclosing {
Overlap::Enclosing(l)
} else {
if end_pos <= end && end_pos > start {
Overlap::Right(end_pos - start)
} else if pos >= start && pos < end {
Overlap::Left(end - pos)
} else {
Overlap::None
}
}
}
&model::Variant::Insertion(ref seq) => {
let l = seq.len() as u32;
let center_pos = (end_pos - pos) / 2 + pos;
if pos < start && end_pos > start {
if start > center_pos {
let overlap = end_pos - start;
if overlap > l {
Overlap::Enclosing(l)
} else {
Overlap::Some(overlap)
}
} else {
let overlap = start - pos;
if overlap > l {
Overlap::Enclosing(l)
} else {
Overlap::Some(overlap)
}
}
} else {
Overlap::None
}
}
};
Ok(overlap)
}
pub fn is_enclosing(&self) -> bool {
if let &Overlap::Enclosing(_) = self {
true
} else {
false
}
}
pub fn is_none(&self) -> bool {
if let &Overlap::None = self {
true
} else {
false
}
}
pub fn len(&self) -> u32 {
match self {
&Overlap::Enclosing(l) => l,
&Overlap::Left(l) => l,
&Overlap::Right(l) => l,
&Overlap::Some(l) => l,
&Overlap::None => 0,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use bio::stats::{LogProb, Prob};
use model::VariantType;
use rust_htslib::bcf::{self, Read};
use ComplementEvent;
use SimpleEvent;
#[test]
fn test_tags_prob_sum() {
let test_file = "tests/resources/test_tags_prob_sum/overshoot.vcf";
let mut overshoot_calls = bcf::Reader::from_path(test_file).unwrap();
let mut record = overshoot_calls.empty_record();
if let Err(e) = overshoot_calls.read(&mut record) {
panic!("BCF reading error: {}", e);
}
let alt_tags = [
String::from("PROB_ADO_TO_REF"),
String::from("PROB_ADO_TO_ALT"),
String::from("PROB_HOM_ALT"),
String::from("PROB_HET"),
String::from("PROB_ERR_REF"),
];
let snv = VariantType::SNV;
if let Ok(prob_sum) = tags_prob_sum(&mut record, &alt_tags, &snv) {
assert_eq!(LogProb::ln_one(), prob_sum[0].unwrap());
} else {
panic!("tags_prob_sum(&overshoot_calls, &alt_events, &snv) returned Error")
}
}
#[test]
fn test_collect_prob_dist() {
let events = vec![
SimpleEvent {
name: "germline".to_owned(),
},
SimpleEvent {
name: "somatic".to_owned(),
},
];
let absent_event = vec![ComplementEvent {
name: "absent".to_owned(),
}];
let test_file = "tests/resources/test_collect_prob_dist/min.calls.vcf";
let del = VariantType::Deletion(None);
let mut del_calls_1 = bcf::Reader::from_path(test_file).unwrap();
if let Ok(prob_del) = collect_prob_dist(&mut del_calls_1, &events, &del) {
println!("prob_del[0]: {:?}", prob_del[0].into_inner());
assert_eq!(prob_del.len(), 1);
assert_relative_eq!(prob_del[0].into_inner(), Prob(0.8).ln(), epsilon = 0.000005);
} else {
panic!("collect_prob_dist(&calls, &events, &del) returned Error")
}
let mut del_calls_2 = bcf::Reader::from_path(test_file).unwrap();
if let Ok(prob_del_abs) = collect_prob_dist(&mut del_calls_2, &absent_event, &del) {
assert_eq!(prob_del_abs.len(), 1);
assert_relative_eq!(
prob_del_abs[0].into_inner(),
Prob(0.2).ln(),
epsilon = 0.000005
);
} else {
panic!("collect_prob_dist(&calls, &absent_event, &del) returned Error")
}
let ins = VariantType::Insertion(None);
let mut ins_calls_1 = bcf::Reader::from_path(test_file).unwrap();
if let Ok(prob_ins) = collect_prob_dist(&mut ins_calls_1, &events, &ins) {
assert_eq!(prob_ins.len(), 1);
assert_relative_eq!(prob_ins[0].into_inner(), Prob(0.2).ln(), epsilon = 0.000005);
} else {
panic!("collect_prob_dist(&calls, &events, &ins) returned Error")
}
let mut ins_calls_2 = bcf::Reader::from_path(test_file).unwrap();
if let Ok(prob_ins_abs) = collect_prob_dist(&mut ins_calls_2, &absent_event, &ins) {
assert_eq!(prob_ins_abs.len(), 1);
assert_relative_eq!(
prob_ins_abs[0].into_inner(),
Prob(0.8).ln(),
epsilon = 0.000005
);
} else {
panic!("collect_prob_dist(&calls, &absent_event, &ins) returned Error")
}
}
#[test]
fn test_filter_by_threshold() {
}
}