use std::cmp;
use std::error::Error;
use std::fmt::Display;
use std::fs;
use std::hash::Hash;
use std::ops::Range;
use std::str;
use bio::io::fasta;
use bio::stats::{bayesian::bayes_factors::evidence::KassRaftery, LogProb, PHREDProb};
use counter::Counter;
use itertools::join;
use itertools::Itertools;
use ordered_float::NotNan;
use rust_htslib::bcf::Read;
use rust_htslib::{bam, bcf, bcf::record::Numeric};
use crate::model;
use crate::utils;
use crate::BCFError;
use crate::Event;
pub const NUMERICAL_EPSILON: f64 = 1e-3;
pub fn select<'a, T: Clone>(idx: &'a [usize], values: &'a [T]) -> impl Iterator<Item = T> + 'a {
idx.iter().map(move |i| values[*i].clone())
}
pub fn generalized_cigar<T: Hash + Eq + Clone + Display>(
items: impl Iterator<Item = T>,
keep_order: bool,
) -> String {
if keep_order {
join(
items
.map(|item| (item, 1))
.coalesce(|(a, n), (b, m)| {
if a == b {
Ok((a, n + m))
} else {
Err(((a, n), (b, m)))
}
})
.map(|(item, count)| format!("{}{}", count, item)),
"",
)
} else {
let items: Counter<T> = items.collect();
join(
items
.most_common()
.into_iter()
.map(|(item, count)| format!("{}{}", count, item)),
"",
)
}
}
pub fn evidence_kass_raftery_to_letter(evidence: KassRaftery) -> char {
match evidence {
KassRaftery::Barely => 'B',
KassRaftery::None => 'N',
KassRaftery::Positive => 'P',
KassRaftery::Strong => 'S',
KassRaftery::VeryStrong => 'V',
}
}
pub fn collect_variants(
record: &mut bcf::Record,
omit_snvs: bool,
omit_indels: bool,
indel_len_range: Option<Range<u32>>,
) -> Result<Vec<Option<model::Variant>>, Box<Error>> {
let pos = record.pos();
let svlens = match record.info(b"SVLEN").integer() {
Ok(Some(svlens)) => Some(
svlens
.into_iter()
.map(|l| {
if !l.is_missing() {
Some(l.abs() as u32)
} else {
None
}
})
.collect_vec(),
),
_ => None,
};
let end = match record.info(b"END").integer() {
Ok(Some(end)) => {
let end = end[0] as u32 - 1;
Some(end)
}
_ => None,
};
let svtype = match record.info(b"SVTYPE").string() {
Ok(Some(svtype)) => Some(svtype[0].to_owned()),
_ => None,
};
let is_valid_len = |svlen| {
if let Some(ref len_range) = indel_len_range {
if svlen < len_range.start || svlen >= len_range.end {
return false;
}
}
true
};
let is_valid_insertion_alleles = |ref_allele: &[u8], alt_allele: &[u8]| {
alt_allele == b"<INS>"
|| (ref_allele.len() < alt_allele.len()
&& ref_allele == &alt_allele[..ref_allele.len()])
};
let is_valid_deletion_alleles = |ref_allele: &[u8], alt_allele: &[u8]| {
alt_allele == b"<DEL>"
|| (ref_allele.len() > alt_allele.len()
&& &ref_allele[..alt_allele.len()] == alt_allele)
};
let variants = if let Some(svtype) = svtype {
vec![if omit_indels {
None
} else if svtype == b"INS" {
let alleles = record.alleles();
if alleles.len() > 2 {
return Err(Box::new(BCFError::InvalidRecord(
"SVTYPE=INS but more than one ALT allele".to_owned(),
)));
}
let ref_allele = alleles[0];
let alt_allele = alleles[1];
if alt_allele == b"<INS>" {
None
} else {
let len = alt_allele.len() - ref_allele.len();
if is_valid_insertion_alleles(ref_allele, alt_allele) && is_valid_len(len as u32) {
Some(model::Variant::Insertion(
alt_allele[ref_allele.len()..].to_owned(),
))
} else {
None
}
}
} else if svtype == b"DEL" {
let svlen = match (svlens, end) {
(Some(ref svlens), _) if svlens[0].is_some() => svlens[0].unwrap(),
(None, Some(end)) => end - (pos + 1),
_ => {
return Err(Box::new(BCFError::MissingTag("SVLEN or END".to_owned())));
}
};
if svlen == 0 {
return Err(Box::new(BCFError::InvalidRecord(
"Absolute value of SVLEN or END - POS must be greater than zero.".to_owned(),
)));
}
let alleles = record.alleles();
if alleles.len() > 2 {
return Err(Box::new(BCFError::InvalidRecord(
"SVTYPE=DEL but more than one ALT allele".to_owned(),
)));
}
let ref_allele = alleles[0];
let alt_allele = alleles[1];
if alt_allele == b"<DEL>" || is_valid_deletion_alleles(ref_allele, alt_allele) {
if is_valid_len(svlen) {
Some(model::Variant::Deletion(svlen))
} else {
None
}
} else {
None
}
} else {
None
}]
} else {
let alleles = record.alleles();
let ref_allele = alleles[0];
alleles
.iter()
.skip(1)
.enumerate()
.map(|(i, alt_allele)| {
if alt_allele == b"<*>" {
if omit_snvs {
None
} else {
Some(model::Variant::None)
}
} else if alt_allele == b"<DEL>" {
if let Some(ref svlens) = svlens {
if let Some(svlen) = svlens[i] {
Some(model::Variant::Deletion(svlen))
} else {
None
}
} else {
None
}
} else if alt_allele[0] == b'<' {
None
} else if alt_allele.len() == 1 && ref_allele.len() == 1 {
if omit_snvs {
None
} else {
Some(model::Variant::SNV(alt_allele[0]))
}
} else if alt_allele.len() == ref_allele.len() {
None
} else {
let indel_len =
(alt_allele.len() as i32 - ref_allele.len() as i32).abs() as u32;
if omit_indels {
None
} else if !is_valid_len(indel_len) {
None
} else if is_valid_deletion_alleles(ref_allele, alt_allele) {
Some(model::Variant::Deletion(
(ref_allele.len() - alt_allele.len()) as u32,
))
} else if is_valid_insertion_alleles(ref_allele, alt_allele) {
Some(model::Variant::Insertion(
alt_allele[ref_allele.len()..].to_owned(),
))
} else {
None
}
}
})
.collect_vec()
};
Ok(variants)
}
pub struct ReferenceBuffer {
pub(crate) reader: fasta::IndexedReader<fs::File>,
chrom: Option<Vec<u8>>,
sequence: Vec<u8>,
}
impl ReferenceBuffer {
pub fn new(fasta: fasta::IndexedReader<fs::File>) -> Self {
ReferenceBuffer {
reader: fasta,
chrom: None,
sequence: Vec::new(),
}
}
pub fn seq(&mut self, chrom: &[u8]) -> Result<&[u8], Box<Error>> {
if let Some(ref last_chrom) = self.chrom {
if last_chrom == &chrom {
return Ok(&self.sequence);
}
}
self.reader.fetch_all(str::from_utf8(chrom)?)?;
self.reader.read(&mut self.sequence)?;
self.chrom = Some(chrom.to_owned());
Ok(&self.sequence)
}
}
pub fn tags_prob_sum(
record: &mut bcf::Record,
tags: &[String],
vartype: Option<&model::VariantType>,
) -> Result<Vec<Option<LogProb>>, Box<Error>> {
let variants = (utils::collect_variants(record, false, false, None))?;
let mut tags_probs_out = vec![Vec::new(); variants.len()];
for tag in tags {
if let Some(tags_probs_in) = (record.info(tag.as_bytes()).float())? {
for (i, (variant, tag_prob)) in
variants.iter().zip(tags_probs_in.into_iter()).enumerate()
{
if let Some(ref variant) = *variant {
if (vartype.is_some() && !variant.is_type(vartype.unwrap()))
|| tag_prob.is_nan()
{
continue;
}
tags_probs_out[i].push(LogProb::from(PHREDProb(*tag_prob as f64)));
}
}
}
}
Ok(tags_probs_out
.into_iter()
.map(|probs| {
if !probs.is_empty() {
Some(LogProb::ln_sum_exp(&probs).cap_numerical_overshoot(NUMERICAL_EPSILON))
} else {
None
}
})
.collect_vec())
}
pub fn events_to_tags<E>(events: &[E]) -> Vec<String>
where
E: Event,
{
events.iter().map(|e| e.tag_name("PROB")).collect_vec()
}
pub fn collect_prob_dist<E>(
calls: &mut bcf::Reader,
events: &[E],
vartype: &model::VariantType,
) -> Result<Vec<NotNan<f64>>, Box<Error>>
where
E: Event,
{
let mut record = calls.empty_record();
let mut prob_dist = Vec::new();
let tags = events_to_tags(events);
loop {
if let Err(e) = calls.read(&mut record) {
if e.is_eof() {
break;
} else {
return Err(Box::new(e));
}
}
for p in utils::tags_prob_sum(&mut record, &tags, Some(&vartype))? {
if let Some(p) = p {
prob_dist.push(NotNan::new(*p)?);
}
}
}
prob_dist.sort();
Ok(prob_dist)
}
pub fn filter_by_threshold<E: Event>(
calls: &mut bcf::Reader,
threshold: Option<LogProb>,
out: &mut bcf::Writer,
events: &[E],
vartype: &model::VariantType,
) -> Result<(), Box<Error>> {
let tags = events.iter().map(|e| e.tag_name("PROB")).collect_vec();
let filter = |record: &mut bcf::Record| {
let probs = utils::tags_prob_sum(record, &tags, Some(vartype))?;
Ok(probs.into_iter().map(|p| {
match (p, threshold) {
(Some(p), Some(threshold)) if p > threshold || relative_eq!(*p, *threshold) => true,
(Some(_), None) => true,
_ => false,
}
}))
};
filter_calls(calls, out, filter)
}
pub fn filter_calls<F, I, II>(
calls: &mut bcf::Reader,
out: &mut bcf::Writer,
filter: F,
) -> Result<(), Box<Error>>
where
F: Fn(&mut bcf::Record) -> Result<II, Box<Error>>,
I: Iterator<Item = bool>,
II: IntoIterator<Item = bool, IntoIter = I>,
{
let mut record = calls.empty_record();
loop {
if let Err(e) = calls.read(&mut record) {
if e.is_eof() {
return Ok(());
} else {
return Err(Box::new(e));
}
}
let mut remove = vec![false];
remove.extend(filter(&mut record)?.into_iter().map(|keep| !keep));
assert_eq!(
remove.len(),
record.allele_count() as usize,
"bug: filter passed to filter_calls has to return a bool for each alt allele."
);
if !remove[1..].iter().all(|r| *r) {
record.remove_alleles(&remove)?;
out.write(&record)?;
}
}
}
pub fn max_prob(prob_a: LogProb, prob_b: LogProb) -> LogProb {
LogProb(*cmp::max(NotNan::from(prob_a), NotNan::from(prob_b)))
}
#[derive(Debug)]
pub enum Overlap {
Enclosing(u32),
Left(u32),
Right(u32),
Some(u32),
None,
}
impl Overlap {
pub fn new(
record: &bam::Record,
cigar: &bam::record::CigarStringView,
start: u32,
variant: &model::Variant,
consider_clips: bool,
) -> Result<Overlap, Box<Error>> {
let mut pos = record.pos() as u32;
let mut end_pos = cigar.end_pos()? as u32;
if consider_clips {
pos = pos.saturating_sub(model::evidence::Clips::leading(&cigar).soft());
end_pos = end_pos + model::evidence::Clips::trailing(&cigar).soft();
}
let overlap = match variant {
&model::Variant::SNV(_) | &model::Variant::None => {
if pos <= start && end_pos > start {
Overlap::Enclosing(1)
} else {
Overlap::None
}
}
&model::Variant::Deletion(l) => {
let end = start + l;
let enclosing = pos < start && end_pos > end;
if enclosing {
Overlap::Enclosing(l)
} else {
if end_pos <= end && end_pos > start {
Overlap::Right(end_pos - start)
} else if pos >= start && pos < end {
Overlap::Left(end - pos)
} else {
Overlap::None
}
}
}
&model::Variant::Insertion(ref seq) => {
let l = seq.len() as u32;
let center_pos = (end_pos - pos) / 2 + pos;
if pos < start && end_pos > start {
if start > center_pos {
let overlap = end_pos - start;
if overlap > l {
Overlap::Enclosing(l)
} else {
Overlap::Some(overlap)
}
} else {
let overlap = start - pos;
if overlap > l {
Overlap::Enclosing(l)
} else {
Overlap::Some(overlap)
}
}
} else {
Overlap::None
}
}
};
Ok(overlap)
}
pub fn is_enclosing(&self) -> bool {
if let &Overlap::Enclosing(_) = self {
true
} else {
false
}
}
pub fn is_none(&self) -> bool {
if let &Overlap::None = self {
true
} else {
false
}
}
pub fn len(&self) -> u32 {
match self {
&Overlap::Enclosing(l) => l,
&Overlap::Left(l) => l,
&Overlap::Right(l) => l,
&Overlap::Some(l) => l,
&Overlap::None => 0,
}
}
}
pub fn is_repeat_variant(start: u32, variant: &model::Variant, chrom_seq: &[u8]) -> bool {
let end = match variant {
&model::Variant::SNV(_) | &model::Variant::None | &model::Variant::Insertion(_) => {
start + 1
}
&model::Variant::Deletion(l) => start + l,
} as usize;
for nuc in &chrom_seq[start as usize..end] {
if (*nuc as char).is_lowercase() {
return true;
}
}
false
}
#[cfg(test)]
mod tests {
use super::*;
use crate::model::VariantType;
use crate::ComplementEvent;
use crate::SimpleEvent;
use bio::stats::{LogProb, Prob};
use rust_htslib::bcf::{self, Read};
#[test]
fn test_tags_prob_sum() {
let test_file = "tests/resources/test_tags_prob_sum/overshoot.vcf";
let mut overshoot_calls = bcf::Reader::from_path(test_file).unwrap();
let mut record = overshoot_calls.empty_record();
if let Err(e) = overshoot_calls.read(&mut record) {
panic!("BCF reading error: {}", e);
}
let alt_tags = [
String::from("PROB_ADO_TO_REF"),
String::from("PROB_ADO_TO_ALT"),
String::from("PROB_HOM_ALT"),
String::from("PROB_HET"),
String::from("PROB_ERR_REF"),
];
let snv = VariantType::SNV;
if let Ok(prob_sum) = tags_prob_sum(&mut record, &alt_tags, Some(&snv)) {
assert_eq!(LogProb::ln_one(), prob_sum[0].unwrap());
} else {
panic!("tags_prob_sum(&overshoot_calls, &alt_events, &snv) returned Error")
}
}
#[test]
fn test_collect_prob_dist() {
let events = vec![
SimpleEvent {
name: "germline".to_owned(),
},
SimpleEvent {
name: "somatic".to_owned(),
},
];
let absent_event = vec![ComplementEvent {
name: "absent".to_owned(),
}];
let test_file = "tests/resources/test_collect_prob_dist/min.calls.vcf";
let del = VariantType::Deletion(None);
let mut del_calls_1 = bcf::Reader::from_path(test_file).unwrap();
let prob_del = collect_prob_dist(&mut del_calls_1, &events, &del).unwrap();
println!("prob_del[0]: {:?}", prob_del[0].into_inner());
assert_eq!(prob_del.len(), 1);
assert_relative_eq!(prob_del[0].into_inner(), Prob(0.8).ln(), epsilon = 0.000005);
let mut del_calls_2 = bcf::Reader::from_path(test_file).unwrap();
let prob_del_abs = collect_prob_dist(&mut del_calls_2, &absent_event, &del).unwrap();
assert_eq!(prob_del_abs.len(), 1);
assert_relative_eq!(
prob_del_abs[0].into_inner(),
Prob(0.2).ln(),
epsilon = 0.000005
);
let ins = VariantType::Insertion(None);
let mut ins_calls_1 = bcf::Reader::from_path(test_file).unwrap();
let prob_ins = collect_prob_dist(&mut ins_calls_1, &events, &ins).unwrap();
assert_eq!(prob_ins.len(), 1);
assert_relative_eq!(prob_ins[0].into_inner(), Prob(0.2).ln(), epsilon = 0.000005);
let mut ins_calls_2 = bcf::Reader::from_path(test_file).unwrap();
let prob_ins_abs = collect_prob_dist(&mut ins_calls_2, &absent_event, &ins).unwrap();
assert_eq!(prob_ins_abs.len(), 1);
assert_relative_eq!(
prob_ins_abs[0].into_inner(),
Prob(0.8).ln(),
epsilon = 0.000005
);
}
#[test]
fn test_filter_by_threshold() {
}
}