use std::cmp;
use std::error::Error;
use std::f64;
use std::ops::Range;
use bio::stats::LogProb;
use itertools::Itertools;
use rgsl::randist::gaussian::ugaussian_P;
use rust_htslib::bam;
use estimation::alignment_properties::AlignmentProperties;
use model::evidence;
use model::Variant;
pub fn n_fragment_positions(
insert_size: u32,
left_read_len: u32,
right_read_len: u32,
window: u32,
) -> u32 {
cmp::min(
cmp::max(
insert_size as i32 - left_read_len as i32 - right_read_len as i32,
0,
),
cmp::max(window as i32 - insert_size as i32 + 1, 0),
) as u32
}
pub fn estimate_insert_size(left: &bam::Record, right: &bam::Record) -> Result<u32, Box<Error>> {
let left_cigar = left.cigar_cached().unwrap();
let right_cigar = right.cigar_cached().unwrap();
let aln = |rec: &bam::Record, cigar| -> Result<(u32, u32), Box<Error>> {
Ok((
(rec.pos() as u32).saturating_sub(evidence::Clips::leading(cigar).both()),
cigar.end_pos()? as u32 + evidence::Clips::trailing(cigar).both(),
))
};
let (left_start, left_end) = aln(left, &left_cigar)?;
let (right_start, right_end) = aln(right, &right_cigar)?;
let inner_mate_distance = right_start as i32 - left_end as i32;
let insert_size =
inner_mate_distance + (left_end - left_start) as i32 + (right_end - right_start) as i32;
assert!(
insert_size > 0,
"bug: insert size {} is smaller than zero",
insert_size
);
Ok(insert_size as u32)
}
pub struct IndelEvidence {
alignment_properties: AlignmentProperties,
}
impl IndelEvidence {
pub fn new(alignment_properties: AlignmentProperties) -> Self {
IndelEvidence {
alignment_properties,
}
}
fn pmf_range(&self) -> Range<u32> {
let m = self.alignment_properties.insert_size().mean.round() as u32;
let s = self.alignment_properties.insert_size().sd.ceil() as u32 * 6;
m.saturating_sub(s)..m + s
}
fn pmf(&self, insert_size: u32, shift: f64) -> LogProb {
isize_pmf(
insert_size as f64,
self.alignment_properties.insert_size().mean + shift,
self.alignment_properties.insert_size().sd,
)
}
pub fn is_discriminative(&self, variant: &Variant) -> bool {
variant.len() as f64 > self.alignment_properties.insert_size().sd
}
pub fn prob(
&self,
insert_size: u32,
variant: &Variant,
) -> Result<(LogProb, LogProb), Box<Error>> {
let shift = match variant {
&Variant::Deletion(_) => variant.len() as f64,
&Variant::Insertion(_) => {
panic!(
"bug: insert-size based probability for insertions is currently unsupported"
);
}
&Variant::SNV(_) => panic!("no fragment observations for SNV"),
&Variant::None => panic!("no fragment observations for None"),
};
let p_ref = self.pmf(insert_size, 0.0);
let p_alt = self.pmf(insert_size, shift);
Ok((p_ref, p_alt))
}
pub fn prob_sample_alt(
&self,
left_read_len: u32,
right_read_len: u32,
variant: &Variant,
) -> LogProb {
let expected_prob_enclose = |left_feasible, right_feasible, delta| {
let read_offsets = left_read_len.saturating_sub(left_feasible)
+ right_read_len.saturating_sub(right_feasible);
let expected_p_alt = LogProb::ln_sum_exp(
&self
.pmf_range()
.filter_map(|x| {
if x <= delta || x <= delta + read_offsets {
None
} else {
let p = self.pmf(x, 0.0) +
LogProb(
(x.saturating_sub(delta).saturating_sub(read_offsets) as f64).ln() -
(x.saturating_sub(delta) as f64).ln()
);
assert!(p.is_valid(), "bug: invalid probability {:?}", p);
Some(p)
}
})
.collect_vec(),
)
.cap_numerical_overshoot(0.0001);
expected_p_alt
};
let expected_prob_overlap = |left_feasible, right_feasible, delta| {
let n_alt_left = cmp::min(delta, left_read_len);
let n_alt_right = cmp::min(delta, right_read_len);
let n_alt_valid =
cmp::min(n_alt_left, left_feasible) + cmp::min(n_alt_right, right_feasible);
LogProb((n_alt_valid as f64).ln() - ((n_alt_left + n_alt_right) as f64).ln())
};
let left_feasible = self
.alignment_properties
.feasible_bases(left_read_len, variant);
let right_feasible = self
.alignment_properties
.feasible_bases(right_read_len, variant);
match variant {
&Variant::Deletion(_) => {
let delta = 0;
expected_prob_enclose(left_feasible, right_feasible, delta)
}
&Variant::Insertion(ref seq) => {
let delta = seq.len() as u32;
expected_prob_overlap(left_feasible, right_feasible, delta)
}
&Variant::SNV(_) | &Variant::None => LogProb::ln_one(),
}
}
}
pub fn isize_pmf(value: f64, mean: f64, sd: f64) -> LogProb {
LogProb(
(ugaussian_P((value + 0.5 - mean) / sd) - ugaussian_P((value - 0.5 - mean) / sd)).ln(), )
}
#[cfg(test)]
mod tests {
use super::*;
fn _test_n_fragment_positions(insert_size: u32) -> u32 {
let window = 364;
let read_len = 100;
n_fragment_positions(insert_size, read_len, read_len, window)
}
#[test]
fn test_n_fragment_positions_too_small() {
let n = _test_n_fragment_positions(150);
assert_eq!(n, 0);
let n = _test_n_fragment_positions(200);
assert_eq!(n, 0);
}
#[test]
fn test_n_fragment_positions_exact() {
let n = _test_n_fragment_positions(201);
assert_eq!(n, 1);
}
#[test]
fn test_n_fragment_positions() {
let n = _test_n_fragment_positions(202);
assert_eq!(n, 2);
}
#[test]
fn test_n_fragment_positions_too_large() {
let n = _test_n_fragment_positions(800);
assert_eq!(n, 0);
}
}