use std::collections::HashMap;
use anyhow::{Context, Ok, Result};
use rand::prelude::*;
use rand_xoshiro::Xoshiro256PlusPlus;
use rust_htslib::bam::{self, Read, Record};
use serde::{Deserialize, Serialize};
use super::{normalize_distr, trim_distr};
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct ExpectedInsertSizeDistr {
distr: Vec<f64>,
}
impl ExpectedInsertSizeDistr {
pub fn from_bam(
aln_rdr: &mut bam::Reader,
subsample: f64,
percentile_limit: f64,
rng: &mut Xoshiro256PlusPlus,
) -> Result<Self> {
let mut unpaired_reads: HashMap<Vec<u8>, (i64, usize)> = HashMap::new();
let mut distr = Vec::new();
let adjusted_subsample = subsample * 2.0;
let mut record = Record::new();
while let Some(result) = aln_rdr.read(&mut record) {
result.context("Failed to read record from SAM/BAM/CRAM file while estimating insert size distribution")?;
if record.is_secondary()
|| record.is_supplementary()
|| record.is_unmapped()
|| record.pos() < 0
{
continue;
}
let qname = record.qname().to_vec();
let mut remove = false;
if unpaired_reads.contains_key(&qname) {
let pos = record.pos();
let len = record.seq().len();
let (mate_pos, mate_len) = *unpaired_reads.get(&qname).unwrap();
remove = true;
let insert_size = if pos > mate_pos {
(pos + len as i64) - mate_pos
} else {
(mate_pos + mate_len as i64) - pos
};
assert!(insert_size >= 0, "Encountered negative insert size");
if distr.len() <= insert_size as usize {
distr.resize(insert_size as usize + 1, 0.0);
}
distr[insert_size as usize] += 1.0;
} else if rng.gen_bool(adjusted_subsample) {
let pos = record.pos();
let len = record.seq().len();
unpaired_reads.insert(qname.clone(), (pos, len));
}
if remove {
unpaired_reads.remove(&qname);
}
}
trim_distr(&mut distr, percentile_limit);
normalize_distr(&mut distr);
Ok(Self { distr })
}
pub fn to_vec(&self) -> Vec<f64> {
self.distr.clone()
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
pub struct EstimatedDistr {
pub distr: Vec<f64>,
}
impl EstimatedDistr {
pub fn new() -> Self {
Self { distr: vec![] }
}
pub fn from_vec(distr: Vec<f64>) -> Self {
Self { distr }
}
#[inline]
pub fn add(&mut self, observation: u32) {
if self.distr.len() <= observation as usize {
self.distr.resize(observation as usize + 1, 1.0);
}
self.distr[observation as usize] += 1.0;
}
pub fn add_many(&mut self, observations: &[u32]) {
for &obs in observations {
self.add(obs);
}
}
pub fn norm(&mut self) {
let sum = self.distr.iter().sum::<f64>();
self.distr.iter_mut().for_each(|x| *x /= sum);
}
pub fn mean(&self) -> f64 {
self.distr
.iter()
.enumerate()
.map(|(depth, prob)| depth as f64 * prob)
.sum()
}
pub fn std_dev(&self) -> f64 {
self.variance().sqrt()
}
fn variance(&self) -> f64 {
let mean = self.mean();
self.distr
.iter()
.enumerate()
.map(|(depth, prob)| (depth as f64 - mean).powi(2) * prob)
.sum()
}
pub fn max(&self) -> (usize, f64) {
self.distr
.iter()
.cloned()
.enumerate()
.reduce(
|(i_max, x_max), (i, x)| {
if x > x_max {
(i, x)
} else {
(i_max, x_max)
}
},
)
.unwrap()
}
}
impl Default for EstimatedDistr {
fn default() -> Self {
Self::new()
}
}
impl<'a> IntoIterator for &'a EstimatedDistr {
type Item = &'a f64;
type IntoIter = std::slice::Iter<'a, f64>;
fn into_iter(self) -> Self::IntoIter {
self.distr.iter()
}
}
impl AsRef<Vec<f64>> for EstimatedDistr {
fn as_ref(&self) -> &Vec<f64> {
&self.distr
}
}
impl AsMut<Vec<f64>> for EstimatedDistr {
fn as_mut(&mut self) -> &mut Vec<f64> {
&mut self.distr
}
}
impl From<EstimatedDistr> for Vec<f64> {
fn from(distr: EstimatedDistr) -> Vec<f64> {
distr.distr
}
}