use std::collections::{BTreeMap, HashMap, HashSet};
use std::hash::{Hash, Hasher};
use anyhow::Result;
use indexmap::IndexMap;
use rust_htslib::bam;
use super::bam_stat::{BamStatResult, GcDepthBin};
const GCD_BIN_SIZE: u64 = 20_000;
use super::common::{self, KnownJunctionSet, ReferenceJunctions};
use super::infer_experiment::{GeneModel, InferExperimentResult};
use super::inner_distance::{
build_histogram, ExonBitset, InnerDistanceResult, PairRecord, TranscriptTree,
};
use super::junction_annotation::{Junction, JunctionClass, JunctionResults};
use super::junction_saturation::SaturationResult;
use super::read_distribution::{ChromIntervals, ReadDistributionResult, RegionSets};
use super::read_duplication::ReadDuplicationResult;
use super::tin::TinAccum;
use crate::rna::preseq::PreseqAccum;
use crate::rna::bam_flags::*;
pub struct RseqcAnnotations<'a> {
pub gene_model: Option<&'a GeneModel>,
pub ref_junctions: Option<&'a ReferenceJunctions>,
pub rd_regions: Option<&'a RegionSets>,
pub exon_bitset: Option<&'a ExonBitset>,
pub transcript_tree: Option<&'a TranscriptTree>,
pub tin_index: Option<&'a super::tin::TinIndex>,
}
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct RseqcConfig {
pub mapq_cut: u8,
pub infer_experiment_sample_size: u64,
pub min_intron: u64,
pub junction_saturation_min_coverage: u32,
pub junction_saturation_sample_start: u32,
pub junction_saturation_sample_end: u32,
pub junction_saturation_sample_step: u32,
pub inner_distance_sample_size: u64,
pub inner_distance_lower_bound: i64,
pub inner_distance_upper_bound: i64,
pub inner_distance_step: i64,
pub bam_stat_enabled: bool,
pub infer_experiment_enabled: bool,
pub read_duplication_enabled: bool,
pub read_distribution_enabled: bool,
pub junction_annotation_enabled: bool,
pub junction_saturation_enabled: bool,
pub inner_distance_enabled: bool,
pub tin_enabled: bool,
pub tin_sample_size: usize,
pub tin_min_coverage: u32,
pub tin_seed: Option<u64>,
pub junction_saturation_seed: u64,
pub preseq_enabled: bool,
pub preseq_max_segment_length: i64,
}
#[derive(Debug)]
pub struct BamStatAccum {
pub total_records: u64,
pub qc_failed: u64,
pub duplicates: u64,
pub non_primary: u64,
pub unmapped: u64,
pub non_unique: u64,
pub unique: u64,
pub read_1: u64,
pub read_2: u64,
pub forward: u64,
pub reverse: u64,
pub splice: u64,
pub non_splice: u64,
pub proper_pairs: u64,
pub proper_pair_diff_chrom: u64,
pub secondary: u64,
pub supplementary: u64,
pub mapped: u64,
pub paired_flagstat: u64,
pub read1_flagstat: u64,
pub read2_flagstat: u64,
pub first_fragments: u64,
pub last_fragments: u64,
pub properly_paired: u64,
pub both_mapped: u64,
pub singletons: u64,
pub mate_diff_chr: u64,
pub mate_diff_chr_mapq5: u64,
pub chrom_counts: HashMap<i32, (u64, u64)>,
pub unplaced_unmapped: u64,
pub total_len: u64,
pub total_first_fragment_len: u64,
pub total_last_fragment_len: u64,
pub bases_mapped: u64,
pub bases_mapped_cigar: u64,
pub bases_duplicated: u64,
pub max_len: u64,
pub max_first_fragment_len: u64,
pub max_last_fragment_len: u64,
pub quality_sum: f64,
pub quality_count: u64,
pub mismatches: u64,
pub is_hist: HashMap<u64, [u64; 4]>,
pub inward_pairs: u64,
pub outward_pairs: u64,
pub other_orientation: u64,
pub primary_count: u64,
pub primary_mapped: u64,
pub primary_duplicates: u64,
pub reads_mq0: u64,
pub reads_mapped_and_paired: u64,
pub rl_hist: HashMap<u64, u64>,
pub frl_hist: HashMap<u64, u64>,
pub lrl_hist: HashMap<u64, u64>,
pub mapq_hist: [u64; 256],
pub ffq: Vec<[u64; 64]>,
pub lfq: Vec<[u64; 64]>,
pub gcf: [u64; 200],
pub gcl: [u64; 200],
pub fbc: Vec<[u64; 6]>,
pub lbc: Vec<[u64; 6]>,
pub fbc_ro: Vec<[u64; 6]>,
pub lbc_ro: Vec<[u64; 6]>,
pub gcc_rc: Vec<[u64; 4]>,
pub ftc: [u64; 5],
pub ltc: [u64; 5],
pub id_hist: HashMap<u64, [u64; 2]>,
pub ic: Vec<[u64; 4]>,
pub chk: [u32; 3],
pub cov_hist: HashMap<u32, u64>,
cov_buf: Vec<u32>,
cov_buf_idx: usize,
cov_buf_pos: i64,
cov_buf_tid: i32,
gcd_bins: Vec<GcDepthBin>,
gcd_pos: i64,
gcd_tid: i32,
}
impl Default for BamStatAccum {
fn default() -> Self {
Self {
total_records: 0,
qc_failed: 0,
duplicates: 0,
non_primary: 0,
unmapped: 0,
non_unique: 0,
unique: 0,
read_1: 0,
read_2: 0,
forward: 0,
reverse: 0,
splice: 0,
non_splice: 0,
proper_pairs: 0,
proper_pair_diff_chrom: 0,
secondary: 0,
supplementary: 0,
mapped: 0,
paired_flagstat: 0,
read1_flagstat: 0,
read2_flagstat: 0,
first_fragments: 0,
last_fragments: 0,
properly_paired: 0,
both_mapped: 0,
singletons: 0,
mate_diff_chr: 0,
mate_diff_chr_mapq5: 0,
chrom_counts: HashMap::new(),
unplaced_unmapped: 0,
total_len: 0,
total_first_fragment_len: 0,
total_last_fragment_len: 0,
bases_mapped: 0,
bases_mapped_cigar: 0,
bases_duplicated: 0,
max_len: 0,
max_first_fragment_len: 0,
max_last_fragment_len: 0,
quality_sum: 0.0,
quality_count: 0,
mismatches: 0,
is_hist: HashMap::new(),
inward_pairs: 0,
outward_pairs: 0,
other_orientation: 0,
primary_count: 0,
primary_mapped: 0,
primary_duplicates: 0,
reads_mq0: 0,
reads_mapped_and_paired: 0,
rl_hist: HashMap::new(),
frl_hist: HashMap::new(),
lrl_hist: HashMap::new(),
mapq_hist: [0u64; 256],
ffq: Vec::new(),
lfq: Vec::new(),
gcf: [0u64; 200],
gcl: [0u64; 200],
fbc: Vec::new(),
lbc: Vec::new(),
fbc_ro: Vec::new(),
lbc_ro: Vec::new(),
gcc_rc: Vec::new(),
ftc: [0u64; 5],
ltc: [0u64; 5],
id_hist: HashMap::new(),
ic: Vec::new(),
chk: [0u32; 3],
cov_hist: HashMap::new(),
cov_buf: vec![0u32; 1500], cov_buf_idx: 0,
cov_buf_pos: 0,
cov_buf_tid: -1,
gcd_bins: Vec::new(),
gcd_pos: -1,
gcd_tid: -1,
}
}
}
impl BamStatAccum {
pub fn process_read(&mut self, record: &bam::Record, mapq_cut: u8) {
let flags = record.flags();
self.total_records += 1;
let is_secondary = flags & BAM_FSECONDARY != 0;
let is_supplementary = flags & BAM_FSUPPLEMENTARY != 0;
let is_unmapped = flags & BAM_FUNMAP != 0;
let is_paired = flags & BAM_FPAIRED != 0;
let is_dup = flags & BAM_FDUP != 0;
let is_qcfail = flags & BAM_FQCFAIL != 0;
let is_primary = !is_secondary && !is_supplementary;
let is_mapped = !is_unmapped;
let tid = record.tid();
let mapq = record.mapq();
if is_secondary {
self.secondary += 1;
}
if is_supplementary {
self.supplementary += 1;
}
if is_mapped {
self.mapped += 1;
}
if is_primary {
if flags & BAM_FREAD2 != 0 {
self.last_fragments += 1;
} else {
self.first_fragments += 1;
}
}
if is_paired && is_primary {
self.paired_flagstat += 1;
if flags & BAM_FREAD1 != 0 {
self.read1_flagstat += 1;
}
if flags & BAM_FREAD2 != 0 {
self.read2_flagstat += 1;
}
if flags & BAM_FPROPER_PAIR != 0 {
self.properly_paired += 1;
}
let mate_unmapped = flags & BAM_FMUNMAP != 0;
if is_mapped && !mate_unmapped {
self.both_mapped += 1;
if tid != record.mtid() {
self.mate_diff_chr += 1;
if mapq >= 5 {
self.mate_diff_chr_mapq5 += 1;
}
}
}
if is_mapped && mate_unmapped {
self.singletons += 1;
}
}
if is_unmapped {
if tid >= 0 {
self.chrom_counts.entry(tid).or_insert((0, 0)).1 += 1;
} else {
self.unplaced_unmapped += 1;
}
} else if tid >= 0 {
self.chrom_counts.entry(tid).or_insert((0, 0)).0 += 1;
}
{
let qname = record.qname();
let name_crc = crc32fast::hash(qname);
self.chk[0] = self.chk[0].wrapping_add(name_crc);
let seq_len = record.seq_len();
if seq_len > 0 {
let seq_bytes = unsafe {
let inner = record.inner();
let data = inner.data;
let seq_offset =
inner.core.l_qname as isize + ((inner.core.n_cigar as isize) << 2);
let seq_nbytes = seq_len.div_ceil(2);
std::slice::from_raw_parts(data.offset(seq_offset), seq_nbytes)
};
let seq_crc = crc32fast::hash(seq_bytes);
self.chk[1] = self.chk[1].wrapping_add(seq_crc);
let qual = record.qual();
let qual_crc = crc32fast::hash(qual);
self.chk[2] = self.chk[2].wrapping_add(qual_crc);
}
}
let mut primary_gc_count: u64 = 0;
if is_primary {
self.primary_count += 1;
let seq_len = record.seq_len() as u64;
let mate_unmapped = flags & BAM_FMUNMAP != 0;
self.total_len += seq_len;
let is_last_fragment = is_paired && flags & BAM_FREAD2 != 0;
if is_last_fragment {
self.total_last_fragment_len += seq_len;
if seq_len > self.max_last_fragment_len {
self.max_last_fragment_len = seq_len;
}
} else {
self.total_first_fragment_len += seq_len;
if seq_len > self.max_first_fragment_len {
self.max_first_fragment_len = seq_len;
}
}
if seq_len > self.max_len {
self.max_len = seq_len;
}
*self.rl_hist.entry(seq_len).or_insert(0) += 1;
if is_last_fragment {
*self.lrl_hist.entry(seq_len).or_insert(0) += 1;
} else {
*self.frl_hist.entry(seq_len).or_insert(0) += 1;
}
if is_dup {
self.primary_duplicates += 1;
self.bases_duplicated += seq_len;
}
if is_mapped && is_paired && !is_qcfail && !mate_unmapped {
self.reads_mapped_and_paired += 1;
}
if is_mapped {
self.primary_mapped += 1;
self.bases_mapped += seq_len;
if record.mapq() == 0 {
self.reads_mq0 += 1;
}
if let Ok(rust_htslib::bam::record::Aux::U8(nm)) = record.aux(b"NM") {
self.mismatches += u64::from(nm);
} else if let Ok(rust_htslib::bam::record::Aux::U16(nm)) = record.aux(b"NM") {
self.mismatches += u64::from(nm);
} else if let Ok(rust_htslib::bam::record::Aux::U32(nm)) = record.aux(b"NM") {
self.mismatches += u64::from(nm);
} else if let Ok(rust_htslib::bam::record::Aux::I8(nm)) = record.aux(b"NM") {
if nm > 0 {
self.mismatches += nm as u64;
}
} else if let Ok(rust_htslib::bam::record::Aux::I16(nm)) = record.aux(b"NM") {
if nm > 0 {
self.mismatches += nm as u64;
}
} else if let Ok(rust_htslib::bam::record::Aux::I32(nm)) = record.aux(b"NM") {
if nm > 0 {
self.mismatches += nm as u64;
}
}
if is_paired && !mate_unmapped {
let tid = record.tid();
let mtid = record.mtid();
let tlen = record.insert_size();
let abs_tlen = tlen.unsigned_abs();
if abs_tlen > 0 || tid == mtid {
let pos = record.pos();
let mpos = record.mpos();
let pos_fst = mpos - pos;
let is_fst: i64 = if flags & BAM_FREAD1 != 0 { 1 } else { -1 };
let is_fwd: i64 = if flags & BAM_FREVERSE != 0 { -1 } else { 1 };
let is_mfwd: i64 = if flags & BAM_FMREVERSE != 0 { -1 } else { 1 };
let orientation_idx = if is_fwd * is_mfwd > 0 {
self.other_orientation += 1;
3usize
} else if is_fst * pos_fst > 0 {
if is_fst * is_fwd > 0 {
self.inward_pairs += 1;
1usize
} else {
self.outward_pairs += 1;
2usize
}
} else if is_fst * pos_fst < 0 {
if is_fst * is_fwd > 0 {
self.outward_pairs += 1;
2usize
} else {
self.inward_pairs += 1;
1usize
}
} else {
self.inward_pairs += 1;
1usize
};
if abs_tlen > 0 {
let capped = abs_tlen.min(8000);
let entry = self.is_hist.entry(capped).or_insert([0; 4]);
entry[0] += 1; entry[orientation_idx] += 1;
}
}
}
}
if !is_qcfail {
let quals = record.qual();
if !quals.is_empty() {
let base_qual_sum: f64 = quals.iter().map(|&q| f64::from(q)).sum::<f64>();
self.quality_sum += base_qual_sum;
self.quality_count += quals.len() as u64;
}
}
if is_mapped && !is_qcfail && !is_dup {
self.mapq_hist[mapq as usize] += 1;
}
{
let is_reverse = flags & BAM_FREVERSE != 0;
let seq = record.seq();
let quals = record.qual();
let read_len = seq.len();
let (qual_arr, base_arr, base_ro_arr, gc_arr, tc_arr) = if is_last_fragment {
(
&mut self.lfq,
&mut self.lbc,
&mut self.lbc_ro,
&mut self.gcl,
&mut self.ltc,
)
} else {
(
&mut self.ffq,
&mut self.fbc,
&mut self.fbc_ro,
&mut self.gcf,
&mut self.ftc,
)
};
if read_len > qual_arr.len() {
qual_arr.resize(read_len, [0u64; 64]);
}
if read_len > base_arr.len() {
base_arr.resize(read_len, [0u64; 6]);
}
if read_len > base_ro_arr.len() {
base_ro_arr.resize(read_len, [0u64; 6]);
}
if read_len > self.gcc_rc.len() {
self.gcc_rc.resize(read_len, [0u64; 4]);
}
let mut gc_count: u64 = 0;
const BASE_IDX: [u8; 16] = [5, 0, 1, 5, 2, 5, 5, 5, 3, 5, 5, 5, 5, 5, 5, 4];
const RC_IDX: [u8; 6] = [3, 2, 1, 0, 4, 5];
if !is_reverse {
for i in 0..read_len {
let q = quals[i] as usize;
qual_arr[i][q.min(63)] += 1;
let base_idx = BASE_IDX[seq.encoded_base(i) as usize] as usize;
base_arr[i][base_idx] += 1;
base_ro_arr[i][base_idx] += 1;
if base_idx < 4 {
self.gcc_rc[i][base_idx] += 1;
}
if base_idx == 1 || base_idx == 2 {
gc_count += 1;
}
if base_idx < 5 {
tc_arr[base_idx] += 1;
}
}
} else {
for i in 0..read_len {
let ro_cycle = read_len - 1 - i;
let q = quals[i] as usize;
qual_arr[ro_cycle][q.min(63)] += 1;
let base_idx = BASE_IDX[seq.encoded_base(i) as usize] as usize;
base_arr[i][base_idx] += 1;
base_ro_arr[ro_cycle][base_idx] += 1;
if base_idx < 4 {
self.gcc_rc[ro_cycle][RC_IDX[base_idx] as usize] += 1;
}
if base_idx == 1 || base_idx == 2 {
gc_count += 1;
}
if base_idx < 5 {
tc_arr[base_idx] += 1;
}
}
}
primary_gc_count = gc_count;
if read_len > 0 {
let ngc: usize = 200;
let gc_idx_min = gc_count as usize * (ngc - 1) / read_len;
let mut gc_idx_max = (gc_count as usize + 1) * (ngc - 1) / read_len;
if gc_idx_max >= ngc {
gc_idx_max = ngc - 1;
}
for item in gc_arr.iter_mut().take(gc_idx_max).skip(gc_idx_min) {
*item += 1;
}
}
}
}
if is_mapped && !is_secondary {
use rust_htslib::bam::record::Cigar as C;
let is_reverse = flags & BAM_FREVERSE != 0;
let read_len = record.seq_len();
let tid = record.tid();
let pos = record.pos();
let order: u32 = if is_paired {
(if flags & BAM_FREAD1 != 0 { 1 } else { 0 })
+ (if flags & BAM_FREAD2 != 0 { 2 } else { 0 })
} else {
1 };
let do_cov = read_len > 0;
let buf_size = if do_cov {
let need = read_len * 5;
if need > self.cov_buf.len() {
let old_size = self.cov_buf.len();
let mut new_buf = vec![0u32; need];
let head = old_size - self.cov_buf_idx;
new_buf[..head].copy_from_slice(&self.cov_buf[self.cov_buf_idx..]);
new_buf[head..head + self.cov_buf_idx]
.copy_from_slice(&self.cov_buf[..self.cov_buf_idx]);
self.cov_buf = new_buf;
self.cov_buf_idx = 0;
}
let bs = self.cov_buf.len();
if tid != self.cov_buf_tid {
self.flush_cov_buf_all();
self.cov_buf_tid = tid;
self.cov_buf_pos = pos;
self.cov_buf_idx = 0;
}
self.cov_buf_flush_to(pos, bs);
bs
} else {
0
};
let cigar = record.cigar();
let mut icycle: usize = 0;
let mut cigar_mapped: u64 = 0;
let mut ref_pos = pos;
for op in cigar.iter() {
match op {
C::Ins(n) => {
let ncig = *n as usize;
let len = *n as u64;
cigar_mapped += len;
let id_entry = self.id_hist.entry(len).or_insert([0; 2]);
id_entry[0] += 1;
let idx = if is_reverse {
read_len.saturating_sub(icycle + ncig)
} else {
icycle
};
if idx >= self.ic.len() {
self.ic.resize(idx + 1, [0u64; 4]);
}
if order == 1 {
self.ic[idx][0] += 1; }
if order == 2 {
self.ic[idx][1] += 1; }
icycle += ncig; }
C::Del(n) => {
let len = *n as u64;
let id_entry = self.id_hist.entry(len).or_insert([0; 2]);
id_entry[1] += 1;
let idx = if is_reverse {
if icycle == 0 {
ref_pos += *n as i64; continue;
}
read_len.saturating_sub(icycle + 1)
} else {
if icycle == 0 {
ref_pos += *n as i64;
continue;
}
icycle - 1
};
if idx >= self.ic.len() {
self.ic.resize(idx + 1, [0u64; 4]);
}
if order == 1 {
self.ic[idx][2] += 1; }
if order == 2 {
self.ic[idx][3] += 1; }
ref_pos += *n as i64;
}
C::Match(n) | C::Equal(n) | C::Diff(n) => {
let len = *n as u64;
cigar_mapped += len; icycle += *n as usize;
if do_cov {
let end = ref_pos + *n as i64;
self.cov_buf_insert(ref_pos, end, buf_size);
ref_pos = end;
} else {
ref_pos += *n as i64;
}
}
C::RefSkip(n) => {
ref_pos += *n as i64; }
C::SoftClip(n) => {
icycle += *n as usize; }
C::HardClip(_) | C::Pad(_) => {}
}
}
self.bases_mapped_cigar += cigar_mapped;
}
if is_mapped && !is_secondary {
let tid = record.tid();
let pos = record.pos();
let seq_len = record.seq_len();
if seq_len > 0 {
let new_bin = self.gcd_pos < 0
|| tid != self.gcd_tid
|| pos - self.gcd_pos > GCD_BIN_SIZE as i64;
if new_bin {
self.gcd_bins.push(GcDepthBin { gc: 0.0, depth: 0 });
self.gcd_pos = pos;
self.gcd_tid = tid;
}
if let Some(bin) = self.gcd_bins.last_mut() {
bin.depth += 1;
let gc_count: u32 = if is_primary {
primary_gc_count as u32
} else {
let seq = record.seq();
let mut count: u32 = 0;
for i in 0..seq_len {
let base = seq.encoded_base(i);
if base == 2 || base == 4 {
count += 1;
}
}
count
};
bin.gc += gc_count as f32 / seq_len as f32;
}
}
}
if is_qcfail {
self.qc_failed += 1;
return;
}
if is_dup {
self.duplicates += 1;
return;
}
if is_secondary {
self.non_primary += 1;
return;
}
if is_unmapped {
self.unmapped += 1;
return;
}
if mapq < mapq_cut {
self.non_unique += 1;
return;
}
self.unique += 1;
if flags & BAM_FREAD1 != 0 {
self.read_1 += 1;
}
if flags & BAM_FREAD2 != 0 {
self.read_2 += 1;
}
if flags & BAM_FREVERSE != 0 {
self.reverse += 1;
} else {
self.forward += 1;
}
let has_splice = record
.cigar()
.iter()
.any(|op| matches!(op, rust_htslib::bam::record::Cigar::RefSkip(_)));
if has_splice {
self.splice += 1;
} else {
self.non_splice += 1;
}
if is_paired && flags & BAM_FPROPER_PAIR != 0 {
self.proper_pairs += 1;
if tid != record.mtid() {
self.proper_pair_diff_chrom += 1;
}
}
}
fn cov_buf_flush_to(&mut self, pos: i64, buf_size: usize) {
if pos - self.cov_buf_pos >= buf_size as i64 {
let flush_count = buf_size - 1; for _ in 0..flush_count {
let depth = self.cov_buf[self.cov_buf_idx];
if depth > 0 {
*self.cov_hist.entry(depth).or_insert(0) += 1;
self.cov_buf[self.cov_buf_idx] = 0;
}
self.cov_buf_idx += 1;
if self.cov_buf_idx >= buf_size {
self.cov_buf_idx = 0;
}
}
self.cov_buf_pos = pos;
} else {
while self.cov_buf_pos < pos {
let depth = self.cov_buf[self.cov_buf_idx];
if depth > 0 {
*self.cov_hist.entry(depth).or_insert(0) += 1;
self.cov_buf[self.cov_buf_idx] = 0;
}
self.cov_buf_idx += 1;
if self.cov_buf_idx >= buf_size {
self.cov_buf_idx = 0;
}
self.cov_buf_pos += 1;
}
}
}
fn cov_buf_insert(&mut self, from: i64, to: i64, buf_size: usize) {
for ref_pos in from..to {
let offset = (ref_pos - self.cov_buf_pos) as usize;
let idx = (self.cov_buf_idx + offset) % buf_size;
self.cov_buf[idx] += 1;
}
}
pub fn flush_cov_buf_all(&mut self) {
for slot in self.cov_buf.iter_mut() {
if *slot > 0 {
*self.cov_hist.entry(*slot).or_insert(0) += 1;
*slot = 0;
}
}
self.cov_buf_idx = 0;
self.cov_buf_pos = 0;
self.cov_buf_tid = -1;
}
pub fn merge(&mut self, mut other: BamStatAccum) {
other.flush_cov_buf_all();
self.total_records += other.total_records;
self.qc_failed += other.qc_failed;
self.duplicates += other.duplicates;
self.non_primary += other.non_primary;
self.unmapped += other.unmapped;
self.non_unique += other.non_unique;
self.unique += other.unique;
self.read_1 += other.read_1;
self.read_2 += other.read_2;
self.forward += other.forward;
self.reverse += other.reverse;
self.splice += other.splice;
self.non_splice += other.non_splice;
self.proper_pairs += other.proper_pairs;
self.proper_pair_diff_chrom += other.proper_pair_diff_chrom;
self.secondary += other.secondary;
self.supplementary += other.supplementary;
self.mapped += other.mapped;
self.paired_flagstat += other.paired_flagstat;
self.read1_flagstat += other.read1_flagstat;
self.read2_flagstat += other.read2_flagstat;
self.first_fragments += other.first_fragments;
self.last_fragments += other.last_fragments;
self.properly_paired += other.properly_paired;
self.both_mapped += other.both_mapped;
self.singletons += other.singletons;
self.mate_diff_chr += other.mate_diff_chr;
self.mate_diff_chr_mapq5 += other.mate_diff_chr_mapq5;
for (tid, (m, u)) in other.chrom_counts {
let entry = self.chrom_counts.entry(tid).or_insert((0, 0));
entry.0 += m;
entry.1 += u;
}
self.unplaced_unmapped += other.unplaced_unmapped;
self.total_len += other.total_len;
self.total_first_fragment_len += other.total_first_fragment_len;
self.total_last_fragment_len += other.total_last_fragment_len;
self.bases_mapped += other.bases_mapped;
self.bases_mapped_cigar += other.bases_mapped_cigar;
self.bases_duplicated += other.bases_duplicated;
if other.max_len > self.max_len {
self.max_len = other.max_len;
}
if other.max_first_fragment_len > self.max_first_fragment_len {
self.max_first_fragment_len = other.max_first_fragment_len;
}
if other.max_last_fragment_len > self.max_last_fragment_len {
self.max_last_fragment_len = other.max_last_fragment_len;
}
self.quality_sum += other.quality_sum;
self.quality_count += other.quality_count;
self.mismatches += other.mismatches;
for (isize_val, counts) in other.is_hist {
let entry = self.is_hist.entry(isize_val).or_insert([0; 4]);
for i in 0..4 {
entry[i] += counts[i];
}
}
self.inward_pairs += other.inward_pairs;
self.outward_pairs += other.outward_pairs;
self.other_orientation += other.other_orientation;
self.primary_count += other.primary_count;
self.primary_mapped += other.primary_mapped;
self.primary_duplicates += other.primary_duplicates;
self.reads_mq0 += other.reads_mq0;
self.reads_mapped_and_paired += other.reads_mapped_and_paired;
for (len, count) in other.rl_hist {
*self.rl_hist.entry(len).or_insert(0) += count;
}
for (len, count) in other.frl_hist {
*self.frl_hist.entry(len).or_insert(0) += count;
}
for (len, count) in other.lrl_hist {
*self.lrl_hist.entry(len).or_insert(0) += count;
}
for i in 0..256 {
self.mapq_hist[i] += other.mapq_hist[i];
}
merge_vec_arrays(&mut self.ffq, other.ffq);
merge_vec_arrays(&mut self.lfq, other.lfq);
for i in 0..200 {
self.gcf[i] += other.gcf[i];
self.gcl[i] += other.gcl[i];
}
merge_vec_arrays(&mut self.fbc, other.fbc);
merge_vec_arrays(&mut self.lbc, other.lbc);
merge_vec_arrays(&mut self.fbc_ro, other.fbc_ro);
merge_vec_arrays(&mut self.lbc_ro, other.lbc_ro);
merge_vec_arrays(&mut self.gcc_rc, other.gcc_rc);
for i in 0..5 {
self.ftc[i] += other.ftc[i];
self.ltc[i] += other.ltc[i];
}
for (len, counts) in other.id_hist {
let entry = self.id_hist.entry(len).or_insert([0; 2]);
entry[0] += counts[0];
entry[1] += counts[1];
}
merge_vec_arrays(&mut self.ic, other.ic);
for i in 0..3 {
self.chk[i] = self.chk[i].wrapping_add(other.chk[i]);
}
for (depth, count) in other.cov_hist {
*self.cov_hist.entry(depth).or_insert(0) += count;
}
self.gcd_bins.append(&mut other.gcd_bins);
}
}
#[derive(Debug, Default)]
pub struct InferExpAccum {
pub p_strandness: HashMap<String, u64>,
pub s_strandness: HashMap<String, u64>,
key_buf: String,
}
impl InferExpAccum {
pub fn process_read(
&mut self,
record: &bam::Record,
chrom: &str,
model: &GeneModel,
mapq_cut: u8,
) {
let flags = record.flags();
if flags & BAM_FQCFAIL != 0
|| flags & BAM_FDUP != 0
|| flags & BAM_FSECONDARY != 0
|| flags & BAM_FUNMAP != 0
{
return;
}
if record.mapq() < mapq_cut {
return;
}
let map_strand = if record.is_reverse() { '-' } else { '+' };
let read_start = record.pos() as u64;
let qalen: u64 = record
.cigar()
.iter()
.filter_map(|op| {
use rust_htslib::bam::record::Cigar::*;
match op {
Match(len) | Ins(len) | Equal(len) | Diff(len) => Some(*len as u64),
_ => None,
}
})
.sum();
let read_end = read_start + qalen;
let strands = model.find_strands(chrom, read_start, read_end);
if strands.is_empty() {
return;
}
self.key_buf.clear();
let map = if record.is_paired() {
self.key_buf.push(if record.is_first_in_template() {
'1'
} else {
'2'
});
&mut self.p_strandness
} else {
&mut self.s_strandness
};
self.key_buf.push(map_strand);
for (i, &s) in strands.iter().enumerate() {
if i > 0 {
self.key_buf.push(':');
}
self.key_buf.push(s as char);
}
match map.get_mut(self.key_buf.as_str()) {
Some(count) => *count += 1,
None => {
map.insert(self.key_buf.clone(), 1);
}
}
}
pub fn merge(&mut self, other: InferExpAccum) {
for (key, count) in other.p_strandness {
*self.p_strandness.entry(key).or_insert(0) += count;
}
for (key, count) in other.s_strandness {
*self.s_strandness.entry(key).or_insert(0) += count;
}
}
}
#[derive(Debug, Default)]
pub struct ReadDupAccum {
pub seq_dup: HashMap<u128, u64>,
pub pos_dup: HashMap<u64, u64>,
}
impl ReadDupAccum {
pub fn process_read(&mut self, record: &bam::Record, chrom: &str, mapq_cut: u8) {
let flags = record.flags();
if flags & BAM_FUNMAP != 0 || flags & BAM_FQCFAIL != 0 {
return;
}
if record.mapq() < mapq_cut {
return;
}
let seq_hash = hash_sequence_encoded(&record.seq());
*self.seq_dup.entry(seq_hash).or_insert(0) += 1;
let pos = record.pos();
let cigar = record.cigar();
let key = hash_position_key(chrom, pos, &cigar);
*self.pos_dup.entry(key).or_insert(0) += 1;
}
pub fn merge(&mut self, other: ReadDupAccum) {
for (hash, count) in other.seq_dup {
*self.seq_dup.entry(hash).or_insert(0) += count;
}
for (key, count) in other.pos_dup {
*self.pos_dup.entry(key).or_insert(0) += count;
}
}
}
fn hash_sequence_encoded(seq: &bam::record::Seq<'_>) -> u128 {
let len = seq.len();
let mut hasher = std::collections::hash_map::DefaultHasher::new();
for i in 0..len {
seq.encoded_base(i).hash(&mut hasher);
}
let h1 = hasher.finish();
let mut hasher2 = std::collections::hash_map::DefaultHasher::new();
len.hash(&mut hasher2);
h1.hash(&mut hasher2);
let h2 = hasher2.finish();
(h1 as u128) << 64 | (h2 as u128)
}
fn hash_position_key(chrom: &str, pos: i64, cigar: &bam::record::CigarStringView) -> u64 {
use rust_htslib::bam::record::Cigar;
let mut h = crate::io::FNV1A_OFFSET;
crate::io::fnv1a_update(&mut h, chrom.as_bytes());
crate::io::fnv1a_update(&mut h, &pos.to_le_bytes());
let mut ref_pos = pos;
for op in cigar.iter() {
match op {
Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => {
let end = ref_pos + *len as i64;
crate::io::fnv1a_update(&mut h, &ref_pos.to_le_bytes());
crate::io::fnv1a_update(&mut h, &end.to_le_bytes());
ref_pos = end;
}
Cigar::Del(len) | Cigar::RefSkip(len) => {
ref_pos += *len as i64;
}
Cigar::SoftClip(len) => {
ref_pos += *len as i64;
}
Cigar::Ins(_) | Cigar::HardClip(_) | Cigar::Pad(_) => {}
}
}
h
}
#[derive(Debug, Default)]
pub struct ReadDistAccum {
pub total_reads: u64,
pub total_tags: u64,
pub cds_tags: u64,
pub utr5_tags: u64,
pub utr3_tags: u64,
pub intron_tags: u64,
pub tss_1k_tags: u64,
pub tss_5k_tags: u64,
pub tss_10k_tags: u64,
pub tes_1k_tags: u64,
pub tes_5k_tags: u64,
pub tes_10k_tags: u64,
pub unassigned: u64,
}
impl ReadDistAccum {
pub fn process_read(&mut self, record: &bam::Record, chrom_upper: &str, regions: &RegionSets) {
let flags = record.flags();
if flags & BAM_FQCFAIL != 0
|| flags & BAM_FDUP != 0
|| flags & BAM_FSECONDARY != 0
|| flags & BAM_FUNMAP != 0
{
return;
}
self.total_reads += 1;
let exon_blocks = fetch_exon_blocks_rseqc(record);
for (block_start, block_end) in exon_blocks {
self.total_tags += 1;
let midpoint = block_start + (block_end - block_start) / 2;
if point_in(®ions.cds_exon, chrom_upper, midpoint) {
self.cds_tags += 1;
} else {
let in_utr5 = point_in(®ions.utr_5, chrom_upper, midpoint);
let in_utr3 = point_in(®ions.utr_3, chrom_upper, midpoint);
if in_utr5 && in_utr3 {
self.unassigned += 1;
} else if in_utr5 {
self.utr5_tags += 1;
} else if in_utr3 {
self.utr3_tags += 1;
} else if point_in(®ions.intron, chrom_upper, midpoint) {
self.intron_tags += 1;
} else {
let in_tss_10k = point_in(®ions.tss_up_10kb, chrom_upper, midpoint);
let in_tes_10k = point_in(®ions.tes_down_10kb, chrom_upper, midpoint);
if in_tss_10k && in_tes_10k {
self.unassigned += 1;
} else if in_tss_10k {
self.tss_10k_tags += 1;
if point_in(®ions.tss_up_5kb, chrom_upper, midpoint) {
self.tss_5k_tags += 1;
if point_in(®ions.tss_up_1kb, chrom_upper, midpoint) {
self.tss_1k_tags += 1;
}
}
} else if in_tes_10k {
self.tes_10k_tags += 1;
if point_in(®ions.tes_down_5kb, chrom_upper, midpoint) {
self.tes_5k_tags += 1;
if point_in(®ions.tes_down_1kb, chrom_upper, midpoint) {
self.tes_1k_tags += 1;
}
}
} else {
self.unassigned += 1;
}
}
}
}
}
pub fn merge(&mut self, other: ReadDistAccum) {
self.total_reads += other.total_reads;
self.total_tags += other.total_tags;
self.cds_tags += other.cds_tags;
self.utr5_tags += other.utr5_tags;
self.utr3_tags += other.utr3_tags;
self.intron_tags += other.intron_tags;
self.tss_1k_tags += other.tss_1k_tags;
self.tss_5k_tags += other.tss_5k_tags;
self.tss_10k_tags += other.tss_10k_tags;
self.tes_1k_tags += other.tes_1k_tags;
self.tes_5k_tags += other.tes_5k_tags;
self.tes_10k_tags += other.tes_10k_tags;
self.unassigned += other.unassigned;
}
}
#[derive(Debug, Default)]
pub struct JuncAnnotAccum {
pub junction_counts: IndexMap<Junction, (u64, JunctionClass)>,
pub total_events: u64,
pub known_events: u64,
pub partial_novel_events: u64,
pub complete_novel_events: u64,
pub filtered_events: u64,
}
impl JuncAnnotAccum {
pub fn process_read(
&mut self,
record: &bam::Record,
chrom_upper: &str,
ref_junctions: &ReferenceJunctions,
min_intron: u64,
mapq_cut: u8,
) {
let flags = record.flags();
if flags & BAM_FQCFAIL != 0
|| flags & BAM_FDUP != 0
|| flags & BAM_FSECONDARY != 0
|| flags & BAM_FUNMAP != 0
{
return;
}
if record.mapq() < mapq_cut {
return;
}
let start_pos = record.pos() as u64;
let cigar = record.cigar();
let introns = common::fetch_introns(start_pos, cigar.as_ref());
for (intron_start, intron_end) in introns {
self.total_events += 1;
let intron_size = intron_end.saturating_sub(intron_start);
if intron_size < min_intron {
self.filtered_events += 1;
continue;
}
let class = classify_junction(chrom_upper, intron_start, intron_end, ref_junctions);
match class {
JunctionClass::Annotated => self.known_events += 1,
JunctionClass::PartialNovel => self.partial_novel_events += 1,
JunctionClass::CompleteNovel => self.complete_novel_events += 1,
}
let junction = Junction {
chrom: chrom_upper.to_string(),
intron_start,
intron_end,
};
self.junction_counts
.entry(junction)
.and_modify(|(c, _)| *c += 1)
.or_insert((1, class));
}
}
pub fn merge(&mut self, other: JuncAnnotAccum) {
for (junction, (count, class)) in other.junction_counts {
self.junction_counts
.entry(junction)
.and_modify(|(c, _)| *c += count)
.or_insert((count, class));
}
self.total_events += other.total_events;
self.known_events += other.known_events;
self.partial_novel_events += other.partial_novel_events;
self.complete_novel_events += other.complete_novel_events;
self.filtered_events += other.filtered_events;
}
}
fn classify_junction(
chrom: &str,
intron_start: u64,
intron_end: u64,
reference: &ReferenceJunctions,
) -> JunctionClass {
super::common::classify_junction(chrom, intron_start, intron_end, reference)
}
#[derive(Debug, Default)]
pub struct JuncSatAccum {
pub observations: Vec<u64>,
unique_keys: HashMap<u64, String>,
}
fn hash_junction_key(chrom: &str, start: u64, end: u64) -> u64 {
use std::hash::Hasher;
let mut hasher = std::collections::hash_map::DefaultHasher::new();
chrom.hash(&mut hasher);
start.hash(&mut hasher);
end.hash(&mut hasher);
hasher.finish()
}
impl JuncSatAccum {
pub fn process_read(
&mut self,
record: &bam::Record,
chrom_upper: &str,
min_intron: u64,
mapq_cut: u8,
) {
let flags = record.flags();
if flags & BAM_FQCFAIL != 0
|| flags & BAM_FDUP != 0
|| flags & BAM_FSECONDARY != 0
|| flags & BAM_FUNMAP != 0
{
return;
}
if record.mapq() < mapq_cut {
return;
}
let start = record.pos() as u64;
let cigar = record.cigar();
let introns = common::fetch_introns(start, cigar.as_ref());
for (istart, iend) in introns {
if iend - istart < min_intron {
continue;
}
let h = hash_junction_key(chrom_upper, istart, iend);
self.observations.push(h);
self.unique_keys
.entry(h)
.or_insert_with(|| format!("{}:{}-{}", chrom_upper, istart, iend));
}
}
pub fn merge(&mut self, other: JuncSatAccum) {
self.observations.extend(other.observations);
for (h, key) in other.unique_keys {
self.unique_keys.entry(h).or_insert(key);
}
}
}
#[derive(Debug)]
pub struct InnerDistPair {
pub name: Vec<u8>,
pub distance: Option<i64>,
pub classification: &'static str,
}
#[derive(Debug, Default)]
pub struct InnerDistAccum {
pub pairs: Vec<InnerDistPair>,
pub distances: Vec<i64>,
pub pair_num: u64,
pub sample_size: u64,
}
impl InnerDistAccum {
pub fn new(sample_size: u64) -> Self {
InnerDistAccum {
sample_size,
..Default::default()
}
}
pub fn process_read(
&mut self,
record: &bam::Record,
chrom_upper: &str,
exon_bitset: &ExonBitset,
transcript_tree: &TranscriptTree,
mapq_cut: u8,
) {
if self.pair_num >= self.sample_size {
return;
}
let flags = record.flags();
if flags & BAM_FQCFAIL != 0
|| flags & BAM_FDUP != 0
|| flags & BAM_FSECONDARY != 0
|| flags & BAM_FUNMAP != 0
{
return;
}
if flags & BAM_FPAIRED == 0 || record.is_mate_unmapped() {
return;
}
if record.mapq() < mapq_cut {
return;
}
let read1_start = record.pos() as u64;
let read2_start = record.mpos() as u64;
if record.tid() != record.mtid() {
if record.tid() > record.mtid() {
return;
}
self.pair_num += 1;
self.pairs.push(InnerDistPair {
name: record.qname().to_vec(),
distance: None,
classification: "sameChrom=No",
});
return;
}
if read2_start < read1_start {
return;
}
if read2_start == read1_start && record.is_first_in_template() {
return;
}
self.pair_num += 1;
let read_name = record.qname().to_vec();
let (qalen, splice_intron_size) = compute_qalen_and_intron_size(record);
let read1_end = read1_start + qalen + splice_intron_size;
let inner_dist: i64 = if read2_start >= read1_end {
(read2_start - read1_end) as i64
} else {
let exon_blocks = fetch_exon_blocks_rseqc(record);
let mut overlap_count: i64 = 0;
for (ex_start, ex_end) in &exon_blocks {
let ov_start = (*ex_start).max(read2_start);
let ov_end = (*ex_end).min(read1_end);
if ov_start < ov_end {
overlap_count += (ov_end - ov_start) as i64;
}
}
-overlap_count
};
let read1_genes =
transcript_tree.find_overlapping(chrom_upper, read1_end.saturating_sub(1));
let read2_genes = transcript_tree.find_overlapping(chrom_upper, read2_start);
let common_genes: HashSet<_> = read1_genes.intersection(&read2_genes).collect();
let classification: &'static str;
if common_genes.is_empty() {
classification = "sameTranscript=No,dist=genomic";
} else if inner_dist > 0 {
if !exon_bitset.has_chrom(chrom_upper) {
classification = "unknownChromosome,dist=genomic";
} else {
let exonic_bases =
exon_bitset.count_exonic_bases(chrom_upper, read1_end, read2_start);
if exonic_bases as i64 == inner_dist {
classification = "sameTranscript=Yes,sameExon=Yes,dist=mRNA";
} else if exonic_bases > 0 {
let mrna_dist = exonic_bases as i64;
self.pairs.push(InnerDistPair {
name: read_name,
distance: Some(mrna_dist),
classification: "sameTranscript=Yes,sameExon=No,dist=mRNA",
});
self.distances.push(mrna_dist);
return;
} else {
classification = "sameTranscript=Yes,nonExonic=Yes,dist=genomic";
}
}
} else {
classification = "readPairOverlap";
}
self.pairs.push(InnerDistPair {
name: read_name,
distance: Some(inner_dist),
classification,
});
self.distances.push(inner_dist);
}
pub fn merge(&mut self, other: InnerDistAccum) {
self.pairs.extend(other.pairs);
self.distances.extend(other.distances);
self.pair_num += other.pair_num;
if self.pair_num > self.sample_size {
let limit = self.sample_size as usize;
self.pairs.truncate(limit);
self.distances.truncate(limit);
self.pair_num = self.sample_size;
}
}
}
fn compute_qalen_and_intron_size(record: &bam::Record) -> (u64, u64) {
let mut qalen: u64 = 0;
let mut intron_size: u64 = 0;
for op in record.cigar().iter() {
use rust_htslib::bam::record::Cigar::*;
match op {
Match(len) | Equal(len) | Diff(len) => qalen += *len as u64,
Ins(len) => qalen += *len as u64,
RefSkip(len) => intron_size += *len as u64,
_ => {}
}
}
(qalen, intron_size)
}
fn fetch_exon_blocks_rseqc(record: &bam::Record) -> Vec<(u64, u64)> {
let mut exons = Vec::new();
let mut chrom_st = record.pos() as u64;
for op in record.cigar().iter() {
use rust_htslib::bam::record::Cigar::*;
match op {
Match(len) => {
let start = chrom_st;
chrom_st += *len as u64;
exons.push((start, chrom_st));
}
Del(len) | RefSkip(len) => chrom_st += *len as u64,
SoftClip(len) => chrom_st += *len as u64, _ => {}
}
}
exons
}
#[derive(Debug, Default)]
pub struct RseqcAccumulators {
pub bam_stat: Option<BamStatAccum>,
pub infer_exp: Option<InferExpAccum>,
pub read_dup: Option<ReadDupAccum>,
pub read_dist: Option<ReadDistAccum>,
pub junc_annot: Option<JuncAnnotAccum>,
pub junc_sat: Option<JuncSatAccum>,
pub inner_dist: Option<InnerDistAccum>,
pub tin: Option<TinAccum>,
pub preseq: Option<PreseqAccum>,
}
impl RseqcAccumulators {
pub fn empty() -> Self {
Self {
bam_stat: None,
infer_exp: None,
read_dup: None,
read_dist: None,
junc_annot: None,
junc_sat: None,
inner_dist: None,
tin: None,
preseq: None,
}
}
pub fn new(config: &RseqcConfig, annotations: Option<&RseqcAnnotations>) -> Self {
RseqcAccumulators {
bam_stat: if config.bam_stat_enabled {
Some(BamStatAccum::default())
} else {
None
},
infer_exp: if config.infer_experiment_enabled {
Some(InferExpAccum::default())
} else {
None
},
read_dup: if config.read_duplication_enabled {
Some(ReadDupAccum::default())
} else {
None
},
read_dist: if config.read_distribution_enabled {
Some(ReadDistAccum::default())
} else {
None
},
junc_annot: if config.junction_annotation_enabled {
Some(JuncAnnotAccum::default())
} else {
None
},
junc_sat: if config.junction_saturation_enabled {
Some(JuncSatAccum::default())
} else {
None
},
inner_dist: if config.inner_distance_enabled {
Some(InnerDistAccum::new(config.inner_distance_sample_size))
} else {
None
},
tin: if config.tin_enabled {
annotations.and_then(|a| a.tin_index).map(|idx| {
TinAccum::new(
idx,
config.mapq_cut,
config.tin_min_coverage,
config.tin_seed,
)
})
} else {
None
},
preseq: if config.preseq_enabled {
Some(PreseqAccum::new(config.preseq_max_segment_length))
} else {
None
},
}
}
#[allow(clippy::too_many_arguments)]
pub fn process_read(
&mut self,
record: &bam::Record,
chrom: &str,
chrom_upper: &str,
annotations: &RseqcAnnotations,
config: &RseqcConfig,
) {
if let Some(ref mut accum) = self.bam_stat {
accum.process_read(record, config.mapq_cut);
}
if let Some(ref mut accum) = self.read_dup {
accum.process_read(record, chrom, config.mapq_cut);
}
if let (Some(ref mut accum), Some(model)) = (&mut self.infer_exp, annotations.gene_model) {
accum.process_read(record, chrom, model, config.mapq_cut);
}
if let (Some(ref mut accum), Some(regions)) = (&mut self.read_dist, annotations.rd_regions)
{
accum.process_read(record, chrom_upper, regions);
}
if let (Some(ref mut accum), Some(ref_junctions)) =
(&mut self.junc_annot, annotations.ref_junctions)
{
accum.process_read(
record,
chrom_upper,
ref_junctions,
config.min_intron,
config.mapq_cut,
);
}
if let Some(ref mut accum) = &mut self.junc_sat {
accum.process_read(record, chrom_upper, config.min_intron, config.mapq_cut);
}
if let (Some(ref mut accum), Some(exon_bitset), Some(transcript_tree)) = (
&mut self.inner_dist,
annotations.exon_bitset,
annotations.transcript_tree,
) {
accum.process_read(
record,
chrom_upper,
exon_bitset,
transcript_tree,
config.mapq_cut,
);
}
if let (Some(ref mut accum), Some(tin_index)) = (&mut self.tin, annotations.tin_index) {
accum.process_read(record, chrom_upper, tin_index);
}
if let Some(ref mut accum) = self.preseq {
accum.process_read(record);
}
}
pub fn merge(&mut self, other: RseqcAccumulators) {
if let (Some(ref mut a), Some(b)) = (&mut self.bam_stat, other.bam_stat) {
a.merge(b);
}
if let (Some(ref mut a), Some(b)) = (&mut self.infer_exp, other.infer_exp) {
a.merge(b);
}
if let (Some(ref mut a), Some(b)) = (&mut self.read_dup, other.read_dup) {
a.merge(b);
}
if let (Some(ref mut a), Some(b)) = (&mut self.read_dist, other.read_dist) {
a.merge(b);
}
if let (Some(ref mut a), Some(b)) = (&mut self.junc_annot, other.junc_annot) {
a.merge(b);
}
if let (Some(ref mut a), Some(b)) = (&mut self.junc_sat, other.junc_sat) {
a.merge(b);
}
if let (Some(ref mut a), Some(b)) = (&mut self.inner_dist, other.inner_dist) {
a.merge(b);
}
if let (Some(ref mut a), Some(b)) = (&mut self.tin, other.tin) {
a.merge(b);
}
if let (Some(ref mut a), Some(b)) = (&mut self.preseq, other.preseq) {
a.merge(b);
}
}
}
fn point_in(region_map: &HashMap<String, ChromIntervals>, chrom: &str, point: u64) -> bool {
region_map.get(chrom).is_some_and(|ci| ci.contains(point))
}
fn merge_vec_arrays<const N: usize>(target: &mut Vec<[u64; N]>, source: Vec<[u64; N]>) {
if source.len() > target.len() {
target.resize(source.len(), [0u64; N]);
}
for (i, arr) in source.into_iter().enumerate() {
for j in 0..N {
target[i][j] += arr[j];
}
}
}
impl BamStatAccum {
pub fn into_result(mut self) -> BamStatResult {
self.flush_cov_buf_all();
BamStatResult {
total_records: self.total_records,
qc_failed: self.qc_failed,
duplicates: self.duplicates,
non_primary: self.non_primary,
unmapped: self.unmapped,
non_unique: self.non_unique,
unique: self.unique,
read_1: self.read_1,
read_2: self.read_2,
forward: self.forward,
reverse: self.reverse,
splice: self.splice,
non_splice: self.non_splice,
proper_pairs: self.proper_pairs,
proper_pair_diff_chrom: self.proper_pair_diff_chrom,
secondary: self.secondary,
supplementary: self.supplementary,
mapped: self.mapped,
paired_flagstat: self.paired_flagstat,
read1_flagstat: self.read1_flagstat,
read2_flagstat: self.read2_flagstat,
first_fragments: self.first_fragments,
last_fragments: self.last_fragments,
properly_paired: self.properly_paired,
both_mapped: self.both_mapped,
singletons: self.singletons,
mate_diff_chr: self.mate_diff_chr,
mate_diff_chr_mapq5: self.mate_diff_chr_mapq5,
chrom_counts: self.chrom_counts,
unplaced_unmapped: self.unplaced_unmapped,
total_len: self.total_len,
total_first_fragment_len: self.total_first_fragment_len,
total_last_fragment_len: self.total_last_fragment_len,
bases_mapped: self.bases_mapped,
bases_mapped_cigar: self.bases_mapped_cigar,
bases_duplicated: self.bases_duplicated,
max_len: self.max_len,
max_first_fragment_len: self.max_first_fragment_len,
max_last_fragment_len: self.max_last_fragment_len,
quality_sum: self.quality_sum,
quality_count: self.quality_count,
mismatches: self.mismatches,
is_hist: self.is_hist,
inward_pairs: self.inward_pairs,
outward_pairs: self.outward_pairs,
other_orientation: self.other_orientation,
primary_count: self.primary_count,
primary_mapped: self.primary_mapped,
primary_duplicates: self.primary_duplicates,
reads_mq0: self.reads_mq0,
reads_mapped_and_paired: self.reads_mapped_and_paired,
rl_hist: self.rl_hist,
frl_hist: self.frl_hist,
lrl_hist: self.lrl_hist,
mapq_hist: self.mapq_hist,
ffq: self.ffq,
lfq: self.lfq,
gcf: self.gcf,
gcl: self.gcl,
fbc: self.fbc,
lbc: self.lbc,
fbc_ro: self.fbc_ro,
lbc_ro: self.lbc_ro,
gcc_rc: self.gcc_rc,
ftc: self.ftc,
ltc: self.ltc,
id_hist: self.id_hist,
ic: self.ic,
chk: self.chk,
cov_hist: self.cov_hist,
gcd_bins: self.gcd_bins,
}
}
}
impl InferExpAccum {
pub fn into_result(self) -> InferExperimentResult {
let p_total: u64 = self.p_strandness.values().sum();
let s_total: u64 = self.s_strandness.values().sum();
let total = p_total + s_total;
if total == 0 {
return InferExperimentResult {
total_sampled: 0,
library_type: String::from("Undetermined"),
frac_failed: 0.0,
frac_protocol1: 0.0,
frac_protocol2: 0.0,
};
}
let pe_spec1 = *self.p_strandness.get("1++").unwrap_or(&0)
+ *self.p_strandness.get("1--").unwrap_or(&0)
+ *self.p_strandness.get("2+-").unwrap_or(&0)
+ *self.p_strandness.get("2-+").unwrap_or(&0);
let pe_spec2 = *self.p_strandness.get("1+-").unwrap_or(&0)
+ *self.p_strandness.get("1-+").unwrap_or(&0)
+ *self.p_strandness.get("2++").unwrap_or(&0)
+ *self.p_strandness.get("2--").unwrap_or(&0);
let se_spec1 =
*self.s_strandness.get("++").unwrap_or(&0) + *self.s_strandness.get("--").unwrap_or(&0);
let se_spec2 =
*self.s_strandness.get("+-").unwrap_or(&0) + *self.s_strandness.get("-+").unwrap_or(&0);
let (library_type, spec1, spec2) = if p_total > 0 && s_total > 0 {
(
"Mixture".to_string(),
pe_spec1 + se_spec1,
pe_spec2 + se_spec2,
)
} else if p_total > 0 {
("PairEnd".to_string(), pe_spec1, pe_spec2)
} else {
("SingleEnd".to_string(), se_spec1, se_spec2)
};
let determined = spec1 + spec2;
let failed = total - determined;
let total_f = total as f64;
InferExperimentResult {
total_sampled: total,
library_type,
frac_failed: failed as f64 / total_f,
frac_protocol1: spec1 as f64 / total_f,
frac_protocol2: spec2 as f64 / total_f,
}
}
}
impl ReadDupAccum {
pub fn into_result(self) -> ReadDuplicationResult {
let pos_histogram = build_dup_histogram(&self.pos_dup);
let seq_histogram = build_dup_histogram(&self.seq_dup);
ReadDuplicationResult {
pos_histogram,
seq_histogram,
}
}
}
fn build_dup_histogram<K: Eq + std::hash::Hash>(counts: &HashMap<K, u64>) -> BTreeMap<u64, u64> {
let mut histogram = BTreeMap::new();
for &count in counts.values() {
*histogram.entry(count).or_insert(0) += 1;
}
histogram
}
impl ReadDistAccum {
pub fn into_result(self, regions: &RegionSets) -> ReadDistributionResult {
fn sum_bases(map: &HashMap<String, ChromIntervals>) -> u64 {
map.values().map(|ci| ci.total_bases()).sum()
}
let region_data = vec![
(
"CDS_Exons".to_string(),
sum_bases(®ions.cds_exon),
self.cds_tags,
),
(
"5'UTR_Exons".to_string(),
sum_bases(®ions.utr_5),
self.utr5_tags,
),
(
"3'UTR_Exons".to_string(),
sum_bases(®ions.utr_3),
self.utr3_tags,
),
(
"Introns".to_string(),
sum_bases(®ions.intron),
self.intron_tags,
),
(
"TSS_up_1kb".to_string(),
sum_bases(®ions.tss_up_1kb),
self.tss_1k_tags,
),
(
"TSS_up_5kb".to_string(),
sum_bases(®ions.tss_up_5kb),
self.tss_5k_tags,
),
(
"TSS_up_10kb".to_string(),
sum_bases(®ions.tss_up_10kb),
self.tss_10k_tags,
),
(
"TES_down_1kb".to_string(),
sum_bases(®ions.tes_down_1kb),
self.tes_1k_tags,
),
(
"TES_down_5kb".to_string(),
sum_bases(®ions.tes_down_5kb),
self.tes_5k_tags,
),
(
"TES_down_10kb".to_string(),
sum_bases(®ions.tes_down_10kb),
self.tes_10k_tags,
),
];
ReadDistributionResult {
total_reads: self.total_reads,
total_tags: self.total_tags,
regions: region_data,
unassigned_tags: self.unassigned,
}
}
}
impl JuncAnnotAccum {
pub fn into_result(self, bam_header_refs: &[(String, u64)]) -> JunctionResults {
let chrom_order: std::collections::HashMap<String, usize> = bam_header_refs
.iter()
.enumerate()
.map(|(i, (name, _))| (name.to_uppercase(), i))
.collect();
let sentinel = bam_header_refs.len();
let mut per_chrom: std::collections::BTreeMap<usize, Vec<_>> =
std::collections::BTreeMap::new();
for (junction, (count, class)) in self.junction_counts {
let idx = chrom_order
.get(&junction.chrom)
.copied()
.unwrap_or(sentinel);
per_chrom
.entry(idx)
.or_default()
.push((junction, (count, class)));
}
let junctions: IndexMap<_, _> = per_chrom
.into_values()
.flat_map(|v| v.into_iter())
.collect();
JunctionResults {
junctions,
total_events: self.total_events,
known_events: self.known_events,
partial_novel_events: self.partial_novel_events,
complete_novel_events: self.complete_novel_events,
filtered_events: self.filtered_events,
}
}
}
impl JuncSatAccum {
pub fn into_result(
mut self,
known_junctions: &KnownJunctionSet,
sample_start: u32,
sample_end: u32,
sample_step: u32,
min_coverage: u32,
seed: u64,
) -> SaturationResult {
use rand::seq::SliceRandom;
use rand::SeedableRng;
use rand_chacha::ChaCha8Rng;
let known_hashes: HashSet<u64> = self
.unique_keys
.iter()
.filter(|(_, key)| known_junctions.junctions.contains(*key))
.map(|(&h, _)| h)
.collect();
let mut rng = ChaCha8Rng::seed_from_u64(seed);
self.observations.shuffle(&mut rng);
let mut percentages: Vec<u32> = (sample_start..=sample_end)
.step_by(sample_step as usize)
.collect();
if *percentages.last().unwrap_or(&0) != 100 {
percentages.push(100);
}
let total = self.observations.len();
let mut junction_counts: HashMap<u64, u32> = HashMap::new();
let mut prev_end = 0;
let mut known_counts = Vec::with_capacity(percentages.len());
let mut novel_counts = Vec::with_capacity(percentages.len());
let mut all_counts = Vec::with_capacity(percentages.len());
let mut running_known: usize = 0;
let mut running_novel: usize = 0;
for &pct in &percentages {
let index_end = total * pct as usize / 100;
for &obs_hash in &self.observations[prev_end..index_end] {
let count = junction_counts.entry(obs_hash).or_insert(0);
*count += 1;
let is_known = known_hashes.contains(&obs_hash);
if *count == 1 {
if is_known {
if min_coverage <= 1 {
running_known += 1;
}
} else {
running_novel += 1;
}
} else if is_known && *count == min_coverage && min_coverage > 1 {
running_known += 1;
}
}
prev_end = index_end;
known_counts.push(running_known);
novel_counts.push(running_novel);
all_counts.push(junction_counts.len());
}
SaturationResult {
percentages,
known_counts,
novel_counts,
all_counts,
}
}
}
impl InnerDistAccum {
pub fn into_result(
self,
lower_bound: i64,
upper_bound: i64,
step: i64,
) -> Result<InnerDistanceResult> {
let pairs: Vec<PairRecord> = self
.pairs
.into_iter()
.map(|p| PairRecord {
name: String::from_utf8_lossy(&p.name).into_owned(),
distance: p.distance,
classification: p.classification.to_owned(),
})
.collect();
let histogram = build_histogram(&self.distances, lower_bound, upper_bound, step)?;
Ok(InnerDistanceResult {
pairs,
histogram,
total_pairs: self.pair_num,
})
}
}