use std::collections::{HashMap, HashSet};
use coitrees::IntervalTree;
use rust_htslib::bam;
use rust_htslib::bam::record::Cigar;
use crate::cli::Strandedness;
use super::coverage::TranscriptCoverage;
use super::index::QualimapIndex;
#[derive(Debug, Clone)]
struct CachedBlockHits {
enclosing_genes: Vec<(u32, u8)>,
has_overlap_not_enclosed: bool,
}
use crate::rna::bam_flags::*;
type MateKey = (u64, i32, i64, i32, i64);
#[derive(Debug, Clone)]
struct JunctionMotif {
donor: [u8; 2],
acceptor: [u8; 2],
}
#[derive(Debug, Clone)]
struct MateData {
m_blocks: Vec<(i32, i32)>,
enclosing_genes: HashSet<u32>,
cached_hits: Vec<CachedBlockHits>,
is_reverse: bool,
is_first_of_pair: bool,
}
#[derive(Debug, Clone, Default)]
pub struct QualimapCounters {
pub primary_alignments: u64,
pub secondary_alignments: u64,
pub not_aligned: u64,
pub alignment_not_unique: u64,
pub exonic_reads: u64,
pub no_feature: u64,
pub ambiguous: u64,
pub intronic_reads: u64,
pub intergenic_reads: u64,
pub overlapping_exon: u64,
pub read_count: u64,
pub left_proper_in_pair: u64,
pub right_proper_in_pair: u64,
pub both_proper_in_pair: u64,
pub reads_at_junctions: u64,
pub known_junction_events: u64,
pub partly_known_junction_events: u64,
pub supplementary: u64,
pub ssp_fwd: u64,
pub ssp_rev: u64,
}
impl QualimapCounters {
pub fn merge(&mut self, other: &QualimapCounters) {
self.primary_alignments += other.primary_alignments;
self.secondary_alignments += other.secondary_alignments;
self.not_aligned += other.not_aligned;
self.alignment_not_unique += other.alignment_not_unique;
self.exonic_reads += other.exonic_reads;
self.no_feature += other.no_feature;
self.ambiguous += other.ambiguous;
self.intronic_reads += other.intronic_reads;
self.intergenic_reads += other.intergenic_reads;
self.overlapping_exon += other.overlapping_exon;
self.read_count += other.read_count;
self.left_proper_in_pair += other.left_proper_in_pair;
self.right_proper_in_pair += other.right_proper_in_pair;
self.both_proper_in_pair += other.both_proper_in_pair;
self.reads_at_junctions += other.reads_at_junctions;
self.known_junction_events += other.known_junction_events;
self.partly_known_junction_events += other.partly_known_junction_events;
self.supplementary += other.supplementary;
self.ssp_fwd += other.ssp_fwd;
self.ssp_rev += other.ssp_rev;
}
}
#[derive(Debug, Clone, Default)]
pub struct JunctionMotifCounts {
pub counts: HashMap<([u8; 2], [u8; 2]), u64>,
}
impl JunctionMotifCounts {
pub fn merge(&mut self, other: &JunctionMotifCounts) {
for (key, &count) in &other.counts {
*self.counts.entry(*key).or_insert(0) += count;
}
}
}
#[derive(Debug, Clone)]
pub struct QualimapAccum {
pub counters: QualimapCounters,
pub coverage: TranscriptCoverage,
pub junction_motifs: JunctionMotifCounts,
mate_buffer: HashMap<MateKey, MateData>,
stranded: Strandedness,
}
impl QualimapAccum {
pub fn new(stranded: Strandedness) -> Self {
Self {
counters: QualimapCounters::default(),
coverage: TranscriptCoverage::new(),
junction_motifs: JunctionMotifCounts::default(),
mate_buffer: HashMap::new(),
stranded,
}
}
pub fn process_read(&mut self, record: &bam::Record, chrom: &str, index: &QualimapIndex) {
let flags = record.flags();
log::trace!(
"QM process_read chrom={} flags={} pos={}",
chrom,
flags,
record.pos()
);
if flags & BAM_FUNMAP != 0 {
self.counters.not_aligned += 1;
return;
}
let is_multi_mapped = get_nh_tag(record).is_some_and(|nh| nh > 1);
if is_multi_mapped {
self.counters.alignment_not_unique += 1;
}
let is_secondary = flags & BAM_FSECONDARY != 0;
if is_secondary {
self.counters.secondary_alignments += 1;
return;
}
self.counters.primary_alignments += 1;
if flags & BAM_FPAIRED != 0 {
if flags & BAM_FREAD1 != 0 {
self.counters.left_proper_in_pair += 1;
}
if flags & BAM_FREAD2 != 0 {
self.counters.right_proper_in_pair += 1;
}
if flags & BAM_FPROPER_PAIR != 0 {
self.counters.both_proper_in_pair += 1;
}
}
if flags & BAM_FSUPPLEMENTARY != 0 {
self.counters.supplementary += 1;
return;
}
if flags & BAM_FQCFAIL != 0 {
return;
}
self.counters.read_count += 1;
if is_multi_mapped {
return;
}
let (m_blocks, n_op_count, motifs, junction_ref_positions) =
extract_m_blocks_and_junctions(record);
if m_blocks.is_empty() {
return;
}
self.counters.reads_at_junctions += n_op_count as u64;
for motif in &motifs {
*self
.junction_motifs
.counts
.entry((motif.donor, motif.acceptor))
.or_insert(0) += 1;
}
for &(pos_ref1, pos_ref2) in &junction_ref_positions {
let j1 = index.has_junction_overlap(chrom, pos_ref1);
let j2 = index.has_junction_overlap(chrom, pos_ref2);
if j1 && j2 {
self.counters.known_junction_events += 1;
} else if j1 || j2 {
self.counters.partly_known_junction_events += 1;
}
}
let cached_hits = cache_block_hits(&m_blocks, chrom, index);
let is_reverse = flags & BAM_FREVERSE != 0;
let is_first_of_pair = if flags & BAM_FPAIRED != 0 {
flags & BAM_FREAD1 != 0
} else {
true
};
let enclosing_genes =
find_enclosing_genes_cached(&cached_hits, self.stranded, is_reverse, is_first_of_pair);
if flags & BAM_FPAIRED == 0 || flags & BAM_FPROPER_PAIR == 0 {
self.add_per_block_coverage_cached(&m_blocks, &cached_hits, index);
}
let data = MateData {
m_blocks,
enclosing_genes,
cached_hits,
is_reverse,
is_first_of_pair,
};
if flags & BAM_FPAIRED != 0 && flags & BAM_FPROPER_PAIR != 0 {
self.process_pe_read(record, data, chrom, index);
} else {
self.classify_and_count(&data, chrom, index, 1);
}
}
fn process_pe_read(
&mut self,
record: &bam::Record,
data: MateData,
chrom: &str,
index: &QualimapIndex,
) {
let key = make_mate_key(record);
if let Some(mate) = self.mate_buffer.remove(&key) {
self.reconcile_pair(&data, &mate, chrom, index);
} else {
self.mate_buffer.insert(key, data);
}
}
fn reconcile_pair(&mut self, r1: &MateData, r2: &MateData, chrom: &str, index: &QualimapIndex) {
let assigned_genes: HashSet<u32> = r1
.enclosing_genes
.intersection(&r2.enclosing_genes)
.copied()
.collect();
self.add_per_block_coverage_cached(&r1.m_blocks, &r1.cached_hits, index);
self.add_per_block_coverage_cached(&r2.m_blocks, &r2.cached_hits, index);
if !check_has_overlap_not_enclosed(&r1.cached_hits) {
self.check_overlapping_exon_cached(&r2.cached_hits);
} else {
self.counters.overlapping_exon += 1;
}
match assigned_genes.len() {
0 => {
let mut combined_blocks: Vec<(i32, i32)> =
Vec::with_capacity(r1.m_blocks.len() + r2.m_blocks.len());
combined_blocks.extend_from_slice(&r1.m_blocks);
combined_blocks.extend_from_slice(&r2.m_blocks);
self.classify_no_feature(&combined_blocks, chrom, index, 2);
self.counters.no_feature += 2;
}
1 => {
self.counters.exonic_reads += 2;
}
_ => {
self.counters.ambiguous += 2;
}
}
self.count_ssp_for_blocks_cached(&r1.cached_hits, r1.is_reverse, r1.is_first_of_pair);
let r2_is_first = !r1.is_first_of_pair;
self.count_ssp_for_blocks_cached(&r2.cached_hits, r2.is_reverse, r2_is_first);
}
fn classify_and_count(
&mut self,
data: &MateData,
chrom: &str,
index: &QualimapIndex,
num_reads: u64,
) {
self.check_overlapping_exon_cached(&data.cached_hits);
match data.enclosing_genes.len() {
0 => {
self.classify_no_feature(&data.m_blocks, chrom, index, num_reads);
self.counters.no_feature += num_reads;
}
1 => {
self.counters.exonic_reads += num_reads;
}
_ => {
self.counters.ambiguous += num_reads;
}
}
self.count_ssp_for_blocks_cached(&data.cached_hits, data.is_reverse, data.is_first_of_pair);
}
fn check_overlapping_exon_cached(&mut self, cached_hits: &[CachedBlockHits]) {
if check_has_overlap_not_enclosed(cached_hits) {
self.counters.overlapping_exon += 1;
}
}
fn add_per_block_coverage_cached(
&mut self,
m_blocks: &[(i32, i32)],
cached_hits: &[CachedBlockHits],
index: &QualimapIndex,
) {
for (i, &(block_start, block_end)) in m_blocks.iter().enumerate() {
let hit = &cached_hits[i];
let single_block = &[(block_start, block_end)];
for &(gidx, _strand) in &hit.enclosing_genes {
let (tx_start, _) = index.gene_transcript_ranges[gidx as usize];
let transcripts = index.gene_transcripts(gidx);
self.coverage
.add_coverage(single_block, transcripts, tx_start);
}
}
}
fn classify_no_feature(
&mut self,
m_blocks: &[(i32, i32)],
chrom: &str,
index: &QualimapIndex,
num_reads: u64,
) {
if let Some(intron_tree) = index.intron_tree(chrom) {
for &(start, end) in m_blocks {
let mut found_intron = false;
intron_tree.query(start, end, |_iv| {
found_intron = true;
});
if found_intron {
self.counters.intronic_reads += num_reads;
return;
}
}
}
self.counters.intergenic_reads += num_reads;
}
fn count_ssp_for_blocks_cached(
&mut self,
cached_hits: &[CachedBlockHits],
is_reverse: bool,
is_first_of_pair: bool,
) {
for hit in cached_hits {
for &(_gidx, strand) in &hit.enclosing_genes {
let same_strand = (is_reverse && strand == b'-') || (!is_reverse && strand == b'+');
if is_first_of_pair {
if same_strand {
self.counters.ssp_fwd += 1;
} else {
self.counters.ssp_rev += 1;
}
} else if same_strand {
self.counters.ssp_rev += 1;
} else {
self.counters.ssp_fwd += 1;
}
}
}
}
pub fn flush_unpaired(&mut self, _index: &QualimapIndex) {
self.mate_buffer.clear();
}
pub fn merge(&mut self, other: QualimapAccum) {
self.counters.merge(&other.counters);
self.coverage.merge(other.coverage);
self.junction_motifs.merge(&other.junction_motifs);
for (key, info) in other.mate_buffer {
if let Some(existing) = self.mate_buffer.remove(&key) {
let common_genes: HashSet<u32> = existing
.enclosing_genes
.intersection(&info.enclosing_genes)
.copied()
.collect();
let assigned = if existing.enclosing_genes.is_empty()
&& !info.enclosing_genes.is_empty()
{
info.enclosing_genes.clone()
} else if info.enclosing_genes.is_empty() && !existing.enclosing_genes.is_empty() {
existing.enclosing_genes.clone()
} else {
common_genes
};
match assigned.len() {
0 => {
self.counters.no_feature += 2;
self.counters.intergenic_reads += 2;
}
1 => {
self.counters.exonic_reads += 2;
}
_ => {
self.counters.ambiguous += 2;
}
}
} else {
self.mate_buffer.insert(key, info);
}
}
}
#[allow(dead_code)]
pub fn unmatched_count(&self) -> usize {
self.mate_buffer.len()
}
pub fn into_result(self) -> super::QualimapResult {
let mut junction_motifs_str = HashMap::new();
for ((donor, acceptor), count) in &self.junction_motifs.counts {
let motif = format!(
"{}{}{}{}",
donor[0] as char, donor[1] as char, acceptor[0] as char, acceptor[1] as char,
);
*junction_motifs_str.entry(motif).or_insert(0u64) += count;
}
super::QualimapResult {
primary_alignments: self.counters.primary_alignments,
secondary_alignments: self.counters.secondary_alignments,
not_aligned: self.counters.not_aligned,
alignment_not_unique: self.counters.alignment_not_unique,
exonic_reads: self.counters.exonic_reads,
ambiguous_reads: self.counters.ambiguous,
no_feature: self.counters.no_feature,
intronic_reads: self.counters.intronic_reads,
intergenic_reads: self.counters.intergenic_reads,
overlapping_exon_reads: self.counters.overlapping_exon,
read_count: self.counters.read_count,
left_proper_in_pair: self.counters.left_proper_in_pair,
right_proper_in_pair: self.counters.right_proper_in_pair,
both_proper_in_pair: self.counters.both_proper_in_pair,
reads_at_junctions: self.counters.reads_at_junctions,
known_junction_events: self.counters.known_junction_events,
partly_known_junction_events: self.counters.partly_known_junction_events,
junction_motifs: junction_motifs_str,
transcript_coverage_raw: self.coverage,
ssp_fwd: self.counters.ssp_fwd,
ssp_rev: self.counters.ssp_rev,
}
}
}
fn get_nh_tag(record: &bam::Record) -> Option<u32> {
crate::rna::bam_flags::get_aux_int(record, b"NH").map(|v| v as u32)
}
#[allow(
clippy::needless_range_loop,
clippy::type_complexity,
unused_assignments,
unused_variables
)]
fn extract_m_blocks_and_junctions(
record: &bam::Record,
) -> (Vec<(i32, i32)>, usize, Vec<JunctionMotif>, Vec<(i32, i32)>) {
let cigar = record.cigar();
let seq = record.seq();
let seq_len = seq.len();
let mut blocks = Vec::new();
let mut motifs = Vec::new();
let mut junction_ref_positions = Vec::new();
let mut n_op_count: usize = 0;
let mut ref_pos = record.pos() as i32; let alignment_start_1based = record.pos() as i32 + 1;
let mut seq_pos: usize = 0;
for op in cigar.iter() {
match op {
Cigar::Match(len) => {
let len = *len as i32;
blocks.push((ref_pos, ref_pos + len));
ref_pos += len;
seq_pos += len as usize;
}
Cigar::Equal(len) | Cigar::Diff(len) => {
ref_pos += *len as i32;
seq_pos += *len as usize;
}
Cigar::RefSkip(len) => {
n_op_count += 1;
let pos_ref1 = alignment_start_1based + seq_pos as i32;
let pos_ref2 = pos_ref1 + *len as i32;
junction_ref_positions.push((pos_ref1, pos_ref2));
if seq_pos > 2 && seq_pos + 1 < seq_len {
let donor = [seq.encoded_base(seq_pos - 2), seq.encoded_base(seq_pos - 1)];
let acceptor = [seq.encoded_base(seq_pos), seq.encoded_base(seq_pos + 1)];
let donor_ascii = [decode_base(donor[0]), decode_base(donor[1])];
let acceptor_ascii = [decode_base(acceptor[0]), decode_base(acceptor[1])];
motifs.push(JunctionMotif {
donor: donor_ascii,
acceptor: acceptor_ascii,
});
}
ref_pos += *len as i32;
}
Cigar::Ins(len) | Cigar::SoftClip(len) => {
ref_pos += *len as i32;
seq_pos += *len as usize;
}
Cigar::Del(len) => {
ref_pos += *len as i32;
}
Cigar::HardClip(_) | Cigar::Pad(_) => {}
}
}
(blocks, n_op_count, motifs, junction_ref_positions)
}
fn decode_base(encoded: u8) -> u8 {
match encoded {
1 => b'A',
2 => b'C',
4 => b'G',
8 => b'T',
15 => b'N',
_ => b'N',
}
}
fn check_has_overlap_not_enclosed(cached_hits: &[CachedBlockHits]) -> bool {
cached_hits.iter().any(|h| h.has_overlap_not_enclosed)
}
fn cache_block_hits(
m_blocks: &[(i32, i32)],
chrom: &str,
index: &QualimapIndex,
) -> Vec<CachedBlockHits> {
let tree = match index.merged_exon_tree(chrom) {
Some(t) => t,
None => {
return m_blocks
.iter()
.map(|_| CachedBlockHits {
enclosing_genes: Vec::new(),
has_overlap_not_enclosed: false,
})
.collect();
}
};
m_blocks
.iter()
.map(|&(block_start, block_end)| {
let mut enclosing_genes: Vec<(u32, u8)> = Vec::new();
let mut has_overlap_not_enclosed = false;
tree.query(block_start, block_end, |iv| {
let exon_start = iv.metadata.exon_start;
let exon_end = iv.metadata.exon_end;
if exon_start >= block_end || block_start >= exon_end {
return; }
if block_start >= exon_start && block_end <= exon_end {
let gene_idx = iv.metadata.gene_idx;
let strand = iv.metadata.strand;
if !enclosing_genes.iter().any(|(g, _)| *g == gene_idx) {
enclosing_genes.push((gene_idx, strand));
}
} else {
has_overlap_not_enclosed = true;
}
});
CachedBlockHits {
enclosing_genes,
has_overlap_not_enclosed,
}
})
.collect()
}
fn find_enclosing_genes_cached(
cached_hits: &[CachedBlockHits],
stranded: Strandedness,
is_reverse: bool,
is_first_of_pair: bool,
) -> HashSet<u32> {
if cached_hits.is_empty() {
return HashSet::new();
}
let read_on_plus = if stranded == Strandedness::Unstranded {
true } else {
let flip = (stranded == Strandedness::Reverse && is_first_of_pair)
|| (stranded == Strandedness::Forward && !is_first_of_pair);
if flip {
is_reverse
} else {
!is_reverse
}
};
let strand_ok = |gene_strand: u8| -> bool {
if stranded == Strandedness::Unstranded {
return true;
}
(read_on_plus && gene_strand == b'+') || (!read_on_plus && gene_strand == b'-')
};
let mut candidate_genes: HashSet<u32> = cached_hits[0]
.enclosing_genes
.iter()
.filter(|(_, strand)| strand_ok(*strand))
.map(|(gidx, _)| *gidx)
.collect();
if candidate_genes.is_empty() {
return candidate_genes;
}
for hit in &cached_hits[1..] {
candidate_genes.retain(|gidx| {
hit.enclosing_genes
.iter()
.any(|(g, strand)| g == gidx && strand_ok(*strand))
});
if candidate_genes.is_empty() {
break;
}
}
candidate_genes
}
#[cfg(test)]
fn find_enclosing_genes(
m_blocks: &[(i32, i32)],
chrom: &str,
index: &QualimapIndex,
stranded: Strandedness,
is_reverse: bool,
is_first_of_pair: bool,
) -> HashSet<u32> {
let cached_hits = cache_block_hits(m_blocks, chrom, index);
find_enclosing_genes_cached(&cached_hits, stranded, is_reverse, is_first_of_pair)
}
fn make_mate_key(record: &bam::Record) -> MateKey {
let qname_hash = hash_qname(record.qname());
let tid = record.tid();
let pos = record.pos();
let mate_tid = record.mtid();
let mate_pos = record.mpos();
if tid < mate_tid || (tid == mate_tid && pos <= mate_pos) {
(qname_hash, tid, pos, mate_tid, mate_pos)
} else {
(qname_hash, mate_tid, mate_pos, tid, pos)
}
}
fn hash_qname(qname: &[u8]) -> u64 {
crate::io::fnv1a(qname)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_hash_qname_deterministic() {
let h1 = hash_qname(b"read1");
let h2 = hash_qname(b"read1");
let h3 = hash_qname(b"read2");
assert_eq!(h1, h2);
assert_ne!(h1, h3);
}
#[test]
fn test_decode_base() {
assert_eq!(decode_base(1), b'A');
assert_eq!(decode_base(2), b'C');
assert_eq!(decode_base(4), b'G');
assert_eq!(decode_base(8), b'T');
assert_eq!(decode_base(15), b'N');
assert_eq!(decode_base(0), b'N');
}
#[test]
fn test_find_enclosing_genes_empty_chrom() {
let genes = indexmap::IndexMap::new();
let index = QualimapIndex::from_genes(&genes);
let result = find_enclosing_genes(
&[(100, 200)],
"chr1",
&index,
Strandedness::Unstranded,
false,
true,
);
assert!(result.is_empty());
}
#[test]
fn test_counters_merge() {
let mut c1 = QualimapCounters {
primary_alignments: 10,
exonic_reads: 5,
intronic_reads: 3,
..Default::default()
};
let c2 = QualimapCounters {
primary_alignments: 20,
exonic_reads: 8,
intronic_reads: 4,
..Default::default()
};
c1.merge(&c2);
assert_eq!(c1.primary_alignments, 30);
assert_eq!(c1.exonic_reads, 13);
assert_eq!(c1.intronic_reads, 7);
}
#[test]
fn test_junction_motif_counts_merge() {
let mut j1 = JunctionMotifCounts::default();
j1.counts.insert(([b'G', b'T'], [b'A', b'G']), 5);
let mut j2 = JunctionMotifCounts::default();
j2.counts.insert(([b'G', b'T'], [b'A', b'G']), 3);
j2.counts.insert(([b'G', b'C'], [b'A', b'G']), 1);
j1.merge(&j2);
assert_eq!(j1.counts[&([b'G', b'T'], [b'A', b'G'])], 8);
assert_eq!(j1.counts[&([b'G', b'C'], [b'A', b'G'])], 1);
}
}