use coitrees::{COITree, Interval, IntervalTree};
use indexmap::IndexMap;
use log::debug;
use std::collections::{HashMap, HashSet};
use crate::gtf::Gene;
#[derive(Debug, Clone, Copy)]
#[allow(dead_code)] pub struct ExonMeta {
pub gene_idx: u32,
pub exon_start: i32,
pub exon_end: i32,
pub strand: u8,
}
impl Default for ExonMeta {
fn default() -> Self {
Self {
gene_idx: 0,
exon_start: 0,
exon_end: 0,
strand: b'.',
}
}
}
pub type ExonTree = COITree<ExonMeta, u32>;
#[derive(Debug, Clone, Copy, Default)]
#[allow(dead_code)]
pub struct IntronMeta {
pub gene_idx: u32,
}
pub type IntronTree = COITree<IntronMeta, u32>;
#[derive(Debug, Clone, Copy)]
pub struct MergedExonMeta {
pub gene_idx: u32,
pub exon_start: i32,
pub exon_end: i32,
pub strand: u8,
}
impl Default for MergedExonMeta {
fn default() -> Self {
Self {
gene_idx: 0,
exon_start: 0,
exon_end: 0,
strand: b'.',
}
}
}
pub type MergedExonTree = COITree<MergedExonMeta, u32>;
#[derive(Debug, Clone)]
#[allow(dead_code)]
pub struct TranscriptInfo {
pub gene_idx: u32,
pub transcript_idx: u16,
pub gene_id: String,
pub transcript_id: String,
pub chrom: String,
pub start: i32,
pub end: i32,
pub strand: char,
pub length: u32,
pub exons: Vec<(i32, i32)>,
pub exonic_length: u32,
}
#[derive(Debug, Clone)]
pub struct MergedGeneModel {
pub strand: char,
pub exons: Vec<(i32, i32)>,
}
impl MergedGeneModel {
pub fn from_transcripts(strand: char, all_exons: &[(i32, i32)]) -> Self {
let exons = merge_intervals(all_exons);
Self { strand, exons }
}
}
fn merge_intervals(intervals: &[(i32, i32)]) -> Vec<(i32, i32)> {
if intervals.is_empty() {
return Vec::new();
}
let mut sorted: Vec<(i32, i32)> = intervals.to_vec();
sorted.sort_unstable_by_key(|(s, _)| *s);
let mut merged = Vec::with_capacity(sorted.len());
let (mut cur_start, mut cur_end) = sorted[0];
for &(start, end) in &sorted[1..] {
if start <= cur_end {
cur_end = cur_end.max(end);
} else {
merged.push((cur_start, cur_end));
cur_start = start;
cur_end = end;
}
}
merged.push((cur_start, cur_end));
merged
}
pub struct QualimapIndex {
pub exon_trees: HashMap<String, ExonTree>,
pub merged_exon_trees: HashMap<String, MergedExonTree>,
pub intron_trees: HashMap<String, IntronTree>,
pub transcripts: Vec<TranscriptInfo>,
pub gene_transcript_ranges: Vec<(u32, u32)>,
pub junction_locations: HashMap<String, HashSet<i32>>,
}
impl QualimapIndex {
pub fn from_genes(genes: &IndexMap<String, Gene>) -> Self {
let mut exon_intervals: HashMap<String, Vec<Interval<ExonMeta>>> = HashMap::new();
let mut intron_intervals: HashMap<String, Vec<Interval<IntronMeta>>> = HashMap::new();
let mut junction_positions: HashMap<String, HashSet<i32>> = HashMap::new();
let mut transcripts = Vec::new();
let mut gene_transcript_ranges = Vec::with_capacity(genes.len());
let mut merged_gene_models = Vec::with_capacity(genes.len());
for (gene_idx, gene) in genes.values().enumerate() {
let gene_idx = gene_idx as u32;
let tx_start_idx = transcripts.len() as u32;
let mut all_gene_exons: Vec<(i32, i32)> = Vec::new();
for (tx_idx, tx) in gene.transcripts.iter().enumerate() {
let tx_idx = tx_idx as u16;
let mut exons_0based: Vec<(i32, i32)> = tx
.exons
.iter()
.map(|(s, e)| ((*s as i32) - 1, *e as i32))
.collect();
exons_0based.sort_unstable_by_key(|(s, _)| *s);
let exonic_length: u32 = exons_0based.iter().map(|(s, e)| (e - s) as u32).sum();
let tx_start = exons_0based.first().map(|(s, _)| *s).unwrap_or(0);
let tx_end = exons_0based.last().map(|(_, e)| *e).unwrap_or(0);
all_gene_exons.extend_from_slice(&exons_0based);
let chrom_exons = exon_intervals.entry(tx.chrom.clone()).or_default();
for &(start, end) in &exons_0based {
chrom_exons.push(Interval::new(
start,
end,
ExonMeta {
gene_idx,
exon_start: start,
exon_end: end,
strand: tx.strand as u8,
},
));
}
let chrom_introns = intron_intervals.entry(tx.chrom.clone()).or_default();
let chrom_junctions = junction_positions.entry(tx.chrom.clone()).or_default();
for window in exons_0based.windows(2) {
let intron_start = window[0].1;
let intron_end = window[1].0;
if intron_end > intron_start {
chrom_introns.push(Interval::new(
intron_start,
intron_end,
IntronMeta { gene_idx },
));
chrom_junctions.insert(intron_start); chrom_junctions.insert(intron_end + 1); }
}
let coverage_exons = if tx.strand == '-' {
let mut desc = exons_0based.clone();
desc.sort_unstable_by(|a, b| b.0.cmp(&a.0));
desc
} else {
exons_0based
};
transcripts.push(TranscriptInfo {
gene_idx,
transcript_idx: tx_idx,
gene_id: gene.gene_id.clone(),
transcript_id: tx.transcript_id.clone(),
chrom: tx.chrom.clone(),
start: tx_start,
end: tx_end,
strand: tx.strand,
length: (tx_end - tx_start) as u32,
exons: coverage_exons,
exonic_length,
});
}
let tx_end_idx = transcripts.len() as u32;
gene_transcript_ranges.push((tx_start_idx, tx_end_idx));
merged_gene_models.push(MergedGeneModel::from_transcripts(
gene.strand,
&all_gene_exons,
));
}
let mut merged_exon_intervals: HashMap<String, Vec<Interval<MergedExonMeta>>> =
HashMap::new();
for (gene_idx_usize, gene) in genes.values().enumerate() {
let gene_idx = gene_idx_usize as u32;
let model = &merged_gene_models[gene_idx_usize];
if let Some(tx) = gene.transcripts.first() {
let chrom_merged = merged_exon_intervals.entry(tx.chrom.clone()).or_default();
for &(start, end) in &model.exons {
chrom_merged.push(Interval::new(
start,
end,
MergedExonMeta {
gene_idx,
exon_start: start,
exon_end: end,
strand: model.strand as u8,
},
));
}
}
}
let total_exon_ivs: usize = exon_intervals.values().map(|v| v.len()).sum();
let total_merged_ivs: usize = merged_exon_intervals.values().map(|v| v.len()).sum();
log::debug!(
"QM_INDEX: {} exon intervals ({} merged), {} transcripts, {} genes",
total_exon_ivs,
total_merged_ivs,
transcripts.len(),
gene_transcript_ranges.len()
);
let exon_trees: HashMap<String, ExonTree> = exon_intervals
.into_iter()
.map(|(chrom, intervals)| (chrom, COITree::new(&intervals)))
.collect();
let merged_exon_trees: HashMap<String, MergedExonTree> = merged_exon_intervals
.into_iter()
.map(|(chrom, intervals)| (chrom, COITree::new(&intervals)))
.collect();
let intron_trees: HashMap<String, IntronTree> = intron_intervals
.into_iter()
.map(|(chrom, intervals)| (chrom, COITree::new(&intervals)))
.collect();
debug!(
"Built Qualimap index: {} genes, {} transcripts, {} chromosomes with exons",
genes.len(),
transcripts.len(),
exon_trees.len(),
);
Self {
exon_trees,
merged_exon_trees,
intron_trees,
transcripts,
gene_transcript_ranges,
junction_locations: junction_positions,
}
}
#[allow(dead_code)]
pub fn exon_tree(&self, chrom: &str) -> Option<&ExonTree> {
self.exon_trees.get(chrom)
}
pub fn merged_exon_tree(&self, chrom: &str) -> Option<&MergedExonTree> {
self.merged_exon_trees.get(chrom)
}
pub fn intron_tree(&self, chrom: &str) -> Option<&IntronTree> {
self.intron_trees.get(chrom)
}
pub fn has_junction_overlap(&self, chrom: &str, position: i32) -> bool {
if let Some(positions) = self.junction_locations.get(chrom) {
positions.contains(&(position - 1))
|| positions.contains(&position)
|| positions.contains(&(position + 1))
} else {
false
}
}
pub fn gene_transcripts(&self, gene_idx: u32) -> &[TranscriptInfo] {
let (start, end) = self.gene_transcript_ranges[gene_idx as usize];
&self.transcripts[start as usize..end as usize]
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::gtf::{Exon, Gene, Transcript};
fn make_gene(gene_id: &str, chrom: &str, strand: char, transcripts: Vec<Transcript>) -> Gene {
let start = transcripts
.iter()
.flat_map(|t| t.exons.iter().map(|(s, _)| *s))
.min()
.unwrap_or(1);
let end = transcripts
.iter()
.flat_map(|t| t.exons.iter().map(|(_, e)| *e))
.max()
.unwrap_or(1);
let exons = transcripts
.iter()
.flat_map(|t| {
t.exons.iter().map(|(s, e)| Exon {
chrom: chrom.to_string(),
start: *s,
end: *e,
strand,
})
})
.collect();
Gene {
gene_id: gene_id.to_string(),
chrom: chrom.to_string(),
start,
end,
strand,
exons,
effective_length: 0,
attributes: HashMap::new(),
transcripts,
}
}
fn make_transcript(
tx_id: &str,
chrom: &str,
strand: char,
exons: Vec<(u64, u64)>,
) -> Transcript {
let start = exons.iter().map(|(s, _)| *s).min().unwrap_or(1);
let end = exons.iter().map(|(_, e)| *e).max().unwrap_or(1);
Transcript {
transcript_id: tx_id.to_string(),
chrom: chrom.to_string(),
start,
end,
strand,
exons,
cds_start: None,
cds_end: None,
}
}
#[test]
fn test_index_from_single_gene() {
let mut genes = IndexMap::new();
let tx = make_transcript("tx1", "chr1", '+', vec![(101, 200), (301, 400)]);
let gene = make_gene("gene1", "chr1", '+', vec![tx]);
genes.insert("gene1".to_string(), gene);
let index = QualimapIndex::from_genes(&genes);
assert_eq!(index.gene_transcript_ranges.len(), 1);
assert_eq!(index.transcripts.len(), 1);
assert!(index.exon_tree("chr1").is_some());
assert!(index.intron_tree("chr1").is_some());
assert!(index.exon_tree("chr2").is_none());
let ti = &index.transcripts[0];
assert_eq!(ti.gene_id, "gene1");
assert_eq!(ti.transcript_id, "tx1");
assert_eq!(ti.exons.len(), 2);
assert_eq!(ti.exons[0], (100, 200));
assert_eq!(ti.exons[1], (300, 400));
assert_eq!(ti.exonic_length, 200);
let txs = index.gene_transcripts(0);
assert_eq!(txs.len(), 1);
assert_eq!(txs[0].transcript_id, "tx1");
let tree = index.exon_tree("chr1").unwrap();
let mut hits = Vec::new();
tree.query(150, 160, |iv| hits.push(iv.metadata));
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].gene_idx, 0);
let itree = index.intron_tree("chr1").unwrap();
let mut ihits = Vec::new();
itree.query(250, 260, |iv| ihits.push(iv.metadata));
assert_eq!(ihits.len(), 1);
assert_eq!(ihits[0].gene_idx, 0);
}
#[test]
fn test_index_multiple_transcripts() {
let mut genes = IndexMap::new();
let tx1 = make_transcript("tx1", "chr1", '+', vec![(101, 200), (301, 400)]);
let tx2 = make_transcript("tx2", "chr1", '+', vec![(101, 250), (351, 400)]);
let gene = make_gene("gene1", "chr1", '+', vec![tx1, tx2]);
genes.insert("gene1".to_string(), gene);
let index = QualimapIndex::from_genes(&genes);
assert_eq!(index.transcripts.len(), 2);
let txs = index.gene_transcripts(0);
assert_eq!(txs.len(), 2);
assert_eq!(txs[0].transcript_id, "tx1");
assert_eq!(txs[1].transcript_id, "tx2");
}
#[test]
fn test_enclosure_check() {
let mut genes = IndexMap::new();
let tx = make_transcript("tx1", "chr1", '+', vec![(101, 200), (301, 400)]);
let gene = make_gene("gene1", "chr1", '+', vec![tx]);
genes.insert("gene1".to_string(), gene);
let index = QualimapIndex::from_genes(&genes);
let tree = index.exon_tree("chr1").unwrap();
let mut enclosed_genes = Vec::new();
tree.query(120, 180, |iv| {
if 120 >= iv.metadata.exon_start && 180 <= iv.metadata.exon_end {
enclosed_genes.push(iv.metadata.gene_idx);
}
});
assert_eq!(enclosed_genes, vec![0]);
let mut enclosed_genes2 = Vec::new();
tree.query(190, 310, |iv| {
if 190 >= iv.metadata.exon_start && 310 <= iv.metadata.exon_end {
enclosed_genes2.push(iv.metadata.gene_idx);
}
});
assert!(enclosed_genes2.is_empty());
}
#[test]
fn test_merged_exon_enclosure() {
let mut genes = IndexMap::new();
let tx1 = make_transcript("tx1", "chr1", '+', vec![(101, 200)]);
let tx2 = make_transcript("tx2", "chr1", '+', vec![(151, 300)]);
let gene = make_gene("gene1", "chr1", '+', vec![tx1, tx2]);
genes.insert("gene1".to_string(), gene);
let index = QualimapIndex::from_genes(&genes);
let per_tx_tree = index.exon_tree("chr1").unwrap();
let mut per_tx_enclosed = Vec::new();
per_tx_tree.query(120, 250, |iv| {
if 120 >= iv.metadata.exon_start && 250 <= iv.metadata.exon_end {
per_tx_enclosed.push(iv.metadata.gene_idx);
}
});
assert!(
per_tx_enclosed.is_empty(),
"Block should NOT be enclosed by any individual transcript exon"
);
let merged_tree = index.merged_exon_tree("chr1").unwrap();
let mut merged_enclosed = Vec::new();
merged_tree.query(120, 250, |iv| {
if 120 >= iv.metadata.exon_start && 250 <= iv.metadata.exon_end {
merged_enclosed.push(iv.metadata.gene_idx);
}
});
assert_eq!(
merged_enclosed,
vec![0],
"Block should be enclosed by merged exon interval"
);
}
}