use std::collections::{HashMap, HashSet};
use indexmap::IndexMap;
use log::debug;
use crate::gtf::Gene;
pub fn fetch_introns(start_pos: u64, cigar: &[rust_htslib::bam::record::Cigar]) -> Vec<(u64, u64)> {
use rust_htslib::bam::record::Cigar::*;
let mut pos = start_pos;
let mut introns = Vec::new();
for op in cigar {
match op {
Match(len) => pos += *len as u64, Ins(_) => {} Del(len) => pos += *len as u64, RefSkip(len) => {
let intron_start = pos;
let intron_end = pos + *len as u64;
introns.push((intron_start, intron_end));
pos = intron_end;
}
SoftClip(_) => {} HardClip(_) | Pad(_) | Equal(_) | Diff(_) => {} }
}
introns
}
#[derive(Debug, Default)]
pub struct ReferenceJunctions {
pub intron_starts: HashMap<String, HashSet<u64>>,
pub intron_ends: HashMap<String, HashSet<u64>>,
}
pub fn classify_junction(
chrom: &str,
intron_start: u64,
intron_end: u64,
reference: &ReferenceJunctions,
) -> super::junction_annotation::JunctionClass {
let start_known = reference
.intron_starts
.get(chrom)
.is_some_and(|s| s.contains(&intron_start));
let end_known = reference
.intron_ends
.get(chrom)
.is_some_and(|s| s.contains(&intron_end));
use super::junction_annotation::JunctionClass;
match (start_known, end_known) {
(true, true) => JunctionClass::Annotated,
(false, false) => JunctionClass::CompleteNovel,
_ => JunctionClass::PartialNovel,
}
}
#[derive(Debug, Default)]
pub struct KnownJunctionSet {
pub junctions: HashSet<String>,
}
pub fn build_reference_junctions_from_genes(genes: &IndexMap<String, Gene>) -> ReferenceJunctions {
let mut result = ReferenceJunctions::default();
let mut transcript_count = 0u64;
for gene in genes.values() {
for tx in &gene.transcripts {
if tx.exons.len() <= 1 {
continue;
}
let chrom = tx.chrom.to_uppercase();
let starts_set = result.intron_starts.entry(chrom.clone()).or_default();
let ends_set = result.intron_ends.entry(chrom).or_default();
for i in 0..tx.exons.len() - 1 {
let intron_start = tx.exons[i].1; let intron_end = tx.exons[i + 1].0 - 1; starts_set.insert(intron_start);
ends_set.insert(intron_end);
}
transcript_count += 1;
}
}
debug!(
"Extracted reference junctions from {} multi-exon transcripts (GTF)",
transcript_count
);
result
}
pub fn build_known_junctions_from_genes(genes: &IndexMap<String, Gene>) -> KnownJunctionSet {
let mut result = KnownJunctionSet::default();
for gene in genes.values() {
for tx in &gene.transcripts {
if tx.exons.len() <= 1 {
continue;
}
let chrom = tx.chrom.to_uppercase();
for i in 0..tx.exons.len() - 1 {
let intron_start = tx.exons[i].1;
let intron_end = tx.exons[i + 1].0 - 1;
let key = format!("{chrom}:{intron_start}-{intron_end}");
result.junctions.insert(key);
}
}
}
debug!(
"Extracted {} known splice junctions from GTF",
result.junctions.len()
);
result
}
#[cfg(test)]
mod tests {
use super::*;
use crate::gtf::{Exon, Transcript};
use std::collections::HashMap;
#[test]
fn test_fetch_introns_simple() {
use rust_htslib::bam::record::Cigar::*;
let cigar = vec![Match(50), RefSkip(500), Match(50)];
let introns = fetch_introns(100, &cigar);
assert_eq!(introns.len(), 1);
assert_eq!(introns[0], (150, 650));
}
#[test]
fn test_fetch_introns_multiple() {
use rust_htslib::bam::record::Cigar::*;
let cigar = vec![Match(10), RefSkip(500), Match(20), RefSkip(300), Match(10)];
let introns = fetch_introns(100, &cigar);
assert_eq!(introns.len(), 2);
assert_eq!(introns[0], (110, 610));
assert_eq!(introns[1], (630, 930));
}
#[test]
fn test_fetch_introns_with_deletions() {
use rust_htslib::bam::record::Cigar::*;
let cigar = vec![Match(10), Del(5), Match(10), RefSkip(500), Match(10)];
let introns = fetch_introns(100, &cigar);
assert_eq!(introns.len(), 1);
assert_eq!(introns[0], (125, 625)); }
#[test]
fn test_fetch_introns_no_introns() {
use rust_htslib::bam::record::Cigar::*;
let cigar = vec![Match(100)];
let introns = fetch_introns(100, &cigar);
assert!(introns.is_empty());
}
#[test]
fn test_fetch_introns_soft_clip_no_advance() {
use rust_htslib::bam::record::Cigar::*;
let cigar = vec![SoftClip(5), Match(50), RefSkip(500), Match(50)];
let introns = fetch_introns(100, &cigar);
assert_eq!(introns.len(), 1);
assert_eq!(introns[0], (150, 650));
}
fn make_gene_with_transcripts(
gene_id: &str,
chrom: &str,
transcripts: Vec<Transcript>,
) -> Gene {
let exons: Vec<Exon> = transcripts
.iter()
.flat_map(|tx| {
tx.exons.iter().map(|&(start, end)| Exon {
chrom: chrom.to_string(),
start,
end,
strand: '+',
})
})
.collect();
let start = exons.iter().map(|e| e.start).min().unwrap_or(0);
let end = exons.iter().map(|e| e.end).max().unwrap_or(0);
Gene {
gene_id: gene_id.to_string(),
chrom: chrom.to_string(),
start,
end,
strand: '+',
exons,
effective_length: 0,
attributes: HashMap::new(),
transcripts,
}
}
#[test]
fn test_build_reference_junctions_from_genes() {
let tx = Transcript {
transcript_id: "TX1".to_string(),
chrom: "chr1".to_string(),
start: 101,
end: 600,
strand: '+',
exons: vec![(101, 200), (301, 400), (501, 600)],
cds_start: None,
cds_end: None,
};
let gene = make_gene_with_transcripts("G1", "chr1", vec![tx]);
let mut genes = IndexMap::new();
genes.insert("G1".to_string(), gene);
let ref_junctions = build_reference_junctions_from_genes(&genes);
let chr1_starts = ref_junctions.intron_starts.get("CHR1").unwrap();
let chr1_ends = ref_junctions.intron_ends.get("CHR1").unwrap();
assert_eq!(chr1_starts.len(), 2);
assert!(chr1_starts.contains(&200)); assert!(chr1_starts.contains(&400));
assert_eq!(chr1_ends.len(), 2);
assert!(chr1_ends.contains(&300)); assert!(chr1_ends.contains(&500)); }
#[test]
fn test_build_known_junctions_from_genes() {
let tx = Transcript {
transcript_id: "TX1".to_string(),
chrom: "chr1".to_string(),
start: 101,
end: 600,
strand: '+',
exons: vec![(101, 200), (301, 400), (501, 600)],
cds_start: None,
cds_end: None,
};
let gene = make_gene_with_transcripts("G1", "chr1", vec![tx]);
let mut genes = IndexMap::new();
genes.insert("G1".to_string(), gene);
let known = build_known_junctions_from_genes(&genes);
assert_eq!(known.junctions.len(), 2);
assert!(known.junctions.contains("CHR1:200-300"));
assert!(known.junctions.contains("CHR1:400-500"));
}
#[test]
fn test_build_junctions_skips_single_exon() {
let tx = Transcript {
transcript_id: "TX1".to_string(),
chrom: "chr1".to_string(),
start: 101,
end: 200,
strand: '+',
exons: vec![(101, 200)],
cds_start: None,
cds_end: None,
};
let gene = make_gene_with_transcripts("G1", "chr1", vec![tx]);
let mut genes = IndexMap::new();
genes.insert("G1".to_string(), gene);
let ref_junctions = build_reference_junctions_from_genes(&genes);
assert!(ref_junctions.intron_starts.is_empty());
let known = build_known_junctions_from_genes(&genes);
assert!(known.junctions.is_empty());
}
#[test]
fn test_build_junctions_multiple_transcripts() {
let tx1 = Transcript {
transcript_id: "TX1".to_string(),
chrom: "chr1".to_string(),
start: 101,
end: 600,
strand: '+',
exons: vec![(101, 200), (301, 600)],
cds_start: None,
cds_end: None,
};
let tx2 = Transcript {
transcript_id: "TX2".to_string(),
chrom: "chr1".to_string(),
start: 101,
end: 600,
strand: '+',
exons: vec![(101, 250), (401, 600)],
cds_start: None,
cds_end: None,
};
let gene = make_gene_with_transcripts("G1", "chr1", vec![tx1, tx2]);
let mut genes = IndexMap::new();
genes.insert("G1".to_string(), gene);
let ref_junctions = build_reference_junctions_from_genes(&genes);
let chr1_starts = ref_junctions.intron_starts.get("CHR1").unwrap();
assert!(chr1_starts.contains(&200));
assert!(chr1_starts.contains(&250));
}
}