#![allow(clippy::cast_precision_loss)]
use std::collections::{HashMap, HashSet};
use std::io::Write;
use std::num::NonZero;
use std::path::Path;
use rsomics_bamio::raw::{self, RawRecord};
use rsomics_common::{Result, RsomicsError};
use serde::Serialize;
const FLAG_QCFAIL: u16 = 0x0200;
const FLAG_DUPLICATE: u16 = 0x0400;
const FLAG_SECONDARY: u16 = 0x0100;
const FLAG_UNMAPPED: u16 = 0x0004;
pub struct KnownSites {
pub intron_starts: HashMap<String, HashSet<i64>>,
pub intron_ends: HashMap<String, HashSet<i64>>,
}
impl KnownSites {
pub fn from_bed12(path: &Path) -> Result<Self> {
let content = std::fs::read_to_string(path)
.map_err(|e| RsomicsError::Io(std::io::Error::other(format!("reading BED12: {e}"))))?;
let mut intron_starts: HashMap<String, HashSet<i64>> = HashMap::new();
let mut intron_ends: HashMap<String, HashSet<i64>> = HashMap::new();
for line in content.lines() {
if line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
continue;
}
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() < 12 {
eprintln!("[NOTE:input bed must be 12-column] skipped this line: {line}");
continue;
}
let chrom = fields[0].to_uppercase();
let Ok(tx_start) = fields[1].parse::<i64>() else {
eprintln!("[NOTE:input bed must be 12-column] skipped this line: {line}");
continue;
};
let Ok(block_count) = fields[9].parse::<usize>() else {
continue;
};
if block_count == 1 {
continue;
}
let block_sizes: Vec<i64> = fields[10]
.trim_end_matches(',')
.split(',')
.filter(|s| !s.is_empty())
.filter_map(|s| s.parse().ok())
.collect();
let block_starts: Vec<i64> = fields[11]
.trim_end_matches(',')
.split(',')
.filter(|s| !s.is_empty())
.filter_map(|s| s.parse().ok())
.collect();
if block_sizes.len() < 2 || block_starts.len() < 2 {
continue;
}
let exon_starts: Vec<i64> = block_starts.iter().map(|&s| s + tx_start).collect();
let exon_ends: Vec<i64> = exon_starts
.iter()
.zip(block_sizes.iter())
.map(|(&s, &sz)| s + sz)
.collect();
for i in 0..exon_ends.len() - 1 {
let i_st = exon_ends[i];
let i_end = exon_starts[i + 1];
intron_starts.entry(chrom.clone()).or_default().insert(i_st);
intron_ends.entry(chrom.clone()).or_default().insert(i_end);
}
}
Ok(Self {
intron_starts,
intron_ends,
})
}
#[must_use]
pub fn classify(&self, chrom: &str, intron_start: i64, intron_end: i64) -> JunctionClass {
let chrom_up = chrom.to_uppercase();
let known_start = self
.intron_starts
.get(&chrom_up)
.is_some_and(|s| s.contains(&intron_start));
let known_end = self
.intron_ends
.get(&chrom_up)
.is_some_and(|s| s.contains(&intron_end));
match (known_start, known_end) {
(true, true) => JunctionClass::Known,
(false, false) => JunctionClass::CompleteNovel,
_ => JunctionClass::PartialNovel,
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
pub enum JunctionClass {
Known,
PartialNovel,
CompleteNovel,
}
#[derive(Debug, Clone, Default, Serialize)]
pub struct JunctionCounts {
pub total_events: u64,
pub filtered_events: u64,
pub known_events: u64,
pub partial_novel_events: u64,
pub novel_events: u64,
pub known_junctions: u64,
pub partial_novel_junctions: u64,
pub novel_junctions: u64,
}
impl JunctionCounts {
#[must_use]
pub fn total_junctions(&self) -> u64 {
self.known_junctions + self.partial_novel_junctions + self.novel_junctions
}
pub fn write_rseqc_stderr<W: Write>(&self, mut out: W) -> std::io::Result<()> {
writeln!(out)?;
writeln!(
out,
"==================================================================="
)?;
writeln!(out, "Total splicing Events:\t{}", self.total_events)?;
writeln!(out, "Known Splicing Events:\t{}", self.known_events)?;
writeln!(
out,
"Partial Novel Splicing Events:\t{}",
self.partial_novel_events
)?;
writeln!(out, "Novel Splicing Events:\t{}", self.novel_events)?;
writeln!(out, "Filtered Splicing Events:\t{}", self.filtered_events)?;
writeln!(out)?;
writeln!(
out,
"Total splicing Junctions:\t{}",
self.total_junctions()
)?;
writeln!(out, "Known Splicing Junctions:\t{}", self.known_junctions)?;
writeln!(
out,
"Partial Novel Splicing Junctions:\t{}",
self.partial_novel_junctions
)?;
writeln!(out, "Novel Splicing Junctions:\t{}", self.novel_junctions)?;
writeln!(out)?;
writeln!(
out,
"==================================================================="
)?;
Ok(())
}
pub fn write_rseqc_stdout<W: Write>(&self, mut out: W) -> std::io::Result<()> {
writeln!(out, "total = {}", self.total_events)?;
Ok(())
}
}
pub fn annotate_junctions(
bam_path: &Path,
bed_path: &Path,
min_intron: i64,
mapq_cut: u8,
workers: NonZero<usize>,
) -> Result<JunctionCounts> {
eprintln!("Reading reference bed file: {} ... ", bed_path.display());
let known = KnownSites::from_bed12(bed_path)?;
eprintln!("Done");
if bam_path
.extension()
.is_some_and(|e| e.eq_ignore_ascii_case("bam"))
{
eprintln!("Load BAM file ... ");
} else {
eprintln!("Load SAM file ... ");
}
let mut reader = rsomics_bamio::open_with_workers(bam_path, workers)?;
let header = reader.read_header().map_err(RsomicsError::Io)?;
let ref_names: Vec<String> = header
.reference_sequences()
.keys()
.map(|k| String::from_utf8_lossy(k.as_ref()).into_owned())
.collect();
let counts = scan_reads(reader.get_mut(), &ref_names, &known, min_intron, mapq_cut)?;
eprintln!("Done");
Ok(counts)
}
fn scan_reads<R: std::io::Read>(
reader: &mut R,
ref_names: &[String],
known: &KnownSites,
min_intron: i64,
mapq_cut: u8,
) -> Result<JunctionCounts> {
let mut splicing_events: HashMap<(String, i64, i64), u64> = HashMap::new();
let mut total_events: u64 = 0;
let mut filtered_events: u64 = 0;
let mut known_ev: u64 = 0;
let mut partial_ev: u64 = 0;
let mut novel_ev: u64 = 0;
let mut rec = RawRecord::default();
loop {
let bytes = raw::read_record(reader, &mut rec)?;
if bytes == 0 {
break;
}
let flags = rec.flags();
if flags & (FLAG_QCFAIL | FLAG_DUPLICATE | FLAG_SECONDARY | FLAG_UNMAPPED) != 0 {
continue;
}
if rec.mapping_quality() < mapq_cut {
continue;
}
let tid = rec.reference_sequence_id();
if tid < 0 {
continue;
}
#[allow(clippy::cast_sign_loss)]
let Some(chrom) = ref_names.get(tid as usize) else {
continue;
};
let introns = fetch_introns(chrom, i64::from(rec.alignment_start()), rec.cigar_ops());
if introns.is_empty() {
continue;
}
for (chr, i_st, i_end) in introns {
total_events += 1;
if i_end - i_st < min_intron {
filtered_events += 1;
continue;
}
*splicing_events
.entry((chr.clone(), i_st, i_end))
.or_insert(0) += 1;
match known.classify(&chr, i_st, i_end) {
JunctionClass::Known => known_ev += 1,
JunctionClass::PartialNovel => partial_ev += 1,
JunctionClass::CompleteNovel => novel_ev += 1,
}
}
}
let mut known_junc: u64 = 0;
let mut partial_junc: u64 = 0;
let mut novel_junc: u64 = 0;
for (chr, i_st, i_end) in splicing_events.keys() {
match known.classify(chr, *i_st, *i_end) {
JunctionClass::Known => known_junc += 1,
JunctionClass::PartialNovel => partial_junc += 1,
JunctionClass::CompleteNovel => novel_junc += 1,
}
}
Ok(JunctionCounts {
total_events,
filtered_events,
known_events: known_ev,
partial_novel_events: partial_ev,
novel_events: novel_ev,
known_junctions: known_junc,
partial_novel_junctions: partial_junc,
novel_junctions: novel_junc,
})
}
fn fetch_introns<I>(chrom: &str, alignment_start: i64, ops: I) -> Vec<(String, i64, i64)>
where
I: Iterator<Item = (u8, u32)>,
{
let mut result = Vec::new();
let mut ref_pos = alignment_start;
for (op_code, op_len) in ops {
let len = i64::from(op_len);
match op_code {
0 | 2 | 7 | 8 => {
ref_pos += len;
}
3 => {
let i_st = ref_pos;
let i_end = ref_pos + len;
result.push((chrom.to_string(), i_st, i_end));
ref_pos = i_end;
}
_ => {}
}
}
result
}