Skip to main content

rsomics_bam_junctions/
lib.rs

1//! Annotate splice junctions from spliced BAM reads against a BED12 gene model.
2//!
3//! Mirrors `RSeQC` `junction_annotation.py` (LGPL-2.1+):
4//!   - extracts every `N`-op intron from each read's CIGAR string;
5//!   - filters by minimum intron length and MAPQ;
6//!   - classifies each `(chrom, intron_start, intron_end)` as **known**
7//!     (both splice sites present in the BED12 intron set), **`partial_novel`**
8//!     (one site known), or **`complete_novel`** (neither site known);
9//!   - counts both per-read **events** and distinct **junctions**.
10//!
11//! ## Origin
12//!
13//! This crate is an independent Rust reimplementation based on:
14//! - `RSeQC`: `junction_annotation.py` (LGPL-2.1+), Wang et al. 2012
15//!   <https://doi.org/10.1093/bioinformatics/bts356>
16//! - The SAM/BAM format specification (MIT)
17//! - BED12 format specification
18//! - Black-box behaviour testing against `RSeQC` 5.0.4
19//!
20//! No source code from the GPL/LGPL upstream was used as reference during
21//! implementation; the algorithm is derived from the published method,
22//! the public format specs, and black-box behavioural testing.
23//!
24//! License: MIT OR Apache-2.0.
25//! Upstream credit: `RSeQC` <https://rseqc.sourceforge.net/> (LGPL-2.1+).
26
27#![allow(clippy::cast_precision_loss)]
28
29use std::collections::{HashMap, HashSet};
30use std::io::Write;
31use std::num::NonZero;
32use std::path::Path;
33
34use rsomics_bamio::raw::{self, RawRecord};
35use rsomics_common::{Result, RsomicsError};
36use serde::Serialize;
37
38// BAM flag bits (SAM spec §1.4).
39const FLAG_QCFAIL: u16 = 0x0200;
40const FLAG_DUPLICATE: u16 = 0x0400;
41const FLAG_SECONDARY: u16 = 0x0100;
42const FLAG_UNMAPPED: u16 = 0x0004;
43
44/// Known splice-site sets built from the BED12 intron boundaries.
45///
46/// `intron_starts` = set of intron start positions (= exon end positions, 0-based half-open end).
47/// `intron_ends`   = set of intron end positions (= next exon start, 0-based).
48///
49/// `RSeQC` uses chromosome names uppercased as keys. We mirror that.
50pub struct KnownSites {
51    /// Per-chromosome set of intron start coordinates (exon end, 0-based).
52    pub intron_starts: HashMap<String, HashSet<i64>>,
53    /// Per-chromosome set of intron end coordinates (next exon start, 0-based).
54    pub intron_ends: HashMap<String, HashSet<i64>>,
55}
56
57impl KnownSites {
58    /// Parse a BED12 file and build the known-intron coordinate sets.
59    ///
60    /// Skips comment/track/browser lines and genes with a single exon
61    /// (`blockCount == 1`), mirroring `RSeQC`'s `if int(fields[9]) == 1: continue`.
62    pub fn from_bed12(path: &Path) -> Result<Self> {
63        let content = std::fs::read_to_string(path)
64            .map_err(|e| RsomicsError::Io(std::io::Error::other(format!("reading BED12: {e}"))))?;
65
66        let mut intron_starts: HashMap<String, HashSet<i64>> = HashMap::new();
67        let mut intron_ends: HashMap<String, HashSet<i64>> = HashMap::new();
68
69        for line in content.lines() {
70            if line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
71                continue;
72            }
73            let fields: Vec<&str> = line.split('\t').collect();
74            if fields.len() < 12 {
75                eprintln!("[NOTE:input bed must be 12-column] skipped this line: {line}");
76                continue;
77            }
78
79            let chrom = fields[0].to_uppercase();
80            let Ok(tx_start) = fields[1].parse::<i64>() else {
81                eprintln!("[NOTE:input bed must be 12-column] skipped this line: {line}");
82                continue;
83            };
84
85            // Skip single-exon transcripts — no introns to index.
86            let Ok(block_count) = fields[9].parse::<usize>() else {
87                continue;
88            };
89            if block_count == 1 {
90                continue;
91            }
92
93            // Parse block sizes and relative block starts.
94            let block_sizes: Vec<i64> = fields[10]
95                .trim_end_matches(',')
96                .split(',')
97                .filter(|s| !s.is_empty())
98                .filter_map(|s| s.parse().ok())
99                .collect();
100            let block_starts: Vec<i64> = fields[11]
101                .trim_end_matches(',')
102                .split(',')
103                .filter(|s| !s.is_empty())
104                .filter_map(|s| s.parse().ok())
105                .collect();
106
107            if block_sizes.len() < 2 || block_starts.len() < 2 {
108                continue;
109            }
110
111            // Absolute exon starts and ends.
112            let exon_starts: Vec<i64> = block_starts.iter().map(|&s| s + tx_start).collect();
113            let exon_ends: Vec<i64> = exon_starts
114                .iter()
115                .zip(block_sizes.iter())
116                .map(|(&s, &sz)| s + sz)
117                .collect();
118
119            // Intron start = exon end (0-based); intron end = next exon start (0-based).
120            for i in 0..exon_ends.len() - 1 {
121                let i_st = exon_ends[i];
122                let i_end = exon_starts[i + 1];
123                intron_starts.entry(chrom.clone()).or_default().insert(i_st);
124                intron_ends.entry(chrom.clone()).or_default().insert(i_end);
125            }
126        }
127
128        Ok(Self {
129            intron_starts,
130            intron_ends,
131        })
132    }
133
134    /// Classify an intron by whether its splice sites are known.
135    #[must_use]
136    pub fn classify(&self, chrom: &str, intron_start: i64, intron_end: i64) -> JunctionClass {
137        let chrom_up = chrom.to_uppercase();
138        let known_start = self
139            .intron_starts
140            .get(&chrom_up)
141            .is_some_and(|s| s.contains(&intron_start));
142        let known_end = self
143            .intron_ends
144            .get(&chrom_up)
145            .is_some_and(|s| s.contains(&intron_end));
146        match (known_start, known_end) {
147            (true, true) => JunctionClass::Known,
148            (false, false) => JunctionClass::CompleteNovel,
149            _ => JunctionClass::PartialNovel,
150        }
151    }
152}
153
154/// Classification of a splice junction relative to the gene model.
155#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
156pub enum JunctionClass {
157    Known,
158    PartialNovel,
159    CompleteNovel,
160}
161
162/// Summary counts produced by junction annotation.
163#[derive(Debug, Clone, Default, Serialize)]
164pub struct JunctionCounts {
165    /// Total N-op occurrences seen across all reads (before min-intron filter).
166    pub total_events: u64,
167    /// Events that were skipped for being shorter than `min_intron`.
168    pub filtered_events: u64,
169    /// Per-read-occurrence counts (events) for passing junctions.
170    pub known_events: u64,
171    pub partial_novel_events: u64,
172    pub novel_events: u64,
173    /// Distinct-junction counts.
174    pub known_junctions: u64,
175    pub partial_novel_junctions: u64,
176    pub novel_junctions: u64,
177}
178
179impl JunctionCounts {
180    /// Total distinct junctions.
181    #[must_use]
182    pub fn total_junctions(&self) -> u64 {
183        self.known_junctions + self.partial_novel_junctions + self.novel_junctions
184    }
185
186    /// Emit the exact stderr text format that `RSeQC junction_annotation.py` prints.
187    ///
188    /// `RSeQC` emits two `===` blocks to stderr: first events (total includes
189    /// filtered events), then junctions (distinct passing junctions).
190    /// The `total = N` line (total events including filtered) goes to stdout.
191    pub fn write_rseqc_stderr<W: Write>(&self, mut out: W) -> std::io::Result<()> {
192        writeln!(out)?;
193        writeln!(
194            out,
195            "==================================================================="
196        )?;
197        writeln!(out, "Total splicing  Events:\t{}", self.total_events)?;
198        writeln!(out, "Known Splicing Events:\t{}", self.known_events)?;
199        writeln!(
200            out,
201            "Partial Novel Splicing Events:\t{}",
202            self.partial_novel_events
203        )?;
204        writeln!(out, "Novel Splicing Events:\t{}", self.novel_events)?;
205        writeln!(out, "Filtered Splicing Events:\t{}", self.filtered_events)?;
206
207        writeln!(out)?;
208        writeln!(
209            out,
210            "Total splicing  Junctions:\t{}",
211            self.total_junctions()
212        )?;
213        writeln!(out, "Known Splicing Junctions:\t{}", self.known_junctions)?;
214        writeln!(
215            out,
216            "Partial Novel Splicing Junctions:\t{}",
217            self.partial_novel_junctions
218        )?;
219        writeln!(out, "Novel Splicing Junctions:\t{}", self.novel_junctions)?;
220        writeln!(out)?;
221        writeln!(
222            out,
223            "==================================================================="
224        )?;
225        Ok(())
226    }
227
228    /// Emit `total = N\n` to stdout.
229    ///
230    /// Matches `RSeQC`'s `print("total = " + str(total_junc))` where
231    /// `total_junc` is the count of all N-ops seen, including filtered ones.
232    pub fn write_rseqc_stdout<W: Write>(&self, mut out: W) -> std::io::Result<()> {
233        writeln!(out, "total = {}", self.total_events)?;
234        Ok(())
235    }
236}
237
238/// Annotate splice junctions from `bam_path` against the BED12 gene model at `bed_path`.
239pub fn annotate_junctions(
240    bam_path: &Path,
241    bed_path: &Path,
242    min_intron: i64,
243    mapq_cut: u8,
244    workers: NonZero<usize>,
245) -> Result<JunctionCounts> {
246    eprintln!("Reading reference bed file: {} ... ", bed_path.display());
247    let known = KnownSites::from_bed12(bed_path)?;
248    eprintln!("Done");
249
250    if bam_path
251        .extension()
252        .is_some_and(|e| e.eq_ignore_ascii_case("bam"))
253    {
254        eprintln!("Load BAM file ... ");
255    } else {
256        eprintln!("Load SAM file ... ");
257    }
258
259    let mut reader = rsomics_bamio::open_with_workers(bam_path, workers)?;
260    let header = reader.read_header().map_err(RsomicsError::Io)?;
261
262    let ref_names: Vec<String> = header
263        .reference_sequences()
264        .keys()
265        .map(|k| String::from_utf8_lossy(k.as_ref()).into_owned())
266        .collect();
267
268    let counts = scan_reads(reader.get_mut(), &ref_names, &known, min_intron, mapq_cut)?;
269
270    eprintln!("Done");
271    Ok(counts)
272}
273
274/// Inner BAM-scanning loop: extract N-ops, classify, accumulate counts.
275fn scan_reads<R: std::io::Read>(
276    reader: &mut R,
277    ref_names: &[String],
278    known: &KnownSites,
279    min_intron: i64,
280    mapq_cut: u8,
281) -> Result<JunctionCounts> {
282    // Map (chrom, intron_start, intron_end) → per-read occurrence count (events).
283    let mut splicing_events: HashMap<(String, i64, i64), u64> = HashMap::new();
284
285    let mut total_events: u64 = 0;
286    let mut filtered_events: u64 = 0;
287    // Per-read event counters (before dedup into junctions).
288    let mut known_ev: u64 = 0;
289    let mut partial_ev: u64 = 0;
290    let mut novel_ev: u64 = 0;
291
292    let mut rec = RawRecord::default();
293
294    loop {
295        let bytes = raw::read_record(reader, &mut rec)?;
296        if bytes == 0 {
297            break;
298        }
299
300        let flags = rec.flags();
301        if flags & (FLAG_QCFAIL | FLAG_DUPLICATE | FLAG_SECONDARY | FLAG_UNMAPPED) != 0 {
302            continue;
303        }
304        if rec.mapping_quality() < mapq_cut {
305            continue;
306        }
307
308        let tid = rec.reference_sequence_id();
309        if tid < 0 {
310            continue;
311        }
312        #[allow(clippy::cast_sign_loss)]
313        let Some(chrom) = ref_names.get(tid as usize) else {
314            continue;
315        };
316
317        // Walk the CIGAR to collect N (intron/skip) blocks.
318        let introns = fetch_introns(chrom, i64::from(rec.alignment_start()), rec.cigar_ops());
319        if introns.is_empty() {
320            continue;
321        }
322
323        for (chr, i_st, i_end) in introns {
324            total_events += 1;
325            if i_end - i_st < min_intron {
326                filtered_events += 1;
327                continue;
328            }
329            *splicing_events
330                .entry((chr.clone(), i_st, i_end))
331                .or_insert(0) += 1;
332
333            // Classify each event occurrence (not just distinct junctions).
334            match known.classify(&chr, i_st, i_end) {
335                JunctionClass::Known => known_ev += 1,
336                JunctionClass::PartialNovel => partial_ev += 1,
337                JunctionClass::CompleteNovel => novel_ev += 1,
338            }
339        }
340    }
341
342    // Count distinct junctions.
343    let mut known_junc: u64 = 0;
344    let mut partial_junc: u64 = 0;
345    let mut novel_junc: u64 = 0;
346
347    for (chr, i_st, i_end) in splicing_events.keys() {
348        match known.classify(chr, *i_st, *i_end) {
349            JunctionClass::Known => known_junc += 1,
350            JunctionClass::PartialNovel => partial_junc += 1,
351            JunctionClass::CompleteNovel => novel_junc += 1,
352        }
353    }
354
355    Ok(JunctionCounts {
356        total_events,
357        filtered_events,
358        known_events: known_ev,
359        partial_novel_events: partial_ev,
360        novel_events: novel_ev,
361        known_junctions: known_junc,
362        partial_novel_junctions: partial_junc,
363        novel_junctions: novel_junc,
364    })
365}
366
367/// Walk a CIGAR op iterator and collect N-op (intron) spans as
368/// `(chrom, intron_start_0based, intron_end_0based)`.
369///
370/// BAM CIGAR opcodes: 0=M, 1=I, 2=D, 3=N, 4=S, 5=H, 6=P, 7==, 8=X.
371/// Reference-consuming ops: M(0), D(2), N(3), =(7), X(8).
372/// N(3) = intron skip; produces one output entry.
373fn fetch_introns<I>(chrom: &str, alignment_start: i64, ops: I) -> Vec<(String, i64, i64)>
374where
375    I: Iterator<Item = (u8, u32)>,
376{
377    let mut result = Vec::new();
378    let mut ref_pos = alignment_start;
379
380    for (op_code, op_len) in ops {
381        let len = i64::from(op_len);
382        match op_code {
383            0 | 2 | 7 | 8 => {
384                // M, D, =, X — consume reference
385                ref_pos += len;
386            }
387            3 => {
388                // N — intron skip
389                let i_st = ref_pos;
390                let i_end = ref_pos + len;
391                result.push((chrom.to_string(), i_st, i_end));
392                ref_pos = i_end;
393            }
394            // I(1), S(4), H(5), P(6) — do not consume reference
395            _ => {}
396        }
397    }
398    result
399}