Skip to main content

rsomics_junction_saturation/
lib.rs

1//! Subsample-based splice-junction saturation analysis.
2//!
3//! Algorithm (reimplemented from RSeQC junction_saturation.py):
4//! 1. Parse BED12 gene models; extract all annotated splice sites (junction
5//!    donor and acceptor positions) and annotated junctions (donor, acceptor
6//!    pairs) into hash sets.
7//! 2. Load all mapped, primary reads from the BAM.
8//! 3. Shuffle all read indices once with a seeded ChaCha12 RNG.
9//! 4. For each fraction F in [lower..upper] step S:
10//!    a. Take the first ⌊F% × total_reads⌋ indices from the shuffled order.
11//!    b. For each selected read, extract introns from the CIGAR `N` operations.
12//!    c. Classify each observed junction as:
13//!       - known: both donor and acceptor in the annotated junction set
14//!       - partial novel: one of donor/acceptor is annotated
15//!       - complete novel: neither is annotated
16//! 5. Write one TSV file `<prefix>.junction_saturation.txt` with columns:
17//!    `pct\tknown\tpartial_novel\tcomplete_novel`
18//!
19//! Using a single shuffle (prefix-based sampling) guarantees monotonicity:
20//! the read set at fraction F1 is always a subset of the set at F2 > F1.
21//! RSeQC's subsampling is non-deterministic (Python `random`). We use a
22//! seedable ChaCha12 RNG so results are reproducible when `--seed` is given.
23
24use std::collections::HashSet;
25use std::io::{BufRead, Write};
26use std::num::NonZero;
27use std::path::Path;
28
29use rand::SeedableRng;
30use rand::seq::SliceRandom;
31use rand_chacha::ChaCha12Rng;
32#[allow(clippy::wildcard_imports)]
33use rayon::prelude::*;
34use rsomics_common::{Result, RsomicsError};
35
36/// A splice junction: (chrom, donor_pos, acceptor_pos) in 0-based coordinates.
37/// Donor = intron start (one past last exon base), acceptor = intron end (first
38/// base of next exon). Stored as (intron_start, intron_end) 0-based half-open.
39type Junction = (String, u64, u64);
40
41/// Per-fraction junction counts.
42#[derive(Debug, Clone, Default)]
43pub struct FractionResult {
44    pub pct: u8,
45    pub known: usize,
46    pub partial_novel: usize,
47    pub complete_novel: usize,
48}
49
50/// Options for junction saturation analysis.
51#[derive(Debug, Clone)]
52pub struct JunctionSaturationOpts {
53    /// Lower sampling fraction (percent), inclusive. Default 5.
54    pub lower: u8,
55    /// Upper sampling fraction (percent), inclusive. Default 100.
56    pub upper: u8,
57    /// Step between fractions. Default 5.
58    pub step: u8,
59    /// Minimum mapping quality. Default 0.
60    pub min_mapq: u8,
61    /// Minimum intron length to count a CIGAR N op as a real splice junction.
62    /// Matches RSeQC default of 50 bp.
63    pub min_intron: u64,
64    /// RNG seed (None = random seed).
65    pub seed: Option<u64>,
66    /// BGZF inflate threads.
67    pub threads: NonZero<usize>,
68}
69
70impl Default for JunctionSaturationOpts {
71    fn default() -> Self {
72        Self {
73            lower: 5,
74            upper: 100,
75            step: 5,
76            min_mapq: 0,
77            min_intron: 50,
78            seed: None,
79            threads: NonZero::new(1).unwrap(),
80        }
81    }
82}
83
84// ── BED12 annotated junction extraction ──────────────────────────────────────
85
86/// A splice site: (chrom, position) 0-based.
87type SpliceSite = (String, u64);
88
89/// Load all annotated splice junctions and individual splice sites from a
90/// BED12 gene model file.
91///
92/// Returns `(junctions, splice_sites)` where:
93/// - `junctions`: set of `(chrom, intron_start, intron_end)` 0-based half-open
94/// - `splice_sites`: set of `(chrom, pos)` for every donor and acceptor
95pub(crate) fn load_annotated_junctions(
96    path: &Path,
97) -> Result<(HashSet<Junction>, HashSet<SpliceSite>)> {
98    let f = std::fs::File::open(path)
99        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
100    let reader = std::io::BufReader::new(f);
101    let mut junctions: HashSet<Junction> = HashSet::new();
102    let mut splice_sites: HashSet<SpliceSite> = HashSet::new();
103
104    for (lineno, line) in reader.lines().enumerate() {
105        let line = line.map_err(RsomicsError::Io)?;
106        let line = line.trim();
107        if line.is_empty() || line.starts_with('#') || line.starts_with("track") {
108            continue;
109        }
110        let cols: Vec<&str> = line.split('\t').collect();
111        if cols.len() < 12 {
112            return Err(RsomicsError::InvalidInput(format!(
113                "BED12 requires 12 columns at line {}; got {}",
114                lineno + 1,
115                cols.len()
116            )));
117        }
118
119        let chrom = cols[0].to_string();
120        let tx_start: u64 = cols[1]
121            .parse()
122            .map_err(|_| RsomicsError::InvalidInput(format!("bad start at line {}", lineno + 1)))?;
123        let block_count: usize = cols[9].parse().map_err(|_| {
124            RsomicsError::InvalidInput(format!("bad blockCount at line {}", lineno + 1))
125        })?;
126        if block_count < 2 {
127            continue; // single-exon gene — no junctions
128        }
129        let block_sizes: Vec<u64> = cols[10]
130            .trim_end_matches(',')
131            .split(',')
132            .map(|s| {
133                s.trim().parse::<u64>().map_err(|_| {
134                    RsomicsError::InvalidInput(format!("bad blockSize at line {}", lineno + 1))
135                })
136            })
137            .collect::<Result<_>>()?;
138        let block_starts: Vec<u64> = cols[11]
139            .trim_end_matches(',')
140            .split(',')
141            .map(|s| {
142                s.trim().parse::<u64>().map_err(|_| {
143                    RsomicsError::InvalidInput(format!("bad blockStart at line {}", lineno + 1))
144                })
145            })
146            .collect::<Result<_>>()?;
147
148        if block_sizes.len() < block_count || block_starts.len() < block_count {
149            return Err(RsomicsError::InvalidInput(format!(
150                "blockSizes/blockStarts length mismatch at line {}",
151                lineno + 1
152            )));
153        }
154
155        // Build exon absolute coordinates.
156        let exons: Vec<(u64, u64)> = (0..block_count)
157            .map(|i| {
158                let start = tx_start + block_starts[i];
159                let end = start + block_sizes[i];
160                (start, end)
161            })
162            .collect();
163
164        // Each consecutive exon pair gives one junction:
165        // donor = exon[i].end, acceptor = exon[i+1].start.
166        for i in 0..(block_count - 1) {
167            let donor = exons[i].1; // end of exon i (0-based, exclusive)
168            let acceptor = exons[i + 1].0; // start of exon i+1 (0-based, inclusive)
169            if donor >= acceptor {
170                continue; // degenerate
171            }
172            junctions.insert((chrom.clone(), donor, acceptor));
173            splice_sites.insert((chrom.clone(), donor));
174            splice_sites.insert((chrom.clone(), acceptor));
175        }
176    }
177
178    Ok((junctions, splice_sites))
179}
180
181// ── CIGAR parsing: extract introns from BAM records ──────────────────────────
182
183/// A raw BAM record's CIGAR-derived junction: (chrom_idx, intron_start,
184/// intron_end) where intron_start is 0-based.
185#[derive(Debug, Clone, PartialEq, Eq, Hash)]
186struct RawJunction {
187    ref_id: i32,
188    start: u64,
189    end: u64,
190}
191
192/// Extract splice junctions from CIGAR operations.
193/// Only CIGAR N operations with length >= `min_intron` are returned.
194fn extract_junctions(
195    cigar: &[(u8, u32)],
196    ref_start: u64,
197    min_intron: u64,
198    ref_id: i32,
199) -> Vec<RawJunction> {
200    let mut junctions = Vec::new();
201    let mut ref_pos = ref_start;
202
203    for &(op_code, op_len) in cigar {
204        let op_len = op_len as u64;
205        match op_code {
206            // M, X, = : consume reference
207            0 | 7 | 8 => ref_pos += op_len,
208            // D: consume reference (deletion)
209            2 => ref_pos += op_len,
210            // N: intron (skip reference)
211            3 => {
212                let intron_start = ref_pos;
213                let intron_end = ref_pos + op_len;
214                ref_pos = intron_end;
215                if op_len >= min_intron {
216                    junctions.push(RawJunction {
217                        ref_id,
218                        start: intron_start,
219                        end: intron_end,
220                    });
221                }
222            }
223            // I, S, H, P: don't consume reference
224            _ => {}
225        }
226    }
227    junctions
228}
229
230// ── BAM loading ───────────────────────────────────────────────────────────────
231
232struct RawRead {
233    ref_id: i32,
234    pos: u64,
235    cigar: Vec<(u8, u32)>,
236}
237
238/// Load all usable reads from a BAM file into memory.
239/// Filters: mapped, primary (not SECONDARY/SUPPLEMENTARY), not QCFAIL/DUP,
240/// MAPQ >= `min_mapq`.
241pub(crate) fn load_reads(
242    bam: &Path,
243    min_mapq: u8,
244    threads: NonZero<usize>,
245) -> Result<(Vec<RawRead>, Vec<String>)> {
246    let mut reader = rsomics_bamio::open_with_workers(bam, threads)?;
247    let header = reader.read_header().map_err(RsomicsError::Io)?;
248
249    let ref_names: Vec<String> = header
250        .reference_sequences()
251        .keys()
252        .map(|n| n.to_string())
253        .collect();
254
255    let inner = reader.get_mut();
256    let mut reads = Vec::new();
257    let mut raw = rsomics_bamio::raw::RawRecord::default();
258
259    loop {
260        let n = rsomics_bamio::raw::read_record(inner, &mut raw)?;
261        if n == 0 {
262            break;
263        }
264
265        let flags = raw.flags();
266        // Skip unmapped, secondary, supplementary, QC-fail, duplicate.
267        const SKIP: u16 = 0x4 | 0x100 | 0x800 | 0x200 | 0x400;
268        if flags & SKIP != 0 {
269            continue;
270        }
271        if raw.mapping_quality() < min_mapq {
272            continue;
273        }
274        let ref_id = raw.reference_sequence_id();
275        if ref_id < 0 {
276            continue;
277        }
278        let pos = raw.alignment_start() as u64;
279        // Collect CIGAR ops eagerly so the record can be overwritten.
280        let cigar: Vec<(u8, u32)> = raw.cigar_ops().collect();
281
282        reads.push(RawRead { ref_id, pos, cigar });
283    }
284
285    Ok((reads, ref_names))
286}
287
288// ── Saturation computation ────────────────────────────────────────────────────
289
290/// Run junction saturation analysis.
291///
292/// Writes one output file: `<prefix>.junction_saturation.txt` with columns
293/// `pct\tknown\tpartial_novel\tcomplete_novel`.
294pub fn run(bam: &Path, bed: &Path, prefix: &str, opts: &JunctionSaturationOpts) -> Result<()> {
295    let (anno_junctions, splice_sites) = load_annotated_junctions(bed)?;
296    let (reads, ref_names) = load_reads(bam, opts.min_mapq, opts.threads)?;
297
298    let n_reads = reads.len();
299
300    // Pre-extract all junctions from all reads.
301    let all_read_junctions: Vec<Vec<RawJunction>> = reads
302        .par_iter()
303        .map(|r| extract_junctions(&r.cigar, r.pos, opts.min_intron, r.ref_id))
304        .collect();
305
306    // Build the fraction list.
307    let fractions: Vec<u8> = {
308        let mut f = Vec::new();
309        let mut pct = opts.lower;
310        while pct <= opts.upper {
311            f.push(pct);
312            pct = pct.saturating_add(opts.step);
313            if pct > opts.upper && f.last().copied() != Some(opts.upper) {
314                f.push(opts.upper);
315                break;
316            }
317        }
318        f.dedup();
319        f
320    };
321
322    let seed = opts.seed.unwrap_or_else(rand::random);
323
324    // Shuffle all read indices once with the global seed.  Each fraction then
325    // takes the first ⌊F% × N⌋ indices from this single permutation, so
326    // sample(F1) ⊆ sample(F2) for F1 ≤ F2 — monotonicity is guaranteed by
327    // construction.
328    let shuffled_indices: Vec<usize> = {
329        let mut rng = ChaCha12Rng::seed_from_u64(seed);
330        let mut idx: Vec<usize> = (0..n_reads).collect();
331        idx.shuffle(&mut rng);
332        idx
333    };
334
335    // For each fraction, scan the first N shuffled reads and count junctions.
336    let results: Vec<FractionResult> = fractions
337        .par_iter()
338        .map(|&pct| {
339            let n_sample = ((n_reads as f64 * pct as f64 / 100.0).floor() as usize).min(n_reads);
340
341            // Collect unique junctions from sampled reads.
342            let mut obs: HashSet<(i32, u64, u64)> = HashSet::new();
343            for &idx in &shuffled_indices[..n_sample] {
344                for junc in &all_read_junctions[idx] {
345                    obs.insert((junc.ref_id, junc.start, junc.end));
346                }
347            }
348
349            // Classify junctions.
350            let mut known = 0usize;
351            let mut partial_novel = 0usize;
352            let mut complete_novel = 0usize;
353
354            for (ref_id, start, end) in &obs {
355                let chrom = match ref_names.get(*ref_id as usize) {
356                    Some(n) => n,
357                    None => continue,
358                };
359                let donor_key = (chrom.clone(), *start);
360                let acceptor_key = (chrom.clone(), *end);
361                let has_donor = splice_sites.contains(&donor_key);
362                let has_acceptor = splice_sites.contains(&acceptor_key);
363
364                let junc_key = (chrom.clone(), *start, *end);
365                if anno_junctions.contains(&junc_key) {
366                    known += 1;
367                } else if has_donor || has_acceptor {
368                    partial_novel += 1;
369                } else {
370                    complete_novel += 1;
371                }
372            }
373
374            FractionResult {
375                pct,
376                known,
377                partial_novel,
378                complete_novel,
379            }
380        })
381        .collect();
382
383    // Write output.
384    let out_path = format!("{prefix}.junction_saturation.txt");
385    let mut out = std::io::BufWriter::new(
386        std::fs::File::create(&out_path)
387            .map_err(|e| RsomicsError::InvalidInput(format!("{out_path}: {e}")))?,
388    );
389
390    writeln!(out, "pct\tknown\tpartial_novel\tcomplete_novel").map_err(RsomicsError::Io)?;
391    for r in &results {
392        writeln!(
393            out,
394            "{}\t{}\t{}\t{}",
395            r.pct, r.known, r.partial_novel, r.complete_novel
396        )
397        .map_err(RsomicsError::Io)?;
398    }
399
400    Ok(())
401}