use std::collections::HashSet;
use std::io::{BufRead, Write};
use std::num::NonZero;
use std::path::Path;
use rand::SeedableRng;
use rand::seq::SliceRandom;
use rand_chacha::ChaCha12Rng;
#[allow(clippy::wildcard_imports)]
use rayon::prelude::*;
use rsomics_common::{Result, RsomicsError};
type Junction = (String, u64, u64);
#[derive(Debug, Clone, Default)]
pub struct FractionResult {
pub pct: u8,
pub known: usize,
pub partial_novel: usize,
pub complete_novel: usize,
}
#[derive(Debug, Clone)]
pub struct JunctionSaturationOpts {
pub lower: u8,
pub upper: u8,
pub step: u8,
pub min_mapq: u8,
pub min_intron: u64,
pub seed: Option<u64>,
pub threads: NonZero<usize>,
}
impl Default for JunctionSaturationOpts {
fn default() -> Self {
Self {
lower: 5,
upper: 100,
step: 5,
min_mapq: 0,
min_intron: 50,
seed: None,
threads: NonZero::new(1).unwrap(),
}
}
}
type SpliceSite = (String, u64);
pub(crate) fn load_annotated_junctions(
path: &Path,
) -> Result<(HashSet<Junction>, HashSet<SpliceSite>)> {
let f = std::fs::File::open(path)
.map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
let reader = std::io::BufReader::new(f);
let mut junctions: HashSet<Junction> = HashSet::new();
let mut splice_sites: HashSet<SpliceSite> = HashSet::new();
for (lineno, line) in reader.lines().enumerate() {
let line = line.map_err(RsomicsError::Io)?;
let line = line.trim();
if line.is_empty() || line.starts_with('#') || line.starts_with("track") {
continue;
}
let cols: Vec<&str> = line.split('\t').collect();
if cols.len() < 12 {
return Err(RsomicsError::InvalidInput(format!(
"BED12 requires 12 columns at line {}; got {}",
lineno + 1,
cols.len()
)));
}
let chrom = cols[0].to_string();
let tx_start: u64 = cols[1]
.parse()
.map_err(|_| RsomicsError::InvalidInput(format!("bad start at line {}", lineno + 1)))?;
let block_count: usize = cols[9].parse().map_err(|_| {
RsomicsError::InvalidInput(format!("bad blockCount at line {}", lineno + 1))
})?;
if block_count < 2 {
continue; }
let block_sizes: Vec<u64> = cols[10]
.trim_end_matches(',')
.split(',')
.map(|s| {
s.trim().parse::<u64>().map_err(|_| {
RsomicsError::InvalidInput(format!("bad blockSize at line {}", lineno + 1))
})
})
.collect::<Result<_>>()?;
let block_starts: Vec<u64> = cols[11]
.trim_end_matches(',')
.split(',')
.map(|s| {
s.trim().parse::<u64>().map_err(|_| {
RsomicsError::InvalidInput(format!("bad blockStart at line {}", lineno + 1))
})
})
.collect::<Result<_>>()?;
if block_sizes.len() < block_count || block_starts.len() < block_count {
return Err(RsomicsError::InvalidInput(format!(
"blockSizes/blockStarts length mismatch at line {}",
lineno + 1
)));
}
let exons: Vec<(u64, u64)> = (0..block_count)
.map(|i| {
let start = tx_start + block_starts[i];
let end = start + block_sizes[i];
(start, end)
})
.collect();
for i in 0..(block_count - 1) {
let donor = exons[i].1; let acceptor = exons[i + 1].0; if donor >= acceptor {
continue; }
junctions.insert((chrom.clone(), donor, acceptor));
splice_sites.insert((chrom.clone(), donor));
splice_sites.insert((chrom.clone(), acceptor));
}
}
Ok((junctions, splice_sites))
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct RawJunction {
ref_id: i32,
start: u64,
end: u64,
}
fn extract_junctions(
cigar: &[(u8, u32)],
ref_start: u64,
min_intron: u64,
ref_id: i32,
) -> Vec<RawJunction> {
let mut junctions = Vec::new();
let mut ref_pos = ref_start;
for &(op_code, op_len) in cigar {
let op_len = op_len as u64;
match op_code {
0 | 7 | 8 => ref_pos += op_len,
2 => ref_pos += op_len,
3 => {
let intron_start = ref_pos;
let intron_end = ref_pos + op_len;
ref_pos = intron_end;
if op_len >= min_intron {
junctions.push(RawJunction {
ref_id,
start: intron_start,
end: intron_end,
});
}
}
_ => {}
}
}
junctions
}
struct RawRead {
ref_id: i32,
pos: u64,
cigar: Vec<(u8, u32)>,
}
pub(crate) fn load_reads(
bam: &Path,
min_mapq: u8,
threads: NonZero<usize>,
) -> Result<(Vec<RawRead>, Vec<String>)> {
let mut reader = rsomics_bamio::open_with_workers(bam, threads)?;
let header = reader.read_header().map_err(RsomicsError::Io)?;
let ref_names: Vec<String> = header
.reference_sequences()
.keys()
.map(|n| n.to_string())
.collect();
let inner = reader.get_mut();
let mut reads = Vec::new();
let mut raw = rsomics_bamio::raw::RawRecord::default();
loop {
let n = rsomics_bamio::raw::read_record(inner, &mut raw)?;
if n == 0 {
break;
}
let flags = raw.flags();
const SKIP: u16 = 0x4 | 0x100 | 0x800 | 0x200 | 0x400;
if flags & SKIP != 0 {
continue;
}
if raw.mapping_quality() < min_mapq {
continue;
}
let ref_id = raw.reference_sequence_id();
if ref_id < 0 {
continue;
}
let pos = raw.alignment_start() as u64;
let cigar: Vec<(u8, u32)> = raw.cigar_ops().collect();
reads.push(RawRead { ref_id, pos, cigar });
}
Ok((reads, ref_names))
}
pub fn run(bam: &Path, bed: &Path, prefix: &str, opts: &JunctionSaturationOpts) -> Result<()> {
let (anno_junctions, splice_sites) = load_annotated_junctions(bed)?;
let (reads, ref_names) = load_reads(bam, opts.min_mapq, opts.threads)?;
let n_reads = reads.len();
let all_read_junctions: Vec<Vec<RawJunction>> = reads
.par_iter()
.map(|r| extract_junctions(&r.cigar, r.pos, opts.min_intron, r.ref_id))
.collect();
let fractions: Vec<u8> = {
let mut f = Vec::new();
let mut pct = opts.lower;
while pct <= opts.upper {
f.push(pct);
pct = pct.saturating_add(opts.step);
if pct > opts.upper && f.last().copied() != Some(opts.upper) {
f.push(opts.upper);
break;
}
}
f.dedup();
f
};
let seed = opts.seed.unwrap_or_else(rand::random);
let shuffled_indices: Vec<usize> = {
let mut rng = ChaCha12Rng::seed_from_u64(seed);
let mut idx: Vec<usize> = (0..n_reads).collect();
idx.shuffle(&mut rng);
idx
};
let results: Vec<FractionResult> = fractions
.par_iter()
.map(|&pct| {
let n_sample = ((n_reads as f64 * pct as f64 / 100.0).floor() as usize).min(n_reads);
let mut obs: HashSet<(i32, u64, u64)> = HashSet::new();
for &idx in &shuffled_indices[..n_sample] {
for junc in &all_read_junctions[idx] {
obs.insert((junc.ref_id, junc.start, junc.end));
}
}
let mut known = 0usize;
let mut partial_novel = 0usize;
let mut complete_novel = 0usize;
for (ref_id, start, end) in &obs {
let chrom = match ref_names.get(*ref_id as usize) {
Some(n) => n,
None => continue,
};
let donor_key = (chrom.clone(), *start);
let acceptor_key = (chrom.clone(), *end);
let has_donor = splice_sites.contains(&donor_key);
let has_acceptor = splice_sites.contains(&acceptor_key);
let junc_key = (chrom.clone(), *start, *end);
if anno_junctions.contains(&junc_key) {
known += 1;
} else if has_donor || has_acceptor {
partial_novel += 1;
} else {
complete_novel += 1;
}
}
FractionResult {
pct,
known,
partial_novel,
complete_novel,
}
})
.collect();
let out_path = format!("{prefix}.junction_saturation.txt");
let mut out = std::io::BufWriter::new(
std::fs::File::create(&out_path)
.map_err(|e| RsomicsError::InvalidInput(format!("{out_path}: {e}")))?,
);
writeln!(out, "pct\tknown\tpartial_novel\tcomplete_novel").map_err(RsomicsError::Io)?;
for r in &results {
writeln!(
out,
"{}\t{}\t{}\t{}",
r.pct, r.known, r.partial_novel, r.complete_novel
)
.map_err(RsomicsError::Io)?;
}
Ok(())
}