rsomics_junction_saturation/
lib.rs1use std::collections::HashSet;
25use std::io::{BufRead, Write};
26use std::num::NonZero;
27use std::path::Path;
28
29use rand::SeedableRng;
30use rand::seq::SliceRandom;
31use rand_chacha::ChaCha12Rng;
32#[allow(clippy::wildcard_imports)]
33use rayon::prelude::*;
34use rsomics_common::{Result, RsomicsError};
35
36type Junction = (String, u64, u64);
40
41#[derive(Debug, Clone, Default)]
43pub struct FractionResult {
44 pub pct: u8,
45 pub known: usize,
46 pub partial_novel: usize,
47 pub complete_novel: usize,
48}
49
50#[derive(Debug, Clone)]
52pub struct JunctionSaturationOpts {
53 pub lower: u8,
55 pub upper: u8,
57 pub step: u8,
59 pub min_mapq: u8,
61 pub min_intron: u64,
64 pub seed: Option<u64>,
66 pub threads: NonZero<usize>,
68}
69
70impl Default for JunctionSaturationOpts {
71 fn default() -> Self {
72 Self {
73 lower: 5,
74 upper: 100,
75 step: 5,
76 min_mapq: 0,
77 min_intron: 50,
78 seed: None,
79 threads: NonZero::new(1).unwrap(),
80 }
81 }
82}
83
84type SpliceSite = (String, u64);
88
89pub(crate) fn load_annotated_junctions(
96 path: &Path,
97) -> Result<(HashSet<Junction>, HashSet<SpliceSite>)> {
98 let f = std::fs::File::open(path)
99 .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
100 let reader = std::io::BufReader::new(f);
101 let mut junctions: HashSet<Junction> = HashSet::new();
102 let mut splice_sites: HashSet<SpliceSite> = HashSet::new();
103
104 for (lineno, line) in reader.lines().enumerate() {
105 let line = line.map_err(RsomicsError::Io)?;
106 let line = line.trim();
107 if line.is_empty() || line.starts_with('#') || line.starts_with("track") {
108 continue;
109 }
110 let cols: Vec<&str> = line.split('\t').collect();
111 if cols.len() < 12 {
112 return Err(RsomicsError::InvalidInput(format!(
113 "BED12 requires 12 columns at line {}; got {}",
114 lineno + 1,
115 cols.len()
116 )));
117 }
118
119 let chrom = cols[0].to_string();
120 let tx_start: u64 = cols[1]
121 .parse()
122 .map_err(|_| RsomicsError::InvalidInput(format!("bad start at line {}", lineno + 1)))?;
123 let block_count: usize = cols[9].parse().map_err(|_| {
124 RsomicsError::InvalidInput(format!("bad blockCount at line {}", lineno + 1))
125 })?;
126 if block_count < 2 {
127 continue; }
129 let block_sizes: Vec<u64> = cols[10]
130 .trim_end_matches(',')
131 .split(',')
132 .map(|s| {
133 s.trim().parse::<u64>().map_err(|_| {
134 RsomicsError::InvalidInput(format!("bad blockSize at line {}", lineno + 1))
135 })
136 })
137 .collect::<Result<_>>()?;
138 let block_starts: Vec<u64> = cols[11]
139 .trim_end_matches(',')
140 .split(',')
141 .map(|s| {
142 s.trim().parse::<u64>().map_err(|_| {
143 RsomicsError::InvalidInput(format!("bad blockStart at line {}", lineno + 1))
144 })
145 })
146 .collect::<Result<_>>()?;
147
148 if block_sizes.len() < block_count || block_starts.len() < block_count {
149 return Err(RsomicsError::InvalidInput(format!(
150 "blockSizes/blockStarts length mismatch at line {}",
151 lineno + 1
152 )));
153 }
154
155 let exons: Vec<(u64, u64)> = (0..block_count)
157 .map(|i| {
158 let start = tx_start + block_starts[i];
159 let end = start + block_sizes[i];
160 (start, end)
161 })
162 .collect();
163
164 for i in 0..(block_count - 1) {
167 let donor = exons[i].1; let acceptor = exons[i + 1].0; if donor >= acceptor {
170 continue; }
172 junctions.insert((chrom.clone(), donor, acceptor));
173 splice_sites.insert((chrom.clone(), donor));
174 splice_sites.insert((chrom.clone(), acceptor));
175 }
176 }
177
178 Ok((junctions, splice_sites))
179}
180
181#[derive(Debug, Clone, PartialEq, Eq, Hash)]
186struct RawJunction {
187 ref_id: i32,
188 start: u64,
189 end: u64,
190}
191
192fn extract_junctions(
195 cigar: &[(u8, u32)],
196 ref_start: u64,
197 min_intron: u64,
198 ref_id: i32,
199) -> Vec<RawJunction> {
200 let mut junctions = Vec::new();
201 let mut ref_pos = ref_start;
202
203 for &(op_code, op_len) in cigar {
204 let op_len = op_len as u64;
205 match op_code {
206 0 | 7 | 8 => ref_pos += op_len,
208 2 => ref_pos += op_len,
210 3 => {
212 let intron_start = ref_pos;
213 let intron_end = ref_pos + op_len;
214 ref_pos = intron_end;
215 if op_len >= min_intron {
216 junctions.push(RawJunction {
217 ref_id,
218 start: intron_start,
219 end: intron_end,
220 });
221 }
222 }
223 _ => {}
225 }
226 }
227 junctions
228}
229
230struct RawRead {
233 ref_id: i32,
234 pos: u64,
235 cigar: Vec<(u8, u32)>,
236}
237
238pub(crate) fn load_reads(
242 bam: &Path,
243 min_mapq: u8,
244 threads: NonZero<usize>,
245) -> Result<(Vec<RawRead>, Vec<String>)> {
246 let mut reader = rsomics_bamio::open_with_workers(bam, threads)?;
247 let header = reader.read_header().map_err(RsomicsError::Io)?;
248
249 let ref_names: Vec<String> = header
250 .reference_sequences()
251 .keys()
252 .map(|n| n.to_string())
253 .collect();
254
255 let inner = reader.get_mut();
256 let mut reads = Vec::new();
257 let mut raw = rsomics_bamio::raw::RawRecord::default();
258
259 loop {
260 let n = rsomics_bamio::raw::read_record(inner, &mut raw)?;
261 if n == 0 {
262 break;
263 }
264
265 let flags = raw.flags();
266 const SKIP: u16 = 0x4 | 0x100 | 0x800 | 0x200 | 0x400;
268 if flags & SKIP != 0 {
269 continue;
270 }
271 if raw.mapping_quality() < min_mapq {
272 continue;
273 }
274 let ref_id = raw.reference_sequence_id();
275 if ref_id < 0 {
276 continue;
277 }
278 let pos = raw.alignment_start() as u64;
279 let cigar: Vec<(u8, u32)> = raw.cigar_ops().collect();
281
282 reads.push(RawRead { ref_id, pos, cigar });
283 }
284
285 Ok((reads, ref_names))
286}
287
288pub fn run(bam: &Path, bed: &Path, prefix: &str, opts: &JunctionSaturationOpts) -> Result<()> {
295 let (anno_junctions, splice_sites) = load_annotated_junctions(bed)?;
296 let (reads, ref_names) = load_reads(bam, opts.min_mapq, opts.threads)?;
297
298 let n_reads = reads.len();
299
300 let all_read_junctions: Vec<Vec<RawJunction>> = reads
302 .par_iter()
303 .map(|r| extract_junctions(&r.cigar, r.pos, opts.min_intron, r.ref_id))
304 .collect();
305
306 let fractions: Vec<u8> = {
308 let mut f = Vec::new();
309 let mut pct = opts.lower;
310 while pct <= opts.upper {
311 f.push(pct);
312 pct = pct.saturating_add(opts.step);
313 if pct > opts.upper && f.last().copied() != Some(opts.upper) {
314 f.push(opts.upper);
315 break;
316 }
317 }
318 f.dedup();
319 f
320 };
321
322 let seed = opts.seed.unwrap_or_else(rand::random);
323
324 let shuffled_indices: Vec<usize> = {
329 let mut rng = ChaCha12Rng::seed_from_u64(seed);
330 let mut idx: Vec<usize> = (0..n_reads).collect();
331 idx.shuffle(&mut rng);
332 idx
333 };
334
335 let results: Vec<FractionResult> = fractions
337 .par_iter()
338 .map(|&pct| {
339 let n_sample = ((n_reads as f64 * pct as f64 / 100.0).floor() as usize).min(n_reads);
340
341 let mut obs: HashSet<(i32, u64, u64)> = HashSet::new();
343 for &idx in &shuffled_indices[..n_sample] {
344 for junc in &all_read_junctions[idx] {
345 obs.insert((junc.ref_id, junc.start, junc.end));
346 }
347 }
348
349 let mut known = 0usize;
351 let mut partial_novel = 0usize;
352 let mut complete_novel = 0usize;
353
354 for (ref_id, start, end) in &obs {
355 let chrom = match ref_names.get(*ref_id as usize) {
356 Some(n) => n,
357 None => continue,
358 };
359 let donor_key = (chrom.clone(), *start);
360 let acceptor_key = (chrom.clone(), *end);
361 let has_donor = splice_sites.contains(&donor_key);
362 let has_acceptor = splice_sites.contains(&acceptor_key);
363
364 let junc_key = (chrom.clone(), *start, *end);
365 if anno_junctions.contains(&junc_key) {
366 known += 1;
367 } else if has_donor || has_acceptor {
368 partial_novel += 1;
369 } else {
370 complete_novel += 1;
371 }
372 }
373
374 FractionResult {
375 pct,
376 known,
377 partial_novel,
378 complete_novel,
379 }
380 })
381 .collect();
382
383 let out_path = format!("{prefix}.junction_saturation.txt");
385 let mut out = std::io::BufWriter::new(
386 std::fs::File::create(&out_path)
387 .map_err(|e| RsomicsError::InvalidInput(format!("{out_path}: {e}")))?,
388 );
389
390 writeln!(out, "pct\tknown\tpartial_novel\tcomplete_novel").map_err(RsomicsError::Io)?;
391 for r in &results {
392 writeln!(
393 out,
394 "{}\t{}\t{}\t{}",
395 r.pct, r.known, r.partial_novel, r.complete_novel
396 )
397 .map_err(RsomicsError::Io)?;
398 }
399
400 Ok(())
401}