1#![allow(clippy::cast_precision_loss)]
28
29use std::collections::{HashMap, HashSet};
30use std::io::Write;
31use std::num::NonZero;
32use std::path::Path;
33
34use rsomics_bamio::raw::{self, RawRecord};
35use rsomics_common::{Result, RsomicsError};
36use serde::Serialize;
37
38const FLAG_QCFAIL: u16 = 0x0200;
40const FLAG_DUPLICATE: u16 = 0x0400;
41const FLAG_SECONDARY: u16 = 0x0100;
42const FLAG_UNMAPPED: u16 = 0x0004;
43
44pub struct KnownSites {
51 pub intron_starts: HashMap<String, HashSet<i64>>,
53 pub intron_ends: HashMap<String, HashSet<i64>>,
55}
56
57impl KnownSites {
58 pub fn from_bed12(path: &Path) -> Result<Self> {
63 let content = std::fs::read_to_string(path)
64 .map_err(|e| RsomicsError::Io(std::io::Error::other(format!("reading BED12: {e}"))))?;
65
66 let mut intron_starts: HashMap<String, HashSet<i64>> = HashMap::new();
67 let mut intron_ends: HashMap<String, HashSet<i64>> = HashMap::new();
68
69 for line in content.lines() {
70 if line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
71 continue;
72 }
73 let fields: Vec<&str> = line.split('\t').collect();
74 if fields.len() < 12 {
75 eprintln!("[NOTE:input bed must be 12-column] skipped this line: {line}");
76 continue;
77 }
78
79 let chrom = fields[0].to_uppercase();
80 let Ok(tx_start) = fields[1].parse::<i64>() else {
81 eprintln!("[NOTE:input bed must be 12-column] skipped this line: {line}");
82 continue;
83 };
84
85 let Ok(block_count) = fields[9].parse::<usize>() else {
87 continue;
88 };
89 if block_count == 1 {
90 continue;
91 }
92
93 let block_sizes: Vec<i64> = fields[10]
95 .trim_end_matches(',')
96 .split(',')
97 .filter(|s| !s.is_empty())
98 .filter_map(|s| s.parse().ok())
99 .collect();
100 let block_starts: Vec<i64> = fields[11]
101 .trim_end_matches(',')
102 .split(',')
103 .filter(|s| !s.is_empty())
104 .filter_map(|s| s.parse().ok())
105 .collect();
106
107 if block_sizes.len() < 2 || block_starts.len() < 2 {
108 continue;
109 }
110
111 let exon_starts: Vec<i64> = block_starts.iter().map(|&s| s + tx_start).collect();
113 let exon_ends: Vec<i64> = exon_starts
114 .iter()
115 .zip(block_sizes.iter())
116 .map(|(&s, &sz)| s + sz)
117 .collect();
118
119 for i in 0..exon_ends.len() - 1 {
121 let i_st = exon_ends[i];
122 let i_end = exon_starts[i + 1];
123 intron_starts.entry(chrom.clone()).or_default().insert(i_st);
124 intron_ends.entry(chrom.clone()).or_default().insert(i_end);
125 }
126 }
127
128 Ok(Self {
129 intron_starts,
130 intron_ends,
131 })
132 }
133
134 #[must_use]
136 pub fn classify(&self, chrom: &str, intron_start: i64, intron_end: i64) -> JunctionClass {
137 let chrom_up = chrom.to_uppercase();
138 let known_start = self
139 .intron_starts
140 .get(&chrom_up)
141 .is_some_and(|s| s.contains(&intron_start));
142 let known_end = self
143 .intron_ends
144 .get(&chrom_up)
145 .is_some_and(|s| s.contains(&intron_end));
146 match (known_start, known_end) {
147 (true, true) => JunctionClass::Known,
148 (false, false) => JunctionClass::CompleteNovel,
149 _ => JunctionClass::PartialNovel,
150 }
151 }
152}
153
154#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
156pub enum JunctionClass {
157 Known,
158 PartialNovel,
159 CompleteNovel,
160}
161
162#[derive(Debug, Clone, Default, Serialize)]
164pub struct JunctionCounts {
165 pub total_events: u64,
167 pub filtered_events: u64,
169 pub known_events: u64,
171 pub partial_novel_events: u64,
172 pub novel_events: u64,
173 pub known_junctions: u64,
175 pub partial_novel_junctions: u64,
176 pub novel_junctions: u64,
177}
178
179impl JunctionCounts {
180 #[must_use]
182 pub fn total_junctions(&self) -> u64 {
183 self.known_junctions + self.partial_novel_junctions + self.novel_junctions
184 }
185
186 pub fn write_rseqc_stderr<W: Write>(&self, mut out: W) -> std::io::Result<()> {
192 writeln!(out)?;
193 writeln!(
194 out,
195 "==================================================================="
196 )?;
197 writeln!(out, "Total splicing Events:\t{}", self.total_events)?;
198 writeln!(out, "Known Splicing Events:\t{}", self.known_events)?;
199 writeln!(
200 out,
201 "Partial Novel Splicing Events:\t{}",
202 self.partial_novel_events
203 )?;
204 writeln!(out, "Novel Splicing Events:\t{}", self.novel_events)?;
205 writeln!(out, "Filtered Splicing Events:\t{}", self.filtered_events)?;
206
207 writeln!(out)?;
208 writeln!(
209 out,
210 "Total splicing Junctions:\t{}",
211 self.total_junctions()
212 )?;
213 writeln!(out, "Known Splicing Junctions:\t{}", self.known_junctions)?;
214 writeln!(
215 out,
216 "Partial Novel Splicing Junctions:\t{}",
217 self.partial_novel_junctions
218 )?;
219 writeln!(out, "Novel Splicing Junctions:\t{}", self.novel_junctions)?;
220 writeln!(out)?;
221 writeln!(
222 out,
223 "==================================================================="
224 )?;
225 Ok(())
226 }
227
228 pub fn write_rseqc_stdout<W: Write>(&self, mut out: W) -> std::io::Result<()> {
233 writeln!(out, "total = {}", self.total_events)?;
234 Ok(())
235 }
236}
237
238pub fn annotate_junctions(
240 bam_path: &Path,
241 bed_path: &Path,
242 min_intron: i64,
243 mapq_cut: u8,
244 workers: NonZero<usize>,
245) -> Result<JunctionCounts> {
246 eprintln!("Reading reference bed file: {} ... ", bed_path.display());
247 let known = KnownSites::from_bed12(bed_path)?;
248 eprintln!("Done");
249
250 if bam_path
251 .extension()
252 .is_some_and(|e| e.eq_ignore_ascii_case("bam"))
253 {
254 eprintln!("Load BAM file ... ");
255 } else {
256 eprintln!("Load SAM file ... ");
257 }
258
259 let mut reader = rsomics_bamio::open_with_workers(bam_path, workers)?;
260 let header = reader.read_header().map_err(RsomicsError::Io)?;
261
262 let ref_names: Vec<String> = header
263 .reference_sequences()
264 .keys()
265 .map(|k| String::from_utf8_lossy(k.as_ref()).into_owned())
266 .collect();
267
268 let counts = scan_reads(reader.get_mut(), &ref_names, &known, min_intron, mapq_cut)?;
269
270 eprintln!("Done");
271 Ok(counts)
272}
273
274fn scan_reads<R: std::io::Read>(
276 reader: &mut R,
277 ref_names: &[String],
278 known: &KnownSites,
279 min_intron: i64,
280 mapq_cut: u8,
281) -> Result<JunctionCounts> {
282 let mut splicing_events: HashMap<(String, i64, i64), u64> = HashMap::new();
284
285 let mut total_events: u64 = 0;
286 let mut filtered_events: u64 = 0;
287 let mut known_ev: u64 = 0;
289 let mut partial_ev: u64 = 0;
290 let mut novel_ev: u64 = 0;
291
292 let mut rec = RawRecord::default();
293
294 loop {
295 let bytes = raw::read_record(reader, &mut rec)?;
296 if bytes == 0 {
297 break;
298 }
299
300 let flags = rec.flags();
301 if flags & (FLAG_QCFAIL | FLAG_DUPLICATE | FLAG_SECONDARY | FLAG_UNMAPPED) != 0 {
302 continue;
303 }
304 if rec.mapping_quality() < mapq_cut {
305 continue;
306 }
307
308 let tid = rec.reference_sequence_id();
309 if tid < 0 {
310 continue;
311 }
312 #[allow(clippy::cast_sign_loss)]
313 let Some(chrom) = ref_names.get(tid as usize) else {
314 continue;
315 };
316
317 let introns = fetch_introns(chrom, i64::from(rec.alignment_start()), rec.cigar_ops());
319 if introns.is_empty() {
320 continue;
321 }
322
323 for (chr, i_st, i_end) in introns {
324 total_events += 1;
325 if i_end - i_st < min_intron {
326 filtered_events += 1;
327 continue;
328 }
329 *splicing_events
330 .entry((chr.clone(), i_st, i_end))
331 .or_insert(0) += 1;
332
333 match known.classify(&chr, i_st, i_end) {
335 JunctionClass::Known => known_ev += 1,
336 JunctionClass::PartialNovel => partial_ev += 1,
337 JunctionClass::CompleteNovel => novel_ev += 1,
338 }
339 }
340 }
341
342 let mut known_junc: u64 = 0;
344 let mut partial_junc: u64 = 0;
345 let mut novel_junc: u64 = 0;
346
347 for (chr, i_st, i_end) in splicing_events.keys() {
348 match known.classify(chr, *i_st, *i_end) {
349 JunctionClass::Known => known_junc += 1,
350 JunctionClass::PartialNovel => partial_junc += 1,
351 JunctionClass::CompleteNovel => novel_junc += 1,
352 }
353 }
354
355 Ok(JunctionCounts {
356 total_events,
357 filtered_events,
358 known_events: known_ev,
359 partial_novel_events: partial_ev,
360 novel_events: novel_ev,
361 known_junctions: known_junc,
362 partial_novel_junctions: partial_junc,
363 novel_junctions: novel_junc,
364 })
365}
366
367fn fetch_introns<I>(chrom: &str, alignment_start: i64, ops: I) -> Vec<(String, i64, i64)>
374where
375 I: Iterator<Item = (u8, u32)>,
376{
377 let mut result = Vec::new();
378 let mut ref_pos = alignment_start;
379
380 for (op_code, op_len) in ops {
381 let len = i64::from(op_len);
382 match op_code {
383 0 | 2 | 7 | 8 => {
384 ref_pos += len;
386 }
387 3 => {
388 let i_st = ref_pos;
390 let i_end = ref_pos + len;
391 result.push((chrom.to_string(), i_st, i_end));
392 ref_pos = i_end;
393 }
394 _ => {}
396 }
397 }
398 result
399}