1#![allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
29
30use std::collections::HashSet;
31use std::fs::File;
32use std::path::Path;
33
34use noodles::bam;
35use noodles::sam::alignment::record::cigar::op::Kind;
36use rsomics_common::{Result, RsomicsError};
37use serde::Serialize;
38
39#[derive(Debug, Clone)]
41pub struct TinOutput {
42 pub gene_id: String,
43 pub chrom: String,
44 pub tx_start: i64,
45 pub tx_end: i64,
46 pub tin: f64,
48}
49
50#[derive(Debug, Serialize)]
52pub struct TinSummary {
53 pub mean: f64,
54 pub median: f64,
55 pub stdev: f64,
56}
57
58#[derive(Debug, Clone)]
60struct Transcript {
61 gene_id: String,
62 chrom: String,
63 tx_start: i64,
65 tx_end: i64,
66 exons: Vec<(i64, i64)>,
68}
69
70impl Transcript {
71 fn mrna_len(&self) -> i64 {
72 self.exons.iter().map(|(s, e)| e - s).sum()
73 }
74}
75
76fn parse_bed12(path: &Path) -> Result<Vec<Transcript>> {
78 let content = std::fs::read_to_string(path)
79 .map_err(|e| RsomicsError::Io(std::io::Error::other(format!("reading BED12: {e}"))))?;
80
81 let mut transcripts = Vec::new();
82
83 for line in content.lines() {
84 if line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
85 continue;
86 }
87 let fields: Vec<&str> = line.split('\t').collect();
88 if fields.len() < 12 {
89 continue;
90 }
91
92 let chrom = fields[0].to_string();
93 let Ok(tx_start) = fields[1].parse::<i64>() else {
94 continue;
95 };
96 let Ok(tx_end) = fields[2].parse::<i64>() else {
97 continue;
98 };
99 let gene_id = fields[3].to_string();
100
101 let Ok(block_count) = fields[9].parse::<usize>() else {
102 continue;
103 };
104
105 let block_sizes: Vec<i64> = fields[10]
106 .trim_end_matches(',')
107 .split(',')
108 .filter(|s| !s.is_empty())
109 .filter_map(|s| s.parse().ok())
110 .collect();
111 let block_starts: Vec<i64> = fields[11]
112 .trim_end_matches(',')
113 .split(',')
114 .filter(|s| !s.is_empty())
115 .filter_map(|s| s.parse().ok())
116 .collect();
117
118 if block_sizes.len() != block_count || block_starts.len() != block_count {
119 continue;
120 }
121
122 let exons: Vec<(i64, i64)> = block_starts
123 .iter()
124 .zip(block_sizes.iter())
125 .map(|(&rel_start, &size)| {
126 let abs_start = tx_start + rel_start;
127 (abs_start, abs_start + size)
128 })
129 .collect();
130
131 if exons.is_empty() {
132 continue;
133 }
134
135 transcripts.push(Transcript {
136 gene_id,
137 chrom,
138 tx_start,
139 tx_end,
140 exons,
141 });
142 }
143
144 Ok(transcripts)
145}
146
147fn pick_positions(tx: &Transcript, sample_size: usize) -> Vec<i64> {
156 #[allow(clippy::cast_sign_loss)]
157 let mrna_len = tx.mrna_len() as usize;
158 if mrna_len == 0 {
159 return Vec::new();
160 }
161
162 let mut exon_bounds: Vec<i64> = Vec::with_capacity(tx.exons.len() * 2);
164 for &(ex_start, ex_end) in &tx.exons {
165 exon_bounds.push(ex_start + 1);
166 exon_bounds.push(ex_end);
167 }
168
169 let chose_bases: Vec<i64> = if mrna_len <= sample_size {
174 let mut v: Vec<i64> = Vec::with_capacity(mrna_len);
176 for &(ex_start, ex_end) in &tx.exons {
177 for gpos in (ex_start + 1)..=ex_end {
178 v.push(gpos);
179 }
180 }
181 v
182 } else {
183 let step = mrna_len / sample_size;
184 let mut sampled: Vec<i64> = Vec::with_capacity(mrna_len / step + 1);
187 let mut mrna_offset: usize = 0; let mut next_idx: usize = 0; for &(ex_start, ex_end) in &tx.exons {
191 #[allow(clippy::cast_sign_loss)]
192 let exon_len = (ex_end - ex_start) as usize;
193 while next_idx < mrna_offset + exon_len {
196 let local = next_idx - mrna_offset; #[allow(clippy::cast_possible_wrap)]
198 let gpos = ex_start + 1 + local as i64; sampled.push(gpos);
200 next_idx += step;
201 }
202 mrna_offset += exon_len;
203 }
204 sampled
205 };
206
207 let mut seen: HashSet<i64> = HashSet::with_capacity(exon_bounds.len() + chose_bases.len());
210 let mut merged: Vec<i64> = Vec::with_capacity(exon_bounds.len() + chose_bases.len());
211 for pos in exon_bounds.iter().chain(chose_bases.iter()) {
212 if seen.insert(*pos) {
213 merged.push(*pos);
214 }
215 }
216 merged.sort_unstable();
217 merged
218}
219
220fn shannon_entropy(vals: &[f64]) -> f64 {
224 let total: f64 = vals.iter().sum();
225 if total == 0.0 {
226 return 0.0;
227 }
228 vals.iter()
229 .filter(|&&v| v > 0.0)
230 .map(|&v| {
231 let p = v / total;
232 -p * p.ln()
233 })
234 .sum()
235}
236
237#[must_use]
243pub fn compute_tin_score(mrna_cov: &[u32], n: usize) -> f64 {
244 let mrna_len = mrna_cov.len();
245 if mrna_len == 0 || n == 0 {
246 return 0.0;
247 }
248
249 let step = mrna_len / n;
250 if step == 0 {
251 return 0.0;
252 }
253
254 let sample: Vec<f64> = (0..n).map(|i| f64::from(mrna_cov[i * step])).collect();
255 let total: f64 = sample.iter().sum();
256 if total == 0.0 {
257 return 0.0;
258 }
259 let h = shannon_entropy(&sample);
260 #[allow(clippy::cast_precision_loss)]
261 let l = n as f64;
262 100.0 * h.exp() / l
263}
264
265#[must_use]
270pub fn effective_sample_size(mut n: usize, mrna_len: usize) -> usize {
271 while n >= mrna_len {
272 n /= 2;
273 if n == 0 {
274 return 0;
275 }
276 }
277 n
278}
279
280struct TxAccum {
282 positions: Vec<i64>, coverage: Vec<f64>, unique_starts: HashSet<i64>, }
286
287#[allow(clippy::too_many_lines)]
288pub fn compute_tin(
300 bam_path: &Path,
301 bed_path: &Path,
302 min_cov: u64,
303 sample_size: usize,
304) -> Result<Vec<TinOutput>> {
305 let transcripts = parse_bed12(bed_path)?;
306
307 let mut tx_by_idx: Vec<(usize, &Transcript)> = transcripts.iter().enumerate().collect();
311 tx_by_idx.sort_unstable_by(|a, b| {
312 a.1.chrom
313 .cmp(&b.1.chrom)
314 .then(a.1.tx_start.cmp(&b.1.tx_start))
315 });
316
317 let positions_vec: Vec<Vec<i64>> = transcripts
319 .iter()
320 .map(|tx| pick_positions(tx, sample_size))
321 .collect();
322
323 let mut accums: Vec<TxAccum> = positions_vec
325 .iter()
326 .map(|pos| TxAccum {
327 coverage: vec![0.0; pos.len()],
328 positions: pos.clone(),
329 unique_starts: HashSet::new(),
330 })
331 .collect();
332
333 let file = File::open(bam_path)
335 .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", bam_path.display())))?;
336 let mut reader = bam::io::Reader::new(file);
337 let header = reader.read_header().map_err(RsomicsError::Io)?;
338
339 let mut chrom_txs: std::collections::HashMap<String, Vec<(i64, i64, usize)>> =
342 std::collections::HashMap::new();
343 for (orig_idx, tx) in transcripts.iter().enumerate() {
344 chrom_txs
345 .entry(tx.chrom.clone())
346 .or_default()
347 .push((tx.tx_start, tx.tx_end, orig_idx));
348 }
349 for v in chrom_txs.values_mut() {
351 v.sort_unstable_by_key(|&(s, _, _)| s);
352 }
353
354 let mut record = bam::Record::default();
355 loop {
356 match reader.read_record(&mut record) {
357 Ok(0) => break,
358 Ok(_) => {}
359 Err(e) => return Err(RsomicsError::Io(e)),
360 }
361
362 let flags = record.flags();
363 if flags.is_unmapped() || flags.is_secondary() || flags.is_qc_fail() {
364 continue;
365 }
366
367 let start_1based: i64 = match record.alignment_start() {
368 Some(Ok(pos)) => {
369 #[allow(clippy::cast_possible_wrap)]
370 let v = usize::from(pos) as i64;
371 v
372 }
373 _ => continue,
374 };
375 let start_0based = start_1based - 1;
376
377 let Some(Ok(ref_id)) = record.reference_sequence_id() else {
379 continue;
380 };
381 let chrom: &str = match header.reference_sequences().get_index(ref_id) {
382 Some((name, _)) => match std::str::from_utf8(name.as_ref()) {
383 Ok(s) => s,
384 Err(_) => continue,
385 },
386 None => continue,
387 };
388
389 let Some(tx_list) = chrom_txs.get(chrom) else {
390 continue;
391 };
392
393 let cigar = record.cigar();
395 let read_ref_len: i64 = cigar
396 .iter()
397 .filter_map(std::result::Result::ok)
398 .map(|op| match op.kind() {
399 Kind::Match
400 | Kind::SequenceMatch
401 | Kind::SequenceMismatch
402 | Kind::Deletion
403 | Kind::Skip => {
404 #[allow(clippy::cast_possible_wrap)]
405 {
406 op.len() as i64
407 }
408 }
409 _ => 0,
410 })
411 .sum();
412 let search_end = start_0based + read_ref_len; let first_candidate = tx_list.partition_point(|&(s, _, _)| s < start_0based);
420 let upper = tx_list.partition_point(|&(s, _, _)| s < search_end);
424
425 let lower = if first_candidate > 0 {
433 let mut lo = first_candidate;
435 while lo > 0 && tx_list[lo - 1].1 > start_0based {
436 lo -= 1;
437 }
438 lo
439 } else {
440 0
441 };
442
443 for &(tx_start, tx_end, orig_idx) in &tx_list[lower..upper] {
444 if tx_end <= start_0based {
445 continue; }
447 let accum = &mut accums[orig_idx];
449 let positions = &accum.positions;
450
451 if positions.is_empty() {
452 continue;
453 }
454
455 if start_0based >= tx_start && start_0based < tx_end {
457 accum.unique_starts.insert(start_0based);
458 }
459
460 let last_pos = *positions.last().unwrap();
462 #[allow(clippy::cast_sign_loss)]
463 let mut ref_cursor = start_1based as usize;
464 for op_result in cigar.iter() {
465 let Ok(op) = op_result else { break };
466 let len = op.len();
467 match op.kind() {
468 Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
469 #[allow(clippy::cast_possible_wrap)]
470 let block_start = ref_cursor as i64;
471 #[allow(clippy::cast_possible_wrap)]
472 let block_end = block_start + len as i64;
473 if block_start > last_pos {
474 break;
475 }
476 let lo = positions.partition_point(|&p| p < block_start);
477 let hi = positions.partition_point(|&p| p < block_end);
478 for idx in lo..hi {
479 accum.coverage[idx] += 1.0;
480 }
481 ref_cursor += len;
482 }
483 Kind::Deletion | Kind::Skip => {
484 ref_cursor += len;
485 }
486 Kind::SoftClip | Kind::HardClip | Kind::Pad | Kind::Insertion => {}
487 }
488 }
489 }
490 }
491
492 let mut results: Vec<Option<TinOutput>> = (0..transcripts.len()).map(|_| None).collect();
494 for (orig_idx, tx) in transcripts.iter().enumerate() {
495 let accum = &accums[orig_idx];
496 let positions = &accum.positions;
497
498 let tin = if positions.is_empty() || accum.unique_starts.len() as u64 <= min_cov {
499 0.0
500 } else {
501 let h = shannon_entropy(&accum.coverage);
502 if h == 0.0 {
503 0.0
504 } else {
505 #[allow(clippy::cast_precision_loss)]
506 let l_f = positions.len() as f64;
507 100.0 * h.exp() / l_f
508 }
509 };
510
511 eprint!(" {} transcripts finished", orig_idx + 1);
512 results[orig_idx] = Some(TinOutput {
513 gene_id: tx.gene_id.clone(),
514 chrom: tx.chrom.clone(),
515 tx_start: tx.tx_start,
516 tx_end: tx.tx_end,
517 tin,
518 });
519 }
520
521 eprintln!();
522 Ok(results.into_iter().map(|r| r.unwrap()).collect())
523}
524
525#[must_use]
530pub fn summarise(outputs: &[TinOutput]) -> TinSummary {
531 let nonzero: Vec<f64> = outputs
532 .iter()
533 .filter(|r| r.tin > 0.0)
534 .map(|r| r.tin)
535 .collect();
536
537 if nonzero.is_empty() {
538 return TinSummary {
539 mean: 0.0,
540 median: 0.0,
541 stdev: 0.0,
542 };
543 }
544
545 #[allow(clippy::cast_precision_loss)]
546 let n = nonzero.len() as f64;
547 let mean = nonzero.iter().sum::<f64>() / n;
548
549 let mut sorted = nonzero.clone();
550 sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
551 let median = if sorted.len() % 2 == 1 {
552 sorted[sorted.len() / 2]
553 } else {
554 let mid = sorted.len() / 2;
555 sorted[mid - 1] / 2.0 + sorted[mid] / 2.0
557 };
558
559 let variance = nonzero.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / n;
561 let stdev = variance.sqrt();
562
563 TinSummary {
564 mean,
565 median,
566 stdev,
567 }
568}
569
570#[cfg(test)]
571mod tests {
572 use super::*;
573
574 #[test]
575 fn tin_uniform_100_positions() {
576 let cov: Vec<u32> = vec![10; 100];
578 let tin = compute_tin_score(&cov, 100);
579 assert!(
580 (tin - 100.0).abs() < 1e-10,
581 "expected TIN=100 for uniform, got {tin}"
582 );
583 }
584
585 #[test]
586 fn tin_single_spike() {
587 let mut cov = vec![0u32; 100];
589 cov[50] = 20;
590 let tin = compute_tin_score(&cov, 100);
591 assert!(
592 (tin - 1.0).abs() < 1e-10,
593 "expected TIN≈1 for spike, got {tin}"
594 );
595 }
596
597 #[test]
598 fn tin_zero_coverage() {
599 let cov = vec![0u32; 100];
600 let tin = compute_tin_score(&cov, 100);
601 assert!(tin == 0.0, "expected 0.0, got {tin}");
602 }
603
604 #[test]
605 fn effective_sample_size_no_halving() {
606 assert_eq!(effective_sample_size(100, 1000), 100);
608 }
609
610 #[test]
611 fn effective_sample_size_halving_once() {
612 assert_eq!(effective_sample_size(100, 100), 50);
614 }
615
616 #[test]
617 fn effective_sample_size_halving_twice() {
618 assert_eq!(effective_sample_size(100, 50), 25);
620 }
621
622 #[test]
623 fn parse_bed12_single_exon() {
624 use std::io::Write;
625 let mut f = tempfile::NamedTempFile::new().unwrap();
626 writeln!(f, "chr1\t0\t1000\ttx1\t0\t+\t0\t1000\t0\t1\t1000,\t0,").unwrap();
627 let txs = parse_bed12(f.path()).unwrap();
628 assert_eq!(txs.len(), 1);
629 assert_eq!(txs[0].mrna_len(), 1000);
630 assert_eq!(txs[0].exons, vec![(0, 1000)]);
631 }
632
633 #[test]
634 fn parse_bed12_multi_exon() {
635 use std::io::Write;
636 let mut f = tempfile::NamedTempFile::new().unwrap();
637 writeln!(
639 f,
640 "chr1\t100\t600\ttx2\t0\t+\t100\t600\t0\t3\t100,150,200,\t0,200,300,"
641 )
642 .unwrap();
643 let txs = parse_bed12(f.path()).unwrap();
644 assert_eq!(txs[0].mrna_len(), 450);
645 assert_eq!(txs[0].exons, vec![(100, 200), (300, 450), (400, 600)]);
646 }
647
648 #[test]
649 fn pick_positions_single_exon() {
650 let tx = Transcript {
655 gene_id: "tx".into(),
656 chrom: "chr1".into(),
657 tx_start: 0,
658 tx_end: 1000,
659 exons: vec![(0, 1000)],
660 };
661 let positions = pick_positions(&tx, 100);
662 assert_eq!(positions.len(), 101, "expected 101 positions");
663 assert_eq!(positions[0], 1, "first position should be 1 (1-based)");
664 assert_eq!(
665 *positions.last().unwrap(),
666 1000,
667 "last position should be 1000"
668 );
669 }
670
671 #[test]
672 fn summarise_nonzero_only() {
673 let outputs = vec![
674 TinOutput {
675 gene_id: "a".into(),
676 chrom: "chr1".into(),
677 tx_start: 0,
678 tx_end: 100,
679 tin: 90.0,
680 },
681 TinOutput {
682 gene_id: "b".into(),
683 chrom: "chr1".into(),
684 tx_start: 0,
685 tx_end: 100,
686 tin: 0.0,
687 },
688 TinOutput {
689 gene_id: "c".into(),
690 chrom: "chr1".into(),
691 tx_start: 0,
692 tx_end: 100,
693 tin: 80.0,
694 },
695 ];
696 let s = summarise(&outputs);
697 assert!((s.mean - 85.0).abs() < 1e-10, "mean = {}", s.mean);
698 assert!((s.median - 85.0).abs() < 1e-10, "median = {}", s.median);
699 assert!((s.stdev - 5.0).abs() < 1e-10, "stdev = {}", s.stdev);
701 }
702}