#![allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
use std::collections::HashSet;
use std::fs::File;
use std::path::Path;
use noodles::bam;
use noodles::sam::alignment::record::cigar::op::Kind;
use rsomics_common::{Result, RsomicsError};
use serde::Serialize;
#[derive(Debug, Clone)]
pub struct TinOutput {
pub gene_id: String,
pub chrom: String,
pub tx_start: i64,
pub tx_end: i64,
pub tin: f64,
}
#[derive(Debug, Serialize)]
pub struct TinSummary {
pub mean: f64,
pub median: f64,
pub stdev: f64,
}
#[derive(Debug, Clone)]
struct Transcript {
gene_id: String,
chrom: String,
tx_start: i64,
tx_end: i64,
exons: Vec<(i64, i64)>,
}
impl Transcript {
fn mrna_len(&self) -> i64 {
self.exons.iter().map(|(s, e)| e - s).sum()
}
}
fn parse_bed12(path: &Path) -> Result<Vec<Transcript>> {
let content = std::fs::read_to_string(path)
.map_err(|e| RsomicsError::Io(std::io::Error::other(format!("reading BED12: {e}"))))?;
let mut transcripts = Vec::new();
for line in content.lines() {
if line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
continue;
}
let fields: Vec<&str> = line.split('\t').collect();
if fields.len() < 12 {
continue;
}
let chrom = fields[0].to_string();
let Ok(tx_start) = fields[1].parse::<i64>() else {
continue;
};
let Ok(tx_end) = fields[2].parse::<i64>() else {
continue;
};
let gene_id = fields[3].to_string();
let Ok(block_count) = fields[9].parse::<usize>() else {
continue;
};
let block_sizes: Vec<i64> = fields[10]
.trim_end_matches(',')
.split(',')
.filter(|s| !s.is_empty())
.filter_map(|s| s.parse().ok())
.collect();
let block_starts: Vec<i64> = fields[11]
.trim_end_matches(',')
.split(',')
.filter(|s| !s.is_empty())
.filter_map(|s| s.parse().ok())
.collect();
if block_sizes.len() != block_count || block_starts.len() != block_count {
continue;
}
let exons: Vec<(i64, i64)> = block_starts
.iter()
.zip(block_sizes.iter())
.map(|(&rel_start, &size)| {
let abs_start = tx_start + rel_start;
(abs_start, abs_start + size)
})
.collect();
if exons.is_empty() {
continue;
}
transcripts.push(Transcript {
gene_id,
chrom,
tx_start,
tx_end,
exons,
});
}
Ok(transcripts)
}
fn pick_positions(tx: &Transcript, sample_size: usize) -> Vec<i64> {
#[allow(clippy::cast_sign_loss)]
let mrna_len = tx.mrna_len() as usize;
if mrna_len == 0 {
return Vec::new();
}
let mut exon_bounds: Vec<i64> = Vec::with_capacity(tx.exons.len() * 2);
for &(ex_start, ex_end) in &tx.exons {
exon_bounds.push(ex_start + 1);
exon_bounds.push(ex_end);
}
let chose_bases: Vec<i64> = if mrna_len <= sample_size {
let mut v: Vec<i64> = Vec::with_capacity(mrna_len);
for &(ex_start, ex_end) in &tx.exons {
for gpos in (ex_start + 1)..=ex_end {
v.push(gpos);
}
}
v
} else {
let step = mrna_len / sample_size;
let mut sampled: Vec<i64> = Vec::with_capacity(mrna_len / step + 1);
let mut mrna_offset: usize = 0; let mut next_idx: usize = 0;
for &(ex_start, ex_end) in &tx.exons {
#[allow(clippy::cast_sign_loss)]
let exon_len = (ex_end - ex_start) as usize;
while next_idx < mrna_offset + exon_len {
let local = next_idx - mrna_offset; #[allow(clippy::cast_possible_wrap)]
let gpos = ex_start + 1 + local as i64; sampled.push(gpos);
next_idx += step;
}
mrna_offset += exon_len;
}
sampled
};
let mut seen: HashSet<i64> = HashSet::with_capacity(exon_bounds.len() + chose_bases.len());
let mut merged: Vec<i64> = Vec::with_capacity(exon_bounds.len() + chose_bases.len());
for pos in exon_bounds.iter().chain(chose_bases.iter()) {
if seen.insert(*pos) {
merged.push(*pos);
}
}
merged.sort_unstable();
merged
}
fn shannon_entropy(vals: &[f64]) -> f64 {
let total: f64 = vals.iter().sum();
if total == 0.0 {
return 0.0;
}
vals.iter()
.filter(|&&v| v > 0.0)
.map(|&v| {
let p = v / total;
-p * p.ln()
})
.sum()
}
#[must_use]
pub fn compute_tin_score(mrna_cov: &[u32], n: usize) -> f64 {
let mrna_len = mrna_cov.len();
if mrna_len == 0 || n == 0 {
return 0.0;
}
let step = mrna_len / n;
if step == 0 {
return 0.0;
}
let sample: Vec<f64> = (0..n).map(|i| f64::from(mrna_cov[i * step])).collect();
let total: f64 = sample.iter().sum();
if total == 0.0 {
return 0.0;
}
let h = shannon_entropy(&sample);
#[allow(clippy::cast_precision_loss)]
let l = n as f64;
100.0 * h.exp() / l
}
#[must_use]
pub fn effective_sample_size(mut n: usize, mrna_len: usize) -> usize {
while n >= mrna_len {
n /= 2;
if n == 0 {
return 0;
}
}
n
}
struct TxAccum {
positions: Vec<i64>, coverage: Vec<f64>, unique_starts: HashSet<i64>, }
#[allow(clippy::too_many_lines)]
pub fn compute_tin(
bam_path: &Path,
bed_path: &Path,
min_cov: u64,
sample_size: usize,
) -> Result<Vec<TinOutput>> {
let transcripts = parse_bed12(bed_path)?;
let mut tx_by_idx: Vec<(usize, &Transcript)> = transcripts.iter().enumerate().collect();
tx_by_idx.sort_unstable_by(|a, b| {
a.1.chrom
.cmp(&b.1.chrom)
.then(a.1.tx_start.cmp(&b.1.tx_start))
});
let positions_vec: Vec<Vec<i64>> = transcripts
.iter()
.map(|tx| pick_positions(tx, sample_size))
.collect();
let mut accums: Vec<TxAccum> = positions_vec
.iter()
.map(|pos| TxAccum {
coverage: vec![0.0; pos.len()],
positions: pos.clone(),
unique_starts: HashSet::new(),
})
.collect();
let file = File::open(bam_path)
.map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", bam_path.display())))?;
let mut reader = bam::io::Reader::new(file);
let header = reader.read_header().map_err(RsomicsError::Io)?;
let mut chrom_txs: std::collections::HashMap<String, Vec<(i64, i64, usize)>> =
std::collections::HashMap::new();
for (orig_idx, tx) in transcripts.iter().enumerate() {
chrom_txs
.entry(tx.chrom.clone())
.or_default()
.push((tx.tx_start, tx.tx_end, orig_idx));
}
for v in chrom_txs.values_mut() {
v.sort_unstable_by_key(|&(s, _, _)| s);
}
let mut record = bam::Record::default();
loop {
match reader.read_record(&mut record) {
Ok(0) => break,
Ok(_) => {}
Err(e) => return Err(RsomicsError::Io(e)),
}
let flags = record.flags();
if flags.is_unmapped() || flags.is_secondary() || flags.is_qc_fail() {
continue;
}
let start_1based: i64 = match record.alignment_start() {
Some(Ok(pos)) => {
#[allow(clippy::cast_possible_wrap)]
let v = usize::from(pos) as i64;
v
}
_ => continue,
};
let start_0based = start_1based - 1;
let Some(Ok(ref_id)) = record.reference_sequence_id() else {
continue;
};
let chrom: &str = match header.reference_sequences().get_index(ref_id) {
Some((name, _)) => match std::str::from_utf8(name.as_ref()) {
Ok(s) => s,
Err(_) => continue,
},
None => continue,
};
let Some(tx_list) = chrom_txs.get(chrom) else {
continue;
};
let cigar = record.cigar();
let read_ref_len: i64 = cigar
.iter()
.filter_map(std::result::Result::ok)
.map(|op| match op.kind() {
Kind::Match
| Kind::SequenceMatch
| Kind::SequenceMismatch
| Kind::Deletion
| Kind::Skip => {
#[allow(clippy::cast_possible_wrap)]
{
op.len() as i64
}
}
_ => 0,
})
.sum();
let search_end = start_0based + read_ref_len; let first_candidate = tx_list.partition_point(|&(s, _, _)| s < start_0based);
let upper = tx_list.partition_point(|&(s, _, _)| s < search_end);
let lower = if first_candidate > 0 {
let mut lo = first_candidate;
while lo > 0 && tx_list[lo - 1].1 > start_0based {
lo -= 1;
}
lo
} else {
0
};
for &(tx_start, tx_end, orig_idx) in &tx_list[lower..upper] {
if tx_end <= start_0based {
continue; }
let accum = &mut accums[orig_idx];
let positions = &accum.positions;
if positions.is_empty() {
continue;
}
if start_0based >= tx_start && start_0based < tx_end {
accum.unique_starts.insert(start_0based);
}
let last_pos = *positions.last().unwrap();
#[allow(clippy::cast_sign_loss)]
let mut ref_cursor = start_1based as usize;
for op_result in cigar.iter() {
let Ok(op) = op_result else { break };
let len = op.len();
match op.kind() {
Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
#[allow(clippy::cast_possible_wrap)]
let block_start = ref_cursor as i64;
#[allow(clippy::cast_possible_wrap)]
let block_end = block_start + len as i64;
if block_start > last_pos {
break;
}
let lo = positions.partition_point(|&p| p < block_start);
let hi = positions.partition_point(|&p| p < block_end);
for idx in lo..hi {
accum.coverage[idx] += 1.0;
}
ref_cursor += len;
}
Kind::Deletion | Kind::Skip => {
ref_cursor += len;
}
Kind::SoftClip | Kind::HardClip | Kind::Pad | Kind::Insertion => {}
}
}
}
}
let mut results: Vec<Option<TinOutput>> = (0..transcripts.len()).map(|_| None).collect();
for (orig_idx, tx) in transcripts.iter().enumerate() {
let accum = &accums[orig_idx];
let positions = &accum.positions;
let tin = if positions.is_empty() || accum.unique_starts.len() as u64 <= min_cov {
0.0
} else {
let h = shannon_entropy(&accum.coverage);
if h == 0.0 {
0.0
} else {
#[allow(clippy::cast_precision_loss)]
let l_f = positions.len() as f64;
100.0 * h.exp() / l_f
}
};
eprint!(" {} transcripts finished", orig_idx + 1);
results[orig_idx] = Some(TinOutput {
gene_id: tx.gene_id.clone(),
chrom: tx.chrom.clone(),
tx_start: tx.tx_start,
tx_end: tx.tx_end,
tin,
});
}
eprintln!();
Ok(results.into_iter().map(|r| r.unwrap()).collect())
}
#[must_use]
pub fn summarise(outputs: &[TinOutput]) -> TinSummary {
let nonzero: Vec<f64> = outputs
.iter()
.filter(|r| r.tin > 0.0)
.map(|r| r.tin)
.collect();
if nonzero.is_empty() {
return TinSummary {
mean: 0.0,
median: 0.0,
stdev: 0.0,
};
}
#[allow(clippy::cast_precision_loss)]
let n = nonzero.len() as f64;
let mean = nonzero.iter().sum::<f64>() / n;
let mut sorted = nonzero.clone();
sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
let median = if sorted.len() % 2 == 1 {
sorted[sorted.len() / 2]
} else {
let mid = sorted.len() / 2;
sorted[mid - 1] / 2.0 + sorted[mid] / 2.0
};
let variance = nonzero.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / n;
let stdev = variance.sqrt();
TinSummary {
mean,
median,
stdev,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tin_uniform_100_positions() {
let cov: Vec<u32> = vec![10; 100];
let tin = compute_tin_score(&cov, 100);
assert!(
(tin - 100.0).abs() < 1e-10,
"expected TIN=100 for uniform, got {tin}"
);
}
#[test]
fn tin_single_spike() {
let mut cov = vec![0u32; 100];
cov[50] = 20;
let tin = compute_tin_score(&cov, 100);
assert!(
(tin - 1.0).abs() < 1e-10,
"expected TIN≈1 for spike, got {tin}"
);
}
#[test]
fn tin_zero_coverage() {
let cov = vec![0u32; 100];
let tin = compute_tin_score(&cov, 100);
assert!(tin == 0.0, "expected 0.0, got {tin}");
}
#[test]
fn effective_sample_size_no_halving() {
assert_eq!(effective_sample_size(100, 1000), 100);
}
#[test]
fn effective_sample_size_halving_once() {
assert_eq!(effective_sample_size(100, 100), 50);
}
#[test]
fn effective_sample_size_halving_twice() {
assert_eq!(effective_sample_size(100, 50), 25);
}
#[test]
fn parse_bed12_single_exon() {
use std::io::Write;
let mut f = tempfile::NamedTempFile::new().unwrap();
writeln!(f, "chr1\t0\t1000\ttx1\t0\t+\t0\t1000\t0\t1\t1000,\t0,").unwrap();
let txs = parse_bed12(f.path()).unwrap();
assert_eq!(txs.len(), 1);
assert_eq!(txs[0].mrna_len(), 1000);
assert_eq!(txs[0].exons, vec![(0, 1000)]);
}
#[test]
fn parse_bed12_multi_exon() {
use std::io::Write;
let mut f = tempfile::NamedTempFile::new().unwrap();
writeln!(
f,
"chr1\t100\t600\ttx2\t0\t+\t100\t600\t0\t3\t100,150,200,\t0,200,300,"
)
.unwrap();
let txs = parse_bed12(f.path()).unwrap();
assert_eq!(txs[0].mrna_len(), 450);
assert_eq!(txs[0].exons, vec![(100, 200), (300, 450), (400, 600)]);
}
#[test]
fn pick_positions_single_exon() {
let tx = Transcript {
gene_id: "tx".into(),
chrom: "chr1".into(),
tx_start: 0,
tx_end: 1000,
exons: vec![(0, 1000)],
};
let positions = pick_positions(&tx, 100);
assert_eq!(positions.len(), 101, "expected 101 positions");
assert_eq!(positions[0], 1, "first position should be 1 (1-based)");
assert_eq!(
*positions.last().unwrap(),
1000,
"last position should be 1000"
);
}
#[test]
fn summarise_nonzero_only() {
let outputs = vec![
TinOutput {
gene_id: "a".into(),
chrom: "chr1".into(),
tx_start: 0,
tx_end: 100,
tin: 90.0,
},
TinOutput {
gene_id: "b".into(),
chrom: "chr1".into(),
tx_start: 0,
tx_end: 100,
tin: 0.0,
},
TinOutput {
gene_id: "c".into(),
chrom: "chr1".into(),
tx_start: 0,
tx_end: 100,
tin: 80.0,
},
];
let s = summarise(&outputs);
assert!((s.mean - 85.0).abs() < 1e-10, "mean = {}", s.mean);
assert!((s.median - 85.0).abs() < 1e-10, "median = {}", s.median);
assert!((s.stdev - 5.0).abs() < 1e-10, "stdev = {}", s.stdev);
}
}