rsomics_tin/
lib.rs

1//! Transcript Integrity Number (TIN) computation from RNA-seq BAM data.
2//!
3//! TIN measures coverage uniformity across the mRNA body using Shannon entropy.
4//! Given per-base coverage `C_i` at sampled mRNA positions:
5//! `p_i = C_i / sum(C_j)`, `H = -sum(p_i * ln(p_i))` (non-zero only), `TIN = 100 * exp(H) / l`
6//! where `l` is the total number of sampled positions (including zero-coverage positions).
7//!
8//! Transcripts with fewer than `min_cov` UNIQUE read-start positions are assigned TIN=0.
9//! The sample count `n` is halved while `n >= mRNA_len`; when `mRNA_len <= n`, all bases
10//! are used. Exon boundary positions are always added to the position set (tin.py behaviour).
11//!
12//! ## Origin
13//!
14//! This crate is an independent Rust reimplementation of `tin.py` based on:
15//! - Wang L, Wang S, Li W. "`RSeQC`: quality control of RNA-seq experiments."
16//!   Bioinformatics. 2012;28(16):2184-5. doi:10.1093/bioinformatics/bts356
17//! - Wang L, Nie J, Sicotte H, et al. "Measure transcript integrity using RNA-seq data."
18//!   BMC Bioinformatics. 2016;17:58. doi:10.1186/s12859-016-0922-z
19//! - The BED12 format specification
20//! - Black-box behaviour testing against `RSeQC` `tin.py` 5.0.4
21//!
22//! No source code from the GPL-v3 upstream was used as reference during
23//! implementation. Test fixtures are independently generated.
24//!
25//! License: MIT OR Apache-2.0.
26//! Upstream credit: `RSeQC` <https://rseqc.sourceforge.net/> (GPL-v3).
27
28#![allow(clippy::cast_precision_loss, clippy::cast_possible_truncation)]
29
30use std::collections::HashSet;
31use std::fs::File;
32use std::path::Path;
33
34use noodles::bam;
35use noodles::sam::alignment::record::cigar::op::Kind;
36use rsomics_common::{Result, RsomicsError};
37use serde::Serialize;
38
39/// One row in the per-transcript output (`.tin.xls`).
40#[derive(Debug, Clone)]
41pub struct TinOutput {
42    pub gene_id: String,
43    pub chrom: String,
44    pub tx_start: i64,
45    pub tx_end: i64,
46    /// TIN score in [0, 100]. 0.0 means below minCov or zero coverage.
47    pub tin: f64,
48}
49
50/// Sample-level summary statistics emitted to `.summary.txt`.
51#[derive(Debug, Serialize)]
52pub struct TinSummary {
53    pub mean: f64,
54    pub median: f64,
55    pub stdev: f64,
56}
57
58/// One transcript parsed from BED12.
59#[derive(Debug, Clone)]
60struct Transcript {
61    gene_id: String,
62    chrom: String,
63    /// 0-based, half-open `[tx_start, tx_end)`
64    tx_start: i64,
65    tx_end: i64,
66    /// Exon blocks as 0-based half-open `[exon_start, exon_end)` in genomic coordinates.
67    exons: Vec<(i64, i64)>,
68}
69
70impl Transcript {
71    fn mrna_len(&self) -> i64 {
72        self.exons.iter().map(|(s, e)| e - s).sum()
73    }
74}
75
76/// Parse BED12 file into a list of transcripts.
77fn parse_bed12(path: &Path) -> Result<Vec<Transcript>> {
78    let content = std::fs::read_to_string(path)
79        .map_err(|e| RsomicsError::Io(std::io::Error::other(format!("reading BED12: {e}"))))?;
80
81    let mut transcripts = Vec::new();
82
83    for line in content.lines() {
84        if line.starts_with('#') || line.starts_with("track") || line.starts_with("browser") {
85            continue;
86        }
87        let fields: Vec<&str> = line.split('\t').collect();
88        if fields.len() < 12 {
89            continue;
90        }
91
92        let chrom = fields[0].to_string();
93        let Ok(tx_start) = fields[1].parse::<i64>() else {
94            continue;
95        };
96        let Ok(tx_end) = fields[2].parse::<i64>() else {
97            continue;
98        };
99        let gene_id = fields[3].to_string();
100
101        let Ok(block_count) = fields[9].parse::<usize>() else {
102            continue;
103        };
104
105        let block_sizes: Vec<i64> = fields[10]
106            .trim_end_matches(',')
107            .split(',')
108            .filter(|s| !s.is_empty())
109            .filter_map(|s| s.parse().ok())
110            .collect();
111        let block_starts: Vec<i64> = fields[11]
112            .trim_end_matches(',')
113            .split(',')
114            .filter(|s| !s.is_empty())
115            .filter_map(|s| s.parse().ok())
116            .collect();
117
118        if block_sizes.len() != block_count || block_starts.len() != block_count {
119            continue;
120        }
121
122        let exons: Vec<(i64, i64)> = block_starts
123            .iter()
124            .zip(block_sizes.iter())
125            .map(|(&rel_start, &size)| {
126                let abs_start = tx_start + rel_start;
127                (abs_start, abs_start + size)
128            })
129            .collect();
130
131        if exons.is_empty() {
132            continue;
133        }
134
135        transcripts.push(Transcript {
136            gene_id,
137            chrom,
138            tx_start,
139            tx_end,
140            exons,
141        });
142    }
143
144    Ok(transcripts)
145}
146
147/// Build the sampled genomic positions for a transcript, matching `tin.py` exactly.
148///
149/// Positions are 1-based genomic coordinates. Exon boundary positions are always
150/// included in addition to the equally-spaced sample. When `mrna_len <= sample_size`,
151/// every mRNA base is returned (all-bases mode).
152///
153/// Returns the sorted, deduplicated position list. Avoids materialising the full
154/// per-base mRNA index list by walking exons arithmetically.
155fn pick_positions(tx: &Transcript, sample_size: usize) -> Vec<i64> {
156    #[allow(clippy::cast_sign_loss)]
157    let mrna_len = tx.mrna_len() as usize;
158    if mrna_len == 0 {
159        return Vec::new();
160    }
161
162    // Collect exon boundary positions (1-based).
163    let mut exon_bounds: Vec<i64> = Vec::with_capacity(tx.exons.len() * 2);
164    for &(ex_start, ex_end) in &tx.exons {
165        exon_bounds.push(ex_start + 1);
166        exon_bounds.push(ex_end);
167    }
168
169    // Generate the equally-spaced sample without building the full per-base list.
170    // tin.py: step = int(mRNA_size / sample_size); indx = range(0, len(gene_all_base), step)
171    // gene_all_base[i] = genomic position of mRNA base at index i.
172    // We compute gene_all_base[i] by walking exons: find which exon contains mRNA index i.
173    let chose_bases: Vec<i64> = if mrna_len <= sample_size {
174        // All-bases mode: every mRNA position in genomic order.
175        let mut v: Vec<i64> = Vec::with_capacity(mrna_len);
176        for &(ex_start, ex_end) in &tx.exons {
177            for gpos in (ex_start + 1)..=ex_end {
178                v.push(gpos);
179            }
180        }
181        v
182    } else {
183        let step = mrna_len / sample_size;
184        // Walk exons to map mRNA index i → genomic 1-based position.
185        // Collect indices 0, step, 2*step, ... < mrna_len.
186        let mut sampled: Vec<i64> = Vec::with_capacity(mrna_len / step + 1);
187        let mut mrna_offset: usize = 0; // mRNA index of the start of the current exon
188        let mut next_idx: usize = 0; // next mRNA sample index to collect
189
190        for &(ex_start, ex_end) in &tx.exons {
191            #[allow(clippy::cast_sign_loss)]
192            let exon_len = (ex_end - ex_start) as usize;
193            // Indices for this exon: [mrna_offset, mrna_offset + exon_len)
194            // Sample indices in this range: next_idx, next_idx+step, ...
195            while next_idx < mrna_offset + exon_len {
196                let local = next_idx - mrna_offset; // offset within this exon
197                #[allow(clippy::cast_possible_wrap)]
198                let gpos = ex_start + 1 + local as i64; // 1-based genomic
199                sampled.push(gpos);
200                next_idx += step;
201            }
202            mrna_offset += exon_len;
203        }
204        sampled
205    };
206
207    // Merge exon bounds + sampled bases, deduplicate (first-seen wins), then sort.
208    // Mirrors tin.py: sorted(uniqify(exon_bounds + chose_bases)).
209    let mut seen: HashSet<i64> = HashSet::with_capacity(exon_bounds.len() + chose_bases.len());
210    let mut merged: Vec<i64> = Vec::with_capacity(exon_bounds.len() + chose_bases.len());
211    for pos in exon_bounds.iter().chain(chose_bases.iter()) {
212        if seen.insert(*pos) {
213            merged.push(*pos);
214        }
215    }
216    merged.sort_unstable();
217    merged
218}
219
220/// Shannon entropy `H = -sum(p_i * ln(p_i))` over non-zero values.
221///
222/// Returns 0.0 when all values are zero or the list is empty.
223fn shannon_entropy(vals: &[f64]) -> f64 {
224    let total: f64 = vals.iter().sum();
225    if total == 0.0 {
226        return 0.0;
227    }
228    vals.iter()
229        .filter(|&&v| v > 0.0)
230        .map(|&v| {
231            let p = v / total;
232            -p * p.ln()
233        })
234        .sum()
235}
236
237/// Compute TIN from a coverage vector and the total position count `l`.
238///
239/// Only non-zero coverage positions contribute to entropy; `l` is the denominator
240/// (total sampled positions including zero-coverage ones), matching `tin.py`.
241/// Returns 0.0 when total coverage is zero.
242#[must_use]
243pub fn compute_tin_score(mrna_cov: &[u32], n: usize) -> f64 {
244    let mrna_len = mrna_cov.len();
245    if mrna_len == 0 || n == 0 {
246        return 0.0;
247    }
248
249    let step = mrna_len / n;
250    if step == 0 {
251        return 0.0;
252    }
253
254    let sample: Vec<f64> = (0..n).map(|i| f64::from(mrna_cov[i * step])).collect();
255    let total: f64 = sample.iter().sum();
256    if total == 0.0 {
257        return 0.0;
258    }
259    let h = shannon_entropy(&sample);
260    #[allow(clippy::cast_precision_loss)]
261    let l = n as f64;
262    100.0 * h.exp() / l
263}
264
265/// Choose the number of sample positions: halve `n` while `n >= mrna_len`.
266///
267/// From `tin.py` help: "if this number is larger than the length of mRNA (L),
268/// it will be halved until it's smaller than L."
269#[must_use]
270pub fn effective_sample_size(mut n: usize, mrna_len: usize) -> usize {
271    while n >= mrna_len {
272        n /= 2;
273        if n == 0 {
274            return 0;
275        }
276    }
277    n
278}
279
280/// Per-transcript accumulator used during the linear BAM sweep.
281struct TxAccum {
282    positions: Vec<i64>,         // sorted, 1-based sampled genomic positions
283    coverage: Vec<f64>,          // coverage[i] corresponds to positions[i]
284    unique_starts: HashSet<i64>, // 0-based read starts within [tx_start, tx_end)
285}
286
287#[allow(clippy::too_many_lines)]
288/// Compute TIN for all transcripts in a BED12 from a sorted, indexed BAM.
289///
290/// Uses a single sequential BAM scan rather than per-transcript indexed seeks,
291/// avoiding repeated BGZF-decompress cycles when transcripts are dense.
292///
293/// # Parameters
294///
295/// - `bam_path`: sorted + indexed BAM (`.bai` index must exist).
296/// - `bed_path`: BED12 gene model.
297/// - `min_cov`: minimum unique read-start positions for a transcript (below → TIN=0). Default 10.
298/// - `sample_size`: initial number of equally-spaced mRNA positions to sample. Default 100.
299pub fn compute_tin(
300    bam_path: &Path,
301    bed_path: &Path,
302    min_cov: u64,
303    sample_size: usize,
304) -> Result<Vec<TinOutput>> {
305    let transcripts = parse_bed12(bed_path)?;
306
307    // Group transcripts by chromosome in BAM order.
308    // Within each chromosome, sort by tx_start for sweep-line ordering.
309    // We preserve the original index so we can emit results in BED12 order.
310    let mut tx_by_idx: Vec<(usize, &Transcript)> = transcripts.iter().enumerate().collect();
311    tx_by_idx.sort_unstable_by(|a, b| {
312        a.1.chrom
313            .cmp(&b.1.chrom)
314            .then(a.1.tx_start.cmp(&b.1.tx_start))
315    });
316
317    // Pre-compute sampled positions for every transcript.
318    let positions_vec: Vec<Vec<i64>> = transcripts
319        .iter()
320        .map(|tx| pick_positions(tx, sample_size))
321        .collect();
322
323    // Accumulator: one entry per transcript (in BED12 order, indexed by original index).
324    let mut accums: Vec<TxAccum> = positions_vec
325        .iter()
326        .map(|pos| TxAccum {
327            coverage: vec![0.0; pos.len()],
328            positions: pos.clone(),
329            unique_starts: HashSet::new(),
330        })
331        .collect();
332
333    // Open BAM for sequential scan (no index needed for the linear pass).
334    let file = File::open(bam_path)
335        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", bam_path.display())))?;
336    let mut reader = bam::io::Reader::new(file);
337    let header = reader.read_header().map_err(RsomicsError::Io)?;
338
339    // Build a chromosome → sorted list of (tx_start, tx_end, original_tx_index) mapping.
340    // For each BAM record we'll find overlapping transcripts.
341    let mut chrom_txs: std::collections::HashMap<String, Vec<(i64, i64, usize)>> =
342        std::collections::HashMap::new();
343    for (orig_idx, tx) in transcripts.iter().enumerate() {
344        chrom_txs
345            .entry(tx.chrom.clone())
346            .or_default()
347            .push((tx.tx_start, tx.tx_end, orig_idx));
348    }
349    // Sort each chrom's list by tx_start for binary-search lookup.
350    for v in chrom_txs.values_mut() {
351        v.sort_unstable_by_key(|&(s, _, _)| s);
352    }
353
354    let mut record = bam::Record::default();
355    loop {
356        match reader.read_record(&mut record) {
357            Ok(0) => break,
358            Ok(_) => {}
359            Err(e) => return Err(RsomicsError::Io(e)),
360        }
361
362        let flags = record.flags();
363        if flags.is_unmapped() || flags.is_secondary() || flags.is_qc_fail() {
364            continue;
365        }
366
367        let start_1based: i64 = match record.alignment_start() {
368            Some(Ok(pos)) => {
369                #[allow(clippy::cast_possible_wrap)]
370                let v = usize::from(pos) as i64;
371                v
372            }
373            _ => continue,
374        };
375        let start_0based = start_1based - 1;
376
377        // Determine reference name from the record's reference sequence ID.
378        let Some(Ok(ref_id)) = record.reference_sequence_id() else {
379            continue;
380        };
381        let chrom: &str = match header.reference_sequences().get_index(ref_id) {
382            Some((name, _)) => match std::str::from_utf8(name.as_ref()) {
383                Ok(s) => s,
384                Err(_) => continue,
385            },
386            None => continue,
387        };
388
389        let Some(tx_list) = chrom_txs.get(chrom) else {
390            continue;
391        };
392
393        // Compute read end from CIGAR (reference-consuming length).
394        let cigar = record.cigar();
395        let read_ref_len: i64 = cigar
396            .iter()
397            .filter_map(std::result::Result::ok)
398            .map(|op| match op.kind() {
399                Kind::Match
400                | Kind::SequenceMatch
401                | Kind::SequenceMismatch
402                | Kind::Deletion
403                | Kind::Skip => {
404                    #[allow(clippy::cast_possible_wrap)]
405                    {
406                        op.len() as i64
407                    }
408                }
409                _ => 0,
410            })
411            .sum();
412        // Find transcripts overlapping [start_0based, start_0based + read_ref_len) using
413        // binary search into the chromosome-sorted transcript list.
414        // A transcript overlaps the read if tx_start < search_end AND tx_end > start_0based.
415        let search_end = start_0based + read_ref_len; // exclusive 0-based read end
416        // First tx whose tx_start < search_end: leftmost tx to consider.
417        // We want tx_start < search_end (tx begins before read ends).
418        // All tx with tx_end > start_0based AND tx_start < search_end overlap.
419        let first_candidate = tx_list.partition_point(|&(s, _, _)| s < start_0based);
420        // Walk back to find transcripts that started before start_0based but extend past it.
421        // In a sorted list we might miss transcripts that started before but overlap.
422        // Simpler: scan all with tx_start < search_end (upper bound), check tx_end > start_0based.
423        let upper = tx_list.partition_point(|&(s, _, _)| s < search_end);
424
425        // Determine lower: we need tx_start such that tx could still overlap.
426        // Since BAM is sorted and we process records in order, transcripts whose tx_end <=
427        // start_0based are already fully processed. However we can't prune easily without
428        // tracking "active" set. Use a scan from 0..upper with tx_end > start_0based filter.
429        // For typical gene models (not insanely long transcripts) this is fast.
430        // Use binary search: find last tx whose tx_end could overlap.
431        // Actually just scan 0..upper and skip non-overlapping:
432        let lower = if first_candidate > 0 {
433            // Look back for transcripts that start before start_0based but may overlap
434            let mut lo = first_candidate;
435            while lo > 0 && tx_list[lo - 1].1 > start_0based {
436                lo -= 1;
437            }
438            lo
439        } else {
440            0
441        };
442
443        for &(tx_start, tx_end, orig_idx) in &tx_list[lower..upper] {
444            if tx_end <= start_0based {
445                continue; // no overlap
446            }
447            // This transcript overlaps the read.
448            let accum = &mut accums[orig_idx];
449            let positions = &accum.positions;
450
451            if positions.is_empty() {
452                continue;
453            }
454
455            // min_reads: track unique read starts within [tx_start, tx_end).
456            if start_0based >= tx_start && start_0based < tx_end {
457                accum.unique_starts.insert(start_0based);
458            }
459
460            // Coverage: walk CIGAR match intervals, find sampled positions within each.
461            let last_pos = *positions.last().unwrap();
462            #[allow(clippy::cast_sign_loss)]
463            let mut ref_cursor = start_1based as usize;
464            for op_result in cigar.iter() {
465                let Ok(op) = op_result else { break };
466                let len = op.len();
467                match op.kind() {
468                    Kind::Match | Kind::SequenceMatch | Kind::SequenceMismatch => {
469                        #[allow(clippy::cast_possible_wrap)]
470                        let block_start = ref_cursor as i64;
471                        #[allow(clippy::cast_possible_wrap)]
472                        let block_end = block_start + len as i64;
473                        if block_start > last_pos {
474                            break;
475                        }
476                        let lo = positions.partition_point(|&p| p < block_start);
477                        let hi = positions.partition_point(|&p| p < block_end);
478                        for idx in lo..hi {
479                            accum.coverage[idx] += 1.0;
480                        }
481                        ref_cursor += len;
482                    }
483                    Kind::Deletion | Kind::Skip => {
484                        ref_cursor += len;
485                    }
486                    Kind::SoftClip | Kind::HardClip | Kind::Pad | Kind::Insertion => {}
487                }
488            }
489        }
490    }
491
492    // Compute TIN from accumulated coverage.
493    let mut results: Vec<Option<TinOutput>> = (0..transcripts.len()).map(|_| None).collect();
494    for (orig_idx, tx) in transcripts.iter().enumerate() {
495        let accum = &accums[orig_idx];
496        let positions = &accum.positions;
497
498        let tin = if positions.is_empty() || accum.unique_starts.len() as u64 <= min_cov {
499            0.0
500        } else {
501            let h = shannon_entropy(&accum.coverage);
502            if h == 0.0 {
503                0.0
504            } else {
505                #[allow(clippy::cast_precision_loss)]
506                let l_f = positions.len() as f64;
507                100.0 * h.exp() / l_f
508            }
509        };
510
511        eprint!(" {} transcripts finished", orig_idx + 1);
512        results[orig_idx] = Some(TinOutput {
513            gene_id: tx.gene_id.clone(),
514            chrom: tx.chrom.clone(),
515            tx_start: tx.tx_start,
516            tx_end: tx.tx_end,
517            tin,
518        });
519    }
520
521    eprintln!();
522    Ok(results.into_iter().map(|r| r.unwrap()).collect())
523}
524
525/// Compute sample-level summary statistics over non-zero TIN scores.
526///
527/// Mirrors `tin.py`: only transcripts with TIN > 0 contribute to mean/median/stdev.
528/// Standard deviation uses the population formula (divide by N, not N-1).
529#[must_use]
530pub fn summarise(outputs: &[TinOutput]) -> TinSummary {
531    let nonzero: Vec<f64> = outputs
532        .iter()
533        .filter(|r| r.tin > 0.0)
534        .map(|r| r.tin)
535        .collect();
536
537    if nonzero.is_empty() {
538        return TinSummary {
539            mean: 0.0,
540            median: 0.0,
541            stdev: 0.0,
542        };
543    }
544
545    #[allow(clippy::cast_precision_loss)]
546    let n = nonzero.len() as f64;
547    let mean = nonzero.iter().sum::<f64>() / n;
548
549    let mut sorted = nonzero.clone();
550    sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
551    let median = if sorted.len() % 2 == 1 {
552        sorted[sorted.len() / 2]
553    } else {
554        let mid = sorted.len() / 2;
555        // Average of the two middle values for even-length arrays
556        sorted[mid - 1] / 2.0 + sorted[mid] / 2.0
557    };
558
559    // Population standard deviation (divides by N, matching numpy default)
560    let variance = nonzero.iter().map(|&x| (x - mean).powi(2)).sum::<f64>() / n;
561    let stdev = variance.sqrt();
562
563    TinSummary {
564        mean,
565        median,
566        stdev,
567    }
568}
569
570#[cfg(test)]
571mod tests {
572    use super::*;
573
574    #[test]
575    fn tin_uniform_100_positions() {
576        // All 100 positions have coverage 10 → p_i = 1/100 → H = ln(100) → TIN = 100
577        let cov: Vec<u32> = vec![10; 100];
578        let tin = compute_tin_score(&cov, 100);
579        assert!(
580            (tin - 100.0).abs() < 1e-10,
581            "expected TIN=100 for uniform, got {tin}"
582        );
583    }
584
585    #[test]
586    fn tin_single_spike() {
587        // All coverage at one position → H = 0 → TIN = exp(0)/n * 100 = 1.0
588        let mut cov = vec![0u32; 100];
589        cov[50] = 20;
590        let tin = compute_tin_score(&cov, 100);
591        assert!(
592            (tin - 1.0).abs() < 1e-10,
593            "expected TIN≈1 for spike, got {tin}"
594        );
595    }
596
597    #[test]
598    fn tin_zero_coverage() {
599        let cov = vec![0u32; 100];
600        let tin = compute_tin_score(&cov, 100);
601        assert!(tin == 0.0, "expected 0.0, got {tin}");
602    }
603
604    #[test]
605    fn effective_sample_size_no_halving() {
606        // n=100, mrna_len=1000: 100 < 1000, no change
607        assert_eq!(effective_sample_size(100, 1000), 100);
608    }
609
610    #[test]
611    fn effective_sample_size_halving_once() {
612        // n=100, mrna_len=100: 100 >= 100 → halve to 50 → 50 < 100
613        assert_eq!(effective_sample_size(100, 100), 50);
614    }
615
616    #[test]
617    fn effective_sample_size_halving_twice() {
618        // n=100, mrna_len=50: 100≥50 → 50≥50 → 25 < 50
619        assert_eq!(effective_sample_size(100, 50), 25);
620    }
621
622    #[test]
623    fn parse_bed12_single_exon() {
624        use std::io::Write;
625        let mut f = tempfile::NamedTempFile::new().unwrap();
626        writeln!(f, "chr1\t0\t1000\ttx1\t0\t+\t0\t1000\t0\t1\t1000,\t0,").unwrap();
627        let txs = parse_bed12(f.path()).unwrap();
628        assert_eq!(txs.len(), 1);
629        assert_eq!(txs[0].mrna_len(), 1000);
630        assert_eq!(txs[0].exons, vec![(0, 1000)]);
631    }
632
633    #[test]
634    fn parse_bed12_multi_exon() {
635        use std::io::Write;
636        let mut f = tempfile::NamedTempFile::new().unwrap();
637        // 3 exons: offsets 0,200,300 sizes 100,150,200 → absolute: [100,200],[300,450],[400,600]
638        writeln!(
639            f,
640            "chr1\t100\t600\ttx2\t0\t+\t100\t600\t0\t3\t100,150,200,\t0,200,300,"
641        )
642        .unwrap();
643        let txs = parse_bed12(f.path()).unwrap();
644        assert_eq!(txs[0].mrna_len(), 450);
645        assert_eq!(txs[0].exons, vec![(100, 200), (300, 450), (400, 600)]);
646    }
647
648    #[test]
649    fn pick_positions_single_exon() {
650        // tx: chr1:0-1000, one exon [0,1000), mrna_len=1000, sample_size=100
651        // step=10, gene_all_base=[1..=1000], indx=[0,10,...,990]
652        // chose_bases=[1,11,...,991], exon_bounds=[1,1000]
653        // uniqify([1,1000]+[1,11,...,991])=[1,1000,11,...,991] → sorted=[1,11,...,991,1000]
654        let tx = Transcript {
655            gene_id: "tx".into(),
656            chrom: "chr1".into(),
657            tx_start: 0,
658            tx_end: 1000,
659            exons: vec![(0, 1000)],
660        };
661        let positions = pick_positions(&tx, 100);
662        assert_eq!(positions.len(), 101, "expected 101 positions");
663        assert_eq!(positions[0], 1, "first position should be 1 (1-based)");
664        assert_eq!(
665            *positions.last().unwrap(),
666            1000,
667            "last position should be 1000"
668        );
669    }
670
671    #[test]
672    fn summarise_nonzero_only() {
673        let outputs = vec![
674            TinOutput {
675                gene_id: "a".into(),
676                chrom: "chr1".into(),
677                tx_start: 0,
678                tx_end: 100,
679                tin: 90.0,
680            },
681            TinOutput {
682                gene_id: "b".into(),
683                chrom: "chr1".into(),
684                tx_start: 0,
685                tx_end: 100,
686                tin: 0.0,
687            },
688            TinOutput {
689                gene_id: "c".into(),
690                chrom: "chr1".into(),
691                tx_start: 0,
692                tx_end: 100,
693                tin: 80.0,
694            },
695        ];
696        let s = summarise(&outputs);
697        assert!((s.mean - 85.0).abs() < 1e-10, "mean = {}", s.mean);
698        assert!((s.median - 85.0).abs() < 1e-10, "median = {}", s.median);
699        // pop stdev of [90, 80]: sqrt(((90-85)^2 + (80-85)^2) / 2) = sqrt(25) = 5
700        assert!((s.stdev - 5.0).abs() < 1e-10, "stdev = {}", s.stdev);
701    }
702}
rsomics_tin/lib.rs

rsomics_tin/
lib.rs