rustqc 0.1.0

Fast RNA-seq QC in a single pass: dupRadar, featureCounts, 8 RSeQC tools, preseq, samtools stats, and Qualimap — reimplemented in Rust
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
//! TIN (Transcript Integrity Number) analysis.
//!
//! Measures transcript integrity via Shannon entropy of read coverage
//! uniformity across sampled exonic positions. Reimplementation of
//! RSeQC's `tin.py` tool.

use rust_htslib::bam;
use std::collections::{HashMap, HashSet};
use std::hash::{BuildHasher, Hasher};
use std::io::Write;
use std::path::Path;

use anyhow::{Context, Result};
use indexmap::IndexMap;
use log::debug;
use rand::Rng;

use crate::gtf::Gene;

// ===================================================================
// Deterministic hash state for reproducible TIN
// ===================================================================

/// A [`BuildHasher`] that produces deterministic hashes seeded from a `u64`.
///
/// When `--tin-seed` is provided, the seed is used directly, making the
/// [`HashSet`] layout (and therefore threshold-crossing behaviour during
/// parallel accumulation) identical across runs. Without a seed, a random
/// value is drawn so the default non-deterministic behaviour is preserved.
#[derive(Clone, Debug)]
pub(crate) struct TinHashState(u64);

impl BuildHasher for TinHashState {
    type Hasher = std::collections::hash_map::DefaultHasher;

    fn build_hasher(&self) -> Self::Hasher {
        let mut h = std::collections::hash_map::DefaultHasher::new();
        h.write_u64(self.0);
        h
    }
}

impl TinHashState {
    fn new(seed: Option<u64>) -> Self {
        TinHashState(seed.unwrap_or_else(|| rand::rng().next_u64()))
    }
}

// ===================================================================
// Data structures
// ===================================================================

/// A single transcript's sampling metadata.
#[derive(Debug, Clone)]
pub struct TranscriptSampling {
    /// Gene identifier (gene_id from GTF).
    pub gene_id: String,
    /// Chromosome name (original case, for output).
    pub chrom: String,
    /// Chromosome name uppercased (for lookup matching).
    pub chrom_upper: String,
    /// Transcript start position (0-based).
    pub tx_start: u64,
    /// Transcript end position (0-based, exclusive).
    pub tx_end: u64,
    /// Sorted exon blocks as (start, end) in 0-based half-open coords.
    #[allow(dead_code)] // stored for API completeness; values used during construction
    pub exon_regions: Vec<(u64, u64)>,
    /// Total exonic bases across all exon blocks.
    #[allow(dead_code)] // stored for API completeness; values used during construction
    pub exon_length: u64,
    /// Unique genomic positions sampled within exonic regions (sorted, deduped).
    /// Used for coverage counting — each position gets one coverage slot.
    pub sampled_positions: Vec<u64>,
    /// Total position count including duplicates from upstream's `chose_bases`.
    /// Used as the denominator `l` in the TIN formula to match upstream behavior.
    /// For the small mRNA case, upstream does NOT dedup chose_bases, so
    /// `total_position_count > len(sampled_positions)` when boundary positions
    /// overlap with exonic positions.
    pub total_position_count: usize,
}

/// Index for fast lookup of sampled positions during BAM processing.
///
/// Maps chromosome → sorted list of (genomic_pos, transcript_idx, slot_idx)
/// for binary search during per-read dispatch.
#[derive(Debug, Default)]
pub struct TinIndex {
    /// All transcripts with their sampling metadata.
    pub transcripts: Vec<TranscriptSampling>,
    /// Per-chromosome sorted position entries for binary search.
    /// Each entry: (genomic_position, transcript_index, slot_index).
    pub chrom_positions: HashMap<String, Vec<(u64, u32, u32)>>,
    /// Per-chromosome transcript spans for overlap pre-filtering.
    /// Each entry: (tx_start, tx_end, transcript_index), sorted by tx_start.
    pub chrom_spans: HashMap<String, Vec<(u64, u64, u32)>>,
}

/// Result of TIN computation for a single transcript.
#[derive(Debug)]
pub struct TinResult {
    /// Gene identifier.
    pub gene_id: String,
    /// Chromosome name.
    pub chrom: String,
    /// Transcript start (0-based).
    pub tx_start: u64,
    /// Transcript end (0-based, exclusive).
    pub tx_end: u64,
    /// Computed TIN score (0-100), or 0.0 if below coverage threshold.
    pub tin: f64,
    /// Whether the transcript passed the minimum coverage threshold.
    /// Only transcripts with `passed_threshold == true` contribute to
    /// summary statistics, matching upstream RSeQC's `sample_TINs` list.
    pub passed_threshold: bool,
}

/// Collection of all TIN results for a BAM file.
#[derive(Debug)]
pub struct TinResults {
    /// Per-transcript TIN scores.
    pub transcripts: Vec<TinResult>,
}

impl TinResults {
    /// Number of transcripts with computed TIN scores.
    pub fn len(&self) -> usize {
        self.transcripts.len()
    }
}

// ===================================================================
// TIN index construction
// ===================================================================

impl TinIndex {
    /// Build TIN index from GTF gene annotations.
    ///
    /// Emits every transcript from every gene, matching RSeQC's
    /// per-transcript behaviour.
    pub fn from_genes(genes: &IndexMap<String, Gene>, sample_size: usize) -> Self {
        let mut index = TinIndex::default();

        for gene in genes.values() {
            for tx in &gene.transcripts {
                let exon_regions: Vec<(u64, u64)> = tx
                    .exons
                    .iter()
                    .map(|&(s, e)| (s.saturating_sub(1), e)) // GTF 1-based to 0-based half-open
                    .collect();
                let exon_length: u64 = exon_regions.iter().map(|(s, e)| e - s).sum();

                if exon_length == 0 {
                    continue;
                }

                let chrom = gene.chrom.clone();
                let chrom_upper = chrom.to_uppercase();
                let tx_start = exon_regions.first().map(|r| r.0).unwrap_or(0);
                let tx_end = exon_regions.last().map(|r| r.1).unwrap_or(0);

                let (sampled, total_count) =
                    sample_exonic_positions(&exon_regions, sample_size, tx_start, tx_end);
                if sampled.is_empty() {
                    continue;
                }

                index.add_transcript(TranscriptSampling {
                    gene_id: tx.transcript_id.clone(),
                    chrom,
                    chrom_upper,
                    tx_start,
                    tx_end,
                    exon_regions,
                    exon_length,
                    sampled_positions: sampled,
                    total_position_count: total_count,
                });
            }
        }

        index.build();
        index
    }

    /// Add a transcript to the index (before calling `build()`).
    fn add_transcript(&mut self, tx: TranscriptSampling) {
        let tx_idx = self.transcripts.len() as u32;
        let chrom_upper = tx.chrom_upper.clone();
        let tx_start = tx.tx_start;
        let tx_end = tx.tx_end;

        // Add sampled positions to per-chromosome position list
        for (slot_idx, &pos) in tx.sampled_positions.iter().enumerate() {
            self.chrom_positions
                .entry(chrom_upper.clone())
                .or_default()
                .push((pos, tx_idx, slot_idx as u32));
        }

        // Add transcript span
        self.chrom_spans
            .entry(chrom_upper)
            .or_default()
            .push((tx_start, tx_end, tx_idx));

        self.transcripts.push(tx);
    }

    /// Sort per-chromosome data for binary search.
    fn build(&mut self) {
        for positions in self.chrom_positions.values_mut() {
            positions.sort_unstable();
        }
        for spans in self.chrom_spans.values_mut() {
            spans.sort_unstable();
        }
        debug!(
            "TIN index: {} transcripts, {} chromosomes",
            self.transcripts.len(),
            self.chrom_positions.len()
        );
    }
}

/// Sample positions within exonic regions, matching upstream RSeQC `tin.py`'s
/// `genomic_positions()` algorithm.
///
/// Upstream uses 1-based genomic coordinates internally. All positions returned
/// here are 1-based to match pileup semantics (pysam's `pileupcolumn.pos + 1`).
///
/// When `mRNA_size <= sample_size`: returns ALL exonic bases (1-based) plus
/// `tx_start+1` and `tx_end` as boundary markers. The upstream does NOT dedup
/// these, so duplicates inflate `len(chose_bases)` which becomes the TIN
/// denominator `l`. We reproduce this by returning non-unique positions.
/// When `mRNA_size > sample_size`: returns every `step_size`-th exonic base
/// plus exon boundary positions (first and last base of each exon), deduplicated
/// and sorted.
/// Returns `(unique_positions, total_count)` where:
/// - `unique_positions`: sorted, deduplicated genomic positions for coverage counting
/// - `total_count`: length including duplicates, used as TIN denominator `l`
fn sample_exonic_positions(
    exon_regions: &[(u64, u64)],
    n: usize,
    tx_start: u64,
    tx_end: u64,
) -> (Vec<u64>, usize) {
    let mrna_size: u64 = exon_regions.iter().map(|(s, e)| e - s).sum();
    if mrna_size == 0 {
        return (Vec::new(), 0);
    }

    // Build 1-based list of all exonic positions (upstream: range(st+1, end+1))
    // exon_regions are 0-based half-open: (start, end) means bases start..end-1
    // 1-based equivalents: start+1 .. end (inclusive)
    if mrna_size <= n as u64 {
        // Small transcript: chose_bases = [tx_start+1, tx_end] + all exonic bases
        // Upstream does NOT deduplicate — duplicates inflate len(chose_bases)
        // which is used as the TIN denominator `l`.
        let mut positions: Vec<u64> = Vec::with_capacity(mrna_size as usize + 2);
        positions.push(tx_start + 1); // transcript boundary (1-based)
        positions.push(tx_end); // transcript boundary (1-based, 0-based exclusive = 1-based inclusive)
        for &(start, end) in exon_regions {
            for pos in (start + 1)..=end {
                positions.push(pos);
            }
        }
        positions.sort_unstable();
        let total_count = positions.len(); // includes duplicates

        // Dedup for coverage counting (pileup only returns unique positions)
        positions.dedup();
        (positions, total_count)
    } else {
        // Large transcript: strided sampling + exon boundaries
        // Build all 1-based exonic positions
        let mut gene_all_base: Vec<u64> = Vec::with_capacity(mrna_size as usize);
        let mut exon_bounds: Vec<u64> = Vec::new();
        for &(start, end) in exon_regions {
            for pos in (start + 1)..=end {
                gene_all_base.push(pos);
            }
            // Exon boundary positions (1-based)
            exon_bounds.push(start + 1); // first base of exon
            exon_bounds.push(end); // last base of exon
        }

        // Strided sampling: step_size = int(mRNA_size / sample_size)
        let step_size = (mrna_size as usize) / n;
        let step_size = step_size.max(1);

        let mut chose_bases: Vec<u64> = (0..gene_all_base.len())
            .step_by(step_size)
            .map(|i| gene_all_base[i])
            .collect();

        // Merge exon boundaries with sampled positions (upstream: uniqify(exon_bounds + chose_bases))
        // uniqify preserves first occurrence order and removes duplicates
        let mut all_positions = exon_bounds;
        all_positions.append(&mut chose_bases);

        let mut seen = HashSet::new();
        all_positions.retain(|x| seen.insert(*x));
        all_positions.sort_unstable();

        let total_count = all_positions.len(); // already deduped by uniqify
        (all_positions, total_count)
    }
}

// ===================================================================
// TIN accumulator
// ===================================================================

/// Per-read accumulator for TIN computation.
///
/// Tracks coverage at sampled positions and unique read start positions
/// per transcript during the BAM pass. Unique start tracking matches
/// RSeQC's `check_min_reads()`, which uses a `set()` of read start
/// positions.
///
/// To reduce memory pressure, each transcript's `unique_starts` set is
/// capped at `min_cov + 1` entries. Once the threshold is exceeded the
/// set is cleared and `exceeded_threshold` is set, since only the
/// pass/fail decision matters for the TIN score.
#[derive(Debug)]
pub struct TinAccum {
    /// Per-transcript per-slot coverage counts.
    /// Indexed as `coverage[tx_idx][slot_idx]`.
    pub coverage: Vec<Vec<u32>>,
    /// Per-transcript unique read start positions, capped at `min_cov + 1`.
    /// Once exceeded, the set is drained and `exceeded_threshold[tx_idx]`
    /// is set instead.
    pub unique_starts: Vec<HashSet<u64, TinHashState>>,
    /// Per-transcript flag: true once unique start count exceeded `min_cov`.
    /// Avoids further HashSet inserts for high-coverage transcripts.
    pub exceeded_threshold: Vec<bool>,
    /// Number of sampled slots per transcript.
    #[allow(dead_code)]
    pub n_samples: Vec<u32>,
    /// Minimum MAPQ threshold (kept for API compatibility but not used;
    /// upstream tin.py does not filter by MAPQ).
    #[allow(dead_code)]
    pub mapq_cut: u8,
    /// Minimum coverage threshold (unique read start positions).
    pub min_cov: u32,
    /// Hash state used for unique_starts sets (deterministic when seeded).
    hash_state: TinHashState,
    /// Reusable scratch buffer for aligned blocks, avoids per-read allocation.
    blocks_buf: Vec<(u64, u64)>,
    /// Last chromosome seen -- used to detect chromosome transitions and
    /// reset the cursors below.
    cursor_chrom: String,
    /// Cursor into `chrom_spans` for the current chromosome. Since the BAM
    /// is coordinate-sorted, each read's span_end is >= the previous read's,
    /// so we only need to search forward from here.
    cursor_span: usize,
    /// Cursor into `chrom_positions` for the current chromosome. The leftmost
    /// aligned-block start of each read is >= the previous read's, so we
    /// search forward from here.
    cursor_pos: usize,
}

impl TinAccum {
    /// Create a new TIN accumulator for the given index.
    ///
    /// When `seed` is `Some(n)`, the internal hash sets use a deterministic
    /// hash function seeded from `n`, making TIN results reproducible across
    /// runs. When `None`, the default random hash state is used.
    pub fn new(index: &TinIndex, mapq_cut: u8, min_cov: u32, seed: Option<u64>) -> Self {
        let n_transcripts = index.transcripts.len();
        let mut coverage = Vec::with_capacity(n_transcripts);
        let mut n_samples = Vec::with_capacity(n_transcripts);

        let hash_state = TinHashState::new(seed);

        for tx in &index.transcripts {
            let n = tx.sampled_positions.len();
            coverage.push(vec![0u32; n]);
            n_samples.push(n as u32);
        }

        let unique_starts = (0..n_transcripts)
            .map(|_| HashSet::with_hasher(hash_state.clone()))
            .collect();

        TinAccum {
            coverage,
            unique_starts,
            exceeded_threshold: vec![false; n_transcripts],
            n_samples,
            mapq_cut,
            min_cov,
            hash_state,
            blocks_buf: Vec::with_capacity(8),
            cursor_chrom: String::new(),
            cursor_span: 0,
            cursor_pos: 0,
        }
    }

    /// Process a single BAM record for TIN accumulation.
    ///
    /// Updates coverage at sampled positions that overlap the read's
    /// aligned blocks, and tracks read start positions.
    ///
    /// Matches upstream RSeQC `tin.py` filtering:
    /// - `check_min_reads()`: skips qcfail, unmapped, secondary only
    /// - `genebody_coverage()` pileup: skips is_del, qcfail, secondary, unmapped
    ///   (pysam pileup default flag_filter also excludes BAM_FDUP)
    /// - Neither function filters by MAPQ or supplementary flags
    pub fn process_read(&mut self, record: &bam::Record, chrom_upper: &str, index: &TinIndex) {
        let flags = record.flags();

        // Skip unmapped, QC-fail, secondary only (matching upstream tin.py)
        if flags & 0x4 != 0 || flags & 0x200 != 0 || flags & 0x100 != 0 {
            return;
        }

        // Fill the reusable scratch buffer with aligned blocks from CIGAR,
        // avoiding a per-read Vec allocation.
        fill_aligned_blocks(record, &mut self.blocks_buf);
        let blocks = &self.blocks_buf;
        if blocks.is_empty() {
            return;
        }

        let read_start = blocks[0].0;

        // Get chromosome positions for this chromosome
        let chrom_positions = match index.chrom_positions.get(chrom_upper) {
            Some(p) => p,
            None => return,
        };

        // Get transcript spans for this chromosome
        let chrom_spans = match index.chrom_spans.get(chrom_upper) {
            Some(s) => s,
            None => return,
        };

        // Reset cursors on chromosome transition.
        if chrom_upper != self.cursor_chrom {
            self.cursor_chrom.clear();
            self.cursor_chrom.push_str(chrom_upper);
            self.cursor_span = 0;
            self.cursor_pos = 0;
        }

        // Find overlapping transcripts via read start.
        // chrom_spans is sorted by tx_start.  We need all transcripts
        // where tx_start <= read_start < tx_end.
        //
        // Since reads are coordinate-sorted, read_start is monotonically
        // non-decreasing.  The partition_point (first span where
        // tx_start > read_start) can only move forward, so we search
        // from cursor_span instead of 0.
        let span_end = self.cursor_span
            + chrom_spans[self.cursor_span..].partition_point(|s| s.0 <= read_start);
        self.cursor_span = span_end;

        for &(_tx_start, tx_end, tx_idx) in &chrom_spans[..span_end] {
            // _tx_start <= read_start is guaranteed by the partition_point
            if read_start < tx_end {
                let idx = tx_idx as usize;
                // Skip HashSet insert if this transcript already exceeded the
                // min_cov threshold -- we only need the pass/fail decision.
                if !self.exceeded_threshold[idx] {
                    self.unique_starts[idx].insert(read_start);
                    if self.unique_starts[idx].len() > self.min_cov as usize {
                        self.exceeded_threshold[idx] = true;
                        // Free the HashSet memory; the flag is sufficient.
                        self.unique_starts[idx] = HashSet::with_hasher(self.hash_state.clone());
                    }
                }
            }
        }

        // Skip duplicate reads for coverage counting.
        // Upstream tin.py uses pysam pileup() whose default flag_filter
        // (BAM_FUNMAP|BAM_FSECONDARY|BAM_FQCFAIL|BAM_FDUP = 0x704) excludes
        // duplicate-flagged reads from coverage columns. However, the upstream
        // check_min_reads() uses fetch() which does NOT filter duplicates, so
        // duplicates are allowed above for the unique_starts tracking.
        if flags & 0x400 != 0 {
            return;
        }

        // For each aligned block, find sampled positions that fall within it.
        // Sampled positions are 1-based; aligned blocks are 0-based half-open
        // [block_start, block_end). A 1-based position P is covered iff
        // P-1 >= block_start && P-1 < block_end, i.e., P > block_start && P <= block_end.
        //
        // Since reads are coordinate-sorted, block_start values are
        // monotonically non-decreasing across reads.  We advance cursor_pos
        // forward rather than binary-searching from scratch each time.
        let mut first_block = true;
        for &(block_start, block_end) in blocks.iter() {
            // Advance cursor to first position > block_start.
            // cursor_pos only moves forward across reads on the same chromosome.
            let base = self.cursor_pos;
            let pos_start = base + chrom_positions[base..].partition_point(|p| p.0 <= block_start);

            // Update cursor from the first aligned block (leftmost), since
            // the next read's first block_start >= this read's first block_start.
            if first_block {
                self.cursor_pos = pos_start;
                first_block = false;
            }

            for &(pos, tx_idx, slot_idx) in &chrom_positions[pos_start..] {
                if pos > block_end {
                    break;
                }
                // This sampled position is covered by this aligned block
                self.coverage[tx_idx as usize][slot_idx as usize] += 1;
            }
        }
    }

    /// Merge another accumulator into this one.
    pub fn merge(&mut self, other: TinAccum) {
        for (i, other_cov) in other.coverage.into_iter().enumerate() {
            for (j, count) in other_cov.into_iter().enumerate() {
                self.coverage[i][j] += count;
            }
            // If either side already exceeded the threshold, the merged
            // result also exceeds it -- no need to union the sets.
            if self.exceeded_threshold[i] || other.exceeded_threshold[i] {
                self.exceeded_threshold[i] = true;
                self.unique_starts[i] = HashSet::with_hasher(self.hash_state.clone());
            } else {
                self.unique_starts[i].extend(other.unique_starts[i].iter());
                if self.unique_starts[i].len() > self.min_cov as usize {
                    self.exceeded_threshold[i] = true;
                    self.unique_starts[i] = HashSet::with_hasher(self.hash_state.clone());
                }
            }
        }
    }

    /// Compute TIN scores for all transcripts.
    pub fn into_result(self, index: &TinIndex) -> TinResults {
        let mut transcripts = Vec::with_capacity(index.transcripts.len());

        for (i, tx) in index.transcripts.iter().enumerate() {
            // A transcript passes if the exceeded_threshold flag was set
            // (unique starts > min_cov during accumulation or merge), or
            // if the remaining set still exceeds the threshold.
            let passed =
                self.exceeded_threshold[i] || self.unique_starts[i].len() as u32 > self.min_cov;
            let coverage = &self.coverage[i];

            // Upstream: `if len(read_count) > cutoff` -- strictly greater
            if !passed {
                transcripts.push(TinResult {
                    gene_id: tx.gene_id.clone(),
                    chrom: tx.chrom.clone(),
                    tx_start: tx.tx_start,
                    tx_end: tx.tx_end,
                    tin: 0.0,
                    passed_threshold: false,
                });
                continue;
            }

            // Use total_position_count (which includes duplicates from upstream's
            // chose_bases list) as the denominator, matching upstream's l = len(pick_positions)
            let n_total = tx.total_position_count;
            let tin = compute_tin(coverage, n_total);

            transcripts.push(TinResult {
                gene_id: tx.gene_id.clone(),
                chrom: tx.chrom.clone(),
                tx_start: tx.tx_start,
                tx_end: tx.tx_end,
                tin,
                passed_threshold: true,
            });
        }

        TinResults { transcripts }
    }
}

/// Compute TIN score from coverage vector using Shannon entropy.
///
/// Matches upstream RSeQC's `tin_score()`:
///   cvg_eff = [c for c in cvg if c > 0]
///   entropy = shannon_entropy(cvg_eff)
///   tin = 100 * exp(entropy) / l
///
/// where `l` is the total number of sampled positions (including zeros),
/// NOT just the nonzero count. A transcript with many uncovered positions
/// gets a lower TIN because the denominator is larger.
fn compute_tin(coverage: &[u32], n_total_positions: usize) -> f64 {
    if n_total_positions == 0 {
        return 0.0;
    }

    let nonzero: Vec<f64> = coverage
        .iter()
        .filter(|&&c| c > 0)
        .map(|&c| c as f64)
        .collect();

    if nonzero.is_empty() {
        return 0.0;
    }

    let total: f64 = nonzero.iter().sum();
    if total == 0.0 {
        return 0.0;
    }

    // Shannon entropy over non-zero positions only (matching upstream)
    let entropy: f64 = nonzero
        .iter()
        .map(|&c| {
            let p = c / total;
            -p * p.ln()
        })
        .sum();

    // Upstream: tin = 100 * exp(entropy) / l
    // where l = total number of sampled positions (not just nonzero)
    100.0 * entropy.exp() / n_total_positions as f64
}

/// Get aligned blocks from a BAM record's CIGAR string.
///
/// Returns sorted (start, end) pairs of aligned blocks (0-based half-open).
/// Fill `buf` with aligned blocks from the CIGAR, reusing the existing
/// Vec capacity to avoid per-read heap allocation.
fn fill_aligned_blocks(record: &bam::Record, buf: &mut Vec<(u64, u64)>) {
    use rust_htslib::bam::record::Cigar;

    buf.clear();
    let mut pos = record.pos() as u64;

    for op in record.cigar().iter() {
        match op {
            Cigar::Match(len) | Cigar::Equal(len) | Cigar::Diff(len) => {
                buf.push((pos, pos + *len as u64));
                pos += *len as u64;
            }
            Cigar::Del(len) | Cigar::RefSkip(len) => {
                pos += *len as u64;
            }
            Cigar::Ins(_) | Cigar::SoftClip(_) | Cigar::HardClip(_) | Cigar::Pad(_) => {}
        }
    }
}

// ===================================================================
// Output writers
// ===================================================================

/// Write per-transcript TIN scores in RSeQC format.
///
/// Format: TSV with header geneID\tchrom\ttx_start\ttx_end\tTIN
pub fn write_tin(results: &TinResults, output_path: &Path) -> Result<()> {
    let mut f = std::fs::File::create(output_path)
        .with_context(|| format!("Failed to create TIN output: {}", output_path.display()))?;

    writeln!(f, "geneID\tchrom\ttx_start\ttx_end\tTIN")?;

    for r in &results.transcripts {
        // Match upstream formatting:
        // - Failed check_min_reads: explicitly writes "0.0" (Python float literal)
        // - Passed but tin_score returned int 0 (empty coverage): writes "0"
        // - Non-zero TIN: writes float value
        if !r.passed_threshold {
            // Below coverage threshold → upstream writes explicit 0.0
            writeln!(
                f,
                "{}\t{}\t{}\t{}\t0.0",
                r.gene_id, r.chrom, r.tx_start, r.tx_end
            )?;
        } else if r.tin == 0.0 {
            // Passed threshold but zero TIN → upstream tin_score returns int 0
            writeln!(
                f,
                "{}\t{}\t{}\t{}\t0",
                r.gene_id, r.chrom, r.tx_start, r.tx_end
            )?;
        } else {
            writeln!(
                f,
                "{}\t{}\t{}\t{}\t{}",
                r.gene_id, r.chrom, r.tx_start, r.tx_end, r.tin
            )?;
        }
    }

    Ok(())
}

/// Write TIN summary statistics.
///
/// Format: Bam_file\tTIN(mean)\tTIN(median)\tTIN(stdev)
pub fn write_tin_summary(results: &TinResults, bam_name: &str, output_path: &Path) -> Result<()> {
    let mut f = std::fs::File::create(output_path)
        .with_context(|| format!("Failed to create TIN summary: {}", output_path.display()))?;

    let scores: Vec<f64> = results
        .transcripts
        .iter()
        .filter(|r| r.passed_threshold)
        .map(|r| r.tin)
        .collect();

    let (mean, median, stdev) = if scores.is_empty() {
        (0.0, 0.0, 0.0)
    } else {
        let n = scores.len() as f64;
        let mean = scores.iter().sum::<f64>() / n;

        let median = crate::io::median(&scores);

        let variance = scores.iter().map(|x| (x - mean).powi(2)).sum::<f64>() / n;
        let stdev = variance.sqrt();

        (mean, median, stdev)
    };

    writeln!(f, "Bam_file\tTIN(mean)\tTIN(median)\tTIN(stdev)")?;
    writeln!(f, "{}\t{}\t{}\t{}", bam_name, mean, median, stdev)?;

    Ok(())
}

// ===================================================================
// Unit tests
// ===================================================================

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_sample_exonic_positions() {
        // Single exon of 10 bases: 0-based [100, 110)
        // tx_start=100, tx_end=110 (matching BED format)
        // mRNA_size = 10, sample_size = 5, so mRNA_size > sample_size
        // step_size = 10 / 5 = 2
        // gene_all_base (1-based) = [101, 102, ..., 110]
        // exon_bounds = [101, 110]
        // strided: indices 0,2,4,6,8 -> [101, 103, 105, 107, 109]
        // merged with bounds [101, 110] -> [101, 103, 105, 107, 109, 110]
        // Large mRNA case: deduped by uniqify, total_count = unique count
        let exons = vec![(100, 110)];
        let (positions, total_count) = sample_exonic_positions(&exons, 5, 100, 110);
        assert_eq!(positions.len(), 6); // 5 strided + exon end boundary 110
        assert_eq!(total_count, 6); // no duplicates in large mRNA case
                                    // Positions should be 1-based
        assert_eq!(positions[0], 101);
        assert_eq!(positions[5], 110);
    }

    #[test]
    fn test_sample_multi_exon() {
        // Two exons: 0-based [100,105) and [200,205) → 10 bases total
        // tx_start=100, tx_end=205
        // mRNA_size = 10, sample_size = 10, so mRNA_size <= sample_size
        // chose_bases = [tx_start+1, tx_end] + all exonic = [101, 205, 101..=105, 201..=205]
        // Sorted with dupes: [101, 101, 102, 103, 104, 105, 201, 202, 203, 204, 205, 205]
        // total_count = 12 (with duplicates), unique positions = 10
        let exons = vec![(100, 105), (200, 205)];
        let (positions, total_count) = sample_exonic_positions(&exons, 10, 100, 205);
        assert_eq!(positions.len(), 10); // 10 unique positions
        assert_eq!(total_count, 12); // 10 + 2 boundary duplicates
        assert_eq!(positions[0], 101);
        assert_eq!(positions[9], 205);
    }

    #[test]
    fn test_sample_tiny_exon() {
        // Exon smaller than sample size: 0-based [100, 103) → 3 bases
        // tx_start=100, tx_end=103
        // mRNA_size = 3 <= sample_size = 100
        // chose_bases = [101, 103, 101, 102, 103] -> sorted: [101, 101, 102, 103, 103]
        // total_count = 5 (with duplicates), unique positions = 3
        let exons = vec![(100, 103)];
        let (positions, total_count) = sample_exonic_positions(&exons, 100, 100, 103);
        assert_eq!(positions.len(), 3); // 3 unique exonic positions
        assert_eq!(total_count, 5); // 3 + 2 boundary duplicates
        assert_eq!(positions[0], 101);
        assert_eq!(positions[2], 103);
    }

    #[test]
    fn test_compute_tin_uniform() {
        // Uniform coverage → max TIN
        let cov = vec![10, 10, 10, 10, 10, 10, 10, 10, 10, 10];
        let tin = compute_tin(&cov, cov.len());
        assert!(
            (tin - 100.0).abs() < 0.01,
            "Uniform coverage TIN should be ~100: {tin}"
        );
    }

    #[test]
    fn test_compute_tin_uniform_with_zeros() {
        // Uniform coverage but only 5 of 10 positions covered
        // TIN = 100 * exp(H) / 10; H entropy over 5 equal values -> exp(H)=5
        // TIN = 100 * 5 / 10 = 50
        let cov = vec![10, 10, 10, 10, 10, 0, 0, 0, 0, 0];
        let tin = compute_tin(&cov, cov.len());
        assert!(
            (tin - 50.0).abs() < 0.01,
            "Half-covered uniform TIN should be ~50: {tin}"
        );
    }

    #[test]
    fn test_compute_tin_degraded() {
        // 5' degradation: high at start, low at end
        let cov = vec![100, 80, 60, 40, 20, 10, 5, 2, 1, 1];
        let tin = compute_tin(&cov, cov.len());
        assert!(
            tin > 0.0 && tin < 100.0,
            "Degraded TIN should be between 0 and 100: {tin}"
        );
    }

    #[test]
    fn test_compute_tin_all_zero() {
        let cov = vec![0, 0, 0, 0, 0];
        let tin = compute_tin(&cov, cov.len());
        assert_eq!(tin, 0.0);
    }

    #[test]
    fn test_compute_tin_single_nonzero() {
        // Only one position covered out of 5 -> entropy=0 -> exp(0)=1
        // TIN = 100 * 1 / 5 = 20
        let cov = vec![0, 0, 100, 0, 0];
        let tin = compute_tin(&cov, cov.len());
        assert!(
            (tin - 20.0).abs() < 0.01,
            "Single position coverage should give TIN=20: {tin}"
        );
    }
}