rustqc 0.1.0

Fast RNA-seq QC in a single pass: dupRadar, featureCounts, 8 RSeQC tools, preseq, samtools stats, and Qualimap — reimplemented in Rust
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
//! GTF annotation file parser.
//!
//! Parses GTF/GFF2 format files to extract gene, transcript, and exon
//! information. Computes effective gene lengths as the total number of
//! non-overlapping exon bases (matching featureCounts behavior).
//! Also extracts transcript-level structure (exon blocks, CDS ranges) for
//! use by RSeQC-equivalent tools.

use anyhow::{Context, Result};
use indexmap::IndexMap;
use std::collections::HashMap;
use std::io::BufRead;

/// Represents a single exon interval.
#[derive(Debug, Clone)]
pub struct Exon {
    /// Chromosome/contig name
    pub chrom: String,
    /// Start position (1-based, inclusive, as in GTF)
    pub start: u64,
    /// End position (1-based, inclusive, as in GTF)
    pub end: u64,
    /// Strand: '+', '-', or '.'
    pub strand: char,
}

/// Represents a single transcript with its exon blocks and optional CDS range.
///
/// Transcript-level structure is needed by RSeQC-equivalent tools that require
/// per-transcript exon/intron/CDS information (e.g. read_distribution,
/// junction_annotation, inner_distance).
#[derive(Debug, Clone)]
pub struct Transcript {
    /// Transcript identifier (from transcript_id attribute)
    pub transcript_id: String,
    /// Chromosome/contig name
    pub chrom: String,
    /// Transcript start (minimum of all exon starts, 1-based inclusive)
    pub start: u64,
    /// Transcript end (maximum of all exon ends, 1-based inclusive)
    pub end: u64,
    /// Strand: '+', '-', or '.'
    pub strand: char,
    /// Sorted exon blocks as (start, end) pairs (1-based inclusive)
    pub exons: Vec<(u64, u64)>,
    /// CDS start (minimum of all CDS feature starts, 1-based inclusive).
    /// `None` for non-coding transcripts.
    pub cds_start: Option<u64>,
    /// CDS end (maximum of all CDS feature ends, 1-based inclusive).
    /// `None` for non-coding transcripts.
    pub cds_end: Option<u64>,
}

/// Represents a gene with all its exons and transcripts.
#[derive(Debug, Clone)]
pub struct Gene {
    /// Gene identifier (from gene_id attribute)
    pub gene_id: String,
    /// Chromosome/contig name (populated from GTF, used by downstream tools)
    pub chrom: String,
    /// Gene start (minimum of all exon starts, 1-based)
    pub start: u64,
    /// Gene end (maximum of all exon ends, 1-based)
    pub end: u64,
    /// Strand: '+', '-', or '.' (populated from GTF, used by downstream tools)
    pub strand: char,
    /// All exons belonging to this gene (flattened across transcripts)
    pub exons: Vec<Exon>,
    /// Effective length: total non-overlapping exon bases
    pub effective_length: u64,
    /// Additional extracted attributes (e.g. gene_biotype, gene_name).
    ///
    /// Keys are attribute names, values are the attribute values from the GTF.
    /// Only attributes requested via `extra_attributes` in [`parse_gtf`] are stored.
    pub attributes: HashMap<String, String>,
    /// Per-transcript exon/CDS structure, built from transcript_id grouping.
    ///
    /// This is populated during GTF parsing and provides the transcript-level
    /// detail needed by RSeQC-equivalent tools. May be empty if the GTF lacks
    /// transcript_id attributes.
    pub transcripts: Vec<Transcript>,
}

/// Parse a GTF attribute string to extract a specific attribute value.
///
/// GTF attributes are semicolon-separated key-value pairs like:
/// `gene_id "ENSG00000223972"; gene_name "DDX11L1";`
fn get_attribute(attributes: &str, key: &str) -> Option<String> {
    for attr in attributes.split(';') {
        let attr = attr.trim();
        if attr.is_empty() {
            continue;
        }
        // Split on first whitespace
        if let Some(pos) = attr.find(|c: char| c.is_whitespace()) {
            let (k, v) = attr.split_at(pos);
            if k.trim() == key {
                // Remove surrounding quotes and whitespace
                let v = v.trim().trim_matches('"');
                return Some(v.to_string());
            }
        }
    }
    None
}

/// Compute the total number of non-overlapping bases across a set of intervals.
///
/// This matches featureCounts behavior for computing effective gene length:
/// merge overlapping exons and sum their lengths.
fn compute_non_overlapping_length(exons: &[Exon]) -> u64 {
    if exons.is_empty() {
        return 0;
    }

    // Collect all intervals as (start, end) and sort by start
    let mut intervals: Vec<(u64, u64)> = exons.iter().map(|e| (e.start, e.end)).collect();
    intervals.sort_unstable();

    // Merge overlapping intervals
    let mut total_bases: u64 = 0;
    let mut current_start = intervals[0].0;
    let mut current_end = intervals[0].1;

    for &(start, end) in &intervals[1..] {
        if start <= current_end + 1 {
            // Overlapping or adjacent, extend
            current_end = current_end.max(end);
        } else {
            // Non-overlapping, count previous interval
            total_bases += current_end - current_start + 1;
            current_start = start;
            current_end = end;
        }
    }
    // Count the last interval
    total_bases += current_end - current_start + 1;

    total_bases
}

/// Intermediate accumulator for building a transcript from parsed GTF features.
struct TranscriptBuilder {
    /// Transcript identifier
    transcript_id: String,
    /// Chromosome (from first exon)
    chrom: String,
    /// Strand (from first exon)
    strand: char,
    /// Exon intervals: (start, end), 1-based inclusive
    exons: Vec<(u64, u64)>,
    /// CDS intervals: (start, end), 1-based inclusive
    cds: Vec<(u64, u64)>,
    /// start_codon intervals: (start, end), 1-based inclusive
    start_codons: Vec<(u64, u64)>,
    /// stop_codon intervals: (start, end), 1-based inclusive
    stop_codons: Vec<(u64, u64)>,
}

/// Parse a GTF file and return a map of gene_id -> Gene.
///
/// Extracts all exon and CDS features, groups them by gene_id, and builds
/// transcript-level structures by grouping exons/CDS by transcript_id.
/// Computes effective gene length from non-overlapping exon bases.
/// Returns genes in the order they are first encountered in the GTF.
///
/// # Arguments
/// * `path` - Path to the GTF annotation file
/// * `extra_attributes` - Additional GTF attribute names to extract (e.g. `["gene_biotype", "gene_name"]`)
///
/// # Returns
/// An IndexMap preserving insertion order of gene_id -> Gene
pub fn parse_gtf(path: &str, extra_attributes: &[String]) -> Result<IndexMap<String, Gene>> {
    let reader = crate::io::open_reader(path)
        .with_context(|| format!("Failed to open GTF file: {}", path))?;

    let mut genes: IndexMap<String, Gene> = IndexMap::new();

    // Accumulate transcript-level data: (gene_id, transcript_id) -> TranscriptBuilder
    // Use IndexMap to preserve insertion order within each gene
    let mut tx_builders: IndexMap<(String, String), TranscriptBuilder> = IndexMap::new();

    for line in reader.lines() {
        let line = line.context("Failed to read line from GTF file")?;

        // Skip comments and empty lines
        if line.starts_with('#') || line.is_empty() {
            continue;
        }

        let fields: Vec<&str> = line.split('\t').collect();
        if fields.len() < 9 {
            continue;
        }

        let feature_type = fields[2];

        // We care about exon, CDS, start_codon, and stop_codon features
        if feature_type != "exon"
            && feature_type != "CDS"
            && feature_type != "start_codon"
            && feature_type != "stop_codon"
        {
            continue;
        }

        let chrom = fields[0].to_string();
        let start: u64 = fields[3]
            .parse()
            .with_context(|| format!("Invalid start position: {}", fields[3]))?;
        let end: u64 = fields[4]
            .parse()
            .with_context(|| format!("Invalid end position: {}", fields[4]))?;
        anyhow::ensure!(
            start <= end,
            "Malformed GTF: start ({}) > end ({}) for {} on {}",
            start,
            end,
            feature_type,
            fields[0]
        );
        let strand = fields[6].chars().next().unwrap_or('.');

        let attr_str = fields[8];
        let gene_id = match get_attribute(attr_str, "gene_id") {
            Some(id) => id,
            None => continue, // Skip features without gene_id
        };

        // For exon features, update the gene-level data (unchanged from original)
        if feature_type == "exon" {
            let exon = Exon {
                chrom: chrom.clone(),
                start,
                end,
                strand,
            };

            // Extract extra attributes from the first exon encountered for this gene
            genes
                .entry(gene_id.clone())
                .and_modify(|gene| {
                    gene.start = gene.start.min(start);
                    gene.end = gene.end.max(end);
                    gene.exons.push(exon.clone());
                })
                .or_insert_with(|| {
                    let mut attrs = HashMap::new();
                    for attr_name in extra_attributes {
                        if let Some(val) = get_attribute(attr_str, attr_name) {
                            attrs.insert(attr_name.clone(), val);
                        }
                    }
                    Gene {
                        gene_id: gene_id.clone(),
                        chrom: chrom.clone(),
                        start,
                        end,
                        strand,
                        exons: vec![exon],
                        effective_length: 0, // computed later
                        attributes: attrs,
                        transcripts: Vec::new(), // populated later
                    }
                });
        }

        // Accumulate transcript-level data for both exon and CDS features.
        // Features without transcript_id are grouped under a synthetic key.
        let transcript_id = get_attribute(attr_str, "transcript_id")
            .unwrap_or_else(|| format!("{}__no_tx", gene_id));

        let key = (gene_id.clone(), transcript_id.clone());
        let builder = tx_builders.entry(key).or_insert_with(|| TranscriptBuilder {
            transcript_id,
            chrom: chrom.clone(),
            strand,
            exons: Vec::new(),
            cds: Vec::new(),
            start_codons: Vec::new(),
            stop_codons: Vec::new(),
        });

        match feature_type {
            "exon" => builder.exons.push((start, end)),
            "CDS" => builder.cds.push((start, end)),
            "start_codon" => builder.start_codons.push((start, end)),
            "stop_codon" => builder.stop_codons.push((start, end)),
            _ => unreachable!(),
        }
    }

    // Warn if no genes were extracted — likely indicates a format problem
    if genes.is_empty() {
        log::warn!(
            "No genes extracted from GTF file '{}'. Check that the file is in GTF (not GFF3) \
             format and contains exon features with gene_id attributes.",
            path
        );
    }

    // Build Transcript structs from accumulated data and attach to genes
    for ((gene_id, _), mut builder) in tx_builders {
        // Skip transcript builders that have no exons (CDS-only entries are useless)
        if builder.exons.is_empty() {
            continue;
        }

        // Sort exon blocks by start position
        builder.exons.sort_unstable();

        let tx_start = builder.exons.first().map(|e| e.0).unwrap_or(0);
        let tx_end = builder.exons.last().map(|e| e.1).unwrap_or(0);

        // Compute CDS range (thickStart/thickEnd) using start_codon/stop_codon
        // positions with strand-aware logic matching the Perl gtf2bed script
        // used by upstream nf-core/rnaseq.
        //
        // The Perl script stores the start of the last start_codon line and the
        // end of the last stop_codon line (overwriting on each occurrence).
        // For split codons spanning exon boundaries, multiple GTF lines exist
        // for the same codon; the Perl script keeps the last line's values.
        //
        // For + strand: thick_start = last_sc_start, thick_end = last_stc_end
        // For - strand: the Perl script swaps and adjusts:
        //   thick_start = last_stc_end - 2, thick_end = last_sc_start + 2
        //   (this reconstructs the codon span from a single position,
        //    correct for non-split codons and matching upstream behavior)
        // Missing codons fall back to transcript boundaries (tx_start/tx_end).
        // Transcripts with CDS features but no codon features get tx_start/tx_end.
        let (cds_start, cds_end) = if builder.cds.is_empty() {
            (None, None)
        } else {
            let has_start_codon = !builder.start_codons.is_empty();
            let has_stop_codon = !builder.stop_codons.is_empty();

            let thick_start;
            let thick_end;

            if has_start_codon || has_stop_codon {
                // Use the last codon line seen (matching Perl overwrite semantics).
                // For split codons across exon boundaries, multiple GTF lines exist;
                // the Perl script simply overwrites with each line, keeping the last.
                let last_start_codon_start = builder.start_codons.last().map(|c| c.0);
                let last_stop_codon_end = builder.stop_codons.last().map(|c| c.1);

                match builder.strand {
                    '+' => {
                        // + strand: thick_start = last start_codon start
                        //           thick_end   = last stop_codon end
                        thick_start = last_start_codon_start.unwrap_or(tx_start);
                        thick_end = last_stop_codon_end.unwrap_or(tx_end);
                    }
                    '-' => {
                        // - strand: Perl swaps then adjusts:
                        //   thick_start = last_stop_codon_end - 2
                        //   thick_end   = last_start_codon_start + 2
                        thick_start = last_stop_codon_end
                            .map(|e| e.saturating_sub(2))
                            .unwrap_or(tx_start);
                        thick_end = last_start_codon_start.map(|s| s + 2).unwrap_or(tx_end);
                    }
                    _ => {
                        // Unknown strand: fall back to transcript boundaries
                        thick_start = tx_start;
                        thick_end = tx_end;
                    }
                }
            } else {
                // CDS features but no codon features: use transcript boundaries
                thick_start = tx_start;
                thick_end = tx_end;
            }

            (Some(thick_start), Some(thick_end))
        };

        let transcript = Transcript {
            transcript_id: builder.transcript_id,
            chrom: builder.chrom,
            start: tx_start,
            end: tx_end,
            strand: builder.strand,
            exons: builder.exons,
            cds_start,
            cds_end,
        };

        // Attach to the parent gene (which must exist since we only created
        // transcript builders for features that also had exon lines)
        if let Some(gene) = genes.get_mut(&gene_id) {
            gene.transcripts.push(transcript);
        }
    }

    // Compute effective lengths for all genes
    for gene in genes.values_mut() {
        gene.effective_length = compute_non_overlapping_length(&gene.exons);
    }

    Ok(genes)
}

/// Check whether a given attribute name exists in a GTF file.
///
/// Scans up to `max_lines` data lines (non-comment, non-empty) and returns
/// `true` if the attribute is found in at least one exon feature.
pub fn attribute_exists_in_gtf(path: &str, attribute_name: &str, max_lines: usize) -> bool {
    let reader = match crate::io::open_reader(path) {
        Ok(r) => r,
        Err(_) => return false,
    };
    let mut checked = 0;
    for line in reader.lines() {
        let line = match line {
            Ok(l) => l,
            Err(_) => break,
        };
        if line.starts_with('#') || line.is_empty() {
            continue;
        }
        let fields: Vec<&str> = line.split('\t').collect();
        if fields.len() < 9 || fields[2] != "exon" {
            continue;
        }
        if get_attribute(fields[8], attribute_name).is_some() {
            return true;
        }
        checked += 1;
        if checked >= max_lines {
            break;
        }
    }
    false
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_get_attribute() {
        let attrs =
            r#"gene_id "ENSG00000223972"; transcript_id "ENST00000456328"; gene_name "DDX11L1";"#;
        assert_eq!(
            get_attribute(attrs, "gene_id"),
            Some("ENSG00000223972".to_string())
        );
        assert_eq!(
            get_attribute(attrs, "gene_name"),
            Some("DDX11L1".to_string())
        );
        assert_eq!(get_attribute(attrs, "missing"), None);
    }

    #[test]
    fn test_non_overlapping_length() {
        // Single exon: 100-200 = 101 bases
        let exons = vec![Exon {
            chrom: "chr1".to_string(),
            start: 100,
            end: 200,
            strand: '+',
        }];
        assert_eq!(compute_non_overlapping_length(&exons), 101);

        // Two non-overlapping exons
        let exons = vec![
            Exon {
                chrom: "chr1".to_string(),
                start: 100,
                end: 200,
                strand: '+',
            },
            Exon {
                chrom: "chr1".to_string(),
                start: 300,
                end: 400,
                strand: '+',
            },
        ];
        assert_eq!(compute_non_overlapping_length(&exons), 202);

        // Two overlapping exons: 100-200, 150-250 -> merged 100-250 = 151
        let exons = vec![
            Exon {
                chrom: "chr1".to_string(),
                start: 100,
                end: 200,
                strand: '+',
            },
            Exon {
                chrom: "chr1".to_string(),
                start: 150,
                end: 250,
                strand: '+',
            },
        ];
        assert_eq!(compute_non_overlapping_length(&exons), 151);

        // Empty
        let exons: Vec<Exon> = vec![];
        assert_eq!(compute_non_overlapping_length(&exons), 0);
    }

    /// Helper to create a temporary GTF file for testing.
    fn write_temp_gtf(content: &str) -> (std::path::PathBuf, std::fs::File) {
        use std::io::Write;
        use std::sync::atomic::{AtomicU64, Ordering};
        static COUNTER: AtomicU64 = AtomicU64::new(0);
        let dir = std::env::temp_dir();
        let id = COUNTER.fetch_add(1, Ordering::Relaxed);
        let path = dir.join(format!(
            "rustqc_test_{:?}_{}.gtf",
            std::thread::current().id(),
            id
        ));
        let mut f = std::fs::File::create(&path).unwrap();
        f.write_all(content.as_bytes()).unwrap();
        f.flush().unwrap();
        (path, f)
    }

    #[test]
    fn test_parse_gtf_transcripts_basic() {
        // T1 has CDS + start_codon + stop_codon → precise CDS range
        // T2 has no CDS features → non-coding
        let (path, _f) = write_temp_gtf(
            "\
chr1\ttest\texon\t100\t200\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t300\t400\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t100\t250\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T2\";\n\
chr1\ttest\tCDS\t120\t200\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t300\t380\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstart_codon\t120\t122\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t381\t383\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n",
        );

        let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
        assert_eq!(genes.len(), 1);
        let gene = &genes["G1"];

        // Gene-level: 3 exons total, spanning 100-400
        assert_eq!(gene.exons.len(), 3);
        assert_eq!(gene.start, 100);
        assert_eq!(gene.end, 400);
        assert_eq!(gene.strand, '+');

        // Should have 2 transcripts
        assert_eq!(gene.transcripts.len(), 2);

        let t1 = gene
            .transcripts
            .iter()
            .find(|t| t.transcript_id == "T1")
            .unwrap();
        assert_eq!(t1.exons, vec![(100, 200), (300, 400)]);
        assert_eq!(t1.start, 100);
        assert_eq!(t1.end, 400);
        // + strand with start_codon and stop_codon:
        // thick_start = start_codon start = 120
        // thick_end = stop_codon end = 383
        assert_eq!(t1.cds_start, Some(120));
        assert_eq!(t1.cds_end, Some(383));

        let t2 = gene
            .transcripts
            .iter()
            .find(|t| t.transcript_id == "T2")
            .unwrap();
        assert_eq!(t2.exons, vec![(100, 250)]);
        assert_eq!(t2.start, 100);
        assert_eq!(t2.end, 250);
        assert_eq!(t2.cds_start, None); // non-coding
        assert_eq!(t2.cds_end, None);
    }

    #[test]
    fn test_parse_gtf_no_transcript_id() {
        // Exons without transcript_id should be grouped under a synthetic key
        let (path, _f) = write_temp_gtf(
            "\
chr1\ttest\texon\t100\t200\t.\t+\t.\tgene_id \"G1\";\n\
chr1\ttest\texon\t300\t400\t.\t+\t.\tgene_id \"G1\";\n",
        );

        let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
        let gene = &genes["G1"];
        // Should have exactly 1 transcript with synthetic ID
        assert_eq!(gene.transcripts.len(), 1);
        assert_eq!(gene.transcripts[0].transcript_id, "G1__no_tx");
        assert_eq!(gene.transcripts[0].exons, vec![(100, 200), (300, 400)]);
    }

    #[test]
    fn test_parse_gtf_cds_only_no_exon() {
        // CDS features without corresponding exons should not create transcripts
        let (path, _f) = write_temp_gtf(
            "\
chr1\ttest\texon\t100\t200\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t500\t600\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T_orphan\";\n",
        );

        let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
        let gene = &genes["G1"];
        // T_orphan should be skipped because it has no exons
        assert_eq!(gene.transcripts.len(), 1);
        assert_eq!(gene.transcripts[0].transcript_id, "T1");
    }

    #[test]
    fn test_parse_gtf_multi_gene_transcripts() {
        // G1/T1: CDS but no codon features → falls back to tx_start/tx_end
        // G2/T2: non-coding (no CDS)
        let (path, _f) = write_temp_gtf(
            "\
chr1\ttest\texon\t100\t200\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t120\t180\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr2\ttest\texon\t500\t600\t.\t-\t.\tgene_id \"G2\"; transcript_id \"T2\";\n\
chr2\ttest\texon\t700\t800\t.\t-\t.\tgene_id \"G2\"; transcript_id \"T2\";\n",
        );

        let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
        assert_eq!(genes.len(), 2);

        let g1 = &genes["G1"];
        assert_eq!(g1.transcripts.len(), 1);
        // CDS features but no codon features → tx_start=100, tx_end=200
        assert_eq!(g1.transcripts[0].cds_start, Some(100));
        assert_eq!(g1.transcripts[0].cds_end, Some(200));

        let g2 = &genes["G2"];
        assert_eq!(g2.transcripts.len(), 1);
        assert_eq!(g2.transcripts[0].exons, vec![(500, 600), (700, 800)]);
        assert_eq!(g2.transcripts[0].strand, '-');
        assert_eq!(g2.transcripts[0].cds_start, None);
    }

    #[test]
    fn test_stop_codon_extends_cds_range_forward_strand() {
        // On + strand with start_codon and stop_codon:
        // thick_start = start_codon start, thick_end = stop_codon end
        let (path, _f) = write_temp_gtf(
            "\
chr1\ttest\texon\t1000\t2000\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t3000\t4000\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t1200\t2000\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t3000\t3500\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstart_codon\t1200\t1202\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t3501\t3503\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n",
        );

        let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
        let t1 = &genes["G1"].transcripts[0];
        // + strand: thick_start = start_codon start = 1200
        //           thick_end = stop_codon end = 3503
        assert_eq!(t1.cds_start, Some(1200));
        assert_eq!(t1.cds_end, Some(3503));
    }

    #[test]
    fn test_stop_codon_extends_cds_range_reverse_strand() {
        // On - strand with start_codon and stop_codon (non-split):
        // Perl gtf2bed: swap then adjust → thick_start = stc_end - 2, thick_end = sc_start + 2
        // For a contiguous 3-base stop_codon [1497,1499]: 1499 - 2 = 1497
        // For a contiguous 3-base start_codon [3798,3800]: 3798 + 2 = 3800
        let (path, _f) = write_temp_gtf(
            "\
chr1\ttest\texon\t1000\t2000\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t3000\t4000\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t1500\t2000\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t3000\t3800\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstart_codon\t3798\t3800\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t1497\t1499\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n",
        );

        let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
        let t1 = &genes["G1"].transcripts[0];
        // - strand: thick_start = last_stc_end - 2 = 1499 - 2 = 1497
        //           thick_end   = last_sc_start + 2 = 3798 + 2 = 3800
        assert_eq!(t1.cds_start, Some(1497));
        assert_eq!(t1.cds_end, Some(3800));
    }

    #[test]
    fn test_split_stop_codon_reverse_strand() {
        // On - strand with a stop codon split across two exons:
        //   stop_codon line 1: [524, 524] (1 base, exon 2)
        //   stop_codon line 2: [101, 102] (2 bases, exon 3)
        //
        // The Perl gtf2bed script overwrites $sc{id}[1] with each stop_codon
        // line, keeping the last end seen (102). Then for - strand:
        //   thick_start = 102 - 2 = 100 (1-based)
        //
        // Using min/max would incorrectly give thick_start = min(524,101) - 1
        // which is wrong. We must match the Perl overwrite behavior.
        let (path, _f) = write_temp_gtf(
            "\
chr1\ttest\texon\t100\t200\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t400\t600\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t800\t1000\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t110\t200\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t400\t600\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t800\t900\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstart_codon\t898\t900\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t524\t524\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t101\t102\t.\t-\t.\tgene_id \"G1\"; transcript_id \"T1\";\n",
        );

        let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
        let t1 = &genes["G1"].transcripts[0];
        // - strand: last stop_codon end = 102, last start_codon start = 898
        // thick_start = 102 - 2 = 100 (Perl: stc_end - 2)
        // thick_end = 898 + 2 = 900 (Perl: sc_start + 2)
        assert_eq!(t1.cds_start, Some(100));
        assert_eq!(t1.cds_end, Some(900));
    }

    #[test]
    fn test_split_start_codon_forward_strand() {
        // On + strand with a start codon split across two exons:
        //   start_codon line 1: [198, 199] (2 bases, exon 1)
        //   start_codon line 2: [401, 401] (1 base, exon 2)
        //
        // Perl keeps the last start_codon start seen (401).
        // For + strand: thick_start = last_sc_start = 401
        let (path, _f) = write_temp_gtf(
            "\
chr1\ttest\texon\t100\t200\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\texon\t400\t600\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t198\t200\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tCDS\t400\t550\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstart_codon\t198\t199\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstart_codon\t401\t401\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t551\t553\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n",
        );

        let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
        let t1 = &genes["G1"].transcripts[0];
        // + strand: thick_start = last start_codon start = 401
        //           thick_end   = last stop_codon end = 553
        assert_eq!(t1.cds_start, Some(401));
        assert_eq!(t1.cds_end, Some(553));
    }

    #[test]
    fn test_stop_codon_without_cds_does_not_create_coding() {
        // stop_codon without CDS features should not make a non-coding transcript coding
        let (path, _f) = write_temp_gtf(
            "\
chr1\ttest\texon\t1000\t2000\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n\
chr1\ttest\tstop_codon\t1500\t1502\t.\t+\t.\tgene_id \"G1\"; transcript_id \"T1\";\n",
        );

        let genes = parse_gtf(path.to_str().unwrap(), &[]).unwrap();
        let t1 = &genes["G1"].transcripts[0];
        assert_eq!(t1.cds_start, None);
        assert_eq!(t1.cds_end, None);
    }
}