use std::path::{Path, PathBuf};
use std::sync::OnceLock;
use super::diagnostics::{DiagnosticPayload, LoaderDiagnostic, LoaderReport, SourceLocation};
use super::feature::FeatureType;
use super::format_detect::AnnotationFormat;
use super::graph::{FeatureGraph, FeatureId};
use crate::reference::transcript::{Exon, GenomeBuild, ManeStatus, Strand, Transcript};
pub fn build_transcripts(
graph: &FeatureGraph,
format: AnnotationFormat,
genome_build: GenomeBuild,
source_path: PathBuf,
report: &mut LoaderReport,
) -> Vec<Transcript> {
let mut out = Vec::new();
let mut tx_fids: Vec<FeatureId> = (0..graph.features.len() as u32)
.map(FeatureId)
.filter(|fid| graph.feature(*fid).feature_type.is_transcript_like())
.collect();
for i in 0..graph.features.len() {
let fid = FeatureId(i as u32);
let f = graph.feature(fid);
if !f.feature_type.is_gene_like() {
continue;
}
let children = graph.children_of(fid);
if children.is_empty() {
continue;
}
let has_any_tx_child = children
.iter()
.any(|c| graph.feature(*c).feature_type.is_transcript_like());
if has_any_tx_child {
continue;
}
let has_cds = children
.iter()
.any(|c| matches!(graph.feature(*c).feature_type, FeatureType::Cds));
if has_cds {
tx_fids.push(fid);
report.record(LoaderDiagnostic::warning(
"W-LOAD-101",
format!(
"GeneAsTranscript: synthesizing transcript from gene {}",
f.id.as_deref().unwrap_or("<no-id>")
),
SourceLocation {
path: source_path.clone(),
line: f.source_line,
},
f.id.as_ref().map(|s| s.to_string()),
DiagnosticPayload::GeneAsTranscript {
gene_id: f.id.as_deref().unwrap_or("").to_string(),
},
));
}
}
for tx_fid in tx_fids {
match build_single(graph, tx_fid, format, genome_build, &source_path, report) {
Some(t) => {
report.transcripts_loaded += 1;
out.push(t);
}
None => report.records_dropped += 1,
}
}
out
}
fn build_single(
graph: &FeatureGraph,
tx_fid: FeatureId,
format: AnnotationFormat,
genome_build: GenomeBuild,
source_path: &Path,
report: &mut LoaderReport,
) -> Option<Transcript> {
let tx_feat = graph.feature(tx_fid);
let tx_id_str = tx_feat.id.as_deref().unwrap_or("<no-id>").to_string();
if tx_feat.strand == Strand::Unknown {
report.record(LoaderDiagnostic::error(
"E-LOAD-103",
format!("StrandRequired: transcript {} has strand '.'", tx_id_str),
SourceLocation {
path: source_path.to_path_buf(),
line: tx_feat.source_line,
},
Some(tx_id_str.clone()),
DiagnosticPayload::StrandRequired {
feature_id: Some(tx_id_str.clone()),
},
));
return None;
}
let exon_fids = graph.descendants_of_type(tx_fid, &FeatureType::Exon);
let cds_fids = graph.descendants_of_type(tx_fid, &FeatureType::Cds);
let raw_exon_ranges: Vec<(u64, u64)> = if !exon_fids.is_empty() {
exon_fids
.iter()
.map(|fid| graph.feature(*fid).range)
.collect()
} else {
let utr5 = graph.descendants_of_type(tx_fid, &FeatureType::FivePrimeUtr);
let utr3 = graph.descendants_of_type(tx_fid, &FeatureType::ThreePrimeUtr);
let utr_generic = graph.descendants_of_type(tx_fid, &FeatureType::Utr);
let has_utr = !utr5.is_empty() || !utr3.is_empty() || !utr_generic.is_empty();
if has_utr && !cds_fids.is_empty() {
let mut all: Vec<(u64, u64)> = utr5
.iter()
.chain(utr3.iter())
.chain(utr_generic.iter())
.chain(cds_fids.iter())
.map(|f| graph.feature(*f).range)
.collect();
all.sort_by_key(|(s, _)| *s);
merge_adjacent(&all)
} else if !cds_fids.is_empty() {
if cds_fids.len() == 1 {
vec![tx_feat.range]
} else {
cds_fids
.iter()
.map(|fid| graph.feature(*fid).range)
.collect()
}
} else {
report.record(LoaderDiagnostic::warning(
"W-LOAD-100",
format!(
"TranscriptWithoutExons: {} has no exon/UTR/CDS children",
tx_id_str
),
SourceLocation {
path: source_path.to_path_buf(),
line: tx_feat.source_line,
},
Some(tx_id_str.clone()),
DiagnosticPayload::NoExonsDerivable {
transcript_id: tx_id_str.clone(),
},
));
return None;
}
};
let mut sorted_exons = raw_exon_ranges;
if tx_feat.strand == Strand::Minus {
sorted_exons.sort_by(|(a, _), (b, _)| b.cmp(a));
} else {
sorted_exons.sort_by_key(|(s, _)| *s);
}
let mut genomic_sorted_exons = sorted_exons.clone();
genomic_sorted_exons.sort_by_key(|(s, _)| *s);
let (cds_g_start, cds_g_end) = derive_cds_bounds(
graph,
tx_fid,
&cds_fids,
&genomic_sorted_exons,
tx_feat.strand,
&tx_id_str,
format,
source_path,
report,
);
let mut tx_pos = 1u64;
let exons: Vec<Exon> = sorted_exons
.iter()
.enumerate()
.map(|(i, (g_start, g_end))| {
let len = g_end - g_start + 1;
let ex = Exon::with_genomic((i + 1) as u32, tx_pos, tx_pos + len - 1, *g_start, *g_end);
tx_pos += len;
ex
})
.collect();
let (tx_cds_start, tx_cds_end) = match (cds_g_start, cds_g_end) {
(Some(gs), Some(ge)) => {
let minus = tx_feat.strand == Strand::Minus;
let mut a = None;
let mut b = None;
for ex in &exons {
let (egs, ege) = (ex.genomic_start.unwrap(), ex.genomic_end.unwrap());
if gs >= egs && gs <= ege {
a = Some(if minus {
ex.start + (ege - gs)
} else {
ex.start + (gs - egs)
});
}
if ge >= egs && ge <= ege {
b = Some(if minus {
ex.start + (ege - ge)
} else {
ex.start + (ge - egs)
});
}
}
match (a, b) {
(Some(x), Some(y)) if x > y => (Some(y), Some(x)),
_ => (a, b),
}
}
_ => (None, None),
};
let chromosome = Some(tx_feat.seqid.to_string());
let g_start = exons.iter().filter_map(|e| e.genomic_start).min();
let g_end = exons.iter().filter_map(|e| e.genomic_end).max();
let attrs = &tx_feat.attrs;
let gene_symbol = match format {
AnnotationFormat::Gtf => {
crate::reference::annotation::feature::attr_get(attrs, "gene_name")
.or_else(|| crate::reference::annotation::feature::attr_get(attrs, "gene_id"))
.map(String::from)
}
AnnotationFormat::Gff3 => {
crate::reference::annotation::feature::attr_get(attrs, "gene_name")
.or_else(|| crate::reference::annotation::feature::attr_get(attrs, "gene"))
.or_else(|| crate::reference::annotation::feature::attr_get(attrs, "Name"))
.map(String::from)
}
};
let mane_status = {
let tag = crate::reference::annotation::feature::attr_get(attrs, "tag").unwrap_or("");
let mane = crate::reference::annotation::feature::attr_get(attrs, "MANE")
.or_else(|| crate::reference::annotation::feature::attr_get(attrs, "mane_status"))
.unwrap_or("");
if tag.contains("MANE_Select")
|| tag.contains("MANE Select")
|| mane.to_lowercase().contains("select")
{
ManeStatus::Select
} else if tag.contains("MANE_Plus_Clinical")
|| tag.contains("MANE Plus Clinical")
|| mane.to_lowercase().contains("plus")
|| mane.to_lowercase().contains("clinical")
{
ManeStatus::PlusClinical
} else {
ManeStatus::None
}
};
let refseq_match = crate::reference::annotation::feature::attr_get(attrs, "RefSeq")
.or_else(|| crate::reference::annotation::feature::attr_get(attrs, "refseq_id"))
.map(String::from);
let ensembl_match = crate::reference::annotation::feature::attr_get(attrs, "Ensembl")
.or_else(|| crate::reference::annotation::feature::attr_get(attrs, "ensembl_id"))
.map(String::from);
Some(Transcript {
id: tx_id_str,
gene_symbol,
strand: tx_feat.strand,
sequence: None,
cds_start: tx_cds_start,
cds_end: tx_cds_end,
exons,
chromosome,
genomic_start: g_start,
genomic_end: g_end,
genome_build,
mane_status,
refseq_match,
ensembl_match,
exon_cigars: Vec::new(),
cached_introns: OnceLock::new(),
})
}
fn merge_adjacent(ranges: &[(u64, u64)]) -> Vec<(u64, u64)> {
if ranges.is_empty() {
return Vec::new();
}
let mut out = Vec::with_capacity(ranges.len());
let mut cur = ranges[0];
for &(s, e) in &ranges[1..] {
if s <= cur.1 + 1 {
cur.1 = cur.1.max(e);
} else {
out.push(cur);
cur = (s, e);
}
}
out.push(cur);
out
}
#[allow(clippy::too_many_arguments)]
fn derive_cds_bounds(
graph: &FeatureGraph,
tx_fid: FeatureId,
cds_fids: &[FeatureId],
sorted_exon_ranges: &[(u64, u64)],
strand: Strand,
tx_id: &str,
format: AnnotationFormat,
source_path: &Path,
report: &mut LoaderReport,
) -> (Option<u64>, Option<u64>) {
if cds_fids.is_empty() {
return (None, None);
}
let start_codons = graph.descendants_of_type(tx_fid, &FeatureType::StartCodon);
let stop_codons = graph.descendants_of_type(tx_fid, &FeatureType::StopCodon);
let plus = matches!(strand, Strand::Plus);
let cds_5prime_genomic: u64 = if plus {
cds_fids
.iter()
.map(|f| graph.feature(*f).range.0)
.min()
.unwrap()
} else {
cds_fids
.iter()
.map(|f| graph.feature(*f).range.1)
.max()
.unwrap()
};
let leading_cds = cds_fids
.iter()
.copied()
.find(|f| {
let r = graph.feature(*f).range;
if plus {
r.0 == cds_5prime_genomic
} else {
r.1 == cds_5prime_genomic
}
})
.unwrap();
let resolved_start: u64 = if let Some(sc) = start_codons.first() {
let r = graph.feature(*sc).range;
if plus {
r.0
} else {
r.1
}
} else {
let phase = graph.feature(leading_cds).phase;
match phase {
Some(0) => cds_5prime_genomic,
Some(p @ (1 | 2)) => {
report.record(LoaderDiagnostic::warning(
"W-LOAD-110",
format!("PhaseAppliedToCdsStart: tx {} phase={}", tx_id, p),
SourceLocation {
path: source_path.to_path_buf(),
line: graph.feature(leading_cds).source_line,
},
Some(tx_id.into()),
DiagnosticPayload::PhaseAppliedToCdsStart {
transcript_id: tx_id.into(),
phase: p,
},
));
if plus {
cds_5prime_genomic + p as u64
} else {
cds_5prime_genomic.saturating_sub(p as u64)
}
}
Some(_) | None => {
report.record(LoaderDiagnostic::warning(
"W-LOAD-111",
format!("PhaseUnavailable: tx {}", tx_id),
SourceLocation {
path: source_path.to_path_buf(),
line: graph.feature(leading_cds).source_line,
},
Some(tx_id.into()),
DiagnosticPayload::PhaseUnavailable {
transcript_id: tx_id.into(),
},
));
cds_5prime_genomic
}
}
};
let cds_3prime_genomic: u64 = if plus {
cds_fids
.iter()
.map(|f| graph.feature(*f).range.1)
.max()
.unwrap()
} else {
cds_fids
.iter()
.map(|f| graph.feature(*f).range.0)
.min()
.unwrap()
};
let resolved_end: u64 = if let Some(stop) = stop_codons.first() {
let r = graph.feature(*stop).range;
if plus {
r.1
} else {
r.0
}
} else if matches!(format, AnnotationFormat::Gtf) {
cds_3prime_genomic
} else {
report.record(LoaderDiagnostic::warning(
"W-LOAD-112",
format!("StopCodonAssumed: extending CDS for tx {}", tx_id),
SourceLocation {
path: source_path.to_path_buf(),
line: graph.feature(leading_cds).source_line,
},
Some(tx_id.into()),
DiagnosticPayload::StopCodonAssumed {
transcript_id: tx_id.into(),
},
));
if plus {
let extended = cds_3prime_genomic + 3;
sorted_exon_ranges
.iter()
.find(|(es, ee)| cds_3prime_genomic >= *es && cds_3prime_genomic <= *ee)
.map(|(_, ee)| extended.min(*ee))
.unwrap_or(cds_3prime_genomic)
} else {
let extended = cds_3prime_genomic.saturating_sub(3);
sorted_exon_ranges
.iter()
.find(|(es, ee)| cds_3prime_genomic >= *es && cds_3prime_genomic <= *ee)
.map(|(es, _)| extended.max(*es))
.unwrap_or(cds_3prime_genomic)
}
};
let (lo, hi) = if plus {
(resolved_start, resolved_end)
} else {
(resolved_end, resolved_start)
};
(Some(lo), Some(hi))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::reference::annotation::format_detect::AnnotationFormat;
use crate::reference::annotation::graph::FeatureGraph;
use crate::reference::annotation::record::Gff3Record;
use crate::reference::transcript::GenomeBuild;
fn build_db(lines: &[&str], fmt: AnnotationFormat) -> (Vec<Transcript>, LoaderReport) {
let mut graph = FeatureGraph::new();
for (i, l) in lines.iter().enumerate() {
let r = Gff3Record::parse(l, (i + 1) as u64).unwrap().unwrap();
graph.ingest(r);
}
graph.resolve();
let mut report = LoaderReport::default();
let txs = build_transcripts(
&graph,
fmt,
GenomeBuild::GRCh38,
"test.gff".into(),
&mut report,
);
(txs, report)
}
#[test]
fn cds_start_uses_start_codon_when_present() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1",
"chr1\t.\texon\t100\t500\t.\t+\t.\tParent=tx1",
"chr1\t.\tCDS\t150\t450\t.\t+\t0\tParent=tx1",
"chr1\t.\tstart_codon\t150\t152\t.\t+\t0\tParent=tx1",
];
let (txs, _) = build_db(lines, AnnotationFormat::Gff3);
let tx = &txs[0];
assert_eq!(tx.cds_start, Some(51));
}
#[test]
fn cds_start_applies_phase_when_no_start_codon() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1",
"chr1\t.\texon\t100\t500\t.\t+\t.\tParent=tx1",
"chr1\t.\tCDS\t150\t450\t.\t+\t2\tParent=tx1",
];
let (txs, report) = build_db(lines, AnnotationFormat::Gff3);
let tx = &txs[0];
assert_eq!(tx.cds_start, Some(53)); assert_eq!(report.diagnostics_by_code.get("W-LOAD-110"), Some(&1));
}
#[test]
fn cds_start_rejects_out_of_range_phase_as_unavailable() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1",
"chr1\t.\texon\t100\t500\t.\t+\t.\tParent=tx1",
"chr1\t.\tCDS\t150\t450\t.\t+\t5\tParent=tx1",
];
let (txs, report) = build_db(lines, AnnotationFormat::Gff3);
let tx = &txs[0];
assert_eq!(tx.cds_start, Some(51));
assert_eq!(report.diagnostics_by_code.get("W-LOAD-110"), None);
assert_eq!(report.diagnostics_by_code.get("W-LOAD-111"), Some(&1));
}
#[test]
fn gff3_cds_end_extends_to_include_stop_when_within_exon() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1",
"chr1\t.\texon\t100\t500\t.\t+\t.\tParent=tx1",
"chr1\t.\tCDS\t150\t450\t.\t+\t0\tParent=tx1",
];
let (txs, report) = build_db(lines, AnnotationFormat::Gff3);
let tx = &txs[0];
assert_eq!(tx.cds_end, Some(354));
assert_eq!(report.diagnostics_by_code.get("W-LOAD-112"), Some(&1));
}
#[test]
fn gff3_cds_end_clipped_when_no_room_for_stop_extension() {
let lines = &[
"seq1\t.\tgene\t100\t1200\t.\t+\t.\tID=gene01;Name=gene01",
"seq1\t.\tmRNA\t100\t1200\t.\t+\t.\tID=gene01.1;Parent=gene01",
"seq1\t.\tCDS\t100\t1200\t.\t+\t0\tParent=gene01.1",
];
let (txs, report) = build_db(lines, AnnotationFormat::Gff3);
let tx = &txs[0];
assert_eq!(tx.cds_end, Some(1101));
assert_eq!(report.diagnostics_by_code.get("W-LOAD-112"), Some(&1));
}
#[test]
fn gff3_cds_end_uses_stop_codon_record_when_present() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1",
"chr1\t.\texon\t100\t500\t.\t+\t.\tParent=tx1",
"chr1\t.\tCDS\t150\t450\t.\t+\t0\tParent=tx1",
"chr1\t.\tstop_codon\t451\t453\t.\t+\t0\tParent=tx1",
];
let (txs, report) = build_db(lines, AnnotationFormat::Gff3);
let tx = &txs[0];
assert_eq!(tx.cds_end, Some(354)); assert_eq!(report.diagnostics_by_code.get("W-LOAD-112"), None);
}
#[test]
fn gtf_cds_end_uses_max_cds_verbatim() {
let mut graph = crate::reference::annotation::graph::FeatureGraph::new();
let lines = &[
"chr1\tHAVANA\ttranscript\t100\t500\t.\t+\t.\tgene_id \"g1\"; transcript_id \"tx1\";",
"chr1\tHAVANA\texon\t100\t500\t.\t+\t.\tgene_id \"g1\"; transcript_id \"tx1\";",
"chr1\tHAVANA\tCDS\t150\t450\t.\t+\t0\tgene_id \"g1\"; transcript_id \"tx1\";",
];
for (i, l) in lines.iter().enumerate() {
let r = crate::reference::annotation::record::GtfRecord::parse(l, (i + 1) as u64)
.unwrap()
.unwrap();
graph.ingest(r);
}
graph.resolve();
let mut report = LoaderReport::default();
let txs = build_transcripts(
&graph,
AnnotationFormat::Gtf,
crate::reference::transcript::GenomeBuild::GRCh38,
"t.gtf".into(),
&mut report,
);
let tx = &txs[0];
assert_eq!(tx.cds_end, Some(351));
assert_eq!(report.diagnostics_by_code.get("W-LOAD-112"), None);
}
#[test]
fn ladder_step1_uses_explicit_exons() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1;gene=G1",
"chr1\t.\texon\t100\t200\t.\t+\t.\tParent=tx1",
"chr1\t.\texon\t300\t500\t.\t+\t.\tParent=tx1",
"chr1\t.\tCDS\t150\t450\t.\t+\t0\tParent=tx1",
];
let (txs, _) = build_db(lines, AnnotationFormat::Gff3);
assert_eq!(txs.len(), 1);
assert_eq!(txs[0].exons.len(), 2);
assert_eq!(txs[0].exons[0].genomic_start, Some(100));
assert_eq!(txs[0].exons[1].genomic_end, Some(500));
}
#[test]
fn ladder_step3_cds_as_exon_for_issue_183() {
let lines = &[
"seq1\t.\tgene\t100\t1200\t.\t+\t.\tID=gene01;Name=gene01",
"seq1\t.\tmRNA\t100\t1200\t.\t+\t.\tID=gene01.1;Parent=gene01",
"seq1\t.\tCDS\t100\t1200\t.\t+\t0\tParent=gene01.1",
];
let (txs, _) = build_db(lines, AnnotationFormat::Gff3);
assert_eq!(
txs.len(),
1,
"issue #183: single-exon transcript must not be dropped"
);
assert_eq!(txs[0].exons.len(), 1);
assert_eq!(txs[0].exons[0].genomic_start, Some(100));
assert_eq!(txs[0].exons[0].genomic_end, Some(1200));
}
#[test]
fn ladder_step3_preserves_utr_when_cds_is_subset_of_mrna() {
let lines = &[
"seq1\t.\tgene\t100\t1200\t.\t+\t.\tID=g1;Name=g1",
"seq1\t.\tmRNA\t100\t1200\t.\t+\t.\tID=tx1;Parent=g1",
"seq1\t.\tCDS\t200\t1000\t.\t+\t0\tParent=tx1",
];
let (txs, _) = build_db(lines, AnnotationFormat::Gff3);
assert_eq!(txs.len(), 1);
let tx = &txs[0];
assert_eq!(tx.exons.len(), 1);
assert_eq!(tx.exons[0].genomic_start, Some(100));
assert_eq!(tx.exons[0].genomic_end, Some(1200));
assert_eq!(tx.exons[0].end, 1101);
assert_eq!(tx.cds_start, Some(101));
assert_eq!(tx.cds_end, Some(904));
}
#[test]
fn ladder_step4_drops_with_diagnostic() {
let lines = &["chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1"];
let (txs, report) = build_db(lines, AnnotationFormat::Gff3);
assert!(txs.is_empty());
assert_eq!(report.diagnostics_by_code.get("W-LOAD-100"), Some(&1));
}
#[test]
fn unknown_strand_dropped_with_diagnostic() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t.\t.\tID=tx1",
"chr1\t.\texon\t100\t500\t.\t.\t.\tParent=tx1",
];
let (txs, report) = build_db(lines, AnnotationFormat::Gff3);
assert!(txs.is_empty());
assert_eq!(report.diagnostics_by_code.get("E-LOAD-103"), Some(&1));
}
#[test]
fn multi_parent_exon_is_attached_to_both_transcripts() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1",
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx2",
"chr1\t.\texon\t100\t500\t.\t+\t.\tParent=tx1,tx2",
];
let (txs, _) = build_db(lines, AnnotationFormat::Gff3);
assert_eq!(txs.len(), 2);
assert!(txs.iter().all(|tx| tx.exons.len() == 1));
}
#[test]
fn minus_strand_exons_and_cds_use_transcript_order() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t-\t.\tID=tx1;gene=G1",
"chr1\t.\texon\t100\t200\t.\t-\t.\tParent=tx1",
"chr1\t.\texon\t300\t500\t.\t-\t.\tParent=tx1",
"chr1\t.\tCDS\t150\t450\t.\t-\t0\tParent=tx1",
];
let (txs, _) = build_db(lines, AnnotationFormat::Gff3);
assert_eq!(txs.len(), 1);
let tx = &txs[0];
assert_eq!(tx.exons[0].number, 1);
assert_eq!(tx.exons[0].genomic_start, Some(300));
assert_eq!(tx.exons[0].genomic_end, Some(500));
assert_eq!(tx.exons[0].start, 1);
assert_eq!(tx.exons[0].end, 201);
assert_eq!(tx.exons[1].number, 2);
assert_eq!(tx.exons[1].genomic_start, Some(100));
assert_eq!(tx.exons[1].genomic_end, Some(200));
assert_eq!(tx.exons[1].start, 202);
assert_eq!(tx.exons[1].end, 302);
assert_eq!(tx.cds_start, Some(51));
assert_eq!(tx.cds_end, Some(255));
}
#[test]
fn ladder_step2_merges_utrs_and_cds_into_one_exon_when_adjacent() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1",
"chr1\t.\tfive_prime_UTR\t100\t149\t.\t+\t.\tParent=tx1",
"chr1\t.\tCDS\t150\t450\t.\t+\t0\tParent=tx1",
"chr1\t.\tthree_prime_UTR\t451\t500\t.\t+\t.\tParent=tx1",
];
let (txs, _) = build_db(lines, AnnotationFormat::Gff3);
assert_eq!(txs.len(), 1);
let tx = &txs[0];
assert_eq!(
tx.exons.len(),
1,
"adjacent UTR+CDS+UTR should merge to one exon"
);
assert_eq!(tx.exons[0].genomic_start, Some(100));
assert_eq!(tx.exons[0].genomic_end, Some(500));
}
#[test]
fn ladder_step2_keeps_separate_exons_when_intervals_have_gaps() {
let lines = &[
"chr1\t.\tmRNA\t100\t800\t.\t+\t.\tID=tx1",
"chr1\t.\tfive_prime_UTR\t100\t149\t.\t+\t.\tParent=tx1",
"chr1\t.\tCDS\t150\t300\t.\t+\t0\tParent=tx1",
"chr1\t.\tCDS\t500\t650\t.\t+\t0\tParent=tx1",
"chr1\t.\tthree_prime_UTR\t651\t800\t.\t+\t.\tParent=tx1",
];
let (txs, _) = build_db(lines, AnnotationFormat::Gff3);
assert_eq!(txs.len(), 1);
let tx = &txs[0];
assert_eq!(tx.exons.len(), 2);
assert_eq!(tx.exons[0].genomic_start, Some(100));
assert_eq!(tx.exons[0].genomic_end, Some(300));
assert_eq!(tx.exons[1].genomic_start, Some(500));
assert_eq!(tx.exons[1].genomic_end, Some(800));
}
#[test]
fn ladder_step2_with_only_utrs_no_cds_falls_through_to_drop() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1",
"chr1\t.\tfive_prime_UTR\t100\t250\t.\t+\t.\tParent=tx1",
"chr1\t.\tthree_prime_UTR\t251\t500\t.\t+\t.\tParent=tx1",
];
let (txs, report) = build_db(lines, AnnotationFormat::Gff3);
assert!(txs.is_empty());
assert_eq!(report.diagnostics_by_code.get("W-LOAD-100"), Some(&1));
}
#[test]
fn gff3_gene_symbol_extracted_from_gene_attribute() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1;gene=GENE1",
"chr1\t.\texon\t100\t500\t.\t+\t.\tParent=tx1",
];
let (txs, _) = build_db(lines, AnnotationFormat::Gff3);
assert_eq!(txs[0].gene_symbol.as_deref(), Some("GENE1"));
}
#[test]
fn gff3_gene_name_attribute_takes_precedence_over_name() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1;Name=fallback;gene_name=PRIMARY",
"chr1\t.\texon\t100\t500\t.\t+\t.\tParent=tx1",
];
let (txs, _) = build_db(lines, AnnotationFormat::Gff3);
assert_eq!(txs[0].gene_symbol.as_deref(), Some("PRIMARY"));
}
#[test]
fn gtf_gene_symbol_falls_back_to_gene_id_when_no_gene_name() {
let mut graph = crate::reference::annotation::graph::FeatureGraph::new();
let lines = &[
"chr1\tHAVANA\ttranscript\t100\t500\t.\t+\t.\tgene_id \"ENSG_NOSYMBOL\"; transcript_id \"tx1\";",
"chr1\tHAVANA\texon\t100\t500\t.\t+\t.\tgene_id \"ENSG_NOSYMBOL\"; transcript_id \"tx1\";",
];
for (i, l) in lines.iter().enumerate() {
let r = crate::reference::annotation::record::GtfRecord::parse(l, (i + 1) as u64)
.unwrap()
.unwrap();
graph.ingest(r);
}
graph.resolve();
let mut report = LoaderReport::default();
let txs = build_transcripts(
&graph,
AnnotationFormat::Gtf,
crate::reference::transcript::GenomeBuild::GRCh38,
"t.gtf".into(),
&mut report,
);
assert_eq!(txs[0].gene_symbol.as_deref(), Some("ENSG_NOSYMBOL"));
}
#[test]
fn mane_select_extracted_from_tag() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1;tag=MANE_Select",
"chr1\t.\texon\t100\t500\t.\t+\t.\tParent=tx1",
];
let (txs, _) = build_db(lines, AnnotationFormat::Gff3);
assert!(matches!(
txs[0].mane_status,
crate::reference::transcript::ManeStatus::Select
));
}
#[test]
fn mane_plus_clinical_extracted_from_tag() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1;tag=MANE_Plus_Clinical",
"chr1\t.\texon\t100\t500\t.\t+\t.\tParent=tx1",
];
let (txs, _) = build_db(lines, AnnotationFormat::Gff3);
assert!(matches!(
txs[0].mane_status,
crate::reference::transcript::ManeStatus::PlusClinical
));
}
#[test]
fn refseq_and_ensembl_cross_refs_extracted() {
let lines = &[
"chr1\t.\tmRNA\t100\t500\t.\t+\t.\tID=tx1;RefSeq=NM_000088.3;Ensembl=ENST00000000001",
"chr1\t.\texon\t100\t500\t.\t+\t.\tParent=tx1",
];
let (txs, _) = build_db(lines, AnnotationFormat::Gff3);
assert_eq!(txs[0].refseq_match.as_deref(), Some("NM_000088.3"));
assert_eq!(txs[0].ensembl_match.as_deref(), Some("ENST00000000001"));
}
}