use std::path::Path;
use crate::reference::provider::ReferenceProvider;
use crate::reference::transcript::{Strand, Transcript};
use super::diagnostics::{DiagnosticPayload, LoaderDiagnostic, LoaderReport, SourceLocation};
pub(crate) fn validate_transcript(
tx: &Transcript,
fasta: &dyn ReferenceProvider,
source_path: &Path,
report: &mut LoaderReport,
) {
let (cds_start, cds_end) = match (tx.cds_start, tx.cds_end) {
(Some(s), Some(e)) => (s, e),
_ => return, };
let cds_len = cds_end.saturating_sub(cds_start).saturating_add(1);
if cds_len % 3 != 0 {
report.record(LoaderDiagnostic::warning(
"W-LOAD-200",
format!("CdsLengthNotMod3: tx {} cds_length={}", tx.id, cds_len),
SourceLocation {
path: source_path.to_path_buf(),
line: 0,
},
Some(tx.id.clone()),
DiagnosticPayload::CdsLengthNotMod3 {
transcript_id: tx.id.clone(),
length: cds_len,
},
));
}
let chrom = match tx.chromosome.as_deref() {
Some(c) => c,
None => return,
};
let codon = extract_first_codon(tx, chrom, fasta, cds_start);
if let Some(c) = codon {
let codon_uc = c.to_uppercase();
let canonical = matches!(codon_uc.as_str(), "ATG" | "CTG" | "GTG" | "TTG");
if !canonical {
report.record(LoaderDiagnostic::warning(
"W-LOAD-201",
format!("NonCanonicalStartCodon: tx {} codon={}", tx.id, codon_uc),
SourceLocation {
path: source_path.to_path_buf(),
line: 0,
},
Some(tx.id.clone()),
DiagnosticPayload::NonCanonicalStartCodon {
transcript_id: tx.id.clone(),
codon: codon_uc,
},
));
}
}
}
fn extract_first_codon(
tx: &Transcript,
chrom: &str,
fasta: &dyn ReferenceProvider,
cds_start: u64,
) -> Option<String> {
const TARGET: u64 = 3;
let mut codon = String::with_capacity(3);
let mut tx_pos = cds_start;
for ex in &tx.exons {
if codon.len() as u64 >= TARGET {
break;
}
if ex.end < tx_pos {
continue;
}
let (gs, ge) = match (ex.genomic_start, ex.genomic_end) {
(Some(gs), Some(ge)) => (gs, ge),
_ => return None,
};
let local_start_tx = tx_pos.max(ex.start);
if local_start_tx > ex.end {
continue;
}
let remaining = TARGET - codon.len() as u64;
let avail = ex.end - local_start_tx + 1;
let take = remaining.min(avail);
let local_end_tx = local_start_tx + take - 1;
let chunk = match tx.strand {
Strand::Plus => {
let g_lo = gs + (local_start_tx - ex.start);
let g_hi = gs + (local_end_tx - ex.start);
fasta
.get_sequence(chrom, g_lo.saturating_sub(1), g_hi)
.ok()?
}
Strand::Minus => {
let g_hi = ge - (local_start_tx - ex.start);
let g_lo = ge - (local_end_tx - ex.start);
let raw = fasta
.get_sequence(chrom, g_lo.saturating_sub(1), g_hi)
.ok()?;
reverse_complement(&raw)
}
Strand::Unknown => return None,
};
codon.push_str(&chunk);
tx_pos = local_end_tx + 1;
}
(codon.len() as u64 == TARGET).then_some(codon)
}
fn reverse_complement(s: &str) -> String {
s.chars()
.rev()
.map(|c| match c {
'A' | 'a' => 'T',
'T' | 't' => 'A',
'G' | 'g' => 'C',
'C' | 'c' => 'G',
'N' | 'n' => 'N',
other => other,
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use crate::reference::provider::ReferenceProvider;
use crate::reference::transcript::{Exon, GenomeBuild, ManeStatus, Transcript};
use std::sync::OnceLock;
struct StubFasta {
seq: String,
}
impl ReferenceProvider for StubFasta {
fn get_transcript(&self, id: &str) -> Result<Transcript, crate::error::FerroError> {
Err(crate::error::FerroError::ReferenceNotFound { id: id.to_string() })
}
fn get_sequence(
&self,
_chrom: &str,
start: u64,
end: u64,
) -> Result<String, crate::error::FerroError> {
Ok(self.seq[(start as usize)..(end as usize)].to_string())
}
}
fn tx_with_cds(cds_start: u64, cds_end: u64) -> Transcript {
Transcript {
id: "tx1".into(),
gene_symbol: None,
strand: Strand::Plus,
sequence: None,
cds_start: Some(cds_start),
cds_end: Some(cds_end),
exons: vec![Exon::with_genomic(1, 1, 100, 1, 100)],
chromosome: Some("chr1".into()),
genomic_start: Some(1),
genomic_end: Some(100),
genome_build: GenomeBuild::GRCh38,
mane_status: ManeStatus::None,
refseq_match: None,
ensembl_match: None,
exon_cigars: vec![],
cached_introns: OnceLock::new(),
}
}
#[test]
fn warns_when_cds_length_not_mod3() {
let tx = tx_with_cds(1, 5);
let fa = StubFasta {
seq: "ATGCCCAAATAGNNNNNNN".to_string(),
};
let mut report = LoaderReport::default();
validate_transcript(&tx, &fa, Path::new("t"), &mut report);
assert_eq!(report.diagnostics_by_code.get("W-LOAD-200"), Some(&1));
}
#[test]
fn does_not_warn_when_cds_length_mod3_and_atg_start() {
let tx = tx_with_cds(1, 9);
let fa = StubFasta {
seq: "ATGCCCAAATAGNNNNNNN".to_string(),
};
let mut report = LoaderReport::default();
validate_transcript(&tx, &fa, Path::new("t"), &mut report);
assert_eq!(report.diagnostics_by_code.get("W-LOAD-200"), None);
assert_eq!(report.diagnostics_by_code.get("W-LOAD-201"), None);
}
fn seq_with(planted: &[(u64, char)], len: usize) -> String {
let mut bytes = vec![b'N'; len];
for &(pos1, base) in planted {
bytes[(pos1 - 1) as usize] = base as u8;
}
String::from_utf8(bytes).unwrap()
}
fn two_exon_tx(
strand: Strand,
ex1: (u64, u64, u64, u64),
ex2: (u64, u64, u64, u64),
cds_start: u64,
cds_end: u64,
) -> Transcript {
Transcript {
id: "tx_split".into(),
gene_symbol: None,
strand,
sequence: None,
cds_start: Some(cds_start),
cds_end: Some(cds_end),
exons: vec![
Exon::with_genomic(1, ex1.0, ex1.1, ex1.2, ex1.3),
Exon::with_genomic(2, ex2.0, ex2.1, ex2.2, ex2.3),
],
chromosome: Some("chr1".into()),
genomic_start: Some(ex1.2.min(ex2.2)),
genomic_end: Some(ex1.3.max(ex2.3)),
genome_build: GenomeBuild::GRCh38,
mane_status: ManeStatus::None,
refseq_match: None,
ensembl_match: None,
exon_cigars: vec![],
cached_introns: OnceLock::new(),
}
}
#[test]
fn plus_strand_start_codon_split_1_2_across_exon_junction() {
let tx = two_exon_tx(Strand::Plus, (1, 1, 10, 10), (2, 10, 50, 58), 1, 9);
let fa = StubFasta {
seq: seq_with(&[(10, 'A'), (50, 'T'), (51, 'G')], 100),
};
let mut report = LoaderReport::default();
validate_transcript(&tx, &fa, Path::new("t"), &mut report);
assert_eq!(report.diagnostics_by_code.get("W-LOAD-200"), None);
assert_eq!(report.diagnostics_by_code.get("W-LOAD-201"), None);
}
#[test]
fn plus_strand_start_codon_split_2_1_across_exon_junction() {
let tx = two_exon_tx(Strand::Plus, (1, 2, 10, 11), (3, 9, 50, 56), 1, 9);
let fa = StubFasta {
seq: seq_with(&[(10, 'A'), (11, 'T'), (50, 'G')], 100),
};
let mut report = LoaderReport::default();
validate_transcript(&tx, &fa, Path::new("t"), &mut report);
assert_eq!(report.diagnostics_by_code.get("W-LOAD-201"), None);
}
#[test]
fn minus_strand_start_codon_split_1_2_across_exon_junction() {
let tx = two_exon_tx(Strand::Minus, (1, 1, 80, 80), (2, 10, 20, 28), 1, 9);
let fa = StubFasta {
seq: seq_with(&[(80, 'T'), (28, 'A'), (27, 'C')], 100),
};
let mut report = LoaderReport::default();
validate_transcript(&tx, &fa, Path::new("t"), &mut report);
assert_eq!(report.diagnostics_by_code.get("W-LOAD-201"), None);
}
#[test]
fn minus_strand_start_codon_split_2_1_across_exon_junction() {
let tx = two_exon_tx(Strand::Minus, (1, 2, 80, 81), (3, 9, 20, 26), 1, 9);
let fa = StubFasta {
seq: seq_with(&[(81, 'T'), (80, 'A'), (26, 'C')], 100),
};
let mut report = LoaderReport::default();
validate_transcript(&tx, &fa, Path::new("t"), &mut report);
assert_eq!(report.diagnostics_by_code.get("W-LOAD-201"), None);
}
#[test]
fn split_start_codon_warns_when_not_canonical() {
let tx = two_exon_tx(Strand::Plus, (1, 1, 10, 10), (2, 10, 50, 58), 1, 9);
let fa = StubFasta {
seq: seq_with(&[(10, 'G'), (50, 'A'), (51, 'A')], 100),
};
let mut report = LoaderReport::default();
validate_transcript(&tx, &fa, Path::new("t"), &mut report);
assert_eq!(report.diagnostics_by_code.get("W-LOAD-201"), Some(&1));
}
#[test]
fn warns_on_non_canonical_start_codon() {
let tx = tx_with_cds(1, 9);
let fa = StubFasta {
seq: "AAATTTGGGNNNNNNNNNN".to_string(),
};
let mut report = LoaderReport::default();
validate_transcript(&tx, &fa, Path::new("t"), &mut report);
assert_eq!(report.diagnostics_by_code.get("W-LOAD-201"), Some(&1));
}
}