use crate::error::FerroError;
use crate::hgvs::edit::{InsertedSequence, NaEdit};
use crate::hgvs::location::AminoAcid;
use crate::reference::transcript::Transcript;
use crate::sequence::reverse_complement;
use super::substitution::translate;
pub(crate) fn read_full_cds(transcript: &Transcript) -> Result<String, FerroError> {
let cds_start = transcript
.cds_start
.ok_or_else(|| FerroError::ConversionError {
msg: format!("transcript {} has no CDS start", transcript.id),
})? as usize;
let cds_end = transcript
.cds_end
.ok_or_else(|| FerroError::ConversionError {
msg: format!("transcript {} has no CDS end", transcript.id),
})? as usize;
let seq =
transcript
.sequence
.as_deref()
.ok_or_else(|| FerroError::ProteinSequenceUnavailable {
accession: transcript.id.clone(),
})?;
if cds_start < 1 || cds_end > seq.len() || cds_start > cds_end + 1 {
return Err(FerroError::ProteinSequenceUnavailable {
accession: transcript.id.clone(),
});
}
Ok(seq[cds_start - 1..cds_end].to_uppercase())
}
pub(crate) fn build_mutated_cds(
transcript: &Transcript,
cds_pos_start: i64, cds_pos_end: i64, edit: &NaEdit,
) -> Result<String, FerroError> {
let tx_cds_start = transcript
.cds_start
.ok_or_else(|| FerroError::ConversionError {
msg: format!("transcript {} has no CDS", transcript.id),
})? as usize;
let full_cds = read_full_cds(transcript)?;
let idx_start = (cds_pos_start - 1) as usize;
let idx_end = cds_pos_end as usize;
if idx_end > full_cds.len() {
return Err(FerroError::ProteinSequenceUnavailable {
accession: transcript.id.clone(),
});
}
let before = &full_cds[..idx_start];
let affected = &full_cds[idx_start..idx_end];
let after = &full_cds[idx_end..];
let mutated = match edit {
NaEdit::Deletion { .. } => {
format!("{}{}", before, after)
}
NaEdit::Insertion { sequence } => {
let inserted = extract_literal_sequence(sequence, transcript)?;
format!("{}{}{}{}", before, affected, inserted, after)
}
NaEdit::Duplication { .. } => {
format!("{}{}{}{}", before, affected, affected, after)
}
NaEdit::Delins { sequence, .. } => {
let inserted = extract_literal_sequence(sequence, transcript)?;
format!("{}{}{}", before, inserted, after)
}
NaEdit::Inversion { .. } => {
let rc = reverse_complement(affected);
format!("{}{}{}", before, rc, after)
}
NaEdit::Substitution { alternative, .. } => {
let alt_char = alternative.to_u8() as char;
format!("{}{}{}", before, alt_char, after)
}
_ => {
return Err(FerroError::UnsupportedProjection {
reason: format!("build_mutated_cds does not support edit type: {:?}", edit),
})
}
};
let _ = tx_cds_start; Ok(mutated.to_uppercase())
}
fn extract_literal_sequence(
seq: &InsertedSequence,
transcript: &Transcript,
) -> Result<String, FerroError> {
match seq {
InsertedSequence::Literal(s) => Ok(s.to_string()),
_ => Err(FerroError::UnsupportedProjection {
reason: format!(
"protein prediction requires a literal inserted sequence; \
got non-literal for transcript {}",
transcript.id
),
}),
}
}
pub(crate) fn translate_full_cds(cds: &str) -> Vec<AminoAcid> {
cds.as_bytes()
.chunks_exact(3)
.filter_map(|c| std::str::from_utf8(c).ok().and_then(translate))
.take_while(|aa| *aa != AminoAcid::Ter)
.collect()
}
pub(crate) fn translate_full_cds_with_stop(cds: &str) -> Vec<AminoAcid> {
let mut result = Vec::new();
for chunk in cds.as_bytes().chunks_exact(3) {
if let Ok(s) = std::str::from_utf8(chunk) {
if let Some(aa) = translate(s) {
result.push(aa);
if aa == AminoAcid::Ter {
break;
}
}
}
}
result
}
pub(crate) fn first_diff_position(ref_prot: &[AminoAcid], alt_prot: &[AminoAcid]) -> usize {
ref_prot
.iter()
.zip(alt_prot.iter())
.position(|(r, a)| r != a)
.unwrap_or(ref_prot.len().min(alt_prot.len()))
}
pub(crate) fn net_length_change(edit: &NaEdit, del_len: usize) -> Option<i64> {
match edit {
NaEdit::Deletion { .. } => Some(-(del_len as i64)),
NaEdit::Insertion { sequence } => sequence.len().map(|n| n as i64),
NaEdit::Duplication { .. } => Some(del_len as i64),
NaEdit::Delins { sequence, .. } => sequence.len().map(|n| n as i64 - del_len as i64),
NaEdit::Inversion { .. } => Some(0),
NaEdit::Substitution { .. } => Some(0),
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::hgvs::edit::{InsertedSequence, Sequence};
use crate::reference::transcript::{Exon, ManeStatus, Strand};
use std::sync::OnceLock;
fn tx(seq: &str, cds_start: u64, cds_end: u64) -> Transcript {
Transcript {
id: "NM_TEST.1".to_string(),
gene_symbol: None,
strand: Strand::Plus,
sequence: Some(seq.to_string()),
cds_start: Some(cds_start),
cds_end: Some(cds_end),
exons: vec![Exon::new(1, 1, seq.len() as u64)],
chromosome: None,
genomic_start: None,
genomic_end: None,
genome_build: Default::default(),
mane_status: ManeStatus::default(),
refseq_match: None,
ensembl_match: None,
exon_cigars: Vec::new(),
cached_introns: OnceLock::new(),
}
}
#[test]
fn read_full_cds_no_utr() {
let t = tx("ATGCGCTAA", 1, 9);
assert_eq!(read_full_cds(&t).unwrap(), "ATGCGCTAA");
}
#[test]
fn read_full_cds_with_5utr() {
let t = tx("AAATGCCCTAG", 3, 11);
assert_eq!(read_full_cds(&t).unwrap(), "ATGCCCTAG");
}
#[test]
fn mutated_cds_deletion_single_base() {
let t = tx("ATGCGCTAA", 1, 9);
let edit = NaEdit::Deletion {
sequence: None,
length: None,
};
let result = build_mutated_cds(&t, 4, 4, &edit).unwrap();
assert_eq!(result, "ATGGCTAA");
}
#[test]
fn mutated_cds_deletion_three_bases() {
let t = tx("ATGCGCTAA", 1, 9);
let edit = NaEdit::Deletion {
sequence: None,
length: None,
};
let result = build_mutated_cds(&t, 4, 6, &edit).unwrap();
assert_eq!(result, "ATGTAA");
}
#[test]
fn mutated_cds_insertion() {
let t = tx("ATGCGCTAA", 1, 9);
let seq: Sequence = "GGG".parse().unwrap();
let edit = NaEdit::Insertion {
sequence: InsertedSequence::Literal(seq),
};
let result = build_mutated_cds(&t, 3, 3, &edit).unwrap();
assert_eq!(result, "ATGGGGCGCTAA");
}
#[test]
fn mutated_cds_duplication() {
let t = tx("ATGCGCTAA", 1, 9);
let edit = NaEdit::Duplication {
sequence: None,
length: None,
uncertain_extent: None,
};
let result = build_mutated_cds(&t, 4, 6, &edit).unwrap();
assert_eq!(result, "ATGCGCCGCTAA");
}
#[test]
fn mutated_cds_delins() {
let t = tx("ATGCGCTAA", 1, 9);
let seq: Sequence = "TCC".parse().unwrap();
let edit = NaEdit::Delins {
sequence: InsertedSequence::Literal(seq),
deleted: None,
deleted_length: None,
};
let result = build_mutated_cds(&t, 4, 6, &edit).unwrap();
assert_eq!(result, "ATGTCCTAA");
}
#[test]
fn mutated_cds_inversion() {
let t = tx("ATGCGCTAA", 1, 9);
let edit = NaEdit::Inversion {
sequence: None,
length: None,
};
let result = build_mutated_cds(&t, 4, 6, &edit).unwrap();
assert_eq!(result, "ATGGCGTAA");
}
#[test]
fn translate_full_cds_met_arg() {
let aas = translate_full_cds("ATGCGCTAA");
assert_eq!(aas, vec![AminoAcid::Met, AminoAcid::Arg]);
}
#[test]
fn translate_full_cds_with_stop_includes_ter() {
let aas = translate_full_cds_with_stop("ATGCGCTAA");
assert_eq!(aas, vec![AminoAcid::Met, AminoAcid::Arg, AminoAcid::Ter]);
}
#[test]
fn translate_full_cds_incomplete_codon_dropped() {
let aas = translate_full_cds("ATGCGCTA");
assert_eq!(aas, vec![AminoAcid::Met, AminoAcid::Arg]);
}
#[test]
fn first_diff_identical() {
let r = vec![AminoAcid::Met, AminoAcid::Arg];
let a = vec![AminoAcid::Met, AminoAcid::Arg];
assert_eq!(first_diff_position(&r, &a), 2);
}
#[test]
fn first_diff_at_start() {
let r = vec![AminoAcid::Met, AminoAcid::Arg];
let a = vec![AminoAcid::Val, AminoAcid::Arg];
assert_eq!(first_diff_position(&r, &a), 0);
}
#[test]
fn first_diff_second_position() {
let r = vec![AminoAcid::Met, AminoAcid::Arg];
let a = vec![AminoAcid::Met, AminoAcid::Ser];
assert_eq!(first_diff_position(&r, &a), 1);
}
#[test]
fn net_change_deletion() {
let edit = NaEdit::Deletion {
sequence: None,
length: None,
};
assert_eq!(net_length_change(&edit, 3), Some(-3));
}
#[test]
fn net_change_insertion() {
let seq: Sequence = "GGG".parse().unwrap();
let edit = NaEdit::Insertion {
sequence: InsertedSequence::Literal(seq),
};
assert_eq!(net_length_change(&edit, 0), Some(3));
}
#[test]
fn net_change_duplication() {
let edit = NaEdit::Duplication {
sequence: None,
length: None,
uncertain_extent: None,
};
assert_eq!(net_length_change(&edit, 3), Some(3));
}
#[test]
fn net_change_inversion() {
let edit = NaEdit::Inversion {
sequence: None,
length: None,
};
assert_eq!(net_length_change(&edit, 6), Some(0));
}
}