use crate::error::FerroError;
use crate::hgvs::edit::{AminoAcidSeq, ExtDirection, NaEdit, ProteinEdit};
use crate::hgvs::interval::ProtInterval;
use crate::hgvs::location::{AminoAcid, ProtPos};
use crate::hgvs::variant::{HgvsVariant, LocEdit, ProteinVariant};
use crate::project::accession::parse_accession;
use crate::reference::transcript::Transcript;
use super::helpers::{
build_mutated_cds, first_diff_position, net_length_change, read_full_cds, translate_full_cds,
translate_full_cds_with_stop,
};
pub(crate) fn predict_indel_protein(
transcript: &Transcript,
cds_pos_start: i64,
cds_pos_end: i64,
edit: &NaEdit,
protein_accession: &str,
) -> Result<HgvsVariant, FerroError> {
let ref_cds = read_full_cds(transcript)?;
let mut_cds = build_mutated_cds(transcript, cds_pos_start, cds_pos_end, edit)?;
let ref_protein = translate_full_cds(&ref_cds);
let alt_protein = translate_full_cds(&mut_cds);
if alt_protein.len() > ref_protein.len() {
let ref_protein_with_stop = translate_full_cds_with_stop(&ref_cds);
if ref_protein_with_stop.last() == Some(&AminoAcid::Ter)
&& alt_protein.len() > ref_protein.len()
{
let stop_cds_start = (ref_protein.len() * 3 + 1) as i64; if cds_pos_start >= stop_cds_start
|| (cds_pos_end >= stop_cds_start && cds_pos_start <= stop_cds_start + 2)
{
return build_extension_variant(
&ref_protein,
&alt_protein,
&mut_cds,
protein_accession,
transcript,
);
}
}
}
let del_len = (cds_pos_end - cds_pos_start + 1) as usize;
let net =
net_length_change(edit, del_len).ok_or_else(|| FerroError::UnsupportedProjection {
reason: "cannot determine net length change for protein prediction".to_string(),
})?;
let is_frameshift = net % 3 != 0;
if is_frameshift {
build_frameshift_variant(
&ref_protein,
&alt_protein,
&mut_cds,
protein_accession,
transcript,
)
} else {
build_inframe_variant(
transcript,
cds_pos_start,
cds_pos_end,
edit,
net,
&ref_protein,
&alt_protein,
protein_accession,
)
}
}
#[allow(clippy::too_many_arguments)]
fn build_inframe_variant(
transcript: &Transcript,
cds_pos_start: i64,
cds_pos_end: i64,
edit: &NaEdit,
net: i64, ref_protein: &[AminoAcid],
alt_protein: &[AminoAcid],
protein_accession: &str,
) -> Result<HgvsVariant, FerroError> {
if ref_protein == alt_protein {
return build_identity_variant(ref_protein, protein_accession, transcript);
}
let frame_at_start = (cds_pos_start - 1) % 3;
match edit {
NaEdit::Deletion { .. } => {
if frame_at_start == 0 && net % 3 == 0 {
build_inframe_deletion(ref_protein, alt_protein, protein_accession, transcript)
} else {
build_inframe_delins(
ref_protein,
alt_protein,
cds_pos_start,
protein_accession,
transcript,
)
}
}
NaEdit::Insertion { .. } => {
if frame_at_start == 2 && net % 3 == 0 {
build_inframe_insertion(
ref_protein,
alt_protein,
cds_pos_start,
protein_accession,
transcript,
)
} else {
build_inframe_delins(
ref_protein,
alt_protein,
cds_pos_start,
protein_accession,
transcript,
)
}
}
NaEdit::Duplication { .. } => {
if frame_at_start == 0 && net % 3 == 0 {
build_inframe_duplication(
ref_protein,
alt_protein,
cds_pos_start,
cds_pos_end,
protein_accession,
transcript,
)
} else {
build_inframe_delins(
ref_protein,
alt_protein,
cds_pos_start,
protein_accession,
transcript,
)
}
}
NaEdit::Delins { .. } | NaEdit::Inversion { .. } => {
build_inframe_delins(
ref_protein,
alt_protein,
cds_pos_start,
protein_accession,
transcript,
)
}
_ => Err(FerroError::UnsupportedProjection {
reason: format!(
"build_inframe_variant does not support edit type for protein prediction: {:?}",
edit
),
}),
}
}
fn build_identity_variant(
ref_protein: &[AminoAcid],
protein_accession: &str,
transcript: &Transcript,
) -> Result<HgvsVariant, FerroError> {
let accession = parse_accession(protein_accession);
let ref_aa = ref_protein.first().copied().unwrap_or(AminoAcid::Met);
let loc = ProtInterval::point(ProtPos::new(ref_aa, 1));
let edit = crate::hgvs::edit::ProteinEdit::Identity {
predicted: false,
whole_protein: true,
};
let variant = ProteinVariant {
accession,
gene_symbol: transcript.gene_symbol.clone(),
loc_edit: LocEdit::new_predicted(loc, edit),
};
Ok(HgvsVariant::Protein(variant))
}
fn build_inframe_deletion(
ref_protein: &[AminoAcid],
alt_protein: &[AminoAcid],
protein_accession: &str,
transcript: &Transcript,
) -> Result<HgvsVariant, FerroError> {
let first_diff = first_diff_position(ref_protein, alt_protein);
let n_deleted = ref_protein.len().saturating_sub(alt_protein.len());
if n_deleted == 0 {
return build_identity_variant(ref_protein, protein_accession, transcript);
}
let last_deleted = first_diff + n_deleted - 1;
let start_aa = ref_protein
.get(first_diff)
.copied()
.unwrap_or(AminoAcid::Xaa);
let start_pos = (first_diff + 1) as u64;
let protein_edit = ProteinEdit::Deletion {
sequence: None,
count: None,
};
let loc = if n_deleted == 1 {
ProtInterval::point(ProtPos::new(start_aa, start_pos))
} else {
let end_aa = ref_protein
.get(last_deleted)
.copied()
.unwrap_or(AminoAcid::Xaa);
let end_pos = (last_deleted + 1) as u64;
ProtInterval::new(
ProtPos::new(start_aa, start_pos),
ProtPos::new(end_aa, end_pos),
)
};
let accession = parse_accession(protein_accession);
let variant = ProteinVariant {
accession,
gene_symbol: transcript.gene_symbol.clone(),
loc_edit: LocEdit::new_predicted(loc, protein_edit),
};
Ok(HgvsVariant::Protein(variant))
}
fn build_inframe_insertion(
ref_protein: &[AminoAcid],
alt_protein: &[AminoAcid],
cds_pos_start: i64, protein_accession: &str,
transcript: &Transcript,
) -> Result<HgvsVariant, FerroError> {
if alt_protein.len() <= ref_protein.len() {
let mut alt_with_stop = alt_protein.to_vec();
alt_with_stop.push(AminoAcid::Ter);
return build_inframe_delins(
ref_protein,
&alt_with_stop,
cds_pos_start,
protein_accession,
transcript,
);
}
let aa_before_idx = ((cds_pos_start - 1) / 3) as usize; let aa_before = ref_protein
.get(aa_before_idx)
.copied()
.unwrap_or(AminoAcid::Xaa);
let aa_before_pos = (aa_before_idx + 1) as u64;
let aa_after = ref_protein
.get(aa_before_idx + 1)
.copied()
.unwrap_or(AminoAcid::Ter);
let aa_after_pos = aa_before_pos + 1;
let n_inserted = alt_protein.len() - ref_protein.len();
let insert_end = (aa_before_idx + 1 + n_inserted).min(alt_protein.len());
let inserted_aas: Vec<AminoAcid> = alt_protein[aa_before_idx + 1..insert_end].to_vec();
let protein_edit = ProteinEdit::Insertion {
sequence: AminoAcidSeq::new(inserted_aas),
};
let loc = ProtInterval::new(
ProtPos::new(aa_before, aa_before_pos),
ProtPos::new(aa_after, aa_after_pos),
);
let accession = parse_accession(protein_accession);
let variant = ProteinVariant {
accession,
gene_symbol: transcript.gene_symbol.clone(),
loc_edit: LocEdit::new_predicted(loc, protein_edit),
};
Ok(HgvsVariant::Protein(variant))
}
fn build_inframe_duplication(
ref_protein: &[AminoAcid],
_alt_protein: &[AminoAcid],
cds_pos_start: i64,
cds_pos_end: i64,
protein_accession: &str,
transcript: &Transcript,
) -> Result<HgvsVariant, FerroError> {
let aa_start_idx = ((cds_pos_start - 1) / 3) as usize;
let aa_end_idx = ((cds_pos_end - 1) / 3) as usize;
let start_aa = ref_protein
.get(aa_start_idx)
.copied()
.unwrap_or(AminoAcid::Xaa);
let start_pos = (aa_start_idx + 1) as u64;
let loc = if aa_start_idx == aa_end_idx {
ProtInterval::point(ProtPos::new(start_aa, start_pos))
} else {
let end_aa = ref_protein
.get(aa_end_idx)
.copied()
.unwrap_or(AminoAcid::Xaa);
let end_pos = (aa_end_idx + 1) as u64;
ProtInterval::new(
ProtPos::new(start_aa, start_pos),
ProtPos::new(end_aa, end_pos),
)
};
let accession = parse_accession(protein_accession);
let variant = ProteinVariant {
accession,
gene_symbol: transcript.gene_symbol.clone(),
loc_edit: LocEdit::new_predicted(loc, ProteinEdit::Duplication),
};
Ok(HgvsVariant::Protein(variant))
}
fn build_inframe_delins(
ref_protein: &[AminoAcid],
alt_protein: &[AminoAcid],
cds_pos_start: i64,
protein_accession: &str,
transcript: &Transcript,
) -> Result<HgvsVariant, FerroError> {
let first_diff = first_diff_position(ref_protein, alt_protein);
let last_diff_ref = find_last_diff(ref_protein, alt_protein);
let last_diff_alt = find_last_diff_alt(ref_protein, alt_protein);
let ref_len = ref_protein.len();
let alt_len = alt_protein.len();
let _ = cds_pos_start;
if first_diff >= ref_len && first_diff >= alt_len {
return build_identity_variant(ref_protein, protein_accession, transcript);
}
let start_aa = ref_protein
.get(first_diff)
.copied()
.unwrap_or(AminoAcid::Xaa);
let start_pos = (first_diff + 1) as u64;
let end_ref = last_diff_ref.min(ref_len.saturating_sub(1));
let end_aa = ref_protein.get(end_ref).copied().unwrap_or(AminoAcid::Xaa);
let end_pos = (end_ref + 1) as u64;
let end_alt = last_diff_alt.min(alt_len.saturating_sub(1));
let inserted: Vec<AminoAcid> = if first_diff <= end_alt {
alt_protein[first_diff..=end_alt].to_vec()
} else {
vec![]
};
let protein_edit = ProteinEdit::Delins {
sequence: AminoAcidSeq::new(inserted),
};
let loc = if start_pos == end_pos {
ProtInterval::point(ProtPos::new(start_aa, start_pos))
} else {
ProtInterval::new(
ProtPos::new(start_aa, start_pos),
ProtPos::new(end_aa, end_pos),
)
};
let accession = parse_accession(protein_accession);
let variant = ProteinVariant {
accession,
gene_symbol: transcript.gene_symbol.clone(),
loc_edit: LocEdit::new_predicted(loc, protein_edit),
};
Ok(HgvsVariant::Protein(variant))
}
fn find_last_diff(ref_prot: &[AminoAcid], alt_prot: &[AminoAcid]) -> usize {
let ref_len = ref_prot.len();
let alt_len = alt_prot.len();
let max_tail = ref_len.min(alt_len);
for tail in 0..max_tail {
let ri = ref_len - 1 - tail;
let ai = alt_len - 1 - tail;
if ref_prot[ri] != alt_prot[ai] {
return ri;
}
}
ref_len.saturating_sub(alt_len)
}
fn find_last_diff_alt(ref_prot: &[AminoAcid], alt_prot: &[AminoAcid]) -> usize {
let ref_len = ref_prot.len();
let alt_len = alt_prot.len();
let max_tail = ref_len.min(alt_len);
for tail in 0..max_tail {
let ri = ref_len - 1 - tail;
let ai = alt_len - 1 - tail;
if ref_prot[ri] != alt_prot[ai] {
return ai;
}
}
alt_len.saturating_sub(ref_len)
}
fn build_frameshift_variant(
ref_protein: &[AminoAcid],
alt_protein: &[AminoAcid],
mut_cds: &str,
protein_accession: &str,
transcript: &Transcript,
) -> Result<HgvsVariant, FerroError> {
let first_diff = first_diff_position(ref_protein, alt_protein);
let aa_pos = (first_diff + 1) as u64; let ref_aa = ref_protein
.get(first_diff)
.copied()
.unwrap_or(AminoAcid::Xaa);
let new_aa = alt_protein.get(first_diff).copied();
let codon_byte_offset = first_diff * 3;
let ter_pos: Option<u64> = if codon_byte_offset < mut_cds.len() {
let downstream = &mut_cds[codon_byte_offset..];
let downstream_aas = translate_full_cds_with_stop(downstream);
downstream_aas
.iter()
.position(|aa| *aa == AminoAcid::Ter)
.map(|p| (p + 1) as u64)
} else {
None
};
let protein_edit = ProteinEdit::Frameshift { new_aa, ter_pos };
let loc = ProtInterval::point(ProtPos::new(ref_aa, aa_pos));
let accession = parse_accession(protein_accession);
let variant = ProteinVariant {
accession,
gene_symbol: transcript.gene_symbol.clone(),
loc_edit: LocEdit::new_predicted(loc, protein_edit),
};
Ok(HgvsVariant::Protein(variant))
}
fn build_extension_variant(
ref_protein: &[AminoAcid],
_alt_protein: &[AminoAcid],
mut_cds: &str,
protein_accession: &str,
transcript: &Transcript,
) -> Result<HgvsVariant, FerroError> {
let stop_pos = (ref_protein.len() + 1) as u64;
let stop_offset = ref_protein.len() * 3;
let new_aa: Option<AminoAcid> = if stop_offset + 3 <= mut_cds.len() {
let stop_codon_seq = &mut_cds[stop_offset..stop_offset + 3];
super::substitution::translate(stop_codon_seq).filter(|aa| *aa != AminoAcid::Ter)
} else {
None
};
let ext_count: Option<i64> = if stop_offset < mut_cds.len() {
let downstream = &mut_cds[stop_offset..];
let downstream_aas = translate_full_cds_with_stop(downstream);
downstream_aas
.iter()
.position(|aa| *aa == AminoAcid::Ter)
.map(|p| (p + 1) as i64) } else {
None
};
let protein_edit = ProteinEdit::Extension {
new_aa,
direction: ExtDirection::CTerminal,
count: ext_count,
};
let loc = ProtInterval::point(ProtPos::new(AminoAcid::Ter, stop_pos));
let accession = parse_accession(protein_accession);
let variant = ProteinVariant {
accession,
gene_symbol: transcript.gene_symbol.clone(),
loc_edit: LocEdit::new_predicted(loc, protein_edit),
};
Ok(HgvsVariant::Protein(variant))
}
#[cfg(test)]
mod tests {
use super::*;
use crate::reference::transcript::{Exon, ManeStatus, Strand};
use std::sync::OnceLock;
fn tx(seq: &str, cds_start: u64, cds_end: u64) -> Transcript {
Transcript {
id: "NM_TEST.1".to_string(),
gene_symbol: None,
strand: Strand::Plus,
sequence: Some(seq.to_string()),
cds_start: Some(cds_start),
cds_end: Some(cds_end),
exons: vec![Exon::new(1, 1, seq.len() as u64)],
chromosome: None,
genomic_start: None,
genomic_end: None,
genome_build: Default::default(),
mane_status: ManeStatus::default(),
refseq_match: None,
ensembl_match: None,
exon_cigars: Vec::new(),
cached_introns: OnceLock::new(),
}
}
fn prot_str(v: &HgvsVariant) -> String {
match v {
HgvsVariant::Protein(p) => p.to_string(),
_ => panic!("expected Protein variant"),
}
}
#[test]
fn del_single_base_frameshift_with_ter() {
let t = tx("ATGCGCTAA", 1, 9);
let edit = NaEdit::Deletion {
sequence: None,
length: None,
};
let result = predict_indel_protein(&t, 4, 4, &edit, "NP_TEST.1").unwrap();
let s = prot_str(&result);
assert!(s.contains("Arg2"), "expected Arg2 in '{}'", s);
assert!(s.contains("fs"), "expected fs in '{}'", s);
assert!(s.contains("Ala"), "expected Ala new_aa in '{}'", s);
}
#[test]
fn del_three_bases_codon_aligned_single_aa() {
let t = tx("ATGCGCTAA", 1, 9);
let edit = NaEdit::Deletion {
sequence: None,
length: None,
};
let result = predict_indel_protein(&t, 4, 6, &edit, "NP_TEST.1").unwrap();
let s = prot_str(&result);
assert_eq!(s, "NP_TEST.1:p.(Arg2del)");
}
#[test]
fn del_six_bases_codon_aligned_range() {
let t = tx("ATGCGCAAATAA", 1, 12);
let edit = NaEdit::Deletion {
sequence: None,
length: None,
};
let result = predict_indel_protein(&t, 4, 9, &edit, "NP_TEST.1").unwrap();
let s = prot_str(&result);
assert_eq!(s, "NP_TEST.1:p.(Arg2_Lys3del)");
}
#[test]
fn del_two_bases_non_aligned_frameshift() {
let t = tx("ATGCGCAAATAA", 1, 12);
let edit = NaEdit::Deletion {
sequence: None,
length: None,
};
let result = predict_indel_protein(&t, 4, 5, &edit, "NP_TEST.1").unwrap();
let s = prot_str(&result);
assert!(s.contains("fs"), "expected frameshift in '{}'", s);
}
#[test]
fn ins_three_bases_codon_aligned() {
let t = tx("ATGCGCTAA", 1, 9);
let seq: crate::hgvs::edit::Sequence = "GGG".parse().unwrap();
let edit = NaEdit::Insertion {
sequence: crate::hgvs::edit::InsertedSequence::Literal(seq),
};
let result = predict_indel_protein(&t, 3, 3, &edit, "NP_TEST.1").unwrap();
let s = prot_str(&result);
assert_eq!(s, "NP_TEST.1:p.(Met1_Arg2insGly)");
}
#[test]
fn ins_one_base_frameshift() {
let t = tx("ATGCGCTAA", 1, 9);
let seq: crate::hgvs::edit::Sequence = "A".parse().unwrap();
let edit = NaEdit::Insertion {
sequence: crate::hgvs::edit::InsertedSequence::Literal(seq),
};
let result = predict_indel_protein(&t, 3, 3, &edit, "NP_TEST.1").unwrap();
let s = prot_str(&result);
assert!(s.contains("fs"), "expected frameshift in '{}'", s);
}
#[test]
fn dup_three_bases_codon_aligned() {
let t = tx("ATGCGCTAA", 1, 9);
let edit = NaEdit::Duplication {
sequence: None,
length: None,
uncertain_extent: None,
};
let result = predict_indel_protein(&t, 4, 6, &edit, "NP_TEST.1").unwrap();
let s = prot_str(&result);
assert_eq!(s, "NP_TEST.1:p.(Arg2dup)");
}
#[test]
fn dup_one_base_frameshift() {
let t = tx("ATGCGCTAA", 1, 9);
let edit = NaEdit::Duplication {
sequence: None,
length: None,
uncertain_extent: None,
};
let result = predict_indel_protein(&t, 4, 4, &edit, "NP_TEST.1").unwrap();
let s = prot_str(&result);
assert!(s.contains("fs"), "expected frameshift in '{}'", s);
}
#[test]
fn delins_same_length_codon_aligned() {
let t = tx("ATGCGCTAA", 1, 9);
let seq: crate::hgvs::edit::Sequence = "TCC".parse().unwrap();
let edit = NaEdit::Delins {
sequence: crate::hgvs::edit::InsertedSequence::Literal(seq),
deleted: None,
deleted_length: None,
};
let result = predict_indel_protein(&t, 4, 6, &edit, "NP_TEST.1").unwrap();
let s = prot_str(&result);
assert!(s.contains("Arg2"), "expected Arg2 in '{}'", s);
assert!(s.contains("delins"), "expected delins in '{}'", s);
assert!(s.contains("Ser"), "expected Ser in '{}'", s);
}
#[test]
fn inversion_codon_cgc_to_gcg() {
let t = tx("ATGCGCTAA", 1, 9);
let edit = NaEdit::Inversion {
sequence: None,
length: None,
};
let result = predict_indel_protein(&t, 4, 6, &edit, "NP_TEST.1").unwrap();
let s = prot_str(&result);
assert!(s.contains("Arg2"), "expected Arg2 in '{}'", s);
assert!(s.contains("delins"), "expected delins in '{}'", s);
assert!(s.contains("Ala"), "expected Ala in '{}'", s);
}
#[test]
fn inframe_deletion_with_zero_diff_returns_identity_not_panic() {
let t = tx("ATGCGCTAA", 1, 9);
let ref_protein = [AminoAcid::Met, AminoAcid::Arg];
let alt_protein = [AminoAcid::Met, AminoAcid::Arg];
let result =
build_inframe_deletion(&ref_protein, &alt_protein, "NP_TEST.1", &t).expect("no panic");
let s = prot_str(&result);
assert!(s.contains("(="), "expected identity '(=' in '{}'", s);
}
#[test]
fn ins_three_bases_premature_stop_falls_back_to_delins() {
let t = tx("ATGCGCAAATAA", 1, 12);
let seq: crate::hgvs::edit::Sequence = "TAA".parse().unwrap();
let edit = NaEdit::Insertion {
sequence: crate::hgvs::edit::InsertedSequence::Literal(seq),
};
let result = predict_indel_protein(&t, 3, 3, &edit, "NP_TEST.1").unwrap();
let s = prot_str(&result);
assert_eq!(s, "NP_TEST.1:p.(Arg2_Lys3delinsTer)");
}
#[test]
fn stop_readthrough_extension() {
let t = tx("ATGCGCTAA", 1, 9);
let seq: crate::hgvs::edit::Sequence = "TGG".parse().unwrap();
let edit = NaEdit::Delins {
sequence: crate::hgvs::edit::InsertedSequence::Literal(seq),
deleted: None,
deleted_length: None,
};
let result = predict_indel_protein(&t, 7, 9, &edit, "NP_TEST.1").unwrap();
let s = prot_str(&result);
assert!(s.contains("Ter3"), "expected Ter3 in '{}'", s);
assert!(s.contains("Trp"), "expected Trp in '{}'", s);
assert!(s.contains("ext"), "expected ext in '{}'", s);
}
}