use std::fmt;
use crate::codon::{complement, reverse_complement};
use crate::consequence::helpers::compute_cdna_position_exonic;
use crate::error::VarEffectError;
use crate::fasta::FastaReader;
use crate::locate::helpers::{
compute_cds_offset, compute_utr_offset_3prime, compute_utr_offset_5prime,
};
use crate::locate::{LocateIndex, SpliceSide, VariantLocation, locate_variant};
use crate::types::{Strand, TranscriptModel};
#[derive(Debug, Clone, PartialEq, Eq)]
enum HgvsPosition {
Cds(u32),
FivePrimeUtr(i64),
ThreePrimeUtr(i64),
NonCoding(u32),
}
impl fmt::Display for HgvsPosition {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
Self::Cds(pos) => write!(f, "{pos}"),
Self::FivePrimeUtr(offset) => write!(f, "{offset}"),
Self::ThreePrimeUtr(offset) => write!(f, "*{offset}"),
Self::NonCoding(pos) => write!(f, "{pos}"),
}
}
}
fn format_position(anchor: &HgvsPosition, intronic_offset: Option<i64>) -> String {
match intronic_offset {
None => anchor.to_string(),
Some(offset) if offset > 0 => format!("{anchor}+{offset}"),
Some(offset) => {
format!("{anchor}{offset}")
}
}
}
fn exon_boundary_hgvs_anchor(
intron_index: u16,
is_donor: bool,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<HgvsPosition, VarEffectError> {
if transcript.cds_segments.is_empty() {
let exon_idx = if is_donor {
intron_index as usize
} else {
intron_index as usize + 1
};
let exon = &transcript.exons[exon_idx];
let boundary_pos = match (is_donor, transcript.strand) {
(true, Strand::Plus) => exon.genomic_end - 1,
(true, Strand::Minus) => exon.genomic_start,
(false, Strand::Plus) => exon.genomic_start,
(false, Strand::Minus) => exon.genomic_end - 1,
};
let cdna = compute_cdna_position_exonic(boundary_pos, transcript).unwrap_or(1);
return Ok(HgvsPosition::NonCoding(cdna));
}
let exon_idx = if is_donor {
intron_index as usize
} else {
intron_index as usize + 1
};
let exon = &transcript.exons[exon_idx];
let boundary_pos = match (is_donor, transcript.strand) {
(true, Strand::Plus) => exon.genomic_end - 1,
(true, Strand::Minus) => exon.genomic_start,
(false, Strand::Plus) => exon.genomic_start,
(false, Strand::Minus) => exon.genomic_end - 1,
};
let cds_start = transcript.cds_genomic_start.ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: coding transcript has no cds_genomic_start",
transcript.accession,
))
})?;
let cds_end = transcript.cds_genomic_end.ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: coding transcript has no cds_genomic_end",
transcript.accession,
))
})?;
let is_5utr = match transcript.strand {
Strand::Plus => boundary_pos < cds_start,
Strand::Minus => boundary_pos >= cds_end,
};
let is_3utr = match transcript.strand {
Strand::Plus => boundary_pos >= cds_end,
Strand::Minus => boundary_pos < cds_start,
};
if is_5utr {
let offset = compute_utr_offset_5prime(boundary_pos, exon_idx, transcript, index)?;
Ok(HgvsPosition::FivePrimeUtr(offset))
} else if is_3utr {
let offset = compute_utr_offset_3prime(boundary_pos, exon_idx, transcript, index)?;
Ok(HgvsPosition::ThreePrimeUtr(offset))
} else {
let cds = compute_cds_offset(boundary_pos, exon_idx, transcript, index)?;
Ok(HgvsPosition::Cds(cds.cds_offset + 1))
}
}
fn position_for_variant_location(
location: &VariantLocation,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<Option<String>, VarEffectError> {
match *location {
VariantLocation::CdsExon { cds_offset, .. } => {
let pos = HgvsPosition::Cds(cds_offset + 1);
Ok(Some(format_position(&pos, None)))
}
VariantLocation::FivePrimeUtr {
offset_from_cds_start,
..
} => {
let pos = HgvsPosition::FivePrimeUtr(offset_from_cds_start);
Ok(Some(format_position(&pos, None)))
}
VariantLocation::ThreePrimeUtr {
offset_from_cds_end,
..
} => {
let pos = HgvsPosition::ThreePrimeUtr(offset_from_cds_end);
Ok(Some(format_position(&pos, None)))
}
VariantLocation::SpliceDonor {
intron_index,
offset,
} => {
let anchor = exon_boundary_hgvs_anchor(intron_index, true, transcript, index)?;
Ok(Some(format_position(&anchor, Some(offset as i64))))
}
VariantLocation::SpliceAcceptor {
intron_index,
offset,
} => {
let anchor = exon_boundary_hgvs_anchor(intron_index, false, transcript, index)?;
Ok(Some(format_position(&anchor, Some(-(offset as i64)))))
}
VariantLocation::SpliceRegion {
intron_index,
side,
distance,
} => {
let is_donor = matches!(side, SpliceSide::Donor);
let anchor = exon_boundary_hgvs_anchor(intron_index, is_donor, transcript, index)?;
Ok(Some(format_position(&anchor, Some(distance))))
}
VariantLocation::Intron {
intron_index,
distance_to_nearest_exon,
} => {
let is_donor = distance_to_nearest_exon > 0;
let anchor = exon_boundary_hgvs_anchor(intron_index, is_donor, transcript, index)?;
Ok(Some(format_position(
&anchor,
Some(distance_to_nearest_exon),
)))
}
VariantLocation::NonCodingExon { exon_index, .. } => {
let exon = &transcript.exons[exon_index as usize];
let _ = exon;
Ok(None)
}
VariantLocation::NonCodingIntron {
intron_index,
distance_to_nearest_exon,
} => {
let is_donor = distance_to_nearest_exon > 0;
let anchor = exon_boundary_hgvs_anchor(intron_index, is_donor, transcript, index)?;
Ok(Some(format_position(
&anchor,
Some(distance_to_nearest_exon),
)))
}
VariantLocation::Upstream { .. }
| VariantLocation::Downstream { .. }
| VariantLocation::Distal => Ok(None),
}
}
fn position_for_genomic(
chrom: &str,
pos: u64,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<Option<String>, VarEffectError> {
let loc = locate_variant(chrom, pos, pos + 1, transcript, index)?;
if let VariantLocation::NonCodingExon { .. } = loc {
let cdna = compute_cdna_position_exonic(pos, transcript).unwrap_or(1);
return Ok(Some(HgvsPosition::NonCoding(cdna).to_string()));
}
position_for_variant_location(&loc, transcript, index)
}
fn is_duplication(
ins_pos: u64,
inserted_bases: &[u8],
chrom: &str,
transcript: &TranscriptModel,
fasta: &FastaReader,
) -> Result<bool, VarEffectError> {
let ins_len = inserted_bases.len() as u64;
if ins_len == 0 {
return Ok(false);
}
let (fetch_start, fetch_end) = match transcript.strand {
Strand::Plus => {
if ins_pos < ins_len {
return Ok(false); }
(ins_pos - ins_len, ins_pos)
}
Strand::Minus => {
let end = ins_pos + ins_len;
if let Some(chrom_len) = fasta.chrom_length(chrom)
&& end > chrom_len
{
return Ok(false);
}
(ins_pos, end)
}
};
let ref_seq = fasta.fetch_sequence(chrom, fetch_start, fetch_end)?;
Ok(ref_seq == inserted_bases)
}
fn coding_strand_base(base: u8, strand: Strand) -> char {
let b = match strand {
Strand::Plus => base,
Strand::Minus => complement(base),
};
b.to_ascii_uppercase() as char
}
fn coding_strand_seq(bases: &[u8], strand: Strand) -> String {
match strand {
Strand::Plus => bases
.iter()
.map(|&b| b.to_ascii_uppercase() as char)
.collect(),
Strand::Minus => reverse_complement(bases)
.iter()
.map(|&b| b.to_ascii_uppercase() as char)
.collect(),
}
}
pub(crate) fn format_snv_hgvs(
pos: u64,
ref_base: u8,
alt_base: u8,
location: &VariantLocation,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<Option<String>, VarEffectError> {
if let VariantLocation::NonCodingExon { .. } = location {
let cdna = compute_cdna_position_exonic(pos, transcript).unwrap_or(1);
let hgvs_pos = HgvsPosition::NonCoding(cdna);
let r = coding_strand_base(ref_base, transcript.strand);
let a = coding_strand_base(alt_base, transcript.strand);
return Ok(Some(format!(
"{}:n.{}{}>{}",
transcript.accession, hgvs_pos, r, a,
)));
}
let pos_str = match position_for_variant_location(location, transcript, index)? {
Some(s) => s,
None => return Ok(None),
};
let prefix = if transcript.cds_segments.is_empty() {
"n."
} else {
"c."
};
let prefix = match location {
VariantLocation::NonCodingIntron { .. } => "n.",
_ => prefix,
};
let r = coding_strand_base(ref_base, transcript.strand);
let a = coding_strand_base(alt_base, transcript.strand);
Ok(Some(format!(
"{}:{prefix}{pos_str}{r}>{a}",
transcript.accession,
)))
}
pub(crate) fn format_deletion_hgvs(
chrom: &str,
start: u64,
end: u64,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<Option<String>, VarEffectError> {
let start_pos = match position_for_genomic(chrom, start, transcript, index)? {
Some(s) => s,
None => return Ok(None),
};
let prefix = deletion_prefix(chrom, start, transcript, index)?;
let del_len = end - start;
if del_len == 1 {
Ok(Some(format!(
"{}:{prefix}{start_pos}del",
transcript.accession,
)))
} else {
let (first_genomic, last_genomic) = match transcript.strand {
Strand::Plus => (start, end - 1),
Strand::Minus => (end - 1, start),
};
let first_pos = match position_for_genomic(chrom, first_genomic, transcript, index)? {
Some(s) => s,
None => return Ok(None),
};
let last_pos = match position_for_genomic(chrom, last_genomic, transcript, index)? {
Some(s) => s,
None => return Ok(None),
};
Ok(Some(format!(
"{}:{prefix}{first_pos}_{last_pos}del",
transcript.accession,
)))
}
}
pub(crate) fn format_insertion_hgvs(
pos: u64,
inserted_bases: &[u8],
chrom: &str,
transcript: &TranscriptModel,
index: &LocateIndex,
fasta: &FastaReader,
) -> Result<Option<String>, VarEffectError> {
let ins_len = inserted_bases.len() as u64;
if is_duplication(pos, inserted_bases, chrom, transcript, fasta)? {
return format_duplication(pos, ins_len, chrom, transcript, index);
}
if pos == 0 {
return Ok(None);
}
let (left_genomic, right_genomic) = match transcript.strand {
Strand::Plus => (pos - 1, pos),
Strand::Minus => (pos, pos - 1),
};
let left_pos = match position_for_genomic(chrom, left_genomic, transcript, index)? {
Some(s) => s,
None => return Ok(None),
};
let right_pos = match position_for_genomic(chrom, right_genomic, transcript, index)? {
Some(s) => s,
None => return Ok(None),
};
let prefix = insertion_prefix(chrom, pos, transcript, index)?;
let coding_seq = coding_strand_seq(inserted_bases, transcript.strand);
Ok(Some(format!(
"{}:{prefix}{left_pos}_{right_pos}ins{coding_seq}",
transcript.accession,
)))
}
pub(crate) fn format_delins_hgvs(
chrom: &str,
start: u64,
end: u64,
alt_bases: &[u8],
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<Option<String>, VarEffectError> {
let start_pos = match position_for_genomic(chrom, start, transcript, index)? {
Some(s) => s,
None => return Ok(None),
};
let prefix = deletion_prefix(chrom, start, transcript, index)?;
let coding_alt = coding_strand_seq(alt_bases, transcript.strand);
let ref_len = end - start;
if ref_len == 1 {
Ok(Some(format!(
"{}:{prefix}{start_pos}delins{coding_alt}",
transcript.accession,
)))
} else {
let (first_genomic, last_genomic) = match transcript.strand {
Strand::Plus => (start, end - 1),
Strand::Minus => (end - 1, start),
};
let first_pos = match position_for_genomic(chrom, first_genomic, transcript, index)? {
Some(s) => s,
None => return Ok(None),
};
let last_pos = match position_for_genomic(chrom, last_genomic, transcript, index)? {
Some(s) => s,
None => return Ok(None),
};
Ok(Some(format!(
"{}:{prefix}{first_pos}_{last_pos}delins{coding_alt}",
transcript.accession,
)))
}
}
fn format_duplication(
ins_pos: u64,
ins_len: u64,
chrom: &str,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<Option<String>, VarEffectError> {
let (dup_start, dup_end) = match transcript.strand {
Strand::Plus => (ins_pos - ins_len, ins_pos),
Strand::Minus => (ins_pos, ins_pos + ins_len),
};
let prefix = deletion_prefix(chrom, dup_start, transcript, index)?;
if ins_len == 1 {
let pos_str = match position_for_genomic(chrom, dup_start, transcript, index)? {
Some(s) => s,
None => return Ok(None),
};
Ok(Some(format!(
"{}:{prefix}{pos_str}dup",
transcript.accession,
)))
} else {
let (first_genomic, last_genomic) = match transcript.strand {
Strand::Plus => (dup_start, dup_end - 1),
Strand::Minus => (dup_end - 1, dup_start),
};
let start_pos = match position_for_genomic(chrom, first_genomic, transcript, index)? {
Some(s) => s,
None => return Ok(None),
};
let end_pos = match position_for_genomic(chrom, last_genomic, transcript, index)? {
Some(s) => s,
None => return Ok(None),
};
Ok(Some(format!(
"{}:{prefix}{start_pos}_{end_pos}dup",
transcript.accession,
)))
}
}
fn deletion_prefix(
chrom: &str,
pos: u64,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<&'static str, VarEffectError> {
if transcript.cds_segments.is_empty() {
return Ok("n.");
}
let loc = locate_variant(chrom, pos, pos + 1, transcript, index)?;
match loc {
VariantLocation::NonCodingExon { .. } | VariantLocation::NonCodingIntron { .. } => Ok("n."),
_ => Ok("c."),
}
}
fn insertion_prefix(
chrom: &str,
pos: u64,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<&'static str, VarEffectError> {
deletion_prefix(chrom, pos, transcript, index)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn display_cds_position() {
assert_eq!(HgvsPosition::Cds(742).to_string(), "742");
assert_eq!(HgvsPosition::Cds(1).to_string(), "1");
}
#[test]
fn display_five_prime_utr_position() {
assert_eq!(HgvsPosition::FivePrimeUtr(-15).to_string(), "-15");
assert_eq!(HgvsPosition::FivePrimeUtr(-1).to_string(), "-1");
}
#[test]
fn display_three_prime_utr_position() {
assert_eq!(HgvsPosition::ThreePrimeUtr(42).to_string(), "*42");
assert_eq!(HgvsPosition::ThreePrimeUtr(1).to_string(), "*1");
}
#[test]
fn display_non_coding_position() {
assert_eq!(HgvsPosition::NonCoding(76).to_string(), "76");
}
#[test]
fn format_position_exonic() {
let pos = HgvsPosition::Cds(742);
assert_eq!(format_position(&pos, None), "742");
}
#[test]
fn format_position_donor_intronic() {
let pos = HgvsPosition::Cds(742);
assert_eq!(format_position(&pos, Some(5)), "742+5");
}
#[test]
fn format_position_acceptor_intronic() {
let pos = HgvsPosition::Cds(743);
assert_eq!(format_position(&pos, Some(-3)), "743-3");
}
#[test]
fn format_position_utr_intronic() {
let pos = HgvsPosition::FivePrimeUtr(-15);
assert_eq!(format_position(&pos, Some(2)), "-15+2");
let pos = HgvsPosition::ThreePrimeUtr(37);
assert_eq!(format_position(&pos, Some(-1)), "*37-1");
}
#[test]
fn format_position_noncoding_intronic() {
let pos = HgvsPosition::NonCoding(76);
assert_eq!(format_position(&pos, Some(1)), "76+1");
}
#[test]
fn coding_strand_base_plus() {
assert_eq!(coding_strand_base(b'A', Strand::Plus), 'A');
assert_eq!(coding_strand_base(b'C', Strand::Plus), 'C');
}
#[test]
fn coding_strand_base_minus() {
assert_eq!(coding_strand_base(b'A', Strand::Minus), 'T');
assert_eq!(coding_strand_base(b'G', Strand::Minus), 'C');
}
#[test]
fn coding_strand_seq_plus() {
assert_eq!(coding_strand_seq(b"ACG", Strand::Plus), "ACG");
}
#[test]
fn coding_strand_seq_minus() {
assert_eq!(coding_strand_seq(b"ACG", Strand::Minus), "CGT");
}
}