pub(crate) mod helpers;
mod indel;
#[cfg(test)]
mod tests;
pub use indel::{IndelLocation, IndelRegion, SpliceOverlapDetail, locate_indel};
use crate::error::VarEffectError;
use crate::types::{Strand, TranscriptModel};
use helpers::{
ExonOrIntron, classify_intronic, compute_cds_offset, compute_utr_offset_3prime,
compute_utr_offset_5prime, find_exon_or_intron, is_exonic_splice_region, to_u16,
};
const UPSTREAM_DOWNSTREAM_LIMIT: u64 = 5_000;
const SPLICE_CANONICAL_MAX: u64 = 2;
const SPLICE_REGION_INTRON_MAX_DONOR: u64 = 8;
const SPLICE_REGION_INTRON_MAX_ACCEPTOR: u64 = 17;
const SPLICE_REGION_EXON_MAX: u64 = 3;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SpliceSide {
Donor,
Acceptor,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum VariantLocation {
Upstream {
distance: u64,
},
Downstream {
distance: u64,
},
Distal,
FivePrimeUtr {
exon_index: u16,
offset_from_cds_start: i64,
is_splice_region: bool,
},
CdsExon {
exon_index: u16,
cds_segment_index: u16,
cds_offset: u32,
codon_number: u32,
codon_position: u8,
is_splice_region: bool,
},
ThreePrimeUtr {
exon_index: u16,
offset_from_cds_end: i64,
is_splice_region: bool,
},
SpliceDonor {
intron_index: u16,
offset: u8,
},
SpliceAcceptor {
intron_index: u16,
offset: u8,
},
SpliceRegion {
intron_index: u16,
side: SpliceSide,
distance: i64,
},
Intron {
intron_index: u16,
distance_to_nearest_exon: i64,
},
NonCodingExon {
exon_index: u16,
is_splice_region: bool,
},
NonCodingIntron {
intron_index: u16,
distance_to_nearest_exon: i64,
},
}
pub fn format_exon_number(exon_index: u16, exon_count: u16) -> String {
format!("{}/{}", exon_index + 1, exon_count)
}
pub fn format_intron_number(intron_index: u16, exon_count: u16) -> String {
debug_assert!(exon_count > 1, "no introns in a single-exon transcript");
format!("{}/{}", intron_index + 1, exon_count.saturating_sub(1))
}
#[derive(Debug, Clone)]
pub struct LocateIndex {
cumulative_cds: Vec<u32>,
exon_to_cds_seg: Vec<Option<u16>>,
cds_start_exon_idx: Option<usize>,
cds_end_exon_idx: Option<usize>,
utr5_exonic_len: u32,
}
impl LocateIndex {
pub fn build(transcript: &TranscriptModel) -> Result<Self, VarEffectError> {
let malformed =
|msg: &str| VarEffectError::Malformed(format!("{}: {}", transcript.accession, msg));
let mut cumulative_cds = Vec::with_capacity(transcript.cds_segments.len() + 1);
cumulative_cds.push(0u32);
for seg in &transcript.cds_segments {
let prev = *cumulative_cds
.last()
.expect("cumulative_cds initialized with 0");
cumulative_cds.push(prev + (seg.genomic_end - seg.genomic_start) as u32);
}
let mut exon_to_cds_seg = vec![None; transcript.exons.len()];
for (seg_idx, seg) in transcript.cds_segments.iter().enumerate() {
exon_to_cds_seg[seg.exon_index as usize] = Some(to_u16(seg_idx));
}
let (cds_start_exon_idx, cds_end_exon_idx) = if transcript.cds_segments.is_empty() {
(None, None)
} else {
let start_idx = transcript
.cds_genomic_start
.zip(transcript.cds_genomic_end)
.map(|(cds_start, cds_end)| {
let target = match transcript.strand {
Strand::Plus => cds_start,
Strand::Minus => cds_end - 1,
};
transcript
.exons
.iter()
.position(|e| e.genomic_start <= target && target < e.genomic_end)
.ok_or_else(|| malformed("CDS start base not found in any exon"))
})
.transpose()?;
let end_idx = transcript
.cds_genomic_start
.zip(transcript.cds_genomic_end)
.map(|(cds_start, cds_end)| {
let target = match transcript.strand {
Strand::Plus => cds_end - 1,
Strand::Minus => cds_start,
};
transcript
.exons
.iter()
.position(|e| e.genomic_start <= target && target < e.genomic_end)
.ok_or_else(|| malformed("CDS end base not found in any exon"))
})
.transpose()?;
(start_idx, end_idx)
};
let utr5_exonic_len = compute_utr5_exonic_length(transcript);
Ok(Self {
cumulative_cds,
exon_to_cds_seg,
cds_start_exon_idx,
cds_end_exon_idx,
utr5_exonic_len,
})
}
pub(crate) fn cumulative_cds(&self) -> &[u32] {
&self.cumulative_cds
}
pub(crate) fn total_cds_length(&self) -> u32 {
*self.cumulative_cds.last().unwrap_or(&0)
}
#[allow(dead_code)]
pub(crate) fn exon_to_cds_seg(&self) -> &[Option<u16>] {
&self.exon_to_cds_seg
}
#[allow(dead_code)]
pub(crate) fn cds_start_exon_idx(&self) -> Option<usize> {
self.cds_start_exon_idx
}
#[allow(dead_code)]
pub(crate) fn cds_end_exon_idx(&self) -> Option<usize> {
self.cds_end_exon_idx
}
pub(crate) fn utr5_exonic_len(&self) -> u32 {
self.utr5_exonic_len
}
}
fn compute_utr5_exonic_length(transcript: &TranscriptModel) -> u32 {
let cds_start = match transcript.strand {
Strand::Plus => match transcript.cds_genomic_start {
Some(s) => s,
None => return 0,
},
Strand::Minus => match transcript.cds_genomic_end {
Some(e) => e,
None => return 0,
},
};
let mut len = 0u32;
for exon in &transcript.exons {
match transcript.strand {
Strand::Plus => {
if exon.genomic_end <= cds_start {
len += (exon.genomic_end - exon.genomic_start) as u32;
} else if exon.genomic_start < cds_start {
len += (cds_start - exon.genomic_start) as u32;
}
}
Strand::Minus => {
if exon.genomic_start >= cds_start {
len += (exon.genomic_end - exon.genomic_start) as u32;
} else if exon.genomic_end > cds_start {
len += (exon.genomic_end - cds_start) as u32;
}
}
}
}
len
}
pub fn locate_variant(
chrom: &str,
start: u64,
_end: u64,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<VariantLocation, VarEffectError> {
debug_assert!(
_end > start,
"end ({}) must be greater than start ({})",
_end,
start,
);
if chrom != transcript.chrom {
return Err(VarEffectError::Malformed(format!(
"chromosome mismatch: caller passed '{}' but transcript {} is on '{}'",
chrom, transcript.accession, transcript.chrom,
)));
}
let is_coding = !transcript.cds_segments.is_empty();
if start < transcript.tx_start {
let distance = transcript.tx_start - start;
if distance > UPSTREAM_DOWNSTREAM_LIMIT {
return Ok(VariantLocation::Distal);
}
return Ok(match transcript.strand {
Strand::Plus => VariantLocation::Upstream { distance },
Strand::Minus => VariantLocation::Downstream { distance },
});
}
if start >= transcript.tx_end {
let distance = start - transcript.tx_end + 1;
if distance > UPSTREAM_DOWNSTREAM_LIMIT {
return Ok(VariantLocation::Distal);
}
return Ok(match transcript.strand {
Strand::Plus => VariantLocation::Downstream { distance },
Strand::Minus => VariantLocation::Upstream { distance },
});
}
match find_exon_or_intron(start, transcript)? {
ExonOrIntron::Intron {
upstream,
downstream,
} => Ok(classify_intronic(
start,
upstream,
downstream,
to_u16(upstream),
transcript,
is_coding,
)),
ExonOrIntron::Exon(exon_idx) => {
let splice_region = is_exonic_splice_region(start, exon_idx, transcript);
let exon_index = to_u16(exon_idx);
if !is_coding {
return Ok(VariantLocation::NonCodingExon {
exon_index,
is_splice_region: splice_region,
});
}
let cds_start = transcript.cds_genomic_start.ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: coding transcript has no cds_genomic_start",
transcript.accession,
))
})?;
let cds_end = transcript.cds_genomic_end.ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: coding transcript has no cds_genomic_end",
transcript.accession,
))
})?;
let is_5utr = match transcript.strand {
Strand::Plus => start < cds_start,
Strand::Minus => start >= cds_end,
};
let is_3utr = match transcript.strand {
Strand::Plus => start >= cds_end,
Strand::Minus => start < cds_start,
};
if is_5utr {
let offset = compute_utr_offset_5prime(start, exon_idx, transcript, index)?;
Ok(VariantLocation::FivePrimeUtr {
exon_index,
offset_from_cds_start: offset,
is_splice_region: splice_region,
})
} else if is_3utr {
let offset = compute_utr_offset_3prime(start, exon_idx, transcript, index)?;
Ok(VariantLocation::ThreePrimeUtr {
exon_index,
offset_from_cds_end: offset,
is_splice_region: splice_region,
})
} else {
let cds = compute_cds_offset(start, exon_idx, transcript, index)?;
Ok(VariantLocation::CdsExon {
exon_index,
cds_segment_index: cds.cds_segment_index,
cds_offset: cds.cds_offset,
codon_number: cds.codon_number,
codon_position: cds.codon_position,
is_splice_region: splice_region,
})
}
}
}
}