use super::helpers::{
ExonOrIntron, compute_cds_offset, find_exon_or_intron, is_exonic_splice_region, to_u16,
};
use super::{
LocateIndex, SPLICE_CANONICAL_MAX, SPLICE_REGION_INTRON_MAX_ACCEPTOR,
SPLICE_REGION_INTRON_MAX_DONOR,
};
use crate::error::VarEffectError;
use crate::types::{Strand, TranscriptModel};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct IndelLocation {
pub region: IndelRegion,
pub overlaps_splice_canonical: bool,
pub overlaps_splice_region: bool,
pub crosses_exon_boundary: bool,
pub exon_index: Option<u16>,
pub intron_index: Option<u16>,
pub splice_detail: Option<SpliceOverlapDetail>,
}
#[derive(Debug, Clone, Default, PartialEq, Eq)]
pub struct SpliceOverlapDetail {
pub overlaps_donor: bool,
pub overlaps_acceptor: bool,
pub overlaps_splice_region: bool,
pub donor_intron_indices: Vec<u16>,
pub acceptor_intron_indices: Vec<u16>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum IndelRegion {
Cds {
cds_offset_start: u32,
cds_offset_end: u32,
},
FivePrimeUtr,
ThreePrimeUtr,
Intron,
NonCodingExon,
Upstream,
Downstream,
BoundarySpanning,
}
fn check_splice_overlap_detailed(
range_start: u64,
range_end: u64,
transcript: &TranscriptModel,
) -> SpliceOverlapDetail {
let exons = &transcript.exons;
let n = exons.len();
let mut detail = SpliceOverlapDetail::default();
if n < 2 {
return detail;
}
let expanded_lo = range_start.saturating_sub(SPLICE_REGION_INTRON_MAX_ACCEPTOR);
let expanded_hi = range_end + SPLICE_REGION_INTRON_MAX_ACCEPTOR;
let first = match transcript.strand {
Strand::Plus => {
let p = exons.partition_point(|e| e.genomic_end <= expanded_lo);
if p > 0 { p - 1 } else { 0 }
}
Strand::Minus => 0,
};
for i in first..n - 1 {
let upstream_exon = &exons[i];
let downstream_exon = &exons[i + 1];
if matches!(transcript.strand, Strand::Plus)
&& upstream_exon.genomic_start > expanded_hi
&& downstream_exon.genomic_start > expanded_hi
{
break;
}
let intron_idx = to_u16(i);
let (donor_start, donor_end) = match transcript.strand {
Strand::Plus => (
upstream_exon.genomic_end,
upstream_exon.genomic_end + SPLICE_CANONICAL_MAX,
),
Strand::Minus => (
upstream_exon.genomic_start - SPLICE_CANONICAL_MAX,
upstream_exon.genomic_start,
),
};
let (acceptor_start, acceptor_end) = match transcript.strand {
Strand::Plus => (
downstream_exon.genomic_start - SPLICE_CANONICAL_MAX,
downstream_exon.genomic_start,
),
Strand::Minus => (
downstream_exon.genomic_end,
downstream_exon.genomic_end + SPLICE_CANONICAL_MAX,
),
};
if range_start < donor_end && range_end > donor_start {
detail.overlaps_donor = true;
detail.donor_intron_indices.push(intron_idx);
}
if range_start < acceptor_end && range_end > acceptor_start {
detail.overlaps_acceptor = true;
detail.acceptor_intron_indices.push(intron_idx);
}
let (donor_region_start, donor_region_end) = match transcript.strand {
Strand::Plus => (
upstream_exon.genomic_end + SPLICE_CANONICAL_MAX,
upstream_exon.genomic_end + SPLICE_REGION_INTRON_MAX_DONOR,
),
Strand::Minus => (
upstream_exon
.genomic_start
.saturating_sub(SPLICE_REGION_INTRON_MAX_DONOR),
upstream_exon
.genomic_start
.saturating_sub(SPLICE_CANONICAL_MAX),
),
};
let (acceptor_region_start, acceptor_region_end) = match transcript.strand {
Strand::Plus => (
downstream_exon
.genomic_start
.saturating_sub(SPLICE_REGION_INTRON_MAX_ACCEPTOR),
downstream_exon
.genomic_start
.saturating_sub(SPLICE_CANONICAL_MAX),
),
Strand::Minus => (
downstream_exon.genomic_end + SPLICE_CANONICAL_MAX,
downstream_exon.genomic_end + SPLICE_REGION_INTRON_MAX_ACCEPTOR,
),
};
if (range_start < donor_region_end && range_end > donor_region_start)
|| (range_start < acceptor_region_end && range_end > acceptor_region_start)
{
detail.overlaps_splice_region = true;
}
}
detail
}
pub fn locate_indel(
chrom: &str,
start: u64,
end: u64,
transcript: &TranscriptModel,
index: &LocateIndex,
) -> Result<IndelLocation, VarEffectError> {
debug_assert!(end >= start, "end ({end}) must be >= start ({start})");
if chrom != transcript.chrom {
return Err(VarEffectError::Malformed(format!(
"chromosome mismatch: caller passed '{}' but transcript {} is on '{}'",
chrom, transcript.accession, transcript.chrom,
)));
}
let is_coding = !transcript.cds_segments.is_empty();
let is_insertion = start == end;
if end <= transcript.tx_start {
let distance = transcript.tx_start - start;
let region = if distance > super::UPSTREAM_DOWNSTREAM_LIMIT {
return Ok(simple_indel_location(IndelRegion::Upstream));
} else {
match transcript.strand {
Strand::Plus => IndelRegion::Upstream,
Strand::Minus => IndelRegion::Downstream,
}
};
return Ok(simple_indel_location(region));
}
if start >= transcript.tx_end {
let distance = start - transcript.tx_end + 1;
let region = if distance > super::UPSTREAM_DOWNSTREAM_LIMIT {
return Ok(simple_indel_location(IndelRegion::Downstream));
} else {
match transcript.strand {
Strand::Plus => IndelRegion::Downstream,
Strand::Minus => IndelRegion::Upstream,
}
};
return Ok(simple_indel_location(region));
}
let (detail_start, detail_end) = if is_insertion {
(start, start + 1)
} else {
(start, end)
};
let detail = check_splice_overlap_detailed(detail_start, detail_end, transcript);
let splice_canonical = detail.overlaps_donor || detail.overlaps_acceptor;
let splice_region = detail.overlaps_splice_region;
let splice_detail = if splice_canonical { Some(detail) } else { None };
if is_insertion {
return locate_indel_insertion(
start,
transcript,
index,
is_coding,
splice_canonical,
splice_region,
splice_detail,
);
}
if start < transcript.tx_start || end > transcript.tx_end {
let anchor = start.max(transcript.tx_start);
let anchor_loc = find_exon_or_intron(anchor, transcript)?;
let (exon_index, intron_index) = match anchor_loc {
ExonOrIntron::Exon(i) => (Some(to_u16(i)), None),
ExonOrIntron::Intron { upstream, .. } => (None, Some(to_u16(upstream))),
};
return Ok(IndelLocation {
region: IndelRegion::BoundarySpanning,
overlaps_splice_canonical: splice_canonical,
overlaps_splice_region: splice_region,
crosses_exon_boundary: true,
exon_index,
intron_index,
splice_detail,
});
}
let start_loc = find_exon_or_intron(start, transcript)?;
let end_loc = find_exon_or_intron(end - 1, transcript)?;
match (&start_loc, &end_loc) {
(
ExonOrIntron::Intron {
upstream: u1,
downstream: d1,
},
ExonOrIntron::Intron {
upstream: u2,
downstream: d2,
},
) if u1 == u2 && d1 == d2 => {
return Ok(IndelLocation {
region: IndelRegion::Intron,
overlaps_splice_canonical: splice_canonical,
overlaps_splice_region: splice_region,
crosses_exon_boundary: false,
exon_index: None,
intron_index: Some(to_u16(*u1)),
splice_detail,
});
}
(ExonOrIntron::Exon(s), ExonOrIntron::Exon(e)) if s == e => {}
_ => {
let (exon_idx, intron_idx) = match &start_loc {
ExonOrIntron::Exon(i) => (Some(to_u16(*i)), None),
ExonOrIntron::Intron { upstream, .. } => (None, Some(to_u16(*upstream))),
};
return Ok(IndelLocation {
region: IndelRegion::BoundarySpanning,
overlaps_splice_canonical: splice_canonical,
overlaps_splice_region: splice_region,
crosses_exon_boundary: true,
exon_index: exon_idx,
intron_index: intron_idx,
splice_detail,
});
}
}
let start_exon = match start_loc {
ExonOrIntron::Exon(i) => i,
_ => unreachable!("both endpoints in same exon after match"),
};
let exon_index = to_u16(start_exon);
let splice_region_exonic = is_exonic_splice_region(start, start_exon, transcript)
|| is_exonic_splice_region(end - 1, start_exon, transcript);
if !is_coding {
return Ok(IndelLocation {
region: IndelRegion::NonCodingExon,
overlaps_splice_canonical: splice_canonical,
overlaps_splice_region: splice_region || splice_region_exonic,
crosses_exon_boundary: false,
exon_index: Some(exon_index),
intron_index: None,
splice_detail: splice_detail.clone(),
});
}
let cds_start = transcript.cds_genomic_start.ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: coding transcript has no cds_genomic_start",
transcript.accession,
))
})?;
let cds_end = transcript.cds_genomic_end.ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: coding transcript has no cds_genomic_end",
transcript.accession,
))
})?;
let start_in_cds = start >= cds_start && start < cds_end;
let end_in_cds = (end - 1) >= cds_start && (end - 1) < cds_end;
if start_in_cds && end_in_cds {
let cds_s = compute_cds_offset(start, start_exon, transcript, index)?;
let cds_e = compute_cds_offset(end - 1, start_exon, transcript, index)?;
let offset_lo = cds_s.cds_offset.min(cds_e.cds_offset);
let offset_hi = cds_s.cds_offset.max(cds_e.cds_offset);
return Ok(IndelLocation {
region: IndelRegion::Cds {
cds_offset_start: offset_lo,
cds_offset_end: offset_hi + 1,
},
overlaps_splice_canonical: splice_canonical,
overlaps_splice_region: splice_region || splice_region_exonic,
crosses_exon_boundary: false,
exon_index: Some(exon_index),
intron_index: None,
splice_detail: splice_detail.clone(),
});
}
if !start_in_cds && !end_in_cds {
let is_5utr = match transcript.strand {
Strand::Plus => start < cds_start,
Strand::Minus => start >= cds_end,
};
let region = if is_5utr {
IndelRegion::FivePrimeUtr
} else {
IndelRegion::ThreePrimeUtr
};
return Ok(IndelLocation {
region,
overlaps_splice_canonical: splice_canonical,
overlaps_splice_region: splice_region || splice_region_exonic,
crosses_exon_boundary: false,
exon_index: Some(exon_index),
intron_index: None,
splice_detail,
});
}
Ok(IndelLocation {
region: IndelRegion::BoundarySpanning,
overlaps_splice_canonical: splice_canonical,
overlaps_splice_region: splice_region || splice_region_exonic,
crosses_exon_boundary: false,
exon_index: Some(exon_index),
intron_index: None,
splice_detail,
})
}
fn locate_indel_insertion(
pos: u64,
transcript: &TranscriptModel,
index: &LocateIndex,
is_coding: bool,
splice_canonical: bool,
splice_region: bool,
splice_detail: Option<SpliceOverlapDetail>,
) -> Result<IndelLocation, VarEffectError> {
let loc = find_exon_or_intron(pos, transcript)?;
match loc {
ExonOrIntron::Intron { upstream, .. } => Ok(IndelLocation {
region: IndelRegion::Intron,
overlaps_splice_canonical: splice_canonical,
overlaps_splice_region: splice_region,
crosses_exon_boundary: false,
exon_index: None,
intron_index: Some(to_u16(upstream)),
splice_detail,
}),
ExonOrIntron::Exon(exon_idx) => {
let exon_index = to_u16(exon_idx);
let splice_region_exonic = is_exonic_splice_region(pos, exon_idx, transcript);
if !is_coding {
return Ok(IndelLocation {
region: IndelRegion::NonCodingExon,
overlaps_splice_canonical: splice_canonical,
overlaps_splice_region: splice_region || splice_region_exonic,
crosses_exon_boundary: false,
exon_index: Some(exon_index),
intron_index: None,
splice_detail,
});
}
let cds_start = transcript.cds_genomic_start.ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: coding transcript has no cds_genomic_start",
transcript.accession,
))
})?;
let cds_end = transcript.cds_genomic_end.ok_or_else(|| {
VarEffectError::Malformed(format!(
"{}: coding transcript has no cds_genomic_end",
transcript.accession,
))
})?;
let in_cds = pos >= cds_start && pos < cds_end;
if in_cds {
let cds = compute_cds_offset(pos, exon_idx, transcript, index)?;
Ok(IndelLocation {
region: IndelRegion::Cds {
cds_offset_start: cds.cds_offset,
cds_offset_end: cds.cds_offset,
},
overlaps_splice_canonical: splice_canonical,
overlaps_splice_region: splice_region || splice_region_exonic,
crosses_exon_boundary: false,
exon_index: Some(exon_index),
intron_index: None,
splice_detail,
})
} else {
let is_5utr = match transcript.strand {
Strand::Plus => pos < cds_start,
Strand::Minus => pos >= cds_end,
};
let region = if is_5utr {
IndelRegion::FivePrimeUtr
} else {
IndelRegion::ThreePrimeUtr
};
Ok(IndelLocation {
region,
overlaps_splice_canonical: splice_canonical,
overlaps_splice_region: splice_region || splice_region_exonic,
crosses_exon_boundary: false,
exon_index: Some(exon_index),
intron_index: None,
splice_detail,
})
}
}
}
}
fn simple_indel_location(region: IndelRegion) -> IndelLocation {
IndelLocation {
region,
overlaps_splice_canonical: false,
overlaps_splice_region: false,
crosses_exon_boundary: false,
exon_index: None,
intron_index: None,
splice_detail: None,
}
}