use crate::error::VarEffectError;
use crate::fasta::FastaReader;
use crate::types::{Strand, TranscriptModel};
pub(crate) fn compute_3prime_shift_deletion(
chrom: &str,
del_start: u64,
del_end: u64,
transcript: &TranscriptModel,
fasta: &FastaReader,
) -> Result<u64, VarEffectError> {
let del_len = del_end - del_start;
if del_len == 0 {
return Ok(0);
}
match transcript.strand {
Strand::Plus => {
let fetch_end = if let Some((_, exon_end)) =
containing_exon_range(del_end - 1, transcript)
{
exon_end
} else if let Some((_, intron_end)) = containing_intron_range(del_end - 1, transcript) {
intron_end
} else {
return Ok(0);
};
if fetch_end <= del_end {
return Ok(0); }
let seq = fasta.fetch_sequence(chrom, del_start, fetch_end)?;
let dl = del_len as usize;
let mut shift = 0usize;
while dl + shift < seq.len() {
if seq[shift] != seq[dl + shift] {
break;
}
shift += 1;
}
Ok(shift as u64)
}
Strand::Minus => {
let fetch_start = if let Some((exon_start, _)) =
containing_exon_range(del_start, transcript)
{
exon_start
} else if let Some((intron_start, _)) = containing_intron_range(del_start, transcript) {
intron_start
} else {
return Ok(0);
};
if fetch_start >= del_start {
return Ok(0); }
let seq = fasta.fetch_sequence(chrom, fetch_start, del_end)?;
let ds = (del_start - fetch_start) as usize;
let de = (del_end - fetch_start) as usize;
let mut shift = 0usize;
while shift < ds {
if seq[ds - 1 - shift] != seq[de - 1 - shift] {
break;
}
shift += 1;
}
Ok(shift as u64)
}
}
}
pub(crate) fn compute_3prime_shift_insertion(
chrom: &str,
ins_pos: u64,
inserted_bases: &[u8],
transcript: &TranscriptModel,
fasta: &FastaReader,
) -> Result<u64, VarEffectError> {
let ins_len = inserted_bases.len();
if ins_len == 0 {
return Ok(0);
}
match transcript.strand {
Strand::Plus => {
let fetch_end = if let Some(exon_range) = find_shift_exon_range(ins_pos, transcript) {
exon_range.1
} else if let Some((_, intron_end)) = containing_intron_range(ins_pos, transcript) {
intron_end
} else if ins_pos > 0 {
if let Some((_, intron_end)) = containing_intron_range(ins_pos - 1, transcript) {
intron_end
} else {
return Ok(0);
}
} else {
return Ok(0);
};
if fetch_end <= ins_pos {
return Ok(0);
}
let seq = fasta.fetch_sequence(chrom, ins_pos, fetch_end)?;
let mut shift = 0usize;
while shift < seq.len() {
if seq[shift] != inserted_bases[shift % ins_len] {
break;
}
shift += 1;
}
Ok(shift as u64)
}
Strand::Minus => {
let fetch_start = if let Some(exon_range) =
find_shift_exon_range_minus(ins_pos, transcript)
{
exon_range.0
} else if ins_pos > 0 {
if let Some((intron_start, _)) = containing_intron_range(ins_pos - 1, transcript) {
intron_start
} else if let Some((intron_start, _)) = containing_intron_range(ins_pos, transcript)
{
intron_start
} else {
return Ok(0);
}
} else {
return Ok(0);
};
if fetch_start >= ins_pos {
return Ok(0);
}
let seq = fasta.fetch_sequence(chrom, fetch_start, ins_pos)?;
let mut shift = 0usize;
while shift < seq.len() {
let ref_base = seq[seq.len() - 1 - shift];
let ins_idx = ins_len - 1 - (shift % ins_len);
if ref_base != inserted_bases[ins_idx] {
break;
}
shift += 1;
}
Ok(shift as u64)
}
}
}
fn containing_exon_range(pos: u64, transcript: &TranscriptModel) -> Option<(u64, u64)> {
transcript
.exons
.iter()
.find(|e| pos >= e.genomic_start && pos < e.genomic_end)
.map(|e| (e.genomic_start, e.genomic_end))
}
fn containing_intron_range(pos: u64, transcript: &TranscriptModel) -> Option<(u64, u64)> {
for pair in transcript.exons.windows(2) {
let (intron_start, intron_end) = match transcript.strand {
Strand::Plus => (pair[0].genomic_end, pair[1].genomic_start),
Strand::Minus => (pair[1].genomic_end, pair[0].genomic_start),
};
if pos >= intron_start && pos < intron_end {
return Some((intron_start, intron_end));
}
}
None
}
fn find_shift_exon_range(pos: u64, transcript: &TranscriptModel) -> Option<(u64, u64)> {
containing_exon_range(pos, transcript).or_else(|| {
if pos > 0 {
containing_exon_range(pos - 1, transcript)
} else {
None
}
})
}
fn find_shift_exon_range_minus(pos: u64, transcript: &TranscriptModel) -> Option<(u64, u64)> {
if pos > 0
&& let Some(r) = containing_exon_range(pos - 1, transcript)
{
return Some(r);
}
containing_exon_range(pos, transcript)
}
#[cfg(test)]
mod tests {
use super::*;
use crate::fasta::write_genome_binary;
use crate::types::{Biotype, CdsSegment, Exon, TranscriptTier};
use tempfile::TempDir;
fn write_test_fasta() -> (TempDir, FastaReader) {
let tmp = TempDir::new().unwrap();
let chr1_seq = b"TCGAAAAAAGATATATATCCCGATACGTGATGGCATACGTGATGGCNNGCGCGCGCGCGCGC";
let chr17_seq = b"TCGATCGATCCCCCCCATGATGATGATGATNNNNNNNNNN";
assert_eq!(chr1_seq.len(), 62);
assert_eq!(chr17_seq.len(), 40);
let bin_path = tmp.path().join("test.bin");
let idx_path = tmp.path().join("test.bin.idx");
let contigs: &[(&str, &[u8])] = &[
("chr1", chr1_seq.as_slice()),
("chr17", chr17_seq.as_slice()),
];
write_genome_binary(contigs, "test", &bin_path, &idx_path).unwrap();
let reader = FastaReader::open(&bin_path).unwrap();
(tmp, reader)
}
fn chr1_plus_transcript() -> TranscriptModel {
TranscriptModel {
accession: "NM_SHIFT_PLUS.1".into(),
protein_accession: Some("NP_SHIFT_PLUS.1".into()),
gene_symbol: "SHIFTPLUS".into(),
hgnc_id: None,
ensembl_accession: None,
chrom: "chr1".into(),
strand: Strand::Plus,
tx_start: 0,
tx_end: 62,
cds_genomic_start: Some(0),
cds_genomic_end: Some(62),
exons: vec![
Exon {
exon_number: 1,
genomic_start: 0,
genomic_end: 25,
},
Exon {
exon_number: 2,
genomic_start: 30,
genomic_end: 62,
},
],
cds_segments: vec![
CdsSegment {
exon_index: 0,
genomic_start: 0,
genomic_end: 25,
phase: 0,
},
CdsSegment {
exon_index: 1,
genomic_start: 30,
genomic_end: 62,
phase: 0,
},
],
tier: TranscriptTier::ManeSelect,
biotype: Biotype::ProteinCoding,
exon_count: 2,
}
}
fn chr17_minus_transcript() -> TranscriptModel {
TranscriptModel {
accession: "NM_SHIFT_MINUS.1".into(),
protein_accession: Some("NP_SHIFT_MINUS.1".into()),
gene_symbol: "SHIFTMINUS".into(),
hgnc_id: None,
ensembl_accession: None,
chrom: "chr17".into(),
strand: Strand::Minus,
tx_start: 8,
tx_end: 30,
cds_genomic_start: Some(8),
cds_genomic_end: Some(30),
exons: vec![Exon {
exon_number: 1,
genomic_start: 8,
genomic_end: 30,
}],
cds_segments: vec![CdsSegment {
exon_index: 0,
genomic_start: 8,
genomic_end: 30,
phase: 0,
}],
tier: TranscriptTier::ManeSelect,
biotype: Biotype::ProteinCoding,
exon_count: 1,
}
}
#[test]
fn del_in_mono_repeat_plus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_transcript();
let shift = compute_3prime_shift_deletion("chr1", 4, 5, &tx, &fasta).unwrap();
assert_eq!(shift, 4);
}
#[test]
fn del_in_di_repeat_plus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_transcript();
let shift = compute_3prime_shift_deletion("chr1", 10, 12, &tx, &fasta).unwrap();
assert_eq!(shift, 6);
}
#[test]
fn del_no_repeat_plus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_transcript();
let shift = compute_3prime_shift_deletion("chr1", 0, 1, &tx, &fasta).unwrap();
assert_eq!(shift, 0);
}
#[test]
fn del_at_exon_boundary_plus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_transcript();
let shift = compute_3prime_shift_deletion("chr1", 24, 25, &tx, &fasta).unwrap();
assert_eq!(shift, 0);
}
#[test]
fn del_zero_length() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_transcript();
let shift = compute_3prime_shift_deletion("chr1", 5, 5, &tx, &fasta).unwrap();
assert_eq!(shift, 0);
}
#[test]
fn del_in_mono_repeat_minus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr17_minus_transcript();
let shift = compute_3prime_shift_deletion("chr17", 15, 16, &tx, &fasta).unwrap();
assert_eq!(shift, 6);
}
#[test]
fn ins_in_mono_repeat_plus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_transcript();
let shift = compute_3prime_shift_insertion("chr1", 5, b"A", &tx, &fasta).unwrap();
assert_eq!(shift, 4);
}
#[test]
fn ins_in_di_repeat_plus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_transcript();
let shift = compute_3prime_shift_insertion("chr1", 12, b"AT", &tx, &fasta).unwrap();
assert_eq!(shift, 6);
}
#[test]
fn ins_no_repeat_plus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_transcript();
let shift = compute_3prime_shift_insertion("chr1", 1, b"G", &tx, &fasta).unwrap();
assert_eq!(shift, 0);
}
#[test]
fn ins_at_exon_boundary_plus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_transcript();
let shift = compute_3prime_shift_insertion("chr1", 25, b"A", &tx, &fasta).unwrap();
assert_eq!(shift, 0);
}
#[test]
fn ins_zero_length() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_transcript();
let shift = compute_3prime_shift_insertion("chr1", 5, b"", &tx, &fasta).unwrap();
assert_eq!(shift, 0);
}
#[test]
fn ins_in_mono_repeat_minus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr17_minus_transcript();
let shift = compute_3prime_shift_insertion("chr17", 15, b"C", &tx, &fasta).unwrap();
assert_eq!(shift, 6);
}
#[test]
fn ins_in_tri_repeat_minus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr17_minus_transcript();
let shift = compute_3prime_shift_insertion("chr17", 25, b"ATG", &tx, &fasta).unwrap();
assert_eq!(shift, 9);
}
#[test]
fn ins_intronic_no_repeat() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_transcript();
let shift = compute_3prime_shift_insertion("chr1", 27, b"A", &tx, &fasta).unwrap();
assert_eq!(shift, 0);
}
fn chr1_plus_intron_repeat_transcript() -> TranscriptModel {
TranscriptModel {
accession: "NM_INTRON_PLUS.1".into(),
protein_accession: Some("NP_INTRON_PLUS.1".into()),
gene_symbol: "INTRONPLUS".into(),
hgnc_id: None,
ensembl_accession: None,
chrom: "chr1".into(),
strand: Strand::Plus,
tx_start: 0,
tx_end: 62,
cds_genomic_start: Some(0),
cds_genomic_end: Some(62),
exons: vec![
Exon {
exon_number: 1,
genomic_start: 0,
genomic_end: 46,
},
Exon {
exon_number: 2,
genomic_start: 56,
genomic_end: 62,
},
],
cds_segments: vec![
CdsSegment {
exon_index: 0,
genomic_start: 0,
genomic_end: 46,
phase: 0,
},
CdsSegment {
exon_index: 1,
genomic_start: 56,
genomic_end: 62,
phase: 0,
},
],
tier: TranscriptTier::ManeSelect,
biotype: Biotype::ProteinCoding,
exon_count: 2,
}
}
fn chr17_minus_intron_transcript() -> TranscriptModel {
TranscriptModel {
accession: "NM_INTRON_MINUS.1".into(),
protein_accession: Some("NP_INTRON_MINUS.1".into()),
gene_symbol: "INTRONMINUS".into(),
hgnc_id: None,
ensembl_accession: None,
chrom: "chr17".into(),
strand: Strand::Minus,
tx_start: 0,
tx_end: 40,
cds_genomic_start: Some(0),
cds_genomic_end: Some(40),
exons: vec![
Exon {
exon_number: 1,
genomic_start: 30,
genomic_end: 40,
},
Exon {
exon_number: 2,
genomic_start: 0,
genomic_end: 10,
},
],
cds_segments: vec![
CdsSegment {
exon_index: 0,
genomic_start: 30,
genomic_end: 40,
phase: 0,
},
CdsSegment {
exon_index: 1,
genomic_start: 0,
genomic_end: 10,
phase: 0,
},
],
tier: TranscriptTier::ManeSelect,
biotype: Biotype::ProteinCoding,
exon_count: 2,
}
}
#[test]
fn del_intronic_gc_repeat_plus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_intron_repeat_transcript();
let shift = compute_3prime_shift_deletion("chr1", 48, 50, &tx, &fasta).unwrap();
assert_eq!(shift, 6);
}
#[test]
fn del_intronic_single_base_no_repeat_plus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_intron_repeat_transcript();
let shift = compute_3prime_shift_deletion("chr1", 48, 49, &tx, &fasta).unwrap();
assert_eq!(shift, 0);
}
#[test]
fn del_intronic_stops_at_boundary_plus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_intron_repeat_transcript();
let shift = compute_3prime_shift_deletion("chr1", 52, 54, &tx, &fasta).unwrap();
assert_eq!(shift, 2);
}
#[test]
fn ins_intronic_gc_repeat_plus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr1_plus_intron_repeat_transcript();
let shift = compute_3prime_shift_insertion("chr1", 50, b"GC", &tx, &fasta).unwrap();
assert_eq!(shift, 6);
}
#[test]
fn del_intronic_c_repeat_minus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr17_minus_intron_transcript();
let shift = compute_3prime_shift_deletion("chr17", 15, 16, &tx, &fasta).unwrap();
assert_eq!(shift, 5);
}
#[test]
fn del_intronic_stops_at_boundary_minus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr17_minus_intron_transcript();
let shift = compute_3prime_shift_deletion("chr17", 10, 11, &tx, &fasta).unwrap();
assert_eq!(shift, 0);
}
#[test]
fn ins_intronic_c_repeat_minus() {
let (_tmp, fasta) = write_test_fasta();
let tx = chr17_minus_intron_transcript();
let shift = compute_3prime_shift_insertion("chr17", 15, b"C", &tx, &fasta).unwrap();
assert_eq!(shift, 5);
}
}