use crate::annotation::{Gene, GeneType, Transcript};
use crate::genomic::Strand;
use crate::interval_tree::{Interval, IntervalTree};
use crate::variant::Variant;
fn nucleotide_index(base: u8) -> usize {
match base.to_ascii_uppercase() {
b'A' => 0,
b'C' => 1,
b'G' => 2,
b'T' | b'U' => 3,
_ => 0, }
}
fn codon_index(b1: u8, b2: u8, b3: u8) -> usize {
nucleotide_index(b1) * 16 + nucleotide_index(b2) * 4 + nucleotide_index(b3)
}
const CODON_TABLE: [u8; 64] = [
b'K', b'N', b'K', b'N', b'T', b'T', b'T', b'T', b'R', b'S', b'R', b'S', b'I', b'I', b'M', b'I',
b'Q', b'H', b'Q', b'H', b'P', b'P', b'P', b'P', b'R', b'R', b'R', b'R', b'L', b'L', b'L', b'L',
b'E', b'D', b'E', b'D', b'A', b'A', b'A', b'A', b'G', b'G', b'G', b'G', b'V', b'V', b'V', b'V',
b'*', b'Y', b'*', b'Y', b'S', b'S', b'S', b'S', b'*', b'C', b'W', b'C', b'L', b'F', b'L', b'F',
];
fn translate_codon(codon: &[u8]) -> u8 {
if codon.len() < 3 {
return b'X';
}
CODON_TABLE[codon_index(codon[0], codon[1], codon[2])]
}
fn complement(base: u8) -> u8 {
match base.to_ascii_uppercase() {
b'A' => b'T',
b'T' => b'A',
b'C' => b'G',
b'G' => b'C',
other => other,
}
}
fn reverse_complement(seq: &[u8]) -> Vec<u8> {
seq.iter().rev().map(|&b| complement(b)).collect()
}
fn aa_three_letter(aa: u8) -> &'static str {
match aa.to_ascii_uppercase() {
b'A' => "Ala",
b'R' => "Arg",
b'N' => "Asn",
b'D' => "Asp",
b'C' => "Cys",
b'E' => "Glu",
b'Q' => "Gln",
b'G' => "Gly",
b'H' => "His",
b'I' => "Ile",
b'L' => "Leu",
b'K' => "Lys",
b'M' => "Met",
b'F' => "Phe",
b'P' => "Pro",
b'S' => "Ser",
b'T' => "Thr",
b'W' => "Trp",
b'Y' => "Tyr",
b'V' => "Val",
b'*' => "Ter",
_ => "Xaa",
}
}
#[derive(Debug, Clone, PartialEq)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub enum Consequence {
Missense {
ref_aa: u8,
alt_aa: u8,
codon_pos: u64,
},
Nonsense {
ref_aa: u8,
codon_pos: u64,
},
Synonymous {
aa: u8,
codon_pos: u64,
},
Frameshift {
codon_pos: u64,
},
InFrame {
codon_pos: u64,
},
FivePrimeUtr,
ThreePrimeUtr,
SpliceSite {
donor: bool,
},
Intronic,
Upstream,
Downstream,
NonCoding,
StartLoss {
ref_aa: u8,
},
StopLoss {
ref_aa: u8,
},
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct VariantEffect {
pub gene_id: String,
pub gene_name: String,
pub transcript_id: String,
pub consequence: Consequence,
pub hgvs_c: Option<String>,
pub hgvs_p: Option<String>,
pub codon_change: Option<(String, String)>,
pub cds_position: Option<u64>,
pub protein_position: Option<u64>,
pub exon_distance: Option<i64>,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct SpliceScore {
pub transcript_id: String,
pub is_canonical: bool,
pub disrupts_consensus: bool,
pub score_ref: f64,
pub score_alt: f64,
pub delta_score: f64,
}
#[derive(Debug, Clone)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
pub struct AnnotationConfig {
pub upstream_distance: u64,
pub downstream_distance: u64,
pub splice_window: u64,
}
impl Default for AnnotationConfig {
fn default() -> Self {
Self {
upstream_distance: 5000,
downstream_distance: 5000,
splice_window: 2,
}
}
}
const DONOR_PWM: [[f64; 4]; 9] = [
[0.33, 0.36, 0.18, 0.13], [0.60, 0.03, 0.12, 0.25], [0.09, 0.03, 0.81, 0.07], [0.00, 0.00, 1.00, 0.00], [0.00, 0.00, 0.00, 1.00], [0.54, 0.02, 0.38, 0.06], [0.71, 0.08, 0.12, 0.09], [0.06, 0.04, 0.82, 0.08], [0.15, 0.15, 0.21, 0.49], ];
const ACCEPTOR_PWM: [[f64; 4]; 23] = [
[0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.24, 0.15, 0.09, 0.52], [0.20, 0.16, 0.09, 0.55], [0.16, 0.18, 0.09, 0.57], [0.14, 0.19, 0.09, 0.58], [0.12, 0.21, 0.08, 0.59], [0.10, 0.25, 0.07, 0.58], [0.09, 0.28, 0.07, 0.56], [0.08, 0.30, 0.08, 0.54], [0.08, 0.32, 0.08, 0.52], [0.08, 0.28, 0.12, 0.52], [0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 1.00, 0.00], [0.25, 0.25, 0.25, 0.25], ];
fn score_pwm(seq: &[u8], pwm: &[[f64; 4]]) -> f64 {
let len = seq.len().min(pwm.len());
let mut score = 0.0;
for i in 0..len {
let idx = nucleotide_index(seq[i]);
let freq = pwm[i][idx];
if freq <= 0.0 {
score += -10.0; } else {
score += (freq / 0.25_f64).log2();
}
}
score
}
pub fn annotate_variant(
variant: &Variant,
genes: &[Gene],
config: &AnnotationConfig,
) -> Vec<VariantEffect> {
let pos0 = variant.position - 1; let mut effects = Vec::new();
for gene in genes {
if variant.chrom != gene.chrom {
continue;
}
let (upstream_ext, downstream_ext) = if gene.strand.is_reverse() {
(config.downstream_distance, config.upstream_distance)
} else {
(config.upstream_distance, config.downstream_distance)
};
let region_start = gene.start.saturating_sub(upstream_ext);
let region_end = gene.end + downstream_ext;
if pos0 < region_start || pos0 >= region_end {
continue;
}
if pos0 < gene.start {
let consequence = if gene.strand.is_reverse() {
Consequence::Downstream
} else {
Consequence::Upstream
};
for tx in &gene.transcripts {
effects.push(VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence: consequence.clone(),
hgvs_c: None,
hgvs_p: None,
codon_change: None,
cds_position: None,
protein_position: None,
exon_distance: None,
});
}
continue;
}
if pos0 >= gene.end {
let consequence = if gene.strand.is_reverse() {
Consequence::Upstream
} else {
Consequence::Downstream
};
for tx in &gene.transcripts {
effects.push(VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence: consequence.clone(),
hgvs_c: None,
hgvs_p: None,
codon_change: None,
cds_position: None,
protein_position: None,
exon_distance: None,
});
}
continue;
}
if gene.gene_type != GeneType::ProteinCoding {
for tx in &gene.transcripts {
effects.push(VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence: Consequence::NonCoding,
hgvs_c: None,
hgvs_p: None,
codon_change: None,
cds_position: None,
protein_position: None,
exon_distance: None,
});
}
continue;
}
for tx in &gene.transcripts {
let effect = annotate_transcript(variant, gene, tx, config);
effects.push(effect);
}
}
effects
}
pub fn annotate_variants(
variants: &[Variant],
genes: &[Gene],
config: &AnnotationConfig,
) -> Vec<VariantEffect> {
if genes.is_empty() || variants.is_empty() {
return Vec::new();
}
let mut chrom_genes: std::collections::HashMap<&str, Vec<(usize, &Gene)>> =
std::collections::HashMap::new();
for (idx, gene) in genes.iter().enumerate() {
chrom_genes
.entry(gene.chrom.as_str())
.or_default()
.push((idx, gene));
}
let max_ext = config.upstream_distance.max(config.downstream_distance);
let mut chrom_trees: std::collections::HashMap<&str, IntervalTree<usize>> =
std::collections::HashMap::new();
for (chrom, gene_list) in &chrom_genes {
let intervals: Vec<Interval<usize>> = gene_list
.iter()
.map(|&(idx, gene)| {
Interval::new(
gene.start.saturating_sub(max_ext),
gene.end + max_ext,
idx,
)
})
.collect();
chrom_trees.insert(chrom, IntervalTree::from_unsorted(intervals));
}
let mut all_effects = Vec::new();
for variant in variants {
let pos0 = variant.position - 1;
if let Some(tree) = chrom_trees.get(variant.chrom.as_str()) {
let hits = tree.query(pos0, pos0 + 1);
let mut hit_genes: Vec<&Gene> = hits.iter().map(|iv| &genes[iv.data]).collect();
hit_genes.sort_by_key(|g| &g.gene_id);
hit_genes.dedup_by_key(|g| &g.gene_id);
for gene in hit_genes {
let effs = annotate_variant(variant, std::slice::from_ref(gene), config);
all_effects.extend(effs);
}
}
}
all_effects
}
pub fn score_splice_disruption(
variant: &Variant,
gene: &Gene,
reference_seq: &[u8],
) -> Vec<SpliceScore> {
let pos0 = variant.position - 1;
let mut scores = Vec::new();
for tx in &gene.transcripts {
let exons_sorted = sorted_exons(tx, &gene.strand);
for (i, exon) in exons_sorted.iter().enumerate() {
if i < exons_sorted.len() - 1 {
let donor_start = exon.end.saturating_sub(3);
let donor_end = exon.end + 6;
if pos0 >= donor_start && pos0 < donor_end && (donor_end as usize) <= reference_seq.len() {
let window_ref: Vec<u8> = reference_seq[donor_start as usize..donor_end as usize].to_vec();
let mut window_alt = window_ref.clone();
let offset = (pos0 - donor_start) as usize;
if offset < window_alt.len() && variant.alt_alleles[0].len() == 1 && variant.ref_allele.len() == 1 {
window_alt[offset] = variant.alt_alleles[0][0];
}
let s_ref = score_pwm(&window_ref, &DONOR_PWM);
let s_alt = score_pwm(&window_alt, &DONOR_PWM);
let is_canonical = reference_seq.len() > (exon.end as usize + 1)
&& reference_seq[exon.end as usize] == b'G'
&& reference_seq[exon.end as usize + 1] == b'T';
let disrupts = is_canonical
&& (pos0 == exon.end || pos0 == exon.end + 1)
&& variant.ref_allele != variant.alt_alleles[0];
scores.push(SpliceScore {
transcript_id: tx.transcript_id.clone(),
is_canonical,
disrupts_consensus: disrupts,
score_ref: s_ref,
score_alt: s_alt,
delta_score: s_alt - s_ref,
});
}
}
if i > 0 {
let acc_start = exon.start.saturating_sub(20);
let acc_end = exon.start + 3;
if pos0 >= acc_start && pos0 < acc_end && (acc_end as usize) <= reference_seq.len() {
let window_ref: Vec<u8> = reference_seq[acc_start as usize..acc_end as usize].to_vec();
let mut window_alt = window_ref.clone();
let offset = (pos0 - acc_start) as usize;
if offset < window_alt.len() && variant.alt_alleles[0].len() == 1 && variant.ref_allele.len() == 1 {
window_alt[offset] = variant.alt_alleles[0][0];
}
let s_ref = score_pwm(&window_ref, &ACCEPTOR_PWM);
let s_alt = score_pwm(&window_alt, &ACCEPTOR_PWM);
let is_canonical = exon.start >= 2
&& reference_seq.len() > exon.start as usize
&& reference_seq[exon.start as usize - 2] == b'A'
&& reference_seq[exon.start as usize - 1] == b'G';
let disrupts = is_canonical
&& (pos0 == exon.start - 2 || pos0 == exon.start - 1)
&& variant.ref_allele != variant.alt_alleles[0];
scores.push(SpliceScore {
transcript_id: tx.transcript_id.clone(),
is_canonical,
disrupts_consensus: disrupts,
score_ref: s_ref,
score_alt: s_alt,
delta_score: s_alt - s_ref,
});
}
}
}
}
scores
}
fn sorted_exons<'a>(tx: &'a Transcript, strand: &Strand) -> Vec<&'a crate::annotation::Exon> {
let mut exons: Vec<&crate::annotation::Exon> = tx.exons.iter().collect();
if strand.is_reverse() {
exons.sort_by(|a, b| b.start.cmp(&a.start));
} else {
exons.sort_by_key(|e| e.start);
}
exons
}
fn annotate_transcript(
variant: &Variant,
gene: &Gene,
tx: &Transcript,
config: &AnnotationConfig,
) -> VariantEffect {
let pos0 = variant.position - 1;
let in_exon = tx.exons.iter().any(|e| pos0 >= e.start && pos0 < e.end);
if !in_exon {
for exon in &tx.exons {
if pos0 >= exon.end && pos0 < exon.end + config.splice_window {
return VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence: Consequence::SpliceSite { donor: true },
hgvs_c: None,
hgvs_p: None,
codon_change: None,
cds_position: None,
protein_position: None,
exon_distance: Some((pos0 - exon.end) as i64 + 1),
};
}
if exon.start >= config.splice_window
&& pos0 >= exon.start - config.splice_window
&& pos0 < exon.start
{
return VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence: Consequence::SpliceSite { donor: false },
hgvs_c: None,
hgvs_p: None,
codon_change: None,
cds_position: None,
protein_position: None,
exon_distance: Some(-((exon.start - pos0) as i64)),
};
}
}
return VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence: Consequence::Intronic,
hgvs_c: None,
hgvs_p: None,
codon_change: None,
cds_position: None,
protein_position: None,
exon_distance: None,
};
}
let (cds_start, cds_end) = match (tx.cds_start, tx.cds_end) {
(Some(cs), Some(ce)) => (cs, ce),
_ => {
return VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence: Consequence::NonCoding,
hgvs_c: None,
hgvs_p: None,
codon_change: None,
cds_position: None,
protein_position: None,
exon_distance: None,
};
}
};
if gene.strand.is_reverse() {
if pos0 >= cds_end {
return VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence: Consequence::FivePrimeUtr,
hgvs_c: None,
hgvs_p: None,
codon_change: None,
cds_position: None,
protein_position: None,
exon_distance: None,
};
}
if pos0 < cds_start {
return VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence: Consequence::ThreePrimeUtr,
hgvs_c: None,
hgvs_p: None,
codon_change: None,
cds_position: None,
protein_position: None,
exon_distance: None,
};
}
} else {
if pos0 < cds_start {
return VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence: Consequence::FivePrimeUtr,
hgvs_c: None,
hgvs_p: None,
codon_change: None,
cds_position: None,
protein_position: None,
exon_distance: None,
};
}
if pos0 >= cds_end {
return VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence: Consequence::ThreePrimeUtr,
hgvs_c: None,
hgvs_p: None,
codon_change: None,
cds_position: None,
protein_position: None,
exon_distance: None,
};
}
}
let exons_sorted = sorted_exons(tx, &gene.strand);
let mut cds_offset: u64 = 0;
let mut found = false;
for exon in &exons_sorted {
let exon_cds_start = exon.start.max(cds_start);
let exon_cds_end = exon.end.min(cds_end);
if exon_cds_start >= exon_cds_end {
continue; }
if pos0 >= exon_cds_start && pos0 < exon_cds_end {
if gene.strand.is_reverse() {
cds_offset += exon_cds_end - 1 - pos0;
} else {
cds_offset += pos0 - exon_cds_start;
}
found = true;
break;
} else {
cds_offset += exon_cds_end - exon_cds_start;
}
}
if !found {
return VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence: Consequence::Intronic,
hgvs_c: None,
hgvs_p: None,
codon_change: None,
cds_position: None,
protein_position: None,
exon_distance: None,
};
}
let cds_pos_1based = cds_offset + 1; let codon_pos_0based = cds_offset / 3; let codon_pos_1based = codon_pos_0based + 1; let codon_offset = (cds_offset % 3) as usize;
let ref_allele = if gene.strand.is_reverse() {
reverse_complement(&variant.ref_allele)
} else {
variant.ref_allele.clone()
};
let alt_allele = if gene.strand.is_reverse() {
reverse_complement(&variant.alt_alleles[0])
} else {
variant.alt_alleles[0].clone()
};
if ref_allele.len() != alt_allele.len() {
let indel_len = (ref_allele.len() as i64 - alt_allele.len() as i64).unsigned_abs() as usize;
let consequence = if indel_len % 3 != 0 {
Consequence::Frameshift {
codon_pos: codon_pos_1based,
}
} else {
Consequence::InFrame {
codon_pos: codon_pos_1based,
}
};
return VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence,
hgvs_c: Some(format!(
"c.{}del",
cds_pos_1based,
)),
hgvs_p: None,
codon_change: None,
cds_position: Some(cds_pos_1based),
protein_position: Some(codon_pos_1based),
exon_distance: None,
};
}
let ref_codon = build_codon_from_cds(
tx,
&gene.strand,
cds_start,
cds_end,
codon_pos_0based,
&exons_sorted,
);
if ref_codon.len() < 3 {
return VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence: Consequence::NonCoding,
hgvs_c: None,
hgvs_p: None,
codon_change: None,
cds_position: None,
protein_position: None,
exon_distance: None,
};
}
let mut alt_codon = ref_codon.clone();
if ref_allele.len() == 1 {
alt_codon[codon_offset] = alt_allele[0].to_ascii_uppercase();
}
let ref_aa = translate_codon(&ref_codon);
let alt_aa = translate_codon(&alt_codon);
let hgvs_c_ref = if gene.strand.is_reverse() {
complement(variant.ref_allele[0]).to_ascii_uppercase()
} else {
variant.ref_allele[0].to_ascii_uppercase()
};
let hgvs_c_alt = if gene.strand.is_reverse() {
complement(variant.alt_alleles[0][0]).to_ascii_uppercase()
} else {
variant.alt_alleles[0][0].to_ascii_uppercase()
};
let hgvs_c = Some(format!(
"c.{}{}>{}",
cds_pos_1based,
hgvs_c_ref as char,
hgvs_c_alt as char,
));
let consequence = if ref_aa == alt_aa {
Consequence::Synonymous {
aa: ref_aa,
codon_pos: codon_pos_1based,
}
} else if codon_pos_0based == 0 && ref_aa == b'M' {
Consequence::StartLoss { ref_aa }
} else if ref_aa == b'*' && alt_aa != b'*' {
Consequence::StopLoss { ref_aa }
} else if alt_aa == b'*' {
Consequence::Nonsense {
ref_aa,
codon_pos: codon_pos_1based,
}
} else {
Consequence::Missense {
ref_aa,
alt_aa,
codon_pos: codon_pos_1based,
}
};
let hgvs_p = match &consequence {
Consequence::Missense {
ref_aa, alt_aa, ..
} => Some(format!(
"p.{}{}{}",
aa_three_letter(*ref_aa),
codon_pos_1based,
aa_three_letter(*alt_aa),
)),
Consequence::Nonsense { ref_aa, .. } => Some(format!(
"p.{}{}{}",
aa_three_letter(*ref_aa),
codon_pos_1based,
aa_three_letter(b'*'),
)),
Consequence::Synonymous { aa, .. } => Some(format!(
"p.{}{}=",
aa_three_letter(*aa),
codon_pos_1based,
)),
Consequence::StartLoss { .. } => Some("p.Met1?".to_string()),
Consequence::StopLoss { .. } => Some(format!(
"p.Ter{}?ext",
codon_pos_1based,
)),
_ => None,
};
let ref_codon_str = String::from_utf8_lossy(&ref_codon).to_string();
let alt_codon_str = String::from_utf8_lossy(&alt_codon).to_string();
VariantEffect {
gene_id: gene.gene_id.clone(),
gene_name: gene.gene_name.clone(),
transcript_id: tx.transcript_id.clone(),
consequence,
hgvs_c,
hgvs_p,
codon_change: Some((ref_codon_str, alt_codon_str)),
cds_position: Some(cds_pos_1based),
protein_position: Some(codon_pos_1based),
exon_distance: None,
}
}
fn build_codon_from_cds(
_tx: &Transcript,
strand: &Strand,
cds_start: u64,
cds_end: u64,
codon_pos_0based: u64,
exons_sorted: &[&crate::annotation::Exon],
) -> Vec<u8> {
let codon_start_offset = codon_pos_0based * 3;
let mut cds_bases: Vec<u8> = Vec::new();
let mut collected = 0u64;
let target_end = codon_start_offset + 3;
for exon in exons_sorted {
let exon_cds_start = exon.start.max(cds_start);
let exon_cds_end = exon.end.min(cds_end);
if exon_cds_start >= exon_cds_end {
continue;
}
let exon_cds_len = exon_cds_end - exon_cds_start;
if collected + exon_cds_len <= codon_start_offset {
collected += exon_cds_len;
continue;
}
let skip = if codon_start_offset > collected {
codon_start_offset - collected
} else {
0
};
let take = (target_end - (collected + skip).max(codon_start_offset))
.min(exon_cds_len - skip);
if strand.is_reverse() {
let genomic_start = exon_cds_end - skip - take;
let genomic_end = exon_cds_end - skip;
for _pos in (genomic_start..genomic_end).rev() {
cds_bases.push(b'N');
}
} else {
let genomic_start = exon_cds_start + skip;
let genomic_end = genomic_start + take;
for _pos in genomic_start..genomic_end {
cds_bases.push(b'N');
}
}
collected += exon_cds_len;
if cds_bases.len() >= 3 {
break;
}
}
cds_bases.truncate(3);
cds_bases
}
#[cfg(test)]
mod tests {
use super::*;
use crate::annotation::{Exon, Gene, GeneType, Transcript};
use crate::genomic::Strand;
use crate::variant::Variant;
fn test_gene_forward() -> Gene {
Gene {
gene_id: "GENE001".into(),
gene_name: "TEST1".into(),
chrom: "chr1".into(),
start: 1000,
end: 2000,
strand: Strand::Forward,
gene_type: GeneType::ProteinCoding,
transcripts: vec![Transcript {
transcript_id: "TX001".into(),
start: 1000,
end: 2000,
exons: vec![
Exon {
exon_number: 1,
start: 1000,
end: 1200,
},
Exon {
exon_number: 2,
start: 1500,
end: 1800,
},
],
cds_start: Some(1050),
cds_end: Some(1750),
}],
}
}
fn test_gene_reverse() -> Gene {
Gene {
gene_id: "GENE002".into(),
gene_name: "TEST2".into(),
chrom: "chr2".into(),
start: 5000,
end: 6000,
strand: Strand::Reverse,
gene_type: GeneType::ProteinCoding,
transcripts: vec![Transcript {
transcript_id: "TX002".into(),
start: 5000,
end: 6000,
exons: vec![
Exon {
exon_number: 1,
start: 5000,
end: 5300,
},
Exon {
exon_number: 2,
start: 5600,
end: 5900,
},
],
cds_start: Some(5100),
cds_end: Some(5800),
}],
}
}
fn test_gene_simple_cds() -> Gene {
Gene {
gene_id: "GENE003".into(),
gene_name: "SIMPLE".into(),
chrom: "chr1".into(),
start: 100,
end: 400,
strand: Strand::Forward,
gene_type: GeneType::ProteinCoding,
transcripts: vec![Transcript {
transcript_id: "TX003".into(),
start: 100,
end: 400,
exons: vec![Exon {
exon_number: 1,
start: 100,
end: 400,
}],
cds_start: Some(100),
cds_end: Some(400),
}],
}
}
fn annotate_one(variant: &Variant, gene: &Gene) -> Vec<VariantEffect> {
annotate_variant(variant, &[gene.clone()], &AnnotationConfig::default())
}
#[test]
fn test_missense_snv() {
let ref_codon = [b'G', b'T', b'G']; let mut alt_codon = ref_codon;
alt_codon[0] = b'G'; alt_codon[1] = b'A'; alt_codon[2] = b'G';
let ref_aa = translate_codon(&ref_codon);
let alt_aa = translate_codon(&alt_codon);
assert_eq!(ref_aa, b'V'); assert_eq!(alt_aa, b'E'); assert_ne!(ref_aa, alt_aa);
let gene = test_gene_simple_cds();
let v = Variant::new("chr1", 105, vec![b'T'], vec![vec![b'A']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
let eff = &effects[0];
assert_eq!(eff.cds_position, Some(5));
assert_eq!(eff.protein_position, Some(2));
}
#[test]
fn test_synonymous_snv() {
let ref_codon = [b'C', b'T', b'A'];
let alt_codon = [b'C', b'T', b'G'];
let ref_aa = translate_codon(&ref_codon);
let alt_aa = translate_codon(&alt_codon);
assert_eq!(ref_aa, b'L');
assert_eq!(alt_aa, b'L');
let codon_pos = 5u64;
let consequence = if ref_aa == alt_aa {
Consequence::Synonymous { aa: ref_aa, codon_pos }
} else {
Consequence::Missense { ref_aa, alt_aa, codon_pos }
};
assert!(matches!(consequence, Consequence::Synonymous { aa: b'L', codon_pos: 5 }));
}
#[test]
fn test_nonsense_stop_gain() {
let ref_codon = [b'C', b'A', b'G'];
let alt_codon = [b'T', b'A', b'G'];
let ref_aa = translate_codon(&ref_codon);
let alt_aa = translate_codon(&alt_codon);
assert_eq!(ref_aa, b'Q');
assert_eq!(alt_aa, b'*');
let consequence = Consequence::Nonsense {
ref_aa,
codon_pos: 10,
};
assert!(matches!(consequence, Consequence::Nonsense { ref_aa: b'Q', codon_pos: 10 }));
}
#[test]
fn test_frameshift_deletion() {
let gene = test_gene_simple_cds();
let v = Variant::new("chr1", 105, vec![b'A', b'T'], vec![vec![b'A']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
assert!(matches!(effects[0].consequence, Consequence::Frameshift { .. }));
}
#[test]
fn test_inframe_deletion() {
let gene = test_gene_simple_cds();
let v = Variant::new("chr1", 105, vec![b'A', b'T', b'G', b'C'], vec![vec![b'A']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
assert!(matches!(effects[0].consequence, Consequence::InFrame { .. }));
}
#[test]
fn test_start_loss() {
let ref_aa = b'M';
let alt_aa = b'T';
let codon_pos_0based = 0u64;
let consequence = if codon_pos_0based == 0 && ref_aa == b'M' {
Consequence::StartLoss { ref_aa }
} else {
Consequence::Missense { ref_aa, alt_aa, codon_pos: 1 }
};
assert!(matches!(consequence, Consequence::StartLoss { ref_aa: b'M' }));
}
#[test]
fn test_stop_loss() {
let ref_aa = b'*';
let alt_aa = b'Q';
let codon_pos_0based = 100u64;
let consequence = if ref_aa == b'*' && alt_aa != b'*' {
Consequence::StopLoss { ref_aa }
} else {
Consequence::Missense { ref_aa, alt_aa, codon_pos: 101 }
};
assert!(matches!(consequence, Consequence::StopLoss { ref_aa: b'*' }));
}
#[test]
fn test_five_prime_utr() {
let gene = test_gene_forward();
let v = Variant::new("chr1", 1021, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
assert!(matches!(effects[0].consequence, Consequence::FivePrimeUtr));
}
#[test]
fn test_three_prime_utr() {
let gene = test_gene_forward();
let v = Variant::new("chr1", 1761, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
assert!(matches!(effects[0].consequence, Consequence::ThreePrimeUtr));
}
#[test]
fn test_intronic_variant() {
let gene = test_gene_forward();
let v = Variant::new("chr1", 1351, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
assert!(matches!(effects[0].consequence, Consequence::Intronic));
}
#[test]
fn test_splice_donor() {
let gene = test_gene_forward();
let v = Variant::new("chr1", 1201, vec![b'G'], vec![vec![b'A']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
assert!(matches!(
effects[0].consequence,
Consequence::SpliceSite { donor: true }
));
}
#[test]
fn test_splice_acceptor() {
let gene = test_gene_forward();
let v = Variant::new("chr1", 1500, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
assert!(matches!(
effects[0].consequence,
Consequence::SpliceSite { donor: false }
));
}
#[test]
fn test_upstream_variant() {
let gene = test_gene_forward();
let v = Variant::new("chr1", 501, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
assert!(matches!(effects[0].consequence, Consequence::Upstream));
}
#[test]
fn test_downstream_variant() {
let gene = test_gene_forward();
let v = Variant::new("chr1", 2501, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
assert!(matches!(effects[0].consequence, Consequence::Downstream));
}
#[test]
fn test_reverse_strand_gene() {
let gene = test_gene_reverse();
let v = Variant::new("chr2", 4501, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
assert!(matches!(effects[0].consequence, Consequence::Downstream));
let v2 = Variant::new("chr2", 6501, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects2 = annotate_one(&v2, &gene);
assert_eq!(effects2.len(), 1);
assert!(matches!(effects2[0].consequence, Consequence::Upstream));
let v3 = Variant::new("chr2", 5851, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects3 = annotate_one(&v3, &gene);
assert_eq!(effects3.len(), 1);
assert!(matches!(effects3[0].consequence, Consequence::FivePrimeUtr));
let v4 = Variant::new("chr2", 5051, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects4 = annotate_one(&v4, &gene);
assert_eq!(effects4.len(), 1);
assert!(matches!(effects4[0].consequence, Consequence::ThreePrimeUtr));
}
#[test]
fn test_hgvs_coding_notation() {
let gene = test_gene_simple_cds();
let v = Variant::new("chr1", 105, vec![b'T'], vec![vec![b'A']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
let hgvs = effects[0].hgvs_c.as_ref().unwrap();
assert_eq!(hgvs, "c.5T>A");
}
#[test]
fn test_hgvs_protein_notation() {
let ref_aa = b'V';
let alt_aa = b'E';
let codon_pos = 600u64;
let hgvs_p = format!(
"p.{}{}{}",
aa_three_letter(ref_aa),
codon_pos,
aa_three_letter(alt_aa),
);
assert_eq!(hgvs_p, "p.Val600Glu");
let hgvs_nonsense = format!(
"p.{}{}{}",
aa_three_letter(b'Q'),
42,
aa_three_letter(b'*'),
);
assert_eq!(hgvs_nonsense, "p.Gln42Ter");
let hgvs_syn = format!("p.{}{}=", aa_three_letter(b'L'), 10);
assert_eq!(hgvs_syn, "p.Leu10=");
}
#[test]
fn test_batch_annotation_interval_tree() {
let gene1 = test_gene_forward(); let gene2 = test_gene_simple_cds();
let genes = vec![gene1, gene2];
let variants = vec![
Variant::new("chr1", 105, vec![b'T'], vec![vec![b'A']]).unwrap(),
Variant::new("chr1", 1021, vec![b'A'], vec![vec![b'G']]).unwrap(),
Variant::new("chr1", 50000, vec![b'A'], vec![vec![b'G']]).unwrap(),
];
let effects = annotate_variants(&variants, &genes, &AnnotationConfig::default());
let v1_effects: Vec<_> = effects
.iter()
.filter(|e| e.gene_name == "SIMPLE")
.collect();
assert!(!v1_effects.is_empty());
let v2_effects: Vec<_> = effects
.iter()
.filter(|e| e.gene_name == "TEST1")
.collect();
assert!(!v2_effects.is_empty());
assert!(
v2_effects.iter().any(|e| matches!(e.consequence, Consequence::FivePrimeUtr)),
"expected at least one FivePrimeUtr for TEST1"
);
let v3_count = effects
.iter()
.filter(|e| e.gene_name != "SIMPLE" && e.gene_name != "TEST1")
.count();
assert_eq!(v3_count, 0);
}
#[test]
fn test_non_coding_gene() {
let gene = Gene {
gene_id: "GENE_NC".into(),
gene_name: "LINC001".into(),
chrom: "chr1".into(),
start: 1000,
end: 2000,
strand: Strand::Forward,
gene_type: GeneType::LncRNA,
transcripts: vec![Transcript {
transcript_id: "TX_NC".into(),
start: 1000,
end: 2000,
exons: vec![Exon {
exon_number: 1,
start: 1000,
end: 2000,
}],
cds_start: None,
cds_end: None,
}],
};
let v = Variant::new("chr1", 1501, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
assert!(matches!(effects[0].consequence, Consequence::NonCoding));
assert_eq!(effects[0].gene_name, "LINC001");
}
#[test]
fn test_codon_change_field() {
let gene = test_gene_simple_cds();
let v = Variant::new("chr1", 105, vec![b'T'], vec![vec![b'A']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects.len(), 1);
let eff = &effects[0];
assert!(eff.codon_change.is_some());
let (ref_c, alt_c) = eff.codon_change.as_ref().unwrap();
assert_eq!(ref_c.len(), 3);
assert_eq!(alt_c.len(), 3);
let diffs: usize = ref_c
.bytes()
.zip(alt_c.bytes())
.filter(|(a, b)| a != b)
.count();
assert_eq!(diffs, 1, "SNV should change exactly one codon position");
}
#[test]
fn test_codon_table_basics() {
assert_eq!(translate_codon(b"ATG"), b'M');
assert_eq!(translate_codon(b"TAA"), b'*');
assert_eq!(translate_codon(b"TAG"), b'*');
assert_eq!(translate_codon(b"TGA"), b'*');
assert_eq!(translate_codon(b"TTT"), b'F');
assert_eq!(translate_codon(b"GGG"), b'G');
assert_eq!(translate_codon(b"AAA"), b'K');
assert_eq!(translate_codon(b"CCC"), b'P');
}
#[test]
fn test_reverse_complement() {
assert_eq!(reverse_complement(b"ATGC"), b"GCAT");
assert_eq!(reverse_complement(b"AAAA"), b"TTTT");
assert_eq!(reverse_complement(b""), b"");
assert_eq!(reverse_complement(b"A"), b"T");
}
#[test]
fn test_aa_three_letter_codes() {
assert_eq!(aa_three_letter(b'M'), "Met");
assert_eq!(aa_three_letter(b'V'), "Val");
assert_eq!(aa_three_letter(b'E'), "Glu");
assert_eq!(aa_three_letter(b'*'), "Ter");
assert_eq!(aa_three_letter(b'X'), "Xaa");
}
#[test]
fn test_annotation_config_default() {
let config = AnnotationConfig::default();
assert_eq!(config.upstream_distance, 5000);
assert_eq!(config.downstream_distance, 5000);
assert_eq!(config.splice_window, 2);
}
#[test]
fn test_splice_scoring() {
let gene = Gene {
gene_id: "SPLICE_GENE".into(),
gene_name: "SPLICE".into(),
chrom: "chr1".into(),
start: 0,
end: 100,
strand: Strand::Forward,
gene_type: GeneType::ProteinCoding,
transcripts: vec![Transcript {
transcript_id: "TX_SPLICE".into(),
start: 0,
end: 100,
exons: vec![
Exon {
exon_number: 1,
start: 10,
end: 30,
},
Exon {
exon_number: 2,
start: 50,
end: 80,
},
],
cds_start: Some(10),
cds_end: Some(80),
}],
};
let mut ref_seq = vec![b'A'; 100];
ref_seq[30] = b'G'; ref_seq[31] = b'T'; ref_seq[48] = b'A'; ref_seq[49] = b'G';
let v = Variant::new("chr1", 31, vec![b'G'], vec![vec![b'A']]).unwrap();
let scores = score_splice_disruption(&v, &gene, &ref_seq);
assert!(!scores.is_empty());
let donor_score = &scores[0];
assert!(donor_score.is_canonical);
assert!(donor_score.disrupts_consensus);
assert!(donor_score.delta_score < 0.0, "disrupting GT should lower score");
}
#[test]
fn test_variant_different_chrom_no_effect() {
let gene = test_gene_forward(); let v = Variant::new("chr2", 1500, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects = annotate_one(&v, &gene);
assert!(effects.is_empty());
}
#[test]
fn test_variant_far_from_gene_no_effect() {
let gene = test_gene_forward(); let v = Variant::new("chr1", 100001, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects = annotate_one(&v, &gene);
assert!(effects.is_empty());
}
#[test]
fn test_exon_distance_splice_site() {
let gene = test_gene_forward();
let v = Variant::new("chr1", 1201, vec![b'G'], vec![vec![b'A']]).unwrap();
let effects = annotate_one(&v, &gene);
assert_eq!(effects[0].exon_distance, Some(1));
let v2 = Variant::new("chr1", 1500, vec![b'A'], vec![vec![b'G']]).unwrap();
let effects2 = annotate_one(&v2, &gene);
assert_eq!(effects2[0].exon_distance, Some(-1));
}
}