1use crate::annotation::{Gene, GeneType, Transcript};
8use crate::genomic::Strand;
9use crate::interval_tree::{Interval, IntervalTree};
10use crate::variant::Variant;
11
12fn nucleotide_index(base: u8) -> usize {
18 match base.to_ascii_uppercase() {
19 b'A' => 0,
20 b'C' => 1,
21 b'G' => 2,
22 b'T' | b'U' => 3,
23 _ => 0, }
25}
26
27fn codon_index(b1: u8, b2: u8, b3: u8) -> usize {
29 nucleotide_index(b1) * 16 + nucleotide_index(b2) * 4 + nucleotide_index(b3)
30}
31
32const CODON_TABLE: [u8; 64] = [
36 b'K', b'N', b'K', b'N', b'T', b'T', b'T', b'T', b'R', b'S', b'R', b'S', b'I', b'I', b'M', b'I',
38 b'Q', b'H', b'Q', b'H', b'P', b'P', b'P', b'P', b'R', b'R', b'R', b'R', b'L', b'L', b'L', b'L',
40 b'E', b'D', b'E', b'D', b'A', b'A', b'A', b'A', b'G', b'G', b'G', b'G', b'V', b'V', b'V', b'V',
42 b'*', b'Y', b'*', b'Y', b'S', b'S', b'S', b'S', b'*', b'C', b'W', b'C', b'L', b'F', b'L', b'F',
44];
45
46fn translate_codon(codon: &[u8]) -> u8 {
48 if codon.len() < 3 {
49 return b'X';
50 }
51 CODON_TABLE[codon_index(codon[0], codon[1], codon[2])]
52}
53
54fn complement(base: u8) -> u8 {
56 match base.to_ascii_uppercase() {
57 b'A' => b'T',
58 b'T' => b'A',
59 b'C' => b'G',
60 b'G' => b'C',
61 other => other,
62 }
63}
64
65fn reverse_complement(seq: &[u8]) -> Vec<u8> {
67 seq.iter().rev().map(|&b| complement(b)).collect()
68}
69
70fn aa_three_letter(aa: u8) -> &'static str {
72 match aa.to_ascii_uppercase() {
73 b'A' => "Ala",
74 b'R' => "Arg",
75 b'N' => "Asn",
76 b'D' => "Asp",
77 b'C' => "Cys",
78 b'E' => "Glu",
79 b'Q' => "Gln",
80 b'G' => "Gly",
81 b'H' => "His",
82 b'I' => "Ile",
83 b'L' => "Leu",
84 b'K' => "Lys",
85 b'M' => "Met",
86 b'F' => "Phe",
87 b'P' => "Pro",
88 b'S' => "Ser",
89 b'T' => "Thr",
90 b'W' => "Trp",
91 b'Y' => "Tyr",
92 b'V' => "Val",
93 b'*' => "Ter",
94 _ => "Xaa",
95 }
96}
97
98#[derive(Debug, Clone, PartialEq)]
104#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
105pub enum Consequence {
106 Missense {
108 ref_aa: u8,
109 alt_aa: u8,
110 codon_pos: u64,
111 },
112 Nonsense {
114 ref_aa: u8,
115 codon_pos: u64,
116 },
117 Synonymous {
119 aa: u8,
120 codon_pos: u64,
121 },
122 Frameshift {
124 codon_pos: u64,
125 },
126 InFrame {
128 codon_pos: u64,
129 },
130 FivePrimeUtr,
132 ThreePrimeUtr,
134 SpliceSite {
136 donor: bool,
137 },
138 Intronic,
140 Upstream,
142 Downstream,
144 NonCoding,
146 StartLoss {
148 ref_aa: u8,
149 },
150 StopLoss {
152 ref_aa: u8,
153 },
154}
155
156#[derive(Debug, Clone)]
158#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
159pub struct VariantEffect {
160 pub gene_id: String,
161 pub gene_name: String,
162 pub transcript_id: String,
163 pub consequence: Consequence,
164 pub hgvs_c: Option<String>,
166 pub hgvs_p: Option<String>,
168 pub codon_change: Option<(String, String)>,
170 pub cds_position: Option<u64>,
172 pub protein_position: Option<u64>,
174 pub exon_distance: Option<i64>,
176}
177
178#[derive(Debug, Clone)]
180#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
181pub struct SpliceScore {
182 pub transcript_id: String,
183 pub is_canonical: bool,
185 pub disrupts_consensus: bool,
187 pub score_ref: f64,
189 pub score_alt: f64,
191 pub delta_score: f64,
193}
194
195#[derive(Debug, Clone)]
197#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
198pub struct AnnotationConfig {
199 pub upstream_distance: u64,
201 pub downstream_distance: u64,
203 pub splice_window: u64,
205}
206
207impl Default for AnnotationConfig {
208 fn default() -> Self {
209 Self {
210 upstream_distance: 5000,
211 downstream_distance: 5000,
212 splice_window: 2,
213 }
214 }
215}
216
217const DONOR_PWM: [[f64; 4]; 9] = [
224 [0.33, 0.36, 0.18, 0.13], [0.60, 0.03, 0.12, 0.25], [0.09, 0.03, 0.81, 0.07], [0.00, 0.00, 1.00, 0.00], [0.00, 0.00, 0.00, 1.00], [0.54, 0.02, 0.38, 0.06], [0.71, 0.08, 0.12, 0.09], [0.06, 0.04, 0.82, 0.08], [0.15, 0.15, 0.21, 0.49], ];
234
235const ACCEPTOR_PWM: [[f64; 4]; 23] = [
238 [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.25, 0.25, 0.25, 0.25], [0.24, 0.15, 0.09, 0.52], [0.20, 0.16, 0.09, 0.55], [0.16, 0.18, 0.09, 0.57], [0.14, 0.19, 0.09, 0.58], [0.12, 0.21, 0.08, 0.59], [0.10, 0.25, 0.07, 0.58], [0.09, 0.28, 0.07, 0.56], [0.08, 0.30, 0.08, 0.54], [0.08, 0.32, 0.08, 0.52], [0.08, 0.28, 0.12, 0.52], [0.00, 0.00, 0.00, 0.00], [0.00, 0.00, 1.00, 0.00], [0.25, 0.25, 0.25, 0.25], ];
262
263fn score_pwm(seq: &[u8], pwm: &[[f64; 4]]) -> f64 {
265 let len = seq.len().min(pwm.len());
266 let mut score = 0.0;
267 for i in 0..len {
268 let idx = nucleotide_index(seq[i]);
269 let freq = pwm[i][idx];
270 if freq <= 0.0 {
271 score += -10.0; } else {
273 score += (freq / 0.25_f64).log2();
274 }
275 }
276 score
277}
278
279pub fn annotate_variant(
289 variant: &Variant,
290 genes: &[Gene],
291 config: &AnnotationConfig,
292) -> Vec<VariantEffect> {
293 let pos0 = variant.position - 1; let mut effects = Vec::new();
295
296 for gene in genes {
297 if variant.chrom != gene.chrom {
298 continue;
299 }
300
301 let (upstream_ext, downstream_ext) = if gene.strand.is_reverse() {
303 (config.downstream_distance, config.upstream_distance)
304 } else {
305 (config.upstream_distance, config.downstream_distance)
306 };
307
308 let region_start = gene.start.saturating_sub(upstream_ext);
309 let region_end = gene.end + downstream_ext;
310
311 if pos0 < region_start || pos0 >= region_end {
312 continue;
313 }
314
315 if pos0 < gene.start {
317 let consequence = if gene.strand.is_reverse() {
319 Consequence::Downstream
320 } else {
321 Consequence::Upstream
322 };
323 for tx in &gene.transcripts {
324 effects.push(VariantEffect {
325 gene_id: gene.gene_id.clone(),
326 gene_name: gene.gene_name.clone(),
327 transcript_id: tx.transcript_id.clone(),
328 consequence: consequence.clone(),
329 hgvs_c: None,
330 hgvs_p: None,
331 codon_change: None,
332 cds_position: None,
333 protein_position: None,
334 exon_distance: None,
335 });
336 }
337 continue;
338 }
339
340 if pos0 >= gene.end {
341 let consequence = if gene.strand.is_reverse() {
343 Consequence::Upstream
344 } else {
345 Consequence::Downstream
346 };
347 for tx in &gene.transcripts {
348 effects.push(VariantEffect {
349 gene_id: gene.gene_id.clone(),
350 gene_name: gene.gene_name.clone(),
351 transcript_id: tx.transcript_id.clone(),
352 consequence: consequence.clone(),
353 hgvs_c: None,
354 hgvs_p: None,
355 codon_change: None,
356 cds_position: None,
357 protein_position: None,
358 exon_distance: None,
359 });
360 }
361 continue;
362 }
363
364 if gene.gene_type != GeneType::ProteinCoding {
366 for tx in &gene.transcripts {
367 effects.push(VariantEffect {
368 gene_id: gene.gene_id.clone(),
369 gene_name: gene.gene_name.clone(),
370 transcript_id: tx.transcript_id.clone(),
371 consequence: Consequence::NonCoding,
372 hgvs_c: None,
373 hgvs_p: None,
374 codon_change: None,
375 cds_position: None,
376 protein_position: None,
377 exon_distance: None,
378 });
379 }
380 continue;
381 }
382
383 for tx in &gene.transcripts {
385 let effect = annotate_transcript(variant, gene, tx, config);
386 effects.push(effect);
387 }
388 }
389
390 effects
391}
392
393pub fn annotate_variants(
398 variants: &[Variant],
399 genes: &[Gene],
400 config: &AnnotationConfig,
401) -> Vec<VariantEffect> {
402 if genes.is_empty() || variants.is_empty() {
403 return Vec::new();
404 }
405
406 let mut chrom_genes: std::collections::HashMap<&str, Vec<(usize, &Gene)>> =
408 std::collections::HashMap::new();
409 for (idx, gene) in genes.iter().enumerate() {
410 chrom_genes
411 .entry(gene.chrom.as_str())
412 .or_default()
413 .push((idx, gene));
414 }
415
416 let max_ext = config.upstream_distance.max(config.downstream_distance);
418 let mut chrom_trees: std::collections::HashMap<&str, IntervalTree<usize>> =
419 std::collections::HashMap::new();
420 for (chrom, gene_list) in &chrom_genes {
421 let intervals: Vec<Interval<usize>> = gene_list
422 .iter()
423 .map(|&(idx, gene)| {
424 Interval::new(
425 gene.start.saturating_sub(max_ext),
426 gene.end + max_ext,
427 idx,
428 )
429 })
430 .collect();
431 chrom_trees.insert(chrom, IntervalTree::from_unsorted(intervals));
432 }
433
434 let mut all_effects = Vec::new();
435
436 for variant in variants {
437 let pos0 = variant.position - 1;
438 if let Some(tree) = chrom_trees.get(variant.chrom.as_str()) {
439 let hits = tree.query(pos0, pos0 + 1);
440 let mut hit_genes: Vec<&Gene> = hits.iter().map(|iv| &genes[iv.data]).collect();
441 hit_genes.sort_by_key(|g| &g.gene_id);
443 hit_genes.dedup_by_key(|g| &g.gene_id);
444 for gene in hit_genes {
445 let effs = annotate_variant(variant, std::slice::from_ref(gene), config);
446 all_effects.extend(effs);
447 }
448 }
449 }
450
451 all_effects
452}
453
454pub fn score_splice_disruption(
460 variant: &Variant,
461 gene: &Gene,
462 reference_seq: &[u8],
463) -> Vec<SpliceScore> {
464 let pos0 = variant.position - 1;
465 let mut scores = Vec::new();
466
467 for tx in &gene.transcripts {
468 let exons_sorted = sorted_exons(tx, &gene.strand);
469 for (i, exon) in exons_sorted.iter().enumerate() {
470 if i < exons_sorted.len() - 1 {
474 let donor_start = exon.end.saturating_sub(3);
475 let donor_end = exon.end + 6;
476 if pos0 >= donor_start && pos0 < donor_end && (donor_end as usize) <= reference_seq.len() {
477 let window_ref: Vec<u8> = reference_seq[donor_start as usize..donor_end as usize].to_vec();
478 let mut window_alt = window_ref.clone();
479 let offset = (pos0 - donor_start) as usize;
480 if offset < window_alt.len() && variant.alt_alleles[0].len() == 1 && variant.ref_allele.len() == 1 {
481 window_alt[offset] = variant.alt_alleles[0][0];
482 }
483
484 let s_ref = score_pwm(&window_ref, &DONOR_PWM);
485 let s_alt = score_pwm(&window_alt, &DONOR_PWM);
486
487 let is_canonical = reference_seq.len() > (exon.end as usize + 1)
489 && reference_seq[exon.end as usize] == b'G'
490 && reference_seq[exon.end as usize + 1] == b'T';
491
492 let disrupts = is_canonical
493 && (pos0 == exon.end || pos0 == exon.end + 1)
494 && variant.ref_allele != variant.alt_alleles[0];
495
496 scores.push(SpliceScore {
497 transcript_id: tx.transcript_id.clone(),
498 is_canonical,
499 disrupts_consensus: disrupts,
500 score_ref: s_ref,
501 score_alt: s_alt,
502 delta_score: s_alt - s_ref,
503 });
504 }
505 }
506
507 if i > 0 {
510 let acc_start = exon.start.saturating_sub(20);
511 let acc_end = exon.start + 3;
512 if pos0 >= acc_start && pos0 < acc_end && (acc_end as usize) <= reference_seq.len() {
513 let window_ref: Vec<u8> = reference_seq[acc_start as usize..acc_end as usize].to_vec();
514 let mut window_alt = window_ref.clone();
515 let offset = (pos0 - acc_start) as usize;
516 if offset < window_alt.len() && variant.alt_alleles[0].len() == 1 && variant.ref_allele.len() == 1 {
517 window_alt[offset] = variant.alt_alleles[0][0];
518 }
519
520 let s_ref = score_pwm(&window_ref, &ACCEPTOR_PWM);
521 let s_alt = score_pwm(&window_alt, &ACCEPTOR_PWM);
522
523 let is_canonical = exon.start >= 2
524 && reference_seq.len() > exon.start as usize
525 && reference_seq[exon.start as usize - 2] == b'A'
526 && reference_seq[exon.start as usize - 1] == b'G';
527
528 let disrupts = is_canonical
529 && (pos0 == exon.start - 2 || pos0 == exon.start - 1)
530 && variant.ref_allele != variant.alt_alleles[0];
531
532 scores.push(SpliceScore {
533 transcript_id: tx.transcript_id.clone(),
534 is_canonical,
535 disrupts_consensus: disrupts,
536 score_ref: s_ref,
537 score_alt: s_alt,
538 delta_score: s_alt - s_ref,
539 });
540 }
541 }
542 }
543 }
544
545 scores
546}
547
548fn sorted_exons<'a>(tx: &'a Transcript, strand: &Strand) -> Vec<&'a crate::annotation::Exon> {
554 let mut exons: Vec<&crate::annotation::Exon> = tx.exons.iter().collect();
555 if strand.is_reverse() {
556 exons.sort_by(|a, b| b.start.cmp(&a.start));
557 } else {
558 exons.sort_by_key(|e| e.start);
559 }
560 exons
561}
562
563fn annotate_transcript(
565 variant: &Variant,
566 gene: &Gene,
567 tx: &Transcript,
568 config: &AnnotationConfig,
569) -> VariantEffect {
570 let pos0 = variant.position - 1;
571
572 let in_exon = tx.exons.iter().any(|e| pos0 >= e.start && pos0 < e.end);
574
575 if !in_exon {
576 for exon in &tx.exons {
578 if pos0 >= exon.end && pos0 < exon.end + config.splice_window {
580 return VariantEffect {
581 gene_id: gene.gene_id.clone(),
582 gene_name: gene.gene_name.clone(),
583 transcript_id: tx.transcript_id.clone(),
584 consequence: Consequence::SpliceSite { donor: true },
585 hgvs_c: None,
586 hgvs_p: None,
587 codon_change: None,
588 cds_position: None,
589 protein_position: None,
590 exon_distance: Some((pos0 - exon.end) as i64 + 1),
591 };
592 }
593 if exon.start >= config.splice_window
595 && pos0 >= exon.start - config.splice_window
596 && pos0 < exon.start
597 {
598 return VariantEffect {
599 gene_id: gene.gene_id.clone(),
600 gene_name: gene.gene_name.clone(),
601 transcript_id: tx.transcript_id.clone(),
602 consequence: Consequence::SpliceSite { donor: false },
603 hgvs_c: None,
604 hgvs_p: None,
605 codon_change: None,
606 cds_position: None,
607 protein_position: None,
608 exon_distance: Some(-((exon.start - pos0) as i64)),
609 };
610 }
611 }
612
613 return VariantEffect {
615 gene_id: gene.gene_id.clone(),
616 gene_name: gene.gene_name.clone(),
617 transcript_id: tx.transcript_id.clone(),
618 consequence: Consequence::Intronic,
619 hgvs_c: None,
620 hgvs_p: None,
621 codon_change: None,
622 cds_position: None,
623 protein_position: None,
624 exon_distance: None,
625 };
626 }
627
628 let (cds_start, cds_end) = match (tx.cds_start, tx.cds_end) {
630 (Some(cs), Some(ce)) => (cs, ce),
631 _ => {
632 return VariantEffect {
634 gene_id: gene.gene_id.clone(),
635 gene_name: gene.gene_name.clone(),
636 transcript_id: tx.transcript_id.clone(),
637 consequence: Consequence::NonCoding,
638 hgvs_c: None,
639 hgvs_p: None,
640 codon_change: None,
641 cds_position: None,
642 protein_position: None,
643 exon_distance: None,
644 };
645 }
646 };
647
648 if gene.strand.is_reverse() {
650 if pos0 >= cds_end {
652 return VariantEffect {
653 gene_id: gene.gene_id.clone(),
654 gene_name: gene.gene_name.clone(),
655 transcript_id: tx.transcript_id.clone(),
656 consequence: Consequence::FivePrimeUtr,
657 hgvs_c: None,
658 hgvs_p: None,
659 codon_change: None,
660 cds_position: None,
661 protein_position: None,
662 exon_distance: None,
663 };
664 }
665 if pos0 < cds_start {
666 return VariantEffect {
667 gene_id: gene.gene_id.clone(),
668 gene_name: gene.gene_name.clone(),
669 transcript_id: tx.transcript_id.clone(),
670 consequence: Consequence::ThreePrimeUtr,
671 hgvs_c: None,
672 hgvs_p: None,
673 codon_change: None,
674 cds_position: None,
675 protein_position: None,
676 exon_distance: None,
677 };
678 }
679 } else {
680 if pos0 < cds_start {
682 return VariantEffect {
683 gene_id: gene.gene_id.clone(),
684 gene_name: gene.gene_name.clone(),
685 transcript_id: tx.transcript_id.clone(),
686 consequence: Consequence::FivePrimeUtr,
687 hgvs_c: None,
688 hgvs_p: None,
689 codon_change: None,
690 cds_position: None,
691 protein_position: None,
692 exon_distance: None,
693 };
694 }
695 if pos0 >= cds_end {
696 return VariantEffect {
697 gene_id: gene.gene_id.clone(),
698 gene_name: gene.gene_name.clone(),
699 transcript_id: tx.transcript_id.clone(),
700 consequence: Consequence::ThreePrimeUtr,
701 hgvs_c: None,
702 hgvs_p: None,
703 codon_change: None,
704 cds_position: None,
705 protein_position: None,
706 exon_distance: None,
707 };
708 }
709 }
710
711 let exons_sorted = sorted_exons(tx, &gene.strand);
713
714 let mut cds_offset: u64 = 0;
716 let mut found = false;
717
718 for exon in &exons_sorted {
719 let exon_cds_start = exon.start.max(cds_start);
721 let exon_cds_end = exon.end.min(cds_end);
722 if exon_cds_start >= exon_cds_end {
723 continue; }
725
726 if pos0 >= exon_cds_start && pos0 < exon_cds_end {
727 if gene.strand.is_reverse() {
729 cds_offset += exon_cds_end - 1 - pos0;
730 } else {
731 cds_offset += pos0 - exon_cds_start;
732 }
733 found = true;
734 break;
735 } else {
736 cds_offset += exon_cds_end - exon_cds_start;
737 }
738 }
739
740 if !found {
741 return VariantEffect {
743 gene_id: gene.gene_id.clone(),
744 gene_name: gene.gene_name.clone(),
745 transcript_id: tx.transcript_id.clone(),
746 consequence: Consequence::Intronic,
747 hgvs_c: None,
748 hgvs_p: None,
749 codon_change: None,
750 cds_position: None,
751 protein_position: None,
752 exon_distance: None,
753 };
754 }
755
756 let cds_pos_1based = cds_offset + 1; let codon_pos_0based = cds_offset / 3; let codon_pos_1based = codon_pos_0based + 1; let codon_offset = (cds_offset % 3) as usize; let ref_allele = if gene.strand.is_reverse() {
763 reverse_complement(&variant.ref_allele)
764 } else {
765 variant.ref_allele.clone()
766 };
767 let alt_allele = if gene.strand.is_reverse() {
768 reverse_complement(&variant.alt_alleles[0])
769 } else {
770 variant.alt_alleles[0].clone()
771 };
772
773 if ref_allele.len() != alt_allele.len() {
775 let indel_len = (ref_allele.len() as i64 - alt_allele.len() as i64).unsigned_abs() as usize;
776 let consequence = if indel_len % 3 != 0 {
777 Consequence::Frameshift {
778 codon_pos: codon_pos_1based,
779 }
780 } else {
781 Consequence::InFrame {
782 codon_pos: codon_pos_1based,
783 }
784 };
785 return VariantEffect {
786 gene_id: gene.gene_id.clone(),
787 gene_name: gene.gene_name.clone(),
788 transcript_id: tx.transcript_id.clone(),
789 consequence,
790 hgvs_c: Some(format!(
791 "c.{}del",
792 cds_pos_1based,
793 )),
794 hgvs_p: None,
795 codon_change: None,
796 cds_position: Some(cds_pos_1based),
797 protein_position: Some(codon_pos_1based),
798 exon_distance: None,
799 };
800 }
801
802 let ref_codon = build_codon_from_cds(
807 tx,
808 &gene.strand,
809 cds_start,
810 cds_end,
811 codon_pos_0based,
812 &exons_sorted,
813 );
814
815 if ref_codon.len() < 3 {
816 return VariantEffect {
818 gene_id: gene.gene_id.clone(),
819 gene_name: gene.gene_name.clone(),
820 transcript_id: tx.transcript_id.clone(),
821 consequence: Consequence::NonCoding,
822 hgvs_c: None,
823 hgvs_p: None,
824 codon_change: None,
825 cds_position: None,
826 protein_position: None,
827 exon_distance: None,
828 };
829 }
830
831 let mut alt_codon = ref_codon.clone();
833 if ref_allele.len() == 1 {
834 alt_codon[codon_offset] = alt_allele[0].to_ascii_uppercase();
835 }
836
837 let ref_aa = translate_codon(&ref_codon);
838 let alt_aa = translate_codon(&alt_codon);
839
840 let hgvs_c_ref = if gene.strand.is_reverse() {
842 complement(variant.ref_allele[0]).to_ascii_uppercase()
843 } else {
844 variant.ref_allele[0].to_ascii_uppercase()
845 };
846 let hgvs_c_alt = if gene.strand.is_reverse() {
847 complement(variant.alt_alleles[0][0]).to_ascii_uppercase()
848 } else {
849 variant.alt_alleles[0][0].to_ascii_uppercase()
850 };
851 let hgvs_c = Some(format!(
852 "c.{}{}>{}",
853 cds_pos_1based,
854 hgvs_c_ref as char,
855 hgvs_c_alt as char,
856 ));
857
858 let consequence = if ref_aa == alt_aa {
860 Consequence::Synonymous {
861 aa: ref_aa,
862 codon_pos: codon_pos_1based,
863 }
864 } else if codon_pos_0based == 0 && ref_aa == b'M' {
865 Consequence::StartLoss { ref_aa }
866 } else if ref_aa == b'*' && alt_aa != b'*' {
867 Consequence::StopLoss { ref_aa }
868 } else if alt_aa == b'*' {
869 Consequence::Nonsense {
870 ref_aa,
871 codon_pos: codon_pos_1based,
872 }
873 } else {
874 Consequence::Missense {
875 ref_aa,
876 alt_aa,
877 codon_pos: codon_pos_1based,
878 }
879 };
880
881 let hgvs_p = match &consequence {
883 Consequence::Missense {
884 ref_aa, alt_aa, ..
885 } => Some(format!(
886 "p.{}{}{}",
887 aa_three_letter(*ref_aa),
888 codon_pos_1based,
889 aa_three_letter(*alt_aa),
890 )),
891 Consequence::Nonsense { ref_aa, .. } => Some(format!(
892 "p.{}{}{}",
893 aa_three_letter(*ref_aa),
894 codon_pos_1based,
895 aa_three_letter(b'*'),
896 )),
897 Consequence::Synonymous { aa, .. } => Some(format!(
898 "p.{}{}=",
899 aa_three_letter(*aa),
900 codon_pos_1based,
901 )),
902 Consequence::StartLoss { .. } => Some("p.Met1?".to_string()),
903 Consequence::StopLoss { .. } => Some(format!(
904 "p.Ter{}?ext",
905 codon_pos_1based,
906 )),
907 _ => None,
908 };
909
910 let ref_codon_str = String::from_utf8_lossy(&ref_codon).to_string();
911 let alt_codon_str = String::from_utf8_lossy(&alt_codon).to_string();
912
913 VariantEffect {
914 gene_id: gene.gene_id.clone(),
915 gene_name: gene.gene_name.clone(),
916 transcript_id: tx.transcript_id.clone(),
917 consequence,
918 hgvs_c,
919 hgvs_p,
920 codon_change: Some((ref_codon_str, alt_codon_str)),
921 cds_position: Some(cds_pos_1based),
922 protein_position: Some(codon_pos_1based),
923 exon_distance: None,
924 }
925}
926
927fn build_codon_from_cds(
929 _tx: &Transcript,
930 strand: &Strand,
931 cds_start: u64,
932 cds_end: u64,
933 codon_pos_0based: u64,
934 exons_sorted: &[&crate::annotation::Exon],
935) -> Vec<u8> {
936 let codon_start_offset = codon_pos_0based * 3;
938 let mut cds_bases: Vec<u8> = Vec::new();
939 let mut collected = 0u64;
940 let target_end = codon_start_offset + 3;
941
942 for exon in exons_sorted {
943 let exon_cds_start = exon.start.max(cds_start);
944 let exon_cds_end = exon.end.min(cds_end);
945 if exon_cds_start >= exon_cds_end {
946 continue;
947 }
948 let exon_cds_len = exon_cds_end - exon_cds_start;
949
950 if collected + exon_cds_len <= codon_start_offset {
951 collected += exon_cds_len;
952 continue;
953 }
954
955 let skip = if codon_start_offset > collected {
957 codon_start_offset - collected
958 } else {
959 0
960 };
961 let take = (target_end - (collected + skip).max(codon_start_offset))
962 .min(exon_cds_len - skip);
963
964 if strand.is_reverse() {
965 let genomic_start = exon_cds_end - skip - take;
967 let genomic_end = exon_cds_end - skip;
968 for _pos in (genomic_start..genomic_end).rev() {
972 cds_bases.push(b'N');
974 }
975 } else {
976 let genomic_start = exon_cds_start + skip;
977 let genomic_end = genomic_start + take;
978 for _pos in genomic_start..genomic_end {
979 cds_bases.push(b'N');
980 }
981 }
982
983 collected += exon_cds_len;
984 if cds_bases.len() >= 3 {
985 break;
986 }
987 }
988
989 cds_bases.truncate(3);
993 cds_bases
994}
995
996
997#[cfg(test)]
1002mod tests {
1003 use super::*;
1004 use crate::annotation::{Exon, Gene, GeneType, Transcript};
1005 use crate::genomic::Strand;
1006 use crate::variant::Variant;
1007
1008 fn test_gene_forward() -> Gene {
1015 Gene {
1016 gene_id: "GENE001".into(),
1017 gene_name: "TEST1".into(),
1018 chrom: "chr1".into(),
1019 start: 1000,
1020 end: 2000,
1021 strand: Strand::Forward,
1022 gene_type: GeneType::ProteinCoding,
1023 transcripts: vec![Transcript {
1024 transcript_id: "TX001".into(),
1025 start: 1000,
1026 end: 2000,
1027 exons: vec![
1028 Exon {
1029 exon_number: 1,
1030 start: 1000,
1031 end: 1200,
1032 },
1033 Exon {
1034 exon_number: 2,
1035 start: 1500,
1036 end: 1800,
1037 },
1038 ],
1039 cds_start: Some(1050),
1040 cds_end: Some(1750),
1041 }],
1042 }
1043 }
1044
1045 fn test_gene_reverse() -> Gene {
1054 Gene {
1055 gene_id: "GENE002".into(),
1056 gene_name: "TEST2".into(),
1057 chrom: "chr2".into(),
1058 start: 5000,
1059 end: 6000,
1060 strand: Strand::Reverse,
1061 gene_type: GeneType::ProteinCoding,
1062 transcripts: vec![Transcript {
1063 transcript_id: "TX002".into(),
1064 start: 5000,
1065 end: 6000,
1066 exons: vec![
1067 Exon {
1068 exon_number: 1,
1069 start: 5000,
1070 end: 5300,
1071 },
1072 Exon {
1073 exon_number: 2,
1074 start: 5600,
1075 end: 5900,
1076 },
1077 ],
1078 cds_start: Some(5100),
1079 cds_end: Some(5800),
1080 }],
1081 }
1082 }
1083
1084 fn test_gene_simple_cds() -> Gene {
1090 Gene {
1091 gene_id: "GENE003".into(),
1092 gene_name: "SIMPLE".into(),
1093 chrom: "chr1".into(),
1094 start: 100,
1095 end: 400,
1096 strand: Strand::Forward,
1097 gene_type: GeneType::ProteinCoding,
1098 transcripts: vec![Transcript {
1099 transcript_id: "TX003".into(),
1100 start: 100,
1101 end: 400,
1102 exons: vec![Exon {
1103 exon_number: 1,
1104 start: 100,
1105 end: 400,
1106 }],
1107 cds_start: Some(100),
1108 cds_end: Some(400),
1109 }],
1110 }
1111 }
1112
1113 fn annotate_one(variant: &Variant, gene: &Gene) -> Vec<VariantEffect> {
1120 annotate_variant(variant, &[gene.clone()], &AnnotationConfig::default())
1121 }
1122
1123 #[test]
1153 fn test_missense_snv() {
1154 let ref_codon = [b'G', b'T', b'G']; let mut alt_codon = ref_codon;
1158 alt_codon[0] = b'G'; alt_codon[1] = b'A'; alt_codon[2] = b'G'; let ref_aa = translate_codon(&ref_codon);
1164 let alt_aa = translate_codon(&alt_codon);
1165 assert_eq!(ref_aa, b'V'); assert_eq!(alt_aa, b'E'); assert_ne!(ref_aa, alt_aa);
1168
1169 let gene = test_gene_simple_cds();
1171 let v = Variant::new("chr1", 105, vec![b'T'], vec![vec![b'A']]).unwrap();
1178 let effects = annotate_one(&v, &gene);
1179 assert_eq!(effects.len(), 1);
1180 let eff = &effects[0];
1184 assert_eq!(eff.cds_position, Some(5));
1185 assert_eq!(eff.protein_position, Some(2));
1186 }
1187
1188 #[test]
1190 fn test_synonymous_snv() {
1191 let ref_codon = [b'C', b'T', b'A'];
1193 let alt_codon = [b'C', b'T', b'G'];
1194 let ref_aa = translate_codon(&ref_codon);
1195 let alt_aa = translate_codon(&alt_codon);
1196 assert_eq!(ref_aa, b'L');
1197 assert_eq!(alt_aa, b'L');
1198
1199 let codon_pos = 5u64;
1201 let consequence = if ref_aa == alt_aa {
1202 Consequence::Synonymous { aa: ref_aa, codon_pos }
1203 } else {
1204 Consequence::Missense { ref_aa, alt_aa, codon_pos }
1205 };
1206 assert!(matches!(consequence, Consequence::Synonymous { aa: b'L', codon_pos: 5 }));
1207 }
1208
1209 #[test]
1211 fn test_nonsense_stop_gain() {
1212 let ref_codon = [b'C', b'A', b'G'];
1214 let alt_codon = [b'T', b'A', b'G'];
1215 let ref_aa = translate_codon(&ref_codon);
1216 let alt_aa = translate_codon(&alt_codon);
1217 assert_eq!(ref_aa, b'Q');
1218 assert_eq!(alt_aa, b'*');
1219
1220 let consequence = Consequence::Nonsense {
1221 ref_aa,
1222 codon_pos: 10,
1223 };
1224 assert!(matches!(consequence, Consequence::Nonsense { ref_aa: b'Q', codon_pos: 10 }));
1225 }
1226
1227 #[test]
1229 fn test_frameshift_deletion() {
1230 let gene = test_gene_simple_cds();
1231 let v = Variant::new("chr1", 105, vec![b'A', b'T'], vec![vec![b'A']]).unwrap();
1233 let effects = annotate_one(&v, &gene);
1234 assert_eq!(effects.len(), 1);
1235 assert!(matches!(effects[0].consequence, Consequence::Frameshift { .. }));
1236 }
1237
1238 #[test]
1240 fn test_inframe_deletion() {
1241 let gene = test_gene_simple_cds();
1242 let v = Variant::new("chr1", 105, vec![b'A', b'T', b'G', b'C'], vec![vec![b'A']]).unwrap();
1244 let effects = annotate_one(&v, &gene);
1245 assert_eq!(effects.len(), 1);
1246 assert!(matches!(effects[0].consequence, Consequence::InFrame { .. }));
1247 }
1248
1249 #[test]
1251 fn test_start_loss() {
1252 let ref_aa = b'M';
1256 let alt_aa = b'T';
1257 let codon_pos_0based = 0u64;
1258
1259 let consequence = if codon_pos_0based == 0 && ref_aa == b'M' {
1260 Consequence::StartLoss { ref_aa }
1261 } else {
1262 Consequence::Missense { ref_aa, alt_aa, codon_pos: 1 }
1263 };
1264 assert!(matches!(consequence, Consequence::StartLoss { ref_aa: b'M' }));
1265 }
1266
1267 #[test]
1269 fn test_stop_loss() {
1270 let ref_aa = b'*';
1272 let alt_aa = b'Q';
1273 let codon_pos_0based = 100u64;
1274
1275 let consequence = if ref_aa == b'*' && alt_aa != b'*' {
1276 Consequence::StopLoss { ref_aa }
1277 } else {
1278 Consequence::Missense { ref_aa, alt_aa, codon_pos: 101 }
1279 };
1280 assert!(matches!(consequence, Consequence::StopLoss { ref_aa: b'*' }));
1281 }
1282
1283 #[test]
1285 fn test_five_prime_utr() {
1286 let gene = test_gene_forward();
1287 let v = Variant::new("chr1", 1021, vec![b'A'], vec![vec![b'G']]).unwrap();
1290 let effects = annotate_one(&v, &gene);
1291 assert_eq!(effects.len(), 1);
1292 assert!(matches!(effects[0].consequence, Consequence::FivePrimeUtr));
1293 }
1294
1295 #[test]
1297 fn test_three_prime_utr() {
1298 let gene = test_gene_forward();
1299 let v = Variant::new("chr1", 1761, vec![b'A'], vec![vec![b'G']]).unwrap();
1302 let effects = annotate_one(&v, &gene);
1303 assert_eq!(effects.len(), 1);
1304 assert!(matches!(effects[0].consequence, Consequence::ThreePrimeUtr));
1305 }
1306
1307 #[test]
1309 fn test_intronic_variant() {
1310 let gene = test_gene_forward();
1311 let v = Variant::new("chr1", 1351, vec![b'A'], vec![vec![b'G']]).unwrap();
1315 let effects = annotate_one(&v, &gene);
1316 assert_eq!(effects.len(), 1);
1317 assert!(matches!(effects[0].consequence, Consequence::Intronic));
1318 }
1319
1320 #[test]
1322 fn test_splice_donor() {
1323 let gene = test_gene_forward();
1324 let v = Variant::new("chr1", 1201, vec![b'G'], vec![vec![b'A']]).unwrap();
1328 let effects = annotate_one(&v, &gene);
1329 assert_eq!(effects.len(), 1);
1330 assert!(matches!(
1331 effects[0].consequence,
1332 Consequence::SpliceSite { donor: true }
1333 ));
1334 }
1335
1336 #[test]
1338 fn test_splice_acceptor() {
1339 let gene = test_gene_forward();
1340 let v = Variant::new("chr1", 1500, vec![b'A'], vec![vec![b'G']]).unwrap();
1344 let effects = annotate_one(&v, &gene);
1345 assert_eq!(effects.len(), 1);
1346 assert!(matches!(
1347 effects[0].consequence,
1348 Consequence::SpliceSite { donor: false }
1349 ));
1350 }
1351
1352 #[test]
1354 fn test_upstream_variant() {
1355 let gene = test_gene_forward();
1356 let v = Variant::new("chr1", 501, vec![b'A'], vec![vec![b'G']]).unwrap();
1359 let effects = annotate_one(&v, &gene);
1360 assert_eq!(effects.len(), 1);
1361 assert!(matches!(effects[0].consequence, Consequence::Upstream));
1362 }
1363
1364 #[test]
1366 fn test_downstream_variant() {
1367 let gene = test_gene_forward();
1368 let v = Variant::new("chr1", 2501, vec![b'A'], vec![vec![b'G']]).unwrap();
1371 let effects = annotate_one(&v, &gene);
1372 assert_eq!(effects.len(), 1);
1373 assert!(matches!(effects[0].consequence, Consequence::Downstream));
1374 }
1375
1376 #[test]
1378 fn test_reverse_strand_gene() {
1379 let gene = test_gene_reverse();
1380 let v = Variant::new("chr2", 4501, vec![b'A'], vec![vec![b'G']]).unwrap();
1384 let effects = annotate_one(&v, &gene);
1385 assert_eq!(effects.len(), 1);
1386 assert!(matches!(effects[0].consequence, Consequence::Downstream));
1387
1388 let v2 = Variant::new("chr2", 6501, vec![b'A'], vec![vec![b'G']]).unwrap();
1391 let effects2 = annotate_one(&v2, &gene);
1392 assert_eq!(effects2.len(), 1);
1393 assert!(matches!(effects2[0].consequence, Consequence::Upstream));
1394
1395 let v3 = Variant::new("chr2", 5851, vec![b'A'], vec![vec![b'G']]).unwrap();
1398 let effects3 = annotate_one(&v3, &gene);
1399 assert_eq!(effects3.len(), 1);
1400 assert!(matches!(effects3[0].consequence, Consequence::FivePrimeUtr));
1401
1402 let v4 = Variant::new("chr2", 5051, vec![b'A'], vec![vec![b'G']]).unwrap();
1404 let effects4 = annotate_one(&v4, &gene);
1405 assert_eq!(effects4.len(), 1);
1406 assert!(matches!(effects4[0].consequence, Consequence::ThreePrimeUtr));
1407 }
1408
1409 #[test]
1411 fn test_hgvs_coding_notation() {
1412 let gene = test_gene_simple_cds();
1413 let v = Variant::new("chr1", 105, vec![b'T'], vec![vec![b'A']]).unwrap();
1415 let effects = annotate_one(&v, &gene);
1416 assert_eq!(effects.len(), 1);
1417 let hgvs = effects[0].hgvs_c.as_ref().unwrap();
1418 assert_eq!(hgvs, "c.5T>A");
1419 }
1420
1421 #[test]
1423 fn test_hgvs_protein_notation() {
1424 let ref_aa = b'V';
1426 let alt_aa = b'E';
1427 let codon_pos = 600u64;
1428 let hgvs_p = format!(
1429 "p.{}{}{}",
1430 aa_three_letter(ref_aa),
1431 codon_pos,
1432 aa_three_letter(alt_aa),
1433 );
1434 assert_eq!(hgvs_p, "p.Val600Glu");
1435
1436 let hgvs_nonsense = format!(
1438 "p.{}{}{}",
1439 aa_three_letter(b'Q'),
1440 42,
1441 aa_three_letter(b'*'),
1442 );
1443 assert_eq!(hgvs_nonsense, "p.Gln42Ter");
1444
1445 let hgvs_syn = format!("p.{}{}=", aa_three_letter(b'L'), 10);
1447 assert_eq!(hgvs_syn, "p.Leu10=");
1448 }
1449
1450 #[test]
1452 fn test_batch_annotation_interval_tree() {
1453 let gene1 = test_gene_forward(); let gene2 = test_gene_simple_cds(); let genes = vec![gene1, gene2];
1457
1458 let variants = vec![
1459 Variant::new("chr1", 105, vec![b'T'], vec![vec![b'A']]).unwrap(),
1461 Variant::new("chr1", 1021, vec![b'A'], vec![vec![b'G']]).unwrap(),
1463 Variant::new("chr1", 50000, vec![b'A'], vec![vec![b'G']]).unwrap(),
1465 ];
1466
1467 let effects = annotate_variants(&variants, &genes, &AnnotationConfig::default());
1468
1469 let v1_effects: Vec<_> = effects
1471 .iter()
1472 .filter(|e| e.gene_name == "SIMPLE")
1473 .collect();
1474 assert!(!v1_effects.is_empty());
1475
1476 let v2_effects: Vec<_> = effects
1479 .iter()
1480 .filter(|e| e.gene_name == "TEST1")
1481 .collect();
1482 assert!(!v2_effects.is_empty());
1483 assert!(
1484 v2_effects.iter().any(|e| matches!(e.consequence, Consequence::FivePrimeUtr)),
1485 "expected at least one FivePrimeUtr for TEST1"
1486 );
1487
1488 let v3_count = effects
1490 .iter()
1491 .filter(|e| e.gene_name != "SIMPLE" && e.gene_name != "TEST1")
1492 .count();
1493 assert_eq!(v3_count, 0);
1496 }
1497
1498 #[test]
1500 fn test_non_coding_gene() {
1501 let gene = Gene {
1502 gene_id: "GENE_NC".into(),
1503 gene_name: "LINC001".into(),
1504 chrom: "chr1".into(),
1505 start: 1000,
1506 end: 2000,
1507 strand: Strand::Forward,
1508 gene_type: GeneType::LncRNA,
1509 transcripts: vec![Transcript {
1510 transcript_id: "TX_NC".into(),
1511 start: 1000,
1512 end: 2000,
1513 exons: vec![Exon {
1514 exon_number: 1,
1515 start: 1000,
1516 end: 2000,
1517 }],
1518 cds_start: None,
1519 cds_end: None,
1520 }],
1521 };
1522
1523 let v = Variant::new("chr1", 1501, vec![b'A'], vec![vec![b'G']]).unwrap();
1524 let effects = annotate_one(&v, &gene);
1525 assert_eq!(effects.len(), 1);
1526 assert!(matches!(effects[0].consequence, Consequence::NonCoding));
1527 assert_eq!(effects[0].gene_name, "LINC001");
1528 }
1529
1530 #[test]
1532 fn test_codon_change_field() {
1533 let gene = test_gene_simple_cds();
1534 let v = Variant::new("chr1", 105, vec![b'T'], vec![vec![b'A']]).unwrap();
1536 let effects = annotate_one(&v, &gene);
1537 assert_eq!(effects.len(), 1);
1538 let eff = &effects[0];
1539 assert!(eff.codon_change.is_some());
1540 let (ref_c, alt_c) = eff.codon_change.as_ref().unwrap();
1541 assert_eq!(ref_c.len(), 3);
1543 assert_eq!(alt_c.len(), 3);
1544 let diffs: usize = ref_c
1546 .bytes()
1547 .zip(alt_c.bytes())
1548 .filter(|(a, b)| a != b)
1549 .count();
1550 assert_eq!(diffs, 1, "SNV should change exactly one codon position");
1551 }
1552
1553 #[test]
1556 fn test_codon_table_basics() {
1557 assert_eq!(translate_codon(b"ATG"), b'M');
1559 assert_eq!(translate_codon(b"TAA"), b'*');
1561 assert_eq!(translate_codon(b"TAG"), b'*');
1562 assert_eq!(translate_codon(b"TGA"), b'*');
1563 assert_eq!(translate_codon(b"TTT"), b'F');
1565 assert_eq!(translate_codon(b"GGG"), b'G');
1567 assert_eq!(translate_codon(b"AAA"), b'K');
1569 assert_eq!(translate_codon(b"CCC"), b'P');
1571 }
1572
1573 #[test]
1574 fn test_reverse_complement() {
1575 assert_eq!(reverse_complement(b"ATGC"), b"GCAT");
1576 assert_eq!(reverse_complement(b"AAAA"), b"TTTT");
1577 assert_eq!(reverse_complement(b""), b"");
1578 assert_eq!(reverse_complement(b"A"), b"T");
1579 }
1580
1581 #[test]
1582 fn test_aa_three_letter_codes() {
1583 assert_eq!(aa_three_letter(b'M'), "Met");
1584 assert_eq!(aa_three_letter(b'V'), "Val");
1585 assert_eq!(aa_three_letter(b'E'), "Glu");
1586 assert_eq!(aa_three_letter(b'*'), "Ter");
1587 assert_eq!(aa_three_letter(b'X'), "Xaa");
1588 }
1589
1590 #[test]
1591 fn test_annotation_config_default() {
1592 let config = AnnotationConfig::default();
1593 assert_eq!(config.upstream_distance, 5000);
1594 assert_eq!(config.downstream_distance, 5000);
1595 assert_eq!(config.splice_window, 2);
1596 }
1597
1598 #[test]
1599 fn test_splice_scoring() {
1600 let gene = Gene {
1602 gene_id: "SPLICE_GENE".into(),
1603 gene_name: "SPLICE".into(),
1604 chrom: "chr1".into(),
1605 start: 0,
1606 end: 100,
1607 strand: Strand::Forward,
1608 gene_type: GeneType::ProteinCoding,
1609 transcripts: vec![Transcript {
1610 transcript_id: "TX_SPLICE".into(),
1611 start: 0,
1612 end: 100,
1613 exons: vec![
1614 Exon {
1615 exon_number: 1,
1616 start: 10,
1617 end: 30,
1618 },
1619 Exon {
1620 exon_number: 2,
1621 start: 50,
1622 end: 80,
1623 },
1624 ],
1625 cds_start: Some(10),
1626 cds_end: Some(80),
1627 }],
1628 };
1629
1630 let mut ref_seq = vec![b'A'; 100];
1633 ref_seq[30] = b'G'; ref_seq[31] = b'T'; ref_seq[48] = b'A'; ref_seq[49] = b'G'; let v = Variant::new("chr1", 31, vec![b'G'], vec![vec![b'A']]).unwrap();
1640 let scores = score_splice_disruption(&v, &gene, &ref_seq);
1641 assert!(!scores.is_empty());
1642 let donor_score = &scores[0];
1643 assert!(donor_score.is_canonical);
1644 assert!(donor_score.disrupts_consensus);
1645 assert!(donor_score.delta_score < 0.0, "disrupting GT should lower score");
1646 }
1647
1648 #[test]
1649 fn test_variant_different_chrom_no_effect() {
1650 let gene = test_gene_forward(); let v = Variant::new("chr2", 1500, vec![b'A'], vec![vec![b'G']]).unwrap();
1652 let effects = annotate_one(&v, &gene);
1653 assert!(effects.is_empty());
1654 }
1655
1656 #[test]
1657 fn test_variant_far_from_gene_no_effect() {
1658 let gene = test_gene_forward(); let v = Variant::new("chr1", 100001, vec![b'A'], vec![vec![b'G']]).unwrap();
1661 let effects = annotate_one(&v, &gene);
1662 assert!(effects.is_empty());
1663 }
1664
1665 #[test]
1666 fn test_exon_distance_splice_site() {
1667 let gene = test_gene_forward();
1668 let v = Variant::new("chr1", 1201, vec![b'G'], vec![vec![b'A']]).unwrap();
1670 let effects = annotate_one(&v, &gene);
1671 assert_eq!(effects[0].exon_distance, Some(1));
1672
1673 let v2 = Variant::new("chr1", 1500, vec![b'A'], vec![vec![b'G']]).unwrap();
1675 let effects2 = annotate_one(&v2, &gene);
1676 assert_eq!(effects2[0].exon_distance, Some(-1));
1677 }
1678}