Skip to main content

argenus/
snp.rs

1
2use rustc_hash::FxHashMap;
3use std::sync::LazyLock;
4
5#[derive(Debug, Clone, PartialEq)]
6pub enum SnpStatus {
7
8    NotApplicable,
9
10    Confirmed,
11
12    WildType,
13
14    NovelVariant(char),
15
16    NotCovered,
17
18    Unverified(String),
19}
20
21impl std::fmt::Display for SnpStatus {
22    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
23        match self {
24            SnpStatus::NotApplicable => write!(f, "Acquired"),
25            SnpStatus::Confirmed => write!(f, "Confirmed"),
26            SnpStatus::WildType => write!(f, "WildType"),
27            SnpStatus::NovelVariant(aa) => write!(f, "Novel({})", aa),
28            SnpStatus::NotCovered => write!(f, "NotCovered"),
29            SnpStatus::Unverified(reason) => write!(f, "Unverified({})", reason),
30        }
31    }
32}
33
34#[derive(Debug, Clone)]
35pub struct SnpInfo {
36
37    #[allow(dead_code)]
38    pub gene: String,
39
40    pub wildtype: char,
41
42    pub position: usize,
43
44    pub mutant: char,
45}
46
47impl SnpInfo {
48
49    pub fn nucleotide_range(&self) -> (usize, usize) {
50        let start = (self.position - 1) * 3;
51        let end = start + 3;
52        (start, end)
53    }
54}
55
56static CODON_TABLE: LazyLock<FxHashMap<&'static str, char>> = LazyLock::new(|| {
57    let mut table = FxHashMap::default();
58
59    table.insert("TTT", 'F'); table.insert("TTC", 'F');
60
61    table.insert("TTA", 'L'); table.insert("TTG", 'L');
62    table.insert("CTT", 'L'); table.insert("CTC", 'L');
63    table.insert("CTA", 'L'); table.insert("CTG", 'L');
64
65    table.insert("ATT", 'I'); table.insert("ATC", 'I'); table.insert("ATA", 'I');
66
67    table.insert("ATG", 'M');
68
69    table.insert("GTT", 'V'); table.insert("GTC", 'V');
70    table.insert("GTA", 'V'); table.insert("GTG", 'V');
71
72    table.insert("TCT", 'S'); table.insert("TCC", 'S');
73    table.insert("TCA", 'S'); table.insert("TCG", 'S');
74    table.insert("AGT", 'S'); table.insert("AGC", 'S');
75
76    table.insert("CCT", 'P'); table.insert("CCC", 'P');
77    table.insert("CCA", 'P'); table.insert("CCG", 'P');
78
79    table.insert("ACT", 'T'); table.insert("ACC", 'T');
80    table.insert("ACA", 'T'); table.insert("ACG", 'T');
81
82    table.insert("GCT", 'A'); table.insert("GCC", 'A');
83    table.insert("GCA", 'A'); table.insert("GCG", 'A');
84
85    table.insert("TAT", 'Y'); table.insert("TAC", 'Y');
86
87    table.insert("TAA", '*'); table.insert("TAG", '*'); table.insert("TGA", '*');
88
89    table.insert("CAT", 'H'); table.insert("CAC", 'H');
90
91    table.insert("CAA", 'Q'); table.insert("CAG", 'Q');
92
93    table.insert("AAT", 'N'); table.insert("AAC", 'N');
94
95    table.insert("AAA", 'K'); table.insert("AAG", 'K');
96
97    table.insert("GAT", 'D'); table.insert("GAC", 'D');
98
99    table.insert("GAA", 'E'); table.insert("GAG", 'E');
100
101    table.insert("TGT", 'C'); table.insert("TGC", 'C');
102
103    table.insert("TGG", 'W');
104
105    table.insert("CGT", 'R'); table.insert("CGC", 'R');
106    table.insert("CGA", 'R'); table.insert("CGG", 'R');
107    table.insert("AGA", 'R'); table.insert("AGG", 'R');
108
109    table.insert("GGT", 'G'); table.insert("GGC", 'G');
110    table.insert("GGA", 'G'); table.insert("GGG", 'G');
111    table
112});
113
114pub fn translate_codon(codon: &str) -> Option<char> {
115    let upper = codon.to_uppercase();
116    CODON_TABLE.get(upper.as_str()).copied()
117}
118
119#[allow(dead_code)]
120pub fn is_snp_gene(gene_name: &str) -> bool {
121    parse_snp_info(gene_name).is_some()
122}
123
124pub fn parse_snp_info(gene_name: &str) -> Option<SnpInfo> {
125
126    let name = gene_name.split('|').next().unwrap_or(gene_name);
127
128    let parts: Vec<&str> = name.rsplitn(2, '_').collect();
129    if parts.len() != 2 {
130        return None;
131    }
132
133    let snp_part = parts[0];
134    let gene = parts[1];
135
136    if snp_part.len() < 3 {
137        return None;
138    }
139
140    let chars: Vec<char> = snp_part.chars().collect();
141
142    let wildtype = chars[0].to_ascii_uppercase();
143    if !is_amino_acid(wildtype) {
144        return None;
145    }
146
147    let mutant = chars[chars.len() - 1].to_ascii_uppercase();
148    if !is_amino_acid(mutant) {
149        return None;
150    }
151
152    let pos_str: String = chars[1..chars.len()-1].iter().collect();
153    let position: usize = pos_str.parse().ok()?;
154
155    if position == 0 {
156        return None;
157    }
158
159    Some(SnpInfo {
160        gene: gene.to_string(),
161        wildtype,
162        position,
163        mutant,
164    })
165}
166
167fn is_amino_acid(c: char) -> bool {
168    matches!(c, 'A' | 'C' | 'D' | 'E' | 'F' | 'G' | 'H' | 'I' | 'K' | 'L' |
169                'M' | 'N' | 'P' | 'Q' | 'R' | 'S' | 'T' | 'V' | 'W' | 'Y')
170}
171
172pub fn verify_snp(
173    contig_seq: &str,
174    gene_name: &str,
175    ref_start: usize,
176    ref_end: usize,
177    contig_start: usize,
178    contig_end: usize,
179    strand: char,
180) -> SnpStatus {
181
182    let snp_info = match parse_snp_info(gene_name) {
183        Some(info) => info,
184        None => return SnpStatus::NotApplicable,
185    };
186
187    let (snp_nt_start, snp_nt_end) = snp_info.nucleotide_range();
188
189    if snp_nt_start < ref_start || snp_nt_end > ref_end {
190        return SnpStatus::NotCovered;
191    }
192
193    let offset_in_ref = snp_nt_start - ref_start;
194    let contig_snp_start = if strand == '+' {
195        contig_start + offset_in_ref
196    } else {
197
198        contig_end - offset_in_ref - 3
199    };
200    let contig_snp_end = contig_snp_start + 3;
201
202    if contig_snp_end > contig_seq.len() {
203        return SnpStatus::NotCovered;
204    }
205
206    let codon_seq = &contig_seq[contig_snp_start..contig_snp_end];
207
208    let codon = if strand == '-' {
209        reverse_complement(codon_seq)
210    } else {
211        codon_seq.to_uppercase()
212    };
213
214    let amino_acid = match translate_codon(&codon) {
215        Some(aa) => aa,
216        None => return SnpStatus::Unverified(format!("invalid_codon:{}", codon)),
217    };
218
219    if amino_acid == snp_info.mutant {
220        SnpStatus::Confirmed
221    } else if amino_acid == snp_info.wildtype {
222        SnpStatus::WildType
223    } else {
224        SnpStatus::NovelVariant(amino_acid)
225    }
226}
227
228fn reverse_complement(seq: &str) -> String {
229    seq.chars()
230        .rev()
231        .map(|c| match c.to_ascii_uppercase() {
232            'A' => 'T',
233            'T' => 'A',
234            'G' => 'C',
235            'C' => 'G',
236            _ => 'N',
237        })
238        .collect()
239}
240
241#[cfg(test)]
242mod tests {
243    use super::*;
244
245    #[test]
246    fn test_parse_snp_info() {
247
248        let info = parse_snp_info("rpoB_V146F").unwrap();
249        assert_eq!(info.gene, "rpoB");
250        assert_eq!(info.wildtype, 'V');
251        assert_eq!(info.position, 146);
252        assert_eq!(info.mutant, 'F');
253
254        let info = parse_snp_info("gyrA_S83L|QUINOLONE|QUINOLONE|J01M").unwrap();
255        assert_eq!(info.gene, "gyrA");
256        assert_eq!(info.wildtype, 'S');
257        assert_eq!(info.position, 83);
258        assert_eq!(info.mutant, 'L');
259
260        let info = parse_snp_info("test_A1G").unwrap();
261        assert_eq!(info.position, 1);
262
263        assert!(parse_snp_info("blaTEM-1").is_none());
264        assert!(parse_snp_info("tet(M)").is_none());
265    }
266
267    #[test]
268    fn test_is_snp_gene() {
269        assert!(is_snp_gene("rpoB_V146F"));
270        assert!(is_snp_gene("gyrA_S83L|QUINOLONE"));
271        assert!(!is_snp_gene("blaTEM-1"));
272        assert!(!is_snp_gene("tet(M)"));
273    }
274
275    #[test]
276    fn test_translate_codon() {
277        assert_eq!(translate_codon("ATG"), Some('M'));
278        assert_eq!(translate_codon("TTT"), Some('F'));
279        assert_eq!(translate_codon("TTC"), Some('F'));
280        assert_eq!(translate_codon("GTT"), Some('V'));
281        assert_eq!(translate_codon("TAA"), Some('*'));
282        assert_eq!(translate_codon("XXX"), None);
283    }
284
285    #[test]
286    fn test_nucleotide_range() {
287        let info = SnpInfo {
288            gene: "test".to_string(),
289            wildtype: 'V',
290            position: 146,
291            mutant: 'F',
292        };
293
294        assert_eq!(info.nucleotide_range(), (435, 438));
295
296        let info2 = SnpInfo {
297            gene: "test".to_string(),
298            wildtype: 'A',
299            position: 1,
300            mutant: 'B',
301        };
302        assert_eq!(info2.nucleotide_range(), (0, 3));
303    }
304
305    #[test]
306    fn test_verify_snp() {
307
308        let contig = "ATGTTTGGG";
309        let status = verify_snp(
310            contig,
311            "test_V2F",
312            0, 9,
313            0, 9,
314            '+',
315        );
316        assert_eq!(status, SnpStatus::Confirmed);
317
318        let contig = "ATGGTTGGG";
319        let status = verify_snp(
320            contig,
321            "test_V2F",
322            0, 9,
323            0, 9,
324            '+',
325        );
326        assert_eq!(status, SnpStatus::WildType);
327
328        let contig = "ATGCTTGGG";
329        let status = verify_snp(
330            contig,
331            "test_V2F",
332            0, 9,
333            0, 9,
334            '+',
335        );
336        assert_eq!(status, SnpStatus::NovelVariant('L'));
337    }
338
339    #[test]
340    fn test_snp_status_display() {
341        assert_eq!(format!("{}", SnpStatus::Confirmed), "Confirmed");
342        assert_eq!(format!("{}", SnpStatus::WildType), "WildType");
343        assert_eq!(format!("{}", SnpStatus::NovelVariant('L')), "Novel(L)");
344        assert_eq!(format!("{}", SnpStatus::NotApplicable), "Acquired");
345    }
346}