1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126
/*! Translate DNA or RNA sequence into protein. "X" = invalid amino acid "*" = stop codon You can find the rest of the table [elsewhere](https://en.wikipedia.org/wiki/DNA_codon_table). */ #![deny(missing_docs)] /// Translate DNA or RNA sequence into a peptide. /// /// # Examples /// /// ``` /// use protein_translate::translate; /// /// # fn main() { dna_example(); rna_example(); shift_reading_frame(); } /// fn dna_example() { /// /// let dna = b"GCTAGTCGTATCGTAGCTAGTC"; /// let peptide = translate(dna); /// assert_eq!(&peptide, "ASRIVAS"); /// /// } /// /// fn rna_example() { /// /// let rna = b"GCUAGUCGUAUCGUAGCUAGUC"; /// let peptide = translate(rna); /// assert_eq!(&peptide, "ASRIVAS"); /// } /// /// fn shift_reading_frame() { /// /// // To shift the reading frame, pass in a slice /// // skipping the first 1-2 nucleotides. /// /// let dna = b"GCTAGTCGTATCGTAGCTAGTC"; /// let peptide_frame2 = translate(&dna[1..]); /// assert_eq!(&peptide_frame2, "LVVS*LV"); /// /// let peptide_frame3 = translate(&dna[2..]); /// assert_eq!(&peptide_frame3, "*SYRS*"); /// /// } /// ``` pub fn translate(seq: &[u8]) -> String { let mut peptide = String::with_capacity(seq.len() / 3); 'outer: for triplet in seq.chunks_exact(3) { for c in triplet { if !c.is_ascii() { peptide.push('X'); continue 'outer; } } let c1 = ASCII_TO_INDEX[triplet[0] as usize]; let c2 = ASCII_TO_INDEX[triplet[1] as usize]; let c3 = ASCII_TO_INDEX[triplet[2] as usize]; let amino_acid = if c1 == 4 || c2 == 4 || c3 == 4 { 'X' } else { AA_TABLE_CANONICAL[c1][c2][c3] }; peptide.push(amino_acid); } peptide } /// https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi /// U is equivalent to T here /// /// The 1st index picks the 4x4 block /// The 2nd index picks the row /// the 3rd index picks the column static AA_TABLE_CANONICAL: [[[char; 4]; 4]; 4] = [ [ ['K', 'N', 'K', 'N'], // AAA, AAC, AAG, AAU/AAT ['T', 'T', 'T', 'T'], // ACA, ACC, ACG, ACU/ACT ['R', 'S', 'R', 'S'], // AGA, AGC, AGG, AGU/AGT ['I', 'I', 'M', 'I'], // AUA/ATA, AUC/ATC, AUG/ATG, AUU/ATT ], [ ['Q', 'H', 'Q', 'H'], // CAA, CAC, CAG, CAU/CAT ['P', 'P', 'P', 'P'], // CCA, CCC, CCG, CCU/CCT ['R', 'R', 'R', 'R'], // CGA, CGC, CGG, CGU/CGT ['L', 'L', 'L', 'L'], // CUA/CTA, CUC/CTC, CUG/CTG, CUU/CTT ], [ ['E', 'D', 'E', 'D'], // GAA, GAC, GAG, GAU/GAT ['A', 'A', 'A', 'A'], // GCA, GCC, GCG, GCU/GCT ['G', 'G', 'G', 'G'], // GGA, GGC, GGG, GGU/GGT ['V', 'V', 'V', 'V'], // GUA/GTA, GUC/GTC, GUG/GTG, GUU/GTT ], [ ['*', 'Y', '*', 'Y'], // UAA/TAA, UAC/TAC, UAG/TAG, UAU/TAT ['S', 'S', 'S', 'S'], // UCA/TCA, UCC/TCC, UCG/TCG, UCU/TCT ['*', 'C', 'W', 'C'], // UGA/TGA, UGC/TGC, UGG/TGG, UGU/TGT ['L', 'F', 'L', 'F'], // UUA/TTA, UUC/TTC, UUG/TTG, UUU/TTT ], ]; /// Maps an ASCII character to array index /// /// A = 65, a = 97 => 0 /// C = 67, c = 99 => 1 /// G = 71, g = 103 => 2 /// T = 84, t = 116 => 3 /// U = 85, u = 117 => 3 static ASCII_TO_INDEX: [usize; 128] = [ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 0-15 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 16-31 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 32-47 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 48-63 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, // 64-79 (65 = A, 67 = C, 71 = G) 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 80-95 (84 = T, 85 = U) 4, 0, 4, 1, 4, 4, 4, 2, 4, 4, 4, 4, 4, 4, 4, 4, // 96-111 (97 = a, 99 = c, 103 = g) 4, 4, 4, 4, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // 112-127 (116 = t, 117 = u) ];