bdb/bio/
proteins.rs

1//! General purpose protein routines.
2
3/// Valid aminoacid 1-letter codes.
4pub static AMINOACIDS: &'static str = "ABCDEFGHIJKLMNPQRSTVWXYZ";
5
6/// Calculate aminoacid and protein sequence masses.
7///
8/// Different biological application depend on different assumptions for
9/// mass calculations, some assuming average isotope composition and
10/// some assuming the sole presence of monoisotopic species.
11///
12/// Calculating monoisotopic species uses a high-accuracy mass of the
13/// most prevalent (and lowest mass) isotope of a given element.
14/// The average mass calculates the mass of an element by summing the mass
15/// of each isotope multiplied each isotope's abundance.
16pub trait ProteinMass {
17    /// Calculate the mass of water.
18    fn water_mass() -> f64;
19
20    /// Calculate the mass of an aminoacid residue.
21    fn residue_mass(residue: u8) -> f64;
22
23    /// Calculate the mass of an aminoacid with N- and C-termini.
24    #[inline(always)]
25    fn aminoacid_mass(residue: u8) -> f64 {
26        Self::residue_mass(residue) + Self::water_mass()
27    }
28
29    /// Calculate the mass of a protein sequence.
30    #[inline]
31    fn internal_sequence_mass(sequence: &[u8]) -> f64 {
32        sequence.iter().fold(0.0, |sum, x| sum + Self::residue_mass(*x))
33    }
34
35    /// Calculate the mass of a protein sequence with N- or C-termini.
36    #[inline(always)]
37    fn protein_sequence_mass(sequence: &[u8]) -> f64 {
38        Self::internal_sequence_mass(sequence) + Self::water_mass()
39    }
40}
41
42// IMPLEMENTATIONS
43// ---------------
44
45/// Calculate protein mass using only high-resolution masses from monoisotopic elements.
46pub struct MonoisotopicMass;
47
48impl ProteinMass for MonoisotopicMass {
49    #[inline(always)]
50    fn water_mass() -> f64 {
51        18.0105646942
52    }
53
54    #[inline]
55    fn residue_mass(residue: u8) -> f64 {
56        match residue {
57            // uppercase
58            b'A' => 71.0371137957,
59            b'C' => 103.0091844957,
60            b'D' => 115.0269430557,
61            b'E' => 129.0425931199,
62            b'F' => 147.0684139241,
63            b'G' => 57.0214637315,
64            b'H' => 137.0589118703,
65            b'I' => 113.0840639883,
66            b'K' => 128.0949630256,
67            b'L' => 113.0840639883,
68            b'M' => 131.0404846241,
69            b'N' => 114.042927463,
70            b'P' => 97.0527638599,
71            b'Q' => 128.0585775272,
72            b'R' => 156.101111036,
73            b'S' => 87.0320284257,
74            b'T' => 101.0476784899,
75            b'U' => 150.9536347957,
76            b'V' => 99.0684139241,
77            b'W' => 186.0793129614,
78            b'Y' => 163.0633285541,
79            // lowercase
80            b'a' => 71.0371137957,
81            b'c' => 103.0091844957,
82            b'd' => 115.0269430557,
83            b'e' => 129.0425931199,
84            b'f' => 147.0684139241,
85            b'g' => 57.0214637315,
86            b'h' => 137.0589118703,
87            b'i' => 113.0840639883,
88            b'k' => 128.0949630256,
89            b'l' => 113.0840639883,
90            b'm' => 131.0404846241,
91            b'n' => 114.042927463,
92            b'p' => 97.0527638599,
93            b'q' => 128.0585775272,
94            b'r' => 156.101111036,
95            b's' => 87.0320284257,
96            b't' => 101.0476784899,
97            b'u' => 150.9536347957,
98            b'v' => 99.0684139241,
99            b'w' => 186.0793129614,
100            b'y' => 163.0633285541,
101            // default
102            _    => 0.0,
103        }
104    }
105}
106
107
108/// Calculate protein mass using only low-resolution masses from average isotopic compositions.
109pub struct AverageMass;
110
111impl ProteinMass for AverageMass {
112    #[inline(always)]
113    fn water_mass() -> f64 {
114        18.015
115    }
116
117    #[inline]
118    fn residue_mass(residue: u8) -> f64 {
119        match residue {
120            // uppercase
121            b'A' => 71.0779,
122            b'C' => 103.1429,
123            b'D' => 115.0874,
124            b'E' => 129.114,
125            b'F' => 147.1739,
126            b'G' => 57.0513,
127            b'H' => 137.1393,
128            b'I' => 113.1576,
129            b'K' => 128.1723,
130            b'L' => 113.1576,
131            b'M' => 131.1961,
132            b'N' => 114.1026,
133            b'P' => 97.1152,
134            b'Q' => 128.1292,
135            b'R' => 156.1857,
136            b'S' => 87.0773,
137            b'T' => 101.1039,
138            b'U' => 150.0379,
139            b'V' => 99.1311,
140            b'W' => 186.2099,
141            b'Y' => 163.1733,
142            // lowercase
143            b'a' => 71.0779,
144            b'c' => 103.1429,
145            b'd' => 115.0874,
146            b'e' => 129.114,
147            b'f' => 147.1739,
148            b'g' => 57.0513,
149            b'h' => 137.1393,
150            b'i' => 113.1576,
151            b'k' => 128.1723,
152            b'l' => 113.1576,
153            b'm' => 131.1961,
154            b'n' => 114.1026,
155            b'p' => 97.1152,
156            b'q' => 128.1292,
157            b'r' => 156.1857,
158            b's' => 87.0773,
159            b't' => 101.1039,
160            b'u' => 150.0379,
161            b'v' => 99.1311,
162            b'w' => 186.2099,
163            b'y' => 163.1733,
164            // default
165            _    => 0.0,
166        }
167    }
168}
169
170// TESTS
171// -----
172
173#[cfg(test)]
174mod tests {
175    use super::*;
176
177    // AMINOACID
178
179    fn _one_letter_mass<T: ProteinMass>() {
180        // shorthand for `to_ascii_lowercase`
181        let lower = | a: u8 | a.to_ascii_lowercase();
182
183        // check all uppercase and lowercase items are identical
184        for a in AMINOACIDS.bytes() {
185            assert_eq!(T::residue_mass(a), T::residue_mass(lower(a)));
186            assert_eq!(T::aminoacid_mass(a), T::aminoacid_mass(lower(a)));
187        }
188    }
189
190    #[test]
191    fn one_letter_mass() {
192        pub type A = AverageMass;
193        pub type M = MonoisotopicMass;
194
195        // check approximate monoisotopic masses
196        // average to monoisotopic should be within 0.2
197        for a in AMINOACIDS.bytes() {
198            assert_approx_eq!(A::residue_mass(a), M::residue_mass(a), 0.2);
199        }
200
201        _one_letter_mass::<MonoisotopicMass>();
202        _one_letter_mass::<AverageMass>();
203    }
204
205    // SEQUENCE
206
207    #[test]
208    fn sequence_mass_average() {
209        // use common sequences to check whether the aminoacid masses
210        // are correct values
211        pub type T = AverageMass;
212
213        let peptide = b"SAMPLER";
214        assert_approx_eq!(T::internal_sequence_mass(peptide), 784.9238,    0.001);
215        assert_approx_eq!(T::protein_sequence_mass(peptide),  802.9388,    0.001);
216
217        let peptide = b"TGPNLHGLFGR";
218        assert_approx_eq!(T::internal_sequence_mass(peptide), 1150.2897,   0.001);
219        assert_approx_eq!(T::protein_sequence_mass(peptide),  1168.3047,   0.001);
220
221        let peptide = b"ACDEFGHIKLMNPQRSTUVWY";
222        assert_approx_eq!(T::internal_sequence_mass(peptide), 2527.7364,   0.001);
223        assert_approx_eq!(T::protein_sequence_mass(peptide),  2545.7514,   0.001);
224    }
225
226    #[test]
227    fn sequence_mass_monoisotopic() {
228        // use common sequences to check whether the aminoacid masses
229        // are correct values
230        pub type T = MonoisotopicMass;
231
232        let peptide = b"SAMPLER";
233        assert_approx_eq!(T::internal_sequence_mass(peptide), 784.39016,    0.001);
234        assert_approx_eq!(T::protein_sequence_mass(peptide),  802.4007,     0.001);
235
236        let peptide = b"TGPNLHGLFGR";
237        assert_approx_eq!(T::internal_sequence_mass(peptide), 1149.60433,   0.001);
238        assert_approx_eq!(T::protein_sequence_mass(peptide),  1167.61489,   0.001);
239
240        let peptide = b"ACDEFGHIKLMNPQRSTUVWY";
241        assert_approx_eq!(T::internal_sequence_mass(peptide), 2527.067977,  0.001);
242        assert_approx_eq!(T::protein_sequence_mass(peptide),  2545.0785414, 0.001);
243    }
244}