microBioRust_seqmetrics/
metrics.rs

1//!  The purpose of seqmetrics is to allow calculations of protein sequence parameters
2//!  We define a set of amino acid getters to allow getting of the protein sequence
3//!  as either the three letter code such as Trp, Phe, the full name such as alanine, glycine
4//!  or the single letter code such as Y, A
5//!
6//!   Example function to calculate transmembrane statistics
7//!
8//! ```rust
9//!
10//! use clap::Parser;
11//! use std::fs::File;
12//! use microBioRust::gbk::Reader;
13//! use std::io;
14//! use std::collections::HashMap;
15//! use microBioRust_seqmetrics::metrics::hydrophobicity;
16//!
17//! pub fn suggest_transmembrane_domains() -> Result<(), anyhow::Error> {
18//!            let file_gbk = File::open("test_output.gbk")?;
19//!            let mut reader = Reader::new(file_gbk);
20//!            let mut records = reader.records();
21//!            loop {  
22//!                match records.next() {  
23//!                    Some(Ok(mut record)) => {
24//!                       //println!("next record");
25//!                       //println!("Record id: {:?}", record.id);
26//!                       for (k, v) in &record.cds.attributes {
27//!                           match record.seq_features.get_sequence_faa(&k) {
28//!                                     Some(value) => { let seq_faa = value.to_string();
29//!				                      println!("{:?}", &seq_faa);
30//!				                      let hydro_values = hydrophobicity(&seq_faa, 21);
31//!						      let mut result = String::new();
32//!						      for hydro in hydro_values {
33//!						           if hydro > 1.6 {
34//!						               println!("possible transmembrane region - score {}",&hydro);  
35//!							       }
36//!						           else {
37//!						               ()
38//!							   }
39//!						      }
40//!                                                 },
41//!                                     _ => (),
42//!                                     };
43//!                       
44//!                           }
45//!                    },
46//!                    Some(Err(e)) => { println!("theres an err {:?}", e); },
47//!                    None => {
48//!                             println!("finished iteration");
49//!                             break;
50//!                             },
51//!                    };
52//!               }
53//!            return Ok(());
54//!   }
55//!```   
56//!
57//!   Example function to calculate the molecular weight of a protein sequence
58//!
59//!```rust
60//!
61//! use clap::Parser;
62//! use std::fs::File;
63//! use microBioRust::gbk::Reader;
64//! use std::io;
65//! use std::collections::HashMap;
66//! use microBioRust_seqmetrics::metrics::molecular_weight;
67//!
68//! pub fn collect_molecular_weight() -> Result<(), anyhow::Error> {
69//!            let file_gbk = File::open("test_output.gbk")?;
70//!            let mut reader = Reader::new(file_gbk);
71//!            let mut records = reader.records();
72//!	    let mut molecular_weight_total: f64 = 0.0;
73//!            loop {  
74//!                match records.next() {  
75//!                    Some(Ok(mut record)) => {
76//!                       //println!("next record");
77//!                       //println!("Record id: {:?}", record.id);
78//!                      for (k, v) in &record.cds.attributes {
79//!                           match record.seq_features.get_sequence_faa(&k) {
80//!                                     Some(value) => {
81//!                                                     let seq_faa = value.to_string();
82//!				                        println!("id: {:?}", &k);
83//!				                        molecular_weight_total = molecular_weight(&seq_faa);
84//!                                                     println!(">{}|{}\n{}", &record.id, &k, molecular_weight_total);
85//!                                                    },
86//!                                     _ => (),
87//!                                     };
88//!                       
89//!                           }
90//!                    },
91//!                    Some(Err(e)) => { println!("theres an err {:?}", e); },
92//!                    None => {
93//!                              println!("finished iteration");
94//!                              break;
95//!                            },
96//!                    }
97//!               }
98//!            return Ok(());
99//!   }
100//!```
101//!
102//!   Example function to count amino acids of each protein as raw counts, see below to generate percentages per protein
103//!
104//!```rust
105//!
106//! use clap::Parser;
107//! use std::fs::File;
108//! use microBioRust::gbk::Reader;
109//! use std::io;
110//! use std::collections::HashMap;
111//! use microBioRust_seqmetrics::metrics::amino_counts;
112//!
113//! pub fn count_aminos() -> Result<(), anyhow::Error> {
114//!            let file_gbk = File::open("test_output.gbk")?;
115//!            let mut reader = Reader::new(file_gbk);
116//!            let mut records = reader.records();
117//!	    let mut results: HashMap<char, u64> = HashMap::new();
118//!            loop {  
119//!                match records.next() {  
120//!                   Some(Ok(mut record)) => {
121//!                       //println!("next record");
122//!                       //println!("Record id: {:?}", record.id);
123//!                       for (k, v) in &record.cds.attributes {
124//!                           match record.seq_features.get_sequence_faa(&k) {
125//!                                     Some(value) => { let seq_faa = value.to_string();
126//!				                      println!("id: {:?}", &k);
127//!				                      results = amino_counts(&seq_faa);
128//!                                                      println!(">{}|{}\n{:?}", &record.id, &k, results);
129//!                                                      },
130//!                                     _ => (),
131//!                                     };
132//!                       
133//!                           }
134//!                    },
135//!                    Some(Err(e)) => { println!("theres an err {:?}", e); },
136//!                    None => {
137//!                             println!("finished iteration");
138//!                             break;
139//!                             },
140//!                    }
141//!               }
142//!            return Ok(());
143//!   }
144//!```
145//!  Example function to calculate and print out the aromaticity of each protein
146//!
147//!```rust
148//! use clap::Parser;
149//! use std::fs::File;
150//! use microBioRust::gbk::Reader;
151//! use std::io;
152//! use std::collections::HashMap;
153//! use microBioRust_seqmetrics::metrics::amino_percentage;
154//!
155//! pub fn aromaticity() -> Result<(), anyhow::Error> {
156//!        // calculated as in biopython with aromaticity according to Lobry, 1994 as the relative freq of Phe+Trp+Tyr
157//!        let file_gbk = File::open("test_output.gbk")?;
158//!	let mut reader = Reader::new(file_gbk);
159//!	let mut records = reader.records();
160//!	let mut results: HashMap<char, f64> = HashMap::new();
161//!	loop {
162//!	   match records.next() {
163//!	      Some(Ok(mut record)) => {
164//!	          for (k, v) in &record.cds.attributes {
165//!		     match record.seq_features.get_sequence_faa(&k) {
166//!		         Some(value) => {  let seq_faa = value.to_string();
167//!			                   results = amino_percentage(&seq_faa);
168//!					   let aromatic_aas = vec!['Y','W','F'];
169//!					   let aromaticity: f64 = aromatic_aas.iter()
170//!					       .filter_map(|&amino| results.get(&amino))
171//!					       .map(|&perc| perc / 100.0)
172//!					       .sum();
173//!					   println!("aromaticity for {} {} is {}",&record.id, &k, &aromaticity);
174//!					  },
175//!			_ => (),
176//!			};
177//!		   }
178//!	         },
179//!	    Some(Err(e)) => { println!("theres an error {:?}", e); },
180//!	    None => {
181//!                   println!("finished iteration");
182//!	              break;
183//!		    },
184//!	    }
185//!       }
186//!      return Ok(());
187//!   }
188//!```
189
190
191use std::collections::HashMap;
192use std::fs::File;
193use microBioRust::gbk::Reader;
194
195// Define a macro to generate the getters for each amino acid
196macro_rules! amino_acid_getters {
197    ($struct_name:ident, $( ($field:ident, $full_name:ident, $three_letter:ident, $single_letter:ident) ),* ) => {
198        impl $struct_name {
199            $(
200	        // Capital full name getter
201		fn $field(&self) -> f64 {
202		   self.$field
203		}
204                // Full name getter
205                fn $full_name(&self) -> f64 {
206                    self.$field
207                }
208                // Three-letter code getter
209                fn $three_letter(&self) -> f64 {
210                    self.$field
211                }
212                // Single-letter code getter
213                fn $single_letter(&self) -> f64 {
214                    self.$field
215                }
216            )*
217        }
218    };
219}
220
221
222pub struct MolWeights {
223    Alanine: f64,
224    Arginine: f64,
225    Asparagine: f64,
226    Aspartate: f64,
227    Cysteine: f64,
228    Glutamate: f64,
229    Glutamine: f64,
230    Glycine: f64,
231    Histidine: f64,
232    Isoleucine: f64,
233    Leucine: f64,
234    Lysine: f64,
235    Methionine: f64,
236    Phenylalanine: f64,
237    Proline: f64,
238    Serine: f64,
239    Threonine: f64,
240    Tryptophan: f64,
241    Tyrosine: f64,
242    Valine: f64,
243}
244
245impl MolWeights {
246    fn new() -> Self {
247       Self {
248              //masses from NIST chemistry webbook US Dept of commerce
249              Alanine: 89.0932, //C3H7NO2
250              Arginine: 174.2010, //C6H14N4O2
251              Asparagine: 132.1179, //C4H8N2O3
252              Aspartate: 133.1027, //C4H7NO4
253              Cysteine: 121.158, //C3H7NO2S
254              Glutamate: 147.1293, //C5H9NO4
255              Glutamine: 146.1445, //C5H10N2O3
256              Glycine: 75.0666, //C2H5NO2
257              Histidine: 155.1546, //C6H9N3O2
258              Isoleucine: 131.1729, //C6H13NO2
259	      Leucine: 131.1729, //C6H13NO2
260              Lysine: 146.1876, //C6H14N2O2
261              Methionine: 149.211, //C5H11NO2S
262              Phenylalanine: 165.1891, //C9H11NO2
263              Proline: 115.1305, //C5H9NO2
264              Serine: 105.0926, //C3H7NO2
265              Threonine: 119.1192, //C4H9NO3
266              Tryptophan: 204.2252, //C11H12N2O2
267              Tyrosine: 181.1885, //C9H11NO3
268              Valine: 117.1463, //C5H11NO2
269             }
270      }
271}
272
273#[allow(non_snake_case)]
274pub fn molecular_weight(protein_seq: &str) -> f64 {
275    let amino_weights: MolWeights = MolWeights::new();
276    amino_acid_getters!(MolWeights,
277             (Alanine, alanine, Ala, A),
278             (Arginine, arginine, Arg, R),
279             (Asparagine, asparagine, Asn, N),
280             (Aspartate, aspartate, Asp, D),
281             (Cysteine, cysteine, Cys, C),
282             (Glutamine, glutamine, Gln, Q),
283             (Glutamate, glutamate, Glu, E),
284             (Glycine, glycine, Gly, G),
285             (Histidine, histidine, His, H),
286             (Isoleucine, isoleucine, Ile, I),
287             (Leucine, leucine, Leu, L),
288             (Lysine, lysine, Lys, K),
289             (Methionine, methionine, Met, M),
290             (Phenylalanine, phenylalanine, Phe, F),
291             (Proline, proline, Pro, P),
292             (Serine, serine, Ser, S),
293             (Threonine, threonine, Thr, T),
294             (Tryptophan, trytophan, Trp, W),
295             (Tyrosine, tyrosine, Tyr, Y),
296             (Valine, valine, Val, V)
297             );
298    let mut total_weight = 0.0;
299    for ch in protein_seq.chars() {
300       match ch {
301           'A' => total_weight += amino_weights.A(),
302	   'R' => total_weight += amino_weights.R(),
303	   'N' => total_weight += amino_weights.N(),
304	   'D' => total_weight += amino_weights.D(),
305	   'C' => total_weight += amino_weights.C(),
306	   'Q' => total_weight += amino_weights.Q(),
307	   'E' => total_weight += amino_weights.E(),
308	   'G' => total_weight += amino_weights.G(),
309	   'H' => total_weight += amino_weights.H(),
310	   'I' => total_weight += amino_weights.I(),
311	   'L' => total_weight += amino_weights.L(),
312	   'K' => total_weight += amino_weights.K(),
313	   'M' => total_weight += amino_weights.M(),
314	   'F' => total_weight += amino_weights.F(),
315	   'P' => total_weight += amino_weights.P(),
316	   'S' => total_weight += amino_weights.S(),
317	   'T' => total_weight += amino_weights.T(),
318	   'W' => total_weight += amino_weights.W(),
319	   'Y' => total_weight += amino_weights.Y(),
320	   'V' => total_weight += amino_weights.V(),
321	   _ => continue,
322	   }
323      }
324   let result_weight = total_weight - ((protein_seq.len() - 1) as f64 * 18.02);
325   result_weight
326}
327
328pub struct Hydrophobicity {
329    Alanine: f64,
330    Arginine: f64,
331    Asparagine: f64,
332    Aspartate: f64,
333    Cysteine: f64,
334    Glutamate: f64,
335    Glutamine: f64,
336    Glycine: f64,
337    Histidine: f64,
338    Isoleucine: f64,
339    Leucine: f64,
340    Lysine: f64,
341    Methionine: f64,
342    Phenylalanine: f64,
343    Proline: f64,
344    Serine: f64,
345    Threonine: f64,
346    Tryptophan: f64,
347    Tyrosine: f64,
348    Valine: f64,
349}
350
351impl Hydrophobicity {
352    fn new_KD() -> Self {
353       Self {
354              //Kyte-Doolittle values from the Qiagen resources website
355              Alanine: 1.80, 
356              Arginine: -4.50, 
357              Asparagine: -3.50, 
358              Aspartate: -3.50, 
359              Cysteine: 2.50, 
360              Glutamate: -3.50, 
361              Glutamine: -3.50, 
362              Glycine: -0.40, 
363              Histidine: -3.20, 
364              Isoleucine: 4.50, 
365	      Leucine: 3.80, 
366              Lysine: -3.90, 
367              Methionine: 1.90, 
368              Phenylalanine: 2.80, 
369              Proline: -1.60, 
370              Serine: -0.80, 
371              Threonine: -0.70,
372              Tryptophan: -0.90, 
373              Tyrosine: -1.30, 
374              Valine: 4.20, 
375             }
376      }
377}
378
379#[allow(non_snake_case)]
380pub fn hydrophobicity(protein_seq: &str, window_size: usize) -> Vec<f64> {
381    let mut hydrophobicity: Hydrophobicity = Hydrophobicity::new_KD();
382    let mut total_hydrophobicity: Vec<f64> = Vec::new();
383    let mut window_values: f64 = 0.0;
384    amino_acid_getters!(Hydrophobicity,
385             (Alanine, alanine, Ala, A),
386             (Arginine, arginine, Arg, R),
387             (Asparagine, asparagine, Asn, N),
388             (Aspartate, aspartate, Asp, D),
389             (Cysteine, cysteine, Cys, C),
390             (Glutamine, glutamine, Gln, Q),
391             (Glutamate, glutamate, Glu, E),
392             (Glycine, glycine, Gly, G),
393             (Histidine, histidine, His, H),
394             (Isoleucine, isoleucine, Ile, I),
395             (Leucine, leucine, Leu, L),
396             (Lysine, lysine, Lys, K),
397             (Methionine, methionine, Met, M),
398             (Phenylalanine, phenylalanine, Phe, F),
399             (Proline, proline, Pro, P),
400             (Serine, serine, Ser, S),
401             (Threonine, threonine, Thr, T),
402             (Tryptophan, trytophan, Trp, W),
403             (Tyrosine, tyrosine, Tyr, Y),
404             (Valine, valine, Val, V)
405             );
406    let mut windows: Vec<String> = protein_seq
407           .chars()
408	   .collect::<Vec<_>>()
409	   .windows(window_size)
410	   .map(|window| window.iter().collect())
411	   .collect();
412    for (index, window) in windows.iter().enumerate() {
413       for ch in window.chars() {
414           match ch {
415               'A' => window_values += hydrophobicity.A(),
416	       'R' => window_values += hydrophobicity.R(),
417	       'N' => window_values += hydrophobicity.N(),
418	       'D' => window_values += hydrophobicity.D(),
419	       'C' => window_values += hydrophobicity.C(),
420	       'Q' => window_values += hydrophobicity.Q(),
421	       'E' => window_values += hydrophobicity.E(),
422	       'G' => window_values += hydrophobicity.G(),
423	       'H' => window_values += hydrophobicity.H(),
424	       'I' => window_values += hydrophobicity.I(),
425	       'L' => window_values += hydrophobicity.L(),
426	       'K' => window_values += hydrophobicity.K(),
427	       'M' => window_values += hydrophobicity.M(),
428	       'F' => window_values += hydrophobicity.F(),
429	       'P' => window_values += hydrophobicity.P(),
430	       'S' => window_values += hydrophobicity.S(),
431	       'T' => window_values += hydrophobicity.T(),
432	       'W' => window_values += hydrophobicity.W(),
433	       'Y' => window_values += hydrophobicity.Y(),
434	       'V' => window_values += hydrophobicity.V(),
435	        _ => continue,
436	        }
437          }
438         total_hydrophobicity.push(window_values);
439      }
440   total_hydrophobicity
441}
442
443
444pub fn amino_counts(protein_seq: &str) -> HashMap<char, u64> {
445    let mut counts: HashMap<char, u64> = HashMap::new();
446    for c in protein_seq.chars() {
447       *counts.entry(c).or_insert(0) +=1;
448       }
449    counts
450}
451
452pub fn amino_percentage(protein_seq: &str) -> HashMap<char, f64> {
453    let mut percentages: HashMap<char, f64> = HashMap::new();
454    let counts = amino_counts(protein_seq);
455    let seq_len: f64 = (protein_seq.len() as f64) as f64;
456    percentages = counts.iter().map(|(k, &value)| {
457        let percentage = (value as f64 / seq_len) * 100.0;
458	(k.clone(), percentage)
459	}).collect();
460    percentages
461}
462       
463
464    
465#[cfg(test)]
466mod tests {
467    use super::*;
468
469   #[test]
470   pub fn suggest_transmembrane_domains() -> Result<(), anyhow::Error> {
471            let file_gbk = File::open("test_output.gbk")?;
472            let mut reader = Reader::new(file_gbk);
473            let mut records = reader.records();
474            loop {  
475                match records.next() {  
476                    Some(Ok(mut record)) => {
477                       //println!("next record");
478                       //println!("Record id: {:?}", record.id);
479                       for (k, v) in &record.cds.attributes {
480                           match record.seq_features.get_sequence_faa(&k) {
481                                     Some(value) => { let seq_faa = value.to_string();
482				                      println!("{:?}", &seq_faa);
483				                      let hydro_values = hydrophobicity(&seq_faa, 21);
484						      let mut result = String::new();
485						      for hydro in hydro_values {
486						           if hydro > 1.6 {
487						               println!("possible transmembrane region - score {}",&hydro);  
488							       }
489						           else {
490						               ()
491							   }
492						      }
493                                                  },
494                                     _ => (),
495                                     };
496                       
497                           }
498                    },
499                    Some(Err(e)) => { println!("theres an err {:?}", e); },
500                    None => {
501                       println!("finished iteration");
502                             break; },
503                    }
504               }
505            return Ok(());
506   }
507   
508   #[test]
509   pub fn collect_molecular_weight() -> Result<(), anyhow::Error> {
510            let file_gbk = File::open("test_output.gbk")?;
511            let mut reader = Reader::new(file_gbk);
512            let mut records = reader.records();
513	    let mut molecular_weight_total: f64 = 0.0;
514            loop {  
515                match records.next() {  
516                    Some(Ok(mut record)) => {
517                       //println!("next record");
518                       //println!("Record id: {:?}", record.id);
519                       for (k, v) in &record.cds.attributes {
520                           match record.seq_features.get_sequence_faa(&k) {
521                                     Some(value) => { let seq_faa = value.to_string();
522				                      println!("id: {:?}", &k);
523				                      molecular_weight_total = molecular_weight(&seq_faa);
524                                                      println!(">{}|{}\n{}", &record.id, &k, molecular_weight_total);
525                                                      },
526                                     _ => (),
527                                     };
528                       
529                           }
530                    },
531                    Some(Err(e)) => { println!("theres an err {:?}", e); },
532                    None => {
533                       println!("finished iteration");
534                             break; },
535                    }
536               }
537            return Ok(());
538   }
539
540   #[test]
541   pub fn count_aminos() -> Result<(), anyhow::Error> {
542            let file_gbk = File::open("test_output.gbk")?;
543            let mut reader = Reader::new(file_gbk);
544            let mut records = reader.records();
545	    let mut results: HashMap<char, u64> = HashMap::new();
546            loop {  
547                match records.next() {  
548                    Some(Ok(mut record)) => {
549                       //println!("next record");
550                       //println!("Record id: {:?}", record.id);
551                       for (k, v) in &record.cds.attributes {
552                           match record.seq_features.get_sequence_faa(&k) {
553                                     Some(value) => { let seq_faa = value.to_string();
554				                      println!("id: {:?}", &k);
555				                      results = amino_counts(&seq_faa);
556                                                      println!(">{}|{}\n{:?}", &record.id, &k, results);
557                                                      },
558                                     _ => (),
559                                     };
560                       
561                           }
562                    },
563                    Some(Err(e)) => { println!("theres an err {:?}", e); },
564                    None => {
565                       println!("finished iteration");
566                             break; },
567                    }
568               }
569            return Ok(());
570   }
571   
572   #[test]
573   pub fn aromaticity() -> Result<(), anyhow::Error> {
574        // calculated as in biopython with aromaticity according to Lobry, 1994 as the relative freq of Phe+Trp+Tyr
575        let file_gbk = File::open("test_output.gbk")?;
576	let mut reader = Reader::new(file_gbk);
577	let mut records = reader.records();
578	let mut results: HashMap<char, f64> = HashMap::new();
579	loop {
580	   match records.next() {
581	      Some(Ok(mut record)) => {
582	          for (k, v) in &record.cds.attributes {
583		     match record.seq_features.get_sequence_faa(&k) {
584		         Some(value) => {  let seq_faa = value.to_string();
585			                   results = amino_percentage(&seq_faa);
586					   let aromatic_aas = vec!['Y','W','F'];
587					   let aromaticity: f64 = aromatic_aas.iter()
588					       .filter_map(|&amino| results.get(&amino))
589					       .map(|&perc| perc / 100.0)
590					       .sum();
591					   println!("aromaticity for {} {} is {}",&record.id, &k, &aromaticity);
592					  },
593			_ => (),
594			};
595		   }
596	         },
597	    Some(Err(e)) => { println!("theres an error {:?}", e); },
598	    None => { println!("finished iteration");
599	              break;
600		    },
601	    }
602       }
603      return Ok(());
604   }
605}