microBioRust_seqmetrics/metrics.rs
1//! The purpose of seqmetrics is to allow calculations of protein sequence parameters
2//! We define a set of amino acid getters to allow getting of the protein sequence
3//! as either the three letter code such as Trp, Phe, the full name such as alanine, glycine
4//! or the single letter code such as Y, A
5//!
6//! Example function to calculate transmembrane statistics
7//!
8//! ```rust
9//!
10//! use clap::Parser;
11//! use std::fs::File;
12//! use microBioRust::gbk::Reader;
13//! use std::io;
14//! use std::collections::HashMap;
15//! use microBioRust_seqmetrics::metrics::hydrophobicity;
16//!
17//! pub fn suggest_transmembrane_domains() -> Result<(), anyhow::Error> {
18//! let file_gbk = File::open("test_output.gbk")?;
19//! let mut reader = Reader::new(file_gbk);
20//! let mut records = reader.records();
21//! loop {
22//! match records.next() {
23//! Some(Ok(mut record)) => {
24//! //println!("next record");
25//! //println!("Record id: {:?}", record.id);
26//! for (k, v) in &record.cds.attributes {
27//! match record.seq_features.get_sequence_faa(&k) {
28//! Some(value) => { let seq_faa = value.to_string();
29//! println!("{:?}", &seq_faa);
30//! let hydro_values = hydrophobicity(&seq_faa, 21);
31//! let mut result = String::new();
32//! for hydro in hydro_values {
33//! if hydro > 1.6 {
34//! println!("possible transmembrane region - score {}",&hydro);
35//! }
36//! else {
37//! ()
38//! }
39//! }
40//! },
41//! _ => (),
42//! };
43//!
44//! }
45//! },
46//! Some(Err(e)) => { println!("theres an err {:?}", e); },
47//! None => {
48//! println!("finished iteration");
49//! break;
50//! },
51//! };
52//! }
53//! return Ok(());
54//! }
55//!```
56//!
57//! Example function to calculate the molecular weight of a protein sequence
58//!
59//!```rust
60//!
61//! use clap::Parser;
62//! use std::fs::File;
63//! use microBioRust::gbk::Reader;
64//! use std::io;
65//! use std::collections::HashMap;
66//! use microBioRust_seqmetrics::metrics::molecular_weight;
67//!
68//! pub fn collect_molecular_weight() -> Result<(), anyhow::Error> {
69//! let file_gbk = File::open("test_output.gbk")?;
70//! let mut reader = Reader::new(file_gbk);
71//! let mut records = reader.records();
72//! let mut molecular_weight_total: f64 = 0.0;
73//! loop {
74//! match records.next() {
75//! Some(Ok(mut record)) => {
76//! //println!("next record");
77//! //println!("Record id: {:?}", record.id);
78//! for (k, v) in &record.cds.attributes {
79//! match record.seq_features.get_sequence_faa(&k) {
80//! Some(value) => {
81//! let seq_faa = value.to_string();
82//! println!("id: {:?}", &k);
83//! molecular_weight_total = molecular_weight(&seq_faa);
84//! println!(">{}|{}\n{}", &record.id, &k, molecular_weight_total);
85//! },
86//! _ => (),
87//! };
88//!
89//! }
90//! },
91//! Some(Err(e)) => { println!("theres an err {:?}", e); },
92//! None => {
93//! println!("finished iteration");
94//! break;
95//! },
96//! }
97//! }
98//! return Ok(());
99//! }
100//!```
101//!
102//! Example function to count amino acids of each protein as raw counts, see below to generate percentages per protein
103//!
104//!```rust
105//!
106//! use clap::Parser;
107//! use std::fs::File;
108//! use microBioRust::gbk::Reader;
109//! use std::io;
110//! use std::collections::HashMap;
111//! use microBioRust_seqmetrics::metrics::amino_counts;
112//!
113//! pub fn count_aminos() -> Result<(), anyhow::Error> {
114//! let file_gbk = File::open("test_output.gbk")?;
115//! let mut reader = Reader::new(file_gbk);
116//! let mut records = reader.records();
117//! let mut results: HashMap<char, u64> = HashMap::new();
118//! loop {
119//! match records.next() {
120//! Some(Ok(mut record)) => {
121//! //println!("next record");
122//! //println!("Record id: {:?}", record.id);
123//! for (k, v) in &record.cds.attributes {
124//! match record.seq_features.get_sequence_faa(&k) {
125//! Some(value) => { let seq_faa = value.to_string();
126//! println!("id: {:?}", &k);
127//! results = amino_counts(&seq_faa);
128//! println!(">{}|{}\n{:?}", &record.id, &k, results);
129//! },
130//! _ => (),
131//! };
132//!
133//! }
134//! },
135//! Some(Err(e)) => { println!("theres an err {:?}", e); },
136//! None => {
137//! println!("finished iteration");
138//! break;
139//! },
140//! }
141//! }
142//! return Ok(());
143//! }
144//!```
145//! Example function to calculate and print out the aromaticity of each protein
146//!
147//!```rust
148//! use clap::Parser;
149//! use std::fs::File;
150//! use microBioRust::gbk::Reader;
151//! use std::io;
152//! use std::collections::HashMap;
153//! use microBioRust_seqmetrics::metrics::amino_percentage;
154//!
155//! pub fn aromaticity() -> Result<(), anyhow::Error> {
156//! // calculated as in biopython with aromaticity according to Lobry, 1994 as the relative freq of Phe+Trp+Tyr
157//! let file_gbk = File::open("test_output.gbk")?;
158//! let mut reader = Reader::new(file_gbk);
159//! let mut records = reader.records();
160//! let mut results: HashMap<char, f64> = HashMap::new();
161//! loop {
162//! match records.next() {
163//! Some(Ok(mut record)) => {
164//! for (k, v) in &record.cds.attributes {
165//! match record.seq_features.get_sequence_faa(&k) {
166//! Some(value) => { let seq_faa = value.to_string();
167//! results = amino_percentage(&seq_faa);
168//! let aromatic_aas = vec!['Y','W','F'];
169//! let aromaticity: f64 = aromatic_aas.iter()
170//! .filter_map(|&amino| results.get(&amino))
171//! .map(|&perc| perc / 100.0)
172//! .sum();
173//! println!("aromaticity for {} {} is {}",&record.id, &k, &aromaticity);
174//! },
175//! _ => (),
176//! };
177//! }
178//! },
179//! Some(Err(e)) => { println!("theres an error {:?}", e); },
180//! None => {
181//! println!("finished iteration");
182//! break;
183//! },
184//! }
185//! }
186//! return Ok(());
187//! }
188//!```
189
190
191use std::collections::HashMap;
192use std::fs::File;
193use microBioRust::gbk::Reader;
194
195// Define a macro to generate the getters for each amino acid
196macro_rules! amino_acid_getters {
197 ($struct_name:ident, $( ($field:ident, $full_name:ident, $three_letter:ident, $single_letter:ident) ),* ) => {
198 impl $struct_name {
199 $(
200 // Capital full name getter
201 fn $field(&self) -> f64 {
202 self.$field
203 }
204 // Full name getter
205 fn $full_name(&self) -> f64 {
206 self.$field
207 }
208 // Three-letter code getter
209 fn $three_letter(&self) -> f64 {
210 self.$field
211 }
212 // Single-letter code getter
213 fn $single_letter(&self) -> f64 {
214 self.$field
215 }
216 )*
217 }
218 };
219}
220
221
222pub struct MolWeights {
223 Alanine: f64,
224 Arginine: f64,
225 Asparagine: f64,
226 Aspartate: f64,
227 Cysteine: f64,
228 Glutamate: f64,
229 Glutamine: f64,
230 Glycine: f64,
231 Histidine: f64,
232 Isoleucine: f64,
233 Leucine: f64,
234 Lysine: f64,
235 Methionine: f64,
236 Phenylalanine: f64,
237 Proline: f64,
238 Serine: f64,
239 Threonine: f64,
240 Tryptophan: f64,
241 Tyrosine: f64,
242 Valine: f64,
243}
244
245impl MolWeights {
246 fn new() -> Self {
247 Self {
248 //masses from NIST chemistry webbook US Dept of commerce
249 Alanine: 89.0932, //C3H7NO2
250 Arginine: 174.2010, //C6H14N4O2
251 Asparagine: 132.1179, //C4H8N2O3
252 Aspartate: 133.1027, //C4H7NO4
253 Cysteine: 121.158, //C3H7NO2S
254 Glutamate: 147.1293, //C5H9NO4
255 Glutamine: 146.1445, //C5H10N2O3
256 Glycine: 75.0666, //C2H5NO2
257 Histidine: 155.1546, //C6H9N3O2
258 Isoleucine: 131.1729, //C6H13NO2
259 Leucine: 131.1729, //C6H13NO2
260 Lysine: 146.1876, //C6H14N2O2
261 Methionine: 149.211, //C5H11NO2S
262 Phenylalanine: 165.1891, //C9H11NO2
263 Proline: 115.1305, //C5H9NO2
264 Serine: 105.0926, //C3H7NO2
265 Threonine: 119.1192, //C4H9NO3
266 Tryptophan: 204.2252, //C11H12N2O2
267 Tyrosine: 181.1885, //C9H11NO3
268 Valine: 117.1463, //C5H11NO2
269 }
270 }
271}
272
273#[allow(non_snake_case)]
274pub fn molecular_weight(protein_seq: &str) -> f64 {
275 let amino_weights: MolWeights = MolWeights::new();
276 amino_acid_getters!(MolWeights,
277 (Alanine, alanine, Ala, A),
278 (Arginine, arginine, Arg, R),
279 (Asparagine, asparagine, Asn, N),
280 (Aspartate, aspartate, Asp, D),
281 (Cysteine, cysteine, Cys, C),
282 (Glutamine, glutamine, Gln, Q),
283 (Glutamate, glutamate, Glu, E),
284 (Glycine, glycine, Gly, G),
285 (Histidine, histidine, His, H),
286 (Isoleucine, isoleucine, Ile, I),
287 (Leucine, leucine, Leu, L),
288 (Lysine, lysine, Lys, K),
289 (Methionine, methionine, Met, M),
290 (Phenylalanine, phenylalanine, Phe, F),
291 (Proline, proline, Pro, P),
292 (Serine, serine, Ser, S),
293 (Threonine, threonine, Thr, T),
294 (Tryptophan, trytophan, Trp, W),
295 (Tyrosine, tyrosine, Tyr, Y),
296 (Valine, valine, Val, V)
297 );
298 let mut total_weight = 0.0;
299 for ch in protein_seq.chars() {
300 match ch {
301 'A' => total_weight += amino_weights.A(),
302 'R' => total_weight += amino_weights.R(),
303 'N' => total_weight += amino_weights.N(),
304 'D' => total_weight += amino_weights.D(),
305 'C' => total_weight += amino_weights.C(),
306 'Q' => total_weight += amino_weights.Q(),
307 'E' => total_weight += amino_weights.E(),
308 'G' => total_weight += amino_weights.G(),
309 'H' => total_weight += amino_weights.H(),
310 'I' => total_weight += amino_weights.I(),
311 'L' => total_weight += amino_weights.L(),
312 'K' => total_weight += amino_weights.K(),
313 'M' => total_weight += amino_weights.M(),
314 'F' => total_weight += amino_weights.F(),
315 'P' => total_weight += amino_weights.P(),
316 'S' => total_weight += amino_weights.S(),
317 'T' => total_weight += amino_weights.T(),
318 'W' => total_weight += amino_weights.W(),
319 'Y' => total_weight += amino_weights.Y(),
320 'V' => total_weight += amino_weights.V(),
321 _ => continue,
322 }
323 }
324 let result_weight = total_weight - ((protein_seq.len() - 1) as f64 * 18.02);
325 result_weight
326}
327
328pub struct Hydrophobicity {
329 Alanine: f64,
330 Arginine: f64,
331 Asparagine: f64,
332 Aspartate: f64,
333 Cysteine: f64,
334 Glutamate: f64,
335 Glutamine: f64,
336 Glycine: f64,
337 Histidine: f64,
338 Isoleucine: f64,
339 Leucine: f64,
340 Lysine: f64,
341 Methionine: f64,
342 Phenylalanine: f64,
343 Proline: f64,
344 Serine: f64,
345 Threonine: f64,
346 Tryptophan: f64,
347 Tyrosine: f64,
348 Valine: f64,
349}
350
351impl Hydrophobicity {
352 fn new_KD() -> Self {
353 Self {
354 //Kyte-Doolittle values from the Qiagen resources website
355 Alanine: 1.80,
356 Arginine: -4.50,
357 Asparagine: -3.50,
358 Aspartate: -3.50,
359 Cysteine: 2.50,
360 Glutamate: -3.50,
361 Glutamine: -3.50,
362 Glycine: -0.40,
363 Histidine: -3.20,
364 Isoleucine: 4.50,
365 Leucine: 3.80,
366 Lysine: -3.90,
367 Methionine: 1.90,
368 Phenylalanine: 2.80,
369 Proline: -1.60,
370 Serine: -0.80,
371 Threonine: -0.70,
372 Tryptophan: -0.90,
373 Tyrosine: -1.30,
374 Valine: 4.20,
375 }
376 }
377}
378
379#[allow(non_snake_case)]
380pub fn hydrophobicity(protein_seq: &str, window_size: usize) -> Vec<f64> {
381 let mut hydrophobicity: Hydrophobicity = Hydrophobicity::new_KD();
382 let mut total_hydrophobicity: Vec<f64> = Vec::new();
383 let mut window_values: f64 = 0.0;
384 amino_acid_getters!(Hydrophobicity,
385 (Alanine, alanine, Ala, A),
386 (Arginine, arginine, Arg, R),
387 (Asparagine, asparagine, Asn, N),
388 (Aspartate, aspartate, Asp, D),
389 (Cysteine, cysteine, Cys, C),
390 (Glutamine, glutamine, Gln, Q),
391 (Glutamate, glutamate, Glu, E),
392 (Glycine, glycine, Gly, G),
393 (Histidine, histidine, His, H),
394 (Isoleucine, isoleucine, Ile, I),
395 (Leucine, leucine, Leu, L),
396 (Lysine, lysine, Lys, K),
397 (Methionine, methionine, Met, M),
398 (Phenylalanine, phenylalanine, Phe, F),
399 (Proline, proline, Pro, P),
400 (Serine, serine, Ser, S),
401 (Threonine, threonine, Thr, T),
402 (Tryptophan, trytophan, Trp, W),
403 (Tyrosine, tyrosine, Tyr, Y),
404 (Valine, valine, Val, V)
405 );
406 let mut windows: Vec<String> = protein_seq
407 .chars()
408 .collect::<Vec<_>>()
409 .windows(window_size)
410 .map(|window| window.iter().collect())
411 .collect();
412 for (index, window) in windows.iter().enumerate() {
413 for ch in window.chars() {
414 match ch {
415 'A' => window_values += hydrophobicity.A(),
416 'R' => window_values += hydrophobicity.R(),
417 'N' => window_values += hydrophobicity.N(),
418 'D' => window_values += hydrophobicity.D(),
419 'C' => window_values += hydrophobicity.C(),
420 'Q' => window_values += hydrophobicity.Q(),
421 'E' => window_values += hydrophobicity.E(),
422 'G' => window_values += hydrophobicity.G(),
423 'H' => window_values += hydrophobicity.H(),
424 'I' => window_values += hydrophobicity.I(),
425 'L' => window_values += hydrophobicity.L(),
426 'K' => window_values += hydrophobicity.K(),
427 'M' => window_values += hydrophobicity.M(),
428 'F' => window_values += hydrophobicity.F(),
429 'P' => window_values += hydrophobicity.P(),
430 'S' => window_values += hydrophobicity.S(),
431 'T' => window_values += hydrophobicity.T(),
432 'W' => window_values += hydrophobicity.W(),
433 'Y' => window_values += hydrophobicity.Y(),
434 'V' => window_values += hydrophobicity.V(),
435 _ => continue,
436 }
437 }
438 total_hydrophobicity.push(window_values);
439 }
440 total_hydrophobicity
441}
442
443
444pub fn amino_counts(protein_seq: &str) -> HashMap<char, u64> {
445 let mut counts: HashMap<char, u64> = HashMap::new();
446 for c in protein_seq.chars() {
447 *counts.entry(c).or_insert(0) +=1;
448 }
449 counts
450}
451
452pub fn amino_percentage(protein_seq: &str) -> HashMap<char, f64> {
453 let mut percentages: HashMap<char, f64> = HashMap::new();
454 let counts = amino_counts(protein_seq);
455 let seq_len: f64 = (protein_seq.len() as f64) as f64;
456 percentages = counts.iter().map(|(k, &value)| {
457 let percentage = (value as f64 / seq_len) * 100.0;
458 (k.clone(), percentage)
459 }).collect();
460 percentages
461}
462
463
464
465#[cfg(test)]
466mod tests {
467 use super::*;
468
469 #[test]
470 pub fn suggest_transmembrane_domains() -> Result<(), anyhow::Error> {
471 let file_gbk = File::open("test_output.gbk")?;
472 let mut reader = Reader::new(file_gbk);
473 let mut records = reader.records();
474 loop {
475 match records.next() {
476 Some(Ok(mut record)) => {
477 //println!("next record");
478 //println!("Record id: {:?}", record.id);
479 for (k, v) in &record.cds.attributes {
480 match record.seq_features.get_sequence_faa(&k) {
481 Some(value) => { let seq_faa = value.to_string();
482 println!("{:?}", &seq_faa);
483 let hydro_values = hydrophobicity(&seq_faa, 21);
484 let mut result = String::new();
485 for hydro in hydro_values {
486 if hydro > 1.6 {
487 println!("possible transmembrane region - score {}",&hydro);
488 }
489 else {
490 ()
491 }
492 }
493 },
494 _ => (),
495 };
496
497 }
498 },
499 Some(Err(e)) => { println!("theres an err {:?}", e); },
500 None => {
501 println!("finished iteration");
502 break; },
503 }
504 }
505 return Ok(());
506 }
507
508 #[test]
509 pub fn collect_molecular_weight() -> Result<(), anyhow::Error> {
510 let file_gbk = File::open("test_output.gbk")?;
511 let mut reader = Reader::new(file_gbk);
512 let mut records = reader.records();
513 let mut molecular_weight_total: f64 = 0.0;
514 loop {
515 match records.next() {
516 Some(Ok(mut record)) => {
517 //println!("next record");
518 //println!("Record id: {:?}", record.id);
519 for (k, v) in &record.cds.attributes {
520 match record.seq_features.get_sequence_faa(&k) {
521 Some(value) => { let seq_faa = value.to_string();
522 println!("id: {:?}", &k);
523 molecular_weight_total = molecular_weight(&seq_faa);
524 println!(">{}|{}\n{}", &record.id, &k, molecular_weight_total);
525 },
526 _ => (),
527 };
528
529 }
530 },
531 Some(Err(e)) => { println!("theres an err {:?}", e); },
532 None => {
533 println!("finished iteration");
534 break; },
535 }
536 }
537 return Ok(());
538 }
539
540 #[test]
541 pub fn count_aminos() -> Result<(), anyhow::Error> {
542 let file_gbk = File::open("test_output.gbk")?;
543 let mut reader = Reader::new(file_gbk);
544 let mut records = reader.records();
545 let mut results: HashMap<char, u64> = HashMap::new();
546 loop {
547 match records.next() {
548 Some(Ok(mut record)) => {
549 //println!("next record");
550 //println!("Record id: {:?}", record.id);
551 for (k, v) in &record.cds.attributes {
552 match record.seq_features.get_sequence_faa(&k) {
553 Some(value) => { let seq_faa = value.to_string();
554 println!("id: {:?}", &k);
555 results = amino_counts(&seq_faa);
556 println!(">{}|{}\n{:?}", &record.id, &k, results);
557 },
558 _ => (),
559 };
560
561 }
562 },
563 Some(Err(e)) => { println!("theres an err {:?}", e); },
564 None => {
565 println!("finished iteration");
566 break; },
567 }
568 }
569 return Ok(());
570 }
571
572 #[test]
573 pub fn aromaticity() -> Result<(), anyhow::Error> {
574 // calculated as in biopython with aromaticity according to Lobry, 1994 as the relative freq of Phe+Trp+Tyr
575 let file_gbk = File::open("test_output.gbk")?;
576 let mut reader = Reader::new(file_gbk);
577 let mut records = reader.records();
578 let mut results: HashMap<char, f64> = HashMap::new();
579 loop {
580 match records.next() {
581 Some(Ok(mut record)) => {
582 for (k, v) in &record.cds.attributes {
583 match record.seq_features.get_sequence_faa(&k) {
584 Some(value) => { let seq_faa = value.to_string();
585 results = amino_percentage(&seq_faa);
586 let aromatic_aas = vec!['Y','W','F'];
587 let aromaticity: f64 = aromatic_aas.iter()
588 .filter_map(|&amino| results.get(&amino))
589 .map(|&perc| perc / 100.0)
590 .sum();
591 println!("aromaticity for {} {} is {}",&record.id, &k, &aromaticity);
592 },
593 _ => (),
594 };
595 }
596 },
597 Some(Err(e)) => { println!("theres an error {:?}", e); },
598 None => { println!("finished iteration");
599 break;
600 },
601 }
602 }
603 return Ok(());
604 }
605}