microBioRust_seqmetrics/metrics.rs
1//! The purpose of seqmetrics is to allow calculations of protein sequence parameters
2//! We define a set of amino acid getters to allow getting of the protein sequence
3//! as either the three letter code such as Trp, Phe, the full name such as alanine, glycine
4//! or the single letter code such as Y, A
5//!
6//! Example function to calculate transmembrane statistics
7//!
8//! ```rust
9//!
10//! use clap::Parser;
11//! use std::fs::File;
12//! use microBioRust::gbk::Reader;
13//! use std::io;
14//! use std::collections::HashMap;
15//! use microBioRust_seqmetrics::metrics::hydrophobicity;
16//!
17//! pub fn suggest_transmembrane_domains() -> Result<(), anyhow::Error> {
18//! let file_gbk = File::open("test_output.gbk")?;
19//! let mut reader = Reader::new(file_gbk);
20//! let mut records = reader.records();
21//! loop {
22//! match records.next() {
23//! Some(Ok(mut record)) => {
24//! //println!("next record");
25//! //println!("Record id: {:?}", record.id);
26//! for (k, v) in &record.cds.attributes {
27//! match record.seq_features.get_sequence_faa(&k) {
28//! Some(value) => { let seq_faa = value.to_string();
29//! println!("{:?}", &seq_faa);
30//! let hydro_values = hydrophobicity(&seq_faa, 21);
31//! let mut result = String::new();
32//! for hydro in hydro_values {
33//! if hydro > 1.6 {
34//! println!("possible transmembrane region - score {}",&hydro);
35//! }
36//! else {
37//! ()
38//! }
39//! }
40//! },
41//! _ => (),
42//! };
43//!
44//! }
45//! },
46//! Some(Err(e)) => { println!("theres an err {:?}", e); },
47//! None => {
48//! println!("finished iteration");
49//! break;
50//! },
51//! };
52//! }
53//! return Ok(());
54//! }
55//!```
56//!
57//! Example function to calculate the molecular weight of a protein sequence
58//!
59//!```rust
60//!
61//! use clap::Parser;
62//! use std::fs::File;
63//! use microBioRust::gbk::Reader;
64//! use std::io;
65//! use std::collections::HashMap;
66//! use microBioRust_seqmetrics::metrics::molecular_weight;
67//!
68//! pub fn collect_molecular_weight() -> Result<(), anyhow::Error> {
69//! let file_gbk = File::open("test_output.gbk")?;
70//! let mut reader = Reader::new(file_gbk);
71//! let mut records = reader.records();
72//! let mut molecular_weight_total: f64 = 0.0;
73//! loop {
74//! match records.next() {
75//! Some(Ok(mut record)) => {
76//! //println!("next record");
77//! //println!("Record id: {:?}", record.id);
78//! for (k, v) in &record.cds.attributes {
79//! match record.seq_features.get_sequence_faa(&k) {
80//! Some(value) => {
81//! let seq_faa = value.to_string();
82//! println!("id: {:?}", &k);
83//! molecular_weight_total = molecular_weight(&seq_faa);
84//! println!(">{}|{}\n{}", &record.id, &k, molecular_weight_total);
85//! },
86//! _ => (),
87//! };
88//!
89//! }
90//! },
91//! Some(Err(e)) => { println!("theres an err {:?}", e); },
92//! None => {
93//! println!("finished iteration");
94//! break;
95//! },
96//! }
97//! }
98//! return Ok(());
99//! }
100//!```
101//!
102//! Example function to count amino acids of each protein as raw counts, see below to generate percentages per protein
103//!
104//!```rust
105//!
106//! use clap::Parser;
107//! use std::fs::File;
108//! use microBioRust::gbk::Reader;
109//! use std::io;
110//! use std::collections::HashMap;
111//! use microBioRust_seqmetrics::metrics::amino_counts;
112//!
113//! pub fn count_aminos() -> Result<(), anyhow::Error> {
114//! let file_gbk = File::open("test_output.gbk")?;
115//! let mut reader = Reader::new(file_gbk);
116//! let mut records = reader.records();
117//! let mut results: HashMap<char, u64> = HashMap::new();
118//! loop {
119//! match records.next() {
120//! Some(Ok(mut record)) => {
121//! //println!("next record");
122//! //println!("Record id: {:?}", record.id);
123//! for (k, v) in &record.cds.attributes {
124//! match record.seq_features.get_sequence_faa(&k) {
125//! Some(value) => { let seq_faa = value.to_string();
126//! println!("id: {:?}", &k);
127//! results = amino_counts(&seq_faa);
128//! println!(">{}|{}\n{:?}", &record.id, &k, results);
129//! },
130//! _ => (),
131//! };
132//!
133//! }
134//! },
135//! Some(Err(e)) => { println!("theres an err {:?}", e); },
136//! None => {
137//! println!("finished iteration");
138//! break;
139//! },
140//! }
141//! }
142//! return Ok(());
143//! }
144//!```
145//! Example function to calculate and print out the aromaticity of each protein
146//!
147//!```rust
148//! use clap::Parser;
149//! use std::fs::File;
150//! use microBioRust::gbk::Reader;
151//! use std::io;
152//! use std::collections::HashMap;
153//! use microBioRust_seqmetrics::metrics::amino_percentage;
154//!
155//! pub fn aromaticity() -> Result<(), anyhow::Error> {
156//! // calculated as in biopython with aromaticity according to Lobry, 1994 as the relative freq of Phe+Trp+Tyr
157//! let file_gbk = File::open("test_output.gbk")?;
158//! let mut reader = Reader::new(file_gbk);
159//! let mut records = reader.records();
160//! let mut results: HashMap<char, f64> = HashMap::new();
161//! loop {
162//! match records.next() {
163//! Some(Ok(mut record)) => {
164//! for (k, v) in &record.cds.attributes {
165//! match record.seq_features.get_sequence_faa(&k) {
166//! Some(value) => { let seq_faa = value.to_string();
167//! results = amino_percentage(&seq_faa);
168//! let aromatic_aas = vec!['Y','W','F'];
169//! let aromaticity: f64 = aromatic_aas.iter()
170//! .filter_map(|&amino| results.get(&amino))
171//! .map(|&perc| perc / 100.0)
172//! .sum();
173//! println!("aromaticity for {} {} is {}",&record.id, &k, &aromaticity);
174//! },
175//! _ => (),
176//! };
177//! }
178//! },
179//! Some(Err(e)) => { println!("theres an error {:?}", e); },
180//! None => {
181//! println!("finished iteration");
182//! break;
183//! },
184//! }
185//! }
186//! return Ok(());
187//! }
188//!```
189//! The purpose of hamming.rs is to allow calculations of the hamming distances between sequences
190//! The Hamming distance is the minimum number of substitutions required to change one string to another
191//! It is one of several string metrics for measuring the edit distance between two sequences.
192//! It does not encompass or take into account any biology
193//! It is named after the American Mathematician Richard Hamming (wikipedia)
194//! This is aimed essentially at protein fasta sequences
195//!
196//!
197//! ```
198//! use microBioRust_seqmetrics::hamming::hamming_matrix;
199//! use microBioRust_seqmetrics::write_dst_csv::write_distances_csv;
200//! use tokio::fs::File;
201//! use std::collections::HashMap;
202//! use bio::io::fasta;
203//! use tokio::io;
204//! use tokio::io::{AsyncWriteExt, BufWriter};
205//!
206//!
207//! #[tokio::main]
208//! async fn main() -> Result<(), anyhow::Error> {
209//! let reader = fasta::Reader::new(std::io::stdin());
210//! let records: Vec<_> = reader.records().collect::<Result<_, _>>()?;
211//! println!("gathering records");
212//! let sequences: Vec<String> = records
213//! .iter()
214//! .map(|rec| String::from_utf8_lossy(rec.seq()).to_string())
215//! .collect();
216//! let ids: Vec<String> = records
217//! .iter()
218//! .map(|rec| rec.id().to_string())
219//! .collect();
220//! println!("gathered ids");
221//! let distances = hamming_matrix(&sequences).await?;
222//! write_distances_csv(ids, distances, "hamming_dists.csv").await?;
223//!
224//! Ok(())
225//! }
226//! ```
227
228#![allow(unused_imports)]
229use microBioRust::gbk::Reader;
230use std::fs::File;
231use std::collections::HashMap;
232use crate::hamming::hamming_matrix;
233use crate::write_dst_csv::write_distances_csv;
234use bio::io::fasta;
235
236
237// Define a macro to generate the getters for each amino acid
238#[macro_export]
239macro_rules! amino_acid_getters {
240 ($struct_name:ident, $( ($field:ident, $full_name:ident, $three_letter:ident, $single_letter:ident) ),* ) => {
241 #[allow(non_snake_case)]
242 #[allow(dead_code)]
243 impl $struct_name {
244 $(
245 // Capital full name getter
246 fn $field(&self) -> f64 {
247 self.$field
248 }
249 // Full name getter
250 fn $full_name(&self) -> f64 {
251 self.$field
252 }
253 // Three-letter code getter
254 fn $three_letter(&self) -> f64 {
255 self.$field
256 }
257 // Single-letter code getter
258 fn $single_letter(&self) -> f64 {
259 self.$field
260 }
261 )*
262 }
263 };
264}
265
266#[allow(non_snake_case)]
267#[allow(dead_code)]
268pub struct MolWeights {
269 Alanine: f64,
270 Arginine: f64,
271 Asparagine: f64,
272 Aspartate: f64,
273 Cysteine: f64,
274 Glutamate: f64,
275 Glutamine: f64,
276 Glycine: f64,
277 Histidine: f64,
278 Isoleucine: f64,
279 Leucine: f64,
280 Lysine: f64,
281 Methionine: f64,
282 Phenylalanine: f64,
283 Proline: f64,
284 Serine: f64,
285 Threonine: f64,
286 Tryptophan: f64,
287 Tyrosine: f64,
288 Valine: f64,
289}
290
291#[allow(non_snake_case)]
292#[allow(dead_code)]
293impl MolWeights {
294 fn new() -> Self {
295 Self {
296 //masses from NIST chemistry webbook US Dept of commerce
297 Alanine: 89.0932, //C3H7NO2
298 Arginine: 174.2010, //C6H14N4O2
299 Asparagine: 132.1179, //C4H8N2O3
300 Aspartate: 133.1027, //C4H7NO4
301 Cysteine: 121.158, //C3H7NO2S
302 Glutamate: 147.1293, //C5H9NO4
303 Glutamine: 146.1445, //C5H10N2O3
304 Glycine: 75.0666, //C2H5NO2
305 Histidine: 155.1546, //C6H9N3O2
306 Isoleucine: 131.1729, //C6H13NO2
307 Leucine: 131.1729, //C6H13NO2
308 Lysine: 146.1876, //C6H14N2O2
309 Methionine: 149.211, //C5H11NO2S
310 Phenylalanine: 165.1891, //C9H11NO2
311 Proline: 115.1305, //C5H9NO2
312 Serine: 105.0926, //C3H7NO2
313 Threonine: 119.1192, //C4H9NO3
314 Tryptophan: 204.2252, //C11H12N2O2
315 Tyrosine: 181.1885, //C9H11NO3
316 Valine: 117.1463, //C5H11NO2
317 }
318 }
319}
320
321
322amino_acid_getters!(MolWeights,
323 (Alanine, alanine, Ala, A),
324 (Arginine, arginine, Arg, R),
325 (Asparagine, asparagine, Asn, N),
326 (Aspartate, aspartate, Asp, D),
327 (Cysteine, cysteine, Cys, C),
328 (Glutamine, glutamine, Gln, Q),
329 (Glutamate, glutamate, Glu, E),
330 (Glycine, glycine, Gly, G),
331 (Histidine, histidine, His, H),
332 (Isoleucine, isoleucine, Ile, I),
333 (Leucine, leucine, Leu, L),
334 (Lysine, lysine, Lys, K),
335 (Methionine, methionine, Met, M),
336 (Phenylalanine, phenylalanine, Phe, F),
337 (Proline, proline, Pro, P),
338 (Serine, serine, Ser, S),
339 (Threonine, threonine, Thr, T),
340 (Tryptophan, tryptophan, Trp, W),
341 (Tyrosine, tyrosine, Tyr, Y),
342 (Valine, valine, Val, V)
343 );
344
345#[allow(non_snake_case)]
346#[allow(dead_code)]
347#[allow(unused_variables)]
348pub fn molecular_weight(protein_seq: &str) -> f64 {
349 let amino_weights: MolWeights = MolWeights::new();
350 let mut total_weight = 0.0;
351 for ch in protein_seq.chars() {
352 match ch {
353 'A' => total_weight += amino_weights.A(),
354 'R' => total_weight += amino_weights.R(),
355 'N' => total_weight += amino_weights.N(),
356 'D' => total_weight += amino_weights.D(),
357 'C' => total_weight += amino_weights.C(),
358 'Q' => total_weight += amino_weights.Q(),
359 'E' => total_weight += amino_weights.E(),
360 'G' => total_weight += amino_weights.G(),
361 'H' => total_weight += amino_weights.H(),
362 'I' => total_weight += amino_weights.I(),
363 'L' => total_weight += amino_weights.L(),
364 'K' => total_weight += amino_weights.K(),
365 'M' => total_weight += amino_weights.M(),
366 'F' => total_weight += amino_weights.F(),
367 'P' => total_weight += amino_weights.P(),
368 'S' => total_weight += amino_weights.S(),
369 'T' => total_weight += amino_weights.T(),
370 'W' => total_weight += amino_weights.W(),
371 'Y' => total_weight += amino_weights.Y(),
372 'V' => total_weight += amino_weights.V(),
373 _ => continue,
374 }
375 }
376 let result_weight = total_weight - ((protein_seq.len() - 1) as f64 * 18.02);
377 result_weight
378}
379
380#[allow(non_snake_case)]
381#[allow(dead_code)]
382pub struct Hydrophobicity {
383 Alanine: f64,
384 Arginine: f64,
385 Asparagine: f64,
386 Aspartate: f64,
387 Cysteine: f64,
388 Glutamate: f64,
389 Glutamine: f64,
390 Glycine: f64,
391 Histidine: f64,
392 Isoleucine: f64,
393 Leucine: f64,
394 Lysine: f64,
395 Methionine: f64,
396 Phenylalanine: f64,
397 Proline: f64,
398 Serine: f64,
399 Threonine: f64,
400 Tryptophan: f64,
401 Tyrosine: f64,
402 Valine: f64,
403}
404
405impl Hydrophobicity {
406 #[allow(non_snake_case)]
407 fn new_KD() -> Self {
408 Self {
409 //Kyte-Doolittle values from the Qiagen resources website
410 Alanine: 1.80,
411 Arginine: -4.50,
412 Asparagine: -3.50,
413 Aspartate: -3.50,
414 Cysteine: 2.50,
415 Glutamate: -3.50,
416 Glutamine: -3.50,
417 Glycine: -0.40,
418 Histidine: -3.20,
419 Isoleucine: 4.50,
420 Leucine: 3.80,
421 Lysine: -3.90,
422 Methionine: 1.90,
423 Phenylalanine: 2.80,
424 Proline: -1.60,
425 Serine: -0.80,
426 Threonine: -0.70,
427 Tryptophan: -0.90,
428 Tyrosine: -1.30,
429 Valine: 4.20,
430 }
431 }
432}
433
434amino_acid_getters!(Hydrophobicity,
435 (Alanine, alanine, Ala, A),
436 (Arginine, arginine, Arg, R),
437 (Asparagine, asparagine, Asn, N),
438 (Aspartate, aspartate, Asp, D),
439 (Cysteine, cysteine, Cys, C),
440 (Glutamine, glutamine, Gln, Q),
441 (Glutamate, glutamate, Glu, E),
442 (Glycine, glycine, Gly, G),
443 (Histidine, histidine, His, H),
444 (Isoleucine, isoleucine, Ile, I),
445 (Leucine, leucine, Leu, L),
446 (Lysine, lysine, Lys, K),
447 (Methionine, methionine, Met, M),
448 (Phenylalanine, phenylalanine, Phe, F),
449 (Proline, proline, Pro, P),
450 (Serine, serine, Ser, S),
451 (Threonine, threonine, Thr, T),
452 (Tryptophan, trytophan, Trp, W),
453 (Tyrosine, tyrosine, Tyr, Y),
454 (Valine, valine, Val, V)
455 );
456
457#[allow(non_snake_case)]
458#[allow(dead_code)]
459#[allow(unused_mut)]
460#[allow(unused_variables)]
461pub fn hydrophobicity(protein_seq: &str, window_size: usize) -> Vec<f64> {
462 let mut hydrophobicity: Hydrophobicity = Hydrophobicity::new_KD();
463 let mut total_hydrophobicity: Vec<f64> = Vec::new();
464 let mut window_values: f64 = 0.0;
465 let mut windows: Vec<String> = protein_seq
466 .chars()
467 .collect::<Vec<_>>()
468 .windows(window_size)
469 .map(|window| window.iter().collect())
470 .collect();
471 for (index, window) in windows.iter().enumerate() {
472 for ch in window.chars() {
473 match ch {
474 'A' => window_values += hydrophobicity.A(),
475 'R' => window_values += hydrophobicity.R(),
476 'N' => window_values += hydrophobicity.N(),
477 'D' => window_values += hydrophobicity.D(),
478 'C' => window_values += hydrophobicity.C(),
479 'Q' => window_values += hydrophobicity.Q(),
480 'E' => window_values += hydrophobicity.E(),
481 'G' => window_values += hydrophobicity.G(),
482 'H' => window_values += hydrophobicity.H(),
483 'I' => window_values += hydrophobicity.I(),
484 'L' => window_values += hydrophobicity.L(),
485 'K' => window_values += hydrophobicity.K(),
486 'M' => window_values += hydrophobicity.M(),
487 'F' => window_values += hydrophobicity.F(),
488 'P' => window_values += hydrophobicity.P(),
489 'S' => window_values += hydrophobicity.S(),
490 'T' => window_values += hydrophobicity.T(),
491 'W' => window_values += hydrophobicity.W(),
492 'Y' => window_values += hydrophobicity.Y(),
493 'V' => window_values += hydrophobicity.V(),
494 _ => continue,
495 }
496 }
497 total_hydrophobicity.push(window_values);
498 }
499 total_hydrophobicity
500}
501
502
503pub fn amino_counts(protein_seq: &str) -> HashMap<char, u64> {
504 let mut counts: HashMap<char, u64> = HashMap::new();
505 for c in protein_seq.chars() {
506 *counts.entry(c).or_insert(0) +=1;
507 }
508 counts
509}
510
511#[allow(unused_mut)]
512#[allow(unused_variables)]
513#[allow(unused_assignments)]
514#[allow(dead_code)]
515pub fn amino_percentage(protein_seq: &str) -> HashMap<char, f64> {
516 let mut percentages: HashMap<char, f64> = HashMap::new();
517 let counts = amino_counts(protein_seq);
518 let seq_len: f64 = (protein_seq.len() as f64) as f64;
519 percentages = counts.iter().map(|(k, &value)| {
520 let percentage = (value as f64 / seq_len) * 100.0;
521 (k.clone(), percentage)
522 }).collect();
523 percentages
524}
525
526
527#[cfg(test)]
528mod tests {
529 use super::*;
530
531 #[test]
532 #[allow(unused_mut)]
533 #[allow(dead_code)]
534 #[allow(unused_variables)]
535 pub fn suggest_transmembrane_domains() -> Result<(), anyhow::Error> {
536 let file_gbk = File::open("test_output.gbk")?;
537 let mut reader = Reader::new(file_gbk);
538 let mut records = reader.records();
539 loop {
540 match records.next() {
541 Some(Ok(mut record)) => {
542 //println!("next record");
543 //println!("Record id: {:?}", record.id);
544 for (k, v) in &record.cds.attributes {
545 match record.seq_features.get_sequence_faa(&k) {
546 Some(value) => { let seq_faa = value.to_string();
547 println!("{:?}", &seq_faa);
548 let hydro_values = hydrophobicity(&seq_faa, 21);
549 let mut result = String::new();
550 for hydro in hydro_values {
551 if hydro > 1.6 {
552 println!("possible transmembrane region - score {}",&hydro);
553 }
554 else {
555 ()
556 }
557 }
558 },
559 _ => (),
560 };
561
562 }
563 },
564 Some(Err(e)) => { println!("theres an err {:?}", e); },
565 None => {
566 println!("finished iteration");
567 break; },
568 }
569 }
570 return Ok(());
571 }
572
573 #[test]
574 #[allow(unused_mut)]
575 #[allow(dead_code)]
576 #[allow(unused_variables)]
577 #[allow(unused_assignments)]
578 pub fn collect_molecular_weight() -> Result<(), anyhow::Error> {
579 let file_gbk = File::open("test_output.gbk")?;
580 let mut reader = Reader::new(file_gbk);
581 let mut records = reader.records();
582 let mut molecular_weight_total: f64 = 0.0;
583 loop {
584 match records.next() {
585 Some(Ok(mut record)) => {
586 //println!("next record");
587 //println!("Record id: {:?}", record.id);
588 for (k, v) in &record.cds.attributes {
589 match record.seq_features.get_sequence_faa(&k) {
590 Some(value) => { let seq_faa = value.to_string();
591 println!("id: {:?}", &k);
592 molecular_weight_total = molecular_weight(&seq_faa);
593 println!(">{}|{}\n{}", &record.id, &k, molecular_weight_total);
594 },
595 _ => (),
596 };
597
598 }
599 },
600 Some(Err(e)) => { println!("theres an err {:?}", e); },
601 None => {
602 println!("finished iteration");
603 break; },
604 }
605 }
606 return Ok(());
607 }
608
609 #[test]
610 #[allow(unused_mut)]
611 #[allow(unused_variables)]
612 #[allow(unused_assignments)]
613 pub fn count_aminos() -> Result<(), anyhow::Error> {
614 let file_gbk = File::open("test_output.gbk")?;
615 let mut reader = Reader::new(file_gbk);
616 let mut records = reader.records();
617 let mut results: HashMap<char, u64> = HashMap::new();
618 loop {
619 match records.next() {
620 Some(Ok(record)) => {
621 for (k, _v) in &record.cds.attributes {
622 match record.seq_features.get_sequence_faa(&k) {
623 Some(value) => { let seq_faa = value.to_string();
624 println!("id: {:?}", &k);
625 results = amino_counts(&seq_faa);
626 println!(">{}|{}\n{:?}", &record.id, &k, results);
627 },
628 _ => (),
629 };
630
631 }
632 },
633 Some(Err(e)) => { println!("theres an err {:?}", e); },
634 None => {
635 println!("finished iteration");
636 break; },
637 }
638 }
639 return Ok(());
640 }
641
642 #[test]
643 #[allow(dead_code)]
644 #[allow(unused_mut)]
645 #[allow(unused_variables)]
646 #[allow(unused_assignments)]
647 pub fn aromaticity() -> Result<(), anyhow::Error> {
648 // calculated as in biopython with aromaticity according to Lobry, 1994 as the relative freq of Phe+Trp+Tyr
649 let file_gbk = File::open("test_output.gbk")?;
650 let reader = Reader::new(file_gbk);
651 let mut records = reader.records();
652 let mut results: HashMap<char, f64> = HashMap::new();
653 loop {
654 match records.next() {
655 Some(Ok(record)) => {
656 for (k, _v) in &record.cds.attributes {
657 match record.seq_features.get_sequence_faa(&k) {
658 Some(value) => { let seq_faa = value.to_string();
659 results = amino_percentage(&seq_faa);
660 let aromatic_aas = vec!['Y','W','F'];
661 let aromaticity: f64 = aromatic_aas.iter()
662 .filter_map(|&amino| results.get(&amino))
663 .map(|&perc| perc / 100.0)
664 .sum();
665 println!("aromaticity for {} {} is {}",&record.id, &k, &aromaticity);
666 },
667 _ => (),
668 };
669 }
670 },
671 Some(Err(e)) => { println!("theres an error {:?}", e); },
672 None => { println!("finished iteration");
673 break;
674 },
675 }
676 }
677 return Ok(());
678 }
679 #[tokio::test]
680 pub async fn main() -> Result<(), anyhow::Error> {
681 let reader = fasta::Reader::new(File::open("test_hamming.aln")?);
682 let records: Vec<_> = reader.records().collect::<Result<_, _>>()?;
683 let sequences: Vec<String> = records
684 .iter()
685 .map(|rec| String::from_utf8_lossy(rec.seq()).to_string())
686 .collect();
687 let ids: Vec<String> = records
688 .iter()
689 .map(|rec| rec.id().to_string())
690 .collect();
691 let distances = hamming_matrix(&sequences).await?;
692 let _ = write_distances_csv(ids, distances, "hamming_dists.csv");
693 Ok(())
694 }
695}