Skip to main content

microBioRust_seqmetrics/
metrics.rs

1//!  The purpose of seqmetrics is to allow calculations of protein sequence parameters
2//!  We define a set of amino acid getters to allow getting of the protein sequence
3//!  as either the three letter code such as Trp, Phe, the full name such as alanine, glycine
4//!  or the single letter code such as Y, A
5//!
6//!   Example function to calculate transmembrane statistics
7//!
8//! ```rust
9//!
10//! use clap::Parser;
11//! use std::fs::File;
12//! use microBioRust::gbk::Reader;
13//! use std::io;
14//! use std::collections::HashMap;
15//! use microBioRust_seqmetrics::metrics::hydrophobicity;
16//!
17//! pub fn suggest_transmembrane_domains() -> Result<(), anyhow::Error> {
18//!            let file_gbk = File::open("K12_ribo.gbk")?;
19//!            let mut reader = Reader::new(file_gbk);
20//!            let mut records = reader.records();
21//!            loop {  
22//!                match records.next() {  
23//!                    Some(Ok(mut record)) => {
24//!                       //println!("next record");
25//!                       //println!("Record id: {:?}", record.id);
26//!                       for (k, v) in &record.cds.attributes {
27//!                           match record.seq_features.get_sequence_faa(&k) {
28//!                                     Some(value) => { let seq_faa = value.to_string();
29//!				                      println!("{:?}", &seq_faa);
30//!				                      let hydro_values = hydrophobicity(&seq_faa, 21);
31//!						      let mut result = String::new();
32//!						      for hydro in hydro_values {
33//!						           if hydro > 1.6 {
34//!						               println!("possible transmembrane region - score {}",&hydro);  
35//!							       }
36//!						           else {
37//!						               ()
38//!							   }
39//!						      }
40//!                                                 },
41//!                                     _ => (),
42//!                                     };
43//!                       
44//!                           }
45//!                    },
46//!                    Some(Err(e)) => { println!("theres an err {:?}", e); },
47//!                    None => {
48//!                             println!("finished iteration");
49//!                             break;
50//!                             },
51//!                    };
52//!               }
53//!            return Ok(());
54//!   }
55//!```   
56//!
57//!   Example function to calculate the molecular weight of a protein sequence
58//!
59//!```rust
60//!
61//! use clap::Parser;
62//! use std::fs::File;
63//! use microBioRust::gbk::Reader;
64//! use std::io;
65//! use std::collections::HashMap;
66//! use microBioRust_seqmetrics::metrics::molecular_weight;
67//!
68//! pub fn collect_molecular_weight() -> Result<(), anyhow::Error> {
69//!            let file_gbk = File::open("K12_ribo.gbk")?;
70//!            let mut reader = Reader::new(file_gbk);
71//!            let mut records = reader.records();
72//!	    let mut molecular_weight_total: f64 = 0.0;
73//!            loop {  
74//!                match records.next() {  
75//!                    Some(Ok(mut record)) => {
76//!                       //println!("next record");
77//!                       //println!("Record id: {:?}", record.id);
78//!                      for (k, v) in &record.cds.attributes {
79//!                           match record.seq_features.get_sequence_faa(&k) {
80//!                                     Some(value) => {
81//!                                                     let seq_faa = value.to_string();
82//!				                        println!("id: {:?}", &k);
83//!				                        molecular_weight_total = molecular_weight(&seq_faa);
84//!                                                     println!(">{}|{}\n{}", &record.id, &k, molecular_weight_total);
85//!                                                    },
86//!                                     _ => (),
87//!                                     };
88//!                       
89//!                           }
90//!                    },
91//!                    Some(Err(e)) => { println!("theres an err {:?}", e); },
92//!                    None => {
93//!                              println!("finished iteration");
94//!                              break;
95//!                            },
96//!                    }
97//!               }
98//!            return Ok(());
99//!   }
100//!```
101//!
102//!   Example function to count amino acids of each protein as raw counts, see below to generate percentages per protein
103//!
104//!```rust
105//!
106//! use clap::Parser;
107//! use std::fs::File;
108//! use microBioRust::gbk::Reader;
109//! use std::io;
110//! use std::collections::HashMap;
111//! use microBioRust_seqmetrics::metrics::amino_counts;
112//!
113//! pub fn count_aminos() -> Result<(), anyhow::Error> {
114//!            let file_gbk = File::open("K12_ribo.gbk")?;
115//!            let mut reader = Reader::new(file_gbk);
116//!            let mut records = reader.records();
117//!	    let mut results: HashMap<char, u64> = HashMap::new();
118//!            loop {  
119//!                match records.next() {  
120//!                   Some(Ok(mut record)) => {
121//!                       //println!("next record");
122//!                       //println!("Record id: {:?}", record.id);
123//!                       for (k, v) in &record.cds.attributes {
124//!                           match record.seq_features.get_sequence_faa(&k) {
125//!                                     Some(value) => { let seq_faa = value.to_string();
126//!				                      println!("id: {:?}", &k);
127//!				                      results = amino_counts(&seq_faa);
128//!                                                      println!(">{}|{}\n{:?}", &record.id, &k, results);
129//!                                                      },
130//!                                     _ => (),
131//!                                     };
132//!                       
133//!                           }
134//!                    },
135//!                    Some(Err(e)) => { println!("theres an err {:?}", e); },
136//!                    None => {
137//!                             println!("finished iteration");
138//!                             break;
139//!                             },
140//!                    }
141//!               }
142//!            return Ok(());
143//!   }
144//!```
145//!  Example function to calculate and print out the aromaticity of each protein.  You can do the equivalent but using the instability_index function for those calculations.
146//!  The instability index is from the method by Guruprasad et al., 1990. A protein with an instability index > 40 may be unstable in the test tube, whilst one < 40 is expected
147//!  to be stable.  This interpretation should be taken as a guideline only.
148//!
149//!```rust
150//! use clap::Parser;
151//! use std::fs::File;
152//! use microBioRust::gbk::Reader;
153//! use std::io;
154//! use std::collections::HashMap;
155//! use microBioRust_seqmetrics::metrics::amino_percentage;
156//!
157//! pub fn aromaticity() -> Result<(), anyhow::Error> {
158//!        // calculated as in biopython with aromaticity according to Lobry, 1994 as the relative freq of Phe+Trp+Tyr
159//!        let file_gbk = File::open("K12_ribo.gbk")?;
160//!	let mut reader = Reader::new(file_gbk);
161//!	let mut records = reader.records();
162//!	let mut results: HashMap<char, f64> = HashMap::new();
163//!	loop {
164//!	   match records.next() {
165//!	      Some(Ok(mut record)) => {
166//!	          for (k, v) in &record.cds.attributes {
167//!		     match record.seq_features.get_sequence_faa(&k) {
168//!		         Some(value) => {  let seq_faa = value.to_string();
169//!			                   results = amino_percentage(&seq_faa);
170//!					   let aromatic_aas = vec!['Y','W','F'];
171//!					   let aromaticity: f64 = aromatic_aas.iter()
172//!					       .filter_map(|&amino| results.get(&amino))
173//!					       .map(|&perc| perc / 100.0)
174//!					       .sum();
175//!					   println!("aromaticity for {} {} is {}",&record.id, &k, &aromaticity);
176//!					  },
177//!			_ => (),
178//!			};
179//!		   }
180//!	         },
181//!	    Some(Err(e)) => { println!("theres an error {:?}", e); },
182//!	    None => {
183//!                   println!("finished iteration");
184//!	              break;
185//!		    },
186//!	    }
187//!       }
188//!      return Ok(());
189//!   }
190//!```
191//!  The purpose of hamming.rs is to allow calculations of the hamming distances between sequences
192//!  The Hamming distance is the minimum number of substitutions required to change one string to another
193//!  It is one of several string metrics for measuring the edit distance between two sequences.
194//!  It does not encompass or take into account any biology
195//!  It is named after the American Mathematician Richard Hamming (wikipedia)
196//!  This is aimed essentially at protein fasta sequences
197//!  
198//!
199//!  ```
200//!  use microBioRust_seqmetrics::hamming::hamming_matrix;
201//!  use microBioRust_seqmetrics::write_dst_csv::write_distances_csv;
202//!  use tokio::fs::File;
203//!  use std::collections::HashMap;
204//!  use bio::io::fasta;
205//!  use tokio::io;
206//!  use tokio::io::{AsyncWriteExt, BufWriter};
207//!
208//!
209//!  #[tokio::main]
210//!  async fn main() -> Result<(), anyhow::Error> {
211//!            let reader = fasta::Reader::new(std::io::stdin());
212//!            let records: Vec<_> = reader.records().collect::<Result<_, _>>()?;
213//!	    println!("gathering records");
214//!            let sequences: Vec<String> = records
215//!	                          .iter()
216//!				  .map(|rec| String::from_utf8_lossy(rec.seq()).to_string())
217//!				  .collect();
218//!            let ids: Vec<String> = records
219//!	                          .iter()
220//!				  .map(|rec| rec.id().to_string())
221//!				  .collect();
222//!	    println!("gathered ids");
223//!	    let distances = hamming_matrix(&sequences).await?;
224//!	    write_distances_csv(ids, distances, "hamming_dists.csv").await?;
225//!
226//!         Ok(())
227//!  }
228//!  ```
229
230#![allow(unused_imports)]
231use crate::hamming::hamming_matrix;
232use crate::write_dst_csv::write_distances_csv;
233use bio::io::fasta;
234use microBioRust::gbk::Reader;
235use std::collections::HashMap;
236use std::fs::File;
237
238// Define a macro to generate the getters for each amino acid
239#[macro_export]
240macro_rules! amino_acid_getters {
241    ($struct_name:ident, $( ($field:ident, $full_name:ident, $three_letter:ident, $single_letter:ident) ),* ) => {
242        #[allow(non_snake_case)]
243	#[allow(dead_code)]
244        impl $struct_name {
245            $(
246	        // Capital full name getter
247		fn $field(&self) -> f64 {
248		   self.$field
249		}
250                // Full name getter
251                fn $full_name(&self) -> f64 {
252                    self.$field
253                }
254                // Three-letter code getter
255                fn $three_letter(&self) -> f64 {
256                    self.$field
257                }
258                // Single-letter code getter
259                fn $single_letter(&self) -> f64 {
260                    self.$field
261                }
262            )*
263        }
264    };
265}
266
267#[allow(non_snake_case)]
268#[allow(dead_code)]
269pub struct MolWeights {
270    Alanine: f64,
271    Arginine: f64,
272    Asparagine: f64,
273    Aspartate: f64,
274    Cysteine: f64,
275    Glutamate: f64,
276    Glutamine: f64,
277    Glycine: f64,
278    Histidine: f64,
279    Isoleucine: f64,
280    Leucine: f64,
281    Lysine: f64,
282    Methionine: f64,
283    Phenylalanine: f64,
284    Proline: f64,
285    Serine: f64,
286    Threonine: f64,
287    Tryptophan: f64,
288    Tyrosine: f64,
289    Valine: f64,
290}
291
292#[allow(non_snake_case)]
293#[allow(dead_code)]
294impl MolWeights {
295    fn new() -> Self {
296        Self {
297            //masses from NIST chemistry webbook US Dept of commerce
298            Alanine: 89.0932,        //C3H7NO2
299            Arginine: 174.2010,      //C6H14N4O2
300            Asparagine: 132.1179,    //C4H8N2O3
301            Aspartate: 133.1027,     //C4H7NO4
302            Cysteine: 121.158,       //C3H7NO2S
303            Glutamate: 147.1293,     //C5H9NO4
304            Glutamine: 146.1445,     //C5H10N2O3
305            Glycine: 75.0666,        //C2H5NO2
306            Histidine: 155.1546,     //C6H9N3O2
307            Isoleucine: 131.1729,    //C6H13NO2
308            Leucine: 131.1729,       //C6H13NO2
309            Lysine: 146.1876,        //C6H14N2O2
310            Methionine: 149.211,     //C5H11NO2S
311            Phenylalanine: 165.1891, //C9H11NO2
312            Proline: 115.1305,       //C5H9NO2
313            Serine: 105.0926,        //C3H7NO2
314            Threonine: 119.1192,     //C4H9NO3
315            Tryptophan: 204.2252,    //C11H12N2O2
316            Tyrosine: 181.1885,      //C9H11NO3
317            Valine: 117.1463,        //C5H11NO2
318        }
319    }
320}
321
322amino_acid_getters!(
323    MolWeights,
324    (Alanine, alanine, Ala, A),
325    (Arginine, arginine, Arg, R),
326    (Asparagine, asparagine, Asn, N),
327    (Aspartate, aspartate, Asp, D),
328    (Cysteine, cysteine, Cys, C),
329    (Glutamine, glutamine, Gln, Q),
330    (Glutamate, glutamate, Glu, E),
331    (Glycine, glycine, Gly, G),
332    (Histidine, histidine, His, H),
333    (Isoleucine, isoleucine, Ile, I),
334    (Leucine, leucine, Leu, L),
335    (Lysine, lysine, Lys, K),
336    (Methionine, methionine, Met, M),
337    (Phenylalanine, phenylalanine, Phe, F),
338    (Proline, proline, Pro, P),
339    (Serine, serine, Ser, S),
340    (Threonine, threonine, Thr, T),
341    (Tryptophan, tryptophan, Trp, W),
342    (Tyrosine, tyrosine, Tyr, Y),
343    (Valine, valine, Val, V)
344);
345
346#[allow(non_snake_case)]
347#[allow(dead_code)]
348#[allow(unused_variables)]
349pub fn molecular_weight(protein_seq: &str) -> f64 {
350    let amino_weights: MolWeights = MolWeights::new();
351    let mut total_weight = 0.0;
352    for ch in protein_seq.chars() {
353        match ch {
354            'A' => total_weight += amino_weights.A(),
355            'R' => total_weight += amino_weights.R(),
356            'N' => total_weight += amino_weights.N(),
357            'D' => total_weight += amino_weights.D(),
358            'C' => total_weight += amino_weights.C(),
359            'Q' => total_weight += amino_weights.Q(),
360            'E' => total_weight += amino_weights.E(),
361            'G' => total_weight += amino_weights.G(),
362            'H' => total_weight += amino_weights.H(),
363            'I' => total_weight += amino_weights.I(),
364            'L' => total_weight += amino_weights.L(),
365            'K' => total_weight += amino_weights.K(),
366            'M' => total_weight += amino_weights.M(),
367            'F' => total_weight += amino_weights.F(),
368            'P' => total_weight += amino_weights.P(),
369            'S' => total_weight += amino_weights.S(),
370            'T' => total_weight += amino_weights.T(),
371            'W' => total_weight += amino_weights.W(),
372            'Y' => total_weight += amino_weights.Y(),
373            'V' => total_weight += amino_weights.V(),
374            _ => continue,
375        }
376    }
377    let result_weight = total_weight - ((protein_seq.len() - 1) as f64 * 18.02);
378    result_weight
379}
380
381use tokio::io::AsyncBufReadExt;
382use tokio::io::BufReader;
383#[allow(non_snake_case)]
384#[allow(dead_code)]
385pub async fn load_instability(path: &str) -> Result<HashMap<String, f64>, anyhow::Error> {
386    let file = tokio::fs::File::open(path).await?;
387    let reader = BufReader::new(file);
388    let mut lines = reader.lines();
389    let mut weights = HashMap::new();
390    while let Some(line) = lines.next_line().await? {
391        let line = line.trim().to_string();
392        if line.is_empty() {
393            continue;
394        }
395        let parts: Vec<&str> = line.split(',').collect();
396        if parts.len() != 2 {
397            continue;
398        }
399        let key = parts[0].trim().to_string();
400        let val: f64 = parts[1]
401            .trim()
402            .replace('—', "-") // handle minus dash
403            .parse()
404            .unwrap();
405        weights.insert(key, val);
406    }
407    Ok(weights)
408}
409
410#[allow(non_snake_case)]
411#[allow(dead_code)]
412pub async fn instability_index(seq: String, weights: &HashMap<String, f64>) -> f64 {
413    let chars: Vec<char> = seq.chars().collect();
414    let mut total = 0.0;
415    for window in chars.windows(2) {
416        let pair = format!("{}{}", window[0], window[1]);
417        if let Some(val) = weights.get(&pair) {
418            total += val;
419        }
420    }
421    total
422}
423
424#[allow(non_snake_case)]
425#[allow(dead_code)]
426pub struct Hydrophobicity {
427    Alanine: f64,
428    Arginine: f64,
429    Asparagine: f64,
430    Aspartate: f64,
431    Cysteine: f64,
432    Glutamate: f64,
433    Glutamine: f64,
434    Glycine: f64,
435    Histidine: f64,
436    Isoleucine: f64,
437    Leucine: f64,
438    Lysine: f64,
439    Methionine: f64,
440    Phenylalanine: f64,
441    Proline: f64,
442    Serine: f64,
443    Threonine: f64,
444    Tryptophan: f64,
445    Tyrosine: f64,
446    Valine: f64,
447}
448
449impl Hydrophobicity {
450    #[allow(non_snake_case)]
451    fn new_KD() -> Self {
452        Self {
453            //Kyte-Doolittle values from the Qiagen resources website
454            Alanine: 1.80,
455            Arginine: -4.50,
456            Asparagine: -3.50,
457            Aspartate: -3.50,
458            Cysteine: 2.50,
459            Glutamate: -3.50,
460            Glutamine: -3.50,
461            Glycine: -0.40,
462            Histidine: -3.20,
463            Isoleucine: 4.50,
464            Leucine: 3.80,
465            Lysine: -3.90,
466            Methionine: 1.90,
467            Phenylalanine: 2.80,
468            Proline: -1.60,
469            Serine: -0.80,
470            Threonine: -0.70,
471            Tryptophan: -0.90,
472            Tyrosine: -1.30,
473            Valine: 4.20,
474        }
475    }
476}
477
478amino_acid_getters!(
479    Hydrophobicity,
480    (Alanine, alanine, Ala, A),
481    (Arginine, arginine, Arg, R),
482    (Asparagine, asparagine, Asn, N),
483    (Aspartate, aspartate, Asp, D),
484    (Cysteine, cysteine, Cys, C),
485    (Glutamine, glutamine, Gln, Q),
486    (Glutamate, glutamate, Glu, E),
487    (Glycine, glycine, Gly, G),
488    (Histidine, histidine, His, H),
489    (Isoleucine, isoleucine, Ile, I),
490    (Leucine, leucine, Leu, L),
491    (Lysine, lysine, Lys, K),
492    (Methionine, methionine, Met, M),
493    (Phenylalanine, phenylalanine, Phe, F),
494    (Proline, proline, Pro, P),
495    (Serine, serine, Ser, S),
496    (Threonine, threonine, Thr, T),
497    (Tryptophan, trytophan, Trp, W),
498    (Tyrosine, tyrosine, Tyr, Y),
499    (Valine, valine, Val, V)
500);
501
502#[allow(non_snake_case)]
503#[allow(dead_code)]
504#[allow(unused_mut)]
505#[allow(unused_variables)]
506pub fn hydrophobicity(protein_seq: &str, window_size: usize) -> Vec<f64> {
507    let mut hydrophobicity: Hydrophobicity = Hydrophobicity::new_KD();
508    let mut total_hydrophobicity: Vec<f64> = Vec::new();
509    let mut window_values: f64 = 0.0;
510    let mut windows: Vec<String> = protein_seq
511        .chars()
512        .collect::<Vec<_>>()
513        .windows(window_size)
514        .map(|window| window.iter().collect())
515        .collect();
516    for (index, window) in windows.iter().enumerate() {
517        for ch in window.chars() {
518            match ch {
519                'A' => window_values += hydrophobicity.A(),
520                'R' => window_values += hydrophobicity.R(),
521                'N' => window_values += hydrophobicity.N(),
522                'D' => window_values += hydrophobicity.D(),
523                'C' => window_values += hydrophobicity.C(),
524                'Q' => window_values += hydrophobicity.Q(),
525                'E' => window_values += hydrophobicity.E(),
526                'G' => window_values += hydrophobicity.G(),
527                'H' => window_values += hydrophobicity.H(),
528                'I' => window_values += hydrophobicity.I(),
529                'L' => window_values += hydrophobicity.L(),
530                'K' => window_values += hydrophobicity.K(),
531                'M' => window_values += hydrophobicity.M(),
532                'F' => window_values += hydrophobicity.F(),
533                'P' => window_values += hydrophobicity.P(),
534                'S' => window_values += hydrophobicity.S(),
535                'T' => window_values += hydrophobicity.T(),
536                'W' => window_values += hydrophobicity.W(),
537                'Y' => window_values += hydrophobicity.Y(),
538                'V' => window_values += hydrophobicity.V(),
539                _ => continue,
540            }
541        }
542        total_hydrophobicity.push(window_values);
543    }
544    total_hydrophobicity
545}
546
547pub fn amino_counts(protein_seq: &str) -> HashMap<char, u64> {
548    let mut counts: HashMap<char, u64> = HashMap::new();
549    for c in protein_seq.chars() {
550        *counts.entry(c).or_insert(0) += 1;
551    }
552    counts
553}
554
555#[allow(unused_mut)]
556#[allow(unused_variables)]
557#[allow(unused_assignments)]
558#[allow(dead_code)]
559pub fn amino_percentage(protein_seq: &str) -> HashMap<char, f64> {
560    let mut percentages: HashMap<char, f64> = HashMap::new();
561    let counts = amino_counts(protein_seq);
562    let seq_len: f64 = (protein_seq.len() as f64) as f64;
563    percentages = counts
564        .iter()
565        .map(|(k, &value)| {
566            let percentage = (value as f64 / seq_len) * 100.0;
567            (k.clone(), percentage)
568        })
569        .collect();
570    percentages
571}
572
573#[cfg(test)]
574mod tests {
575    use super::*;
576
577    #[test]
578    #[allow(unused_mut)]
579    #[allow(dead_code)]
580    #[allow(unused_variables)]
581    pub fn suggest_transmembrane_domains() -> Result<(), anyhow::Error> {
582        let file_gbk = File::open("K12_ribo.gbk")?;
583        let mut reader = Reader::new(file_gbk);
584        let mut records = reader.records();
585        loop {
586            match records.next() {
587                Some(Ok(mut record)) => {
588                    //println!("next record");
589                    //println!("Record id: {:?}", record.id);
590                    for (k, v) in &record.cds.attributes {
591                        match record.seq_features.get_sequence_faa(&k) {
592                            Some(value) => {
593                                let seq_faa = value.to_string();
594                                println!("{:?}", &seq_faa);
595                                let hydro_values = hydrophobicity(&seq_faa, 21);
596                                let mut result = String::new();
597                                for hydro in hydro_values {
598                                    if hydro > 1.6 {
599                                        println!(
600                                            "possible transmembrane region - score {}",
601                                            &hydro
602                                        );
603                                    } else {
604                                        ()
605                                    }
606                                }
607                            }
608                            _ => (),
609                        };
610                    }
611                }
612                Some(Err(e)) => {
613                    println!("theres an err {:?}", e);
614                }
615                None => {
616                    println!("finished iteration");
617                    break;
618                }
619            }
620        }
621        return Ok(());
622    }
623
624    #[test]
625    #[allow(unused_mut)]
626    #[allow(dead_code)]
627    #[allow(unused_variables)]
628    #[allow(unused_assignments)]
629    pub fn collect_molecular_weight() -> Result<(), anyhow::Error> {
630        let file_gbk = File::open("K12_ribo.gbk")?;
631        let mut reader = Reader::new(file_gbk);
632        let mut records = reader.records();
633        let mut molecular_weight_total: f64 = 0.0;
634        loop {
635            match records.next() {
636                Some(Ok(mut record)) => {
637                    //println!("next record");
638                    //println!("Record id: {:?}", record.id);
639                    for (k, v) in &record.cds.attributes {
640                        match record.seq_features.get_sequence_faa(&k) {
641                            Some(value) => {
642                                let seq_faa = value.to_string();
643                                println!("id: {:?}", &k);
644                                molecular_weight_total = molecular_weight(&seq_faa);
645                                println!(">{}|{}\n{}", &record.id, &k, molecular_weight_total);
646                            }
647                            _ => (),
648                        };
649                    }
650                }
651                Some(Err(e)) => {
652                    println!("theres an err {:?}", e);
653                }
654                None => {
655                    println!("finished iteration");
656                    break;
657                }
658            }
659        }
660        return Ok(());
661    }
662
663    #[test]
664    #[allow(unused_mut)]
665    #[allow(unused_variables)]
666    #[allow(unused_assignments)]
667    pub fn count_aminos() -> Result<(), anyhow::Error> {
668        let file_gbk = File::open("K12_ribo.gbk")?;
669        let mut reader = Reader::new(file_gbk);
670        let mut records = reader.records();
671        let mut results: HashMap<char, u64> = HashMap::new();
672        loop {
673            match records.next() {
674                Some(Ok(record)) => {
675                    for (k, _v) in &record.cds.attributes {
676                        match record.seq_features.get_sequence_faa(&k) {
677                            Some(value) => {
678                                let seq_faa = value.to_string();
679                                println!("id: {:?}", &k);
680                                results = amino_counts(&seq_faa);
681                                println!(">{}|{}\n{:?}", &record.id, &k, results);
682                            }
683                            _ => (),
684                        };
685                    }
686                }
687                Some(Err(e)) => {
688                    println!("theres an err {:?}", e);
689                }
690                None => {
691                    println!("finished iteration");
692                    break;
693                }
694            }
695        }
696        return Ok(());
697    }
698
699    #[test]
700    #[allow(dead_code)]
701    #[allow(unused_mut)]
702    #[allow(unused_variables)]
703    #[allow(unused_assignments)]
704    pub fn aromaticity() -> Result<(), anyhow::Error> {
705        // calculated as in biopython with aromaticity according to Lobry, 1994 as the relative freq of Phe+Trp+Tyr
706        let file_gbk = File::open("K12_ribo.gbk")?;
707        let reader = Reader::new(file_gbk);
708        let mut records = reader.records();
709        let mut results: HashMap<char, f64> = HashMap::new();
710        loop {
711            match records.next() {
712                Some(Ok(record)) => {
713                    for (k, _v) in &record.cds.attributes {
714                        match record.seq_features.get_sequence_faa(&k) {
715                            Some(value) => {
716                                let seq_faa = value.to_string();
717                                results = amino_percentage(&seq_faa);
718                                let aromatic_aas = vec!['Y', 'W', 'F'];
719                                let aromaticity: f64 = aromatic_aas
720                                    .iter()
721                                    .filter_map(|&amino| results.get(&amino))
722                                    .map(|&perc| perc / 100.0)
723                                    .sum();
724                                println!(
725                                    "aromaticity for {} {} is {}",
726                                    &record.id, &k, &aromaticity
727                                );
728                            }
729                            _ => (),
730                        };
731                    }
732                }
733                Some(Err(e)) => {
734                    println!("theres an error {:?}", e);
735                }
736                None => {
737                    println!("finished iteration");
738                    break;
739                }
740            }
741        }
742        return Ok(());
743    }
744    use tokio::io::BufReader;
745    #[cfg(test)]
746    #[allow(dead_code)]
747    #[allow(unused_mut)]
748    #[allow(unused_variables)]
749    #[allow(unused_assignments)]
750    #[tokio::test]
751    pub async fn instability_test() -> Result<(), anyhow::Error> {
752        let file_gbk = File::open("K12_ribo.gbk")?;
753        let reader = Reader::new(file_gbk);
754        let mut records = reader.records();
755        let weights = load_instability("dipeptide_stability_values.csv").await?;
756        loop {
757            match records.next() {
758                Some(Ok(record)) => {
759                    for (k, _v) in &record.cds.attributes {
760                        match record.seq_features.get_sequence_faa(&k) {
761                            Some(value) => {
762                                let seq_faa = value.to_string();
763                                let result = instability_index(seq_faa, &weights).await;
764                                println!(
765                                    "instability index for {} {} is {}",
766                                    &record.id, &k, &result
767                                );
768                            }
769                            _ => (),
770                        };
771                    }
772                }
773                Some(Err(e)) => {
774                    println!("theres an error {:?}", e);
775                }
776                None => {
777                    println!("finished iteration");
778                    break;
779                }
780            }
781        }
782        return Ok(());
783    }
784    #[tokio::test]
785    pub async fn main() -> Result<(), anyhow::Error> {
786        let reader = fasta::Reader::new(File::open("test_hamming.aln")?);
787        let records: Vec<_> = reader.records().collect::<Result<_, _>>()?;
788        let sequences: Vec<String> = records
789            .iter()
790            .map(|rec| String::from_utf8_lossy(rec.seq()).to_string())
791            .collect();
792        let ids: Vec<String> = records.iter().map(|rec| rec.id().to_string()).collect();
793        let distances = hamming_matrix(&sequences).await?;
794        let _ = write_distances_csv(ids, distances, "hamming_dists.csv");
795        Ok(())
796    }
797}