microBioRust_seqmetrics/metrics.rs
1//! The purpose of seqmetrics is to allow calculations of protein sequence parameters
2//! We define a set of amino acid getters to allow getting of the protein sequence
3//! as either the three letter code such as Trp, Phe, the full name such as alanine, glycine
4//! or the single letter code such as Y, A
5//!
6//! Example function to calculate transmembrane statistics
7//!
8//! ```rust
9//!
10//! use clap::Parser;
11//! use std::fs::File;
12//! use microBioRust::gbk::Reader;
13//! use std::io;
14//! use std::collections::HashMap;
15//! use microBioRust_seqmetrics::metrics::hydrophobicity;
16//!
17//! pub fn suggest_transmembrane_domains() -> Result<(), anyhow::Error> {
18//! let file_gbk = File::open("K12_ribo.gbk")?;
19//! let mut reader = Reader::new(file_gbk);
20//! let mut records = reader.records();
21//! loop {
22//! match records.next() {
23//! Some(Ok(mut record)) => {
24//! //println!("next record");
25//! //println!("Record id: {:?}", record.id);
26//! for (k, v) in &record.cds.attributes {
27//! match record.seq_features.get_sequence_faa(&k) {
28//! Some(value) => { let seq_faa = value.to_string();
29//! println!("{:?}", &seq_faa);
30//! let hydro_values = hydrophobicity(&seq_faa, 21);
31//! let mut result = String::new();
32//! for hydro in hydro_values {
33//! if hydro > 1.6 {
34//! println!("possible transmembrane region - score {}",&hydro);
35//! }
36//! else {
37//! ()
38//! }
39//! }
40//! },
41//! _ => (),
42//! };
43//!
44//! }
45//! },
46//! Some(Err(e)) => { println!("theres an err {:?}", e); },
47//! None => {
48//! println!("finished iteration");
49//! break;
50//! },
51//! };
52//! }
53//! return Ok(());
54//! }
55//!```
56//!
57//! Example function to calculate the molecular weight of a protein sequence
58//!
59//!```rust
60//!
61//! use clap::Parser;
62//! use std::fs::File;
63//! use microBioRust::gbk::Reader;
64//! use std::io;
65//! use std::collections::HashMap;
66//! use microBioRust_seqmetrics::metrics::molecular_weight;
67//!
68//! pub fn collect_molecular_weight() -> Result<(), anyhow::Error> {
69//! let file_gbk = File::open("K12_ribo.gbk")?;
70//! let mut reader = Reader::new(file_gbk);
71//! let mut records = reader.records();
72//! let mut molecular_weight_total: f64 = 0.0;
73//! loop {
74//! match records.next() {
75//! Some(Ok(mut record)) => {
76//! //println!("next record");
77//! //println!("Record id: {:?}", record.id);
78//! for (k, v) in &record.cds.attributes {
79//! match record.seq_features.get_sequence_faa(&k) {
80//! Some(value) => {
81//! let seq_faa = value.to_string();
82//! println!("id: {:?}", &k);
83//! molecular_weight_total = molecular_weight(&seq_faa);
84//! println!(">{}|{}\n{}", &record.id, &k, molecular_weight_total);
85//! },
86//! _ => (),
87//! };
88//!
89//! }
90//! },
91//! Some(Err(e)) => { println!("theres an err {:?}", e); },
92//! None => {
93//! println!("finished iteration");
94//! break;
95//! },
96//! }
97//! }
98//! return Ok(());
99//! }
100//!```
101//!
102//! Example function to count amino acids of each protein as raw counts, see below to generate percentages per protein
103//!
104//!```rust
105//!
106//! use clap::Parser;
107//! use std::fs::File;
108//! use microBioRust::gbk::Reader;
109//! use std::io;
110//! use std::collections::HashMap;
111//! use microBioRust_seqmetrics::metrics::amino_counts;
112//!
113//! pub fn count_aminos() -> Result<(), anyhow::Error> {
114//! let file_gbk = File::open("K12_ribo.gbk")?;
115//! let mut reader = Reader::new(file_gbk);
116//! let mut records = reader.records();
117//! let mut results: HashMap<char, u64> = HashMap::new();
118//! loop {
119//! match records.next() {
120//! Some(Ok(mut record)) => {
121//! //println!("next record");
122//! //println!("Record id: {:?}", record.id);
123//! for (k, v) in &record.cds.attributes {
124//! match record.seq_features.get_sequence_faa(&k) {
125//! Some(value) => { let seq_faa = value.to_string();
126//! println!("id: {:?}", &k);
127//! results = amino_counts(&seq_faa);
128//! println!(">{}|{}\n{:?}", &record.id, &k, results);
129//! },
130//! _ => (),
131//! };
132//!
133//! }
134//! },
135//! Some(Err(e)) => { println!("theres an err {:?}", e); },
136//! None => {
137//! println!("finished iteration");
138//! break;
139//! },
140//! }
141//! }
142//! return Ok(());
143//! }
144//!```
145//! Example function to calculate and print out the aromaticity of each protein. You can do the equivalent but using the instability_index function for those calculations.
146//! The instability index is from the method by Guruprasad et al., 1990. A protein with an instability index > 40 may be unstable in the test tube, whilst one < 40 is expected
147//! to be stable. This interpretation should be taken as a guideline only.
148//!
149//!```rust
150//! use clap::Parser;
151//! use std::fs::File;
152//! use microBioRust::gbk::Reader;
153//! use std::io;
154//! use std::collections::HashMap;
155//! use microBioRust_seqmetrics::metrics::amino_percentage;
156//!
157//! pub fn aromaticity() -> Result<(), anyhow::Error> {
158//! // calculated as in biopython with aromaticity according to Lobry, 1994 as the relative freq of Phe+Trp+Tyr
159//! let file_gbk = File::open("K12_ribo.gbk")?;
160//! let mut reader = Reader::new(file_gbk);
161//! let mut records = reader.records();
162//! let mut results: HashMap<char, f64> = HashMap::new();
163//! loop {
164//! match records.next() {
165//! Some(Ok(mut record)) => {
166//! for (k, v) in &record.cds.attributes {
167//! match record.seq_features.get_sequence_faa(&k) {
168//! Some(value) => { let seq_faa = value.to_string();
169//! results = amino_percentage(&seq_faa);
170//! let aromatic_aas = vec!['Y','W','F'];
171//! let aromaticity: f64 = aromatic_aas.iter()
172//! .filter_map(|&amino| results.get(&amino))
173//! .map(|&perc| perc / 100.0)
174//! .sum();
175//! println!("aromaticity for {} {} is {}",&record.id, &k, &aromaticity);
176//! },
177//! _ => (),
178//! };
179//! }
180//! },
181//! Some(Err(e)) => { println!("theres an error {:?}", e); },
182//! None => {
183//! println!("finished iteration");
184//! break;
185//! },
186//! }
187//! }
188//! return Ok(());
189//! }
190//!```
191//! The purpose of hamming.rs is to allow calculations of the hamming distances between sequences
192//! The Hamming distance is the minimum number of substitutions required to change one string to another
193//! It is one of several string metrics for measuring the edit distance between two sequences.
194//! It does not encompass or take into account any biology
195//! It is named after the American Mathematician Richard Hamming (wikipedia)
196//! This is aimed essentially at protein fasta sequences
197//!
198//!
199//! ```
200//! use microBioRust_seqmetrics::hamming::hamming_matrix;
201//! use microBioRust_seqmetrics::write_dst_csv::write_distances_csv;
202//! use tokio::fs::File;
203//! use std::collections::HashMap;
204//! use bio::io::fasta;
205//! use tokio::io;
206//! use tokio::io::{AsyncWriteExt, BufWriter};
207//!
208//!
209//! #[tokio::main]
210//! async fn main() -> Result<(), anyhow::Error> {
211//! let reader = fasta::Reader::new(std::io::stdin());
212//! let records: Vec<_> = reader.records().collect::<Result<_, _>>()?;
213//! println!("gathering records");
214//! let sequences: Vec<String> = records
215//! .iter()
216//! .map(|rec| String::from_utf8_lossy(rec.seq()).to_string())
217//! .collect();
218//! let ids: Vec<String> = records
219//! .iter()
220//! .map(|rec| rec.id().to_string())
221//! .collect();
222//! println!("gathered ids");
223//! let distances = hamming_matrix(&sequences).await?;
224//! write_distances_csv(ids, distances, "hamming_dists.csv").await?;
225//!
226//! Ok(())
227//! }
228//! ```
229
230#![allow(unused_imports)]
231use crate::hamming::hamming_matrix;
232use crate::write_dst_csv::write_distances_csv;
233use bio::io::fasta;
234use microBioRust::gbk::Reader;
235use std::collections::HashMap;
236use std::fs::File;
237
238// Define a macro to generate the getters for each amino acid
239#[macro_export]
240macro_rules! amino_acid_getters {
241 ($struct_name:ident, $( ($field:ident, $full_name:ident, $three_letter:ident, $single_letter:ident) ),* ) => {
242 #[allow(non_snake_case)]
243 #[allow(dead_code)]
244 impl $struct_name {
245 $(
246 // Capital full name getter
247 fn $field(&self) -> f64 {
248 self.$field
249 }
250 // Full name getter
251 fn $full_name(&self) -> f64 {
252 self.$field
253 }
254 // Three-letter code getter
255 fn $three_letter(&self) -> f64 {
256 self.$field
257 }
258 // Single-letter code getter
259 fn $single_letter(&self) -> f64 {
260 self.$field
261 }
262 )*
263 }
264 };
265}
266
267#[allow(non_snake_case)]
268#[allow(dead_code)]
269pub struct MolWeights {
270 Alanine: f64,
271 Arginine: f64,
272 Asparagine: f64,
273 Aspartate: f64,
274 Cysteine: f64,
275 Glutamate: f64,
276 Glutamine: f64,
277 Glycine: f64,
278 Histidine: f64,
279 Isoleucine: f64,
280 Leucine: f64,
281 Lysine: f64,
282 Methionine: f64,
283 Phenylalanine: f64,
284 Proline: f64,
285 Serine: f64,
286 Threonine: f64,
287 Tryptophan: f64,
288 Tyrosine: f64,
289 Valine: f64,
290}
291
292#[allow(non_snake_case)]
293#[allow(dead_code)]
294impl MolWeights {
295 fn new() -> Self {
296 Self {
297 //masses from NIST chemistry webbook US Dept of commerce
298 Alanine: 89.0932, //C3H7NO2
299 Arginine: 174.2010, //C6H14N4O2
300 Asparagine: 132.1179, //C4H8N2O3
301 Aspartate: 133.1027, //C4H7NO4
302 Cysteine: 121.158, //C3H7NO2S
303 Glutamate: 147.1293, //C5H9NO4
304 Glutamine: 146.1445, //C5H10N2O3
305 Glycine: 75.0666, //C2H5NO2
306 Histidine: 155.1546, //C6H9N3O2
307 Isoleucine: 131.1729, //C6H13NO2
308 Leucine: 131.1729, //C6H13NO2
309 Lysine: 146.1876, //C6H14N2O2
310 Methionine: 149.211, //C5H11NO2S
311 Phenylalanine: 165.1891, //C9H11NO2
312 Proline: 115.1305, //C5H9NO2
313 Serine: 105.0926, //C3H7NO2
314 Threonine: 119.1192, //C4H9NO3
315 Tryptophan: 204.2252, //C11H12N2O2
316 Tyrosine: 181.1885, //C9H11NO3
317 Valine: 117.1463, //C5H11NO2
318 }
319 }
320}
321
322amino_acid_getters!(
323 MolWeights,
324 (Alanine, alanine, Ala, A),
325 (Arginine, arginine, Arg, R),
326 (Asparagine, asparagine, Asn, N),
327 (Aspartate, aspartate, Asp, D),
328 (Cysteine, cysteine, Cys, C),
329 (Glutamine, glutamine, Gln, Q),
330 (Glutamate, glutamate, Glu, E),
331 (Glycine, glycine, Gly, G),
332 (Histidine, histidine, His, H),
333 (Isoleucine, isoleucine, Ile, I),
334 (Leucine, leucine, Leu, L),
335 (Lysine, lysine, Lys, K),
336 (Methionine, methionine, Met, M),
337 (Phenylalanine, phenylalanine, Phe, F),
338 (Proline, proline, Pro, P),
339 (Serine, serine, Ser, S),
340 (Threonine, threonine, Thr, T),
341 (Tryptophan, tryptophan, Trp, W),
342 (Tyrosine, tyrosine, Tyr, Y),
343 (Valine, valine, Val, V)
344);
345
346#[allow(non_snake_case)]
347#[allow(dead_code)]
348#[allow(unused_variables)]
349pub fn molecular_weight(protein_seq: &str) -> f64 {
350 let amino_weights: MolWeights = MolWeights::new();
351 let mut total_weight = 0.0;
352 for ch in protein_seq.chars() {
353 match ch {
354 'A' => total_weight += amino_weights.A(),
355 'R' => total_weight += amino_weights.R(),
356 'N' => total_weight += amino_weights.N(),
357 'D' => total_weight += amino_weights.D(),
358 'C' => total_weight += amino_weights.C(),
359 'Q' => total_weight += amino_weights.Q(),
360 'E' => total_weight += amino_weights.E(),
361 'G' => total_weight += amino_weights.G(),
362 'H' => total_weight += amino_weights.H(),
363 'I' => total_weight += amino_weights.I(),
364 'L' => total_weight += amino_weights.L(),
365 'K' => total_weight += amino_weights.K(),
366 'M' => total_weight += amino_weights.M(),
367 'F' => total_weight += amino_weights.F(),
368 'P' => total_weight += amino_weights.P(),
369 'S' => total_weight += amino_weights.S(),
370 'T' => total_weight += amino_weights.T(),
371 'W' => total_weight += amino_weights.W(),
372 'Y' => total_weight += amino_weights.Y(),
373 'V' => total_weight += amino_weights.V(),
374 _ => continue,
375 }
376 }
377 let result_weight = total_weight - ((protein_seq.len() - 1) as f64 * 18.02);
378 result_weight
379}
380
381use tokio::io::AsyncBufReadExt;
382use tokio::io::BufReader;
383#[allow(non_snake_case)]
384#[allow(dead_code)]
385pub async fn load_instability(path: &str) -> Result<HashMap<String, f64>, anyhow::Error> {
386 let file = tokio::fs::File::open(path).await?;
387 let reader = BufReader::new(file);
388 let mut lines = reader.lines();
389 let mut weights = HashMap::new();
390 while let Some(line) = lines.next_line().await? {
391 let line = line.trim().to_string();
392 if line.is_empty() {
393 continue;
394 }
395 let parts: Vec<&str> = line.split(',').collect();
396 if parts.len() != 2 {
397 continue;
398 }
399 let key = parts[0].trim().to_string();
400 let val: f64 = parts[1]
401 .trim()
402 .replace('—', "-") // handle minus dash
403 .parse()
404 .unwrap();
405 weights.insert(key, val);
406 }
407 Ok(weights)
408}
409
410#[allow(non_snake_case)]
411#[allow(dead_code)]
412pub async fn instability_index(seq: String, weights: &HashMap<String, f64>) -> f64 {
413 let chars: Vec<char> = seq.chars().collect();
414 let mut total = 0.0;
415 for window in chars.windows(2) {
416 let pair = format!("{}{}", window[0], window[1]);
417 if let Some(val) = weights.get(&pair) {
418 total += val;
419 }
420 }
421 total
422}
423
424#[allow(non_snake_case)]
425#[allow(dead_code)]
426pub struct Hydrophobicity {
427 Alanine: f64,
428 Arginine: f64,
429 Asparagine: f64,
430 Aspartate: f64,
431 Cysteine: f64,
432 Glutamate: f64,
433 Glutamine: f64,
434 Glycine: f64,
435 Histidine: f64,
436 Isoleucine: f64,
437 Leucine: f64,
438 Lysine: f64,
439 Methionine: f64,
440 Phenylalanine: f64,
441 Proline: f64,
442 Serine: f64,
443 Threonine: f64,
444 Tryptophan: f64,
445 Tyrosine: f64,
446 Valine: f64,
447}
448
449impl Hydrophobicity {
450 #[allow(non_snake_case)]
451 fn new_KD() -> Self {
452 Self {
453 //Kyte-Doolittle values from the Qiagen resources website
454 Alanine: 1.80,
455 Arginine: -4.50,
456 Asparagine: -3.50,
457 Aspartate: -3.50,
458 Cysteine: 2.50,
459 Glutamate: -3.50,
460 Glutamine: -3.50,
461 Glycine: -0.40,
462 Histidine: -3.20,
463 Isoleucine: 4.50,
464 Leucine: 3.80,
465 Lysine: -3.90,
466 Methionine: 1.90,
467 Phenylalanine: 2.80,
468 Proline: -1.60,
469 Serine: -0.80,
470 Threonine: -0.70,
471 Tryptophan: -0.90,
472 Tyrosine: -1.30,
473 Valine: 4.20,
474 }
475 }
476}
477
478amino_acid_getters!(
479 Hydrophobicity,
480 (Alanine, alanine, Ala, A),
481 (Arginine, arginine, Arg, R),
482 (Asparagine, asparagine, Asn, N),
483 (Aspartate, aspartate, Asp, D),
484 (Cysteine, cysteine, Cys, C),
485 (Glutamine, glutamine, Gln, Q),
486 (Glutamate, glutamate, Glu, E),
487 (Glycine, glycine, Gly, G),
488 (Histidine, histidine, His, H),
489 (Isoleucine, isoleucine, Ile, I),
490 (Leucine, leucine, Leu, L),
491 (Lysine, lysine, Lys, K),
492 (Methionine, methionine, Met, M),
493 (Phenylalanine, phenylalanine, Phe, F),
494 (Proline, proline, Pro, P),
495 (Serine, serine, Ser, S),
496 (Threonine, threonine, Thr, T),
497 (Tryptophan, trytophan, Trp, W),
498 (Tyrosine, tyrosine, Tyr, Y),
499 (Valine, valine, Val, V)
500);
501
502#[allow(non_snake_case)]
503#[allow(dead_code)]
504#[allow(unused_mut)]
505#[allow(unused_variables)]
506pub fn hydrophobicity(protein_seq: &str, window_size: usize) -> Vec<f64> {
507 let mut hydrophobicity: Hydrophobicity = Hydrophobicity::new_KD();
508 let mut total_hydrophobicity: Vec<f64> = Vec::new();
509 let mut window_values: f64 = 0.0;
510 let mut windows: Vec<String> = protein_seq
511 .chars()
512 .collect::<Vec<_>>()
513 .windows(window_size)
514 .map(|window| window.iter().collect())
515 .collect();
516 for (index, window) in windows.iter().enumerate() {
517 for ch in window.chars() {
518 match ch {
519 'A' => window_values += hydrophobicity.A(),
520 'R' => window_values += hydrophobicity.R(),
521 'N' => window_values += hydrophobicity.N(),
522 'D' => window_values += hydrophobicity.D(),
523 'C' => window_values += hydrophobicity.C(),
524 'Q' => window_values += hydrophobicity.Q(),
525 'E' => window_values += hydrophobicity.E(),
526 'G' => window_values += hydrophobicity.G(),
527 'H' => window_values += hydrophobicity.H(),
528 'I' => window_values += hydrophobicity.I(),
529 'L' => window_values += hydrophobicity.L(),
530 'K' => window_values += hydrophobicity.K(),
531 'M' => window_values += hydrophobicity.M(),
532 'F' => window_values += hydrophobicity.F(),
533 'P' => window_values += hydrophobicity.P(),
534 'S' => window_values += hydrophobicity.S(),
535 'T' => window_values += hydrophobicity.T(),
536 'W' => window_values += hydrophobicity.W(),
537 'Y' => window_values += hydrophobicity.Y(),
538 'V' => window_values += hydrophobicity.V(),
539 _ => continue,
540 }
541 }
542 total_hydrophobicity.push(window_values);
543 }
544 total_hydrophobicity
545}
546
547pub fn amino_counts(protein_seq: &str) -> HashMap<char, u64> {
548 let mut counts: HashMap<char, u64> = HashMap::new();
549 for c in protein_seq.chars() {
550 *counts.entry(c).or_insert(0) += 1;
551 }
552 counts
553}
554
555#[allow(unused_mut)]
556#[allow(unused_variables)]
557#[allow(unused_assignments)]
558#[allow(dead_code)]
559pub fn amino_percentage(protein_seq: &str) -> HashMap<char, f64> {
560 let mut percentages: HashMap<char, f64> = HashMap::new();
561 let counts = amino_counts(protein_seq);
562 let seq_len: f64 = (protein_seq.len() as f64) as f64;
563 percentages = counts
564 .iter()
565 .map(|(k, &value)| {
566 let percentage = (value as f64 / seq_len) * 100.0;
567 (k.clone(), percentage)
568 })
569 .collect();
570 percentages
571}
572
573#[cfg(test)]
574mod tests {
575 use super::*;
576
577 #[test]
578 #[allow(unused_mut)]
579 #[allow(dead_code)]
580 #[allow(unused_variables)]
581 pub fn suggest_transmembrane_domains() -> Result<(), anyhow::Error> {
582 let file_gbk = File::open("K12_ribo.gbk")?;
583 let mut reader = Reader::new(file_gbk);
584 let mut records = reader.records();
585 loop {
586 match records.next() {
587 Some(Ok(mut record)) => {
588 //println!("next record");
589 //println!("Record id: {:?}", record.id);
590 for (k, v) in &record.cds.attributes {
591 match record.seq_features.get_sequence_faa(&k) {
592 Some(value) => {
593 let seq_faa = value.to_string();
594 println!("{:?}", &seq_faa);
595 let hydro_values = hydrophobicity(&seq_faa, 21);
596 let mut result = String::new();
597 for hydro in hydro_values {
598 if hydro > 1.6 {
599 println!(
600 "possible transmembrane region - score {}",
601 &hydro
602 );
603 } else {
604 ()
605 }
606 }
607 }
608 _ => (),
609 };
610 }
611 }
612 Some(Err(e)) => {
613 println!("theres an err {:?}", e);
614 }
615 None => {
616 println!("finished iteration");
617 break;
618 }
619 }
620 }
621 return Ok(());
622 }
623
624 #[test]
625 #[allow(unused_mut)]
626 #[allow(dead_code)]
627 #[allow(unused_variables)]
628 #[allow(unused_assignments)]
629 pub fn collect_molecular_weight() -> Result<(), anyhow::Error> {
630 let file_gbk = File::open("K12_ribo.gbk")?;
631 let mut reader = Reader::new(file_gbk);
632 let mut records = reader.records();
633 let mut molecular_weight_total: f64 = 0.0;
634 loop {
635 match records.next() {
636 Some(Ok(mut record)) => {
637 //println!("next record");
638 //println!("Record id: {:?}", record.id);
639 for (k, v) in &record.cds.attributes {
640 match record.seq_features.get_sequence_faa(&k) {
641 Some(value) => {
642 let seq_faa = value.to_string();
643 println!("id: {:?}", &k);
644 molecular_weight_total = molecular_weight(&seq_faa);
645 println!(">{}|{}\n{}", &record.id, &k, molecular_weight_total);
646 }
647 _ => (),
648 };
649 }
650 }
651 Some(Err(e)) => {
652 println!("theres an err {:?}", e);
653 }
654 None => {
655 println!("finished iteration");
656 break;
657 }
658 }
659 }
660 return Ok(());
661 }
662
663 #[test]
664 #[allow(unused_mut)]
665 #[allow(unused_variables)]
666 #[allow(unused_assignments)]
667 pub fn count_aminos() -> Result<(), anyhow::Error> {
668 let file_gbk = File::open("K12_ribo.gbk")?;
669 let mut reader = Reader::new(file_gbk);
670 let mut records = reader.records();
671 let mut results: HashMap<char, u64> = HashMap::new();
672 loop {
673 match records.next() {
674 Some(Ok(record)) => {
675 for (k, _v) in &record.cds.attributes {
676 match record.seq_features.get_sequence_faa(&k) {
677 Some(value) => {
678 let seq_faa = value.to_string();
679 println!("id: {:?}", &k);
680 results = amino_counts(&seq_faa);
681 println!(">{}|{}\n{:?}", &record.id, &k, results);
682 }
683 _ => (),
684 };
685 }
686 }
687 Some(Err(e)) => {
688 println!("theres an err {:?}", e);
689 }
690 None => {
691 println!("finished iteration");
692 break;
693 }
694 }
695 }
696 return Ok(());
697 }
698
699 #[test]
700 #[allow(dead_code)]
701 #[allow(unused_mut)]
702 #[allow(unused_variables)]
703 #[allow(unused_assignments)]
704 pub fn aromaticity() -> Result<(), anyhow::Error> {
705 // calculated as in biopython with aromaticity according to Lobry, 1994 as the relative freq of Phe+Trp+Tyr
706 let file_gbk = File::open("K12_ribo.gbk")?;
707 let reader = Reader::new(file_gbk);
708 let mut records = reader.records();
709 let mut results: HashMap<char, f64> = HashMap::new();
710 loop {
711 match records.next() {
712 Some(Ok(record)) => {
713 for (k, _v) in &record.cds.attributes {
714 match record.seq_features.get_sequence_faa(&k) {
715 Some(value) => {
716 let seq_faa = value.to_string();
717 results = amino_percentage(&seq_faa);
718 let aromatic_aas = vec!['Y', 'W', 'F'];
719 let aromaticity: f64 = aromatic_aas
720 .iter()
721 .filter_map(|&amino| results.get(&amino))
722 .map(|&perc| perc / 100.0)
723 .sum();
724 println!(
725 "aromaticity for {} {} is {}",
726 &record.id, &k, &aromaticity
727 );
728 }
729 _ => (),
730 };
731 }
732 }
733 Some(Err(e)) => {
734 println!("theres an error {:?}", e);
735 }
736 None => {
737 println!("finished iteration");
738 break;
739 }
740 }
741 }
742 return Ok(());
743 }
744 use tokio::io::BufReader;
745 #[cfg(test)]
746 #[allow(dead_code)]
747 #[allow(unused_mut)]
748 #[allow(unused_variables)]
749 #[allow(unused_assignments)]
750 #[tokio::test]
751 pub async fn instability_test() -> Result<(), anyhow::Error> {
752 let file_gbk = File::open("K12_ribo.gbk")?;
753 let reader = Reader::new(file_gbk);
754 let mut records = reader.records();
755 let weights = load_instability("dipeptide_stability_values.csv").await?;
756 loop {
757 match records.next() {
758 Some(Ok(record)) => {
759 for (k, _v) in &record.cds.attributes {
760 match record.seq_features.get_sequence_faa(&k) {
761 Some(value) => {
762 let seq_faa = value.to_string();
763 let result = instability_index(seq_faa, &weights).await;
764 println!(
765 "instability index for {} {} is {}",
766 &record.id, &k, &result
767 );
768 }
769 _ => (),
770 };
771 }
772 }
773 Some(Err(e)) => {
774 println!("theres an error {:?}", e);
775 }
776 None => {
777 println!("finished iteration");
778 break;
779 }
780 }
781 }
782 return Ok(());
783 }
784 #[tokio::test]
785 pub async fn main() -> Result<(), anyhow::Error> {
786 let reader = fasta::Reader::new(File::open("test_hamming.aln")?);
787 let records: Vec<_> = reader.records().collect::<Result<_, _>>()?;
788 let sequences: Vec<String> = records
789 .iter()
790 .map(|rec| String::from_utf8_lossy(rec.seq()).to_string())
791 .collect();
792 let ids: Vec<String> = records.iter().map(|rec| rec.id().to_string()).collect();
793 let distances = hamming_matrix(&sequences).await?;
794 let _ = write_distances_csv(ids, distances, "hamming_dists.csv");
795 Ok(())
796 }
797}