Expand description
The purpose of seqmetrics is to allow calculations of protein sequence parameters We define a set of amino acid getters to allow getting of the protein sequence as either the three letter code such as Trp, Phe, the full name such as alanine, glycine or the single letter code such as Y, A
Example function to calculate transmembrane statistics
use clap::Parser;
use std::fs::File;
use microBioRust::gbk::Reader;
use std::io;
use std::collections::HashMap;
use microBioRust_seqmetrics::metrics::hydrophobicity;
pub fn suggest_transmembrane_domains() -> Result<(), anyhow::Error> {
let file_gbk = File::open("test_output.gbk")?;
let mut reader = Reader::new(file_gbk);
let mut records = reader.records();
loop {
match records.next() {
Some(Ok(mut record)) => {
//println!("next record");
//println!("Record id: {:?}", record.id);
for (k, v) in &record.cds.attributes {
match record.seq_features.get_sequence_faa(&k) {
Some(value) => { let seq_faa = value.to_string();
println!("{:?}", &seq_faa);
let hydro_values = hydrophobicity(&seq_faa, 21);
let mut result = String::new();
for hydro in hydro_values {
if hydro > 1.6 {
println!("possible transmembrane region - score {}",&hydro);
}
else {
()
}
}
},
_ => (),
};
}
},
Some(Err(e)) => { println!("theres an err {:?}", e); },
None => {
println!("finished iteration");
break;
},
};
}
return Ok(());
}
Example function to calculate the molecular weight of a protein sequence
use clap::Parser;
use std::fs::File;
use microBioRust::gbk::Reader;
use std::io;
use std::collections::HashMap;
use microBioRust_seqmetrics::metrics::molecular_weight;
pub fn collect_molecular_weight() -> Result<(), anyhow::Error> {
let file_gbk = File::open("test_output.gbk")?;
let mut reader = Reader::new(file_gbk);
let mut records = reader.records();
let mut molecular_weight_total: f64 = 0.0;
loop {
match records.next() {
Some(Ok(mut record)) => {
//println!("next record");
//println!("Record id: {:?}", record.id);
for (k, v) in &record.cds.attributes {
match record.seq_features.get_sequence_faa(&k) {
Some(value) => {
let seq_faa = value.to_string();
println!("id: {:?}", &k);
molecular_weight_total = molecular_weight(&seq_faa);
println!(">{}|{}\n{}", &record.id, &k, molecular_weight_total);
},
_ => (),
};
}
},
Some(Err(e)) => { println!("theres an err {:?}", e); },
None => {
println!("finished iteration");
break;
},
}
}
return Ok(());
}
Example function to count amino acids of each protein as raw counts, see below to generate percentages per protein
use clap::Parser;
use std::fs::File;
use microBioRust::gbk::Reader;
use std::io;
use std::collections::HashMap;
use microBioRust_seqmetrics::metrics::amino_counts;
pub fn count_aminos() -> Result<(), anyhow::Error> {
let file_gbk = File::open("test_output.gbk")?;
let mut reader = Reader::new(file_gbk);
let mut records = reader.records();
let mut results: HashMap<char, u64> = HashMap::new();
loop {
match records.next() {
Some(Ok(mut record)) => {
//println!("next record");
//println!("Record id: {:?}", record.id);
for (k, v) in &record.cds.attributes {
match record.seq_features.get_sequence_faa(&k) {
Some(value) => { let seq_faa = value.to_string();
println!("id: {:?}", &k);
results = amino_counts(&seq_faa);
println!(">{}|{}\n{:?}", &record.id, &k, results);
},
_ => (),
};
}
},
Some(Err(e)) => { println!("theres an err {:?}", e); },
None => {
println!("finished iteration");
break;
},
}
}
return Ok(());
}
Example function to calculate and print out the aromaticity of each protein
use clap::Parser;
use std::fs::File;
use microBioRust::gbk::Reader;
use std::io;
use std::collections::HashMap;
use microBioRust_seqmetrics::metrics::amino_percentage;
pub fn aromaticity() -> Result<(), anyhow::Error> {
// calculated as in biopython with aromaticity according to Lobry, 1994 as the relative freq of Phe+Trp+Tyr
let file_gbk = File::open("test_output.gbk")?;
let mut reader = Reader::new(file_gbk);
let mut records = reader.records();
let mut results: HashMap<char, f64> = HashMap::new();
loop {
match records.next() {
Some(Ok(mut record)) => {
for (k, v) in &record.cds.attributes {
match record.seq_features.get_sequence_faa(&k) {
Some(value) => { let seq_faa = value.to_string();
results = amino_percentage(&seq_faa);
let aromatic_aas = vec!['Y','W','F'];
let aromaticity: f64 = aromatic_aas.iter()
.filter_map(|&amino| results.get(&amino))
.map(|&perc| perc / 100.0)
.sum();
println!("aromaticity for {} {} is {}",&record.id, &k, &aromaticity);
},
_ => (),
};
}
},
Some(Err(e)) => { println!("theres an error {:?}", e); },
None => {
println!("finished iteration");
break;
},
}
}
return Ok(());
}
The purpose of hamming.rs is to allow calculations of the hamming distances between sequences The Hamming distance is the minimum number of substitutions required to change one string to another It is one of several string metrics for measuring the edit distance between two sequences. It does not encompass or take into account any biology It is named after the American Mathematician Richard Hamming (wikipedia) This is aimed essentially at protein fasta sequences
use microBioRust_seqmetrics::hamming::hamming_matrix;
use microBioRust_seqmetrics::write_dst_csv::write_distances_csv;
use tokio::fs::File;
use std::collections::HashMap;
use bio::io::fasta;
use tokio::io;
use tokio::io::{AsyncWriteExt, BufWriter};
#[tokio::main]
async fn main() -> Result<(), anyhow::Error> {
let reader = fasta::Reader::new(std::io::stdin());
let records: Vec<_> = reader.records().collect::<Result<_, _>>()?;
println!("gathering records");
let sequences: Vec<String> = records
.iter()
.map(|rec| String::from_utf8_lossy(rec.seq()).to_string())
.collect();
let ids: Vec<String> = records
.iter()
.map(|rec| rec.id().to_string())
.collect();
println!("gathered ids");
let distances = hamming_matrix(&sequences).await?;
write_distances_csv(ids, distances, "hamming_dists.csv").await?;
Ok(())
}