use log::{error, info};
use needletail::parse_fastx_file;
use std::collections::BTreeMap;
use crate::bam_loader::gather_bam_stats_with_seed;
pub fn gather_fastx_stats(filename: &str) -> Result<BTreeMap<usize, u64>, Box<dyn std::error::Error>> {
gather_fastx_stats_with_seed(filename, None)
}
pub fn gather_fastx_stats_with_seed(filename: &str, initial_counts: Option<BTreeMap<usize, u64>>) -> Result<BTreeMap<usize, u64>, Box<dyn std::error::Error>> {
let mut hash_stats: BTreeMap<usize, u64> = match initial_counts {
Some(ic) => ic,
None => BTreeMap::new()
};
let mut reader = parse_fastx_file(filename)?;
let mut count: usize = 0;
info!("Loading file \"{}\"...", filename);
while let Some(record) = reader.next() {
let seq_rec = record?;
let seq_len: usize = seq_rec.num_bases();
let len_count: &mut u64 = hash_stats.entry(seq_len).or_insert(0);
*len_count += 1;
count += 1;
if count % 1000000 == 0 {
info!("Processed {} sequences", count);
}
}
info!("Finished loading file with {} sequences.", count);
Ok(hash_stats)
}
pub fn gather_multifastx_stats<T: AsRef<str> + std::fmt::Debug>(filenames: &[T]) -> Result<BTreeMap<usize, u64>, Box<dyn std::error::Error>> {
let mut hash_stats: BTreeMap<usize, u64> = BTreeMap::new();
for filename in filenames.iter() {
if filename.as_ref().ends_with(".bam") || filename.as_ref().ends_with(".sam") {
hash_stats = match gather_bam_stats_with_seed(filename.as_ref(), Some(hash_stats)) {
Ok(result) => result,
Err(e) => {
error!("Error while parsing BAM file: {:?}", filename);
error!("Error: {:?}", e);
return Err(e);
}
};
}
else {
hash_stats = match gather_fastx_stats_with_seed(filename.as_ref(), Some(hash_stats)) {
Ok(result) => result,
Err(e) => {
error!("Error while parsing FASTX file: {:?}", filename);
error!("Error: {:?}", e);
return Err(e);
}
};
}
}
Ok(hash_stats)
}
#[cfg(test)]
mod tests {
use super::*;
fn stats_basic_fasta() -> BTreeMap<usize, u64> {
let mut results: BTreeMap<usize, u64> = BTreeMap::new();
results.insert(1, 1);
results
}
fn stats_basic_fasta2() -> BTreeMap<usize, u64> {
let mut results: BTreeMap<usize, u64> = BTreeMap::new();
for l in 1..6 {
results.insert(l, 1);
}
results
}
fn stats_basic_fasta3() -> BTreeMap<usize, u64> {
let mut results: BTreeMap<usize, u64> = BTreeMap::new();
results.insert(1, 3);
results.insert(2, 2);
results.insert(3, 1);
results.insert(4, 2);
results
}
fn stats_basic_fasta4() -> BTreeMap<usize, u64> {
let mut results: BTreeMap<usize, u64> = BTreeMap::new();
results.insert(50, 2);
results.insert(100, 2);
results.insert(150, 2);
results.insert(1000, 1);
results
}
#[test]
fn test_basic_fasta() {
let filename = "./test_data/single_string.fa";
let expected = stats_basic_fasta();
let hash_stats = gather_fastx_stats(&filename).unwrap();
assert_eq!(hash_stats, expected);
}
#[test]
fn test_basic_fasta2() {
let filename = "./test_data/five_strings.fa";
let expected = stats_basic_fasta2();
let hash_stats = gather_fastx_stats(&filename).unwrap();
assert_eq!(hash_stats, expected);
}
#[test]
fn test_basic_fasta3() {
let filename = "./test_data/small_strings.fa";
let expected = stats_basic_fasta3();
let hash_stats = gather_fastx_stats(&filename).unwrap();
assert_eq!(hash_stats, expected);
}
#[test]
fn test_basic_fasta4() {
let filename = "./test_data/long_strings.fa";
let expected = stats_basic_fasta4();
let hash_stats = gather_fastx_stats(&filename).unwrap();
assert_eq!(hash_stats, expected);
}
#[test]
#[should_panic]
fn test_error_handling() {
let filename = "./test_data/panic_file.fa";
let _hash_stats = gather_fastx_stats(&filename).unwrap();
}
#[test]
fn test_multifastx() {
let filenames = [
"./test_data/single_string.fa",
"./test_data/five_strings.fa",
"./test_data/small_strings.fa",
"./test_data/long_strings.fa"
];
let expected_list = [
stats_basic_fasta(),
stats_basic_fasta2(),
stats_basic_fasta3(),
stats_basic_fasta4()
];
let mut expected: BTreeMap<usize, u64> = BTreeMap::new();
for results in expected_list.iter() {
for (key, value) in results.iter() {
let len_count: &mut u64 = expected.entry(*key).or_insert(0);
*len_count += value;
}
}
let hash_stats = gather_multifastx_stats(&filenames).unwrap();
assert_eq!(hash_stats, expected);
}
#[test]
fn test_multimixed() {
let filenames = [
"./test_data/single_string.fa",
"./test_data/five_strings.sam",
"./test_data/small_strings.fa",
"./test_data/long_strings.bam"
];
let expected_list = [
stats_basic_fasta(),
stats_basic_fasta2(),
stats_basic_fasta3(),
stats_basic_fasta4()
];
let mut expected: BTreeMap<usize, u64> = BTreeMap::new();
for results in expected_list.iter() {
for (key, value) in results.iter() {
let len_count: &mut u64 = expected.entry(*key).or_insert(0);
*len_count += value;
}
}
let hash_stats = gather_multifastx_stats(&filenames).unwrap();
assert_eq!(hash_stats, expected);
}
#[test]
#[should_panic]
fn test_multifastx_error_handling() {
let filenames = [
"./test_data/single_string.fa",
"./test_data/panic_file.fa"
];
let _hash_stats = gather_multifastx_stats(&filenames).unwrap();
}
}