use std::{io::BufRead, path::Path};
use crate::{
helper::{
files,
types::{infer_contig_fmt_auto, ContigFmt},
},
stats::common::{CommonStats, NStats},
};
use crate::parser::fasta::FastaReader;
pub struct ContigSummary {
pub file_path: String,
pub contig_name: String,
pub contig_count: usize,
pub base_count: usize,
pub nucleotide: usize,
pub g_count: usize,
pub c_count: usize,
pub a_count: usize,
pub t_count: usize,
pub n_count: usize,
pub unknown: usize,
pub gc_content: f64,
pub at_content: f64,
pub nstats: NStats,
pub stats: CommonStats,
pub contig750: usize,
pub contig1000: usize,
pub contig1500: usize,
}
impl Default for ContigSummary {
fn default() -> Self {
Self::new()
}
}
impl ContigSummary {
pub fn new() -> Self {
Self {
file_path: String::new(),
contig_name: String::new(),
contig_count: 0,
base_count: 0,
nucleotide: 0,
g_count: 0,
c_count: 0,
a_count: 0,
t_count: 0,
n_count: 0,
unknown: 0,
gc_content: 0.0,
at_content: 0.0,
nstats: NStats::new(),
stats: CommonStats::new(),
contig750: 0,
contig1000: 0,
contig1500: 0,
}
}
pub fn summarize(&mut self, path: &Path, file_fmt: &ContigFmt) {
self.file_path = path.display().to_string();
self.contig_name = path
.file_stem()
.expect("No file name")
.to_str()
.expect("File name not valid UTF-8")
.to_string();
let contigs = self.parse_file(path, file_fmt);
self.summarize_contigs(&contigs);
}
fn parse_file(&mut self, path: &Path, file_fmt: &ContigFmt) -> Vec<usize> {
match file_fmt {
ContigFmt::Fasta => {
let mut buff = files::open_file(path);
self.count(&mut buff)
}
ContigFmt::Gzip => {
let mut buff = files::decode_gzip(path);
self.count(&mut buff)
}
ContigFmt::Auto => {
let file_fmt = infer_contig_fmt_auto(path);
self.parse_file(path, &file_fmt)
}
}
}
fn count<R: BufRead>(&mut self, buff: &mut R) -> Vec<usize> {
let reader = FastaReader::new(buff);
let mut contigs = Vec::new();
reader.into_iter().for_each(|r| {
contigs.push(r.seq.len());
r.seq.bytes().for_each(|s| match s {
b'G' | b'g' => self.g_count += 1,
b'C' | b'c' => self.c_count += 1,
b'A' | b'a' => self.a_count += 1,
b'T' | b't' => self.t_count += 1,
b'N' | b'n' => self.n_count += 1,
_ => self.unknown += 1,
})
});
contigs
}
fn summarize_contigs(&mut self, contigs: &[usize]) {
self.common_stats(contigs);
self.calculate_nstat(contigs);
self.count_contigs(contigs);
self.count_bases();
self.count_nucleotide();
self.calculate_gc_content();
self.calculate_at_content();
self.count_contig750(contigs);
self.count_contig1000(contigs);
self.count_contig1500(contigs);
}
fn count_contigs(&mut self, contigs: &[usize]) {
self.contig_count = contigs.len();
}
fn common_stats(&mut self, contigs: &[usize]) {
self.stats.calculate(contigs);
}
fn calculate_nstat(&mut self, contigs: &[usize]) {
self.nstats.calculate(contigs, self.stats.sum);
}
fn count_bases(&mut self) {
self.base_count = self.g_count + self.c_count + self.a_count + self.t_count + self.n_count;
}
fn count_nucleotide(&mut self) {
self.nucleotide = self.base_count - self.n_count;
}
fn calculate_gc_content(&mut self) {
self.gc_content = (self.g_count + self.c_count) as f64 / self.base_count as f64;
}
fn calculate_at_content(&mut self) {
self.at_content = (self.a_count + self.t_count) as f64 / self.base_count as f64;
}
fn count_contig750(&mut self, contigs: &[usize]) {
self.contig750 = contigs.iter().filter(|&c| *c > 750).count();
}
fn count_contig1000(&mut self, contigs: &[usize]) {
self.contig1000 = contigs.iter().filter(|&c| *c > 1000).count();
}
fn count_contig1500(&mut self, contigs: &[usize]) {
self.contig1500 = contigs.iter().filter(|&c| *c > 1500).count();
}
}
#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_parse_fasta() {
let mut parser = ContigSummary::new();
let path = Path::new("tests/files/contigs/contigs1.fa");
let file_fmt = ContigFmt::Fasta;
parser.summarize(path, &file_fmt);
assert_eq!(parser.contig_name, "contigs1");
assert_eq!(parser.file_path, "tests/files/contigs/contigs1.fa");
assert_eq!(parser.g_count, 53);
assert_eq!(parser.c_count, 121);
assert_eq!(parser.a_count, 187);
assert_eq!(parser.t_count, 124);
assert_eq!(parser.n_count, 0);
assert_eq!(parser.unknown, 0);
assert_eq!(parser.base_count, 485);
assert_eq!(parser.nucleotide, 485);
assert_eq!(parser.contig_count, 2);
assert_eq!(parser.stats.sum, 485);
assert_eq!(parser.stats.min, 227);
assert_eq!(parser.stats.max, 258);
assert_eq!(parser.stats.mean, 242.5);
assert_eq!(parser.stats.median, 242.5);
assert_eq!(parser.contig1000, 0);
assert_eq!(parser.contig1500, 0);
assert_eq!(parser.contig750, 0);
}
}