1use std::collections::BTreeMap;
3use std::path::{Path, PathBuf};
4use std::sync::mpsc::channel;
5
6use colored::Colorize;
7use indexmap::IndexSet;
8use rayon::prelude::*;
9
10use crate::helper::finder::IDs;
11use crate::helper::sequence::SeqParser;
12use crate::helper::types::{DataType, InputFmt, TaxonRecords};
13use crate::helper::utils;
14use crate::stats::sequence::{CharMatrix, CharSummary, Completeness, SiteSummary, Sites, Taxa};
15use crate::writer::summary::{CsvWriter, SummaryWriter};
16
17pub struct AlignmentSummary<'a> {
18 input_fmt: &'a InputFmt,
19 output: &'a Path,
20 datatype: &'a DataType,
21 ntax: usize,
22 interval: usize,
23}
24
25impl<'a> AlignmentSummary<'a> {
26 pub fn new(
27 input_fmt: &'a InputFmt,
28 output: &'a Path,
29 interval: usize,
30 datatype: &'a DataType,
31 ) -> Self {
32 Self {
33 input_fmt,
34 output,
35 ntax: 0,
36 interval,
37 datatype,
38 }
39 }
40
41 pub fn summarize_all(&mut self, files: &[PathBuf], prefix: Option<&str>) {
42 self.check_datatype();
43 let spin = utils::set_spinner();
44 spin.set_message("Indexing alignments...");
45 let ids = self.get_id(files);
46 self.ntax = ids.len();
47 spin.set_message("Computing alignment stats...");
48 let mut stats: Vec<(Sites, CharMatrix, Taxa)> = self.par_get_stats(files);
49 stats.sort_by(|a, b| alphanumeric_sort::compare_path(&a.0.path, &b.0.path));
50 let (sites, dna, complete) = self.summarize_char_matrix(&stats);
51 let taxon_records = self.summarize_taxa(&ids, &stats);
52 spin.finish_with_message("Finished computing summary stats!\n");
53 let sum = SummaryWriter::new(&sites, &dna, &complete, self.datatype);
54 sum.write(self.output, prefix)
55 .expect("Failed writing to stdout");
56 let csv = CsvWriter::new(self.output, prefix, self.datatype);
57 csv.write_taxon_summary(&taxon_records)
58 .expect("Failed writing to csv");
59 csv.write_locus_summary(&stats)
60 .expect("Failed writing a per locus csv file");
61 }
62
63 pub fn summarize_locus(&mut self, files: &[PathBuf], prefix: Option<&str>) {
64 self.check_datatype();
65 let spin = utils::set_spinner();
66 spin.set_message("Computing per locus summary...");
67 files.iter().for_each(|file| {
68 let (matrix, _) = SeqParser::new(file, self.datatype).get_alignment(self.input_fmt);
69 let mut taxa = Taxa::new();
70 taxa.summarize_taxa(&matrix, self.datatype);
71 let csv = CsvWriter::new(self.output, prefix, self.datatype);
72 csv.write_per_locus_summary(file, &taxa)
73 .expect("Failed writing a taxon stats file");
74 });
75 spin.finish_with_message("Finished computing per locus summary!\n");
76 log::info!("{}", "Output".yellow());
77 log::info!("{:18}: {}", "Output dir", self.output.display());
78 }
79
80 fn summarize_taxa(
81 &self,
82 ids: &IndexSet<String>,
83 stats: &[(Sites, CharMatrix, Taxa)],
84 ) -> BTreeMap<String, TaxonRecords> {
85 let mut taxon_summary: BTreeMap<String, TaxonRecords> = BTreeMap::new();
86 stats.iter().for_each(|(_, _, taxa)| {
87 ids.iter().for_each(|id| {
88 if let Some(char_counts) = taxa.records.get(id) {
89 match taxon_summary.get_mut(id) {
90 Some(taxon) => {
91 char_counts.chars.iter().for_each(|(c, count)| {
92 *taxon.char_counts.entry(*c).or_insert(0) += count;
93 });
94 taxon.locus_counts += 1;
95 taxon.total_chars += char_counts.total_chars;
96 taxon.missing_data += char_counts.missing_data;
97 if DataType::Dna == *self.datatype {
98 taxon.gc_count += char_counts.gc_count;
99 taxon.at_count += char_counts.at_count;
100 taxon.nucleotides += char_counts.nucleotides;
101 }
102 }
103 None => {
104 let mut taxon = TaxonRecords::new();
105 taxon.char_counts = char_counts.chars.clone();
106 taxon.locus_counts = 1;
107 taxon.total_chars = char_counts.total_chars;
108 taxon.missing_data = char_counts.missing_data;
109 if DataType::Dna == *self.datatype {
110 taxon.gc_count = char_counts.gc_count;
111 taxon.at_count = char_counts.at_count;
112 taxon.nucleotides = char_counts.nucleotides;
113 }
114 taxon_summary.insert(id.to_string(), taxon);
115 }
116 }
117 }
118 });
119 });
120
121 taxon_summary
122 }
123
124 fn get_id(&mut self, files: &[PathBuf]) -> IndexSet<String> {
125 IDs::new(files, self.input_fmt, self.datatype).id_unique()
126 }
127
128 fn par_get_stats(&self, files: &[PathBuf]) -> Vec<(Sites, CharMatrix, Taxa)> {
129 let (send, rec) = channel();
130 files.par_iter().for_each_with(send, |s, file| {
131 s.send(self.get_stats(file)).unwrap();
132 });
133 rec.iter().collect()
134 }
135
136 fn check_datatype(&mut self) {
137 if let DataType::Ignore = self.datatype {
138 self.datatype = &DataType::Dna
139 }
140 }
141
142 fn get_stats(&self, path: &Path) -> (Sites, CharMatrix, Taxa) {
143 let aln = SeqParser::new(path, self.datatype);
144 let (matrix, header) = aln.get_alignment(self.input_fmt);
145 let mut dna = CharMatrix::new();
146 dna.count_chars(&matrix, &header, self.datatype);
147 let mut sites = Sites::new(path);
148 sites.get_stats(&matrix, self.datatype);
149 let mut taxa = Taxa::new();
150 taxa.summarize_taxa(&matrix, self.datatype);
151
152 (sites, dna, taxa)
153 }
154
155 fn summarize_char_matrix(
156 &self,
157 stats: &[(Sites, CharMatrix, Taxa)],
158 ) -> (SiteSummary, CharSummary, Completeness) {
159 let (sites, dna): (Vec<Sites>, Vec<CharMatrix>) =
160 stats.par_iter().map(|p| (p.0.clone(), p.1.clone())).unzip();
161 let mut sum_sites = SiteSummary::new();
162 sum_sites.summarize(&sites);
163 let mut sum_dna = CharSummary::new();
164 sum_dna.summarize(&dna, self.datatype);
165 let mut mat_comp = Completeness::new(&self.ntax, self.interval);
166 mat_comp.matrix_completeness(&dna);
167 (sum_sites, sum_dna, mat_comp)
168 }
169}