Skip to main content

segul/core/align/
summarize.rs

1//! Generate alignment summary statistics
2use std::collections::BTreeMap;
3use std::path::{Path, PathBuf};
4use std::sync::mpsc::channel;
5
6use colored::Colorize;
7use indexmap::IndexSet;
8use rayon::prelude::*;
9
10use crate::helper::finder::IDs;
11use crate::helper::sequence::SeqParser;
12use crate::helper::types::{DataType, InputFmt, TaxonRecords};
13use crate::helper::utils;
14use crate::stats::sequence::{CharMatrix, CharSummary, Completeness, SiteSummary, Sites, Taxa};
15use crate::writer::summary::{CsvWriter, SummaryWriter};
16
17pub struct AlignmentSummary<'a> {
18    input_fmt: &'a InputFmt,
19    output: &'a Path,
20    datatype: &'a DataType,
21    ntax: usize,
22    interval: usize,
23}
24
25impl<'a> AlignmentSummary<'a> {
26    pub fn new(
27        input_fmt: &'a InputFmt,
28        output: &'a Path,
29        interval: usize,
30        datatype: &'a DataType,
31    ) -> Self {
32        Self {
33            input_fmt,
34            output,
35            ntax: 0,
36            interval,
37            datatype,
38        }
39    }
40
41    pub fn summarize_all(&mut self, files: &[PathBuf], prefix: Option<&str>) {
42        self.check_datatype();
43        let spin = utils::set_spinner();
44        spin.set_message("Indexing alignments...");
45        let ids = self.get_id(files);
46        self.ntax = ids.len();
47        spin.set_message("Computing alignment stats...");
48        let mut stats: Vec<(Sites, CharMatrix, Taxa)> = self.par_get_stats(files);
49        stats.sort_by(|a, b| alphanumeric_sort::compare_path(&a.0.path, &b.0.path));
50        let (sites, dna, complete) = self.summarize_char_matrix(&stats);
51        let taxon_records = self.summarize_taxa(&ids, &stats);
52        spin.finish_with_message("Finished computing summary stats!\n");
53        let sum = SummaryWriter::new(&sites, &dna, &complete, self.datatype);
54        sum.write(self.output, prefix)
55            .expect("Failed writing to stdout");
56        let csv = CsvWriter::new(self.output, prefix, self.datatype);
57        csv.write_taxon_summary(&taxon_records)
58            .expect("Failed writing to csv");
59        csv.write_locus_summary(&stats)
60            .expect("Failed writing a per locus csv file");
61    }
62
63    pub fn summarize_locus(&mut self, files: &[PathBuf], prefix: Option<&str>) {
64        self.check_datatype();
65        let spin = utils::set_spinner();
66        spin.set_message("Computing per locus summary...");
67        files.iter().for_each(|file| {
68            let (matrix, _) = SeqParser::new(file, self.datatype).get_alignment(self.input_fmt);
69            let mut taxa = Taxa::new();
70            taxa.summarize_taxa(&matrix, self.datatype);
71            let csv = CsvWriter::new(self.output, prefix, self.datatype);
72            csv.write_per_locus_summary(file, &taxa)
73                .expect("Failed writing a taxon stats file");
74        });
75        spin.finish_with_message("Finished computing per locus summary!\n");
76        log::info!("{}", "Output".yellow());
77        log::info!("{:18}: {}", "Output dir", self.output.display());
78    }
79
80    fn summarize_taxa(
81        &self,
82        ids: &IndexSet<String>,
83        stats: &[(Sites, CharMatrix, Taxa)],
84    ) -> BTreeMap<String, TaxonRecords> {
85        let mut taxon_summary: BTreeMap<String, TaxonRecords> = BTreeMap::new();
86        stats.iter().for_each(|(_, _, taxa)| {
87            ids.iter().for_each(|id| {
88                if let Some(char_counts) = taxa.records.get(id) {
89                    match taxon_summary.get_mut(id) {
90                        Some(taxon) => {
91                            char_counts.chars.iter().for_each(|(c, count)| {
92                                *taxon.char_counts.entry(*c).or_insert(0) += count;
93                            });
94                            taxon.locus_counts += 1;
95                            taxon.total_chars += char_counts.total_chars;
96                            taxon.missing_data += char_counts.missing_data;
97                            if DataType::Dna == *self.datatype {
98                                taxon.gc_count += char_counts.gc_count;
99                                taxon.at_count += char_counts.at_count;
100                                taxon.nucleotides += char_counts.nucleotides;
101                            }
102                        }
103                        None => {
104                            let mut taxon = TaxonRecords::new();
105                            taxon.char_counts = char_counts.chars.clone();
106                            taxon.locus_counts = 1;
107                            taxon.total_chars = char_counts.total_chars;
108                            taxon.missing_data = char_counts.missing_data;
109                            if DataType::Dna == *self.datatype {
110                                taxon.gc_count = char_counts.gc_count;
111                                taxon.at_count = char_counts.at_count;
112                                taxon.nucleotides = char_counts.nucleotides;
113                            }
114                            taxon_summary.insert(id.to_string(), taxon);
115                        }
116                    }
117                }
118            });
119        });
120
121        taxon_summary
122    }
123
124    fn get_id(&mut self, files: &[PathBuf]) -> IndexSet<String> {
125        IDs::new(files, self.input_fmt, self.datatype).id_unique()
126    }
127
128    fn par_get_stats(&self, files: &[PathBuf]) -> Vec<(Sites, CharMatrix, Taxa)> {
129        let (send, rec) = channel();
130        files.par_iter().for_each_with(send, |s, file| {
131            s.send(self.get_stats(file)).unwrap();
132        });
133        rec.iter().collect()
134    }
135
136    fn check_datatype(&mut self) {
137        if let DataType::Ignore = self.datatype {
138            self.datatype = &DataType::Dna
139        }
140    }
141
142    fn get_stats(&self, path: &Path) -> (Sites, CharMatrix, Taxa) {
143        let aln = SeqParser::new(path, self.datatype);
144        let (matrix, header) = aln.get_alignment(self.input_fmt);
145        let mut dna = CharMatrix::new();
146        dna.count_chars(&matrix, &header, self.datatype);
147        let mut sites = Sites::new(path);
148        sites.get_stats(&matrix, self.datatype);
149        let mut taxa = Taxa::new();
150        taxa.summarize_taxa(&matrix, self.datatype);
151
152        (sites, dna, taxa)
153    }
154
155    fn summarize_char_matrix(
156        &self,
157        stats: &[(Sites, CharMatrix, Taxa)],
158    ) -> (SiteSummary, CharSummary, Completeness) {
159        let (sites, dna): (Vec<Sites>, Vec<CharMatrix>) =
160            stats.par_iter().map(|p| (p.0.clone(), p.1.clone())).unzip();
161        let mut sum_sites = SiteSummary::new();
162        sum_sites.summarize(&sites);
163        let mut sum_dna = CharSummary::new();
164        sum_dna.summarize(&dna, self.datatype);
165        let mut mat_comp = Completeness::new(&self.ntax, self.interval);
166        mat_comp.matrix_completeness(&dna);
167        (sum_sites, sum_dna, mat_comp)
168    }
169}