rosella/coverage/
coverage_table.rs

1use std::{path::Path, collections::HashSet};
2
3use anyhow::{Result, anyhow};
4use ndarray::{Array, Array2, Axis};
5
6use crate::external::coverm_engine::MappingMode;
7
8
9pub struct CoverageTable {
10    pub table: Array2<f64>, // rows are contigs, columns are coverage and variance. number of columns is twice the number of samples.
11                            // the first column is the coverage, the second is the variance, the third is the coverage, the fourth is the variance, etc.
12    pub average_depths: Vec<f64>, // the average of the coverage values in each row for a contig. Order is identical to the order of the rows of table.
13    pub contig_names: Vec<String>, // same length as the rows of table. Order is identical to the order of the rows of table.
14    pub contig_lengths: Vec<usize>, // same length as the rows of table. Order is identical to the order of the rows of table.
15    pub sample_names: Vec<String>, // half the length of the columns of table. Order is identical to the order of the columns of table.
16    pub output_path: String,
17}
18
19impl CoverageTable {
20    pub fn new(
21        table: Array2<f64>,
22        average_depths: Vec<f64>,
23        contig_names: Vec<String>,
24        contig_lengths: Vec<usize>,
25        sample_names: Vec<String>,
26        output_path: String
27    ) -> Self {
28        Self {
29            table,
30            average_depths,
31            contig_names,
32            contig_lengths,
33            sample_names,
34            output_path,
35        }
36    }
37
38    pub fn filter_by_length(&mut self, min_contig_size: usize) -> Result<HashSet<String>> {
39        // find the indices of the contigs that are too small
40        let indices_to_remove = self.contig_lengths
41            .iter()
42            .enumerate()
43            .filter_map(|(index, length)| {
44                if *length < min_contig_size {
45                    Some(index)
46                } else {
47                    None
48                }
49            }).collect::<HashSet<_>>();
50
51        self.filter_by_index(&indices_to_remove)
52    }
53
54    pub fn get_contig_names(&self, indices: &HashSet<usize>) -> HashSet<String> {
55        let filtered_contig_names = self.contig_names
56            .iter()
57            .enumerate()
58            .filter_map(|(index, name)| {
59                if indices.contains(&index) {
60                    Some(name.clone())
61                } else {
62                    None
63                }
64            }).collect::<HashSet<_>>();
65
66        filtered_contig_names
67    }
68
69    pub fn filter_by_index(&mut self, indices_to_remove: &HashSet<usize>) -> Result<HashSet<String>> {
70        // remove the contigs from the table
71        let new_table = self.table
72            .axis_iter(Axis(0))
73            .enumerate()
74            .filter_map(|(index, row)| {
75                if indices_to_remove.contains(&index) {
76                    None
77                } else {
78                    Some(row)
79                }
80            }).flat_map(|row| row.to_vec());
81        let new_n_rows = self.table.nrows() - indices_to_remove.len();
82        self.table = Array::from_iter(new_table).into_shape((new_n_rows, self.table.ncols()))?;
83        
84        // remove the contigs from the average depths
85        self.average_depths = self.average_depths
86            .iter()
87            .enumerate()
88            .filter_map(|(index, depth)| {
89                if indices_to_remove.contains(&index) {
90                    None
91                } else {
92                    Some(*depth)
93                }
94            }).collect::<Vec<_>>();
95        
96        let filtered_contig_names = self.contig_names
97            .iter()
98            .enumerate()
99            .filter_map(|(index, name)| {
100                if indices_to_remove.contains(&index) {
101                    Some(name.clone())
102                } else {
103                    None
104                }
105            }).collect::<HashSet<_>>();
106        // remove the contigs from the contig names
107        self.contig_names = self.contig_names
108            .iter()
109            .enumerate()
110            .filter_map(|(index, name)| {
111                if indices_to_remove.contains(&index) {
112                    None
113                } else {
114                    Some(name.clone())
115                }
116            }).collect::<Vec<_>>();
117        
118        // remove the contigs from the contig lengths
119        self.contig_lengths = self.contig_lengths
120            .iter()
121            .enumerate()
122            .filter_map(|(index, length)| {
123                if indices_to_remove.contains(&index) {
124                    None
125                } else {
126                    Some(*length)
127                }
128            }).collect::<Vec<_>>();
129
130        Ok(filtered_contig_names)
131    }
132
133    /// read a coverage table from a file
134    /// we specify the mode as a parameter as coverm has
135    /// different output formats for different modes depending on 
136    /// short/long read inputs
137    pub fn from_file<P: AsRef<Path>>(file_path: P, mode: MappingMode) -> Result<Self> {
138        match mode {
139            MappingMode::ShortBam | MappingMode::ShortRead => {
140                let mut reader = csv::ReaderBuilder::new()
141                    .delimiter(b'\t')
142                    .has_headers(true)
143                    .from_path(&file_path)?;
144
145                let mut table = Vec::new();
146                let mut contig_names = Vec::new();
147                let mut contig_lengths = Vec::new();
148                let mut average_depths = Vec::new();
149                let mut sample_names = Vec::new();
150
151                // get sample name from header
152                let headers = reader.headers()?;
153                for header in headers.iter().skip(3).step_by(2) {
154                    if header.contains("/") {
155                        // when performing read mapping, coverm sets the column name to
156                        // {reference}/{sample}.bam
157                        let mut sample_name = header.split("/").last().unwrap().to_string();
158                        sample_name = sample_name.replace(".bam", "");
159                        sample_names.push(sample_name);
160                    } else {
161                        // when using BAM files, coverm sets the column name to
162                        // {sample}
163                        sample_names.push(header.to_string());
164                    }
165                }
166
167                for result in reader.records() {
168                    let record = result?;
169                    let mut record_iter = record.iter();
170
171                    let contig_name = record_iter.next().unwrap().to_string();
172                    let contig_length = record_iter.next().unwrap().parse::<usize>()?;
173                    let average_depth = record_iter.next().unwrap().parse::<f64>()?;
174
175                    let mut coverage = Vec::new();
176                    let mut variance = Vec::new();
177
178                    for (i, value) in record_iter.enumerate() {
179                        if i % 2 == 0 {
180                            coverage.push(value.parse::<f64>()?);
181                        } else {
182                            variance.push(value.parse::<f64>()?);
183                        }
184                    }
185
186                    table.push(coverage);
187                    table.push(variance);
188
189                    contig_names.push(contig_name);
190                    contig_lengths.push(contig_length);
191                    average_depths.push(average_depth);
192                }
193
194                let table = Array2::from_shape_vec(
195                    (contig_names.len(), sample_names.len() * 2),
196                    table.into_iter().flatten().collect(),
197                )?;
198
199                Ok(
200                    Self {
201                        table,
202                        average_depths,
203                        contig_names,
204                        contig_lengths,
205                        sample_names,
206                        output_path: file_path.as_ref().to_string_lossy().to_string(),
207                    }
208                )
209            },
210            MappingMode::LongBam | MappingMode::LongRead => {
211                // long read/bam output is different.
212                // the first column is still the contig name
213                // the second column is the contig length
214                // the third column is the sample coverage
215                // the fourth column is the sample variance
216                // but then the fifth column is the contig length again, just
217                // reclalculated for the second sample. So we need to ignore every extra
218                // length column
219                let mut reader = csv::ReaderBuilder::new()
220                    .delimiter(b'\t')
221                    .has_headers(true)
222                    .from_path(&file_path)?;
223
224                let mut table = Vec::new();
225                let mut contig_names = Vec::new();
226                let mut contig_lengths = Vec::new();
227                // we need to calculate average depths ourselves after collecting the table
228                let mut average_depths = Vec::new();
229                let mut sample_names = Vec::new();
230
231                // get sample name from header
232                let headers = reader.headers()?;
233                // skip 2 and then step by 3 to bypase length columns
234                for header in headers.iter().skip(2).step_by(3) {
235                    // split on white space to get rid of "Mean" or "Variance" in header name
236                    let mut sample_name = header.split_whitespace().next().unwrap().to_string();
237                    if header.contains("/") {
238                        // when performing read mapping, coverm sets the column name to
239                        // {reference}/{sample}.bam
240                        sample_name = sample_name.split("/").last().unwrap().to_string();
241                        sample_name = sample_name.replace(".bam", "");
242                        sample_names.push(sample_name);
243                    } else {
244                        // when using BAM files, coverm sets the column name to
245                        // {sample}
246                        sample_names.push(sample_name.to_string());
247                    }
248                }
249
250                for result in reader.records() {
251                    let record = result?;
252                    let mut record_iter = record.iter();
253
254                    let contig_name = record_iter.next().unwrap().to_string();
255                    let contig_length = record_iter.next().unwrap().parse::<usize>()?;
256
257                    let mut coverage = Vec::new();
258                    let mut variance = Vec::new();
259
260                    for (i, value) in record_iter.enumerate() {
261                        if i % 2 == 0 {
262                            coverage.push(value.parse::<f64>()?);
263                        } else {
264                            variance.push(value.parse::<f64>()?);
265                        }
266                    }
267
268                    let average_depth = coverage.iter().sum::<f64>() / coverage.len() as f64;
269
270                    table.push(coverage);
271                    table.push(variance);
272
273                    contig_names.push(contig_name);
274                    contig_lengths.push(contig_length);
275                    average_depths.push(average_depth);
276                }
277
278                let table = Array2::from_shape_vec(
279                    (contig_names.len(), sample_names.len() * 2),
280                    table.into_iter().flatten().collect(),
281                )?;
282
283                Ok(
284                    Self {
285                        table,
286                        average_depths,
287                        contig_names,
288                        contig_lengths,
289                        sample_names,
290                        output_path: file_path.as_ref().to_string_lossy().to_string(),
291                    }
292                )
293            }
294        }
295    }
296
297    pub fn merge(&mut self, other: Self) -> Result<()> {
298        if &self.contig_names != &other.contig_names {
299            return Err(anyhow!(
300                "Cannot merge coverage tables with different contig names",
301            ));
302        }
303
304        // extend the sample names
305        self.sample_names.extend(other.sample_names);
306        // merge the tables along the columns
307        self.table.append(Axis(1), other.table.view())?;
308        
309
310        // recalculate average depths
311        self.average_depths = self
312            .table
313            .axis_iter(Axis(0))
314            .map(|row| row.iter().step_by(2).sum::<f64>() / self.sample_names.len() as f64)
315            .collect();
316
317        Ok(())
318    }
319
320    /// merge multiple coverage tables into one
321    /// Make sure that the tables have the same contig names
322    /// in the same order.
323    /// We also want to be aware of sample name order.
324    /// We will also need to recalculate average depths
325    pub fn merge_many(coverage_tables: Vec<CoverageTable>) -> Result<Self> {
326        let mut merged_table: Option<CoverageTable> = None;
327        for coverage_table in coverage_tables.into_iter() {
328            match &mut merged_table {
329                Some(table) => {
330                    table.merge(coverage_table)?;
331                },
332                None => {
333                    merged_table = Some(coverage_table);
334                }
335            }
336        }
337        
338        Ok(merged_table.unwrap())
339    }
340
341    /// Write the coverage table to a file
342    /// The file will be a tab delimited file with the following columns:
343    /// contig_name, contig_length, sample1_coverage, sample1_variance, sample2_coverage, sample2_variance, ...
344    /// The first row will be a header row with the sample names
345    pub fn write<P: AsRef<Path>>(&mut self, output_path: P) -> Result<()> {
346
347        self.set_output_path(output_path.as_ref().to_string_lossy().to_string());
348        let mut writer = csv::WriterBuilder::new()
349            .delimiter(b'\t')
350            .from_path(output_path)?;
351        
352        // write header row
353        writer.write_field("contigName")?;
354        writer.write_field("contigLen")?;
355        writer.write_field("totalAvgDepth")?;
356        for sample_name in &self.sample_names {
357            writer.write_field(format!("{}", sample_name))?;
358            writer.write_field(format!("{}-var", sample_name))?;
359        }
360        writer.write_record(None::<&[u8]>)?;
361
362        // write table
363        for (((contig_name, contig_length), average_depth), row) in 
364            self.contig_names.iter().zip(
365                self.contig_lengths.iter()
366            ).zip(
367                self.average_depths.iter()
368            ).zip(
369                self.table.axis_iter(Axis(0)))
370         {
371            let mut record = Vec::with_capacity(3 + self.sample_names.len() * 2);
372            record.push(format!("{}", contig_name));
373            record.push(format!("{}", contig_length));
374            record.push(format!("{:.3}", average_depth));
375            for value in row {
376                record.push(format!("{:.3}", value));
377            }
378            writer.write_record(record)?;
379        }
380        writer.flush()?;
381
382        Ok(())
383    }
384
385    pub fn set_output_path(&mut self, output_path: String) {
386        self.output_path = output_path;
387    }
388}