rosella/coverage/
coverage_table.rs
1use std::{path::Path, collections::HashSet};
2
3use anyhow::{Result, anyhow};
4use ndarray::{Array, Array2, Axis};
5
6use crate::external::coverm_engine::MappingMode;
7
8
9pub struct CoverageTable {
10 pub table: Array2<f64>, pub average_depths: Vec<f64>, pub contig_names: Vec<String>, pub contig_lengths: Vec<usize>, pub sample_names: Vec<String>, pub output_path: String,
17}
18
19impl CoverageTable {
20 pub fn new(
21 table: Array2<f64>,
22 average_depths: Vec<f64>,
23 contig_names: Vec<String>,
24 contig_lengths: Vec<usize>,
25 sample_names: Vec<String>,
26 output_path: String
27 ) -> Self {
28 Self {
29 table,
30 average_depths,
31 contig_names,
32 contig_lengths,
33 sample_names,
34 output_path,
35 }
36 }
37
38 pub fn filter_by_length(&mut self, min_contig_size: usize) -> Result<HashSet<String>> {
39 let indices_to_remove = self.contig_lengths
41 .iter()
42 .enumerate()
43 .filter_map(|(index, length)| {
44 if *length < min_contig_size {
45 Some(index)
46 } else {
47 None
48 }
49 }).collect::<HashSet<_>>();
50
51 self.filter_by_index(&indices_to_remove)
52 }
53
54 pub fn get_contig_names(&self, indices: &HashSet<usize>) -> HashSet<String> {
55 let filtered_contig_names = self.contig_names
56 .iter()
57 .enumerate()
58 .filter_map(|(index, name)| {
59 if indices.contains(&index) {
60 Some(name.clone())
61 } else {
62 None
63 }
64 }).collect::<HashSet<_>>();
65
66 filtered_contig_names
67 }
68
69 pub fn filter_by_index(&mut self, indices_to_remove: &HashSet<usize>) -> Result<HashSet<String>> {
70 let new_table = self.table
72 .axis_iter(Axis(0))
73 .enumerate()
74 .filter_map(|(index, row)| {
75 if indices_to_remove.contains(&index) {
76 None
77 } else {
78 Some(row)
79 }
80 }).flat_map(|row| row.to_vec());
81 let new_n_rows = self.table.nrows() - indices_to_remove.len();
82 self.table = Array::from_iter(new_table).into_shape((new_n_rows, self.table.ncols()))?;
83
84 self.average_depths = self.average_depths
86 .iter()
87 .enumerate()
88 .filter_map(|(index, depth)| {
89 if indices_to_remove.contains(&index) {
90 None
91 } else {
92 Some(*depth)
93 }
94 }).collect::<Vec<_>>();
95
96 let filtered_contig_names = self.contig_names
97 .iter()
98 .enumerate()
99 .filter_map(|(index, name)| {
100 if indices_to_remove.contains(&index) {
101 Some(name.clone())
102 } else {
103 None
104 }
105 }).collect::<HashSet<_>>();
106 self.contig_names = self.contig_names
108 .iter()
109 .enumerate()
110 .filter_map(|(index, name)| {
111 if indices_to_remove.contains(&index) {
112 None
113 } else {
114 Some(name.clone())
115 }
116 }).collect::<Vec<_>>();
117
118 self.contig_lengths = self.contig_lengths
120 .iter()
121 .enumerate()
122 .filter_map(|(index, length)| {
123 if indices_to_remove.contains(&index) {
124 None
125 } else {
126 Some(*length)
127 }
128 }).collect::<Vec<_>>();
129
130 Ok(filtered_contig_names)
131 }
132
133 pub fn from_file<P: AsRef<Path>>(file_path: P, mode: MappingMode) -> Result<Self> {
138 match mode {
139 MappingMode::ShortBam | MappingMode::ShortRead => {
140 let mut reader = csv::ReaderBuilder::new()
141 .delimiter(b'\t')
142 .has_headers(true)
143 .from_path(&file_path)?;
144
145 let mut table = Vec::new();
146 let mut contig_names = Vec::new();
147 let mut contig_lengths = Vec::new();
148 let mut average_depths = Vec::new();
149 let mut sample_names = Vec::new();
150
151 let headers = reader.headers()?;
153 for header in headers.iter().skip(3).step_by(2) {
154 if header.contains("/") {
155 let mut sample_name = header.split("/").last().unwrap().to_string();
158 sample_name = sample_name.replace(".bam", "");
159 sample_names.push(sample_name);
160 } else {
161 sample_names.push(header.to_string());
164 }
165 }
166
167 for result in reader.records() {
168 let record = result?;
169 let mut record_iter = record.iter();
170
171 let contig_name = record_iter.next().unwrap().to_string();
172 let contig_length = record_iter.next().unwrap().parse::<usize>()?;
173 let average_depth = record_iter.next().unwrap().parse::<f64>()?;
174
175 let mut coverage = Vec::new();
176 let mut variance = Vec::new();
177
178 for (i, value) in record_iter.enumerate() {
179 if i % 2 == 0 {
180 coverage.push(value.parse::<f64>()?);
181 } else {
182 variance.push(value.parse::<f64>()?);
183 }
184 }
185
186 table.push(coverage);
187 table.push(variance);
188
189 contig_names.push(contig_name);
190 contig_lengths.push(contig_length);
191 average_depths.push(average_depth);
192 }
193
194 let table = Array2::from_shape_vec(
195 (contig_names.len(), sample_names.len() * 2),
196 table.into_iter().flatten().collect(),
197 )?;
198
199 Ok(
200 Self {
201 table,
202 average_depths,
203 contig_names,
204 contig_lengths,
205 sample_names,
206 output_path: file_path.as_ref().to_string_lossy().to_string(),
207 }
208 )
209 },
210 MappingMode::LongBam | MappingMode::LongRead => {
211 let mut reader = csv::ReaderBuilder::new()
220 .delimiter(b'\t')
221 .has_headers(true)
222 .from_path(&file_path)?;
223
224 let mut table = Vec::new();
225 let mut contig_names = Vec::new();
226 let mut contig_lengths = Vec::new();
227 let mut average_depths = Vec::new();
229 let mut sample_names = Vec::new();
230
231 let headers = reader.headers()?;
233 for header in headers.iter().skip(2).step_by(3) {
235 let mut sample_name = header.split_whitespace().next().unwrap().to_string();
237 if header.contains("/") {
238 sample_name = sample_name.split("/").last().unwrap().to_string();
241 sample_name = sample_name.replace(".bam", "");
242 sample_names.push(sample_name);
243 } else {
244 sample_names.push(sample_name.to_string());
247 }
248 }
249
250 for result in reader.records() {
251 let record = result?;
252 let mut record_iter = record.iter();
253
254 let contig_name = record_iter.next().unwrap().to_string();
255 let contig_length = record_iter.next().unwrap().parse::<usize>()?;
256
257 let mut coverage = Vec::new();
258 let mut variance = Vec::new();
259
260 for (i, value) in record_iter.enumerate() {
261 if i % 2 == 0 {
262 coverage.push(value.parse::<f64>()?);
263 } else {
264 variance.push(value.parse::<f64>()?);
265 }
266 }
267
268 let average_depth = coverage.iter().sum::<f64>() / coverage.len() as f64;
269
270 table.push(coverage);
271 table.push(variance);
272
273 contig_names.push(contig_name);
274 contig_lengths.push(contig_length);
275 average_depths.push(average_depth);
276 }
277
278 let table = Array2::from_shape_vec(
279 (contig_names.len(), sample_names.len() * 2),
280 table.into_iter().flatten().collect(),
281 )?;
282
283 Ok(
284 Self {
285 table,
286 average_depths,
287 contig_names,
288 contig_lengths,
289 sample_names,
290 output_path: file_path.as_ref().to_string_lossy().to_string(),
291 }
292 )
293 }
294 }
295 }
296
297 pub fn merge(&mut self, other: Self) -> Result<()> {
298 if &self.contig_names != &other.contig_names {
299 return Err(anyhow!(
300 "Cannot merge coverage tables with different contig names",
301 ));
302 }
303
304 self.sample_names.extend(other.sample_names);
306 self.table.append(Axis(1), other.table.view())?;
308
309
310 self.average_depths = self
312 .table
313 .axis_iter(Axis(0))
314 .map(|row| row.iter().step_by(2).sum::<f64>() / self.sample_names.len() as f64)
315 .collect();
316
317 Ok(())
318 }
319
320 pub fn merge_many(coverage_tables: Vec<CoverageTable>) -> Result<Self> {
326 let mut merged_table: Option<CoverageTable> = None;
327 for coverage_table in coverage_tables.into_iter() {
328 match &mut merged_table {
329 Some(table) => {
330 table.merge(coverage_table)?;
331 },
332 None => {
333 merged_table = Some(coverage_table);
334 }
335 }
336 }
337
338 Ok(merged_table.unwrap())
339 }
340
341 pub fn write<P: AsRef<Path>>(&mut self, output_path: P) -> Result<()> {
346
347 self.set_output_path(output_path.as_ref().to_string_lossy().to_string());
348 let mut writer = csv::WriterBuilder::new()
349 .delimiter(b'\t')
350 .from_path(output_path)?;
351
352 writer.write_field("contigName")?;
354 writer.write_field("contigLen")?;
355 writer.write_field("totalAvgDepth")?;
356 for sample_name in &self.sample_names {
357 writer.write_field(format!("{}", sample_name))?;
358 writer.write_field(format!("{}-var", sample_name))?;
359 }
360 writer.write_record(None::<&[u8]>)?;
361
362 for (((contig_name, contig_length), average_depth), row) in
364 self.contig_names.iter().zip(
365 self.contig_lengths.iter()
366 ).zip(
367 self.average_depths.iter()
368 ).zip(
369 self.table.axis_iter(Axis(0)))
370 {
371 let mut record = Vec::with_capacity(3 + self.sample_names.len() * 2);
372 record.push(format!("{}", contig_name));
373 record.push(format!("{}", contig_length));
374 record.push(format!("{:.3}", average_depth));
375 for value in row {
376 record.push(format!("{:.3}", value));
377 }
378 writer.write_record(record)?;
379 }
380 writer.flush()?;
381
382 Ok(())
383 }
384
385 pub fn set_output_path(&mut self, output_path: String) {
386 self.output_path = output_path;
387 }
388}