nanocov 0.1.0

Rust Coverage Calculator and QC Plot Generation Tool
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
// src/io/statistics.rs
// Module for generating statistics output from BAM files

use crate::utils::ReadStats;
use noodles_bam as bam;
use std::fmt::Write as FmtWrite;
use std::fs::File;
use std::io::{BufReader, Write};
use std::path::{Path, PathBuf};
use std::time::{Instant, SystemTime};

/// Format for statistics output
pub struct StatisticsOutput {
    pub file_name: String,
    pub num_alignments: u64,
    pub percent_from_total: f64,
    pub num_reads: u64,
    pub yield_gb: f64,
    pub mean_coverage: f64,
    pub yield_gb_greater_than_25kb: f64,
    pub n50: u32,
    pub n75: u32,
    pub median_length: f64,
    pub mean_length: f64,
    pub path: PathBuf,
    pub creation_time: String,
}

impl StatisticsOutput {
    /// Create a new StatisticsOutput struct with all zeros/empty values
    pub fn new_empty(path: PathBuf) -> Self {
        let file_name = path
            .file_name()
            .map(|s| s.to_string_lossy().to_string())
            .unwrap_or_default();

        // Format creation time as dd/mm/yyyy hh:mm:ss
        let creation_time = match path.metadata() {
            Ok(metadata) => {
                if let Ok(modified_time) = metadata.modified() {
                    if let Ok(duration) = modified_time.duration_since(SystemTime::UNIX_EPOCH) {
                        let secs = duration.as_secs();
                        let naive_time =
                            chrono::DateTime::<chrono::Utc>::from_timestamp(secs as i64, 0)
                                .unwrap_or_else(|| {
                                    chrono::DateTime::<chrono::Utc>::from_timestamp(0, 0).unwrap()
                                })
                                .naive_utc();
                        naive_time.format("%d/%m/%Y %H:%M:%S").to_string()
                    } else {
                        String::from("01/01/2000 00:00:00") // Fallback if time calculation fails
                    }
                } else {
                    String::from("01/01/2000 00:00:00") // Fallback if modified time can't be read
                }
            }
            Err(_) => String::from("01/01/2000 00:00:00"), // Fallback if metadata can't be read
        };

        Self {
            file_name,
            num_alignments: 0,
            percent_from_total: 0.0,
            num_reads: 0,
            yield_gb: 0.0,
            mean_coverage: 0.0,
            yield_gb_greater_than_25kb: 0.0,
            n50: 0,
            n75: 0,
            median_length: 0.0,
            mean_length: 0.0,
            path,
            creation_time,
        }
    }

    /// Create a new StatisticsOutput struct from read statistics and BAM path
    pub fn from_read_stats(path: PathBuf, read_stats: &ReadStats, mean_coverage: f64) -> Self {
        let mut result = Self::new_empty(path.clone());

        // Calculate N75
        // (We assume the ReadStats already has calculated N50 and sorted lengths)
        let n75 = if let Some(lengths) = &read_stats.lengths {
            if lengths.is_empty() {
                0
            } else {
                let total: u64 = lengths.iter().map(|&l| l as u64).sum();
                let mut acc = 0;
                let mut n75 = 0;
                for &l in lengths {
                    acc += l as u64;
                    if acc >= (total * 3) / 4 {
                        // 75% of total length
                        n75 = l;
                        break;
                    }
                }
                n75
            }
        } else {
            0
        };

        // Calculate yield_gb and yield_gb_greater_than_25kb
        let total_bases = read_stats.num_bases;
        let yield_gb = total_bases as f64 / 1_000_000_000.0;

        let total_bases_gt_25kb = if let Some(lengths) = &read_stats.lengths {
            lengths
                .iter()
                .filter(|&&l| l > 25_000)
                .map(|&l| l as u64)
                .sum::<u64>()
        } else {
            0
        };
        let yield_gb_gt_25kb = total_bases_gt_25kb as f64 / 1_000_000_000.0;

        result.num_alignments = read_stats.num_reads;
        result.percent_from_total = 100.0; // Assume all alignments are counted
        result.num_reads = read_stats.num_reads;
        result.yield_gb = yield_gb;
        result.mean_coverage = mean_coverage;
        result.yield_gb_greater_than_25kb = yield_gb_gt_25kb;
        result.n50 = read_stats.n50;
        result.n75 = n75;
        result.median_length = read_stats.median_len;
        result.mean_length = read_stats.mean_len;

        result
    }

    pub fn header() -> &'static str {
        "file_name\tnum_alignments\tpercent_from_total\tnum_reads\tyield_gb\tmean_coverage\tyield_gb_gt_25kb\tn50\tn75\tmedian_length\tmean_length\tpath\tcreation_time"
    }

    pub fn row(&self) -> String {
        format!(
            "{}\t{}\t{:.2}\t{}\t{:.2}\t{:.2}\t{:.2}\t{}\t{}\t{:.2}\t{:.2}\t{}\t{}",
            self.file_name,
            self.num_alignments,
            self.percent_from_total,
            self.num_reads,
            self.yield_gb,
            self.mean_coverage,
            self.yield_gb_greater_than_25kb,
            self.n50,
            self.n75,
            self.median_length,
            self.mean_length,
            self.path.display(),
            self.creation_time
        )
    }

    /// Format as a row-wise TSV (header + single data row)
    pub fn format(&self) -> String {
        let mut output = String::new();

        writeln!(&mut output, "{}", Self::header()).unwrap();
        writeln!(&mut output, "{}", self.row()).unwrap();

        output
    }

    /// Write to a file
    pub fn write_to_file(&self, output_path: &Path) -> Result<(), Box<dyn std::error::Error>> {
        let mut file = File::create(output_path)?;
        file.write_all(self.format().as_bytes())?;
        Ok(())
    }
}

/// Enhanced version of ReadStats to include more information needed for statistics output
pub struct EnhancedReadStats {
    pub n50: u32,
    pub n75: u32,
    pub mean_len: f64,
    pub median_len: f64,
    pub _mean_qual: f64,
    pub _median_qual: f64,
    pub num_reads: u64,
    pub num_bases: u64,
    pub lengths: Option<Vec<u32>>, // Store lengths for additional calculations
}

impl From<&ReadStats> for EnhancedReadStats {
    fn from(stats: &ReadStats) -> Self {
        // Convert basic ReadStats to EnhancedReadStats
        // Note: Some fields like num_reads, num_bases, and lengths
        // need to be populated separately
        Self {
            n50: stats.n50,
            n75: 0, // Will be calculated later
            mean_len: stats.mean_len,
            median_len: stats.median_len,
            _mean_qual: stats.mean_qual,
            _median_qual: stats.median_qual,
            num_reads: 0,  // Will be calculated later
            num_bases: 0,  // Will be calculated later
            lengths: None, // Will be populated later
        }
    }
}

/// Extract enhanced read statistics from a BAM file
#[hotpath::measure]
pub fn extract_enhanced_read_stats(
    bam_path: &Path,
) -> Result<EnhancedReadStats, Box<dyn std::error::Error>> {
    let timer = Instant::now();
    let mut reader = bam::io::Reader::new(BufReader::new(File::open(bam_path)?));
    let _header = reader.read_header()?;
    let mut lengths = Vec::new();
    let mut quals = Vec::new();
    let mut num_reads: u64 = 0;
    let mut num_bases: u64 = 0;

    for result in reader.records() {
        let record = result?;
        let len = record.sequence().len() as u32;
        lengths.push(len);
        num_reads += 1;
        num_bases += len as u64;

        // Collect mean quality per read (if available)
        let qual = record.quality_scores();
        let qual_slice = qual.as_ref();
        if !qual_slice.is_empty() {
            let q: f64 =
                qual_slice.iter().map(|&q| q as u32).sum::<u32>() as f64 / qual_slice.len() as f64;
            quals.push(q);
        }
    }
    let read_elapsed = timer.elapsed();

    // N50 calculation
    let sort_timer = Instant::now();
    lengths.sort_unstable_by(|a, b| b.cmp(a));
    let total: u64 = lengths.iter().map(|&l| l as u64).sum();
    let mut acc = 0;
    let mut n50 = 0;
    for &l in &lengths {
        acc += l as u64;
        if acc >= total / 2 {
            n50 = l;
            break;
        }
    }

    // N75 calculation
    let mut acc = 0;
    let mut n75 = 0;
    for &l in &lengths {
        acc += l as u64;
        if acc >= (total * 3) / 4 {
            n75 = l;
            break;
        }
    }

    // Mean/median length
    let mean_len = if lengths.is_empty() {
        0.0
    } else {
        lengths.iter().sum::<u32>() as f64 / lengths.len() as f64
    };
    let median_len = if lengths.is_empty() {
        0.0
    } else {
        let mid = lengths.len() / 2;
        if lengths.len() % 2 == 0 {
            (lengths[mid - 1] + lengths[mid]) as f64 / 2.0
        } else {
            lengths[mid] as f64
        }
    };

    // Mean/median quality
    let mean_qual = if quals.is_empty() {
        0.0
    } else {
        quals.iter().sum::<f64>() / quals.len() as f64
    };
    let median_qual = if quals.is_empty() {
        0.0
    } else {
        let mut sorted = quals.clone();
        sorted.sort_by(|a, b| a.partial_cmp(b).unwrap());
        let mid = sorted.len() / 2;
        if sorted.len() % 2 == 0 {
            (sorted[mid - 1] + sorted[mid]) / 2.0
        } else {
            sorted[mid]
        }
    };
    eprintln!(
        "[hotpath] statistics: read {:.2?} ({} reads), stats {:.2?}",
        read_elapsed,
        num_reads,
        sort_timer.elapsed()
    );

    Ok(EnhancedReadStats {
        n50,
        n75,
        mean_len,
        median_len,
        _mean_qual: mean_qual,
        _median_qual: median_qual,
        num_reads,
        num_bases,
        lengths: Some(lengths), // Store lengths for additional calculations
    })
}

/// Generate statistics output for a BAM file
#[hotpath::measure]
pub fn generate_statistics_output(
    bam_path: &Path,
    output_path: &Path,
    read_stats: Option<&ReadStats>,
    mean_coverage: f64,
) -> Result<StatisticsOutput, Box<dyn std::error::Error>> {
    let timer = Instant::now();
    let statistics_output = build_statistics_output(bam_path, read_stats, mean_coverage)?;

    statistics_output.write_to_file(output_path)?;
    eprintln!(
        "[hotpath] statistics output total: {:.2?} (alignments={})",
        timer.elapsed(),
        statistics_output.num_alignments
    );
    Ok(statistics_output)
}

fn build_statistics_output(
    bam_path: &Path,
    read_stats: Option<&ReadStats>,
    mean_coverage: f64,
) -> Result<StatisticsOutput, Box<dyn std::error::Error>> {
    if let Some(stats) = read_stats {
        // Use provided stats if available
        let _enhanced_stats = EnhancedReadStats::from(stats);
        return Ok(StatisticsOutput::from_read_stats(
            bam_path.to_path_buf(),
            stats,
            mean_coverage,
        ));
    }

    // Otherwise extract stats from BAM
    let enhanced_stats = extract_enhanced_read_stats(bam_path)?;
    let path_buf = bam_path.to_path_buf();
    if enhanced_stats.num_reads == 0 {
        // Empty BAM file
        return Ok(StatisticsOutput::new_empty(path_buf));
    }

    let creation_time = match bam_path.metadata() {
        Ok(metadata) => {
            if let Ok(modified_time) = metadata.modified() {
                if let Ok(duration) = modified_time.duration_since(SystemTime::UNIX_EPOCH) {
                    let secs = duration.as_secs();
                    let naive_time =
                        chrono::DateTime::<chrono::Utc>::from_timestamp(secs as i64, 0)
                            .unwrap_or_else(|| {
                                chrono::DateTime::<chrono::Utc>::from_timestamp(0, 0).unwrap()
                            })
                            .naive_utc();
                    naive_time.format("%d/%m/%Y %H:%M:%S").to_string()
                } else {
                    String::from("01/01/2000 00:00:00")
                }
            } else {
                String::from("01/01/2000 00:00:00")
            }
        }
        Err(_) => String::from("01/01/2000 00:00:00"),
    };

    Ok(StatisticsOutput {
        file_name: path_buf
            .file_name()
            .map(|s| s.to_string_lossy().to_string())
            .unwrap_or_default(),
        num_alignments: enhanced_stats.num_reads,
        percent_from_total: 100.0,
        num_reads: enhanced_stats.num_reads,
        yield_gb: enhanced_stats.num_bases as f64 / 1_000_000_000.0,
        mean_coverage,
        yield_gb_greater_than_25kb: if let Some(ref lengths) = enhanced_stats.lengths {
            let total_gt_25kb: u64 = lengths
                .iter()
                .filter(|&&l| l > 25_000)
                .map(|&l| l as u64)
                .sum();
            total_gt_25kb as f64 / 1_000_000_000.0
        } else {
            0.0
        },
        n50: enhanced_stats.n50,
        n75: enhanced_stats.n75,
        median_length: enhanced_stats.median_len,
        mean_length: enhanced_stats.mean_len,
        path: path_buf,
        creation_time,
    })
}

pub fn write_batch_statistics(
    outputs: &[(StatisticsOutput, Option<std::path::PathBuf>, String)],
    output_path: &Path,
) -> Result<(), Box<dyn std::error::Error>> {
    if let Some(parent) = output_path.parent()
        && !parent.as_os_str().is_empty()
    {
        std::fs::create_dir_all(parent)?;
    }

    let mut file = File::create(output_path)?;
    writeln!(file, "prefix\tbed_path\t{}", StatisticsOutput::header())?;

    for (entry, bed_path, prefix) in outputs {
        let bed_cell = bed_path
            .as_ref()
            .map(|p| p.display().to_string())
            .unwrap_or_else(|| "-".to_string());
        writeln!(file, "{prefix}\t{bed_cell}\t{}", entry.row())?;
    }

    Ok(())
}