Skip to main content

holodeck_lib/commands/
eval.rs

1//! Alignment accuracy evaluation command.
2
3use std::collections::BTreeMap;
4use std::io::Write;
5use std::path::PathBuf;
6
7use anyhow::{Context, Result};
8use bstr::ByteSlice;
9use clap::Parser;
10use noodles::bam;
11
12use super::command::{Command, output_path};
13use super::common::OutputPrefixOptions;
14use crate::read_naming::{parse_encoded_pe_name, parse_encoded_se_name};
15
16/// Evaluate alignment accuracy of simulated reads.
17///
18/// Compares the true (simulated) positions of reads against their mapped
19/// positions in a BAM file.  Reports mapping accuracy, mismapping rate, and
20/// unmapped rate stratified by MAPQ bin.  Truth positions are parsed from
21/// encoded read names (default holodeck format).
22#[derive(Parser, Debug)]
23#[command(after_long_help = "EXAMPLES:\n  \
24    holodeck eval --mapped aligned.bam -o eval_results\n  \
25    holodeck eval --mapped aligned.bam --truth golden.bam -o eval_results")]
26pub struct Eval {
27    /// BAM file of mapped reads to evaluate.
28    #[arg(short = 'm', long, value_name = "BAM")]
29    pub mapped: PathBuf,
30
31    /// Optional golden BAM file with truth alignments. If omitted, truth
32    /// positions are parsed from encoded read names.
33    #[arg(long, value_name = "BAM")]
34    pub truth: Option<PathBuf>,
35
36    #[command(flatten)]
37    pub output: OutputPrefixOptions,
38
39    /// Maximum distance (in bases) between the true and mapped start positions
40    /// of a read for it to be considered correctly mapped. Uses
41    /// `|mapped_start - true_start| <= wiggle` on the same contig.
42    #[arg(long, default_value_t = 5, value_name = "INT")]
43    pub wiggle: u32,
44}
45
46/// Accuracy counts for a single MAPQ bin.
47#[derive(Debug, Default, Clone)]
48struct BinCounts {
49    /// Reads mapped to the correct position (within wiggle).
50    correct: u64,
51    /// Reads mapped to a wrong position or wrong contig.
52    mismapped: u64,
53    /// Reads that are unmapped.
54    unmapped: u64,
55    /// Total reads in this bin.
56    total: u64,
57}
58
59impl Command for Eval {
60    fn execute(&self) -> Result<()> {
61        if self.truth.is_some() {
62            log::warn!("--truth (golden BAM) is not yet implemented; using read names");
63        }
64
65        let mut reader = bam::io::reader::Builder
66            .build_from_path(&self.mapped)
67            .with_context(|| format!("Failed to open BAM: {}", self.mapped.display()))?;
68
69        let header = reader.read_header()?;
70
71        // MAPQ bins: 0, 1-9, 10-19, 20-29, 30-39, 40-49, 50-59, 60+
72        let mut bins: BTreeMap<u8, BinCounts> = BTreeMap::new();
73        let mut total_reads: u64 = 0;
74        let mut parse_failures: u64 = 0;
75
76        for result in reader.records() {
77            let record = result.with_context(|| "Failed to read BAM record")?;
78
79            // Skip secondary and supplementary alignments before counting.
80            let flags = record.flags();
81            if flags.is_secondary() || flags.is_supplementary() {
82                continue;
83            }
84            total_reads += 1;
85
86            // Get read name.
87            let name_bytes = record.name().map_or(&b""[..], |n| n.as_bytes());
88            let name = name_bytes.to_str().unwrap_or("");
89
90            // Parse truth from encoded read name.
91            let truth = parse_encoded_pe_name(name)
92                .map(|(_, r1, _)| r1)
93                .or_else(|| parse_encoded_se_name(name).map(|(_, r1)| r1));
94
95            let Some(truth) = truth else {
96                parse_failures += 1;
97                continue;
98            };
99
100            let mapq = record.mapping_quality().map_or(0, u8::from);
101            let bin_key = mapq_bin(mapq);
102            let counts = bins.entry(bin_key).or_default();
103            counts.total += 1;
104
105            if flags.is_unmapped() {
106                counts.unmapped += 1;
107                continue;
108            }
109
110            // Get mapped position.
111            let mapped_contig_idx = record.reference_sequence_id().and_then(Result::ok);
112            let mapped_pos = record.alignment_start().and_then(Result::ok).map(usize::from);
113
114            let (Some(mapped_contig_idx), Some(mapped_pos_1based)) =
115                (mapped_contig_idx, mapped_pos)
116            else {
117                counts.mismapped += 1;
118                continue;
119            };
120
121            // Resolve mapped contig name.
122            let Some((contig_name, _)) = header.reference_sequences().get_index(mapped_contig_idx)
123            else {
124                counts.mismapped += 1;
125                continue;
126            };
127            let mapped_contig = String::from_utf8_lossy(contig_name.as_ref());
128
129            // Compare truth vs mapped.
130            #[expect(clippy::cast_possible_truncation, reason = "mapped positions fit u32")]
131            let mapped_pos_u32 = mapped_pos_1based as u32;
132            let is_correct = mapped_contig == truth.contig
133                && mapped_pos_u32.abs_diff(truth.position) <= self.wiggle;
134
135            if is_correct {
136                counts.correct += 1;
137            } else {
138                counts.mismapped += 1;
139            }
140        }
141
142        if parse_failures > 0 {
143            log::warn!("{parse_failures} reads had unparseable names; skipped");
144        }
145
146        // Write results.
147        let output_file = output_path(&self.output.output, ".eval.txt");
148        let mut out = std::fs::File::create(&output_file)
149            .with_context(|| format!("Failed to create {}", output_file.display()))?;
150
151        writeln!(
152            out,
153            "mapq_bin\ttotal\tcorrect\tmismapped\tunmapped\tpct_correct\tpct_mismapped\tpct_unmapped"
154        )?;
155
156        let mut grand_total = BinCounts::default();
157        for (&bin, counts) in &bins {
158            write_bin_row(&mut out, &format_bin_label(bin), counts)?;
159            grand_total.correct += counts.correct;
160            grand_total.mismapped += counts.mismapped;
161            grand_total.unmapped += counts.unmapped;
162            grand_total.total += counts.total;
163        }
164        write_bin_row(&mut out, "ALL", &grand_total)?;
165
166        log::info!(
167            "Evaluated {total_reads} reads: {} correct, {} mismapped, {} unmapped",
168            grand_total.correct,
169            grand_total.mismapped,
170            grand_total.unmapped
171        );
172        log::info!("Results written to: {}", output_file.display());
173
174        Ok(())
175    }
176}
177
178/// Map a MAPQ value to a bin key.
179fn mapq_bin(mapq: u8) -> u8 {
180    match mapq {
181        0 => 0,
182        1..=9 => 1,
183        10..=19 => 10,
184        20..=29 => 20,
185        30..=39 => 30,
186        40..=49 => 40,
187        50..=59 => 50,
188        _ => 60,
189    }
190}
191
192/// Format a bin key as a label string.
193fn format_bin_label(bin: u8) -> String {
194    match bin {
195        0 => "0".to_string(),
196        1 => "1-9".to_string(),
197        60 => "60+".to_string(),
198        _ => format!("{}-{}", bin, bin + 9),
199    }
200}
201
202/// Write one row of the evaluation results table.
203fn write_bin_row(out: &mut impl Write, label: &str, counts: &BinCounts) -> Result<()> {
204    let total = counts.total.max(1) as f64;
205    writeln!(
206        out,
207        "{label}\t{}\t{}\t{}\t{}\t{:.2}\t{:.2}\t{:.2}",
208        counts.total,
209        counts.correct,
210        counts.mismapped,
211        counts.unmapped,
212        counts.correct as f64 / total * 100.0,
213        counts.mismapped as f64 / total * 100.0,
214        counts.unmapped as f64 / total * 100.0,
215    )?;
216    Ok(())
217}
218
219#[cfg(test)]
220mod tests {
221    use super::*;
222
223    #[test]
224    fn test_mapq_bin() {
225        assert_eq!(mapq_bin(0), 0);
226        assert_eq!(mapq_bin(5), 1);
227        assert_eq!(mapq_bin(10), 10);
228        assert_eq!(mapq_bin(15), 10);
229        assert_eq!(mapq_bin(30), 30);
230        assert_eq!(mapq_bin(60), 60);
231        assert_eq!(mapq_bin(255), 60);
232    }
233
234    #[test]
235    fn test_format_bin_label() {
236        assert_eq!(format_bin_label(0), "0");
237        assert_eq!(format_bin_label(1), "1-9");
238        assert_eq!(format_bin_label(10), "10-19");
239        assert_eq!(format_bin_label(60), "60+");
240    }
241
242    #[test]
243    fn test_write_bin_row() {
244        let counts = BinCounts { correct: 90, mismapped: 8, unmapped: 2, total: 100 };
245        let mut buf = Vec::new();
246        write_bin_row(&mut buf, "30-39", &counts).unwrap();
247        let line = String::from_utf8(buf).unwrap();
248        assert!(line.starts_with("30-39\t100\t90\t8\t2\t"));
249        assert!(line.contains("90.00"));
250    }
251}