Skip to main content

rsomics_coverage_core/
lib.rs

1//! Genome-binned BAM read-coverage primitive — the per-bin read-counting core
2//! shared by `rsomics-bam-signal` (deeptools `bamCoverage`) and
3//! `rsomics-bam-compare` (deeptools `bamCompare`).
4//!
5//! Both deeptools tools tile each chromosome into fixed-width bins and count,
6//! per bin, the reads whose reference span overlaps it (deeptools
7//! `countReadsPerBin`). bamCoverage then normalises one BAM's bins; bamCompare
8//! combines two BAMs' bins per operation. The tiling + counting is identical
9//! between them, so it lives here once (Layer A; B never depends on B). The
10//! normalisation, bedGraph run-length emit, and two-BAM combination stay in the
11//! respective tool crates — those differ.
12//!
13//! ## Counting semantics (deeptools source parity)
14//!
15//! A read is accepted into the count when it is mapped (FLAG 0x4 clear, refID
16//! ≥ 0) and passes the [`BinFilter`] (`skip_flags` / `min_mapq`). With
17//! deeptools' defaults (`samFlag_exclude=None`, `ignoreDuplicates=False`,
18//! `minMappingQuality=None`) the filter is empty, so secondary and supplementary
19//! reads are **not** excluded — pass `skip_flags = 0x900` to match samtools-style
20//! filtering, `0x400` for duplicate-only.
21//!
22//! Fragment extent: with `extendReads=False` / `centerReads=False` (deeptools
23//! defaults) the reference span is the alignment start plus every
24//! reference-consuming CIGAR op (M/=/X/D/N), matching pysam `get_blocks()`.
25//!
26//! Bin counting: a read contributes +1 to every bin it overlaps —
27//! `s_idx = floor(fragStart / binSize)`, `e_idx = ceil(fragEnd / binSize)`. The
28//! partial last bin per chromosome is retained.
29
30// Genome coordinates and bin indices fit u64; every first-class target is
31// 64-bit (the workspace supports no 32-bit target), so u64→usize never
32// truncates. refID/pos are validated non-negative before the i32→unsigned casts.
33#![allow(clippy::cast_possible_truncation)]
34#![allow(clippy::cast_sign_loss)]
35
36use std::num::NonZero;
37use std::path::Path;
38
39use rsomics_bamio::raw::{self, RawRecord};
40use rsomics_common::{Result, RsomicsError};
41
42// CIGAR op codes (BAM packed encoding, low nibble): the reference-consuming ops.
43const CIGAR_MATCH: u8 = 0;
44const CIGAR_DELETION: u8 = 2;
45const CIGAR_SKIP: u8 = 3;
46const CIGAR_SEQ_MATCH: u8 = 7;
47const CIGAR_SEQ_MISMATCH: u8 = 8;
48
49/// Read-acceptance predicate. A read counts when none of `skip_flags` are set in
50/// its FLAG and its MAPQ is at least `min_mapq`. Both default to "accept all
51/// mapped reads", matching deeptools' empty default filter.
52#[derive(Debug, Clone, Copy, Default)]
53pub struct BinFilter {
54    /// Skip reads whose FLAG has any of these bits set. deeptools default
55    /// (`samFlag_exclude=None`, `ignoreDuplicates=False`) → 0 (no skip).
56    pub skip_flags: u16,
57    /// Minimum mapping quality. deeptools default (`minMappingQuality=None`) → 0
58    /// (no filter).
59    pub min_mapq: u8,
60}
61
62/// One chromosome's bin counts. `bins[i]` holds the read count for the half-open
63/// reference window `[i*bin_size, min((i+1)*bin_size, chrom_len))`.
64#[derive(Debug, Clone)]
65pub struct ChromBins {
66    pub name: String,
67    pub chrom_len: u64,
68    pub bins: Vec<u32>,
69}
70
71/// The result of binning one BAM: per-chromosome bin counts plus two read
72/// totals that downstream normalisation needs.
73#[derive(Debug, Clone)]
74pub struct BinnedCoverage {
75    /// Per-chromosome bin counts, in header reference order.
76    pub chroms: Vec<ChromBins>,
77    /// Reads that contributed at least one bin increment — the denominator
78    /// deeptools `bamCoverage` uses for CPM/RPKM/RPGC (reads actually placed on
79    /// the genome after the filter and the `ref_len > 0` / in-bounds checks).
80    pub total_binned: u64,
81    /// All mapped reads passing the filter, regardless of whether they landed in
82    /// a bin — the count deeptools `bamCompare` uses for its readCount scale
83    /// factor (`min(m1, m2) / mi`). With no filter this equals the BAM's mapped
84    /// read count (`bam.mapped`).
85    pub total_mapped: u64,
86}
87
88impl BinnedCoverage {
89    /// Total signal across every bin — the BPM denominator
90    /// (`sum of all bin counts`).
91    #[must_use]
92    pub fn total_signal(&self) -> u64 {
93        self.chroms
94            .iter()
95            .map(|c| c.bins.iter().map(|&b| u64::from(b)).sum::<u64>())
96            .sum()
97    }
98}
99
100/// Scan `input`, tiling each reference at `bin_size` and counting reads per bin.
101///
102/// `bin_size` is in bases (deeptools default 50). `workers` controls BGZF
103/// inflate parallelism (see [`rsomics_bamio::open_with_workers`]); it never
104/// changes the result, only the read throughput.
105pub fn compute_coverage(
106    input: &Path,
107    bin_size: u32,
108    filter: BinFilter,
109    workers: NonZero<usize>,
110) -> Result<BinnedCoverage> {
111    let mut reader = rsomics_bamio::open_with_workers(input, workers)?;
112    let header = reader.read_header().map_err(RsomicsError::Io)?;
113
114    let bin_size = u64::from(bin_size);
115    let mut chroms: Vec<ChromBins> = header
116        .reference_sequences()
117        .iter()
118        .map(|(name, seq)| {
119            let len = usize::from(seq.length()) as u64;
120            let n_bins = len.div_ceil(bin_size) as usize;
121            ChromBins {
122                name: name.to_string(),
123                chrom_len: len,
124                bins: vec![0u32; n_bins],
125            }
126        })
127        .collect();
128
129    let mut total_binned: u64 = 0;
130    let mut total_mapped: u64 = 0;
131    let mut record = RawRecord::default();
132
133    while raw::read_record(reader.get_mut(), &mut record)? != 0 {
134        let flags = record.flags();
135        // Skip unmapped (FLAG 0x4).
136        if flags & 0x4 != 0 {
137            continue;
138        }
139        if record.reference_sequence_id() < 0 {
140            continue;
141        }
142        if filter.skip_flags != 0 && (flags & filter.skip_flags) != 0 {
143            continue;
144        }
145        if filter.min_mapq > 0 && record.mapping_quality() < filter.min_mapq {
146            continue;
147        }
148
149        // Every mapped read that clears the filter is a "kept read" for the
150        // bamCompare scale factor, even if its span falls outside all bins.
151        total_mapped += 1;
152
153        let tid = record.reference_sequence_id() as usize;
154        let Some(chrom) = chroms.get_mut(tid) else {
155            continue;
156        };
157
158        // 0-based alignment start (BAM raw pos field is 0-based).
159        let start0 = record.alignment_start() as u64;
160        let ref_len: u64 = record
161            .cigar_ops()
162            .filter_map(|(kind, len)| match kind {
163                CIGAR_MATCH | CIGAR_DELETION | CIGAR_SKIP | CIGAR_SEQ_MATCH
164                | CIGAR_SEQ_MISMATCH => Some(u64::from(len)),
165                _ => None,
166            })
167            .sum();
168        if ref_len == 0 {
169            continue;
170        }
171        let frag_end = start0 + ref_len;
172
173        let s_idx = (start0 / bin_size) as usize;
174        let e_idx = (frag_end.div_ceil(bin_size) as usize).min(chrom.bins.len());
175        if s_idx >= chrom.bins.len() {
176            continue;
177        }
178        for b in &mut chrom.bins[s_idx..e_idx] {
179            *b = b.saturating_add(1);
180        }
181        total_binned += 1;
182    }
183
184    Ok(BinnedCoverage {
185        chroms,
186        total_binned,
187        total_mapped,
188    })
189}