Skip to main content

rsomics_compute_gc_bias/
lib.rs

1//! GC-bias estimation for a BAM over a 2bit genome — deeptools computeGCBias.
2//!
3//! For each genomic position sampled at `stepSize`, the fragment-length window
4//! contributes its G+C count to `n_gc` (expected, from the genome) and the
5//! number of forward reads starting at that position to `f_gc` (observed). The
6//! per-GC ratio `r_gc = (f/n) * (sum n / sum f)` is the bias estimate
7//! (Benjamini & Speed 2012, doi:10.1093/nar/gks001).
8
9#![allow(clippy::cast_possible_truncation)]
10#![allow(clippy::cast_precision_loss)]
11#![allow(clippy::cast_sign_loss)]
12
13mod twobit;
14
15use std::io::Write;
16use std::path::Path;
17
18use rsomics_bamio::raw::{self, RawRecord};
19use rsomics_common::{Result, RsomicsError};
20
21pub use twobit::TwoBit;
22
23const FLAG_UNMAPPED: u16 = 0x4;
24const FLAG_REVERSE: u16 = 0x10;
25
26pub struct GcBiasOpts {
27    pub fragment_length: u32,
28    pub effective_genome_size: u64,
29    /// deeptools default 5e7; sets `stepSize = max(genome_size / sample_size, 1)`.
30    pub sample_size: u64,
31}
32
33/// One row per GC count 0..=fragment_length.
34pub struct GcBiasTable {
35    pub f_gc: Vec<u64>,
36    pub n_gc: Vec<u64>,
37    pub r_gc: Vec<f64>,
38}
39
40/// Per-chromosome forward-read start histogram plus the mapped-read total.
41struct ReadStarts {
42    starts: Vec<Vec<u32>>,
43    total_mapped: u64,
44}
45
46pub fn compute(bam: &Path, genome: &Path, opts: &GcBiasOpts) -> Result<GcBiasTable> {
47    let tbit = TwoBit::open(genome)?;
48    let genome_size = tbit.genome_size();
49    let step = (genome_size / opts.sample_size).max(1);
50
51    let mut reader = rsomics_bamio::open_with_workers(bam, std::num::NonZero::<usize>::MIN)?;
52    let header = reader.read_header().map_err(RsomicsError::Io)?;
53    let refs: Vec<(String, u32)> = header
54        .reference_sequences()
55        .iter()
56        .map(|(name, seq)| (name.to_string(), usize::from(seq.length()) as u32))
57        .collect();
58
59    let reads = collect_read_starts(reader.get_mut(), &refs)?;
60
61    let reads_per_bp = reads.total_mapped as f64 / opts.effective_genome_size as f64;
62    let max_reads = poisson_isf(
63        4.0 * reads_per_bp * f64::from(opts.fragment_length),
64        1.0 / opts.sample_size as f64,
65    );
66
67    let flen = opts.fragment_length;
68    let mut f_gc = vec![0u64; flen as usize + 1];
69    let mut n_gc = vec![0u64; flen as usize + 1];
70
71    for (tid, (name, _len)) in refs.iter().enumerate() {
72        let Some(chrom_len) = tbit.chrom_len(name) else {
73            continue;
74        };
75        let starts = &reads.starts[tid];
76        let mut i = 0u32;
77        while i < chrom_len {
78            if i + flen > chrom_len {
79                break;
80            }
81            let Some(w) = tbit.window_bases(name, i, i + flen) else {
82                i = i.saturating_add(step as u32);
83                continue;
84            };
85            // deeptools skips windows with > 5% Ns
86            if (w.non_n as f64) < 0.95 * f64::from(flen) {
87                i = i.saturating_add(step as u32);
88                continue;
89            }
90            let gc = w.gc as usize;
91            n_gc[gc] += 1;
92            let num_reads = starts.get(i as usize).copied().unwrap_or(0);
93            if u64::from(num_reads) < max_reads {
94                f_gc[gc] += u64::from(num_reads);
95            }
96            i = i.saturating_add(step as u32);
97        }
98    }
99
100    let sum_n: u64 = n_gc.iter().sum();
101    let sum_f: u64 = f_gc.iter().sum();
102    let scaling = sum_n as f64 / sum_f as f64;
103    let r_gc: Vec<f64> = (0..f_gc.len())
104        .map(|x| {
105            if n_gc[x] > 0 && f_gc[x] > 0 {
106                (f_gc[x] as f64 / n_gc[x] as f64) * scaling
107            } else {
108                1.0
109            }
110        })
111        .collect();
112
113    Ok(GcBiasTable { f_gc, n_gc, r_gc })
114}
115
116/// Build, per reference, the count of forward (non-reverse) reads starting at
117/// each position. deeptools counts a read at GC position `i` iff its `pos == i`
118/// and it is forward and mapped.
119fn collect_read_starts(
120    reader: &mut impl std::io::Read,
121    refs: &[(String, u32)],
122) -> Result<ReadStarts> {
123    let mut starts: Vec<Vec<u32>> = refs
124        .iter()
125        .map(|(_, len)| vec![0u32; *len as usize])
126        .collect();
127    let mut total_mapped = 0u64;
128    let mut record = RawRecord::default();
129
130    while raw::read_record(reader, &mut record)? != 0 {
131        let flags = record.flags();
132        if flags & FLAG_UNMAPPED != 0 {
133            continue;
134        }
135        let tid = record.reference_sequence_id();
136        if tid < 0 {
137            continue;
138        }
139        total_mapped += 1;
140        if flags & FLAG_REVERSE != 0 {
141            continue;
142        }
143        let pos = record.alignment_start();
144        if pos < 0 {
145            continue;
146        }
147        if let Some(chrom) = starts.get_mut(tid as usize)
148            && let Some(slot) = chrom.get_mut(pos as usize)
149        {
150            *slot += 1;
151        }
152    }
153
154    Ok(ReadStarts {
155        starts,
156        total_mapped,
157    })
158}
159
160/// scipy `poisson(mu).isf(q)`: the smallest integer k with P(X > k) <= q.
161/// Computed from the exact PMF recurrence; mu here is small (a few tens).
162fn poisson_isf(mu: f64, q: f64) -> u64 {
163    if mu <= 0.0 {
164        return 0;
165    }
166    let mut pmf = (-mu).exp();
167    let mut cdf = pmf;
168    let mut k = 0u64;
169    // sf(k) = 1 - cdf(k); stop at the first k whose tail probability <= q
170    while 1.0 - cdf > q {
171        k += 1;
172        pmf *= mu / k as f64;
173        cdf += pmf;
174        if pmf == 0.0 && 1.0 - cdf <= q {
175            break;
176        }
177    }
178    k
179}
180
181/// deeptools writes the table with numpy `savetxt` defaults: three
182/// space-separated columns in `%.18e`, one row per GC count.
183pub fn write_freq(table: &GcBiasTable, out: &mut impl Write) -> Result<()> {
184    for i in 0..table.f_gc.len() {
185        writeln!(
186            out,
187            "{} {} {}",
188            fmt_e(table.f_gc[i] as f64),
189            fmt_e(table.n_gc[i] as f64),
190            fmt_e(table.r_gc[i]),
191        )
192        .map_err(RsomicsError::Io)?;
193    }
194    Ok(())
195}
196
197/// numpy savetxt default `%.18e`: 18 fractional digits, signed exponent of at
198/// least two digits — Rust's `{:e}` drops the sign and zero-padding, so rebuild.
199fn fmt_e(v: f64) -> String {
200    let raw = format!("{v:.18e}");
201    let (mantissa, exp) = raw.split_once('e').expect("scientific format has e");
202    let exp: i32 = exp.parse().expect("exponent is an integer");
203    let sign = if exp < 0 { '-' } else { '+' };
204    format!("{mantissa}e{sign}{:02}", exp.abs())
205}