rsomics_compute_gc_bias/
lib.rs1#![allow(clippy::cast_possible_truncation)]
10#![allow(clippy::cast_precision_loss)]
11#![allow(clippy::cast_sign_loss)]
12
13mod twobit;
14
15use std::io::Write;
16use std::path::Path;
17
18use rsomics_bamio::raw::{self, RawRecord};
19use rsomics_common::{Result, RsomicsError};
20
21pub use twobit::TwoBit;
22
23const FLAG_UNMAPPED: u16 = 0x4;
24const FLAG_REVERSE: u16 = 0x10;
25
26pub struct GcBiasOpts {
27 pub fragment_length: u32,
28 pub effective_genome_size: u64,
29 pub sample_size: u64,
31}
32
33pub struct GcBiasTable {
35 pub f_gc: Vec<u64>,
36 pub n_gc: Vec<u64>,
37 pub r_gc: Vec<f64>,
38}
39
40struct ReadStarts {
42 starts: Vec<Vec<u32>>,
43 total_mapped: u64,
44}
45
46pub fn compute(bam: &Path, genome: &Path, opts: &GcBiasOpts) -> Result<GcBiasTable> {
47 let tbit = TwoBit::open(genome)?;
48 let genome_size = tbit.genome_size();
49 let step = (genome_size / opts.sample_size).max(1);
50
51 let mut reader = rsomics_bamio::open_with_workers(bam, std::num::NonZero::<usize>::MIN)?;
52 let header = reader.read_header().map_err(RsomicsError::Io)?;
53 let refs: Vec<(String, u32)> = header
54 .reference_sequences()
55 .iter()
56 .map(|(name, seq)| (name.to_string(), usize::from(seq.length()) as u32))
57 .collect();
58
59 let reads = collect_read_starts(reader.get_mut(), &refs)?;
60
61 let reads_per_bp = reads.total_mapped as f64 / opts.effective_genome_size as f64;
62 let max_reads = poisson_isf(
63 4.0 * reads_per_bp * f64::from(opts.fragment_length),
64 1.0 / opts.sample_size as f64,
65 );
66
67 let flen = opts.fragment_length;
68 let mut f_gc = vec![0u64; flen as usize + 1];
69 let mut n_gc = vec![0u64; flen as usize + 1];
70
71 for (tid, (name, _len)) in refs.iter().enumerate() {
72 let Some(chrom_len) = tbit.chrom_len(name) else {
73 continue;
74 };
75 let starts = &reads.starts[tid];
76 let mut i = 0u32;
77 while i < chrom_len {
78 if i + flen > chrom_len {
79 break;
80 }
81 let Some(w) = tbit.window_bases(name, i, i + flen) else {
82 i = i.saturating_add(step as u32);
83 continue;
84 };
85 if (w.non_n as f64) < 0.95 * f64::from(flen) {
87 i = i.saturating_add(step as u32);
88 continue;
89 }
90 let gc = w.gc as usize;
91 n_gc[gc] += 1;
92 let num_reads = starts.get(i as usize).copied().unwrap_or(0);
93 if u64::from(num_reads) < max_reads {
94 f_gc[gc] += u64::from(num_reads);
95 }
96 i = i.saturating_add(step as u32);
97 }
98 }
99
100 let sum_n: u64 = n_gc.iter().sum();
101 let sum_f: u64 = f_gc.iter().sum();
102 let scaling = sum_n as f64 / sum_f as f64;
103 let r_gc: Vec<f64> = (0..f_gc.len())
104 .map(|x| {
105 if n_gc[x] > 0 && f_gc[x] > 0 {
106 (f_gc[x] as f64 / n_gc[x] as f64) * scaling
107 } else {
108 1.0
109 }
110 })
111 .collect();
112
113 Ok(GcBiasTable { f_gc, n_gc, r_gc })
114}
115
116fn collect_read_starts(
120 reader: &mut impl std::io::Read,
121 refs: &[(String, u32)],
122) -> Result<ReadStarts> {
123 let mut starts: Vec<Vec<u32>> = refs
124 .iter()
125 .map(|(_, len)| vec![0u32; *len as usize])
126 .collect();
127 let mut total_mapped = 0u64;
128 let mut record = RawRecord::default();
129
130 while raw::read_record(reader, &mut record)? != 0 {
131 let flags = record.flags();
132 if flags & FLAG_UNMAPPED != 0 {
133 continue;
134 }
135 let tid = record.reference_sequence_id();
136 if tid < 0 {
137 continue;
138 }
139 total_mapped += 1;
140 if flags & FLAG_REVERSE != 0 {
141 continue;
142 }
143 let pos = record.alignment_start();
144 if pos < 0 {
145 continue;
146 }
147 if let Some(chrom) = starts.get_mut(tid as usize)
148 && let Some(slot) = chrom.get_mut(pos as usize)
149 {
150 *slot += 1;
151 }
152 }
153
154 Ok(ReadStarts {
155 starts,
156 total_mapped,
157 })
158}
159
160fn poisson_isf(mu: f64, q: f64) -> u64 {
163 if mu <= 0.0 {
164 return 0;
165 }
166 let mut pmf = (-mu).exp();
167 let mut cdf = pmf;
168 let mut k = 0u64;
169 while 1.0 - cdf > q {
171 k += 1;
172 pmf *= mu / k as f64;
173 cdf += pmf;
174 if pmf == 0.0 && 1.0 - cdf <= q {
175 break;
176 }
177 }
178 k
179}
180
181pub fn write_freq(table: &GcBiasTable, out: &mut impl Write) -> Result<()> {
184 for i in 0..table.f_gc.len() {
185 writeln!(
186 out,
187 "{} {} {}",
188 fmt_e(table.f_gc[i] as f64),
189 fmt_e(table.n_gc[i] as f64),
190 fmt_e(table.r_gc[i]),
191 )
192 .map_err(RsomicsError::Io)?;
193 }
194 Ok(())
195}
196
197fn fmt_e(v: f64) -> String {
200 let raw = format!("{v:.18e}");
201 let (mantissa, exp) = raw.split_once('e').expect("scientific format has e");
202 let exp: i32 = exp.parse().expect("exponent is an integer");
203 let sign = if exp < 0 { '-' } else { '+' };
204 format!("{mantissa}e{sign}{:02}", exp.abs())
205}