rsomics_read_nvc/lib.rs
1//! Per-cycle nucleotide composition (NVC) from a BAM file.
2//!
3//! For each read cycle position (0-based, 0..read_length-1), counts how many
4//! reads have each nucleotide (A, C, G, T, N, X) at that position, where X is
5//! any base not in {A, C, G, T, N}.
6//!
7//! Reverse-strand reads (FLAG 0x10) are reverse-complemented before counting,
8//! so position 0 always corresponds to the first sequenced base.
9//!
10//! ## Filters applied
11//!
12//! - Skip unmapped reads (FLAG 0x0004).
13//! - Skip QC-fail reads (FLAG 0x0200).
14//! - Skip reads with MAPQ < `mapq_cut` (default 30).
15//! - Secondary and supplementary reads are not filtered.
16//!
17//! ## Output format
18//!
19//! `<prefix>.NVC.xls`: tab-separated, columns `Position`, `A`, `C`, `G`, `T`,
20//! `N`, `X`. Each count value has a leading space (e.g. `" 10991"`), matching
21//! the exact byte format of `RSeQC` `read_NVC.py`. Each data row ends with a
22//! trailing tab before the newline. The header row has no leading spaces.
23//!
24//! ## Origin
25//!
26//! This crate is an independent Rust reimplementation of `RSeQC`
27//! `read_NVC.py` based on:
28//! - The published method: Wang et al. 2012 <https://doi.org/10.1093/bioinformatics/bts356>
29//! - The public SAM/BAM format specification
30//! - Black-box behaviour testing against `RSeQC` 5.0.4
31//! (`read_NVC.py` — GPL-v3; source not read; clean-room implementation)
32//!
33//! No source code from the GPL upstream was used as reference during
34//! implementation. Test fixtures are independently generated.
35//!
36//! License: MIT OR Apache-2.0.
37//! Upstream credit: `RSeQC` <https://rseqc.sourceforge.net/> (GPL-v3).
38
39use std::fs::File;
40use std::io::{BufWriter, Write};
41use std::num::NonZero;
42use std::path::Path;
43
44use rsomics_bamio::raw::{self, RawRecord};
45use rsomics_common::{Result, RsomicsError};
46
47// SAM flag bits.
48const FLAG_UNMAPPED: u16 = 0x0004;
49const FLAG_REVERSE: u16 = 0x0010;
50const FLAG_QCFAIL: u16 = 0x0200;
51
52/// BAM 4-bit nibble codes for each nucleotide.
53/// Nibble encoding: 1=A, 2=C, 4=G, 8=T, 15=N, others=ambiguous.
54const NIBBLE_A: u8 = 1;
55const NIBBLE_C: u8 = 2;
56const NIBBLE_G: u8 = 4;
57const NIBBLE_T: u8 = 8;
58const NIBBLE_N: u8 = 15;
59
60/// Complement of a BAM nibble-encoded base.
61///
62/// Nibble encoding: A=1, C=2, G=4, T=8, N=15.
63/// Complement: A↔T, C↔G, N↔N, others↔N.
64fn complement_nibble(nib: u8) -> u8 {
65 match nib {
66 NIBBLE_A => NIBBLE_T,
67 NIBBLE_T => NIBBLE_A,
68 NIBBLE_C => NIBBLE_G,
69 NIBBLE_G => NIBBLE_C,
70 // N and all ambiguity codes complement to N.
71 _ => NIBBLE_N,
72 }
73}
74
75/// Per-cycle NVC table: counts[pos][base] where base index is A=0, C=1, G=2, T=3, N=4, X=5.
76pub struct NvcTable {
77 /// `counts[pos][6]` — A, C, G, T, N, X counts per cycle position.
78 pub counts: Vec<[u64; 6]>,
79 /// Read cycle length (from first passing read's sequence length).
80 pub read_length: usize,
81}
82
83impl NvcTable {
84 fn new(read_length: usize) -> Self {
85 Self {
86 counts: vec![[0u64; 6]; read_length],
87 read_length,
88 }
89 }
90}
91
92/// Map a BAM nibble to the column index (A=0, C=1, G=2, T=3, N=4, X=5).
93fn nibble_to_col(nib: u8) -> usize {
94 match nib {
95 NIBBLE_A => 0,
96 NIBBLE_C => 1,
97 NIBBLE_G => 2,
98 NIBBLE_T => 3,
99 NIBBLE_N => 4,
100 _ => 5,
101 }
102}
103
104/// Accumulate one read's base calls into the NVC table.
105///
106/// For reverse-strand reads the sequence is iterated in reverse and each base
107/// is complemented, so that position 0 in the table corresponds to the first
108/// sequenced base regardless of strand.
109fn accumulate(table: &mut NvcTable, rec: &RawRecord, is_reverse: bool) {
110 let n = rec.sequence_len().min(table.read_length);
111 if is_reverse {
112 for i in 0..n {
113 // Read in reverse: nibble at position (n-1-i) gives the base at
114 // sequencing cycle i after complementing.
115 let src = (n - 1) - i;
116 let nib = complement_nibble(rec.seq_nibble(src));
117 table.counts[i][nibble_to_col(nib)] += 1;
118 }
119 } else {
120 for i in 0..n {
121 let nib = rec.seq_nibble(i);
122 table.counts[i][nibble_to_col(nib)] += 1;
123 }
124 }
125}
126
127/// Scan `bam_path` and compute the NVC table.
128pub fn compute_nvc(bam_path: &Path, mapq_cut: u8, workers: NonZero<usize>) -> Result<NvcTable> {
129 let mut reader = rsomics_bamio::open_with_workers(bam_path, workers)?;
130 reader.read_header().map_err(RsomicsError::Io)?;
131
132 let inner = reader.get_mut();
133 let mut rec = RawRecord::default();
134 let mut table: Option<NvcTable> = None;
135
136 loop {
137 let n = raw::read_record(inner, &mut rec)?;
138 if n == 0 {
139 break;
140 }
141
142 let flags = rec.flags();
143 if flags & (FLAG_UNMAPPED | FLAG_QCFAIL) != 0 {
144 continue;
145 }
146 if rec.mapping_quality() < mapq_cut {
147 continue;
148 }
149
150 let seq_len = rec.sequence_len();
151 if seq_len == 0 {
152 continue;
153 }
154
155 let t = table.get_or_insert_with(|| NvcTable::new(seq_len));
156 let is_reverse = flags & FLAG_REVERSE != 0;
157 accumulate(t, &rec, is_reverse);
158 }
159
160 Ok(table.unwrap_or_else(|| NvcTable::new(0)))
161}
162
163/// Write `<prefix>.NVC.xls` matching the exact byte format of `read_NVC.py`.
164///
165/// Header: `Position\tA\tC\tG\tT\tN\tX\n` (no leading spaces).
166/// Data rows: `{pos}\t {A}\t {C}\t {G}\t {T}\t {N}\t {X}\t\n` — each count
167/// has one leading space; each row ends with a trailing tab before the newline.
168pub fn write_nvc_xls(table: &NvcTable, out_prefix: &Path) -> Result<()> {
169 let prefix_str = out_prefix
170 .file_name()
171 .unwrap_or_default()
172 .to_string_lossy()
173 .into_owned();
174 let dir = out_prefix.parent().unwrap_or(Path::new("."));
175 let xls_path = dir.join(format!("{prefix_str}.NVC.xls"));
176
177 let f = File::create(&xls_path).map_err(RsomicsError::Io)?;
178 let mut w = BufWriter::new(f);
179
180 writeln!(w, "Position\tA\tC\tG\tT\tN\tX").map_err(RsomicsError::Io)?;
181
182 for (pos, row) in table.counts.iter().enumerate() {
183 // Each value is written with a leading space; row ends with trailing tab.
184 write!(w, "{pos}").map_err(RsomicsError::Io)?;
185 for &count in row {
186 write!(w, "\t {count}").map_err(RsomicsError::Io)?;
187 }
188 writeln!(w, "\t").map_err(RsomicsError::Io)?;
189 }
190
191 Ok(())
192}
193
194/// Run the full NVC analysis and write the `.NVC.xls` output file.
195pub fn run_nvc(
196 bam_path: &Path,
197 out_prefix: &Path,
198 mapq_cut: u8,
199 workers: NonZero<usize>,
200) -> Result<()> {
201 eprintln!("Read BAM file ...");
202 let table = compute_nvc(bam_path, mapq_cut, workers)?;
203 eprintln!(" Done");
204 eprintln!("generating data matrix ...");
205 write_nvc_xls(&table, out_prefix)?;
206 eprintln!("generating R script ...");
207 Ok(())
208}