rsomics-read-nvc 0.1.0

Per-cycle nucleotide composition (NVC) from a BAM — Rust port of RSeQC read_NVC.py
Documentation
//! Per-cycle nucleotide composition (NVC) from a BAM file.
//!
//! For each read cycle position (0-based, 0..read_length-1), counts how many
//! reads have each nucleotide (A, C, G, T, N, X) at that position, where X is
//! any base not in {A, C, G, T, N}.
//!
//! Reverse-strand reads (FLAG 0x10) are reverse-complemented before counting,
//! so position 0 always corresponds to the first sequenced base.
//!
//! ## Filters applied
//!
//! - Skip unmapped reads (FLAG 0x0004).
//! - Skip QC-fail reads (FLAG 0x0200).
//! - Skip reads with MAPQ < `mapq_cut` (default 30).
//! - Secondary and supplementary reads are not filtered.
//!
//! ## Output format
//!
//! `<prefix>.NVC.xls`: tab-separated, columns `Position`, `A`, `C`, `G`, `T`,
//! `N`, `X`. Each count value has a leading space (e.g. `" 10991"`), matching
//! the exact byte format of `RSeQC` `read_NVC.py`. Each data row ends with a
//! trailing tab before the newline. The header row has no leading spaces.
//!
//! ## Origin
//!
//! This crate is an independent Rust reimplementation of `RSeQC`
//! `read_NVC.py` based on:
//! - The published method: Wang et al. 2012 <https://doi.org/10.1093/bioinformatics/bts356>
//! - The public SAM/BAM format specification
//! - Black-box behaviour testing against `RSeQC` 5.0.4
//!   (`read_NVC.py` — GPL-v3; source not read; clean-room implementation)
//!
//! No source code from the GPL upstream was used as reference during
//! implementation. Test fixtures are independently generated.
//!
//! License: MIT OR Apache-2.0.
//! Upstream credit: `RSeQC` <https://rseqc.sourceforge.net/> (GPL-v3).

use std::fs::File;
use std::io::{BufWriter, Write};
use std::num::NonZero;
use std::path::Path;

use rsomics_bamio::raw::{self, RawRecord};
use rsomics_common::{Result, RsomicsError};

// SAM flag bits.
const FLAG_UNMAPPED: u16 = 0x0004;
const FLAG_REVERSE: u16 = 0x0010;
const FLAG_QCFAIL: u16 = 0x0200;

/// BAM 4-bit nibble codes for each nucleotide.
/// Nibble encoding: 1=A, 2=C, 4=G, 8=T, 15=N, others=ambiguous.
const NIBBLE_A: u8 = 1;
const NIBBLE_C: u8 = 2;
const NIBBLE_G: u8 = 4;
const NIBBLE_T: u8 = 8;
const NIBBLE_N: u8 = 15;

/// Complement of a BAM nibble-encoded base.
///
/// Nibble encoding: A=1, C=2, G=4, T=8, N=15.
/// Complement: A↔T, C↔G, N↔N, others↔N.
fn complement_nibble(nib: u8) -> u8 {
    match nib {
        NIBBLE_A => NIBBLE_T,
        NIBBLE_T => NIBBLE_A,
        NIBBLE_C => NIBBLE_G,
        NIBBLE_G => NIBBLE_C,
        // N and all ambiguity codes complement to N.
        _ => NIBBLE_N,
    }
}

/// Per-cycle NVC table: counts[pos][base] where base index is A=0, C=1, G=2, T=3, N=4, X=5.
pub struct NvcTable {
    /// `counts[pos][6]` — A, C, G, T, N, X counts per cycle position.
    pub counts: Vec<[u64; 6]>,
    /// Read cycle length (from first passing read's sequence length).
    pub read_length: usize,
}

impl NvcTable {
    fn new(read_length: usize) -> Self {
        Self {
            counts: vec![[0u64; 6]; read_length],
            read_length,
        }
    }
}

/// Map a BAM nibble to the column index (A=0, C=1, G=2, T=3, N=4, X=5).
fn nibble_to_col(nib: u8) -> usize {
    match nib {
        NIBBLE_A => 0,
        NIBBLE_C => 1,
        NIBBLE_G => 2,
        NIBBLE_T => 3,
        NIBBLE_N => 4,
        _ => 5,
    }
}

/// Accumulate one read's base calls into the NVC table.
///
/// For reverse-strand reads the sequence is iterated in reverse and each base
/// is complemented, so that position 0 in the table corresponds to the first
/// sequenced base regardless of strand.
fn accumulate(table: &mut NvcTable, rec: &RawRecord, is_reverse: bool) {
    let n = rec.sequence_len().min(table.read_length);
    if is_reverse {
        for i in 0..n {
            // Read in reverse: nibble at position (n-1-i) gives the base at
            // sequencing cycle i after complementing.
            let src = (n - 1) - i;
            let nib = complement_nibble(rec.seq_nibble(src));
            table.counts[i][nibble_to_col(nib)] += 1;
        }
    } else {
        for i in 0..n {
            let nib = rec.seq_nibble(i);
            table.counts[i][nibble_to_col(nib)] += 1;
        }
    }
}

/// Scan `bam_path` and compute the NVC table.
pub fn compute_nvc(bam_path: &Path, mapq_cut: u8, workers: NonZero<usize>) -> Result<NvcTable> {
    let mut reader = rsomics_bamio::open_with_workers(bam_path, workers)?;
    reader.read_header().map_err(RsomicsError::Io)?;

    let inner = reader.get_mut();
    let mut rec = RawRecord::default();
    let mut table: Option<NvcTable> = None;

    loop {
        let n = raw::read_record(inner, &mut rec)?;
        if n == 0 {
            break;
        }

        let flags = rec.flags();
        if flags & (FLAG_UNMAPPED | FLAG_QCFAIL) != 0 {
            continue;
        }
        if rec.mapping_quality() < mapq_cut {
            continue;
        }

        let seq_len = rec.sequence_len();
        if seq_len == 0 {
            continue;
        }

        let t = table.get_or_insert_with(|| NvcTable::new(seq_len));
        let is_reverse = flags & FLAG_REVERSE != 0;
        accumulate(t, &rec, is_reverse);
    }

    Ok(table.unwrap_or_else(|| NvcTable::new(0)))
}

/// Write `<prefix>.NVC.xls` matching the exact byte format of `read_NVC.py`.
///
/// Header: `Position\tA\tC\tG\tT\tN\tX\n` (no leading spaces).
/// Data rows: `{pos}\t {A}\t {C}\t {G}\t {T}\t {N}\t {X}\t\n` — each count
/// has one leading space; each row ends with a trailing tab before the newline.
pub fn write_nvc_xls(table: &NvcTable, out_prefix: &Path) -> Result<()> {
    let prefix_str = out_prefix
        .file_name()
        .unwrap_or_default()
        .to_string_lossy()
        .into_owned();
    let dir = out_prefix.parent().unwrap_or(Path::new("."));
    let xls_path = dir.join(format!("{prefix_str}.NVC.xls"));

    let f = File::create(&xls_path).map_err(RsomicsError::Io)?;
    let mut w = BufWriter::new(f);

    writeln!(w, "Position\tA\tC\tG\tT\tN\tX").map_err(RsomicsError::Io)?;

    for (pos, row) in table.counts.iter().enumerate() {
        // Each value is written with a leading space; row ends with trailing tab.
        write!(w, "{pos}").map_err(RsomicsError::Io)?;
        for &count in row {
            write!(w, "\t {count}").map_err(RsomicsError::Io)?;
        }
        writeln!(w, "\t").map_err(RsomicsError::Io)?;
    }

    Ok(())
}

/// Run the full NVC analysis and write the `.NVC.xls` output file.
pub fn run_nvc(
    bam_path: &Path,
    out_prefix: &Path,
    mapq_cut: u8,
    workers: NonZero<usize>,
) -> Result<()> {
    eprintln!("Read BAM file ...");
    let table = compute_nvc(bam_path, mapq_cut, workers)?;
    eprintln!("  Done");
    eprintln!("generating data matrix ...");
    write_nvc_xls(&table, out_prefix)?;
    eprintln!("generating R script  ...");
    Ok(())
}