gsva-rust 0.1.0

//! Shared gene / gene-set preprocessing, faithful to GSVA 2.6.2's
//! `.filterGenes`, `.mapGeneSetsToFeatures`, and `filterGeneSets`.

use crate::geneset::GeneSets;
use crate::matrix::ExprMatrix;
use std::collections::HashSet;

/// Row-subset an expression matrix, keeping `rows` in the given order.
fn subset_rows(expr: &ExprMatrix, rows: &[usize]) -> ExprMatrix {
    let col_names = expr.col_names().to_vec();
    let mut row_names = Vec::with_capacity(rows.len());
    let mut data = Vec::with_capacity(rows.len() * expr.ncol());
    for &r in rows {
        row_names.push(expr.row_names()[r].clone());
        data.extend_from_slice(expr.row(r));
    }
    ExprMatrix::new(row_names, col_names, data)
}

/// Drop rows whose values are constant across all columns (GSVA's
/// `removeConstant = TRUE`): a constant row has zero variance and would scale to
/// `0/0`. The min/max scan ignores `NaN`, matching R's `na.rm` range. Aborts if
/// fewer than two rows remain, as R does.
pub(crate) fn filter_constant_rows(expr: &ExprMatrix) -> ExprMatrix {
    let mut keep = Vec::with_capacity(expr.nrow());
    for i in 0..expr.nrow() {
        let mut mn = f64::INFINITY;
        let mut mx = f64::NEG_INFINITY;
        for &v in expr.row(i) {
            if v < mn {
                mn = v;
            }
            if v > mx {
                mx = v;
            }
        }
        // A constant row (or an all-NaN row, where mn/mx stay infinite) is
        // dropped; R coerces the NA range case to "constant" as well.
        if mn != mx && mn.is_finite() && mx.is_finite() {
            keep.push(i);
        }
    }
    assert!(
        keep.len() >= 2,
        "Less than two rows left in the input matrix after dropping constant rows"
    );
    subset_rows(expr, &keep)
}

/// Map gene sets onto the rows of `features` and size-filter them, reproducing
/// GSVA's `.mapGeneSetsToFeatures` + `filterGeneSets`.
///
/// Each gene is matched to its first-occurrence row (R `match`); unmatched genes
/// are dropped while order and multiplicity of matched genes are preserved. Sets
/// whose matched size falls outside `[min_size, max_size]` are removed.
///
/// Panics (as R `stop()`s) when nothing matches, when no set survives filtering,
/// or when surviving set names collide.
pub(crate) fn map_and_filter_sets(
    features: &ExprMatrix,
    gene_sets: &GeneSets,
    min_size: usize,
    max_size: usize,
) -> (Vec<String>, Vec<Vec<usize>>) {
    let mut total_matched = 0usize;
    let mut mapped: Vec<(String, Vec<usize>)> = Vec::with_capacity(gene_sets.len());
    for set in gene_sets.iter() {
        let mut idx = Vec::with_capacity(set.genes.len());
        for gene in &set.genes {
            if let Some(i) = features.row_of(gene) {
                idx.push(i);
            }
        }
        total_matched += idx.len();
        mapped.push((set.name.clone(), idx));
    }
    assert!(
        total_matched > 0,
        "No identifiers in the gene sets could be matched to the identifiers in the expression data"
    );

    let mut names = Vec::new();
    let mut indices = Vec::new();
    for (name, idx) in mapped {
        if idx.len() >= min_size && idx.len() <= max_size {
            names.push(name);
            indices.push(idx);
        }
    }
    assert!(
        !names.is_empty(),
        "No gene set left after mapping and filtering"
    );

    let mut seen = HashSet::with_capacity(names.len());
    for name in &names {
        assert!(
            seen.insert(name.as_str()),
            "The gene set list contains duplicated gene set names: '{name}'"
        );
    }

    (names, indices)
}

/// Center and scale each row to mean 0 and unit sample standard deviation
/// (denominator `n - 1`), reproducing GSVA's `.scale_rows`:
/// `(X - rowMeans(X)) / rowSds(X)`. Returns the scaled matrix row-major.
pub(crate) fn scale_rows(m: &ExprMatrix) -> Vec<f64> {
    let n = m.ncol();
    let nf = n as f64;
    let mut z = vec![0.0f64; m.nrow() * n];
    for i in 0..m.nrow() {
        let row = m.row(i);
        // Two-pass mean: a naive sum/n then a correction pass. R accumulates
        // `rowMeans` in 80-bit long double; the correction recovers the
        // f64-rounded true mean, tracking R to ~1 ULP instead of drifting with
        // the naive sum over many samples.
        let mut mean = row.iter().sum::<f64>() / nf;
        let mut corr = 0.0f64;
        for &v in row {
            corr += v - mean;
        }
        mean += corr / nf;
        let mut ss = 0.0f64;
        for &v in row {
            let d = v - mean;
            ss += d * d;
        }
        let sd = (ss / (nf - 1.0)).sqrt();
        let out = &mut z[i * n..(i + 1) * n];
        for (o, &v) in out.iter_mut().zip(row) {
            *o = (v - mean) / sd;
        }
    }
    z
}