rsomics-bioenv 0.1.0

BIO-ENV / BEST — subset of environmental variables maximally rank-correlated with a community distance matrix (Clarke & Ainsworth 1993), scikit-bio compatible
Documentation
use std::io::{BufRead, Write};

use rayon::prelude::*;
use rsomics_common::{Result, RsomicsError};

mod dm;
mod env;

pub use dm::DistanceMatrix;
pub use env::{EnvTable, Standardized};

pub struct SizeBest {
    pub size: usize,
    pub correlation: f64,
    pub vars: Vec<String>,
}

/// BIO-ENV: for each subset size 1..=p, the variable subset whose standardized
/// Euclidean distances are maximally Spearman-correlated with the community
/// distances. `dm_flat` is the community matrix's condensed upper triangle.
pub fn bioenv(dm_flat: &[f64], env: &Standardized) -> Vec<SizeBest> {
    let p = env.vars.len();
    let n = env.n;
    let m = dm_flat.len();

    // The community ranks are constant across all subsets, so center+normalize
    // them once; each subset then costs one rank + one centered dot product
    // instead of scipy's full re-rank of both vectors per call.
    let dm_ranks = rankdata(dm_flat);
    let dm_centered = centered_unit(&dm_ranks);

    (1..=p)
        .map(|size| {
            let best = (0..count_combinations(p, size))
                .into_par_iter()
                .map(|combo_idx| {
                    let subset = nth_combination(p, size, combo_idx);
                    let dists = euclidean_condensed(&env.cols, n, m, &subset);
                    let var_ranks = rankdata(&dists);
                    let rho = spearman_against(&var_ranks, &dm_centered);
                    Candidate { rho, combo_idx }
                })
                .reduce(
                    || Candidate {
                        rho: f64::NEG_INFINITY,
                        combo_idx: usize::MAX,
                    },
                    Candidate::better,
                );
            let subset = nth_combination(p, size, best.combo_idx);
            SizeBest {
                size,
                correlation: best.rho,
                vars: subset.iter().map(|&i| env.vars[i].clone()).collect(),
            }
        })
        .collect()
}

#[derive(Clone, Copy)]
struct Candidate {
    rho: f64,
    combo_idx: usize,
}

impl Candidate {
    /// Strictly-greater rho wins; on a tie keep the earlier combination, which
    /// is skbio's "first one in `combinations` order" tie rule.
    fn better(a: Candidate, b: Candidate) -> Candidate {
        if b.rho > a.rho || (b.rho == a.rho && b.combo_idx < a.combo_idx) {
            b
        } else {
            a
        }
    }
}

/// Condensed Euclidean distances over the chosen standardized columns, upper
/// triangle in the same row-major order as the community matrix.
fn euclidean_condensed(cols: &[f64], n: usize, m: usize, subset: &[usize]) -> Vec<f64> {
    let mut out = Vec::with_capacity(m);
    for i in 0..n {
        for j in (i + 1)..n {
            let mut s = 0.0;
            for &c in subset {
                let base = c * n;
                let d = cols[base + i] - cols[base + j];
                s += d * d;
            }
            out.push(s.sqrt());
        }
    }
    out
}

/// Spearman = Pearson on ranks. `dm_centered` is the community ranks already
/// centered and scaled to unit norm, so this is one centered dot of the
/// variable ranks divided by their own norm.
fn spearman_against(var_ranks: &[f64], dm_centered: &[f64]) -> f64 {
    let vmean = var_ranks.iter().sum::<f64>() / var_ranks.len() as f64;
    let mut vnorm_sq = 0.0;
    let mut dot = 0.0;
    for (&vr, &dc) in var_ranks.iter().zip(dm_centered) {
        let cv = vr - vmean;
        vnorm_sq += cv * cv;
        dot += cv * dc;
    }
    if vnorm_sq == 0.0 {
        return f64::NAN;
    }
    dot / vnorm_sq.sqrt()
}

/// Center to zero mean then scale to unit norm; the constant-vector case is
/// not reachable here (community distances always vary across pairs).
fn centered_unit(v: &[f64]) -> Vec<f64> {
    let mean = v.iter().sum::<f64>() / v.len() as f64;
    let mut out: Vec<f64> = v.iter().map(|&x| x - mean).collect();
    let norm = out.iter().map(|&x| x * x).sum::<f64>().sqrt();
    for x in &mut out {
        *x /= norm;
    }
    out
}

/// Average-rank of each element, scipy `rankdata` default (ties averaged).
fn rankdata(v: &[f64]) -> Vec<f64> {
    let mut order: Vec<usize> = (0..v.len()).collect();
    order.sort_by(|&a, &b| v[a].partial_cmp(&v[b]).unwrap());
    let mut ranks = vec![0.0f64; v.len()];
    let mut i = 0;
    while i < order.len() {
        let mut j = i + 1;
        while j < order.len() && v[order[j]] == v[order[i]] {
            j += 1;
        }
        let avg = ((i + 1 + j) as f64) / 2.0;
        for &idx in &order[i..j] {
            ranks[idx] = avg;
        }
        i = j;
    }
    ranks
}

fn count_combinations(n: usize, k: usize) -> usize {
    if k > n {
        return 0;
    }
    let k = k.min(n - k);
    let mut c = 1usize;
    for i in 0..k {
        c = c * (n - i) / (i + 1);
    }
    c
}

/// The `idx`-th k-subset of `0..n` in lexicographic order (unranking).
fn nth_combination(n: usize, k: usize, mut idx: usize) -> Vec<usize> {
    let mut out = Vec::with_capacity(k);
    let mut c = 0usize;
    let mut remaining = k;
    while remaining > 0 {
        let block = count_combinations(n - c - 1, remaining - 1);
        if idx < block {
            out.push(c);
            remaining -= 1;
        } else {
            idx -= block;
        }
        c += 1;
    }
    out
}

pub fn write_result<W: Write>(out: &mut W, best: &[SizeBest]) -> Result<()> {
    writeln!(out, "size\tcorrelation\tvars").map_err(RsomicsError::Io)?;
    for b in best {
        writeln!(
            out,
            "{}\t{:.12}\t{}",
            b.size,
            b.correlation,
            b.vars.join(", ")
        )
        .map_err(RsomicsError::Io)?;
    }
    Ok(())
}

pub fn read_matrix<R: BufRead>(reader: R, source: &str) -> Result<DistanceMatrix> {
    DistanceMatrix::read(reader, source)
}

pub fn read_env<R: BufRead>(reader: R, source: &str) -> Result<EnvTable> {
    EnvTable::read(reader, source)
}

/// Select `columns` (or all), standardize onto the matrix ids, and run BIO-ENV.
pub fn run_bioenv(
    dm: &DistanceMatrix,
    env: &EnvTable,
    columns: Option<&[String]>,
    env_source: &str,
) -> Result<Vec<SizeBest>> {
    let selected = env.select(columns, env_source)?;
    let std = selected.standardized_for(&dm.ids, env_source)?;
    Ok(bioenv(&dm.condensed(), &std))
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::Cursor;

    const DM: &str = "\tA\tB\tC\tD\n\
        A\t0.0\t0.5\t0.25\t0.75\n\
        B\t0.5\t0.0\t0.1\t0.42\n\
        C\t0.25\t0.1\t0.0\t0.33\n\
        D\t0.75\t0.42\t0.33\t0.0\n";

    const ENV: &str = "id\tpH\tElevation\n\
        A\t7.0\t400\n\
        B\t8.0\t530\n\
        C\t7.5\t450\n\
        D\t8.5\t810\n";

    #[test]
    fn skbio_doc_example() {
        let dm = read_matrix(Cursor::new(DM), "dm").unwrap();
        let env = read_env(Cursor::new(ENV), "env").unwrap();
        let best = run_bioenv(&dm, &env, None, "env").unwrap();
        assert_eq!(best.len(), 2);
        assert_eq!(best[0].vars, vec!["pH"]);
        assert!(
            (best[0].correlation - 0.771517).abs() < 1e-6,
            "{}",
            best[0].correlation
        );
        assert_eq!(best[1].vars, vec!["pH", "Elevation"]);
        assert!(
            (best[1].correlation - 0.714286).abs() < 1e-6,
            "{}",
            best[1].correlation
        );
    }

    #[test]
    fn rankdata_ties_averaged() {
        assert_eq!(rankdata(&[1.0, 2.0, 2.0, 3.0]), vec![1.0, 2.5, 2.5, 4.0]);
    }

    #[test]
    fn nth_combination_is_lexicographic() {
        let all: Vec<_> = (0..count_combinations(4, 2))
            .map(|i| nth_combination(4, 2, i))
            .collect();
        assert_eq!(
            all,
            vec![
                vec![0, 1],
                vec![0, 2],
                vec![0, 3],
                vec![1, 2],
                vec![1, 3],
                vec![2, 3],
            ]
        );
    }

    #[test]
    fn condensed_upper_triangle() {
        let dm = read_matrix(Cursor::new(DM), "dm").unwrap();
        assert_eq!(dm.condensed(), vec![0.5, 0.25, 0.75, 0.1, 0.42, 0.33]);
    }
}