rsomics-bioenv 0.1.0

BIO-ENV / BEST — subset of environmental variables maximally rank-correlated with a community distance matrix (Clarke & Ainsworth 1993), scikit-bio compatible
Documentation
use std::collections::HashMap;
use std::io::BufRead;

use rsomics_common::{Result, RsomicsError};

/// Environmental variables table: samples (rows) × numeric variables (columns).
pub struct EnvTable {
    pub vars: Vec<String>,
    /// Row-major, `n_samples × vars.len()`.
    pub values: Vec<f64>,
    pub ids: Vec<String>,
}

impl EnvTable {
    /// Parse a TSV: header row is `<id-col-name>\tvar1\tvar2…`, then one
    /// `sampleid\tv1\tv2…` row per sample. The first header cell (the sample
    /// id column label) is ignored.
    pub fn read<R: BufRead>(reader: R, source: &str) -> Result<Self> {
        let mut lines = reader.lines();
        let header = lines
            .next()
            .ok_or_else(|| RsomicsError::InvalidInput(format!("{source}: empty env table")))?
            .map_err(RsomicsError::Io)?;

        let vars: Vec<String> = header
            .split('\t')
            .skip(1)
            .map(str::trim)
            .map(str::to_owned)
            .collect();
        if vars.is_empty() {
            return Err(RsomicsError::InvalidInput(format!(
                "{source}: no variable columns in header"
            )));
        }
        let p = vars.len();

        let mut ids = Vec::new();
        let mut values = Vec::new();
        for line in lines {
            let line = line.map_err(RsomicsError::Io)?;
            if line.trim().is_empty() {
                continue;
            }
            let mut fields = line.split('\t');
            let id = fields
                .next()
                .ok_or_else(|| RsomicsError::InvalidInput(format!("{source}: blank row")))?
                .trim()
                .to_owned();
            let mut row = Vec::with_capacity(p);
            for f in fields {
                row.push(f.trim().parse::<f64>().map_err(|_| {
                    RsomicsError::InvalidInput(format!("{source}: '{id}': not numeric: '{f}'"))
                })?);
            }
            if row.len() != p {
                return Err(RsomicsError::InvalidInput(format!(
                    "{source}: '{id}' has {} values, expected {p}",
                    row.len()
                )));
            }
            ids.push(id);
            values.extend_from_slice(&row);
        }
        if ids.is_empty() {
            return Err(RsomicsError::InvalidInput(format!(
                "{source}: no data rows"
            )));
        }

        Ok(EnvTable { vars, values, ids })
    }

    /// Keep only `columns` (in the given order), or all if `None`. Errors on a
    /// missing or duplicate name.
    pub fn select(&self, columns: Option<&[String]>, source: &str) -> Result<EnvTable> {
        let Some(cols) = columns else {
            return Ok(EnvTable {
                vars: self.vars.clone(),
                values: self.values.clone(),
                ids: self.ids.clone(),
            });
        };
        let mut seen = std::collections::HashSet::new();
        let idx: Vec<usize> =
            cols.iter()
                .map(|c| {
                    if !seen.insert(c.as_str()) {
                        return Err(RsomicsError::InvalidInput(format!(
                            "{source}: duplicate column '{c}'"
                        )));
                    }
                    self.vars.iter().position(|v| v == c).ok_or_else(|| {
                        RsomicsError::InvalidInput(format!("{source}: no column '{c}'"))
                    })
                })
                .collect::<Result<_>>()?;
        let p = self.vars.len();
        let n = self.ids.len();
        let mut values = Vec::with_capacity(n * idx.len());
        for r in 0..n {
            for &c in &idx {
                values.push(self.values[r * p + c]);
            }
        }
        Ok(EnvTable {
            vars: idx.iter().map(|&c| self.vars[c].clone()).collect(),
            values,
            ids: self.ids.clone(),
        })
    }

    /// Reorder rows onto `target` ids and standardize each column (center by
    /// mean, scale by sample std, ddof=1). skbio reindexes the env frame onto
    /// the distance-matrix ids, then `_scale` centers and divides by `df.std()`.
    pub fn standardized_for(&self, target: &[String], source: &str) -> Result<Standardized> {
        let p = self.vars.len();
        let pos: HashMap<&str, usize> = self
            .ids
            .iter()
            .enumerate()
            .map(|(i, s)| (s.as_str(), i))
            .collect();
        let rows: Vec<usize> = target
            .iter()
            .map(|id| {
                pos.get(id.as_str()).copied().ok_or_else(|| {
                    RsomicsError::InvalidInput(format!(
                        "{source}: distance-matrix id '{id}' not in env table"
                    ))
                })
            })
            .collect::<Result<_>>()?;
        let n = rows.len();

        // Column-major after standardization: each variable is a contiguous
        // length-n column, so a subset's per-sample Euclidean distance reads
        // sequentially down each chosen column.
        let mut cols = vec![0.0f64; n * p];
        for (c, col) in self.vars.iter().enumerate() {
            let mut mean = 0.0;
            for (r, &src) in rows.iter().enumerate() {
                mean += self.values[src * p + c];
                cols[c * n + r] = self.values[src * p + c];
            }
            mean /= n as f64;
            let mut ss = 0.0;
            for r in 0..n {
                let d = cols[c * n + r] - mean;
                ss += d * d;
            }
            let std = (ss / (n as f64 - 1.0)).sqrt();
            if std == 0.0 {
                return Err(RsomicsError::InvalidInput(format!(
                    "{source}: column '{col}' has zero variance and cannot be scaled"
                )));
            }
            for r in 0..n {
                cols[c * n + r] = (cols[c * n + r] - mean) / std;
            }
        }

        Ok(Standardized {
            cols,
            n,
            vars: self.vars.clone(),
        })
    }
}

/// Standardized env variables stored column-major (`vars.len()` columns of `n`).
pub struct Standardized {
    pub cols: Vec<f64>,
    pub n: usize,
    pub vars: Vec<String>,
}