rsomics-bioenv 0.1.0

BIO-ENV / BEST — subset of environmental variables maximally rank-correlated with a community distance matrix (Clarke & Ainsworth 1993), scikit-bio compatible
Documentation
use std::io::BufRead;

use rsomics_common::{Result, RsomicsError};

/// A square, symmetric distance matrix with sample ids, in row-major order.
pub struct DistanceMatrix {
    pub ids: Vec<String>,
    pub data: Vec<f64>,
    pub n: usize,
}

impl DistanceMatrix {
    pub fn read<R: BufRead>(reader: R, source: &str) -> Result<Self> {
        let mut lines = reader.lines();

        let header = lines
            .next()
            .ok_or_else(|| RsomicsError::InvalidInput(format!("{source}: empty matrix")))?
            .map_err(RsomicsError::Io)?;

        // lsmat header: a leading empty cell, then the column ids.
        let ids: Vec<String> = header
            .split('\t')
            .skip(1)
            .map(str::trim)
            .map(str::to_owned)
            .collect();
        let n = ids.len();
        if n < 3 {
            return Err(RsomicsError::InvalidInput(format!(
                "{source}: need at least 3 ids, found {n}"
            )));
        }

        let mut data = vec![0.0f64; n * n];
        let mut row = 0;
        for line in lines {
            let line = line.map_err(RsomicsError::Io)?;
            if line.is_empty() {
                continue;
            }
            if row >= n {
                return Err(RsomicsError::InvalidInput(format!(
                    "{source}: more rows than the {n} ids in the header"
                )));
            }
            let mut fields = line.split('\t');
            let rid = fields.next().unwrap_or("").trim();
            if rid != ids[row] {
                return Err(RsomicsError::InvalidInput(format!(
                    "{source}: row {row} id '{rid}' != header id '{}'",
                    ids[row]
                )));
            }
            let mut col = 0;
            for f in fields {
                if col >= n {
                    return Err(RsomicsError::InvalidInput(format!(
                        "{source}: row {row} has more than {n} values"
                    )));
                }
                data[row * n + col] = f.trim().parse::<f64>().map_err(|_| {
                    RsomicsError::InvalidInput(format!(
                        "{source}: row {row} col {col}: not a number: '{f}'"
                    ))
                })?;
                col += 1;
            }
            if col != n {
                return Err(RsomicsError::InvalidInput(format!(
                    "{source}: row {row} has {col} values, expected {n}"
                )));
            }
            row += 1;
        }
        if row != n {
            return Err(RsomicsError::InvalidInput(format!(
                "{source}: {row} data rows, expected {n}"
            )));
        }

        Ok(DistanceMatrix { ids, data, n })
    }

    /// Upper-triangle (i<j) entries in row-major order — skbio's condensed form.
    pub fn condensed(&self) -> Vec<f64> {
        let n = self.n;
        let mut v = Vec::with_capacity(n * (n - 1) / 2);
        for i in 0..n {
            for j in (i + 1)..n {
                v.push(self.data[i * n + j]);
            }
        }
        v
    }
}