Skip to main content

rsomics_bioenv/
env.rs

1use std::collections::HashMap;
2use std::io::BufRead;
3
4use rsomics_common::{Result, RsomicsError};
5
6/// Environmental variables table: samples (rows) × numeric variables (columns).
7pub struct EnvTable {
8    pub vars: Vec<String>,
9    /// Row-major, `n_samples × vars.len()`.
10    pub values: Vec<f64>,
11    pub ids: Vec<String>,
12}
13
14impl EnvTable {
15    /// Parse a TSV: header row is `<id-col-name>\tvar1\tvar2…`, then one
16    /// `sampleid\tv1\tv2…` row per sample. The first header cell (the sample
17    /// id column label) is ignored.
18    pub fn read<R: BufRead>(reader: R, source: &str) -> Result<Self> {
19        let mut lines = reader.lines();
20        let header = lines
21            .next()
22            .ok_or_else(|| RsomicsError::InvalidInput(format!("{source}: empty env table")))?
23            .map_err(RsomicsError::Io)?;
24
25        let vars: Vec<String> = header
26            .split('\t')
27            .skip(1)
28            .map(str::trim)
29            .map(str::to_owned)
30            .collect();
31        if vars.is_empty() {
32            return Err(RsomicsError::InvalidInput(format!(
33                "{source}: no variable columns in header"
34            )));
35        }
36        let p = vars.len();
37
38        let mut ids = Vec::new();
39        let mut values = Vec::new();
40        for line in lines {
41            let line = line.map_err(RsomicsError::Io)?;
42            if line.trim().is_empty() {
43                continue;
44            }
45            let mut fields = line.split('\t');
46            let id = fields
47                .next()
48                .ok_or_else(|| RsomicsError::InvalidInput(format!("{source}: blank row")))?
49                .trim()
50                .to_owned();
51            let mut row = Vec::with_capacity(p);
52            for f in fields {
53                row.push(f.trim().parse::<f64>().map_err(|_| {
54                    RsomicsError::InvalidInput(format!("{source}: '{id}': not numeric: '{f}'"))
55                })?);
56            }
57            if row.len() != p {
58                return Err(RsomicsError::InvalidInput(format!(
59                    "{source}: '{id}' has {} values, expected {p}",
60                    row.len()
61                )));
62            }
63            ids.push(id);
64            values.extend_from_slice(&row);
65        }
66        if ids.is_empty() {
67            return Err(RsomicsError::InvalidInput(format!(
68                "{source}: no data rows"
69            )));
70        }
71
72        Ok(EnvTable { vars, values, ids })
73    }
74
75    /// Keep only `columns` (in the given order), or all if `None`. Errors on a
76    /// missing or duplicate name.
77    pub fn select(&self, columns: Option<&[String]>, source: &str) -> Result<EnvTable> {
78        let Some(cols) = columns else {
79            return Ok(EnvTable {
80                vars: self.vars.clone(),
81                values: self.values.clone(),
82                ids: self.ids.clone(),
83            });
84        };
85        let mut seen = std::collections::HashSet::new();
86        let idx: Vec<usize> =
87            cols.iter()
88                .map(|c| {
89                    if !seen.insert(c.as_str()) {
90                        return Err(RsomicsError::InvalidInput(format!(
91                            "{source}: duplicate column '{c}'"
92                        )));
93                    }
94                    self.vars.iter().position(|v| v == c).ok_or_else(|| {
95                        RsomicsError::InvalidInput(format!("{source}: no column '{c}'"))
96                    })
97                })
98                .collect::<Result<_>>()?;
99        let p = self.vars.len();
100        let n = self.ids.len();
101        let mut values = Vec::with_capacity(n * idx.len());
102        for r in 0..n {
103            for &c in &idx {
104                values.push(self.values[r * p + c]);
105            }
106        }
107        Ok(EnvTable {
108            vars: idx.iter().map(|&c| self.vars[c].clone()).collect(),
109            values,
110            ids: self.ids.clone(),
111        })
112    }
113
114    /// Reorder rows onto `target` ids and standardize each column (center by
115    /// mean, scale by sample std, ddof=1). skbio reindexes the env frame onto
116    /// the distance-matrix ids, then `_scale` centers and divides by `df.std()`.
117    pub fn standardized_for(&self, target: &[String], source: &str) -> Result<Standardized> {
118        let p = self.vars.len();
119        let pos: HashMap<&str, usize> = self
120            .ids
121            .iter()
122            .enumerate()
123            .map(|(i, s)| (s.as_str(), i))
124            .collect();
125        let rows: Vec<usize> = target
126            .iter()
127            .map(|id| {
128                pos.get(id.as_str()).copied().ok_or_else(|| {
129                    RsomicsError::InvalidInput(format!(
130                        "{source}: distance-matrix id '{id}' not in env table"
131                    ))
132                })
133            })
134            .collect::<Result<_>>()?;
135        let n = rows.len();
136
137        // Column-major after standardization: each variable is a contiguous
138        // length-n column, so a subset's per-sample Euclidean distance reads
139        // sequentially down each chosen column.
140        let mut cols = vec![0.0f64; n * p];
141        for (c, col) in self.vars.iter().enumerate() {
142            let mut mean = 0.0;
143            for (r, &src) in rows.iter().enumerate() {
144                mean += self.values[src * p + c];
145                cols[c * n + r] = self.values[src * p + c];
146            }
147            mean /= n as f64;
148            let mut ss = 0.0;
149            for r in 0..n {
150                let d = cols[c * n + r] - mean;
151                ss += d * d;
152            }
153            let std = (ss / (n as f64 - 1.0)).sqrt();
154            if std == 0.0 {
155                return Err(RsomicsError::InvalidInput(format!(
156                    "{source}: column '{col}' has zero variance and cannot be scaled"
157                )));
158            }
159            for r in 0..n {
160                cols[c * n + r] = (cols[c * n + r] - mean) / std;
161            }
162        }
163
164        Ok(Standardized {
165            cols,
166            n,
167            vars: self.vars.clone(),
168        })
169    }
170}
171
172/// Standardized env variables stored column-major (`vars.len()` columns of `n`).
173pub struct Standardized {
174    pub cols: Vec<f64>,
175    pub n: usize,
176    pub vars: Vec<String>,
177}