use std::collections::HashMap;
use std::io::BufRead;
use rsomics_common::{Result, RsomicsError};
pub struct EnvTable {
pub vars: Vec<String>,
pub values: Vec<f64>,
pub ids: Vec<String>,
}
impl EnvTable {
pub fn read<R: BufRead>(reader: R, source: &str) -> Result<Self> {
let mut lines = reader.lines();
let header = lines
.next()
.ok_or_else(|| RsomicsError::InvalidInput(format!("{source}: empty env table")))?
.map_err(RsomicsError::Io)?;
let vars: Vec<String> = header
.split('\t')
.skip(1)
.map(str::trim)
.map(str::to_owned)
.collect();
if vars.is_empty() {
return Err(RsomicsError::InvalidInput(format!(
"{source}: no variable columns in header"
)));
}
let p = vars.len();
let mut ids = Vec::new();
let mut values = Vec::new();
for line in lines {
let line = line.map_err(RsomicsError::Io)?;
if line.trim().is_empty() {
continue;
}
let mut fields = line.split('\t');
let id = fields
.next()
.ok_or_else(|| RsomicsError::InvalidInput(format!("{source}: blank row")))?
.trim()
.to_owned();
let mut row = Vec::with_capacity(p);
for f in fields {
row.push(f.trim().parse::<f64>().map_err(|_| {
RsomicsError::InvalidInput(format!("{source}: '{id}': not numeric: '{f}'"))
})?);
}
if row.len() != p {
return Err(RsomicsError::InvalidInput(format!(
"{source}: '{id}' has {} values, expected {p}",
row.len()
)));
}
ids.push(id);
values.extend_from_slice(&row);
}
if ids.is_empty() {
return Err(RsomicsError::InvalidInput(format!(
"{source}: no data rows"
)));
}
Ok(EnvTable { vars, values, ids })
}
pub fn select(&self, columns: Option<&[String]>, source: &str) -> Result<EnvTable> {
let Some(cols) = columns else {
return Ok(EnvTable {
vars: self.vars.clone(),
values: self.values.clone(),
ids: self.ids.clone(),
});
};
let mut seen = std::collections::HashSet::new();
let idx: Vec<usize> =
cols.iter()
.map(|c| {
if !seen.insert(c.as_str()) {
return Err(RsomicsError::InvalidInput(format!(
"{source}: duplicate column '{c}'"
)));
}
self.vars.iter().position(|v| v == c).ok_or_else(|| {
RsomicsError::InvalidInput(format!("{source}: no column '{c}'"))
})
})
.collect::<Result<_>>()?;
let p = self.vars.len();
let n = self.ids.len();
let mut values = Vec::with_capacity(n * idx.len());
for r in 0..n {
for &c in &idx {
values.push(self.values[r * p + c]);
}
}
Ok(EnvTable {
vars: idx.iter().map(|&c| self.vars[c].clone()).collect(),
values,
ids: self.ids.clone(),
})
}
pub fn standardized_for(&self, target: &[String], source: &str) -> Result<Standardized> {
let p = self.vars.len();
let pos: HashMap<&str, usize> = self
.ids
.iter()
.enumerate()
.map(|(i, s)| (s.as_str(), i))
.collect();
let rows: Vec<usize> = target
.iter()
.map(|id| {
pos.get(id.as_str()).copied().ok_or_else(|| {
RsomicsError::InvalidInput(format!(
"{source}: distance-matrix id '{id}' not in env table"
))
})
})
.collect::<Result<_>>()?;
let n = rows.len();
let mut cols = vec![0.0f64; n * p];
for (c, col) in self.vars.iter().enumerate() {
let mut mean = 0.0;
for (r, &src) in rows.iter().enumerate() {
mean += self.values[src * p + c];
cols[c * n + r] = self.values[src * p + c];
}
mean /= n as f64;
let mut ss = 0.0;
for r in 0..n {
let d = cols[c * n + r] - mean;
ss += d * d;
}
let std = (ss / (n as f64 - 1.0)).sqrt();
if std == 0.0 {
return Err(RsomicsError::InvalidInput(format!(
"{source}: column '{col}' has zero variance and cannot be scaled"
)));
}
for r in 0..n {
cols[c * n + r] = (cols[c * n + r] - mean) / std;
}
}
Ok(Standardized {
cols,
n,
vars: self.vars.clone(),
})
}
}
pub struct Standardized {
pub cols: Vec<f64>,
pub n: usize,
pub vars: Vec<String>,
}