1use std::collections::HashMap;
2use std::io::BufRead;
3
4use rsomics_common::{Result, RsomicsError};
5
6pub struct EnvTable {
8 pub vars: Vec<String>,
9 pub values: Vec<f64>,
11 pub ids: Vec<String>,
12}
13
14impl EnvTable {
15 pub fn read<R: BufRead>(reader: R, source: &str) -> Result<Self> {
19 let mut lines = reader.lines();
20 let header = lines
21 .next()
22 .ok_or_else(|| RsomicsError::InvalidInput(format!("{source}: empty env table")))?
23 .map_err(RsomicsError::Io)?;
24
25 let vars: Vec<String> = header
26 .split('\t')
27 .skip(1)
28 .map(str::trim)
29 .map(str::to_owned)
30 .collect();
31 if vars.is_empty() {
32 return Err(RsomicsError::InvalidInput(format!(
33 "{source}: no variable columns in header"
34 )));
35 }
36 let p = vars.len();
37
38 let mut ids = Vec::new();
39 let mut values = Vec::new();
40 for line in lines {
41 let line = line.map_err(RsomicsError::Io)?;
42 if line.trim().is_empty() {
43 continue;
44 }
45 let mut fields = line.split('\t');
46 let id = fields
47 .next()
48 .ok_or_else(|| RsomicsError::InvalidInput(format!("{source}: blank row")))?
49 .trim()
50 .to_owned();
51 let mut row = Vec::with_capacity(p);
52 for f in fields {
53 row.push(f.trim().parse::<f64>().map_err(|_| {
54 RsomicsError::InvalidInput(format!("{source}: '{id}': not numeric: '{f}'"))
55 })?);
56 }
57 if row.len() != p {
58 return Err(RsomicsError::InvalidInput(format!(
59 "{source}: '{id}' has {} values, expected {p}",
60 row.len()
61 )));
62 }
63 ids.push(id);
64 values.extend_from_slice(&row);
65 }
66 if ids.is_empty() {
67 return Err(RsomicsError::InvalidInput(format!(
68 "{source}: no data rows"
69 )));
70 }
71
72 Ok(EnvTable { vars, values, ids })
73 }
74
75 pub fn select(&self, columns: Option<&[String]>, source: &str) -> Result<EnvTable> {
78 let Some(cols) = columns else {
79 return Ok(EnvTable {
80 vars: self.vars.clone(),
81 values: self.values.clone(),
82 ids: self.ids.clone(),
83 });
84 };
85 let mut seen = std::collections::HashSet::new();
86 let idx: Vec<usize> =
87 cols.iter()
88 .map(|c| {
89 if !seen.insert(c.as_str()) {
90 return Err(RsomicsError::InvalidInput(format!(
91 "{source}: duplicate column '{c}'"
92 )));
93 }
94 self.vars.iter().position(|v| v == c).ok_or_else(|| {
95 RsomicsError::InvalidInput(format!("{source}: no column '{c}'"))
96 })
97 })
98 .collect::<Result<_>>()?;
99 let p = self.vars.len();
100 let n = self.ids.len();
101 let mut values = Vec::with_capacity(n * idx.len());
102 for r in 0..n {
103 for &c in &idx {
104 values.push(self.values[r * p + c]);
105 }
106 }
107 Ok(EnvTable {
108 vars: idx.iter().map(|&c| self.vars[c].clone()).collect(),
109 values,
110 ids: self.ids.clone(),
111 })
112 }
113
114 pub fn standardized_for(&self, target: &[String], source: &str) -> Result<Standardized> {
118 let p = self.vars.len();
119 let pos: HashMap<&str, usize> = self
120 .ids
121 .iter()
122 .enumerate()
123 .map(|(i, s)| (s.as_str(), i))
124 .collect();
125 let rows: Vec<usize> = target
126 .iter()
127 .map(|id| {
128 pos.get(id.as_str()).copied().ok_or_else(|| {
129 RsomicsError::InvalidInput(format!(
130 "{source}: distance-matrix id '{id}' not in env table"
131 ))
132 })
133 })
134 .collect::<Result<_>>()?;
135 let n = rows.len();
136
137 let mut cols = vec![0.0f64; n * p];
141 for (c, col) in self.vars.iter().enumerate() {
142 let mut mean = 0.0;
143 for (r, &src) in rows.iter().enumerate() {
144 mean += self.values[src * p + c];
145 cols[c * n + r] = self.values[src * p + c];
146 }
147 mean /= n as f64;
148 let mut ss = 0.0;
149 for r in 0..n {
150 let d = cols[c * n + r] - mean;
151 ss += d * d;
152 }
153 let std = (ss / (n as f64 - 1.0)).sqrt();
154 if std == 0.0 {
155 return Err(RsomicsError::InvalidInput(format!(
156 "{source}: column '{col}' has zero variance and cannot be scaled"
157 )));
158 }
159 for r in 0..n {
160 cols[c * n + r] = (cols[c * n + r] - mean) / std;
161 }
162 }
163
164 Ok(Standardized {
165 cols,
166 n,
167 vars: self.vars.clone(),
168 })
169 }
170}
171
172pub struct Standardized {
174 pub cols: Vec<f64>,
175 pub n: usize,
176 pub vars: Vec<String>,
177}