Skip to main content

rsomics_gradient_trajectory/
io.rs

1use std::collections::HashMap;
2use std::io::BufRead;
3
4use rsomics_common::{Result, RsomicsError};
5
6/// Ordination coordinates: a sample-id-indexed matrix of PC axis values, plus
7/// the per-axis proportion explained. Either a plain TSV/CSV (`#id` header then
8/// one row per sample) or an scikit-bio `OrdinationResults` file, whose `Site`
9/// block carries the same matrix and whose `Proportion explained` block carries
10/// the vector.
11pub struct Coords {
12    pub ids: Vec<String>,
13    pub naxes: usize,
14    /// Row-major `ids.len() x naxes`.
15    pub data: Vec<f64>,
16}
17
18impl Coords {
19    pub fn row(&self, i: usize) -> &[f64] {
20        &self.data[i * self.naxes..(i + 1) * self.naxes]
21    }
22}
23
24pub fn parse_coords<R: BufRead>(reader: R, delim: char) -> Result<Coords> {
25    let mut ids = Vec::new();
26    let mut data = Vec::new();
27    let mut naxes = 0usize;
28    for line in reader.lines() {
29        let line = line.map_err(RsomicsError::Io)?;
30        let t = line.trim_end();
31        if t.is_empty() || t.starts_with('#') {
32            continue;
33        }
34        let mut fields = t.split(delim);
35        let id = fields.next().unwrap().trim().to_string();
36        let vals: Vec<f64> = fields
37            .map(|f| {
38                f.trim().parse().map_err(|_| {
39                    RsomicsError::InvalidInput(format!("sample '{id}': '{f}' is not numeric"))
40                })
41            })
42            .collect::<Result<_>>()?;
43        if naxes == 0 {
44            naxes = vals.len();
45        } else if vals.len() != naxes {
46            return Err(RsomicsError::InvalidInput(format!(
47                "sample '{id}' has {} axes, expected {naxes}",
48                vals.len()
49            )));
50        }
51        ids.push(id);
52        data.extend(vals);
53    }
54    if ids.is_empty() {
55        return Err(RsomicsError::InvalidInput("no coordinate rows".into()));
56    }
57    Ok(Coords { ids, naxes, data })
58}
59
60/// Proportion explained: one value per axis, whitespace- or newline-separated.
61pub fn parse_prop<R: BufRead>(reader: R) -> Result<Vec<f64>> {
62    let mut prop = Vec::new();
63    for line in reader.lines() {
64        let line = line.map_err(RsomicsError::Io)?;
65        for tok in line.split([',', '\t', ' ']) {
66            let t = tok.trim();
67            if t.is_empty() {
68                continue;
69            }
70            prop.push(t.parse().map_err(|_| {
71                RsomicsError::InvalidInput(format!("proportion '{t}' is not numeric"))
72            })?);
73        }
74    }
75    if prop.is_empty() {
76        return Err(RsomicsError::InvalidInput(
77            "empty proportion-explained vector".into(),
78        ));
79    }
80    Ok(prop)
81}
82
83/// Sample metadata: a `#SampleID` header naming the columns, then one row per
84/// sample. Values are strings; numeric coercion happens at use sites (sort,
85/// weighting).
86pub struct Metadata {
87    pub columns: Vec<String>,
88    pub rows: HashMap<String, Vec<String>>,
89}
90
91impl Metadata {
92    pub fn col_index(&self, name: &str) -> Result<usize> {
93        self.columns
94            .iter()
95            .position(|c| c == name)
96            .ok_or_else(|| RsomicsError::InvalidInput(format!("category '{name}' not in metadata")))
97    }
98
99    pub fn value<'a>(&'a self, sid: &str, col: usize) -> &'a str {
100        &self.rows[sid][col]
101    }
102}
103
104pub fn parse_metadata<R: BufRead>(reader: R, delim: char) -> Result<Metadata> {
105    let mut lines = reader.lines();
106    let header = loop {
107        match lines.next() {
108            Some(l) => {
109                let l = l.map_err(RsomicsError::Io)?;
110                if l.trim().is_empty() {
111                    continue;
112                }
113                break l;
114            }
115            None => return Err(RsomicsError::InvalidInput("empty metadata".into())),
116        }
117    };
118    let columns: Vec<String> = header
119        .trim_start_matches('#')
120        .split(delim)
121        .skip(1)
122        .map(|s| s.trim().to_string())
123        .collect();
124    let mut rows = HashMap::new();
125    for line in lines {
126        let line = line.map_err(RsomicsError::Io)?;
127        let t = line.trim_end();
128        if t.is_empty() || t.starts_with('#') {
129            continue;
130        }
131        let mut fields = t.split(delim);
132        let sid = fields.next().unwrap().trim().to_string();
133        let vals: Vec<String> = fields.map(|f| f.trim().to_string()).collect();
134        if vals.len() != columns.len() {
135            return Err(RsomicsError::InvalidInput(format!(
136                "metadata row '{sid}' has {} values, expected {}",
137                vals.len(),
138                columns.len()
139            )));
140        }
141        rows.insert(sid, vals);
142    }
143    Ok(Metadata { columns, rows })
144}