Skip to main content

rsomics_ilr/
lib.rs

1use std::io::{BufRead, Write};
2
3use rayon::prelude::*;
4use rsomics_common::{Result, RsomicsError};
5
6mod table;
7
8pub use table::Table;
9
10pub struct IlrMatrix {
11    pub samples: Vec<String>,
12    pub coords: Vec<String>,
13    pub data: Vec<f64>,
14}
15
16/// Per-balance scale factor of skbio's default Egozcue/Gram-Schmidt basis:
17/// row `k` (i = k+1) carries the weight `sqrt(i/(i+1))`, the only data the
18/// triangular basis needs since its nonzero pattern is fixed.
19fn basis_scales(d: usize) -> Vec<f64> {
20    (1..d)
21        .map(|i| (i as f64 / (i as f64 + 1.0)).sqrt())
22        .collect()
23}
24
25/// Isometric log-ratio transform (Egozcue et al. 2003), matching
26/// `skbio.stats.composition.ilr` with its default basis: `ilr(x) = clr(x) · Vᵀ`
27/// where `V` is the (D-1)×D Gram-Schmidt orthonormal basis.
28///
29/// The default basis is lower-triangular, so the dense `clr·Vᵀ` matmul collapses
30/// to one prefix-sum pass per sample: with clr row `c` and `P[i]=Σ_{t<i} c[t]`,
31/// the k-th coordinate (i=k+1) is `sqrt(i/(i+1))·(P[i]/i − c[i])`. That is O(D)
32/// per sample rather than the O(D²) numpy tensordot.
33///
34/// `pseudocount` is added to every value before the log; with the default 0 the
35/// input must be strictly positive, as skbio requires (`nozero=True`).
36///
37/// # Errors
38/// A non-positive value after the pseudocount (log undefined), or a non-finite
39/// input — rejected as in skbio. Fewer than two features, where ilr is undefined.
40pub fn ilr(table: &Table, pseudocount: f64) -> Result<IlrMatrix> {
41    let d = table.n_features();
42    if d < 2 {
43        return Err(RsomicsError::InvalidInput(format!(
44            "ilr needs at least 2 features, got {d}"
45        )));
46    }
47    let inv_d = 1.0 / d as f64;
48    let scales = basis_scales(d);
49
50    let mut data = vec![0.0_f64; table.n_samples() * (d - 1)];
51    let bad: Option<f64> = data
52        .par_chunks_mut(d - 1)
53        .zip(table.data.par_chunks(d))
54        .map(|(out, row)| {
55            let mut clr = Vec::with_capacity(d);
56            let mut mean = 0.0;
57            for &x in row {
58                let v = x + pseudocount;
59                if v <= 0.0 || !v.is_finite() {
60                    return Some(x);
61                }
62                let l = v.ln();
63                clr.push(l);
64                mean += l;
65            }
66            mean *= inv_d;
67
68            let mut prefix = 0.0;
69            for (k, o) in out.iter_mut().enumerate() {
70                let i = k + 1;
71                prefix += clr[k] - mean;
72                *o = scales[k] * (prefix / i as f64 - (clr[i] - mean));
73            }
74            None
75        })
76        .reduce(|| None, |a, b| a.or(b));
77
78    if let Some(v) = bad {
79        return Err(RsomicsError::InvalidInput(format!(
80            "value {v} is non-positive after pseudocount {pseudocount} — ilr's log is undefined (skbio requires strictly positive data)"
81        )));
82    }
83
84    let coords = (0..d - 1).map(|k| format!("ilr{k}")).collect();
85    Ok(IlrMatrix {
86        samples: table.samples.clone(),
87        coords,
88        data,
89    })
90}
91
92impl IlrMatrix {
93    /// Write the transformed matrix as a TSV: header of ilr coordinate IDs then
94    /// one `sample_id<delim>value...` line per sample.
95    ///
96    /// # Errors
97    /// Propagates write errors.
98    pub fn write_tsv<W: Write>(&self, mut out: W, delim: char) -> Result<()> {
99        let k = self.coords.len();
100        let mut line = String::with_capacity(16 * (k + 1));
101        line.push_str("sample");
102        for c in &self.coords {
103            line.push(delim);
104            line.push_str(c);
105        }
106        line.push('\n');
107        out.write_all(line.as_bytes()).map_err(RsomicsError::Io)?;
108
109        let mut buf = ryu_like::Buffer::new();
110        for (i, sample) in self.samples.iter().enumerate() {
111            line.clear();
112            line.push_str(sample);
113            for &v in &self.data[i * k..(i + 1) * k] {
114                line.push(delim);
115                line.push_str(buf.format(v));
116            }
117            line.push('\n');
118            out.write_all(line.as_bytes()).map_err(RsomicsError::Io)?;
119        }
120        Ok(())
121    }
122}
123
124/// std's `{}` already emits the shortest decimal that round-trips; reuse one
125/// growable buffer to avoid a per-value allocation.
126mod ryu_like {
127    use std::fmt::Write;
128
129    pub struct Buffer {
130        s: String,
131    }
132
133    impl Buffer {
134        pub fn new() -> Self {
135            Self {
136                s: String::with_capacity(24),
137            }
138        }
139
140        pub fn format(&mut self, v: f64) -> &str {
141            self.s.clear();
142            let _ = write!(self.s, "{v}");
143            &self.s
144        }
145    }
146}
147
148/// # Errors
149/// Propagates parse, compute, and write errors.
150pub fn run<W: Write>(
151    table_reader: impl BufRead,
152    out: W,
153    delim: char,
154    pseudocount: f64,
155) -> Result<()> {
156    let table = Table::parse(table_reader, delim)?;
157    let res = ilr(&table, pseudocount)?;
158    res.write_tsv(out, delim)
159}
160
161#[cfg(test)]
162mod tests {
163    use super::*;
164
165    fn parse(txt: &str) -> Table {
166        Table::parse(txt.as_bytes(), '\t').unwrap()
167    }
168
169    #[test]
170    fn docstring_example() {
171        // skbio: ilr([.1,.3,.4,.2]) == [-0.7768362, -0.68339802, 0.11704769]
172        let t = parse("\tf1\tf2\tf3\tf4\ns1\t0.1\t0.3\t0.4\t0.2\n");
173        let r = ilr(&t, 0.0).unwrap();
174        let want = [-0.7768362, -0.68339802, 0.11704769];
175        assert_eq!(r.data.len(), 3);
176        for (g, w) in r.data.iter().zip(want) {
177            assert!((g - w).abs() < 1e-7, "{g} vs {w}");
178        }
179    }
180
181    #[test]
182    fn output_width_is_d_minus_one() {
183        let r = ilr(&parse("\tf1\tf2\tf3\ns1\t5\t1\t9\ns2\t2\t8\t4\n"), 0.0).unwrap();
184        assert_eq!(r.coords.len(), 2);
185        assert_eq!(r.data.len(), 4);
186    }
187
188    #[test]
189    fn scale_invariant() {
190        let a = ilr(&parse("\tf1\tf2\tf3\tf4\ns1\t0.1\t0.3\t0.4\t0.2\n"), 0.0).unwrap();
191        let b = ilr(&parse("\tf1\tf2\tf3\tf4\ns1\t1\t3\t4\t2\n"), 0.0).unwrap();
192        for (x, y) in a.data.iter().zip(&b.data) {
193            assert!((x - y).abs() < 1e-12);
194        }
195    }
196
197    #[test]
198    fn zero_without_pseudocount_errors() {
199        assert!(ilr(&parse("\tf1\tf2\tf3\ns1\t0\t5\t1\n"), 0.0).is_err());
200    }
201
202    #[test]
203    fn pseudocount_admits_zero() {
204        let r = ilr(&parse("\tf1\tf2\tf3\ns1\t0\t5\t1\n"), 0.5).unwrap();
205        assert!(r.data.iter().all(|v| v.is_finite()));
206    }
207
208    #[test]
209    fn single_feature_errors() {
210        assert!(ilr(&parse("\tf1\ns1\t5\n"), 0.0).is_err());
211    }
212}