rsomics-ilr 0.1.0

Isometric log-ratio (ILR) transform of a composition table — scikit-bio skbio.stats.composition.ilr equivalent using the default Egozcue/Gram-Schmidt orthonormal basis
Documentation
use std::io::{BufRead, Write};

use rayon::prelude::*;
use rsomics_common::{Result, RsomicsError};

mod table;

pub use table::Table;

pub struct IlrMatrix {
    pub samples: Vec<String>,
    pub coords: Vec<String>,
    pub data: Vec<f64>,
}

/// Per-balance scale factor of skbio's default Egozcue/Gram-Schmidt basis:
/// row `k` (i = k+1) carries the weight `sqrt(i/(i+1))`, the only data the
/// triangular basis needs since its nonzero pattern is fixed.
fn basis_scales(d: usize) -> Vec<f64> {
    (1..d)
        .map(|i| (i as f64 / (i as f64 + 1.0)).sqrt())
        .collect()
}

/// Isometric log-ratio transform (Egozcue et al. 2003), matching
/// `skbio.stats.composition.ilr` with its default basis: `ilr(x) = clr(x) · Vᵀ`
/// where `V` is the (D-1)×D Gram-Schmidt orthonormal basis.
///
/// The default basis is lower-triangular, so the dense `clr·Vᵀ` matmul collapses
/// to one prefix-sum pass per sample: with clr row `c` and `P[i]=Σ_{t<i} c[t]`,
/// the k-th coordinate (i=k+1) is `sqrt(i/(i+1))·(P[i]/i − c[i])`. That is O(D)
/// per sample rather than the O(D²) numpy tensordot.
///
/// `pseudocount` is added to every value before the log; with the default 0 the
/// input must be strictly positive, as skbio requires (`nozero=True`).
///
/// # Errors
/// A non-positive value after the pseudocount (log undefined), or a non-finite
/// input — rejected as in skbio. Fewer than two features, where ilr is undefined.
pub fn ilr(table: &Table, pseudocount: f64) -> Result<IlrMatrix> {
    let d = table.n_features();
    if d < 2 {
        return Err(RsomicsError::InvalidInput(format!(
            "ilr needs at least 2 features, got {d}"
        )));
    }
    let inv_d = 1.0 / d as f64;
    let scales = basis_scales(d);

    let mut data = vec![0.0_f64; table.n_samples() * (d - 1)];
    let bad: Option<f64> = data
        .par_chunks_mut(d - 1)
        .zip(table.data.par_chunks(d))
        .map(|(out, row)| {
            let mut clr = Vec::with_capacity(d);
            let mut mean = 0.0;
            for &x in row {
                let v = x + pseudocount;
                if v <= 0.0 || !v.is_finite() {
                    return Some(x);
                }
                let l = v.ln();
                clr.push(l);
                mean += l;
            }
            mean *= inv_d;

            let mut prefix = 0.0;
            for (k, o) in out.iter_mut().enumerate() {
                let i = k + 1;
                prefix += clr[k] - mean;
                *o = scales[k] * (prefix / i as f64 - (clr[i] - mean));
            }
            None
        })
        .reduce(|| None, |a, b| a.or(b));

    if let Some(v) = bad {
        return Err(RsomicsError::InvalidInput(format!(
            "value {v} is non-positive after pseudocount {pseudocount} — ilr's log is undefined (skbio requires strictly positive data)"
        )));
    }

    let coords = (0..d - 1).map(|k| format!("ilr{k}")).collect();
    Ok(IlrMatrix {
        samples: table.samples.clone(),
        coords,
        data,
    })
}

impl IlrMatrix {
    /// Write the transformed matrix as a TSV: header of ilr coordinate IDs then
    /// one `sample_id<delim>value...` line per sample.
    ///
    /// # Errors
    /// Propagates write errors.
    pub fn write_tsv<W: Write>(&self, mut out: W, delim: char) -> Result<()> {
        let k = self.coords.len();
        let mut line = String::with_capacity(16 * (k + 1));
        line.push_str("sample");
        for c in &self.coords {
            line.push(delim);
            line.push_str(c);
        }
        line.push('\n');
        out.write_all(line.as_bytes()).map_err(RsomicsError::Io)?;

        let mut buf = ryu_like::Buffer::new();
        for (i, sample) in self.samples.iter().enumerate() {
            line.clear();
            line.push_str(sample);
            for &v in &self.data[i * k..(i + 1) * k] {
                line.push(delim);
                line.push_str(buf.format(v));
            }
            line.push('\n');
            out.write_all(line.as_bytes()).map_err(RsomicsError::Io)?;
        }
        Ok(())
    }
}

/// std's `{}` already emits the shortest decimal that round-trips; reuse one
/// growable buffer to avoid a per-value allocation.
mod ryu_like {
    use std::fmt::Write;

    pub struct Buffer {
        s: String,
    }

    impl Buffer {
        pub fn new() -> Self {
            Self {
                s: String::with_capacity(24),
            }
        }

        pub fn format(&mut self, v: f64) -> &str {
            self.s.clear();
            let _ = write!(self.s, "{v}");
            &self.s
        }
    }
}

/// # Errors
/// Propagates parse, compute, and write errors.
pub fn run<W: Write>(
    table_reader: impl BufRead,
    out: W,
    delim: char,
    pseudocount: f64,
) -> Result<()> {
    let table = Table::parse(table_reader, delim)?;
    let res = ilr(&table, pseudocount)?;
    res.write_tsv(out, delim)
}

#[cfg(test)]
mod tests {
    use super::*;

    fn parse(txt: &str) -> Table {
        Table::parse(txt.as_bytes(), '\t').unwrap()
    }

    #[test]
    fn docstring_example() {
        // skbio: ilr([.1,.3,.4,.2]) == [-0.7768362, -0.68339802, 0.11704769]
        let t = parse("\tf1\tf2\tf3\tf4\ns1\t0.1\t0.3\t0.4\t0.2\n");
        let r = ilr(&t, 0.0).unwrap();
        let want = [-0.7768362, -0.68339802, 0.11704769];
        assert_eq!(r.data.len(), 3);
        for (g, w) in r.data.iter().zip(want) {
            assert!((g - w).abs() < 1e-7, "{g} vs {w}");
        }
    }

    #[test]
    fn output_width_is_d_minus_one() {
        let r = ilr(&parse("\tf1\tf2\tf3\ns1\t5\t1\t9\ns2\t2\t8\t4\n"), 0.0).unwrap();
        assert_eq!(r.coords.len(), 2);
        assert_eq!(r.data.len(), 4);
    }

    #[test]
    fn scale_invariant() {
        let a = ilr(&parse("\tf1\tf2\tf3\tf4\ns1\t0.1\t0.3\t0.4\t0.2\n"), 0.0).unwrap();
        let b = ilr(&parse("\tf1\tf2\tf3\tf4\ns1\t1\t3\t4\t2\n"), 0.0).unwrap();
        for (x, y) in a.data.iter().zip(&b.data) {
            assert!((x - y).abs() < 1e-12);
        }
    }

    #[test]
    fn zero_without_pseudocount_errors() {
        assert!(ilr(&parse("\tf1\tf2\tf3\ns1\t0\t5\t1\n"), 0.0).is_err());
    }

    #[test]
    fn pseudocount_admits_zero() {
        let r = ilr(&parse("\tf1\tf2\tf3\ns1\t0\t5\t1\n"), 0.5).unwrap();
        assert!(r.data.iter().all(|v| v.is_finite()));
    }

    #[test]
    fn single_feature_errors() {
        assert!(ilr(&parse("\tf1\ns1\t5\n"), 0.0).is_err());
    }
}