limma-rust 0.1.0

Pure-Rust port of the Bioconductor limma differential-expression package
Documentation
//! Natural cubic spline basis, a port of `splines::ns(x, df, intercept=TRUE)`
//! (R's `splines` package) and the B-spline machinery (`splineDesign` / de Boor)
//! it needs. Used by the trended variance moderation in `eBayes` (`fitFDist`
//! with a covariate).
//!
//! The downstream consumer only uses the basis through a least-squares fit, so
//! the *exact* basis vectors do not matter — only the column space does. We
//! therefore reproduce R's knot placement exactly (boundary knots at the data
//! range, interior knots at type-7 quantiles) and span the same natural-cubic-
//! spline subspace, but parameterise it with our own orthonormal null-space
//! basis rather than R's. Projection onto a subspace is basis-independent, so
//! the fitted values and residual sum of squares match R to floating-point
//! precision regardless.
//!
//! [`NsBasis`] keeps that null-space transform so the fitted spline can be
//! re-evaluated at new covariate points (R's `predict.ns`). A natural spline is
//! a *unique* function — linear beyond the boundary knots — so its value at a
//! new point is likewise basis-independent: evaluating our basis there and
//! contracting with our fit coefficients reproduces R's prediction exactly. This
//! drives the `notallok` branch of trended `fitFDist`, which extrapolates the
//! trend to genes excluded from the fit.

use ndarray::{s, Array2};

use crate::ebayes::quantile_type7;
use crate::linalg::qr_full_q;

/// One row of the order-`ord` B-spline basis (or its `deriv`-th derivative,
/// `deriv` in `{0,1,2}`) evaluated at `xv` on knot vector `t`. The returned
/// vector has length `t.len() - ord` (the number of B-splines).
///
/// Cox–de Boor value recurrence, with the standard derivative recurrence applied
/// once or twice. At the global right boundary (`xv == max knot`) the order-1
/// indicator is taken as the left limit so the clamped basis evaluates to the
/// endpoint value, matching `splines::splineDesign`.
fn bspline_row(t: &[f64], xv: f64, ord: usize, deriv: usize) -> Vec<f64> {
    let m = t.len();

    // Order-1 (piecewise-constant) basis, right-continuous on [t_i, t_{i+1}).
    let mut tables: Vec<Vec<f64>> = Vec::with_capacity(ord);
    let mut n1 = vec![0.0_f64; m - 1];
    for i in 0..(m - 1) {
        if t[i] <= xv && xv < t[i + 1] {
            n1[i] = 1.0;
        }
    }
    // Left limit at the global right boundary knot.
    if xv >= t[m - 1] {
        for i in (0..(m - 1)).rev() {
            if t[i] < t[i + 1] {
                n1[i] = 1.0;
                break;
            }
        }
    }
    tables.push(n1);

    // Raise to order `ord` by the Cox–de Boor recurrence.
    for o in 2..=ord {
        let prev = &tables[o - 2];
        let len = m - o;
        let mut no = vec![0.0_f64; len];
        for j in 0..len {
            let d1 = t[j + o - 1] - t[j];
            let a1 = if d1 != 0.0 { (xv - t[j]) / d1 } else { 0.0 };
            let d2 = t[j + o] - t[j + 1];
            let a2 = if d2 != 0.0 { (t[j + o] - xv) / d2 } else { 0.0 };
            no[j] = a1 * prev[j] + a2 * prev[j + 1];
        }
        tables.push(no);
    }

    match deriv {
        0 => tables[ord - 1].clone(),
        1 => {
            // d/dx B_{j,ord} = (ord-1) [ B_{j,ord-1}/(t_{j+ord-1}-t_j)
            //                          - B_{j+1,ord-1}/(t_{j+ord}-t_{j+1}) ]
            let prev = &tables[ord - 2];
            deriv_combine(t, prev, ord)
        }
        2 => {
            // First derivative of the order-(ord-1) basis, then differentiate
            // once more with the order-`ord` weights.
            let lower = &tables[ord - 3];
            let d1 = deriv_combine(t, lower, ord - 1);
            deriv_combine(t, &d1, ord)
        }
        _ => panic!("bspline_row: deriv must be 0, 1 or 2"),
    }
}

/// Apply one step of the B-spline derivative recurrence at order `ord` to a
/// vector `prev` of order-(ord-1) basis values (or derivatives). Returns a
/// vector of length `t.len() - ord`.
fn deriv_combine(t: &[f64], prev: &[f64], ord: usize) -> Vec<f64> {
    let m = t.len();
    let len = m - ord;
    let scale = (ord - 1) as f64;
    let mut out = vec![0.0_f64; len];
    for j in 0..len {
        let d1 = t[j + ord - 1] - t[j];
        let term1 = if d1 != 0.0 { prev[j] / d1 } else { 0.0 };
        let d2 = t[j + ord] - t[j + 1];
        let term2 = if d2 != 0.0 { prev[j + 1] / d2 } else { 0.0 };
        out[j] = scale * (term1 - term2);
    }
    out
}

/// `splines::splineDesign(knots, x, ord, derivs=deriv)`: the order-`ord` B-spline
/// basis (or `deriv`-th derivative) at each point of `x`. Shape `x.len() x
/// (knots.len() - ord)`.
fn spline_design(knots: &[f64], x: &[f64], ord: usize, deriv: usize) -> Array2<f64> {
    let nb = knots.len() - ord;
    let mut out = Array2::<f64>::zeros((x.len(), nb));
    for (r, &xv) in x.iter().enumerate() {
        let row = bspline_row(knots, xv, ord, deriv);
        for (c, val) in row.into_iter().enumerate() {
            out[[r, c]] = val;
        }
    }
    out
}

/// A fitted natural-cubic-spline basis: the clamped knot vector, boundary knots,
/// and the null-space transform mapping the order-4 B-spline pre-basis to the
/// natural-spline basis. Spans the same column space as `splines::ns(x, df=df,
/// intercept=TRUE)` and can be re-evaluated at new covariate points (cf. R's
/// `predict.ns`), including linear extrapolation beyond the boundary knots.
pub(crate) struct NsBasis {
    /// Augmented (clamped) knot vector: ord-fold boundary knots plus interior.
    aknots: Vec<f64>,
    /// Boundary knots = the fitting data range.
    xmin: f64,
    xmax: f64,
    /// `(df+2) x df` transform from the order-4 B-spline pre-basis to the
    /// natural-spline basis (trailing columns of the constraint null space).
    nbasis: Array2<f64>,
}

impl NsBasis {
    /// Fit the basis to `x` with `df` degrees of freedom (`df >= 2`, giving
    /// `df - 2` interior knots). Boundary knots are the data range; interior
    /// knots are type-7 quantiles at the evenly spaced probabilities
    /// `i/(df-1)` for `i = 1..df-2`.
    pub(crate) fn fit(x: &[f64], df: usize) -> NsBasis {
        assert!(df >= 2, "NsBasis requires df >= 2");
        let xmin = x.iter().cloned().fold(f64::INFINITY, f64::min);
        let xmax = x.iter().cloned().fold(f64::NEG_INFINITY, f64::max);

        let n_iknots = df - 2;
        let iknots: Vec<f64> = if n_iknots > 0 {
            let probs: Vec<f64> = (1..=n_iknots)
                .map(|i| i as f64 / (n_iknots as f64 + 1.0))
                .collect();
            let mut xs = x.to_vec();
            xs.sort_by(|a, b| a.partial_cmp(b).unwrap());
            quantile_type7(&xs, &probs)
        } else {
            Vec::new()
        };

        let ord = 4usize;
        let mut aknots: Vec<f64> = Vec::with_capacity(2 * ord + n_iknots);
        aknots.extend(std::iter::repeat_n(xmin, ord));
        aknots.extend_from_slice(&iknots);
        aknots.extend(std::iter::repeat_n(xmax, ord));
        aknots.sort_by(|a, b| a.partial_cmp(b).unwrap());

        // Natural boundary constraint: second derivative zero at the boundary
        // knots. Its null space (trailing columns of Q from the QR of its
        // transpose) spans the natural cubic spline subspace.
        let constraint = spline_design(&aknots, &[xmin, xmax], ord, 2); // 2 x (df+2)
        let ct = constraint.t().to_owned(); // (df+2) x 2
        let q = qr_full_q(&ct); // (df+2) x (df+2)
        let nb = ct.nrows();
        let nbasis = q.slice(s![.., 2..nb]).to_owned(); // (df+2) x df

        NsBasis {
            aknots,
            xmin,
            xmax,
            nbasis,
        }
    }

    /// Evaluate the natural-spline basis at each point of `newx`, returning a
    /// `newx.len() x df` matrix. Points inside `[xmin, xmax]` use the clamped
    /// B-spline basis directly; points outside are extrapolated linearly from
    /// the value and first derivative at the nearest boundary knot, matching R's
    /// `predict.ns`. Because the same null-space transform is reused, evaluating
    /// at new points gives the same fitted function as R regardless of the
    /// (basis-dependent) coefficients.
    pub(crate) fn eval(&self, newx: &[f64]) -> Array2<f64> {
        let ord = 4usize;
        let npre = self.aknots.len() - ord; // df + 2
        let mut pre = Array2::<f64>::zeros((newx.len(), npre));
        for (r, &xv) in newx.iter().enumerate() {
            let row = if xv < self.xmin {
                self.extrapolate_row(self.xmin, xv)
            } else if xv > self.xmax {
                self.extrapolate_row(self.xmax, xv)
            } else {
                bspline_row(&self.aknots, xv, ord, 0)
            };
            for (c, val) in row.into_iter().enumerate() {
                pre[[r, c]] = val;
            }
        }
        pre.dot(&self.nbasis) // newx.len() x df
    }

    /// Linear extrapolation of the order-4 pre-basis from boundary knot `pivot`
    /// out to `xv`: `value(pivot) + (xv - pivot) * value'(pivot)`, component-wise.
    fn extrapolate_row(&self, pivot: f64, xv: f64) -> Vec<f64> {
        let ord = 4usize;
        let val = bspline_row(&self.aknots, pivot, ord, 0);
        let der = bspline_row(&self.aknots, pivot, ord, 1);
        let dx = xv - pivot;
        val.iter()
            .zip(der.iter())
            .map(|(&v, &d)| v + dx * d)
            .collect()
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::linalg::{qr_econ, solve_upper};
    use ndarray::Array1;

    /// Natural cubic spline design at the fitting points: `NsBasis::fit` then
    /// `eval` at `x`. Spans the same column space as `splines::ns(x, df,
    /// intercept=TRUE)`.
    fn ns_design(x: &[f64], df: usize) -> Array2<f64> {
        NsBasis::fit(x, df).eval(x)
    }

    /// Project `y` onto `col(design)` and return (fitted, residual sum of
    /// squares). Basis-independent, so it equals R's `lm.fit` up to f.p. error.
    fn project(design: &Array2<f64>, y: &[f64]) -> (Vec<f64>, f64) {
        let yv = Array1::from(y.to_vec());
        let (q, r) = qr_econ(design);
        let qty = q.t().dot(&yv);
        let coef = solve_upper(&r, &qty);
        let fitted = design.dot(&coef);
        let rss: f64 = yv
            .iter()
            .zip(fitted.iter())
            .map(|(a, b)| (a - b) * (a - b))
            .sum();
        (fitted.to_vec(), rss)
    }

    /// At the boundary knots a clamped natural-spline basis interpolates the
    /// endpoints: the constant function is in the span, and the basis sums to a
    /// finite, well-conditioned set (no NaNs/Infs anywhere).
    #[test]
    fn ns_basis_is_finite_and_spans_constants() {
        let x: Vec<f64> = (0..50).map(|i| (i as f64) * 0.2 - 3.0).collect();
        let d = ns_design(&x, 4);
        assert_eq!(d.nrows(), 50);
        assert_eq!(d.ncols(), 4);
        assert!(d.iter().all(|v| v.is_finite()));
        // The constant vector lies in the column space, so regressing 1 on the
        // basis reproduces it with ~zero residual.
        let ones = vec![1.0_f64; 50];
        let (_fit, rss) = project(&d, &ones);
        assert!(rss < 1e-18, "constant not in span, rss={rss}");
    }

    /// A cubic polynomial restricted between the boundary knots lies in the
    /// natural-spline space only approximately, but a *linear* function lies in
    /// it exactly (natural splines are linear past the boundary and a line is
    /// its own natural extension). Regressing a line reproduces it exactly.
    #[test]
    fn ns_reproduces_linear_functions() {
        let x: Vec<f64> = (0..40).map(|i| (i as f64).powf(1.3) * 0.1).collect();
        let d = ns_design(&x, 5);
        assert_eq!(d.ncols(), 5);
        let y: Vec<f64> = x.iter().map(|&v| 2.5 * v - 0.7).collect();
        let (fit, rss) = project(&d, &y);
        assert!(rss < 1e-16, "line not reproduced, rss={rss}");
        for (a, b) in fit.iter().zip(y.iter()) {
            assert!((a - b).abs() < 1e-8);
        }
    }

    /// Reference values from R:
    /// `splineDesign(c(0,0,0,0,1,2,3,3,3,3), c(0,1.5,3), 4)` — the order-4
    /// B-spline basis on a clamped knot vector with two interior knots,
    /// evaluated at the left boundary, an interior point, and the right
    /// boundary. Confirms the de Boor recurrence and the right-boundary limit.
    #[test]
    fn spline_design_matches_r_reference() {
        let knots = [0.0, 0.0, 0.0, 0.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0];
        let b = spline_design(&knots, &[0.0, 1.5, 3.0], 4, 0);
        // R: row 1 = (1,0,0,0,0,0); row 3 = (0,0,0,0,0,1).
        assert!((b[[0, 0]] - 1.0).abs() < 1e-12);
        assert!(b.row(0).iter().skip(1).all(|&v| v.abs() < 1e-12));
        assert!((b[[2, 5]] - 1.0).abs() < 1e-12);
        assert!(b.row(2).iter().take(5).all(|&v| v.abs() < 1e-12));
        // Each row of a B-spline basis sums to 1 (partition of unity).
        for r in 0..3 {
            let s: f64 = b.row(r).sum();
            assert!((s - 1.0).abs() < 1e-12, "row {r} sums to {s}");
        }
        // R: splineDesign(...,1.5,...) = (0, 0.03125, 0.46875, 0.46875, 0.03125,
        // 0) — symmetric about the midpoint 1.5 of the [0,3] knot layout.
        assert!((b[[1, 1]] - 0.031_25).abs() < 1e-12, "{}", b[[1, 1]]);
        assert!((b[[1, 2]] - 0.468_75).abs() < 1e-12, "{}", b[[1, 2]]);
        assert!((b[[1, 3]] - 0.468_75).abs() < 1e-12, "{}", b[[1, 3]]);
        assert!((b[[1, 4]] - 0.031_25).abs() < 1e-12, "{}", b[[1, 4]]);
    }
}