limma/
glsseries.rs

1//! `gls.series` (lmfit.R): gene-wise generalized least squares allowing for a
2//! known correlation between duplicate spots (`ndups`) or between samples that
3//! share a `block`. This is the fitting engine `lmFit` dispatches to whenever
4//! `ndups > 1` or `block` is supplied together with a `correlation` (typically
5//! the `consensus.correlation` from [`crate::duplicate_correlation`]).
6//!
7//! Two cormatrix constructions are reproduced:
8//!   * **duplicates** — `cormatrix = diag(correlation, narrays) ⊗ J(ndups)`
9//!     with unit diagonal, after `unwrapdups` folds the `ndups` spots of each
10//!     gene into extra columns and the design is replicated row-wise;
11//!   * **blocks** — `cormatrix[i,j] = correlation` when `block[i] == block[j]`,
12//!     unit diagonal.
13//!
14//! And both of limma's code paths:
15//!   * the fast multi-response `lm.fit` when every value is finite and no probe
16//!     weights are supplied (one shared `chol(V)` and design QR);
17//!   * the slow per-gene iteration when probe weights or missing values are
18//!     present (each gene re-derives `V` over its observed arrays).
19//!
20//! `cov.coefficients` is always the unweighted `(Xᵀ V⁻¹ X)⁻¹` of the full
21//! design (limma computes it from `chol(cormatrix)` even on the weighted path).
22//!
23//! Not reproduced (rare, documented): the array-weight fast path that scales `V`
24//! by `1/sqrt(arrayweights)` — array weights are uncommon with `gls.series`, and
25//! a full probe-weight matrix flows correctly through the slow path instead; and
26//! per-gene rank deficiency (an observed-array subset smaller than the number of
27//! coefficients), which limma resolves by pivoted column dropping.
28
29use anyhow::{bail, Result};
30use ndarray::{Array1, Array2};
31
32use crate::dups::{kron_rows, unwrapdups};
33use crate::fit::{new_fit, MArrayLM};
34use crate::linalg::{cholesky_upper, qr_econ, solve_upper, solve_upper_transpose, xtx_inv_from_r};
35
36/// NaN-skipping mean of an array view; `NaN` when nothing is finite.
37fn nanmean<'a, I: IntoIterator<Item = &'a f64>>(it: I) -> f64 {
38    let mut sum = 0.0;
39    let mut cnt = 0usize;
40    for &v in it {
41        if v.is_finite() {
42            sum += v;
43            cnt += 1;
44        }
45    }
46    if cnt > 0 {
47        sum / cnt as f64
48    } else {
49        f64::NAN
50    }
51}
52
53/// `Amean` exactly as `lmFit` would populate it: row means of the original
54/// expression matrix, reduced through `unwrapdups` when `ndups > 1`.
55fn gls_amean(exprs: &Array2<f64>, ndups: usize, spacing: usize) -> Array1<f64> {
56    let nspots = exprs.nrows();
57    let spotmean: Vec<f64> = (0..nspots).map(|i| nanmean(exprs.row(i))).collect();
58    if ndups <= 1 {
59        return Array1::from(spotmean);
60    }
61    let col = Array2::from_shape_vec((nspots, 1), spotmean).expect("column shape");
62    let uw = unwrapdups(&col, ndups, spacing.max(1));
63    (0..uw.nrows()).map(|g| nanmean(uw.row(g))).collect()
64}
65
66/// Apply `backsolve(u, b, transpose = TRUE)` column by column: solve `Uᵀ X = B`.
67fn backsolve_t_cols(u: &Array2<f64>, b: &Array2<f64>) -> Array2<f64> {
68    let (n, k) = b.dim();
69    let mut out = Array2::<f64>::zeros((n, k));
70    for j in 0..k {
71        let sol = solve_upper_transpose(u, &b.column(j).to_owned());
72        out.column_mut(j).assign(&sol);
73    }
74    out
75}
76
77/// Fit gene-wise linear models by generalized least squares. Port of limma's
78/// `gls.series`. `correlation` must be supplied (limma would otherwise call
79/// `duplicateCorrelation`); pass [`crate::duplicate_correlation`]'s consensus.
80///
81/// Exactly one of `ndups > 1` (with `block == None`) or `block == Some(..)` is
82/// the intended use; when `block` is given, `ndups`/`spacing` are forced to 1.
83/// `weights`, when supplied, is a full probe-weight matrix matching `exprs`.
84#[allow(clippy::too_many_arguments)]
85pub fn gls_series(
86    exprs: &Array2<f64>,
87    design: &Array2<f64>,
88    ndups: usize,
89    spacing: usize,
90    block: Option<&[i64]>,
91    correlation: f64,
92    weights: Option<&Array2<f64>>,
93    gene_names: Vec<String>,
94    coef_names: Vec<String>,
95) -> Result<MArrayLM> {
96    let narrays0 = exprs.ncols();
97    if design.nrows() != narrays0 {
98        bail!(
99            "design rows ({}) must match number of arrays ({})",
100            design.nrows(),
101            narrays0
102        );
103    }
104    if correlation.abs() >= 1.0 {
105        bail!("correlation is 1 or -1, so the model is degenerate");
106    }
107    let nbeta = design.ncols();
108
109    // Amean reflects the original data, before weight-driven NA masking below.
110    let amean = gls_amean(exprs, ndups, spacing);
111
112    // Clean weights: NA -> 0, then mask M and weights where weight < 1e-15.
113    let mut m = exprs.clone();
114    let mut wmat: Option<Array2<f64>> = None;
115    if let Some(w) = weights {
116        if w.dim() != exprs.dim() {
117            bail!("weights dimensions must match exprs");
118        }
119        let mut wc = w.clone();
120        for i in 0..wc.nrows() {
121            for j in 0..wc.ncols() {
122                if wc[[i, j]].is_nan() {
123                    wc[[i, j]] = 0.0;
124                }
125                if wc[[i, j]] < 1e-15 {
126                    m[[i, j]] = f64::NAN;
127                    wc[[i, j]] = f64::NAN;
128                }
129            }
130        }
131        wmat = Some(wc);
132    }
133
134    // Build cormatrix and, for the duplicates path, unwrap M/weights and expand
135    // the design so rows align with the Kronecker correlation blocks.
136    let (cormatrix, mwork, wwork, dwork) = match block {
137        None => {
138            let nd = if ndups < 2 { 1 } else { ndups };
139            let corr = if ndups < 2 { 0.0 } else { correlation };
140            let sp = spacing.max(1);
141            let mu = unwrapdups(&m, nd, sp);
142            let wu = wmat.as_ref().map(|w| unwrapdups(w, nd, sp));
143            let du = kron_rows(design, nd);
144            let n = du.nrows();
145            let mut cor = Array2::<f64>::zeros((n, n));
146            for i in 0..n {
147                for j in 0..n {
148                    if i / nd == j / nd {
149                        cor[[i, j]] = corr;
150                    }
151                }
152                cor[[i, i]] = 1.0;
153            }
154            (cor, mu, wu, du)
155        }
156        Some(b) => {
157            if b.len() != narrays0 {
158                bail!("length of block does not match number of arrays");
159            }
160            let n = narrays0;
161            let mut cor = Array2::<f64>::zeros((n, n));
162            for i in 0..n {
163                for j in 0..n {
164                    if b[i] == b[j] {
165                        cor[[i, j]] = correlation;
166                    }
167                }
168                cor[[i, i]] = 1.0;
169            }
170            (cor, m, wmat, design.clone())
171        }
172    };
173
174    let ngenes = mwork.nrows();
175    let n = mwork.ncols();
176
177    let all_finite = mwork.iter().all(|v| v.is_finite());
178    let no_probe_wts = all_finite && wwork.is_none();
179
180    let mut coefficients = Array2::<f64>::from_elem((ngenes, nbeta), f64::NAN);
181    let mut stdev_unscaled = Array2::<f64>::from_elem((ngenes, nbeta), f64::NAN);
182    let mut sigma = Array1::<f64>::from_elem(ngenes, f64::NAN);
183    let mut df_residual = Array1::<f64>::zeros(ngenes);
184
185    // cov.coefficients = (Xstar' Xstar)^-1 with Xstar = chol(cormatrix)^-T design
186    // (limma's unweighted full-design covariance, shared by both paths).
187    let cholv_full = cholesky_upper(&cormatrix);
188    let xstar_full = backsolve_t_cols(&cholv_full, &dwork);
189    let (qf, rf) = qr_econ(&xstar_full);
190    let cov_coefficients = xtx_inv_from_r(&rf);
191
192    if no_probe_wts {
193        // Fast path: one shared whitening transform, multi-response lm.fit.
194        let df = (n - nbeta) as f64;
195        let stdev_row: Vec<f64> = (0..nbeta)
196            .map(|j| cov_coefficients[[j, j]].sqrt())
197            .collect();
198        for g in 0..ngenes {
199            let zstar = solve_upper_transpose(&cholv_full, &mwork.row(g).to_owned());
200            let qty = qf.t().dot(&zstar);
201            let beta = solve_upper(&rf, &qty);
202            for j in 0..nbeta {
203                coefficients[[g, j]] = beta[j];
204                stdev_unscaled[[g, j]] = stdev_row[j];
205            }
206            df_residual[g] = df;
207            if df > 0.0 {
208                let ssy: f64 = zstar.iter().map(|&v| v * v).sum();
209                let ssfit: f64 = qty.iter().map(|&v| v * v).sum();
210                sigma[g] = ((ssy - ssfit).max(0.0) / df).sqrt();
211            }
212        }
213    } else {
214        // Slow path: re-derive V over each gene's observed arrays.
215        for g in 0..ngenes {
216            let yfull = mwork.row(g);
217            let obs: Vec<usize> = (0..n).filter(|&k| yfull[k].is_finite()).collect();
218            let nobs = obs.len();
219            if nobs == 0 {
220                continue;
221            }
222            let mut xsub = Array2::<f64>::zeros((nobs, nbeta));
223            let mut vsub = Array2::<f64>::zeros((nobs, nobs));
224            for (ri, &ki) in obs.iter().enumerate() {
225                for j in 0..nbeta {
226                    xsub[[ri, j]] = dwork[[ki, j]];
227                }
228                for (cj, &kj) in obs.iter().enumerate() {
229                    vsub[[ri, cj]] = cormatrix[[ki, kj]];
230                }
231            }
232            if let Some(w) = &wwork {
233                let wrs: Vec<f64> = obs.iter().map(|&k| 1.0 / w[[g, k]].sqrt()).collect();
234                for ri in 0..nobs {
235                    for cj in 0..nobs {
236                        vsub[[ri, cj]] *= wrs[ri] * wrs[cj];
237                    }
238                }
239            }
240            let cholv = cholesky_upper(&vsub);
241            let yobs: Array1<f64> = obs.iter().map(|&k| yfull[k]).collect();
242            let ystar = solve_upper_transpose(&cholv, &yobs);
243
244            if xsub.iter().all(|&v| v == 0.0) {
245                df_residual[g] = nobs as f64;
246                let ss: f64 = ystar.iter().map(|&v| v * v).sum();
247                sigma[g] = (ss / nobs as f64).sqrt();
248                continue;
249            }
250            if nobs < nbeta {
251                // Rank-deficient gene (fewer observed arrays than coefficients);
252                // limma drops columns by pivoting — left as non-estimable here.
253                continue;
254            }
255            let xstar = backsolve_t_cols(&cholv, &xsub);
256            let (q, r) = qr_econ(&xstar);
257            let qty = q.t().dot(&ystar);
258            let beta = solve_upper(&r, &qty);
259            let xtx_inv = xtx_inv_from_r(&r);
260            for j in 0..nbeta {
261                coefficients[[g, j]] = beta[j];
262                stdev_unscaled[[g, j]] = xtx_inv[[j, j]].sqrt();
263            }
264            let df = (nobs - nbeta) as f64;
265            df_residual[g] = df;
266            if df > 0.0 {
267                let ssy: f64 = ystar.iter().map(|&v| v * v).sum();
268                let ssfit: f64 = qty.iter().map(|&v| v * v).sum();
269                sigma[g] = ((ssy - ssfit).max(0.0) / df).sqrt();
270            }
271        }
272    }
273
274    let gnames: Vec<String> = if gene_names.len() == ngenes {
275        gene_names
276    } else {
277        (0..ngenes).map(|i| i.to_string()).collect()
278    };
279    let cnames: Vec<String> = if coef_names.len() == nbeta {
280        coef_names
281    } else {
282        (0..nbeta).map(|j| format!("coef{j}")).collect()
283    };
284
285    let mut fit = new_fit(
286        coefficients,
287        stdev_unscaled,
288        sigma,
289        df_residual,
290        cov_coefficients,
291        &gnames,
292        &cnames,
293    );
294    fit.amean = amean;
295    fit.design = Some(dwork);
296    Ok(fit)
297}
298
299#[cfg(test)]
300#[allow(clippy::excessive_precision)]
301mod tests {
302    use super::*;
303
304    fn rclose(a: f64, b: f64) -> bool {
305        (a - b).abs() <= 1e-7 * (1.0 + b.abs())
306    }
307
308    fn names(n: usize, p: usize) -> (Vec<String>, Vec<String>) {
309        (
310            (0..n).map(|i| format!("g{i}")).collect(),
311            (0..p).map(|j| format!("c{j}")).collect(),
312        )
313    }
314
315    /// Scenario 1 fixture: 8 spots x 6 arrays, ndups=2 (-> 4 genes).
316    fn dup_fixture() -> Array2<f64> {
317        let group = [0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
318        let mut m = Array2::<f64>::zeros((8, 6));
319        for g0 in 0..8 {
320            let base = ((g0 as i64) % 7) - 3;
321            let extra = ((g0 as i64 * 5) % 11) - 5;
322            let lvl = 5.0 + (g0 % 10) as f64 * 0.2;
323            let eff = base as f64 * 0.25;
324            for k0 in 0..6 {
325                let noise = (((g0 as i64 * 13 + k0 as i64 * 17) % 19) - 9) as f64 * 0.04;
326                m[[g0, k0]] = lvl + eff * group[k0] + extra as f64 * 0.05 + noise;
327            }
328        }
329        m
330    }
331
332    fn dup_design() -> Array2<f64> {
333        let group = [0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
334        let mut d = Array2::<f64>::zeros((6, 2));
335        for k0 in 0..6 {
336            d[[k0, 0]] = 1.0;
337            d[[k0, 1]] = group[k0];
338        }
339        d
340    }
341
342    /// Scenario 2/3 fixture: 6 genes x 8 arrays, block path.
343    fn block_fixture() -> Array2<f64> {
344        let mut m = Array2::<f64>::zeros((6, 8));
345        for g0 in 0..6 {
346            let lvl2 = 4.0 + (g0 % 5) as f64 * 0.3;
347            let slope = (((g0 as i64 * 2) % 7) - 3) as f64 * 0.15;
348            for k0 in 0..8 {
349                let x = (k0 % 3) as f64;
350                let noise2 = (((g0 as i64 * 11 + k0 as i64 * 5) % 17) - 8) as f64 * 0.05;
351                m[[g0, k0]] = lvl2 + slope * x + noise2;
352            }
353        }
354        m
355    }
356
357    fn block_design() -> Array2<f64> {
358        let mut d = Array2::<f64>::zeros((8, 2));
359        for k0 in 0..8 {
360            d[[k0, 0]] = 1.0;
361            d[[k0, 1]] = (k0 % 3) as f64;
362        }
363        d
364    }
365
366    #[test]
367    fn gls_series_ndups_fast_matches_r() {
368        let m = dup_fixture();
369        let (gn, cn) = names(4, 2);
370        let fit = gls_series(&m, &dup_design(), 2, 1, None, 0.4, None, gn, cn).unwrap();
371
372        let coef = [
373            [5.048333333333332, -0.73833333333333273],
374            [5.5733333333333341, 0.015000000000000385],
375            [5.9500000000000011, 0.2616666666666671],
376            [6.3266666666666627, 0.013333333333335673],
377        ];
378        let sig = [
379            0.38390381980014643,
380            0.30262541674013937,
381            0.27637319489621803,
382            0.94559882765216252,
383        ];
384        let amean = [
385            4.6791666666666671,
386            5.5808333333333335,
387            6.0808333333333335,
388            6.3333333333333339,
389        ];
390        for g in 0..4 {
391            assert!(rclose(fit.coefficients[[g, 0]], coef[g][0]), "coef0 g{g}");
392            assert!(rclose(fit.coefficients[[g, 1]], coef[g][1]), "coef1 g{g}");
393            assert!(rclose(fit.sigma[g], sig[g]), "sigma g{g}");
394            assert!(rclose(fit.amean[g], amean[g]), "amean g{g}");
395            assert!(rclose(fit.stdev_unscaled[[g, 0]], 0.48304589153964794));
396            assert!(rclose(fit.stdev_unscaled[[g, 1]], 0.68313005106397318));
397            assert_eq!(fit.df_residual[g], 10.0);
398        }
399        assert!(rclose(fit.cov_coefficients[[0, 0]], 0.23333333333333331));
400        assert!(rclose(fit.cov_coefficients[[0, 1]], -0.23333333333333331));
401        assert!(rclose(fit.cov_coefficients[[1, 1]], 0.46666666666666662));
402    }
403
404    #[test]
405    fn gls_series_block_fast_matches_r() {
406        let m = block_fixture();
407        let block = [0i64, 0, 1, 1, 2, 2, 3, 3];
408        let (gn, cn) = names(6, 2);
409        let fit = gls_series(&m, &block_design(), 1, 1, Some(&block), 0.3, None, gn, cn).unwrap();
410
411        let coef = [
412            [3.9896825396825411, -0.5024943310657598],
413            [4.461904761904762, -0.25646258503401381],
414            [4.340476190476191, 0.42517006802721086],
415            [5.0690476190476206, 0.25680272108843544],
416            [5.3119047619047643, -0.40646258503401367],
417            [3.6904761904761902, 0.27517006802721089],
418        ];
419        let sig = [
420            0.31011362178297863,
421            0.26676429774418003,
422            0.078484562609406242,
423            0.28938639706404889,
424            0.26676429774418015,
425            0.078484562609406339,
426        ];
427        let amean = [
428            3.5499999999999998,
429            4.2374999999999998,
430            4.7124999999999995,
431            5.2937500000000002,
432            4.9562500000000007,
433            3.9312499999999999,
434        ];
435        for g in 0..6 {
436            assert!(rclose(fit.coefficients[[g, 0]], coef[g][0]), "coef0 g{g}");
437            assert!(rclose(fit.coefficients[[g, 1]], coef[g][1]), "coef1 g{g}");
438            assert!(rclose(fit.sigma[g], sig[g]), "sigma g{g}");
439            assert!(rclose(fit.amean[g], amean[g]), "amean g{g}");
440            assert!(rclose(fit.stdev_unscaled[[g, 0]], 0.53748384988657005));
441            assert!(rclose(fit.stdev_unscaled[[g, 1]], 0.40629960014669614));
442            assert_eq!(fit.df_residual[g], 6.0);
443        }
444        assert!(rclose(fit.cov_coefficients[[0, 0]], 0.28888888888888903));
445        assert!(rclose(fit.cov_coefficients[[0, 1]], -0.14444444444444454));
446        assert!(rclose(fit.cov_coefficients[[1, 1]], 0.16507936507936516));
447    }
448
449    #[test]
450    fn gls_series_block_slow_one_na_matches_r() {
451        let mut m = block_fixture();
452        m[[2, 5]] = f64::NAN; // 0-indexed gene 2, array 5 -> triggers slow path
453        let block = [0i64, 0, 1, 1, 2, 2, 3, 3];
454        let (gn, cn) = names(6, 2);
455        let fit = gls_series(&m, &block_design(), 1, 1, Some(&block), 0.3, None, gn, cn).unwrap();
456
457        // Affected gene 2: one observation dropped (df 6 -> 5).
458        assert_eq!(fit.df_residual[2], 5.0);
459        assert!(rclose(fit.coefficients[[2, 0]], 4.3380116959064319));
460        assert!(rclose(fit.coefficients[[2, 1]], 0.43538011695906442));
461        assert!(rclose(fit.sigma[2], 0.083551863073296567));
462        assert!(rclose(fit.stdev_unscaled[[2, 0]], 0.54022713197881478));
463        assert!(rclose(fit.stdev_unscaled[[2, 1]], 0.46456642400542719));
464        assert!(rclose(fit.amean[2], 4.6499999999999995));
465
466        // Untouched genes match the fast-path result exactly.
467        assert_eq!(fit.df_residual[0], 6.0);
468        assert!(rclose(fit.coefficients[[0, 0]], 3.9896825396825411));
469        assert!(rclose(fit.sigma[0], 0.31011362178297869));
470        assert!(rclose(fit.coefficients[[5, 1]], 0.27517006802721089));
471        assert!(rclose(fit.sigma[5], 0.078484562609406339));
472
473        // cov.coefficients still the unweighted full-design covariance.
474        assert!(rclose(fit.cov_coefficients[[0, 0]], 0.28888888888888903));
475        assert!(rclose(fit.cov_coefficients[[1, 1]], 0.16507936507936516));
476    }
477}
limma/glsseries.rs

limma/
glsseries.rs