Skip to main content

limma/
genas.rs

1//! Genuine association of gene expression profiles. Port of limma's `genas`
2//! (`genas.R`).
3//!
4//! Given a linear-model fit with at least two coefficients, `genas` separates
5//! the *technical* correlation between two coefficient estimates (induced by a
6//! shared design, read straight off `cov.coefficients`) from their *biological*
7//! correlation across genes. The biological covariance `V0` is estimated by
8//! maximizing a bivariate moderated-t likelihood: a null fit with `V0` diagonal
9//! and a full fit with an off-diagonal term, both optimized by Nelder–Mead
10//! ([`crate::optim::nelder_mead`], a faithful port of R's `optim`). A
11//! likelihood-ratio statistic and its `chi^2_1` p-value test for non-zero
12//! biological correlation.
13//!
14//! All of limma's gene-subset selectors are supported ([`GenasSubset`]): the
15//! default `all`, the differential-expression filters `Fpval`/`p.union`/`p.int`,
16//! and the fold-change filters `logFC`/`predFC`. (R's fold-change branches read
17//! as if they soft-threshold the coefficients, but that write is inert — see
18//! [`which_genes`] — so all selectors refit on the raw coefficients.) As in R,
19//! the fit must come from an
20//! unweighted, no-missing least-squares fit so that a single common
21//! `cov.coefficients` applies to all genes, and it must already have been run
22//! through `eBayes`.
23
24use anyhow::{anyhow, bail, Result};
25use ndarray::{Array1, Array2};
26use statrs::distribution::{ContinuousCDF, Normal};
27
28use crate::classifytestsf::classify_tests_fstat;
29use crate::ebayes::ebayes;
30use crate::fit::MArrayLM;
31use crate::fitgamma::fit_gamma_intercept;
32use crate::linalg::cov2cor;
33use crate::optim::nelder_mead;
34use crate::predfcm::pred_fcm;
35use crate::proptruenull::{prop_true_null, PropTrueNullMethod};
36use crate::special::{chi2_sf, f_sf};
37
38/// Gene subset used by [`genas`] (`genas`'s `subset` argument).
39#[derive(Clone, Copy, Debug, PartialEq, Eq)]
40pub enum GenasSubset {
41    /// `"all"`: every gene contributes (the default).
42    All,
43    /// `"Fpval"`: genes ranked into the estimated DE proportion by `F.p.value`.
44    Fpval,
45    /// `"p.union"`: union of the per-coefficient DE-ranked genes.
46    PUnion,
47    /// `"p.int"`: intersection of the per-coefficient DE-ranked genes.
48    PInt,
49    /// `"logFC"`: genes whose `|coef|` exceeds its 90th percentile (either
50    /// coefficient).
51    LogFC,
52    /// `"predFC"`: as `logFC` but on predictive fold changes ([`pred_fcm`]).
53    PredFC,
54}
55
56/// Result of [`genas`], mirroring R's `genas` return list.
57#[derive(Debug, Clone)]
58pub struct Genas {
59    /// Correlation of the two coefficient *estimates* from the design alone.
60    pub technical_correlation: f64,
61    /// Estimated biological covariance matrix `V0` (the `L D Lᵀ` fit).
62    pub covariance_matrix: [[f64; 2]; 2],
63    /// Biological correlation implied by `covariance_matrix`.
64    pub biological_correlation: f64,
65    /// Likelihood-ratio deviance `|2(ℓ_null − ℓ_full)|`.
66    pub deviance: f64,
67    /// `chi^2_1` upper-tail p-value of `deviance`.
68    pub p_value: f64,
69    /// Number of genes used.
70    pub n: usize,
71}
72
73/// Compute genuine (biological) association between two coefficients of `fit`.
74///
75/// `coef` gives the two 0-based coefficient columns to correlate (R's default
76/// `coef = c(1, 2)` is `(0, 1)` here). `subset` selects which genes feed the
77/// estimate ([`GenasSubset::All`] reproduces the original behaviour); the
78/// non-`all` selectors refit `eBayes` on the chosen genes exactly as limma
79/// does. The fit must already have been run through `eBayes` (so `s2_post`,
80/// `df_total`, `p_value` and `f_p_value` are present).
81pub fn genas(fit: &MArrayLM, coef: (usize, usize), subset: GenasSubset) -> Result<Genas> {
82    let ngenes = fit.coefficients.nrows();
83    if ngenes < 1 {
84        return Ok(null_genas());
85    }
86    let (c1, c2) = coef;
87    let ncoef = fit.coefficients.ncols();
88    if c1 >= ncoef || c2 >= ncoef {
89        bail!("coef out of range for a {ncoef}-coefficient fit");
90    }
91    let s2post = fit
92        .s2_post
93        .as_ref()
94        .ok_or_else(|| anyhow!("genas requires an eBayes fit (s2_post missing)"))?;
95    let dft = fit
96        .df_total
97        .as_ref()
98        .ok_or_else(|| anyhow!("genas requires an eBayes fit (df_total missing)"))?;
99
100    // The Nelder–Mead start is fixed from the full two-coefficient fit (all
101    // genes), before any subset refit, exactly as limma computes it once at the
102    // top of `genas` and reuses it for the subset `optim` calls.
103    let b0_full = fit.coefficients.column(c1).to_vec();
104    let b1_full = fit.coefficients.column(c2).to_vec();
105    let v = cov_block(fit, c1, c2);
106    let s2post_slice = s2post.as_slice().expect("contiguous s2_post");
107    let x_start = gamma_start(&b0_full, &b1_full, v, s2post_slice);
108
109    // Default path: correlate the two coefficient columns over all genes.
110    if subset == GenasSubset::All {
111        return Ok(genas_core(
112            &b0_full,
113            &b1_full,
114            v,
115            s2post_slice,
116            dft.as_slice().expect("contiguous df_total"),
117            x_start,
118        ));
119    }
120
121    // Subset path: pick differentially expressed genes, refit eBayes on them,
122    // then correlate. trend/robust are inherited from the incoming fit's priors.
123    let mask = which_genes(fit, subset, c1, c2)?;
124    let selected: Vec<usize> = (0..ngenes).filter(|&g| mask[g]).collect();
125    if selected.is_empty() {
126        return Ok(null_genas());
127    }
128    let trend = is_varying(fit.s2_prior.as_ref());
129    let robust = is_varying(fit.df_prior.as_ref());
130    let mut sub = build_subfit(fit, &selected, c1, c2);
131    ebayes(&mut sub, 0.01, (0.1, 4.0), trend, robust)?;
132    let s2post2 = sub.s2_post.as_ref().expect("eBayes fills s2_post");
133    let dft2 = sub.df_total.as_ref().expect("eBayes fills df_total");
134    let b0 = sub.coefficients.column(0).to_vec();
135    let b1 = sub.coefficients.column(1).to_vec();
136    Ok(genas_core(
137        &b0,
138        &b1,
139        v,
140        s2post2.as_slice().expect("contiguous s2_post"),
141        dft2.as_slice().expect("contiguous df_total"),
142        x_start,
143    ))
144}
145
146/// Nelder–Mead starting point: per-coefficient gamma-intercept variances mapped
147/// through `log(diag(chol(diag(x1, x2)))) = (½ln x1, ½ln x2)`. limma computes
148/// this once from the *full* two-coefficient fit and reuses it even when the
149/// subset selectors refit `eBayes` on fewer genes, so the start must be passed
150/// into [`genas_core`] rather than recomputed from the (possibly subsetted) data.
151fn gamma_start(b0: &[f64], b1: &[f64], v: [[f64; 2]; 2], s2post: &[f64]) -> [f64; 2] {
152    let ngenes = b0.len();
153    let ya: Vec<f64> = (0..ngenes).map(|g| b0[g] * b0[g] / s2post[g]).collect();
154    let yb: Vec<f64> = (0..ngenes).map(|g| b1[g] * b1[g] / s2post[g]).collect();
155    let x1 = fit_gamma_intercept(&ya, &[v[0][0]], 1000);
156    let x2 = fit_gamma_intercept(&yb, &[v[1][1]], 1000);
157    if x1 > 0.0 && x2 > 0.0 {
158        [0.5 * x1.ln(), 0.5 * x2.ln()]
159    } else {
160        [0.0, 0.0]
161    }
162}
163
164/// The two-coefficient biological-correlation estimate shared by every subset:
165/// null then full moderated-t optimisation from the shared `x_start`, deviance.
166fn genas_core(
167    b0: &[f64],
168    b1: &[f64],
169    v: [[f64; 2]; 2],
170    s2post: &[f64],
171    dft: &[f64],
172    x_start: [f64; 2],
173) -> Genas {
174    let ngenes = b0.len();
175    let x_start = x_start.to_vec();
176    let m = 2.0;
177
178    // Null fit (V0 diagonal) then full fit (with off-diagonal term).
179    let q2 = nelder_mead(&x_start, |x| loglik_null(x, v, b0, b1, s2post, dft, m));
180    let q1_start = [q2.par[0], q2.par[1], 0.0];
181    let q1 = nelder_mead(&q1_start, |x| loglik_full(x, v, b0, b1, s2post, dft, m));
182
183    // Biological covariance V0 = L D Lᵀ at the full optimum.
184    let d1 = q1.par[0].exp();
185    let d2 = q1.par[1].exp();
186    let bb = q1.par[2];
187    let v0 = [[d1, bb * d1], [bb * d1, bb * bb * d1 + d2]];
188    let rhobiol = v0[1][0] / (v0[0][0] * v0[1][1]).sqrt();
189    let rhotech = v[1][0] / (v[0][0] * v[1][1]).sqrt();
190
191    let deviance = (2.0 * (q2.value - q1.value)).abs();
192    // pchisq(deviance, df=1, lower.tail=FALSE) = 2*Phi(-sqrt(deviance)).
193    let normal = Normal::new(0.0, 1.0).expect("standard normal");
194    let p_value = 2.0 * normal.cdf(-deviance.sqrt());
195
196    Genas {
197        technical_correlation: rhotech,
198        covariance_matrix: v0,
199        biological_correlation: rhobiol,
200        deviance,
201        p_value,
202        n: ngenes,
203    }
204}
205
206/// The 2x2 technical covariance block for coefficient columns `c1`, `c2`.
207fn cov_block(fit: &MArrayLM, c1: usize, c2: usize) -> [[f64; 2]; 2] {
208    [
209        [
210            fit.cov_coefficients[[c1, c1]],
211            fit.cov_coefficients[[c1, c2]],
212        ],
213        [
214            fit.cov_coefficients[[c2, c1]],
215            fit.cov_coefficients[[c2, c2]],
216        ],
217    ]
218}
219
220/// The F-test p-value recomputed over *only* coefficients `c1`, `c2`. limma's
221/// `genas` calls `.whichGenes` on `fit[,coef]`, and `[.MArrayLM` regenerates
222/// `F`/`F.p.value` after a column subset via `classifyTestsF(fstat.only=TRUE)`.
223/// The `"Fpval"` selector therefore ranks on this two-coefficient F-test, not on
224/// the original full-model `F.p.value`.
225fn fpval_two_coef(fit: &MArrayLM, c1: usize, c2: usize) -> Result<Vec<f64>> {
226    let t = fit
227        .t
228        .as_ref()
229        .ok_or_else(|| anyhow!("genas subset=\"Fpval\" needs moderated t (run eBayes)"))?;
230    let dfp = fit
231        .df_prior
232        .as_ref()
233        .ok_or_else(|| anyhow!("genas subset=\"Fpval\" needs df.prior (run eBayes)"))?;
234    let ng = t.nrows();
235    let mut t2 = Array2::<f64>::zeros((ng, 2));
236    for g in 0..ng {
237        t2[[g, 0]] = t[[g, c1]];
238        t2[[g, 1]] = t[[g, c2]];
239    }
240    // cov2cor of the 2x2 technical block, lifting any exactly-zero variance to 1
241    // exactly as `classifyTestsF` does before whitening.
242    let blk = cov_block(fit, c1, c2);
243    let mut cov = Array2::<f64>::zeros((2, 2));
244    for i in 0..2 {
245        for j in 0..2 {
246            cov[[i, j]] = blk[i][j];
247        }
248        if cov[[i, i]] == 0.0 {
249            cov[[i, i]] = 1.0;
250        }
251    }
252    let cor = cov2cor(&cov);
253    let (fstat, r) = classify_tests_fstat(&t2, Some(&cor));
254    let df1 = r as f64;
255    let mut fp = vec![0.0; ng];
256    for g in 0..ng {
257        let df2 = dfp[g] + fit.df_residual[g];
258        fp[g] = if df2.is_finite() {
259            f_sf(fstat[g], df1, df2)
260        } else {
261            chi2_sf(df1 * fstat[g], df1)
262        };
263    }
264    Ok(fp)
265}
266
267/// Port of limma's `.whichGenes`: the boolean gene-inclusion mask for a subset
268/// selector.
269///
270/// limma's `logFC`/`predFC` branches *appear* to soft-threshold the coefficients
271/// (`fit$coeff[,j] <- sign(.)*(abs(.)-q)`), but that assignment — via R's partial
272/// name matching — writes a throwaway `coeff` element while the downstream
273/// likelihood reads the untouched `coefficients`. The thresholding is therefore
274/// inert; only the gene *selection* differs between selectors, and the refit
275/// always uses the raw coefficients. (This is also why `logFC` and `predFC`
276/// yield identical estimates: they differ only in the discarded write.) We
277/// replicate that exactly by returning just the mask.
278fn which_genes(fit: &MArrayLM, subset: GenasSubset, c1: usize, c2: usize) -> Result<Vec<bool>> {
279    let ng = fit.coefficients.nrows();
280    let mask = match subset {
281        GenasSubset::All => vec![true; ng],
282        GenasSubset::Fpval => {
283            let fp = fpval_two_coef(fit, c1, c2)?;
284            let p = 1.0 - prop_true_null(&fp, PropTrueNullMethod::Lfdr, 20);
285            let r = rank_average(&fp);
286            let cut = p * ng as f64;
287            r.iter().map(|&ri| ri <= cut).collect()
288        }
289        GenasSubset::PUnion | GenasSubset::PInt => {
290            let pv = fit
291                .p_value
292                .as_ref()
293                .ok_or_else(|| anyhow!("genas subset needs p.value (run eBayes)"))?;
294            let p1col = pv.column(c1).to_vec();
295            let p2col = pv.column(c2).to_vec();
296            let p1 = 1.0 - prop_true_null(&p1col, PropTrueNullMethod::Lfdr, 20);
297            let p2 = 1.0 - prop_true_null(&p2col, PropTrueNullMethod::Lfdr, 20);
298            let cut1 = p1 * ng as f64;
299            let cut2 = p2 * ng as f64;
300            if subset == GenasSubset::PUnion && p1 == 0.0 && p2 == 0.0 {
301                vec![false; ng]
302            } else {
303                let r1 = rank_average(&p1col);
304                let r2 = rank_average(&p2col);
305                (0..ng)
306                    .map(|g| {
307                        let a = r1[g] <= cut1;
308                        let b = r2[g] <= cut2;
309                        if subset == GenasSubset::PInt {
310                            a && b
311                        } else {
312                            a || b
313                        }
314                    })
315                    .collect()
316            }
317        }
318        GenasSubset::LogFC => {
319            let a1: Vec<f64> = fit
320                .coefficients
321                .column(c1)
322                .iter()
323                .map(|v| v.abs())
324                .collect();
325            let a2: Vec<f64> = fit
326                .coefficients
327                .column(c2)
328                .iter()
329                .map(|v| v.abs())
330                .collect();
331            let q1 = quantile_type7(&a1, 0.9);
332            let q2 = quantile_type7(&a2, 0.9);
333            (0..ng).map(|g| a1[g] > q1 || a2[g] > q2).collect()
334        }
335        GenasSubset::PredFC => {
336            let pfc1 = pred_fcm(fit, c1, true, true, PropTrueNullMethod::Lfdr)?;
337            let pfc2 = pred_fcm(fit, c2, true, true, PropTrueNullMethod::Lfdr)?;
338            let a1: Vec<f64> = pfc1.iter().map(|v| v.abs()).collect();
339            let a2: Vec<f64> = pfc2.iter().map(|v| v.abs()).collect();
340            let q1 = quantile_type7(&a1, 0.9);
341            let q2 = quantile_type7(&a2, 0.9);
342            (0..ng).map(|g| a1[g] > q1 || a2[g] > q2).collect()
343        }
344    };
345    Ok(mask)
346}
347
348/// Assemble a two-coefficient fit over the selected genes (`fit[genes, coef]`),
349/// reading the raw coefficient columns `c1`/`c2`. eBayes output fields are
350/// cleared so the caller can refit.
351fn build_subfit(fit: &MArrayLM, sel: &[usize], c1: usize, c2: usize) -> MArrayLM {
352    let m = sel.len();
353    let mut coefficients = Array2::<f64>::zeros((m, 2));
354    let mut stdev_unscaled = Array2::<f64>::zeros((m, 2));
355    let mut sigma = Array1::<f64>::zeros(m);
356    let mut df_residual = Array1::<f64>::zeros(m);
357    let mut amean = Array1::<f64>::zeros(m);
358    let mut gene_names = Vec::with_capacity(m);
359    for (i, &g) in sel.iter().enumerate() {
360        coefficients[[i, 0]] = fit.coefficients[[g, c1]];
361        coefficients[[i, 1]] = fit.coefficients[[g, c2]];
362        stdev_unscaled[[i, 0]] = fit.stdev_unscaled[[g, c1]];
363        stdev_unscaled[[i, 1]] = fit.stdev_unscaled[[g, c2]];
364        sigma[i] = fit.sigma[g];
365        df_residual[i] = fit.df_residual[g];
366        amean[i] = if g < fit.amean.len() {
367            fit.amean[g]
368        } else {
369            0.0
370        };
371        gene_names.push(fit.gene_names.get(g).cloned().unwrap_or_default());
372    }
373    let b = cov_block(fit, c1, c2);
374    let mut cov_coefficients = Array2::<f64>::zeros((2, 2));
375    cov_coefficients[[0, 0]] = b[0][0];
376    cov_coefficients[[0, 1]] = b[0][1];
377    cov_coefficients[[1, 0]] = b[1][0];
378    cov_coefficients[[1, 1]] = b[1][1];
379    let coef_names = vec![
380        fit.coef_names.get(c1).cloned().unwrap_or_default(),
381        fit.coef_names.get(c2).cloned().unwrap_or_default(),
382    ];
383
384    MArrayLM {
385        coefficients,
386        stdev_unscaled,
387        sigma,
388        df_residual,
389        cov_coefficients,
390        gene_names,
391        coef_names,
392        amean,
393        design: None,
394        contrasts: None,
395        df_prior: None,
396        s2_prior: None,
397        var_prior: None,
398        proportion: None,
399        s2_post: None,
400        t: None,
401        df_total: None,
402        p_value: None,
403        lods: None,
404        f_stat: None,
405        f_p_value: None,
406    }
407}
408
409/// The NA-filled result limma returns when no genes survive the subset filter.
410fn null_genas() -> Genas {
411    Genas {
412        technical_correlation: f64::NAN,
413        covariance_matrix: [[f64::NAN; 2]; 2],
414        biological_correlation: f64::NAN,
415        deviance: 0.0,
416        p_value: 1.0,
417        n: 0,
418    }
419}
420
421/// `TRUE` when a prior vector actually varies by gene (limma's trend/robust flag
422/// `length(prior) > 1`; our `MArrayLM` always stores a full-length vector).
423fn is_varying(v: Option<&Array1<f64>>) -> bool {
424    match v {
425        Some(a) => a.len() > 1 && a.iter().any(|&x| x != a[0]),
426        None => false,
427    }
428}
429
430/// `quantile(x, prob)` with R's default type-7 (linear interpolation).
431fn quantile_type7(x: &[f64], prob: f64) -> f64 {
432    let n = x.len();
433    if n == 0 {
434        return f64::NAN;
435    }
436    let mut s = x.to_vec();
437    s.sort_by(|a, b| a.partial_cmp(b).unwrap());
438    if n == 1 {
439        return s[0];
440    }
441    let h = (n as f64 - 1.0) * prob;
442    let lo = h.floor() as usize;
443    let frac = h - lo as f64;
444    if lo + 1 < n {
445        s[lo] + frac * (s[lo + 1] - s[lo])
446    } else {
447        s[lo]
448    }
449}
450
451/// Average ranks (R's `rank(x, ties.method="average")`), 1-based, ascending.
452fn rank_average(x: &[f64]) -> Vec<f64> {
453    let n = x.len();
454    let mut idx: Vec<usize> = (0..n).collect();
455    idx.sort_by(|&a, &b| x[a].partial_cmp(&x[b]).unwrap());
456    let mut ranks = vec![0.0; n];
457    let mut i = 0;
458    while i < n {
459        let mut j = i;
460        while j + 1 < n && x[idx[j + 1]] == x[idx[i]] {
461            j += 1;
462        }
463        let avg = ((i + 1 + j + 1) as f64) / 2.0;
464        for &k in &idx[i..=j] {
465            ranks[k] = avg;
466        }
467        i = j + 1;
468    }
469    ranks
470}
471
472/// Shared bivariate moderated-t negative log-likelihood given a biological
473/// covariance `v0` and technical covariance `v`. Returns `Σ_g (Second + Third)`
474/// exactly as limma's `.multTLogLik*` helpers.
475fn loglik_core(
476    v0: [[f64; 2]; 2],
477    v: [[f64; 2]; 2],
478    b0: &[f64],
479    b1: &[f64],
480    s2post: &[f64],
481    dft: &[f64],
482    m: f64,
483) -> f64 {
484    // Upper Cholesky R of M = V0 + V (a 2x2 SPD matrix).
485    let m00 = v0[0][0] + v[0][0];
486    let m01 = v0[0][1] + v[0][1];
487    let m11 = v0[1][1] + v[1][1];
488    let r11 = m00.sqrt();
489    let r12 = m01 / r11;
490    let r22 = (m11 - r12 * r12).sqrt();
491    let second = r11.ln() + r22.ln();
492
493    let mut total = 0.0;
494    for g in 0..b0.len() {
495        // Solve Rᵀ w = (b0,b1) by forward substitution.
496        let w0 = b0[g] / r11;
497        let w1 = (b1[g] - r12 * w0) / r22;
498        let q = w0 * w0 + w1 * w1;
499        let third = 0.5 * (m + dft[g]) * (1.0 + q / (s2post[g] * dft[g])).ln();
500        total += second + third;
501    }
502    total
503}
504
505/// Null model: `V0 = diag(exp(2a1), exp(2a2))` (limma's `.multTLogLikNull`).
506fn loglik_null(
507    x: &[f64],
508    v: [[f64; 2]; 2],
509    b0: &[f64],
510    b1: &[f64],
511    s2post: &[f64],
512    dft: &[f64],
513    m: f64,
514) -> f64 {
515    let v0 = [[(2.0 * x[0]).exp(), 0.0], [0.0, (2.0 * x[1]).exp()]];
516    loglik_core(v0, v, b0, b1, s2post, dft, m)
517}
518
519/// Full model: `V0 = L D Lᵀ`, `L = [[1,0],[b,1]]`, `D = diag(exp(a1), exp(a2))`
520/// (limma's `.multTLogLik`).
521fn loglik_full(
522    x: &[f64],
523    v: [[f64; 2]; 2],
524    b0: &[f64],
525    b1: &[f64],
526    s2post: &[f64],
527    dft: &[f64],
528    m: f64,
529) -> f64 {
530    let d1 = x[0].exp();
531    let d2 = x[1].exp();
532    let bb = x[2];
533    let v0 = [[d1, bb * d1], [bb * d1, bb * bb * d1 + d2]];
534    loglik_core(v0, v, b0, b1, s2post, dft, m)
535}
536
537#[cfg(test)]
538#[allow(clippy::excessive_precision, clippy::approx_constant)]
539mod tests {
540    use super::*;
541    use crate::fit::lmfit;
542    use ndarray::Array2;
543
544    fn rclose(a: f64, b: f64) -> bool {
545        (a - b).abs() <= 1e-6 * (1.0 + b.abs())
546    }
547
548    /// Rebuild the bit-identical fixture from scratch/genas_ref.R: a 60x9
549    /// expression matrix from a purely rational, 0-indexed formula, plus a
550    /// common-intercept + two-treatment design.
551    fn fixture() -> (Array2<f64>, Array2<f64>) {
552        let ngenes = 60usize;
553        let narrays = 9usize;
554        let g_a = [0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0];
555        let g_b = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0];
556        let mut y = Array2::<f64>::zeros((ngenes, narrays));
557        for g in 0..ngenes {
558            let gi = g as i64;
559            let base = ((gi % 7) - 3) as f64;
560            let extra = (((gi * 5) % 11) - 5) as f64;
561            let own_b = (((gi * 3) % 13) - 6) as f64;
562            let eff_a = base * 0.4 + extra * 0.1;
563            let eff_b = base * 0.32 + own_b * 0.1;
564            let mu = 8.0 + (gi % 5) as f64 * 0.25;
565            for k in 0..narrays {
566                let ki = k as i64;
567                let noise = (((gi * 13 + ki * 17) % 19) - 9) as f64 * 0.05;
568                y[[g, k]] = mu + eff_a * g_a[k] + eff_b * g_b[k] + noise;
569            }
570        }
571        let mut design = Array2::<f64>::zeros((narrays, 3));
572        for k in 0..narrays {
573            design[[k, 0]] = 1.0;
574            design[[k, 1]] = g_a[k];
575            design[[k, 2]] = g_b[k];
576        }
577        (y, design)
578    }
579
580    fn ebayes_fit() -> MArrayLM {
581        let (y, design) = fixture();
582        let names: Vec<String> = (1..=60).map(|i| format!("g{i}")).collect();
583        let coefs = vec!["Intercept".to_string(), "gA".to_string(), "gB".to_string()];
584        let mut fit = lmfit(&y, &design, names, coefs).unwrap();
585        // eBayes defaults: proportion=0.01, stdev.coef.lim=c(0.1,4).
586        ebayes(&mut fit, 0.01, (0.1, 4.0), false, false).unwrap();
587        fit
588    }
589
590    #[test]
591    fn genas_matches_r() {
592        let fit = ebayes_fit();
593        // R: genas(fit, coef=c(2,3), subset="all") -> 0-based (1,2).
594        let g = genas(&fit, (1, 2), GenasSubset::All).unwrap();
595
596        assert_eq!(g.n, 60);
597        assert!(
598            rclose(g.technical_correlation, 0.49999999999999994),
599            "tech {}",
600            g.technical_correlation
601        );
602        assert!(
603            rclose(g.biological_correlation, 0.72110133367492135),
604            "biol {}",
605            g.biological_correlation
606        );
607        assert!(
608            rclose(g.covariance_matrix[0][0], 20.311293729600376),
609            "v00 {}",
610            g.covariance_matrix[0][0]
611        );
612        assert!(
613            rclose(g.covariance_matrix[0][1], 12.393027523641022),
614            "v01 {}",
615            g.covariance_matrix[0][1]
616        );
617        assert!(
618            rclose(g.covariance_matrix[1][0], 12.393027523641022),
619            "v10 {}",
620            g.covariance_matrix[1][0]
621        );
622        assert!(
623            rclose(g.covariance_matrix[1][1], 14.542016870927945),
624            "v11 {}",
625            g.covariance_matrix[1][1]
626        );
627        assert!(rclose(g.deviance, 35.500819277411154), "dev {}", g.deviance);
628        assert!(rclose(g.p_value, 2.5494331700086711e-09), "p {}", g.p_value);
629    }
630
631    #[test]
632    fn genas_subset_matches_r() {
633        // Reference values from scratch/genas_subset_ref.R (same fixture).
634        let fit = ebayes_fit();
635        let check = |subset: GenasSubset,
636                     n: usize,
637                     biol: f64,
638                     cov: [f64; 4],
639                     dev: f64,
640                     pval: f64,
641                     tag: &str| {
642            let g = genas(&fit, (1, 2), subset).unwrap();
643            assert_eq!(g.n, n, "{tag} n");
644            assert!(rclose(g.technical_correlation, 0.5), "{tag} tech");
645            assert!(rclose(g.biological_correlation, biol), "{tag} biol");
646            assert!(rclose(g.covariance_matrix[0][0], cov[0]), "{tag} v00");
647            assert!(rclose(g.covariance_matrix[0][1], cov[1]), "{tag} v01");
648            assert!(rclose(g.covariance_matrix[1][0], cov[2]), "{tag} v10");
649            assert!(rclose(g.covariance_matrix[1][1], cov[3]), "{tag} v11");
650            assert!(rclose(g.deviance, dev), "{tag} dev");
651            assert!(rclose(g.p_value, pval), "{tag} pval");
652        };
653
654        check(
655            GenasSubset::Fpval,
656            55,
657            0.71848111283183225,
658            [
659                23.231392431559968,
660                14.120763825027172,
661                14.120763825027172,
662                16.626867102316595,
663            ],
664            32.698047599684912,
665            1.0764529339097856e-08,
666            "Fpval",
667        );
668        check(
669            GenasSubset::PUnion,
670            55,
671            0.72141688090435385,
672            [
673                23.20355805632785,
674                14.181934574327077,
675                14.181934574327077,
676                16.654966711355684,
677            ],
678            33.078191943303352,
679            8.8526093918565966e-09,
680            "p.union",
681        );
682        check(
683            GenasSubset::PInt,
684            38,
685            0.83237776620740411,
686            [
687                31.358733693853345,
688                22.383537180965163,
689                22.383537180965163,
690                23.059929516708259,
691            ],
692            37.494823730220958,
693            9.1655906123216993e-10,
694            "p.int",
695        );
696        check(
697            GenasSubset::LogFC,
698            10,
699            0.9036479641020686,
700            [
701                74.055616589393026,
702                62.941872383594607,
703                62.941872383594607,
704                65.512287646621971,
705            ],
706            15.829494073613475,
707            6.9313604519058013e-05,
708            "logFC",
709        );
710        check(
711            GenasSubset::PredFC,
712            10,
713            0.9036479641020686,
714            [
715                74.055616589393026,
716                62.941872383594607,
717                62.941872383594607,
718                65.512287646621971,
719            ],
720            15.829494073613475,
721            6.9313604519058013e-05,
722            "predFC",
723        );
724    }
725}