limma-rust 0.1.0

Pure-Rust port of the Bioconductor limma differential-expression package
Documentation
// The CLI binary only exists under the `cli` feature; without it there is no
// `CARGO_BIN_EXE_limma` to drive, so the whole test crate compiles to nothing.
#![cfg(feature = "cli")]

//! End-to-end integration test for the `limma` CLI binary.
//!
//! Drives the compiled binary on a tiny synthetic dataset written to a temp
//! directory (no external deps, nothing committed) and cross-checks its output
//! against the same pipeline run through the public library API. This guards
//! the CLI wiring -- CSV reading, design/sample alignment, coefficient
//! resolution, and the top-table / dump writers -- which the library unit
//! tests do not exercise.

use std::fs;
use std::path::PathBuf;
use std::process::Command;

use limma::{ebayes, lmfit, top_table, SortBy};
use ndarray::Array2;

struct Fixture {
    dir: PathBuf,
    exprs: PathBuf,
    design: PathBuf,
    data: Array2<f64>,
    design_mat: Array2<f64>,
    genes: Vec<String>,
    coefs: Vec<String>,
}

impl Drop for Fixture {
    fn drop(&mut self) {
        let _ = fs::remove_dir_all(&self.dir);
    }
}

fn unique_dir(tag: &str) -> PathBuf {
    let nanos = std::time::SystemTime::now()
        .duration_since(std::time::UNIX_EPOCH)
        .unwrap()
        .as_nanos();
    let mut d = std::env::temp_dir();
    d.push(format!("limma_it_{}_{}_{}", tag, std::process::id(), nanos));
    fs::create_dir_all(&d).unwrap();
    d
}

/// Write a "first column is id" delimited table from a matrix, matching the
/// reader the CLI uses (header is id-col name then the column names; one row
/// per matrix row). `sep` selects the field delimiter (comma, tab, ...).
fn write_idmat(
    path: &PathBuf,
    id_header: &str,
    ids: &[String],
    cols: &[String],
    m: &Array2<f64>,
    sep: char,
) {
    let mut s = String::from(id_header);
    for c in cols {
        s.push(sep);
        s.push_str(c);
    }
    s.push('\n');
    for (i, id) in ids.iter().enumerate() {
        s.push_str(id);
        for j in 0..cols.len() {
            s.push(sep);
            s.push_str(&format!("{}", m[[i, j]]));
        }
        s.push('\n');
    }
    fs::write(path, s).unwrap();
}

fn setup(tag: &str) -> Fixture {
    setup_fmt(tag, "csv", ',')
}

/// Like [`setup`], but writes the fixture files with the given extension and
/// field delimiter so the CLI's delimiter handling can be exercised end-to-end.
fn setup_fmt(tag: &str, ext: &str, sep: char) -> Fixture {
    let genes: Vec<String> = (1..=6).map(|i| format!("g{i}")).collect();
    let samples: Vec<String> = (1..=6).map(|i| format!("s{i}")).collect();
    let coefs: Vec<String> = vec!["Intercept".into(), "grpB".into()];

    // 6 genes x 6 samples; samples s1-s3 = group A, s4-s6 = group B.
    // g1, g6 up in B; g3 down in B; the rest roughly flat.
    #[rustfmt::skip]
    let data = Array2::from_shape_vec((6, 6), vec![
        5.1, 4.9, 5.0, 7.0, 7.2, 6.9,
        3.0, 3.1, 2.9, 3.0, 2.8, 3.1,
        8.0, 8.2, 7.9, 6.0, 5.9, 6.1,
        1.0, 1.2, 0.9, 1.1, 1.0, 1.3,
        4.0, 4.1, 3.9, 4.2, 4.0, 4.1,
        2.0, 2.1, 1.9, 5.0, 5.1, 4.9,
    ]).unwrap();

    // samples x coefs: intercept plus the group-B indicator.
    #[rustfmt::skip]
    let design_mat = Array2::from_shape_vec((6, 2), vec![
        1.0, 0.0,
        1.0, 0.0,
        1.0, 0.0,
        1.0, 1.0,
        1.0, 1.0,
        1.0, 1.0,
    ]).unwrap();

    let dir = unique_dir(tag);
    let exprs = dir.join(format!("exprs.{ext}"));
    let design = dir.join(format!("design.{ext}"));
    write_idmat(&exprs, "gene", &genes, &samples, &data, sep);
    write_idmat(&design, "sample", &samples, &coefs, &design_mat, sep);

    Fixture {
        dir,
        exprs,
        design,
        data,
        design_mat,
        genes,
        coefs,
    }
}

fn run_cli(args: &[&str]) -> std::process::Output {
    Command::new(env!("CARGO_BIN_EXE_limma"))
        .args(args)
        .output()
        .expect("failed to spawn limma binary")
}

#[test]
fn cli_matches_library_end_to_end() {
    let fx = setup("e2e");
    let out = fx.dir.join("out.csv");
    let dump = fx.dir.join("dump.csv");

    let res = run_cli(&[
        "--exprs",
        fx.exprs.to_str().unwrap(),
        "--design",
        fx.design.to_str().unwrap(),
        "--coef",
        "grpB",
        "--out",
        out.to_str().unwrap(),
        "--dump",
        dump.to_str().unwrap(),
    ]);
    assert!(
        res.status.success(),
        "CLI exited with failure: {}",
        String::from_utf8_lossy(&res.stderr)
    );

    // --- top table ---------------------------------------------------------
    let out_txt = fs::read_to_string(&out).unwrap();
    let mut lines = out_txt.lines();
    assert_eq!(
        lines.next().unwrap(),
        "id,log2FoldChange,lfcSE,AveExpr,t,P.Value,adj.P.Val,B"
    );
    let cli_rows: Vec<(String, f64, f64, f64)> = lines
        .map(|l| {
            let f: Vec<&str> = l.split(',').collect();
            (
                f[0].to_string(),
                f[1].parse().unwrap(), // log2FoldChange
                f[4].parse().unwrap(), // t
                f[7].parse().unwrap(), // B
            )
        })
        .collect();
    assert_eq!(cli_rows.len(), 6, "expected one row per gene");

    // Reproduce the identical pipeline through the public API.
    let mut fit = lmfit(&fx.data, &fx.design_mat, fx.genes.clone(), fx.coefs.clone()).unwrap();
    ebayes(&mut fit, 0.01, (0.1, 4.0), false, false).unwrap();
    let coef = fx.coefs.iter().position(|c| c == "grpB").unwrap();
    let lib_rows = top_table(&fit, coef, SortBy::B, None).unwrap();
    assert_eq!(lib_rows.len(), 6);

    let close = |a: f64, b: f64| (a - b).abs() <= 1e-6 * (1.0 + b.abs());
    for (cli, lib) in cli_rows.iter().zip(lib_rows.iter()) {
        assert_eq!(
            cli.0, lib.id,
            "gene ordering must match the library's B sort"
        );
        assert!(
            close(cli.1, lib.log2_fold_change),
            "logFC mismatch for {}",
            cli.0
        );
        assert!(close(cli.2, lib.t), "t mismatch for {}", cli.0);
        assert!(close(cli.3, lib.b), "B mismatch for {}", cli.0);
    }

    // The most-up and most-down genes are known a priori.
    assert_eq!(
        cli_rows[0].0, "g6",
        "g6 has the largest effect, sorts first by B"
    );
    assert!(
        cli_rows.iter().any(|r| r.0 == "g3" && r.1 < 0.0),
        "g3 is down in B"
    );

    // --- full dump ---------------------------------------------------------
    let dump_txt = fs::read_to_string(&dump).unwrap();
    let mut dl = dump_txt.lines();
    assert_eq!(
        dl.next().unwrap(),
        "id,coef,coefficient,stdev_unscaled,t,p_value,lods,sigma,s2_post,df_total,F,F_p_value"
    );
    assert_eq!(dl.count(), 12, "6 genes x 2 coefficients");
}

#[test]
fn cli_rejects_unknown_coefficient() {
    let fx = setup("badcoef");
    let res = run_cli(&[
        "--exprs",
        fx.exprs.to_str().unwrap(),
        "--design",
        fx.design.to_str().unwrap(),
        "--coef",
        "does_not_exist",
    ]);
    assert!(
        !res.status.success(),
        "CLI should fail on an unknown coefficient name"
    );
}

/// `.tsv` inputs are read tab-separated by auto-detection (no flag needed) and
/// drive the same pipeline to the same answer as the `.csv` path.
#[test]
fn cli_reads_tsv_inputs() {
    let fx = setup_fmt("tsv", "tsv", '\t');
    let out = fx.dir.join("out.csv");
    let res = run_cli(&[
        "--exprs",
        fx.exprs.to_str().unwrap(),
        "--design",
        fx.design.to_str().unwrap(),
        "--coef",
        "grpB",
        "--out",
        out.to_str().unwrap(),
    ]);
    assert!(
        res.status.success(),
        "CLI should read .tsv inputs: {}",
        String::from_utf8_lossy(&res.stderr)
    );
    let out_txt = fs::read_to_string(&out).unwrap();
    let first_data = out_txt.lines().nth(1).unwrap();
    let top_gene = first_data.split(',').next().unwrap();
    assert_eq!(
        top_gene, "g6",
        "TSV must parse to the same matrix: g6 still sorts first by B"
    );
}

/// `--delimiter` overrides extension-based detection: a semicolon-delimited
/// file named `.csv` parses only with the explicit flag, and fails without it.
#[test]
fn cli_delimiter_override() {
    let fx = setup_fmt("semi", "csv", ';');
    let out = fx.dir.join("out.csv");

    // With the override the semicolon file reads cleanly.
    let ok = run_cli(&[
        "--exprs",
        fx.exprs.to_str().unwrap(),
        "--design",
        fx.design.to_str().unwrap(),
        "--coef",
        "grpB",
        "--delimiter",
        ";",
        "--out",
        out.to_str().unwrap(),
    ]);
    assert!(
        ok.status.success(),
        "CLI should honour --delimiter ';': {}",
        String::from_utf8_lossy(&ok.stderr)
    );
    let out_txt = fs::read_to_string(&out).unwrap();
    let top_gene = out_txt.lines().nth(1).unwrap().split(',').next().unwrap();
    assert_eq!(top_gene, "g6");

    // Without it, comma auto-detection mis-reads the single-field rows and fails.
    let bad = run_cli(&[
        "--exprs",
        fx.exprs.to_str().unwrap(),
        "--design",
        fx.design.to_str().unwrap(),
        "--coef",
        "grpB",
    ]);
    assert!(
        !bad.status.success(),
        "a semicolon file read as comma should fail without --delimiter"
    );
}