limma-rust 0.1.0

//! `limma` command-line interface: run the limma differential-expression
//! pipeline (lmFit -> [contrasts.fit] -> eBayes -> topTable) on CSV or TSV
//! inputs (delimiter auto-detected by extension, or set with `--delimiter`).

use std::path::PathBuf;
use std::time::Instant;

use anyhow::{bail, Context, Result};
use clap::Parser;

use limma::io::{
    align_contrasts, align_design, read_matrix, read_matrix_with_delimiter, write_fit,
    write_fit_dump, write_test_results, write_top_table, write_top_table_f, WriteFitOptions,
};
use limma::{
    contrasts_fit, decide_tests, ebayes, lmfit, top_table, top_table_f, top_treat, treat, Adjust,
    DecideMethod, SortBy,
};

#[derive(Parser, Debug)]
#[command(
    name = "limma",
    about = "Pure-Rust port of Bioconductor limma (lmFit -> eBayes -> topTable)",
    version
)]
struct Args {
    /// Expression matrix (CSV/TSV): genes x samples, header = sample names, first column = gene id.
    #[arg(long)]
    exprs: PathBuf,

    /// Design matrix (CSV/TSV): samples x coefficients, header = coef names, first column = sample id.
    #[arg(long)]
    design: PathBuf,

    /// Optional contrast matrix (CSV/TSV): coefficients x contrasts.
    #[arg(long)]
    contrasts: Option<PathBuf>,

    /// Field delimiter for all inputs. Accepts a single character or a name
    /// (comma, tab, semicolon, space). Default: auto-detect (tab for
    /// .tsv/.tab, comma otherwise).
    #[arg(long)]
    delimiter: Option<String>,

    /// Coefficient/contrast to tabulate (name or 0-based index). Selects the moderated-t table.
    #[arg(long)]
    coef: Option<String>,

    /// Tabulate the overall moderated F-statistic across all coefficients/contrasts instead.
    #[arg(long, default_value_t = false)]
    f_test: bool,

    /// Assumed proportion of differentially expressed genes (eBayes).
    #[arg(long, default_value_t = 0.01)]
    proportion: f64,

    /// Robust empirical Bayes hyperparameter estimation (eBayes robust=TRUE).
    /// Handles unequal residual degrees of freedom (e.g. data with NAs).
    #[arg(long, default_value_t = false)]
    robust: bool,

    /// Fit an intensity-dependent prior-variance trend on the average
    /// log-expression (eBayes trend=TRUE). Combinable with --robust.
    #[arg(long, default_value_t = false)]
    trend: bool,

    /// Run TREAT instead of eBayes: moderated t relative to an absolute log2
    /// fold-change threshold (R's treat(lfc=)). Disables the F-table; the
    /// t-table defaults to sorting by P-value.
    #[arg(long)]
    treat: Option<f64>,

    /// Sort criterion for the t-table: logFC | AveExpr | P | t | B | none.
    #[arg(long, default_value = "B")]
    sort: String,

    /// Maximum number of rows to emit (default: all).
    #[arg(long)]
    number: Option<usize>,

    /// Write the ranked top table here (CSV).
    #[arg(long)]
    out: Option<PathBuf>,

    /// Also dump full per-gene statistics (for parity checks) here.
    #[arg(long)]
    dump: Option<PathBuf>,

    /// Write a `write.fit`-style results table (all coefficients) here (TSV).
    #[arg(long)]
    write_fit: Option<PathBuf>,

    /// write.fit p-value adjustment: none | bonferroni | holm | BH | fdr | BY.
    #[arg(long, default_value = "none")]
    write_fit_adjust: String,

    /// write.fit adjustment scope: separate | global.
    #[arg(long, default_value = "separate")]
    write_fit_method: String,

    /// Write a decideTests outcome matrix (-1/0/1 per gene per contrast) here.
    #[arg(long)]
    decide: Option<PathBuf>,

    /// decideTests method: separate | global.
    #[arg(long, default_value = "separate")]
    decide_method: String,

    /// decideTests p-value adjustment: none | bonferroni | holm | BH | fdr | BY.
    #[arg(long, default_value = "BH")]
    decide_adjust: String,

    /// decideTests adjusted-p significance cutoff.
    #[arg(long, default_value_t = 0.05)]
    decide_p: f64,

    /// decideTests minimum absolute log-fold-change.
    #[arg(long, default_value_t = 0.0)]
    lfc: f64,

    /// Print timing of the fit/eBayes stages to stderr.
    #[arg(long, default_value_t = false)]
    timing: bool,

    /// Write fit/eBayes wall-clock timing as JSON here (for benchmark harnesses).
    #[arg(long)]
    timing_json: Option<PathBuf>,
}

/// Parse a `--delimiter` value into a single byte. Accepts the common named
/// delimiters (`comma`, `tab`, `semicolon`, `space`), the literal escape
/// `\t`, or any single character. Multi-character values (other than the
/// recognised names) are rejected.
fn parse_delimiter(s: &str) -> Result<u8> {
    let b = match s {
        "," | "comma" => b',',
        "\t" | "\\t" | "tab" => b'\t',
        ";" | "semicolon" => b';',
        " " | "space" => b' ',
        other => {
            let mut chars = other.chars();
            match (chars.next(), chars.next()) {
                (Some(c), None) if c.is_ascii() => c as u8,
                _ => bail!(
                    "invalid --delimiter {other:?}: expected a single ASCII character or one of \
                     comma, tab, semicolon, space"
                ),
            }
        }
    };
    Ok(b)
}

fn main() -> Result<()> {
    let args = Args::parse();

    // Resolve the field delimiter once: an explicit `--delimiter` forces it for
    // every input, otherwise each file auto-detects from its own extension.
    let delimiter = args.delimiter.as_deref().map(parse_delimiter).transpose()?;
    let read = |path: &std::path::Path| match delimiter {
        Some(d) => read_matrix_with_delimiter(path, d),
        None => read_matrix(path),
    };

    let expr = read(&args.exprs).context("reading expression matrix")?;
    let design_raw = read(&args.design).context("reading design matrix")?;
    let design = align_design(&design_raw, &expr.col_names)?;
    let coef_names = design_raw.col_names.clone();

    let t0 = Instant::now();
    let mut fit = lmfit(
        &expr.data,
        &design,
        expr.row_names.clone(),
        coef_names.clone(),
    )?;
    let t_fit = t0.elapsed();

    // Optional reorientation onto contrasts.
    if let Some(cpath) = &args.contrasts {
        let craw = read(cpath).context("reading contrast matrix")?;
        let cmat = align_contrasts(&craw, &coef_names)?;
        fit = contrasts_fit(&fit, &cmat, craw.col_names.clone())?;
    }

    let treat_active = args.treat.is_some();
    let t1 = Instant::now();
    if let Some(lfc) = args.treat {
        treat(&mut fit, lfc)?;
    } else {
        ebayes(
            &mut fit,
            args.proportion,
            (0.1, 4.0),
            args.trend,
            args.robust,
        )?;
    }
    let t_eb = t1.elapsed();

    if args.timing {
        eprintln!(
            "lmFit: {:.3} ms   {}: {:.3} ms   genes={} coef={}",
            t_fit.as_secs_f64() * 1e3,
            if treat_active { "treat" } else { "eBayes" },
            t_eb.as_secs_f64() * 1e3,
            fit.n_genes(),
            fit.n_coef()
        );
    }

    if let Some(tj) = &args.timing_json {
        let json = format!(
            "{{\n  \"impl\": \"limma-rust\",\n  \"n_genes\": {},\n  \"n_samples\": {},\n  \"n_coef\": {},\n  \"lmfit_s\": {},\n  \"ebayes_s\": {},\n  \"total_s\": {}\n}}\n",
            fit.n_genes(),
            expr.data.ncols(),
            fit.n_coef(),
            t_fit.as_secs_f64(),
            t_eb.as_secs_f64(),
            t_fit.as_secs_f64() + t_eb.as_secs_f64(),
        );
        std::fs::write(tj, json).context("writing timing json")?;
    }

    if let Some(dpath) = &args.dump {
        write_fit_dump(dpath, &fit)?;
    }

    if let Some(wpath) = &args.write_fit {
        let opts = WriteFitOptions {
            adjust: Adjust::parse(&args.write_fit_adjust)?,
            method: DecideMethod::parse(&args.write_fit_method)?,
            ..Default::default()
        };
        write_fit(wpath, &fit, None, &opts)?;
    }

    if let Some(dpath) = &args.decide {
        let method = DecideMethod::parse(&args.decide_method)?;
        let adjust = Adjust::parse(&args.decide_adjust)?;
        let res = decide_tests(&fit, method, adjust, args.decide_p, args.lfc)?;
        let counts = res.counts();
        eprintln!(
            "decideTests ({}, {}, p<{}):",
            args.decide_method, args.decide_adjust, args.decide_p
        );
        for (j, (down, notsig, up)) in counts.iter().enumerate() {
            eprintln!(
                "  {:<16} Down {:>6}  NotSig {:>6}  Up {:>6}",
                res.coef_names[j], down, notsig, up
            );
        }
        write_test_results(dpath, &res)?;
    }

    // Decide which table to produce. TREAT fits carry no F-statistic.
    let want_f = !treat_active && (args.f_test || (args.coef.is_none() && fit.n_coef() > 1));
    if want_f {
        let rows = top_table_f(&fit, args.number)?;
        match &args.out {
            Some(p) => write_top_table_f(p, &rows)?,
            None => print_f(&rows),
        }
    } else {
        let coef = resolve_coef(args.coef.as_deref(), &fit.coef_names)?;
        let sort_by = SortBy::parse(&args.sort)?;
        let rows = if treat_active {
            // topTreat defaults to sorting by P-value and forbids B-sorting;
            // map the CLI's default "B" onto P rather than erroring.
            let sb = if sort_by == SortBy::B {
                SortBy::PValue
            } else {
                sort_by
            };
            top_treat(&fit, coef, sb, args.number)?
        } else {
            top_table(&fit, coef, sort_by, args.number)?
        };
        match &args.out {
            Some(p) => write_top_table(p, &rows)?,
            None => print_t(&rows),
        }
    }
    Ok(())
}

fn resolve_coef(coef: Option<&str>, names: &[String]) -> Result<usize> {
    match coef {
        None => Ok(names.len() - 1),
        Some(s) => {
            if let Ok(idx) = s.parse::<usize>() {
                if idx < names.len() {
                    return Ok(idx);
                }
                bail!("coef index {} out of range (have {})", idx, names.len());
            }
            names
                .iter()
                .position(|n| n == s)
                .with_context(|| format!("coefficient '{}' not found in {:?}", s, names))
        }
    }
}

fn print_t(rows: &[limma::TopRow]) {
    println!("id,log2FoldChange,lfcSE,AveExpr,t,P.Value,adj.P.Val,B");
    for r in rows.iter().take(20) {
        println!(
            "{},{:.6},{:.6},{:.6},{:.6},{:.3e},{:.3e},{:.4}",
            r.id, r.log2_fold_change, r.lfc_se, r.ave_expr, r.t, r.p_value, r.adj_p_value, r.b
        );
    }
}

fn print_f(rows: &[limma::TopRowF]) {
    println!("id,AveExpr,F,P.Value,adj.P.Val");
    for r in rows.iter().take(20) {
        println!(
            "{},{:.6},{:.6},{:.3e},{:.3e}",
            r.id, r.ave_expr, r.f, r.p_value, r.adj_p_value
        );
    }
}