limma/
io.rs

1//! Input/output for expression matrices, design and contrast matrices, and
2//! result tables. The delimited-text *reader* (`read_matrix`) lives behind the
3//! `cli` feature -- it is the only consumer of the `csv` crate; everything else
4//! here (the design/contrast aligners and every table writer, including
5//! `write_fit`) is std-only and always compiled.
6
7use std::collections::HashMap;
8use std::fs::File;
9use std::io::{BufWriter, Write};
10use std::path::Path;
11
12use anyhow::{bail, Context, Result};
13use ndarray::Array2;
14
15use crate::decidetests::{p_adjust, Adjust, DecideMethod, TestResults};
16use crate::fit::MArrayLM;
17use crate::toptable::{TopRow, TopRowF};
18
19/// A labelled numeric matrix: `data` is `rows x cols`, with `row_names` and
20/// `col_names` taken from the first column and header row respectively.
21pub struct LabeledMatrix {
22    pub data: Array2<f64>,
23    pub row_names: Vec<String>,
24    pub col_names: Vec<String>,
25}
26
27/// Read a labelled matrix from a delimited text file, choosing the delimiter
28/// from the file extension: `.tsv` / `.tab` are read tab-separated, everything
29/// else (`.csv`, `.txt`, no extension) comma-separated. The first row is a
30/// header whose first cell is an ignored corner label; the first column of each
31/// subsequent row is the row name. Empty / `NA` / `NaN` cells parse to
32/// `f64::NAN`. Use [`read_matrix_with_delimiter`] to force a delimiter.
33#[cfg(feature = "cli")]
34pub fn read_matrix(path: &Path) -> Result<LabeledMatrix> {
35    read_matrix_with_delimiter(path, delimiter_for_path(path))
36}
37
38/// Pick the field delimiter for `path` from its extension: tab for `.tsv` /
39/// `.tab` (case-insensitive), comma otherwise.
40#[cfg(feature = "cli")]
41fn delimiter_for_path(path: &Path) -> u8 {
42    match path
43        .extension()
44        .and_then(|e| e.to_str())
45        .map(str::to_ascii_lowercase)
46        .as_deref()
47    {
48        Some("tsv") | Some("tab") => b'\t',
49        _ => b',',
50    }
51}
52
53/// Read a labelled matrix from a delimited text file using an explicit
54/// `delimiter` byte (e.g. `b','` or `b'\t'`). The first row is a header whose
55/// first cell is an ignored corner label; the first column of each subsequent
56/// row is the row name. Empty / `NA` / `NaN` cells parse to `f64::NAN`.
57#[cfg(feature = "cli")]
58pub fn read_matrix_with_delimiter(path: &Path, delimiter: u8) -> Result<LabeledMatrix> {
59    let mut rdr = csv::ReaderBuilder::new()
60        .has_headers(true)
61        .flexible(false)
62        .delimiter(delimiter)
63        .from_path(path)
64        .with_context(|| format!("opening {}", path.display()))?;
65
66    let header = rdr.headers()?.clone();
67    if header.len() < 2 {
68        bail!("{}: expected at least one data column", path.display());
69    }
70    let col_names: Vec<String> = header.iter().skip(1).map(|s| s.to_string()).collect();
71    let ncol = col_names.len();
72
73    let mut row_names = Vec::new();
74    let mut values: Vec<f64> = Vec::new();
75    // Read into a single reused ByteRecord rather than allocating a fresh
76    // StringRecord per row: one buffer for the whole file instead of one per
77    // line. Numeric cells still go through `parse_cell` on a UTF-8 view, so the
78    // parsed values are byte-for-byte identical to the previous path.
79    let mut rec = csv::ByteRecord::new();
80    let mut i = 0usize;
81    while rdr
82        .read_byte_record(&mut rec)
83        .with_context(|| format!("{}: reading row {}", path.display(), i + 1))?
84    {
85        if rec.len() != ncol + 1 {
86            bail!(
87                "{}: row {} has {} fields, expected {}",
88                path.display(),
89                i + 1,
90                rec.len(),
91                ncol + 1
92            );
93        }
94        row_names.push(String::from_utf8_lossy(&rec[0]).into_owned());
95        values.reserve(ncol);
96        for j in 0..ncol {
97            let cell = std::str::from_utf8(&rec[j + 1]).map_or(f64::NAN, parse_cell);
98            values.push(cell);
99        }
100        i += 1;
101    }
102    let nrow = row_names.len();
103    let data = Array2::from_shape_vec((nrow, ncol), values)
104        .with_context(|| format!("{}: assembling matrix", path.display()))?;
105    Ok(LabeledMatrix {
106        data,
107        row_names,
108        col_names,
109    })
110}
111
112#[cfg(feature = "cli")]
113fn parse_cell(s: &str) -> f64 {
114    let t = s.trim();
115    if t.is_empty() || t.eq_ignore_ascii_case("na") || t.eq_ignore_ascii_case("nan") {
116        return f64::NAN;
117    }
118    t.parse::<f64>().unwrap_or(f64::NAN)
119}
120
121/// Reorder the rows of `design` so they line up with `sample_order` (the
122/// expression-matrix column names), matching on row name. Returns the aligned
123/// `n_samples x n_coef` design matrix.
124pub fn align_design(design: &LabeledMatrix, sample_order: &[String]) -> Result<Array2<f64>> {
125    let index: HashMap<&str, usize> = design
126        .row_names
127        .iter()
128        .enumerate()
129        .map(|(i, n)| (n.as_str(), i))
130        .collect();
131
132    // If every expression sample names a design row, align by name; otherwise
133    // require an exact positional match.
134    let by_name = sample_order.iter().all(|s| index.contains_key(s.as_str()));
135    let ncoef = design.col_names.len();
136    let mut out = Array2::<f64>::zeros((sample_order.len(), ncoef));
137    if by_name {
138        for (new_i, s) in sample_order.iter().enumerate() {
139            let old_i = index[s.as_str()];
140            for j in 0..ncoef {
141                out[[new_i, j]] = design.data[[old_i, j]];
142            }
143        }
144    } else {
145        if design.row_names.len() != sample_order.len() {
146            bail!(
147                "design has {} rows but expression has {} samples, and sample names do not match",
148                design.row_names.len(),
149                sample_order.len()
150            );
151        }
152        out = design.data.clone();
153    }
154    Ok(out)
155}
156
157/// Reorder the rows of a contrast matrix to match `coef_order` (the design's
158/// coefficient names). Returns the `n_coef x n_contrasts` matrix.
159pub fn align_contrasts(contrasts: &LabeledMatrix, coef_order: &[String]) -> Result<Array2<f64>> {
160    let index: HashMap<&str, usize> = contrasts
161        .row_names
162        .iter()
163        .enumerate()
164        .map(|(i, n)| (n.as_str(), i))
165        .collect();
166    let by_name = coef_order.iter().all(|c| index.contains_key(c.as_str()));
167    let ncont = contrasts.col_names.len();
168    let mut out = Array2::<f64>::zeros((coef_order.len(), ncont));
169    if by_name {
170        for (new_i, c) in coef_order.iter().enumerate() {
171            let old_i = index[c.as_str()];
172            for j in 0..ncont {
173                out[[new_i, j]] = contrasts.data[[old_i, j]];
174            }
175        }
176    } else {
177        if contrasts.row_names.len() != coef_order.len() {
178            bail!("contrast rows do not match design coefficients by name or position");
179        }
180        out = contrasts.data.clone();
181    }
182    Ok(out)
183}
184
185fn fmt(v: f64) -> String {
186    if v.is_nan() {
187        "NA".to_string()
188    } else {
189        format!("{:.10e}", v)
190    }
191}
192
193/// High-precision formatter for the parity dump: 17 significant digits is
194/// enough to round-trip an f64 exactly, so the comparison against R is
195/// limited by the computation, not by CSV rounding.
196fn fmt_hp(v: f64) -> String {
197    if v.is_nan() {
198        "NA".to_string()
199    } else {
200        format!("{:.17e}", v)
201    }
202}
203
204/// Quote `field` for `sep`-delimited output following RFC 4180: when it
205/// contains the `sep` byte, a double quote, a carriage return, or a line feed,
206/// wrap the whole field in double quotes and double any embedded quotes;
207/// otherwise return it unchanged. This lets ids and labels that contain the
208/// delimiter — e.g. metabolite names like `1,2-Di(4Z,7Z,10Z)-PE` — survive a
209/// round-trip through any RFC-4180 reader (`pandas.read_csv`, R `read.csv`, the
210/// `csv` crate, ...). `sep` must be ASCII.
211fn csv_quote(field: &str, sep: u8) -> std::borrow::Cow<'_, str> {
212    let needs_quote = field
213        .bytes()
214        .any(|b| b == sep || b == b'"' || b == b'\n' || b == b'\r');
215    if !needs_quote {
216        return std::borrow::Cow::Borrowed(field);
217    }
218    let mut quoted = String::with_capacity(field.len() + 2);
219    quoted.push('"');
220    for ch in field.chars() {
221        if ch == '"' {
222            quoted.push('"');
223        }
224        quoted.push(ch);
225    }
226    quoted.push('"');
227    std::borrow::Cow::Owned(quoted)
228}
229
230/// Write a single-contrast top table.
231pub fn write_top_table(path: &Path, rows: &[TopRow]) -> Result<()> {
232    let mut w = BufWriter::new(File::create(path)?);
233    writeln!(w, "id,log2FoldChange,lfcSE,AveExpr,t,P.Value,adj.P.Val,B")?;
234    for r in rows {
235        writeln!(
236            w,
237            "{},{},{},{},{},{},{},{}",
238            csv_quote(&r.id, b','),
239            fmt(r.log2_fold_change),
240            fmt(r.lfc_se),
241            fmt(r.ave_expr),
242            fmt(r.t),
243            fmt(r.p_value),
244            fmt(r.adj_p_value),
245            fmt(r.b)
246        )?;
247    }
248    Ok(())
249}
250
251/// Write a multi-contrast (F-statistic) top table.
252pub fn write_top_table_f(path: &Path, rows: &[TopRowF]) -> Result<()> {
253    let mut w = BufWriter::new(File::create(path)?);
254    writeln!(w, "id,AveExpr,F,P.Value,adj.P.Val")?;
255    for r in rows {
256        writeln!(
257            w,
258            "{},{},{},{},{}",
259            csv_quote(&r.id, b','),
260            fmt(r.ave_expr),
261            fmt(r.f),
262            fmt(r.p_value),
263            fmt(r.adj_p_value)
264        )?;
265    }
266    Ok(())
267}
268
269/// Write a `decideTests` outcome matrix (genes x contrasts) of -1/0/1, in
270/// original gene order. Header is `id` followed by the contrast names.
271pub fn write_test_results(path: &Path, res: &TestResults) -> Result<()> {
272    let mut w = BufWriter::new(File::create(path)?);
273    write!(w, "id")?;
274    for name in &res.coef_names {
275        write!(w, ",{}", csv_quote(name, b','))?;
276    }
277    writeln!(w)?;
278    for g in 0..res.data.nrows() {
279        write!(w, "{}", csv_quote(&res.gene_names[g], b','))?;
280        for j in 0..res.data.ncols() {
281            write!(w, ",{}", res.data[[g, j]])?;
282        }
283        writeln!(w)?;
284    }
285    Ok(())
286}
287
288/// Dump the full per-gene fit statistics in original gene order, one CSV row
289/// per gene per coefficient. Used for numerical parity checks against R limma.
290pub fn write_fit_dump(path: &Path, fit: &MArrayLM) -> Result<()> {
291    let mut w = BufWriter::new(File::create(path)?);
292    writeln!(
293        w,
294        "id,coef,coefficient,stdev_unscaled,t,p_value,lods,sigma,s2_post,df_total,F,F_p_value"
295    )?;
296    let t = fit.t.as_ref();
297    let p = fit.p_value.as_ref();
298    let lods = fit.lods.as_ref();
299    let s2_post = fit.s2_post.as_ref();
300    let df_total = fit.df_total.as_ref();
301    let f = fit.f_stat.as_ref();
302    let fp = fit.f_p_value.as_ref();
303    for g in 0..fit.n_genes() {
304        for j in 0..fit.n_coef() {
305            writeln!(
306                w,
307                "{},{},{},{},{},{},{},{},{},{},{},{}",
308                csv_quote(&fit.gene_names[g], b','),
309                csv_quote(&fit.coef_names[j], b','),
310                fmt_hp(fit.coefficients[[g, j]]),
311                fmt_hp(fit.stdev_unscaled[[g, j]]),
312                fmt_hp(t.map(|m| m[[g, j]]).unwrap_or(f64::NAN)),
313                fmt_hp(p.map(|m| m[[g, j]]).unwrap_or(f64::NAN)),
314                fmt_hp(lods.map(|m| m[[g, j]]).unwrap_or(f64::NAN)),
315                fmt_hp(fit.sigma[g]),
316                fmt_hp(s2_post.map(|m| m[g]).unwrap_or(f64::NAN)),
317                fmt_hp(df_total.map(|m| m[g]).unwrap_or(f64::NAN)),
318                fmt_hp(f.map(|m| m[g]).unwrap_or(f64::NAN)),
319                fmt_hp(fp.map(|m| m[g]).unwrap_or(f64::NAN)),
320            )?;
321        }
322    }
323    Ok(())
324}
325
326/// Format a finite `f64` the way R's `as.character`/`write.table` do: up to 15
327/// significant digits, trailing zeros dropped, choosing fixed vs scientific
328/// notation by whichever is shorter (R's `scipen = 0`, ties → fixed).
329/// `NaN` → `"NA"`, infinities → `"Inf"`/`"-Inf"`.
330fn fmt_r(v: f64) -> String {
331    if v.is_nan() {
332        return "NA".to_string();
333    }
334    if v.is_infinite() {
335        return if v > 0.0 { "Inf" } else { "-Inf" }.to_string();
336    }
337    if v == 0.0 {
338        return "0".to_string();
339    }
340    let neg = v < 0.0;
341    let a = v.abs();
342
343    // Canonical 15-significant-digit form, then the fewest significant digits
344    // (1..=15) that reproduce it — mirroring R's minimal round-trip width.
345    let s15 = format!("{:.14e}", a);
346    let mut mant = s15.clone();
347    for d in 1..=15usize {
348        let cand = format!("{:.*e}", d - 1, a);
349        if format!("{:.14e}", cand.parse::<f64>().unwrap()) == s15 {
350            mant = cand;
351            break;
352        }
353    }
354
355    let (mant_part, exp_part) = mant.split_once('e').unwrap();
356    let exp: i32 = exp_part.parse().unwrap();
357    let digits: String = mant_part.chars().filter(|&c| c != '.').collect();
358
359    let fixed = fmt_fixed(&digits, exp);
360    let sci = fmt_sci(&digits, exp);
361    let body = if fixed.len() <= sci.len() { fixed } else { sci };
362    if neg {
363        format!("-{body}")
364    } else {
365        body
366    }
367}
368
369/// Render significant `digits` (no leading zeros, first digit has place value
370/// `10^exp`) in plain fixed-point notation, trailing zeros trimmed.
371fn fmt_fixed(digits: &str, exp: i32) -> String {
372    let ndig = digits.len() as i32;
373    if exp >= 0 {
374        let ip_len = exp + 1;
375        if ip_len >= ndig {
376            format!("{}{}", digits, "0".repeat((ip_len - ndig) as usize))
377        } else {
378            let (ip, fp) = digits.split_at(ip_len as usize);
379            let fp = fp.trim_end_matches('0');
380            if fp.is_empty() {
381                ip.to_string()
382            } else {
383                format!("{ip}.{fp}")
384            }
385        }
386    } else {
387        let zeros = (-exp - 1) as usize;
388        let frac = format!("{}{}", "0".repeat(zeros), digits);
389        let frac = frac.trim_end_matches('0');
390        format!("0.{frac}")
391    }
392}
393
394/// Render significant `digits` in R's scientific notation: one leading digit,
395/// then a signed two-(or-more)-digit exponent (`1.92170677729296e-05`).
396fn fmt_sci(digits: &str, exp: i32) -> String {
397    let (first, rest) = digits.split_at(1);
398    let mant = if rest.is_empty() {
399        first.to_string()
400    } else {
401        format!("{first}.{rest}")
402    };
403    let sign = if exp < 0 { '-' } else { '+' };
404    format!("{mant}e{sign}{:02}", exp.abs())
405}
406
407/// Round `v` to `decimals` digits after the decimal point (negative `decimals`
408/// rounds to powers of ten), matching R's `round`.
409fn round_dec(v: f64, decimals: i32) -> f64 {
410    if !v.is_finite() {
411        return v;
412    }
413    if decimals >= 0 {
414        format!("{:.*}", decimals as usize, v).parse().unwrap_or(v)
415    } else {
416        let f = 10f64.powi(-decimals);
417        (v / f).round() * f
418    }
419}
420
421/// Options for [`write_fit`], mirroring the arguments of limma's `write.fit`.
422pub struct WriteFitOptions {
423    /// Significant-digit budget; `None` writes full precision (limma default).
424    pub digits: Option<i32>,
425    /// P-value adjustment method applied to the coefficient p-values.
426    pub adjust: Adjust,
427    /// Whether adjustment is done per coefficient (`Separate`) or pooled across
428    /// the whole matrix (`Global`).
429    pub method: DecideMethod,
430    /// Adjustment method for the moderated-F p-values.
431    pub f_adjust: Adjust,
432    /// Field separator (limma default `'\t'`).
433    pub sep: char,
434    /// Emit the gene names as a leading row-name column.
435    pub row_names: bool,
436}
437
438impl Default for WriteFitOptions {
439    fn default() -> Self {
440        Self {
441            digits: None,
442            adjust: Adjust::None,
443            method: DecideMethod::Separate,
444            f_adjust: Adjust::None,
445            sep: '\t',
446            row_names: true,
447        }
448    }
449}
450
451/// Adjust a `genes x coefs` p-value matrix either per column (`Separate`) or
452/// over all entries pooled column-major (`Global`), matching write.fit.
453fn adjust_pvalue_matrix(
454    p: &ndarray::Array2<f64>,
455    method: Adjust,
456    decide: DecideMethod,
457) -> ndarray::Array2<f64> {
458    let (ng, nc) = p.dim();
459    let mut out = ndarray::Array2::<f64>::zeros((ng, nc));
460    // write.fit only distinguishes "global" from "separate" (its default);
461    // anything that is not Global is treated as per-coefficient separate.
462    if let DecideMethod::Global = decide {
463        // R `as.vector(matrix)` is column-major: column 0 first.
464        let flat: Vec<f64> = (0..nc)
465            .flat_map(|j| (0..ng).map(move |g| p[[g, j]]))
466            .collect();
467        let adj = p_adjust(&flat, method);
468        let mut k = 0;
469        for j in 0..nc {
470            for g in 0..ng {
471                out[[g, j]] = adj[k];
472                k += 1;
473            }
474        }
475    } else {
476        for j in 0..nc {
477            let col: Vec<f64> = (0..ng).map(|g| p[[g, j]]).collect();
478            let adj = p_adjust(&col, method);
479            for g in 0..ng {
480                out[[g, j]] = adj[g];
481            }
482        }
483    }
484    out
485}
486
487/// Write an eBayes-processed [`MArrayLM`] fit to a delimited file, port of
488/// limma's `write.fit`. The fit must already carry moderated `t` and p-values
489/// (run [`crate::ebayes`] or [`crate::treat`] first). Columns are emitted in
490/// limma's order — `AveExpr`, then per-coefficient `Coef`, `t`, `P.value`,
491/// optional `P.value.adj`, then `F`/`F.p.value` (when present) and optional
492/// `F.p.value.adj`, then optional `Results` from a `decideTests` matrix. With a
493/// single coefficient the per-coefficient columns carry no `.<name>` suffix,
494/// matching R's `drop()`.
495pub fn write_fit(
496    path: &Path,
497    fit: &MArrayLM,
498    results: Option<&TestResults>,
499    opts: &WriteFitOptions,
500) -> Result<()> {
501    let t = fit
502        .t
503        .as_ref()
504        .context("write_fit needs moderated t-statistics; run eBayes/treat first")?;
505    let p = fit
506        .p_value
507        .as_ref()
508        .context("write_fit needs p-values; run eBayes/treat first")?;
509    let ng = fit.n_genes();
510    let nc = fit.n_coef();
511
512    let padj = match opts.adjust {
513        Adjust::None => None,
514        m => Some(adjust_pvalue_matrix(p, m, opts.method)),
515    };
516    let f = fit.f_stat.as_ref();
517    let fp = fit.f_p_value.as_ref();
518    let fpadj = match (opts.f_adjust, fp) {
519        (Adjust::None, _) | (_, None) => None,
520        (m, Some(fpv)) => Some(p_adjust(fpv.as_slice().unwrap(), m)),
521    };
522
523    // Per-column rounding offset relative to `digits`, as in write.fit.
524    let cell = |v: f64, delta: i32| -> String {
525        match opts.digits {
526            Some(d) => fmt_r(round_dec(v, d + delta)),
527            None => fmt_r(v),
528        }
529    };
530    let cname = |base: &str, j: usize| -> String {
531        if nc == 1 {
532            base.to_string()
533        } else {
534            format!("{base}.{}", fit.coef_names[j])
535        }
536    };
537
538    let mut columns: Vec<(String, Vec<String>)> = Vec::new();
539    columns.push((
540        "AveExpr".to_string(),
541        (0..ng).map(|g| cell(fit.amean[g], -1)).collect(),
542    ));
543    for j in 0..nc {
544        columns.push((
545            cname("Coef", j),
546            (0..ng).map(|g| cell(fit.coefficients[[g, j]], 0)).collect(),
547        ));
548    }
549    for j in 0..nc {
550        columns.push((
551            cname("t", j),
552            (0..ng).map(|g| cell(t[[g, j]], -1)).collect(),
553        ));
554    }
555    for j in 0..nc {
556        columns.push((
557            cname("P.value", j),
558            (0..ng).map(|g| cell(p[[g, j]], 2)).collect(),
559        ));
560    }
561    if let Some(pa) = &padj {
562        for j in 0..nc {
563            columns.push((
564                cname("P.value.adj", j),
565                (0..ng).map(|g| cell(pa[[g, j]], 3)).collect(),
566            ));
567        }
568    }
569    if let Some(fv) = f {
570        columns.push(("F".to_string(), (0..ng).map(|g| cell(fv[g], -1)).collect()));
571    }
572    if let Some(fpv) = fp {
573        columns.push((
574            "F.p.value".to_string(),
575            (0..ng).map(|g| cell(fpv[g], 2)).collect(),
576        ));
577    }
578    if let Some(fpa) = &fpadj {
579        columns.push((
580            "F.p.value.adj".to_string(),
581            (0..ng).map(|g| cell(fpa[g], 3)).collect(),
582        ));
583    }
584    if let Some(res) = results {
585        let rc = res.data.ncols();
586        for j in 0..rc {
587            let nm = if rc == 1 {
588                "Results".to_string()
589            } else {
590                format!("Results.{}", res.coef_names[j])
591            };
592            columns.push((nm, (0..ng).map(|g| res.data[[g, j]].to_string()).collect()));
593        }
594    }
595
596    let sep = opts.sep.to_string();
597    let mut w = BufWriter::new(File::create(path)?);
598
599    // Header: with row names, R's col.names=NA leaves a blank corner cell.
600    let header: Vec<&str> = columns.iter().map(|(h, _)| h.as_str()).collect();
601    if opts.row_names {
602        write!(w, "{sep}")?;
603    }
604    writeln!(w, "{}", header.join(&sep))?;
605
606    for g in 0..ng {
607        if opts.row_names {
608            write!(w, "{}{sep}", fit.gene_names[g])?;
609        }
610        let row: Vec<&str> = columns.iter().map(|(_, c)| c[g].as_str()).collect();
611        writeln!(w, "{}", row.join(&sep))?;
612    }
613    Ok(())
614}
615
616#[cfg(test)]
617mod write_fit_tests {
618    use super::*;
619    use crate::ebayes::ebayes;
620    use crate::fit::lmfit;
621    use ndarray::Array2;
622
623    // Reference strings from R `as.character(<double>)` (limma 3.68.3 / R 4.6.0),
624    // exactly what write.table emits for the full-precision (no-digits) path.
625    #[test]
626    fn fmt_r_matches_as_character() {
627        let cases: &[(f64, &str)] = &[
628            (0.0, "0"),
629            (0.502, "0.502"),
630            (-1.069, "-1.069"),
631            (6.29433333333333, "6.29433333333333"),
632            (0.744216015086904, "0.744216015086904"),
633            (0.000595501060937467, "0.000595501060937467"),
634            (1.92170677729296e-05, "1.92170677729296e-05"),
635            (7.00632124072959e-06, "7.00632124072959e-06"),
636            (36.1843587501443, "36.1843587501443"),
637            (0.0001, "1e-04"),
638            (100000.0, "1e+05"),
639            (123456.0, "123456"),
640        ];
641        for &(v, want) in cases {
642            assert_eq!(fmt_r(v), want, "fmt_r({v})");
643        }
644        assert_eq!(fmt_r(f64::NAN), "NA");
645        assert_eq!(fmt_r(f64::INFINITY), "Inf");
646        assert_eq!(fmt_r(f64::NEG_INFINITY), "-Inf");
647    }
648
649    fn build_fit() -> MArrayLM {
650        // set.seed(1); round(rnorm(30,6,2),3) laid out 5 genes x 6 samples.
651        let e = Array2::from_shape_vec(
652            (5, 6),
653            vec![
654                4.747, 4.359, 9.024, 5.91, 7.838, 5.888, 6.367, 6.975, 6.78, 5.968, 7.564, 5.688,
655                4.329, 7.477, 4.758, 7.888, 6.149, 3.058, 9.191, 7.152, 1.571, 7.642, 2.021, 5.044,
656                6.659, 5.389, 8.25, 7.188, 7.24, 6.836,
657            ],
658        )
659        .unwrap();
660        let design = Array2::from_shape_vec(
661            (6, 2),
662            vec![1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0],
663        )
664        .unwrap();
665        let genes = (1..=5).map(|i| format!("g{i}")).collect();
666        let coefs = vec!["Intercept".to_string(), "grpB".to_string()];
667        let mut fit = lmfit(&e, &design, genes, coefs).unwrap();
668        ebayes(&mut fit, 0.01, (0.1, 4.0), false, false).unwrap();
669        fit
670    }
671
672    fn tmp(tag: &str) -> std::path::PathBuf {
673        let nanos = std::time::SystemTime::now()
674            .duration_since(std::time::UNIX_EPOCH)
675            .unwrap()
676            .as_nanos();
677        std::env::temp_dir().join(format!(
678            "limma_wf_{}_{}_{}.tsv",
679            tag,
680            std::process::id(),
681            nanos
682        ))
683    }
684
685    fn close(a: f64, b: f64) -> bool {
686        (a - b).abs() <= 1e-6 * (1.0 + b.abs())
687    }
688
689    #[test]
690    fn write_fit_matches_r_separate() {
691        let fit = build_fit();
692        let path = tmp("sep");
693        let opts = WriteFitOptions {
694            adjust: Adjust::BH,
695            ..Default::default()
696        };
697        write_fit(&path, &fit, None, &opts).unwrap();
698        let txt = std::fs::read_to_string(&path).unwrap();
699        let _ = std::fs::remove_file(&path);
700
701        let mut lines = txt.lines();
702        assert_eq!(
703            lines.next().unwrap(),
704            "\tAveExpr\tCoef.Intercept\tCoef.grpB\tt.Intercept\tt.grpB\tP.value.Intercept\t\
705             P.value.grpB\tP.value.adj.Intercept\tP.value.adj.grpB\tF\tF.p.value"
706        );
707
708        // Each row of write.fit(adjust="BH", method="separate") from R.
709        let expected: [[f64; 11]; 5] = [
710            [
711                6.29433333333333,
712                6.04333333333333,
713                0.502,
714                5.77088235610168,
715                0.338964636041287,
716                0.000595501060937467,
717                0.744216015086904,
718                0.000992501768229112,
719                0.911512695521572,
720                36.1843587501443,
721                0.000166248928492668,
722            ],
723            [
724                6.557,
725                6.70733333333334,
726                -0.300666666666668,
727                9.78282382638691,
728                -0.310087762759466,
729                1.92170677729296e-05,
730                0.765193889323088,
731                8.20450194802642e-05,
732                0.911512695521572,
733                91.5097315793615,
734                7.00632124072959e-06,
735            ],
736            [
737                5.60983333333333,
738                5.52133333333334,
739                0.176999999999999,
740                5.07576159719018,
741                0.115057654632127,
742                0.00128191400310164,
743                0.911512695521572,
744                0.00160239250387704,
745                0.911512695521572,
746                26.6025021648719,
747                0.000452243348899521,
748            ],
749            [
750                5.43683333333333,
751                5.97133333333334,
752                -1.069,
753                3.76176082911824,
754                -0.476192523101982,
755                0.00658339642469066,
756                0.647914700333665,
757                0.00658339642469066,
758                0.911512695521572,
759                11.844291449427,
760                0.00515864337996187,
761            ],
762            [
763                6.927,
764                6.766,
765                0.322000000000001,
766                9.0399993371551,
767                0.304212656857555,
768                3.28180077921057e-05,
769                0.769488416182624,
770                8.20450194802642e-05,
771                0.911512695521572,
772                85.7033369243552,
773                8.80865266401849e-06,
774            ],
775        ];
776        let mut g = 0;
777        for line in lines {
778            let f: Vec<&str> = line.split('\t').collect();
779            assert_eq!(f.len(), 12, "row {g} field count");
780            assert_eq!(f[0], format!("g{}", g + 1));
781            for (c, &want) in expected[g].iter().enumerate() {
782                let got: f64 = f[c + 1].parse().unwrap();
783                assert!(close(got, want), "cell [{g}][{c}] got {got} want {want}");
784            }
785            g += 1;
786        }
787        assert_eq!(g, 5);
788    }
789
790    #[test]
791    fn write_fit_global_and_none_headers() {
792        let fit = build_fit();
793
794        // method="global" pools the adjustment across all coefficients.
795        let pg = tmp("glob");
796        write_fit(
797            &pg,
798            &fit,
799            None,
800            &WriteFitOptions {
801                adjust: Adjust::BH,
802                method: DecideMethod::Global,
803                ..Default::default()
804            },
805        )
806        .unwrap();
807        let gtxt = std::fs::read_to_string(&pg).unwrap();
808        let _ = std::fs::remove_file(&pg);
809        let g1: Vec<&str> = gtxt.lines().nth(1).unwrap().split('\t').collect();
810        // P.value.adj.Intercept / .grpB for gene 1 under global BH.
811        assert!(close(g1[8].parse().unwrap(), 0.00198500353645822));
812        assert!(close(g1[9].parse().unwrap(), 0.854987129091805));
813
814        // adjust="none" drops the two P.value.adj columns from the header.
815        let pn = tmp("none");
816        write_fit(&pn, &fit, None, &WriteFitOptions::default()).unwrap();
817        let ntxt = std::fs::read_to_string(&pn).unwrap();
818        let _ = std::fs::remove_file(&pn);
819        assert_eq!(
820            ntxt.lines().next().unwrap(),
821            "\tAveExpr\tCoef.Intercept\tCoef.grpB\tt.Intercept\tt.grpB\t\
822             P.value.Intercept\tP.value.grpB\tF\tF.p.value"
823        );
824    }
825
826    #[test]
827    fn csv_quote_rfc4180() {
828        use std::borrow::Cow;
829        // Plain ids are returned untouched (and without allocating).
830        assert!(matches!(csv_quote("NAMPT", b','), Cow::Borrowed(_)));
831        assert_eq!(&*csv_quote("NAMPT", b','), "NAMPT");
832        // A comma in the field forces quoting of the whole field.
833        assert_eq!(
834            &*csv_quote("1,2-Di(4Z,7Z,10Z)-PE", b','),
835            "\"1,2-Di(4Z,7Z,10Z)-PE\""
836        );
837        // Embedded double quotes are doubled, per RFC 4180.
838        assert_eq!(&*csv_quote("a\"b", b','), "\"a\"\"b\"");
839        // Newlines also trigger quoting.
840        assert_eq!(&*csv_quote("a\nb", b','), "\"a\nb\"");
841        // Quoting keys off the active separator: a comma is harmless under TAB,
842        // but a TAB is not.
843        assert!(matches!(csv_quote("a,b", b'\t'), Cow::Borrowed(_)));
844        assert_eq!(&*csv_quote("a\tb", b'\t'), "\"a\tb\"");
845    }
846
847    #[test]
848    fn write_top_table_quotes_comma_ids() {
849        let row = |id: &str| TopRow {
850            id: id.to_string(),
851            log2_fold_change: 1.23,
852            lfc_se: 0.45,
853            ave_expr: 8.1,
854            t: 2.0,
855            p_value: 0.01,
856            adj_p_value: 0.02,
857            b: 0.5,
858        };
859        let rows = vec![
860            row("1,2-Di(4Z,7Z,10Z)-PE"),
861            row("NAMPT"),
862            row("weird\"name"),
863        ];
864        let path = tmp("quote");
865        write_top_table(&path, &rows).unwrap();
866        let txt = std::fs::read_to_string(&path).unwrap();
867        let _ = std::fs::remove_file(&path);
868
869        let mut lines = txt.lines();
870        assert_eq!(
871            lines.next().unwrap(),
872            "id,log2FoldChange,lfcSE,AveExpr,t,P.Value,adj.P.Val,B"
873        );
874        // The comma-laden id is wrapped in quotes, so exactly 7 numeric fields
875        // (8 columns) follow it -- the row no longer shatters under a CSV reader.
876        let l1 = lines.next().unwrap();
877        let q = "\"1,2-Di(4Z,7Z,10Z)-PE\"";
878        assert!(l1.starts_with(q), "comma id not quoted: {l1}");
879        assert_eq!(
880            l1[q.len()..].matches(',').count(),
881            7,
882            "expected 7 trailing numeric fields: {l1}"
883        );
884        // A comma-free id stays bare.
885        assert!(lines.next().unwrap().starts_with("NAMPT,"));
886        // An embedded quote is doubled and the field wrapped.
887        assert!(lines.next().unwrap().starts_with("\"weird\"\"name\","));
888    }
889}
890
891#[cfg(all(test, feature = "cli"))]
892mod read_matrix_tests {
893    use super::*;
894
895    #[test]
896    fn delimiter_from_extension() {
897        assert_eq!(delimiter_for_path(Path::new("x.tsv")), b'\t');
898        assert_eq!(delimiter_for_path(Path::new("x.tab")), b'\t');
899        // Case-insensitive on the extension.
900        assert_eq!(delimiter_for_path(Path::new("X.TSV")), b'\t');
901        assert_eq!(delimiter_for_path(Path::new("dir/y.Tab")), b'\t');
902        // Everything else falls back to comma.
903        assert_eq!(delimiter_for_path(Path::new("x.csv")), b',');
904        assert_eq!(delimiter_for_path(Path::new("x.txt")), b',');
905        assert_eq!(delimiter_for_path(Path::new("noext")), b',');
906    }
907
908    fn tmp(tag: &str, ext: &str) -> std::path::PathBuf {
909        let nanos = std::time::SystemTime::now()
910            .duration_since(std::time::UNIX_EPOCH)
911            .unwrap()
912            .as_nanos();
913        std::env::temp_dir().join(format!(
914            "limma_rm_{}_{}_{}.{}",
915            tag,
916            std::process::id(),
917            nanos,
918            ext
919        ))
920    }
921
922    // Same 2x3 table, one delimiter each, with a blank cell that must read NaN.
923    fn body(sep: char) -> String {
924        format!(
925            "gene{sep}s1{sep}s2{sep}s3\n\
926             g1{sep}1.5{sep}{sep}3.5\n\
927             g2{sep}4{sep}NA{sep}6\n"
928        )
929    }
930
931    fn check(m: &LabeledMatrix) {
932        assert_eq!(m.col_names, ["s1", "s2", "s3"]);
933        assert_eq!(m.row_names, ["g1", "g2"]);
934        assert_eq!(m.data.shape(), [2, 3]);
935        assert_eq!(m.data[[0, 0]], 1.5);
936        assert!(m.data[[0, 1]].is_nan()); // blank cell
937        assert_eq!(m.data[[0, 2]], 3.5);
938        assert_eq!(m.data[[1, 0]], 4.0);
939        assert!(m.data[[1, 1]].is_nan()); // NA cell
940        assert_eq!(m.data[[1, 2]], 6.0);
941    }
942
943    #[test]
944    fn auto_detects_tsv_and_csv() {
945        // `.tsv` extension -> tab-delimited via read_matrix's auto-detect.
946        let pt = tmp("auto", "tsv");
947        std::fs::write(&pt, body('\t')).unwrap();
948        let mt = read_matrix(&pt).unwrap();
949        let _ = std::fs::remove_file(&pt);
950        check(&mt);
951
952        // `.csv` extension -> comma-delimited.
953        let pc = tmp("auto", "csv");
954        std::fs::write(&pc, body(',')).unwrap();
955        let mc = read_matrix(&pc).unwrap();
956        let _ = std::fs::remove_file(&pc);
957        check(&mc);
958    }
959
960    #[test]
961    fn explicit_delimiter_overrides_extension() {
962        // A semicolon-delimited file named `.csv`: auto-detect would pick comma
963        // and mis-parse, but the explicit override reads it correctly.
964        let p = tmp("override", "csv");
965        std::fs::write(&p, body(';')).unwrap();
966        let m = read_matrix_with_delimiter(&p, b';').unwrap();
967        let _ = std::fs::remove_file(&p);
968        check(&m);
969    }
970}
limma/io.rs

limma/
io.rs