Skip to main content

salmon_core/
quantmerge.rs

1//! `salmon quantmerge`: merge one column across several samples' `quant.sf`
2//! (or `quant.genes.sf`) into a single target × sample matrix TSV.
3
4use std::collections::HashMap;
5use std::io::{self, BufRead, Write};
6use std::path::{Path, PathBuf};
7
8/// Which `quant.sf` column to merge.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum MergeColumn {
11    Len,
12    Elen,
13    Tpm,
14    NumReads,
15}
16
17impl MergeColumn {
18    /// Parse a column name (case-insensitive), accepting salmon's aliases.
19    pub fn parse(s: &str) -> Option<Self> {
20        match s.to_ascii_uppercase().as_str() {
21            "LEN" | "LENGTH" => Some(Self::Len),
22            "ELEN" | "ELENGTH" | "EFFLEN" | "EFFLENGTH" | "EFFECTIVELEN" | "EFFECTIVELENGTH" => {
23                Some(Self::Elen)
24            }
25            "TPM" => Some(Self::Tpm),
26            "NUMREADS" | "NUM_READS" | "NREADS" => Some(Self::NumReads),
27            _ => None,
28        }
29    }
30
31    /// Column index in a `quant.sf` row (Name=0, Length=1, EffectiveLength=2,
32    /// TPM=3, NumReads=4).
33    fn index(self) -> usize {
34        match self {
35            Self::Len => 1,
36            Self::Elen => 2,
37            Self::Tpm => 3,
38            Self::NumReads => 4,
39        }
40    }
41}
42
43/// Merge the chosen `column` of each sample's quant file into `out_path`. The
44/// output has a `Name` header plus one column per sample (`names`), then one row
45/// per target (in first-seen order across the samples), with `missing` filling
46/// targets absent from a sample. The per-sample value strings are taken verbatim
47/// from each quant file. Returns the number of missing entries written.
48pub fn merge_quants(
49    quant_files: &[PathBuf],
50    names: &[String],
51    column: MergeColumn,
52    missing: &str,
53    out_path: &Path,
54) -> io::Result<usize> {
55    assert_eq!(quant_files.len(), names.len());
56    let nsamp = quant_files.len();
57    let col = column.index();
58
59    let mut order: Vec<String> = Vec::new();
60    let mut values: HashMap<String, Vec<Option<String>>> = HashMap::new();
61
62    for (s, qf) in quant_files.iter().enumerate() {
63        let reader = io::BufReader::new(
64            std::fs::File::open(qf)
65                .map_err(|e| io::Error::new(e.kind(), format!("opening {}: {e}", qf.display())))?,
66        );
67        for (lineno, line) in reader.lines().enumerate() {
68            let line = line?;
69            if lineno == 0 {
70                continue; // header
71            }
72            if line.trim().is_empty() {
73                continue;
74            }
75            let toks: Vec<&str> = line.split('\t').collect();
76            if toks.len() <= col {
77                continue;
78            }
79            let name = toks[0];
80            let entry = values.entry(name.to_string()).or_insert_with(|| {
81                order.push(name.to_string());
82                vec![None; nsamp]
83            });
84            entry[s] = Some(toks[col].to_string());
85        }
86    }
87
88    if let Some(parent) = out_path.parent() {
89        if !parent.as_os_str().is_empty() {
90            std::fs::create_dir_all(parent)?;
91        }
92    }
93    let mut w = io::BufWriter::new(std::fs::File::create(out_path)?);
94    write!(w, "Name")?;
95    for n in names {
96        write!(w, "\t{n}")?;
97    }
98    writeln!(w)?;
99
100    let mut n_missing = 0usize;
101    for name in &order {
102        write!(w, "{name}")?;
103        for v in &values[name] {
104            match v {
105                Some(s) => write!(w, "\t{s}")?,
106                None => {
107                    n_missing += 1;
108                    write!(w, "\t{missing}")?;
109                }
110            }
111        }
112        writeln!(w)?;
113    }
114    w.flush()?;
115    Ok(n_missing)
116}
117
118#[cfg(test)]
119mod tests {
120    use super::*;
121
122    #[test]
123    fn column_aliases() {
124        assert_eq!(MergeColumn::parse("tpm"), Some(MergeColumn::Tpm));
125        assert_eq!(MergeColumn::parse("NumReads"), Some(MergeColumn::NumReads));
126        assert_eq!(
127            MergeColumn::parse("effectiveLength"),
128            Some(MergeColumn::Elen)
129        );
130        assert_eq!(MergeColumn::parse("len"), Some(MergeColumn::Len));
131        assert_eq!(MergeColumn::parse("bogus"), None);
132    }
133
134    #[test]
135    fn merges_with_missing() {
136        let dir = std::env::temp_dir().join(format!("qm_{}", std::process::id()));
137        std::fs::create_dir_all(&dir).unwrap();
138        let s1 = dir.join("s1.sf");
139        let s2 = dir.join("s2.sf");
140        std::fs::write(
141            &s1,
142            "Name\tLength\tEffectiveLength\tTPM\tNumReads\nt1\t100\t80.0\t10.5\t5\nt2\t200\t180.0\t20.0\t9\n",
143        )
144        .unwrap();
145        // s2 is missing t1, has a new t3
146        std::fs::write(
147            &s2,
148            "Name\tLength\tEffectiveLength\tTPM\tNumReads\nt2\t200\t180.0\t30.0\t12\nt3\t50\t30.0\t1.0\t1\n",
149        )
150        .unwrap();
151        let out = dir.join("merged.tsv");
152        let n = merge_quants(
153            &[s1, s2],
154            &["A".into(), "B".into()],
155            MergeColumn::Tpm,
156            "NA",
157            &out,
158        )
159        .unwrap();
160        let body = std::fs::read_to_string(&out).unwrap();
161        let lines: Vec<&str> = body.lines().collect();
162        assert_eq!(lines[0], "Name\tA\tB");
163        // first-seen order: t1, t2, t3
164        assert_eq!(lines[1], "t1\t10.5\tNA");
165        assert_eq!(lines[2], "t2\t20.0\t30.0");
166        assert_eq!(lines[3], "t3\tNA\t1.0");
167        assert_eq!(n, 2);
168    }
169}