salmon_core/
quantmerge.rs1use std::collections::HashMap;
5use std::io::{self, BufRead, Write};
6use std::path::{Path, PathBuf};
7
8#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum MergeColumn {
11 Len,
12 Elen,
13 Tpm,
14 NumReads,
15}
16
17impl MergeColumn {
18 pub fn parse(s: &str) -> Option<Self> {
20 match s.to_ascii_uppercase().as_str() {
21 "LEN" | "LENGTH" => Some(Self::Len),
22 "ELEN" | "ELENGTH" | "EFFLEN" | "EFFLENGTH" | "EFFECTIVELEN" | "EFFECTIVELENGTH" => {
23 Some(Self::Elen)
24 }
25 "TPM" => Some(Self::Tpm),
26 "NUMREADS" | "NUM_READS" | "NREADS" => Some(Self::NumReads),
27 _ => None,
28 }
29 }
30
31 fn index(self) -> usize {
34 match self {
35 Self::Len => 1,
36 Self::Elen => 2,
37 Self::Tpm => 3,
38 Self::NumReads => 4,
39 }
40 }
41}
42
43pub fn merge_quants(
49 quant_files: &[PathBuf],
50 names: &[String],
51 column: MergeColumn,
52 missing: &str,
53 out_path: &Path,
54) -> io::Result<usize> {
55 assert_eq!(quant_files.len(), names.len());
56 let nsamp = quant_files.len();
57 let col = column.index();
58
59 let mut order: Vec<String> = Vec::new();
60 let mut values: HashMap<String, Vec<Option<String>>> = HashMap::new();
61
62 for (s, qf) in quant_files.iter().enumerate() {
63 let reader = io::BufReader::new(
64 std::fs::File::open(qf)
65 .map_err(|e| io::Error::new(e.kind(), format!("opening {}: {e}", qf.display())))?,
66 );
67 for (lineno, line) in reader.lines().enumerate() {
68 let line = line?;
69 if lineno == 0 {
70 continue; }
72 if line.trim().is_empty() {
73 continue;
74 }
75 let toks: Vec<&str> = line.split('\t').collect();
76 if toks.len() <= col {
77 continue;
78 }
79 let name = toks[0];
80 let entry = values.entry(name.to_string()).or_insert_with(|| {
81 order.push(name.to_string());
82 vec![None; nsamp]
83 });
84 entry[s] = Some(toks[col].to_string());
85 }
86 }
87
88 if let Some(parent) = out_path.parent() {
89 if !parent.as_os_str().is_empty() {
90 std::fs::create_dir_all(parent)?;
91 }
92 }
93 let mut w = io::BufWriter::new(std::fs::File::create(out_path)?);
94 write!(w, "Name")?;
95 for n in names {
96 write!(w, "\t{n}")?;
97 }
98 writeln!(w)?;
99
100 let mut n_missing = 0usize;
101 for name in &order {
102 write!(w, "{name}")?;
103 for v in &values[name] {
104 match v {
105 Some(s) => write!(w, "\t{s}")?,
106 None => {
107 n_missing += 1;
108 write!(w, "\t{missing}")?;
109 }
110 }
111 }
112 writeln!(w)?;
113 }
114 w.flush()?;
115 Ok(n_missing)
116}
117
118#[cfg(test)]
119mod tests {
120 use super::*;
121
122 #[test]
123 fn column_aliases() {
124 assert_eq!(MergeColumn::parse("tpm"), Some(MergeColumn::Tpm));
125 assert_eq!(MergeColumn::parse("NumReads"), Some(MergeColumn::NumReads));
126 assert_eq!(
127 MergeColumn::parse("effectiveLength"),
128 Some(MergeColumn::Elen)
129 );
130 assert_eq!(MergeColumn::parse("len"), Some(MergeColumn::Len));
131 assert_eq!(MergeColumn::parse("bogus"), None);
132 }
133
134 #[test]
135 fn merges_with_missing() {
136 let dir = std::env::temp_dir().join(format!("qm_{}", std::process::id()));
137 std::fs::create_dir_all(&dir).unwrap();
138 let s1 = dir.join("s1.sf");
139 let s2 = dir.join("s2.sf");
140 std::fs::write(
141 &s1,
142 "Name\tLength\tEffectiveLength\tTPM\tNumReads\nt1\t100\t80.0\t10.5\t5\nt2\t200\t180.0\t20.0\t9\n",
143 )
144 .unwrap();
145 std::fs::write(
147 &s2,
148 "Name\tLength\tEffectiveLength\tTPM\tNumReads\nt2\t200\t180.0\t30.0\t12\nt3\t50\t30.0\t1.0\t1\n",
149 )
150 .unwrap();
151 let out = dir.join("merged.tsv");
152 let n = merge_quants(
153 &[s1, s2],
154 &["A".into(), "B".into()],
155 MergeColumn::Tpm,
156 "NA",
157 &out,
158 )
159 .unwrap();
160 let body = std::fs::read_to_string(&out).unwrap();
161 let lines: Vec<&str> = body.lines().collect();
162 assert_eq!(lines[0], "Name\tA\tB");
163 assert_eq!(lines[1], "t1\t10.5\tNA");
165 assert_eq!(lines[2], "t2\t20.0\t30.0");
166 assert_eq!(lines[3], "t3\tNA\t1.0");
167 assert_eq!(n, 2);
168 }
169}