Skip to main content

rsomics_tsv_join/
lib.rs

1use rsomics_common::{Result, RsomicsError};
2use std::collections::HashMap;
3use std::fs::File;
4use std::io::{BufRead, BufReader, BufWriter, Write};
5use std::path::Path;
6
7pub fn inner_join(left: &Path, right: &Path, key_col: &str, output: &mut dyn Write) -> Result<u64> {
8    let (rh, rdata) = load_tsv(right, key_col)?;
9    let src_file = File::open(left)
10        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", left.display())))?;
11    let buf_reader = BufReader::new(src_file);
12    let mut out = BufWriter::new(output);
13    let mut lines = buf_reader.lines();
14
15    let hdr_line = lines
16        .next()
17        .ok_or_else(|| RsomicsError::InvalidInput("empty left".into()))?
18        .map_err(RsomicsError::Io)?;
19    let lcols: Vec<&str> = hdr_line.split('\t').collect();
20    let lki = lcols
21        .iter()
22        .position(|c| *c == key_col)
23        .ok_or_else(|| RsomicsError::InvalidInput(format!("'{key_col}' not in left")))?;
24
25    write!(out, "{hdr_line}").map_err(RsomicsError::Io)?;
26    for h in &rh {
27        if *h != key_col {
28            write!(out, "\t{h}").map_err(RsomicsError::Io)?;
29        }
30    }
31    writeln!(out).map_err(RsomicsError::Io)?;
32
33    let mut count = 0u64;
34    for line in lines {
35        let line = line.map_err(RsomicsError::Io)?;
36        let fields: Vec<&str> = line.split('\t').collect();
37        let key = fields.get(lki).copied().unwrap_or("");
38        if let Some(rrow) = rdata.get(key) {
39            write!(out, "{line}").map_err(RsomicsError::Io)?;
40            for (i, val) in rrow.iter().enumerate() {
41                if rh[i] != key_col {
42                    write!(out, "\t{val}").map_err(RsomicsError::Io)?;
43                }
44            }
45            writeln!(out).map_err(RsomicsError::Io)?;
46            count += 1;
47        }
48    }
49    out.flush().map_err(RsomicsError::Io)?;
50    Ok(count)
51}
52
53type RowMap = HashMap<String, Vec<String>>;
54
55fn load_tsv(path: &Path, key_col_name: &str) -> Result<(Vec<String>, RowMap)> {
56    let file = File::open(path)
57        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
58    let reader = BufReader::new(file);
59    let mut lines = reader.lines();
60
61    let header = lines
62        .next()
63        .ok_or_else(|| RsomicsError::InvalidInput("empty".into()))?
64        .map_err(RsomicsError::Io)?;
65    let cols: Vec<String> = header.split('\t').map(String::from).collect();
66    let ki = cols
67        .iter()
68        .position(|c| c == key_col_name)
69        .ok_or_else(|| RsomicsError::InvalidInput(format!("'{key_col_name}' not in right")))?;
70
71    let mut data = HashMap::new();
72    for line in lines {
73        let line = line.map_err(RsomicsError::Io)?;
74        let fields: Vec<String> = line.split('\t').map(String::from).collect();
75        let key = fields.get(ki).cloned().unwrap_or_default();
76        data.insert(key, fields);
77    }
78    Ok((cols, data))
79}