rsomics-tsv-join 0.1.0

Join two TSV files by a shared key column — inner/left/outer join
Documentation
use rsomics_common::{Result, RsomicsError};
use std::collections::HashMap;
use std::fs::File;
use std::io::{BufRead, BufReader, BufWriter, Write};
use std::path::Path;

pub fn inner_join(left: &Path, right: &Path, key_col: &str, output: &mut dyn Write) -> Result<u64> {
    let (rh, rdata) = load_tsv(right, key_col)?;
    let src_file = File::open(left)
        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", left.display())))?;
    let buf_reader = BufReader::new(src_file);
    let mut out = BufWriter::new(output);
    let mut lines = buf_reader.lines();

    let hdr_line = lines
        .next()
        .ok_or_else(|| RsomicsError::InvalidInput("empty left".into()))?
        .map_err(RsomicsError::Io)?;
    let lcols: Vec<&str> = hdr_line.split('\t').collect();
    let lki = lcols
        .iter()
        .position(|c| *c == key_col)
        .ok_or_else(|| RsomicsError::InvalidInput(format!("'{key_col}' not in left")))?;

    write!(out, "{hdr_line}").map_err(RsomicsError::Io)?;
    for h in &rh {
        if *h != key_col {
            write!(out, "\t{h}").map_err(RsomicsError::Io)?;
        }
    }
    writeln!(out).map_err(RsomicsError::Io)?;

    let mut count = 0u64;
    for line in lines {
        let line = line.map_err(RsomicsError::Io)?;
        let fields: Vec<&str> = line.split('\t').collect();
        let key = fields.get(lki).copied().unwrap_or("");
        if let Some(rrow) = rdata.get(key) {
            write!(out, "{line}").map_err(RsomicsError::Io)?;
            for (i, val) in rrow.iter().enumerate() {
                if rh[i] != key_col {
                    write!(out, "\t{val}").map_err(RsomicsError::Io)?;
                }
            }
            writeln!(out).map_err(RsomicsError::Io)?;
            count += 1;
        }
    }
    out.flush().map_err(RsomicsError::Io)?;
    Ok(count)
}

type RowMap = HashMap<String, Vec<String>>;

fn load_tsv(path: &Path, key_col_name: &str) -> Result<(Vec<String>, RowMap)> {
    let file = File::open(path)
        .map_err(|e| RsomicsError::InvalidInput(format!("{}: {e}", path.display())))?;
    let reader = BufReader::new(file);
    let mut lines = reader.lines();

    let header = lines
        .next()
        .ok_or_else(|| RsomicsError::InvalidInput("empty".into()))?
        .map_err(RsomicsError::Io)?;
    let cols: Vec<String> = header.split('\t').map(String::from).collect();
    let ki = cols
        .iter()
        .position(|c| c == key_col_name)
        .ok_or_else(|| RsomicsError::InvalidInput(format!("'{key_col_name}' not in right")))?;

    let mut data = HashMap::new();
    for line in lines {
        let line = line.map_err(RsomicsError::Io)?;
        let fields: Vec<String> = line.split('\t').map(String::from).collect();
        let key = fields.get(ki).cloned().unwrap_or_default();
        data.insert(key, fields);
    }
    Ok((cols, data))
}