Skip to main content

gapsmith_align/
tsv.rs

1//! TSV parser shared by every aligner backend.
2//!
3//! Input: lines of tab-separated values with the 8 columns documented on
4//! the crate root (`qseqid pident evalue bitscore qcov stitle sstart send`).
5//! Blank lines and `#`-prefixed comments are skipped.
6//!
7//! `coverage_is_fraction = true` indicates the source tool (e.g. mmseqs2)
8//! reports `qcov` as a 0–1 fraction; we rescale to 0–100 for consistency.
9
10use crate::error::AlignError;
11use crate::hit::Hit;
12use std::io::BufRead;
13
14pub fn parse_tsv<R: BufRead>(
15    rdr: R,
16    coverage_is_fraction: bool,
17) -> Result<Vec<Hit>, AlignError> {
18    let mut out = Vec::new();
19    for (i, line) in rdr.lines().enumerate() {
20        let line = line.map_err(|e| AlignError::TsvParse {
21            line: (i + 1) as u64,
22            msg: format!("read error: {e}"),
23        })?;
24        let l = line.trim_end_matches('\r');
25        if l.is_empty() || l.starts_with('#') {
26            continue;
27        }
28        let hit = parse_hit_line(l, coverage_is_fraction, (i + 1) as u64)?;
29        out.push(hit);
30    }
31    Ok(out)
32}
33
34fn parse_hit_line(
35    line: &str,
36    coverage_is_fraction: bool,
37    line_no: u64,
38) -> Result<Hit, AlignError> {
39    // 8 fields — but stitle can contain tabs(?) it shouldn't; BLAST's -outfmt 6
40    // preserves spaces but replaces tabs in stitle. We split on tab directly.
41    let cols: Vec<&str> = line.splitn(8, '\t').collect();
42    if cols.len() < 8 {
43        return Err(AlignError::TsvParse {
44            line: line_no,
45            msg: format!("expected 8 tab-separated fields, got {}", cols.len()),
46        });
47    }
48    let parse_f32 = |s: &str, field: &str| -> Result<f32, AlignError> {
49        s.trim().parse().map_err(|_| AlignError::TsvParse {
50            line: line_no,
51            msg: format!("{field} `{s}` is not a float"),
52        })
53    };
54    let parse_f64 = |s: &str, field: &str| -> Result<f64, AlignError> {
55        s.trim().parse().map_err(|_| AlignError::TsvParse {
56            line: line_no,
57            msg: format!("{field} `{s}` is not a float"),
58        })
59    };
60    let parse_i = |s: &str, field: &str| -> Result<i32, AlignError> {
61        s.trim().parse().map_err(|_| AlignError::TsvParse {
62            line: line_no,
63            msg: format!("{field} `{s}` is not an integer"),
64        })
65    };
66
67    let mut qcov = parse_f32(cols[4], "qcov")?;
68    if coverage_is_fraction {
69        qcov *= 100.0;
70    }
71    Ok(Hit {
72        qseqid: cols[0].to_string(),
73        pident: parse_f32(cols[1], "pident")?,
74        evalue: parse_f64(cols[2], "evalue")?,
75        bitscore: parse_f32(cols[3], "bitscore")?,
76        qcov,
77        stitle: cols[5].to_string(),
78        sstart: parse_i(cols[6], "sstart")?,
79        send: parse_i(cols[7], "send")?,
80    })
81}
82
83#[cfg(test)]
84mod tests {
85    use super::*;
86    use std::io::Cursor;
87
88    #[test]
89    fn parses_blast_row() {
90        let data = "q1\t95.5\t1e-100\t312.5\t90\ttarget1 some desc\t1\t200\n";
91        let hits = parse_tsv(Cursor::new(data), false).unwrap();
92        assert_eq!(hits.len(), 1);
93        assert_eq!(hits[0].qseqid, "q1");
94        assert_eq!(hits[0].pident, 95.5);
95        assert_eq!(hits[0].evalue, 1e-100);
96        assert_eq!(hits[0].bitscore, 312.5);
97        assert_eq!(hits[0].qcov, 90.0);
98        assert_eq!(hits[0].stitle, "target1 some desc");
99        assert_eq!(hits[0].sstart, 1);
100        assert_eq!(hits[0].send, 200);
101    }
102
103    #[test]
104    fn rescales_mmseqs_fraction_coverage() {
105        let data = "q1\t95.5\t1e-100\t312.5\t0.9\ttarget1\t1\t200\n";
106        let hits = parse_tsv(Cursor::new(data), true).unwrap();
107        assert_eq!(hits[0].qcov, 90.0_f32);
108    }
109
110    #[test]
111    fn skips_blank_and_comment() {
112        let data = "\n#header\nq1\t95.5\t1e-100\t312.5\t90\tt1\t1\t200\n\n";
113        let hits = parse_tsv(Cursor::new(data), false).unwrap();
114        assert_eq!(hits.len(), 1);
115    }
116
117    #[test]
118    fn error_reports_line_number() {
119        let data = "q1\t95.5\t1e-100\t312.5\t90\tt1\t1\t200\nBAD_LINE\n";
120        let err = parse_tsv(Cursor::new(data), false).unwrap_err();
121        match err {
122            AlignError::TsvParse { line, .. } => assert_eq!(line, 2),
123            other => panic!("unexpected: {other}"),
124        }
125    }
126
127    #[test]
128    fn stitle_with_internal_spaces_preserved() {
129        let data = "q1\t95.5\t1e-100\t312.5\t90\ta b c d\t1\t200\n";
130        let h = parse_tsv(Cursor::new(data), false).unwrap();
131        assert_eq!(h[0].stitle, "a b c d");
132    }
133}