Skip to main content

gapsmith_align/
precomputed.rs

1//! Pre-computed alignment-TSV aligner.
2//!
3//! The user provides a TSV produced by an earlier diamond / blastp / mmseqs2
4//! run (or a concatenation thereof — e.g. from the M4.5 batch-cluster mode).
5//! The `align` method ignores the `query_fasta` and `target_fasta`
6//! arguments and simply returns every parsed hit. Downstream layers
7//! (find/transport) are responsible for grouping hits by `qseqid` and
8//! applying bitscore / identity / coverage cutoffs.
9//!
10//! Motivation: batch annotation of many genomes can amortize a single
11//! alignment run over a clustered reference set instead of re-running
12//! blast/diamond per genome — exactly the use case gapsmith is being
13//! built for.
14
15use crate::error::{io_err, AlignError};
16use crate::hit::Hit;
17use crate::tsv::parse_tsv;
18use crate::{AlignOpts, Aligner};
19use std::fs::File;
20use std::io::BufReader;
21use std::path::{Path, PathBuf};
22use std::sync::OnceLock;
23
24pub struct PrecomputedTsvAligner {
25    path: PathBuf,
26    coverage_is_fraction: bool,
27    cache: OnceLock<Vec<Hit>>,
28}
29
30impl PrecomputedTsvAligner {
31    /// TSV where coverage is reported 0–100 (blast / diamond convention).
32    pub fn new_percentage(path: impl Into<PathBuf>) -> Self {
33        Self { path: path.into(), coverage_is_fraction: false, cache: OnceLock::new() }
34    }
35
36    /// TSV where coverage is a 0–1 fraction (mmseqs2 native output).
37    pub fn new_fraction(path: impl Into<PathBuf>) -> Self {
38        Self { path: path.into(), coverage_is_fraction: true, cache: OnceLock::new() }
39    }
40
41    fn load(&self) -> Result<&Vec<Hit>, AlignError> {
42        if let Some(hits) = self.cache.get() {
43            return Ok(hits);
44        }
45        let f = File::open(&self.path).map_err(|e| io_err(&self.path, e))?;
46        let rdr = BufReader::new(f);
47        let hits = parse_tsv(rdr, self.coverage_is_fraction)?;
48        let _ = self.cache.set(hits);
49        Ok(self.cache.get().unwrap())
50    }
51}
52
53impl Aligner for PrecomputedTsvAligner {
54    fn name(&self) -> &'static str {
55        "precomputed"
56    }
57
58    fn align(
59        &self,
60        _query_fasta: &Path,
61        _target_fasta: &Path,
62        _opts: &AlignOpts,
63    ) -> Result<Vec<Hit>, AlignError> {
64        let hits = self.load()?;
65        Ok(hits.clone())
66    }
67}
68
69#[cfg(test)]
70mod tests {
71    use super::*;
72    use std::io::Write;
73
74    #[test]
75    fn loads_and_caches() {
76        let dir = tempfile::tempdir().unwrap();
77        let p = dir.path().join("a.tsv");
78        let mut f = std::fs::File::create(&p).unwrap();
79        writeln!(f, "q1\t95.5\t1e-100\t312.5\t90\tt1\t1\t200").unwrap();
80        writeln!(f, "q2\t80.0\t1e-20\t120.0\t70\tt2 descr\t10\t150").unwrap();
81        drop(f);
82        let a = PrecomputedTsvAligner::new_percentage(&p);
83        let h = a.align(Path::new("unused"), Path::new("unused"), &AlignOpts::default()).unwrap();
84        assert_eq!(h.len(), 2);
85        assert_eq!(h[0].qseqid, "q1");
86        assert_eq!(h[1].stitle, "t2 descr");
87        // Second call serves from cache — we can't observe timing but the
88        // result stays identical.
89        let h2 = a.align(Path::new("x"), Path::new("y"), &AlignOpts::default()).unwrap();
90        assert_eq!(h, h2);
91    }
92}