Skip to main content

gapsmith_align/
mmseqs2.rs

1//! MMseqs2 aligner (protein-protein).
2//!
3//! Mirrors gapseq's explicit 4-command pipeline from
4//! `src/gapseq_find.sh:603–615`:
5//!
6//! ```text
7//! mmseqs createdb TARGET targetDB
8//! mmseqs createdb QUERY queryDB
9//! mmseqs search queryDB targetDB resultDB tmp --threads N -c 0.C
10//! mmseqs convertalis queryDB targetDB resultDB out.tsv
11//!     --format-output "qheader,pident,evalue,bits,qcov,theader,tstart,tend"
12//! ```
13//!
14//! Rationale: `mmseqs easy-search` would collapse the four steps into one,
15//! but it defaults to `align --alignment-mode 3` (full-alignment identity)
16//! whereas `search` reports the k-mer prefilter identity. Identity cutoffs
17//! downstream (`analyse_alignments.R`) were calibrated against the latter —
18//! so we match that semantics exactly. Native coverage is a 0–1 fraction;
19//! we rescale to 0–100 via [`crate::tsv::parse_tsv`].
20
21use crate::blast::which;
22use crate::error::{io_err, AlignError};
23use crate::hit::Hit;
24use crate::tsv::parse_tsv;
25use crate::{AlignOpts, Aligner};
26use std::fs::File;
27use std::io::BufReader;
28use std::path::Path;
29use std::process::Command;
30
31/// Columns in the mmseqs2 native format. Note `bits` (not `bitscore`),
32/// `qcov` (fraction), `theader` instead of `stitle`. We use `qheader`
33/// (rather than `query`) so the emitted first column matches what diamond
34/// and blastp produce from the same FASTA — both of those record the
35/// entire first whitespace-delimited token of the header. Post-processing
36/// in [`normalize_qheader`] replicates gapseq's sed fixup from
37/// `src/gapseq_find.sh`.
38const FORMAT: &str = "qheader,pident,evalue,bits,qcov,theader,tstart,tend";
39
40pub struct Mmseqs2Aligner {
41    /// Sensitivity (`-s`). Default 5.7 — matches mmseqs's default.
42    pub sensitivity: f32,
43}
44
45impl Mmseqs2Aligner {
46    pub fn new() -> Self {
47        Self { sensitivity: 5.7 }
48    }
49}
50impl Default for Mmseqs2Aligner {
51    fn default() -> Self {
52        Self::new()
53    }
54}
55
56fn require(tool: &'static str) -> Result<(), AlignError> {
57    if which(tool).is_some() {
58        Ok(())
59    } else {
60        Err(AlignError::ToolMissing { tool })
61    }
62}
63
64impl Aligner for Mmseqs2Aligner {
65    fn name(&self) -> &'static str {
66        "mmseqs2"
67    }
68
69    fn align(
70        &self,
71        query_fasta: &Path,
72        target_fasta: &Path,
73        opts: &AlignOpts,
74    ) -> Result<Vec<Hit>, AlignError> {
75        require("mmseqs")?;
76        let tmp = tempfile::tempdir().map_err(|e| io_err(Path::new("/tmp"), e))?;
77        let target_db = tmp.path().join("targetDB");
78        let query_db = tmp.path().join("queryDB");
79        let result_db = tmp.path().join("resultDB");
80        let out_tsv = tmp.path().join("aln.tsv");
81        let scratch = tmp.path().join("scratch");
82        std::fs::create_dir_all(&scratch).map_err(|e| io_err(&scratch, e))?;
83
84        let coverage = opts.coverage_pct as f64 / 100.0;
85        let verbosity = if opts.quiet { "1" } else { "3" };
86
87        // 1) createdb for target
88        run_mmseqs(&[
89            "createdb",
90            target_fasta.to_str().unwrap_or(""),
91            target_db.to_str().unwrap_or(""),
92            "-v",
93            verbosity,
94        ])?;
95
96        // 2) createdb for query
97        run_mmseqs(&[
98            "createdb",
99            query_fasta.to_str().unwrap_or(""),
100            query_db.to_str().unwrap_or(""),
101            "-v",
102            verbosity,
103        ])?;
104
105        // 3) search
106        let mut search = vec![
107            "search".to_string(),
108            query_db.display().to_string(),
109            target_db.display().to_string(),
110            result_db.display().to_string(),
111            scratch.display().to_string(),
112            "--threads".to_string(),
113            opts.threads.to_string(),
114            "-c".to_string(),
115            format!("{coverage:.2}"),
116            "-s".to_string(),
117            self.sensitivity.to_string(),
118            "-v".to_string(),
119            verbosity.to_string(),
120        ];
121        if let Some(e) = opts.evalue {
122            search.push("-e".into());
123            search.push(e.to_string());
124        }
125        for a in &opts.extra_args {
126            search.push(a.clone());
127        }
128        run_mmseqs(&search.iter().map(|s| s.as_str()).collect::<Vec<_>>())?;
129
130        // 4) convertalis → TSV
131        run_mmseqs(&[
132            "convertalis",
133            query_db.to_str().unwrap_or(""),
134            target_db.to_str().unwrap_or(""),
135            result_db.to_str().unwrap_or(""),
136            out_tsv.to_str().unwrap_or(""),
137            "--format-mode",
138            "0",
139            "--format-output",
140            FORMAT,
141            "-v",
142            verbosity,
143        ])?;
144
145        let f = File::open(&out_tsv).map_err(|e| io_err(&out_tsv, e))?;
146        let mut hits = parse_tsv(BufReader::new(f), true)?;
147        for h in &mut hits {
148            h.qseqid = normalize_qheader(&h.qseqid);
149        }
150        Ok(hits)
151    }
152}
153
154fn run_mmseqs(args: &[&str]) -> Result<(), AlignError> {
155    let mut cmd = Command::new("mmseqs");
156    for a in args {
157        cmd.arg(a);
158    }
159    tracing::debug!(?cmd, "running mmseqs");
160    let out = cmd
161        .output()
162        .map_err(|e| io_err(Path::new("mmseqs"), e))?;
163    if !out.status.success() {
164        return Err(AlignError::ToolFailed {
165            tool: "mmseqs2",
166            status: out.status,
167            stderr: String::from_utf8_lossy(&out.stderr).to_string(),
168        });
169    }
170    Ok(())
171}
172
173/// Reduce a mmseqs2 `qheader` field to its first whitespace-delimited token —
174/// matching diamond and blastp's `qseqid` output. Mirrors the sed fixup in
175/// `src/gapseq_find.sh` after a `convertalis` call.
176fn normalize_qheader(h: &str) -> String {
177    match h.split_once(|c: char| c.is_whitespace()) {
178        Some((first, _)) => first.to_string(),
179        None => h.to_string(),
180    }
181}
182
183#[cfg(test)]
184mod tests {
185    use super::*;
186
187    #[test]
188    fn strip_qheader_first_token() {
189        assert_eq!(
190            normalize_qheader("sp|P77580|ACDH_ECOLI Acetaldehyde dehydrogenase"),
191            "sp|P77580|ACDH_ECOLI"
192        );
193        assert_eq!(normalize_qheader("noSpaces"), "noSpaces");
194        assert_eq!(normalize_qheader("tab\tdelimited"), "tab");
195    }
196}