gapsmith_align/
mmseqs2.rs1use crate::blast::which;
22use crate::error::{io_err, AlignError};
23use crate::hit::Hit;
24use crate::tsv::parse_tsv;
25use crate::{AlignOpts, Aligner};
26use std::fs::File;
27use std::io::BufReader;
28use std::path::Path;
29use std::process::Command;
30
31const FORMAT: &str = "qheader,pident,evalue,bits,qcov,theader,tstart,tend";
39
40pub struct Mmseqs2Aligner {
41 pub sensitivity: f32,
43}
44
45impl Mmseqs2Aligner {
46 pub fn new() -> Self {
47 Self { sensitivity: 5.7 }
48 }
49}
50impl Default for Mmseqs2Aligner {
51 fn default() -> Self {
52 Self::new()
53 }
54}
55
56fn require(tool: &'static str) -> Result<(), AlignError> {
57 if which(tool).is_some() {
58 Ok(())
59 } else {
60 Err(AlignError::ToolMissing { tool })
61 }
62}
63
64impl Aligner for Mmseqs2Aligner {
65 fn name(&self) -> &'static str {
66 "mmseqs2"
67 }
68
69 fn align(
70 &self,
71 query_fasta: &Path,
72 target_fasta: &Path,
73 opts: &AlignOpts,
74 ) -> Result<Vec<Hit>, AlignError> {
75 require("mmseqs")?;
76 let tmp = tempfile::tempdir().map_err(|e| io_err(Path::new("/tmp"), e))?;
77 let target_db = tmp.path().join("targetDB");
78 let query_db = tmp.path().join("queryDB");
79 let result_db = tmp.path().join("resultDB");
80 let out_tsv = tmp.path().join("aln.tsv");
81 let scratch = tmp.path().join("scratch");
82 std::fs::create_dir_all(&scratch).map_err(|e| io_err(&scratch, e))?;
83
84 let coverage = opts.coverage_pct as f64 / 100.0;
85 let verbosity = if opts.quiet { "1" } else { "3" };
86
87 run_mmseqs(&[
89 "createdb",
90 target_fasta.to_str().unwrap_or(""),
91 target_db.to_str().unwrap_or(""),
92 "-v",
93 verbosity,
94 ])?;
95
96 run_mmseqs(&[
98 "createdb",
99 query_fasta.to_str().unwrap_or(""),
100 query_db.to_str().unwrap_or(""),
101 "-v",
102 verbosity,
103 ])?;
104
105 let mut search = vec![
107 "search".to_string(),
108 query_db.display().to_string(),
109 target_db.display().to_string(),
110 result_db.display().to_string(),
111 scratch.display().to_string(),
112 "--threads".to_string(),
113 opts.threads.to_string(),
114 "-c".to_string(),
115 format!("{coverage:.2}"),
116 "-s".to_string(),
117 self.sensitivity.to_string(),
118 "-v".to_string(),
119 verbosity.to_string(),
120 ];
121 if let Some(e) = opts.evalue {
122 search.push("-e".into());
123 search.push(e.to_string());
124 }
125 for a in &opts.extra_args {
126 search.push(a.clone());
127 }
128 run_mmseqs(&search.iter().map(|s| s.as_str()).collect::<Vec<_>>())?;
129
130 run_mmseqs(&[
132 "convertalis",
133 query_db.to_str().unwrap_or(""),
134 target_db.to_str().unwrap_or(""),
135 result_db.to_str().unwrap_or(""),
136 out_tsv.to_str().unwrap_or(""),
137 "--format-mode",
138 "0",
139 "--format-output",
140 FORMAT,
141 "-v",
142 verbosity,
143 ])?;
144
145 let f = File::open(&out_tsv).map_err(|e| io_err(&out_tsv, e))?;
146 let mut hits = parse_tsv(BufReader::new(f), true)?;
147 for h in &mut hits {
148 h.qseqid = normalize_qheader(&h.qseqid);
149 }
150 Ok(hits)
151 }
152}
153
154fn run_mmseqs(args: &[&str]) -> Result<(), AlignError> {
155 let mut cmd = Command::new("mmseqs");
156 for a in args {
157 cmd.arg(a);
158 }
159 tracing::debug!(?cmd, "running mmseqs");
160 let out = cmd
161 .output()
162 .map_err(|e| io_err(Path::new("mmseqs"), e))?;
163 if !out.status.success() {
164 return Err(AlignError::ToolFailed {
165 tool: "mmseqs2",
166 status: out.status,
167 stderr: String::from_utf8_lossy(&out.stderr).to_string(),
168 });
169 }
170 Ok(())
171}
172
173fn normalize_qheader(h: &str) -> String {
177 match h.split_once(|c: char| c.is_whitespace()) {
178 Some((first, _)) => first.to_string(),
179 None => h.to_string(),
180 }
181}
182
183#[cfg(test)]
184mod tests {
185 use super::*;
186
187 #[test]
188 fn strip_qheader_first_token() {
189 assert_eq!(
190 normalize_qheader("sp|P77580|ACDH_ECOLI Acetaldehyde dehydrogenase"),
191 "sp|P77580|ACDH_ECOLI"
192 );
193 assert_eq!(normalize_qheader("noSpaces"), "noSpaces");
194 assert_eq!(normalize_qheader("tab\tdelimited"), "tab");
195 }
196}