use crate::blast::which;
use crate::error::{io_err, AlignError};
use crate::hit::Hit;
use crate::tsv::parse_tsv;
use crate::{AlignOpts, Aligner};
use std::fs::File;
use std::io::BufReader;
use std::path::Path;
use std::process::Command;
const FORMAT: &str = "qheader,pident,evalue,bits,qcov,theader,tstart,tend";
pub struct Mmseqs2Aligner {
pub sensitivity: f32,
}
impl Mmseqs2Aligner {
pub fn new() -> Self {
Self { sensitivity: 5.7 }
}
}
impl Default for Mmseqs2Aligner {
fn default() -> Self {
Self::new()
}
}
fn require(tool: &'static str) -> Result<(), AlignError> {
if which(tool).is_some() {
Ok(())
} else {
Err(AlignError::ToolMissing { tool })
}
}
impl Aligner for Mmseqs2Aligner {
fn name(&self) -> &'static str {
"mmseqs2"
}
fn align(
&self,
query_fasta: &Path,
target_fasta: &Path,
opts: &AlignOpts,
) -> Result<Vec<Hit>, AlignError> {
require("mmseqs")?;
let tmp = tempfile::tempdir().map_err(|e| io_err(Path::new("/tmp"), e))?;
let target_db = tmp.path().join("targetDB");
let query_db = tmp.path().join("queryDB");
let result_db = tmp.path().join("resultDB");
let out_tsv = tmp.path().join("aln.tsv");
let scratch = tmp.path().join("scratch");
std::fs::create_dir_all(&scratch).map_err(|e| io_err(&scratch, e))?;
let coverage = opts.coverage_pct as f64 / 100.0;
let verbosity = if opts.quiet { "1" } else { "3" };
run_mmseqs(&[
"createdb",
target_fasta.to_str().unwrap_or(""),
target_db.to_str().unwrap_or(""),
"-v",
verbosity,
])?;
run_mmseqs(&[
"createdb",
query_fasta.to_str().unwrap_or(""),
query_db.to_str().unwrap_or(""),
"-v",
verbosity,
])?;
let mut search = vec![
"search".to_string(),
query_db.display().to_string(),
target_db.display().to_string(),
result_db.display().to_string(),
scratch.display().to_string(),
"--threads".to_string(),
opts.threads.to_string(),
"-c".to_string(),
format!("{coverage:.2}"),
"-s".to_string(),
self.sensitivity.to_string(),
"-v".to_string(),
verbosity.to_string(),
];
if let Some(e) = opts.evalue {
search.push("-e".into());
search.push(e.to_string());
}
for a in &opts.extra_args {
search.push(a.clone());
}
run_mmseqs(&search.iter().map(|s| s.as_str()).collect::<Vec<_>>())?;
run_mmseqs(&[
"convertalis",
query_db.to_str().unwrap_or(""),
target_db.to_str().unwrap_or(""),
result_db.to_str().unwrap_or(""),
out_tsv.to_str().unwrap_or(""),
"--format-mode",
"0",
"--format-output",
FORMAT,
"-v",
verbosity,
])?;
let f = File::open(&out_tsv).map_err(|e| io_err(&out_tsv, e))?;
let mut hits = parse_tsv(BufReader::new(f), true)?;
for h in &mut hits {
h.qseqid = normalize_qheader(&h.qseqid);
}
Ok(hits)
}
}
fn run_mmseqs(args: &[&str]) -> Result<(), AlignError> {
let mut cmd = Command::new("mmseqs");
for a in args {
cmd.arg(a);
}
tracing::debug!(?cmd, "running mmseqs");
let out = cmd
.output()
.map_err(|e| io_err(Path::new("mmseqs"), e))?;
if !out.status.success() {
return Err(AlignError::ToolFailed {
tool: "mmseqs2",
status: out.status,
stderr: String::from_utf8_lossy(&out.stderr).to_string(),
});
}
Ok(())
}
fn normalize_qheader(h: &str) -> String {
match h.split_once(|c: char| c.is_whitespace()) {
Some((first, _)) => first.to_string(),
None => h.to_string(),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn strip_qheader_first_token() {
assert_eq!(
normalize_qheader("sp|P77580|ACDH_ECOLI Acetaldehyde dehydrogenase"),
"sp|P77580|ACDH_ECOLI"
);
assert_eq!(normalize_qheader("noSpaces"), "noSpaces");
assert_eq!(normalize_qheader("tab\tdelimited"), "tab");
}
}