Skip to main content

gapsmith_align/
lib.rs

1//! Sequence alignment abstraction for gapsmith.
2//!
3//! Every aligner (blast, diamond, mmseqs2, precomputed TSV) exposes a common
4//! [`Aligner`] trait that takes a query FASTA and a target FASTA and returns
5//! a vector of [`Hit`]. Internally the shell-out implementations manage
6//! their own temp work-directories so callers just see FASTA-in, hits-out.
7//!
8//! # Example
9//!
10//! ```ignore
11//! use gapsmith_align::{AlignOpts, Aligner, DiamondAligner};
12//! use std::path::Path;
13//!
14//! let aligner = DiamondAligner;
15//! let hits = aligner.run(
16//!     Path::new("query.faa"),
17//!     &[Path::new("reference.faa")],
18//!     &AlignOpts::default(),
19//! ).unwrap();
20//! for h in hits.iter().take(5) {
21//!     println!("{}\t{}\t{}", h.qseqid, h.pident, h.bitscore);
22//! }
23//! ```
24//!
25//! # Backend selection
26//!
27//! - [`BlastpAligner`] — protein-vs-protein; always available if NCBI BLAST+
28//!   is on `PATH`. Slow on large genomes but the gapseq reference.
29//! - [`TblastnAligner`] — protein query vs nucleotide subject (rare; used
30//!   for nucleotide-based reference FASTAs).
31//! - [`DiamondAligner`] — 5-20× faster than BLASTp on large proteomes;
32//!   comparable sensitivity at `--more-sensitive` (which we default on).
33//! - [`Mmseqs2Aligner`] — fast k-mer-based alternative; we replicate
34//!   gapseq's 4-command pipeline (createdb → search → convertalis) rather
35//!   than `easy-search`, because the latter reports full-alignment
36//!   identities instead of the k-mer prefilter identities gapseq
37//!   calibrates against.
38//! - [`PrecomputedTsvAligner`] — skips the aligner entirely; reads a TSV
39//!   the caller produced with their own tool. Used by `gapsmith`'s
40//!   `--aligner precomputed` mode and by [`BatchClusterAligner`].
41//! - [`BatchClusterAligner`] — new in gapsmith. mmseqs2-clusters N
42//!   genomes, runs one alignment against the reference, then expands the
43//!   cluster membership to per-genome TSVs. Amortises aligner cost over
44//!   many genomes.
45//!
46//! Columns always emitted by our wrappers (matching gapseq's convention):
47//!
48//! | column  | meaning                                             |
49//! |---------|-----------------------------------------------------|
50//! | qseqid  | query identifier (full FASTA header, up to a space) |
51//! | pident  | percent identity (0–100)                            |
52//! | evalue  | BLAST-style e-value                                  |
53//! | bitscore| bit score                                           |
54//! | qcov    | query coverage (0–100)                              |
55//! | stitle  | subject title (may contain spaces)                  |
56//! | sstart  | subject start                                       |
57//! | send    | subject end                                         |
58//!
59//! This keeps parity with `src/gapseq_find.sh` lines 249–255.
60
61pub mod batch;
62pub mod blast;
63pub mod diamond;
64pub mod error;
65pub mod hit;
66pub mod mmseqs2;
67pub mod precomputed;
68pub mod tsv;
69
70pub use batch::{BatchClusterAligner, ClusterResult, GenomeHitSet, GenomeInput};
71
72// Re-exported for external parity tests that need to reuse the TSV parser.
73// Not part of the stable public API; prefer the per-backend aligners.
74
75pub use blast::{BlastpAligner, TblastnAligner};
76pub use diamond::DiamondAligner;
77pub use error::AlignError;
78pub use hit::Hit;
79pub use mmseqs2::Mmseqs2Aligner;
80pub use precomputed::PrecomputedTsvAligner;
81
82use std::path::Path;
83
84/// Options tuning an alignment run. Sensible gapseq defaults: coverage 75%,
85/// use all detected cores, no extra user args.
86#[derive(Debug, Clone)]
87pub struct AlignOpts {
88    /// Worker threads to pass to the underlying binary.
89    pub threads: usize,
90    /// Minimum query coverage, 0–100 (not enforced by PrecomputedTsvAligner).
91    pub coverage_pct: u32,
92    /// Optional e-value cutoff. None = leave to the tool's default.
93    pub evalue: Option<f64>,
94    /// Additional free-form CLI flags appended verbatim. Unsafe if fed
95    /// untrusted input — matches gapseq's `-R` flag in `src/gapseq_find.sh`.
96    pub extra_args: Vec<String>,
97    /// Suppress the tool's stderr when true. Useful in batch pipelines.
98    pub quiet: bool,
99}
100
101impl Default for AlignOpts {
102    fn default() -> Self {
103        Self {
104            threads: std::thread::available_parallelism()
105                .map(|n| n.get())
106                .unwrap_or(1),
107            coverage_pct: 75,
108            evalue: None,
109            extra_args: Vec::new(),
110            quiet: false,
111        }
112    }
113}
114
115/// Common trait implemented by every aligner backend.
116pub trait Aligner: Send + Sync {
117    /// Short label printed in logs (`"blast"`, `"diamond"`, `"mmseqs2"`,
118    /// `"tblastn"`, `"precomputed"`).
119    fn name(&self) -> &'static str;
120
121    /// Align `query_fasta` against `target_fasta`. The `target_fasta` may be
122    /// a protein or nucleotide file depending on the backend — check the
123    /// impl's docs.
124    fn align(
125        &self,
126        query_fasta: &Path,
127        target_fasta: &Path,
128        opts: &AlignOpts,
129    ) -> Result<Vec<Hit>, AlignError>;
130}