gapsmith_align/lib.rs
1//! Sequence alignment abstraction for gapsmith.
2//!
3//! Every aligner (blast, diamond, mmseqs2, precomputed TSV) exposes a common
4//! [`Aligner`] trait that takes a query FASTA and a target FASTA and returns
5//! a vector of [`Hit`]. Internally the shell-out implementations manage
6//! their own temp work-directories so callers just see FASTA-in, hits-out.
7//!
8//! # Example
9//!
10//! ```ignore
11//! use gapsmith_align::{AlignOpts, Aligner, DiamondAligner};
12//! use std::path::Path;
13//!
14//! let aligner = DiamondAligner;
15//! let hits = aligner.run(
16//! Path::new("query.faa"),
17//! &[Path::new("reference.faa")],
18//! &AlignOpts::default(),
19//! ).unwrap();
20//! for h in hits.iter().take(5) {
21//! println!("{}\t{}\t{}", h.qseqid, h.pident, h.bitscore);
22//! }
23//! ```
24//!
25//! # Backend selection
26//!
27//! - [`BlastpAligner`] — protein-vs-protein; always available if NCBI BLAST+
28//! is on `PATH`. Slow on large genomes but the gapseq reference.
29//! - [`TblastnAligner`] — protein query vs nucleotide subject (rare; used
30//! for nucleotide-based reference FASTAs).
31//! - [`DiamondAligner`] — 5-20× faster than BLASTp on large proteomes;
32//! comparable sensitivity at `--more-sensitive` (which we default on).
33//! - [`Mmseqs2Aligner`] — fast k-mer-based alternative; we replicate
34//! gapseq's 4-command pipeline (createdb → search → convertalis) rather
35//! than `easy-search`, because the latter reports full-alignment
36//! identities instead of the k-mer prefilter identities gapseq
37//! calibrates against.
38//! - [`PrecomputedTsvAligner`] — skips the aligner entirely; reads a TSV
39//! the caller produced with their own tool. Used by `gapsmith`'s
40//! `--aligner precomputed` mode and by [`BatchClusterAligner`].
41//! - [`BatchClusterAligner`] — new in gapsmith. mmseqs2-clusters N
42//! genomes, runs one alignment against the reference, then expands the
43//! cluster membership to per-genome TSVs. Amortises aligner cost over
44//! many genomes.
45//!
46//! Columns always emitted by our wrappers (matching gapseq's convention):
47//!
48//! | column | meaning |
49//! |---------|-----------------------------------------------------|
50//! | qseqid | query identifier (full FASTA header, up to a space) |
51//! | pident | percent identity (0–100) |
52//! | evalue | BLAST-style e-value |
53//! | bitscore| bit score |
54//! | qcov | query coverage (0–100) |
55//! | stitle | subject title (may contain spaces) |
56//! | sstart | subject start |
57//! | send | subject end |
58//!
59//! This keeps parity with `src/gapseq_find.sh` lines 249–255.
60
61pub mod batch;
62pub mod blast;
63pub mod diamond;
64pub mod error;
65pub mod hit;
66pub mod mmseqs2;
67pub mod precomputed;
68pub mod tsv;
69
70pub use batch::{BatchClusterAligner, ClusterResult, GenomeHitSet, GenomeInput};
71
72// Re-exported for external parity tests that need to reuse the TSV parser.
73// Not part of the stable public API; prefer the per-backend aligners.
74
75pub use blast::{BlastpAligner, TblastnAligner};
76pub use diamond::DiamondAligner;
77pub use error::AlignError;
78pub use hit::Hit;
79pub use mmseqs2::Mmseqs2Aligner;
80pub use precomputed::PrecomputedTsvAligner;
81
82use std::path::Path;
83
84/// Options tuning an alignment run. Sensible gapseq defaults: coverage 75%,
85/// use all detected cores, no extra user args.
86#[derive(Debug, Clone)]
87pub struct AlignOpts {
88 /// Worker threads to pass to the underlying binary.
89 pub threads: usize,
90 /// Minimum query coverage, 0–100 (not enforced by PrecomputedTsvAligner).
91 pub coverage_pct: u32,
92 /// Optional e-value cutoff. None = leave to the tool's default.
93 pub evalue: Option<f64>,
94 /// Additional free-form CLI flags appended verbatim. Unsafe if fed
95 /// untrusted input — matches gapseq's `-R` flag in `src/gapseq_find.sh`.
96 pub extra_args: Vec<String>,
97 /// Suppress the tool's stderr when true. Useful in batch pipelines.
98 pub quiet: bool,
99}
100
101impl Default for AlignOpts {
102 fn default() -> Self {
103 Self {
104 threads: std::thread::available_parallelism()
105 .map(|n| n.get())
106 .unwrap_or(1),
107 coverage_pct: 75,
108 evalue: None,
109 extra_args: Vec::new(),
110 quiet: false,
111 }
112 }
113}
114
115/// Common trait implemented by every aligner backend.
116pub trait Aligner: Send + Sync {
117 /// Short label printed in logs (`"blast"`, `"diamond"`, `"mmseqs2"`,
118 /// `"tblastn"`, `"precomputed"`).
119 fn name(&self) -> &'static str;
120
121 /// Align `query_fasta` against `target_fasta`. The `target_fasta` may be
122 /// a protein or nucleotide file depending on the backend — check the
123 /// impl's docs.
124 fn align(
125 &self,
126 query_fasta: &Path,
127 target_fasta: &Path,
128 opts: &AlignOpts,
129 ) -> Result<Vec<Hit>, AlignError>;
130}