Skip to main content

oxicuda_seq/alignment/
blast.rs

1//! BLAST-style seed-and-extend local alignment.
2//!
3//! Classic heuristic local aligner: index every exact `k`-mer ("seed") shared
4//! between the query and the subject, then extend each seed in both directions
5//! with an ungapped *X-drop* extension to grow high-scoring segment pairs
6//! (HSPs). This trades the optimality of Smith–Waterman for near-linear speed
7//! on long sequences, finding only the alignments anchored by an exact seed.
8
9use crate::error::{SeqError, SeqResult};
10use std::collections::{HashMap, HashSet};
11use std::hash::Hash;
12
13/// Configuration for the seed-and-extend aligner.
14#[derive(Debug, Clone, Copy)]
15pub struct BlastConfig {
16    /// Length of the exact-match seed (`k`-mer). Must be `> 0`.
17    pub kmer_len: usize,
18    /// Score awarded for each matching position.
19    pub match_score: i32,
20    /// Score (penalty, typically negative) for each mismatching position.
21    pub mismatch_score: i32,
22    /// X-drop threshold: extension stops once the running score falls more than
23    /// this far below the best score seen so far. Must be `> 0`.
24    pub x_drop: i32,
25}
26
27impl Default for BlastConfig {
28    fn default() -> Self {
29        Self {
30            kmer_len: 3,
31            match_score: 2,
32            mismatch_score: -1,
33            x_drop: 5,
34        }
35    }
36}
37
38/// A high-scoring segment pair (ungapped local alignment) found by extension.
39#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub struct Hsp {
41    /// Start offset of the segment in the query.
42    pub query_start: usize,
43    /// Start offset of the segment in the subject.
44    pub subject_start: usize,
45    /// Length of the aligned (ungapped) segment.
46    pub length: usize,
47    /// Alignment score of the segment.
48    pub score: i32,
49}
50
51/// Seed-and-extend aligner with a validated configuration.
52#[derive(Debug, Clone)]
53pub struct BlastAligner {
54    config: BlastConfig,
55}
56
57impl BlastAligner {
58    /// Construct a new aligner, validating the configuration.
59    ///
60    /// # Errors
61    /// * [`SeqError::InvalidConfiguration`] if `kmer_len == 0`.
62    /// * [`SeqError::InvalidParameter`] if `x_drop <= 0`.
63    pub fn new(config: BlastConfig) -> SeqResult<Self> {
64        if config.kmer_len == 0 {
65            return Err(SeqError::InvalidConfiguration(
66                "kmer_len must be > 0".to_string(),
67            ));
68        }
69        if config.x_drop <= 0 {
70            return Err(SeqError::InvalidParameter {
71                name: "x_drop".to_string(),
72                value: f64::from(config.x_drop),
73            });
74        }
75        Ok(Self { config })
76    }
77
78    /// Borrow the validated configuration.
79    pub fn config(&self) -> &BlastConfig {
80        &self.config
81    }
82
83    /// Search `subject` for high-scoring segment pairs anchored by exact
84    /// `k`-mer seeds shared with `query`.
85    ///
86    /// Returned HSPs are deduplicated (segments subsumed by a longer HSP on the
87    /// same diagonal are dropped) and sorted by descending score, then by
88    /// position for determinism.
89    ///
90    /// # Errors
91    /// [`SeqError::EmptyInput`] if either sequence is empty. Non-empty
92    /// sequences shorter than one seed simply yield an empty result.
93    pub fn search<T: Eq + Hash>(&self, query: &[T], subject: &[T]) -> SeqResult<Vec<Hsp>> {
94        if query.is_empty() || subject.is_empty() {
95            return Err(SeqError::EmptyInput);
96        }
97        let k = self.config.kmer_len;
98        // Sequences shorter than one seed cannot share a k-mer.
99        if query.len() < k || subject.len() < k {
100            return Ok(Vec::new());
101        }
102
103        // Index every k-mer of the subject → list of start positions.
104        let mut index: HashMap<&[T], Vec<usize>> = HashMap::new();
105        for sp in 0..=(subject.len() - k) {
106            index.entry(&subject[sp..sp + k]).or_default().push(sp);
107        }
108
109        // Every (query k-mer, subject hit) pair is a seed; extend each one.
110        let mut seen: HashSet<(usize, usize, usize)> = HashSet::new();
111        let mut hsps: Vec<Hsp> = Vec::new();
112        for qp in 0..=(query.len() - k) {
113            let Some(hits) = index.get(&query[qp..qp + k]) else {
114                continue;
115            };
116            for &sp in hits {
117                let hsp = self.extend_seed(query, subject, qp, sp);
118                if seen.insert((hsp.query_start, hsp.subject_start, hsp.length)) {
119                    hsps.push(hsp);
120                }
121            }
122        }
123
124        // Drop HSPs whose span is contained in a longer HSP on the same diagonal.
125        let mut kept = dedup_subsumed(hsps);
126        // Deterministic ordering: best score first, then by position.
127        kept.sort_by(|x, y| {
128            y.score
129                .cmp(&x.score)
130                .then(x.query_start.cmp(&y.query_start))
131                .then(x.subject_start.cmp(&y.subject_start))
132        });
133        Ok(kept)
134    }
135
136    /// Ungapped X-drop extension of an exact `k`-mer seed anchored at
137    /// query offset `qp` and subject offset `sp`.
138    fn extend_seed<T: Eq>(&self, query: &[T], subject: &[T], qp: usize, sp: usize) -> Hsp {
139        let k = self.config.kmer_len;
140        let mtch = self.config.match_score;
141        let mis = self.config.mismatch_score;
142        let x_drop = self.config.x_drop;
143
144        // Right extension beyond the seed.
145        let mut right_gain = 0i32;
146        let mut right_len = 0usize;
147        {
148            let mut run = 0i32;
149            let mut ext = 0usize;
150            while qp + k + ext < query.len() && sp + k + ext < subject.len() {
151                let step = if query[qp + k + ext] == subject[sp + k + ext] {
152                    mtch
153                } else {
154                    mis
155                };
156                run = run.saturating_add(step);
157                ext += 1;
158                if run > right_gain {
159                    right_gain = run;
160                    right_len = ext;
161                }
162                if right_gain - run > x_drop {
163                    break;
164                }
165            }
166        }
167
168        // Left extension before the seed.
169        let mut left_gain = 0i32;
170        let mut left_len = 0usize;
171        {
172            let mut run = 0i32;
173            let mut ext = 0usize;
174            while qp > ext && sp > ext {
175                let step = if query[qp - 1 - ext] == subject[sp - 1 - ext] {
176                    mtch
177                } else {
178                    mis
179                };
180                run = run.saturating_add(step);
181                ext += 1;
182                if run > left_gain {
183                    left_gain = run;
184                    left_len = ext;
185                }
186                if left_gain - run > x_drop {
187                    break;
188                }
189            }
190        }
191
192        let seed_score = (k as i32).saturating_mul(mtch);
193        Hsp {
194            query_start: qp - left_len,
195            subject_start: sp - left_len,
196            length: left_len + k + right_len,
197            score: seed_score
198                .saturating_add(left_gain)
199                .saturating_add(right_gain),
200        }
201    }
202}
203
204/// Remove HSPs whose span is fully contained within a strictly longer HSP on
205/// the same diagonal (`subject_start − query_start`).
206fn dedup_subsumed(hsps: Vec<Hsp>) -> Vec<Hsp> {
207    let n = hsps.len();
208    let mut keep = vec![true; n];
209    for i in 0..n {
210        for j in 0..n {
211            if i == j || !keep[j] {
212                continue;
213            }
214            let di = hsps[i].subject_start as isize - hsps[i].query_start as isize;
215            let dj = hsps[j].subject_start as isize - hsps[j].query_start as isize;
216            if di != dj {
217                continue;
218            }
219            let i_start = hsps[i].query_start;
220            let i_end = i_start + hsps[i].length;
221            let j_start = hsps[j].query_start;
222            let j_end = j_start + hsps[j].length;
223            if j_start <= i_start && i_end <= j_end && hsps[j].length > hsps[i].length {
224                keep[i] = false;
225                break;
226            }
227        }
228    }
229    hsps.into_iter()
230        .zip(keep)
231        .filter_map(|(h, k)| k.then_some(h))
232        .collect()
233}
234
235#[cfg(test)]
236mod tests {
237    use super::*;
238
239    #[test]
240    fn exact_substring_one_full_hsp() {
241        let cfg = BlastConfig {
242            kmer_len: 3,
243            match_score: 2,
244            mismatch_score: -1,
245            x_drop: 5,
246        };
247        let aligner = BlastAligner::new(cfg).expect("cfg");
248        let query = b"GATTACA";
249        let subject = b"AAGATTACAGG";
250        let hsps = aligner.search(query, subject).expect("search");
251        assert_eq!(hsps.len(), 1, "expected exactly one HSP, got {hsps:?}");
252        let h = hsps[0];
253        assert_eq!(h.length, query.len());
254        assert_eq!(h.score, query.len() as i32 * cfg.match_score);
255        assert_eq!(h.query_start, 0);
256        assert_eq!(h.subject_start, 2);
257    }
258
259    #[test]
260    fn no_shared_kmer_yields_empty() {
261        let aligner = BlastAligner::new(BlastConfig::default()).expect("cfg");
262        let query = b"AAAAAA";
263        let subject = b"CCCCCC";
264        let hsps = aligner.search(query, subject).expect("search");
265        assert!(hsps.is_empty());
266    }
267
268    #[test]
269    fn longer_kmer_reduces_sensitivity() {
270        // A divergent pair: short seeds still hit, a long seed finds nothing.
271        let query = b"ACGTACGTAC";
272        let subject = b"TGCATGCATG";
273        let short = BlastAligner::new(BlastConfig {
274            kmer_len: 1,
275            ..BlastConfig::default()
276        })
277        .expect("cfg");
278        let long = BlastAligner::new(BlastConfig {
279            kmer_len: 6,
280            ..BlastConfig::default()
281        })
282        .expect("cfg");
283        let short_hits = short.search(query, subject).expect("short");
284        let long_hits = long.search(query, subject).expect("long");
285        assert!(long_hits.len() <= short_hits.len());
286        assert!(long_hits.is_empty());
287    }
288
289    #[test]
290    fn xdrop_stops_at_mismatch_run() {
291        let cfg = BlastConfig {
292            kmer_len: 5,
293            match_score: 1,
294            mismatch_score: -2,
295            x_drop: 1,
296        };
297        let aligner = BlastAligner::new(cfg).expect("cfg");
298        let query = b"TTTTTGGGGG";
299        let subject = b"TTTTTAAAAA";
300        let hsps = aligner.search(query, subject).expect("search");
301        assert_eq!(hsps.len(), 1);
302        // Only the "TTTTT" seed survives; the mismatched tail is X-dropped.
303        assert_eq!(hsps[0].length, 5);
304        assert_eq!(hsps[0].score, 5);
305        assert_eq!(hsps[0].query_start, 0);
306        assert_eq!(hsps[0].subject_start, 0);
307    }
308
309    #[test]
310    fn invalid_config_and_empty_input_error() {
311        // k = 0 rejected at construction.
312        assert!(
313            BlastAligner::new(BlastConfig {
314                kmer_len: 0,
315                ..BlastConfig::default()
316            })
317            .is_err()
318        );
319        // Non-positive x_drop rejected at construction.
320        assert!(
321            BlastAligner::new(BlastConfig {
322                x_drop: 0,
323                ..BlastConfig::default()
324            })
325            .is_err()
326        );
327        // Empty input rejected at search time.
328        let aligner = BlastAligner::new(BlastConfig::default()).expect("cfg");
329        let empty: &[u8] = b"";
330        assert!(aligner.search(empty, b"ACGT").is_err());
331        assert!(aligner.search(b"ACGT", empty).is_err());
332    }
333}