oxicuda_seq/alignment/
blast.rs1use crate::error::{SeqError, SeqResult};
10use std::collections::{HashMap, HashSet};
11use std::hash::Hash;
12
13#[derive(Debug, Clone, Copy)]
15pub struct BlastConfig {
16 pub kmer_len: usize,
18 pub match_score: i32,
20 pub mismatch_score: i32,
22 pub x_drop: i32,
25}
26
27impl Default for BlastConfig {
28 fn default() -> Self {
29 Self {
30 kmer_len: 3,
31 match_score: 2,
32 mismatch_score: -1,
33 x_drop: 5,
34 }
35 }
36}
37
38#[derive(Debug, Clone, Copy, PartialEq, Eq)]
40pub struct Hsp {
41 pub query_start: usize,
43 pub subject_start: usize,
45 pub length: usize,
47 pub score: i32,
49}
50
51#[derive(Debug, Clone)]
53pub struct BlastAligner {
54 config: BlastConfig,
55}
56
57impl BlastAligner {
58 pub fn new(config: BlastConfig) -> SeqResult<Self> {
64 if config.kmer_len == 0 {
65 return Err(SeqError::InvalidConfiguration(
66 "kmer_len must be > 0".to_string(),
67 ));
68 }
69 if config.x_drop <= 0 {
70 return Err(SeqError::InvalidParameter {
71 name: "x_drop".to_string(),
72 value: f64::from(config.x_drop),
73 });
74 }
75 Ok(Self { config })
76 }
77
78 pub fn config(&self) -> &BlastConfig {
80 &self.config
81 }
82
83 pub fn search<T: Eq + Hash>(&self, query: &[T], subject: &[T]) -> SeqResult<Vec<Hsp>> {
94 if query.is_empty() || subject.is_empty() {
95 return Err(SeqError::EmptyInput);
96 }
97 let k = self.config.kmer_len;
98 if query.len() < k || subject.len() < k {
100 return Ok(Vec::new());
101 }
102
103 let mut index: HashMap<&[T], Vec<usize>> = HashMap::new();
105 for sp in 0..=(subject.len() - k) {
106 index.entry(&subject[sp..sp + k]).or_default().push(sp);
107 }
108
109 let mut seen: HashSet<(usize, usize, usize)> = HashSet::new();
111 let mut hsps: Vec<Hsp> = Vec::new();
112 for qp in 0..=(query.len() - k) {
113 let Some(hits) = index.get(&query[qp..qp + k]) else {
114 continue;
115 };
116 for &sp in hits {
117 let hsp = self.extend_seed(query, subject, qp, sp);
118 if seen.insert((hsp.query_start, hsp.subject_start, hsp.length)) {
119 hsps.push(hsp);
120 }
121 }
122 }
123
124 let mut kept = dedup_subsumed(hsps);
126 kept.sort_by(|x, y| {
128 y.score
129 .cmp(&x.score)
130 .then(x.query_start.cmp(&y.query_start))
131 .then(x.subject_start.cmp(&y.subject_start))
132 });
133 Ok(kept)
134 }
135
136 fn extend_seed<T: Eq>(&self, query: &[T], subject: &[T], qp: usize, sp: usize) -> Hsp {
139 let k = self.config.kmer_len;
140 let mtch = self.config.match_score;
141 let mis = self.config.mismatch_score;
142 let x_drop = self.config.x_drop;
143
144 let mut right_gain = 0i32;
146 let mut right_len = 0usize;
147 {
148 let mut run = 0i32;
149 let mut ext = 0usize;
150 while qp + k + ext < query.len() && sp + k + ext < subject.len() {
151 let step = if query[qp + k + ext] == subject[sp + k + ext] {
152 mtch
153 } else {
154 mis
155 };
156 run = run.saturating_add(step);
157 ext += 1;
158 if run > right_gain {
159 right_gain = run;
160 right_len = ext;
161 }
162 if right_gain - run > x_drop {
163 break;
164 }
165 }
166 }
167
168 let mut left_gain = 0i32;
170 let mut left_len = 0usize;
171 {
172 let mut run = 0i32;
173 let mut ext = 0usize;
174 while qp > ext && sp > ext {
175 let step = if query[qp - 1 - ext] == subject[sp - 1 - ext] {
176 mtch
177 } else {
178 mis
179 };
180 run = run.saturating_add(step);
181 ext += 1;
182 if run > left_gain {
183 left_gain = run;
184 left_len = ext;
185 }
186 if left_gain - run > x_drop {
187 break;
188 }
189 }
190 }
191
192 let seed_score = (k as i32).saturating_mul(mtch);
193 Hsp {
194 query_start: qp - left_len,
195 subject_start: sp - left_len,
196 length: left_len + k + right_len,
197 score: seed_score
198 .saturating_add(left_gain)
199 .saturating_add(right_gain),
200 }
201 }
202}
203
204fn dedup_subsumed(hsps: Vec<Hsp>) -> Vec<Hsp> {
207 let n = hsps.len();
208 let mut keep = vec![true; n];
209 for i in 0..n {
210 for j in 0..n {
211 if i == j || !keep[j] {
212 continue;
213 }
214 let di = hsps[i].subject_start as isize - hsps[i].query_start as isize;
215 let dj = hsps[j].subject_start as isize - hsps[j].query_start as isize;
216 if di != dj {
217 continue;
218 }
219 let i_start = hsps[i].query_start;
220 let i_end = i_start + hsps[i].length;
221 let j_start = hsps[j].query_start;
222 let j_end = j_start + hsps[j].length;
223 if j_start <= i_start && i_end <= j_end && hsps[j].length > hsps[i].length {
224 keep[i] = false;
225 break;
226 }
227 }
228 }
229 hsps.into_iter()
230 .zip(keep)
231 .filter_map(|(h, k)| k.then_some(h))
232 .collect()
233}
234
235#[cfg(test)]
236mod tests {
237 use super::*;
238
239 #[test]
240 fn exact_substring_one_full_hsp() {
241 let cfg = BlastConfig {
242 kmer_len: 3,
243 match_score: 2,
244 mismatch_score: -1,
245 x_drop: 5,
246 };
247 let aligner = BlastAligner::new(cfg).expect("cfg");
248 let query = b"GATTACA";
249 let subject = b"AAGATTACAGG";
250 let hsps = aligner.search(query, subject).expect("search");
251 assert_eq!(hsps.len(), 1, "expected exactly one HSP, got {hsps:?}");
252 let h = hsps[0];
253 assert_eq!(h.length, query.len());
254 assert_eq!(h.score, query.len() as i32 * cfg.match_score);
255 assert_eq!(h.query_start, 0);
256 assert_eq!(h.subject_start, 2);
257 }
258
259 #[test]
260 fn no_shared_kmer_yields_empty() {
261 let aligner = BlastAligner::new(BlastConfig::default()).expect("cfg");
262 let query = b"AAAAAA";
263 let subject = b"CCCCCC";
264 let hsps = aligner.search(query, subject).expect("search");
265 assert!(hsps.is_empty());
266 }
267
268 #[test]
269 fn longer_kmer_reduces_sensitivity() {
270 let query = b"ACGTACGTAC";
272 let subject = b"TGCATGCATG";
273 let short = BlastAligner::new(BlastConfig {
274 kmer_len: 1,
275 ..BlastConfig::default()
276 })
277 .expect("cfg");
278 let long = BlastAligner::new(BlastConfig {
279 kmer_len: 6,
280 ..BlastConfig::default()
281 })
282 .expect("cfg");
283 let short_hits = short.search(query, subject).expect("short");
284 let long_hits = long.search(query, subject).expect("long");
285 assert!(long_hits.len() <= short_hits.len());
286 assert!(long_hits.is_empty());
287 }
288
289 #[test]
290 fn xdrop_stops_at_mismatch_run() {
291 let cfg = BlastConfig {
292 kmer_len: 5,
293 match_score: 1,
294 mismatch_score: -2,
295 x_drop: 1,
296 };
297 let aligner = BlastAligner::new(cfg).expect("cfg");
298 let query = b"TTTTTGGGGG";
299 let subject = b"TTTTTAAAAA";
300 let hsps = aligner.search(query, subject).expect("search");
301 assert_eq!(hsps.len(), 1);
302 assert_eq!(hsps[0].length, 5);
304 assert_eq!(hsps[0].score, 5);
305 assert_eq!(hsps[0].query_start, 0);
306 assert_eq!(hsps[0].subject_start, 0);
307 }
308
309 #[test]
310 fn invalid_config_and_empty_input_error() {
311 assert!(
313 BlastAligner::new(BlastConfig {
314 kmer_len: 0,
315 ..BlastConfig::default()
316 })
317 .is_err()
318 );
319 assert!(
321 BlastAligner::new(BlastConfig {
322 x_drop: 0,
323 ..BlastConfig::default()
324 })
325 .is_err()
326 );
327 let aligner = BlastAligner::new(BlastConfig::default()).expect("cfg");
329 let empty: &[u8] = b"";
330 assert!(aligner.search(empty, b"ACGT").is_err());
331 assert!(aligner.search(b"ACGT", empty).is_err());
332 }
333}