1use crate::kmer::{Kmer, KmerBits};
13use crate::minimizer::{MinimizerInfo, MinimizerIterator};
14use crate::encoding::encode_base;
15
16#[derive(Clone, Debug, PartialEq, Eq)]
18pub struct LookupResult {
19 pub kmer_id: u64,
21 pub kmer_id_in_string: u64,
23 pub kmer_offset: u64,
25 pub kmer_orientation: i8,
27
28 pub string_id: u64,
30 pub string_begin: u64,
32 pub string_end: u64,
34
35 pub minimizer_found: bool,
37}
38
39impl LookupResult {
40 pub fn not_found() -> Self {
42 Self {
43 kmer_id: u64::MAX,
44 kmer_id_in_string: u64::MAX,
45 kmer_offset: u64::MAX,
46 kmer_orientation: 1, string_id: u64::MAX,
48 string_begin: u64::MAX,
49 string_end: u64::MAX,
50 minimizer_found: true,
51 }
52 }
53
54 #[inline]
56 pub fn is_found(&self) -> bool {
57 self.kmer_id != u64::MAX
58 }
59
60 #[inline]
62 pub fn string_length(&self) -> u64 {
63 if self.is_found() {
64 self.string_end - self.string_begin
65 } else {
66 0
67 }
68 }
69}
70
71impl Default for LookupResult {
72 fn default() -> Self {
73 Self::not_found()
74 }
75}
76
77pub struct StreamingQuery<const K: usize>
93where
94 Kmer<K>: KmerBits,
95{
96 k: usize,
97 _m: usize, _canonical: bool, start: bool,
102 kmer: Option<Kmer<K>>,
103 kmer_rc: Option<Kmer<K>>,
104
105 minimizer_it: MinimizerIterator,
107 minimizer_it_rc: MinimizerIterator,
108 curr_mini_info: MinimizerInfo,
109 prev_mini_info: MinimizerInfo,
110 curr_mini_info_rc: MinimizerInfo,
111 prev_mini_info_rc: MinimizerInfo,
112
113 remaining_string_bases: u64,
115
116 result: LookupResult,
118
119 num_searches: u64,
121 num_extensions: u64,
122 num_invalid: u64,
123 num_negative: u64,
124}
125
126impl<const K: usize> StreamingQuery<K>
127where
128 Kmer<K>: KmerBits,
129{
130 pub fn new(k: usize, m: usize, canonical: bool) -> Self {
137 assert_eq!(k, K, "k parameter must match const generic K");
138
139 let dummy_mini = MinimizerInfo::new(u64::MAX, 0, 0);
140
141 Self {
142 k,
143 _m: m,
144 _canonical: canonical,
145 start: true,
146 kmer: None,
147 kmer_rc: None,
148 minimizer_it: MinimizerIterator::with_seed(k, m, 1),
149 minimizer_it_rc: MinimizerIterator::with_seed(k, m, 1),
150 curr_mini_info: dummy_mini,
151 prev_mini_info: dummy_mini,
152 curr_mini_info_rc: dummy_mini,
153 prev_mini_info_rc: dummy_mini,
154 remaining_string_bases: 0,
155 result: LookupResult::not_found(),
156 num_searches: 0,
157 num_extensions: 0,
158 num_invalid: 0,
159 num_negative: 0,
160 }
161 }
162
163 pub fn reset(&mut self) {
165 self.start = true;
166 self.remaining_string_bases = 0;
167 self.result = LookupResult::not_found();
168 self.minimizer_it.set_position(0);
169 self.minimizer_it_rc.set_position(0);
170 }
171
172 pub fn lookup(&mut self, kmer_bytes: &[u8]) -> LookupResult {
183 self.lookup_internal(kmer_bytes, None)
185 }
186
187 pub(crate) fn lookup_with_dict(&mut self, kmer_bytes: &[u8], dict: &crate::dictionary::Dictionary) -> LookupResult {
191 self.lookup_internal(kmer_bytes, Some(dict))
192 }
193
194 fn lookup_internal(&mut self, kmer_bytes: &[u8], dict_opt: Option<&crate::dictionary::Dictionary>) -> LookupResult {
195 let is_valid = if self.start {
197 self.is_valid_kmer_bytes(kmer_bytes)
198 } else {
199 self.is_valid_base(kmer_bytes[self.k - 1])
200 };
201
202 if !is_valid {
203 self.num_invalid += 1;
204 self.reset();
205 return self.result.clone();
206 }
207
208 if self.start {
210 let km = Kmer::<K>::from_ascii_unchecked(kmer_bytes);
212 self.kmer = Some(km);
213 let rc = km.reverse_complement();
214 self.kmer_rc = Some(rc);
215
216 self.curr_mini_info = self.minimizer_it.next(km);
217 self.curr_mini_info_rc = self.minimizer_it_rc.next(rc);
218 } else {
219 if let Some(mut km) = self.kmer {
221 for i in 0..(self.k - 1) {
223 let base = km.get_base(i + 1);
224 km.set_base(i, base);
225 }
226
227 let new_base = kmer_bytes[self.k - 1];
229 if let Ok(encoded) = encode_base(new_base) {
230 km.set_base(self.k - 1, encoded);
231
232 self.kmer = Some(km);
233
234 if let Some(mut km_rc) = self.kmer_rc {
236 for i in (1..self.k).rev() {
237 let base = km_rc.get_base(i - 1);
238 km_rc.set_base(i, base);
239 }
240
241 let complement = crate::encoding::complement_base(encoded);
243 km_rc.set_base(0, complement);
244
245 self.kmer_rc = Some(km_rc);
246
247 self.curr_mini_info = self.minimizer_it.next(km);
248 self.curr_mini_info_rc = self.minimizer_it_rc.next(km_rc);
249 }
250 }
251 }
252 }
253
254 if self.remaining_string_bases == 0 {
256 self.seed(dict_opt);
257 } else {
258 if let Some(dict) = dict_opt {
260 self.try_extend(dict);
261 } else {
262 self.seed(dict_opt);
264 }
265 }
266
267 self.prev_mini_info = self.curr_mini_info;
269 self.prev_mini_info_rc = self.curr_mini_info_rc;
270 self.start = false;
271
272 self.result.clone()
273 }
274
275 fn is_valid_kmer_bytes(&self, bytes: &[u8]) -> bool {
277 if bytes.len() != self.k {
278 return false;
279 }
280 for &b in bytes {
281 if !matches!(b, b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't') {
282 return false;
283 }
284 }
285 true
286 }
287
288 fn is_valid_base(&self, b: u8) -> bool {
290 matches!(b, b'A' | b'C' | b'G' | b'T' | b'a' | b'c' | b'g' | b't')
291 }
292
293 fn seed(&mut self, dict_opt: Option<&crate::dictionary::Dictionary>) {
297 self.remaining_string_bases = 0;
298
299 if !self.start
301 && self.curr_mini_info.value == self.prev_mini_info.value
302 && self.curr_mini_info_rc.value == self.prev_mini_info_rc.value
303 && !self.result.minimizer_found
304 {
305 assert_eq!(self.result.kmer_id, u64::MAX);
306 self.num_negative += 1;
307 return;
308 }
309
310 if let (Some(dict), Some(kmer)) = (dict_opt, self.kmer) {
311 if self._canonical {
312 let kmer_rc = kmer.reverse_complement();
321 let mini_fwd = dict.extract_minimizer::<K>(&kmer);
322 let mini_rc = dict.extract_minimizer::<K>(&kmer_rc);
323
324 if mini_fwd.value < mini_rc.value {
325 self.result = dict.lookup_canonical_streaming::<K>(&kmer, &kmer_rc, mini_fwd);
326 } else if mini_rc.value < mini_fwd.value {
327 self.result = dict.lookup_canonical_streaming::<K>(&kmer, &kmer_rc, mini_rc);
328 } else {
329 self.result = dict.lookup_canonical_streaming::<K>(&kmer, &kmer_rc, mini_fwd);
330 if self.result.kmer_id == u64::MAX {
331 self.result = dict.lookup_canonical_streaming::<K>(&kmer, &kmer_rc, mini_rc);
332 }
333 }
334 } else {
335 let mini_fwd = dict.extract_minimizer::<K>(&kmer);
338 self.result = dict.lookup_regular_streaming::<K>(&kmer, mini_fwd);
339 let minimizer_found = self.result.minimizer_found;
340 if self.result.kmer_id == u64::MAX {
341 assert_eq!(self.result.kmer_orientation, 1); let kmer_rc = kmer.reverse_complement();
343 let mini_rc = dict.extract_minimizer::<K>(&kmer_rc);
344 self.result = dict.lookup_regular_streaming::<K>(&kmer_rc, mini_rc);
345 self.result.kmer_orientation = -1; let minimizer_rc_found = self.result.minimizer_found;
347 self.result.minimizer_found = minimizer_rc_found || minimizer_found;
348 }
349 }
350
351 if self.result.kmer_id == u64::MAX {
352 self.num_negative += 1;
353 return;
354 }
355
356 assert!(self.result.minimizer_found);
357 self.num_searches += 1;
358
359 let string_size = self.result.string_end - self.result.string_begin;
363 if self.result.kmer_orientation > 0 {
364 self.remaining_string_bases =
365 (string_size - self.k as u64) - self.result.kmer_id_in_string;
366 } else {
367 self.remaining_string_bases = self.result.kmer_id_in_string;
368 }
369 } else {
370 self.result = LookupResult::not_found();
372 self.num_negative += 1;
373 }
374 }
375
376 fn try_extend(&mut self, dict: &crate::dictionary::Dictionary) {
382 if let (Some(kmer), Some(kmer_rc)) = (self.kmer, self.kmer_rc) {
383 let abs_pos = self.result.kmer_id_in_string as usize
387 + self.result.string_begin as usize;
388
389 let next_abs_pos = if self.result.kmer_orientation > 0 {
390 abs_pos + 1
391 } else {
392 abs_pos.wrapping_sub(1)
393 };
394
395 let expected_kmer: Kmer<K> = dict.spss().decode_kmer_at(next_abs_pos);
397
398 if expected_kmer.bits() == kmer.bits()
399 || expected_kmer.bits() == kmer_rc.bits()
400 {
401 self.num_extensions += 1;
403 let delta = self.result.kmer_orientation as i64;
404 self.result.kmer_id = (self.result.kmer_id as i64 + delta) as u64;
405 self.result.kmer_id_in_string =
406 (self.result.kmer_id_in_string as i64 + delta) as u64;
407 self.result.kmer_offset =
408 (self.result.kmer_offset as i64 + delta) as u64;
409 self.remaining_string_bases -= 1;
410 return;
411 }
412 }
413
414 self.seed(Some(dict));
416 }
417
418 pub fn num_searches(&self) -> u64 {
420 self.num_searches
421 }
422
423 pub fn num_extensions(&self) -> u64 {
425 self.num_extensions
426 }
427
428 pub fn num_positive_lookups(&self) -> u64 {
430 self.num_searches + self.num_extensions
431 }
432
433 pub fn num_negative_lookups(&self) -> u64 {
435 self.num_negative
436 }
437
438 pub fn num_invalid_lookups(&self) -> u64 {
440 self.num_invalid
441 }
442}
443
444#[cfg(test)]
445mod tests {
446 use super::*;
447
448 #[test]
449 fn test_lookup_result_creation() {
450 let result = LookupResult::not_found();
451 assert!(!result.is_found());
452 assert_eq!(result.kmer_id, u64::MAX);
453 }
454
455 #[test]
456 fn test_lookup_result_string_length() {
457 let mut result = LookupResult::not_found();
458 result.string_begin = 100;
459 result.string_end = 200;
460 result.kmer_id = 42; assert_eq!(result.string_length(), 100);
463 }
464
465 #[test]
466 fn test_streaming_query_creation() {
467 let query: StreamingQuery<31> = StreamingQuery::new(31, 13, true);
468 assert_eq!(query.k, 31);
469 assert_eq!(query._m, 13);
470 assert!(query._canonical);
471 assert_eq!(query.num_searches(), 0);
472 }
473
474 #[test]
475 fn test_streaming_query_reset() {
476 let mut query: StreamingQuery<31> = StreamingQuery::new(31, 13, false);
477 query.num_searches = 10;
478 query.num_extensions = 5;
479
480 query.reset();
481
482 assert!(query.start);
483 assert_eq!(query.remaining_string_bases, 0);
484 }
485
486 #[test]
487 fn test_streaming_query_validation() {
488 let query: StreamingQuery<31> = StreamingQuery::new(31, 13, true);
489
490 assert!(query.is_valid_kmer_bytes(b"ACGTACGTACGTACGTACGTACGTACGTACG")); assert!(!query.is_valid_kmer_bytes(b"ACGT")); assert!(!query.is_valid_kmer_bytes(b"ACGTACGTACGTACGTACGTACGTACGTACGN")); assert!(query.is_valid_base(b'A'));
495 assert!(query.is_valid_base(b'a'));
496 assert!(!query.is_valid_base(b'N'));
497 }
498
499 #[test]
500 fn test_streaming_query_lookup_invalid() {
501 let mut query: StreamingQuery<15> = StreamingQuery::new(15, 7, true);
502
503 let result = query.lookup(b"ACGT");
505 assert!(!result.is_found());
506 assert_eq!(query.num_invalid_lookups(), 1);
507
508 query.reset();
510 let result = query.lookup(b"ACGTACGTACGTACN");
511 assert!(!result.is_found());
512 assert_eq!(query.num_invalid_lookups(), 2);
513 }
514
515 #[test]
516 fn test_streaming_query_incremental_update() {
517 let mut query: StreamingQuery<9> = StreamingQuery::new(9, 5, false);
518
519 let _result1 = query.lookup(b"ACGTACGTA");
521 assert!(!query.start); let _result2 = query.lookup(b"CGTACGTAC");
525
526 assert!(!query.start);
528 }
529}
530
531pub struct StreamingQueryEngine<'a, const K: usize>
536where
537 Kmer<K>: KmerBits,
538{
539 dict: &'a crate::dictionary::Dictionary,
540 query: StreamingQuery<K>,
541}
542
543impl<'a, const K: usize> StreamingQueryEngine<'a, K>
544where
545 Kmer<K>: KmerBits,
546{
547 pub fn new(dict: &'a crate::dictionary::Dictionary) -> Self {
549 let canonical = dict.canonical();
550 Self {
551 dict,
552 query: StreamingQuery::new(dict.k(), dict.m(), canonical),
553 }
554 }
555
556 pub fn reset(&mut self) {
558 self.query.reset();
559 }
560
561 pub fn lookup(&mut self, kmer_bytes: &[u8]) -> LookupResult {
563 self.query.lookup_with_dict(kmer_bytes, self.dict)
565 }
566
567 pub fn num_searches(&self) -> u64 {
569 self.query.num_searches()
570 }
571
572 pub fn num_extensions(&self) -> u64 {
574 self.query.num_extensions()
575 }
576
577 pub fn stats(&self) -> StreamingQueryStats {
579 StreamingQueryStats {
580 num_searches: self.query.num_searches(),
581 num_extensions: self.query.num_extensions(),
582 num_invalid: self.query.num_invalid_lookups(),
583 num_negative: self.query.num_negative_lookups(),
584 }
585 }
586}
587
588#[derive(Debug, Clone)]
590pub struct StreamingQueryStats {
591 pub num_searches: u64,
593 pub num_extensions: u64,
595 pub num_invalid: u64,
597 pub num_negative: u64,
599}