1use crate::{
9 kmer::{Kmer, KmerBits},
10 minimizer::MinimizerInfo,
11 minimizers_control_map::MinimizersControlMap,
12 sparse_and_skew_index::SparseAndSkewIndex,
13 spectrum_preserving_string_set::SpectrumPreservingStringSet,
14 constants::{ceil_log2, INVALID_UINT64, MIN_L},
15};
16use value_traits::slices::SliceByValue;
17use tracing::info;
18
19pub struct Dictionary {
24 spss: SpectrumPreservingStringSet,
26
27 control_map: MinimizersControlMap,
29
30 index: SparseAndSkewIndex,
32
33 k: usize,
35
36 m: usize,
38
39 canonical: bool,
41
42 hasher: crate::hasher::DeterministicHasher,
44}
45
46#[derive(Clone, Copy)]
49struct LocatedHit {
50 kmer_offset: u64,
51 orientation: i8,
52 string_id: u64,
53 string_begin: u64,
54 string_end: u64,
55}
56
57impl LocatedHit {
58 #[inline]
59 fn into_lookup_result(self, k: usize) -> crate::streaming_query::LookupResult {
60 let kmer_id_in_string = self.kmer_offset - self.string_begin;
61 let kmer_id = self.kmer_offset - self.string_id * (k as u64 - 1);
62 crate::streaming_query::LookupResult {
63 kmer_id,
64 kmer_id_in_string,
65 kmer_offset: self.kmer_offset,
66 kmer_orientation: self.orientation,
67 string_id: self.string_id,
68 string_begin: self.string_begin,
69 string_end: self.string_end,
70 minimizer_found: true,
71 }
72 }
73}
74
75impl Dictionary {
76 pub fn new(
78 spss: SpectrumPreservingStringSet,
79 control_map: MinimizersControlMap,
80 index: SparseAndSkewIndex,
81 k: usize,
82 m: usize,
83 canonical: bool,
84 ) -> Self {
85 Self {
86 spss,
87 control_map,
88 index,
89 k,
90 m,
91 canonical,
92 hasher: crate::hasher::DeterministicHasher::new(1),
93 }
94 }
95
96 #[inline]
98 pub fn lookup<const K: usize>(&self, kmer: &Kmer<K>) -> u64
99 where
100 Kmer<K>: KmerBits,
101 {
102 let (pos, _) = self.lookup_with_orientation(kmer);
103 pos
104 }
105
106 #[inline]
108 pub fn query<const K: usize>(&self, kmer: &Kmer<K>) -> crate::streaming_query::LookupResult
109 where
110 Kmer<K>: KmerBits,
111 {
112 if self.canonical {
113 return self.query_canonical(kmer);
114 }
115
116 if let Some(res) = self.query_regular(kmer, 1) {
118 return res;
119 }
120 let kmer_rc = kmer.reverse_complement();
121 if let Some(res) = self.query_regular(&kmer_rc, -1) {
122 return res;
123 }
124 crate::streaming_query::LookupResult::not_found()
125 }
126
127 #[inline]
129 fn query_regular<const K: usize>(
130 &self,
131 kmer: &Kmer<K>,
132 orientation: i8,
133 ) -> Option<crate::streaming_query::LookupResult>
134 where
135 Kmer<K>: KmerBits,
136 {
137 let minimizer_info = self.extract_minimizer(kmer);
138 let bucket_id = self.control_map.lookup(minimizer_info.value)?;
139 if bucket_id >= self.index.num_buckets() {
140 return None;
141 }
142
143 let mut hit = self.lookup_in_bucket::<K>(kmer, minimizer_info, bucket_id)?;
144 hit.orientation = orientation;
145 Some(hit.into_lookup_result(self.k))
146 }
147
148 #[inline]
150 fn query_canonical<const K: usize>(
151 &self,
152 kmer: &Kmer<K>,
153 ) -> crate::streaming_query::LookupResult
154 where
155 Kmer<K>: KmerBits,
156 {
157 match self.lookup_canonical_located(kmer) {
158 Some(hit) => hit.into_lookup_result(self.k),
159 None => crate::streaming_query::LookupResult::not_found(),
160 }
161 }
162
163 pub fn query_from_str<const K: usize>(&self, kmer_str: &str) -> crate::streaming_query::LookupResult
165 where
166 Kmer<K>: KmerBits,
167 {
168 assert_eq!(kmer_str.len(), self.k, "k-mer string length must equal k={}", self.k);
169 match Kmer::<K>::from_str(kmer_str) {
170 Ok(kmer) => self.query(&kmer),
171 Err(_) => crate::streaming_query::LookupResult::not_found(),
172 }
173 }
174
175 pub fn debug_has_control_map_mphf(&self) -> bool {
177 self.control_map.mphf_ref().is_some()
178 }
179
180 pub fn debug_control_map_lookup(&self, minimizer: u64) -> bool {
182 self.control_map.lookup(minimizer).is_some()
183 }
184
185 pub fn debug_extract_minimizer<const K: usize>(&self, kmer: &Kmer<K>) -> MinimizerInfo
187 where
188 Kmer<K>: KmerBits,
189 {
190 self.extract_minimizer(kmer)
191 }
192
193 pub fn debug_bucket_info<const K: usize>(&self, kmer: &Kmer<K>) -> Option<(u64, u64, u64)>
195 where
196 Kmer<K>: KmerBits,
197 {
198 let minimizer_info = self.extract_minimizer(kmer);
199 let bucket_id = self.control_map.lookup(minimizer_info.value)?;
200 if bucket_id >= self.index.num_buckets() {
201 return None;
202 }
203 let (begin, end) = self.index.locate_bucket(bucket_id);
204 Some((minimizer_info.value, bucket_id as u64, (end - begin) as u64))
205 }
206
207 #[inline]
209 pub fn lookup_with_orientation<const K: usize>(&self, kmer: &Kmer<K>) -> (u64, i8)
210 where
211 Kmer<K>: KmerBits,
212 {
213 if self.canonical {
214 self.lookup_canonical_with_orientation(kmer)
215 } else {
216 let (pos, ori) = self.lookup_regular_with_orientation(kmer);
217 if pos != INVALID_UINT64 {
218 return (pos, ori);
219 }
220 let kmer_rc = kmer.reverse_complement();
221 let (pos_rc, _) = self.lookup_regular_with_orientation(&kmer_rc);
222 if pos_rc != INVALID_UINT64 {
223 return (pos_rc, -1);
224 }
225 (INVALID_UINT64, 1)
226 }
227 }
228
229 #[inline]
230 fn lookup_regular_with_orientation<const K: usize>(&self, kmer: &Kmer<K>) -> (u64, i8)
231 where
232 Kmer<K>: KmerBits,
233 {
234 let minimizer_info = self.extract_minimizer(kmer);
235 let bucket_id = match self.control_map.lookup(minimizer_info.value) {
236 Some(id) => id,
237 None => return (INVALID_UINT64, 1),
238 };
239 if bucket_id >= self.index.num_buckets() {
240 return (INVALID_UINT64, 1);
241 }
242
243 match self.lookup_in_bucket::<K>(kmer, minimizer_info, bucket_id) {
244 Some(hit) => (hit.kmer_offset, 1),
245 None => (INVALID_UINT64, 1),
246 }
247 }
248
249 #[inline]
250 fn lookup_canonical_with_orientation<const K: usize>(&self, kmer: &Kmer<K>) -> (u64, i8)
251 where
252 Kmer<K>: KmerBits,
253 {
254 match self.lookup_canonical_located(kmer) {
255 Some(hit) => (hit.kmer_offset, hit.orientation),
256 None => (INVALID_UINT64, 1),
257 }
258 }
259
260 #[inline]
261 fn lookup_canonical_located<const K: usize>(&self, kmer: &Kmer<K>) -> Option<LocatedHit>
262 where
263 Kmer<K>: KmerBits,
264 {
265 let kmer_rc = kmer.reverse_complement();
266 let mini_fwd = self.extract_minimizer(kmer);
267 let mini_rc = self.extract_minimizer(&kmer_rc);
268
269 if mini_fwd.value < mini_rc.value {
270 self.lookup_canonical_with_minimizer::<K>(kmer, &kmer_rc, mini_fwd)
271 } else if mini_rc.value < mini_fwd.value {
272 self.lookup_canonical_with_minimizer::<K>(kmer, &kmer_rc, mini_rc)
273 } else {
274 if let Some(hit) = self.lookup_canonical_with_minimizer::<K>(kmer, &kmer_rc, mini_fwd) {
275 return Some(hit);
276 }
277 self.lookup_canonical_with_minimizer::<K>(kmer, &kmer_rc, mini_rc)
278 }
279 }
280
281 #[inline]
282 fn lookup_canonical_with_minimizer<const K: usize>(
283 &self,
284 kmer: &Kmer<K>,
285 kmer_rc: &Kmer<K>,
286 minimizer_info: MinimizerInfo,
287 ) -> Option<LocatedHit>
288 where
289 Kmer<K>: KmerBits,
290 {
291 let bucket_id = self.control_map.lookup(minimizer_info.value)?;
292 if bucket_id >= self.index.num_buckets() {
293 return None;
294 }
295
296 self.lookup_in_bucket_canonical::<K>(kmer, kmer_rc, minimizer_info, bucket_id)
297 }
298
299 #[inline]
308 fn lookup_in_bucket<const K: usize>(
309 &self,
310 kmer: &Kmer<K>,
311 minimizer_info: MinimizerInfo,
312 bucket_id: usize,
313 ) -> Option<LocatedHit>
314 where
315 Kmer<K>: KmerBits,
316 {
317 let (begin, end) = self.index.locate_bucket(bucket_id);
318 let n = end - begin;
319 if n == 0 {
320 return None;
321 }
322
323 let log2_size = ceil_log2(n as u64);
324 if log2_size > MIN_L {
325 let within_pos = self.index.skew_index.lookup(&kmer.bits(), log2_size);
327 if within_pos == INVALID_UINT64 || within_pos as usize >= n {
328 return None;
329 }
330 let minimizer_pos = self.index.offsets.index_value(begin + within_pos as usize) as u64;
331 self.lookup_from_minimizer_pos::<K>(kmer, minimizer_pos, minimizer_info)
332 } else {
333 self.lookup_bucket_linear::<K>(kmer, minimizer_info, begin, end)
335 }
336 }
337
338 #[inline]
340 fn lookup_in_bucket_canonical<const K: usize>(
341 &self,
342 kmer: &Kmer<K>,
343 kmer_rc: &Kmer<K>,
344 minimizer_info: MinimizerInfo,
345 bucket_id: usize,
346 ) -> Option<LocatedHit>
347 where
348 Kmer<K>: KmerBits,
349 {
350 let (begin, end) = self.index.locate_bucket(bucket_id);
351 let n = end - begin;
352 if n == 0 {
353 return None;
354 }
355
356 let log2_size = ceil_log2(n as u64);
357 if log2_size > MIN_L {
358 let kmer_canon_value = std::cmp::min(kmer.bits(), kmer_rc.bits());
360 let within_pos = self.index.skew_index.lookup(&kmer_canon_value, log2_size);
361 if within_pos == INVALID_UINT64 || within_pos as usize >= n {
362 return None;
363 }
364 let minimizer_pos = self.index.offsets.index_value(begin + within_pos as usize) as u64;
365 self.lookup_from_minimizer_pos_canonical::<K>(kmer, kmer_rc, minimizer_pos, minimizer_info)
366 } else {
367 self.lookup_bucket_linear_canonical::<K>(kmer, kmer_rc, minimizer_info, begin, end)
369 }
370 }
371
372 #[inline]
374 fn lookup_bucket_linear<const K: usize>(
375 &self,
376 query_kmer: &Kmer<K>,
377 minimizer_info: MinimizerInfo,
378 begin: usize,
379 end: usize,
380 ) -> Option<LocatedHit>
381 where
382 Kmer<K>: KmerBits,
383 {
384 for i in begin..end {
385 let minimizer_pos = self.index.offsets.index_value(i) as u64;
386 if let Some(hit) = self.lookup_from_minimizer_pos::<K>(query_kmer, minimizer_pos, minimizer_info) {
387 return Some(hit);
388 }
389 }
390 None
391 }
392
393 #[inline]
395 fn lookup_bucket_linear_canonical<const K: usize>(
396 &self,
397 query_kmer: &Kmer<K>,
398 kmer_rc: &Kmer<K>,
399 minimizer_info: MinimizerInfo,
400 begin: usize,
401 end: usize,
402 ) -> Option<LocatedHit>
403 where
404 Kmer<K>: KmerBits,
405 {
406 for i in begin..end {
407 let minimizer_pos = self.index.offsets.index_value(i) as u64;
408 if let Some(hit) = self.lookup_from_minimizer_pos_canonical::<K>(
409 query_kmer, kmer_rc, minimizer_pos, minimizer_info,
410 ) {
411 return Some(hit);
412 }
413 }
414 None
415 }
416
417 #[inline]
422 fn lookup_from_minimizer_pos<const K: usize>(
423 &self,
424 query_kmer: &Kmer<K>,
425 minimizer_pos: u64,
426 minimizer_info: MinimizerInfo,
427 ) -> Option<LocatedHit>
428 where
429 Kmer<K>: KmerBits,
430 {
431 let kmer_pos = minimizer_pos.checked_sub(minimizer_info.pos_in_kmer as u64)?;
432
433 let stored_kmer = self.spss.decode_kmer_at::<K>(kmer_pos as usize);
434
435 if stored_kmer.bits() == query_kmer.bits() {
436 let (string_id, string_begin, string_end) = self.spss.locate_with_end(kmer_pos)?;
437 if kmer_pos >= string_begin && kmer_pos < string_end - self.k as u64 + 1 {
438 return Some(LocatedHit { kmer_offset: kmer_pos, orientation: 1, string_id, string_begin, string_end });
439 }
440 }
441
442 None
443 }
444
445 #[inline]
447 fn lookup_from_minimizer_pos_canonical<const K: usize>(
448 &self,
449 query_kmer: &Kmer<K>,
450 kmer_rc: &Kmer<K>,
451 minimizer_pos: u64,
452 minimizer_info: MinimizerInfo,
453 ) -> Option<LocatedHit>
454 where
455 Kmer<K>: KmerBits,
456 {
457 let pos_in_kmer_fwd = minimizer_info.pos_in_kmer as u64;
459 if let Some(hit) = self.try_canonical_lookup_at_pos::<K>(
460 query_kmer, kmer_rc, minimizer_pos, pos_in_kmer_fwd
461 ) {
462 return Some(hit);
463 }
464
465 let pos_in_kmer_rc = K as u64 - self.m() as u64 - minimizer_info.pos_in_kmer as u64;
467 self.try_canonical_lookup_at_pos::<K>(
468 query_kmer, kmer_rc, minimizer_pos, pos_in_kmer_rc
469 )
470 }
471
472 #[inline]
474 fn try_canonical_lookup_at_pos<const K: usize>(
475 &self,
476 query_kmer: &Kmer<K>,
477 kmer_rc: &Kmer<K>,
478 minimizer_pos: u64,
479 pos_in_kmer: u64,
480 ) -> Option<LocatedHit>
481 where
482 Kmer<K>: KmerBits,
483 {
484 let kmer_pos = minimizer_pos.checked_sub(pos_in_kmer)?;
485
486 let stored_kmer = self.spss.decode_kmer_at::<K>(kmer_pos as usize);
487
488 let orientation = if stored_kmer.bits() == query_kmer.bits() {
489 1i8
490 } else if stored_kmer.bits() == kmer_rc.bits() {
491 -1i8
492 } else {
493 return None;
494 };
495
496 let (string_id, string_begin, string_end) = self.spss.locate_with_end(kmer_pos)?;
497 if kmer_pos >= string_begin && kmer_pos < string_end - self.k as u64 + 1 {
498 Some(LocatedHit { kmer_offset: kmer_pos, orientation, string_id, string_begin, string_end })
499 } else {
500 None
501 }
502 }
503
504 pub fn access<const K: usize>(&self, kmer: &Kmer<K>) -> bool
506 where
507 Kmer<K>: KmerBits,
508 {
509 self.lookup(kmer) != INVALID_UINT64
510 }
511
512 pub fn create_streaming_query<const K: usize>(&self) -> crate::streaming_query::StreamingQueryEngine<'_, K>
514 where
515 Kmer<K>: KmerBits,
516 {
517 crate::streaming_query::StreamingQueryEngine::new(self)
518 }
519
520 pub fn k(&self) -> usize {
522 self.k
523 }
524
525 pub fn m(&self) -> usize {
527 self.m
528 }
529
530 pub fn canonical(&self) -> bool {
532 self.canonical
533 }
534
535 pub fn spss(&self) -> &SpectrumPreservingStringSet {
537 &self.spss
538 }
539
540 pub fn control_map_ref(&self) -> &MinimizersControlMap {
542 &self.control_map
543 }
544
545 pub fn index_ref(&self) -> &SparseAndSkewIndex {
547 &self.index
548 }
549
550 #[cfg(test)]
552 pub fn debug_spss(&self) -> &SpectrumPreservingStringSet {
553 &self.spss
554 }
555
556 pub fn num_strings(&self) -> u64 {
558 self.spss.num_strings()
559 }
560
561 pub fn string_length(&self, string_id: u64) -> usize {
563 self.spss.string_length(string_id)
564 }
565
566 #[inline]
568 pub fn locate_string(&self, absolute_pos: u64) -> Option<(u64, u64)> {
569 self.spss.locate(absolute_pos)
570 }
571
572 pub fn access_kmer<const K: usize>(&self, string_id: u64, pos: usize) -> Kmer<K>
574 where
575 Kmer<K>: KmerBits,
576 {
577 self.spss.decode_kmer::<K>(string_id, pos)
578 }
579
580 #[inline]
582 pub fn kmer_at_pos<const K: usize>(&self, absolute_base_pos: usize) -> Kmer<K>
583 where
584 Kmer<K>: KmerBits,
585 {
586 self.spss.decode_kmer_at(absolute_base_pos)
587 }
588
589 pub fn num_minimizers(&self) -> u64 {
591 self.control_map.num_minimizers()
592 }
593
594 pub fn num_bits(&self) -> u64 {
596 self.spss.num_bits()
597 + self.control_map.num_bits()
598 + self.index.num_bits()
599 }
600
601 pub fn print_space_breakdown(&self) {
603 let num_kmers = self.spss.total_bases().saturating_sub(
604 (self.k as u64 - 1) * self.spss.num_strings()
605 ) as f64;
606
607 let strings_bytes = self.spss.strings_bytes() as f64;
608 let offsets_bytes = self.spss.offsets_bytes() as f64;
609 let ef_bytes = self.index.ef_bytes() as f64;
610 let index_offsets_bytes = self.index.offsets_bytes() as f64;
611 let skew_bytes = self.index.skew_index_bytes() as f64;
612 let mphf_bytes = self.control_map.mphf_serialized_bytes() as f64;
613 let skew_mphf_bytes = self.index.skew_mphf_bytes() as f64;
614
615 let total = strings_bytes + offsets_bytes + ef_bytes
616 + index_offsets_bytes + skew_bytes
617 + mphf_bytes + skew_mphf_bytes;
618
619 let perc = |x: f64| -> f64 { x * 100.0 / total };
620
621 info!("total index size: {} [B] -- {:.5} [MB] ({:.5} [bits/kmer])",
622 total as u64, total / 1_000_000.0, total * 8.0 / num_kmers);
623 info!("SPACE BREAKDOWN:");
624 info!(" mphf: {:.6} [bits/kmer] ({:.5} [bits/key]) -- {:.4}%",
625 mphf_bytes * 8.0 / num_kmers,
626 mphf_bytes * 8.0 / self.num_minimizers() as f64,
627 perc(mphf_bytes));
628 info!(" strings_offsets: {:.6} [bits/kmer] -- {:.5}%",
629 offsets_bytes * 8.0 / num_kmers, perc(offsets_bytes));
630 info!(" num_super_kmers_before_bucket (EF): {:.5} [bits/kmer] -- {:.4}%",
631 ef_bytes * 8.0 / num_kmers, perc(ef_bytes));
632 info!(" offsets: {:.6} [bits/kmer] -- {:.5}%",
633 index_offsets_bytes * 8.0 / num_kmers, perc(index_offsets_bytes));
634 info!(" strings: {:.5} [bits/kmer] -- {:.4}%",
635 strings_bytes * 8.0 / num_kmers, perc(strings_bytes));
636 info!(" skew_index: {:.6} [bits/kmer] -- {:.5}%",
637 skew_bytes * 8.0 / num_kmers, perc(skew_bytes));
638 info!(" skew_mphfs: {:.6} [bits/kmer] -- {:.5}%",
639 skew_mphf_bytes * 8.0 / num_kmers, perc(skew_mphf_bytes));
640 info!(" --------------");
641 info!(" total: {:.5} [bits/kmer]", total * 8.0 / num_kmers);
642 }
643
644 pub fn save<P: AsRef<std::path::Path>>(&self, path: P) -> crate::serialization::SerializationResult<()> {
646 use crate::serialization::*;
647 use std::io::BufWriter;
648
649 let base_path = path.as_ref();
650
651 let index_path = index_file_path(base_path);
653 let index_file = std::fs::File::create(&index_path)?;
654 let mut index_writer = BufWriter::new(index_file);
655
656 let header = DictionarySerializationHeader::new(
658 self.k,
659 self.m,
660 self.canonical,
661 (self.index.skew_index.num_partitions() + 1) as u32,
662 );
663 header.write(&mut index_writer)?;
664
665 self.spss.serialize_to(&mut index_writer)?;
667
668 self.control_map.serialize_without_mphf(&mut index_writer)?;
670
671 self.index.serialize_without_mphf(&mut index_writer)?;
673
674 let mphf_path = mphf_container_path(base_path);
676 let mphf_file = std::fs::File::create(&mphf_path)?;
677 let mut mphf_writer = BufWriter::new(mphf_file);
678
679 let mut mphfs: Vec<Option<&crate::partitioned_mphf::PartitionedMphf>> = Vec::with_capacity(
681 self.index.skew_index.num_partitions() + 1,
682 );
683 mphfs.push(self.control_map.mphf_ref());
684 mphfs.extend(self.index.skew_index.mphfs_ref().iter().map(|o| o.as_ref()));
685 write_mphf_container(&mut mphf_writer, &mphfs)?;
686
687 Ok(())
688 }
689
690 pub fn load<P: AsRef<std::path::Path>>(path: P) -> crate::serialization::SerializationResult<Self> {
692 use crate::serialization::*;
693 use std::io::BufReader;
694
695 let base_path = path.as_ref();
696
697 let index_path = index_file_path(base_path);
699 let index_file = std::fs::File::open(&index_path)?;
700 let mut index_reader = BufReader::new(index_file);
701
702 let header = DictionarySerializationHeader::read(&mut index_reader)?;
704
705 let spss = SpectrumPreservingStringSet::deserialize_from(&mut index_reader)?;
707
708 let mut control_map = MinimizersControlMap::deserialize_without_mphf(&mut index_reader)?;
710
711 let mut index = SparseAndSkewIndex::deserialize_without_mphf(&mut index_reader)?;
713
714 let mphf_path = mphf_container_path(base_path);
716 let mphf_file = std::fs::File::open(&mphf_path)?;
717 let mut mphf_reader = BufReader::new(mphf_file);
718 let mut mphfs = read_mphf_container(&mut mphf_reader)?;
719
720 let control_mphf = if !mphfs.is_empty() { mphfs.remove(0) } else { None };
722 control_map.set_mphf(control_mphf);
723 index.skew_index.set_mphfs(mphfs);
724
725 Ok(Dictionary {
726 spss,
727 control_map,
728 index,
729 k: header.k,
730 m: header.m,
731 canonical: header.canonical,
732 hasher: crate::hasher::DeterministicHasher::new(1),
733 })
734 }
735
736 #[inline]
738 pub(crate) fn extract_minimizer<const K: usize>(&self, kmer: &Kmer<K>) -> MinimizerInfo
739 where
740 Kmer<K>: KmerBits,
741 {
742 self.extract_minimizer_inline::<K>(*kmer)
743 }
744
745 #[inline]
747 fn extract_minimizer_inline<const K: usize>(&self, kmer: Kmer<K>) -> MinimizerInfo
748 where
749 Kmer<K>: KmerBits,
750 {
751 let num_windows = self.k - self.m + 1;
752 let mask = (1u64 << (self.m * 2)) - 1;
753 let bits = <Kmer<K> as KmerBits>::to_u128(kmer.bits());
754
755 let mut min_hash = u64::MAX;
756 let mut min_value = 0u64;
757 let mut min_pos = 0usize;
758
759 for i in 0..num_windows {
760 let mmer_value = ((bits >> (i * 2)) as u64) & mask;
761 let hash = self.hasher.hash_u64(mmer_value);
762 if hash < min_hash {
763 min_hash = hash;
764 min_value = mmer_value;
765 min_pos = i;
766 }
767 }
768
769 MinimizerInfo::new(min_value, min_pos as u64, min_pos)
770 }
771
772 pub(crate) fn lookup_regular_streaming<const K: usize>(
776 &self,
777 kmer: &Kmer<K>,
778 mini_info: MinimizerInfo,
779 ) -> crate::streaming_query::LookupResult
780 where
781 Kmer<K>: KmerBits,
782 {
783 let bucket_id = match self.control_map.lookup(mini_info.value) {
784 Some(id) => id,
785 None => {
786 let mut res = crate::streaming_query::LookupResult::not_found();
787 res.minimizer_found = false;
788 return res;
789 }
790 };
791
792 if bucket_id >= self.index.num_buckets() {
793 let mut res = crate::streaming_query::LookupResult::not_found();
794 res.minimizer_found = false;
795 return res;
796 }
797
798 match self.lookup_in_bucket::<K>(kmer, mini_info, bucket_id) {
799 Some(hit) => hit.into_lookup_result(self.k),
800 None => crate::streaming_query::LookupResult::not_found(),
801 }
802 }
803
804 pub(crate) fn lookup_canonical_streaming<const K: usize>(
806 &self,
807 kmer: &Kmer<K>,
808 kmer_rc: &Kmer<K>,
809 mini_info: MinimizerInfo,
810 ) -> crate::streaming_query::LookupResult
811 where
812 Kmer<K>: KmerBits,
813 {
814 let bucket_id = match self.control_map.lookup(mini_info.value) {
815 Some(id) => id,
816 None => {
817 let mut res = crate::streaming_query::LookupResult::not_found();
818 res.minimizer_found = false;
819 return res;
820 }
821 };
822
823 if bucket_id >= self.index.num_buckets() {
824 let mut res = crate::streaming_query::LookupResult::not_found();
825 res.minimizer_found = false;
826 return res;
827 }
828
829 match self.lookup_in_bucket_canonical::<K>(kmer, kmer_rc, mini_info, bucket_id) {
830 Some(hit) => hit.into_lookup_result(self.k),
831 None => crate::streaming_query::LookupResult::not_found(),
832 }
833 }
834
835}
836
837#[cfg(test)]
838mod tests {
839 use super::*;
840
841 use crate::builder::{BuildConfiguration, DictionaryBuilder};
842
843 #[test]
844 fn test_dictionary_creation() {
845 let spss = SpectrumPreservingStringSet::new(31, 13);
846 let control_map = MinimizersControlMap::from_parts(
847 Vec::new(),
848 Vec::new(),
849 0,
850 );
851 let index = SparseAndSkewIndex::new();
852
853 let dict = Dictionary::new(spss, control_map, index, 31, 13, false);
854
855 assert_eq!(dict.k(), 31);
856 assert_eq!(dict.m(), 13);
857 assert!(!dict.canonical());
858 }
859
860 #[test]
861 fn test_dictionary_build_and_lookup() {
862 let mut config = BuildConfiguration::new(31, 21).unwrap();
863 config.verbose = true;
864 let builder = DictionaryBuilder::new(config).unwrap();
865
866 let sequences = vec![
867 "ACGTACGTACGTACGTACGTACGTACGTACGTACGT".to_string(),
868 ];
869
870 eprintln!("\n=== Building dictionary ===");
871 let dict = builder.build_from_sequences(sequences.clone()).unwrap();
872
873 assert_eq!(dict.k(), 31);
874 assert_eq!(dict.m(), 21);
875
876 eprintln!("\n=== Dictionary info ===");
877 eprintln!("Num minimizers: {}", dict.num_minimizers());
878 eprintln!("Num buckets: {}", dict.index.num_buckets());
879 eprintln!("Num offsets: {}", dict.index.num_offsets());
880 eprintln!("SPSS num strings: {}", dict.spss.num_strings());
881 eprintln!("SPSS total bases: {}", dict.spss.total_bases());
882
883 let test_kmer_str = &sequences[0][0..31];
885 eprintln!("\n=== Testing lookup for k-mer: {} ===", test_kmer_str);
886 let kmer = crate::kmer::Kmer::<31>::from_str(test_kmer_str).unwrap();
887
888 let mut mini_iter = crate::minimizer::MinimizerIterator::with_seed(31, 21, 1);
890 let mini_info = mini_iter.next(kmer);
891 eprintln!("K-mer minimizer: value={}, pos_in_kmer={}", mini_info.value, mini_info.pos_in_kmer);
892
893 if let Some(bucket_id) = dict.control_map.lookup(mini_info.value) {
895 eprintln!("Control map lookup: bucket_id={}", bucket_id);
896
897 if bucket_id < dict.index.num_buckets() {
898 let (begin, end) = dict.index.locate_bucket(bucket_id);
899 eprintln!("Bucket range: [{}, {}), size={}", begin, end, end - begin);
900 }
901 } else {
902 eprintln!("Minimizer NOT found in control map!");
903 }
904
905 let result = dict.lookup(&kmer);
907
908 eprintln!("\nLookup result: {} (INVALID={})", result, crate::constants::INVALID_UINT64);
909 }
910}