Skip to main content

sshash_lib/
dictionary.rs

1//! Dictionary - the main SSHash data structure
2//!
3//! Provides efficient storage and querying of k-mer sets using:
4//! - Spectrum-preserving string encoding
5//! - Minimizer-based indexing
6//! - Sparse and skew index for O(1) lookup
7
8use crate::{
9    kmer::{Kmer, KmerBits},
10    minimizer::MinimizerInfo,
11    minimizers_control_map::MinimizersControlMap,
12    sparse_and_skew_index::SparseAndSkewIndex,
13    spectrum_preserving_string_set::SpectrumPreservingStringSet,
14    constants::{ceil_log2, INVALID_UINT64, MIN_L},
15};
16use value_traits::slices::SliceByValue;
17use tracing::info;
18
19/// The main dictionary structure
20///
21/// Note: Serialization support is limited due to underlying types.
22/// Use custom save/load methods instead.
23pub struct Dictionary {
24    /// Encoded string storage
25    spss: SpectrumPreservingStringSet,
26
27    /// Maps minimizers to control information
28    control_map: MinimizersControlMap,
29
30    /// Sparse and skew index for lookups
31    index: SparseAndSkewIndex,
32
33    /// K-mer size
34    k: usize,
35
36    /// Minimizer size
37    m: usize,
38
39    /// Whether to use canonical mode (k-mer or RC, whichever is smaller)
40    canonical: bool,
41
42    /// Cached hasher for minimizer extraction (avoids re-creating RandomState per query)
43    hasher: crate::hasher::DeterministicHasher,
44}
45
46/// Successful k-mer lookup with cached locate info to avoid
47/// a redundant second Elias-Fano successor query.
48#[derive(Clone, Copy)]
49struct LocatedHit {
50    kmer_offset: u64,
51    orientation: i8,
52    string_id: u64,
53    string_begin: u64,
54    string_end: u64,
55}
56
57impl LocatedHit {
58    #[inline]
59    fn into_lookup_result(self, k: usize) -> crate::streaming_query::LookupResult {
60        let kmer_id_in_string = self.kmer_offset - self.string_begin;
61        let kmer_id = self.kmer_offset - self.string_id * (k as u64 - 1);
62        crate::streaming_query::LookupResult {
63            kmer_id,
64            kmer_id_in_string,
65            kmer_offset: self.kmer_offset,
66            kmer_orientation: self.orientation,
67            string_id: self.string_id,
68            string_begin: self.string_begin,
69            string_end: self.string_end,
70            minimizer_found: true,
71        }
72    }
73}
74
75impl Dictionary {
76    /// Create a new dictionary from components
77    pub fn new(
78        spss: SpectrumPreservingStringSet,
79        control_map: MinimizersControlMap,
80        index: SparseAndSkewIndex,
81        k: usize,
82        m: usize,
83        canonical: bool,
84    ) -> Self {
85        Self {
86            spss,
87            control_map,
88            index,
89            k,
90            m,
91            canonical,
92            hasher: crate::hasher::DeterministicHasher::new(1),
93        }
94    }
95
96    /// Look up a k-mer's position in the dictionary.
97    #[inline]
98    pub fn lookup<const K: usize>(&self, kmer: &Kmer<K>) -> u64
99    where
100        Kmer<K>: KmerBits,
101    {
102        let (pos, _) = self.lookup_with_orientation(kmer);
103        pos
104    }
105
106    /// Query a k-mer and return a full [`LookupResult`](crate::streaming_query::LookupResult).
107    #[inline]
108    pub fn query<const K: usize>(&self, kmer: &Kmer<K>) -> crate::streaming_query::LookupResult
109    where
110        Kmer<K>: KmerBits,
111    {
112        if self.canonical {
113            return self.query_canonical(kmer);
114        }
115
116        // Non-canonical mode: try forward k-mer first, then RC
117        if let Some(res) = self.query_regular(kmer, 1) {
118            return res;
119        }
120        let kmer_rc = kmer.reverse_complement();
121        if let Some(res) = self.query_regular(&kmer_rc, -1) {
122            return res;
123        }
124        crate::streaming_query::LookupResult::not_found()
125    }
126
127    /// Internal: regular (non-canonical) lookup returning full LookupResult.
128    #[inline]
129    fn query_regular<const K: usize>(
130        &self,
131        kmer: &Kmer<K>,
132        orientation: i8,
133    ) -> Option<crate::streaming_query::LookupResult>
134    where
135        Kmer<K>: KmerBits,
136    {
137        let minimizer_info = self.extract_minimizer(kmer);
138        let bucket_id = self.control_map.lookup(minimizer_info.value)?;
139        if bucket_id >= self.index.num_buckets() {
140            return None;
141        }
142
143        let mut hit = self.lookup_in_bucket::<K>(kmer, minimizer_info, bucket_id)?;
144        hit.orientation = orientation;
145        Some(hit.into_lookup_result(self.k))
146    }
147
148    /// Internal: canonical lookup returning full LookupResult.
149    #[inline]
150    fn query_canonical<const K: usize>(
151        &self,
152        kmer: &Kmer<K>,
153    ) -> crate::streaming_query::LookupResult
154    where
155        Kmer<K>: KmerBits,
156    {
157        match self.lookup_canonical_located(kmer) {
158            Some(hit) => hit.into_lookup_result(self.k),
159            None => crate::streaming_query::LookupResult::not_found(),
160        }
161    }
162
163    /// Query a k-mer given as a DNA string.
164    pub fn query_from_str<const K: usize>(&self, kmer_str: &str) -> crate::streaming_query::LookupResult
165    where
166        Kmer<K>: KmerBits,
167    {
168        assert_eq!(kmer_str.len(), self.k, "k-mer string length must equal k={}", self.k);
169        match Kmer::<K>::from_str(kmer_str) {
170            Ok(kmer) => self.query(&kmer),
171            Err(_) => crate::streaming_query::LookupResult::not_found(),
172        }
173    }
174
175    /// Debug: check if control map MPHF is present
176    pub fn debug_has_control_map_mphf(&self) -> bool {
177        self.control_map.mphf_ref().is_some()
178    }
179
180    /// Debug: check if a minimizer exists in the control map
181    pub fn debug_control_map_lookup(&self, minimizer: u64) -> bool {
182        self.control_map.lookup(minimizer).is_some()
183    }
184
185    /// Debug: extract minimizer info for a k-mer
186    pub fn debug_extract_minimizer<const K: usize>(&self, kmer: &Kmer<K>) -> MinimizerInfo
187    where
188        Kmer<K>: KmerBits,
189    {
190        self.extract_minimizer(kmer)
191    }
192
193    /// Debug: get bucket and locate info for a k-mer
194    pub fn debug_bucket_info<const K: usize>(&self, kmer: &Kmer<K>) -> Option<(u64, u64, u64)>
195    where
196        Kmer<K>: KmerBits,
197    {
198        let minimizer_info = self.extract_minimizer(kmer);
199        let bucket_id = self.control_map.lookup(minimizer_info.value)?;
200        if bucket_id >= self.index.num_buckets() {
201            return None;
202        }
203        let (begin, end) = self.index.locate_bucket(bucket_id);
204        Some((minimizer_info.value, bucket_id as u64, (end - begin) as u64))
205    }
206
207    /// Look up a k-mer and return position + orientation
208    #[inline]
209    pub fn lookup_with_orientation<const K: usize>(&self, kmer: &Kmer<K>) -> (u64, i8)
210    where
211        Kmer<K>: KmerBits,
212    {
213        if self.canonical {
214            self.lookup_canonical_with_orientation(kmer)
215        } else {
216            let (pos, ori) = self.lookup_regular_with_orientation(kmer);
217            if pos != INVALID_UINT64 {
218                return (pos, ori);
219            }
220            let kmer_rc = kmer.reverse_complement();
221            let (pos_rc, _) = self.lookup_regular_with_orientation(&kmer_rc);
222            if pos_rc != INVALID_UINT64 {
223                return (pos_rc, -1);
224            }
225            (INVALID_UINT64, 1)
226        }
227    }
228
229    #[inline]
230    fn lookup_regular_with_orientation<const K: usize>(&self, kmer: &Kmer<K>) -> (u64, i8)
231    where
232        Kmer<K>: KmerBits,
233    {
234        let minimizer_info = self.extract_minimizer(kmer);
235        let bucket_id = match self.control_map.lookup(minimizer_info.value) {
236            Some(id) => id,
237            None => return (INVALID_UINT64, 1),
238        };
239        if bucket_id >= self.index.num_buckets() {
240            return (INVALID_UINT64, 1);
241        }
242
243        match self.lookup_in_bucket::<K>(kmer, minimizer_info, bucket_id) {
244            Some(hit) => (hit.kmer_offset, 1),
245            None => (INVALID_UINT64, 1),
246        }
247    }
248
249    #[inline]
250    fn lookup_canonical_with_orientation<const K: usize>(&self, kmer: &Kmer<K>) -> (u64, i8)
251    where
252        Kmer<K>: KmerBits,
253    {
254        match self.lookup_canonical_located(kmer) {
255            Some(hit) => (hit.kmer_offset, hit.orientation),
256            None => (INVALID_UINT64, 1),
257        }
258    }
259
260    #[inline]
261    fn lookup_canonical_located<const K: usize>(&self, kmer: &Kmer<K>) -> Option<LocatedHit>
262    where
263        Kmer<K>: KmerBits,
264    {
265        let kmer_rc = kmer.reverse_complement();
266        let mini_fwd = self.extract_minimizer(kmer);
267        let mini_rc = self.extract_minimizer(&kmer_rc);
268
269        if mini_fwd.value < mini_rc.value {
270            self.lookup_canonical_with_minimizer::<K>(kmer, &kmer_rc, mini_fwd)
271        } else if mini_rc.value < mini_fwd.value {
272            self.lookup_canonical_with_minimizer::<K>(kmer, &kmer_rc, mini_rc)
273        } else {
274            if let Some(hit) = self.lookup_canonical_with_minimizer::<K>(kmer, &kmer_rc, mini_fwd) {
275                return Some(hit);
276            }
277            self.lookup_canonical_with_minimizer::<K>(kmer, &kmer_rc, mini_rc)
278        }
279    }
280
281    #[inline]
282    fn lookup_canonical_with_minimizer<const K: usize>(
283        &self,
284        kmer: &Kmer<K>,
285        kmer_rc: &Kmer<K>,
286        minimizer_info: MinimizerInfo,
287    ) -> Option<LocatedHit>
288    where
289        Kmer<K>: KmerBits,
290    {
291        let bucket_id = self.control_map.lookup(minimizer_info.value)?;
292        if bucket_id >= self.index.num_buckets() {
293            return None;
294        }
295
296        self.lookup_in_bucket_canonical::<K>(kmer, kmer_rc, minimizer_info, bucket_id)
297    }
298
299    // -----------------------------------------------------------------------
300    // Core bucket lookup helpers (replaces old 3-way LSB dispatch)
301    // -----------------------------------------------------------------------
302
303    /// Look up a k-mer in a bucket using the locate_bucket + size-based dispatch.
304    ///
305    /// For small buckets (ceil_log2(size) <= MIN_L): linear scan of offsets.
306    /// For large buckets: skew index MPHF lookup.
307    #[inline]
308    fn lookup_in_bucket<const K: usize>(
309        &self,
310        kmer: &Kmer<K>,
311        minimizer_info: MinimizerInfo,
312        bucket_id: usize,
313    ) -> Option<LocatedHit>
314    where
315        Kmer<K>: KmerBits,
316    {
317        let (begin, end) = self.index.locate_bucket(bucket_id);
318        let n = end - begin;
319        if n == 0 {
320            return None;
321        }
322
323        let log2_size = ceil_log2(n as u64);
324        if log2_size > MIN_L {
325            // Heavy bucket: use skew index
326            let within_pos = self.index.skew_index.lookup(&kmer.bits(), log2_size);
327            if within_pos == INVALID_UINT64 || within_pos as usize >= n {
328                return None;
329            }
330            let minimizer_pos = self.index.offsets.index_value(begin + within_pos as usize) as u64;
331            self.lookup_from_minimizer_pos::<K>(kmer, minimizer_pos, minimizer_info)
332        } else {
333            // Singleton or light bucket: linear scan
334            self.lookup_bucket_linear::<K>(kmer, minimizer_info, begin, end)
335        }
336    }
337
338    /// Canonical lookup in a bucket using locate_bucket + size-based dispatch.
339    #[inline]
340    fn lookup_in_bucket_canonical<const K: usize>(
341        &self,
342        kmer: &Kmer<K>,
343        kmer_rc: &Kmer<K>,
344        minimizer_info: MinimizerInfo,
345        bucket_id: usize,
346    ) -> Option<LocatedHit>
347    where
348        Kmer<K>: KmerBits,
349    {
350        let (begin, end) = self.index.locate_bucket(bucket_id);
351        let n = end - begin;
352        if n == 0 {
353            return None;
354        }
355
356        let log2_size = ceil_log2(n as u64);
357        if log2_size > MIN_L {
358            // Heavy bucket: use skew index with canonical k-mer
359            let kmer_canon_value = std::cmp::min(kmer.bits(), kmer_rc.bits());
360            let within_pos = self.index.skew_index.lookup(&kmer_canon_value, log2_size);
361            if within_pos == INVALID_UINT64 || within_pos as usize >= n {
362                return None;
363            }
364            let minimizer_pos = self.index.offsets.index_value(begin + within_pos as usize) as u64;
365            self.lookup_from_minimizer_pos_canonical::<K>(kmer, kmer_rc, minimizer_pos, minimizer_info)
366        } else {
367            // Singleton or light bucket: linear scan
368            self.lookup_bucket_linear_canonical::<K>(kmer, kmer_rc, minimizer_info, begin, end)
369        }
370    }
371
372    /// Linear scan through offsets[begin..end] for regular lookup.
373    #[inline]
374    fn lookup_bucket_linear<const K: usize>(
375        &self,
376        query_kmer: &Kmer<K>,
377        minimizer_info: MinimizerInfo,
378        begin: usize,
379        end: usize,
380    ) -> Option<LocatedHit>
381    where
382        Kmer<K>: KmerBits,
383    {
384        for i in begin..end {
385            let minimizer_pos = self.index.offsets.index_value(i) as u64;
386            if let Some(hit) = self.lookup_from_minimizer_pos::<K>(query_kmer, minimizer_pos, minimizer_info) {
387                return Some(hit);
388            }
389        }
390        None
391    }
392
393    /// Linear scan for canonical lookup (tries both fwd and RC pos_in_kmer).
394    #[inline]
395    fn lookup_bucket_linear_canonical<const K: usize>(
396        &self,
397        query_kmer: &Kmer<K>,
398        kmer_rc: &Kmer<K>,
399        minimizer_info: MinimizerInfo,
400        begin: usize,
401        end: usize,
402    ) -> Option<LocatedHit>
403    where
404        Kmer<K>: KmerBits,
405    {
406        for i in begin..end {
407            let minimizer_pos = self.index.offsets.index_value(i) as u64;
408            if let Some(hit) = self.lookup_from_minimizer_pos_canonical::<K>(
409                query_kmer, kmer_rc, minimizer_pos, minimizer_info,
410            ) {
411                return Some(hit);
412            }
413        }
414        None
415    }
416
417    // -----------------------------------------------------------------------
418    // Low-level position verification helpers (unchanged)
419    // -----------------------------------------------------------------------
420
421    #[inline]
422    fn lookup_from_minimizer_pos<const K: usize>(
423        &self,
424        query_kmer: &Kmer<K>,
425        minimizer_pos: u64,
426        minimizer_info: MinimizerInfo,
427    ) -> Option<LocatedHit>
428    where
429        Kmer<K>: KmerBits,
430    {
431        let kmer_pos = minimizer_pos.checked_sub(minimizer_info.pos_in_kmer as u64)?;
432
433        let stored_kmer = self.spss.decode_kmer_at::<K>(kmer_pos as usize);
434
435        if stored_kmer.bits() == query_kmer.bits() {
436            let (string_id, string_begin, string_end) = self.spss.locate_with_end(kmer_pos)?;
437            if kmer_pos >= string_begin && kmer_pos < string_end - self.k as u64 + 1 {
438                return Some(LocatedHit { kmer_offset: kmer_pos, orientation: 1, string_id, string_begin, string_end });
439            }
440        }
441
442        None
443    }
444
445    /// Canonical lookup from minimizer position
446    #[inline]
447    fn lookup_from_minimizer_pos_canonical<const K: usize>(
448        &self,
449        query_kmer: &Kmer<K>,
450        kmer_rc: &Kmer<K>,
451        minimizer_pos: u64,
452        minimizer_info: MinimizerInfo,
453    ) -> Option<LocatedHit>
454    where
455        Kmer<K>: KmerBits,
456    {
457        // Try forward position first
458        let pos_in_kmer_fwd = minimizer_info.pos_in_kmer as u64;
459        if let Some(hit) = self.try_canonical_lookup_at_pos::<K>(
460            query_kmer, kmer_rc, minimizer_pos, pos_in_kmer_fwd
461        ) {
462            return Some(hit);
463        }
464
465        // Try RC position (k - m - pos_in_kmer)
466        let pos_in_kmer_rc = K as u64 - self.m() as u64 - minimizer_info.pos_in_kmer as u64;
467        self.try_canonical_lookup_at_pos::<K>(
468            query_kmer, kmer_rc, minimizer_pos, pos_in_kmer_rc
469        )
470    }
471
472    /// Try canonical lookup at a specific position with a given pos_in_kmer
473    #[inline]
474    fn try_canonical_lookup_at_pos<const K: usize>(
475        &self,
476        query_kmer: &Kmer<K>,
477        kmer_rc: &Kmer<K>,
478        minimizer_pos: u64,
479        pos_in_kmer: u64,
480    ) -> Option<LocatedHit>
481    where
482        Kmer<K>: KmerBits,
483    {
484        let kmer_pos = minimizer_pos.checked_sub(pos_in_kmer)?;
485
486        let stored_kmer = self.spss.decode_kmer_at::<K>(kmer_pos as usize);
487
488        let orientation = if stored_kmer.bits() == query_kmer.bits() {
489            1i8
490        } else if stored_kmer.bits() == kmer_rc.bits() {
491            -1i8
492        } else {
493            return None;
494        };
495
496        let (string_id, string_begin, string_end) = self.spss.locate_with_end(kmer_pos)?;
497        if kmer_pos >= string_begin && kmer_pos < string_end - self.k as u64 + 1 {
498            Some(LocatedHit { kmer_offset: kmer_pos, orientation, string_id, string_begin, string_end })
499        } else {
500            None
501        }
502    }
503
504    /// Check if a k-mer exists in the dictionary
505    pub fn access<const K: usize>(&self, kmer: &Kmer<K>) -> bool
506    where
507        Kmer<K>: KmerBits,
508    {
509        self.lookup(kmer) != INVALID_UINT64
510    }
511
512    /// Create a streaming query engine for this dictionary
513    pub fn create_streaming_query<const K: usize>(&self) -> crate::streaming_query::StreamingQueryEngine<'_, K>
514    where
515        Kmer<K>: KmerBits,
516    {
517        crate::streaming_query::StreamingQueryEngine::new(self)
518    }
519
520    /// Get the k-mer size
521    pub fn k(&self) -> usize {
522        self.k
523    }
524
525    /// Get the minimizer size
526    pub fn m(&self) -> usize {
527        self.m
528    }
529
530    /// Check if canonical mode is enabled
531    pub fn canonical(&self) -> bool {
532        self.canonical
533    }
534
535    /// Get a reference to the underlying SPSS
536    pub fn spss(&self) -> &SpectrumPreservingStringSet {
537        &self.spss
538    }
539
540    /// Get a reference to the control map
541    pub fn control_map_ref(&self) -> &MinimizersControlMap {
542        &self.control_map
543    }
544
545    /// Get a reference to the sparse and skew index
546    pub fn index_ref(&self) -> &SparseAndSkewIndex {
547        &self.index
548    }
549
550    /// Debug: get reference to SPSS for testing
551    #[cfg(test)]
552    pub fn debug_spss(&self) -> &SpectrumPreservingStringSet {
553        &self.spss
554    }
555
556    /// Get the number of strings in the SPSS
557    pub fn num_strings(&self) -> u64 {
558        self.spss.num_strings()
559    }
560
561    /// Get the length of a specific string in bases
562    pub fn string_length(&self, string_id: u64) -> usize {
563        self.spss.string_length(string_id)
564    }
565
566    /// Locate which string contains a given absolute position.
567    #[inline]
568    pub fn locate_string(&self, absolute_pos: u64) -> Option<(u64, u64)> {
569        self.spss.locate(absolute_pos)
570    }
571
572    /// Access a k-mer at a given position within a string
573    pub fn access_kmer<const K: usize>(&self, string_id: u64, pos: usize) -> Kmer<K>
574    where
575        Kmer<K>: KmerBits,
576    {
577        self.spss.decode_kmer::<K>(string_id, pos)
578    }
579
580    /// Decode the k-mer at an absolute base position in the SPSS.
581    #[inline]
582    pub fn kmer_at_pos<const K: usize>(&self, absolute_base_pos: usize) -> Kmer<K>
583    where
584        Kmer<K>: KmerBits,
585    {
586        self.spss.decode_kmer_at(absolute_base_pos)
587    }
588
589    /// Get the number of unique minimizers
590    pub fn num_minimizers(&self) -> u64 {
591        self.control_map.num_minimizers()
592    }
593
594    /// Get total memory usage in bits
595    pub fn num_bits(&self) -> u64 {
596        self.spss.num_bits()
597            + self.control_map.num_bits()
598            + self.index.num_bits()
599    }
600
601    /// Print a detailed space breakdown of the index
602    pub fn print_space_breakdown(&self) {
603        let num_kmers = self.spss.total_bases().saturating_sub(
604            (self.k as u64 - 1) * self.spss.num_strings()
605        ) as f64;
606
607        let strings_bytes = self.spss.strings_bytes() as f64;
608        let offsets_bytes = self.spss.offsets_bytes() as f64;
609        let ef_bytes = self.index.ef_bytes() as f64;
610        let index_offsets_bytes = self.index.offsets_bytes() as f64;
611        let skew_bytes = self.index.skew_index_bytes() as f64;
612        let mphf_bytes = self.control_map.mphf_serialized_bytes() as f64;
613        let skew_mphf_bytes = self.index.skew_mphf_bytes() as f64;
614
615        let total = strings_bytes + offsets_bytes + ef_bytes
616            + index_offsets_bytes + skew_bytes
617            + mphf_bytes + skew_mphf_bytes;
618
619        let perc = |x: f64| -> f64 { x * 100.0 / total };
620
621        info!("total index size: {} [B] -- {:.5} [MB] ({:.5} [bits/kmer])",
622            total as u64, total / 1_000_000.0, total * 8.0 / num_kmers);
623        info!("SPACE BREAKDOWN:");
624        info!("  mphf: {:.6} [bits/kmer] ({:.5} [bits/key]) -- {:.4}%",
625            mphf_bytes * 8.0 / num_kmers,
626            mphf_bytes * 8.0 / self.num_minimizers() as f64,
627            perc(mphf_bytes));
628        info!("  strings_offsets: {:.6} [bits/kmer] -- {:.5}%",
629            offsets_bytes * 8.0 / num_kmers, perc(offsets_bytes));
630        info!("  num_super_kmers_before_bucket (EF): {:.5} [bits/kmer] -- {:.4}%",
631            ef_bytes * 8.0 / num_kmers, perc(ef_bytes));
632        info!("  offsets: {:.6} [bits/kmer] -- {:.5}%",
633            index_offsets_bytes * 8.0 / num_kmers, perc(index_offsets_bytes));
634        info!("  strings: {:.5} [bits/kmer] -- {:.4}%",
635            strings_bytes * 8.0 / num_kmers, perc(strings_bytes));
636        info!("  skew_index: {:.6} [bits/kmer] -- {:.5}%",
637            skew_bytes * 8.0 / num_kmers, perc(skew_bytes));
638        info!("  skew_mphfs: {:.6} [bits/kmer] -- {:.5}%",
639            skew_mphf_bytes * 8.0 / num_kmers, perc(skew_mphf_bytes));
640        info!("  --------------");
641        info!("  total: {:.5} [bits/kmer]", total * 8.0 / num_kmers);
642    }
643
644    /// Serialize the dictionary to files
645    pub fn save<P: AsRef<std::path::Path>>(&self, path: P) -> crate::serialization::SerializationResult<()> {
646        use crate::serialization::*;
647        use std::io::BufWriter;
648
649        let base_path = path.as_ref();
650
651        // Create main index file
652        let index_path = index_file_path(base_path);
653        let index_file = std::fs::File::create(&index_path)?;
654        let mut index_writer = BufWriter::new(index_file);
655
656        // Write header
657        let header = DictionarySerializationHeader::new(
658            self.k,
659            self.m,
660            self.canonical,
661            (self.index.skew_index.num_partitions() + 1) as u32,
662        );
663        header.write(&mut index_writer)?;
664
665        // Write SPSS
666        self.spss.serialize_to(&mut index_writer)?;
667
668        // Write control map without MPHF
669        self.control_map.serialize_without_mphf(&mut index_writer)?;
670
671        // Write index metadata excluding MPHF
672        self.index.serialize_without_mphf(&mut index_writer)?;
673
674        // Create MPHF container file
675        let mphf_path = mphf_container_path(base_path);
676        let mphf_file = std::fs::File::create(&mphf_path)?;
677        let mut mphf_writer = BufWriter::new(mphf_file);
678
679        // Write MPHF container (control_map first, then skew_index partitions)
680        let mut mphfs: Vec<Option<&crate::partitioned_mphf::PartitionedMphf>> = Vec::with_capacity(
681            self.index.skew_index.num_partitions() + 1,
682        );
683        mphfs.push(self.control_map.mphf_ref());
684        mphfs.extend(self.index.skew_index.mphfs_ref().iter().map(|o| o.as_ref()));
685        write_mphf_container(&mut mphf_writer, &mphfs)?;
686
687        Ok(())
688    }
689
690    /// Deserialize a dictionary from files
691    pub fn load<P: AsRef<std::path::Path>>(path: P) -> crate::serialization::SerializationResult<Self> {
692        use crate::serialization::*;
693        use std::io::BufReader;
694
695        let base_path = path.as_ref();
696
697        // Load main index file
698        let index_path = index_file_path(base_path);
699        let index_file = std::fs::File::open(&index_path)?;
700        let mut index_reader = BufReader::new(index_file);
701
702        // Read header
703        let header = DictionarySerializationHeader::read(&mut index_reader)?;
704
705        // Read SPSS
706        let spss = SpectrumPreservingStringSet::deserialize_from(&mut index_reader)?;
707
708        // Read control map without MPHF
709        let mut control_map = MinimizersControlMap::deserialize_without_mphf(&mut index_reader)?;
710
711        // Read index metadata without MPHF
712        let mut index = SparseAndSkewIndex::deserialize_without_mphf(&mut index_reader)?;
713
714        // Load MPHF container
715        let mphf_path = mphf_container_path(base_path);
716        let mphf_file = std::fs::File::open(&mphf_path)?;
717        let mut mphf_reader = BufReader::new(mphf_file);
718        let mut mphfs = read_mphf_container(&mut mphf_reader)?;
719
720        // First MPHF is the control_map's; remaining are skew_index partitions
721        let control_mphf = if !mphfs.is_empty() { mphfs.remove(0) } else { None };
722        control_map.set_mphf(control_mphf);
723        index.skew_index.set_mphfs(mphfs);
724
725        Ok(Dictionary {
726            spss,
727            control_map,
728            index,
729            k: header.k,
730            m: header.m,
731            canonical: header.canonical,
732            hasher: crate::hasher::DeterministicHasher::new(1),
733        })
734    }
735
736    /// Extract the minimizer from a k-mer using the cached hasher.
737    #[inline]
738    pub(crate) fn extract_minimizer<const K: usize>(&self, kmer: &Kmer<K>) -> MinimizerInfo
739    where
740        Kmer<K>: KmerBits,
741    {
742        self.extract_minimizer_inline::<K>(*kmer)
743    }
744
745    /// Inline minimizer extraction without creating a MinimizerIterator.
746    #[inline]
747    fn extract_minimizer_inline<const K: usize>(&self, kmer: Kmer<K>) -> MinimizerInfo
748    where
749        Kmer<K>: KmerBits,
750    {
751        let num_windows = self.k - self.m + 1;
752        let mask = (1u64 << (self.m * 2)) - 1;
753        let bits = <Kmer<K> as KmerBits>::to_u128(kmer.bits());
754
755        let mut min_hash = u64::MAX;
756        let mut min_value = 0u64;
757        let mut min_pos = 0usize;
758
759        for i in 0..num_windows {
760            let mmer_value = ((bits >> (i * 2)) as u64) & mask;
761            let hash = self.hasher.hash_u64(mmer_value);
762            if hash < min_hash {
763                min_hash = hash;
764                min_value = mmer_value;
765                min_pos = i;
766            }
767        }
768
769        MinimizerInfo::new(min_value, min_pos as u64, min_pos)
770    }
771
772    // --- Streaming query helpers ---
773
774    /// Regular (non-canonical) lookup with pre-computed minimizer, returning LookupResult.
775    pub(crate) fn lookup_regular_streaming<const K: usize>(
776        &self,
777        kmer: &Kmer<K>,
778        mini_info: MinimizerInfo,
779    ) -> crate::streaming_query::LookupResult
780    where
781        Kmer<K>: KmerBits,
782    {
783        let bucket_id = match self.control_map.lookup(mini_info.value) {
784            Some(id) => id,
785            None => {
786                let mut res = crate::streaming_query::LookupResult::not_found();
787                res.minimizer_found = false;
788                return res;
789            }
790        };
791
792        if bucket_id >= self.index.num_buckets() {
793            let mut res = crate::streaming_query::LookupResult::not_found();
794            res.minimizer_found = false;
795            return res;
796        }
797
798        match self.lookup_in_bucket::<K>(kmer, mini_info, bucket_id) {
799            Some(hit) => hit.into_lookup_result(self.k),
800            None => crate::streaming_query::LookupResult::not_found(),
801        }
802    }
803
804    /// Canonical lookup with pre-computed minimizer, returning LookupResult.
805    pub(crate) fn lookup_canonical_streaming<const K: usize>(
806        &self,
807        kmer: &Kmer<K>,
808        kmer_rc: &Kmer<K>,
809        mini_info: MinimizerInfo,
810    ) -> crate::streaming_query::LookupResult
811    where
812        Kmer<K>: KmerBits,
813    {
814        let bucket_id = match self.control_map.lookup(mini_info.value) {
815            Some(id) => id,
816            None => {
817                let mut res = crate::streaming_query::LookupResult::not_found();
818                res.minimizer_found = false;
819                return res;
820            }
821        };
822
823        if bucket_id >= self.index.num_buckets() {
824            let mut res = crate::streaming_query::LookupResult::not_found();
825            res.minimizer_found = false;
826            return res;
827        }
828
829        match self.lookup_in_bucket_canonical::<K>(kmer, kmer_rc, mini_info, bucket_id) {
830            Some(hit) => hit.into_lookup_result(self.k),
831            None => crate::streaming_query::LookupResult::not_found(),
832        }
833    }
834
835}
836
837#[cfg(test)]
838mod tests {
839    use super::*;
840
841    use crate::builder::{BuildConfiguration, DictionaryBuilder};
842
843    #[test]
844    fn test_dictionary_creation() {
845        let spss = SpectrumPreservingStringSet::new(31, 13);
846        let control_map = MinimizersControlMap::from_parts(
847            Vec::new(),
848            Vec::new(),
849            0,
850        );
851        let index = SparseAndSkewIndex::new();
852
853        let dict = Dictionary::new(spss, control_map, index, 31, 13, false);
854
855        assert_eq!(dict.k(), 31);
856        assert_eq!(dict.m(), 13);
857        assert!(!dict.canonical());
858    }
859
860    #[test]
861    fn test_dictionary_build_and_lookup() {
862        let mut config = BuildConfiguration::new(31, 21).unwrap();
863        config.verbose = true;
864        let builder = DictionaryBuilder::new(config).unwrap();
865
866        let sequences = vec![
867            "ACGTACGTACGTACGTACGTACGTACGTACGTACGT".to_string(),
868        ];
869
870        eprintln!("\n=== Building dictionary ===");
871        let dict = builder.build_from_sequences(sequences.clone()).unwrap();
872
873        assert_eq!(dict.k(), 31);
874        assert_eq!(dict.m(), 21);
875
876        eprintln!("\n=== Dictionary info ===");
877        eprintln!("Num minimizers: {}", dict.num_minimizers());
878        eprintln!("Num buckets: {}", dict.index.num_buckets());
879        eprintln!("Num offsets: {}", dict.index.num_offsets());
880        eprintln!("SPSS num strings: {}", dict.spss.num_strings());
881        eprintln!("SPSS total bases: {}", dict.spss.total_bases());
882
883        // Create a k-mer from the first sequence
884        let test_kmer_str = &sequences[0][0..31];
885        eprintln!("\n=== Testing lookup for k-mer: {} ===", test_kmer_str);
886        let kmer = crate::kmer::Kmer::<31>::from_str(test_kmer_str).unwrap();
887
888        // Extract minimizer
889        let mut mini_iter = crate::minimizer::MinimizerIterator::with_seed(31, 21, 1);
890        let mini_info = mini_iter.next(kmer);
891        eprintln!("K-mer minimizer: value={}, pos_in_kmer={}", mini_info.value, mini_info.pos_in_kmer);
892
893        // Look up in control map
894        if let Some(bucket_id) = dict.control_map.lookup(mini_info.value) {
895            eprintln!("Control map lookup: bucket_id={}", bucket_id);
896
897            if bucket_id < dict.index.num_buckets() {
898                let (begin, end) = dict.index.locate_bucket(bucket_id);
899                eprintln!("Bucket range: [{}, {}), size={}", begin, end, end - begin);
900            }
901        } else {
902            eprintln!("Minimizer NOT found in control map!");
903        }
904
905        // Lookup the k-mer
906        let result = dict.lookup(&kmer);
907
908        eprintln!("\nLookup result: {} (INVALID={})", result, crate::constants::INVALID_UINT64);
909    }
910}