Skip to main content

sshash_lib/
kmer_dict_trait.rs

1//! Traits abstracting the k-mer dictionary interface.
2//!
3//! These exist so downstream callers (notably piscem-rs) can be generic over
4//! the dictionary implementation. The existing [`Dictionary`] is the reference
5//! implementation; a future small-reference variant will provide a second
6//! implementation with a different internal representation but the same
7//! observable behavior.
8
9use crate::dictionary::Dictionary;
10use crate::kmer::{Kmer, KmerBits};
11use crate::streaming_query::{LookupResult, StreamingQueryEngine};
12
13/// The streaming-query half of a k-mer dictionary.
14///
15/// Consecutive k-mer lookups share state via `reset` + `lookup`; concrete
16/// implementations are free to exploit that (e.g. minimizer reuse, unitig
17/// extension, rolling k-mer updates).
18pub trait KmerStreamingQuery {
19    /// Hint to hot-loop callers: does this engine benefit from receiving
20    /// pre-computed canonical k-mer bits? Byte-oriented engines (e.g. sshash)
21    /// set this to `false` so callers skip the canonicalization/bit conversion
22    /// work; bit-oriented engines (e.g. tiny-dict) set it to `true`.
23    const PREFERS_BITS: bool;
24
25    /// Drop any streaming state so the next lookup parses its k-mer from scratch.
26    fn reset(&mut self);
27
28    /// Look up a single k-mer (ASCII bytes, length = `k`).
29    fn lookup(&mut self, kmer_bytes: &[u8]) -> LookupResult;
30
31    /// Look up by pre-parsed canonical k-mer bits plus the original FW bytes.
32    ///
33    /// `canonical_bits` is the u64 representation of the canonical k-mer;
34    /// `fw_is_canonical` indicates whether the read's forward orientation equals
35    /// the canonical; `fw_bytes` is the forward ASCII k-mer (length = `k`).
36    ///
37    /// Implementations may use whichever representation is cheaper — the tiny
38    /// dictionary uses `canonical_bits` directly; the sshash streaming engine
39    /// uses `fw_bytes` so it can feed its existing byte-level lookup path
40    /// without an ASCII→bits→ASCII round-trip.
41    fn lookup_bits(
42        &mut self,
43        canonical_bits: u64,
44        fw_is_canonical: bool,
45        fw_bytes: &[u8],
46    ) -> LookupResult;
47
48    /// Number of lookups that required a full dictionary search (slow path).
49    fn num_searches(&self) -> u64;
50
51    /// Number of lookups resolved by extending along the current unitig (fast path).
52    fn num_extensions(&self) -> u64;
53
54    /// Offset the current anchor along its SPSS string by `read_offset` read-positions,
55    /// *without* performing a full lookup. Returns `true` when the anchor was
56    /// successfully shifted (the caller must have independently verified the
57    /// sequence agreement, e.g. via a direct SPSS compare), and subsequent
58    /// consecutive lookups may use the fast path. Returns `false` if the engine
59    /// has no anchor concept, no current anchor, or the shifted position would
60    /// leave the string.
61    ///
62    /// Default: no-op (`false`). Byte-oriented engines (sshash) keep this default
63    /// — their streaming state is a rolling minimizer that does not benefit from
64    /// the skip hint. Bit-oriented engines (tiny-dict) override it.
65    #[inline]
66    fn skip_anchor_along_string(&mut self, _read_offset: i32) -> bool {
67        false
68    }
69}
70
71/// A k-mer dictionary: maps canonical k-mers to their position in an SPSS.
72///
73/// This trait captures the call surface that the piscem-rs mapping pipeline
74/// uses today. It is deliberately narrow — everything outside the mapping
75/// hot path remains on the concrete types.
76pub trait KmerDictionary {
77    /// Streaming-query engine type. Parameterized by the same `K` the caller
78    /// dispatches on; the lifetime borrows from `&self`.
79    type Query<'a, const K: usize>: KmerStreamingQuery
80    where
81        Self: 'a,
82        Kmer<K>: KmerBits;
83
84    /// The k-mer length this dictionary was built for.
85    fn k(&self) -> usize;
86
87    /// The minimizer length this dictionary was built for.
88    fn m(&self) -> usize;
89
90    /// Number of SPSS strings (unitigs) in the dictionary.
91    fn num_strings(&self) -> u64;
92
93    /// Whether the dictionary was built in canonical mode.
94    fn canonical(&self) -> bool;
95
96    /// Decode the k-mer at the given absolute base position within the SPSS.
97    ///
98    /// Hot path in piscem's `HitSearcher`.
99    fn kmer_at_pos<const K: usize>(&self, absolute_base_pos: usize) -> Kmer<K>
100    where
101        Kmer<K>: KmerBits;
102
103    /// Construct a new streaming-query engine borrowing from this dictionary.
104    fn create_streaming_query<const K: usize>(&self) -> Self::Query<'_, K>
105    where
106        Kmer<K>: KmerBits;
107}
108
109// ---------------------------------------------------------------------------
110// Blanket impls for the existing sshash types.
111// ---------------------------------------------------------------------------
112
113impl<'a, const K: usize> KmerStreamingQuery for StreamingQueryEngine<'a, K>
114where
115    Kmer<K>: KmerBits,
116{
117    const PREFERS_BITS: bool = false;
118
119    #[inline]
120    fn reset(&mut self) {
121        StreamingQueryEngine::reset(self)
122    }
123
124    #[inline]
125    fn lookup(&mut self, kmer_bytes: &[u8]) -> LookupResult {
126        StreamingQueryEngine::lookup(self, kmer_bytes)
127    }
128
129    #[inline]
130    fn lookup_bits(
131        &mut self,
132        _canonical_bits: u64,
133        _fw_is_canonical: bool,
134        fw_bytes: &[u8],
135    ) -> LookupResult {
136        StreamingQueryEngine::lookup(self, fw_bytes)
137    }
138
139    #[inline]
140    fn num_searches(&self) -> u64 {
141        StreamingQueryEngine::num_searches(self)
142    }
143
144    #[inline]
145    fn num_extensions(&self) -> u64 {
146        StreamingQueryEngine::num_extensions(self)
147    }
148}
149
150impl KmerDictionary for Dictionary {
151    type Query<'a, const K: usize>
152        = StreamingQueryEngine<'a, K>
153    where
154        Self: 'a,
155        Kmer<K>: KmerBits;
156
157    #[inline]
158    fn k(&self) -> usize {
159        Dictionary::k(self)
160    }
161
162    #[inline]
163    fn m(&self) -> usize {
164        Dictionary::m(self)
165    }
166
167    #[inline]
168    fn num_strings(&self) -> u64 {
169        Dictionary::num_strings(self)
170    }
171
172    #[inline]
173    fn canonical(&self) -> bool {
174        Dictionary::canonical(self)
175    }
176
177    #[inline]
178    fn kmer_at_pos<const K: usize>(&self, absolute_base_pos: usize) -> Kmer<K>
179    where
180        Kmer<K>: KmerBits,
181    {
182        Dictionary::kmer_at_pos::<K>(self, absolute_base_pos)
183    }
184
185    #[inline]
186    fn create_streaming_query<const K: usize>(&self) -> Self::Query<'_, K>
187    where
188        Kmer<K>: KmerBits,
189    {
190        Dictionary::create_streaming_query::<K>(self)
191    }
192}