sshash_lib/kmer_dict_trait.rs
1//! Traits abstracting the k-mer dictionary interface.
2//!
3//! These exist so downstream callers (notably piscem-rs) can be generic over
4//! the dictionary implementation. The existing [`Dictionary`] is the reference
5//! implementation; a future small-reference variant will provide a second
6//! implementation with a different internal representation but the same
7//! observable behavior.
8
9use crate::dictionary::Dictionary;
10use crate::kmer::{Kmer, KmerBits};
11use crate::streaming_query::{LookupResult, StreamingQueryEngine};
12
13/// The streaming-query half of a k-mer dictionary.
14///
15/// Consecutive k-mer lookups share state via `reset` + `lookup`; concrete
16/// implementations are free to exploit that (e.g. minimizer reuse, unitig
17/// extension, rolling k-mer updates).
18pub trait KmerStreamingQuery {
19 /// Hint to hot-loop callers: does this engine benefit from receiving
20 /// pre-computed canonical k-mer bits? Byte-oriented engines (e.g. sshash)
21 /// set this to `false` so callers skip the canonicalization/bit conversion
22 /// work; bit-oriented engines (e.g. tiny-dict) set it to `true`.
23 const PREFERS_BITS: bool;
24
25 /// Drop any streaming state so the next lookup parses its k-mer from scratch.
26 fn reset(&mut self);
27
28 /// Look up a single k-mer (ASCII bytes, length = `k`).
29 fn lookup(&mut self, kmer_bytes: &[u8]) -> LookupResult;
30
31 /// Look up by pre-parsed canonical k-mer bits plus the original FW bytes.
32 ///
33 /// `canonical_bits` is the u64 representation of the canonical k-mer;
34 /// `fw_is_canonical` indicates whether the read's forward orientation equals
35 /// the canonical; `fw_bytes` is the forward ASCII k-mer (length = `k`).
36 ///
37 /// Implementations may use whichever representation is cheaper — the tiny
38 /// dictionary uses `canonical_bits` directly; the sshash streaming engine
39 /// uses `fw_bytes` so it can feed its existing byte-level lookup path
40 /// without an ASCII→bits→ASCII round-trip.
41 fn lookup_bits(
42 &mut self,
43 canonical_bits: u64,
44 fw_is_canonical: bool,
45 fw_bytes: &[u8],
46 ) -> LookupResult;
47
48 /// Number of lookups that required a full dictionary search (slow path).
49 fn num_searches(&self) -> u64;
50
51 /// Number of lookups resolved by extending along the current unitig (fast path).
52 fn num_extensions(&self) -> u64;
53
54 /// Offset the current anchor along its SPSS string by `read_offset` read-positions,
55 /// *without* performing a full lookup. Returns `true` when the anchor was
56 /// successfully shifted (the caller must have independently verified the
57 /// sequence agreement, e.g. via a direct SPSS compare), and subsequent
58 /// consecutive lookups may use the fast path. Returns `false` if the engine
59 /// has no anchor concept, no current anchor, or the shifted position would
60 /// leave the string.
61 ///
62 /// Default: no-op (`false`). Byte-oriented engines (sshash) keep this default
63 /// — their streaming state is a rolling minimizer that does not benefit from
64 /// the skip hint. Bit-oriented engines (tiny-dict) override it.
65 #[inline]
66 fn skip_anchor_along_string(&mut self, _read_offset: i32) -> bool {
67 false
68 }
69}
70
71/// A k-mer dictionary: maps canonical k-mers to their position in an SPSS.
72///
73/// This trait captures the call surface that the piscem-rs mapping pipeline
74/// uses today. It is deliberately narrow — everything outside the mapping
75/// hot path remains on the concrete types.
76pub trait KmerDictionary {
77 /// Streaming-query engine type. Parameterized by the same `K` the caller
78 /// dispatches on; the lifetime borrows from `&self`.
79 type Query<'a, const K: usize>: KmerStreamingQuery
80 where
81 Self: 'a,
82 Kmer<K>: KmerBits;
83
84 /// The k-mer length this dictionary was built for.
85 fn k(&self) -> usize;
86
87 /// The minimizer length this dictionary was built for.
88 fn m(&self) -> usize;
89
90 /// Number of SPSS strings (unitigs) in the dictionary.
91 fn num_strings(&self) -> u64;
92
93 /// Whether the dictionary was built in canonical mode.
94 fn canonical(&self) -> bool;
95
96 /// Decode the k-mer at the given absolute base position within the SPSS.
97 ///
98 /// Hot path in piscem's `HitSearcher`.
99 fn kmer_at_pos<const K: usize>(&self, absolute_base_pos: usize) -> Kmer<K>
100 where
101 Kmer<K>: KmerBits;
102
103 /// Construct a new streaming-query engine borrowing from this dictionary.
104 fn create_streaming_query<const K: usize>(&self) -> Self::Query<'_, K>
105 where
106 Kmer<K>: KmerBits;
107}
108
109// ---------------------------------------------------------------------------
110// Blanket impls for the existing sshash types.
111// ---------------------------------------------------------------------------
112
113impl<'a, const K: usize> KmerStreamingQuery for StreamingQueryEngine<'a, K>
114where
115 Kmer<K>: KmerBits,
116{
117 const PREFERS_BITS: bool = false;
118
119 #[inline]
120 fn reset(&mut self) {
121 StreamingQueryEngine::reset(self)
122 }
123
124 #[inline]
125 fn lookup(&mut self, kmer_bytes: &[u8]) -> LookupResult {
126 StreamingQueryEngine::lookup(self, kmer_bytes)
127 }
128
129 #[inline]
130 fn lookup_bits(
131 &mut self,
132 _canonical_bits: u64,
133 _fw_is_canonical: bool,
134 fw_bytes: &[u8],
135 ) -> LookupResult {
136 StreamingQueryEngine::lookup(self, fw_bytes)
137 }
138
139 #[inline]
140 fn num_searches(&self) -> u64 {
141 StreamingQueryEngine::num_searches(self)
142 }
143
144 #[inline]
145 fn num_extensions(&self) -> u64 {
146 StreamingQueryEngine::num_extensions(self)
147 }
148}
149
150impl KmerDictionary for Dictionary {
151 type Query<'a, const K: usize>
152 = StreamingQueryEngine<'a, K>
153 where
154 Self: 'a,
155 Kmer<K>: KmerBits;
156
157 #[inline]
158 fn k(&self) -> usize {
159 Dictionary::k(self)
160 }
161
162 #[inline]
163 fn m(&self) -> usize {
164 Dictionary::m(self)
165 }
166
167 #[inline]
168 fn num_strings(&self) -> u64 {
169 Dictionary::num_strings(self)
170 }
171
172 #[inline]
173 fn canonical(&self) -> bool {
174 Dictionary::canonical(self)
175 }
176
177 #[inline]
178 fn kmer_at_pos<const K: usize>(&self, absolute_base_pos: usize) -> Kmer<K>
179 where
180 Kmer<K>: KmerBits,
181 {
182 Dictionary::kmer_at_pos::<K>(self, absolute_base_pos)
183 }
184
185 #[inline]
186 fn create_streaming_query<const K: usize>(&self) -> Self::Query<'_, K>
187 where
188 Kmer<K>: KmerBits,
189 {
190 Dictionary::create_streaming_query::<K>(self)
191 }
192}