Skip to main content

dryice/
key.rs

1//! Record-key types and traits.
2
3use crate::error::DryIceError;
4use simd_minimizers::packed_seq::{PackedSeqVec, SeqVec};
5
6/// A fixed-width accelerator key associated with each record in a block.
7///
8/// A `RecordKey` defines the full contract needed for `dryice` to store,
9/// parse, and expose accelerator-key sections:
10///
11/// - a fixed encoded width shared by all keys of this type
12/// - a stable type tag written into the block header
13/// - encoding into bytes for writing
14/// - decoding from bytes for reading
15///
16/// Keys are intended for comparison-friendly accelerator sections such as
17/// sort keys, hashes, or other workflow-specific derived record keys.
18pub trait RecordKey: Ord + Sized {
19    /// Width in bytes of the encoded key.
20    const WIDTH: u16;
21
22    /// Stable type tag written into block headers.
23    const TYPE_TAG: [u8; 16];
24
25    /// Encode this key into the provided output buffer.
26    ///
27    /// # Panics
28    ///
29    /// Panics if `out.len()` does not equal [`Self::WIDTH`].
30    fn encode_into(&self, out: &mut [u8]);
31
32    /// Decode a key from bytes.
33    ///
34    /// # Errors
35    ///
36    /// Returns an error if the bytes do not represent a valid key.
37    fn decode_from(bytes: &[u8]) -> Result<Self, DryIceError>;
38}
39
40/// A kmer-derived fixed-width record key.
41///
42/// `KmerKey` is intentionally a thin marker layer over [`RecordKey`]. It
43/// carries the compile-time kmer length while leaving storage concerns to the
44/// underlying record-key contract.
45///
46/// The built-in kmer key families in `dryice` all use packed canonical
47/// representations by default. A concrete key type therefore tells you both:
48///
49/// - how the value was selected from the sequence (prefix or minimizer)
50/// - how wide the packed representation is (currently 64 bits)
51///
52/// Kmer selection constructors return `Result<Option<Self>, DryIceError>`:
53///
54/// - `Ok(Some(key))` means a key was successfully derived
55/// - `Ok(None)` means the sequence simply cannot yield a key for this family
56///   (for example because it is too short or contains ambiguous bases)
57/// - `Err(...)` is reserved for unexpected failures
58pub trait KmerKey: RecordKey {
59    /// Kmer length used by this key family.
60    const K: u8;
61}
62
63/// Marker type for unkeyed readers and writers.
64#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
65pub struct NoRecordKey;
66
67/// Built-in fixed-width 8-byte key type.
68#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
69pub struct Bytes8Key(pub [u8; 8]);
70
71impl From<[u8; 8]> for Bytes8Key {
72    fn from(value: [u8; 8]) -> Self {
73        Self(value)
74    }
75}
76
77impl RecordKey for Bytes8Key {
78    const WIDTH: u16 = 8;
79    const TYPE_TAG: [u8; 16] = *b"dryi:bytes8:key!";
80
81    fn encode_into(&self, out: &mut [u8]) {
82        debug_assert_eq!(out.len(), usize::from(Self::WIDTH));
83        out.copy_from_slice(&self.0);
84    }
85
86    fn decode_from(bytes: &[u8]) -> Result<Self, DryIceError> {
87        let arr: [u8; 8] = bytes
88            .try_into()
89            .map_err(|_| DryIceError::InvalidRecordKeyEncoding {
90                message: "invalid bytes8 key length",
91            })?;
92        Ok(Self(arr))
93    }
94}
95
96/// Built-in fixed-width 16-byte key type.
97#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
98pub struct Bytes16Key(pub [u8; 16]);
99
100impl From<[u8; 16]> for Bytes16Key {
101    fn from(value: [u8; 16]) -> Self {
102        Self(value)
103    }
104}
105
106impl RecordKey for Bytes16Key {
107    const WIDTH: u16 = 16;
108    const TYPE_TAG: [u8; 16] = *b"dryi:bytes16:key";
109
110    fn encode_into(&self, out: &mut [u8]) {
111        debug_assert_eq!(out.len(), usize::from(Self::WIDTH));
112        out.copy_from_slice(&self.0);
113    }
114
115    fn decode_from(bytes: &[u8]) -> Result<Self, DryIceError> {
116        let arr: [u8; 16] =
117            bytes
118                .try_into()
119                .map_err(|_| DryIceError::InvalidRecordKeyEncoding {
120                    message: "invalid bytes16 key length",
121                })?;
122        Ok(Self(arr))
123    }
124}
125
126/// Prefix-selected packed canonical kmer key stored in 64 bits.
127///
128/// `PrefixKmer64<K>` stores the canonical packed representation of the first
129/// `K` DNA bases of a sequence. Canonical here means the minimum of the forward
130/// kmer and its reverse complement, so reverse-complement sequences yield the
131/// same key.
132///
133/// This family is intended as the simplest built-in kmer-as-key selector and
134/// does not depend on any external minimizer backend.
135#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
136pub struct PrefixKmer64<const K: u8>(pub u64);
137
138impl<const K: u8> KmerKey for PrefixKmer64<K> {
139    const K: u8 = K;
140}
141
142impl<const K: u8> RecordKey for PrefixKmer64<K> {
143    const WIDTH: u16 = 8;
144    const TYPE_TAG: [u8; 16] = *b"dryi:kmer:pref64";
145
146    fn encode_into(&self, out: &mut [u8]) {
147        debug_assert_eq!(out.len(), usize::from(Self::WIDTH));
148        out.copy_from_slice(&self.0.to_le_bytes());
149    }
150
151    fn decode_from(bytes: &[u8]) -> Result<Self, DryIceError> {
152        let arr: [u8; 8] = bytes
153            .try_into()
154            .map_err(|_| DryIceError::InvalidRecordKeyEncoding {
155                message: "invalid prefix kmer64 key length",
156            })?;
157        Ok(Self(u64::from_le_bytes(arr)))
158    }
159}
160
161impl<const K: u8> PrefixKmer64<K> {
162    const ASSERT_VALID: () = {
163        assert!(K > 0, "PrefixKmer64 requires K > 0");
164        assert!(K <= 32, "PrefixKmer64 requires K <= 32");
165    };
166
167    /// Derive a prefix-selected canonical kmer key from a sequence.
168    ///
169    /// # Errors
170    ///
171    /// Returns an error only for unexpected internal failures. Expected no-key
172    /// outcomes such as short or ambiguous sequences return `Ok(None)`.
173    ///
174    /// The constructor rejects ambiguous bases by returning `Ok(None)` because a
175    /// packed canonical prefix key is only defined on unambiguous `A/C/G/T`
176    /// sequences.
177    pub fn try_from_sequence(seq: &[u8]) -> Result<Option<Self>, DryIceError> {
178        let () = Self::ASSERT_VALID;
179
180        if seq.len() < usize::from(K) {
181            return Ok(None);
182        }
183
184        let prefix = &seq[..usize::from(K)];
185        let mut forward = 0u64;
186        let mut revcomp = 0u64;
187
188        for &base in prefix {
189            let bits = match base {
190                b'A' | b'a' => 0u64,
191                b'C' | b'c' => 1u64,
192                b'G' | b'g' => 2u64,
193                b'T' | b't' => 3u64,
194                _ => return Ok(None),
195            };
196            forward = (forward << 2) | bits;
197        }
198
199        for &base in prefix.iter().rev() {
200            let bits = match base {
201                b'A' | b'a' => 0u64,
202                b'C' | b'c' => 1u64,
203                b'G' | b'g' => 2u64,
204                b'T' | b't' => 3u64,
205                _ => return Ok(None),
206            };
207            revcomp = (revcomp << 2) | (3 - bits);
208        }
209
210        Ok(Some(Self(forward.min(revcomp))))
211    }
212}
213
214/// Minimizer-selected packed canonical kmer key stored in 64 bits.
215///
216/// `Minimizer64<K, W>` stores one canonical `K`-mer chosen from a longer
217/// sequence using minimizer selection over a window of `W` consecutive `K`-mers.
218/// The effective sequence span examined by the selector is therefore
219/// `K + W - 1` bases.
220///
221/// When multiple minimizer candidates are produced for a sequence, `dryice`
222/// reduces them to a single record key by taking the minimum canonical packed
223/// value. This keeps the result deterministic and stable under
224/// reverse-complement transforms.
225#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
226pub struct Minimizer64<const K: u8, const W: u8>(pub u64);
227
228impl<const K: u8, const W: u8> KmerKey for Minimizer64<K, W> {
229    const K: u8 = K;
230}
231
232impl<const K: u8, const W: u8> RecordKey for Minimizer64<K, W> {
233    const WIDTH: u16 = 8;
234    const TYPE_TAG: [u8; 16] = *b"dryi:kmer:mini64";
235
236    fn encode_into(&self, out: &mut [u8]) {
237        debug_assert_eq!(out.len(), usize::from(Self::WIDTH));
238        out.copy_from_slice(&self.0.to_le_bytes());
239    }
240
241    fn decode_from(bytes: &[u8]) -> Result<Self, DryIceError> {
242        let arr: [u8; 8] = bytes
243            .try_into()
244            .map_err(|_| DryIceError::InvalidRecordKeyEncoding {
245                message: "invalid minimizer64 key length",
246            })?;
247        Ok(Self(u64::from_le_bytes(arr)))
248    }
249}
250
251impl<const K: u8, const W: u8> Minimizer64<K, W> {
252    const ASSERT_VALID: () = {
253        assert!(K > 0, "Minimizer64 requires K > 0");
254        assert!(K <= 32, "Minimizer64 requires K <= 32");
255        assert!(W > 0, "Minimizer64 requires W > 0");
256    };
257
258    /// Derive a minimizer-selected canonical kmer key from a sequence.
259    ///
260    /// # Errors
261    ///
262    /// Returns an error only for unexpected internal failures. Expected no-key
263    /// outcomes such as short or ambiguous sequences return `Ok(None)`.
264    ///
265    /// This constructor currently uses `simd-minimizers` internally for
266    /// canonical minimizer discovery, but `dryice` owns the public reduction
267    /// semantics: one key per record, chosen as the minimum selected canonical
268    /// packed value.
269    pub fn try_from_sequence(seq: &[u8]) -> Result<Option<Self>, DryIceError> {
270        let () = Self::ASSERT_VALID;
271
272        let l = usize::from(K) + usize::from(W) - 1;
273        if seq.len() < l {
274            return Ok(None);
275        }
276        if !seq
277            .iter()
278            .all(|base| matches!(base, b'A' | b'a' | b'C' | b'c' | b'G' | b'g' | b'T' | b't'))
279        {
280            return Ok(None);
281        }
282
283        let packed = PackedSeqVec::from_ascii(seq);
284        let mut positions = Vec::new();
285        let values: Vec<u64> =
286            simd_minimizers::canonical_minimizers(usize::from(K), usize::from(W))
287                .run(packed.as_slice(), &mut positions)
288                .values_u64()
289                .collect();
290
291        Ok(values.into_iter().min().map(Self))
292    }
293}