Skip to main content

scry_index/
key.rs

1//! The [`Key`] trait for types usable as learned index keys.
2//!
3//! Keys must be ordered, clonable, and convertible to `f64` for linear model
4//! prediction. This conversion must be monotonic: if `a < b` then
5//! `a.to_model_input() < b.to_model_input()`.
6
7/// A key type usable in a learned index.
8///
9/// # Contract
10///
11/// - `to_model_input` must be a **monotonic** function: if `a < b` then
12///   `a.to_model_input() <= b.to_model_input()`. Note: the mapping is
13///   non-strict: distinct keys may produce the same `f64` due to precision
14///   loss (e.g., `u64` keys above 2^53). The index handles this correctly
15///   via the `to_exact_ordinal` fallback.
16/// - The returned `f64` must be finite (not NaN or infinity).
17/// - The mapping should preserve relative distances where possible, so that
18///   linear models can fit the key distribution effectively.
19/// - `to_exact_ordinal` must be a **strictly monotonic, injective** function:
20///   if `a < b` then `a.to_exact_ordinal() < b.to_exact_ordinal()`.
21pub trait Key: Clone + Ord + Send + Sync + std::fmt::Debug + 'static {
22    /// Convert this key to a `f64` value for model prediction.
23    ///
24    /// The conversion must be monotonic and return a finite value. It may
25    /// be non-injective for large integer keys (precision loss above 2^53).
26    fn to_model_input(&self) -> f64;
27
28    /// Convert this key to a lossless `i128` for exact comparison.
29    ///
30    /// This must be strictly monotonic and injective: if `a < b` then
31    /// `a.to_exact_ordinal() < b.to_exact_ordinal()`. Used as a fallback
32    /// for conflict resolution when `to_model_input` cannot distinguish keys.
33    fn to_exact_ordinal(&self) -> i128;
34}
35
36macro_rules! impl_key_unsigned {
37    ($($t:ty),*) => {
38        $(
39            impl Key for $t {
40                #[inline]
41                fn to_model_input(&self) -> f64 {
42                    *self as f64
43                }
44
45                #[inline]
46                fn to_exact_ordinal(&self) -> i128 {
47                    *self as i128
48                }
49            }
50        )*
51    };
52}
53
54macro_rules! impl_key_signed {
55    ($($t:ty),*) => {
56        $(
57            impl Key for $t {
58                #[inline]
59                fn to_model_input(&self) -> f64 {
60                    *self as f64
61                }
62
63                #[inline]
64                fn to_exact_ordinal(&self) -> i128 {
65                    *self as i128
66                }
67            }
68        )*
69    };
70}
71
72impl_key_unsigned!(u8, u16, u32, u64);
73impl_key_signed!(i8, i16, i32, i64);
74
75// u128/i128 lose precision in to_model_input beyond 2^53, but
76// to_exact_ordinal is fully injective for all values.
77impl Key for u128 {
78    #[inline]
79    fn to_model_input(&self) -> f64 {
80        *self as f64
81    }
82
83    #[inline]
84    #[allow(clippy::cast_possible_wrap)]
85    fn to_exact_ordinal(&self) -> i128 {
86        // Order-preserving bijection: flip the sign bit so that
87        // 0u128 -> i128::MIN and u128::MAX -> i128::MAX.
88        (*self as i128) ^ i128::MIN
89    }
90}
91
92impl Key for i128 {
93    #[inline]
94    fn to_model_input(&self) -> f64 {
95        *self as f64
96    }
97
98    #[inline]
99    fn to_exact_ordinal(&self) -> i128 {
100        *self
101    }
102}
103
104/// Interpret the first 8 bytes of a byte slice as a big-endian `u64` -> `f64`.
105///
106/// Shorter slices are zero-padded on the right. Provides a monotonic mapping
107/// with respect to lexicographic byte order.
108///
109/// This helper is useful for implementing [`Key::to_model_input`] on custom
110/// types that have a byte representation.
111#[inline]
112pub fn bytes_to_model_input(bytes: &[u8]) -> f64 {
113    let mut buf = [0u8; 8];
114    let len = bytes.len().min(8);
115    buf[..len].copy_from_slice(&bytes[..len]);
116    u64::from_be_bytes(buf) as f64
117}
118
119/// Interpret the first 16 bytes of a byte slice as a big-endian `u128`,
120/// then apply the sign-flip bijection to produce an order-preserving `i128`.
121///
122/// Shorter slices are zero-padded on the right. For slices longer than 16
123/// bytes, only the first 16 bytes are used. Keys that share a 16-byte
124/// prefix will produce the same ordinal (non-injective). The index handles
125/// these collisions via `Ord`-based splits in the node.
126///
127/// This helper is useful for implementing [`Key::to_exact_ordinal`] on custom
128/// types that have a byte representation.
129#[inline]
130#[allow(clippy::cast_possible_wrap)]
131pub fn bytes_to_exact_ordinal(bytes: &[u8]) -> i128 {
132    let mut buf = [0u8; 16];
133    let len = bytes.len().min(16);
134    buf[..len].copy_from_slice(&bytes[..len]);
135    // Order-preserving bijection: XOR with i128::MIN
136    // maps 0x00..00 → i128::MIN and 0xFF..FF → i128::MAX.
137    (u128::from_be_bytes(buf) as i128) ^ i128::MIN
138}
139
140/// Key impl for fixed-size byte arrays.
141///
142/// Enables byte-array keys like `[u8; 16]` (UUIDs), `[u8; 32]` (hashes), etc.
143///
144/// - `to_model_input`: interprets the first 8 bytes as a big-endian `u64` and
145///   casts to `f64`. Shorter arrays are zero-padded on the right. This provides
146///   a monotonic mapping with respect to lexicographic order.
147/// - `to_exact_ordinal`: interprets the first 16 bytes as a big-endian `u128`
148///   (with the same sign-flip trick as `u128`). This is injective for `N <= 16`.
149///   For `N > 16`, it is a best-effort prefix: distinct keys that share their
150///   first 16 bytes will collide. The index resolves these via `Ord`-based
151///   splits in the node.
152impl<const N: usize> Key for [u8; N] {
153    #[inline]
154    fn to_model_input(&self) -> f64 {
155        bytes_to_model_input(self)
156    }
157
158    #[inline]
159    fn to_exact_ordinal(&self) -> i128 {
160        bytes_to_exact_ordinal(self)
161    }
162}
163
164/// Key impl for `String`.
165///
166/// Enables heap-allocated string keys. Uses the UTF-8 byte representation
167/// for model input and ordinal computation.
168///
169/// - `to_model_input`: interprets the first 8 bytes as a big-endian `u64` →
170///   `f64`. Strings shorter than 8 bytes are zero-padded. Monotonic with
171///   respect to lexicographic (byte) order.
172/// - `to_exact_ordinal`: interprets the first 16 bytes as a big-endian `u128`
173///   with sign-flip. Non-injective for strings sharing a 16-byte prefix;
174///   the index uses `Ord`-based node splits to handle these collisions.
175impl Key for String {
176    #[inline]
177    fn to_model_input(&self) -> f64 {
178        bytes_to_model_input(self.as_bytes())
179    }
180
181    #[inline]
182    fn to_exact_ordinal(&self) -> i128 {
183        bytes_to_exact_ordinal(self.as_bytes())
184    }
185}
186
187/// Key impl for `Vec<u8>`.
188///
189/// Enables heap-allocated byte-vector keys. Semantics are identical to the
190/// fixed-size `[u8; N]` implementation but for dynamically sized vectors.
191///
192/// - `to_model_input`: first 8 bytes as big-endian `u64` → `f64`.
193/// - `to_exact_ordinal`: first 16 bytes as `i128` with sign-flip.
194///   Non-injective for vectors sharing a 16-byte prefix.
195impl Key for Vec<u8> {
196    #[inline]
197    fn to_model_input(&self) -> f64 {
198        bytes_to_model_input(self)
199    }
200
201    #[inline]
202    fn to_exact_ordinal(&self) -> i128 {
203        bytes_to_exact_ordinal(self)
204    }
205}
206
207#[cfg(test)]
208mod tests {
209    use super::*;
210
211    #[test]
212    fn u64_monotonic() {
213        let keys: Vec<u64> = vec![0, 1, 100, 1000, u64::MAX / 2];
214        for pair in keys.windows(2) {
215            assert!(
216                pair[0].to_model_input() < pair[1].to_model_input(),
217                "monotonicity violated: {} >= {}",
218                pair[0].to_model_input(),
219                pair[1].to_model_input()
220            );
221        }
222    }
223
224    #[test]
225    fn i64_monotonic() {
226        let keys: Vec<i64> = vec![i64::MIN, -1000, -1, 0, 1, 1000, i64::MAX / 2];
227        for pair in keys.windows(2) {
228            assert!(
229                pair[0].to_model_input() < pair[1].to_model_input(),
230                "monotonicity violated for i64: {} >= {}",
231                pair[0].to_model_input(),
232                pair[1].to_model_input()
233            );
234        }
235    }
236
237    #[test]
238    fn u32_finite() {
239        for &k in &[0u32, 1, u32::MAX] {
240            let v = k.to_model_input();
241            assert!(v.is_finite(), "{k} produced non-finite model input {v}");
242        }
243    }
244
245    #[test]
246    fn key_is_send_sync() {
247        fn assert_send_sync<T: Key>() {}
248        assert_send_sync::<u64>();
249        assert_send_sync::<i32>();
250    }
251
252    #[test]
253    fn exact_ordinal_injective_near_precision_boundary() {
254        // u64 keys near 2^53 where f64 loses precision
255        let base: u64 = 1 << 53;
256        #[allow(clippy::float_cmp)]
257        {
258            assert_eq!(base as f64, (base + 1) as f64, "precondition: same f64");
259        }
260        let o1 = base.to_exact_ordinal();
261        let o2 = (base + 1).to_exact_ordinal();
262        assert_ne!(o1, o2, "to_exact_ordinal must be injective");
263        assert!(o1 < o2, "to_exact_ordinal must be monotonic");
264    }
265
266    #[test]
267    fn exact_ordinal_injective_nanosecond_timestamps() {
268        let base: u64 = 1_700_000_000_000_000_000;
269        for i in 0..256u64 {
270            let o1 = (base + i).to_exact_ordinal();
271            let o2 = (base + i + 1).to_exact_ordinal();
272            assert!(o1 < o2, "monotonicity violated at offset {i}");
273        }
274    }
275
276    #[test]
277    fn u128_exact_ordinal_preserves_order() {
278        let vals: Vec<u128> = vec![0, 1, u128::MAX / 2, u128::MAX / 2 + 1, u128::MAX];
279        for pair in vals.windows(2) {
280            assert!(
281                pair[0].to_exact_ordinal() < pair[1].to_exact_ordinal(),
282                "u128 order violated: {} vs {}",
283                pair[0],
284                pair[1]
285            );
286        }
287    }
288
289    // -----------------------------------------------------------------------
290    // [u8; N] byte array keys
291    // -----------------------------------------------------------------------
292
293    #[test]
294    fn byte_array_key_is_send_sync() {
295        fn assert_key<T: Key>() {}
296        assert_key::<[u8; 4]>();
297        assert_key::<[u8; 8]>();
298        assert_key::<[u8; 16]>();
299        assert_key::<[u8; 32]>();
300    }
301
302    #[test]
303    fn byte4_model_input_monotonic() {
304        let keys: Vec<[u8; 4]> = vec![[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0]];
305        for pair in keys.windows(2) {
306            assert!(
307                pair[0].to_model_input() <= pair[1].to_model_input(),
308                "monotonicity violated for [u8; 4]: {:?} > {:?}",
309                pair[0],
310                pair[1]
311            );
312        }
313    }
314
315    #[test]
316    fn byte8_model_input_monotonic() {
317        let keys: Vec<[u8; 8]> = vec![
318            [0; 8],
319            [0, 0, 0, 0, 0, 0, 0, 1],
320            [0, 0, 0, 0, 0, 0, 1, 0],
321            [0, 0, 0, 1, 0, 0, 0, 0],
322            [1, 0, 0, 0, 0, 0, 0, 0],
323            [255; 8],
324        ];
325        for pair in keys.windows(2) {
326            assert!(
327                pair[0].to_model_input() <= pair[1].to_model_input(),
328                "monotonicity violated for [u8; 8]: {:?} > {:?}",
329                pair[0],
330                pair[1]
331            );
332        }
333    }
334
335    #[test]
336    fn byte16_exact_ordinal_injective() {
337        let keys: Vec<[u8; 16]> = vec![
338            [0; 16],
339            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
340            [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
341            [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
342            [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
343            [255; 16],
344        ];
345        for pair in keys.windows(2) {
346            assert!(
347                pair[0].to_exact_ordinal() < pair[1].to_exact_ordinal(),
348                "exact_ordinal not injective for [u8; 16]: {:?} >= {:?}",
349                pair[0],
350                pair[1]
351            );
352        }
353    }
354
355    #[test]
356    fn byte32_model_input_finite() {
357        let keys: Vec<[u8; 32]> = vec![[0; 32], [128; 32], [255; 32]];
358        for k in &keys {
359            let v = k.to_model_input();
360            assert!(v.is_finite(), "{k:?} produced non-finite model input {v}");
361        }
362    }
363
364    #[test]
365    fn byte4_exact_ordinal_monotonic() {
366        // For N < 16, zero-padded; still strictly monotonic for lex order
367        let keys: Vec<[u8; 4]> = vec![[0, 0, 0, 0], [0, 0, 0, 1], [0, 0, 1, 0], [1, 0, 0, 0]];
368        for pair in keys.windows(2) {
369            assert!(
370                pair[0].to_exact_ordinal() < pair[1].to_exact_ordinal(),
371                "exact_ordinal not monotonic for [u8; 4]: {:?} >= {:?}",
372                pair[0],
373                pair[1]
374            );
375        }
376    }
377
378    #[test]
379    fn byte32_exact_ordinal_prefix_only() {
380        // For N > 16, only the first 16 bytes are used. Keys differing
381        // after byte 16 will have the same ordinal.
382        let mut a = [0u8; 32];
383        let mut b = [0u8; 32];
384        a[20] = 1;
385        b[20] = 2;
386        // Same first 16 bytes → same ordinal (best-effort limitation)
387        assert_eq!(a.to_exact_ordinal(), b.to_exact_ordinal());
388        // But Ord comparison still distinguishes them
389        assert!(a < b);
390    }
391
392    // -----------------------------------------------------------------------
393    // String keys
394    // -----------------------------------------------------------------------
395
396    #[test]
397    fn string_key_is_send_sync() {
398        fn assert_key<T: Key>() {}
399        assert_key::<String>();
400    }
401
402    #[test]
403    fn string_model_input_monotonic() {
404        let keys: Vec<String> = vec![
405            String::new(),
406            "a".into(),
407            "b".into(),
408            "hello".into(),
409            "world".into(),
410            "zzzzzzzz".into(),
411        ];
412        for pair in keys.windows(2) {
413            assert!(
414                pair[0].to_model_input() <= pair[1].to_model_input(),
415                "monotonicity violated for String: {:?} > {:?}",
416                pair[0],
417                pair[1]
418            );
419        }
420    }
421
422    #[test]
423    fn string_model_input_finite() {
424        let keys = vec![String::new(), "hello".into(), "\u{ffff}".into()];
425        for k in &keys {
426            let v = k.to_model_input();
427            assert!(v.is_finite(), "{k:?} produced non-finite model input {v}");
428        }
429    }
430
431    #[test]
432    fn string_exact_ordinal_monotonic() {
433        let keys: Vec<String> = vec![
434            String::new(),
435            "a".into(),
436            "aa".into(),
437            "ab".into(),
438            "b".into(),
439        ];
440        for pair in keys.windows(2) {
441            assert!(
442                pair[0].to_exact_ordinal() <= pair[1].to_exact_ordinal(),
443                "exact_ordinal not monotonic for String: {:?} >= {:?}",
444                pair[0],
445                pair[1]
446            );
447        }
448    }
449
450    #[test]
451    fn string_shared_prefix_same_ordinal() {
452        // Strings sharing a 16+ byte prefix have the same ordinal.
453        let a = "1234567890abcdefXXX".to_string();
454        let b = "1234567890abcdefYYY".to_string();
455        assert_eq!(a.to_exact_ordinal(), b.to_exact_ordinal());
456        assert!(a < b);
457    }
458
459    #[test]
460    fn string_different_in_first_16_bytes() {
461        let a = "1234567890abcdeX".to_string();
462        let b = "1234567890abcdeY".to_string();
463        assert!(a.to_exact_ordinal() < b.to_exact_ordinal());
464    }
465
466    #[test]
467    fn string_matches_byte_array() {
468        // String and [u8; N] should produce the same model input/ordinal
469        // for identical byte content.
470        let s = "hello".to_string();
471        let arr: [u8; 5] = *b"hello";
472        #[allow(clippy::float_cmp)]
473        {
474            assert_eq!(s.to_model_input(), arr.to_model_input());
475        }
476        assert_eq!(s.to_exact_ordinal(), arr.to_exact_ordinal());
477    }
478
479    // -----------------------------------------------------------------------
480    // Vec<u8> keys
481    // -----------------------------------------------------------------------
482
483    #[test]
484    fn vec_u8_key_is_send_sync() {
485        fn assert_key<T: Key>() {}
486        assert_key::<Vec<u8>>();
487    }
488
489    #[test]
490    fn vec_u8_model_input_monotonic() {
491        let keys: Vec<Vec<u8>> = vec![vec![], vec![0], vec![0, 1], vec![1], vec![1, 0], vec![255]];
492        for pair in keys.windows(2) {
493            assert!(
494                pair[0].to_model_input() <= pair[1].to_model_input(),
495                "monotonicity violated for Vec<u8>: {:?} > {:?}",
496                pair[0],
497                pair[1]
498            );
499        }
500    }
501
502    #[test]
503    fn vec_u8_exact_ordinal_monotonic() {
504        let keys: Vec<Vec<u8>> = vec![
505            vec![],
506            vec![0],
507            vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
508            vec![0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
509            vec![1],
510        ];
511        for pair in keys.windows(2) {
512            assert!(
513                pair[0].to_exact_ordinal() <= pair[1].to_exact_ordinal(),
514                "exact_ordinal not monotonic for Vec<u8>: {:?} >= {:?}",
515                pair[0],
516                pair[1]
517            );
518        }
519    }
520}