pgm_extra/util/
bytes.rs

1//! Support for bytes and string keys via prefix extraction.
2//!
3//! PGM-Index internally uses numeric keys for its linear models.
4//! For bytes/strings, we extract an 8-byte prefix as a u64 for the model,
5//! while keeping the full key for exact comparisons.
6
7use core::cmp::Ordering;
8
9/// A fixed-size prefix extracted from bytes/strings for use as a PGM key.
10///
11/// This extracts the first 8 bytes (big-endian) to create a u64,
12/// that preserves lexicographic ordering for the prefix.
13/// The full original data should be kept separately for exact lookups.
14#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
15#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
16#[repr(transparent)]
17pub struct Prefix(pub u64);
18
19impl Prefix {
20    #[inline]
21    pub fn from_bytes(bytes: &[u8]) -> Self {
22        let mut buf = [0u8; 8];
23        let len = bytes.len().min(8);
24        buf[..len].copy_from_slice(&bytes[..len]);
25        Self(u64::from_be_bytes(buf))
26    }
27
28    #[inline]
29    pub fn as_u64(self) -> u64 {
30        self.0
31    }
32}
33
34#[cfg(feature = "std")]
35impl std::str::FromStr for Prefix {
36    type Err = core::convert::Infallible;
37
38    #[inline]
39    fn from_str(s: &str) -> Result<Self, Self::Err> {
40        Ok(Self::from_bytes(s.as_bytes()))
41    }
42}
43
44impl From<&[u8]> for Prefix {
45    #[inline]
46    fn from(bytes: &[u8]) -> Self {
47        Self::from_bytes(bytes)
48    }
49}
50
51impl PartialOrd for Prefix {
52    #[inline]
53    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
54        Some(self.cmp(other))
55    }
56}
57
58impl Ord for Prefix {
59    #[inline]
60    fn cmp(&self, other: &Self) -> Ordering {
61        self.0.cmp(&other.0)
62    }
63}
64
65impl crate::index::Key for Prefix {
66    type Unsigned = u64;
67
68    #[inline]
69    fn to_unsigned(self) -> Self::Unsigned {
70        self.0
71    }
72
73    #[inline]
74    fn to_f64_fast(self) -> f64 {
75        self.0 as f64
76    }
77
78    #[inline]
79    fn to_i64_fast(self) -> i64 {
80        self.0 as i64
81    }
82}
83
84impl num_traits::Zero for Prefix {
85    #[inline]
86    fn zero() -> Self {
87        Self(0)
88    }
89
90    #[inline]
91    fn is_zero(&self) -> bool {
92        self.0 == 0
93    }
94}
95
96impl num_traits::Bounded for Prefix {
97    #[inline]
98    fn min_value() -> Self {
99        Self(u64::MIN)
100    }
101
102    #[inline]
103    fn max_value() -> Self {
104        Self(u64::MAX)
105    }
106}
107
108impl num_traits::ToPrimitive for Prefix {
109    #[inline]
110    fn to_i64(&self) -> Option<i64> {
111        Some(self.0 as i64)
112    }
113
114    #[inline]
115    fn to_u64(&self) -> Option<u64> {
116        Some(self.0)
117    }
118}
119
120impl core::ops::Add for Prefix {
121    type Output = Self;
122
123    #[inline]
124    fn add(self, rhs: Self) -> Self::Output {
125        Self(self.0.wrapping_add(rhs.0))
126    }
127}
128
129impl core::ops::Sub for Prefix {
130    type Output = Self;
131
132    #[inline]
133    fn sub(self, rhs: Self) -> Self::Output {
134        Self(self.0.wrapping_sub(rhs.0))
135    }
136}
137
138#[cfg(test)]
139mod tests {
140    use super::*;
141    use crate::index::Static;
142    use alloc::vec;
143    use alloc::vec::Vec;
144
145    #[test]
146    fn test_bytes_prefix_ordering() {
147        let a = Prefix::from_bytes(b"apple");
148        let b = Prefix::from_bytes(b"banana");
149        let c = Prefix::from_bytes(b"cherry");
150
151        assert!(a < b);
152        assert!(b < c);
153    }
154
155    #[test]
156    fn test_bytes_prefix_from_bytes() {
157        let bytes = b"hello world";
158        let prefix = Prefix::from_bytes(bytes);
159
160        let expected = u64::from_be_bytes(*b"hello wo");
161        assert_eq!(prefix.as_u64(), expected);
162    }
163
164    #[test]
165    fn test_bytes_prefix_short() {
166        let short = Prefix::from_bytes(b"hi");
167        let mut expected = [0u8; 8];
168        expected[0] = b'h';
169        expected[1] = b'i';
170        assert_eq!(short.as_u64(), u64::from_be_bytes(expected));
171    }
172
173    #[test]
174    fn test_pgm_with_string_prefixes() {
175        let strings: Vec<&str> = vec![
176            "aardvark",
177            "apple",
178            "banana",
179            "cherry",
180            "date",
181            "elderberry",
182            "fig",
183            "grape",
184            "honeydew",
185            "jackfruit",
186        ];
187
188        let prefixes: Vec<Prefix> = strings
189            .iter()
190            .map(|s| Prefix::from_bytes(s.as_bytes()))
191            .collect();
192        let index = Static::new(&prefixes, 2, 2).unwrap();
193
194        let query = Prefix::from_bytes(b"cherry");
195        let approx = index.search(&query);
196
197        let found_idx = prefixes[approx.lo..approx.hi]
198            .binary_search(&query)
199            .map(|i| approx.lo + i);
200
201        assert_eq!(found_idx, Ok(3));
202    }
203
204    #[test]
205    fn test_bytes_prefix_equality() {
206        let a = Prefix::from_bytes(b"test");
207        let b = Prefix::from_bytes(b"test");
208        assert_eq!(a, b);
209    }
210
211    #[test]
212    fn test_long_strings_same_prefix() {
213        let a = Prefix::from_bytes(b"abcdefgh_suffix1");
214        let b = Prefix::from_bytes(b"abcdefgh_suffix2");
215
216        assert_eq!(a, b);
217    }
218
219    #[cfg(feature = "std")]
220    #[test]
221    fn test_from_str() {
222        let prefix: Prefix = "hello".parse().unwrap();
223        assert_eq!(prefix, Prefix::from_bytes(b"hello"));
224    }
225}