pgm_extra/util/
bytes.rs

1//! Support for bytes and string keys via prefix extraction.
2//!
3//! PGM-Index internally uses numeric keys for its linear models.
4//! For bytes/strings, we extract an 8-byte prefix as a u64 for the model,
5//! while keeping the full key for exact comparisons.
6
7use core::cmp::Ordering;
8
9/// A fixed-size prefix extracted from bytes/strings for use as a PGM key.
10///
11/// This extracts the first 8 bytes (big-endian) to create a u64,
12/// that preserves lexicographic ordering for the prefix.
13/// The full original data should be kept separately for exact lookups.
14#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)]
15#[cfg_attr(
16    feature = "rkyv",
17    derive(rkyv::Archive, rkyv::Serialize, rkyv::Deserialize)
18)]
19#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
20#[repr(transparent)]
21pub struct Prefix(pub u64);
22
23impl Prefix {
24    #[inline]
25    pub fn from_bytes(bytes: &[u8]) -> Self {
26        let mut buf = [0u8; 8];
27        let len = bytes.len().min(8);
28        buf[..len].copy_from_slice(&bytes[..len]);
29        Self(u64::from_be_bytes(buf))
30    }
31
32    #[inline]
33    pub fn as_u64(self) -> u64 {
34        self.0
35    }
36}
37
38#[cfg(feature = "std")]
39impl std::str::FromStr for Prefix {
40    type Err = core::convert::Infallible;
41
42    #[inline]
43    fn from_str(s: &str) -> Result<Self, Self::Err> {
44        Ok(Self::from_bytes(s.as_bytes()))
45    }
46}
47
48impl From<&[u8]> for Prefix {
49    #[inline]
50    fn from(bytes: &[u8]) -> Self {
51        Self::from_bytes(bytes)
52    }
53}
54
55impl PartialOrd for Prefix {
56    #[inline]
57    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
58        Some(self.cmp(other))
59    }
60}
61
62impl Ord for Prefix {
63    #[inline]
64    fn cmp(&self, other: &Self) -> Ordering {
65        self.0.cmp(&other.0)
66    }
67}
68
69impl crate::index::Key for Prefix {
70    type Unsigned = u64;
71
72    #[inline]
73    fn to_unsigned(self) -> Self::Unsigned {
74        self.0
75    }
76
77    #[inline]
78    fn to_f64_fast(self) -> f64 {
79        self.0 as f64
80    }
81
82    #[inline]
83    fn to_i64_fast(self) -> i64 {
84        self.0 as i64
85    }
86}
87
88impl num_traits::Zero for Prefix {
89    #[inline]
90    fn zero() -> Self {
91        Self(0)
92    }
93
94    #[inline]
95    fn is_zero(&self) -> bool {
96        self.0 == 0
97    }
98}
99
100impl num_traits::Bounded for Prefix {
101    #[inline]
102    fn min_value() -> Self {
103        Self(u64::MIN)
104    }
105
106    #[inline]
107    fn max_value() -> Self {
108        Self(u64::MAX)
109    }
110}
111
112impl num_traits::ToPrimitive for Prefix {
113    #[inline]
114    fn to_i64(&self) -> Option<i64> {
115        Some(self.0 as i64)
116    }
117
118    #[inline]
119    fn to_u64(&self) -> Option<u64> {
120        Some(self.0)
121    }
122}
123
124impl core::ops::Add for Prefix {
125    type Output = Self;
126
127    #[inline]
128    fn add(self, rhs: Self) -> Self::Output {
129        Self(self.0.wrapping_add(rhs.0))
130    }
131}
132
133impl core::ops::Sub for Prefix {
134    type Output = Self;
135
136    #[inline]
137    fn sub(self, rhs: Self) -> Self::Output {
138        Self(self.0.wrapping_sub(rhs.0))
139    }
140}
141
142#[cfg(test)]
143mod tests {
144    use super::*;
145    use crate::index::Static;
146    use alloc::vec;
147    use alloc::vec::Vec;
148
149    #[test]
150    fn test_bytes_prefix_ordering() {
151        let a = Prefix::from_bytes(b"apple");
152        let b = Prefix::from_bytes(b"banana");
153        let c = Prefix::from_bytes(b"cherry");
154
155        assert!(a < b);
156        assert!(b < c);
157    }
158
159    #[test]
160    fn test_bytes_prefix_from_bytes() {
161        let bytes = b"hello world";
162        let prefix = Prefix::from_bytes(bytes);
163
164        let expected = u64::from_be_bytes(*b"hello wo");
165        assert_eq!(prefix.as_u64(), expected);
166    }
167
168    #[test]
169    fn test_bytes_prefix_short() {
170        let short = Prefix::from_bytes(b"hi");
171        let mut expected = [0u8; 8];
172        expected[0] = b'h';
173        expected[1] = b'i';
174        assert_eq!(short.as_u64(), u64::from_be_bytes(expected));
175    }
176
177    #[test]
178    fn test_pgm_with_string_prefixes() {
179        let strings: Vec<&str> = vec![
180            "aardvark",
181            "apple",
182            "banana",
183            "cherry",
184            "date",
185            "elderberry",
186            "fig",
187            "grape",
188            "honeydew",
189            "jackfruit",
190        ];
191
192        let prefixes: Vec<Prefix> = strings
193            .iter()
194            .map(|s| Prefix::from_bytes(s.as_bytes()))
195            .collect();
196        let index = Static::new(&prefixes, 2, 2).unwrap();
197
198        let query = Prefix::from_bytes(b"cherry");
199        let approx = index.search(&query);
200
201        let found_idx = prefixes[approx.lo..approx.hi]
202            .binary_search(&query)
203            .map(|i| approx.lo + i);
204
205        assert_eq!(found_idx, Ok(3));
206    }
207
208    #[test]
209    fn test_bytes_prefix_equality() {
210        let a = Prefix::from_bytes(b"test");
211        let b = Prefix::from_bytes(b"test");
212        assert_eq!(a, b);
213    }
214
215    #[test]
216    fn test_long_strings_same_prefix() {
217        let a = Prefix::from_bytes(b"abcdefgh_suffix1");
218        let b = Prefix::from_bytes(b"abcdefgh_suffix2");
219
220        assert_eq!(a, b);
221    }
222
223    #[cfg(feature = "std")]
224    #[test]
225    fn test_from_str() {
226        let prefix: Prefix = "hello".parse().unwrap();
227        assert_eq!(prefix, Prefix::from_bytes(b"hello"));
228    }
229}