pgm_extra/index/external/
one_level.rs

1//! Single-level PGM-Index.
2//!
3//! A simpler variant with only one level of linear models.
4//! Good for smaller datasets or when minimal memory is preferred.
5
6use alloc::vec::Vec;
7use core::ops::RangeBounds;
8
9use crate::error::Error;
10
11use crate::index::key::Indexable;
12use crate::index::model::build_segments;
13use crate::index::segment::Segment;
14
15use crate::util::ApproxPos;
16use crate::util::range::range_to_indices;
17use crate::util::search::{pgm_add_eps, pgm_sub_eps};
18
19/// A single-level PGM-Index.
20///
21/// This is a simpler variant of the multi-level index that uses only
22/// one level of segments. It's suitable for smaller datasets or when
23/// you want to minimize memory usage at the cost of slightly longer
24/// segment search time.
25///
26/// # Example
27///
28/// ```
29/// use pgm_extra::index::external::OneLevel;
30///
31/// let keys: Vec<u64> = (0..1000).collect();
32/// let index = OneLevel::new(&keys, 8).unwrap();
33///
34/// assert!(index.contains(&keys, &500));
35/// ```
36#[derive(Clone, Debug)]
37#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
38#[cfg_attr(
39    feature = "serde",
40    serde(bound = "T::Key: serde::Serialize + serde::de::DeserializeOwned")
41)]
42pub struct OneLevel<T: Indexable> {
43    epsilon: usize,
44    len: usize,
45    segments: Vec<Segment<T::Key>>,
46}
47
48impl<T: Indexable> OneLevel<T>
49where
50    T::Key: Ord,
51{
52    /// Build a new single-level PGM-Index from sorted data.
53    pub fn new(data: &[T], epsilon: usize) -> Result<Self, Error> {
54        if data.is_empty() {
55            return Err(Error::EmptyInput);
56        }
57        if epsilon == 0 {
58            return Err(Error::InvalidEpsilon);
59        }
60
61        debug_assert!(
62            data.windows(2)
63                .all(|w| w[0].index_key() <= w[1].index_key()),
64            "data must be sorted by index key"
65        );
66
67        let keys: Vec<T::Key> = data.iter().map(|v| v.index_key()).collect();
68        let segments = build_segments(&keys, epsilon);
69
70        Ok(Self {
71            segments,
72            epsilon,
73            len: keys.len(),
74        })
75    }
76
77    #[inline]
78    fn find_segment(&self, key: &T::Key) -> usize {
79        let pos = self
80            .segments
81            .partition_point(|s| s.key <= *key)
82            .saturating_sub(1);
83        pos.min(self.segments.len().saturating_sub(1))
84    }
85
86    /// Get an approximate position for the given value.
87    #[inline]
88    pub fn search(&self, value: &T) -> ApproxPos {
89        let key = value.index_key();
90        self.search_by_key(&key)
91    }
92
93    /// Get an approximate position for the given key.
94    #[inline]
95    pub fn search_by_key(&self, key: &T::Key) -> ApproxPos {
96        let seg_idx = self.find_segment(key);
97        let segment = &self.segments[seg_idx];
98
99        let pos = segment.predict(*key).min(self.len.saturating_sub(1));
100        let lo = pgm_sub_eps(pos, self.epsilon);
101        let hi = pgm_add_eps(pos, self.epsilon, self.len);
102
103        ApproxPos::new(pos, lo, hi)
104    }
105
106    /// Find the first position where `data[pos] >= value`.
107    #[inline]
108    pub fn lower_bound(&self, data: &[T], value: &T) -> usize
109    where
110        T: Ord,
111    {
112        let key = value.index_key();
113        let approx = self.search_by_key(&key);
114        let len = data.len();
115
116        if len == 0 {
117            return 0;
118        }
119
120        let pos = approx.pos.min(len - 1);
121        let k = &data[pos];
122
123        if *k == *value {
124            let mut i = pos;
125            while i > 0 && data[i - 1] == *value {
126                i -= 1;
127            }
128            return i;
129        }
130
131        if *k < *value {
132            if pos + 1 < len && data[pos + 1] >= *value {
133                return pos + 1;
134            }
135        } else if pos > 0 && data[pos - 1] < *value {
136            return pos;
137        }
138
139        let lo = approx.lo;
140        let hi = approx.hi.min(len);
141        let slice = &data[lo..hi];
142        lo + slice.partition_point(|x| x < value)
143    }
144
145    /// Find the first position where `data[pos] > value`.
146    #[inline]
147    pub fn upper_bound(&self, data: &[T], value: &T) -> usize
148    where
149        T: Ord,
150    {
151        let idx = self.lower_bound(data, value);
152        let mut i = idx;
153        while i < data.len() && data[i] == *value {
154            i += 1;
155        }
156        i
157    }
158
159    /// Check if the value exists in the data.
160    #[inline]
161    pub fn contains(&self, data: &[T], value: &T) -> bool
162    where
163        T: Ord,
164    {
165        let idx = self.lower_bound(data, value);
166        idx < data.len() && data[idx] == *value
167    }
168
169    #[inline]
170    pub fn len(&self) -> usize {
171        self.len
172    }
173
174    #[inline]
175    pub fn is_empty(&self) -> bool {
176        self.len == 0
177    }
178
179    #[inline]
180    pub fn segments_count(&self) -> usize {
181        self.segments.len()
182    }
183
184    #[inline]
185    pub fn epsilon(&self) -> usize {
186        self.epsilon
187    }
188
189    pub fn size_in_bytes(&self) -> usize {
190        core::mem::size_of::<Self>()
191            + self.segments.capacity() * core::mem::size_of::<Segment<T::Key>>()
192    }
193
194    /// Returns the (start, end) indices for iterating over data in the given range.
195    #[inline]
196    pub fn range_indices<R>(&self, data: &[T], range: R) -> (usize, usize)
197    where
198        T: Ord,
199        R: RangeBounds<T>,
200    {
201        range_to_indices(
202            range,
203            data.len(),
204            |v| self.lower_bound(data, v),
205            |v| self.upper_bound(data, v),
206        )
207    }
208
209    /// Returns an iterator over data in the given range.
210    #[inline]
211    pub fn range<'a, R>(&self, data: &'a [T], range: R) -> impl DoubleEndedIterator<Item = &'a T>
212    where
213        T: Ord,
214        R: RangeBounds<T>,
215    {
216        let (start, end) = self.range_indices(data, range);
217        data[start..end].iter()
218    }
219}
220
221impl<T: Indexable> crate::index::External<T> for OneLevel<T>
222where
223    T::Key: Ord,
224{
225    #[inline]
226    fn search(&self, value: &T) -> ApproxPos {
227        self.search(value)
228    }
229
230    #[inline]
231    fn lower_bound(&self, data: &[T], value: &T) -> usize
232    where
233        T: Ord,
234    {
235        self.lower_bound(data, value)
236    }
237
238    #[inline]
239    fn upper_bound(&self, data: &[T], value: &T) -> usize
240    where
241        T: Ord,
242    {
243        self.upper_bound(data, value)
244    }
245
246    #[inline]
247    fn contains(&self, data: &[T], value: &T) -> bool
248    where
249        T: Ord,
250    {
251        self.contains(data, value)
252    }
253
254    #[inline]
255    fn len(&self) -> usize {
256        self.len()
257    }
258
259    #[inline]
260    fn segments_count(&self) -> usize {
261        self.segments_count()
262    }
263
264    #[inline]
265    fn epsilon(&self) -> usize {
266        self.epsilon()
267    }
268
269    #[inline]
270    fn size_in_bytes(&self) -> usize {
271        self.size_in_bytes()
272    }
273}
274
275#[cfg(test)]
276mod tests {
277    use super::*;
278    use alloc::vec;
279    use alloc::vec::Vec;
280
281    #[test]
282    fn test_one_level_basic() {
283        let keys: Vec<u64> = (0..1000).collect();
284        let index = OneLevel::new(&keys, 8).unwrap();
285
286        assert_eq!(index.len(), 1000);
287        assert!(!index.is_empty());
288    }
289
290    #[test]
291    fn test_one_level_search() {
292        let keys: Vec<u64> = (0..1000).collect();
293        let index = OneLevel::new(&keys, 8).unwrap();
294
295        for &key in &[0u64, 100, 500, 999] {
296            let idx = index.lower_bound(&keys, &key);
297            assert_eq!(idx, key as usize);
298        }
299    }
300
301    #[test]
302    fn test_one_level_contains() {
303        let keys: Vec<u64> = (0..100).map(|i| i * 2).collect();
304        let index = OneLevel::new(&keys, 8).unwrap();
305
306        assert!(index.contains(&keys, &0));
307        assert!(index.contains(&keys, &50));
308        assert!(!index.contains(&keys, &1));
309        assert!(!index.contains(&keys, &51));
310    }
311
312    #[test]
313    fn test_one_level_empty_error() {
314        let keys: Vec<u64> = vec![];
315        let result = OneLevel::new(&keys, 8);
316        assert!(matches!(result, Err(Error::EmptyInput)));
317    }
318}