summavy_codecs/
column.rs

1use std::marker::PhantomData;
2use std::ops::{Range, RangeInclusive};
3
4use tantivy_bitpacker::minmax;
5
6use crate::monotonic_mapping::StrictlyMonotonicFn;
7
8/// `Column` provides columnar access on a field.
9pub trait Column<T: PartialOrd = u64>: Send + Sync {
10    /// Return the value associated with the given idx.
11    ///
12    /// This accessor should return as fast as possible.
13    ///
14    /// # Panics
15    ///
16    /// May panic if `idx` is greater than the column length.
17    fn get_val(&self, idx: u32) -> T;
18
19    /// Fills an output buffer with the fast field values
20    /// associated with the `DocId` going from
21    /// `start` to `start + output.len()`.
22    ///
23    /// # Panics
24    ///
25    /// Must panic if `start + output.len()` is greater than
26    /// the segment's `maxdoc`.
27    #[inline]
28    fn get_range(&self, start: u64, output: &mut [T]) {
29        for (out, idx) in output.iter_mut().zip(start..) {
30            *out = self.get_val(idx as u32);
31        }
32    }
33
34    /// Get the positions of values which are in the provided value range.
35    ///
36    /// Note that position == docid for single value fast fields
37    #[inline]
38    fn get_docids_for_value_range(
39        &self,
40        value_range: RangeInclusive<T>,
41        doc_id_range: Range<u32>,
42        positions: &mut Vec<u32>,
43    ) {
44        let doc_id_range = doc_id_range.start..doc_id_range.end.min(self.num_vals());
45
46        for idx in doc_id_range.start..doc_id_range.end {
47            let val = self.get_val(idx);
48            if value_range.contains(&val) {
49                positions.push(idx);
50            }
51        }
52    }
53
54    /// Returns the minimum value for this fast field.
55    ///
56    /// This min_value may not be exact.
57    /// For instance, the min value does not take in account of possible
58    /// deleted document. All values are however guaranteed to be higher than
59    /// `.min_value()`.
60    fn min_value(&self) -> T;
61
62    /// Returns the maximum value for this fast field.
63    ///
64    /// This max_value may not be exact.
65    /// For instance, the max value does not take in account of possible
66    /// deleted document. All values are however guaranteed to be higher than
67    /// `.max_value()`.
68    fn max_value(&self) -> T;
69
70    /// The number of values in the column.
71    fn num_vals(&self) -> u32;
72
73    /// Returns a iterator over the data
74    fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = T> + 'a> {
75        Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
76    }
77}
78
79/// VecColumn provides `Column` over a slice.
80pub struct VecColumn<'a, T = u64> {
81    values: &'a [T],
82    min_value: T,
83    max_value: T,
84}
85
86impl<'a, C: Column<T>, T: Copy + PartialOrd> Column<T> for &'a C {
87    fn get_val(&self, idx: u32) -> T {
88        (*self).get_val(idx)
89    }
90
91    fn min_value(&self) -> T {
92        (*self).min_value()
93    }
94
95    fn max_value(&self) -> T {
96        (*self).max_value()
97    }
98
99    fn num_vals(&self) -> u32 {
100        (*self).num_vals()
101    }
102
103    fn iter<'b>(&'b self) -> Box<dyn Iterator<Item = T> + 'b> {
104        (*self).iter()
105    }
106
107    fn get_range(&self, start: u64, output: &mut [T]) {
108        (*self).get_range(start, output)
109    }
110}
111
112impl<'a, T: Copy + PartialOrd + Send + Sync> Column<T> for VecColumn<'a, T> {
113    fn get_val(&self, position: u32) -> T {
114        self.values[position as usize]
115    }
116
117    fn iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
118        Box::new(self.values.iter().copied())
119    }
120
121    fn min_value(&self) -> T {
122        self.min_value
123    }
124
125    fn max_value(&self) -> T {
126        self.max_value
127    }
128
129    fn num_vals(&self) -> u32 {
130        self.values.len() as u32
131    }
132
133    fn get_range(&self, start: u64, output: &mut [T]) {
134        output.copy_from_slice(&self.values[start as usize..][..output.len()])
135    }
136}
137
138impl<'a, T: Copy + Ord + Default, V> From<&'a V> for VecColumn<'a, T>
139where V: AsRef<[T]> + ?Sized
140{
141    fn from(values: &'a V) -> Self {
142        let values = values.as_ref();
143        let (min_value, max_value) = minmax(values.iter().copied()).unwrap_or_default();
144        Self {
145            values,
146            min_value,
147            max_value,
148        }
149    }
150}
151
152struct MonotonicMappingColumn<C, T, Input> {
153    from_column: C,
154    monotonic_mapping: T,
155    _phantom: PhantomData<Input>,
156}
157
158/// Creates a view of a column transformed by a strictly monotonic mapping. See
159/// [`StrictlyMonotonicFn`].
160///
161/// E.g. apply a gcd monotonic_mapping([100, 200, 300]) == [1, 2, 3]
162/// monotonic_mapping.mapping() is expected to be injective, and we should always have
163/// monotonic_mapping.inverse(monotonic_mapping.mapping(el)) == el
164///
165/// The inverse of the mapping is required for:
166/// `fn get_positions_for_value_range(&self, range: RangeInclusive<T>) -> Vec<u64> `
167/// The user provides the original value range and we need to monotonic map them in the same way the
168/// serialization does before calling the underlying column.
169///
170/// Note that when opening a codec, the monotonic_mapping should be the inverse of the mapping
171/// during serialization. And therefore the monotonic_mapping_inv when opening is the same as
172/// monotonic_mapping during serialization.
173pub fn monotonic_map_column<C, T, Input, Output>(
174    from_column: C,
175    monotonic_mapping: T,
176) -> impl Column<Output>
177where
178    C: Column<Input>,
179    T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
180    Input: PartialOrd + Send + Sync + Clone,
181    Output: PartialOrd + Send + Sync + Clone,
182{
183    MonotonicMappingColumn {
184        from_column,
185        monotonic_mapping,
186        _phantom: PhantomData,
187    }
188}
189
190impl<C, T, Input, Output> Column<Output> for MonotonicMappingColumn<C, T, Input>
191where
192    C: Column<Input>,
193    T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
194    Input: PartialOrd + Send + Sync + Clone,
195    Output: PartialOrd + Send + Sync + Clone,
196{
197    #[inline]
198    fn get_val(&self, idx: u32) -> Output {
199        let from_val = self.from_column.get_val(idx);
200        self.monotonic_mapping.mapping(from_val)
201    }
202
203    fn min_value(&self) -> Output {
204        let from_min_value = self.from_column.min_value();
205        self.monotonic_mapping.mapping(from_min_value)
206    }
207
208    fn max_value(&self) -> Output {
209        let from_max_value = self.from_column.max_value();
210        self.monotonic_mapping.mapping(from_max_value)
211    }
212
213    fn num_vals(&self) -> u32 {
214        self.from_column.num_vals()
215    }
216
217    fn iter(&self) -> Box<dyn Iterator<Item = Output> + '_> {
218        Box::new(
219            self.from_column
220                .iter()
221                .map(|el| self.monotonic_mapping.mapping(el)),
222        )
223    }
224
225    fn get_docids_for_value_range(
226        &self,
227        range: RangeInclusive<Output>,
228        doc_id_range: Range<u32>,
229        positions: &mut Vec<u32>,
230    ) {
231        self.from_column.get_docids_for_value_range(
232            self.monotonic_mapping.inverse(range.start().clone())
233                ..=self.monotonic_mapping.inverse(range.end().clone()),
234            doc_id_range,
235            positions,
236        )
237    }
238
239    // We voluntarily do not implement get_range as it yields a regression,
240    // and we do not have any specialized implementation anyway.
241}
242
243/// Wraps an iterator into a `Column`.
244pub struct IterColumn<T>(T);
245
246impl<T> From<T> for IterColumn<T>
247where T: Iterator + Clone + ExactSizeIterator
248{
249    fn from(iter: T) -> Self {
250        IterColumn(iter)
251    }
252}
253
254impl<T> Column<T::Item> for IterColumn<T>
255where
256    T: Iterator + Clone + ExactSizeIterator + Send + Sync,
257    T::Item: PartialOrd,
258{
259    fn get_val(&self, idx: u32) -> T::Item {
260        self.0.clone().nth(idx as usize).unwrap()
261    }
262
263    fn min_value(&self) -> T::Item {
264        self.0.clone().next().unwrap()
265    }
266
267    fn max_value(&self) -> T::Item {
268        self.0.clone().last().unwrap()
269    }
270
271    fn num_vals(&self) -> u32 {
272        self.0.len() as u32
273    }
274
275    fn iter(&self) -> Box<dyn Iterator<Item = T::Item> + '_> {
276        Box::new(self.0.clone())
277    }
278}
279
280#[cfg(test)]
281mod tests {
282    use super::*;
283    use crate::monotonic_mapping::{
284        StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternalBaseval,
285        StrictlyMonotonicMappingToInternalGCDBaseval,
286    };
287
288    #[test]
289    fn test_monotonic_mapping() {
290        let vals = &[3u64, 5u64][..];
291        let col = VecColumn::from(vals);
292        let mapped = monotonic_map_column(col, StrictlyMonotonicMappingToInternalBaseval::new(2));
293        assert_eq!(mapped.min_value(), 1u64);
294        assert_eq!(mapped.max_value(), 3u64);
295        assert_eq!(mapped.num_vals(), 2);
296        assert_eq!(mapped.num_vals(), 2);
297        assert_eq!(mapped.get_val(0), 1);
298        assert_eq!(mapped.get_val(1), 3);
299    }
300
301    #[test]
302    fn test_range_as_col() {
303        let col = IterColumn::from(10..100);
304        assert_eq!(col.num_vals(), 90);
305        assert_eq!(col.max_value(), 99);
306    }
307
308    #[test]
309    fn test_monotonic_mapping_iter() {
310        let vals: Vec<u64> = (10..110u64).map(|el| el * 10).collect();
311        let col = VecColumn::from(&vals);
312        let mapped = monotonic_map_column(
313            col,
314            StrictlyMonotonicMappingInverter::from(
315                StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 100),
316            ),
317        );
318        let val_i64s: Vec<u64> = mapped.iter().collect();
319        for i in 0..100 {
320            assert_eq!(val_i64s[i as usize], mapped.get_val(i));
321        }
322    }
323
324    #[test]
325    fn test_monotonic_mapping_get_range() {
326        let vals: Vec<u64> = (0..100u64).map(|el| el * 10).collect();
327        let col = VecColumn::from(&vals);
328        let mapped = monotonic_map_column(
329            col,
330            StrictlyMonotonicMappingInverter::from(
331                StrictlyMonotonicMappingToInternalGCDBaseval::new(10, 0),
332            ),
333        );
334
335        assert_eq!(mapped.min_value(), 0u64);
336        assert_eq!(mapped.max_value(), 9900u64);
337        assert_eq!(mapped.num_vals(), 100);
338        let val_u64s: Vec<u64> = mapped.iter().collect();
339        assert_eq!(val_u64s.len(), 100);
340        for i in 0..100 {
341            assert_eq!(val_u64s[i as usize], mapped.get_val(i));
342            assert_eq!(val_u64s[i as usize], vals[i as usize] * 10);
343        }
344        let mut buf = [0u64; 20];
345        mapped.get_range(7, &mut buf[..]);
346        assert_eq!(&val_u64s[7..][..20], &buf);
347    }
348}