summavy_codecs/
lib.rs

1#![warn(missing_docs)]
2#![cfg_attr(all(feature = "unstable", test), feature(test))]
3
4//! # `fastfield_codecs`
5//!
6//! - Columnar storage of data for tantivy [`Column`].
7//! - Encode data in different codecs.
8//! - Monotonically map values to u64/u128
9
10#[cfg(test)]
11#[macro_use]
12extern crate more_asserts;
13
14#[cfg(all(test, feature = "unstable"))]
15extern crate test;
16
17use std::io;
18use std::io::Write;
19use std::sync::Arc;
20
21use common::BinarySerializable;
22use compact_space::CompactSpaceDecompressor;
23use format_version::read_format_version;
24use monotonic_mapping::{
25    StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
26    StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval,
27};
28use null_index_footer::read_null_index_footer;
29use ownedbytes::OwnedBytes;
30use serialize::{Header, U128Header};
31
32mod bitpacked;
33mod blockwise_linear;
34mod compact_space;
35mod format_version;
36mod line;
37mod linear;
38mod monotonic_mapping;
39mod monotonic_mapping_u128;
40mod null_index_footer;
41
42mod column;
43mod gcd;
44mod serialize;
45
46use self::bitpacked::BitpackedCodec;
47use self::blockwise_linear::BlockwiseLinearCodec;
48pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn};
49use self::linear::LinearCodec;
50pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
51pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
52pub use self::serialize::{
53    estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader,
54};
55
56#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
57#[repr(u8)]
58/// Available codecs to use to encode the u64 (via [`MonotonicallyMappableToU64`]) converted data.
59pub enum FastFieldCodecType {
60    /// Bitpack all values in the value range. The number of bits is defined by the amplitude
61    /// `column.max_value() - column.min_value()`
62    Bitpacked = 1,
63    /// Linear interpolation puts a line between the first and last value and then bitpacks the
64    /// values by the offset from the line. The number of bits is defined by the max deviation from
65    /// the line.
66    Linear = 2,
67    /// Same as [`FastFieldCodecType::Linear`], but encodes in blocks of 512 elements.
68    BlockwiseLinear = 3,
69}
70
71impl BinarySerializable for FastFieldCodecType {
72    fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
73        self.to_code().serialize(wrt)
74    }
75
76    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
77        let code = u8::deserialize(reader)?;
78        let codec_type: Self = Self::from_code(code)
79            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
80        Ok(codec_type)
81    }
82}
83
84impl FastFieldCodecType {
85    pub(crate) fn to_code(self) -> u8 {
86        self as u8
87    }
88
89    pub(crate) fn from_code(code: u8) -> Option<Self> {
90        match code {
91            1 => Some(Self::Bitpacked),
92            2 => Some(Self::Linear),
93            3 => Some(Self::BlockwiseLinear),
94            _ => None,
95        }
96    }
97}
98
99#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
100#[repr(u8)]
101/// Available codecs to use to encode the u128 (via [`MonotonicallyMappableToU128`]) converted data.
102pub enum U128FastFieldCodecType {
103    /// This codec takes a large number space (u128) and reduces it to a compact number space, by
104    /// removing the holes.
105    CompactSpace = 1,
106}
107
108impl BinarySerializable for U128FastFieldCodecType {
109    fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
110        self.to_code().serialize(wrt)
111    }
112
113    fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
114        let code = u8::deserialize(reader)?;
115        let codec_type: Self = Self::from_code(code)
116            .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
117        Ok(codec_type)
118    }
119}
120
121impl U128FastFieldCodecType {
122    pub(crate) fn to_code(self) -> u8 {
123        self as u8
124    }
125
126    pub(crate) fn from_code(code: u8) -> Option<Self> {
127        match code {
128            1 => Some(Self::CompactSpace),
129            _ => None,
130        }
131    }
132}
133
134/// Returns the correct codec reader wrapped in the `Arc` for the data.
135pub fn open_u128<Item: MonotonicallyMappableToU128>(
136    bytes: OwnedBytes,
137) -> io::Result<Arc<dyn Column<Item>>> {
138    let (bytes, _format_version) = read_format_version(bytes)?;
139    let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
140    let header = U128Header::deserialize(&mut bytes)?;
141    assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
142    let reader = CompactSpaceDecompressor::open(bytes)?;
143    let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<Item>> =
144        StrictlyMonotonicMappingToInternal::<Item>::new().into();
145    Ok(Arc::new(monotonic_map_column(reader, inverted)))
146}
147
148/// Returns the correct codec reader wrapped in the `Arc` for the data.
149pub fn open<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Arc<dyn Column<T>>> {
150    let (bytes, _format_version) = read_format_version(bytes)?;
151    let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
152    let header = Header::deserialize(&mut bytes)?;
153    match header.codec_type {
154        FastFieldCodecType::Bitpacked => open_specific_codec::<BitpackedCodec, _>(bytes, &header),
155        FastFieldCodecType::Linear => open_specific_codec::<LinearCodec, _>(bytes, &header),
156        FastFieldCodecType::BlockwiseLinear => {
157            open_specific_codec::<BlockwiseLinearCodec, _>(bytes, &header)
158        }
159    }
160}
161
162fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
163    bytes: OwnedBytes,
164    header: &Header,
165) -> io::Result<Arc<dyn Column<Item>>> {
166    let normalized_header = header.normalized();
167    let reader = C::open_from_bytes(bytes, normalized_header)?;
168    let min_value = header.min_value;
169    if let Some(gcd) = header.gcd {
170        let mapping = StrictlyMonotonicMappingInverter::from(
171            StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd.get(), min_value),
172        );
173        Ok(Arc::new(monotonic_map_column(reader, mapping)))
174    } else {
175        let mapping = StrictlyMonotonicMappingInverter::from(
176            StrictlyMonotonicMappingToInternalBaseval::new(min_value),
177        );
178        Ok(Arc::new(monotonic_map_column(reader, mapping)))
179    }
180}
181
182/// The FastFieldSerializerEstimate trait is required on all variants
183/// of fast field compressions, to decide which one to choose.
184trait FastFieldCodec: 'static {
185    /// A codex needs to provide a unique name and id, which is
186    /// used for debugging and de/serialization.
187    const CODEC_TYPE: FastFieldCodecType;
188
189    type Reader: Column<u64> + 'static;
190
191    /// Reads the metadata and returns the CodecReader
192    fn open_from_bytes(bytes: OwnedBytes, header: NormalizedHeader) -> io::Result<Self::Reader>;
193
194    /// Serializes the data using the serializer into write.
195    ///
196    /// The column iterator should be preferred over using column `get_val` method for
197    /// performance reasons.
198    fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()>;
199
200    /// Returns an estimate of the compression ratio.
201    /// If the codec is not applicable, returns `None`.
202    ///
203    /// The baseline is uncompressed 64bit data.
204    ///
205    /// It could make sense to also return a value representing
206    /// computational complexity.
207    fn estimate(column: &dyn Column) -> Option<f32>;
208}
209
210/// The list of all available codecs for u64 convertible data.
211pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
212    FastFieldCodecType::Bitpacked,
213    FastFieldCodecType::BlockwiseLinear,
214    FastFieldCodecType::Linear,
215];
216
217#[cfg(test)]
218mod tests {
219
220    use proptest::prelude::*;
221    use proptest::strategy::Strategy;
222    use proptest::{prop_oneof, proptest};
223
224    use crate::bitpacked::BitpackedCodec;
225    use crate::blockwise_linear::BlockwiseLinearCodec;
226    use crate::linear::LinearCodec;
227    use crate::serialize::Header;
228
229    pub(crate) fn create_and_validate<Codec: FastFieldCodec>(
230        data: &[u64],
231        name: &str,
232    ) -> Option<(f32, f32)> {
233        let col = &VecColumn::from(data);
234        let header = Header::compute_header(col, &[Codec::CODEC_TYPE])?;
235        let normalized_col = header.normalize_column(col);
236        let estimation = Codec::estimate(&normalized_col)?;
237
238        let mut out = Vec::new();
239        let col = VecColumn::from(data);
240        serialize(col, &mut out, &[Codec::CODEC_TYPE]).unwrap();
241
242        let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
243
244        let reader = crate::open::<u64>(OwnedBytes::new(out)).unwrap();
245        assert_eq!(reader.num_vals(), data.len() as u32);
246        for (doc, orig_val) in data.iter().copied().enumerate() {
247            let val = reader.get_val(doc as u32);
248            assert_eq!(
249                val, orig_val,
250                "val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \
251                 `{data:?}`",
252            );
253        }
254
255        if !data.is_empty() {
256            let test_rand_idx = rand::thread_rng().gen_range(0..=data.len() - 1);
257            let expected_positions: Vec<u32> = data
258                .iter()
259                .enumerate()
260                .filter(|(_, el)| **el == data[test_rand_idx])
261                .map(|(pos, _)| pos as u32)
262                .collect();
263            let mut positions = Vec::new();
264            reader.get_docids_for_value_range(
265                data[test_rand_idx]..=data[test_rand_idx],
266                0..data.len() as u32,
267                &mut positions,
268            );
269            assert_eq!(expected_positions, positions);
270        }
271        Some((estimation, actual_compression))
272    }
273
274    proptest! {
275        #![proptest_config(ProptestConfig::with_cases(100))]
276
277        #[test]
278        fn test_proptest_small_bitpacked(data in proptest::collection::vec(num_strategy(), 1..10)) {
279            create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
280        }
281
282        #[test]
283        fn test_proptest_small_linear(data in proptest::collection::vec(num_strategy(), 1..10)) {
284            create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
285        }
286
287        #[test]
288        fn test_proptest_small_blockwise_linear(data in proptest::collection::vec(num_strategy(), 1..10)) {
289            create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
290        }
291    }
292
293    proptest! {
294        #![proptest_config(ProptestConfig::with_cases(10))]
295
296        #[test]
297        fn test_proptest_large_bitpacked(data in proptest::collection::vec(num_strategy(), 1..6000)) {
298            create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
299        }
300
301        #[test]
302        fn test_proptest_large_linear(data in proptest::collection::vec(num_strategy(), 1..6000)) {
303            create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
304        }
305
306        #[test]
307        fn test_proptest_large_blockwise_linear(data in proptest::collection::vec(num_strategy(), 1..6000)) {
308            create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
309        }
310    }
311
312    fn num_strategy() -> impl Strategy<Value = u64> {
313        prop_oneof![
314            1 => prop::num::u64::ANY.prop_map(|num| u64::MAX - (num % 10) ),
315            1 => prop::num::u64::ANY.prop_map(|num| num % 10 ),
316            20 => prop::num::u64::ANY,
317        ]
318    }
319
320    pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
321        let mut data_and_names = vec![];
322
323        let data = (10..=10_000_u64).collect::<Vec<_>>();
324        data_and_names.push((data, "simple monotonically increasing"));
325
326        data_and_names.push((
327            vec![5, 6, 7, 8, 9, 10, 99, 100],
328            "offset in linear interpol",
329        ));
330        data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
331        data_and_names.push((vec![10], "single value"));
332
333        data_and_names.push((
334            vec![1572656989877777, 1170935903116329, 720575940379279, 0],
335            "overflow error",
336        ));
337
338        data_and_names
339    }
340
341    fn test_codec<C: FastFieldCodec>() {
342        let codec_name = format!("{:?}", C::CODEC_TYPE);
343        for (data, dataset_name) in get_codec_test_datasets() {
344            let estimate_actual_opt: Option<(f32, f32)> =
345                crate::tests::create_and_validate::<C>(&data, dataset_name);
346            let result = if let Some((estimate, actual)) = estimate_actual_opt {
347                format!("Estimate `{estimate}` Actual `{actual}`")
348            } else {
349                "Disabled".to_string()
350            };
351            println!("Codec {codec_name}, DataSet {dataset_name}, {result}");
352        }
353    }
354    #[test]
355    fn test_codec_bitpacking() {
356        test_codec::<BitpackedCodec>();
357    }
358    #[test]
359    fn test_codec_interpolation() {
360        test_codec::<LinearCodec>();
361    }
362    #[test]
363    fn test_codec_multi_interpolation() {
364        test_codec::<BlockwiseLinearCodec>();
365    }
366
367    use super::*;
368
369    #[test]
370    fn estimation_good_interpolation_case() {
371        let data = (10..=20000_u64).collect::<Vec<_>>();
372        let data: VecColumn = data.as_slice().into();
373
374        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
375        assert_le!(linear_interpol_estimation, 0.01);
376
377        let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
378        assert_le!(multi_linear_interpol_estimation, 0.2);
379        assert_lt!(linear_interpol_estimation, multi_linear_interpol_estimation);
380
381        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
382        assert_lt!(linear_interpol_estimation, bitpacked_estimation);
383    }
384    #[test]
385    fn estimation_test_bad_interpolation_case() {
386        let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
387
388        let data: VecColumn = data.into();
389        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
390        assert_le!(linear_interpol_estimation, 0.34);
391
392        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
393        assert_lt!(bitpacked_estimation, linear_interpol_estimation);
394    }
395
396    #[test]
397    fn estimation_prefer_bitpacked() {
398        let data = VecColumn::from(&[10, 10, 10, 10]);
399        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
400        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
401        assert_lt!(bitpacked_estimation, linear_interpol_estimation);
402    }
403
404    #[test]
405    fn estimation_test_bad_interpolation_case_monotonically_increasing() {
406        let mut data: Vec<u64> = (201..=20000_u64).collect();
407        data.push(1_000_000);
408        let data: VecColumn = data.as_slice().into();
409
410        // in this case the linear interpolation can't in fact not be worse than bitpacking,
411        // but the estimator adds some threshold, which leads to estimated worse behavior
412        let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
413        assert_le!(linear_interpol_estimation, 0.35);
414
415        let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
416        assert_le!(bitpacked_estimation, 0.32);
417        assert_le!(bitpacked_estimation, linear_interpol_estimation);
418    }
419
420    #[test]
421    fn test_fast_field_codec_type_to_code() {
422        let mut count_codec = 0;
423        for code in 0..=255 {
424            if let Some(codec_type) = FastFieldCodecType::from_code(code) {
425                assert_eq!(codec_type.to_code(), code);
426                count_codec += 1;
427            }
428        }
429        assert_eq!(count_codec, 3);
430    }
431}
432
433#[cfg(all(test, feature = "unstable"))]
434mod bench {
435    use std::sync::Arc;
436
437    use ownedbytes::OwnedBytes;
438    use rand::rngs::StdRng;
439    use rand::{Rng, SeedableRng};
440    use test::{self, Bencher};
441
442    use super::*;
443    use crate::Column;
444
445    fn get_data() -> Vec<u64> {
446        let mut rng = StdRng::seed_from_u64(2u64);
447        let mut data: Vec<_> = (100..55000_u64)
448            .map(|num| num + rng.gen::<u8>() as u64)
449            .collect();
450        data.push(99_000);
451        data.insert(1000, 2000);
452        data.insert(2000, 100);
453        data.insert(3000, 4100);
454        data.insert(4000, 100);
455        data.insert(5000, 800);
456        data
457    }
458
459    #[inline(never)]
460    fn value_iter() -> impl Iterator<Item = u64> {
461        0..20_000
462    }
463    fn get_reader_for_bench<Codec: FastFieldCodec>(data: &[u64]) -> Codec::Reader {
464        let mut bytes = Vec::new();
465        let min_value = *data.iter().min().unwrap();
466        let data = data.iter().map(|el| *el - min_value).collect::<Vec<_>>();
467        let col = VecColumn::from(&data);
468        let normalized_header = crate::NormalizedHeader {
469            num_vals: col.num_vals(),
470            max_value: col.max_value(),
471        };
472        Codec::serialize(&VecColumn::from(&data), &mut bytes).unwrap();
473        Codec::open_from_bytes(OwnedBytes::new(bytes), normalized_header).unwrap()
474    }
475    fn bench_get<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
476        let col = get_reader_for_bench::<Codec>(data);
477        b.iter(|| {
478            let mut sum = 0u64;
479            for pos in value_iter() {
480                let val = col.get_val(pos as u32);
481                sum = sum.wrapping_add(val);
482            }
483            sum
484        });
485    }
486
487    #[inline(never)]
488    fn bench_get_dynamic_helper(b: &mut Bencher, col: Arc<dyn Column>) {
489        b.iter(|| {
490            let mut sum = 0u64;
491            for pos in value_iter() {
492                let val = col.get_val(pos as u32);
493                sum = sum.wrapping_add(val);
494            }
495            sum
496        });
497    }
498
499    fn bench_get_dynamic<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
500        let col = Arc::new(get_reader_for_bench::<Codec>(data));
501        bench_get_dynamic_helper(b, col);
502    }
503    fn bench_create<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
504        let min_value = *data.iter().min().unwrap();
505        let data = data.iter().map(|el| *el - min_value).collect::<Vec<_>>();
506
507        let mut bytes = Vec::new();
508        b.iter(|| {
509            bytes.clear();
510            Codec::serialize(&VecColumn::from(&data), &mut bytes).unwrap();
511        });
512    }
513
514    #[bench]
515    fn bench_fastfield_bitpack_create(b: &mut Bencher) {
516        let data: Vec<_> = get_data();
517        bench_create::<BitpackedCodec>(b, &data);
518    }
519    #[bench]
520    fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
521        let data: Vec<_> = get_data();
522        bench_create::<LinearCodec>(b, &data);
523    }
524    #[bench]
525    fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
526        let data: Vec<_> = get_data();
527        bench_create::<BlockwiseLinearCodec>(b, &data);
528    }
529    #[bench]
530    fn bench_fastfield_bitpack_get(b: &mut Bencher) {
531        let data: Vec<_> = get_data();
532        bench_get::<BitpackedCodec>(b, &data);
533    }
534    #[bench]
535    fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
536        let data: Vec<_> = get_data();
537        bench_get_dynamic::<BitpackedCodec>(b, &data);
538    }
539    #[bench]
540    fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
541        let data: Vec<_> = get_data();
542        bench_get::<LinearCodec>(b, &data);
543    }
544    #[bench]
545    fn bench_fastfield_linearinterpol_get_dynamic(b: &mut Bencher) {
546        let data: Vec<_> = get_data();
547        bench_get_dynamic::<LinearCodec>(b, &data);
548    }
549    #[bench]
550    fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
551        let data: Vec<_> = get_data();
552        bench_get::<BlockwiseLinearCodec>(b, &data);
553    }
554    #[bench]
555    fn bench_fastfield_multilinearinterpol_get_dynamic(b: &mut Bencher) {
556        let data: Vec<_> = get_data();
557        bench_get_dynamic::<BlockwiseLinearCodec>(b, &data);
558    }
559}