1#![warn(missing_docs)]
2#![cfg_attr(all(feature = "unstable", test), feature(test))]
3
4#[cfg(test)]
11#[macro_use]
12extern crate more_asserts;
13
14#[cfg(all(test, feature = "unstable"))]
15extern crate test;
16
17use std::io;
18use std::io::Write;
19use std::sync::Arc;
20
21use common::BinarySerializable;
22use compact_space::CompactSpaceDecompressor;
23use format_version::read_format_version;
24use monotonic_mapping::{
25 StrictlyMonotonicMappingInverter, StrictlyMonotonicMappingToInternal,
26 StrictlyMonotonicMappingToInternalBaseval, StrictlyMonotonicMappingToInternalGCDBaseval,
27};
28use null_index_footer::read_null_index_footer;
29use ownedbytes::OwnedBytes;
30use serialize::{Header, U128Header};
31
32mod bitpacked;
33mod blockwise_linear;
34mod compact_space;
35mod format_version;
36mod line;
37mod linear;
38mod monotonic_mapping;
39mod monotonic_mapping_u128;
40mod null_index_footer;
41
42mod column;
43mod gcd;
44mod serialize;
45
46use self::bitpacked::BitpackedCodec;
47use self::blockwise_linear::BlockwiseLinearCodec;
48pub use self::column::{monotonic_map_column, Column, IterColumn, VecColumn};
49use self::linear::LinearCodec;
50pub use self::monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
51pub use self::monotonic_mapping_u128::MonotonicallyMappableToU128;
52pub use self::serialize::{
53 estimate, serialize, serialize_and_load, serialize_u128, NormalizedHeader,
54};
55
56#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
57#[repr(u8)]
58pub enum FastFieldCodecType {
60 Bitpacked = 1,
63 Linear = 2,
67 BlockwiseLinear = 3,
69}
70
71impl BinarySerializable for FastFieldCodecType {
72 fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
73 self.to_code().serialize(wrt)
74 }
75
76 fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
77 let code = u8::deserialize(reader)?;
78 let codec_type: Self = Self::from_code(code)
79 .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
80 Ok(codec_type)
81 }
82}
83
84impl FastFieldCodecType {
85 pub(crate) fn to_code(self) -> u8 {
86 self as u8
87 }
88
89 pub(crate) fn from_code(code: u8) -> Option<Self> {
90 match code {
91 1 => Some(Self::Bitpacked),
92 2 => Some(Self::Linear),
93 3 => Some(Self::BlockwiseLinear),
94 _ => None,
95 }
96 }
97}
98
99#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
100#[repr(u8)]
101pub enum U128FastFieldCodecType {
103 CompactSpace = 1,
106}
107
108impl BinarySerializable for U128FastFieldCodecType {
109 fn serialize<W: Write>(&self, wrt: &mut W) -> io::Result<()> {
110 self.to_code().serialize(wrt)
111 }
112
113 fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
114 let code = u8::deserialize(reader)?;
115 let codec_type: Self = Self::from_code(code)
116 .ok_or_else(|| io::Error::new(io::ErrorKind::InvalidData, "Unknown code `{code}.`"))?;
117 Ok(codec_type)
118 }
119}
120
121impl U128FastFieldCodecType {
122 pub(crate) fn to_code(self) -> u8 {
123 self as u8
124 }
125
126 pub(crate) fn from_code(code: u8) -> Option<Self> {
127 match code {
128 1 => Some(Self::CompactSpace),
129 _ => None,
130 }
131 }
132}
133
134pub fn open_u128<Item: MonotonicallyMappableToU128>(
136 bytes: OwnedBytes,
137) -> io::Result<Arc<dyn Column<Item>>> {
138 let (bytes, _format_version) = read_format_version(bytes)?;
139 let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
140 let header = U128Header::deserialize(&mut bytes)?;
141 assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
142 let reader = CompactSpaceDecompressor::open(bytes)?;
143 let inverted: StrictlyMonotonicMappingInverter<StrictlyMonotonicMappingToInternal<Item>> =
144 StrictlyMonotonicMappingToInternal::<Item>::new().into();
145 Ok(Arc::new(monotonic_map_column(reader, inverted)))
146}
147
148pub fn open<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Arc<dyn Column<T>>> {
150 let (bytes, _format_version) = read_format_version(bytes)?;
151 let (mut bytes, _null_index_footer) = read_null_index_footer(bytes)?;
152 let header = Header::deserialize(&mut bytes)?;
153 match header.codec_type {
154 FastFieldCodecType::Bitpacked => open_specific_codec::<BitpackedCodec, _>(bytes, &header),
155 FastFieldCodecType::Linear => open_specific_codec::<LinearCodec, _>(bytes, &header),
156 FastFieldCodecType::BlockwiseLinear => {
157 open_specific_codec::<BlockwiseLinearCodec, _>(bytes, &header)
158 }
159 }
160}
161
162fn open_specific_codec<C: FastFieldCodec, Item: MonotonicallyMappableToU64>(
163 bytes: OwnedBytes,
164 header: &Header,
165) -> io::Result<Arc<dyn Column<Item>>> {
166 let normalized_header = header.normalized();
167 let reader = C::open_from_bytes(bytes, normalized_header)?;
168 let min_value = header.min_value;
169 if let Some(gcd) = header.gcd {
170 let mapping = StrictlyMonotonicMappingInverter::from(
171 StrictlyMonotonicMappingToInternalGCDBaseval::new(gcd.get(), min_value),
172 );
173 Ok(Arc::new(monotonic_map_column(reader, mapping)))
174 } else {
175 let mapping = StrictlyMonotonicMappingInverter::from(
176 StrictlyMonotonicMappingToInternalBaseval::new(min_value),
177 );
178 Ok(Arc::new(monotonic_map_column(reader, mapping)))
179 }
180}
181
182trait FastFieldCodec: 'static {
185 const CODEC_TYPE: FastFieldCodecType;
188
189 type Reader: Column<u64> + 'static;
190
191 fn open_from_bytes(bytes: OwnedBytes, header: NormalizedHeader) -> io::Result<Self::Reader>;
193
194 fn serialize(column: &dyn Column, write: &mut impl Write) -> io::Result<()>;
199
200 fn estimate(column: &dyn Column) -> Option<f32>;
208}
209
210pub const ALL_CODEC_TYPES: [FastFieldCodecType; 3] = [
212 FastFieldCodecType::Bitpacked,
213 FastFieldCodecType::BlockwiseLinear,
214 FastFieldCodecType::Linear,
215];
216
217#[cfg(test)]
218mod tests {
219
220 use proptest::prelude::*;
221 use proptest::strategy::Strategy;
222 use proptest::{prop_oneof, proptest};
223
224 use crate::bitpacked::BitpackedCodec;
225 use crate::blockwise_linear::BlockwiseLinearCodec;
226 use crate::linear::LinearCodec;
227 use crate::serialize::Header;
228
229 pub(crate) fn create_and_validate<Codec: FastFieldCodec>(
230 data: &[u64],
231 name: &str,
232 ) -> Option<(f32, f32)> {
233 let col = &VecColumn::from(data);
234 let header = Header::compute_header(col, &[Codec::CODEC_TYPE])?;
235 let normalized_col = header.normalize_column(col);
236 let estimation = Codec::estimate(&normalized_col)?;
237
238 let mut out = Vec::new();
239 let col = VecColumn::from(data);
240 serialize(col, &mut out, &[Codec::CODEC_TYPE]).unwrap();
241
242 let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
243
244 let reader = crate::open::<u64>(OwnedBytes::new(out)).unwrap();
245 assert_eq!(reader.num_vals(), data.len() as u32);
246 for (doc, orig_val) in data.iter().copied().enumerate() {
247 let val = reader.get_val(doc as u32);
248 assert_eq!(
249 val, orig_val,
250 "val `{val}` does not match orig_val {orig_val:?}, in data set {name}, data \
251 `{data:?}`",
252 );
253 }
254
255 if !data.is_empty() {
256 let test_rand_idx = rand::thread_rng().gen_range(0..=data.len() - 1);
257 let expected_positions: Vec<u32> = data
258 .iter()
259 .enumerate()
260 .filter(|(_, el)| **el == data[test_rand_idx])
261 .map(|(pos, _)| pos as u32)
262 .collect();
263 let mut positions = Vec::new();
264 reader.get_docids_for_value_range(
265 data[test_rand_idx]..=data[test_rand_idx],
266 0..data.len() as u32,
267 &mut positions,
268 );
269 assert_eq!(expected_positions, positions);
270 }
271 Some((estimation, actual_compression))
272 }
273
274 proptest! {
275 #![proptest_config(ProptestConfig::with_cases(100))]
276
277 #[test]
278 fn test_proptest_small_bitpacked(data in proptest::collection::vec(num_strategy(), 1..10)) {
279 create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
280 }
281
282 #[test]
283 fn test_proptest_small_linear(data in proptest::collection::vec(num_strategy(), 1..10)) {
284 create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
285 }
286
287 #[test]
288 fn test_proptest_small_blockwise_linear(data in proptest::collection::vec(num_strategy(), 1..10)) {
289 create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
290 }
291 }
292
293 proptest! {
294 #![proptest_config(ProptestConfig::with_cases(10))]
295
296 #[test]
297 fn test_proptest_large_bitpacked(data in proptest::collection::vec(num_strategy(), 1..6000)) {
298 create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
299 }
300
301 #[test]
302 fn test_proptest_large_linear(data in proptest::collection::vec(num_strategy(), 1..6000)) {
303 create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
304 }
305
306 #[test]
307 fn test_proptest_large_blockwise_linear(data in proptest::collection::vec(num_strategy(), 1..6000)) {
308 create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
309 }
310 }
311
312 fn num_strategy() -> impl Strategy<Value = u64> {
313 prop_oneof![
314 1 => prop::num::u64::ANY.prop_map(|num| u64::MAX - (num % 10) ),
315 1 => prop::num::u64::ANY.prop_map(|num| num % 10 ),
316 20 => prop::num::u64::ANY,
317 ]
318 }
319
320 pub fn get_codec_test_datasets() -> Vec<(Vec<u64>, &'static str)> {
321 let mut data_and_names = vec![];
322
323 let data = (10..=10_000_u64).collect::<Vec<_>>();
324 data_and_names.push((data, "simple monotonically increasing"));
325
326 data_and_names.push((
327 vec![5, 6, 7, 8, 9, 10, 99, 100],
328 "offset in linear interpol",
329 ));
330 data_and_names.push((vec![5, 50, 3, 13, 1, 1000, 35], "rand small"));
331 data_and_names.push((vec![10], "single value"));
332
333 data_and_names.push((
334 vec![1572656989877777, 1170935903116329, 720575940379279, 0],
335 "overflow error",
336 ));
337
338 data_and_names
339 }
340
341 fn test_codec<C: FastFieldCodec>() {
342 let codec_name = format!("{:?}", C::CODEC_TYPE);
343 for (data, dataset_name) in get_codec_test_datasets() {
344 let estimate_actual_opt: Option<(f32, f32)> =
345 crate::tests::create_and_validate::<C>(&data, dataset_name);
346 let result = if let Some((estimate, actual)) = estimate_actual_opt {
347 format!("Estimate `{estimate}` Actual `{actual}`")
348 } else {
349 "Disabled".to_string()
350 };
351 println!("Codec {codec_name}, DataSet {dataset_name}, {result}");
352 }
353 }
354 #[test]
355 fn test_codec_bitpacking() {
356 test_codec::<BitpackedCodec>();
357 }
358 #[test]
359 fn test_codec_interpolation() {
360 test_codec::<LinearCodec>();
361 }
362 #[test]
363 fn test_codec_multi_interpolation() {
364 test_codec::<BlockwiseLinearCodec>();
365 }
366
367 use super::*;
368
369 #[test]
370 fn estimation_good_interpolation_case() {
371 let data = (10..=20000_u64).collect::<Vec<_>>();
372 let data: VecColumn = data.as_slice().into();
373
374 let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
375 assert_le!(linear_interpol_estimation, 0.01);
376
377 let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data).unwrap();
378 assert_le!(multi_linear_interpol_estimation, 0.2);
379 assert_lt!(linear_interpol_estimation, multi_linear_interpol_estimation);
380
381 let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
382 assert_lt!(linear_interpol_estimation, bitpacked_estimation);
383 }
384 #[test]
385 fn estimation_test_bad_interpolation_case() {
386 let data: &[u64] = &[200, 10, 10, 10, 10, 1000, 20];
387
388 let data: VecColumn = data.into();
389 let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
390 assert_le!(linear_interpol_estimation, 0.34);
391
392 let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
393 assert_lt!(bitpacked_estimation, linear_interpol_estimation);
394 }
395
396 #[test]
397 fn estimation_prefer_bitpacked() {
398 let data = VecColumn::from(&[10, 10, 10, 10]);
399 let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
400 let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
401 assert_lt!(bitpacked_estimation, linear_interpol_estimation);
402 }
403
404 #[test]
405 fn estimation_test_bad_interpolation_case_monotonically_increasing() {
406 let mut data: Vec<u64> = (201..=20000_u64).collect();
407 data.push(1_000_000);
408 let data: VecColumn = data.as_slice().into();
409
410 let linear_interpol_estimation = LinearCodec::estimate(&data).unwrap();
413 assert_le!(linear_interpol_estimation, 0.35);
414
415 let bitpacked_estimation = BitpackedCodec::estimate(&data).unwrap();
416 assert_le!(bitpacked_estimation, 0.32);
417 assert_le!(bitpacked_estimation, linear_interpol_estimation);
418 }
419
420 #[test]
421 fn test_fast_field_codec_type_to_code() {
422 let mut count_codec = 0;
423 for code in 0..=255 {
424 if let Some(codec_type) = FastFieldCodecType::from_code(code) {
425 assert_eq!(codec_type.to_code(), code);
426 count_codec += 1;
427 }
428 }
429 assert_eq!(count_codec, 3);
430 }
431}
432
433#[cfg(all(test, feature = "unstable"))]
434mod bench {
435 use std::sync::Arc;
436
437 use ownedbytes::OwnedBytes;
438 use rand::rngs::StdRng;
439 use rand::{Rng, SeedableRng};
440 use test::{self, Bencher};
441
442 use super::*;
443 use crate::Column;
444
445 fn get_data() -> Vec<u64> {
446 let mut rng = StdRng::seed_from_u64(2u64);
447 let mut data: Vec<_> = (100..55000_u64)
448 .map(|num| num + rng.gen::<u8>() as u64)
449 .collect();
450 data.push(99_000);
451 data.insert(1000, 2000);
452 data.insert(2000, 100);
453 data.insert(3000, 4100);
454 data.insert(4000, 100);
455 data.insert(5000, 800);
456 data
457 }
458
459 #[inline(never)]
460 fn value_iter() -> impl Iterator<Item = u64> {
461 0..20_000
462 }
463 fn get_reader_for_bench<Codec: FastFieldCodec>(data: &[u64]) -> Codec::Reader {
464 let mut bytes = Vec::new();
465 let min_value = *data.iter().min().unwrap();
466 let data = data.iter().map(|el| *el - min_value).collect::<Vec<_>>();
467 let col = VecColumn::from(&data);
468 let normalized_header = crate::NormalizedHeader {
469 num_vals: col.num_vals(),
470 max_value: col.max_value(),
471 };
472 Codec::serialize(&VecColumn::from(&data), &mut bytes).unwrap();
473 Codec::open_from_bytes(OwnedBytes::new(bytes), normalized_header).unwrap()
474 }
475 fn bench_get<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
476 let col = get_reader_for_bench::<Codec>(data);
477 b.iter(|| {
478 let mut sum = 0u64;
479 for pos in value_iter() {
480 let val = col.get_val(pos as u32);
481 sum = sum.wrapping_add(val);
482 }
483 sum
484 });
485 }
486
487 #[inline(never)]
488 fn bench_get_dynamic_helper(b: &mut Bencher, col: Arc<dyn Column>) {
489 b.iter(|| {
490 let mut sum = 0u64;
491 for pos in value_iter() {
492 let val = col.get_val(pos as u32);
493 sum = sum.wrapping_add(val);
494 }
495 sum
496 });
497 }
498
499 fn bench_get_dynamic<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
500 let col = Arc::new(get_reader_for_bench::<Codec>(data));
501 bench_get_dynamic_helper(b, col);
502 }
503 fn bench_create<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
504 let min_value = *data.iter().min().unwrap();
505 let data = data.iter().map(|el| *el - min_value).collect::<Vec<_>>();
506
507 let mut bytes = Vec::new();
508 b.iter(|| {
509 bytes.clear();
510 Codec::serialize(&VecColumn::from(&data), &mut bytes).unwrap();
511 });
512 }
513
514 #[bench]
515 fn bench_fastfield_bitpack_create(b: &mut Bencher) {
516 let data: Vec<_> = get_data();
517 bench_create::<BitpackedCodec>(b, &data);
518 }
519 #[bench]
520 fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
521 let data: Vec<_> = get_data();
522 bench_create::<LinearCodec>(b, &data);
523 }
524 #[bench]
525 fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
526 let data: Vec<_> = get_data();
527 bench_create::<BlockwiseLinearCodec>(b, &data);
528 }
529 #[bench]
530 fn bench_fastfield_bitpack_get(b: &mut Bencher) {
531 let data: Vec<_> = get_data();
532 bench_get::<BitpackedCodec>(b, &data);
533 }
534 #[bench]
535 fn bench_fastfield_bitpack_get_dynamic(b: &mut Bencher) {
536 let data: Vec<_> = get_data();
537 bench_get_dynamic::<BitpackedCodec>(b, &data);
538 }
539 #[bench]
540 fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
541 let data: Vec<_> = get_data();
542 bench_get::<LinearCodec>(b, &data);
543 }
544 #[bench]
545 fn bench_fastfield_linearinterpol_get_dynamic(b: &mut Bencher) {
546 let data: Vec<_> = get_data();
547 bench_get_dynamic::<LinearCodec>(b, &data);
548 }
549 #[bench]
550 fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
551 let data: Vec<_> = get_data();
552 bench_get::<BlockwiseLinearCodec>(b, &data);
553 }
554 #[bench]
555 fn bench_fastfield_multilinearinterpol_get_dynamic(b: &mut Bencher) {
556 let data: Vec<_> = get_data();
557 bench_get_dynamic::<BlockwiseLinearCodec>(b, &data);
558 }
559}