vortex_btrblocks/
integer.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4pub mod dictionary;
5mod stats;
6
7use std::fmt::Debug;
8use std::hash::Hash;
9
10pub use stats::IntegerStats;
11use vortex_array::arrays::{ConstantArray, MaskedArray, PrimitiveArray, PrimitiveVTable};
12use vortex_array::vtable::ValidityHelper;
13use vortex_array::{ArrayRef, IntoArray, ToCanonical};
14use vortex_dict::DictArray;
15use vortex_error::{VortexResult, VortexUnwrap, vortex_bail, vortex_err};
16use vortex_fastlanes::{FoRArray, bit_width_histogram, bitpack_encode, find_best_bit_width};
17use vortex_runend::RunEndArray;
18use vortex_runend::compress::runend_encode;
19use vortex_scalar::Scalar;
20use vortex_sequence::sequence_encode;
21use vortex_sparse::{SparseArray, SparseVTable};
22use vortex_zigzag::{ZigZagArray, zigzag_encode};
23
24use crate::integer::dictionary::dictionary_encode;
25use crate::patches::compress_patches;
26use crate::rle::RLEScheme;
27use crate::{
28    Compressor, CompressorStats, GenerateStatsOptions, Scheme,
29    estimate_compression_ratio_with_sampling,
30};
31
32/// [`Compressor`] for signed and unsigned integers.
33pub struct IntCompressor;
34
35impl Compressor for IntCompressor {
36    type ArrayVTable = PrimitiveVTable;
37    type SchemeType = dyn IntegerScheme;
38    type StatsType = IntegerStats;
39
40    fn schemes() -> &'static [&'static dyn IntegerScheme] {
41        &[
42            &ConstantScheme,
43            &FORScheme,
44            &ZigZagScheme,
45            &BitPackingScheme,
46            &SparseScheme,
47            &DictScheme,
48            &RunEndScheme,
49            &SequenceScheme,
50            &RLE_INTEGER_SCHEME,
51        ]
52    }
53
54    fn default_scheme() -> &'static Self::SchemeType {
55        &UncompressedScheme
56    }
57
58    fn dict_scheme_code() -> IntCode {
59        DICT_SCHEME
60    }
61}
62
63impl IntCompressor {
64    pub(crate) fn compress_no_dict(
65        array: &PrimitiveArray,
66        is_sample: bool,
67        allowed_cascading: usize,
68        excludes: &[IntCode],
69    ) -> VortexResult<ArrayRef> {
70        let stats = IntegerStats::generate_opts(
71            array,
72            GenerateStatsOptions {
73                count_distinct_values: false,
74            },
75        );
76
77        let scheme = Self::choose_scheme(&stats, is_sample, allowed_cascading, excludes)?;
78        let output = scheme.compress(&stats, is_sample, allowed_cascading, excludes)?;
79
80        if output.nbytes() < array.nbytes() {
81            Ok(output)
82        } else {
83            log::debug!("resulting tree too large: {}", output.display_tree());
84            Ok(array.to_array())
85        }
86    }
87}
88
89pub trait IntegerScheme: Scheme<StatsType = IntegerStats, CodeType = IntCode> {}
90
91// Auto-impl
92impl<T> IntegerScheme for T where T: Scheme<StatsType = IntegerStats, CodeType = IntCode> {}
93
94#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
95pub struct IntCode(u8);
96
97const UNCOMPRESSED_SCHEME: IntCode = IntCode(0);
98const CONSTANT_SCHEME: IntCode = IntCode(1);
99const FOR_SCHEME: IntCode = IntCode(2);
100const ZIGZAG_SCHEME: IntCode = IntCode(3);
101const BITPACKING_SCHEME: IntCode = IntCode(4);
102const SPARSE_SCHEME: IntCode = IntCode(5);
103const DICT_SCHEME: IntCode = IntCode(6);
104const RUN_END_SCHEME: IntCode = IntCode(7);
105const SEQUENCE_SCHEME: IntCode = IntCode(8);
106const RUN_LENGTH_SCHEME: IntCode = IntCode(9);
107
108#[derive(Debug, Copy, Clone)]
109pub struct UncompressedScheme;
110
111#[derive(Debug, Copy, Clone)]
112pub struct ConstantScheme;
113
114#[derive(Debug, Copy, Clone)]
115pub struct FORScheme;
116
117#[derive(Debug, Copy, Clone)]
118pub struct ZigZagScheme;
119
120#[derive(Debug, Copy, Clone)]
121pub struct BitPackingScheme;
122
123#[derive(Debug, Copy, Clone)]
124pub struct SparseScheme;
125
126#[derive(Debug, Copy, Clone)]
127pub struct DictScheme;
128
129#[derive(Debug, Copy, Clone)]
130pub struct RunEndScheme;
131
132#[derive(Debug, Copy, Clone)]
133pub struct SequenceScheme;
134
135/// Threshold for the average run length in an array before we consider run-end encoding.
136const RUN_END_THRESHOLD: u32 = 4;
137
138pub const RLE_INTEGER_SCHEME: RLEScheme<IntegerStats, IntCode> = RLEScheme::new(
139    RUN_LENGTH_SCHEME,
140    |values, is_sample, allowed_cascading, excludes| {
141        IntCompressor::compress_no_dict(values, is_sample, allowed_cascading, excludes)
142    },
143);
144
145impl Scheme for UncompressedScheme {
146    type StatsType = IntegerStats;
147    type CodeType = IntCode;
148
149    fn code(&self) -> IntCode {
150        UNCOMPRESSED_SCHEME
151    }
152
153    fn expected_compression_ratio(
154        &self,
155        _stats: &IntegerStats,
156        _is_sample: bool,
157        _allowed_cascading: usize,
158        _excludes: &[IntCode],
159    ) -> VortexResult<f64> {
160        // no compression
161        Ok(1.0)
162    }
163
164    fn compress(
165        &self,
166        stats: &IntegerStats,
167        _is_sample: bool,
168        _allowed_cascading: usize,
169        _excludes: &[IntCode],
170    ) -> VortexResult<ArrayRef> {
171        Ok(stats.source().to_array())
172    }
173}
174
175impl Scheme for ConstantScheme {
176    type StatsType = IntegerStats;
177    type CodeType = IntCode;
178
179    fn code(&self) -> IntCode {
180        CONSTANT_SCHEME
181    }
182
183    fn is_constant(&self) -> bool {
184        true
185    }
186
187    fn expected_compression_ratio(
188        &self,
189        stats: &IntegerStats,
190        is_sample: bool,
191        _allowed_cascading: usize,
192        _excludes: &[IntCode],
193    ) -> VortexResult<f64> {
194        // Never yield ConstantScheme for a sample, it could be a false-positive.
195        if is_sample {
196            return Ok(0.0);
197        }
198
199        // Only arrays with one distinct values can be constant compressed.
200        if stats.distinct_values_count > 1 {
201            return Ok(0.0);
202        }
203
204        Ok(stats.value_count as f64)
205    }
206
207    fn compress(
208        &self,
209        stats: &IntegerStats,
210        _is_sample: bool,
211        _allowed_cascading: usize,
212        _excludes: &[IntCode],
213    ) -> VortexResult<ArrayRef> {
214        let scalar_idx = (0..stats.source().len()).position(|idx| stats.source().is_valid(idx));
215
216        match scalar_idx {
217            Some(idx) => {
218                let scalar = stats.source().scalar_at(idx);
219                let const_arr = ConstantArray::new(scalar, stats.src.len()).into_array();
220                if !stats.source().all_valid() {
221                    Ok(MaskedArray::try_new(const_arr, stats.src.validity().clone())?.into_array())
222                } else {
223                    Ok(const_arr)
224                }
225            }
226            None => Ok(ConstantArray::new(
227                Scalar::null(stats.src.dtype().clone()),
228                stats.src.len(),
229            )
230            .into_array()),
231        }
232    }
233}
234
235impl Scheme for FORScheme {
236    type StatsType = IntegerStats;
237    type CodeType = IntCode;
238
239    fn code(&self) -> IntCode {
240        FOR_SCHEME
241    }
242
243    fn expected_compression_ratio(
244        &self,
245        stats: &IntegerStats,
246        _is_sample: bool,
247        allowed_cascading: usize,
248        _excludes: &[IntCode],
249    ) -> VortexResult<f64> {
250        // Only apply if we are not at the leaf
251        if allowed_cascading == 0 {
252            return Ok(0.0);
253        }
254
255        // All-null cannot be FOR compressed.
256        if stats.value_count == 0 {
257            return Ok(0.0);
258        }
259
260        // Only apply when the min is not already zero.
261        if stats.typed.min_is_zero() {
262            return Ok(0.0);
263        }
264
265        // Difference between max and min
266        let full_width: u32 = stats.src.ptype().bit_width().try_into().vortex_unwrap();
267        let bw = match stats.typed.max_minus_min().checked_ilog2() {
268            Some(l) => l + 1,
269            // If max-min == 0, it we should use a different compression scheme
270            // as we don't want to bitpack down to 0 bits.
271            None => return Ok(0.0),
272        };
273
274        // If we're not saving at least 1 byte, don't bother with FOR
275        if full_width - bw < 8 {
276            return Ok(0.0);
277        }
278
279        Ok(full_width as f64 / bw as f64)
280    }
281
282    fn compress(
283        &self,
284        stats: &IntegerStats,
285        is_sample: bool,
286        _allowed_cascading: usize,
287        excludes: &[IntCode],
288    ) -> VortexResult<ArrayRef> {
289        let for_array = FoRArray::encode(stats.src.clone())?;
290        let biased = for_array.encoded().to_primitive();
291        let biased_stats = IntegerStats::generate_opts(
292            &biased,
293            GenerateStatsOptions {
294                count_distinct_values: false,
295            },
296        );
297
298        // Immediately bitpack. If any other scheme was preferable, it would be chosen instead
299        // of bitpacking.
300        // NOTE: we could delegate in the future if we had another downstream codec that performs
301        //  as well.
302        let compressed = BitPackingScheme.compress(&biased_stats, is_sample, 0, excludes)?;
303
304        Ok(FoRArray::try_new(compressed, for_array.reference_scalar().clone())?.into_array())
305    }
306}
307
308impl Scheme for ZigZagScheme {
309    type StatsType = IntegerStats;
310    type CodeType = IntCode;
311
312    fn code(&self) -> IntCode {
313        ZIGZAG_SCHEME
314    }
315
316    fn expected_compression_ratio(
317        &self,
318        stats: &IntegerStats,
319        is_sample: bool,
320        allowed_cascading: usize,
321        excludes: &[IntCode],
322    ) -> VortexResult<f64> {
323        // ZigZag is only useful when we cascade it with another encoding
324        if allowed_cascading == 0 {
325            return Ok(0.0);
326        }
327
328        // Don't try and compress all-null arrays
329        if stats.value_count == 0 {
330            return Ok(0.0);
331        }
332
333        // ZigZag is only useful when there are negative values.
334        if !stats.typed.min_is_negative() {
335            return Ok(0.0);
336        }
337
338        // Run compression on a sample to see how it performs.
339        estimate_compression_ratio_with_sampling(
340            self,
341            stats,
342            is_sample,
343            allowed_cascading,
344            excludes,
345        )
346    }
347
348    fn compress(
349        &self,
350        stats: &IntegerStats,
351        is_sample: bool,
352        allowed_cascading: usize,
353        excludes: &[IntCode],
354    ) -> VortexResult<ArrayRef> {
355        // Zigzag encode the values, then recursively compress the inner values.
356        let zag = zigzag_encode(stats.src.clone())?;
357        let encoded = zag.encoded().to_primitive();
358
359        // ZigZag should be after Dict, RunEnd or Sparse.
360        // We should only do these "container" style compressors once.
361        let mut new_excludes = vec![
362            ZigZagScheme.code(),
363            DictScheme.code(),
364            RunEndScheme.code(),
365            SparseScheme.code(),
366        ];
367        new_excludes.extend_from_slice(excludes);
368
369        let compressed =
370            IntCompressor::compress(&encoded, is_sample, allowed_cascading - 1, &new_excludes)?;
371
372        log::debug!("zigzag output: {}", compressed.display_tree());
373
374        Ok(ZigZagArray::try_new(compressed)?.into_array())
375    }
376}
377
378impl Scheme for BitPackingScheme {
379    type StatsType = IntegerStats;
380    type CodeType = IntCode;
381
382    fn code(&self) -> IntCode {
383        BITPACKING_SCHEME
384    }
385
386    #[allow(clippy::cast_possible_truncation)]
387    fn expected_compression_ratio(
388        &self,
389        stats: &IntegerStats,
390        is_sample: bool,
391        allowed_cascading: usize,
392        excludes: &[IntCode],
393    ) -> VortexResult<f64> {
394        // BitPacking only works for non-negative values
395        if stats.typed.min_is_negative() {
396            return Ok(0.0);
397        }
398
399        // Don't compress all-null arrays
400        if stats.value_count == 0 {
401            return Ok(0.0);
402        }
403
404        estimate_compression_ratio_with_sampling(
405            self,
406            stats,
407            is_sample,
408            allowed_cascading,
409            excludes,
410        )
411    }
412
413    #[allow(clippy::cast_possible_truncation)]
414    fn compress(
415        &self,
416        stats: &IntegerStats,
417        _is_sample: bool,
418        _allowed_cascading: usize,
419        _excludes: &[IntCode],
420    ) -> VortexResult<ArrayRef> {
421        let histogram = bit_width_histogram(stats.source())?;
422        let bw = find_best_bit_width(stats.source().ptype(), &histogram)?;
423        // If best bw is determined to be the current bit-width, return the original array.
424        if bw as usize == stats.source().ptype().bit_width() {
425            return Ok(stats.source().clone().into_array());
426        }
427        let mut packed = bitpack_encode(stats.source(), bw, Some(&histogram))?;
428
429        let patches = packed.patches().map(compress_patches).transpose()?;
430        packed.replace_patches(patches);
431
432        Ok(packed.into_array())
433    }
434}
435
436impl Scheme for SparseScheme {
437    type StatsType = IntegerStats;
438    type CodeType = IntCode;
439
440    fn code(&self) -> IntCode {
441        SPARSE_SCHEME
442    }
443
444    // We can avoid asserting the encoding tree instead.
445    fn expected_compression_ratio(
446        &self,
447        stats: &IntegerStats,
448        _is_sample: bool,
449        allowed_cascading: usize,
450        _excludes: &[IntCode],
451    ) -> VortexResult<f64> {
452        // Only use `SparseScheme` if we can cascade.
453        if allowed_cascading == 0 {
454            return Ok(0.0);
455        }
456
457        if stats.value_count == 0 {
458            // All nulls should use ConstantScheme
459            return Ok(0.0);
460        }
461
462        // If the majority is null, will compress well.
463        if stats.null_count as f64 / stats.src.len() as f64 > 0.9 {
464            return Ok(stats.src.len() as f64 / stats.value_count as f64);
465        }
466
467        // See if the top value accounts for >= 90% of the set values.
468        let (_, top_count) = stats.typed.top_value_and_count();
469
470        if top_count == stats.value_count {
471            // top_value is the only value, should use ConstantScheme instead
472            return Ok(0.0);
473        }
474
475        let freq = top_count as f64 / stats.value_count as f64;
476        if freq >= 0.9 {
477            // We only store the positions of the non-top values.
478            return Ok(stats.value_count as f64 / (stats.value_count - top_count) as f64);
479        }
480
481        Ok(0.0)
482    }
483
484    fn compress(
485        &self,
486        stats: &IntegerStats,
487        is_sample: bool,
488        allowed_cascading: usize,
489        excludes: &[IntCode],
490    ) -> VortexResult<ArrayRef> {
491        assert!(allowed_cascading > 0);
492        let (top_pvalue, top_count) = stats.typed.top_value_and_count();
493        if top_count as usize == stats.src.len() {
494            // top_value is the only value, use ConstantScheme
495            return Ok(ConstantArray::new(
496                Scalar::primitive_value(
497                    top_pvalue,
498                    top_pvalue.ptype(),
499                    stats.src.dtype().nullability(),
500                ),
501                stats.src.len(),
502            )
503            .into_array());
504        }
505
506        let sparse_encoded = SparseArray::encode(
507            stats.src.as_ref(),
508            Some(Scalar::primitive_value(
509                top_pvalue,
510                top_pvalue.ptype(),
511                stats.src.dtype().nullability(),
512            )),
513        )?;
514
515        if let Some(sparse) = sparse_encoded.as_opt::<SparseVTable>() {
516            // Compress the values
517            let mut new_excludes = vec![SparseScheme.code()];
518            new_excludes.extend_from_slice(excludes);
519
520            let compressed_values = IntCompressor::compress_no_dict(
521                &sparse.patches().values().to_primitive(),
522                is_sample,
523                allowed_cascading - 1,
524                &new_excludes,
525            )?;
526
527            let indices = sparse.patches().indices().to_primitive().narrow()?;
528
529            let compressed_indices = IntCompressor::compress_no_dict(
530                &indices,
531                is_sample,
532                allowed_cascading - 1,
533                &new_excludes,
534            )?;
535
536            SparseArray::try_new(
537                compressed_indices,
538                compressed_values,
539                sparse.len(),
540                sparse.fill_scalar().clone(),
541            )
542            .map(|a| a.into_array())
543        } else {
544            Ok(sparse_encoded)
545        }
546    }
547}
548
549impl Scheme for DictScheme {
550    type StatsType = IntegerStats;
551    type CodeType = IntCode;
552
553    fn code(&self) -> IntCode {
554        DICT_SCHEME
555    }
556
557    fn expected_compression_ratio(
558        &self,
559        stats: &IntegerStats,
560        _is_sample: bool,
561        allowed_cascading: usize,
562        _excludes: &[IntCode],
563    ) -> VortexResult<f64> {
564        // Dict should not be terminal.
565        if allowed_cascading == 0 {
566            return Ok(0.0);
567        }
568
569        if stats.value_count == 0 {
570            return Ok(0.0);
571        }
572
573        // If > 50% of the values are distinct, skip dict.
574        if stats.distinct_values_count > stats.value_count / 2 {
575            return Ok(0.0);
576        }
577
578        // Ignore nulls encoding for the estimate. We only focus on values.
579        let values_size = stats.source().ptype().bit_width() * stats.distinct_values_count as usize;
580
581        // Assume codes are compressed RLE + BitPacking.
582        let codes_bw = usize::BITS - stats.distinct_values_count.leading_zeros();
583
584        let n_runs = stats.value_count / stats.average_run_length;
585
586        // Assume that codes will either be BitPack or RLE-BitPack
587        let codes_size_bp = (codes_bw * stats.value_count) as usize;
588        let codes_size_rle_bp = (codes_bw + 32) * n_runs;
589
590        let codes_size = usize::min(codes_size_bp, codes_size_rle_bp as usize);
591
592        let before = stats.value_count as usize * stats.source().ptype().bit_width();
593
594        Ok(before as f64 / (values_size + codes_size) as f64)
595    }
596
597    fn compress(
598        &self,
599        stats: &IntegerStats,
600        is_sample: bool,
601        allowed_cascading: usize,
602        excludes: &[IntCode],
603    ) -> VortexResult<ArrayRef> {
604        assert!(allowed_cascading > 0);
605
606        // TODO(aduffy): we can be more prescriptive: we know that codes will EITHER be
607        //    RLE or FOR + BP. Cascading probably wastes some time here.
608
609        let dict = dictionary_encode(stats);
610
611        // Cascade the codes child
612        // Don't allow SequenceArray as the codes child as it merely adds extra indirection without actually compressing data.
613        let mut new_excludes = vec![DICT_SCHEME, SEQUENCE_SCHEME];
614        new_excludes.extend_from_slice(excludes);
615
616        let compressed_codes = IntCompressor::compress_no_dict(
617            &dict.codes().to_primitive().narrow()?,
618            is_sample,
619            allowed_cascading - 1,
620            &new_excludes,
621        )?;
622
623        // SAFETY: compressing codes does not change their values
624        unsafe {
625            Ok(DictArray::new_unchecked(compressed_codes, dict.values().clone()).into_array())
626        }
627    }
628}
629
630impl Scheme for RunEndScheme {
631    type StatsType = IntegerStats;
632    type CodeType = IntCode;
633
634    fn code(&self) -> IntCode {
635        RUN_END_SCHEME
636    }
637
638    fn expected_compression_ratio(
639        &self,
640        stats: &IntegerStats,
641        is_sample: bool,
642        allowed_cascading: usize,
643        excludes: &[IntCode],
644    ) -> VortexResult<f64> {
645        // If the run length is below the threshold, drop it.
646        if stats.average_run_length < RUN_END_THRESHOLD {
647            return Ok(0.0);
648        }
649
650        if allowed_cascading == 0 {
651            return Ok(0.0);
652        }
653
654        // Run compression on a sample, see how it performs.
655        estimate_compression_ratio_with_sampling(
656            self,
657            stats,
658            is_sample,
659            allowed_cascading,
660            excludes,
661        )
662    }
663
664    fn compress(
665        &self,
666        stats: &IntegerStats,
667        is_sample: bool,
668        allowed_cascading: usize,
669        excludes: &[IntCode],
670    ) -> VortexResult<ArrayRef> {
671        assert!(allowed_cascading > 0);
672
673        // run-end encode the ends
674        let (ends, values) = runend_encode(&stats.src);
675
676        let mut new_excludes = vec![RunEndScheme.code(), DictScheme.code()];
677        new_excludes.extend_from_slice(excludes);
678
679        let ends_stats = IntegerStats::generate_opts(
680            &ends.to_primitive(),
681            GenerateStatsOptions {
682                count_distinct_values: false,
683            },
684        );
685        let ends_scheme = IntCompressor::choose_scheme(
686            &ends_stats,
687            is_sample,
688            allowed_cascading - 1,
689            &new_excludes,
690        )?;
691        let compressed_ends =
692            ends_scheme.compress(&ends_stats, is_sample, allowed_cascading - 1, &new_excludes)?;
693
694        let compressed_values = IntCompressor::compress_no_dict(
695            &values.to_primitive(),
696            is_sample,
697            allowed_cascading - 1,
698            &new_excludes,
699        )?;
700
701        // SAFETY: compression doesn't affect invariants
702        unsafe {
703            Ok(
704                RunEndArray::new_unchecked(compressed_ends, compressed_values, 0, stats.src.len())
705                    .into_array(),
706            )
707        }
708    }
709}
710
711impl Scheme for SequenceScheme {
712    type StatsType = IntegerStats;
713    type CodeType = IntCode;
714
715    fn code(&self) -> Self::CodeType {
716        SEQUENCE_SCHEME
717    }
718
719    fn expected_compression_ratio(
720        &self,
721        stats: &Self::StatsType,
722        _is_sample: bool,
723        _allowed_cascading: usize,
724        _excludes: &[Self::CodeType],
725    ) -> VortexResult<f64> {
726        if stats.null_count > 0 {
727            return Ok(0.0);
728        }
729        // Since two values are required to store base and multiplier the
730        // compression ratio is divided by 2.
731        Ok(sequence_encode(&stats.src)?
732            .map(|_| stats.src.len() as f64 / 2.0)
733            .unwrap_or(0.0))
734    }
735
736    fn compress(
737        &self,
738        stats: &Self::StatsType,
739        _is_sample: bool,
740        _allowed_cascading: usize,
741        _excludes: &[Self::CodeType],
742    ) -> VortexResult<ArrayRef> {
743        if stats.null_count > 0 {
744            vortex_bail!("sequence encoding does not support nulls");
745        }
746        sequence_encode(&stats.src)?.ok_or_else(|| vortex_err!("cannot sequence encode array"))
747    }
748}
749
750#[cfg(test)]
751mod tests {
752    use std::iter;
753
754    use itertools::Itertools;
755    use log::LevelFilter;
756    use rand::rngs::StdRng;
757    use rand::{RngCore, SeedableRng};
758    use vortex_array::arrays::PrimitiveArray;
759    use vortex_array::validity::Validity;
760    use vortex_array::vtable::ValidityHelper;
761    use vortex_array::{Array, IntoArray, ToCanonical};
762    use vortex_buffer::{Buffer, BufferMut, buffer, buffer_mut};
763    use vortex_dict::DictEncoding;
764    use vortex_sequence::SequenceEncoding;
765    use vortex_sparse::SparseEncoding;
766    use vortex_utils::aliases::hash_set::HashSet;
767
768    use crate::integer::{
769        IntCompressor, IntegerStats, RLE_INTEGER_SCHEME, SequenceScheme, SparseScheme,
770    };
771    use crate::{Compressor, CompressorStats, Scheme};
772
773    #[test]
774    fn test_empty() {
775        // Make sure empty array compression does not fail
776        let result = IntCompressor::compress(
777            &PrimitiveArray::new(Buffer::<i32>::empty(), Validity::NonNullable),
778            false,
779            3,
780            &[],
781        )
782        .unwrap();
783
784        assert!(result.is_empty());
785    }
786
787    #[test]
788    fn test_dict_encodable() {
789        let mut codes = BufferMut::<i32>::with_capacity(65_535);
790        // Write some runs of length 3 of a handful of different values. Interrupted by some
791        // one-off values.
792
793        let numbers = [0, 10, 50, 100, 1000, 3000]
794            .into_iter()
795            .map(|i| 1234 * i)
796            .collect_vec();
797
798        let mut rng = StdRng::seed_from_u64(1u64);
799        while codes.len() < 64000 {
800            let run_length = rng.next_u32() % 5;
801            let value = numbers[rng.next_u32() as usize % numbers.len()];
802            for _ in 0..run_length {
803                codes.push(value);
804            }
805        }
806
807        let primitive = codes.freeze().into_array().to_primitive();
808        let compressed = IntCompressor::compress(&primitive, false, 3, &[]).unwrap();
809        assert_eq!(compressed.encoding_id(), DictEncoding.id());
810    }
811
812    #[test]
813    fn test_window_name() {
814        env_logger::builder()
815            .filter(None, LevelFilter::Debug)
816            .try_init()
817            .ok();
818
819        // A test that's meant to mirror the WindowName column from ClickBench.
820        let mut values = buffer_mut![-1i32; 1_000_000];
821        let mut visited = HashSet::new();
822        let mut rng = StdRng::seed_from_u64(1u64);
823        while visited.len() < 223 {
824            let random = (rng.next_u32() as usize) % 1_000_000;
825            if visited.contains(&random) {
826                continue;
827            }
828            visited.insert(random);
829            // Pick 100 random values to insert.
830            values[random] = 5 * (rng.next_u64() % 100) as i32;
831        }
832
833        let array = values.freeze().into_array().to_primitive();
834        let compressed = IntCompressor::compress(&array, false, 3, &[]).unwrap();
835        log::info!("WindowName compressed: {}", compressed.display_tree());
836    }
837
838    #[test]
839    fn sparse_with_nulls() {
840        let array = PrimitiveArray::new(
841            buffer![189u8, 189, 189, 0, 46],
842            Validity::from_iter(vec![true, true, true, true, false]),
843        );
844        let compressed = SparseScheme
845            .compress(&IntegerStats::generate(&array), false, 3, &[])
846            .unwrap();
847        assert_eq!(compressed.encoding_id(), SparseEncoding.id());
848        let decoded = compressed.to_primitive();
849        let expected = [189u8, 189, 189, 0, 0];
850        assert_eq!(decoded.as_slice::<u8>(), &expected);
851        assert_eq!(decoded.validity(), array.validity());
852    }
853
854    #[test]
855    fn sparse_mostly_nulls() {
856        let array = PrimitiveArray::new(
857            buffer![189u8, 189, 189, 189, 189, 189, 189, 189, 189, 0, 46],
858            Validity::from_iter(vec![
859                false, false, false, false, false, false, false, false, false, false, true,
860            ]),
861        );
862        let compressed = SparseScheme
863            .compress(&IntegerStats::generate(&array), false, 3, &[])
864            .unwrap();
865        assert_eq!(compressed.encoding_id(), SparseEncoding.id());
866        let decoded = compressed.to_primitive();
867        let expected = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46];
868        assert_eq!(decoded.as_slice::<u8>(), &expected);
869        assert_eq!(decoded.validity(), array.validity());
870    }
871
872    #[test]
873    fn nullable_sequence() {
874        let values = (0i32..20).step_by(7).collect_vec();
875        let array = PrimitiveArray::from_option_iter(values.clone().into_iter().map(Some));
876        let compressed = SequenceScheme
877            .compress(&IntegerStats::generate(&array), false, 3, &[])
878            .unwrap();
879        assert_eq!(compressed.encoding_id(), SequenceEncoding.id());
880        let decoded = compressed.to_primitive();
881        assert_eq!(decoded.as_slice::<i32>(), values.as_slice());
882    }
883
884    #[test]
885    fn test_rle_compression() {
886        let mut values = Vec::new();
887        values.extend(iter::repeat_n(42i32, 100));
888        values.extend(iter::repeat_n(123i32, 200));
889        values.extend(iter::repeat_n(987i32, 150));
890
891        let array = PrimitiveArray::new(Buffer::copy_from(&values), Validity::NonNullable);
892        let compressed = RLE_INTEGER_SCHEME
893            .compress(&IntegerStats::generate(&array), false, 3, &[])
894            .unwrap();
895
896        let decoded = compressed.to_primitive();
897        assert_eq!(decoded.as_slice::<i32>(), values.as_slice());
898    }
899}