vortex_btrblocks/
integer.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4pub mod dictionary;
5mod stats;
6
7use std::fmt::Debug;
8use std::hash::Hash;
9
10pub use stats::IntegerStats;
11use vortex_array::arrays::{ConstantArray, PrimitiveArray, PrimitiveVTable};
12use vortex_array::compress::downscale_integer_array;
13use vortex_array::{ArrayRef, IntoArray, ToCanonical};
14use vortex_dict::DictArray;
15use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_err};
16use vortex_fastlanes::{FoRArray, bit_width_histogram, bitpack_encode, find_best_bit_width};
17use vortex_runend::RunEndArray;
18use vortex_runend::compress::runend_encode;
19use vortex_scalar::Scalar;
20use vortex_sequence::sequence_encode;
21use vortex_sparse::{SparseArray, SparseVTable};
22use vortex_zigzag::{ZigZagArray, zigzag_encode};
23
24use crate::integer::dictionary::dictionary_encode;
25use crate::patches::compress_patches;
26use crate::{
27    Compressor, CompressorStats, GenerateStatsOptions, Scheme,
28    estimate_compression_ratio_with_sampling,
29};
30
31pub struct IntCompressor;
32
33impl Compressor for IntCompressor {
34    type ArrayVTable = PrimitiveVTable;
35    type SchemeType = dyn IntegerScheme;
36    type StatsType = IntegerStats;
37
38    fn schemes() -> &'static [&'static dyn IntegerScheme] {
39        &[
40            &ConstantScheme,
41            &FORScheme,
42            &ZigZagScheme,
43            &BitPackingScheme,
44            &SparseScheme,
45            &DictScheme,
46            &RunEndScheme,
47            &SequenceScheme,
48        ]
49    }
50
51    fn default_scheme() -> &'static Self::SchemeType {
52        &UncompressedScheme
53    }
54
55    fn dict_scheme_code() -> IntCode {
56        DICT_SCHEME
57    }
58}
59
60impl IntCompressor {
61    pub fn compress_no_dict(
62        array: &PrimitiveArray,
63        is_sample: bool,
64        allowed_cascading: usize,
65        excludes: &[IntCode],
66    ) -> VortexResult<ArrayRef> {
67        let stats = IntegerStats::generate_opts(
68            array,
69            GenerateStatsOptions {
70                count_distinct_values: false,
71            },
72        );
73
74        let scheme = Self::choose_scheme(&stats, is_sample, allowed_cascading, excludes)?;
75        let output = scheme.compress(&stats, is_sample, allowed_cascading, excludes)?;
76
77        if output.nbytes() < array.nbytes() {
78            Ok(output)
79        } else {
80            log::debug!("resulting tree too large: {}", output.display_tree());
81            Ok(array.to_array())
82        }
83    }
84}
85
86pub trait IntegerScheme: Scheme<StatsType = IntegerStats, CodeType = IntCode> {}
87
88// Auto-impl
89impl<T> IntegerScheme for T where T: Scheme<StatsType = IntegerStats, CodeType = IntCode> {}
90
91#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
92pub struct IntCode(u8);
93
94const UNCOMPRESSED_SCHEME: IntCode = IntCode(0);
95const CONSTANT_SCHEME: IntCode = IntCode(1);
96const FOR_SCHEME: IntCode = IntCode(2);
97const ZIGZAG_SCHEME: IntCode = IntCode(3);
98const BITPACKING_SCHEME: IntCode = IntCode(4);
99const SPARSE_SCHEME: IntCode = IntCode(5);
100const DICT_SCHEME: IntCode = IntCode(6);
101const RUNEND_SCHEME: IntCode = IntCode(7);
102const SEQUENCE_SCHEME: IntCode = IntCode(8);
103
104#[derive(Debug, Copy, Clone)]
105pub struct UncompressedScheme;
106
107#[derive(Debug, Copy, Clone)]
108pub struct ConstantScheme;
109
110#[derive(Debug, Copy, Clone)]
111pub struct FORScheme;
112
113#[derive(Debug, Copy, Clone)]
114pub struct ZigZagScheme;
115
116#[derive(Debug, Copy, Clone)]
117pub struct BitPackingScheme;
118
119#[derive(Debug, Copy, Clone)]
120pub struct SparseScheme;
121
122#[derive(Debug, Copy, Clone)]
123pub struct DictScheme;
124
125#[derive(Debug, Copy, Clone)]
126pub struct RunEndScheme;
127
128#[derive(Debug, Copy, Clone)]
129pub struct SequenceScheme;
130
131/// Threshold for the average run length in an array before we consider run-end encoding.
132const RUN_END_THRESHOLD: u32 = 4;
133
134impl Scheme for UncompressedScheme {
135    type StatsType = IntegerStats;
136    type CodeType = IntCode;
137
138    fn code(&self) -> IntCode {
139        UNCOMPRESSED_SCHEME
140    }
141
142    fn expected_compression_ratio(
143        &self,
144        _stats: &IntegerStats,
145        _is_sample: bool,
146        _allowed_cascading: usize,
147        _excludes: &[IntCode],
148    ) -> VortexResult<f64> {
149        // no compression
150        Ok(1.0)
151    }
152
153    fn compress(
154        &self,
155        stats: &IntegerStats,
156        _is_sample: bool,
157        _allowed_cascading: usize,
158        _excludes: &[IntCode],
159    ) -> VortexResult<ArrayRef> {
160        Ok(stats.source().to_array())
161    }
162}
163
164impl Scheme for ConstantScheme {
165    type StatsType = IntegerStats;
166    type CodeType = IntCode;
167
168    fn code(&self) -> IntCode {
169        CONSTANT_SCHEME
170    }
171
172    fn is_constant(&self) -> bool {
173        true
174    }
175
176    fn expected_compression_ratio(
177        &self,
178        stats: &IntegerStats,
179        is_sample: bool,
180        _allowed_cascading: usize,
181        _excludes: &[IntCode],
182    ) -> VortexResult<f64> {
183        // Never yield ConstantScheme for a sample, it could be a false-positive.
184        if is_sample {
185            return Ok(0.0);
186        }
187
188        // Only arrays with one distinct values can be constant compressed.
189        if stats.distinct_values_count != 1 {
190            return Ok(0.0);
191        }
192
193        // Cannot have mix of nulls and non-nulls
194        if stats.null_count > 0 && stats.value_count > 0 {
195            return Ok(0.0);
196        }
197
198        Ok(stats.value_count as f64)
199    }
200
201    fn compress(
202        &self,
203        stats: &IntegerStats,
204        _is_sample: bool,
205        _allowed_cascading: usize,
206        _excludes: &[IntCode],
207    ) -> VortexResult<ArrayRef> {
208        // We only use Constant encoding if the entire array is constant, never if one of
209        // the child arrays yields a constant value.
210        let scalar = stats
211            .source()
212            .as_constant()
213            .vortex_expect("constant array expected");
214
215        Ok(ConstantArray::new(scalar, stats.src.len()).into_array())
216    }
217}
218
219impl Scheme for FORScheme {
220    type StatsType = IntegerStats;
221    type CodeType = IntCode;
222
223    fn code(&self) -> IntCode {
224        FOR_SCHEME
225    }
226
227    fn expected_compression_ratio(
228        &self,
229        stats: &IntegerStats,
230        _is_sample: bool,
231        allowed_cascading: usize,
232        _excludes: &[IntCode],
233    ) -> VortexResult<f64> {
234        // Only apply if we are not at the leaf
235        if allowed_cascading == 0 {
236            return Ok(0.0);
237        }
238
239        // All-null cannot be FOR compressed.
240        if stats.value_count == 0 {
241            return Ok(0.0);
242        }
243
244        // Only apply when the min is not already zero.
245        if stats.typed.min_is_zero() {
246            return Ok(0.0);
247        }
248
249        // Difference between max and min
250        let full_width: u32 = stats.src.ptype().bit_width().try_into().vortex_unwrap();
251        let bw = match stats.typed.max_minus_min().checked_ilog2() {
252            Some(l) => l + 1,
253            // If max-min == 0, it we should use a different compression scheme
254            // as we don't want to bitpack down to 0 bits.
255            None => return Ok(0.0),
256        };
257
258        // If we're not saving at least 1 byte, don't bother with FOR
259        if full_width - bw < 8 {
260            return Ok(0.0);
261        }
262
263        Ok(full_width as f64 / bw as f64)
264    }
265
266    fn compress(
267        &self,
268        stats: &IntegerStats,
269        is_sample: bool,
270        _allowed_cascading: usize,
271        excludes: &[IntCode],
272    ) -> VortexResult<ArrayRef> {
273        let for_array = FoRArray::encode(stats.src.clone())?;
274        let biased = for_array.encoded().to_primitive()?;
275        let biased_stats = IntegerStats::generate_opts(
276            &biased,
277            GenerateStatsOptions {
278                count_distinct_values: false,
279            },
280        );
281
282        // Immediately bitpack. If any other scheme was preferable, it would be chosen instead
283        // of bitpacking.
284        // NOTE: we could delegate in the future if we had another downstream codec that performs
285        //  as well.
286        let compressed = BitPackingScheme.compress(&biased_stats, is_sample, 0, excludes)?;
287
288        Ok(FoRArray::try_new(compressed, for_array.reference_scalar().clone())?.into_array())
289    }
290}
291
292impl Scheme for ZigZagScheme {
293    type StatsType = IntegerStats;
294    type CodeType = IntCode;
295
296    fn code(&self) -> IntCode {
297        ZIGZAG_SCHEME
298    }
299
300    fn expected_compression_ratio(
301        &self,
302        stats: &IntegerStats,
303        is_sample: bool,
304        allowed_cascading: usize,
305        excludes: &[IntCode],
306    ) -> VortexResult<f64> {
307        // ZigZag is only useful when we cascade it with another encoding
308        if allowed_cascading == 0 {
309            return Ok(0.0);
310        }
311
312        // Don't try and compress all-null arrays
313        if stats.value_count == 0 {
314            return Ok(0.0);
315        }
316
317        // ZigZag is only useful when there are negative values.
318        if !stats.typed.min_is_negative() {
319            return Ok(0.0);
320        }
321
322        // Run compression on a sample to see how it performs.
323        estimate_compression_ratio_with_sampling(
324            self,
325            stats,
326            is_sample,
327            allowed_cascading,
328            excludes,
329        )
330    }
331
332    fn compress(
333        &self,
334        stats: &IntegerStats,
335        is_sample: bool,
336        allowed_cascading: usize,
337        excludes: &[IntCode],
338    ) -> VortexResult<ArrayRef> {
339        // Zigzag encode the values, then recursively compress the inner values.
340        let zag = zigzag_encode(stats.src.clone())?;
341        let encoded = zag.encoded().to_primitive()?;
342
343        // ZigZag should be after Dict, RunEnd or Sparse.
344        // We should only do these "container" style compressors once.
345        let mut new_excludes = vec![
346            ZigZagScheme.code(),
347            DictScheme.code(),
348            RunEndScheme.code(),
349            SparseScheme.code(),
350        ];
351        new_excludes.extend_from_slice(excludes);
352
353        let compressed =
354            IntCompressor::compress(&encoded, is_sample, allowed_cascading - 1, &new_excludes)?;
355
356        log::debug!("zigzag output: {}", compressed.display_tree());
357
358        Ok(ZigZagArray::try_new(compressed)?.into_array())
359    }
360}
361
362impl Scheme for BitPackingScheme {
363    type StatsType = IntegerStats;
364    type CodeType = IntCode;
365
366    fn code(&self) -> IntCode {
367        BITPACKING_SCHEME
368    }
369
370    #[allow(clippy::cast_possible_truncation)]
371    fn expected_compression_ratio(
372        &self,
373        stats: &IntegerStats,
374        is_sample: bool,
375        allowed_cascading: usize,
376        excludes: &[IntCode],
377    ) -> VortexResult<f64> {
378        // BitPacking only works for non-negative values
379        if stats.typed.min_is_negative() {
380            return Ok(0.0);
381        }
382
383        // Don't compress all-null arrays
384        if stats.value_count == 0 {
385            return Ok(0.0);
386        }
387
388        estimate_compression_ratio_with_sampling(
389            self,
390            stats,
391            is_sample,
392            allowed_cascading,
393            excludes,
394        )
395    }
396
397    #[allow(clippy::cast_possible_truncation)]
398    fn compress(
399        &self,
400        stats: &IntegerStats,
401        _is_sample: bool,
402        _allowed_cascading: usize,
403        _excludes: &[IntCode],
404    ) -> VortexResult<ArrayRef> {
405        let histogram = bit_width_histogram(stats.source())?;
406        let bw = find_best_bit_width(stats.source().ptype(), &histogram)?;
407        // If best bw is determined to be the current bit-width, return the original array.
408        if bw as usize == stats.source().ptype().bit_width() {
409            return Ok(stats.source().clone().into_array());
410        }
411        let mut packed = bitpack_encode(stats.source(), bw, Some(&histogram))?;
412
413        let patches = packed.patches().map(compress_patches).transpose()?;
414        packed.replace_patches(patches);
415
416        Ok(packed.into_array())
417    }
418}
419
420impl Scheme for SparseScheme {
421    type StatsType = IntegerStats;
422    type CodeType = IntCode;
423
424    fn code(&self) -> IntCode {
425        SPARSE_SCHEME
426    }
427
428    // We can avoid asserting the encoding tree instead.
429    fn expected_compression_ratio(
430        &self,
431        stats: &IntegerStats,
432        _is_sample: bool,
433        _allowed_cascading: usize,
434        _excludes: &[IntCode],
435    ) -> VortexResult<f64> {
436        if stats.value_count == 0 {
437            // All nulls should use ConstantScheme
438            return Ok(0.0);
439        }
440
441        // If the majority is null, will compress well.
442        if stats.null_count as f64 / stats.src.len() as f64 > 0.9 {
443            return Ok(stats.src.len() as f64 / stats.value_count as f64);
444        }
445
446        // See if the top value accounts for >= 90% of the set values.
447        let (_, top_count) = stats.typed.top_value_and_count();
448
449        if top_count == stats.value_count {
450            // top_value is the only value, should use ConstantScheme instead
451            return Ok(0.0);
452        }
453
454        let freq = top_count as f64 / stats.value_count as f64;
455        if freq >= 0.9 {
456            // We only store the positions of the non-top values.
457            return Ok(stats.value_count as f64 / (stats.value_count - top_count) as f64);
458        }
459
460        Ok(0.0)
461    }
462
463    fn compress(
464        &self,
465        stats: &IntegerStats,
466        is_sample: bool,
467        allowed_cascading: usize,
468        excludes: &[IntCode],
469    ) -> VortexResult<ArrayRef> {
470        assert!(allowed_cascading > 0);
471        let (top_pvalue, top_count) = stats.typed.top_value_and_count();
472        if top_count as usize == stats.src.len() {
473            // top_value is the only value, use ConstantScheme
474            return Ok(ConstantArray::new(
475                Scalar::primitive_value(
476                    top_pvalue,
477                    top_pvalue.ptype(),
478                    stats.src.dtype().nullability(),
479                ),
480                stats.src.len(),
481            )
482            .into_array());
483        }
484
485        let sparse_encoded = SparseArray::encode(
486            stats.src.as_ref(),
487            Some(Scalar::primitive_value(
488                top_pvalue,
489                top_pvalue.ptype(),
490                stats.src.dtype().nullability(),
491            )),
492        )?;
493
494        if let Some(sparse) = sparse_encoded.as_opt::<SparseVTable>() {
495            // Compress the values
496            let mut new_excludes = vec![SparseScheme.code()];
497            new_excludes.extend_from_slice(excludes);
498
499            let compressed_values = IntCompressor::compress_no_dict(
500                &sparse.patches().values().to_primitive()?,
501                is_sample,
502                allowed_cascading - 1,
503                &new_excludes,
504            )?;
505
506            let indices =
507                downscale_integer_array(sparse.patches().indices().clone())?.to_primitive()?;
508
509            let compressed_indices = IntCompressor::compress_no_dict(
510                &indices,
511                is_sample,
512                allowed_cascading - 1,
513                &new_excludes,
514            )?;
515
516            SparseArray::try_new(
517                compressed_indices,
518                compressed_values,
519                sparse.len(),
520                sparse.fill_scalar().clone(),
521            )
522            .map(|a| a.into_array())
523        } else {
524            Ok(sparse_encoded)
525        }
526    }
527}
528
529impl Scheme for DictScheme {
530    type StatsType = IntegerStats;
531    type CodeType = IntCode;
532
533    fn code(&self) -> IntCode {
534        DICT_SCHEME
535    }
536
537    fn expected_compression_ratio(
538        &self,
539        stats: &IntegerStats,
540        _is_sample: bool,
541        allowed_cascading: usize,
542        _excludes: &[IntCode],
543    ) -> VortexResult<f64> {
544        // Dict should not be terminal.
545        if allowed_cascading == 0 {
546            return Ok(0.0);
547        }
548
549        if stats.value_count == 0 {
550            return Ok(0.0);
551        }
552
553        // If > 50% of the values are distinct, skip dict.
554        if stats.distinct_values_count > stats.value_count / 2 {
555            return Ok(0.0);
556        }
557
558        // Ignore nulls encoding for the estimate. We only focus on values.
559        let values_size = stats.source().ptype().bit_width() * stats.distinct_values_count as usize;
560
561        // Assume codes are compressed RLE + BitPacking.
562        let codes_bw = usize::BITS - stats.distinct_values_count.leading_zeros();
563
564        let n_runs = stats.value_count / stats.average_run_length;
565
566        // Assume that codes will either be BitPack or RLE-BitPack
567        let codes_size_bp = (codes_bw * stats.value_count) as usize;
568        let codes_size_rle_bp = (codes_bw + 32) * n_runs;
569
570        let codes_size = usize::min(codes_size_bp, codes_size_rle_bp as usize);
571
572        let before = stats.value_count as usize * stats.source().ptype().bit_width();
573
574        Ok(before as f64 / (values_size + codes_size) as f64)
575    }
576
577    fn compress(
578        &self,
579        stats: &IntegerStats,
580        is_sample: bool,
581        allowed_cascading: usize,
582        excludes: &[IntCode],
583    ) -> VortexResult<ArrayRef> {
584        assert!(allowed_cascading > 0);
585
586        // TODO(aduffy): we can be more prescriptive: we know that codes will EITHER be
587        //    RLE or FOR + BP. Cascading probably wastes some time here.
588
589        let dict = dictionary_encode(stats);
590
591        // Cascade the codes child
592        // Don't allow SequenceArray as the codes child as it merely adds extra indirection without actually compressing data.
593        let mut new_excludes = vec![DICT_SCHEME, SEQUENCE_SCHEME];
594        new_excludes.extend_from_slice(excludes);
595
596        let compressed_codes = IntCompressor::compress_no_dict(
597            &dict.codes().to_primitive()?,
598            is_sample,
599            allowed_cascading - 1,
600            &new_excludes,
601        )?;
602
603        // SAFETY: compressing codes does not change their values
604        unsafe {
605            Ok(DictArray::new_unchecked(compressed_codes, dict.values().clone()).into_array())
606        }
607    }
608}
609
610impl Scheme for RunEndScheme {
611    type StatsType = IntegerStats;
612    type CodeType = IntCode;
613
614    fn code(&self) -> IntCode {
615        RUNEND_SCHEME
616    }
617
618    fn expected_compression_ratio(
619        &self,
620        stats: &IntegerStats,
621        is_sample: bool,
622        allowed_cascading: usize,
623        excludes: &[IntCode],
624    ) -> VortexResult<f64> {
625        // If the run length is below the threshold, drop it.
626        if stats.average_run_length < RUN_END_THRESHOLD {
627            return Ok(0.0);
628        }
629
630        if allowed_cascading == 0 {
631            return Ok(0.0);
632        }
633
634        // Run compression on a sample, see how it performs.
635        estimate_compression_ratio_with_sampling(
636            self,
637            stats,
638            is_sample,
639            allowed_cascading,
640            excludes,
641        )
642    }
643
644    fn compress(
645        &self,
646        stats: &IntegerStats,
647        is_sample: bool,
648        allowed_cascading: usize,
649        excludes: &[IntCode],
650    ) -> VortexResult<ArrayRef> {
651        assert!(allowed_cascading > 0);
652
653        // run-end encode the ends
654        let (ends, values) = runend_encode(&stats.src)?;
655
656        let mut new_excludes = vec![RunEndScheme.code(), DictScheme.code()];
657        new_excludes.extend_from_slice(excludes);
658
659        let ends_stats = IntegerStats::generate_opts(
660            &ends,
661            GenerateStatsOptions {
662                count_distinct_values: false,
663            },
664        );
665        let ends_scheme = IntCompressor::choose_scheme(
666            &ends_stats,
667            is_sample,
668            allowed_cascading - 1,
669            &new_excludes,
670        )?;
671        let compressed_ends =
672            ends_scheme.compress(&ends_stats, is_sample, allowed_cascading - 1, &new_excludes)?;
673
674        let compressed_values = IntCompressor::compress_no_dict(
675            &values.to_primitive()?,
676            is_sample,
677            allowed_cascading - 1,
678            &new_excludes,
679        )?;
680
681        // SAFETY: compression doesn't affect invariants
682        unsafe {
683            Ok(
684                RunEndArray::new_unchecked(compressed_ends, compressed_values, 0, stats.src.len())
685                    .into_array(),
686            )
687        }
688    }
689}
690
691impl Scheme for SequenceScheme {
692    type StatsType = IntegerStats;
693    type CodeType = IntCode;
694
695    fn code(&self) -> Self::CodeType {
696        SEQUENCE_SCHEME
697    }
698
699    fn expected_compression_ratio(
700        &self,
701        stats: &Self::StatsType,
702        _is_sample: bool,
703        _allowed_cascading: usize,
704        _excludes: &[Self::CodeType],
705    ) -> VortexResult<f64> {
706        if stats.null_count > 0 {
707            return Ok(0.0);
708        }
709        // Since two values are required to store base and multiplier the
710        // compression ratio is divided by 2.
711        Ok(sequence_encode(&stats.src)?
712            .map(|_| stats.src.len() as f64 / 2.0)
713            .unwrap_or(0.0))
714    }
715
716    fn compress(
717        &self,
718        stats: &Self::StatsType,
719        _is_sample: bool,
720        _allowed_cascading: usize,
721        _excludes: &[Self::CodeType],
722    ) -> VortexResult<ArrayRef> {
723        if stats.null_count > 0 {
724            vortex_bail!("sequence encoding does not support nulls");
725        }
726        sequence_encode(&stats.src)?.ok_or_else(|| vortex_err!("cannot sequence encode array"))
727    }
728}
729
730#[cfg(test)]
731mod tests {
732    use itertools::Itertools;
733    use log::LevelFilter;
734    use rand::rngs::StdRng;
735    use rand::{RngCore, SeedableRng};
736    use vortex_array::arrays::PrimitiveArray;
737    use vortex_array::validity::Validity;
738    use vortex_array::vtable::ValidityHelper;
739    use vortex_array::{Array, IntoArray, ToCanonical};
740    use vortex_buffer::{Buffer, BufferMut, buffer, buffer_mut};
741    use vortex_dict::DictEncoding;
742    use vortex_sequence::SequenceEncoding;
743    use vortex_sparse::SparseEncoding;
744    use vortex_utils::aliases::hash_set::HashSet;
745
746    use crate::integer::{IntCompressor, IntegerStats, SequenceScheme, SparseScheme};
747    use crate::{Compressor, CompressorStats, Scheme};
748
749    #[test]
750    fn test_empty() {
751        // Make sure empty array compression does not fail
752        let result = IntCompressor::compress(
753            &PrimitiveArray::new(Buffer::<i32>::empty(), Validity::NonNullable),
754            false,
755            3,
756            &[],
757        )
758        .unwrap();
759
760        assert!(result.is_empty());
761    }
762
763    #[test]
764    fn test_dict_encodable() {
765        let mut codes = BufferMut::<i32>::with_capacity(65_535);
766        // Write some runs of length 3 of a handful of different values. Interrupted by some
767        // one-off values.
768
769        let numbers = [0, 10, 50, 100, 1000, 3000]
770            .into_iter()
771            .map(|i| 1234 * i)
772            .collect_vec();
773
774        let mut rng = StdRng::seed_from_u64(1u64);
775        while codes.len() < 64000 {
776            let run_length = rng.next_u32() % 5;
777            let value = numbers[rng.next_u32() as usize % numbers.len()];
778            for _ in 0..run_length {
779                codes.push(value);
780            }
781        }
782
783        let primitive = codes.freeze().into_array().to_primitive().unwrap();
784        let compressed = IntCompressor::compress(&primitive, false, 3, &[]).unwrap();
785        assert_eq!(compressed.encoding_id(), DictEncoding.id());
786    }
787
788    #[test]
789    fn test_window_name() {
790        env_logger::builder()
791            .filter(None, LevelFilter::Debug)
792            .try_init()
793            .ok();
794
795        // A test that's meant to mirror the WindowName column from ClickBench.
796        let mut values = buffer_mut![-1i32; 1_000_000];
797        let mut visited = HashSet::new();
798        let mut rng = StdRng::seed_from_u64(1u64);
799        while visited.len() < 223 {
800            let random = (rng.next_u32() as usize) % 1_000_000;
801            if visited.contains(&random) {
802                continue;
803            }
804            visited.insert(random);
805            // Pick 100 random values to insert.
806            values[random] = 5 * (rng.next_u64() % 100) as i32;
807        }
808
809        let array = values.freeze().into_array().to_primitive().unwrap();
810        let compressed = IntCompressor::compress(&array, false, 3, &[]).unwrap();
811        log::info!("WindowName compressed: {}", compressed.display_tree());
812    }
813
814    #[test]
815    fn sparse_with_nulls() {
816        let array = PrimitiveArray::new(
817            buffer![189u8, 189, 189, 0, 46],
818            Validity::from_iter(vec![true, true, true, true, false]),
819        );
820        let compressed = SparseScheme
821            .compress(&IntegerStats::generate(&array), false, 3, &[])
822            .unwrap();
823        assert_eq!(compressed.encoding_id(), SparseEncoding.id());
824        let decoded = compressed.to_primitive().unwrap();
825        let expected = [189u8, 189, 189, 0, 0];
826        assert_eq!(decoded.as_slice::<u8>(), &expected);
827        assert_eq!(decoded.validity(), array.validity());
828    }
829
830    #[test]
831    fn sparse_mostly_nulls() {
832        let array = PrimitiveArray::new(
833            buffer![189u8, 189, 189, 189, 189, 189, 189, 189, 189, 0, 46],
834            Validity::from_iter(vec![
835                false, false, false, false, false, false, false, false, false, false, true,
836            ]),
837        );
838        let compressed = SparseScheme
839            .compress(&IntegerStats::generate(&array), false, 3, &[])
840            .unwrap();
841        assert_eq!(compressed.encoding_id(), SparseEncoding.id());
842        let decoded = compressed.to_primitive().unwrap();
843        let expected = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46];
844        assert_eq!(decoded.as_slice::<u8>(), &expected);
845        assert_eq!(decoded.validity(), array.validity());
846    }
847
848    #[test]
849    fn nullable_sequence() {
850        let values = (0i32..20).step_by(7).collect_vec();
851        let array = PrimitiveArray::from_option_iter(values.clone().into_iter().map(Some));
852        let compressed = SequenceScheme
853            .compress(&IntegerStats::generate(&array), false, 3, &[])
854            .unwrap();
855        assert_eq!(compressed.encoding_id(), SequenceEncoding.id());
856        let decoded = compressed.to_primitive().unwrap();
857        assert_eq!(decoded.as_slice::<i32>(), values.as_slice());
858    }
859}