1pub mod dictionary;
5mod stats;
6
7use std::fmt::Debug;
8use std::hash::Hash;
9
10pub use stats::IntegerStats;
11use vortex_array::arrays::{ConstantArray, PrimitiveArray, PrimitiveVTable};
12use vortex_array::compress::downscale_integer_array;
13use vortex_array::{ArrayRef, IntoArray, ToCanonical};
14use vortex_dict::DictArray;
15use vortex_error::{VortexExpect, VortexResult, VortexUnwrap, vortex_bail, vortex_err};
16use vortex_fastlanes::{FoRArray, bit_width_histogram, bitpack_encode, find_best_bit_width};
17use vortex_runend::RunEndArray;
18use vortex_runend::compress::runend_encode;
19use vortex_scalar::Scalar;
20use vortex_sequence::sequence_encode;
21use vortex_sparse::{SparseArray, SparseVTable};
22use vortex_zigzag::{ZigZagArray, zigzag_encode};
23
24use crate::integer::dictionary::dictionary_encode;
25use crate::patches::compress_patches;
26use crate::{
27 Compressor, CompressorStats, GenerateStatsOptions, Scheme,
28 estimate_compression_ratio_with_sampling,
29};
30
31pub struct IntCompressor;
32
33impl Compressor for IntCompressor {
34 type ArrayVTable = PrimitiveVTable;
35 type SchemeType = dyn IntegerScheme;
36 type StatsType = IntegerStats;
37
38 fn schemes() -> &'static [&'static dyn IntegerScheme] {
39 &[
40 &ConstantScheme,
41 &FORScheme,
42 &ZigZagScheme,
43 &BitPackingScheme,
44 &SparseScheme,
45 &DictScheme,
46 &RunEndScheme,
47 &SequenceScheme,
48 ]
49 }
50
51 fn default_scheme() -> &'static Self::SchemeType {
52 &UncompressedScheme
53 }
54
55 fn dict_scheme_code() -> IntCode {
56 DICT_SCHEME
57 }
58}
59
60impl IntCompressor {
61 pub fn compress_no_dict(
62 array: &PrimitiveArray,
63 is_sample: bool,
64 allowed_cascading: usize,
65 excludes: &[IntCode],
66 ) -> VortexResult<ArrayRef> {
67 let stats = IntegerStats::generate_opts(
68 array,
69 GenerateStatsOptions {
70 count_distinct_values: false,
71 },
72 );
73
74 let scheme = Self::choose_scheme(&stats, is_sample, allowed_cascading, excludes)?;
75 let output = scheme.compress(&stats, is_sample, allowed_cascading, excludes)?;
76
77 if output.nbytes() < array.nbytes() {
78 Ok(output)
79 } else {
80 log::debug!("resulting tree too large: {}", output.display_tree());
81 Ok(array.to_array())
82 }
83 }
84}
85
86pub trait IntegerScheme: Scheme<StatsType = IntegerStats, CodeType = IntCode> {}
87
88impl<T> IntegerScheme for T where T: Scheme<StatsType = IntegerStats, CodeType = IntCode> {}
90
91#[derive(Debug, Copy, Clone, Eq, PartialEq, Hash)]
92pub struct IntCode(u8);
93
94const UNCOMPRESSED_SCHEME: IntCode = IntCode(0);
95const CONSTANT_SCHEME: IntCode = IntCode(1);
96const FOR_SCHEME: IntCode = IntCode(2);
97const ZIGZAG_SCHEME: IntCode = IntCode(3);
98const BITPACKING_SCHEME: IntCode = IntCode(4);
99const SPARSE_SCHEME: IntCode = IntCode(5);
100const DICT_SCHEME: IntCode = IntCode(6);
101const RUNEND_SCHEME: IntCode = IntCode(7);
102const SEQUENCE_SCHEME: IntCode = IntCode(8);
103
104#[derive(Debug, Copy, Clone)]
105pub struct UncompressedScheme;
106
107#[derive(Debug, Copy, Clone)]
108pub struct ConstantScheme;
109
110#[derive(Debug, Copy, Clone)]
111pub struct FORScheme;
112
113#[derive(Debug, Copy, Clone)]
114pub struct ZigZagScheme;
115
116#[derive(Debug, Copy, Clone)]
117pub struct BitPackingScheme;
118
119#[derive(Debug, Copy, Clone)]
120pub struct SparseScheme;
121
122#[derive(Debug, Copy, Clone)]
123pub struct DictScheme;
124
125#[derive(Debug, Copy, Clone)]
126pub struct RunEndScheme;
127
128#[derive(Debug, Copy, Clone)]
129pub struct SequenceScheme;
130
131const RUN_END_THRESHOLD: u32 = 4;
133
134impl Scheme for UncompressedScheme {
135 type StatsType = IntegerStats;
136 type CodeType = IntCode;
137
138 fn code(&self) -> IntCode {
139 UNCOMPRESSED_SCHEME
140 }
141
142 fn expected_compression_ratio(
143 &self,
144 _stats: &IntegerStats,
145 _is_sample: bool,
146 _allowed_cascading: usize,
147 _excludes: &[IntCode],
148 ) -> VortexResult<f64> {
149 Ok(1.0)
151 }
152
153 fn compress(
154 &self,
155 stats: &IntegerStats,
156 _is_sample: bool,
157 _allowed_cascading: usize,
158 _excludes: &[IntCode],
159 ) -> VortexResult<ArrayRef> {
160 Ok(stats.source().to_array())
161 }
162}
163
164impl Scheme for ConstantScheme {
165 type StatsType = IntegerStats;
166 type CodeType = IntCode;
167
168 fn code(&self) -> IntCode {
169 CONSTANT_SCHEME
170 }
171
172 fn is_constant(&self) -> bool {
173 true
174 }
175
176 fn expected_compression_ratio(
177 &self,
178 stats: &IntegerStats,
179 is_sample: bool,
180 _allowed_cascading: usize,
181 _excludes: &[IntCode],
182 ) -> VortexResult<f64> {
183 if is_sample {
185 return Ok(0.0);
186 }
187
188 if stats.distinct_values_count != 1 {
190 return Ok(0.0);
191 }
192
193 if stats.null_count > 0 && stats.value_count > 0 {
195 return Ok(0.0);
196 }
197
198 Ok(stats.value_count as f64)
199 }
200
201 fn compress(
202 &self,
203 stats: &IntegerStats,
204 _is_sample: bool,
205 _allowed_cascading: usize,
206 _excludes: &[IntCode],
207 ) -> VortexResult<ArrayRef> {
208 let scalar = stats
211 .source()
212 .as_constant()
213 .vortex_expect("constant array expected");
214
215 Ok(ConstantArray::new(scalar, stats.src.len()).into_array())
216 }
217}
218
219impl Scheme for FORScheme {
220 type StatsType = IntegerStats;
221 type CodeType = IntCode;
222
223 fn code(&self) -> IntCode {
224 FOR_SCHEME
225 }
226
227 fn expected_compression_ratio(
228 &self,
229 stats: &IntegerStats,
230 _is_sample: bool,
231 allowed_cascading: usize,
232 _excludes: &[IntCode],
233 ) -> VortexResult<f64> {
234 if allowed_cascading == 0 {
236 return Ok(0.0);
237 }
238
239 if stats.value_count == 0 {
241 return Ok(0.0);
242 }
243
244 if stats.typed.min_is_zero() {
246 return Ok(0.0);
247 }
248
249 let full_width: u32 = stats.src.ptype().bit_width().try_into().vortex_unwrap();
251 let bw = match stats.typed.max_minus_min().checked_ilog2() {
252 Some(l) => l + 1,
253 None => return Ok(0.0),
256 };
257
258 if full_width - bw < 8 {
260 return Ok(0.0);
261 }
262
263 Ok(full_width as f64 / bw as f64)
264 }
265
266 fn compress(
267 &self,
268 stats: &IntegerStats,
269 is_sample: bool,
270 _allowed_cascading: usize,
271 excludes: &[IntCode],
272 ) -> VortexResult<ArrayRef> {
273 let for_array = FoRArray::encode(stats.src.clone())?;
274 let biased = for_array.encoded().to_primitive()?;
275 let biased_stats = IntegerStats::generate_opts(
276 &biased,
277 GenerateStatsOptions {
278 count_distinct_values: false,
279 },
280 );
281
282 let compressed = BitPackingScheme.compress(&biased_stats, is_sample, 0, excludes)?;
287
288 Ok(FoRArray::try_new(compressed, for_array.reference_scalar().clone())?.into_array())
289 }
290}
291
292impl Scheme for ZigZagScheme {
293 type StatsType = IntegerStats;
294 type CodeType = IntCode;
295
296 fn code(&self) -> IntCode {
297 ZIGZAG_SCHEME
298 }
299
300 fn expected_compression_ratio(
301 &self,
302 stats: &IntegerStats,
303 is_sample: bool,
304 allowed_cascading: usize,
305 excludes: &[IntCode],
306 ) -> VortexResult<f64> {
307 if allowed_cascading == 0 {
309 return Ok(0.0);
310 }
311
312 if stats.value_count == 0 {
314 return Ok(0.0);
315 }
316
317 if !stats.typed.min_is_negative() {
319 return Ok(0.0);
320 }
321
322 estimate_compression_ratio_with_sampling(
324 self,
325 stats,
326 is_sample,
327 allowed_cascading,
328 excludes,
329 )
330 }
331
332 fn compress(
333 &self,
334 stats: &IntegerStats,
335 is_sample: bool,
336 allowed_cascading: usize,
337 excludes: &[IntCode],
338 ) -> VortexResult<ArrayRef> {
339 let zag = zigzag_encode(stats.src.clone())?;
341 let encoded = zag.encoded().to_primitive()?;
342
343 let mut new_excludes = vec![
346 ZigZagScheme.code(),
347 DictScheme.code(),
348 RunEndScheme.code(),
349 SparseScheme.code(),
350 ];
351 new_excludes.extend_from_slice(excludes);
352
353 let compressed =
354 IntCompressor::compress(&encoded, is_sample, allowed_cascading - 1, &new_excludes)?;
355
356 log::debug!("zigzag output: {}", compressed.display_tree());
357
358 Ok(ZigZagArray::try_new(compressed)?.into_array())
359 }
360}
361
362impl Scheme for BitPackingScheme {
363 type StatsType = IntegerStats;
364 type CodeType = IntCode;
365
366 fn code(&self) -> IntCode {
367 BITPACKING_SCHEME
368 }
369
370 #[allow(clippy::cast_possible_truncation)]
371 fn expected_compression_ratio(
372 &self,
373 stats: &IntegerStats,
374 is_sample: bool,
375 allowed_cascading: usize,
376 excludes: &[IntCode],
377 ) -> VortexResult<f64> {
378 if stats.typed.min_is_negative() {
380 return Ok(0.0);
381 }
382
383 if stats.value_count == 0 {
385 return Ok(0.0);
386 }
387
388 estimate_compression_ratio_with_sampling(
389 self,
390 stats,
391 is_sample,
392 allowed_cascading,
393 excludes,
394 )
395 }
396
397 #[allow(clippy::cast_possible_truncation)]
398 fn compress(
399 &self,
400 stats: &IntegerStats,
401 _is_sample: bool,
402 _allowed_cascading: usize,
403 _excludes: &[IntCode],
404 ) -> VortexResult<ArrayRef> {
405 let histogram = bit_width_histogram(stats.source())?;
406 let bw = find_best_bit_width(stats.source().ptype(), &histogram)?;
407 if bw as usize == stats.source().ptype().bit_width() {
409 return Ok(stats.source().clone().into_array());
410 }
411 let mut packed = bitpack_encode(stats.source(), bw, Some(&histogram))?;
412
413 let patches = packed.patches().map(compress_patches).transpose()?;
414 packed.replace_patches(patches);
415
416 Ok(packed.into_array())
417 }
418}
419
420impl Scheme for SparseScheme {
421 type StatsType = IntegerStats;
422 type CodeType = IntCode;
423
424 fn code(&self) -> IntCode {
425 SPARSE_SCHEME
426 }
427
428 fn expected_compression_ratio(
430 &self,
431 stats: &IntegerStats,
432 _is_sample: bool,
433 _allowed_cascading: usize,
434 _excludes: &[IntCode],
435 ) -> VortexResult<f64> {
436 if stats.value_count == 0 {
437 return Ok(0.0);
439 }
440
441 if stats.null_count as f64 / stats.src.len() as f64 > 0.9 {
443 return Ok(stats.src.len() as f64 / stats.value_count as f64);
444 }
445
446 let (_, top_count) = stats.typed.top_value_and_count();
448
449 if top_count == stats.value_count {
450 return Ok(0.0);
452 }
453
454 let freq = top_count as f64 / stats.value_count as f64;
455 if freq >= 0.9 {
456 return Ok(stats.value_count as f64 / (stats.value_count - top_count) as f64);
458 }
459
460 Ok(0.0)
461 }
462
463 fn compress(
464 &self,
465 stats: &IntegerStats,
466 is_sample: bool,
467 allowed_cascading: usize,
468 excludes: &[IntCode],
469 ) -> VortexResult<ArrayRef> {
470 assert!(allowed_cascading > 0);
471 let (top_pvalue, top_count) = stats.typed.top_value_and_count();
472 if top_count as usize == stats.src.len() {
473 return Ok(ConstantArray::new(
475 Scalar::primitive_value(
476 top_pvalue,
477 top_pvalue.ptype(),
478 stats.src.dtype().nullability(),
479 ),
480 stats.src.len(),
481 )
482 .into_array());
483 }
484
485 let sparse_encoded = SparseArray::encode(
486 stats.src.as_ref(),
487 Some(Scalar::primitive_value(
488 top_pvalue,
489 top_pvalue.ptype(),
490 stats.src.dtype().nullability(),
491 )),
492 )?;
493
494 if let Some(sparse) = sparse_encoded.as_opt::<SparseVTable>() {
495 let mut new_excludes = vec![SparseScheme.code()];
497 new_excludes.extend_from_slice(excludes);
498
499 let compressed_values = IntCompressor::compress_no_dict(
500 &sparse.patches().values().to_primitive()?,
501 is_sample,
502 allowed_cascading - 1,
503 &new_excludes,
504 )?;
505
506 let indices =
507 downscale_integer_array(sparse.patches().indices().clone())?.to_primitive()?;
508
509 let compressed_indices = IntCompressor::compress_no_dict(
510 &indices,
511 is_sample,
512 allowed_cascading - 1,
513 &new_excludes,
514 )?;
515
516 SparseArray::try_new(
517 compressed_indices,
518 compressed_values,
519 sparse.len(),
520 sparse.fill_scalar().clone(),
521 )
522 .map(|a| a.into_array())
523 } else {
524 Ok(sparse_encoded)
525 }
526 }
527}
528
529impl Scheme for DictScheme {
530 type StatsType = IntegerStats;
531 type CodeType = IntCode;
532
533 fn code(&self) -> IntCode {
534 DICT_SCHEME
535 }
536
537 fn expected_compression_ratio(
538 &self,
539 stats: &IntegerStats,
540 _is_sample: bool,
541 allowed_cascading: usize,
542 _excludes: &[IntCode],
543 ) -> VortexResult<f64> {
544 if allowed_cascading == 0 {
546 return Ok(0.0);
547 }
548
549 if stats.value_count == 0 {
550 return Ok(0.0);
551 }
552
553 if stats.distinct_values_count > stats.value_count / 2 {
555 return Ok(0.0);
556 }
557
558 let values_size = stats.source().ptype().bit_width() * stats.distinct_values_count as usize;
560
561 let codes_bw = usize::BITS - stats.distinct_values_count.leading_zeros();
563
564 let n_runs = stats.value_count / stats.average_run_length;
565
566 let codes_size_bp = (codes_bw * stats.value_count) as usize;
568 let codes_size_rle_bp = (codes_bw + 32) * n_runs;
569
570 let codes_size = usize::min(codes_size_bp, codes_size_rle_bp as usize);
571
572 let before = stats.value_count as usize * stats.source().ptype().bit_width();
573
574 Ok(before as f64 / (values_size + codes_size) as f64)
575 }
576
577 fn compress(
578 &self,
579 stats: &IntegerStats,
580 is_sample: bool,
581 allowed_cascading: usize,
582 excludes: &[IntCode],
583 ) -> VortexResult<ArrayRef> {
584 assert!(allowed_cascading > 0);
585
586 let dict = dictionary_encode(stats)?;
590
591 let mut new_excludes = vec![DICT_SCHEME, SEQUENCE_SCHEME];
594 new_excludes.extend_from_slice(excludes);
595
596 let compressed_codes = IntCompressor::compress_no_dict(
597 &dict.codes().to_primitive()?,
598 is_sample,
599 allowed_cascading - 1,
600 &new_excludes,
601 )?;
602
603 Ok(DictArray::try_new(compressed_codes, dict.values().clone())?.into_array())
604 }
605}
606
607impl Scheme for RunEndScheme {
608 type StatsType = IntegerStats;
609 type CodeType = IntCode;
610
611 fn code(&self) -> IntCode {
612 RUNEND_SCHEME
613 }
614
615 fn expected_compression_ratio(
616 &self,
617 stats: &IntegerStats,
618 is_sample: bool,
619 allowed_cascading: usize,
620 excludes: &[IntCode],
621 ) -> VortexResult<f64> {
622 if stats.average_run_length < RUN_END_THRESHOLD {
624 return Ok(0.0);
625 }
626
627 if allowed_cascading == 0 {
628 return Ok(0.0);
629 }
630
631 estimate_compression_ratio_with_sampling(
633 self,
634 stats,
635 is_sample,
636 allowed_cascading,
637 excludes,
638 )
639 }
640
641 fn compress(
642 &self,
643 stats: &IntegerStats,
644 is_sample: bool,
645 allowed_cascading: usize,
646 excludes: &[IntCode],
647 ) -> VortexResult<ArrayRef> {
648 assert!(allowed_cascading > 0);
649
650 let (ends, values) = runend_encode(&stats.src)?;
652
653 let mut new_excludes = vec![RunEndScheme.code(), DictScheme.code()];
654 new_excludes.extend_from_slice(excludes);
655
656 let ends_stats = IntegerStats::generate_opts(
657 &ends,
658 GenerateStatsOptions {
659 count_distinct_values: false,
660 },
661 );
662 let ends_scheme = IntCompressor::choose_scheme(
663 &ends_stats,
664 is_sample,
665 allowed_cascading - 1,
666 &new_excludes,
667 )?;
668 let compressed_ends =
669 ends_scheme.compress(&ends_stats, is_sample, allowed_cascading - 1, &new_excludes)?;
670
671 let compressed_values = IntCompressor::compress_no_dict(
672 &values.to_primitive()?,
673 is_sample,
674 allowed_cascading - 1,
675 &new_excludes,
676 )?;
677
678 Ok(RunEndArray::try_new(compressed_ends, compressed_values)?.into_array())
679 }
680}
681
682impl Scheme for SequenceScheme {
683 type StatsType = IntegerStats;
684 type CodeType = IntCode;
685
686 fn code(&self) -> Self::CodeType {
687 SEQUENCE_SCHEME
688 }
689
690 fn expected_compression_ratio(
691 &self,
692 stats: &Self::StatsType,
693 _is_sample: bool,
694 _allowed_cascading: usize,
695 _excludes: &[Self::CodeType],
696 ) -> VortexResult<f64> {
697 if stats.null_count > 0 {
698 return Ok(0.0);
699 }
700 Ok(sequence_encode(&stats.src)?
703 .map(|_| stats.src.len() as f64 / 2.0)
704 .unwrap_or(0.0))
705 }
706
707 fn compress(
708 &self,
709 stats: &Self::StatsType,
710 _is_sample: bool,
711 _allowed_cascading: usize,
712 _excludes: &[Self::CodeType],
713 ) -> VortexResult<ArrayRef> {
714 if stats.null_count > 0 {
715 vortex_bail!("sequence encoding does not support nulls");
716 }
717 sequence_encode(&stats.src)?.ok_or_else(|| vortex_err!("cannot sequence encode array"))
718 }
719}
720
721#[cfg(test)]
722mod tests {
723 use itertools::Itertools;
724 use log::LevelFilter;
725 use rand::rngs::StdRng;
726 use rand::{RngCore, SeedableRng};
727 use vortex_array::arrays::PrimitiveArray;
728 use vortex_array::validity::Validity;
729 use vortex_array::vtable::ValidityHelper;
730 use vortex_array::{Array, IntoArray, ToCanonical};
731 use vortex_buffer::{Buffer, BufferMut, buffer, buffer_mut};
732 use vortex_dict::DictEncoding;
733 use vortex_sequence::SequenceEncoding;
734 use vortex_sparse::SparseEncoding;
735 use vortex_utils::aliases::hash_set::HashSet;
736
737 use crate::integer::{IntCompressor, IntegerStats, SequenceScheme, SparseScheme};
738 use crate::{Compressor, CompressorStats, Scheme};
739
740 #[test]
741 fn test_empty() {
742 let result = IntCompressor::compress(
744 &PrimitiveArray::new(Buffer::<i32>::empty(), Validity::NonNullable),
745 false,
746 3,
747 &[],
748 )
749 .unwrap();
750
751 assert!(result.is_empty());
752 }
753
754 #[test]
755 fn test_dict_encodable() {
756 let mut codes = BufferMut::<i32>::with_capacity(65_535);
757 let numbers = [0, 10, 50, 100, 1000, 3000]
761 .into_iter()
762 .map(|i| 1234 * i)
763 .collect_vec();
764
765 let mut rng = StdRng::seed_from_u64(1u64);
766 while codes.len() < 64000 {
767 let run_length = rng.next_u32() % 5;
768 let value = numbers[rng.next_u32() as usize % numbers.len()];
769 for _ in 0..run_length {
770 codes.push(value);
771 }
772 }
773
774 let primitive = codes.freeze().into_array().to_primitive().unwrap();
775 let compressed = IntCompressor::compress(&primitive, false, 3, &[]).unwrap();
776 assert_eq!(compressed.encoding_id(), DictEncoding.id());
777 }
778
779 #[test]
780 fn test_window_name() {
781 env_logger::builder()
782 .filter(None, LevelFilter::Debug)
783 .try_init()
784 .ok();
785
786 let mut values = buffer_mut![-1i32; 1_000_000];
788 let mut visited = HashSet::new();
789 let mut rng = StdRng::seed_from_u64(1u64);
790 while visited.len() < 223 {
791 let random = (rng.next_u32() as usize) % 1_000_000;
792 if visited.contains(&random) {
793 continue;
794 }
795 visited.insert(random);
796 values[random] = 5 * (rng.next_u64() % 100) as i32;
798 }
799
800 let array = values.freeze().into_array().to_primitive().unwrap();
801 let compressed = IntCompressor::compress(&array, false, 3, &[]).unwrap();
802 log::info!("WindowName compressed: {}", compressed.display_tree());
803 }
804
805 #[test]
806 fn sparse_with_nulls() {
807 let array = PrimitiveArray::new(
808 buffer![189u8, 189, 189, 0, 46],
809 Validity::from_iter(vec![true, true, true, true, false]),
810 );
811 let compressed = SparseScheme
812 .compress(&IntegerStats::generate(&array), false, 3, &[])
813 .unwrap();
814 assert_eq!(compressed.encoding_id(), SparseEncoding.id());
815 let decoded = compressed.to_primitive().unwrap();
816 let expected = [189u8, 189, 189, 0, 0];
817 assert_eq!(decoded.as_slice::<u8>(), &expected);
818 assert_eq!(decoded.validity(), array.validity());
819 }
820
821 #[test]
822 fn sparse_mostly_nulls() {
823 let array = PrimitiveArray::new(
824 buffer![189u8, 189, 189, 189, 189, 189, 189, 189, 189, 0, 46],
825 Validity::from_iter(vec![
826 false, false, false, false, false, false, false, false, false, false, true,
827 ]),
828 );
829 let compressed = SparseScheme
830 .compress(&IntegerStats::generate(&array), false, 3, &[])
831 .unwrap();
832 assert_eq!(compressed.encoding_id(), SparseEncoding.id());
833 let decoded = compressed.to_primitive().unwrap();
834 let expected = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 46];
835 assert_eq!(decoded.as_slice::<u8>(), &expected);
836 assert_eq!(decoded.validity(), array.validity());
837 }
838
839 #[test]
840 fn nullable_sequence() {
841 let values = (0i32..20).step_by(7).collect_vec();
842 let array = PrimitiveArray::from_option_iter(values.clone().into_iter().map(Some));
843 let compressed = SequenceScheme
844 .compress(&IntegerStats::generate(&array), false, 3, &[])
845 .unwrap();
846 assert_eq!(compressed.encoding_id(), SequenceEncoding.id());
847 let decoded = compressed.to_primitive().unwrap();
848 assert_eq!(decoded.as_slice::<i32>(), values.as_slice());
849 }
850}