vortex_array/
compress.rs

1// FIXME(ngates): move this file into the compressor
2use vortex_error::VortexResult;
3
4use crate::aliases::hash_set::HashSet;
5use crate::stats::PRUNING_STATS;
6use crate::{Array, ArrayRef, EncodingId};
7
8/// Extendable compression interface, allowing implementations to explore different choices.
9pub trait CompressionStrategy {
10    /// Compress input array.
11    fn compress(&self, array: &dyn Array) -> VortexResult<ArrayRef>;
12
13    /// A set of the IDs of the encodings the compressor can choose from.
14    fn used_encodings(&self) -> HashSet<EncodingId>;
15}
16
17/// Check that compression did not alter the length of the validity array.
18pub fn check_validity_unchanged(arr: &dyn Array, compressed: &dyn Array) {
19    let _ = arr;
20    let _ = compressed;
21    #[cfg(debug_assertions)]
22    {
23        use vortex_error::VortexExpect;
24
25        let old_validity = arr
26            .validity_mask()
27            .vortex_expect("failed to compute validity")
28            .len();
29        let new_validity = compressed
30            .validity_mask()
31            .vortex_expect("failed to compute validity ")
32            .len();
33
34        debug_assert!(
35            old_validity == new_validity,
36            "validity length changed after compression: {old_validity} -> {new_validity}\n From tree {} To tree {}\n",
37            arr.tree_display(),
38            compressed.tree_display()
39        );
40    }
41}
42
43/// Check that compression did not alter the dtype
44pub fn check_dtype_unchanged(arr: &dyn Array, compressed: &dyn Array) {
45    let _ = arr;
46    let _ = compressed;
47    #[cfg(debug_assertions)]
48    {
49        debug_assert!(
50            arr.dtype() == compressed.dtype(),
51            "Compression changed dtype: {} -> {}\nFrom array: {}Into array {}",
52            arr.dtype(),
53            compressed.dtype(),
54            arr.tree_display(),
55            compressed.tree_display(),
56        );
57    }
58}
59
60// Check that compression preserved the statistics.
61pub fn check_statistics_unchanged(arr: &dyn Array, compressed: &dyn Array) {
62    let _ = arr;
63    let _ = compressed;
64    #[cfg(debug_assertions)]
65    {
66        use crate::stats::StatsProviderExt;
67
68        // Run count merge_ordered assumes that the run is "broken" on each chunk, which is a useful estimate but not guaranteed to be correct.
69        for (stat, value) in arr.statistics().to_owned().into_iter() {
70            if let Some(dtype) = stat.dtype(compressed.dtype()) {
71                let compressed_scalar = compressed.statistics().get_scalar(stat, &dtype);
72                debug_assert_eq!(
73                    compressed_scalar,
74                    Some(value.clone().into_scalar(dtype)),
75                    "Compression changed {stat} from {value} to {:?}",
76                    compressed_scalar.as_ref(),
77                );
78            }
79        }
80    }
81}
82
83/// Eagerly compute certain statistics (i.e., pruning stats plus UncompressedSizeInBytes) for an array.
84/// This function is intended to be called in compressors, immediately before compression occurs.
85pub fn compute_precompression_stats(arr: &dyn Array) -> VortexResult<()> {
86    arr.statistics().compute_uncompressed_size_in_bytes();
87    arr.statistics().compute_all(PRUNING_STATS).map(|_| ())
88}