vortex_sampling_compressor/compressors/
dict.rs

1use vortex_array::aliases::hash_set::HashSet;
2use vortex_array::array::{PrimitiveEncoding, VarBinEncoding, VarBinViewEncoding};
3use vortex_array::{Array, Encoding, EncodingId, IntoArray};
4use vortex_dict::{dict_encode, DictArray, DictEncoding};
5use vortex_error::VortexResult;
6
7use crate::compressors::{CompressedArray, CompressionTree, EncodingCompressor};
8use crate::downscale::downscale_integer_array;
9use crate::{constants, SamplingCompressor};
10
11#[derive(Debug)]
12pub struct DictCompressor;
13
14impl EncodingCompressor for DictCompressor {
15    fn id(&self) -> &str {
16        DictEncoding::ID.as_ref()
17    }
18
19    fn cost(&self) -> u8 {
20        constants::DICT_COST
21    }
22
23    fn can_compress(&self, array: &Array) -> Option<&dyn EncodingCompressor> {
24        if !array.is_encoding(PrimitiveEncoding::ID)
25            && !array.is_encoding(VarBinEncoding::ID)
26            && !array.is_encoding(VarBinViewEncoding::ID)
27        {
28            return None;
29        };
30
31        // No point dictionary coding if the array is unique.
32        // We don't have a unique stat yet, but strict-sorted implies unique.
33        if array
34            .statistics()
35            .compute_is_strict_sorted()
36            .unwrap_or(false)
37        {
38            return None;
39        }
40
41        Some(self)
42    }
43
44    fn compress<'a>(
45        &'a self,
46        array: &Array,
47        like: Option<CompressionTree<'a>>,
48        ctx: SamplingCompressor<'a>,
49    ) -> VortexResult<CompressedArray<'a>> {
50        let dict = dict_encode(array)?;
51        let codes = dict.codes();
52        let values = dict.values();
53
54        let (codes, values) = (
55            ctx.auxiliary("codes").excluding(self).compress(
56                &downscale_integer_array(codes)?,
57                like.as_ref().and_then(|l| l.child(0)),
58            )?,
59            ctx.named("values")
60                .excluding(self)
61                .compress(&values, like.as_ref().and_then(|l| l.child(1)))?,
62        );
63
64        Ok(CompressedArray::compressed(
65            DictArray::try_new(codes.array, values.array)?.into_array(),
66            Some(CompressionTree::new(self, vec![codes.path, values.path])),
67            array,
68        ))
69    }
70
71    fn used_encodings(&self) -> HashSet<EncodingId> {
72        HashSet::from([DictEncoding::ID])
73    }
74}