vortex_sampling_compressor/compressors/
struct_.rs

1use itertools::Itertools;
2use vortex_array::aliases::hash_set::HashSet;
3use vortex_array::array::{StructArray, StructEncoding};
4use vortex_array::compress::compute_precompression_stats;
5use vortex_array::variants::StructArrayTrait;
6use vortex_array::{Array, Encoding, EncodingId, IntoArray};
7use vortex_dtype::DType;
8use vortex_error::VortexResult;
9
10use crate::compressors::{CompressedArray, CompressionTree, EncodingCompressor};
11use crate::{constants, SamplingCompressor};
12
13#[derive(Debug)]
14pub struct StructCompressor;
15
16impl EncodingCompressor for StructCompressor {
17    fn id(&self) -> &str {
18        StructEncoding::ID.as_ref()
19    }
20
21    fn cost(&self) -> u8 {
22        constants::STRUCT_COST
23    }
24
25    fn can_compress(&self, array: &Array) -> Option<&dyn EncodingCompressor> {
26        let is_struct =
27            matches!(array.dtype(), DType::Struct(..)) && array.is_encoding(StructEncoding::ID);
28        is_struct.then_some(self)
29    }
30
31    fn compress<'a>(
32        &'a self,
33        array: &Array,
34        like: Option<CompressionTree<'a>>,
35        ctx: SamplingCompressor<'a>,
36    ) -> VortexResult<CompressedArray<'a>> {
37        let array = StructArray::try_from(array.clone())?;
38        let compressed_validity = ctx.compress_validity(array.validity())?;
39
40        let children_trees = match like {
41            Some(tree) => tree.children,
42            None => vec![None; array.nfields()],
43        };
44
45        let (arrays, trees) = array
46            .children()
47            .zip_eq(children_trees)
48            .map(|(array, like)| {
49                // these are extremely valuable when reading/writing, but are potentially much more expensive
50                // to compute post-compression. That's because not all encodings implement stats, so we would
51                // potentially have to canonicalize during writes just to get stats, which would be silly.
52                // Also, we only really require them for column chunks, not for every array.
53                compute_precompression_stats(&array)?;
54                ctx.compress(&array, like.as_ref())
55            })
56            .process_results(|iter| iter.map(|x| (x.array, x.path)).unzip())?;
57
58        Ok(CompressedArray::compressed(
59            StructArray::try_new(
60                array.names().clone(),
61                arrays,
62                array.len(),
63                compressed_validity,
64            )?
65            .into_array(),
66            Some(CompressionTree::new(self, trees)),
67            array,
68        ))
69    }
70
71    fn used_encodings(&self) -> HashSet<EncodingId> {
72        HashSet::from([StructEncoding::ID])
73    }
74}