Skip to main content

vortex_btrblocks/schemes/string/
sparse.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! Sparse encoding for null-dominated string arrays.
5
6use vortex_array::ArrayRef;
7use vortex_array::Canonical;
8use vortex_array::ExecutionCtx;
9use vortex_array::IntoArray;
10use vortex_array::arrays::PrimitiveArray;
11use vortex_array::arrays::primitive::PrimitiveArrayExt;
12use vortex_compressor::estimate::CompressionEstimate;
13use vortex_compressor::estimate::EstimateVerdict;
14use vortex_compressor::scheme::ChildSelection;
15use vortex_compressor::scheme::DescendantExclusion;
16use vortex_error::VortexResult;
17use vortex_sparse::Sparse;
18use vortex_sparse::SparseExt as _;
19
20use crate::ArrayAndStats;
21use crate::CascadingCompressor;
22use crate::CompressorContext;
23use crate::Scheme;
24use crate::SchemeExt;
25use crate::schemes::integer::IntDictScheme;
26use crate::schemes::integer::SparseScheme as IntSparseScheme;
27
28/// Sparse encoding for null-dominated arrays.
29///
30/// This is the same as the integer `SparseScheme`, but we only use this for null-dominated arrays.
31#[derive(Debug, Copy, Clone, PartialEq, Eq)]
32pub struct NullDominatedSparseScheme;
33
34impl Scheme for NullDominatedSparseScheme {
35    fn scheme_name(&self) -> &'static str {
36        "vortex.string.sparse"
37    }
38
39    fn matches(&self, canonical: &Canonical) -> bool {
40        canonical.dtype().is_utf8()
41    }
42
43    /// Children: indices=0.
44    fn num_children(&self) -> usize {
45        1
46    }
47
48    /// The indices of a null-dominated sparse array should not be sparse-encoded again.
49    fn descendant_exclusions(&self) -> Vec<DescendantExclusion> {
50        vec![
51            DescendantExclusion {
52                excluded: IntSparseScheme.id(),
53                children: ChildSelection::All,
54            },
55            DescendantExclusion {
56                excluded: IntDictScheme.id(),
57                children: ChildSelection::All,
58            },
59        ]
60    }
61
62    fn expected_compression_ratio(
63        &self,
64        data: &ArrayAndStats,
65        _compress_ctx: CompressorContext,
66        exec_ctx: &mut ExecutionCtx,
67    ) -> CompressionEstimate {
68        let len = data.array_len() as f64;
69        let stats = data.varbinview_stats(exec_ctx);
70        let value_count = stats.value_count();
71
72        // All-null arrays should be compressed as constant instead anyways.
73        if value_count == 0 {
74            return CompressionEstimate::Verdict(EstimateVerdict::Skip);
75        }
76
77        // If the majority (90%) of values is null, this will compress well.
78        if stats.null_count() as f64 / len > 0.9 {
79            return CompressionEstimate::Verdict(EstimateVerdict::Ratio(len / value_count as f64));
80        }
81
82        // Otherwise we don't go this route.
83        CompressionEstimate::Verdict(EstimateVerdict::Skip)
84    }
85
86    fn compress(
87        &self,
88        compressor: &CascadingCompressor,
89        data: &ArrayAndStats,
90        compress_ctx: CompressorContext,
91        exec_ctx: &mut ExecutionCtx,
92    ) -> VortexResult<ArrayRef> {
93        // We pass None as we only run this pathway for NULL-dominated string arrays.
94        let sparse_encoded = Sparse::encode(data.array(), None, exec_ctx)?;
95
96        if let Some(sparse) = sparse_encoded.as_opt::<Sparse>() {
97            // Compress the indices only (not the values for strings).
98            let indices = sparse
99                .patches()
100                .indices()
101                .clone()
102                .execute::<PrimitiveArray>(exec_ctx)?
103                .narrow(exec_ctx)?;
104            let compressed_indices = compressor.compress_child(
105                &indices.into_array(),
106                &compress_ctx,
107                self.id(),
108                0,
109                exec_ctx,
110            )?;
111
112            Sparse::try_new(
113                compressed_indices,
114                sparse.patches().values().clone(),
115                sparse.len(),
116                sparse.fill_scalar().clone(),
117            )
118            .map(|a| a.into_array())
119        } else {
120            Ok(sparse_encoded)
121        }
122    }
123}