Skip to main content

vortex_array/builders/dict/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use bytes::bytes_dict_builder;
5use primitive::primitive_dict_builder;
6use vortex_dtype::PType;
7use vortex_dtype::match_each_native_ptype;
8use vortex_error::VortexResult;
9use vortex_error::vortex_bail;
10use vortex_error::vortex_panic;
11
12use crate::Array;
13use crate::ArrayRef;
14use crate::IntoArray;
15use crate::ToCanonical;
16use crate::arrays::DictArray;
17use crate::arrays::PrimitiveVTable;
18use crate::arrays::VarBinVTable;
19use crate::arrays::VarBinViewVTable;
20
21mod bytes;
22mod primitive;
23
24#[derive(Clone)]
25pub struct DictConstraints {
26    pub max_bytes: usize,
27    pub max_len: usize,
28}
29
30pub const UNCONSTRAINED: DictConstraints = DictConstraints {
31    max_bytes: usize::MAX,
32    max_len: usize::MAX,
33};
34
35pub trait DictEncoder: Send {
36    /// Assign dictionary codes to the given input array.
37    fn encode(&mut self, array: &dyn Array) -> ArrayRef;
38
39    /// Clear the encoder state to make it ready for a new round of decoding.
40    fn reset(&mut self) -> ArrayRef;
41
42    /// Returns the PType of the codes this encoder produces.
43    fn codes_ptype(&self) -> PType;
44}
45
46pub fn dict_encoder(array: &dyn Array, constraints: &DictConstraints) -> Box<dyn DictEncoder> {
47    let dict_builder: Box<dyn DictEncoder> = if let Some(pa) = array.as_opt::<PrimitiveVTable>() {
48        match_each_native_ptype!(pa.ptype(), |P| {
49            primitive_dict_builder::<P>(pa.dtype().nullability(), constraints)
50        })
51    } else if let Some(vbv) = array.as_opt::<VarBinViewVTable>() {
52        bytes_dict_builder(vbv.dtype().clone(), constraints)
53    } else if let Some(vb) = array.as_opt::<VarBinVTable>() {
54        bytes_dict_builder(vb.dtype().clone(), constraints)
55    } else {
56        vortex_panic!("Can only encode primitive or varbin/view arrays")
57    };
58    dict_builder
59}
60
61/// Encode an array as a `DictArray` subject to the given constraints.
62///
63/// Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for external compatibility.
64pub fn dict_encode_with_constraints(
65    array: &dyn Array,
66    constraints: &DictConstraints,
67) -> VortexResult<DictArray> {
68    let mut encoder = dict_encoder(array, constraints);
69    let codes = encoder.encode(array).to_primitive().narrow()?;
70    // SAFETY: The encoding process will produce a value set of codes and values
71    // All values in the dictionary are guaranteed to be referenced by at least one code
72    // since we build the dictionary from the codes we observe during encoding
73    unsafe {
74        Ok(
75            DictArray::new_unchecked(codes.into_array(), encoder.reset())
76                .set_all_values_referenced(true),
77        )
78    }
79}
80
81pub fn dict_encode(array: &dyn Array) -> VortexResult<DictArray> {
82    let dict_array = dict_encode_with_constraints(array, &UNCONSTRAINED)?;
83    if dict_array.len() != array.len() {
84        vortex_bail!(
85            "must have encoded all {} elements, but only encoded {}",
86            array.len(),
87            dict_array.len(),
88        );
89    }
90    Ok(dict_array)
91}