vortex_array/builders/dict/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use bytes::bytes_dict_builder;
5use primitive::primitive_dict_builder;
6use vortex_dtype::match_each_native_ptype;
7use vortex_error::VortexResult;
8use vortex_error::vortex_bail;
9use vortex_error::vortex_panic;
10
11use crate::Array;
12use crate::ArrayRef;
13use crate::IntoArray;
14use crate::ToCanonical;
15use crate::arrays::DictArray;
16use crate::arrays::PrimitiveVTable;
17use crate::arrays::VarBinVTable;
18use crate::arrays::VarBinViewVTable;
19
20mod bytes;
21mod primitive;
22
23#[derive(Clone)]
24pub struct DictConstraints {
25    pub max_bytes: usize,
26    pub max_len: usize,
27}
28
29pub const UNCONSTRAINED: DictConstraints = DictConstraints {
30    max_bytes: usize::MAX,
31    max_len: usize::MAX,
32};
33
34pub trait DictEncoder: Send {
35    /// Assign dictionary codes to the given input array.
36    fn encode(&mut self, array: &dyn Array) -> ArrayRef;
37
38    /// Clear the encoder state to make it ready for a new round of decoding.
39    fn reset(&mut self) -> ArrayRef;
40}
41
42pub fn dict_encoder(array: &dyn Array, constraints: &DictConstraints) -> Box<dyn DictEncoder> {
43    let dict_builder: Box<dyn DictEncoder> = if let Some(pa) = array.as_opt::<PrimitiveVTable>() {
44        match_each_native_ptype!(pa.ptype(), |P| {
45            primitive_dict_builder::<P>(pa.dtype().nullability(), constraints)
46        })
47    } else if let Some(vbv) = array.as_opt::<VarBinViewVTable>() {
48        bytes_dict_builder(vbv.dtype().clone(), constraints)
49    } else if let Some(vb) = array.as_opt::<VarBinVTable>() {
50        bytes_dict_builder(vb.dtype().clone(), constraints)
51    } else {
52        vortex_panic!("Can only encode primitive or varbin/view arrays")
53    };
54    dict_builder
55}
56
57pub fn dict_encode_with_constraints(
58    array: &dyn Array,
59    constraints: &DictConstraints,
60) -> VortexResult<DictArray> {
61    let mut encoder = dict_encoder(array, constraints);
62    let codes = encoder.encode(array).to_primitive().narrow()?;
63    // SAFETY: The encoding process will produce a value set of codes and values
64    // All values in the dictionary are guaranteed to be referenced by at least one code
65    // since we build the dictionary from the codes we observe during encoding
66    unsafe {
67        Ok(
68            DictArray::new_unchecked(codes.into_array(), encoder.reset())
69                .set_all_values_referenced(true),
70        )
71    }
72}
73
74pub fn dict_encode(array: &dyn Array) -> VortexResult<DictArray> {
75    let dict_array = dict_encode_with_constraints(array, &UNCONSTRAINED)?;
76    if dict_array.len() != array.len() {
77        vortex_bail!(
78            "must have encoded all {} elements, but only encoded {}",
79            array.len(),
80            dict_array.len(),
81        );
82    }
83    Ok(dict_array)
84}