Skip to main content

vortex_array/builders/dict/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use bytes::bytes_dict_builder;
5use primitive::primitive_dict_builder;
6use vortex_error::VortexResult;
7use vortex_error::vortex_bail;
8use vortex_error::vortex_panic;
9
10use crate::ArrayRef;
11use crate::ExecutionCtx;
12use crate::IntoArray;
13use crate::arrays::DictArray;
14use crate::arrays::Primitive;
15use crate::arrays::PrimitiveArray;
16use crate::arrays::VarBin;
17use crate::arrays::VarBinView;
18use crate::arrays::primitive::PrimitiveArrayExt;
19use crate::dtype::PType;
20use crate::match_each_native_ptype;
21
22mod bytes;
23mod primitive;
24
25#[derive(Clone)]
26pub struct DictConstraints {
27    pub max_bytes: usize,
28    pub max_len: usize,
29}
30
31pub const UNCONSTRAINED: DictConstraints = DictConstraints {
32    max_bytes: usize::MAX,
33    max_len: usize::MAX,
34};
35
36pub trait DictEncoder: Send {
37    /// Assign dictionary codes to the given input array.
38    fn encode(&mut self, array: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<PrimitiveArray>;
39
40    /// Clear the encoder state to make it ready for a new round of decoding.
41    fn reset(&mut self) -> ArrayRef;
42
43    /// Returns the PType of the codes this encoder produces.
44    fn codes_ptype(&self) -> PType;
45}
46
47pub fn dict_encoder(array: &ArrayRef, constraints: &DictConstraints) -> Box<dyn DictEncoder> {
48    let dict_builder: Box<dyn DictEncoder> = if let Some(pa) = array.as_opt::<Primitive>() {
49        match_each_native_ptype!(pa.ptype(), |P| {
50            primitive_dict_builder::<P>(pa.dtype().nullability(), constraints)
51        })
52    } else if let Some(vbv) = array.as_opt::<VarBinView>() {
53        bytes_dict_builder(vbv.dtype().clone(), constraints)
54    } else if let Some(vb) = array.as_opt::<VarBin>() {
55        bytes_dict_builder(vb.dtype().clone(), constraints)
56    } else {
57        vortex_panic!("Can only encode primitive or varbin/view arrays")
58    };
59    dict_builder
60}
61
62/// Encode an array as a `DictArray` subject to the given constraints.
63///
64/// Vortex encoders must always produce unsigned integer codes; signed codes are only accepted for external compatibility.
65pub fn dict_encode_with_constraints(
66    array: &ArrayRef,
67    constraints: &DictConstraints,
68    ctx: &mut ExecutionCtx,
69) -> VortexResult<DictArray> {
70    let mut encoder = dict_encoder(array, constraints);
71    let codes = encoder.encode(array, ctx)?.narrow(ctx)?;
72    // SAFETY: The encoding process will produce a value set of codes and values
73    // All values in the dictionary are guaranteed to be referenced by at least one code
74    // since we build the dictionary from the codes we observe during encoding
75    unsafe {
76        Ok(
77            DictArray::new_unchecked(codes.into_array(), encoder.reset())
78                .set_all_values_referenced(true),
79        )
80    }
81}
82
83pub fn dict_encode(array: &ArrayRef, ctx: &mut ExecutionCtx) -> VortexResult<DictArray> {
84    let dict_array = dict_encode_with_constraints(array, &UNCONSTRAINED, ctx)?;
85    if dict_array.len() != array.len() {
86        vortex_bail!(
87            "must have encoded all {} elements, but only encoded {}",
88            array.len(),
89            dict_array.len(),
90        );
91    }
92    Ok(dict_array)
93}