vortex_dict/builders/
mod.rs

1use bytes::bytes_dict_builder;
2use primitive::primitive_dict_builder;
3use vortex_array::arrays::{PrimitiveArray, VarBinArray, VarBinViewArray};
4use vortex_array::compress::downscale_integer_array;
5use vortex_array::variants::PrimitiveArrayTrait;
6use vortex_array::{Array, ArrayExt, ArrayRef};
7use vortex_dtype::match_each_native_ptype;
8use vortex_error::{VortexResult, vortex_bail};
9
10use crate::DictArray;
11
12mod bytes;
13mod primitive;
14
15#[derive(Clone)]
16pub struct DictConstraints {
17    pub max_bytes: usize,
18    pub max_len: usize,
19}
20
21pub const UNCONSTRAINED: DictConstraints = DictConstraints {
22    max_bytes: usize::MAX,
23    max_len: usize::MAX,
24};
25
26pub trait DictEncoder: Send {
27    fn encode(&mut self, array: &dyn Array) -> VortexResult<ArrayRef>;
28
29    fn values(&mut self) -> VortexResult<ArrayRef>;
30}
31
32pub fn dict_encoder(
33    array: &dyn Array,
34    constraints: &DictConstraints,
35) -> VortexResult<Box<dyn DictEncoder>> {
36    let dict_builder: Box<dyn DictEncoder> = if let Some(pa) = array.as_opt::<PrimitiveArray>() {
37        match_each_native_ptype!(pa.ptype(), |$P| {
38            primitive_dict_builder::<$P>(pa.dtype().nullability(), &constraints)
39        })
40    } else if let Some(vbv) = array.as_opt::<VarBinViewArray>() {
41        bytes_dict_builder(vbv.dtype().clone(), constraints)
42    } else if let Some(vb) = array.as_opt::<VarBinArray>() {
43        bytes_dict_builder(vb.dtype().clone(), constraints)
44    } else {
45        vortex_bail!("Can only encode primitive or varbin/view arrays")
46    };
47    Ok(dict_builder)
48}
49
50pub fn dict_encode_with_constraints(
51    array: &dyn Array,
52    constraints: &DictConstraints,
53) -> VortexResult<DictArray> {
54    let mut encoder = dict_encoder(array, constraints)?;
55    let codes = downscale_integer_array(encoder.encode(array)?)?;
56    DictArray::try_new(codes, encoder.values()?)
57}
58
59pub fn dict_encode(array: &dyn Array) -> VortexResult<DictArray> {
60    let dict_array = dict_encode_with_constraints(array, &UNCONSTRAINED)?;
61    if dict_array.len() != array.len() {
62        vortex_bail!(
63            "must have encoded all {} elements, but only encoded {}",
64            array.len(),
65            dict_array.len(),
66        );
67    }
68    Ok(dict_array)
69}