vortex_dict/builders/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use bytes::bytes_dict_builder;
5use primitive::primitive_dict_builder;
6use vortex_array::arrays::{PrimitiveVTable, VarBinVTable, VarBinViewVTable};
7use vortex_array::compress::downscale_integer_array;
8use vortex_array::{Array, ArrayRef};
9use vortex_dtype::match_each_native_ptype;
10use vortex_error::{VortexResult, vortex_bail};
11
12use crate::DictArray;
13
14mod bytes;
15mod primitive;
16
17#[derive(Clone)]
18pub struct DictConstraints {
19    pub max_bytes: usize,
20    pub max_len: usize,
21}
22
23pub const UNCONSTRAINED: DictConstraints = DictConstraints {
24    max_bytes: usize::MAX,
25    max_len: usize::MAX,
26};
27
28pub trait DictEncoder: Send {
29    /// Assign dictionary codes to the given input array.
30    fn encode(&mut self, array: &dyn Array) -> VortexResult<ArrayRef>;
31
32    fn values(&mut self) -> VortexResult<ArrayRef>;
33}
34
35pub fn dict_encoder(
36    array: &dyn Array,
37    constraints: &DictConstraints,
38) -> VortexResult<Box<dyn DictEncoder>> {
39    let dict_builder: Box<dyn DictEncoder> = if let Some(pa) = array.as_opt::<PrimitiveVTable>() {
40        match_each_native_ptype!(pa.ptype(), |P| {
41            primitive_dict_builder::<P>(pa.dtype().nullability(), constraints)
42        })
43    } else if let Some(vbv) = array.as_opt::<VarBinViewVTable>() {
44        bytes_dict_builder(vbv.dtype().clone(), constraints)
45    } else if let Some(vb) = array.as_opt::<VarBinVTable>() {
46        bytes_dict_builder(vb.dtype().clone(), constraints)
47    } else {
48        vortex_bail!("Can only encode primitive or varbin/view arrays")
49    };
50    Ok(dict_builder)
51}
52
53pub fn dict_encode_with_constraints(
54    array: &dyn Array,
55    constraints: &DictConstraints,
56) -> VortexResult<DictArray> {
57    let mut encoder = dict_encoder(array, constraints)?;
58    let codes = downscale_integer_array(encoder.encode(array)?)?;
59    // SAFETY: The encoding process will produce a value set of codes and values
60    unsafe { Ok(DictArray::new_unchecked(codes, encoder.values()?)) }
61}
62
63pub fn dict_encode(array: &dyn Array) -> VortexResult<DictArray> {
64    let dict_array = dict_encode_with_constraints(array, &UNCONSTRAINED)?;
65    if dict_array.len() != array.len() {
66        vortex_bail!(
67            "must have encoded all {} elements, but only encoded {}",
68            array.len(),
69            dict_array.len(),
70        );
71    }
72    Ok(dict_array)
73}