use vortex_array::ArrayRef;
use vortex_array::ArrayView;
use vortex_array::Canonical;
use vortex_array::ExecutionCtx;
use vortex_array::IntoArray;
use vortex_array::arrays::DictArray;
use vortex_array::arrays::Primitive;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::dict::DictArrayExt;
use vortex_array::arrays::dict::DictArraySlotsExt;
use vortex_array::arrays::primitive::PrimitiveArrayExt;
use vortex_array::validity::Validity;
use vortex_buffer::Buffer;
use vortex_error::VortexExpect;
use vortex_error::VortexResult;
use crate::CascadingCompressor;
use crate::builtins::IntDictScheme;
use crate::builtins::is_integer_primitive;
use crate::ctx::CompressorContext;
use crate::estimate::CompressionEstimate;
use crate::estimate::EstimateVerdict;
use crate::scheme::Scheme;
use crate::scheme::SchemeExt;
use crate::stats::ArrayAndStats;
use crate::stats::GenerateStatsOptions;
use crate::stats::IntegerErasedStats;
use crate::stats::IntegerStats;
impl Scheme for IntDictScheme {
fn scheme_name(&self) -> &'static str {
"vortex.int.dict"
}
fn matches(&self, canonical: &Canonical) -> bool {
is_integer_primitive(canonical)
}
fn stats_options(&self) -> GenerateStatsOptions {
GenerateStatsOptions {
count_distinct_values: true,
}
}
fn num_children(&self) -> usize {
2
}
fn expected_compression_ratio(
&self,
data: &ArrayAndStats,
_compress_ctx: CompressorContext,
exec_ctx: &mut ExecutionCtx,
) -> CompressionEstimate {
let bit_width = data.array_as_primitive().ptype().bit_width();
let stats = data.integer_stats(exec_ctx);
if stats.value_count() == 0 {
return CompressionEstimate::Verdict(EstimateVerdict::Skip);
}
let distinct_values_count = stats.distinct_count().vortex_expect(
"this must be present since `DictScheme` declared that we need distinct values",
);
if distinct_values_count > stats.value_count() / 2 {
return CompressionEstimate::Verdict(EstimateVerdict::Skip);
}
let values_size = bit_width * distinct_values_count as usize;
let codes_bw = u32::BITS - distinct_values_count.leading_zeros();
let n_runs = (stats.value_count() / stats.average_run_length()) as usize;
let codes_size_bp = codes_bw as usize * stats.value_count() as usize;
let codes_size_rle_bp = usize::checked_mul(codes_bw as usize + 32, n_runs);
let codes_size = usize::min(codes_size_bp, codes_size_rle_bp.unwrap_or(usize::MAX));
let before = stats.value_count() as usize * bit_width;
CompressionEstimate::Verdict(EstimateVerdict::Ratio(
before as f64 / (values_size + codes_size) as f64,
))
}
fn compress(
&self,
compressor: &CascadingCompressor,
data: &ArrayAndStats,
compress_ctx: CompressorContext,
exec_ctx: &mut ExecutionCtx,
) -> VortexResult<ArrayRef> {
let stats = data.integer_stats(exec_ctx);
let dict = dictionary_encode(data.array_as_primitive(), &stats)?;
let compressed_values =
compressor.compress_child(dict.values(), &compress_ctx, self.id(), 0, exec_ctx)?;
let narrowed_codes = dict
.codes()
.clone()
.execute::<PrimitiveArray>(exec_ctx)?
.narrow()?
.into_array();
let compressed_codes =
compressor.compress_child(&narrowed_codes, &compress_ctx, self.id(), 1, exec_ctx)?;
unsafe {
Ok(
DictArray::new_unchecked(compressed_codes, compressed_values)
.set_all_values_referenced(dict.has_all_values_referenced())
.into_array(),
)
}
}
}
macro_rules! typed_encode {
($source_array:ident, $stats:ident, $typed:ident, $typ:ty) => {{
let distinct = $typed.distinct().vortex_expect(
"this must be present since `DictScheme` declared that we need distinct values",
);
let values_validity = match $source_array.validity()? {
Validity::NonNullable => Validity::NonNullable,
_ => Validity::AllValid,
};
let codes_validity = $source_array.validity()?;
let values: Buffer<$typ> = distinct.distinct_values().keys().map(|x| x.0).collect();
let max_code = values.len();
let codes = if max_code <= u8::MAX as usize {
let buf = <DictEncoder as Encode<$typ, u8>>::encode(
&values,
$source_array.as_slice::<$typ>(),
);
PrimitiveArray::new(buf, codes_validity).into_array()
} else if max_code <= u16::MAX as usize {
let buf = <DictEncoder as Encode<$typ, u16>>::encode(
&values,
$source_array.as_slice::<$typ>(),
);
PrimitiveArray::new(buf, codes_validity).into_array()
} else {
let buf = <DictEncoder as Encode<$typ, u32>>::encode(
&values,
$source_array.as_slice::<$typ>(),
);
PrimitiveArray::new(buf, codes_validity).into_array()
};
let values = PrimitiveArray::new(values, values_validity).into_array();
Ok(unsafe { DictArray::new_unchecked(codes, values).set_all_values_referenced(true) })
}};
}
#[expect(
clippy::cognitive_complexity,
reason = "complexity from match on all integer types"
)]
pub fn dictionary_encode(
array: ArrayView<'_, Primitive>,
stats: &IntegerStats,
) -> VortexResult<DictArray> {
match stats.erased() {
IntegerErasedStats::U8(typed) => typed_encode!(array, stats, typed, u8),
IntegerErasedStats::U16(typed) => typed_encode!(array, stats, typed, u16),
IntegerErasedStats::U32(typed) => typed_encode!(array, stats, typed, u32),
IntegerErasedStats::U64(typed) => typed_encode!(array, stats, typed, u64),
IntegerErasedStats::I8(typed) => typed_encode!(array, stats, typed, i8),
IntegerErasedStats::I16(typed) => typed_encode!(array, stats, typed, i16),
IntegerErasedStats::I32(typed) => typed_encode!(array, stats, typed, i32),
IntegerErasedStats::I64(typed) => typed_encode!(array, stats, typed, i64),
}
}
struct DictEncoder;
trait Encode<T, I> {
fn encode(distinct: &[T], values: &[T]) -> Buffer<I>;
}
macro_rules! impl_encode {
($typ:ty) => { impl_encode!($typ, u8, u16, u32); };
($typ:ty, $($ityp:ty),+) => {
$(
impl Encode<$typ, $ityp> for DictEncoder {
#[expect(clippy::cast_possible_truncation)]
fn encode(distinct: &[$typ], values: &[$typ]) -> Buffer<$ityp> {
let mut codes =
vortex_utils::aliases::hash_map::HashMap::<$typ, $ityp>::with_capacity(
distinct.len(),
);
for (code, &value) in distinct.iter().enumerate() {
codes.insert(value, code as $ityp);
}
let mut output = vortex_buffer::BufferMut::with_capacity(values.len());
for value in values {
unsafe { output.push_unchecked(codes.get(value).copied().unwrap_or_default()) };
}
output.freeze()
}
}
)*
};
}
impl_encode!(u8);
impl_encode!(u16);
impl_encode!(u32);
impl_encode!(u64);
impl_encode!(i8);
impl_encode!(i16);
impl_encode!(i32);
impl_encode!(i64);
#[cfg(test)]
mod tests {
use vortex_array::IntoArray;
use vortex_array::VortexSessionExecute;
use vortex_array::arrays::BoolArray;
use vortex_array::arrays::PrimitiveArray;
use vortex_array::arrays::dict::DictArraySlotsExt;
use vortex_array::assert_arrays_eq;
use vortex_array::session::ArraySession;
use vortex_array::validity::Validity;
use vortex_buffer::buffer;
use vortex_error::VortexResult;
use vortex_session::VortexSession;
use super::dictionary_encode;
use crate::stats::IntegerStats;
#[test]
fn test_dict_encode_integer_stats() -> VortexResult<()> {
let mut ctx = VortexSession::empty()
.with::<ArraySession>()
.create_execution_ctx();
let data = buffer![100i32, 200, 100, 0, 100];
let validity =
Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array());
let array = PrimitiveArray::new(data, validity);
let stats = IntegerStats::generate_opts(
&array,
crate::stats::GenerateStatsOptions {
count_distinct_values: true,
},
&mut ctx,
);
let dict_array = dictionary_encode(array.as_view(), &stats)?;
assert_eq!(dict_array.values().len(), 2);
assert_eq!(dict_array.codes().len(), 5);
let expected = PrimitiveArray::new(
buffer![100i32, 200, 100, 100, 100],
Validity::Array(BoolArray::from_iter([true, true, true, false, true]).into_array()),
)
.into_array();
let undict = dict_array
.as_array()
.clone()
.execute::<PrimitiveArray>(&mut ctx)?
.into_array();
assert_arrays_eq!(undict, expected);
Ok(())
}
}