vortex_layout/layouts/dict/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4mod reader;
5pub mod writer;
6
7use std::sync::Arc;
8
9use reader::DictReader;
10use vortex_array::ArrayContext;
11use vortex_array::DeserializeMetadata;
12use vortex_array::ProstMetadata;
13use vortex_dtype::DType;
14use vortex_dtype::Nullability;
15use vortex_dtype::PType;
16use vortex_error::VortexExpect;
17use vortex_error::VortexResult;
18use vortex_error::vortex_bail;
19use vortex_error::vortex_ensure;
20use vortex_error::vortex_err;
21use vortex_error::vortex_panic;
22use vortex_session::VortexSession;
23
24use crate::LayoutChildType;
25use crate::LayoutEncodingRef;
26use crate::LayoutId;
27use crate::LayoutReaderRef;
28use crate::LayoutRef;
29use crate::VTable;
30use crate::children::LayoutChildren;
31use crate::segments::SegmentId;
32use crate::segments::SegmentSource;
33use crate::vtable;
34
35vtable!(Dict);
36
37impl VTable for DictVTable {
38    type Layout = DictLayout;
39    type Encoding = DictLayoutEncoding;
40    type Metadata = ProstMetadata<DictLayoutMetadata>;
41
42    fn id(_encoding: &Self::Encoding) -> LayoutId {
43        LayoutId::new_ref("vortex.dict")
44    }
45
46    fn encoding(_layout: &Self::Layout) -> LayoutEncodingRef {
47        LayoutEncodingRef::new_ref(DictLayoutEncoding.as_ref())
48    }
49
50    fn row_count(layout: &Self::Layout) -> u64 {
51        layout.codes.row_count()
52    }
53
54    fn dtype(layout: &Self::Layout) -> &DType {
55        layout.values.dtype()
56    }
57
58    fn metadata(layout: &Self::Layout) -> Self::Metadata {
59        let mut metadata =
60            DictLayoutMetadata::new(PType::try_from(layout.codes.dtype()).vortex_expect("ptype"));
61        metadata.is_nullable_codes = Some(layout.codes.dtype().is_nullable());
62        metadata.all_values_referenced = Some(layout.all_values_referenced);
63        ProstMetadata(metadata)
64    }
65
66    fn segment_ids(_layout: &Self::Layout) -> Vec<SegmentId> {
67        vec![]
68    }
69
70    fn nchildren(_layout: &Self::Layout) -> usize {
71        2
72    }
73
74    fn child(layout: &Self::Layout, idx: usize) -> VortexResult<LayoutRef> {
75        match idx {
76            0 => Ok(layout.values.clone()),
77            1 => Ok(layout.codes.clone()),
78            _ => vortex_bail!("Unreachable child index: {}", idx),
79        }
80    }
81
82    fn child_type(_layout: &Self::Layout, idx: usize) -> LayoutChildType {
83        match idx {
84            0 => LayoutChildType::Auxiliary("values".into()),
85            1 => LayoutChildType::Transparent("codes".into()),
86            _ => vortex_panic!("Unreachable child index: {}", idx),
87        }
88    }
89
90    fn new_reader(
91        layout: &Self::Layout,
92        name: Arc<str>,
93        segment_source: Arc<dyn SegmentSource>,
94        session: &VortexSession,
95    ) -> VortexResult<LayoutReaderRef> {
96        Ok(Arc::new(DictReader::try_new(
97            layout.clone(),
98            name,
99            segment_source,
100            session.clone(),
101        )?))
102    }
103
104    #[cfg(gpu_unstable)]
105    fn new_gpu_reader(
106        _layout: &Self::Layout,
107        _name: Arc<str>,
108        _segment_source: Arc<dyn SegmentSource>,
109        _ctx: Arc<cudarc::driver::CudaContext>,
110    ) -> VortexResult<crate::gpu::GpuLayoutReaderRef> {
111        todo!()
112    }
113
114    fn build(
115        _encoding: &Self::Encoding,
116        dtype: &DType,
117        _row_count: u64,
118        metadata: &<Self::Metadata as DeserializeMetadata>::Output,
119        _segment_ids: Vec<SegmentId>,
120        children: &dyn LayoutChildren,
121        _ctx: ArrayContext,
122    ) -> VortexResult<Self::Layout> {
123        let values = children.child(0, dtype)?;
124        let codes_nullable = metadata
125            .is_nullable_codes
126            .map(Nullability::from)
127            // The old behaviour (without `is_nullable_codes` metadata) used the nullability
128            // of the values (and whole array).
129            // see [`SerdeVTable<DictVTable>::build`].
130            .unwrap_or_else(|| dtype.nullability());
131        let codes = children.child(1, &DType::Primitive(metadata.codes_ptype(), codes_nullable))?;
132        Ok(unsafe {
133            DictLayout::new(values, codes)
134                .set_all_values_referenced(metadata.all_values_referenced.unwrap_or(false))
135        })
136    }
137
138    fn with_children(layout: &mut Self::Layout, children: Vec<LayoutRef>) -> VortexResult<()> {
139        vortex_ensure!(
140            children.len() == 2,
141            "DictLayout expects exactly 2 children (values, codes), got {}",
142            children.len()
143        );
144        let mut children_iter = children.into_iter();
145        layout.values = children_iter
146            .next()
147            .ok_or_else(|| vortex_err!("Missing values child"))?;
148        layout.codes = children_iter
149            .next()
150            .ok_or_else(|| vortex_err!("Missing codes child"))?;
151        Ok(())
152    }
153}
154
155#[derive(Debug)]
156pub struct DictLayoutEncoding;
157
158#[derive(Clone, Debug)]
159pub struct DictLayout {
160    values: LayoutRef,
161    codes: LayoutRef,
162    /// Indicates whether all dictionary values are definitely referenced by at least one code.
163    /// `true` = all values are referenced (computed during encoding).
164    /// `false` = unknown/might have unreferenced values.
165    all_values_referenced: bool,
166}
167
168impl DictLayout {
169    pub(crate) fn new(values: LayoutRef, codes: LayoutRef) -> Self {
170        Self {
171            values,
172            codes,
173            all_values_referenced: false,
174        }
175    }
176
177    /// Set whether all dictionary values are definitely referenced.
178    ///
179    /// # Safety
180    /// The caller must ensure that when setting `all_values_referenced = true`, ALL dictionary
181    /// values are actually referenced by at least one valid code. Setting this incorrectly can
182    /// lead to incorrect query results in operations like min/max.
183    ///
184    /// This is typically only set to `true` during dictionary encoding when we know for certain
185    /// that all values are referenced.
186    /// See `DictArray::set_all_values_referenced`.
187    pub unsafe fn set_all_values_referenced(mut self, all_values_referenced: bool) -> Self {
188        self.all_values_referenced = all_values_referenced;
189        self
190    }
191
192    pub fn has_all_values_referenced(&self) -> bool {
193        self.all_values_referenced
194    }
195}
196
197#[derive(prost::Message)]
198pub struct DictLayoutMetadata {
199    #[prost(enumeration = "PType", tag = "1")]
200    // i32 is required for proto, use the generated getter to read this field.
201    codes_ptype: i32,
202    // nullable codes are optional since they were added after stabilisation
203    #[prost(optional, bool, tag = "2")]
204    is_nullable_codes: Option<bool>,
205    // all_values_referenced is optional for backward compatibility
206    // true = all dictionary values are definitely referenced by at least one code
207    // false/None = unknown whether all values are referenced (conservative default)
208    // see `DictArray::all_values_referenced`
209    #[prost(optional, bool, tag = "3")]
210    pub(crate) all_values_referenced: Option<bool>,
211}
212
213impl DictLayoutMetadata {
214    pub fn new(codes_ptype: PType) -> Self {
215        let mut metadata = Self::default();
216        metadata.set_codes_ptype(codes_ptype);
217        metadata
218    }
219}