vortex_layout/layouts/dict/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4mod reader;
5pub mod writer;
6
7use std::sync::Arc;
8
9use reader::DictReader;
10use vortex_array::ArrayContext;
11use vortex_array::DeserializeMetadata;
12use vortex_array::ProstMetadata;
13use vortex_dtype::DType;
14use vortex_dtype::Nullability;
15use vortex_dtype::PType;
16use vortex_error::VortexExpect;
17use vortex_error::VortexResult;
18use vortex_error::vortex_bail;
19use vortex_error::vortex_panic;
20use vortex_session::VortexSession;
21
22use crate::LayoutChildType;
23use crate::LayoutEncodingRef;
24use crate::LayoutId;
25use crate::LayoutReaderRef;
26use crate::LayoutRef;
27use crate::VTable;
28use crate::children::LayoutChildren;
29use crate::segments::SegmentId;
30use crate::segments::SegmentSource;
31use crate::vtable;
32
33vtable!(Dict);
34
35impl VTable for DictVTable {
36    type Layout = DictLayout;
37    type Encoding = DictLayoutEncoding;
38    type Metadata = ProstMetadata<DictLayoutMetadata>;
39
40    fn id(_encoding: &Self::Encoding) -> LayoutId {
41        LayoutId::new_ref("vortex.dict")
42    }
43
44    fn encoding(_layout: &Self::Layout) -> LayoutEncodingRef {
45        LayoutEncodingRef::new_ref(DictLayoutEncoding.as_ref())
46    }
47
48    fn row_count(layout: &Self::Layout) -> u64 {
49        layout.codes.row_count()
50    }
51
52    fn dtype(layout: &Self::Layout) -> &DType {
53        layout.values.dtype()
54    }
55
56    fn metadata(layout: &Self::Layout) -> Self::Metadata {
57        let mut metadata =
58            DictLayoutMetadata::new(PType::try_from(layout.codes.dtype()).vortex_expect("ptype"));
59        metadata.is_nullable_codes = Some(layout.codes.dtype().is_nullable());
60        metadata.all_values_referenced = Some(layout.all_values_referenced);
61        ProstMetadata(metadata)
62    }
63
64    fn segment_ids(_layout: &Self::Layout) -> Vec<SegmentId> {
65        vec![]
66    }
67
68    fn nchildren(_layout: &Self::Layout) -> usize {
69        2
70    }
71
72    fn child(layout: &Self::Layout, idx: usize) -> VortexResult<LayoutRef> {
73        match idx {
74            0 => Ok(layout.values.clone()),
75            1 => Ok(layout.codes.clone()),
76            _ => vortex_bail!("Unreachable child index: {}", idx),
77        }
78    }
79
80    fn child_type(_layout: &Self::Layout, idx: usize) -> LayoutChildType {
81        match idx {
82            0 => LayoutChildType::Auxiliary("values".into()),
83            1 => LayoutChildType::Transparent("codes".into()),
84            _ => vortex_panic!("Unreachable child index: {}", idx),
85        }
86    }
87
88    fn new_reader(
89        layout: &Self::Layout,
90        name: Arc<str>,
91        segment_source: Arc<dyn SegmentSource>,
92        session: &VortexSession,
93    ) -> VortexResult<LayoutReaderRef> {
94        Ok(Arc::new(DictReader::try_new(
95            layout.clone(),
96            name,
97            segment_source,
98            session,
99        )?))
100    }
101
102    #[cfg(gpu_unstable)]
103    fn new_gpu_reader(
104        _layout: &Self::Layout,
105        _name: Arc<str>,
106        _segment_source: Arc<dyn SegmentSource>,
107        _ctx: Arc<cudarc::driver::CudaContext>,
108    ) -> VortexResult<crate::gpu::GpuLayoutReaderRef> {
109        todo!()
110    }
111
112    fn build(
113        _encoding: &Self::Encoding,
114        dtype: &DType,
115        _row_count: u64,
116        metadata: &<Self::Metadata as DeserializeMetadata>::Output,
117        _segment_ids: Vec<SegmentId>,
118        children: &dyn LayoutChildren,
119        _ctx: ArrayContext,
120    ) -> VortexResult<Self::Layout> {
121        let values = children.child(0, dtype)?;
122        let codes_nullable = metadata
123            .is_nullable_codes
124            .map(Nullability::from)
125            // The old behaviour (without `is_nullable_codes` metadata) used the nullability
126            // of the values (and whole array).
127            // see [`SerdeVTable<DictVTable>::build`].
128            .unwrap_or_else(|| dtype.nullability());
129        let codes = children.child(1, &DType::Primitive(metadata.codes_ptype(), codes_nullable))?;
130        Ok(unsafe {
131            DictLayout::new(values, codes)
132                .set_all_values_referenced(metadata.all_values_referenced.unwrap_or(false))
133        })
134    }
135}
136
137#[derive(Debug)]
138pub struct DictLayoutEncoding;
139
140#[derive(Clone, Debug)]
141pub struct DictLayout {
142    values: LayoutRef,
143    codes: LayoutRef,
144    /// Indicates whether all dictionary values are definitely referenced by at least one code.
145    /// `true` = all values are referenced (computed during encoding).
146    /// `false` = unknown/might have unreferenced values.
147    all_values_referenced: bool,
148}
149
150impl DictLayout {
151    pub(crate) fn new(values: LayoutRef, codes: LayoutRef) -> Self {
152        Self {
153            values,
154            codes,
155            all_values_referenced: false,
156        }
157    }
158
159    /// Set whether all dictionary values are definitely referenced.
160    ///
161    /// # Safety
162    /// The caller must ensure that when setting `all_values_referenced = true`, ALL dictionary
163    /// values are actually referenced by at least one valid code. Setting this incorrectly can
164    /// lead to incorrect query results in operations like min/max.
165    ///
166    /// This is typically only set to `true` during dictionary encoding when we know for certain
167    /// that all values are referenced.
168    /// See `DictArray::set_all_values_referenced`.
169    pub unsafe fn set_all_values_referenced(mut self, all_values_referenced: bool) -> Self {
170        self.all_values_referenced = all_values_referenced;
171        self
172    }
173
174    pub fn has_all_values_referenced(&self) -> bool {
175        self.all_values_referenced
176    }
177}
178
179#[derive(prost::Message)]
180pub struct DictLayoutMetadata {
181    #[prost(enumeration = "PType", tag = "1")]
182    // i32 is required for proto, use the generated getter to read this field.
183    codes_ptype: i32,
184    // nullable codes are optional since they were added after stabilisation
185    #[prost(optional, bool, tag = "2")]
186    is_nullable_codes: Option<bool>,
187    // all_values_referenced is optional for backward compatibility
188    // true = all dictionary values are definitely referenced by at least one code
189    // false/None = unknown whether all values are referenced (conservative default)
190    // see `DictArray::all_values_referenced`
191    #[prost(optional, bool, tag = "3")]
192    pub(crate) all_values_referenced: Option<bool>,
193}
194
195impl DictLayoutMetadata {
196    pub fn new(codes_ptype: PType) -> Self {
197        let mut metadata = Self::default();
198        metadata.set_codes_ptype(codes_ptype);
199        metadata
200    }
201}