Skip to main content

vortex_array/arrays/struct_/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::iter::once;
5use std::sync::Arc;
6
7use vortex_error::VortexExpect;
8use vortex_error::VortexResult;
9use vortex_error::vortex_err;
10
11use crate::ArrayRef;
12use crate::IntoArray;
13use crate::array::Array;
14use crate::array::ArrayParts;
15use crate::array::EmptyArrayData;
16use crate::array::TypedArrayRef;
17use crate::array::child_to_validity;
18use crate::array::validity_to_child;
19use crate::arrays::Struct;
20use crate::dtype::DType;
21use crate::dtype::FieldName;
22use crate::dtype::FieldNames;
23use crate::dtype::StructFields;
24use crate::validity::Validity;
25
26// StructArray has a variable number of slots: [validity?, field_0, ..., field_N]
27/// The validity bitmap indicating which struct elements are non-null.
28pub(super) const VALIDITY_SLOT: usize = 0;
29/// The offset at which the struct field arrays begin in the slots vector.
30pub(super) const FIELDS_OFFSET: usize = 1;
31
32/// A struct array that stores multiple named fields as columns, similar to a database row.
33///
34/// This mirrors the Apache Arrow Struct array encoding and provides a columnar representation
35/// of structured data where each row contains multiple named fields of potentially different types.
36///
37/// ## Data Layout
38///
39/// The struct array uses a columnar layout where:
40/// - Each field is stored as a separate child array
41/// - All fields must have the same length (number of rows)
42/// - Field names and types are defined in the struct's dtype
43/// - An optional validity mask indicates which entire rows are null
44///
45/// ## Row-level nulls
46///
47/// The StructArray contains its own top-level nulls, which are superimposed on top of the
48/// field-level validity values. This can be the case even if the fields themselves are non-nullable,
49/// accessing a particular row can yield nulls even if all children are valid at that position.
50///
51/// ```
52/// use vortex_array::arrays::{StructArray, BoolArray};
53/// use vortex_array::validity::Validity;
54/// use vortex_array::dtype::FieldNames;
55/// use vortex_array::{IntoArray, LEGACY_SESSION, VortexSessionExecute};
56/// use vortex_buffer::buffer;
57///
58/// // Create struct with all non-null fields but struct-level nulls
59/// let struct_array = StructArray::try_new(
60///     FieldNames::from(["a", "b", "c"]),
61///     vec![
62///         buffer![1i32, 2i32].into_array(),  // non-null field a
63///         buffer![10i32, 20i32].into_array(), // non-null field b
64///         buffer![100i32, 200i32].into_array(), // non-null field c
65///     ],
66///     2,
67///     Validity::Array(BoolArray::from_iter([true, false]).into_array()), // row 1 is null
68/// ).unwrap();
69/// let mut ctx = LEGACY_SESSION.create_execution_ctx();
70///
71/// // Row 0 is valid - returns a struct scalar with field values
72/// let row0 = struct_array.execute_scalar(0, &mut ctx).unwrap();
73/// assert!(!row0.is_null());
74///
75/// // Row 1 is null at struct level - returns null even though fields have values
76/// let row1 = struct_array.execute_scalar(1, &mut ctx).unwrap();
77/// assert!(row1.is_null());
78/// ```
79///
80/// ## Name uniqueness
81///
82/// It is valid for a StructArray to have multiple child columns that have the same name. In this
83/// case, any accessors that use column names will find the first column in sequence with the name.
84///
85/// ```
86/// use vortex_array::arrays::StructArray;
87/// use vortex_array::arrays::struct_::StructArrayExt;
88/// use vortex_array::validity::Validity;
89/// use vortex_array::dtype::FieldNames;
90/// use vortex_array::{IntoArray, LEGACY_SESSION, VortexSessionExecute};
91/// use vortex_buffer::buffer;
92///
93/// // Create struct with duplicate "data" field names
94/// let struct_array = StructArray::try_new(
95///     FieldNames::from(["data", "data"]),
96///     vec![
97///         buffer![1i32, 2i32].into_array(),   // first "data"
98///         buffer![3i32, 4i32].into_array(),   // second "data"
99///     ],
100///     2,
101///     Validity::NonNullable,
102/// ).unwrap();
103///
104/// // field_by_name returns the FIRST "data" field
105/// let first_data = struct_array.unmasked_field_by_name("data").unwrap();
106/// let mut ctx = LEGACY_SESSION.create_execution_ctx();
107/// assert_eq!(first_data.execute_scalar(0, &mut ctx).unwrap(), 1i32.into());
108/// ```
109///
110/// ## Field Operations
111///
112/// Struct arrays support efficient column operations:
113/// - **Projection**: Select/reorder fields without copying data
114/// - **Field access**: Get columns by name or index
115/// - **Column addition**: Add new fields to create extended structs
116/// - **Column removal**: Remove fields to create narrower structs
117///
118/// ## Validity Semantics
119///
120/// - Row-level nulls are tracked in the struct's validity child
121/// - Individual field nulls are tracked in each field's own validity
122/// - A null struct row means all fields in that row are conceptually null
123/// - Field-level nulls can exist independently of struct-level nulls
124///
125/// # Examples
126///
127/// ```
128/// use vortex_array::arrays::{StructArray, PrimitiveArray};
129/// use vortex_array::arrays::struct_::StructArrayExt;
130/// use vortex_array::validity::Validity;
131/// use vortex_array::dtype::FieldNames;
132/// use vortex_array::IntoArray;
133/// use vortex_buffer::buffer;
134///
135/// // Create arrays for each field
136/// let ids = PrimitiveArray::new(buffer![1i32, 2, 3], Validity::NonNullable);
137/// let names = PrimitiveArray::new(buffer![100u64, 200, 300], Validity::NonNullable);
138///
139/// // Create struct array with named fields
140/// let struct_array = StructArray::try_new(
141///     FieldNames::from(["id", "score"]),
142///     vec![ids.into_array(), names.into_array()],
143///     3,
144///     Validity::NonNullable,
145/// ).unwrap();
146///
147/// assert_eq!(struct_array.len(), 3);
148/// assert_eq!(struct_array.names().len(), 2);
149///
150/// // Access field by name
151/// let id_field = struct_array.unmasked_field_by_name("id").unwrap();
152/// assert_eq!(id_field.len(), 3);
153/// ```
154pub struct StructDataParts {
155    pub struct_fields: StructFields,
156    pub fields: Arc<[ArrayRef]>,
157    pub validity: Validity,
158}
159
160pub(super) fn make_struct_slots(
161    fields: &[ArrayRef],
162    validity: &Validity,
163    length: usize,
164) -> Vec<Option<ArrayRef>> {
165    once(validity_to_child(validity, length))
166        .chain(fields.iter().cloned().map(Some))
167        .collect()
168}
169
170pub trait StructArrayExt: TypedArrayRef<Struct> {
171    fn nullability(&self) -> crate::dtype::Nullability {
172        match self.as_ref().dtype() {
173            DType::Struct(_, nullability) => *nullability,
174            _ => unreachable!("StructArrayExt requires a struct dtype"),
175        }
176    }
177
178    fn names(&self) -> &FieldNames {
179        self.as_ref().dtype().as_struct_fields().names()
180    }
181
182    fn struct_validity(&self) -> Validity {
183        child_to_validity(&self.as_ref().slots()[VALIDITY_SLOT], self.nullability())
184    }
185
186    fn iter_unmasked_fields(&self) -> impl Iterator<Item = &ArrayRef> + '_ {
187        self.as_ref().slots()[FIELDS_OFFSET..]
188            .iter()
189            .map(|s| s.as_ref().vortex_expect("StructArray field slot"))
190    }
191
192    fn unmasked_fields(&self) -> Arc<[ArrayRef]> {
193        self.iter_unmasked_fields().cloned().collect()
194    }
195
196    fn unmasked_field(&self, idx: usize) -> &ArrayRef {
197        self.as_ref().slots()[FIELDS_OFFSET + idx]
198            .as_ref()
199            .vortex_expect("StructArray field slot")
200    }
201
202    fn unmasked_field_by_name_opt(&self, name: impl AsRef<str>) -> Option<&ArrayRef> {
203        let name = name.as_ref();
204        self.struct_fields()
205            .find(name)
206            .map(|idx| self.unmasked_field(idx))
207    }
208
209    fn unmasked_field_by_name(&self, name: impl AsRef<str>) -> VortexResult<&ArrayRef> {
210        let name = name.as_ref();
211        self.unmasked_field_by_name_opt(name).ok_or_else(|| {
212            vortex_err!(
213                "Field {name} not found in struct array with names {:?}",
214                self.names()
215            )
216        })
217    }
218
219    fn struct_fields(&self) -> &StructFields {
220        self.as_ref().dtype().as_struct_fields()
221    }
222}
223impl<T: TypedArrayRef<Struct>> StructArrayExt for T {}
224
225impl Array<Struct> {
226    /// Creates a new `StructArray`.
227    pub fn new(
228        names: FieldNames,
229        fields: impl Into<Arc<[ArrayRef]>>,
230        length: usize,
231        validity: Validity,
232    ) -> Self {
233        Self::try_new(names, fields, length, validity)
234            .vortex_expect("StructArray construction failed")
235    }
236
237    /// Constructs a new `StructArray`.
238    pub fn try_new(
239        names: FieldNames,
240        fields: impl Into<Arc<[ArrayRef]>>,
241        length: usize,
242        validity: Validity,
243    ) -> VortexResult<Self> {
244        let fields = fields.into();
245        let field_dtypes: Vec<_> = fields.iter().map(|d| d.dtype().clone()).collect();
246        let dtype = StructFields::new(names, field_dtypes);
247        let slots = make_struct_slots(&fields, &validity, length);
248        Array::try_from_parts(
249            ArrayParts::new(
250                Struct,
251                DType::Struct(dtype, validity.nullability()),
252                length,
253                EmptyArrayData,
254            )
255            .with_slots(slots),
256        )
257    }
258
259    /// Creates a new `StructArray` without validation.
260    ///
261    /// # Safety
262    ///
263    /// Caller must ensure the field arrays match the supplied dtype, length, and validity.
264    pub unsafe fn new_unchecked(
265        fields: impl Into<Arc<[ArrayRef]>>,
266        dtype: StructFields,
267        length: usize,
268        validity: Validity,
269    ) -> Self {
270        let fields = fields.into();
271        let outer_dtype = DType::Struct(dtype, validity.nullability());
272        let slots = make_struct_slots(&fields, &validity, length);
273        unsafe {
274            Array::from_parts_unchecked(
275                ArrayParts::new(Struct, outer_dtype, length, EmptyArrayData).with_slots(slots),
276            )
277        }
278    }
279
280    /// Constructs a new `StructArray` with an explicit dtype.
281    pub fn try_new_with_dtype(
282        fields: impl Into<Arc<[ArrayRef]>>,
283        dtype: StructFields,
284        length: usize,
285        validity: Validity,
286    ) -> VortexResult<Self> {
287        let fields = fields.into();
288        let outer_dtype = DType::Struct(dtype, validity.nullability());
289        let slots = make_struct_slots(&fields, &validity, length);
290        Array::try_from_parts(
291            ArrayParts::new(Struct, outer_dtype, length, EmptyArrayData).with_slots(slots),
292        )
293    }
294
295    /// Construct a `StructArray` from named fields.
296    pub fn from_fields<N: AsRef<str>>(items: &[(N, ArrayRef)]) -> VortexResult<Self> {
297        Self::try_from_iter(items.iter().map(|(a, b)| (a, b.clone())))
298    }
299
300    /// Create a `StructArray` from an iterator of (name, array) pairs with validity.
301    pub fn try_from_iter_with_validity<
302        N: AsRef<str>,
303        A: IntoArray,
304        T: IntoIterator<Item = (N, A)>,
305    >(
306        iter: T,
307        validity: Validity,
308    ) -> VortexResult<Self> {
309        let (names, fields): (Vec<FieldName>, Vec<ArrayRef>) = iter
310            .into_iter()
311            .map(|(name, fields)| (FieldName::from(name.as_ref()), fields.into_array()))
312            .unzip();
313        let len = fields
314            .first()
315            .map(|f| f.len())
316            .ok_or_else(|| vortex_err!("StructArray cannot be constructed from an empty slice of arrays because the length is unspecified"))?;
317
318        Self::try_new(FieldNames::from_iter(names), fields, len, validity)
319    }
320
321    /// Create a `StructArray` from an iterator of (name, array) pairs.
322    pub fn try_from_iter<N: AsRef<str>, A: IntoArray, T: IntoIterator<Item = (N, A)>>(
323        iter: T,
324    ) -> VortexResult<Self> {
325        let (names, fields): (Vec<FieldName>, Vec<ArrayRef>) = iter
326            .into_iter()
327            .map(|(name, field)| (FieldName::from(name.as_ref()), field.into_array()))
328            .unzip();
329        let len = fields
330            .first()
331            .map(ArrayRef::len)
332            .ok_or_else(|| vortex_err!("StructArray cannot be constructed from an empty slice of arrays because the length is unspecified"))?;
333
334        Self::try_new(
335            FieldNames::from_iter(names),
336            fields,
337            len,
338            Validity::NonNullable,
339        )
340    }
341
342    // TODO(aduffy): Add equivalent function to support field masks for nested column access.
343    /// Return a new StructArray with the given projection applied.
344    ///
345    /// Projection does not copy data arrays. Projection is defined by an ordinal array slice
346    /// which specifies the new ordering of columns in the struct. The projection can be used to
347    /// perform column re-ordering, deletion, or duplication at a logical level, without any data
348    /// copying.
349    pub fn project(&self, projection: &[FieldName]) -> VortexResult<Self> {
350        let mut children = Vec::with_capacity(projection.len());
351        let mut names = Vec::with_capacity(projection.len());
352
353        for f_name in projection {
354            let idx = self
355                .struct_fields()
356                .find(f_name.as_ref())
357                .ok_or_else(|| vortex_err!("Unknown field {f_name}"))?;
358
359            names.push(self.names()[idx].clone());
360            children.push(self.unmasked_field(idx).clone());
361        }
362
363        Self::try_new(
364            FieldNames::from(names.as_slice()),
365            children,
366            self.len(),
367            self.validity()?,
368        )
369    }
370
371    /// Create a fieldless `StructArray` with the given length.
372    pub fn new_fieldless_with_len(len: usize) -> Self {
373        let dtype = DType::Struct(
374            StructFields::new(FieldNames::default(), Vec::new()),
375            crate::dtype::Nullability::NonNullable,
376        );
377        let slots = make_struct_slots(&[], &Validity::NonNullable, len);
378        unsafe {
379            Array::from_parts_unchecked(
380                ArrayParts::new(Struct, dtype, len, EmptyArrayData).with_slots(slots),
381            )
382        }
383    }
384
385    // TODO(ngates): remove this... it doesn't help to consume self.
386    pub fn into_data_parts(self) -> StructDataParts {
387        let fields: Arc<[ArrayRef]> = self.slots()[FIELDS_OFFSET..]
388            .iter()
389            .map(|s| s.as_ref().vortex_expect("StructArray field slot").clone())
390            .collect();
391        let validity = self.validity().vortex_expect("StructArray validity");
392        StructDataParts {
393            struct_fields: self.struct_fields().clone(),
394            fields,
395            validity,
396        }
397    }
398
399    pub fn remove_column(&self, name: impl Into<FieldName>) -> Option<(Self, ArrayRef)> {
400        let name = name.into();
401        let struct_dtype = self.struct_fields();
402        let len = self.len();
403
404        let position = struct_dtype.find(name.as_ref())?;
405
406        let slot_position = FIELDS_OFFSET + position;
407        let field = self.slots()[slot_position]
408            .as_ref()
409            .vortex_expect("StructArray field slot")
410            .clone();
411        let new_slots: Vec<Option<ArrayRef>> = self
412            .slots()
413            .iter()
414            .enumerate()
415            .filter(|(i, _)| *i != slot_position)
416            .map(|(_, s)| s.clone())
417            .collect();
418
419        let new_dtype = struct_dtype.without_field(position).ok()?;
420        let new_array = unsafe {
421            Array::from_parts_unchecked(
422                ArrayParts::new(
423                    Struct,
424                    DType::Struct(new_dtype, self.dtype().nullability()),
425                    len,
426                    EmptyArrayData,
427                )
428                .with_slots(new_slots),
429            )
430        };
431        Some((new_array, field))
432    }
433}
434
435impl Array<Struct> {
436    pub fn with_column(&self, name: impl Into<FieldName>, array: ArrayRef) -> VortexResult<Self> {
437        let name = name.into();
438        let struct_dtype = self.struct_fields();
439
440        let names = struct_dtype.names().iter().cloned().chain(once(name));
441        let types = struct_dtype.fields().chain(once(array.dtype().clone()));
442        let new_fields = StructFields::new(names.collect(), types.collect());
443
444        let children: Arc<[ArrayRef]> = self.slots()[FIELDS_OFFSET..]
445            .iter()
446            .map(|s| s.as_ref().vortex_expect("StructArray field slot").clone())
447            .chain(once(array))
448            .collect();
449
450        Self::try_new_with_dtype(children, new_fields, self.len(), self.validity()?)
451    }
452
453    pub fn remove_column_owned(&self, name: impl Into<FieldName>) -> Option<(Self, ArrayRef)> {
454        self.remove_column(name)
455    }
456}