Skip to main content

vortex_array/arrays/struct_/
array.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::iter::once;
5use std::sync::Arc;
6
7use vortex_error::VortexExpect;
8use vortex_error::VortexResult;
9use vortex_error::vortex_err;
10
11use crate::ArrayRef;
12use crate::IntoArray;
13use crate::array::Array;
14use crate::array::ArrayParts;
15use crate::array::EmptyArrayData;
16use crate::array::TypedArrayRef;
17use crate::array::child_to_validity;
18use crate::array::validity_to_child;
19use crate::arrays::Struct;
20use crate::dtype::DType;
21use crate::dtype::FieldName;
22use crate::dtype::FieldNames;
23use crate::dtype::StructFields;
24use crate::validity::Validity;
25
26// StructArray has a variable number of slots: [validity?, field_0, ..., field_N]
27/// The validity bitmap indicating which struct elements are non-null.
28pub(super) const VALIDITY_SLOT: usize = 0;
29/// The offset at which the struct field arrays begin in the slots vector.
30pub(super) const FIELDS_OFFSET: usize = 1;
31
32/// A struct array that stores multiple named fields as columns, similar to a database row.
33///
34/// This mirrors the Apache Arrow Struct array encoding and provides a columnar representation
35/// of structured data where each row contains multiple named fields of potentially different types.
36///
37/// ## Data Layout
38///
39/// The struct array uses a columnar layout where:
40/// - Each field is stored as a separate child array
41/// - All fields must have the same length (number of rows)
42/// - Field names and types are defined in the struct's dtype
43/// - An optional validity mask indicates which entire rows are null
44///
45/// ## Row-level nulls
46///
47/// The StructArray contains its own top-level nulls, which are superimposed on top of the
48/// field-level validity values. This can be the case even if the fields themselves are non-nullable,
49/// accessing a particular row can yield nulls even if all children are valid at that position.
50///
51/// ```
52/// use vortex_array::arrays::{StructArray, BoolArray};
53/// use vortex_array::validity::Validity;
54/// use vortex_array::dtype::FieldNames;
55/// use vortex_array::IntoArray;
56/// use vortex_buffer::buffer;
57///
58/// // Create struct with all non-null fields but struct-level nulls
59/// let struct_array = StructArray::try_new(
60///     FieldNames::from(["a", "b", "c"]),
61///     vec![
62///         buffer![1i32, 2i32].into_array(),  // non-null field a
63///         buffer![10i32, 20i32].into_array(), // non-null field b
64///         buffer![100i32, 200i32].into_array(), // non-null field c
65///     ],
66///     2,
67///     Validity::Array(BoolArray::from_iter([true, false]).into_array()), // row 1 is null
68/// ).unwrap();
69///
70/// // Row 0 is valid - returns a struct scalar with field values
71/// let row0 = struct_array.scalar_at(0).unwrap();
72/// assert!(!row0.is_null());
73///
74/// // Row 1 is null at struct level - returns null even though fields have values
75/// let row1 = struct_array.scalar_at(1).unwrap();
76/// assert!(row1.is_null());
77/// ```
78///
79/// ## Name uniqueness
80///
81/// It is valid for a StructArray to have multiple child columns that have the same name. In this
82/// case, any accessors that use column names will find the first column in sequence with the name.
83///
84/// ```
85/// use vortex_array::arrays::StructArray;
86/// use vortex_array::arrays::struct_::StructArrayExt;
87/// use vortex_array::validity::Validity;
88/// use vortex_array::dtype::FieldNames;
89/// use vortex_array::IntoArray;
90/// use vortex_buffer::buffer;
91///
92/// // Create struct with duplicate "data" field names
93/// let struct_array = StructArray::try_new(
94///     FieldNames::from(["data", "data"]),
95///     vec![
96///         buffer![1i32, 2i32].into_array(),   // first "data"
97///         buffer![3i32, 4i32].into_array(),   // second "data"
98///     ],
99///     2,
100///     Validity::NonNullable,
101/// ).unwrap();
102///
103/// // field_by_name returns the FIRST "data" field
104/// let first_data = struct_array.unmasked_field_by_name("data").unwrap();
105/// assert_eq!(first_data.scalar_at(0).unwrap(), 1i32.into());
106/// ```
107///
108/// ## Field Operations
109///
110/// Struct arrays support efficient column operations:
111/// - **Projection**: Select/reorder fields without copying data
112/// - **Field access**: Get columns by name or index
113/// - **Column addition**: Add new fields to create extended structs
114/// - **Column removal**: Remove fields to create narrower structs
115///
116/// ## Validity Semantics
117///
118/// - Row-level nulls are tracked in the struct's validity child
119/// - Individual field nulls are tracked in each field's own validity
120/// - A null struct row means all fields in that row are conceptually null
121/// - Field-level nulls can exist independently of struct-level nulls
122///
123/// # Examples
124///
125/// ```
126/// use vortex_array::arrays::{StructArray, PrimitiveArray};
127/// use vortex_array::arrays::struct_::StructArrayExt;
128/// use vortex_array::validity::Validity;
129/// use vortex_array::dtype::FieldNames;
130/// use vortex_array::IntoArray;
131/// use vortex_buffer::buffer;
132///
133/// // Create arrays for each field
134/// let ids = PrimitiveArray::new(buffer![1i32, 2, 3], Validity::NonNullable);
135/// let names = PrimitiveArray::new(buffer![100u64, 200, 300], Validity::NonNullable);
136///
137/// // Create struct array with named fields
138/// let struct_array = StructArray::try_new(
139///     FieldNames::from(["id", "score"]),
140///     vec![ids.into_array(), names.into_array()],
141///     3,
142///     Validity::NonNullable,
143/// ).unwrap();
144///
145/// assert_eq!(struct_array.len(), 3);
146/// assert_eq!(struct_array.names().len(), 2);
147///
148/// // Access field by name
149/// let id_field = struct_array.unmasked_field_by_name("id").unwrap();
150/// assert_eq!(id_field.len(), 3);
151/// ```
152pub struct StructDataParts {
153    pub struct_fields: StructFields,
154    pub fields: Arc<[ArrayRef]>,
155    pub validity: Validity,
156}
157
158pub(super) fn make_struct_slots(
159    fields: &[ArrayRef],
160    validity: &Validity,
161    length: usize,
162) -> Vec<Option<ArrayRef>> {
163    once(validity_to_child(validity, length))
164        .chain(fields.iter().cloned().map(Some))
165        .collect()
166}
167
168pub trait StructArrayExt: TypedArrayRef<Struct> {
169    fn nullability(&self) -> crate::dtype::Nullability {
170        match self.as_ref().dtype() {
171            DType::Struct(_, nullability) => *nullability,
172            _ => unreachable!("StructArrayExt requires a struct dtype"),
173        }
174    }
175
176    fn names(&self) -> &FieldNames {
177        self.as_ref().dtype().as_struct_fields().names()
178    }
179
180    fn struct_validity(&self) -> Validity {
181        child_to_validity(&self.as_ref().slots()[VALIDITY_SLOT], self.nullability())
182    }
183
184    fn iter_unmasked_fields(&self) -> impl Iterator<Item = &ArrayRef> + '_ {
185        self.as_ref().slots()[FIELDS_OFFSET..]
186            .iter()
187            .map(|s| s.as_ref().vortex_expect("StructArray field slot"))
188    }
189
190    fn unmasked_fields(&self) -> Arc<[ArrayRef]> {
191        self.iter_unmasked_fields().cloned().collect()
192    }
193
194    fn unmasked_field(&self, idx: usize) -> &ArrayRef {
195        self.as_ref().slots()[FIELDS_OFFSET + idx]
196            .as_ref()
197            .vortex_expect("StructArray field slot")
198    }
199
200    fn unmasked_field_by_name_opt(&self, name: impl AsRef<str>) -> Option<&ArrayRef> {
201        let name = name.as_ref();
202        self.struct_fields()
203            .find(name)
204            .map(|idx| self.unmasked_field(idx))
205    }
206
207    fn unmasked_field_by_name(&self, name: impl AsRef<str>) -> VortexResult<&ArrayRef> {
208        let name = name.as_ref();
209        self.unmasked_field_by_name_opt(name).ok_or_else(|| {
210            vortex_err!(
211                "Field {name} not found in struct array with names {:?}",
212                self.names()
213            )
214        })
215    }
216
217    fn struct_fields(&self) -> &StructFields {
218        self.as_ref().dtype().as_struct_fields()
219    }
220}
221impl<T: TypedArrayRef<Struct>> StructArrayExt for T {}
222
223impl Array<Struct> {
224    /// Creates a new `StructArray`.
225    pub fn new(
226        names: FieldNames,
227        fields: impl Into<Arc<[ArrayRef]>>,
228        length: usize,
229        validity: Validity,
230    ) -> Self {
231        Self::try_new(names, fields, length, validity)
232            .vortex_expect("StructArray construction failed")
233    }
234
235    /// Constructs a new `StructArray`.
236    pub fn try_new(
237        names: FieldNames,
238        fields: impl Into<Arc<[ArrayRef]>>,
239        length: usize,
240        validity: Validity,
241    ) -> VortexResult<Self> {
242        let fields = fields.into();
243        let field_dtypes: Vec<_> = fields.iter().map(|d| d.dtype().clone()).collect();
244        let dtype = StructFields::new(names, field_dtypes);
245        let slots = make_struct_slots(&fields, &validity, length);
246        Array::try_from_parts(
247            ArrayParts::new(
248                Struct,
249                DType::Struct(dtype, validity.nullability()),
250                length,
251                EmptyArrayData,
252            )
253            .with_slots(slots),
254        )
255    }
256
257    /// Creates a new `StructArray` without validation.
258    ///
259    /// # Safety
260    ///
261    /// Caller must ensure the field arrays match the supplied dtype, length, and validity.
262    pub unsafe fn new_unchecked(
263        fields: impl Into<Arc<[ArrayRef]>>,
264        dtype: StructFields,
265        length: usize,
266        validity: Validity,
267    ) -> Self {
268        let fields = fields.into();
269        let outer_dtype = DType::Struct(dtype, validity.nullability());
270        let slots = make_struct_slots(&fields, &validity, length);
271        unsafe {
272            Array::from_parts_unchecked(
273                ArrayParts::new(Struct, outer_dtype, length, EmptyArrayData).with_slots(slots),
274            )
275        }
276    }
277
278    /// Constructs a new `StructArray` with an explicit dtype.
279    pub fn try_new_with_dtype(
280        fields: impl Into<Arc<[ArrayRef]>>,
281        dtype: StructFields,
282        length: usize,
283        validity: Validity,
284    ) -> VortexResult<Self> {
285        let fields = fields.into();
286        let outer_dtype = DType::Struct(dtype, validity.nullability());
287        let slots = make_struct_slots(&fields, &validity, length);
288        Array::try_from_parts(
289            ArrayParts::new(Struct, outer_dtype, length, EmptyArrayData).with_slots(slots),
290        )
291    }
292
293    /// Construct a `StructArray` from named fields.
294    pub fn from_fields<N: AsRef<str>>(items: &[(N, ArrayRef)]) -> VortexResult<Self> {
295        Self::try_from_iter(items.iter().map(|(a, b)| (a, b.clone())))
296    }
297
298    /// Create a `StructArray` from an iterator of (name, array) pairs with validity.
299    pub fn try_from_iter_with_validity<
300        N: AsRef<str>,
301        A: IntoArray,
302        T: IntoIterator<Item = (N, A)>,
303    >(
304        iter: T,
305        validity: Validity,
306    ) -> VortexResult<Self> {
307        let (names, fields): (Vec<FieldName>, Vec<ArrayRef>) = iter
308            .into_iter()
309            .map(|(name, fields)| (FieldName::from(name.as_ref()), fields.into_array()))
310            .unzip();
311        let len = fields
312            .first()
313            .map(|f| f.len())
314            .ok_or_else(|| vortex_err!("StructArray cannot be constructed from an empty slice of arrays because the length is unspecified"))?;
315
316        Self::try_new(FieldNames::from_iter(names), fields, len, validity)
317    }
318
319    /// Create a `StructArray` from an iterator of (name, array) pairs.
320    pub fn try_from_iter<N: AsRef<str>, A: IntoArray, T: IntoIterator<Item = (N, A)>>(
321        iter: T,
322    ) -> VortexResult<Self> {
323        let (names, fields): (Vec<FieldName>, Vec<ArrayRef>) = iter
324            .into_iter()
325            .map(|(name, field)| (FieldName::from(name.as_ref()), field.into_array()))
326            .unzip();
327        let len = fields
328            .first()
329            .map(ArrayRef::len)
330            .ok_or_else(|| vortex_err!("StructArray cannot be constructed from an empty slice of arrays because the length is unspecified"))?;
331
332        Self::try_new(
333            FieldNames::from_iter(names),
334            fields,
335            len,
336            Validity::NonNullable,
337        )
338    }
339
340    // TODO(aduffy): Add equivalent function to support field masks for nested column access.
341    /// Return a new StructArray with the given projection applied.
342    ///
343    /// Projection does not copy data arrays. Projection is defined by an ordinal array slice
344    /// which specifies the new ordering of columns in the struct. The projection can be used to
345    /// perform column re-ordering, deletion, or duplication at a logical level, without any data
346    /// copying.
347    pub fn project(&self, projection: &[FieldName]) -> VortexResult<Self> {
348        let mut children = Vec::with_capacity(projection.len());
349        let mut names = Vec::with_capacity(projection.len());
350
351        for f_name in projection {
352            let idx = self
353                .struct_fields()
354                .find(f_name.as_ref())
355                .ok_or_else(|| vortex_err!("Unknown field {f_name}"))?;
356
357            names.push(self.names()[idx].clone());
358            children.push(self.unmasked_field(idx).clone());
359        }
360
361        Self::try_new(
362            FieldNames::from(names.as_slice()),
363            children,
364            self.len(),
365            self.validity()?,
366        )
367    }
368
369    /// Create a fieldless `StructArray` with the given length.
370    pub fn new_fieldless_with_len(len: usize) -> Self {
371        let dtype = DType::Struct(
372            StructFields::new(FieldNames::default(), Vec::new()),
373            crate::dtype::Nullability::NonNullable,
374        );
375        let slots = make_struct_slots(&[], &Validity::NonNullable, len);
376        unsafe {
377            Array::from_parts_unchecked(
378                ArrayParts::new(Struct, dtype, len, EmptyArrayData).with_slots(slots),
379            )
380        }
381    }
382
383    // TODO(ngates): remove this... it doesn't help to consume self.
384    pub fn into_data_parts(self) -> StructDataParts {
385        let fields: Arc<[ArrayRef]> = self.slots()[FIELDS_OFFSET..]
386            .iter()
387            .map(|s| s.as_ref().vortex_expect("StructArray field slot").clone())
388            .collect();
389        let validity = self.validity().vortex_expect("StructArray validity");
390        StructDataParts {
391            struct_fields: self.struct_fields().clone(),
392            fields,
393            validity,
394        }
395    }
396
397    pub fn remove_column(&self, name: impl Into<FieldName>) -> Option<(Self, ArrayRef)> {
398        let name = name.into();
399        let struct_dtype = self.struct_fields();
400        let len = self.len();
401
402        let position = struct_dtype.find(name.as_ref())?;
403
404        let slot_position = FIELDS_OFFSET + position;
405        let field = self.slots()[slot_position]
406            .as_ref()
407            .vortex_expect("StructArray field slot")
408            .clone();
409        let new_slots: Vec<Option<ArrayRef>> = self
410            .slots()
411            .iter()
412            .enumerate()
413            .filter(|(i, _)| *i != slot_position)
414            .map(|(_, s)| s.clone())
415            .collect();
416
417        let new_dtype = struct_dtype.without_field(position).ok()?;
418        let new_array = unsafe {
419            Array::from_parts_unchecked(
420                ArrayParts::new(
421                    Struct,
422                    DType::Struct(new_dtype, self.dtype().nullability()),
423                    len,
424                    EmptyArrayData,
425                )
426                .with_slots(new_slots),
427            )
428        };
429        Some((new_array, field))
430    }
431}
432
433impl Array<Struct> {
434    pub fn with_column(&self, name: impl Into<FieldName>, array: ArrayRef) -> VortexResult<Self> {
435        let name = name.into();
436        let struct_dtype = self.struct_fields();
437
438        let names = struct_dtype.names().iter().cloned().chain(once(name));
439        let types = struct_dtype.fields().chain(once(array.dtype().clone()));
440        let new_fields = StructFields::new(names.collect(), types.collect());
441
442        let children: Arc<[ArrayRef]> = self.slots()[FIELDS_OFFSET..]
443            .iter()
444            .map(|s| s.as_ref().vortex_expect("StructArray field slot").clone())
445            .chain(once(array))
446            .collect();
447
448        Self::try_new_with_dtype(children, new_fields, self.len(), self.validity()?)
449    }
450
451    pub fn remove_column_owned(&self, name: impl Into<FieldName>) -> Option<(Self, ArrayRef)> {
452        self.remove_column(name)
453    }
454}