vortex_array/arrays/struct_/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Debug;
5use std::iter::once;
6
7use itertools::Itertools;
8use vortex_dtype::{DType, FieldName, FieldNames, StructFields};
9use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err};
10use vortex_scalar::Scalar;
11
12use crate::stats::{ArrayStats, StatsSetRef};
13use crate::validity::Validity;
14use crate::vtable::{
15    ArrayVTable, CanonicalVTable, NotSupported, OperationsVTable, VTable, ValidityHelper,
16    ValidityVTableFromValidityHelper,
17};
18use crate::{Array, ArrayRef, Canonical, EncodingId, EncodingRef, IntoArray, vtable};
19
20mod compute;
21mod serde;
22
23vtable!(Struct);
24
25impl VTable for StructVTable {
26    type Array = StructArray;
27    type Encoding = StructEncoding;
28
29    type ArrayVTable = Self;
30    type CanonicalVTable = Self;
31    type OperationsVTable = Self;
32    type ValidityVTable = ValidityVTableFromValidityHelper;
33    type VisitorVTable = Self;
34    type ComputeVTable = NotSupported;
35    type EncodeVTable = NotSupported;
36    type SerdeVTable = Self;
37
38    fn id(_encoding: &Self::Encoding) -> EncodingId {
39        EncodingId::new_ref("vortex.struct")
40    }
41
42    fn encoding(_array: &Self::Array) -> EncodingRef {
43        EncodingRef::new_ref(StructEncoding.as_ref())
44    }
45}
46
47/// A struct array that stores multiple named fields as columns, similar to a database row.
48///
49/// This mirrors the Apache Arrow Struct array encoding and provides a columnar representation
50/// of structured data where each row contains multiple named fields of potentially different types.
51///
52/// ## Data Layout
53///
54/// The struct array uses a columnar layout where:
55/// - Each field is stored as a separate child array
56/// - All fields must have the same length (number of rows)
57/// - Field names and types are defined in the struct's dtype
58/// - An optional validity mask indicates which entire rows are null
59///
60/// ## Row-level nulls
61///
62/// The StructArray contains its own top-level nulls, which are superimposed on top of the
63/// field-level validity values. This can be the case even if the fields themselves are non-nullable,
64/// accessing a particular row can yield nulls even if all children are valid at that position.
65///
66/// ```
67/// use vortex_array::arrays::{StructArray, BoolArray};
68/// use vortex_array::validity::Validity;
69/// use vortex_array::IntoArray;
70/// use vortex_dtype::FieldNames;
71/// use vortex_buffer::buffer;
72///
73/// // Create struct with all non-null fields but struct-level nulls
74/// let struct_array = StructArray::try_new(
75///     FieldNames::from(["a", "b", "c"]),
76///     vec![
77///         buffer![1i32, 2i32].into_array(),  // non-null field a
78///         buffer![10i32, 20i32].into_array(), // non-null field b  
79///         buffer![100i32, 200i32].into_array(), // non-null field c
80///     ],
81///     2,
82///     Validity::Array(BoolArray::from_iter([true, false]).into_array()), // row 1 is null
83/// ).unwrap();
84///
85/// // Row 0 is valid - returns a struct scalar with field values
86/// let row0 = struct_array.scalar_at(0).unwrap();
87/// assert!(!row0.is_null());
88///
89/// // Row 1 is null at struct level - returns null even though fields have values
90/// let row1 = struct_array.scalar_at(1).unwrap();
91/// assert!(row1.is_null());
92/// ```
93///
94/// ## Name uniqueness
95///
96/// It is valid for a StructArray to have multiple child columns that have the same name. In this
97/// case, any accessors that use column names will find the first column in sequence with the name.
98///
99/// ```
100/// use vortex_array::arrays::StructArray;
101/// use vortex_array::validity::Validity;
102/// use vortex_array::IntoArray;
103/// use vortex_dtype::FieldNames;
104/// use vortex_buffer::buffer;
105///
106/// // Create struct with duplicate "data" field names
107/// let struct_array = StructArray::try_new(
108///     FieldNames::from(["data", "data"]),
109///     vec![
110///         buffer![1i32, 2i32].into_array(),   // first "data"
111///         buffer![3i32, 4i32].into_array(),   // second "data"
112///     ],
113///     2,
114///     Validity::NonNullable,
115/// ).unwrap();
116///
117/// // field_by_name returns the FIRST "data" field
118/// let first_data = struct_array.field_by_name("data").unwrap();
119/// assert_eq!(first_data.scalar_at(0).unwrap(), 1i32.into());
120/// ```
121///
122/// ## Field Operations
123///
124/// Struct arrays support efficient column operations:
125/// - **Projection**: Select/reorder fields without copying data
126/// - **Field access**: Get columns by name or index
127/// - **Column addition**: Add new fields to create extended structs
128/// - **Column removal**: Remove fields to create narrower structs
129///
130/// ## Validity Semantics
131///
132/// - Row-level nulls are tracked in the struct's validity child
133/// - Individual field nulls are tracked in each field's own validity
134/// - A null struct row means all fields in that row are conceptually null
135/// - Field-level nulls can exist independently of struct-level nulls
136///
137/// # Examples
138///
139/// ```
140/// use vortex_array::arrays::{StructArray, PrimitiveArray};
141/// use vortex_array::validity::Validity;
142/// use vortex_array::IntoArray;
143/// use vortex_dtype::FieldNames;
144/// use vortex_buffer::buffer;
145///
146/// // Create arrays for each field
147/// let ids = PrimitiveArray::new(buffer![1i32, 2, 3], Validity::NonNullable);
148/// let names = PrimitiveArray::new(buffer![100u64, 200, 300], Validity::NonNullable);
149///
150/// // Create struct array with named fields
151/// let struct_array = StructArray::try_new(
152///     FieldNames::from(["id", "score"]),
153///     vec![ids.into_array(), names.into_array()],
154///     3,
155///     Validity::NonNullable,
156/// ).unwrap();
157///
158/// assert_eq!(struct_array.len(), 3);
159/// assert_eq!(struct_array.names().len(), 2);
160///
161/// // Access field by name
162/// let id_field = struct_array.field_by_name("id").unwrap();
163/// assert_eq!(id_field.len(), 3);
164/// ```
165#[derive(Clone, Debug)]
166pub struct StructArray {
167    len: usize,
168    dtype: DType,
169    fields: Vec<ArrayRef>,
170    validity: Validity,
171    stats_set: ArrayStats,
172}
173
174#[derive(Clone, Debug)]
175pub struct StructEncoding;
176
177impl StructArray {
178    pub fn fields(&self) -> &[ArrayRef] {
179        &self.fields
180    }
181
182    pub fn field_by_name(&self, name: impl AsRef<str>) -> VortexResult<&ArrayRef> {
183        let name = name.as_ref();
184        self.field_by_name_opt(name).ok_or_else(|| {
185            vortex_err!(
186                "Field {name} not found in struct array with names {:?}",
187                self.names()
188            )
189        })
190    }
191
192    pub fn field_by_name_opt(&self, name: impl AsRef<str>) -> Option<&ArrayRef> {
193        let name = name.as_ref();
194        self.names()
195            .iter()
196            .position(|field_name| field_name.as_ref() == name)
197            .map(|idx| &self.fields[idx])
198    }
199
200    pub fn names(&self) -> &FieldNames {
201        self.struct_fields().names()
202    }
203
204    pub fn struct_fields(&self) -> &StructFields {
205        let Some(struct_dtype) = &self.dtype.as_struct() else {
206            unreachable!(
207                "struct arrays must have be a DType::Struct, this is likely an internal bug."
208            )
209        };
210        struct_dtype
211    }
212
213    /// Create a new `StructArray` with the given length, but without any fields.
214    pub fn new_with_len(len: usize) -> Self {
215        Self::try_new(
216            FieldNames::default(),
217            Vec::new(),
218            len,
219            Validity::NonNullable,
220        )
221        .vortex_expect("StructArray::new_with_len should not fail")
222    }
223
224    pub fn try_new(
225        names: FieldNames,
226        fields: Vec<ArrayRef>,
227        length: usize,
228        validity: Validity,
229    ) -> VortexResult<Self> {
230        let nullability = validity.nullability();
231
232        if names.len() != fields.len() {
233            vortex_bail!("Got {} names and {} fields", names.len(), fields.len());
234        }
235
236        for field in fields.iter() {
237            if field.len() != length {
238                vortex_bail!(
239                    "Expected all struct fields to have length {length}, found {}",
240                    fields.iter().map(|f| f.len()).format(","),
241                );
242            }
243        }
244
245        let field_dtypes: Vec<_> = fields.iter().map(|d| d.dtype()).cloned().collect();
246        let dtype = DType::Struct(StructFields::new(names, field_dtypes), nullability);
247
248        if length != validity.maybe_len().unwrap_or(length) {
249            vortex_bail!(
250                "array length {} and validity length must match {}",
251                length,
252                validity
253                    .maybe_len()
254                    .vortex_expect("can only fail if maybe is some")
255            )
256        }
257
258        Ok(Self {
259            len: length,
260            dtype,
261            fields,
262            validity,
263            stats_set: Default::default(),
264        })
265    }
266
267    pub fn try_new_with_dtype(
268        fields: Vec<ArrayRef>,
269        dtype: StructFields,
270        length: usize,
271        validity: Validity,
272    ) -> VortexResult<Self> {
273        for (field, struct_dt) in fields.iter().zip(dtype.fields()) {
274            if field.len() != length {
275                vortex_bail!(
276                    "Expected all struct fields to have length {length}, found {}",
277                    field.len()
278                );
279            }
280
281            if &struct_dt != field.dtype() {
282                vortex_bail!(
283                    "Expected all struct fields to have dtype {}, found {}",
284                    struct_dt,
285                    field.dtype()
286                );
287            }
288        }
289
290        Ok(Self {
291            len: length,
292            dtype: DType::Struct(dtype, validity.nullability()),
293            fields,
294            validity,
295            stats_set: Default::default(),
296        })
297    }
298
299    pub fn from_fields<N: AsRef<str>>(items: &[(N, ArrayRef)]) -> VortexResult<Self> {
300        Self::try_from_iter(items.iter().map(|(a, b)| (a, b.to_array())))
301    }
302
303    pub fn try_from_iter_with_validity<
304        N: AsRef<str>,
305        A: IntoArray,
306        T: IntoIterator<Item = (N, A)>,
307    >(
308        iter: T,
309        validity: Validity,
310    ) -> VortexResult<Self> {
311        let (names, fields): (Vec<FieldName>, Vec<ArrayRef>) = iter
312            .into_iter()
313            .map(|(name, fields)| (FieldName::from(name.as_ref()), fields.into_array()))
314            .unzip();
315        let len = fields
316            .first()
317            .map(|f| f.len())
318            .ok_or_else(|| vortex_err!("StructArray cannot be constructed from an empty slice of arrays because the length is unspecified"))?;
319
320        Self::try_new(FieldNames::from_iter(names), fields, len, validity)
321    }
322
323    pub fn try_from_iter<N: AsRef<str>, A: IntoArray, T: IntoIterator<Item = (N, A)>>(
324        iter: T,
325    ) -> VortexResult<Self> {
326        Self::try_from_iter_with_validity(iter, Validity::NonNullable)
327    }
328
329    // TODO(aduffy): Add equivalent function to support field masks for nested column access.
330    /// Return a new StructArray with the given projection applied.
331    ///
332    /// Projection does not copy data arrays. Projection is defined by an ordinal array slice
333    /// which specifies the new ordering of columns in the struct. The projection can be used to
334    /// perform column re-ordering, deletion, or duplication at a logical level, without any data
335    /// copying.
336    #[allow(clippy::same_name_method)]
337    pub fn project(&self, projection: &[FieldName]) -> VortexResult<Self> {
338        let mut children = Vec::with_capacity(projection.len());
339        let mut names = Vec::with_capacity(projection.len());
340
341        for f_name in projection.iter() {
342            let idx = self
343                .names()
344                .iter()
345                .position(|name| name == f_name)
346                .ok_or_else(|| vortex_err!("Unknown field {f_name}"))?;
347
348            names.push(self.names()[idx].clone());
349            children.push(self.fields()[idx].clone());
350        }
351
352        StructArray::try_new(
353            FieldNames::from(names.as_slice()),
354            children,
355            self.len(),
356            self.validity().clone(),
357        )
358    }
359
360    /// Removes and returns a column from the struct array by name.
361    /// If the column does not exist, returns `None`.
362    pub fn remove_column(&mut self, name: impl Into<FieldName>) -> Option<ArrayRef> {
363        let name = name.into();
364
365        let struct_dtype = self.struct_fields().clone();
366
367        let position = struct_dtype
368            .names()
369            .iter()
370            .position(|field_name| field_name.as_ref() == name.as_ref())?;
371
372        let field = self.fields.remove(position);
373
374        let new_dtype = struct_dtype.without_field(position);
375        self.dtype = DType::Struct(new_dtype, self.dtype.nullability());
376
377        Some(field)
378    }
379
380    /// Create a new StructArray by appending a new column onto the existing array.
381    pub fn with_column(&self, name: impl Into<FieldName>, array: ArrayRef) -> VortexResult<Self> {
382        let name = name.into();
383        let struct_dtype = self.struct_fields().clone();
384
385        let names = struct_dtype.names().iter().cloned().chain(once(name));
386        let types = struct_dtype.fields().chain(once(array.dtype().clone()));
387        let new_fields = StructFields::new(names.collect(), types.collect());
388
389        let mut children = self.fields.clone();
390        children.push(array);
391
392        Self::try_new_with_dtype(children, new_fields, self.len, self.validity.clone())
393    }
394}
395
396impl ValidityHelper for StructArray {
397    fn validity(&self) -> &Validity {
398        &self.validity
399    }
400}
401
402impl ArrayVTable<StructVTable> for StructVTable {
403    fn len(array: &StructArray) -> usize {
404        array.len
405    }
406
407    fn dtype(array: &StructArray) -> &DType {
408        &array.dtype
409    }
410
411    fn stats(array: &StructArray) -> StatsSetRef<'_> {
412        array.stats_set.to_ref(array.as_ref())
413    }
414}
415
416impl CanonicalVTable<StructVTable> for StructVTable {
417    fn canonicalize(array: &StructArray) -> VortexResult<Canonical> {
418        Ok(Canonical::Struct(array.clone()))
419    }
420}
421
422impl OperationsVTable<StructVTable> for StructVTable {
423    fn slice(array: &StructArray, start: usize, stop: usize) -> VortexResult<ArrayRef> {
424        let fields = array
425            .fields()
426            .iter()
427            .map(|field| field.slice(start, stop))
428            .try_collect()?;
429        StructArray::try_new_with_dtype(
430            fields,
431            array.struct_fields().clone(),
432            stop - start,
433            array.validity().slice(start, stop)?,
434        )
435        .map(|a| a.into_array())
436    }
437
438    fn scalar_at(array: &StructArray, index: usize) -> VortexResult<Scalar> {
439        if array.is_valid(index)? {
440            Ok(Scalar::struct_(
441                array.dtype().clone(),
442                array
443                    .fields()
444                    .iter()
445                    .map(|field| field.scalar_at(index))
446                    .try_collect()?,
447            ))
448        } else {
449            Ok(Scalar::null(array.dtype().clone()))
450        }
451    }
452}
453
454#[cfg(test)]
455mod test {
456    use vortex_buffer::buffer;
457    use vortex_dtype::{DType, FieldName, FieldNames, Nullability, PType};
458
459    use crate::IntoArray;
460    use crate::arrays::primitive::PrimitiveArray;
461    use crate::arrays::struct_::StructArray;
462    use crate::arrays::varbin::VarBinArray;
463    use crate::arrays::{BoolArray, BoolVTable, PrimitiveVTable};
464    use crate::validity::Validity;
465
466    #[test]
467    fn test_project() {
468        let xs = PrimitiveArray::new(buffer![0i64, 1, 2, 3, 4], Validity::NonNullable);
469        let ys = VarBinArray::from_vec(
470            vec!["a", "b", "c", "d", "e"],
471            DType::Utf8(Nullability::NonNullable),
472        );
473        let zs = BoolArray::from_iter([true, true, true, false, false]);
474
475        let struct_a = StructArray::try_new(
476            FieldNames::from(["xs", "ys", "zs"]),
477            vec![xs.into_array(), ys.into_array(), zs.into_array()],
478            5,
479            Validity::NonNullable,
480        )
481        .unwrap();
482
483        let struct_b = struct_a
484            .project(&[FieldName::from("zs"), FieldName::from("xs")])
485            .unwrap();
486        assert_eq!(
487            struct_b.names().as_ref(),
488            [FieldName::from("zs"), FieldName::from("xs")],
489        );
490
491        assert_eq!(struct_b.len(), 5);
492
493        let bools = &struct_b.fields[0];
494        assert_eq!(
495            bools
496                .as_::<BoolVTable>()
497                .boolean_buffer()
498                .iter()
499                .collect::<Vec<_>>(),
500            vec![true, true, true, false, false]
501        );
502
503        let prims = &struct_b.fields[1];
504        assert_eq!(
505            prims.as_::<PrimitiveVTable>().as_slice::<i64>(),
506            [0i64, 1, 2, 3, 4]
507        );
508    }
509
510    #[test]
511    fn test_remove_column() {
512        let xs = PrimitiveArray::new(buffer![0i64, 1, 2, 3, 4], Validity::NonNullable);
513        let ys = PrimitiveArray::new(buffer![4u64, 5, 6, 7, 8], Validity::NonNullable);
514
515        let mut struct_a = StructArray::try_new(
516            FieldNames::from(["xs", "ys"]),
517            vec![xs.into_array(), ys.into_array()],
518            5,
519            Validity::NonNullable,
520        )
521        .unwrap();
522
523        let removed = struct_a.remove_column("xs").unwrap();
524        assert_eq!(
525            removed.dtype(),
526            &DType::Primitive(PType::I64, Nullability::NonNullable)
527        );
528        assert_eq!(
529            removed.as_::<PrimitiveVTable>().as_slice::<i64>(),
530            [0i64, 1, 2, 3, 4]
531        );
532
533        assert_eq!(struct_a.names(), &[FieldName::from("ys")].into());
534        assert_eq!(struct_a.fields.len(), 1);
535        assert_eq!(struct_a.len(), 5);
536        assert_eq!(
537            struct_a.fields[0].dtype(),
538            &DType::Primitive(PType::U64, Nullability::NonNullable)
539        );
540        assert_eq!(
541            struct_a.fields[0]
542                .as_::<PrimitiveVTable>()
543                .as_slice::<u64>(),
544            [4u64, 5, 6, 7, 8]
545        );
546
547        let empty = struct_a.remove_column("non_existent");
548        assert!(
549            empty.is_none(),
550            "Expected None when removing non-existent column"
551        );
552        assert_eq!(struct_a.names(), &[FieldName::from("ys")].into());
553    }
554
555    #[test]
556    fn test_duplicate_field_names() {
557        // Test that StructArray allows duplicate field names and returns the first match
558        let field1 = buffer![1i32, 2, 3].into_array();
559        let field2 = buffer![10i32, 20, 30].into_array();
560        let field3 = buffer![100i32, 200, 300].into_array();
561
562        // Create struct with duplicate field names - "value" appears twice
563        let struct_array = StructArray::try_new(
564            FieldNames::from(["value", "other", "value"]),
565            vec![field1, field2, field3],
566            3,
567            Validity::NonNullable,
568        )
569        .unwrap();
570
571        // field_by_name should return the first field with the matching name
572        let first_value_field = struct_array.field_by_name("value").unwrap();
573        assert_eq!(
574            first_value_field.as_::<PrimitiveVTable>().as_slice::<i32>(),
575            [1i32, 2, 3] // This is field1, not field3
576        );
577
578        // Verify field_by_name_opt also returns the first match
579        let opt_field = struct_array.field_by_name_opt("value").unwrap();
580        assert_eq!(
581            opt_field.as_::<PrimitiveVTable>().as_slice::<i32>(),
582            [1i32, 2, 3] // First "value" field
583        );
584
585        // Verify the third field (second "value") can be accessed by index
586        let third_field = &struct_array.fields()[2];
587        assert_eq!(
588            third_field.as_::<PrimitiveVTable>().as_slice::<i32>(),
589            [100i32, 200, 300]
590        );
591    }
592}