vortex_array/arrays/struct_/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4use std::fmt::Debug;
5use std::iter::once;
6use std::ops::Range;
7
8use itertools::Itertools;
9use vortex_dtype::{DType, FieldName, FieldNames, StructFields};
10use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err};
11use vortex_scalar::Scalar;
12
13use crate::stats::{ArrayStats, StatsSetRef};
14use crate::validity::Validity;
15use crate::vtable::{
16    ArrayVTable, CanonicalVTable, NotSupported, OperationsVTable, VTable, ValidityHelper,
17    ValidityVTableFromValidityHelper,
18};
19use crate::{Array, ArrayRef, Canonical, EncodingId, EncodingRef, IntoArray, vtable};
20
21mod compute;
22mod operator;
23mod serde;
24
25vtable!(Struct);
26
27impl VTable for StructVTable {
28    type Array = StructArray;
29    type Encoding = StructEncoding;
30
31    type ArrayVTable = Self;
32    type CanonicalVTable = Self;
33    type OperationsVTable = Self;
34    type ValidityVTable = ValidityVTableFromValidityHelper;
35    type VisitorVTable = Self;
36    type ComputeVTable = NotSupported;
37    type EncodeVTable = NotSupported;
38    type PipelineVTable = Self;
39    type SerdeVTable = Self;
40
41    fn id(_encoding: &Self::Encoding) -> EncodingId {
42        EncodingId::new_ref("vortex.struct")
43    }
44
45    fn encoding(_array: &Self::Array) -> EncodingRef {
46        EncodingRef::new_ref(StructEncoding.as_ref())
47    }
48}
49
50/// A struct array that stores multiple named fields as columns, similar to a database row.
51///
52/// This mirrors the Apache Arrow Struct array encoding and provides a columnar representation
53/// of structured data where each row contains multiple named fields of potentially different types.
54///
55/// ## Data Layout
56///
57/// The struct array uses a columnar layout where:
58/// - Each field is stored as a separate child array
59/// - All fields must have the same length (number of rows)
60/// - Field names and types are defined in the struct's dtype
61/// - An optional validity mask indicates which entire rows are null
62///
63/// ## Row-level nulls
64///
65/// The StructArray contains its own top-level nulls, which are superimposed on top of the
66/// field-level validity values. This can be the case even if the fields themselves are non-nullable,
67/// accessing a particular row can yield nulls even if all children are valid at that position.
68///
69/// ```
70/// use vortex_array::arrays::{StructArray, BoolArray};
71/// use vortex_array::validity::Validity;
72/// use vortex_array::IntoArray;
73/// use vortex_dtype::FieldNames;
74/// use vortex_buffer::buffer;
75///
76/// // Create struct with all non-null fields but struct-level nulls
77/// let struct_array = StructArray::try_new(
78///     FieldNames::from(["a", "b", "c"]),
79///     vec![
80///         buffer![1i32, 2i32].into_array(),  // non-null field a
81///         buffer![10i32, 20i32].into_array(), // non-null field b
82///         buffer![100i32, 200i32].into_array(), // non-null field c
83///     ],
84///     2,
85///     Validity::Array(BoolArray::from_iter([true, false]).into_array()), // row 1 is null
86/// ).unwrap();
87///
88/// // Row 0 is valid - returns a struct scalar with field values
89/// let row0 = struct_array.scalar_at(0);
90/// assert!(!row0.is_null());
91///
92/// // Row 1 is null at struct level - returns null even though fields have values
93/// let row1 = struct_array.scalar_at(1);
94/// assert!(row1.is_null());
95/// ```
96///
97/// ## Name uniqueness
98///
99/// It is valid for a StructArray to have multiple child columns that have the same name. In this
100/// case, any accessors that use column names will find the first column in sequence with the name.
101///
102/// ```
103/// use vortex_array::arrays::StructArray;
104/// use vortex_array::validity::Validity;
105/// use vortex_array::IntoArray;
106/// use vortex_dtype::FieldNames;
107/// use vortex_buffer::buffer;
108///
109/// // Create struct with duplicate "data" field names
110/// let struct_array = StructArray::try_new(
111///     FieldNames::from(["data", "data"]),
112///     vec![
113///         buffer![1i32, 2i32].into_array(),   // first "data"
114///         buffer![3i32, 4i32].into_array(),   // second "data"
115///     ],
116///     2,
117///     Validity::NonNullable,
118/// ).unwrap();
119///
120/// // field_by_name returns the FIRST "data" field
121/// let first_data = struct_array.field_by_name("data").unwrap();
122/// assert_eq!(first_data.scalar_at(0), 1i32.into());
123/// ```
124///
125/// ## Field Operations
126///
127/// Struct arrays support efficient column operations:
128/// - **Projection**: Select/reorder fields without copying data
129/// - **Field access**: Get columns by name or index
130/// - **Column addition**: Add new fields to create extended structs
131/// - **Column removal**: Remove fields to create narrower structs
132///
133/// ## Validity Semantics
134///
135/// - Row-level nulls are tracked in the struct's validity child
136/// - Individual field nulls are tracked in each field's own validity
137/// - A null struct row means all fields in that row are conceptually null
138/// - Field-level nulls can exist independently of struct-level nulls
139///
140/// # Examples
141///
142/// ```
143/// use vortex_array::arrays::{StructArray, PrimitiveArray};
144/// use vortex_array::validity::Validity;
145/// use vortex_array::IntoArray;
146/// use vortex_dtype::FieldNames;
147/// use vortex_buffer::buffer;
148///
149/// // Create arrays for each field
150/// let ids = PrimitiveArray::new(buffer![1i32, 2, 3], Validity::NonNullable);
151/// let names = PrimitiveArray::new(buffer![100u64, 200, 300], Validity::NonNullable);
152///
153/// // Create struct array with named fields
154/// let struct_array = StructArray::try_new(
155///     FieldNames::from(["id", "score"]),
156///     vec![ids.into_array(), names.into_array()],
157///     3,
158///     Validity::NonNullable,
159/// ).unwrap();
160///
161/// assert_eq!(struct_array.len(), 3);
162/// assert_eq!(struct_array.names().len(), 2);
163///
164/// // Access field by name
165/// let id_field = struct_array.field_by_name("id").unwrap();
166/// assert_eq!(id_field.len(), 3);
167/// ```
168#[derive(Clone, Debug)]
169pub struct StructArray {
170    len: usize,
171    dtype: DType,
172    fields: Vec<ArrayRef>,
173    validity: Validity,
174    stats_set: ArrayStats,
175}
176
177#[derive(Clone, Debug)]
178pub struct StructEncoding;
179
180impl StructArray {
181    pub fn fields(&self) -> &[ArrayRef] {
182        &self.fields
183    }
184
185    pub fn field_by_name(&self, name: impl AsRef<str>) -> VortexResult<&ArrayRef> {
186        let name = name.as_ref();
187        self.field_by_name_opt(name).ok_or_else(|| {
188            vortex_err!(
189                "Field {name} not found in struct array with names {:?}",
190                self.names()
191            )
192        })
193    }
194
195    pub fn field_by_name_opt(&self, name: impl AsRef<str>) -> Option<&ArrayRef> {
196        let name = name.as_ref();
197        self.names()
198            .iter()
199            .position(|field_name| field_name.as_ref() == name)
200            .map(|idx| &self.fields[idx])
201    }
202
203    pub fn names(&self) -> &FieldNames {
204        self.struct_fields().names()
205    }
206
207    pub fn struct_fields(&self) -> &StructFields {
208        let Some(struct_dtype) = &self.dtype.as_struct_fields_opt() else {
209            unreachable!(
210                "struct arrays must have be a DType::Struct, this is likely an internal bug."
211            )
212        };
213        struct_dtype
214    }
215
216    /// Create a new `StructArray` with the given length, but without any fields.
217    pub fn new_fieldless_with_len(len: usize) -> Self {
218        Self::try_new(
219            FieldNames::default(),
220            Vec::new(),
221            len,
222            Validity::NonNullable,
223        )
224        .vortex_expect("StructArray::new_with_len should not fail")
225    }
226
227    /// Creates a new [`StructArray`].
228    ///
229    /// # Panics
230    ///
231    /// Panics if the provided components do not satisfy the invariants documented
232    /// in [`StructArray::new_unchecked`].
233    pub fn new(
234        names: FieldNames,
235        fields: Vec<ArrayRef>,
236        length: usize,
237        validity: Validity,
238    ) -> Self {
239        Self::try_new(names, fields, length, validity)
240            .vortex_expect("StructArray construction failed")
241    }
242
243    /// Constructs a new `StructArray`.
244    ///
245    /// See [`StructArray::new_unchecked`] for more information.
246    ///
247    /// # Errors
248    ///
249    /// Returns an error if the provided components do not satisfy the invariants documented in
250    /// [`StructArray::new_unchecked`].
251    pub fn try_new(
252        names: FieldNames,
253        fields: Vec<ArrayRef>,
254        length: usize,
255        validity: Validity,
256    ) -> VortexResult<Self> {
257        let field_dtypes: Vec<_> = fields.iter().map(|d| d.dtype()).cloned().collect();
258        let dtype = StructFields::new(names, field_dtypes);
259
260        Self::validate(&fields, &dtype, length, &validity)?;
261
262        // SAFETY: validate ensures all invariants are met.
263        Ok(unsafe { Self::new_unchecked(fields, dtype, length, validity) })
264    }
265
266    /// Creates a new [`StructArray`] without validation from these components:
267    ///
268    /// * `fields` is a vector of arrays, one for each field in the struct.
269    /// * `dtype` contains the field names and types.
270    /// * `length` is the number of struct rows.
271    /// * `validity` holds the null values.
272    ///
273    /// # Safety
274    ///
275    /// The caller must ensure all of the following invariants are satisfied:
276    ///
277    /// ## Field Requirements
278    ///
279    /// - `fields.len()` must exactly equal `dtype.names().len()`.
280    /// - Every field array in `fields` must have length exactly equal to `length`.
281    /// - For each index `i`, `fields[i].dtype()` must exactly match `dtype.fields()[i]`.
282    ///
283    /// ## Type Requirements
284    ///
285    /// - Field names in `dtype` may be duplicated (this is explicitly allowed).
286    /// - The nullability of `dtype` must match the nullability of `validity`.
287    ///
288    /// ## Validity Requirements
289    ///
290    /// - If `validity` is [`Validity::Array`], its length must exactly equal `length`.
291    pub unsafe fn new_unchecked(
292        fields: Vec<ArrayRef>,
293        dtype: StructFields,
294        length: usize,
295        validity: Validity,
296    ) -> Self {
297        Self {
298            len: length,
299            dtype: DType::Struct(dtype, validity.nullability()),
300            fields,
301            validity,
302            stats_set: Default::default(),
303        }
304    }
305
306    /// Validates the components that would be used to create a [`StructArray`].
307    ///
308    /// This function checks all the invariants required by [`StructArray::new_unchecked`].
309    pub(crate) fn validate(
310        fields: &[ArrayRef],
311        dtype: &StructFields,
312        length: usize,
313        validity: &Validity,
314    ) -> VortexResult<()> {
315        // Check field count matches
316        if fields.len() != dtype.names().len() {
317            vortex_bail!(
318                "Got {} fields but dtype has {} names",
319                fields.len(),
320                dtype.names().len()
321            );
322        }
323
324        // Check each field's length and dtype
325        for (i, (field, struct_dt)) in fields.iter().zip(dtype.fields()).enumerate() {
326            if field.len() != length {
327                vortex_bail!(
328                    "Field {} has length {} but expected {}",
329                    i,
330                    field.len(),
331                    length
332                );
333            }
334
335            if field.dtype() != &struct_dt {
336                vortex_bail!(
337                    "Field {} has dtype {} but expected {}",
338                    i,
339                    field.dtype(),
340                    struct_dt
341                );
342            }
343        }
344
345        // Check validity length
346        if let Some(validity_len) = validity.maybe_len()
347            && validity_len != length
348        {
349            vortex_bail!(
350                "Validity has length {} but expected {}",
351                validity_len,
352                length
353            );
354        }
355
356        Ok(())
357    }
358
359    pub fn try_new_with_dtype(
360        fields: Vec<ArrayRef>,
361        dtype: StructFields,
362        length: usize,
363        validity: Validity,
364    ) -> VortexResult<Self> {
365        Self::validate(&fields, &dtype, length, &validity)?;
366
367        // SAFETY: validate ensures all invariants are met.
368        Ok(unsafe { Self::new_unchecked(fields, dtype, length, validity) })
369    }
370
371    pub fn from_fields<N: AsRef<str>>(items: &[(N, ArrayRef)]) -> VortexResult<Self> {
372        Self::try_from_iter(items.iter().map(|(a, b)| (a, b.to_array())))
373    }
374
375    pub fn try_from_iter_with_validity<
376        N: AsRef<str>,
377        A: IntoArray,
378        T: IntoIterator<Item = (N, A)>,
379    >(
380        iter: T,
381        validity: Validity,
382    ) -> VortexResult<Self> {
383        let (names, fields): (Vec<FieldName>, Vec<ArrayRef>) = iter
384            .into_iter()
385            .map(|(name, fields)| (FieldName::from(name.as_ref()), fields.into_array()))
386            .unzip();
387        let len = fields
388            .first()
389            .map(|f| f.len())
390            .ok_or_else(|| vortex_err!("StructArray cannot be constructed from an empty slice of arrays because the length is unspecified"))?;
391
392        Self::try_new(FieldNames::from_iter(names), fields, len, validity)
393    }
394
395    pub fn try_from_iter<N: AsRef<str>, A: IntoArray, T: IntoIterator<Item = (N, A)>>(
396        iter: T,
397    ) -> VortexResult<Self> {
398        Self::try_from_iter_with_validity(iter, Validity::NonNullable)
399    }
400
401    // TODO(aduffy): Add equivalent function to support field masks for nested column access.
402    /// Return a new StructArray with the given projection applied.
403    ///
404    /// Projection does not copy data arrays. Projection is defined by an ordinal array slice
405    /// which specifies the new ordering of columns in the struct. The projection can be used to
406    /// perform column re-ordering, deletion, or duplication at a logical level, without any data
407    /// copying.
408    #[allow(clippy::same_name_method)]
409    pub fn project(&self, projection: &[FieldName]) -> VortexResult<Self> {
410        let mut children = Vec::with_capacity(projection.len());
411        let mut names = Vec::with_capacity(projection.len());
412
413        for f_name in projection.iter() {
414            let idx = self
415                .names()
416                .iter()
417                .position(|name| name == f_name)
418                .ok_or_else(|| vortex_err!("Unknown field {f_name}"))?;
419
420            names.push(self.names()[idx].clone());
421            children.push(self.fields()[idx].clone());
422        }
423
424        StructArray::try_new(
425            FieldNames::from(names.as_slice()),
426            children,
427            self.len(),
428            self.validity().clone(),
429        )
430    }
431
432    /// Removes and returns a column from the struct array by name.
433    /// If the column does not exist, returns `None`.
434    pub fn remove_column(&mut self, name: impl Into<FieldName>) -> Option<ArrayRef> {
435        let name = name.into();
436
437        let struct_dtype = self.struct_fields().clone();
438
439        let position = struct_dtype
440            .names()
441            .iter()
442            .position(|field_name| field_name.as_ref() == name.as_ref())?;
443
444        let field = self.fields.remove(position);
445
446        if let Ok(new_dtype) = struct_dtype.without_field(position) {
447            self.dtype = DType::Struct(new_dtype, self.dtype.nullability());
448            return Some(field);
449        }
450        None
451    }
452
453    /// Create a new StructArray by appending a new column onto the existing array.
454    pub fn with_column(&self, name: impl Into<FieldName>, array: ArrayRef) -> VortexResult<Self> {
455        let name = name.into();
456        let struct_dtype = self.struct_fields().clone();
457
458        let names = struct_dtype.names().iter().cloned().chain(once(name));
459        let types = struct_dtype.fields().chain(once(array.dtype().clone()));
460        let new_fields = StructFields::new(names.collect(), types.collect());
461
462        let mut children = self.fields.clone();
463        children.push(array);
464
465        Self::try_new_with_dtype(children, new_fields, self.len, self.validity.clone())
466    }
467}
468
469impl ValidityHelper for StructArray {
470    fn validity(&self) -> &Validity {
471        &self.validity
472    }
473}
474
475impl ArrayVTable<StructVTable> for StructVTable {
476    fn len(array: &StructArray) -> usize {
477        array.len
478    }
479
480    fn dtype(array: &StructArray) -> &DType {
481        &array.dtype
482    }
483
484    fn stats(array: &StructArray) -> StatsSetRef<'_> {
485        array.stats_set.to_ref(array.as_ref())
486    }
487}
488
489impl CanonicalVTable<StructVTable> for StructVTable {
490    fn canonicalize(array: &StructArray) -> Canonical {
491        Canonical::Struct(array.clone())
492    }
493}
494
495impl OperationsVTable<StructVTable> for StructVTable {
496    fn slice(array: &StructArray, range: Range<usize>) -> ArrayRef {
497        let fields = array
498            .fields()
499            .iter()
500            .map(|field| field.slice(range.clone()))
501            .collect_vec();
502        // SAFETY: All invariants are preserved:
503        // - fields.len() == dtype.names().len() (same struct fields)
504        // - Every field has length == range.len() (all sliced to same range)
505        // - Each field's dtype matches the struct dtype (unchanged from original)
506        // - Validity length matches array length (both sliced to same range)
507        unsafe {
508            StructArray::new_unchecked(
509                fields,
510                array.struct_fields().clone(),
511                range.len(),
512                array.validity().slice(range),
513            )
514        }
515        .into_array()
516    }
517
518    fn scalar_at(array: &StructArray, index: usize) -> Scalar {
519        Scalar::struct_(
520            array.dtype().clone(),
521            array
522                .fields()
523                .iter()
524                .map(|field| field.scalar_at(index))
525                .collect_vec(),
526        )
527    }
528}
529
530#[cfg(test)]
531mod test {
532    use vortex_buffer::buffer;
533    use vortex_dtype::{DType, FieldName, FieldNames, Nullability, PType};
534
535    use crate::arrays::primitive::PrimitiveArray;
536    use crate::arrays::struct_::StructArray;
537    use crate::arrays::varbin::VarBinArray;
538    use crate::arrays::{BoolArray, ConstantArray};
539    use crate::validity::Validity;
540    use crate::{Array, IntoArray, ToCanonical};
541
542    #[test]
543    fn test_project() {
544        let xs = PrimitiveArray::new(buffer![0i64, 1, 2, 3, 4], Validity::NonNullable);
545        let ys = VarBinArray::from_vec(
546            vec!["a", "b", "c", "d", "e"],
547            DType::Utf8(Nullability::NonNullable),
548        );
549        let zs = BoolArray::from_iter([true, true, true, false, false]);
550
551        let struct_a = StructArray::try_new(
552            FieldNames::from(["xs", "ys", "zs"]),
553            vec![xs.into_array(), ys.into_array(), zs.into_array()],
554            5,
555            Validity::NonNullable,
556        )
557        .unwrap();
558
559        let struct_b = struct_a
560            .project(&[FieldName::from("zs"), FieldName::from("xs")])
561            .unwrap();
562        assert_eq!(
563            struct_b.names().as_ref(),
564            [FieldName::from("zs"), FieldName::from("xs")],
565        );
566
567        assert_eq!(struct_b.len(), 5);
568
569        let bools = &struct_b.fields[0];
570        assert_eq!(
571            bools.to_bool().boolean_buffer().iter().collect::<Vec<_>>(),
572            vec![true, true, true, false, false]
573        );
574
575        let prims = &struct_b.fields[1];
576        assert_eq!(prims.to_primitive().as_slice::<i64>(), [0i64, 1, 2, 3, 4]);
577    }
578
579    #[test]
580    fn test_remove_column() {
581        let xs = PrimitiveArray::new(buffer![0i64, 1, 2, 3, 4], Validity::NonNullable);
582        let ys = PrimitiveArray::new(buffer![4u64, 5, 6, 7, 8], Validity::NonNullable);
583
584        let mut struct_a = StructArray::try_new(
585            FieldNames::from(["xs", "ys"]),
586            vec![xs.into_array(), ys.into_array()],
587            5,
588            Validity::NonNullable,
589        )
590        .unwrap();
591
592        let removed = struct_a.remove_column("xs").unwrap();
593        assert_eq!(
594            removed.dtype(),
595            &DType::Primitive(PType::I64, Nullability::NonNullable)
596        );
597        assert_eq!(removed.to_primitive().as_slice::<i64>(), [0i64, 1, 2, 3, 4]);
598
599        assert_eq!(struct_a.names(), &["ys"]);
600        assert_eq!(struct_a.fields.len(), 1);
601        assert_eq!(struct_a.len(), 5);
602        assert_eq!(
603            struct_a.fields[0].dtype(),
604            &DType::Primitive(PType::U64, Nullability::NonNullable)
605        );
606        assert_eq!(
607            struct_a.fields[0].to_primitive().as_slice::<u64>(),
608            [4u64, 5, 6, 7, 8]
609        );
610
611        let empty = struct_a.remove_column("non_existent");
612        assert!(
613            empty.is_none(),
614            "Expected None when removing non-existent column"
615        );
616        assert_eq!(struct_a.names(), &["ys"]);
617    }
618
619    #[test]
620    fn test_duplicate_field_names() {
621        // Test that StructArray allows duplicate field names and returns the first match
622        let field1 = buffer![1i32, 2, 3].into_array();
623        let field2 = buffer![10i32, 20, 30].into_array();
624        let field3 = buffer![100i32, 200, 300].into_array();
625
626        // Create struct with duplicate field names - "value" appears twice
627        let struct_array = StructArray::try_new(
628            FieldNames::from(["value", "other", "value"]),
629            vec![field1, field2, field3],
630            3,
631            Validity::NonNullable,
632        )
633        .unwrap();
634
635        // field_by_name should return the first field with the matching name
636        let first_value_field = struct_array.field_by_name("value").unwrap();
637        assert_eq!(
638            first_value_field.to_primitive().as_slice::<i32>(),
639            [1i32, 2, 3] // This is field1, not field3
640        );
641
642        // Verify field_by_name_opt also returns the first match
643        let opt_field = struct_array.field_by_name_opt("value").unwrap();
644        assert_eq!(
645            opt_field.to_primitive().as_slice::<i32>(),
646            [1i32, 2, 3] // First "value" field
647        );
648
649        // Verify the third field (second "value") can be accessed by index
650        let third_field = &struct_array.fields()[2];
651        assert_eq!(
652            third_field.to_primitive().as_slice::<i32>(),
653            [100i32, 200, 300]
654        );
655    }
656
657    #[test]
658    fn test_uncompressed_size_in_bytes() {
659        let struct_array = StructArray::new(
660            FieldNames::from(["integers"]),
661            vec![ConstantArray::new(5, 1000).into_array()],
662            1000,
663            Validity::NonNullable,
664        );
665
666        let canonical_size = struct_array.to_canonical().into_array().nbytes();
667        let uncompressed_size = struct_array
668            .statistics()
669            .compute_uncompressed_size_in_bytes();
670
671        assert_eq!(canonical_size, 2);
672        assert_eq!(uncompressed_size, Some(4000));
673    }
674}