Skip to main content

vortex_array/arrays/interleave/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4//! The [`Interleave`] encoding: a lazy, random-access gather of `N` value arrays into one array,
5//! routed by a per-row `(array_index, row_index)` pair.
6//!
7//! # Specification
8//!
9//! An [`Interleave`] array has `N + 2` children: `N` *values* followed by an `array_indices`
10//! selector and a `row_indices` selector. The output has `array_indices.len()` rows, and output
11//! row `i` comes from `values[array_indices[i]][row_indices[i]]`.
12//!
13//! Unlike a `Merge`, which consumes each branch in order under a cursor, an [`Interleave`] is
14//! **random-access**: `row_indices` names an explicit position within the selected value array, so
15//! rows may be reordered, skipped, or repeated. A `Merge` is the special case where each value
16//! array is consumed front-to-back exactly once.
17//!
18//! Like a `Merge`, the value arrays are independent: each holds only its own rows, and the
19//! selectors stitch them back together. This distinguishes [`Interleave`] from an element-wise
20//! select such as `zip`, whose arguments are all full-length.
21//!
22//! ## Invariants
23//!
24//! - Both selectors are **non-nullable** and equal in length, which is the output length. They
25//!   record *where* each output row comes from, which is always a definite decision. Predicate
26//!   nullability must be resolved into definite indices by the caller *before* the interleave is
27//!   built.
28//! - `array_indices[i] < values.len()` and `row_indices[i] < values[array_indices[i]].len()` for
29//!   every `i`. These per-row bounds depend on the selector *values* and so are a runtime
30//!   precondition of the caller, checked in the execution kernels rather than at construction.
31//! - All values share a logical type up to nullability. The output type is that shared type with
32//!   the union of the values' nullabilities. This is orthogonal to the selectors: a row's *value*
33//!   may be null even though its `(array_index, row_index)` is definite.
34//! - The output length equals `array_indices.len()` (`== row_indices.len()`).
35//!
36//! ## Selector types
37//!
38//! `array_indices` encodes the value array per row as a non-nullable **unsigned integer**
39//! (`array_indices[i]` is the index into `values`). `row_indices` is likewise a non-nullable
40//! **unsigned integer** naming the position within the selected value array.
41
42mod execute;
43
44use std::fmt::Display;
45use std::fmt::Formatter;
46use std::hash::Hasher;
47
48use vortex_error::VortexExpect;
49use vortex_error::VortexResult;
50use vortex_error::vortex_bail;
51use vortex_error::vortex_ensure;
52use vortex_error::vortex_panic;
53use vortex_session::VortexSession;
54use vortex_session::registry::CachedId;
55
56use crate::ArrayEq;
57use crate::ArrayHash;
58use crate::ArrayRef;
59use crate::EqMode;
60use crate::ExecutionCtx;
61use crate::IntoArray;
62use crate::array::Array;
63use crate::array::ArrayId;
64use crate::array::ArrayParts;
65use crate::array::ArraySlots;
66use crate::array::ArrayView;
67use crate::array::OperationsVTable;
68use crate::array::TypedArrayRef;
69use crate::array::VTable;
70use crate::array::ValidityVTable;
71use crate::arrays::ConstantArray;
72use crate::buffer::BufferHandle;
73use crate::dtype::DType;
74use crate::dtype::Nullability;
75use crate::executor::ExecutionResult;
76use crate::scalar::Scalar;
77use crate::serde::ArrayChildren;
78use crate::validity::Validity;
79
80/// An [`Interleave`]-encoded Vortex array. See the [module docs](self) for the specification.
81pub type InterleaveArray = Array<Interleave>;
82
83/// The [`Interleave`] encoding. See the [module docs](self).
84#[derive(Clone, Debug)]
85pub struct Interleave;
86
87/// Per-array metadata for an [`InterleaveArray`].
88///
89/// The values and selectors live in the array's slots; only the value count is stored here so the
90/// selector slots can be located (`slots[num_values]` and `slots[num_values + 1]`).
91#[derive(Clone, Debug)]
92pub struct InterleaveData {
93    pub(crate) num_values: usize,
94}
95
96impl Display for InterleaveData {
97    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
98        write!(f, "num_values: {}", self.num_values)
99    }
100}
101
102impl ArrayHash for InterleaveData {
103    fn array_hash<H: Hasher>(&self, state: &mut H, _accuracy: EqMode) {
104        state.write_usize(self.num_values);
105    }
106}
107
108impl ArrayEq for InterleaveData {
109    fn array_eq(&self, other: &Self, _accuracy: EqMode) -> bool {
110        self.num_values == other.num_values
111    }
112}
113
114/// Accessors for the values and selectors of an [`InterleaveArray`].
115pub trait InterleaveArrayExt: TypedArrayRef<Interleave> {
116    /// The number of value arrays (two fewer than the number of children).
117    fn num_values(&self) -> usize {
118        self.num_values
119    }
120
121    /// The `idx`-th value array (holding the rows that `array_indices` routes to it).
122    fn value(&self, idx: usize) -> &ArrayRef {
123        self.as_ref().slots()[idx]
124            .as_ref()
125            .vortex_expect("validated interleave value slot")
126    }
127
128    /// The selector routing each output row to a value array.
129    fn array_indices(&self) -> &ArrayRef {
130        self.as_ref().slots()[self.num_values]
131            .as_ref()
132            .vortex_expect("validated interleave array_indices slot")
133    }
134
135    /// The selector naming each output row's position within its value array.
136    fn row_indices(&self) -> &ArrayRef {
137        self.as_ref().slots()[self.num_values + 1]
138            .as_ref()
139            .vortex_expect("validated interleave row_indices slot")
140    }
141}
142impl<T: TypedArrayRef<Interleave>> InterleaveArrayExt for T {}
143
144impl Interleave {
145    /// The single source of truth for [`InterleaveArray`] invariants.
146    ///
147    /// Validates `values`, `array_indices`, and `row_indices` against the [specification](self) and
148    /// returns the output [`DType`] (the shared value type with the union of value nullabilities).
149    /// Both the public constructor and the [`VTable::validate`] hook funnel through here.
150    fn check(
151        values: &[ArrayRef],
152        array_indices: &ArrayRef,
153        row_indices: &ArrayRef,
154    ) -> VortexResult<DType> {
155        vortex_ensure!(
156            values.len() >= 2,
157            "interleave requires at least 2 values, got {}",
158            values.len()
159        );
160
161        // Both selectors are non-nullable unsigned integers: `array_indices` indexes the values and
162        // `row_indices` names a position within the selected value.
163        for (name, selector) in [
164            ("array_indices", array_indices),
165            ("row_indices", row_indices),
166        ] {
167            match selector.dtype() {
168                DType::Primitive(ptype, nullability) if ptype.is_unsigned_int() => {
169                    vortex_ensure!(
170                        !nullability.is_nullable(),
171                        "interleave {name} must be non-nullable, got {}",
172                        selector.dtype()
173                    );
174                }
175                other => vortex_bail!(
176                    "interleave {name} must be a non-nullable unsigned integer, got {other}"
177                ),
178            }
179        }
180
181        vortex_ensure!(
182            array_indices.len() == row_indices.len(),
183            "interleave selectors must have equal length, got array_indices {} and row_indices {}",
184            array_indices.len(),
185            row_indices.len()
186        );
187
188        let base_dtype = values[0].dtype();
189        let mut nullability = Nullability::NonNullable;
190        for value in values {
191            vortex_ensure!(
192                value.dtype().eq_ignore_nullability(base_dtype),
193                "interleave values must share a dtype up to nullability: {} vs {}",
194                base_dtype,
195                value.dtype()
196            );
197            nullability |= value.dtype().nullability();
198        }
199
200        Ok(base_dtype.with_nullability(nullability))
201    }
202}
203
204impl Array<Interleave> {
205    /// Constructs a new [`InterleaveArray`] from `values` and the `array_indices` / `row_indices`
206    /// selectors.
207    ///
208    /// See the [module docs](self) for the full specification and invariants. The selectors must be
209    /// non-nullable: they record a definite `(array_index, row_index)` per row, so null-predicate
210    /// handling is the caller's responsibility, resolved before the interleave is constructed. The
211    /// per-row bounds on the selector values are a runtime precondition checked during execution.
212    pub fn try_new(
213        values: Vec<ArrayRef>,
214        array_indices: ArrayRef,
215        row_indices: ArrayRef,
216    ) -> VortexResult<Self> {
217        let dtype = Interleave::check(&values, &array_indices, &row_indices)?;
218        let len = array_indices.len();
219        let num_values = values.len();
220
221        let mut slots: ArraySlots = values.into_iter().map(Some).collect();
222        slots.push(Some(array_indices));
223        slots.push(Some(row_indices));
224
225        Ok(unsafe {
226            Array::from_parts_unchecked(
227                ArrayParts::new(Interleave, dtype, len, InterleaveData { num_values })
228                    .with_slots(slots),
229            )
230        })
231    }
232}
233
234impl VTable for Interleave {
235    type TypedArrayData = InterleaveData;
236    type OperationsVTable = Self;
237    type ValidityVTable = Self;
238
239    fn id(&self) -> ArrayId {
240        static ID: CachedId = CachedId::new("vortex.interleave");
241        *ID
242    }
243
244    fn validate(
245        &self,
246        data: &Self::TypedArrayData,
247        dtype: &DType,
248        len: usize,
249        slots: &[Option<ArrayRef>],
250    ) -> VortexResult<()> {
251        vortex_ensure!(
252            slots.len() == data.num_values + 2,
253            "InterleaveArray expected {} slots (values + array_indices + row_indices), got {}",
254            data.num_values + 2,
255            slots.len()
256        );
257        vortex_ensure!(
258            slots.iter().all(|s| s.is_some()),
259            "InterleaveArray slots must all be present"
260        );
261
262        let values: Vec<ArrayRef> = slots[..data.num_values]
263            .iter()
264            .map(|s| s.clone().vortex_expect("validated value slot"))
265            .collect();
266        let array_indices = slots[data.num_values]
267            .clone()
268            .vortex_expect("validated array_indices slot");
269        let row_indices = slots[data.num_values + 1]
270            .clone()
271            .vortex_expect("validated row_indices slot");
272
273        // All semantic invariants live in `check`; here we only confirm the array's cached `dtype`
274        // and `len` agree with what the children imply.
275        let expected_dtype = Interleave::check(&values, &array_indices, &row_indices)?;
276        vortex_ensure!(
277            dtype == &expected_dtype,
278            "InterleaveArray dtype {} does not match the dtype implied by its children {}",
279            dtype,
280            expected_dtype
281        );
282        vortex_ensure!(
283            len == array_indices.len(),
284            "InterleaveArray length {} does not match array_indices length {}",
285            len,
286            array_indices.len()
287        );
288        Ok(())
289    }
290
291    fn nbuffers(_array: ArrayView<'_, Self>) -> usize {
292        0
293    }
294
295    fn buffer(_array: ArrayView<'_, Self>, _idx: usize) -> BufferHandle {
296        vortex_panic!("InterleaveArray has no buffers")
297    }
298
299    fn buffer_name(_array: ArrayView<'_, Self>, _idx: usize) -> Option<String> {
300        None
301    }
302
303    fn slot_name(array: ArrayView<'_, Self>, idx: usize) -> String {
304        if idx == array.num_values() {
305            "array_indices".to_string()
306        } else if idx == array.num_values() + 1 {
307            "row_indices".to_string()
308        } else {
309            format!("value_{idx}")
310        }
311    }
312
313    fn serialize(
314        _array: ArrayView<'_, Self>,
315        _session: &VortexSession,
316    ) -> VortexResult<Option<Vec<u8>>> {
317        vortex_bail!("Interleave array is not serializable")
318    }
319
320    fn deserialize(
321        &self,
322        _dtype: &DType,
323        _len: usize,
324        _metadata: &[u8],
325        _buffers: &[BufferHandle],
326        _children: &dyn ArrayChildren,
327        _session: &VortexSession,
328    ) -> VortexResult<ArrayParts<Self>> {
329        vortex_bail!("Interleave array is not serializable")
330    }
331
332    fn execute(array: Array<Self>, ctx: &mut ExecutionCtx) -> VortexResult<ExecutionResult> {
333        execute::execute(array, ctx)
334    }
335}
336
337impl OperationsVTable<Interleave> for Interleave {
338    fn scalar_at(
339        array: ArrayView<'_, Interleave>,
340        index: usize,
341        ctx: &mut ExecutionCtx,
342    ) -> VortexResult<Scalar> {
343        // Random-access gather: read the routing pair for `index` directly, then pull that row from
344        // the selected value array. No cursor walk is required.
345        let branch_idx = array
346            .array_indices()
347            .execute_scalar(index, ctx)?
348            .as_primitive()
349            .as_::<usize>()
350            .vortex_expect("interleave array_indices is non-nullable");
351        let row = array
352            .row_indices()
353            .execute_scalar(index, ctx)?
354            .as_primitive()
355            .as_::<usize>()
356            .vortex_expect("interleave row_indices is non-nullable");
357
358        let scalar = array.value(branch_idx).execute_scalar(row, ctx)?;
359        // The value may be non-nullable while the interleaved output is nullable; align the dtype.
360        Ok(if array.as_ref().dtype().is_nullable() {
361            scalar.into_nullable()
362        } else {
363            scalar
364        })
365    }
366}
367
368impl ValidityVTable<Interleave> for Interleave {
369    fn validity(array: ArrayView<'_, Interleave>) -> VortexResult<Validity> {
370        if !array.as_ref().dtype().is_nullable() {
371            return Ok(Validity::NonNullable);
372        }
373        // The output validity is itself an interleave — by the same selectors — of the values'
374        // validities, expressed as non-nullable boolean arrays. This bottoms out immediately
375        // because the inner interleave is non-nullable.
376        let mut value_validities: Vec<ArrayRef> = Vec::with_capacity(array.num_values());
377        for i in 0..array.num_values() {
378            value_validities.push(value_validity_array(array.value(i))?);
379        }
380        let interleaved = InterleaveArray::try_new(
381            value_validities,
382            array.array_indices().clone(),
383            array.row_indices().clone(),
384        )?;
385        Ok(Validity::Array(interleaved.into_array()))
386    }
387}
388
389/// Materializes a value's validity as a non-nullable boolean array of the value's length, where
390/// `true` marks a valid (non-null) row.
391fn value_validity_array(value: &ArrayRef) -> VortexResult<ArrayRef> {
392    Ok(match value.validity()? {
393        Validity::NonNullable | Validity::AllValid => {
394            ConstantArray::new(true, value.len()).into_array()
395        }
396        Validity::AllInvalid => ConstantArray::new(false, value.len()).into_array(),
397        Validity::Array(array) => array,
398    })
399}
400
401#[cfg(test)]
402mod tests {
403    use vortex_error::VortexResult;
404
405    use super::*;
406    use crate::Canonical;
407    use crate::LEGACY_SESSION;
408    use crate::VortexSessionExecute;
409    use crate::arrays::BoolArray;
410    use crate::arrays::PrimitiveArray;
411    use crate::assert_arrays_eq;
412
413    /// Reference (oracle) implementation of the interleave spec, used only to validate the optimized
414    /// [execute](super::execute) path. It is intentionally simple and slow: it pulls each output
415    /// element one [`Scalar`] at a time via [`ArrayRef::execute_scalar`] and never touches raw bits.
416    ///
417    /// This is deliberately *not* wired into the array execution path — it exists purely as a
418    /// trustworthy comparison point in tests.
419    fn interleave_reference(
420        values: &[ArrayRef],
421        array_indices: &ArrayRef,
422        row_indices: &ArrayRef,
423        ctx: &mut ExecutionCtx,
424    ) -> VortexResult<ArrayRef> {
425        let len = array_indices.len();
426        let nullable = values.iter().any(|v| v.dtype().is_nullable());
427        let mut out: Vec<Option<bool>> = Vec::with_capacity(len);
428
429        for i in 0..len {
430            let j = array_indices
431                .execute_scalar(i, ctx)?
432                .as_primitive()
433                .as_::<usize>()
434                .vortex_expect("array_indices is non-nullable");
435            let row = row_indices
436                .execute_scalar(i, ctx)?
437                .as_primitive()
438                .as_::<usize>()
439                .vortex_expect("row_indices is non-nullable");
440            out.push(values[j].execute_scalar(row, ctx)?.as_bool().value());
441        }
442
443        Ok(if nullable {
444            BoolArray::from_iter(out).into_array()
445        } else {
446            BoolArray::from_iter(
447                out.into_iter()
448                    .map(|v| v.vortex_expect("non-nullable value produced a null")),
449            )
450            .into_array()
451        })
452    }
453
454    /// Builds the compact value arrays and the unsigned `(array_indices, row_indices)` selectors for
455    /// a gather described by per-output `(array_index, row_index)` pairs over `branches`.
456    fn build(
457        branches: &[&[Option<bool>]],
458        indices: &[(usize, usize)],
459    ) -> (Vec<ArrayRef>, ArrayRef, ArrayRef) {
460        let nullable = branches.iter().flat_map(|b| b.iter()).any(Option::is_none);
461        let to_value = |vals: &[Option<bool>]| -> ArrayRef {
462            if nullable {
463                BoolArray::from_iter(vals.iter().copied()).into_array()
464            } else {
465                BoolArray::from_iter(
466                    vals.iter()
467                        .map(|v| v.vortex_expect("non-nullable value produced a null")),
468                )
469                .into_array()
470            }
471        };
472
473        let values = branches.iter().map(|b| to_value(b)).collect();
474        let array_indices = PrimitiveArray::from_iter(
475            indices
476                .iter()
477                .map(|&(a, _)| u32::try_from(a).vortex_expect("array index fits in u32")),
478        )
479        .into_array();
480        let row_indices = PrimitiveArray::from_iter(
481            indices
482                .iter()
483                .map(|&(_, r)| u32::try_from(r).vortex_expect("row index fits in u32")),
484        )
485        .into_array();
486        (values, array_indices, row_indices)
487    }
488
489    /// Asserts that the optimized execute path and the reference implementation agree, exercising
490    /// `InterleaveArray` construction, `execute`, `scalar_at`, and `validity` (via
491    /// `assert_arrays_eq`).
492    fn check(branches: &[&[Option<bool>]], indices: &[(usize, usize)]) -> VortexResult<()> {
493        let (values, array_indices, row_indices) = build(branches, indices);
494
495        let interleaved =
496            InterleaveArray::try_new(values.clone(), array_indices.clone(), row_indices.clone())?
497                .into_array();
498
499        let mut ctx = LEGACY_SESSION.create_execution_ctx();
500        let reference = interleave_reference(&values, &array_indices, &row_indices, &mut ctx)?;
501
502        assert_arrays_eq!(interleaved, reference);
503        Ok(())
504    }
505
506    #[test]
507    fn interleave_reorders_and_repeats() -> VortexResult<()> {
508        // Random access: rows are pulled out of order and branch 0 row 0 is repeated.
509        check(
510            &[&[Some(true), Some(false)], &[Some(false), Some(true)]],
511            &[(0, 1), (1, 0), (0, 0), (1, 1), (0, 0)],
512        )
513    }
514
515    #[test]
516    fn interleave_skips_rows() -> VortexResult<()> {
517        // Branch 0 row 1 and branch 1 row 0 are never gathered.
518        check(
519            &[
520                &[Some(true), Some(false), Some(true)],
521                &[Some(false), Some(true)],
522            ],
523            &[(0, 0), (1, 1), (0, 2)],
524        )
525    }
526
527    #[test]
528    fn interleave_three_values() -> VortexResult<()> {
529        // An unsigned `array_indices` routes among three values with full random access.
530        check(
531            &[
532                &[Some(true), Some(false)],
533                &[Some(false)],
534                &[Some(true), Some(true), Some(false)],
535            ],
536            &[(2, 1), (0, 0), (1, 0), (2, 2), (0, 1), (2, 0)],
537        )
538    }
539
540    #[test]
541    fn interleave_only_one_branch() -> VortexResult<()> {
542        check(
543            &[&[Some(true), Some(false), Some(true)], &[Some(false)]],
544            &[(0, 2), (0, 0), (0, 1)],
545        )
546    }
547
548    #[test]
549    fn interleave_nullable_with_nulls_in_values() -> VortexResult<()> {
550        check(
551            &[&[None, Some(true), None], &[Some(false), None]],
552            &[(1, 1), (0, 0), (1, 0), (0, 2), (0, 1)],
553        )
554    }
555
556    #[test]
557    fn interleave_empty() -> VortexResult<()> {
558        check(&[&[Some(true)], &[Some(false)]], &[])
559    }
560
561    #[test]
562    fn rejects_boolean_array_indices() {
563        let value = BoolArray::from_iter([true, false]).into_array();
564        let array_indices = BoolArray::from_iter([true, false]).into_array();
565        let row_indices = PrimitiveArray::from_iter([0u32, 1]).into_array();
566        let err = InterleaveArray::try_new(vec![value.clone(), value], array_indices, row_indices)
567            .err()
568            .vortex_expect("expected interleave to reject a boolean array_indices");
569        assert!(err.to_string().contains("unsigned integer"), "{err}");
570    }
571
572    #[test]
573    fn rejects_signed_integer_array_indices() {
574        let value = BoolArray::from_iter([true]).into_array();
575        let array_indices = PrimitiveArray::from_iter([0i32, 1]).into_array();
576        let row_indices = PrimitiveArray::from_iter([0u32, 0]).into_array();
577        let err = InterleaveArray::try_new(vec![value.clone(), value], array_indices, row_indices)
578            .err()
579            .vortex_expect("expected interleave to reject a signed integer array_indices");
580        assert!(err.to_string().contains("unsigned integer"), "{err}");
581    }
582
583    #[test]
584    fn rejects_nullable_row_indices() {
585        let value = BoolArray::from_iter([true, false]).into_array();
586        let array_indices = PrimitiveArray::from_iter([0u32, 1]).into_array();
587        let row_indices = PrimitiveArray::from_option_iter([Some(0u32), Some(1)]).into_array();
588        let err = InterleaveArray::try_new(vec![value.clone(), value], array_indices, row_indices)
589            .err()
590            .vortex_expect("expected interleave to reject nullable row_indices");
591        assert!(err.to_string().contains("non-nullable"), "{err}");
592    }
593
594    #[test]
595    fn rejects_mismatched_selector_lengths() {
596        let value = BoolArray::from_iter([true, false]).into_array();
597        let array_indices = PrimitiveArray::from_iter([0u32, 1]).into_array();
598        let row_indices = PrimitiveArray::from_iter([0u32]).into_array();
599        let err = InterleaveArray::try_new(vec![value.clone(), value], array_indices, row_indices)
600            .err()
601            .vortex_expect("expected interleave to reject mismatched selector lengths");
602        assert!(err.to_string().contains("equal length"), "{err}");
603    }
604
605    #[test]
606    #[should_panic(expected = "only implemented for boolean values")]
607    fn non_boolean_value_execution_panics() {
608        // Execution dispatches on the value type: primitive values have no kernel yet.
609        let v0 = PrimitiveArray::from_iter([1u32]).into_array();
610        let v1 = PrimitiveArray::from_iter([2u32]).into_array();
611        let array_indices = PrimitiveArray::from_iter([0u32, 1]).into_array();
612        let row_indices = PrimitiveArray::from_iter([0u32, 0]).into_array();
613        let interleaved = InterleaveArray::try_new(vec![v0, v1], array_indices, row_indices)
614            .vortex_expect("primitive values should construct")
615            .into_array();
616        let mut ctx = LEGACY_SESSION.create_execution_ctx();
617        interleaved.execute::<Canonical>(&mut ctx).ok();
618    }
619}