vortex_array/array/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4pub mod display;
5mod visitor;
6
7use std::any::Any;
8use std::fmt::{Debug, Formatter};
9use std::ops::Range;
10use std::sync::Arc;
11
12pub use visitor::*;
13use vortex_buffer::ByteBuffer;
14use vortex_dtype::{DType, Nullability};
15use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err, vortex_panic};
16use vortex_mask::Mask;
17use vortex_scalar::Scalar;
18
19use crate::arrays::{
20    BoolEncoding, ConstantVTable, DecimalEncoding, ExtensionEncoding, FixedSizeListEncoding,
21    ListEncoding, NullEncoding, PrimitiveEncoding, StructEncoding, VarBinEncoding,
22    VarBinViewEncoding,
23};
24use crate::builders::ArrayBuilder;
25use crate::compute::{ComputeFn, Cost, InvocationArgs, IsConstantOpts, Output, is_constant_opts};
26use crate::operator::OperatorRef;
27use crate::serde::ArrayChildren;
28use crate::stats::{Precision, Stat, StatsProviderExt, StatsSetRef};
29use crate::vtable::{
30    ArrayVTable, CanonicalVTable, ComputeVTable, OperationsVTable, PipelineVTable, SerdeVTable,
31    VTable, ValidityVTable, VisitorVTable,
32};
33use crate::{Canonical, EncodingId, EncodingRef, SerializeMetadata};
34
35/// The public API trait for all Vortex arrays.
36pub trait Array: 'static + private::Sealed + Send + Sync + Debug + ArrayVisitor {
37    /// Returns the array as a reference to a generic [`Any`] trait object.
38    fn as_any(&self) -> &dyn Any;
39
40    /// Returns the array as an [`ArrayRef`].
41    fn to_array(&self) -> ArrayRef;
42
43    /// Returns the length of the array.
44    fn len(&self) -> usize;
45
46    /// Returns whether the array is empty (has zero rows).
47    fn is_empty(&self) -> bool {
48        self.len() == 0
49    }
50
51    /// Returns the logical Vortex [`DType`] of the array.
52    fn dtype(&self) -> &DType;
53
54    /// Returns the encoding of the array.
55    fn encoding(&self) -> EncodingRef;
56
57    /// Returns the encoding ID of the array.
58    fn encoding_id(&self) -> EncodingId;
59
60    /// Performs a constant-time slice of the array.
61    fn slice(&self, range: Range<usize>) -> ArrayRef;
62
63    /// Fetch the scalar at the given index.
64    ///
65    /// This method panics if the index is out of bounds for the array.
66    fn scalar_at(&self, index: usize) -> Scalar;
67
68    /// Returns whether the array is of the given encoding.
69    fn is_encoding(&self, encoding: EncodingId) -> bool {
70        self.encoding_id() == encoding
71    }
72
73    /// Returns whether this array is an arrow encoding.
74    // TODO(ngates): this shouldn't live here.
75    fn is_arrow(&self) -> bool {
76        self.is_encoding(NullEncoding.id())
77            || self.is_encoding(BoolEncoding.id())
78            || self.is_encoding(PrimitiveEncoding.id())
79            || self.is_encoding(VarBinEncoding.id())
80            || self.is_encoding(VarBinViewEncoding.id())
81    }
82
83    /// Whether the array is of a canonical encoding.
84    // TODO(ngates): this shouldn't live here.
85    fn is_canonical(&self) -> bool {
86        self.is_encoding(NullEncoding.id())
87            || self.is_encoding(BoolEncoding.id())
88            || self.is_encoding(PrimitiveEncoding.id())
89            || self.is_encoding(DecimalEncoding.id())
90            || self.is_encoding(StructEncoding.id())
91            || self.is_encoding(ListEncoding.id())
92            || self.is_encoding(FixedSizeListEncoding.id())
93            || self.is_encoding(VarBinViewEncoding.id())
94            || self.is_encoding(ExtensionEncoding.id())
95    }
96
97    /// Returns whether the item at `index` is valid.
98    fn is_valid(&self, index: usize) -> bool;
99
100    /// Returns whether the item at `index` is invalid.
101    fn is_invalid(&self, index: usize) -> bool;
102
103    /// Returns whether all items in the array are valid.
104    ///
105    /// This is usually cheaper than computing a precise `valid_count`.
106    fn all_valid(&self) -> bool;
107
108    /// Returns whether the array is all invalid.
109    ///
110    /// This is usually cheaper than computing a precise `invalid_count`.
111    fn all_invalid(&self) -> bool;
112
113    /// Returns the number of valid elements in the array.
114    fn valid_count(&self) -> usize;
115
116    /// Returns the number of invalid elements in the array.
117    fn invalid_count(&self) -> usize;
118
119    /// Returns the canonical validity mask for the array.
120    fn validity_mask(&self) -> Mask;
121
122    /// Returns the canonical representation of the array.
123    fn to_canonical(&self) -> Canonical;
124
125    /// Writes the array into the canonical builder.
126    ///
127    /// The [`DType`] of the builder must match that of the array.
128    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder);
129
130    /// Returns the statistics of the array.
131    // TODO(ngates): change how this works. It's weird.
132    fn statistics(&self) -> StatsSetRef<'_>;
133
134    /// Replaces the children of the array with the given array references.
135    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef>;
136
137    /// Optionally invoke a kernel for the given compute function.
138    ///
139    /// These encoding-specific kernels are independent of kernels registered directly with
140    /// compute functions using [`ComputeFn::register_kernel`], and are attempted only if none of
141    /// the function-specific kernels returns a result.
142    ///
143    /// This allows encodings the opportunity to generically implement many compute functions
144    /// that share some property, for example [`ComputeFn::is_elementwise`], without prior
145    /// knowledge of the function itself, while still allowing users to override the implementation
146    /// of compute functions for built-in encodings. For an example, see the implementation for
147    /// chunked arrays.
148    ///
149    /// The first input in the [`InvocationArgs`] is always the array itself.
150    ///
151    /// Warning: do not call `compute_fn.invoke(args)` directly, as this will result in a recursive
152    /// call.
153    fn invoke(&self, compute_fn: &ComputeFn, args: &InvocationArgs)
154    -> VortexResult<Option<Output>>;
155
156    /// Convert the array to a operator operator if supported by the encoding.
157    ///
158    /// Returns `None` if the encoding does not support operator operations.
159    fn to_operator(&self) -> VortexResult<Option<OperatorRef>>;
160}
161
162impl Array for Arc<dyn Array> {
163    #[inline]
164    fn as_any(&self) -> &dyn Any {
165        self.as_ref().as_any()
166    }
167
168    #[inline]
169    fn to_array(&self) -> ArrayRef {
170        self.clone()
171    }
172
173    #[inline]
174    fn len(&self) -> usize {
175        self.as_ref().len()
176    }
177
178    #[inline]
179    fn dtype(&self) -> &DType {
180        self.as_ref().dtype()
181    }
182
183    #[inline]
184    fn encoding(&self) -> EncodingRef {
185        self.as_ref().encoding()
186    }
187
188    #[inline]
189    fn encoding_id(&self) -> EncodingId {
190        self.as_ref().encoding_id()
191    }
192
193    #[inline]
194    fn slice(&self, range: Range<usize>) -> ArrayRef {
195        self.as_ref().slice(range)
196    }
197
198    #[inline]
199    fn scalar_at(&self, index: usize) -> Scalar {
200        self.as_ref().scalar_at(index)
201    }
202
203    #[inline]
204    fn is_valid(&self, index: usize) -> bool {
205        self.as_ref().is_valid(index)
206    }
207
208    #[inline]
209    fn is_invalid(&self, index: usize) -> bool {
210        self.as_ref().is_invalid(index)
211    }
212
213    #[inline]
214    fn all_valid(&self) -> bool {
215        self.as_ref().all_valid()
216    }
217
218    #[inline]
219    fn all_invalid(&self) -> bool {
220        self.as_ref().all_invalid()
221    }
222
223    #[inline]
224    fn valid_count(&self) -> usize {
225        self.as_ref().valid_count()
226    }
227
228    #[inline]
229    fn invalid_count(&self) -> usize {
230        self.as_ref().invalid_count()
231    }
232
233    #[inline]
234    fn validity_mask(&self) -> Mask {
235        self.as_ref().validity_mask()
236    }
237
238    fn to_canonical(&self) -> Canonical {
239        self.as_ref().to_canonical()
240    }
241
242    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) {
243        self.as_ref().append_to_builder(builder)
244    }
245
246    fn statistics(&self) -> StatsSetRef<'_> {
247        self.as_ref().statistics()
248    }
249
250    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
251        self.as_ref().with_children(children)
252    }
253
254    fn invoke(
255        &self,
256        compute_fn: &ComputeFn,
257        args: &InvocationArgs,
258    ) -> VortexResult<Option<Output>> {
259        self.as_ref().invoke(compute_fn, args)
260    }
261
262    fn to_operator(&self) -> VortexResult<Option<OperatorRef>> {
263        self.as_ref().to_operator()
264    }
265}
266
267/// A reference counted pointer to a dynamic [`Array`] trait object.
268pub type ArrayRef = Arc<dyn Array>;
269
270impl ToOwned for dyn Array {
271    type Owned = ArrayRef;
272
273    fn to_owned(&self) -> Self::Owned {
274        self.to_array()
275    }
276}
277
278impl dyn Array + '_ {
279    /// Returns the array downcast to the given `A`.
280    pub fn as_<V: VTable>(&self) -> &V::Array {
281        self.as_opt::<V>().vortex_expect("Failed to downcast")
282    }
283
284    /// Returns the array downcast to the given `A`.
285    pub fn as_opt<V: VTable>(&self) -> Option<&V::Array> {
286        self.as_any()
287            .downcast_ref::<ArrayAdapter<V>>()
288            .map(|array_adapter| &array_adapter.0)
289    }
290
291    /// Is self an array with encoding from vtable `V`.
292    pub fn is<V: VTable>(&self) -> bool {
293        self.as_opt::<V>().is_some()
294    }
295
296    pub fn is_constant(&self) -> bool {
297        let opts = IsConstantOpts {
298            cost: Cost::Specialized,
299        };
300        is_constant_opts(self, &opts)
301            .inspect_err(|e| log::warn!("Failed to compute IsConstant: {e}"))
302            .ok()
303            .flatten()
304            .unwrap_or_default()
305    }
306
307    pub fn is_constant_opts(&self, cost: Cost) -> bool {
308        let opts = IsConstantOpts { cost };
309        is_constant_opts(self, &opts)
310            .inspect_err(|e| log::warn!("Failed to compute IsConstant: {e}"))
311            .ok()
312            .flatten()
313            .unwrap_or_default()
314    }
315
316    pub fn as_constant(&self) -> Option<Scalar> {
317        self.is_constant().then(|| self.scalar_at(0))
318    }
319
320    /// Total size of the array in bytes, including all children and buffers.
321    pub fn nbytes(&self) -> u64 {
322        let mut nbytes = 0;
323        for array in self.depth_first_traversal() {
324            for buffer in array.buffers() {
325                nbytes += buffer.len() as u64;
326            }
327        }
328        nbytes
329    }
330}
331
332/// Trait for converting a type into a Vortex [`ArrayRef`].
333pub trait IntoArray {
334    fn into_array(self) -> ArrayRef;
335}
336
337impl IntoArray for ArrayRef {
338    fn into_array(self) -> ArrayRef {
339        self
340    }
341}
342
343mod private {
344    use super::*;
345
346    pub trait Sealed {}
347
348    impl<V: VTable> Sealed for ArrayAdapter<V> {}
349    impl Sealed for Arc<dyn Array> {}
350}
351
352/// Adapter struct used to lift the [`VTable`] trait into an object-safe [`Array`]
353/// implementation.
354///
355/// Since this is a unit struct with `repr(transparent)`, we are able to turn un-adapted array
356/// structs into [`dyn Array`] using some cheeky casting inside [`std::ops::Deref`] and
357/// [`AsRef`]. See the `vtable!` macro for more details.
358#[repr(transparent)]
359pub struct ArrayAdapter<V: VTable>(V::Array);
360
361impl<V: VTable> ArrayAdapter<V> {
362    /// Provide a reference to the underlying array held within the adapter.
363    pub fn as_inner(&self) -> &V::Array {
364        &self.0
365    }
366
367    /// Unwrap into the inner array type, consuming the adapter.
368    pub fn into_inner(self) -> V::Array {
369        self.0
370    }
371}
372
373impl<V: VTable> Debug for ArrayAdapter<V> {
374    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
375        self.0.fmt(f)
376    }
377}
378
379impl<V: VTable> Array for ArrayAdapter<V> {
380    fn as_any(&self) -> &dyn Any {
381        self
382    }
383
384    fn to_array(&self) -> ArrayRef {
385        Arc::new(ArrayAdapter::<V>(self.0.clone()))
386    }
387
388    fn len(&self) -> usize {
389        <V::ArrayVTable as ArrayVTable<V>>::len(&self.0)
390    }
391
392    fn dtype(&self) -> &DType {
393        <V::ArrayVTable as ArrayVTable<V>>::dtype(&self.0)
394    }
395
396    fn encoding(&self) -> EncodingRef {
397        V::encoding(&self.0)
398    }
399
400    fn encoding_id(&self) -> EncodingId {
401        V::encoding(&self.0).id()
402    }
403
404    fn slice(&self, range: Range<usize>) -> ArrayRef {
405        let start = range.start;
406        let stop = range.end;
407
408        if start == 0 && stop == self.len() {
409            return self.to_array();
410        }
411
412        assert!(
413            start <= self.len(),
414            "OutOfBounds: start {start} > length {}",
415            self.len()
416        );
417        assert!(
418            stop <= self.len(),
419            "OutOfBounds: stop {stop} > length {}",
420            self.len()
421        );
422
423        assert!(start <= stop, "start ({start}) must be <= stop ({stop})");
424
425        if start == stop {
426            return Canonical::empty(self.dtype()).into_array();
427        }
428
429        let sliced = <V::OperationsVTable as OperationsVTable<V>>::slice(&self.0, range);
430
431        assert_eq!(
432            sliced.len(),
433            stop - start,
434            "Slice length mismatch {}",
435            self.encoding_id()
436        );
437
438        // Slightly more expensive, so only do this in debug builds.
439        debug_assert_eq!(
440            sliced.dtype(),
441            self.dtype(),
442            "Slice dtype mismatch {}",
443            self.encoding_id()
444        );
445
446        // Propagate some stats from the original array to the sliced array.
447        if !sliced.is::<ConstantVTable>() {
448            self.statistics().with_iter(|iter| {
449                sliced.statistics().inherit(iter.filter(|(stat, value)| {
450                    matches!(
451                        stat,
452                        Stat::IsConstant | Stat::IsSorted | Stat::IsStrictSorted
453                    ) && value.as_ref().as_exact().is_some_and(|v| {
454                        Scalar::new(DType::Bool(Nullability::NonNullable), v.clone())
455                            .as_bool()
456                            .value()
457                            .unwrap_or_default()
458                    })
459                }));
460            });
461        }
462
463        sliced
464    }
465
466    fn scalar_at(&self, index: usize) -> Scalar {
467        assert!(index < self.len(), "index {index} out of bounds");
468        if self.is_invalid(index) {
469            return Scalar::null(self.dtype().clone());
470        }
471        let scalar = <V::OperationsVTable as OperationsVTable<V>>::scalar_at(&self.0, index);
472        assert_eq!(self.dtype(), scalar.dtype(), "Scalar dtype mismatch");
473        scalar
474    }
475
476    fn is_valid(&self, index: usize) -> bool {
477        if index >= self.len() {
478            vortex_panic!(OutOfBounds: index, 0, self.len());
479        }
480        <V::ValidityVTable as ValidityVTable<V>>::is_valid(&self.0, index)
481    }
482
483    fn is_invalid(&self, index: usize) -> bool {
484        !self.is_valid(index)
485    }
486
487    fn all_valid(&self) -> bool {
488        <V::ValidityVTable as ValidityVTable<V>>::all_valid(&self.0)
489    }
490
491    fn all_invalid(&self) -> bool {
492        <V::ValidityVTable as ValidityVTable<V>>::all_invalid(&self.0)
493    }
494
495    fn valid_count(&self) -> usize {
496        if let Some(Precision::Exact(invalid_count)) =
497            self.statistics().get_as::<usize>(Stat::NullCount)
498        {
499            return self.len() - invalid_count;
500        }
501
502        let count = <V::ValidityVTable as ValidityVTable<V>>::valid_count(&self.0);
503        assert!(count <= self.len(), "Valid count exceeds array length");
504
505        self.statistics()
506            .set(Stat::NullCount, Precision::exact(self.len() - count));
507
508        count
509    }
510
511    fn invalid_count(&self) -> usize {
512        if let Some(Precision::Exact(invalid_count)) =
513            self.statistics().get_as::<usize>(Stat::NullCount)
514        {
515            return invalid_count;
516        }
517
518        let count = <V::ValidityVTable as ValidityVTable<V>>::invalid_count(&self.0);
519        assert!(count <= self.len(), "Invalid count exceeds array length");
520
521        self.statistics()
522            .set(Stat::NullCount, Precision::exact(count));
523
524        count
525    }
526
527    fn validity_mask(&self) -> Mask {
528        let mask = <V::ValidityVTable as ValidityVTable<V>>::validity_mask(&self.0);
529        assert_eq!(mask.len(), self.len(), "Validity mask length mismatch");
530        mask
531    }
532
533    fn to_canonical(&self) -> Canonical {
534        let canonical = <V::CanonicalVTable as CanonicalVTable<V>>::canonicalize(&self.0);
535        assert_eq!(
536            self.len(),
537            canonical.as_ref().len(),
538            "Canonical length mismatch {}. Expected {} but encoded into {}.",
539            self.encoding_id(),
540            self.len(),
541            canonical.as_ref().len()
542        );
543        assert_eq!(
544            self.dtype(),
545            canonical.as_ref().dtype(),
546            "Canonical dtype mismatch {}. Expected {} but encoded into {}.",
547            self.encoding_id(),
548            self.dtype(),
549            canonical.as_ref().dtype()
550        );
551        canonical
552            .as_ref()
553            .statistics()
554            .inherit_from(self.statistics());
555        canonical
556    }
557
558    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) {
559        if builder.dtype() != self.dtype() {
560            vortex_panic!(
561                "Builder dtype mismatch: expected {}, got {}",
562                self.dtype(),
563                builder.dtype(),
564            );
565        }
566        let len = builder.len();
567
568        <V::CanonicalVTable as CanonicalVTable<V>>::append_to_builder(&self.0, builder);
569        assert_eq!(
570            len + self.len(),
571            builder.len(),
572            "Builder length mismatch after writing array for encoding {}",
573            self.encoding_id(),
574        );
575    }
576
577    fn statistics(&self) -> StatsSetRef<'_> {
578        <V::ArrayVTable as ArrayVTable<V>>::stats(&self.0)
579    }
580
581    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
582        struct ReplacementChildren<'a> {
583            children: &'a [ArrayRef],
584        }
585
586        impl ArrayChildren for ReplacementChildren<'_> {
587            fn get(&self, index: usize, dtype: &DType, len: usize) -> VortexResult<ArrayRef> {
588                if index >= self.children.len() {
589                    vortex_bail!(OutOfBounds: index, 0, self.children.len());
590                }
591                let child = &self.children[index];
592                if child.len() != len {
593                    vortex_bail!(
594                        "Child length mismatch: expected {}, got {}",
595                        len,
596                        child.len()
597                    );
598                }
599                if child.dtype() != dtype {
600                    vortex_bail!(
601                        "Child dtype mismatch: expected {}, got {}",
602                        dtype,
603                        child.dtype()
604                    );
605                }
606                Ok(child.clone())
607            }
608
609            fn len(&self) -> usize {
610                self.children.len()
611            }
612        }
613
614        let metadata = self.metadata()?.ok_or_else(|| {
615            vortex_err!("Cannot replace children for arrays that do not support serialization")
616        })?;
617
618        // Replace the children of the array by re-building the array from parts.
619        self.encoding().build(
620            self.dtype(),
621            self.len(),
622            &metadata,
623            &self.buffers(),
624            &ReplacementChildren { children },
625        )
626    }
627
628    fn invoke(
629        &self,
630        compute_fn: &ComputeFn,
631        args: &InvocationArgs,
632    ) -> VortexResult<Option<Output>> {
633        <V::ComputeVTable as ComputeVTable<V>>::invoke(&self.0, compute_fn, args)
634    }
635
636    fn to_operator(&self) -> VortexResult<Option<OperatorRef>> {
637        <V::PipelineVTable as PipelineVTable<V>>::to_operator(&self.0)
638    }
639}
640
641impl<V: VTable> ArrayVisitor for ArrayAdapter<V> {
642    fn children(&self) -> Vec<ArrayRef> {
643        struct ChildrenCollector {
644            children: Vec<ArrayRef>,
645        }
646
647        impl ArrayChildVisitor for ChildrenCollector {
648            fn visit_child(&mut self, _name: &str, array: &dyn Array) {
649                self.children.push(array.to_array());
650            }
651        }
652
653        let mut collector = ChildrenCollector {
654            children: Vec::new(),
655        };
656        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
657        collector.children
658    }
659
660    fn nchildren(&self) -> usize {
661        <V::VisitorVTable as VisitorVTable<V>>::nchildren(&self.0)
662    }
663
664    fn children_names(&self) -> Vec<String> {
665        struct ChildNameCollector {
666            names: Vec<String>,
667        }
668
669        impl ArrayChildVisitor for ChildNameCollector {
670            fn visit_child(&mut self, name: &str, _array: &dyn Array) {
671                self.names.push(name.to_string());
672            }
673        }
674
675        let mut collector = ChildNameCollector { names: Vec::new() };
676        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
677        collector.names
678    }
679
680    fn named_children(&self) -> Vec<(String, ArrayRef)> {
681        struct NamedChildrenCollector {
682            children: Vec<(String, ArrayRef)>,
683        }
684
685        impl ArrayChildVisitor for NamedChildrenCollector {
686            fn visit_child(&mut self, name: &str, array: &dyn Array) {
687                self.children.push((name.to_string(), array.to_array()));
688            }
689        }
690
691        let mut collector = NamedChildrenCollector {
692            children: Vec::new(),
693        };
694
695        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
696        collector.children
697    }
698
699    fn buffers(&self) -> Vec<ByteBuffer> {
700        struct BufferCollector {
701            buffers: Vec<ByteBuffer>,
702        }
703
704        impl ArrayBufferVisitor for BufferCollector {
705            fn visit_buffer(&mut self, buffer: &ByteBuffer) {
706                self.buffers.push(buffer.clone());
707            }
708        }
709
710        let mut collector = BufferCollector {
711            buffers: Vec::new(),
712        };
713        <V::VisitorVTable as VisitorVTable<V>>::visit_buffers(&self.0, &mut collector);
714        collector.buffers
715    }
716
717    fn nbuffers(&self) -> usize {
718        <V::VisitorVTable as VisitorVTable<V>>::nbuffers(&self.0)
719    }
720
721    fn metadata(&self) -> VortexResult<Option<Vec<u8>>> {
722        Ok(<V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0)?.map(|m| m.serialize()))
723    }
724
725    fn metadata_fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
726        match <V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0) {
727            Err(e) => write!(f, "<serde error: {e}>"),
728            Ok(None) => write!(f, "<serde not supported>"),
729            Ok(Some(metadata)) => Debug::fmt(&metadata, f),
730        }
731    }
732}