vortex_array/array/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4pub mod display;
5mod visitor;
6
7use std::any::Any;
8use std::fmt::{Debug, Formatter};
9use std::ops::Range;
10use std::sync::Arc;
11
12pub use visitor::*;
13use vortex_buffer::ByteBuffer;
14use vortex_dtype::{DType, Nullability};
15use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err, vortex_panic};
16use vortex_mask::Mask;
17use vortex_scalar::Scalar;
18
19use crate::arrays::{
20    BoolEncoding, ConstantVTable, DecimalEncoding, ExtensionEncoding, ListEncoding, NullEncoding,
21    PrimitiveEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
22};
23use crate::builders::ArrayBuilder;
24use crate::compute::{ComputeFn, Cost, InvocationArgs, IsConstantOpts, Output, is_constant_opts};
25use crate::pipeline::{OperatorRef, PipelineVTable};
26use crate::serde::ArrayChildren;
27use crate::stats::{Precision, Stat, StatsProviderExt, StatsSetRef};
28use crate::vtable::{
29    ArrayVTable, CanonicalVTable, ComputeVTable, OperationsVTable, SerdeVTable, VTable,
30    ValidityVTable, VisitorVTable,
31};
32use crate::{Canonical, EncodingId, EncodingRef, SerializeMetadata};
33
34/// The public API trait for all Vortex arrays.
35pub trait Array: 'static + private::Sealed + Send + Sync + Debug + ArrayVisitor {
36    /// Returns the array as a reference to a generic [`Any`] trait object.
37    fn as_any(&self) -> &dyn Any;
38
39    /// Returns the array as an [`ArrayRef`].
40    fn to_array(&self) -> ArrayRef;
41
42    /// Returns the length of the array.
43    fn len(&self) -> usize;
44
45    /// Returns whether the array is empty (has zero rows).
46    fn is_empty(&self) -> bool {
47        self.len() == 0
48    }
49
50    /// Returns the logical Vortex [`DType`] of the array.
51    fn dtype(&self) -> &DType;
52
53    /// Returns the encoding of the array.
54    fn encoding(&self) -> EncodingRef;
55
56    /// Returns the encoding ID of the array.
57    fn encoding_id(&self) -> EncodingId;
58
59    /// Performs a constant-time slice of the array.
60    fn slice(&self, range: Range<usize>) -> ArrayRef;
61
62    /// Fetch the scalar at the given index.
63    ///
64    /// This method panics if the index is out of bounds for the array.
65    fn scalar_at(&self, index: usize) -> Scalar;
66
67    /// Returns whether the array is of the given encoding.
68    fn is_encoding(&self, encoding: EncodingId) -> bool {
69        self.encoding_id() == encoding
70    }
71
72    /// Returns whether this array is an arrow encoding.
73    // TODO(ngates): this shouldn't live here.
74    fn is_arrow(&self) -> bool {
75        self.is_encoding(NullEncoding.id())
76            || self.is_encoding(BoolEncoding.id())
77            || self.is_encoding(PrimitiveEncoding.id())
78            || self.is_encoding(VarBinEncoding.id())
79            || self.is_encoding(VarBinViewEncoding.id())
80    }
81
82    /// Whether the array is of a canonical encoding.
83    // TODO(ngates): this shouldn't live here.
84    fn is_canonical(&self) -> bool {
85        self.is_encoding(NullEncoding.id())
86            || self.is_encoding(BoolEncoding.id())
87            || self.is_encoding(PrimitiveEncoding.id())
88            || self.is_encoding(DecimalEncoding.id())
89            || self.is_encoding(StructEncoding.id())
90            || self.is_encoding(ListEncoding.id())
91            || self.is_encoding(VarBinViewEncoding.id())
92            || self.is_encoding(ExtensionEncoding.id())
93    }
94
95    /// Returns whether the item at `index` is valid.
96    fn is_valid(&self, index: usize) -> bool;
97
98    /// Returns whether the item at `index` is invalid.
99    fn is_invalid(&self, index: usize) -> bool;
100
101    /// Returns whether all items in the array are valid.
102    ///
103    /// This is usually cheaper than computing a precise `valid_count`.
104    fn all_valid(&self) -> bool;
105
106    /// Returns whether the array is all invalid.
107    ///
108    /// This is usually cheaper than computing a precise `invalid_count`.
109    fn all_invalid(&self) -> bool;
110
111    /// Returns the number of valid elements in the array.
112    fn valid_count(&self) -> usize;
113
114    /// Returns the number of invalid elements in the array.
115    fn invalid_count(&self) -> usize;
116
117    /// Returns the canonical validity mask for the array.
118    fn validity_mask(&self) -> Mask;
119
120    /// Returns the canonical representation of the array.
121    fn to_canonical(&self) -> Canonical;
122
123    /// Writes the array into the canonical builder.
124    ///
125    /// The [`DType`] of the builder must match that of the array.
126    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder);
127
128    /// Returns the statistics of the array.
129    // TODO(ngates): change how this works. It's weird.
130    fn statistics(&self) -> StatsSetRef<'_>;
131
132    /// Replaces the children of the array with the given array references.
133    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef>;
134
135    /// Optionally invoke a kernel for the given compute function.
136    ///
137    /// These encoding-specific kernels are independent of kernels registered directly with
138    /// compute functions using [`ComputeFn::register_kernel`], and are attempted only if none of
139    /// the function-specific kernels returns a result.
140    ///
141    /// This allows encodings the opportunity to generically implement many compute functions
142    /// that share some property, for example [`ComputeFn::is_elementwise`], without prior
143    /// knowledge of the function itself, while still allowing users to override the implementation
144    /// of compute functions for built-in encodings. For an example, see the implementation for
145    /// chunked arrays.
146    ///
147    /// The first input in the [`InvocationArgs`] is always the array itself.
148    ///
149    /// Warning: do not call `compute_fn.invoke(args)` directly, as this will result in a recursive
150    /// call.
151    fn invoke(&self, compute_fn: &ComputeFn, args: &InvocationArgs)
152    -> VortexResult<Option<Output>>;
153
154    /// Convert the array to a pipeline operator if supported by the encoding.
155    ///
156    /// Returns `None` if the encoding does not support pipeline operations.
157    fn to_operator(&self) -> VortexResult<Option<OperatorRef>>;
158}
159
160impl Array for Arc<dyn Array> {
161    #[inline]
162    fn as_any(&self) -> &dyn Any {
163        self.as_ref().as_any()
164    }
165
166    #[inline]
167    fn to_array(&self) -> ArrayRef {
168        self.clone()
169    }
170
171    #[inline]
172    fn len(&self) -> usize {
173        self.as_ref().len()
174    }
175
176    #[inline]
177    fn dtype(&self) -> &DType {
178        self.as_ref().dtype()
179    }
180
181    #[inline]
182    fn encoding(&self) -> EncodingRef {
183        self.as_ref().encoding()
184    }
185
186    #[inline]
187    fn encoding_id(&self) -> EncodingId {
188        self.as_ref().encoding_id()
189    }
190
191    #[inline]
192    fn slice(&self, range: Range<usize>) -> ArrayRef {
193        self.as_ref().slice(range)
194    }
195
196    #[inline]
197    fn scalar_at(&self, index: usize) -> Scalar {
198        self.as_ref().scalar_at(index)
199    }
200
201    #[inline]
202    fn is_valid(&self, index: usize) -> bool {
203        self.as_ref().is_valid(index)
204    }
205
206    #[inline]
207    fn is_invalid(&self, index: usize) -> bool {
208        self.as_ref().is_invalid(index)
209    }
210
211    #[inline]
212    fn all_valid(&self) -> bool {
213        self.as_ref().all_valid()
214    }
215
216    #[inline]
217    fn all_invalid(&self) -> bool {
218        self.as_ref().all_invalid()
219    }
220
221    #[inline]
222    fn valid_count(&self) -> usize {
223        self.as_ref().valid_count()
224    }
225
226    #[inline]
227    fn invalid_count(&self) -> usize {
228        self.as_ref().invalid_count()
229    }
230
231    #[inline]
232    fn validity_mask(&self) -> Mask {
233        self.as_ref().validity_mask()
234    }
235
236    fn to_canonical(&self) -> Canonical {
237        self.as_ref().to_canonical()
238    }
239
240    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) {
241        self.as_ref().append_to_builder(builder)
242    }
243
244    fn statistics(&self) -> StatsSetRef<'_> {
245        self.as_ref().statistics()
246    }
247
248    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
249        self.as_ref().with_children(children)
250    }
251
252    fn invoke(
253        &self,
254        compute_fn: &ComputeFn,
255        args: &InvocationArgs,
256    ) -> VortexResult<Option<Output>> {
257        self.as_ref().invoke(compute_fn, args)
258    }
259
260    fn to_operator(&self) -> VortexResult<Option<OperatorRef>> {
261        self.as_ref().to_operator()
262    }
263}
264
265/// A reference counted pointer to a dynamic [`Array`] trait object.
266pub type ArrayRef = Arc<dyn Array>;
267
268impl ToOwned for dyn Array {
269    type Owned = ArrayRef;
270
271    fn to_owned(&self) -> Self::Owned {
272        self.to_array()
273    }
274}
275
276impl dyn Array + '_ {
277    /// Returns the array downcast to the given `A`.
278    pub fn as_<V: VTable>(&self) -> &V::Array {
279        self.as_opt::<V>().vortex_expect("Failed to downcast")
280    }
281
282    /// Returns the array downcast to the given `A`.
283    pub fn as_opt<V: VTable>(&self) -> Option<&V::Array> {
284        self.as_any()
285            .downcast_ref::<ArrayAdapter<V>>()
286            .map(|array_adapter| &array_adapter.0)
287    }
288
289    /// Is self an array with encoding from vtable `V`.
290    pub fn is<V: VTable>(&self) -> bool {
291        self.as_opt::<V>().is_some()
292    }
293
294    pub fn is_constant(&self) -> bool {
295        let opts = IsConstantOpts {
296            cost: Cost::Specialized,
297        };
298        is_constant_opts(self, &opts)
299            .inspect_err(|e| log::warn!("Failed to compute IsConstant: {e}"))
300            .ok()
301            .flatten()
302            .unwrap_or_default()
303    }
304
305    pub fn is_constant_opts(&self, cost: Cost) -> bool {
306        let opts = IsConstantOpts { cost };
307        is_constant_opts(self, &opts)
308            .inspect_err(|e| log::warn!("Failed to compute IsConstant: {e}"))
309            .ok()
310            .flatten()
311            .unwrap_or_default()
312    }
313
314    pub fn as_constant(&self) -> Option<Scalar> {
315        self.is_constant().then(|| self.scalar_at(0))
316    }
317
318    /// Total size of the array in bytes, including all children and buffers.
319    pub fn nbytes(&self) -> u64 {
320        let mut nbytes = 0;
321        for array in self.depth_first_traversal() {
322            for buffer in array.buffers() {
323                nbytes += buffer.len() as u64;
324            }
325        }
326        nbytes
327    }
328}
329
330/// Trait for converting a type into a Vortex [`ArrayRef`].
331pub trait IntoArray {
332    fn into_array(self) -> ArrayRef;
333}
334
335impl IntoArray for ArrayRef {
336    fn into_array(self) -> ArrayRef {
337        self
338    }
339}
340
341mod private {
342    use super::*;
343
344    pub trait Sealed {}
345
346    impl<V: VTable> Sealed for ArrayAdapter<V> {}
347    impl Sealed for Arc<dyn Array> {}
348}
349
350/// Adapter struct used to lift the [`VTable`] trait into an object-safe [`Array`]
351/// implementation.
352///
353/// Since this is a unit struct with `repr(transparent)`, we are able to turn un-adapted array
354/// structs into [`dyn Array`] using some cheeky casting inside [`std::ops::Deref`] and
355/// [`AsRef`]. See the `vtable!` macro for more details.
356#[repr(transparent)]
357pub struct ArrayAdapter<V: VTable>(V::Array);
358
359impl<V: VTable> ArrayAdapter<V> {
360    /// Provide a reference to the underlying array held within the adapter.
361    pub fn as_inner(&self) -> &V::Array {
362        &self.0
363    }
364
365    /// Unwrap into the inner array type, consuming the adapter.
366    pub fn into_inner(self) -> V::Array {
367        self.0
368    }
369}
370
371impl<V: VTable> Debug for ArrayAdapter<V> {
372    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
373        self.0.fmt(f)
374    }
375}
376
377impl<V: VTable> Array for ArrayAdapter<V> {
378    fn as_any(&self) -> &dyn Any {
379        self
380    }
381
382    fn to_array(&self) -> ArrayRef {
383        Arc::new(ArrayAdapter::<V>(self.0.clone()))
384    }
385
386    fn len(&self) -> usize {
387        <V::ArrayVTable as ArrayVTable<V>>::len(&self.0)
388    }
389
390    fn dtype(&self) -> &DType {
391        <V::ArrayVTable as ArrayVTable<V>>::dtype(&self.0)
392    }
393
394    fn encoding(&self) -> EncodingRef {
395        V::encoding(&self.0)
396    }
397
398    fn encoding_id(&self) -> EncodingId {
399        V::encoding(&self.0).id()
400    }
401
402    fn slice(&self, range: Range<usize>) -> ArrayRef {
403        let start = range.start;
404        let stop = range.end;
405
406        if start == 0 && stop == self.len() {
407            return self.to_array();
408        }
409
410        assert!(
411            start <= self.len(),
412            "OutOfBounds: start {start} > length {}",
413            self.len()
414        );
415        assert!(
416            stop <= self.len(),
417            "OutOfBounds: stop {stop} > length {}",
418            self.len()
419        );
420
421        assert!(start <= stop, "start ({start}) must be <= stop ({stop})");
422
423        if start == stop {
424            return Canonical::empty(self.dtype()).into_array();
425        }
426
427        let sliced = <V::OperationsVTable as OperationsVTable<V>>::slice(&self.0, range);
428
429        assert_eq!(
430            sliced.len(),
431            stop - start,
432            "Slice length mismatch {}",
433            self.encoding_id()
434        );
435
436        // Slightly more expensive, so only do this in debug builds.
437        debug_assert_eq!(
438            sliced.dtype(),
439            self.dtype(),
440            "Slice dtype mismatch {}",
441            self.encoding_id()
442        );
443
444        // Propagate some stats from the original array to the sliced array.
445        if !sliced.is::<ConstantVTable>() {
446            self.statistics().with_iter(|iter| {
447                sliced.statistics().inherit(iter.filter(|(stat, value)| {
448                    matches!(
449                        stat,
450                        Stat::IsConstant | Stat::IsSorted | Stat::IsStrictSorted
451                    ) && value.as_ref().as_exact().is_some_and(|v| {
452                        Scalar::new(DType::Bool(Nullability::NonNullable), v.clone())
453                            .as_bool()
454                            .value()
455                            .unwrap_or_default()
456                    })
457                }));
458            });
459        }
460
461        sliced
462    }
463
464    fn scalar_at(&self, index: usize) -> Scalar {
465        assert!(index < self.len(), "index {index} out of bounds");
466        if self.is_invalid(index) {
467            return Scalar::null(self.dtype().clone());
468        }
469        let scalar = <V::OperationsVTable as OperationsVTable<V>>::scalar_at(&self.0, index);
470        assert_eq!(self.dtype(), scalar.dtype(), "Scalar dtype mismatch");
471        scalar
472    }
473
474    fn is_valid(&self, index: usize) -> bool {
475        if index >= self.len() {
476            vortex_panic!(OutOfBounds: index, 0, self.len());
477        }
478        <V::ValidityVTable as ValidityVTable<V>>::is_valid(&self.0, index)
479    }
480
481    fn is_invalid(&self, index: usize) -> bool {
482        !self.is_valid(index)
483    }
484
485    fn all_valid(&self) -> bool {
486        <V::ValidityVTable as ValidityVTable<V>>::all_valid(&self.0)
487    }
488
489    fn all_invalid(&self) -> bool {
490        <V::ValidityVTable as ValidityVTable<V>>::all_invalid(&self.0)
491    }
492
493    fn valid_count(&self) -> usize {
494        if let Some(Precision::Exact(invalid_count)) =
495            self.statistics().get_as::<usize>(Stat::NullCount)
496        {
497            return self.len() - invalid_count;
498        }
499
500        let count = <V::ValidityVTable as ValidityVTable<V>>::valid_count(&self.0);
501        assert!(count <= self.len(), "Valid count exceeds array length");
502
503        self.statistics()
504            .set(Stat::NullCount, Precision::exact(self.len() - count));
505
506        count
507    }
508
509    fn invalid_count(&self) -> usize {
510        if let Some(Precision::Exact(invalid_count)) =
511            self.statistics().get_as::<usize>(Stat::NullCount)
512        {
513            return invalid_count;
514        }
515
516        let count = <V::ValidityVTable as ValidityVTable<V>>::invalid_count(&self.0);
517        assert!(count <= self.len(), "Invalid count exceeds array length");
518
519        self.statistics()
520            .set(Stat::NullCount, Precision::exact(count));
521
522        count
523    }
524
525    fn validity_mask(&self) -> Mask {
526        let mask = <V::ValidityVTable as ValidityVTable<V>>::validity_mask(&self.0);
527        assert_eq!(mask.len(), self.len(), "Validity mask length mismatch");
528        mask
529    }
530
531    fn to_canonical(&self) -> Canonical {
532        let canonical = <V::CanonicalVTable as CanonicalVTable<V>>::canonicalize(&self.0);
533        assert_eq!(
534            self.len(),
535            canonical.as_ref().len(),
536            "Canonical length mismatch {}. Expected {} but encoded into {}.",
537            self.encoding_id(),
538            self.len(),
539            canonical.as_ref().len()
540        );
541        assert_eq!(
542            self.dtype(),
543            canonical.as_ref().dtype(),
544            "Canonical dtype mismatch {}. Expected {} but encoded into {}.",
545            self.encoding_id(),
546            self.dtype(),
547            canonical.as_ref().dtype()
548        );
549        canonical
550            .as_ref()
551            .statistics()
552            .inherit_from(self.statistics());
553        canonical
554    }
555
556    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) {
557        if builder.dtype() != self.dtype() {
558            vortex_panic!(
559                "Builder dtype mismatch: expected {}, got {}",
560                self.dtype(),
561                builder.dtype(),
562            );
563        }
564        let len = builder.len();
565
566        <V::CanonicalVTable as CanonicalVTable<V>>::append_to_builder(&self.0, builder);
567        assert_eq!(
568            len + self.len(),
569            builder.len(),
570            "Builder length mismatch after writing array for encoding {}",
571            self.encoding_id(),
572        );
573    }
574
575    fn statistics(&self) -> StatsSetRef<'_> {
576        <V::ArrayVTable as ArrayVTable<V>>::stats(&self.0)
577    }
578
579    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
580        struct ReplacementChildren<'a> {
581            children: &'a [ArrayRef],
582        }
583
584        impl ArrayChildren for ReplacementChildren<'_> {
585            fn get(&self, index: usize, dtype: &DType, len: usize) -> VortexResult<ArrayRef> {
586                if index >= self.children.len() {
587                    vortex_bail!(OutOfBounds: index, 0, self.children.len());
588                }
589                let child = &self.children[index];
590                if child.len() != len {
591                    vortex_bail!(
592                        "Child length mismatch: expected {}, got {}",
593                        len,
594                        child.len()
595                    );
596                }
597                if child.dtype() != dtype {
598                    vortex_bail!(
599                        "Child dtype mismatch: expected {}, got {}",
600                        dtype,
601                        child.dtype()
602                    );
603                }
604                Ok(child.clone())
605            }
606
607            fn len(&self) -> usize {
608                self.children.len()
609            }
610        }
611
612        let metadata = self.metadata()?.ok_or_else(|| {
613            vortex_err!("Cannot replace children for arrays that do not support serialization")
614        })?;
615
616        // Replace the children of the array by re-building the array from parts.
617        self.encoding().build(
618            self.dtype(),
619            self.len(),
620            &metadata,
621            &self.buffers(),
622            &ReplacementChildren { children },
623        )
624    }
625
626    fn invoke(
627        &self,
628        compute_fn: &ComputeFn,
629        args: &InvocationArgs,
630    ) -> VortexResult<Option<Output>> {
631        <V::ComputeVTable as ComputeVTable<V>>::invoke(&self.0, compute_fn, args)
632    }
633
634    fn to_operator(&self) -> VortexResult<Option<OperatorRef>> {
635        <V::PipelineVTable as PipelineVTable<V>>::to_operator(&self.0)
636    }
637}
638
639impl<V: VTable> ArrayVisitor for ArrayAdapter<V> {
640    fn children(&self) -> Vec<ArrayRef> {
641        struct ChildrenCollector {
642            children: Vec<ArrayRef>,
643        }
644
645        impl ArrayChildVisitor for ChildrenCollector {
646            fn visit_child(&mut self, _name: &str, array: &dyn Array) {
647                self.children.push(array.to_array());
648            }
649        }
650
651        let mut collector = ChildrenCollector {
652            children: Vec::new(),
653        };
654        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
655        collector.children
656    }
657
658    fn nchildren(&self) -> usize {
659        <V::VisitorVTable as VisitorVTable<V>>::nchildren(&self.0)
660    }
661
662    fn children_names(&self) -> Vec<String> {
663        struct ChildNameCollector {
664            names: Vec<String>,
665        }
666
667        impl ArrayChildVisitor for ChildNameCollector {
668            fn visit_child(&mut self, name: &str, _array: &dyn Array) {
669                self.names.push(name.to_string());
670            }
671        }
672
673        let mut collector = ChildNameCollector { names: Vec::new() };
674        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
675        collector.names
676    }
677
678    fn named_children(&self) -> Vec<(String, ArrayRef)> {
679        struct NamedChildrenCollector {
680            children: Vec<(String, ArrayRef)>,
681        }
682
683        impl ArrayChildVisitor for NamedChildrenCollector {
684            fn visit_child(&mut self, name: &str, array: &dyn Array) {
685                self.children.push((name.to_string(), array.to_array()));
686            }
687        }
688
689        let mut collector = NamedChildrenCollector {
690            children: Vec::new(),
691        };
692
693        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
694        collector.children
695    }
696
697    fn buffers(&self) -> Vec<ByteBuffer> {
698        struct BufferCollector {
699            buffers: Vec<ByteBuffer>,
700        }
701
702        impl ArrayBufferVisitor for BufferCollector {
703            fn visit_buffer(&mut self, buffer: &ByteBuffer) {
704                self.buffers.push(buffer.clone());
705            }
706        }
707
708        let mut collector = BufferCollector {
709            buffers: Vec::new(),
710        };
711        <V::VisitorVTable as VisitorVTable<V>>::visit_buffers(&self.0, &mut collector);
712        collector.buffers
713    }
714
715    fn nbuffers(&self) -> usize {
716        <V::VisitorVTable as VisitorVTable<V>>::nbuffers(&self.0)
717    }
718
719    fn metadata(&self) -> VortexResult<Option<Vec<u8>>> {
720        Ok(<V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0)?.map(|m| m.serialize()))
721    }
722
723    fn metadata_fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
724        match <V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0) {
725            Err(e) => write!(f, "<serde error: {e}>"),
726            Ok(None) => write!(f, "<serde not supported>"),
727            Ok(Some(metadata)) => Debug::fmt(&metadata, f),
728        }
729    }
730}