vortex_array/array/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4pub mod display;
5mod visitor;
6
7use std::any::Any;
8use std::fmt::{Debug, Formatter};
9use std::ops::Range;
10use std::sync::Arc;
11
12pub use visitor::*;
13use vortex_buffer::ByteBuffer;
14use vortex_dtype::{DType, Nullability};
15use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err, vortex_panic};
16use vortex_mask::Mask;
17use vortex_scalar::Scalar;
18
19use crate::arrays::{
20    BoolEncoding, ConstantVTable, DecimalEncoding, ExtensionEncoding, ListEncoding, NullEncoding,
21    PrimitiveEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
22};
23use crate::builders::ArrayBuilder;
24use crate::compute::{ComputeFn, Cost, InvocationArgs, IsConstantOpts, Output, is_constant_opts};
25use crate::pipeline::{OperatorRef, PipelineVTable};
26use crate::serde::ArrayChildren;
27use crate::stats::{Precision, Stat, StatsProviderExt, StatsSetRef};
28use crate::vtable::{
29    ArrayVTable, CanonicalVTable, ComputeVTable, OperationsVTable, SerdeVTable, VTable,
30    ValidityVTable, VisitorVTable,
31};
32use crate::{Canonical, EncodingId, EncodingRef, SerializeMetadata};
33
34/// The public API trait for all Vortex arrays.
35pub trait Array: 'static + private::Sealed + Send + Sync + Debug + ArrayVisitor {
36    /// Returns the array as a reference to a generic [`Any`] trait object.
37    fn as_any(&self) -> &dyn Any;
38
39    /// Returns the array as an [`ArrayRef`].
40    fn to_array(&self) -> ArrayRef;
41
42    /// Returns the length of the array.
43    fn len(&self) -> usize;
44
45    /// Returns whether the array is empty (has zero rows).
46    fn is_empty(&self) -> bool {
47        self.len() == 0
48    }
49
50    /// Returns the logical Vortex [`DType`] of the array.
51    fn dtype(&self) -> &DType;
52
53    /// Returns the encoding of the array.
54    fn encoding(&self) -> EncodingRef;
55
56    /// Returns the encoding ID of the array.
57    fn encoding_id(&self) -> EncodingId;
58
59    /// Performs a constant-time slice of the array.
60    fn slice(&self, range: Range<usize>) -> ArrayRef;
61
62    /// Fetch the scalar at the given index.
63    ///
64    /// This method panics if the index is out of bounds for the array.
65    fn scalar_at(&self, index: usize) -> Scalar;
66
67    /// Returns whether the array is of the given encoding.
68    fn is_encoding(&self, encoding: EncodingId) -> bool {
69        self.encoding_id() == encoding
70    }
71
72    /// Returns whether this array is an arrow encoding.
73    // TODO(ngates): this shouldn't live here.
74    fn is_arrow(&self) -> bool {
75        self.is_encoding(NullEncoding.id())
76            || self.is_encoding(BoolEncoding.id())
77            || self.is_encoding(PrimitiveEncoding.id())
78            || self.is_encoding(VarBinEncoding.id())
79            || self.is_encoding(VarBinViewEncoding.id())
80    }
81
82    /// Whether the array is of a canonical encoding.
83    // TODO(ngates): this shouldn't live here.
84    fn is_canonical(&self) -> bool {
85        self.is_encoding(NullEncoding.id())
86            || self.is_encoding(BoolEncoding.id())
87            || self.is_encoding(PrimitiveEncoding.id())
88            || self.is_encoding(DecimalEncoding.id())
89            || self.is_encoding(StructEncoding.id())
90            || self.is_encoding(ListEncoding.id())
91            || self.is_encoding(VarBinViewEncoding.id())
92            || self.is_encoding(ExtensionEncoding.id())
93    }
94
95    /// Returns whether the item at `index` is valid.
96    fn is_valid(&self, index: usize) -> bool;
97
98    /// Returns whether the item at `index` is invalid.
99    fn is_invalid(&self, index: usize) -> bool;
100
101    /// Returns whether all items in the array are valid.
102    ///
103    /// This is usually cheaper than computing a precise `valid_count`.
104    fn all_valid(&self) -> bool;
105
106    /// Returns whether the array is all invalid.
107    ///
108    /// This is usually cheaper than computing a precise `invalid_count`.
109    fn all_invalid(&self) -> bool;
110
111    /// Returns the number of valid elements in the array.
112    fn valid_count(&self) -> usize;
113
114    /// Returns the number of invalid elements in the array.
115    fn invalid_count(&self) -> usize;
116
117    /// Returns the canonical validity mask for the array.
118    fn validity_mask(&self) -> Mask;
119
120    /// Returns the canonical representation of the array.
121    fn to_canonical(&self) -> VortexResult<Canonical>;
122
123    /// Writes the array into the canonical builder.
124    ///
125    /// The [`DType`] of the builder must match that of the array.
126    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()>;
127
128    /// Returns the statistics of the array.
129    // TODO(ngates): change how this works. It's weird.
130    fn statistics(&self) -> StatsSetRef<'_>;
131
132    /// Replaces the children of the array with the given array references.
133    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef>;
134
135    /// Optionally invoke a kernel for the given compute function.
136    ///
137    /// These encoding-specific kernels are independent of kernels registered directly with
138    /// compute functions using [`ComputeFn::register_kernel`], and are attempted only if none of
139    /// the function-specific kernels returns a result.
140    ///
141    /// This allows encodings the opportunity to generically implement many compute functions
142    /// that share some property, for example [`ComputeFn::is_elementwise`], without prior
143    /// knowledge of the function itself, while still allowing users to override the implementation
144    /// of compute functions for built-in encodings. For an example, see the implementation for
145    /// chunked arrays.
146    ///
147    /// The first input in the [`InvocationArgs`] is always the array itself.
148    ///
149    /// Warning: do not call `compute_fn.invoke(args)` directly, as this will result in a recursive
150    /// call.
151    fn invoke(&self, compute_fn: &ComputeFn, args: &InvocationArgs)
152    -> VortexResult<Option<Output>>;
153
154    /// Convert the array to a pipeline operator if supported by the encoding.
155    ///
156    /// Returns `None` if the encoding does not support pipeline operations.
157    fn to_operator(&self) -> VortexResult<Option<OperatorRef>>;
158}
159
160impl Array for Arc<dyn Array> {
161    fn as_any(&self) -> &dyn Any {
162        self.as_ref().as_any()
163    }
164
165    fn to_array(&self) -> ArrayRef {
166        self.clone()
167    }
168
169    fn len(&self) -> usize {
170        self.as_ref().len()
171    }
172
173    fn dtype(&self) -> &DType {
174        self.as_ref().dtype()
175    }
176
177    fn encoding(&self) -> EncodingRef {
178        self.as_ref().encoding()
179    }
180
181    fn encoding_id(&self) -> EncodingId {
182        self.as_ref().encoding_id()
183    }
184
185    fn slice(&self, range: Range<usize>) -> ArrayRef {
186        self.as_ref().slice(range)
187    }
188
189    fn scalar_at(&self, index: usize) -> Scalar {
190        self.as_ref().scalar_at(index)
191    }
192
193    fn is_valid(&self, index: usize) -> bool {
194        self.as_ref().is_valid(index)
195    }
196
197    fn is_invalid(&self, index: usize) -> bool {
198        self.as_ref().is_invalid(index)
199    }
200
201    fn all_valid(&self) -> bool {
202        self.as_ref().all_valid()
203    }
204
205    fn all_invalid(&self) -> bool {
206        self.as_ref().all_invalid()
207    }
208
209    fn valid_count(&self) -> usize {
210        self.as_ref().valid_count()
211    }
212
213    fn invalid_count(&self) -> usize {
214        self.as_ref().invalid_count()
215    }
216
217    fn validity_mask(&self) -> Mask {
218        self.as_ref().validity_mask()
219    }
220
221    fn to_canonical(&self) -> VortexResult<Canonical> {
222        self.as_ref().to_canonical()
223    }
224
225    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
226        self.as_ref().append_to_builder(builder)
227    }
228
229    fn statistics(&self) -> StatsSetRef<'_> {
230        self.as_ref().statistics()
231    }
232
233    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
234        self.as_ref().with_children(children)
235    }
236
237    fn invoke(
238        &self,
239        compute_fn: &ComputeFn,
240        args: &InvocationArgs,
241    ) -> VortexResult<Option<Output>> {
242        self.as_ref().invoke(compute_fn, args)
243    }
244
245    fn to_operator(&self) -> VortexResult<Option<OperatorRef>> {
246        self.as_ref().to_operator()
247    }
248}
249
250/// A reference counted pointer to a dynamic [`Array`] trait object.
251pub type ArrayRef = Arc<dyn Array>;
252
253impl ToOwned for dyn Array {
254    type Owned = ArrayRef;
255
256    fn to_owned(&self) -> Self::Owned {
257        self.to_array()
258    }
259}
260
261impl dyn Array + '_ {
262    /// Returns the array downcast to the given `A`.
263    pub fn as_<V: VTable>(&self) -> &V::Array {
264        self.as_opt::<V>().vortex_expect("Failed to downcast")
265    }
266
267    /// Returns the array downcast to the given `A`.
268    pub fn as_opt<V: VTable>(&self) -> Option<&V::Array> {
269        self.as_any()
270            .downcast_ref::<ArrayAdapter<V>>()
271            .map(|array_adapter| &array_adapter.0)
272    }
273
274    /// Is self an array with encoding from vtable `V`.
275    pub fn is<V: VTable>(&self) -> bool {
276        self.as_opt::<V>().is_some()
277    }
278
279    pub fn is_constant(&self) -> bool {
280        let opts = IsConstantOpts {
281            cost: Cost::Specialized,
282        };
283        is_constant_opts(self, &opts)
284            .inspect_err(|e| log::warn!("Failed to compute IsConstant: {e}"))
285            .ok()
286            .flatten()
287            .unwrap_or_default()
288    }
289
290    pub fn is_constant_opts(&self, cost: Cost) -> bool {
291        let opts = IsConstantOpts { cost };
292        is_constant_opts(self, &opts)
293            .inspect_err(|e| log::warn!("Failed to compute IsConstant: {e}"))
294            .ok()
295            .flatten()
296            .unwrap_or_default()
297    }
298
299    pub fn as_constant(&self) -> Option<Scalar> {
300        self.is_constant().then(|| self.scalar_at(0))
301    }
302
303    /// Total size of the array in bytes, including all children and buffers.
304    pub fn nbytes(&self) -> u64 {
305        let mut nbytes = 0;
306        for array in self.depth_first_traversal() {
307            for buffer in array.buffers() {
308                nbytes += buffer.len() as u64;
309            }
310        }
311        nbytes
312    }
313}
314
315/// Trait for converting a type into a Vortex [`ArrayRef`].
316pub trait IntoArray {
317    fn into_array(self) -> ArrayRef;
318}
319
320impl IntoArray for ArrayRef {
321    fn into_array(self) -> ArrayRef {
322        self
323    }
324}
325
326mod private {
327    use super::*;
328
329    pub trait Sealed {}
330
331    impl<V: VTable> Sealed for ArrayAdapter<V> {}
332    impl Sealed for Arc<dyn Array> {}
333}
334
335/// Adapter struct used to lift the [`VTable`] trait into an object-safe [`Array`]
336/// implementation.
337///
338/// Since this is a unit struct with `repr(transparent)`, we are able to turn un-adapted array
339/// structs into [`dyn Array`] using some cheeky casting inside [`std::ops::Deref`] and
340/// [`AsRef`]. See the `vtable!` macro for more details.
341#[repr(transparent)]
342pub struct ArrayAdapter<V: VTable>(V::Array);
343
344impl<V: VTable> ArrayAdapter<V> {
345    /// Provide a reference to the underlying array held within the adapter.
346    pub fn as_inner(&self) -> &V::Array {
347        &self.0
348    }
349
350    /// Unwrap into the inner array type, consuming the adapter.
351    pub fn into_inner(self) -> V::Array {
352        self.0
353    }
354}
355
356impl<V: VTable> Debug for ArrayAdapter<V> {
357    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
358        self.0.fmt(f)
359    }
360}
361
362impl<V: VTable> Array for ArrayAdapter<V> {
363    fn as_any(&self) -> &dyn Any {
364        self
365    }
366
367    fn to_array(&self) -> ArrayRef {
368        Arc::new(ArrayAdapter::<V>(self.0.clone()))
369    }
370
371    fn len(&self) -> usize {
372        <V::ArrayVTable as ArrayVTable<V>>::len(&self.0)
373    }
374
375    fn dtype(&self) -> &DType {
376        <V::ArrayVTable as ArrayVTable<V>>::dtype(&self.0)
377    }
378
379    fn encoding(&self) -> EncodingRef {
380        V::encoding(&self.0)
381    }
382
383    fn encoding_id(&self) -> EncodingId {
384        V::encoding(&self.0).id()
385    }
386
387    fn slice(&self, range: Range<usize>) -> ArrayRef {
388        let start = range.start;
389        let stop = range.end;
390
391        if start == 0 && stop == self.len() {
392            return self.to_array();
393        }
394
395        assert!(
396            start <= self.len(),
397            "OutOfBounds: start {start} > length {}",
398            self.len()
399        );
400        assert!(
401            stop <= self.len(),
402            "OutOfBounds: stop {stop} > length {}",
403            self.len()
404        );
405
406        assert!(start <= stop, "start ({start}) must be <= stop ({stop})");
407
408        if start == stop {
409            return Canonical::empty(self.dtype()).into_array();
410        }
411
412        let sliced = <V::OperationsVTable as OperationsVTable<V>>::slice(&self.0, range);
413
414        assert_eq!(
415            sliced.len(),
416            stop - start,
417            "Slice length mismatch {}",
418            self.encoding_id()
419        );
420
421        // Slightly more expensive, so only do this in debug builds.
422        debug_assert_eq!(
423            sliced.dtype(),
424            self.dtype(),
425            "Slice dtype mismatch {}",
426            self.encoding_id()
427        );
428
429        // Propagate some stats from the original array to the sliced array.
430        if !sliced.is::<ConstantVTable>() {
431            self.statistics().with_iter(|iter| {
432                sliced.statistics().inherit(iter.filter(|(stat, value)| {
433                    matches!(
434                        stat,
435                        Stat::IsConstant | Stat::IsSorted | Stat::IsStrictSorted
436                    ) && value.as_ref().as_exact().is_some_and(|v| {
437                        Scalar::new(DType::Bool(Nullability::NonNullable), v.clone())
438                            .as_bool()
439                            .value()
440                            .unwrap_or_default()
441                    })
442                }));
443            });
444        }
445
446        sliced
447    }
448
449    fn scalar_at(&self, index: usize) -> Scalar {
450        assert!(index < self.len(), "index {index} out of bounds");
451        if self.is_invalid(index) {
452            return Scalar::null(self.dtype().clone());
453        }
454        let scalar = <V::OperationsVTable as OperationsVTable<V>>::scalar_at(&self.0, index);
455        assert_eq!(self.dtype(), scalar.dtype(), "Scalar dtype mismatch");
456        scalar
457    }
458
459    fn is_valid(&self, index: usize) -> bool {
460        if index >= self.len() {
461            vortex_panic!(OutOfBounds: index, 0, self.len());
462        }
463        <V::ValidityVTable as ValidityVTable<V>>::is_valid(&self.0, index)
464    }
465
466    fn is_invalid(&self, index: usize) -> bool {
467        !self.is_valid(index)
468    }
469
470    fn all_valid(&self) -> bool {
471        <V::ValidityVTable as ValidityVTable<V>>::all_valid(&self.0)
472    }
473
474    fn all_invalid(&self) -> bool {
475        <V::ValidityVTable as ValidityVTable<V>>::all_invalid(&self.0)
476    }
477
478    fn valid_count(&self) -> usize {
479        if let Some(Precision::Exact(invalid_count)) =
480            self.statistics().get_as::<usize>(Stat::NullCount)
481        {
482            return self.len() - invalid_count;
483        }
484
485        let count = <V::ValidityVTable as ValidityVTable<V>>::valid_count(&self.0);
486        assert!(count <= self.len(), "Valid count exceeds array length");
487
488        self.statistics()
489            .set(Stat::NullCount, Precision::exact(self.len() - count));
490
491        count
492    }
493
494    fn invalid_count(&self) -> usize {
495        if let Some(Precision::Exact(invalid_count)) =
496            self.statistics().get_as::<usize>(Stat::NullCount)
497        {
498            return invalid_count;
499        }
500
501        let count = <V::ValidityVTable as ValidityVTable<V>>::invalid_count(&self.0);
502        assert!(count <= self.len(), "Invalid count exceeds array length");
503
504        self.statistics()
505            .set(Stat::NullCount, Precision::exact(count));
506
507        count
508    }
509
510    fn validity_mask(&self) -> Mask {
511        let mask = <V::ValidityVTable as ValidityVTable<V>>::validity_mask(&self.0);
512        assert_eq!(mask.len(), self.len(), "Validity mask length mismatch");
513        mask
514    }
515
516    fn to_canonical(&self) -> VortexResult<Canonical> {
517        let canonical = <V::CanonicalVTable as CanonicalVTable<V>>::canonicalize(&self.0)?;
518        assert_eq!(
519            self.len(),
520            canonical.as_ref().len(),
521            "Canonical length mismatch {}. Expected {} but encoded into {}.",
522            self.encoding_id(),
523            self.len(),
524            canonical.as_ref().len()
525        );
526        assert_eq!(
527            self.dtype(),
528            canonical.as_ref().dtype(),
529            "Canonical dtype mismatch {}. Expected {} but encoded into {}.",
530            self.encoding_id(),
531            self.dtype(),
532            canonical.as_ref().dtype()
533        );
534        canonical
535            .as_ref()
536            .statistics()
537            .inherit_from(self.statistics());
538        Ok(canonical)
539    }
540
541    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
542        if builder.dtype() != self.dtype() {
543            vortex_bail!(
544                "Builder dtype mismatch: expected {}, got {}",
545                self.dtype(),
546                builder.dtype(),
547            );
548        }
549        let len = builder.len();
550
551        <V::CanonicalVTable as CanonicalVTable<V>>::append_to_builder(&self.0, builder)?;
552        assert_eq!(
553            len + self.len(),
554            builder.len(),
555            "Builder length mismatch after writing array for encoding {}",
556            self.encoding_id(),
557        );
558        Ok(())
559    }
560
561    fn statistics(&self) -> StatsSetRef<'_> {
562        <V::ArrayVTable as ArrayVTable<V>>::stats(&self.0)
563    }
564
565    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
566        struct ReplacementChildren<'a> {
567            children: &'a [ArrayRef],
568        }
569
570        impl ArrayChildren for ReplacementChildren<'_> {
571            fn get(&self, index: usize, dtype: &DType, len: usize) -> VortexResult<ArrayRef> {
572                if index >= self.children.len() {
573                    vortex_bail!(OutOfBounds: index, 0, self.children.len());
574                }
575                let child = &self.children[index];
576                if child.len() != len {
577                    vortex_bail!(
578                        "Child length mismatch: expected {}, got {}",
579                        len,
580                        child.len()
581                    );
582                }
583                if child.dtype() != dtype {
584                    vortex_bail!(
585                        "Child dtype mismatch: expected {}, got {}",
586                        dtype,
587                        child.dtype()
588                    );
589                }
590                Ok(child.clone())
591            }
592
593            fn len(&self) -> usize {
594                self.children.len()
595            }
596        }
597
598        let metadata = self.metadata()?.ok_or_else(|| {
599            vortex_err!("Cannot replace children for arrays that do not support serialization")
600        })?;
601
602        // Replace the children of the array by re-building the array from parts.
603        self.encoding().build(
604            self.dtype(),
605            self.len(),
606            &metadata,
607            &self.buffers(),
608            &ReplacementChildren { children },
609        )
610    }
611
612    fn invoke(
613        &self,
614        compute_fn: &ComputeFn,
615        args: &InvocationArgs,
616    ) -> VortexResult<Option<Output>> {
617        <V::ComputeVTable as ComputeVTable<V>>::invoke(&self.0, compute_fn, args)
618    }
619
620    fn to_operator(&self) -> VortexResult<Option<OperatorRef>> {
621        <V::PipelineVTable as PipelineVTable<V>>::to_operator(&self.0)
622    }
623}
624
625impl<V: VTable> ArrayVisitor for ArrayAdapter<V> {
626    fn children(&self) -> Vec<ArrayRef> {
627        struct ChildrenCollector {
628            children: Vec<ArrayRef>,
629        }
630
631        impl ArrayChildVisitor for ChildrenCollector {
632            fn visit_child(&mut self, _name: &str, array: &dyn Array) {
633                self.children.push(array.to_array());
634            }
635        }
636
637        let mut collector = ChildrenCollector {
638            children: Vec::new(),
639        };
640        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
641        collector.children
642    }
643
644    fn nchildren(&self) -> usize {
645        <V::VisitorVTable as VisitorVTable<V>>::nchildren(&self.0)
646    }
647
648    fn children_names(&self) -> Vec<String> {
649        struct ChildNameCollector {
650            names: Vec<String>,
651        }
652
653        impl ArrayChildVisitor for ChildNameCollector {
654            fn visit_child(&mut self, name: &str, _array: &dyn Array) {
655                self.names.push(name.to_string());
656            }
657        }
658
659        let mut collector = ChildNameCollector { names: Vec::new() };
660        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
661        collector.names
662    }
663
664    fn named_children(&self) -> Vec<(String, ArrayRef)> {
665        struct NamedChildrenCollector {
666            children: Vec<(String, ArrayRef)>,
667        }
668
669        impl ArrayChildVisitor for NamedChildrenCollector {
670            fn visit_child(&mut self, name: &str, array: &dyn Array) {
671                self.children.push((name.to_string(), array.to_array()));
672            }
673        }
674
675        let mut collector = NamedChildrenCollector {
676            children: Vec::new(),
677        };
678
679        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
680        collector.children
681    }
682
683    fn buffers(&self) -> Vec<ByteBuffer> {
684        struct BufferCollector {
685            buffers: Vec<ByteBuffer>,
686        }
687
688        impl ArrayBufferVisitor for BufferCollector {
689            fn visit_buffer(&mut self, buffer: &ByteBuffer) {
690                self.buffers.push(buffer.clone());
691            }
692        }
693
694        let mut collector = BufferCollector {
695            buffers: Vec::new(),
696        };
697        <V::VisitorVTable as VisitorVTable<V>>::visit_buffers(&self.0, &mut collector);
698        collector.buffers
699    }
700
701    fn nbuffers(&self) -> usize {
702        <V::VisitorVTable as VisitorVTable<V>>::nbuffers(&self.0)
703    }
704
705    fn metadata(&self) -> VortexResult<Option<Vec<u8>>> {
706        Ok(<V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0)?.map(|m| m.serialize()))
707    }
708
709    fn metadata_fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
710        match <V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0) {
711            Err(e) => write!(f, "<serde error: {e}>"),
712            Ok(None) => write!(f, "<serde not supported>"),
713            Ok(Some(metadata)) => Debug::fmt(&metadata, f),
714        }
715    }
716}