vortex_array/array/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4pub mod display;
5mod visitor;
6
7use std::any::Any;
8use std::fmt::{Debug, Formatter};
9use std::sync::Arc;
10
11pub use visitor::*;
12use vortex_buffer::ByteBuffer;
13use vortex_dtype::{DType, Nullability};
14use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err};
15use vortex_mask::Mask;
16use vortex_scalar::Scalar;
17
18use crate::arrays::{
19    BoolEncoding, ConstantVTable, DecimalEncoding, ExtensionEncoding, ListEncoding, NullEncoding,
20    PrimitiveEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
21};
22use crate::builders::ArrayBuilder;
23use crate::compute::{ComputeFn, Cost, InvocationArgs, IsConstantOpts, Output, is_constant_opts};
24use crate::serde::ArrayChildren;
25use crate::stats::{Precision, Stat, StatsProviderExt, StatsSetRef};
26use crate::vtable::{
27    ArrayVTable, CanonicalVTable, ComputeVTable, OperationsVTable, SerdeVTable, VTable,
28    ValidityVTable, VisitorVTable,
29};
30use crate::{Canonical, EncodingId, EncodingRef, SerializeMetadata};
31
32/// The public API trait for all Vortex arrays.
33pub trait Array: 'static + private::Sealed + Send + Sync + Debug + ArrayVisitor {
34    /// Returns the array as a reference to a generic [`Any`] trait object.
35    fn as_any(&self) -> &dyn Any;
36
37    /// Returns the array as an [`ArrayRef`].
38    fn to_array(&self) -> ArrayRef;
39
40    /// Returns the length of the array.
41    fn len(&self) -> usize;
42
43    /// Returns whether the array is empty (has zero rows).
44    fn is_empty(&self) -> bool {
45        self.len() == 0
46    }
47
48    /// Returns the logical Vortex [`DType`] of the array.
49    fn dtype(&self) -> &DType;
50
51    /// Returns the encoding of the array.
52    fn encoding(&self) -> EncodingRef;
53
54    /// Returns the encoding ID of the array.
55    fn encoding_id(&self) -> EncodingId;
56
57    /// Performs a constant-time slice of the array.
58    fn slice(&self, start: usize, end: usize) -> ArrayRef;
59
60    /// Fetch the scalar at the given index.
61    ///
62    /// This method panics if the index is out of bounds for the array.
63    fn scalar_at(&self, index: usize) -> Scalar;
64
65    /// Returns whether the array is of the given encoding.
66    fn is_encoding(&self, encoding: EncodingId) -> bool {
67        self.encoding_id() == encoding
68    }
69
70    /// Returns whether this array is an arrow encoding.
71    // TODO(ngates): this shouldn't live here.
72    fn is_arrow(&self) -> bool {
73        self.is_encoding(NullEncoding.id())
74            || self.is_encoding(BoolEncoding.id())
75            || self.is_encoding(PrimitiveEncoding.id())
76            || self.is_encoding(VarBinEncoding.id())
77            || self.is_encoding(VarBinViewEncoding.id())
78    }
79
80    /// Whether the array is of a canonical encoding.
81    // TODO(ngates): this shouldn't live here.
82    fn is_canonical(&self) -> bool {
83        self.is_encoding(NullEncoding.id())
84            || self.is_encoding(BoolEncoding.id())
85            || self.is_encoding(PrimitiveEncoding.id())
86            || self.is_encoding(DecimalEncoding.id())
87            || self.is_encoding(StructEncoding.id())
88            || self.is_encoding(ListEncoding.id())
89            || self.is_encoding(VarBinViewEncoding.id())
90            || self.is_encoding(ExtensionEncoding.id())
91    }
92
93    /// Returns whether the item at `index` is valid.
94    fn is_valid(&self, index: usize) -> VortexResult<bool>;
95
96    /// Returns whether the item at `index` is invalid.
97    fn is_invalid(&self, index: usize) -> VortexResult<bool>;
98
99    /// Returns whether all items in the array are valid.
100    ///
101    /// This is usually cheaper than computing a precise `valid_count`.
102    fn all_valid(&self) -> VortexResult<bool>;
103
104    /// Returns whether the array is all invalid.
105    ///
106    /// This is usually cheaper than computing a precise `invalid_count`.
107    fn all_invalid(&self) -> VortexResult<bool>;
108
109    /// Returns the number of valid elements in the array.
110    fn valid_count(&self) -> VortexResult<usize>;
111
112    /// Returns the number of invalid elements in the array.
113    fn invalid_count(&self) -> VortexResult<usize>;
114
115    /// Returns the canonical validity mask for the array.
116    fn validity_mask(&self) -> VortexResult<Mask>;
117
118    /// Returns the canonical representation of the array.
119    fn to_canonical(&self) -> VortexResult<Canonical>;
120
121    /// Writes the array into the canonical builder.
122    ///
123    /// The [`DType`] of the builder must match that of the array.
124    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()>;
125
126    /// Returns the statistics of the array.
127    // TODO(ngates): change how this works. It's weird.
128    fn statistics(&self) -> StatsSetRef<'_>;
129
130    /// Replaces the children of the array with the given array references.
131    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef>;
132
133    /// Optionally invoke a kernel for the given compute function.
134    ///
135    /// These encoding-specific kernels are independent of kernels registered directly with
136    /// compute functions using [`ComputeFn::register_kernel`], and are attempted only if none of
137    /// the function-specific kernels returns a result.
138    ///
139    /// This allows encodings the opportunity to generically implement many compute functions
140    /// that share some property, for example [`ComputeFn::is_elementwise`], without prior
141    /// knowledge of the function itself, while still allowing users to override the implementation
142    /// of compute functions for built-in encodings. For an example, see the implementation for
143    /// chunked arrays.
144    ///
145    /// The first input in the [`InvocationArgs`] is always the array itself.
146    ///
147    /// Warning: do not call `compute_fn.invoke(args)` directly, as this will result in a recursive
148    /// call.
149    fn invoke(&self, compute_fn: &ComputeFn, args: &InvocationArgs)
150    -> VortexResult<Option<Output>>;
151}
152
153impl Array for Arc<dyn Array> {
154    fn as_any(&self) -> &dyn Any {
155        self.as_ref().as_any()
156    }
157
158    fn to_array(&self) -> ArrayRef {
159        self.clone()
160    }
161
162    fn len(&self) -> usize {
163        self.as_ref().len()
164    }
165
166    fn dtype(&self) -> &DType {
167        self.as_ref().dtype()
168    }
169
170    fn encoding(&self) -> EncodingRef {
171        self.as_ref().encoding()
172    }
173
174    fn encoding_id(&self) -> EncodingId {
175        self.as_ref().encoding_id()
176    }
177
178    fn slice(&self, start: usize, end: usize) -> ArrayRef {
179        self.as_ref().slice(start, end)
180    }
181
182    fn scalar_at(&self, index: usize) -> Scalar {
183        self.as_ref().scalar_at(index)
184    }
185
186    fn is_valid(&self, index: usize) -> VortexResult<bool> {
187        self.as_ref().is_valid(index)
188    }
189
190    fn is_invalid(&self, index: usize) -> VortexResult<bool> {
191        self.as_ref().is_invalid(index)
192    }
193
194    fn all_valid(&self) -> VortexResult<bool> {
195        self.as_ref().all_valid()
196    }
197
198    fn all_invalid(&self) -> VortexResult<bool> {
199        self.as_ref().all_invalid()
200    }
201
202    fn valid_count(&self) -> VortexResult<usize> {
203        self.as_ref().valid_count()
204    }
205
206    fn invalid_count(&self) -> VortexResult<usize> {
207        self.as_ref().invalid_count()
208    }
209
210    fn validity_mask(&self) -> VortexResult<Mask> {
211        self.as_ref().validity_mask()
212    }
213
214    fn to_canonical(&self) -> VortexResult<Canonical> {
215        self.as_ref().to_canonical()
216    }
217
218    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
219        self.as_ref().append_to_builder(builder)
220    }
221
222    fn statistics(&self) -> StatsSetRef<'_> {
223        self.as_ref().statistics()
224    }
225
226    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
227        self.as_ref().with_children(children)
228    }
229
230    fn invoke(
231        &self,
232        compute_fn: &ComputeFn,
233        args: &InvocationArgs,
234    ) -> VortexResult<Option<Output>> {
235        self.as_ref().invoke(compute_fn, args)
236    }
237}
238
239/// A reference counted pointer to a dynamic [`Array`] trait object.
240pub type ArrayRef = Arc<dyn Array>;
241
242impl ToOwned for dyn Array {
243    type Owned = ArrayRef;
244
245    fn to_owned(&self) -> Self::Owned {
246        self.to_array()
247    }
248}
249
250impl dyn Array + '_ {
251    /// Returns the array downcast to the given `A`.
252    pub fn as_<V: VTable>(&self) -> &V::Array {
253        self.as_opt::<V>().vortex_expect("Failed to downcast")
254    }
255
256    /// Returns the array downcast to the given `A`.
257    pub fn as_opt<V: VTable>(&self) -> Option<&V::Array> {
258        self.as_any()
259            .downcast_ref::<ArrayAdapter<V>>()
260            .map(|array_adapter| &array_adapter.0)
261    }
262
263    /// Is self an array with encoding from vtable `V`.
264    pub fn is<V: VTable>(&self) -> bool {
265        self.as_opt::<V>().is_some()
266    }
267
268    pub fn is_constant(&self) -> bool {
269        let opts = IsConstantOpts {
270            cost: Cost::Specialized,
271        };
272        is_constant_opts(self, &opts)
273            .inspect_err(|e| log::warn!("Failed to compute IsConstant: {e}"))
274            .ok()
275            .flatten()
276            .unwrap_or_default()
277    }
278
279    pub fn is_constant_opts(&self, cost: Cost) -> bool {
280        let opts = IsConstantOpts { cost };
281        is_constant_opts(self, &opts)
282            .inspect_err(|e| log::warn!("Failed to compute IsConstant: {e}"))
283            .ok()
284            .flatten()
285            .unwrap_or_default()
286    }
287
288    pub fn as_constant(&self) -> Option<Scalar> {
289        self.is_constant().then(|| self.scalar_at(0))
290    }
291
292    /// Total size of the array in bytes, including all children and buffers.
293    pub fn nbytes(&self) -> u64 {
294        let mut nbytes = 0;
295        for array in self.depth_first_traversal() {
296            for buffer in array.buffers() {
297                nbytes += buffer.len() as u64;
298            }
299        }
300        nbytes
301    }
302}
303
304/// Trait for converting a type into a Vortex [`ArrayRef`].
305pub trait IntoArray {
306    fn into_array(self) -> ArrayRef;
307}
308
309impl IntoArray for ArrayRef {
310    fn into_array(self) -> ArrayRef {
311        self
312    }
313}
314
315mod private {
316    use super::*;
317
318    pub trait Sealed {}
319
320    impl<V: VTable> Sealed for ArrayAdapter<V> {}
321    impl Sealed for Arc<dyn Array> {}
322}
323
324/// Adapter struct used to lift the [`VTable`] trait into an object-safe [`Array`]
325/// implementation.
326///
327/// Since this is a unit struct with `repr(transparent)`, we are able to turn un-adapted array
328/// structs into [`dyn Array`] using some cheeky casting inside [`std::ops::Deref`] and
329/// [`AsRef`]. See the `vtable!` macro for more details.
330#[repr(transparent)]
331pub struct ArrayAdapter<V: VTable>(V::Array);
332
333impl<V: VTable> ArrayAdapter<V> {
334    /// Provide a reference to the underlying array held within the adapter.
335    pub fn as_inner(&self) -> &V::Array {
336        &self.0
337    }
338
339    /// Unwrap into the inner array type, consuming the adapter.
340    pub fn into_inner(self) -> V::Array {
341        self.0
342    }
343}
344
345impl<V: VTable> Debug for ArrayAdapter<V> {
346    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
347        self.0.fmt(f)
348    }
349}
350
351impl<V: VTable> Array for ArrayAdapter<V> {
352    fn as_any(&self) -> &dyn Any {
353        self
354    }
355
356    fn to_array(&self) -> ArrayRef {
357        Arc::new(ArrayAdapter::<V>(self.0.clone()))
358    }
359
360    fn len(&self) -> usize {
361        <V::ArrayVTable as ArrayVTable<V>>::len(&self.0)
362    }
363
364    fn dtype(&self) -> &DType {
365        <V::ArrayVTable as ArrayVTable<V>>::dtype(&self.0)
366    }
367
368    fn encoding(&self) -> EncodingRef {
369        V::encoding(&self.0)
370    }
371
372    fn encoding_id(&self) -> EncodingId {
373        V::encoding(&self.0).id()
374    }
375
376    fn slice(&self, start: usize, stop: usize) -> ArrayRef {
377        if start == 0 && stop == self.len() {
378            return self.to_array();
379        }
380
381        assert!(
382            start <= self.len(),
383            "OutOfBounds: start {start} > length {}",
384            self.len()
385        );
386        assert!(
387            stop <= self.len(),
388            "OutOfBounds: stop {stop} > length {}",
389            self.len()
390        );
391
392        assert!(start <= stop, "start ({start}) must be <= stop ({stop})");
393
394        if start == stop {
395            return Canonical::empty(self.dtype()).into_array();
396        }
397
398        let sliced = <V::OperationsVTable as OperationsVTable<V>>::slice(&self.0, start, stop);
399
400        assert_eq!(
401            sliced.len(),
402            stop - start,
403            "Slice length mismatch {}",
404            self.encoding_id()
405        );
406
407        // Slightly more expensive, so only do this in debug builds.
408        debug_assert_eq!(
409            sliced.dtype(),
410            self.dtype(),
411            "Slice dtype mismatch {}",
412            self.encoding_id()
413        );
414
415        // Propagate some stats from the original array to the sliced array.
416        if !sliced.is::<ConstantVTable>() {
417            self.statistics().with_iter(|iter| {
418                sliced.statistics().inherit(iter.filter(|(stat, value)| {
419                    matches!(
420                        stat,
421                        Stat::IsConstant | Stat::IsSorted | Stat::IsStrictSorted
422                    ) && value.as_ref().as_exact().is_some_and(|v| {
423                        Scalar::new(DType::Bool(Nullability::NonNullable), v.clone())
424                            .as_bool()
425                            .value()
426                            .unwrap_or_default()
427                    })
428                }));
429            });
430        }
431
432        sliced
433    }
434
435    fn scalar_at(&self, index: usize) -> Scalar {
436        assert!(index < self.len(), "index {index} out of bounds");
437        if self.is_invalid(index).vortex_expect("index out of bounds") {
438            return Scalar::null(self.dtype().clone());
439        }
440        let scalar = <V::OperationsVTable as OperationsVTable<V>>::scalar_at(&self.0, index);
441        assert_eq!(self.dtype(), scalar.dtype(), "Scalar dtype mismatch");
442        scalar
443    }
444
445    fn is_valid(&self, index: usize) -> VortexResult<bool> {
446        if index >= self.len() {
447            vortex_bail!(OutOfBounds: index, 0, self.len());
448        }
449        <V::ValidityVTable as ValidityVTable<V>>::is_valid(&self.0, index)
450    }
451
452    fn is_invalid(&self, index: usize) -> VortexResult<bool> {
453        self.is_valid(index).map(|valid| !valid)
454    }
455
456    fn all_valid(&self) -> VortexResult<bool> {
457        <V::ValidityVTable as ValidityVTable<V>>::all_valid(&self.0)
458    }
459
460    fn all_invalid(&self) -> VortexResult<bool> {
461        <V::ValidityVTable as ValidityVTable<V>>::all_invalid(&self.0)
462    }
463
464    fn valid_count(&self) -> VortexResult<usize> {
465        if let Some(Precision::Exact(invalid_count)) =
466            self.statistics().get_as::<usize>(Stat::NullCount)
467        {
468            return Ok(self.len() - invalid_count);
469        }
470
471        let count = <V::ValidityVTable as ValidityVTable<V>>::valid_count(&self.0)?;
472        assert!(count <= self.len(), "Valid count exceeds array length");
473
474        self.statistics()
475            .set(Stat::NullCount, Precision::exact(self.len() - count));
476
477        Ok(count)
478    }
479
480    fn invalid_count(&self) -> VortexResult<usize> {
481        if let Some(Precision::Exact(invalid_count)) =
482            self.statistics().get_as::<usize>(Stat::NullCount)
483        {
484            return Ok(invalid_count);
485        }
486
487        let count = <V::ValidityVTable as ValidityVTable<V>>::invalid_count(&self.0)?;
488        assert!(count <= self.len(), "Invalid count exceeds array length");
489
490        self.statistics()
491            .set(Stat::NullCount, Precision::exact(count));
492
493        Ok(count)
494    }
495
496    fn validity_mask(&self) -> VortexResult<Mask> {
497        let mask = <V::ValidityVTable as ValidityVTable<V>>::validity_mask(&self.0)?;
498        assert_eq!(mask.len(), self.len(), "Validity mask length mismatch");
499        Ok(mask)
500    }
501
502    fn to_canonical(&self) -> VortexResult<Canonical> {
503        let canonical = <V::CanonicalVTable as CanonicalVTable<V>>::canonicalize(&self.0)?;
504        assert_eq!(
505            self.len(),
506            canonical.as_ref().len(),
507            "Canonical length mismatch {}. Expected {} but encoded into {}.",
508            self.encoding_id(),
509            self.len(),
510            canonical.as_ref().len()
511        );
512        assert_eq!(
513            self.dtype(),
514            canonical.as_ref().dtype(),
515            "Canonical dtype mismatch {}. Expected {} but encoded into {}.",
516            self.encoding_id(),
517            self.dtype(),
518            canonical.as_ref().dtype()
519        );
520        canonical
521            .as_ref()
522            .statistics()
523            .inherit_from(self.statistics());
524        Ok(canonical)
525    }
526
527    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
528        if builder.dtype() != self.dtype() {
529            vortex_bail!(
530                "Builder dtype mismatch: expected {}, got {}",
531                self.dtype(),
532                builder.dtype(),
533            );
534        }
535        let len = builder.len();
536
537        <V::CanonicalVTable as CanonicalVTable<V>>::append_to_builder(&self.0, builder)?;
538        assert_eq!(
539            len + self.len(),
540            builder.len(),
541            "Builder length mismatch after writing array for encoding {}",
542            self.encoding_id(),
543        );
544        Ok(())
545    }
546
547    fn statistics(&self) -> StatsSetRef<'_> {
548        <V::ArrayVTable as ArrayVTable<V>>::stats(&self.0)
549    }
550
551    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
552        struct ReplacementChildren<'a> {
553            children: &'a [ArrayRef],
554        }
555
556        impl ArrayChildren for ReplacementChildren<'_> {
557            fn get(&self, index: usize, dtype: &DType, len: usize) -> VortexResult<ArrayRef> {
558                if index >= self.children.len() {
559                    vortex_bail!(OutOfBounds: index, 0, self.children.len());
560                }
561                let child = &self.children[index];
562                if child.len() != len {
563                    vortex_bail!(
564                        "Child length mismatch: expected {}, got {}",
565                        len,
566                        child.len()
567                    );
568                }
569                if child.dtype() != dtype {
570                    vortex_bail!(
571                        "Child dtype mismatch: expected {}, got {}",
572                        dtype,
573                        child.dtype()
574                    );
575                }
576                Ok(child.clone())
577            }
578
579            fn len(&self) -> usize {
580                self.children.len()
581            }
582        }
583
584        let metadata = self.metadata()?.ok_or_else(|| {
585            vortex_err!("Cannot replace children for arrays that do not support serialization")
586        })?;
587
588        // Replace the children of the array by re-building the array from parts.
589        self.encoding().build(
590            self.dtype(),
591            self.len(),
592            &metadata,
593            &self.buffers(),
594            &ReplacementChildren { children },
595        )
596    }
597
598    fn invoke(
599        &self,
600        compute_fn: &ComputeFn,
601        args: &InvocationArgs,
602    ) -> VortexResult<Option<Output>> {
603        <V::ComputeVTable as ComputeVTable<V>>::invoke(&self.0, compute_fn, args)
604    }
605}
606
607impl<V: VTable> ArrayVisitor for ArrayAdapter<V> {
608    fn children(&self) -> Vec<ArrayRef> {
609        struct ChildrenCollector {
610            children: Vec<ArrayRef>,
611        }
612
613        impl ArrayChildVisitor for ChildrenCollector {
614            fn visit_child(&mut self, _name: &str, array: &dyn Array) {
615                self.children.push(array.to_array());
616            }
617        }
618
619        let mut collector = ChildrenCollector {
620            children: Vec::new(),
621        };
622        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
623        collector.children
624    }
625
626    fn nchildren(&self) -> usize {
627        <V::VisitorVTable as VisitorVTable<V>>::nchildren(&self.0)
628    }
629
630    fn children_names(&self) -> Vec<String> {
631        struct ChildNameCollector {
632            names: Vec<String>,
633        }
634
635        impl ArrayChildVisitor for ChildNameCollector {
636            fn visit_child(&mut self, name: &str, _array: &dyn Array) {
637                self.names.push(name.to_string());
638            }
639        }
640
641        let mut collector = ChildNameCollector { names: Vec::new() };
642        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
643        collector.names
644    }
645
646    fn named_children(&self) -> Vec<(String, ArrayRef)> {
647        struct NamedChildrenCollector {
648            children: Vec<(String, ArrayRef)>,
649        }
650
651        impl ArrayChildVisitor for NamedChildrenCollector {
652            fn visit_child(&mut self, name: &str, array: &dyn Array) {
653                self.children.push((name.to_string(), array.to_array()));
654            }
655        }
656
657        let mut collector = NamedChildrenCollector {
658            children: Vec::new(),
659        };
660
661        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
662        collector.children
663    }
664
665    fn buffers(&self) -> Vec<ByteBuffer> {
666        struct BufferCollector {
667            buffers: Vec<ByteBuffer>,
668        }
669
670        impl ArrayBufferVisitor for BufferCollector {
671            fn visit_buffer(&mut self, buffer: &ByteBuffer) {
672                self.buffers.push(buffer.clone());
673            }
674        }
675
676        let mut collector = BufferCollector {
677            buffers: Vec::new(),
678        };
679        <V::VisitorVTable as VisitorVTable<V>>::visit_buffers(&self.0, &mut collector);
680        collector.buffers
681    }
682
683    fn nbuffers(&self) -> usize {
684        <V::VisitorVTable as VisitorVTable<V>>::nbuffers(&self.0)
685    }
686
687    fn metadata(&self) -> VortexResult<Option<Vec<u8>>> {
688        Ok(<V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0)?.map(|m| m.serialize()))
689    }
690
691    fn metadata_fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
692        match <V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0) {
693            Err(e) => write!(f, "<serde error: {e}>"),
694            Ok(None) => write!(f, "<serde not supported>"),
695            Ok(Some(metadata)) => Debug::fmt(&metadata, f),
696        }
697    }
698}