vortex_array/array/
mod.rs

1//  SPDX-License-Identifier: Apache-2.0
2//  SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4pub mod display;
5mod visitor;
6
7use std::any::Any;
8use std::fmt::{Debug, Formatter};
9use std::sync::Arc;
10
11pub use visitor::*;
12use vortex_buffer::ByteBuffer;
13use vortex_dtype::{DType, Nullability};
14use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err};
15use vortex_mask::Mask;
16use vortex_scalar::Scalar;
17
18use crate::arrays::{
19    BoolEncoding, ConstantVTable, DecimalEncoding, ExtensionEncoding, ListEncoding, NullEncoding,
20    PrimitiveEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
21};
22use crate::builders::ArrayBuilder;
23use crate::compute::{ComputeFn, Cost, InvocationArgs, IsConstantOpts, Output, is_constant_opts};
24use crate::serde::ArrayChildren;
25use crate::stats::{Precision, Stat, StatsSetRef};
26use crate::vtable::{
27    ArrayVTable, CanonicalVTable, ComputeVTable, OperationsVTable, SerdeVTable, VTable,
28    ValidityVTable, VisitorVTable,
29};
30use crate::{Canonical, EncodingId, EncodingRef, SerializeMetadata};
31
32/// The public API trait for all Vortex arrays.
33pub trait Array: 'static + private::Sealed + Send + Sync + Debug + ArrayVisitor {
34    /// Returns the array as a reference to a generic [`Any`] trait object.
35    fn as_any(&self) -> &dyn Any;
36
37    /// Returns the array as an [`ArrayRef`].
38    fn to_array(&self) -> ArrayRef;
39
40    /// Returns the length of the array.
41    fn len(&self) -> usize;
42
43    /// Returns whether the array is empty (has zero rows).
44    fn is_empty(&self) -> bool {
45        self.len() == 0
46    }
47
48    /// Returns the logical Vortex [`DType`] of the array.
49    fn dtype(&self) -> &DType;
50
51    /// Returns the encoding of the array.
52    fn encoding(&self) -> EncodingRef;
53
54    /// Returns the encoding ID of the array.
55    fn encoding_id(&self) -> EncodingId;
56
57    /// Performs a constant-time slice of the array.
58    fn slice(&self, start: usize, end: usize) -> VortexResult<ArrayRef>;
59
60    /// Fetch the scalar at the given index.
61    fn scalar_at(&self, index: usize) -> VortexResult<Scalar>;
62
63    /// Returns whether the array is of the given encoding.
64    fn is_encoding(&self, encoding: EncodingId) -> bool {
65        self.encoding_id() == encoding
66    }
67
68    /// Returns whether this array is an arrow encoding.
69    // TODO(ngates): this shouldn't live here.
70    fn is_arrow(&self) -> bool {
71        self.is_encoding(NullEncoding.id())
72            || self.is_encoding(BoolEncoding.id())
73            || self.is_encoding(PrimitiveEncoding.id())
74            || self.is_encoding(VarBinEncoding.id())
75            || self.is_encoding(VarBinViewEncoding.id())
76    }
77
78    /// Whether the array is of a canonical encoding.
79    // TODO(ngates): this shouldn't live here.
80    fn is_canonical(&self) -> bool {
81        self.is_encoding(NullEncoding.id())
82            || self.is_encoding(BoolEncoding.id())
83            || self.is_encoding(PrimitiveEncoding.id())
84            || self.is_encoding(DecimalEncoding.id())
85            || self.is_encoding(StructEncoding.id())
86            || self.is_encoding(ListEncoding.id())
87            || self.is_encoding(VarBinViewEncoding.id())
88            || self.is_encoding(ExtensionEncoding.id())
89    }
90
91    /// Returns whether the item at `index` is valid.
92    fn is_valid(&self, index: usize) -> VortexResult<bool>;
93
94    /// Returns whether the item at `index` is invalid.
95    fn is_invalid(&self, index: usize) -> VortexResult<bool>;
96
97    /// Returns whether all items in the array are valid.
98    ///
99    /// This is usually cheaper than computing a precise `valid_count`.
100    fn all_valid(&self) -> VortexResult<bool>;
101
102    /// Returns whether the array is all invalid.
103    ///
104    /// This is usually cheaper than computing a precise `invalid_count`.
105    fn all_invalid(&self) -> VortexResult<bool>;
106
107    /// Returns the number of valid elements in the array.
108    fn valid_count(&self) -> VortexResult<usize>;
109
110    /// Returns the number of invalid elements in the array.
111    fn invalid_count(&self) -> VortexResult<usize>;
112
113    /// Returns the canonical validity mask for the array.
114    fn validity_mask(&self) -> VortexResult<Mask>;
115
116    /// Returns the canonical representation of the array.
117    fn to_canonical(&self) -> VortexResult<Canonical>;
118
119    /// Writes the array into the canonical builder.
120    ///
121    /// The [`DType`] of the builder must match that of the array.
122    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()>;
123
124    /// Returns the statistics of the array.
125    // TODO(ngates): change how this works. It's weird.
126    fn statistics(&self) -> StatsSetRef<'_>;
127
128    /// Replaces the children of the array with the given array references.
129    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef>;
130
131    /// Optionally invoke a kernel for the given compute function.
132    ///
133    /// These encoding-specific kernels are independent of kernels registered directly with
134    /// compute functions using [`ComputeFn::register_kernel`], and are attempted only if none of
135    /// the function-specific kernels returns a result.
136    ///
137    /// This allows encodings the opportunity to generically implement many compute functions
138    /// that share some property, for example [`ComputeFn::is_elementwise`], without prior
139    /// knowledge of the function itself, while still allowing users to override the implementation
140    /// of compute functions for built-in encodings. For an example, see the implementation for
141    /// chunked arrays.
142    ///
143    /// The first input in the [`InvocationArgs`] is always the array itself.
144    ///
145    /// Warning: do not call `compute_fn.invoke(args)` directly, as this will result in a recursive
146    /// call.
147    fn invoke(&self, compute_fn: &ComputeFn, args: &InvocationArgs)
148    -> VortexResult<Option<Output>>;
149}
150
151impl Array for Arc<dyn Array> {
152    fn as_any(&self) -> &dyn Any {
153        self.as_ref().as_any()
154    }
155
156    fn to_array(&self) -> ArrayRef {
157        self.clone()
158    }
159
160    fn len(&self) -> usize {
161        self.as_ref().len()
162    }
163
164    fn dtype(&self) -> &DType {
165        self.as_ref().dtype()
166    }
167
168    fn encoding(&self) -> EncodingRef {
169        self.as_ref().encoding()
170    }
171
172    fn encoding_id(&self) -> EncodingId {
173        self.as_ref().encoding_id()
174    }
175
176    fn slice(&self, start: usize, end: usize) -> VortexResult<ArrayRef> {
177        self.as_ref().slice(start, end)
178    }
179
180    fn scalar_at(&self, index: usize) -> VortexResult<Scalar> {
181        self.as_ref().scalar_at(index)
182    }
183
184    fn is_valid(&self, index: usize) -> VortexResult<bool> {
185        self.as_ref().is_valid(index)
186    }
187
188    fn is_invalid(&self, index: usize) -> VortexResult<bool> {
189        self.as_ref().is_invalid(index)
190    }
191
192    fn all_valid(&self) -> VortexResult<bool> {
193        self.as_ref().all_valid()
194    }
195
196    fn all_invalid(&self) -> VortexResult<bool> {
197        self.as_ref().all_invalid()
198    }
199
200    fn valid_count(&self) -> VortexResult<usize> {
201        self.as_ref().valid_count()
202    }
203
204    fn invalid_count(&self) -> VortexResult<usize> {
205        self.as_ref().invalid_count()
206    }
207
208    fn validity_mask(&self) -> VortexResult<Mask> {
209        self.as_ref().validity_mask()
210    }
211
212    fn to_canonical(&self) -> VortexResult<Canonical> {
213        self.as_ref().to_canonical()
214    }
215
216    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
217        self.as_ref().append_to_builder(builder)
218    }
219
220    fn statistics(&self) -> StatsSetRef<'_> {
221        self.as_ref().statistics()
222    }
223
224    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
225        self.as_ref().with_children(children)
226    }
227
228    fn invoke(
229        &self,
230        compute_fn: &ComputeFn,
231        args: &InvocationArgs,
232    ) -> VortexResult<Option<Output>> {
233        self.as_ref().invoke(compute_fn, args)
234    }
235}
236
237/// A reference counted pointer to a dynamic [`Array`] trait object.
238pub type ArrayRef = Arc<dyn Array>;
239
240impl ToOwned for dyn Array {
241    type Owned = ArrayRef;
242
243    fn to_owned(&self) -> Self::Owned {
244        self.to_array()
245    }
246}
247
248impl dyn Array + '_ {
249    /// Returns the array downcast to the given `A`.
250    pub fn as_<V: VTable>(&self) -> &V::Array {
251        self.as_opt::<V>().vortex_expect("Failed to downcast")
252    }
253
254    /// Returns the array downcast to the given `A`.
255    pub fn as_opt<V: VTable>(&self) -> Option<&V::Array> {
256        self.as_any()
257            .downcast_ref::<ArrayAdapter<V>>()
258            .map(|array_adapter| &array_adapter.0)
259    }
260
261    /// Is self an array with encoding from vtable `V`.
262    pub fn is<V: VTable>(&self) -> bool {
263        self.as_opt::<V>().is_some()
264    }
265
266    pub fn is_constant(&self) -> bool {
267        let opts = IsConstantOpts {
268            cost: Cost::Specialized,
269        };
270        is_constant_opts(self, &opts)
271            .inspect_err(|e| log::warn!("Failed to compute IsConstant: {e}"))
272            .ok()
273            .flatten()
274            .unwrap_or_default()
275    }
276
277    pub fn is_constant_opts(&self, cost: Cost) -> bool {
278        let opts = IsConstantOpts { cost };
279        is_constant_opts(self, &opts)
280            .inspect_err(|e| log::warn!("Failed to compute IsConstant: {e}"))
281            .ok()
282            .flatten()
283            .unwrap_or_default()
284    }
285
286    pub fn as_constant(&self) -> Option<Scalar> {
287        self.is_constant().then(|| self.scalar_at(0).ok()).flatten()
288    }
289
290    /// Total size of the array in bytes, including all children and buffers.
291    pub fn nbytes(&self) -> u64 {
292        let mut nbytes = 0;
293        for array in self.depth_first_traversal() {
294            for buffer in array.buffers() {
295                nbytes += buffer.len() as u64;
296            }
297        }
298        nbytes
299    }
300}
301
302/// Trait for converting a type into a Vortex [`ArrayRef`].
303pub trait IntoArray {
304    fn into_array(self) -> ArrayRef;
305}
306
307impl IntoArray for ArrayRef {
308    fn into_array(self) -> ArrayRef {
309        self
310    }
311}
312
313mod private {
314    use super::*;
315
316    pub trait Sealed {}
317
318    impl<V: VTable> Sealed for ArrayAdapter<V> {}
319    impl Sealed for Arc<dyn Array> {}
320}
321
322/// Adapter struct used to lift the [`VTable`] trait into an object-safe [`Array`]
323/// implementation.
324///
325/// Since this is a unit struct with `repr(transparent)`, we are able to turn un-adapted array
326/// structs into [`dyn Array`] using some cheeky casting inside [`std::ops::Deref`] and
327/// [`AsRef`]. See the `vtable!` macro for more details.
328#[repr(transparent)]
329pub struct ArrayAdapter<V: VTable>(V::Array);
330
331impl<V: VTable> ArrayAdapter<V> {
332    /// Provide a reference to the underlying array held within the adapter.
333    pub fn as_inner(&self) -> &V::Array {
334        &self.0
335    }
336
337    /// Unwrap into the inner array type, consuming the adapter.
338    pub fn into_inner(self) -> V::Array {
339        self.0
340    }
341}
342
343impl<V: VTable> Debug for ArrayAdapter<V> {
344    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
345        self.0.fmt(f)
346    }
347}
348
349impl<V: VTable> Array for ArrayAdapter<V> {
350    fn as_any(&self) -> &dyn Any {
351        self
352    }
353
354    fn to_array(&self) -> ArrayRef {
355        Arc::new(ArrayAdapter::<V>(self.0.clone()))
356    }
357
358    fn len(&self) -> usize {
359        <V::ArrayVTable as ArrayVTable<V>>::len(&self.0)
360    }
361
362    fn dtype(&self) -> &DType {
363        <V::ArrayVTable as ArrayVTable<V>>::dtype(&self.0)
364    }
365
366    fn encoding(&self) -> EncodingRef {
367        V::encoding(&self.0)
368    }
369
370    fn encoding_id(&self) -> EncodingId {
371        V::encoding(&self.0).id()
372    }
373
374    fn slice(&self, start: usize, stop: usize) -> VortexResult<ArrayRef> {
375        if start == 0 && stop == self.len() {
376            return Ok(self.to_array());
377        }
378
379        if start > self.len() {
380            vortex_bail!(OutOfBounds: start, 0, self.len());
381        }
382        if stop > self.len() {
383            vortex_bail!(OutOfBounds: stop, 0, self.len());
384        }
385        if start > stop {
386            vortex_bail!("start ({start}) must be <= stop ({stop})");
387        }
388        if start == stop {
389            return Ok(Canonical::empty(self.dtype()).into_array());
390        }
391
392        let sliced = <V::OperationsVTable as OperationsVTable<V>>::slice(&self.0, start, stop)?;
393
394        assert_eq!(
395            sliced.len(),
396            stop - start,
397            "Slice length mismatch {}",
398            self.encoding_id()
399        );
400
401        // Slightly more expensive, so only do this in debug builds.
402        debug_assert_eq!(
403            sliced.dtype(),
404            self.dtype(),
405            "Slice dtype mismatch {}",
406            self.encoding_id()
407        );
408
409        // Propagate some stats from the original array to the sliced array.
410        if !sliced.is::<ConstantVTable>() {
411            self.statistics().with_iter(|iter| {
412                sliced.statistics().inherit(iter.filter(|(stat, value)| {
413                    matches!(
414                        stat,
415                        Stat::IsConstant | Stat::IsSorted | Stat::IsStrictSorted
416                    ) && value.as_ref().as_exact().is_some_and(|v| {
417                        Scalar::new(DType::Bool(Nullability::NonNullable), v.clone())
418                            .as_bool()
419                            .value()
420                            .unwrap_or_default()
421                    })
422                }));
423            });
424        }
425
426        Ok(sliced)
427    }
428
429    fn scalar_at(&self, index: usize) -> VortexResult<Scalar> {
430        if index >= self.len() {
431            vortex_bail!(OutOfBounds: index, 0, self.len());
432        }
433        if self.is_invalid(index)? {
434            return Ok(Scalar::null(self.dtype().clone()));
435        }
436        let scalar = <V::OperationsVTable as OperationsVTable<V>>::scalar_at(&self.0, index)?;
437        assert_eq!(self.dtype(), scalar.dtype(), "Scalar dtype mismatch");
438        Ok(scalar)
439    }
440
441    fn is_valid(&self, index: usize) -> VortexResult<bool> {
442        if index >= self.len() {
443            vortex_bail!(OutOfBounds: index, 0, self.len());
444        }
445        <V::ValidityVTable as ValidityVTable<V>>::is_valid(&self.0, index)
446    }
447
448    fn is_invalid(&self, index: usize) -> VortexResult<bool> {
449        self.is_valid(index).map(|valid| !valid)
450    }
451
452    fn all_valid(&self) -> VortexResult<bool> {
453        <V::ValidityVTable as ValidityVTable<V>>::all_valid(&self.0)
454    }
455
456    fn all_invalid(&self) -> VortexResult<bool> {
457        <V::ValidityVTable as ValidityVTable<V>>::all_invalid(&self.0)
458    }
459
460    fn valid_count(&self) -> VortexResult<usize> {
461        if let Some(Precision::Exact(invalid_count)) =
462            self.statistics().get_as::<usize>(Stat::NullCount)
463        {
464            return Ok(self.len() - invalid_count);
465        }
466
467        let count = <V::ValidityVTable as ValidityVTable<V>>::valid_count(&self.0)?;
468        assert!(count <= self.len(), "Valid count exceeds array length");
469
470        self.statistics()
471            .set(Stat::NullCount, Precision::exact(self.len() - count));
472
473        Ok(count)
474    }
475
476    fn invalid_count(&self) -> VortexResult<usize> {
477        if let Some(Precision::Exact(invalid_count)) =
478            self.statistics().get_as::<usize>(Stat::NullCount)
479        {
480            return Ok(invalid_count);
481        }
482
483        let count = <V::ValidityVTable as ValidityVTable<V>>::invalid_count(&self.0)?;
484        assert!(count <= self.len(), "Invalid count exceeds array length");
485
486        self.statistics()
487            .set(Stat::NullCount, Precision::exact(count));
488
489        Ok(count)
490    }
491
492    fn validity_mask(&self) -> VortexResult<Mask> {
493        let mask = <V::ValidityVTable as ValidityVTable<V>>::validity_mask(&self.0)?;
494        assert_eq!(mask.len(), self.len(), "Validity mask length mismatch");
495        Ok(mask)
496    }
497
498    fn to_canonical(&self) -> VortexResult<Canonical> {
499        let canonical = <V::CanonicalVTable as CanonicalVTable<V>>::canonicalize(&self.0)?;
500        assert_eq!(
501            self.len(),
502            canonical.as_ref().len(),
503            "Canonical length mismatch {}. Expected {} but encoded into {}.",
504            self.encoding_id(),
505            self.len(),
506            canonical.as_ref().len()
507        );
508        assert_eq!(
509            self.dtype(),
510            canonical.as_ref().dtype(),
511            "Canonical dtype mismatch {}. Expected {} but encoded into {}.",
512            self.encoding_id(),
513            self.dtype(),
514            canonical.as_ref().dtype()
515        );
516        canonical
517            .as_ref()
518            .statistics()
519            .replace(self.statistics().to_owned());
520        Ok(canonical)
521    }
522
523    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
524        if builder.dtype() != self.dtype() {
525            vortex_bail!(
526                "Builder dtype mismatch: expected {}, got {}",
527                self.dtype(),
528                builder.dtype(),
529            );
530        }
531        let len = builder.len();
532
533        <V::CanonicalVTable as CanonicalVTable<V>>::append_to_builder(&self.0, builder)?;
534        assert_eq!(
535            len + self.len(),
536            builder.len(),
537            "Builder length mismatch after writing array for encoding {}",
538            self.encoding_id(),
539        );
540        Ok(())
541    }
542
543    fn statistics(&self) -> StatsSetRef<'_> {
544        <V::ArrayVTable as ArrayVTable<V>>::stats(&self.0)
545    }
546
547    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
548        struct ReplacementChildren<'a> {
549            children: &'a [ArrayRef],
550        }
551
552        impl ArrayChildren for ReplacementChildren<'_> {
553            fn get(&self, index: usize, dtype: &DType, len: usize) -> VortexResult<ArrayRef> {
554                if index >= self.children.len() {
555                    vortex_bail!(OutOfBounds: index, 0, self.children.len());
556                }
557                let child = &self.children[index];
558                if child.len() != len {
559                    vortex_bail!(
560                        "Child length mismatch: expected {}, got {}",
561                        len,
562                        child.len()
563                    );
564                }
565                if child.dtype() != dtype {
566                    vortex_bail!(
567                        "Child dtype mismatch: expected {}, got {}",
568                        dtype,
569                        child.dtype()
570                    );
571                }
572                Ok(child.clone())
573            }
574
575            fn len(&self) -> usize {
576                self.children.len()
577            }
578        }
579
580        let metadata = self.metadata()?.ok_or_else(|| {
581            vortex_err!("Cannot replace children for arrays that do not support serialization")
582        })?;
583
584        // Replace the children of the array by re-building the array from parts.
585        self.encoding().build(
586            self.dtype(),
587            self.len(),
588            &metadata,
589            &self.buffers(),
590            &ReplacementChildren { children },
591        )
592    }
593
594    fn invoke(
595        &self,
596        compute_fn: &ComputeFn,
597        args: &InvocationArgs,
598    ) -> VortexResult<Option<Output>> {
599        <V::ComputeVTable as ComputeVTable<V>>::invoke(&self.0, compute_fn, args)
600    }
601}
602
603impl<V: VTable> ArrayVisitor for ArrayAdapter<V> {
604    fn children(&self) -> Vec<ArrayRef> {
605        struct ChildrenCollector {
606            children: Vec<ArrayRef>,
607        }
608
609        impl ArrayChildVisitor for ChildrenCollector {
610            fn visit_child(&mut self, _name: &str, array: &dyn Array) {
611                self.children.push(array.to_array());
612            }
613        }
614
615        let mut collector = ChildrenCollector {
616            children: Vec::new(),
617        };
618        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
619        collector.children
620    }
621
622    fn nchildren(&self) -> usize {
623        <V::VisitorVTable as VisitorVTable<V>>::nchildren(&self.0)
624    }
625
626    fn children_names(&self) -> Vec<String> {
627        struct ChildNameCollector {
628            names: Vec<String>,
629        }
630
631        impl ArrayChildVisitor for ChildNameCollector {
632            fn visit_child(&mut self, name: &str, _array: &dyn Array) {
633                self.names.push(name.to_string());
634            }
635        }
636
637        let mut collector = ChildNameCollector { names: Vec::new() };
638        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
639        collector.names
640    }
641
642    fn named_children(&self) -> Vec<(String, ArrayRef)> {
643        struct NamedChildrenCollector {
644            children: Vec<(String, ArrayRef)>,
645        }
646
647        impl ArrayChildVisitor for NamedChildrenCollector {
648            fn visit_child(&mut self, name: &str, array: &dyn Array) {
649                self.children.push((name.to_string(), array.to_array()));
650            }
651        }
652
653        let mut collector = NamedChildrenCollector {
654            children: Vec::new(),
655        };
656
657        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
658        collector.children
659    }
660
661    fn buffers(&self) -> Vec<ByteBuffer> {
662        struct BufferCollector {
663            buffers: Vec<ByteBuffer>,
664        }
665
666        impl ArrayBufferVisitor for BufferCollector {
667            fn visit_buffer(&mut self, buffer: &ByteBuffer) {
668                self.buffers.push(buffer.clone());
669            }
670        }
671
672        let mut collector = BufferCollector {
673            buffers: Vec::new(),
674        };
675        <V::VisitorVTable as VisitorVTable<V>>::visit_buffers(&self.0, &mut collector);
676        collector.buffers
677    }
678
679    fn nbuffers(&self) -> usize {
680        <V::VisitorVTable as VisitorVTable<V>>::nbuffers(&self.0)
681    }
682
683    fn metadata(&self) -> VortexResult<Option<Vec<u8>>> {
684        Ok(<V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0)?.map(|m| m.serialize()))
685    }
686
687    fn metadata_fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
688        match <V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0) {
689            Err(e) => write!(f, "<serde error: {e}>"),
690            Ok(None) => write!(f, "<serde not supported>"),
691            Ok(Some(metadata)) => Debug::fmt(&metadata, f),
692        }
693    }
694}