vortex_array/array/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4mod convert;
5pub mod display;
6mod statistics;
7mod visitor;
8
9use std::any::Any;
10use std::fmt::{Debug, Formatter};
11use std::sync::Arc;
12
13pub use convert::*;
14pub use visitor::*;
15use vortex_buffer::ByteBuffer;
16use vortex_dtype::DType;
17use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err};
18use vortex_mask::Mask;
19use vortex_scalar::Scalar;
20
21use crate::arrays::{
22    BoolEncoding, DecimalEncoding, ExtensionEncoding, ListEncoding, NullEncoding,
23    PrimitiveEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
24};
25use crate::builders::ArrayBuilder;
26use crate::compute::{ComputeFn, Cost, InvocationArgs, Output};
27use crate::serde::ArrayChildren;
28use crate::stats::{Precision, Stat, StatsProviderExt, StatsSetRef};
29use crate::vtable::{
30    ArrayVTable, CanonicalVTable, ComputeVTable, OperationsVTable, SerdeVTable, VTable,
31    ValidityVTable, VisitorVTable,
32};
33use crate::{Canonical, EncodingId, EncodingRef, SerializeMetadata};
34
35/// The public API trait for all Vortex arrays.
36pub trait Array: 'static + private::Sealed + Send + Sync + Debug + ArrayVisitor {
37    /// Returns the array as a reference to a generic [`Any`] trait object.
38    fn as_any(&self) -> &dyn Any;
39
40    /// Returns the array as an [`ArrayRef`].
41    fn to_array(&self) -> ArrayRef;
42
43    /// Returns the length of the array.
44    fn len(&self) -> usize;
45
46    /// Returns whether the array is empty (has zero rows).
47    fn is_empty(&self) -> bool {
48        self.len() == 0
49    }
50
51    /// Returns the logical Vortex [`DType`] of the array.
52    fn dtype(&self) -> &DType;
53
54    /// Returns the encoding of the array.
55    fn encoding(&self) -> EncodingRef;
56
57    /// Returns the encoding ID of the array.
58    fn encoding_id(&self) -> EncodingId;
59
60    /// Performs a constant-time slice of the array.
61    fn slice(&self, start: usize, end: usize) -> VortexResult<ArrayRef>;
62
63    /// Fetch the scalar at the given index.
64    fn scalar_at(&self, index: usize) -> VortexResult<Scalar>;
65
66    /// Returns whether the array is of the given encoding.
67    fn is_encoding(&self, encoding: EncodingId) -> bool {
68        self.encoding_id() == encoding
69    }
70
71    /// Returns whether this array is an arrow encoding.
72    // TODO(ngates): this shouldn't live here.
73    fn is_arrow(&self) -> bool {
74        self.is_encoding(NullEncoding.id())
75            || self.is_encoding(BoolEncoding.id())
76            || self.is_encoding(PrimitiveEncoding.id())
77            || self.is_encoding(VarBinEncoding.id())
78            || self.is_encoding(VarBinViewEncoding.id())
79    }
80
81    /// Whether the array is of a canonical encoding.
82    // TODO(ngates): this shouldn't live here.
83    fn is_canonical(&self) -> bool {
84        self.is_encoding(NullEncoding.id())
85            || self.is_encoding(BoolEncoding.id())
86            || self.is_encoding(PrimitiveEncoding.id())
87            || self.is_encoding(DecimalEncoding.id())
88            || self.is_encoding(StructEncoding.id())
89            || self.is_encoding(ListEncoding.id())
90            || self.is_encoding(VarBinViewEncoding.id())
91            || self.is_encoding(ExtensionEncoding.id())
92    }
93
94    /// Returns whether the item at `index` is valid.
95    fn is_valid(&self, index: usize) -> VortexResult<bool>;
96
97    /// Returns whether the item at `index` is invalid.
98    fn is_invalid(&self, index: usize) -> VortexResult<bool>;
99
100    /// Returns whether all items in the array are valid.
101    ///
102    /// This is usually cheaper than computing a precise `valid_count`.
103    fn all_valid(&self) -> VortexResult<bool>;
104
105    /// Returns whether the array is all invalid.
106    ///
107    /// This is usually cheaper than computing a precise `invalid_count`.
108    fn all_invalid(&self) -> VortexResult<bool>;
109
110    /// Returns the number of valid elements in the array.
111    fn valid_count(&self) -> VortexResult<usize>;
112
113    /// Returns the number of invalid elements in the array.
114    fn invalid_count(&self) -> VortexResult<usize>;
115
116    /// Returns the canonical validity mask for the array.
117    fn validity_mask(&self) -> VortexResult<Mask>;
118
119    /// Returns the canonical representation of the array.
120    fn to_canonical(&self) -> VortexResult<Canonical>;
121
122    /// Writes the array into the canonical builder.
123    ///
124    /// The [`DType`] of the builder must match that of the array.
125    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()>;
126
127    /// Returns the statistics of the array.
128    // TODO(ngates): change how this works. It's weird.
129    fn statistics(&self) -> StatsSetRef<'_>;
130
131    /// Replaces the children of the array with the given array references.
132    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef>;
133
134    /// Optionally invoke a kernel for the given compute function.
135    ///
136    /// These encoding-specific kernels are independent of kernels registered directly with
137    /// compute functions using [`ComputeFn::register_kernel`], and are attempted only if none of
138    /// the function-specific kernels returns a result.
139    ///
140    /// This allows encodings the opportunity to generically implement many compute functions
141    /// that share some property, for example [`ComputeFn::is_elementwise`], without prior
142    /// knowledge of the function itself, while still allowing users to override the implementation
143    /// of compute functions for built-in encodings. For an example, see the implementation for
144    /// chunked arrays.
145    ///
146    /// The first input in the [`InvocationArgs`] is always the array itself.
147    ///
148    /// Warning: do not call `compute_fn.invoke(args)` directly, as this will result in a recursive
149    /// call.
150    fn invoke(&self, compute_fn: &ComputeFn, args: &InvocationArgs)
151    -> VortexResult<Option<Output>>;
152}
153
154impl Array for Arc<dyn Array> {
155    fn as_any(&self) -> &dyn Any {
156        self.as_ref().as_any()
157    }
158
159    fn to_array(&self) -> ArrayRef {
160        self.clone()
161    }
162
163    fn len(&self) -> usize {
164        self.as_ref().len()
165    }
166
167    fn dtype(&self) -> &DType {
168        self.as_ref().dtype()
169    }
170
171    fn encoding(&self) -> EncodingRef {
172        self.as_ref().encoding()
173    }
174
175    fn encoding_id(&self) -> EncodingId {
176        self.as_ref().encoding_id()
177    }
178
179    fn slice(&self, start: usize, end: usize) -> VortexResult<ArrayRef> {
180        self.as_ref().slice(start, end)
181    }
182
183    fn scalar_at(&self, index: usize) -> VortexResult<Scalar> {
184        self.as_ref().scalar_at(index)
185    }
186
187    fn is_valid(&self, index: usize) -> VortexResult<bool> {
188        self.as_ref().is_valid(index)
189    }
190
191    fn is_invalid(&self, index: usize) -> VortexResult<bool> {
192        self.as_ref().is_invalid(index)
193    }
194
195    fn all_valid(&self) -> VortexResult<bool> {
196        self.as_ref().all_valid()
197    }
198
199    fn all_invalid(&self) -> VortexResult<bool> {
200        self.as_ref().all_invalid()
201    }
202
203    fn valid_count(&self) -> VortexResult<usize> {
204        self.as_ref().valid_count()
205    }
206
207    fn invalid_count(&self) -> VortexResult<usize> {
208        self.as_ref().invalid_count()
209    }
210
211    fn validity_mask(&self) -> VortexResult<Mask> {
212        self.as_ref().validity_mask()
213    }
214
215    fn to_canonical(&self) -> VortexResult<Canonical> {
216        self.as_ref().to_canonical()
217    }
218
219    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
220        self.as_ref().append_to_builder(builder)
221    }
222
223    fn statistics(&self) -> StatsSetRef<'_> {
224        self.as_ref().statistics()
225    }
226
227    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
228        self.as_ref().with_children(children)
229    }
230
231    fn invoke(
232        &self,
233        compute_fn: &ComputeFn,
234        args: &InvocationArgs,
235    ) -> VortexResult<Option<Output>> {
236        self.as_ref().invoke(compute_fn, args)
237    }
238}
239
240/// A reference counted pointer to a dynamic [`Array`] trait object.
241pub type ArrayRef = Arc<dyn Array>;
242
243impl ToOwned for dyn Array {
244    type Owned = ArrayRef;
245
246    fn to_owned(&self) -> Self::Owned {
247        self.to_array()
248    }
249}
250
251impl dyn Array + '_ {
252    /// Returns the array downcast to the given `A`.
253    pub fn as_<V: VTable>(&self) -> &V::Array {
254        self.as_opt::<V>().vortex_expect("Failed to downcast")
255    }
256
257    /// Returns the array downcast to the given `A`.
258    pub fn as_opt<V: VTable>(&self) -> Option<&V::Array> {
259        self.as_any()
260            .downcast_ref::<ArrayAdapter<V>>()
261            .map(|array_adapter| &array_adapter.0)
262    }
263
264    /// Is self an array with encoding from vtable `V`.
265    pub fn is<V: VTable>(&self) -> bool {
266        self.as_opt::<V>().is_some()
267    }
268}
269
270impl dyn Array + '_ {
271    /// Total size of the array in bytes, including all children and buffers.
272    pub fn nbytes(&self) -> u64 {
273        let mut nbytes = 0;
274        for array in self.depth_first_traversal() {
275            for buffer in array.buffers() {
276                nbytes += buffer.len() as u64;
277            }
278        }
279        nbytes
280    }
281}
282
283mod private {
284    use super::*;
285
286    pub trait Sealed {}
287
288    impl<V: VTable> Sealed for ArrayAdapter<V> {}
289    impl Sealed for Arc<dyn Array> {}
290}
291
292/// Adapter struct used to lift the [`VTable`] trait into an object-safe [`Array`]
293/// implementation.
294///
295/// Since this is a unit struct with `repr(transparent)`, we are able to turn un-adapted array
296/// structs into [`dyn Array`] using some cheeky casting inside [`std::ops::Deref`] and
297/// [`AsRef`]. See the `vtable!` macro for more details.
298#[repr(transparent)]
299pub struct ArrayAdapter<V: VTable>(V::Array);
300
301impl<V: VTable> Debug for ArrayAdapter<V> {
302    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
303        self.0.fmt(f)
304    }
305}
306
307impl<V: VTable> Array for ArrayAdapter<V> {
308    fn as_any(&self) -> &dyn Any {
309        self
310    }
311
312    fn to_array(&self) -> ArrayRef {
313        Arc::new(ArrayAdapter::<V>(self.0.clone()))
314    }
315
316    fn len(&self) -> usize {
317        <V::ArrayVTable as ArrayVTable<V>>::len(&self.0)
318    }
319
320    fn dtype(&self) -> &DType {
321        <V::ArrayVTable as ArrayVTable<V>>::dtype(&self.0)
322    }
323
324    fn encoding(&self) -> EncodingRef {
325        V::encoding(&self.0)
326    }
327
328    fn encoding_id(&self) -> EncodingId {
329        V::encoding(&self.0).id()
330    }
331
332    fn slice(&self, start: usize, stop: usize) -> VortexResult<ArrayRef> {
333        if start == 0 && stop == self.len() {
334            return Ok(self.to_array());
335        }
336
337        if start > self.len() {
338            vortex_bail!(OutOfBounds: start, 0, self.len());
339        }
340        if stop > self.len() {
341            vortex_bail!(OutOfBounds: stop, 0, self.len());
342        }
343        if start > stop {
344            vortex_bail!("start ({start}) must be <= stop ({stop})");
345        }
346
347        if start == stop {
348            return Ok(Canonical::empty(self.dtype()).into_array());
349        }
350
351        // We know that constant array don't need stats propagation, so we can avoid the overhead of
352        // computing derived stats and merging them in.
353        // TODO(ngates): skip the is_constant check here, it can force an expensive compute.
354        // TODO(ngates): provide a means to slice an array _without_ propagating stats.
355        let derived_stats = (!self.0.is_constant_opts(Cost::Negligible)).then(|| {
356            let stats = self.statistics().to_owned();
357
358            // an array that is not constant can become constant after slicing
359            let is_constant = stats.get_as::<bool>(Stat::IsConstant);
360            let is_sorted = stats.get_as::<bool>(Stat::IsSorted);
361            let is_strict_sorted = stats.get_as::<bool>(Stat::IsStrictSorted);
362
363            let mut stats = stats.keep_inexact_stats(&[
364                Stat::Max,
365                Stat::Min,
366                Stat::NullCount,
367                Stat::UncompressedSizeInBytes,
368            ]);
369
370            if is_constant == Some(Precision::Exact(true)) {
371                stats.set(Stat::IsConstant, Precision::exact(true));
372            }
373            if is_sorted == Some(Precision::Exact(true)) {
374                stats.set(Stat::IsSorted, Precision::exact(true));
375            }
376            if is_strict_sorted == Some(Precision::Exact(true)) {
377                stats.set(Stat::IsStrictSorted, Precision::exact(true));
378            }
379
380            stats
381        });
382
383        let sliced = <V::OperationsVTable as OperationsVTable<V>>::slice(&self.0, start, stop)?;
384
385        assert_eq!(
386            sliced.len(),
387            stop - start,
388            "Slice length mismatch {}",
389            self.encoding_id()
390        );
391        assert_eq!(
392            sliced.dtype(),
393            self.dtype(),
394            "Slice dtype mismatch {}",
395            self.encoding_id()
396        );
397
398        if let Some(derived_stats) = derived_stats {
399            let mut stats = sliced.statistics().to_owned();
400            stats.combine_sets(&derived_stats, self.dtype())?;
401            for (stat, val) in stats.into_iter() {
402                sliced.statistics().set(stat, val)
403            }
404        }
405
406        Ok(sliced)
407    }
408
409    fn scalar_at(&self, index: usize) -> VortexResult<Scalar> {
410        if index >= self.len() {
411            vortex_bail!(OutOfBounds: index, 0, self.len());
412        }
413        if self.is_invalid(index)? {
414            return Ok(Scalar::null(self.dtype().clone()));
415        }
416        let scalar = <V::OperationsVTable as OperationsVTable<V>>::scalar_at(&self.0, index)?;
417        assert_eq!(self.dtype(), scalar.dtype(), "Scalar dtype mismatch");
418        Ok(scalar)
419    }
420
421    fn is_valid(&self, index: usize) -> VortexResult<bool> {
422        if index >= self.len() {
423            vortex_bail!(OutOfBounds: index, 0, self.len());
424        }
425        <V::ValidityVTable as ValidityVTable<V>>::is_valid(&self.0, index)
426    }
427
428    fn is_invalid(&self, index: usize) -> VortexResult<bool> {
429        self.is_valid(index).map(|valid| !valid)
430    }
431
432    fn all_valid(&self) -> VortexResult<bool> {
433        <V::ValidityVTable as ValidityVTable<V>>::all_valid(&self.0)
434    }
435
436    fn all_invalid(&self) -> VortexResult<bool> {
437        <V::ValidityVTable as ValidityVTable<V>>::all_invalid(&self.0)
438    }
439
440    fn valid_count(&self) -> VortexResult<usize> {
441        if let Some(Precision::Exact(invalid_count)) =
442            self.statistics().get_as::<usize>(Stat::NullCount)
443        {
444            return Ok(self.len() - invalid_count);
445        }
446
447        let count = <V::ValidityVTable as ValidityVTable<V>>::valid_count(&self.0)?;
448        assert!(count <= self.len(), "Valid count exceeds array length");
449
450        self.statistics()
451            .set(Stat::NullCount, Precision::exact(self.len() - count));
452
453        Ok(count)
454    }
455
456    fn invalid_count(&self) -> VortexResult<usize> {
457        if let Some(Precision::Exact(invalid_count)) =
458            self.statistics().get_as::<usize>(Stat::NullCount)
459        {
460            return Ok(invalid_count);
461        }
462
463        let count = <V::ValidityVTable as ValidityVTable<V>>::invalid_count(&self.0)?;
464        assert!(count <= self.len(), "Invalid count exceeds array length");
465
466        self.statistics()
467            .set(Stat::NullCount, Precision::exact(count));
468
469        Ok(count)
470    }
471
472    fn validity_mask(&self) -> VortexResult<Mask> {
473        let mask = <V::ValidityVTable as ValidityVTable<V>>::validity_mask(&self.0)?;
474        assert_eq!(mask.len(), self.len(), "Validity mask length mismatch");
475        Ok(mask)
476    }
477
478    fn to_canonical(&self) -> VortexResult<Canonical> {
479        let canonical = <V::CanonicalVTable as CanonicalVTable<V>>::canonicalize(&self.0)?;
480        assert_eq!(
481            self.len(),
482            canonical.as_ref().len(),
483            "Canonical length mismatch {}. Expected {} but encoded into {}.",
484            self.encoding_id(),
485            self.len(),
486            canonical.as_ref().len()
487        );
488        assert_eq!(
489            self.dtype(),
490            canonical.as_ref().dtype(),
491            "Canonical dtype mismatch {}. Expected {} but encoded into {}.",
492            self.encoding_id(),
493            self.dtype(),
494            canonical.as_ref().dtype()
495        );
496        canonical.as_ref().statistics().inherit(self.statistics());
497        Ok(canonical)
498    }
499
500    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
501        if builder.dtype() != self.dtype() {
502            vortex_bail!(
503                "Builder dtype mismatch: expected {}, got {}",
504                self.dtype(),
505                builder.dtype(),
506            );
507        }
508        let len = builder.len();
509
510        <V::CanonicalVTable as CanonicalVTable<V>>::append_to_builder(&self.0, builder)?;
511        assert_eq!(
512            len + self.len(),
513            builder.len(),
514            "Builder length mismatch after writing array for encoding {}",
515            self.encoding_id(),
516        );
517        Ok(())
518    }
519
520    fn statistics(&self) -> StatsSetRef<'_> {
521        <V::ArrayVTable as ArrayVTable<V>>::stats(&self.0)
522    }
523
524    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
525        struct ReplacementChildren<'a> {
526            children: &'a [ArrayRef],
527        }
528
529        impl ArrayChildren for ReplacementChildren<'_> {
530            fn get(&self, index: usize, dtype: &DType, len: usize) -> VortexResult<ArrayRef> {
531                if index >= self.children.len() {
532                    vortex_bail!(OutOfBounds: index, 0, self.children.len());
533                }
534                let child = &self.children[index];
535                if child.len() != len {
536                    vortex_bail!(
537                        "Child length mismatch: expected {}, got {}",
538                        len,
539                        child.len()
540                    );
541                }
542                if child.dtype() != dtype {
543                    vortex_bail!(
544                        "Child dtype mismatch: expected {}, got {}",
545                        dtype,
546                        child.dtype()
547                    );
548                }
549                Ok(child.clone())
550            }
551
552            fn len(&self) -> usize {
553                self.children.len()
554            }
555        }
556
557        let metadata = self.metadata()?.ok_or_else(|| {
558            vortex_err!("Cannot replace children for arrays that do not support serialization")
559        })?;
560
561        // Replace the children of the array by re-building the array from parts.
562        self.encoding().build(
563            self.dtype(),
564            self.len(),
565            &metadata,
566            &self.buffers(),
567            &ReplacementChildren { children },
568        )
569    }
570
571    fn invoke(
572        &self,
573        compute_fn: &ComputeFn,
574        args: &InvocationArgs,
575    ) -> VortexResult<Option<Output>> {
576        <V::ComputeVTable as ComputeVTable<V>>::invoke(&self.0, compute_fn, args)
577    }
578}
579
580impl<V: VTable> ArrayVisitor for ArrayAdapter<V> {
581    fn children(&self) -> Vec<ArrayRef> {
582        struct ChildrenCollector {
583            children: Vec<ArrayRef>,
584        }
585
586        impl ArrayChildVisitor for ChildrenCollector {
587            fn visit_child(&mut self, _name: &str, array: &dyn Array) {
588                self.children.push(array.to_array());
589            }
590        }
591
592        let mut collector = ChildrenCollector {
593            children: Vec::new(),
594        };
595        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
596        collector.children
597    }
598
599    fn nchildren(&self) -> usize {
600        <V::VisitorVTable as VisitorVTable<V>>::nchildren(&self.0)
601    }
602
603    fn children_names(&self) -> Vec<String> {
604        struct ChildNameCollector {
605            names: Vec<String>,
606        }
607
608        impl ArrayChildVisitor for ChildNameCollector {
609            fn visit_child(&mut self, name: &str, _array: &dyn Array) {
610                self.names.push(name.to_string());
611            }
612        }
613
614        let mut collector = ChildNameCollector { names: Vec::new() };
615        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
616        collector.names
617    }
618
619    fn named_children(&self) -> Vec<(String, ArrayRef)> {
620        struct NamedChildrenCollector {
621            children: Vec<(String, ArrayRef)>,
622        }
623
624        impl ArrayChildVisitor for NamedChildrenCollector {
625            fn visit_child(&mut self, name: &str, array: &dyn Array) {
626                self.children.push((name.to_string(), array.to_array()));
627            }
628        }
629
630        let mut collector = NamedChildrenCollector {
631            children: Vec::new(),
632        };
633
634        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
635        collector.children
636    }
637
638    fn buffers(&self) -> Vec<ByteBuffer> {
639        struct BufferCollector {
640            buffers: Vec<ByteBuffer>,
641        }
642
643        impl ArrayBufferVisitor for BufferCollector {
644            fn visit_buffer(&mut self, buffer: &ByteBuffer) {
645                self.buffers.push(buffer.clone());
646            }
647        }
648
649        let mut collector = BufferCollector {
650            buffers: Vec::new(),
651        };
652        <V::VisitorVTable as VisitorVTable<V>>::visit_buffers(&self.0, &mut collector);
653        collector.buffers
654    }
655
656    fn nbuffers(&self) -> usize {
657        <V::VisitorVTable as VisitorVTable<V>>::nbuffers(&self.0)
658    }
659
660    fn metadata(&self) -> VortexResult<Option<Vec<u8>>> {
661        Ok(<V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0)?.map(|m| m.serialize()))
662    }
663
664    fn metadata_fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
665        match <V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0) {
666            Err(e) => write!(f, "<serde error: {e}>"),
667            Ok(None) => write!(f, "<serde not supported>"),
668            Ok(Some(metadata)) => Debug::fmt(&metadata, f),
669        }
670    }
671}