vortex_array/array/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4pub mod display;
5mod visitor;
6
7use std::any::Any;
8use std::fmt::{Debug, Formatter};
9use std::sync::Arc;
10
11pub use visitor::*;
12use vortex_buffer::ByteBuffer;
13use vortex_dtype::DType;
14use vortex_error::{VortexExpect, VortexResult, vortex_bail, vortex_err};
15use vortex_mask::Mask;
16use vortex_scalar::Scalar;
17
18use crate::arrays::{
19    BoolEncoding, ConstantVTable, DecimalEncoding, ExtensionEncoding, ListEncoding, NullEncoding,
20    PrimitiveEncoding, StructEncoding, VarBinEncoding, VarBinViewEncoding,
21};
22use crate::builders::ArrayBuilder;
23use crate::compute::{ComputeFn, Cost, InvocationArgs, IsConstantOpts, Output, is_constant_opts};
24use crate::serde::ArrayChildren;
25use crate::stats::{Precision, Stat, StatsSetRef};
26use crate::vtable::{
27    ArrayVTable, CanonicalVTable, ComputeVTable, OperationsVTable, SerdeVTable, VTable,
28    ValidityVTable, VisitorVTable,
29};
30use crate::{Canonical, EncodingId, EncodingRef, SerializeMetadata};
31
32/// The public API trait for all Vortex arrays.
33pub trait Array: 'static + private::Sealed + Send + Sync + Debug + ArrayVisitor {
34    /// Returns the array as a reference to a generic [`Any`] trait object.
35    fn as_any(&self) -> &dyn Any;
36
37    /// Returns the array as an [`ArrayRef`].
38    fn to_array(&self) -> ArrayRef;
39
40    /// Returns the length of the array.
41    fn len(&self) -> usize;
42
43    /// Returns whether the array is empty (has zero rows).
44    fn is_empty(&self) -> bool {
45        self.len() == 0
46    }
47
48    /// Returns the logical Vortex [`DType`] of the array.
49    fn dtype(&self) -> &DType;
50
51    /// Returns the encoding of the array.
52    fn encoding(&self) -> EncodingRef;
53
54    /// Returns the encoding ID of the array.
55    fn encoding_id(&self) -> EncodingId;
56
57    /// Performs a constant-time slice of the array.
58    fn slice(&self, start: usize, end: usize) -> VortexResult<ArrayRef>;
59
60    /// Fetch the scalar at the given index.
61    fn scalar_at(&self, index: usize) -> VortexResult<Scalar>;
62
63    /// Returns whether the array is of the given encoding.
64    fn is_encoding(&self, encoding: EncodingId) -> bool {
65        self.encoding_id() == encoding
66    }
67
68    /// Returns whether this array is an arrow encoding.
69    // TODO(ngates): this shouldn't live here.
70    fn is_arrow(&self) -> bool {
71        self.is_encoding(NullEncoding.id())
72            || self.is_encoding(BoolEncoding.id())
73            || self.is_encoding(PrimitiveEncoding.id())
74            || self.is_encoding(VarBinEncoding.id())
75            || self.is_encoding(VarBinViewEncoding.id())
76    }
77
78    /// Whether the array is of a canonical encoding.
79    // TODO(ngates): this shouldn't live here.
80    fn is_canonical(&self) -> bool {
81        self.is_encoding(NullEncoding.id())
82            || self.is_encoding(BoolEncoding.id())
83            || self.is_encoding(PrimitiveEncoding.id())
84            || self.is_encoding(DecimalEncoding.id())
85            || self.is_encoding(StructEncoding.id())
86            || self.is_encoding(ListEncoding.id())
87            || self.is_encoding(VarBinViewEncoding.id())
88            || self.is_encoding(ExtensionEncoding.id())
89    }
90
91    /// Returns whether the item at `index` is valid.
92    fn is_valid(&self, index: usize) -> VortexResult<bool>;
93
94    /// Returns whether the item at `index` is invalid.
95    fn is_invalid(&self, index: usize) -> VortexResult<bool>;
96
97    /// Returns whether all items in the array are valid.
98    ///
99    /// This is usually cheaper than computing a precise `valid_count`.
100    fn all_valid(&self) -> VortexResult<bool>;
101
102    /// Returns whether the array is all invalid.
103    ///
104    /// This is usually cheaper than computing a precise `invalid_count`.
105    fn all_invalid(&self) -> VortexResult<bool>;
106
107    /// Returns the number of valid elements in the array.
108    fn valid_count(&self) -> VortexResult<usize>;
109
110    /// Returns the number of invalid elements in the array.
111    fn invalid_count(&self) -> VortexResult<usize>;
112
113    /// Returns the canonical validity mask for the array.
114    fn validity_mask(&self) -> VortexResult<Mask>;
115
116    /// Returns the canonical representation of the array.
117    fn to_canonical(&self) -> VortexResult<Canonical>;
118
119    /// Writes the array into the canonical builder.
120    ///
121    /// The [`DType`] of the builder must match that of the array.
122    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()>;
123
124    /// Returns the statistics of the array.
125    // TODO(ngates): change how this works. It's weird.
126    fn statistics(&self) -> StatsSetRef<'_>;
127
128    /// Replaces the children of the array with the given array references.
129    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef>;
130
131    /// Optionally invoke a kernel for the given compute function.
132    ///
133    /// These encoding-specific kernels are independent of kernels registered directly with
134    /// compute functions using [`ComputeFn::register_kernel`], and are attempted only if none of
135    /// the function-specific kernels returns a result.
136    ///
137    /// This allows encodings the opportunity to generically implement many compute functions
138    /// that share some property, for example [`ComputeFn::is_elementwise`], without prior
139    /// knowledge of the function itself, while still allowing users to override the implementation
140    /// of compute functions for built-in encodings. For an example, see the implementation for
141    /// chunked arrays.
142    ///
143    /// The first input in the [`InvocationArgs`] is always the array itself.
144    ///
145    /// Warning: do not call `compute_fn.invoke(args)` directly, as this will result in a recursive
146    /// call.
147    fn invoke(&self, compute_fn: &ComputeFn, args: &InvocationArgs)
148    -> VortexResult<Option<Output>>;
149}
150
151impl Array for Arc<dyn Array> {
152    fn as_any(&self) -> &dyn Any {
153        self.as_ref().as_any()
154    }
155
156    fn to_array(&self) -> ArrayRef {
157        self.clone()
158    }
159
160    fn len(&self) -> usize {
161        self.as_ref().len()
162    }
163
164    fn dtype(&self) -> &DType {
165        self.as_ref().dtype()
166    }
167
168    fn encoding(&self) -> EncodingRef {
169        self.as_ref().encoding()
170    }
171
172    fn encoding_id(&self) -> EncodingId {
173        self.as_ref().encoding_id()
174    }
175
176    fn slice(&self, start: usize, end: usize) -> VortexResult<ArrayRef> {
177        self.as_ref().slice(start, end)
178    }
179
180    fn scalar_at(&self, index: usize) -> VortexResult<Scalar> {
181        self.as_ref().scalar_at(index)
182    }
183
184    fn is_valid(&self, index: usize) -> VortexResult<bool> {
185        self.as_ref().is_valid(index)
186    }
187
188    fn is_invalid(&self, index: usize) -> VortexResult<bool> {
189        self.as_ref().is_invalid(index)
190    }
191
192    fn all_valid(&self) -> VortexResult<bool> {
193        self.as_ref().all_valid()
194    }
195
196    fn all_invalid(&self) -> VortexResult<bool> {
197        self.as_ref().all_invalid()
198    }
199
200    fn valid_count(&self) -> VortexResult<usize> {
201        self.as_ref().valid_count()
202    }
203
204    fn invalid_count(&self) -> VortexResult<usize> {
205        self.as_ref().invalid_count()
206    }
207
208    fn validity_mask(&self) -> VortexResult<Mask> {
209        self.as_ref().validity_mask()
210    }
211
212    fn to_canonical(&self) -> VortexResult<Canonical> {
213        self.as_ref().to_canonical()
214    }
215
216    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
217        self.as_ref().append_to_builder(builder)
218    }
219
220    fn statistics(&self) -> StatsSetRef<'_> {
221        self.as_ref().statistics()
222    }
223
224    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
225        self.as_ref().with_children(children)
226    }
227
228    fn invoke(
229        &self,
230        compute_fn: &ComputeFn,
231        args: &InvocationArgs,
232    ) -> VortexResult<Option<Output>> {
233        self.as_ref().invoke(compute_fn, args)
234    }
235}
236
237/// A reference counted pointer to a dynamic [`Array`] trait object.
238pub type ArrayRef = Arc<dyn Array>;
239
240impl ToOwned for dyn Array {
241    type Owned = ArrayRef;
242
243    fn to_owned(&self) -> Self::Owned {
244        self.to_array()
245    }
246}
247
248impl dyn Array + '_ {
249    /// Returns the array downcast to the given `A`.
250    pub fn as_<V: VTable>(&self) -> &V::Array {
251        self.as_opt::<V>().vortex_expect("Failed to downcast")
252    }
253
254    /// Returns the array downcast to the given `A`.
255    pub fn as_opt<V: VTable>(&self) -> Option<&V::Array> {
256        self.as_any()
257            .downcast_ref::<ArrayAdapter<V>>()
258            .map(|array_adapter| &array_adapter.0)
259    }
260
261    /// Is self an array with encoding from vtable `V`.
262    pub fn is<V: VTable>(&self) -> bool {
263        self.as_opt::<V>().is_some()
264    }
265
266    pub fn is_constant(&self) -> bool {
267        let opts = IsConstantOpts {
268            cost: Cost::Specialized,
269        };
270        is_constant_opts(self, &opts)
271            .inspect_err(|e| log::warn!("Failed to compute IsConstant: {e}"))
272            .ok()
273            .flatten()
274            .unwrap_or_default()
275    }
276
277    pub fn is_constant_opts(&self, cost: Cost) -> bool {
278        let opts = IsConstantOpts { cost };
279        is_constant_opts(self, &opts)
280            .inspect_err(|e| log::warn!("Failed to compute IsConstant: {e}"))
281            .ok()
282            .flatten()
283            .unwrap_or_default()
284    }
285
286    pub fn as_constant(&self) -> Option<Scalar> {
287        self.is_constant().then(|| self.scalar_at(0).ok()).flatten()
288    }
289
290    /// Total size of the array in bytes, including all children and buffers.
291    pub fn nbytes(&self) -> u64 {
292        let mut nbytes = 0;
293        for array in self.depth_first_traversal() {
294            for buffer in array.buffers() {
295                nbytes += buffer.len() as u64;
296            }
297        }
298        nbytes
299    }
300}
301
302/// Trait for converting a type into a Vortex [`ArrayRef`].
303pub trait IntoArray {
304    fn into_array(self) -> ArrayRef;
305}
306
307impl IntoArray for ArrayRef {
308    fn into_array(self) -> ArrayRef {
309        self
310    }
311}
312
313mod private {
314    use super::*;
315
316    pub trait Sealed {}
317
318    impl<V: VTable> Sealed for ArrayAdapter<V> {}
319    impl Sealed for Arc<dyn Array> {}
320}
321
322/// Adapter struct used to lift the [`VTable`] trait into an object-safe [`Array`]
323/// implementation.
324///
325/// Since this is a unit struct with `repr(transparent)`, we are able to turn un-adapted array
326/// structs into [`dyn Array`] using some cheeky casting inside [`std::ops::Deref`] and
327/// [`AsRef`]. See the `vtable!` macro for more details.
328#[repr(transparent)]
329pub struct ArrayAdapter<V: VTable>(V::Array);
330
331impl<V: VTable> Debug for ArrayAdapter<V> {
332    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
333        self.0.fmt(f)
334    }
335}
336
337impl<V: VTable> Array for ArrayAdapter<V> {
338    fn as_any(&self) -> &dyn Any {
339        self
340    }
341
342    fn to_array(&self) -> ArrayRef {
343        Arc::new(ArrayAdapter::<V>(self.0.clone()))
344    }
345
346    fn len(&self) -> usize {
347        <V::ArrayVTable as ArrayVTable<V>>::len(&self.0)
348    }
349
350    fn dtype(&self) -> &DType {
351        <V::ArrayVTable as ArrayVTable<V>>::dtype(&self.0)
352    }
353
354    fn encoding(&self) -> EncodingRef {
355        V::encoding(&self.0)
356    }
357
358    fn encoding_id(&self) -> EncodingId {
359        V::encoding(&self.0).id()
360    }
361
362    fn slice(&self, start: usize, stop: usize) -> VortexResult<ArrayRef> {
363        if start == 0 && stop == self.len() {
364            return Ok(self.to_array());
365        }
366
367        if start > self.len() {
368            vortex_bail!(OutOfBounds: start, 0, self.len());
369        }
370        if stop > self.len() {
371            vortex_bail!(OutOfBounds: stop, 0, self.len());
372        }
373        if start > stop {
374            vortex_bail!("start ({start}) must be <= stop ({stop})");
375        }
376        if start == stop {
377            return Ok(Canonical::empty(self.dtype()).into_array());
378        }
379
380        let sliced = <V::OperationsVTable as OperationsVTable<V>>::slice(&self.0, start, stop)?;
381
382        assert_eq!(
383            sliced.len(),
384            stop - start,
385            "Slice length mismatch {}",
386            self.encoding_id()
387        );
388
389        // Slightly more expensive, so only do this in debug builds.
390        debug_assert_eq!(
391            sliced.dtype(),
392            self.dtype(),
393            "Slice dtype mismatch {}",
394            self.encoding_id()
395        );
396
397        // Propagate some stats from the original array to the sliced array.
398        if !sliced.is::<ConstantVTable>() {
399            self.statistics().with_iter(|iter| {
400                sliced.statistics().inherit(iter.filter(|(stat, value)| {
401                    matches!(
402                        stat,
403                        Stat::IsConstant | Stat::IsSorted | Stat::IsStrictSorted
404                    ) && value.as_ref().as_exact().is_some_and(|v| {
405                        v.as_bool()
406                            .vortex_expect("must be a bool")
407                            .unwrap_or_default()
408                    })
409                }));
410            });
411        }
412
413        Ok(sliced)
414    }
415
416    fn scalar_at(&self, index: usize) -> VortexResult<Scalar> {
417        if index >= self.len() {
418            vortex_bail!(OutOfBounds: index, 0, self.len());
419        }
420        if self.is_invalid(index)? {
421            return Ok(Scalar::null(self.dtype().clone()));
422        }
423        let scalar = <V::OperationsVTable as OperationsVTable<V>>::scalar_at(&self.0, index)?;
424        assert_eq!(self.dtype(), scalar.dtype(), "Scalar dtype mismatch");
425        Ok(scalar)
426    }
427
428    fn is_valid(&self, index: usize) -> VortexResult<bool> {
429        if index >= self.len() {
430            vortex_bail!(OutOfBounds: index, 0, self.len());
431        }
432        <V::ValidityVTable as ValidityVTable<V>>::is_valid(&self.0, index)
433    }
434
435    fn is_invalid(&self, index: usize) -> VortexResult<bool> {
436        self.is_valid(index).map(|valid| !valid)
437    }
438
439    fn all_valid(&self) -> VortexResult<bool> {
440        <V::ValidityVTable as ValidityVTable<V>>::all_valid(&self.0)
441    }
442
443    fn all_invalid(&self) -> VortexResult<bool> {
444        <V::ValidityVTable as ValidityVTable<V>>::all_invalid(&self.0)
445    }
446
447    fn valid_count(&self) -> VortexResult<usize> {
448        if let Some(Precision::Exact(invalid_count)) =
449            self.statistics().get_as::<usize>(Stat::NullCount)
450        {
451            return Ok(self.len() - invalid_count);
452        }
453
454        let count = <V::ValidityVTable as ValidityVTable<V>>::valid_count(&self.0)?;
455        assert!(count <= self.len(), "Valid count exceeds array length");
456
457        self.statistics()
458            .set(Stat::NullCount, Precision::exact(self.len() - count));
459
460        Ok(count)
461    }
462
463    fn invalid_count(&self) -> VortexResult<usize> {
464        if let Some(Precision::Exact(invalid_count)) =
465            self.statistics().get_as::<usize>(Stat::NullCount)
466        {
467            return Ok(invalid_count);
468        }
469
470        let count = <V::ValidityVTable as ValidityVTable<V>>::invalid_count(&self.0)?;
471        assert!(count <= self.len(), "Invalid count exceeds array length");
472
473        self.statistics()
474            .set(Stat::NullCount, Precision::exact(count));
475
476        Ok(count)
477    }
478
479    fn validity_mask(&self) -> VortexResult<Mask> {
480        let mask = <V::ValidityVTable as ValidityVTable<V>>::validity_mask(&self.0)?;
481        assert_eq!(mask.len(), self.len(), "Validity mask length mismatch");
482        Ok(mask)
483    }
484
485    fn to_canonical(&self) -> VortexResult<Canonical> {
486        let canonical = <V::CanonicalVTable as CanonicalVTable<V>>::canonicalize(&self.0)?;
487        assert_eq!(
488            self.len(),
489            canonical.as_ref().len(),
490            "Canonical length mismatch {}. Expected {} but encoded into {}.",
491            self.encoding_id(),
492            self.len(),
493            canonical.as_ref().len()
494        );
495        assert_eq!(
496            self.dtype(),
497            canonical.as_ref().dtype(),
498            "Canonical dtype mismatch {}. Expected {} but encoded into {}.",
499            self.encoding_id(),
500            self.dtype(),
501            canonical.as_ref().dtype()
502        );
503        canonical
504            .as_ref()
505            .statistics()
506            .replace(self.statistics().to_owned());
507        Ok(canonical)
508    }
509
510    fn append_to_builder(&self, builder: &mut dyn ArrayBuilder) -> VortexResult<()> {
511        if builder.dtype() != self.dtype() {
512            vortex_bail!(
513                "Builder dtype mismatch: expected {}, got {}",
514                self.dtype(),
515                builder.dtype(),
516            );
517        }
518        let len = builder.len();
519
520        <V::CanonicalVTable as CanonicalVTable<V>>::append_to_builder(&self.0, builder)?;
521        assert_eq!(
522            len + self.len(),
523            builder.len(),
524            "Builder length mismatch after writing array for encoding {}",
525            self.encoding_id(),
526        );
527        Ok(())
528    }
529
530    fn statistics(&self) -> StatsSetRef<'_> {
531        <V::ArrayVTable as ArrayVTable<V>>::stats(&self.0)
532    }
533
534    fn with_children(&self, children: &[ArrayRef]) -> VortexResult<ArrayRef> {
535        struct ReplacementChildren<'a> {
536            children: &'a [ArrayRef],
537        }
538
539        impl ArrayChildren for ReplacementChildren<'_> {
540            fn get(&self, index: usize, dtype: &DType, len: usize) -> VortexResult<ArrayRef> {
541                if index >= self.children.len() {
542                    vortex_bail!(OutOfBounds: index, 0, self.children.len());
543                }
544                let child = &self.children[index];
545                if child.len() != len {
546                    vortex_bail!(
547                        "Child length mismatch: expected {}, got {}",
548                        len,
549                        child.len()
550                    );
551                }
552                if child.dtype() != dtype {
553                    vortex_bail!(
554                        "Child dtype mismatch: expected {}, got {}",
555                        dtype,
556                        child.dtype()
557                    );
558                }
559                Ok(child.clone())
560            }
561
562            fn len(&self) -> usize {
563                self.children.len()
564            }
565        }
566
567        let metadata = self.metadata()?.ok_or_else(|| {
568            vortex_err!("Cannot replace children for arrays that do not support serialization")
569        })?;
570
571        // Replace the children of the array by re-building the array from parts.
572        self.encoding().build(
573            self.dtype(),
574            self.len(),
575            &metadata,
576            &self.buffers(),
577            &ReplacementChildren { children },
578        )
579    }
580
581    fn invoke(
582        &self,
583        compute_fn: &ComputeFn,
584        args: &InvocationArgs,
585    ) -> VortexResult<Option<Output>> {
586        <V::ComputeVTable as ComputeVTable<V>>::invoke(&self.0, compute_fn, args)
587    }
588}
589
590impl<V: VTable> ArrayVisitor for ArrayAdapter<V> {
591    fn children(&self) -> Vec<ArrayRef> {
592        struct ChildrenCollector {
593            children: Vec<ArrayRef>,
594        }
595
596        impl ArrayChildVisitor for ChildrenCollector {
597            fn visit_child(&mut self, _name: &str, array: &dyn Array) {
598                self.children.push(array.to_array());
599            }
600        }
601
602        let mut collector = ChildrenCollector {
603            children: Vec::new(),
604        };
605        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
606        collector.children
607    }
608
609    fn nchildren(&self) -> usize {
610        <V::VisitorVTable as VisitorVTable<V>>::nchildren(&self.0)
611    }
612
613    fn children_names(&self) -> Vec<String> {
614        struct ChildNameCollector {
615            names: Vec<String>,
616        }
617
618        impl ArrayChildVisitor for ChildNameCollector {
619            fn visit_child(&mut self, name: &str, _array: &dyn Array) {
620                self.names.push(name.to_string());
621            }
622        }
623
624        let mut collector = ChildNameCollector { names: Vec::new() };
625        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
626        collector.names
627    }
628
629    fn named_children(&self) -> Vec<(String, ArrayRef)> {
630        struct NamedChildrenCollector {
631            children: Vec<(String, ArrayRef)>,
632        }
633
634        impl ArrayChildVisitor for NamedChildrenCollector {
635            fn visit_child(&mut self, name: &str, array: &dyn Array) {
636                self.children.push((name.to_string(), array.to_array()));
637            }
638        }
639
640        let mut collector = NamedChildrenCollector {
641            children: Vec::new(),
642        };
643
644        <V::VisitorVTable as VisitorVTable<V>>::visit_children(&self.0, &mut collector);
645        collector.children
646    }
647
648    fn buffers(&self) -> Vec<ByteBuffer> {
649        struct BufferCollector {
650            buffers: Vec<ByteBuffer>,
651        }
652
653        impl ArrayBufferVisitor for BufferCollector {
654            fn visit_buffer(&mut self, buffer: &ByteBuffer) {
655                self.buffers.push(buffer.clone());
656            }
657        }
658
659        let mut collector = BufferCollector {
660            buffers: Vec::new(),
661        };
662        <V::VisitorVTable as VisitorVTable<V>>::visit_buffers(&self.0, &mut collector);
663        collector.buffers
664    }
665
666    fn nbuffers(&self) -> usize {
667        <V::VisitorVTable as VisitorVTable<V>>::nbuffers(&self.0)
668    }
669
670    fn metadata(&self) -> VortexResult<Option<Vec<u8>>> {
671        Ok(<V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0)?.map(|m| m.serialize()))
672    }
673
674    fn metadata_fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
675        match <V::SerdeVTable as SerdeVTable<V>>::metadata(&self.0) {
676            Err(e) => write!(f, "<serde error: {e}>"),
677            Ok(None) => write!(f, "<serde not supported>"),
678            Ok(Some(metadata)) => Debug::fmt(&metadata, f),
679        }
680    }
681}