Skip to main content

vortex_array/arrow/executor/
mod.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4pub mod bool;
5mod byte;
6pub mod byte_view;
7mod decimal;
8mod dictionary;
9mod fixed_size_list;
10mod list;
11mod list_view;
12pub mod null;
13pub mod primitive;
14mod run_end;
15mod struct_;
16mod temporal;
17mod validity;
18
19use arrow_array::ArrayRef as ArrowArrayRef;
20use arrow_array::RecordBatch;
21use arrow_array::cast::AsArray;
22use arrow_array::types::*;
23use arrow_schema::DataType;
24use arrow_schema::Field;
25use arrow_schema::FieldRef;
26use arrow_schema::Schema;
27use itertools::Itertools;
28use vortex_dtype::DType;
29use vortex_dtype::PType;
30use vortex_error::VortexResult;
31use vortex_error::vortex_bail;
32use vortex_error::vortex_ensure;
33
34use crate::Array;
35use crate::ArrayRef;
36use crate::arrays::ListVTable;
37use crate::arrays::VarBinVTable;
38use crate::arrow::executor::bool::to_arrow_bool;
39use crate::arrow::executor::byte::to_arrow_byte_array;
40use crate::arrow::executor::byte_view::to_arrow_byte_view;
41use crate::arrow::executor::decimal::to_arrow_decimal;
42use crate::arrow::executor::dictionary::to_arrow_dictionary;
43use crate::arrow::executor::fixed_size_list::to_arrow_fixed_list;
44use crate::arrow::executor::list::to_arrow_list;
45use crate::arrow::executor::list_view::to_arrow_list_view;
46use crate::arrow::executor::null::to_arrow_null;
47use crate::arrow::executor::primitive::to_arrow_primitive;
48use crate::arrow::executor::run_end::to_arrow_run_end;
49use crate::arrow::executor::struct_::to_arrow_struct;
50use crate::arrow::executor::temporal::to_arrow_temporal;
51use crate::executor::ExecutionCtx;
52
53/// Trait for executing a Vortex array to produce an Arrow array.
54pub trait ArrowArrayExecutor: Sized {
55    /// Execute the array to produce an Arrow array.
56    ///
57    /// If a [`DataType`] is given, the array will be converted to the desired Arrow type.
58    /// If `None`, the array's preferred (cheapest) Arrow type will be used.
59    fn execute_arrow(
60        self,
61        data_type: Option<&DataType>,
62        ctx: &mut ExecutionCtx,
63    ) -> VortexResult<ArrowArrayRef>;
64
65    /// Execute the array to produce an Arrow `RecordBatch` with the given schema.
66    fn execute_record_batch(
67        self,
68        schema: &Schema,
69        ctx: &mut ExecutionCtx,
70    ) -> VortexResult<RecordBatch> {
71        let array = self.execute_arrow(Some(&DataType::Struct(schema.fields.clone())), ctx)?;
72        Ok(RecordBatch::from(array.as_struct()))
73    }
74
75    /// Execute the array to produce Arrow `RecordBatch`'s with the given schema.
76    fn execute_record_batches(
77        self,
78        schema: &Schema,
79        ctx: &mut ExecutionCtx,
80    ) -> VortexResult<Vec<RecordBatch>>;
81}
82
83impl ArrowArrayExecutor for ArrayRef {
84    fn execute_arrow(
85        self,
86        data_type: Option<&DataType>,
87        ctx: &mut ExecutionCtx,
88    ) -> VortexResult<ArrowArrayRef> {
89        let len = self.len();
90
91        // Resolve the DataType if it is a leaf type
92        // we should likely make this extensible.
93        let resolved_type: DataType = match data_type {
94            Some(dt) => dt.clone(),
95            None => preferred_arrow_type(&self)?,
96        };
97
98        let arrow = match &resolved_type {
99            DataType::Null => to_arrow_null(self, ctx),
100            DataType::Boolean => to_arrow_bool(self, ctx),
101            DataType::Int8 => to_arrow_primitive::<Int8Type>(self, ctx),
102            DataType::Int16 => to_arrow_primitive::<Int16Type>(self, ctx),
103            DataType::Int32 => to_arrow_primitive::<Int32Type>(self, ctx),
104            DataType::Int64 => to_arrow_primitive::<Int64Type>(self, ctx),
105            DataType::UInt8 => to_arrow_primitive::<UInt8Type>(self, ctx),
106            DataType::UInt16 => to_arrow_primitive::<UInt16Type>(self, ctx),
107            DataType::UInt32 => to_arrow_primitive::<UInt32Type>(self, ctx),
108            DataType::UInt64 => to_arrow_primitive::<UInt64Type>(self, ctx),
109            DataType::Float16 => to_arrow_primitive::<Float16Type>(self, ctx),
110            DataType::Float32 => to_arrow_primitive::<Float32Type>(self, ctx),
111            DataType::Float64 => to_arrow_primitive::<Float64Type>(self, ctx),
112            DataType::Timestamp(..)
113            | DataType::Date32
114            | DataType::Date64
115            | DataType::Time32(_)
116            | DataType::Time64(_) => to_arrow_temporal(self, &resolved_type, ctx),
117            DataType::Binary => to_arrow_byte_array::<BinaryType>(self, ctx),
118            DataType::LargeBinary => to_arrow_byte_array::<LargeBinaryType>(self, ctx),
119            DataType::Utf8 => to_arrow_byte_array::<Utf8Type>(self, ctx),
120            DataType::LargeUtf8 => to_arrow_byte_array::<LargeUtf8Type>(self, ctx),
121            DataType::BinaryView => to_arrow_byte_view::<BinaryViewType>(self, ctx),
122            DataType::Utf8View => to_arrow_byte_view::<StringViewType>(self, ctx),
123            // TODO(joe): pass down preferred
124            DataType::List(elements_field) => to_arrow_list::<i32>(self, elements_field, ctx),
125            // TODO(joe): pass down preferred
126            DataType::LargeList(elements_field) => to_arrow_list::<i64>(self, elements_field, ctx),
127            // TODO(joe): pass down preferred
128            DataType::FixedSizeList(elements_field, list_size) => {
129                to_arrow_fixed_list(self, *list_size, elements_field, ctx)
130            }
131            // TODO(joe): pass down preferred
132            DataType::ListView(elements_field) => {
133                to_arrow_list_view::<i32>(self, elements_field, ctx)
134            }
135            // TODO(joe): pass down preferred
136            DataType::LargeListView(elements_field) => {
137                to_arrow_list_view::<i64>(self, elements_field, ctx)
138            }
139            DataType::Struct(fields) => {
140                let fields = if data_type.is_none() {
141                    None
142                } else {
143                    Some(fields)
144                };
145                to_arrow_struct(self, fields, ctx)
146            }
147            // TODO(joe): pass down preferred
148            DataType::Dictionary(codes_type, values_type) => {
149                to_arrow_dictionary(self, codes_type, values_type, ctx)
150            }
151            dt @ DataType::Decimal32(..) => to_arrow_decimal(self, dt, ctx),
152            dt @ DataType::Decimal64(..) => to_arrow_decimal(self, dt, ctx),
153            dt @ DataType::Decimal128(..) => to_arrow_decimal(self, dt, ctx),
154            dt @ DataType::Decimal256(..) => to_arrow_decimal(self, dt, ctx),
155            // TODO(joe): pass down preferred
156            DataType::RunEndEncoded(ends_type, values_type) => {
157                to_arrow_run_end(self, ends_type.data_type(), values_type, ctx)
158            }
159            DataType::FixedSizeBinary(_)
160            | DataType::Map(..)
161            | DataType::Duration(_)
162            | DataType::Interval(_)
163            | DataType::Union(..) => {
164                vortex_bail!("Conversion to Arrow type {resolved_type} is not supported");
165            }
166        }?;
167
168        vortex_ensure!(
169            arrow.len() == len,
170            "Arrow array length does not match Vortex array length after conversion to {:?}",
171            arrow
172        );
173
174        Ok(arrow)
175    }
176
177    fn execute_record_batches(
178        self,
179        schema: &Schema,
180        ctx: &mut ExecutionCtx,
181    ) -> VortexResult<Vec<RecordBatch>> {
182        self.to_array_iterator()
183            .map(|a| a?.execute_record_batch(schema, ctx))
184            .try_collect()
185    }
186}
187
188/// Determine the preferred (cheapest) Arrow type for an array.
189///
190/// For most arrays, this returns the canonical Arrow type from `dtype.to_arrow_dtype()`.
191/// However, some encodings have cheaper Arrow representations:
192/// - `VarBinArray`: Uses `Utf8`/`Binary` (offset-based) instead of `Utf8View`/`BinaryView`
193/// - `ListArray`: Uses `List` instead of `ListView`
194fn preferred_arrow_type(array: &ArrayRef) -> VortexResult<DataType> {
195    // VarBinArray: use offset-based Binary/Utf8 instead of View types
196    if let Some(varbin) = array.as_opt::<VarBinVTable>() {
197        let offsets_ptype = PType::try_from(varbin.offsets().dtype())?;
198        let use_large = matches!(offsets_ptype, PType::I64 | PType::U64);
199
200        return Ok(match (varbin.dtype(), use_large) {
201            (DType::Utf8(_), false) => DataType::Utf8,
202            (DType::Utf8(_), true) => DataType::LargeUtf8,
203            (DType::Binary(_), false) => DataType::Binary,
204            (DType::Binary(_), true) => DataType::LargeBinary,
205            _ => unreachable!("VarBinArray must have Utf8 or Binary dtype"),
206        });
207    }
208
209    // ListArray: use List with appropriate offset size
210    if let Some(list) = array.as_opt::<ListVTable>() {
211        let offsets_ptype = PType::try_from(list.offsets().dtype())?;
212        let use_large = matches!(offsets_ptype, PType::I64 | PType::U64);
213        // Recursively get the preferred type for elements
214        let elem_dtype = preferred_arrow_type(list.elements())?;
215        let field = FieldRef::new(Field::new_list_field(
216            elem_dtype,
217            list.elements().dtype().is_nullable(),
218        ));
219
220        return Ok(if use_large {
221            DataType::LargeList(field)
222        } else {
223            DataType::List(field)
224        });
225    }
226
227    // Everything else: use canonical dtype conversion
228    array.dtype().to_arrow_dtype()
229}