vortex_scan/api.rs
1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4#![deny(missing_docs)]
5
6//! The Vortex Scan API implements an abstract table scan interface that can be used to
7//! read data from various data sources.
8//!
9//! It supports arbitrary projection expressions, filter expressions, and limit pushdown as well
10//! as mechanisms for parallel and distributed execution via splits.
11//!
12//! The API is currently under development and may change in future releases, however we hope to
13//! stabilize into stable C ABI for use within foreign language bindings.
14//!
15//! ## Open Issues
16//!
17//! * We probably want to make the DataSource serializable as well, so that we can share
18//! source-level state with workers, separately from split serialization.
19//! * We should add a way for the client to negotiate capabilities with the data source, for
20//! example which encodings it knows about.
21
22use std::any::Any;
23use std::sync::Arc;
24
25use async_trait::async_trait;
26use futures::stream::BoxStream;
27use vortex_array::expr::Expression;
28use vortex_array::stream::SendableArrayStream;
29use vortex_dtype::DType;
30use vortex_error::VortexResult;
31use vortex_session::VortexSession;
32
33/// Create a Vortex source from serialized configuration.
34///
35/// Providers can be registered with Vortex under a specific
36#[async_trait(?Send)]
37pub trait DataSourceProvider: 'static {
38 /// Attempt to initialize a new source.
39 ///
40 /// Returns `Ok(None)` if the provider cannot handle the given URI.
41 async fn initialize(
42 &self,
43 uri: String,
44 session: &VortexSession,
45 ) -> VortexResult<Option<DataSourceRef>>;
46}
47
48/// A reference-counted data source.
49pub type DataSourceRef = Arc<dyn DataSource>;
50
51/// A data source represents a streamable dataset that can be scanned with projection and filter
52/// expressions. Each scan produces splits that can be executed (potentially in parallel) to read
53/// data. Each split can be serialized for remote execution.
54///
55/// The DataSource may be used multiple times to create multiple scans, whereas each scan and each
56/// split of a scan can only be consumed once.
57#[async_trait]
58pub trait DataSource: 'static + Send + Sync {
59 /// Returns the dtype of the source.
60 fn dtype(&self) -> &DType;
61
62 /// Returns an estimate of the row count of the source.
63 fn row_count_estimate(&self) -> Estimate<u64>;
64
65 /// Returns a scan over the source.
66 async fn scan(&self, scan_request: ScanRequest) -> VortexResult<DataSourceScanRef>;
67
68 /// Serialize a split from this data source.
69 fn serialize_split(&self, split: &dyn Split) -> VortexResult<Vec<u8>>;
70
71 /// Deserialize a split that was previously serialized from a compatible data source.
72 fn deserialize_split(&self, data: &[u8]) -> VortexResult<SplitRef>;
73}
74
75/// A request to scan a data source.
76#[derive(Debug, Clone, Default)]
77pub struct ScanRequest {
78 /// Projection expression, `None` implies `root()`.
79 pub projection: Option<Expression>,
80 /// Filter expression, `None` implies no filter.
81 pub filter: Option<Expression>,
82 /// Optional limit on the number of rows to scan.
83 pub limit: Option<u64>,
84}
85
86/// A boxed data source scan.
87pub type DataSourceScanRef = Box<dyn DataSourceScan>;
88
89/// A data source scan produces splits that can be executed to read data from the source.
90#[async_trait]
91pub trait DataSourceScan: 'static + Send {
92 /// The returned dtype of the scan.
93 fn dtype(&self) -> &DType;
94
95 /// An estimate of the remaining splits.
96 fn remaining_splits_estimate(&self) -> Estimate<usize>;
97
98 /// Returns the next batch of splits to be processed.
99 ///
100 /// This should not return _more_ than `max_splits` splits, but may return fewer.
101 async fn next_splits(&mut self, max_splits: usize) -> VortexResult<Vec<SplitRef>>;
102}
103
104/// A stream of splits.
105pub type SplitStream = BoxStream<'static, VortexResult<SplitRef>>;
106
107/// A reference-counted split.
108pub type SplitRef = Box<dyn Split>;
109
110/// A split represents a unit of work that can be executed to produce a stream of arrays.
111pub trait Split: 'static + Send {
112 /// Downcast the split to a concrete type.
113 fn as_any(&self) -> &dyn Any;
114
115 /// Executes the split.
116 fn execute(self: Box<Self>) -> VortexResult<SendableArrayStream>;
117
118 /// Returns an estimate of the row count for this split.
119 fn row_count_estimate(&self) -> Estimate<u64>;
120
121 /// Returns an estimate of the byte size for this split.
122 fn byte_size_estimate(&self) -> Estimate<u64>;
123}
124
125/// An estimate that can be exact, an upper bound, or unknown.
126#[derive(Default)]
127pub enum Estimate<T> {
128 /// The exact value.
129 Exact(T),
130 /// An upper bound on the value.
131 UpperBound(T),
132 /// The value is unknown.
133 #[default]
134 Unknown,
135}