Skip to main content

vortex_scan/
api.rs

1// SPDX-License-Identifier: Apache-2.0
2// SPDX-FileCopyrightText: Copyright the Vortex contributors
3
4#![deny(missing_docs)]
5
6//! The Vortex Scan API implements an abstract table scan interface that can be used to
7//! read data from various data sources.
8//!
9//! It supports arbitrary projection expressions, filter expressions, and limit pushdown as well
10//! as mechanisms for parallel and distributed execution via splits.
11//!
12//! The API is currently under development and may change in future releases, however we hope to
13//! stabilize into stable C ABI for use within foreign language bindings.
14//!
15//! ## Open Issues
16//!
17//! * We probably want to make the DataSource serializable as well, so that we can share
18//!   source-level state with workers, separately from split serialization.
19//! * We should add a way for the client to negotiate capabilities with the data source, for
20//!   example which encodings it knows about.
21
22use std::any::Any;
23use std::sync::Arc;
24
25use async_trait::async_trait;
26use futures::stream::BoxStream;
27use vortex_array::expr::Expression;
28use vortex_array::stream::SendableArrayStream;
29use vortex_dtype::DType;
30use vortex_error::VortexResult;
31use vortex_session::VortexSession;
32
33/// Create a Vortex source from serialized configuration.
34///
35/// Providers can be registered with Vortex under a specific
36#[async_trait(?Send)]
37pub trait DataSourceProvider: 'static {
38    /// Attempt to initialize a new source.
39    ///
40    /// Returns `Ok(None)` if the provider cannot handle the given URI.
41    async fn initialize(
42        &self,
43        uri: String,
44        session: &VortexSession,
45    ) -> VortexResult<Option<DataSourceRef>>;
46}
47
48/// A reference-counted data source.
49pub type DataSourceRef = Arc<dyn DataSource>;
50
51/// A data source represents a streamable dataset that can be scanned with projection and filter
52/// expressions. Each scan produces splits that can be executed (potentially in parallel) to read
53/// data. Each split can be serialized for remote execution.
54///
55/// The DataSource may be used multiple times to create multiple scans, whereas each scan and each
56/// split of a scan can only be consumed once.
57#[async_trait]
58pub trait DataSource: 'static + Send + Sync {
59    /// Returns the dtype of the source.
60    fn dtype(&self) -> &DType;
61
62    /// Returns an estimate of the row count of the source.
63    fn row_count_estimate(&self) -> Estimate<u64>;
64
65    /// Returns a scan over the source.
66    async fn scan(&self, scan_request: ScanRequest) -> VortexResult<DataSourceScanRef>;
67
68    /// Serialize a split from this data source.
69    fn serialize_split(&self, split: &dyn Split) -> VortexResult<Vec<u8>>;
70
71    /// Deserialize a split that was previously serialized from a compatible data source.
72    fn deserialize_split(&self, data: &[u8]) -> VortexResult<SplitRef>;
73}
74
75/// A request to scan a data source.
76#[derive(Debug, Clone, Default)]
77pub struct ScanRequest {
78    /// Projection expression, `None` implies `root()`.
79    pub projection: Option<Expression>,
80    /// Filter expression, `None` implies no filter.
81    pub filter: Option<Expression>,
82    /// Optional limit on the number of rows to scan.
83    pub limit: Option<u64>,
84}
85
86/// A boxed data source scan.
87pub type DataSourceScanRef = Box<dyn DataSourceScan>;
88
89/// A data source scan produces splits that can be executed to read data from the source.
90#[async_trait]
91pub trait DataSourceScan: 'static + Send {
92    /// The returned dtype of the scan.
93    fn dtype(&self) -> &DType;
94
95    /// An estimate of the remaining splits.
96    fn remaining_splits_estimate(&self) -> Estimate<usize>;
97
98    /// Returns the next batch of splits to be processed.
99    ///
100    /// This should not return _more_ than `max_splits` splits, but may return fewer.
101    async fn next_splits(&mut self, max_splits: usize) -> VortexResult<Vec<SplitRef>>;
102}
103
104/// A stream of splits.
105pub type SplitStream = BoxStream<'static, VortexResult<SplitRef>>;
106
107/// A reference-counted split.
108pub type SplitRef = Box<dyn Split>;
109
110/// A split represents a unit of work that can be executed to produce a stream of arrays.
111pub trait Split: 'static + Send {
112    /// Downcast the split to a concrete type.
113    fn as_any(&self) -> &dyn Any;
114
115    /// Executes the split.
116    fn execute(self: Box<Self>) -> VortexResult<SendableArrayStream>;
117
118    /// Returns an estimate of the row count for this split.
119    fn row_count_estimate(&self) -> Estimate<u64>;
120
121    /// Returns an estimate of the byte size for this split.
122    fn byte_size_estimate(&self) -> Estimate<u64>;
123}
124
125/// An estimate that can be exact, an upper bound, or unknown.
126#[derive(Default)]
127pub enum Estimate<T> {
128    /// The exact value.
129    Exact(T),
130    /// An upper bound on the value.
131    UpperBound(T),
132    /// The value is unknown.
133    #[default]
134    Unknown,
135}