rhei_datafusion/storage.rs
1//! Pluggable storage modes for the DataFusion OLAP engine.
2//!
3//! - `InMemory`: default, data stored as `Vec<RecordBatch>` (lost on shutdown)
4//! - `ArrowIpc`: persistent `.arrow` files (zero serde overhead, no compression)
5//! - `Parquet`: persistent `.parquet` files (compressed, predicate pushdown)
6//! - `S3Parquet` *(cloud-storage feature)*: Parquet files stored on Amazon S3
7//! - `GcsParquet` *(cloud-storage feature)*: Parquet files stored on Google Cloud Storage
8
9use std::path::PathBuf;
10
11/// Storage mode for the DataFusion OLAP engine.
12#[derive(Debug, Clone, Default)]
13pub enum StorageMode {
14 /// In-memory storage (default). Data is lost on shutdown.
15 #[default]
16 InMemory,
17 /// Arrow IPC file storage. Zero-overhead native Arrow format.
18 ///
19 /// Data is persisted as `.arrow` files in `path`. Survives restarts.
20 /// No compression — ideal when disk space is not a concern and you want
21 /// the fastest possible durable writes.
22 ArrowIpc {
23 /// Base directory under which per-table subdirectories are created.
24 path: PathBuf,
25 },
26 /// Parquet file storage. Compressed columnar with predicate pushdown.
27 ///
28 /// Data is persisted as `.parquet` files in `path`. Slower writes than
29 /// [`StorageMode::ArrowIpc`] (Parquet encoding overhead) but much smaller
30 /// on-disk footprint and enables column/row-group pruning at read time.
31 Parquet {
32 /// Base directory under which per-table subdirectories are created.
33 path: PathBuf,
34 },
35 /// S3-backed Parquet storage.
36 ///
37 /// Available on crate feature `cloud-storage` only.
38 ///
39 /// `url` must start with `s3://<bucket>/<prefix>`. Credentials are read
40 /// from environment variables (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`,
41 /// `AWS_REGION`, etc.) following `object_store` crate conventions.
42 ///
43 /// # Characteristics
44 ///
45 /// - **Reads** are handled natively by DataFusion's `ListingTable` once the
46 /// object store is registered.
47 /// - **Writes** (INSERT / `load_arrow`) append a new Parquet object under the
48 /// table prefix. For analytical workloads with infrequent bulk loads this
49 /// is efficient.
50 /// - **UPDATE / DELETE** require a read-modify-write cycle: the full table is
51 /// downloaded, mutated in-memory, and re-uploaded as a single consolidated
52 /// object. This is **not** suitable for high-frequency point updates.
53 /// - Network latency is a factor for all operations — local storage modes are
54 /// faster for single-host deployments.
55 #[cfg(feature = "cloud-storage")]
56 S3Parquet {
57 /// S3 URL of the form `s3://<bucket>/<prefix>` pointing to the base
58 /// storage prefix. Per-table subdirectories are created beneath it.
59 url: String,
60 },
61 /// GCS-backed Parquet storage.
62 ///
63 /// Available on crate feature `cloud-storage` only.
64 ///
65 /// `url` must start with `gs://<bucket>/<prefix>`. Credentials follow
66 /// `object_store` defaults (application default credentials, service-account
67 /// key file via `GOOGLE_APPLICATION_CREDENTIALS`, etc.).
68 ///
69 /// The same read/write characteristics as [`StorageMode::S3Parquet`] apply.
70 #[cfg(feature = "cloud-storage")]
71 GcsParquet {
72 /// GCS URL of the form `gs://<bucket>/<prefix>` pointing to the base
73 /// storage prefix. Per-table subdirectories are created beneath it.
74 url: String,
75 },
76}
77
78impl StorageMode {
79 /// Returns the base path for local file-backed modes, or `None` for
80 /// in-memory and cloud modes.
81 pub fn base_path(&self) -> Option<&PathBuf> {
82 match self {
83 Self::InMemory => None,
84 Self::ArrowIpc { path } | Self::Parquet { path } => Some(path),
85 #[cfg(feature = "cloud-storage")]
86 Self::S3Parquet { .. } | Self::GcsParquet { .. } => None,
87 }
88 }
89
90 /// Returns the file extension for this storage mode.
91 pub fn file_extension(&self) -> &'static str {
92 match self {
93 Self::InMemory => "",
94 Self::ArrowIpc { .. } => "arrow",
95 Self::Parquet { .. } => "parquet",
96 #[cfg(feature = "cloud-storage")]
97 Self::S3Parquet { .. } | Self::GcsParquet { .. } => "parquet",
98 }
99 }
100
101 /// Returns the cloud base URL for cloud-backed modes, or `None` for local modes.
102 #[cfg(feature = "cloud-storage")]
103 pub fn cloud_base_url(&self) -> Option<&str> {
104 match self {
105 Self::S3Parquet { url } | Self::GcsParquet { url } => Some(url.as_str()),
106 _ => None,
107 }
108 }
109
110 /// Returns `true` if this mode uses cloud object storage.
111 pub fn is_cloud(&self) -> bool {
112 #[cfg(feature = "cloud-storage")]
113 {
114 matches!(self, Self::S3Parquet { .. } | Self::GcsParquet { .. })
115 }
116 #[cfg(not(feature = "cloud-storage"))]
117 {
118 false
119 }
120 }
121}