Skip to main content

rhei_datafusion/
storage.rs

1//! Pluggable storage modes for the DataFusion OLAP engine.
2//!
3//! - `InMemory`: default, data stored as `Vec<RecordBatch>` (lost on shutdown)
4//! - `ArrowIpc`: persistent `.arrow` files (zero serde overhead, no compression)
5//! - `Parquet`: persistent `.parquet` files (compressed, predicate pushdown)
6//! - `S3Parquet` *(cloud-storage feature)*: Parquet files stored on Amazon S3
7//! - `GcsParquet` *(cloud-storage feature)*: Parquet files stored on Google Cloud Storage
8
9use std::path::PathBuf;
10
11/// Storage mode for the DataFusion OLAP engine.
12#[derive(Debug, Clone, Default)]
13pub enum StorageMode {
14    /// In-memory storage (default). Data is lost on shutdown.
15    #[default]
16    InMemory,
17    /// Arrow IPC file storage. Zero-overhead native Arrow format.
18    ///
19    /// Data is persisted as `.arrow` files in `path`. Survives restarts.
20    /// No compression — ideal when disk space is not a concern and you want
21    /// the fastest possible durable writes.
22    ArrowIpc {
23        /// Base directory under which per-table subdirectories are created.
24        path: PathBuf,
25    },
26    /// Parquet file storage. Compressed columnar with predicate pushdown.
27    ///
28    /// Data is persisted as `.parquet` files in `path`. Slower writes than
29    /// [`StorageMode::ArrowIpc`] (Parquet encoding overhead) but much smaller
30    /// on-disk footprint and enables column/row-group pruning at read time.
31    Parquet {
32        /// Base directory under which per-table subdirectories are created.
33        path: PathBuf,
34    },
35    /// S3-backed Parquet storage.
36    ///
37    /// Available on crate feature `cloud-storage` only.
38    ///
39    /// `url` must start with `s3://<bucket>/<prefix>`. Credentials are read
40    /// from environment variables (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`,
41    /// `AWS_REGION`, etc.) following `object_store` crate conventions.
42    ///
43    /// # Characteristics
44    ///
45    /// - **Reads** are handled natively by DataFusion's `ListingTable` once the
46    ///   object store is registered.
47    /// - **Writes** (INSERT / `load_arrow`) append a new Parquet object under the
48    ///   table prefix.  For analytical workloads with infrequent bulk loads this
49    ///   is efficient.
50    /// - **UPDATE / DELETE** require a read-modify-write cycle: the full table is
51    ///   downloaded, mutated in-memory, and re-uploaded as a single consolidated
52    ///   object.  This is **not** suitable for high-frequency point updates.
53    /// - Network latency is a factor for all operations — local storage modes are
54    ///   faster for single-host deployments.
55    #[cfg(feature = "cloud-storage")]
56    S3Parquet {
57        /// S3 URL of the form `s3://<bucket>/<prefix>` pointing to the base
58        /// storage prefix.  Per-table subdirectories are created beneath it.
59        url: String,
60    },
61    /// GCS-backed Parquet storage.
62    ///
63    /// Available on crate feature `cloud-storage` only.
64    ///
65    /// `url` must start with `gs://<bucket>/<prefix>`. Credentials follow
66    /// `object_store` defaults (application default credentials, service-account
67    /// key file via `GOOGLE_APPLICATION_CREDENTIALS`, etc.).
68    ///
69    /// The same read/write characteristics as [`StorageMode::S3Parquet`] apply.
70    #[cfg(feature = "cloud-storage")]
71    GcsParquet {
72        /// GCS URL of the form `gs://<bucket>/<prefix>` pointing to the base
73        /// storage prefix.  Per-table subdirectories are created beneath it.
74        url: String,
75    },
76}
77
78impl StorageMode {
79    /// Returns the base path for local file-backed modes, or `None` for
80    /// in-memory and cloud modes.
81    pub fn base_path(&self) -> Option<&PathBuf> {
82        match self {
83            Self::InMemory => None,
84            Self::ArrowIpc { path } | Self::Parquet { path } => Some(path),
85            #[cfg(feature = "cloud-storage")]
86            Self::S3Parquet { .. } | Self::GcsParquet { .. } => None,
87        }
88    }
89
90    /// Returns the file extension for this storage mode.
91    pub fn file_extension(&self) -> &'static str {
92        match self {
93            Self::InMemory => "",
94            Self::ArrowIpc { .. } => "arrow",
95            Self::Parquet { .. } => "parquet",
96            #[cfg(feature = "cloud-storage")]
97            Self::S3Parquet { .. } | Self::GcsParquet { .. } => "parquet",
98        }
99    }
100
101    /// Returns the cloud base URL for cloud-backed modes, or `None` for local modes.
102    #[cfg(feature = "cloud-storage")]
103    pub fn cloud_base_url(&self) -> Option<&str> {
104        match self {
105            Self::S3Parquet { url } | Self::GcsParquet { url } => Some(url.as_str()),
106            _ => None,
107        }
108    }
109
110    /// Returns `true` if this mode uses cloud object storage.
111    pub fn is_cloud(&self) -> bool {
112        #[cfg(feature = "cloud-storage")]
113        {
114            matches!(self, Self::S3Parquet { .. } | Self::GcsParquet { .. })
115        }
116        #[cfg(not(feature = "cloud-storage"))]
117        {
118            false
119        }
120    }
121}