rhei-datafusion 1.5.0

DataFusion OLAP backend for Rhei HTAP engine
Documentation
//! Pluggable storage modes for the DataFusion OLAP engine.
//!
//! - `InMemory`: default, data stored as `Vec<RecordBatch>` (lost on shutdown)
//! - `ArrowIpc`: persistent `.arrow` files (zero serde overhead, no compression)
//! - `Parquet`: persistent `.parquet` files (compressed, predicate pushdown)
//! - `S3Parquet` *(cloud-storage feature)*: Parquet files stored on Amazon S3
//! - `GcsParquet` *(cloud-storage feature)*: Parquet files stored on Google Cloud Storage

use std::path::PathBuf;

/// Storage mode for the DataFusion OLAP engine.
#[derive(Debug, Clone, Default)]
pub enum StorageMode {
    /// In-memory storage (default). Data is lost on shutdown.
    #[default]
    InMemory,
    /// Arrow IPC file storage. Zero-overhead native Arrow format.
    ///
    /// Data is persisted as `.arrow` files in `path`. Survives restarts.
    /// No compression — ideal when disk space is not a concern and you want
    /// the fastest possible durable writes.
    ArrowIpc {
        /// Base directory under which per-table subdirectories are created.
        path: PathBuf,
    },
    /// Parquet file storage. Compressed columnar with predicate pushdown.
    ///
    /// Data is persisted as `.parquet` files in `path`. Slower writes than
    /// [`StorageMode::ArrowIpc`] (Parquet encoding overhead) but much smaller
    /// on-disk footprint and enables column/row-group pruning at read time.
    Parquet {
        /// Base directory under which per-table subdirectories are created.
        path: PathBuf,
    },
    /// S3-backed Parquet storage.
    ///
    /// Available on crate feature `cloud-storage` only.
    ///
    /// `url` must start with `s3://<bucket>/<prefix>`. Credentials are read
    /// from environment variables (`AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`,
    /// `AWS_REGION`, etc.) following `object_store` crate conventions.
    ///
    /// # Characteristics
    ///
    /// - **Reads** are handled natively by DataFusion's `ListingTable` once the
    ///   object store is registered.
    /// - **Writes** (INSERT / `load_arrow`) append a new Parquet object under the
    ///   table prefix.  For analytical workloads with infrequent bulk loads this
    ///   is efficient.
    /// - **UPDATE / DELETE** require a read-modify-write cycle: the full table is
    ///   downloaded, mutated in-memory, and re-uploaded as a single consolidated
    ///   object.  This is **not** suitable for high-frequency point updates.
    /// - Network latency is a factor for all operations — local storage modes are
    ///   faster for single-host deployments.
    #[cfg(feature = "cloud-storage")]
    S3Parquet {
        /// S3 URL of the form `s3://<bucket>/<prefix>` pointing to the base
        /// storage prefix.  Per-table subdirectories are created beneath it.
        url: String,
    },
    /// GCS-backed Parquet storage.
    ///
    /// Available on crate feature `cloud-storage` only.
    ///
    /// `url` must start with `gs://<bucket>/<prefix>`. Credentials follow
    /// `object_store` defaults (application default credentials, service-account
    /// key file via `GOOGLE_APPLICATION_CREDENTIALS`, etc.).
    ///
    /// The same read/write characteristics as [`StorageMode::S3Parquet`] apply.
    #[cfg(feature = "cloud-storage")]
    GcsParquet {
        /// GCS URL of the form `gs://<bucket>/<prefix>` pointing to the base
        /// storage prefix.  Per-table subdirectories are created beneath it.
        url: String,
    },
}

impl StorageMode {
    /// Returns the base path for local file-backed modes, or `None` for
    /// in-memory and cloud modes.
    pub fn base_path(&self) -> Option<&PathBuf> {
        match self {
            Self::InMemory => None,
            Self::ArrowIpc { path } | Self::Parquet { path } => Some(path),
            #[cfg(feature = "cloud-storage")]
            Self::S3Parquet { .. } | Self::GcsParquet { .. } => None,
        }
    }

    /// Returns the file extension for this storage mode.
    pub fn file_extension(&self) -> &'static str {
        match self {
            Self::InMemory => "",
            Self::ArrowIpc { .. } => "arrow",
            Self::Parquet { .. } => "parquet",
            #[cfg(feature = "cloud-storage")]
            Self::S3Parquet { .. } | Self::GcsParquet { .. } => "parquet",
        }
    }

    /// Returns the cloud base URL for cloud-backed modes, or `None` for local modes.
    #[cfg(feature = "cloud-storage")]
    pub fn cloud_base_url(&self) -> Option<&str> {
        match self {
            Self::S3Parquet { url } | Self::GcsParquet { url } => Some(url.as_str()),
            _ => None,
        }
    }

    /// Returns `true` if this mode uses cloud object storage.
    pub fn is_cloud(&self) -> bool {
        #[cfg(feature = "cloud-storage")]
        {
            matches!(self, Self::S3Parquet { .. } | Self::GcsParquet { .. })
        }
        #[cfg(not(feature = "cloud-storage"))]
        {
            false
        }
    }
}