rhei-sidecar 1.5.0

Sidecar CDC consumer for Rhei — polls external databases by timestamp columns
Documentation
//! Configuration types for the [`crate::TimestampCdcConsumer`].
//!
//! [`TimestampCdcConfig`] drives the entire polling loop: which tables to watch,
//! how many rows to fetch per cycle, and how (if at all) to detect deleted rows.
//! Each table is described by a [`TimestampTableConfig`] that names the
//! timestamp and primary-key columns used for watermark ordering.

/// Strategy for detecting deleted rows in timestamp-based polling.
///
/// Hard deletes — rows that simply disappear from the source — are invisible
/// to a `WHERE updated_at > watermark` query. This enum lets callers opt into
/// one of two detection strategies, or disable detection entirely.
#[derive(Debug, Clone)]
pub enum DeleteDetection {
    /// Disable delete detection entirely.
    ///
    /// Rows deleted in the source remain in the OLAP store unchanged.
    /// Choose this when the source never hard-deletes rows or when stale data
    /// is acceptable.
    Disabled,

    /// Detect deletes via a dedicated soft-delete column.
    ///
    /// When a row is deleted the source sets `column` to a non-`NULL`
    /// timestamp. The consumer issues a second poll query:
    ///
    /// ```text
    /// SELECT … FROM table WHERE {column} IS NOT NULL AND {column} > watermark
    /// ORDER BY {column}, pk1, … LIMIT batch_size
    /// ```
    ///
    /// (`{column}` is a placeholder for the field stored in [`Self::SoftDelete::column`].)
    ///
    /// Rows returned by this query are emitted as
    /// [`rhei_core::CdcOperation::Delete`] events. The `column` value is used
    /// as the event timestamp so that temporal validity boundaries in SCD Type 2
    /// history are correct.
    SoftDelete {
        /// Name of the column that holds the deletion timestamp
        /// (e.g., `"deleted_at"`).
        column: String,
    },

    /// Detect deletes by periodically comparing the full source table against
    /// the OLAP store.
    ///
    /// **Not yet implemented.** Configuring this variant logs a warning on
    /// every poll cycle and produces no delete events. Reserved for a future
    /// release.
    FullDiff {
        /// How many poll cycles to skip between full-table scans.
        /// A value of `60` runs a full diff approximately once per minute
        /// if the poll interval is one second.
        every_n_cycles: u32,
    },
}

/// Top-level configuration for [`crate::TimestampCdcConsumer`].
///
/// Describes the set of tables to replicate and the parameters that apply to
/// every table uniformly (batch size, delete detection).
#[derive(Debug, Clone)]
pub struct TimestampCdcConfig {
    /// Tables to poll from the external source, one [`TimestampTableConfig`]
    /// per table.
    pub tables: Vec<TimestampTableConfig>,

    /// Maximum number of rows to fetch per table per poll cycle.
    ///
    /// Smaller values reduce memory pressure and latency jitter; larger values
    /// amortize query round-trips during initial bulk sync.
    pub poll_batch_size: u32,

    /// Strategy for detecting rows deleted in the source.
    ///
    /// See [`DeleteDetection`] for the available options.
    pub delete_detection: DeleteDetection,
}

/// Per-table configuration for timestamp-based CDC polling.
///
/// Identifies the columns that the consumer reads to build the watermark
/// predicate and to classify events as inserts vs updates.
#[derive(Debug, Clone)]
pub struct TimestampTableConfig {
    /// Name of the table in the external source database.
    pub table_name: String,

    /// Column that records when the row was first created.
    ///
    /// Used together with [`Self::updated_at_column`] for the INSERT/UPDATE
    /// heuristic: if `created_at == updated_at` the row is treated as a new
    /// insert; otherwise it is treated as an update.
    pub created_at_column: String,

    /// Column that records when the row was last modified.
    ///
    /// This is the primary watermark column. The poll query filters
    /// `WHERE updated_at > watermark_ts` (with composite-PK tie-breaking).
    pub updated_at_column: String,

    /// Names of the primary key columns, in declaration order.
    ///
    /// Must contain at least one entry. Used for:
    /// - Watermark tie-breaking when multiple rows share the same `updated_at`.
    /// - Setting [`rhei_core::CdcEvent::row_id`] from the first PK column.
    /// - SQL tuple comparison in the poll predicate for composite PKs.
    pub primary_key: Vec<String>,

    /// Explicit list of columns to include in `SELECT`.
    ///
    /// When empty the consumer issues `SELECT *` and all columns are included.
    /// Provide an explicit list to reduce bandwidth or exclude columns
    /// irrelevant to the OLAP workload.
    pub columns: Vec<String>,
}