Skip to main content

rhei_sidecar/
config.rs

1//! Configuration types for the [`crate::TimestampCdcConsumer`].
2//!
3//! [`TimestampCdcConfig`] drives the entire polling loop: which tables to watch,
4//! how many rows to fetch per cycle, and how (if at all) to detect deleted rows.
5//! Each table is described by a [`TimestampTableConfig`] that names the
6//! timestamp and primary-key columns used for watermark ordering.
7
8/// Strategy for detecting deleted rows in timestamp-based polling.
9///
10/// Hard deletes — rows that simply disappear from the source — are invisible
11/// to a `WHERE updated_at > watermark` query. This enum lets callers opt into
12/// one of two detection strategies, or disable detection entirely.
13#[derive(Debug, Clone)]
14pub enum DeleteDetection {
15    /// Disable delete detection entirely.
16    ///
17    /// Rows deleted in the source remain in the OLAP store unchanged.
18    /// Choose this when the source never hard-deletes rows or when stale data
19    /// is acceptable.
20    Disabled,
21
22    /// Detect deletes via a dedicated soft-delete column.
23    ///
24    /// When a row is deleted the source sets `column` to a non-`NULL`
25    /// timestamp. The consumer issues a second poll query:
26    ///
27    /// ```text
28    /// SELECT … FROM table WHERE {column} IS NOT NULL AND {column} > watermark
29    /// ORDER BY {column}, pk1, … LIMIT batch_size
30    /// ```
31    ///
32    /// (`{column}` is a placeholder for the field stored in [`Self::SoftDelete::column`].)
33    ///
34    /// Rows returned by this query are emitted as
35    /// [`rhei_core::CdcOperation::Delete`] events. The `column` value is used
36    /// as the event timestamp so that temporal validity boundaries in SCD Type 2
37    /// history are correct.
38    SoftDelete {
39        /// Name of the column that holds the deletion timestamp
40        /// (e.g., `"deleted_at"`).
41        column: String,
42    },
43
44    /// Detect deletes by periodically comparing the full source table against
45    /// the OLAP store.
46    ///
47    /// **Not yet implemented.** Configuring this variant logs a warning on
48    /// every poll cycle and produces no delete events. Reserved for a future
49    /// release.
50    FullDiff {
51        /// How many poll cycles to skip between full-table scans.
52        /// A value of `60` runs a full diff approximately once per minute
53        /// if the poll interval is one second.
54        every_n_cycles: u32,
55    },
56}
57
58/// Top-level configuration for [`crate::TimestampCdcConsumer`].
59///
60/// Describes the set of tables to replicate and the parameters that apply to
61/// every table uniformly (batch size, delete detection).
62#[derive(Debug, Clone)]
63pub struct TimestampCdcConfig {
64    /// Tables to poll from the external source, one [`TimestampTableConfig`]
65    /// per table.
66    pub tables: Vec<TimestampTableConfig>,
67
68    /// Maximum number of rows to fetch per table per poll cycle.
69    ///
70    /// Smaller values reduce memory pressure and latency jitter; larger values
71    /// amortize query round-trips during initial bulk sync.
72    pub poll_batch_size: u32,
73
74    /// Strategy for detecting rows deleted in the source.
75    ///
76    /// See [`DeleteDetection`] for the available options.
77    pub delete_detection: DeleteDetection,
78}
79
80/// Per-table configuration for timestamp-based CDC polling.
81///
82/// Identifies the columns that the consumer reads to build the watermark
83/// predicate and to classify events as inserts vs updates.
84#[derive(Debug, Clone)]
85pub struct TimestampTableConfig {
86    /// Name of the table in the external source database.
87    pub table_name: String,
88
89    /// Column that records when the row was first created.
90    ///
91    /// Used together with [`Self::updated_at_column`] for the INSERT/UPDATE
92    /// heuristic: if `created_at == updated_at` the row is treated as a new
93    /// insert; otherwise it is treated as an update.
94    pub created_at_column: String,
95
96    /// Column that records when the row was last modified.
97    ///
98    /// This is the primary watermark column. The poll query filters
99    /// `WHERE updated_at > watermark_ts` (with composite-PK tie-breaking).
100    pub updated_at_column: String,
101
102    /// Names of the primary key columns, in declaration order.
103    ///
104    /// Must contain at least one entry. Used for:
105    /// - Watermark tie-breaking when multiple rows share the same `updated_at`.
106    /// - Setting [`rhei_core::CdcEvent::row_id`] from the first PK column.
107    /// - SQL tuple comparison in the poll predicate for composite PKs.
108    pub primary_key: Vec<String>,
109
110    /// Explicit list of columns to include in `SELECT`.
111    ///
112    /// When empty the consumer issues `SELECT *` and all columns are included.
113    /// Provide an explicit list to reduce bandwidth or exclude columns
114    /// irrelevant to the OLAP workload.
115    pub columns: Vec<String>,
116}