rhei_sidecar/config.rs
1//! Configuration types for the [`crate::TimestampCdcConsumer`].
2//!
3//! [`TimestampCdcConfig`] drives the entire polling loop: which tables to watch,
4//! how many rows to fetch per cycle, and how (if at all) to detect deleted rows.
5//! Each table is described by a [`TimestampTableConfig`] that names the
6//! timestamp and primary-key columns used for watermark ordering.
7
8/// Strategy for detecting deleted rows in timestamp-based polling.
9///
10/// Hard deletes — rows that simply disappear from the source — are invisible
11/// to a `WHERE updated_at > watermark` query. This enum lets callers opt into
12/// one of two detection strategies, or disable detection entirely.
13#[derive(Debug, Clone)]
14pub enum DeleteDetection {
15 /// Disable delete detection entirely.
16 ///
17 /// Rows deleted in the source remain in the OLAP store unchanged.
18 /// Choose this when the source never hard-deletes rows or when stale data
19 /// is acceptable.
20 Disabled,
21
22 /// Detect deletes via a dedicated soft-delete column.
23 ///
24 /// When a row is deleted the source sets `column` to a non-`NULL`
25 /// timestamp. The consumer issues a second poll query:
26 ///
27 /// ```text
28 /// SELECT … FROM table WHERE {column} IS NOT NULL AND {column} > watermark
29 /// ORDER BY {column}, pk1, … LIMIT batch_size
30 /// ```
31 ///
32 /// (`{column}` is a placeholder for the field stored in [`Self::SoftDelete::column`].)
33 ///
34 /// Rows returned by this query are emitted as
35 /// [`rhei_core::CdcOperation::Delete`] events. The `column` value is used
36 /// as the event timestamp so that temporal validity boundaries in SCD Type 2
37 /// history are correct.
38 SoftDelete {
39 /// Name of the column that holds the deletion timestamp
40 /// (e.g., `"deleted_at"`).
41 column: String,
42 },
43
44 /// Detect deletes by periodically comparing the full source table against
45 /// the OLAP store.
46 ///
47 /// **Not yet implemented.** Configuring this variant logs a warning on
48 /// every poll cycle and produces no delete events. Reserved for a future
49 /// release.
50 FullDiff {
51 /// How many poll cycles to skip between full-table scans.
52 /// A value of `60` runs a full diff approximately once per minute
53 /// if the poll interval is one second.
54 every_n_cycles: u32,
55 },
56}
57
58/// Top-level configuration for [`crate::TimestampCdcConsumer`].
59///
60/// Describes the set of tables to replicate and the parameters that apply to
61/// every table uniformly (batch size, delete detection).
62#[derive(Debug, Clone)]
63pub struct TimestampCdcConfig {
64 /// Tables to poll from the external source, one [`TimestampTableConfig`]
65 /// per table.
66 pub tables: Vec<TimestampTableConfig>,
67
68 /// Maximum number of rows to fetch per table per poll cycle.
69 ///
70 /// Smaller values reduce memory pressure and latency jitter; larger values
71 /// amortize query round-trips during initial bulk sync.
72 pub poll_batch_size: u32,
73
74 /// Strategy for detecting rows deleted in the source.
75 ///
76 /// See [`DeleteDetection`] for the available options.
77 pub delete_detection: DeleteDetection,
78}
79
80/// Per-table configuration for timestamp-based CDC polling.
81///
82/// Identifies the columns that the consumer reads to build the watermark
83/// predicate and to classify events as inserts vs updates.
84#[derive(Debug, Clone)]
85pub struct TimestampTableConfig {
86 /// Name of the table in the external source database.
87 pub table_name: String,
88
89 /// Column that records when the row was first created.
90 ///
91 /// Used together with [`Self::updated_at_column`] for the INSERT/UPDATE
92 /// heuristic: if `created_at == updated_at` the row is treated as a new
93 /// insert; otherwise it is treated as an update.
94 pub created_at_column: String,
95
96 /// Column that records when the row was last modified.
97 ///
98 /// This is the primary watermark column. The poll query filters
99 /// `WHERE updated_at > watermark_ts` (with composite-PK tie-breaking).
100 pub updated_at_column: String,
101
102 /// Names of the primary key columns, in declaration order.
103 ///
104 /// Must contain at least one entry. Used for:
105 /// - Watermark tie-breaking when multiple rows share the same `updated_at`.
106 /// - Setting [`rhei_core::CdcEvent::row_id`] from the first PK column.
107 /// - SQL tuple comparison in the poll predicate for composite PKs.
108 pub primary_key: Vec<String>,
109
110 /// Explicit list of columns to include in `SELECT`.
111 ///
112 /// When empty the consumer issues `SELECT *` and all columns are included.
113 /// Provide an explicit list to reduce bandwidth or exclude columns
114 /// irrelevant to the OLAP workload.
115 pub columns: Vec<String>,
116}