Skip to main content

reddb_server/
physical.rs

1//! Physical storage design primitives for RedDB's deterministic on-disk layout.
2
3use std::collections::BTreeMap;
4use std::fs;
5use std::io;
6use std::path::{Path, PathBuf};
7use std::time::{SystemTime, UNIX_EPOCH};
8
9use crate::api::{CatalogSnapshot, CollectionStats, RedDBOptions, SchemaManifest, StorageMode};
10use crate::index::IndexKind;
11use crate::serde_json::{Map, Value as JsonValue};
12
13pub const DEFAULT_GRID_BLOCK_SIZE: usize = 512 * 1024;
14pub const DEFAULT_PAGE_SIZE: usize = 4096;
15pub use reddb_file::layout::PHYSICAL_METADATA_BINARY_EXTENSION;
16pub use reddb_file::{
17    fold_dwb_into_wal_enabled, fold_pager_meta_enabled, meta_json_sidecar_enabled,
18    seqn_journal_enabled, seqn_journal_retention, set_fold_dwb_into_wal_enabled,
19    set_fold_pager_meta_enabled, set_meta_json_sidecar_enabled, set_seqn_journal_enabled,
20    set_seqn_journal_retention, BlockReference, ExportDescriptor, ManifestEvent, ManifestEventKind,
21    ManifestPointers, PhysicalAnalyticsJob, PhysicalGraphProjection, PhysicalTreeDefinition,
22    SnapshotDescriptor, SuperblockHeader, DEFAULT_METADATA_JOURNAL_RETENTION,
23    DEFAULT_SUPERBLOCK_COPIES, OPT_IN_METADATA_JOURNAL_RETENTION,
24    PHYSICAL_METADATA_PROTOCOL_VERSION,
25};
26pub const DEFAULT_MANIFEST_EVENT_HISTORY: usize = 256;
27
28#[derive(Debug, Clone, Copy, PartialEq, Eq)]
29pub enum PhysicalMetadataSource {
30    Binary,
31    BinaryJournal,
32    Json,
33}
34
35impl PhysicalMetadataSource {
36    pub fn as_str(self) -> &'static str {
37        match self {
38            Self::Binary => "binary",
39            Self::BinaryJournal => "binary_journal",
40            Self::Json => "json",
41        }
42    }
43}
44#[derive(Debug, Clone, Copy, PartialEq, Eq)]
45pub enum CompactionPolicy {
46    Incremental,
47    Manual,
48}
49
50#[derive(Debug, Clone)]
51pub struct WalPolicy {
52    pub auto_checkpoint_pages: u32,
53    pub fsync_on_commit: bool,
54    pub ring_buffer_bytes: u64,
55}
56
57impl Default for WalPolicy {
58    fn default() -> Self {
59        Self {
60            auto_checkpoint_pages: 1000,
61            fsync_on_commit: true,
62            ring_buffer_bytes: 64 * 1024 * 1024,
63        }
64    }
65}
66
67#[derive(Debug, Clone)]
68pub struct GridLayout {
69    pub block_size: usize,
70    pub page_size: usize,
71    pub superblock_copies: u8,
72}
73
74impl Default for GridLayout {
75    fn default() -> Self {
76        Self {
77            block_size: DEFAULT_GRID_BLOCK_SIZE,
78            page_size: DEFAULT_PAGE_SIZE,
79            superblock_copies: DEFAULT_SUPERBLOCK_COPIES,
80        }
81    }
82}
83
84#[derive(Debug, Clone)]
85pub struct PhysicalLayout {
86    pub mode: StorageMode,
87    pub grid: GridLayout,
88    pub wal: WalPolicy,
89    pub compaction: CompactionPolicy,
90}
91
92impl PhysicalLayout {
93    pub fn from_options(options: &RedDBOptions) -> Self {
94        Self {
95            mode: options.mode,
96            grid: GridLayout::default(),
97            wal: WalPolicy {
98                auto_checkpoint_pages: options.auto_checkpoint_pages,
99                ..WalPolicy::default()
100            },
101            compaction: CompactionPolicy::Incremental,
102        }
103    }
104
105    pub fn is_persistent(&self) -> bool {
106        self.mode == StorageMode::Persistent
107    }
108}
109
110#[derive(Debug, Clone, Copy, PartialEq, Eq)]
111pub enum ContractOrigin {
112    Explicit,
113    Implicit,
114    Migrated,
115}
116
117impl ContractOrigin {
118    pub fn as_str(self) -> &'static str {
119        match self {
120            Self::Explicit => "explicit",
121            Self::Implicit => "implicit",
122            Self::Migrated => "migrated",
123        }
124    }
125}
126
127#[derive(Debug, Clone)]
128pub struct DeclaredColumnContract {
129    pub name: String,
130    pub data_type: String,
131    pub sql_type: Option<crate::storage::schema::SqlTypeName>,
132    pub not_null: bool,
133    pub default: Option<String>,
134    pub compress: Option<u8>,
135    pub unique: bool,
136    pub primary_key: bool,
137    pub enum_variants: Vec<String>,
138    pub array_element: Option<String>,
139    pub decimal_precision: Option<u8>,
140}
141
142#[derive(Debug, Clone)]
143pub struct CollectionContract {
144    pub name: String,
145    pub declared_model: crate::catalog::CollectionModel,
146    pub schema_mode: crate::catalog::SchemaMode,
147    pub origin: ContractOrigin,
148    pub version: u32,
149    pub created_at_unix_ms: u128,
150    pub updated_at_unix_ms: u128,
151    pub default_ttl_ms: Option<u64>,
152    pub vector_dimension: Option<usize>,
153    pub vector_metric: Option<crate::storage::engine::distance::DistanceMetric>,
154    pub context_index_fields: Vec<String>,
155    pub declared_columns: Vec<DeclaredColumnContract>,
156    pub table_def: Option<crate::storage::schema::TableDef>,
157    /// Enabled by `CREATE TABLE ... WITH timestamps = true`. When true,
158    /// the runtime auto-populates two user-visible columns
159    /// `created_at` + `updated_at` (BIGINT unix-ms) sourced from the
160    /// `UnifiedEntity::created_at/updated_at` fields. `created_at` is
161    /// immutable after insert; `updated_at` is bumped on every mutation.
162    pub timestamps_enabled: bool,
163    /// Enabled by `CREATE TABLE ... WITH context_index = true` (or by
164    /// naming specific `context_index_fields`). When true, every INSERT
165    /// tokenises the row's text fields and populates the global context
166    /// index that backs `SEARCH CONTEXT` / `SEARCH SIMILAR TEXT` / `ASK`
167    /// (RAG). When false (default), inserts skip the tokenisation +
168    /// 3-way RwLock write storm entirely — ~800 ns faster per insert,
169    /// and SEARCH returns empty for this collection.
170    ///
171    /// Opt-in by design: pure OLTP tables (accounts, orders, events)
172    /// pay zero indexing tax; search-oriented tables (articles, docs)
173    /// flip the switch at CREATE time.
174    pub context_index_enabled: bool,
175    /// Metrics collections are backed by time-series storage but carry a
176    /// metrics-specific raw sample retention contract.
177    pub metrics_raw_retention_ms: Option<u64>,
178    /// Metrics rollup tiers declared by `CREATE METRICS ... DOWNSAMPLE`.
179    pub metrics_rollup_policies: Vec<String>,
180    /// Metrics tenant identity source. Defaults to current tenant context and
181    /// can be declared as a stable identity path for future ingestion slices.
182    pub metrics_tenant_identity: Option<String>,
183    /// Metrics namespace identity. v0 starts with a default namespace so
184    /// series identity is namespace-aware before Prometheus ingestion exists.
185    pub metrics_namespace: Option<String>,
186    /// Enabled by `CREATE TABLE ... APPEND ONLY` or `WITH
187    /// (append_only = true)`. When true, the runtime rejects
188    /// `UPDATE` and `DELETE` against this collection at parse time
189    /// with a clear error — the operator's immutability intent
190    /// becomes a first-class catalog fact rather than an RLS-shaped
191    /// approximation. Default `false` so legacy DDL keeps its
192    /// mutable semantics.
193    pub append_only: bool,
194    /// Declarative subscriptions created by `WITH EVENTS`. This is
195    /// metadata only in #291; event emission is wired by the outbox slice.
196    pub subscriptions: Vec<crate::catalog::SubscriptionDescriptor>,
197    /// Analytics views declared by `CREATE GRAPH ... WITH ANALYTICS (...)`.
198    /// Persisted as part of the contract so each enabled `<graph>.<output>`
199    /// virtual view survives restarts and crash recovery (issue #800).
200    pub analytics_config: Vec<crate::catalog::AnalyticsViewDescriptor>,
201    /// `CREATE TIMESERIES ... WITH SESSION_KEY <col>` — the column the
202    /// `SESSIONIZE` operator partitions by when no key is supplied at
203    /// query-time. `None` for non-timeseries collections and for
204    /// timeseries created without the clause. Issue #576 slice 1.
205    pub session_key: Option<String>,
206    /// `CREATE TIMESERIES ... SESSION_GAP <duration>` — the default
207    /// inactivity gap (milliseconds) the `SESSIONIZE` operator uses to
208    /// close a session when no gap is supplied at query-time. `None`
209    /// for non-timeseries collections and for timeseries created
210    /// without the clause. Issue #576 slice 1.
211    pub session_gap_ms: Option<u64>,
212    /// `ALTER COLLECTION ... SET RETENTION <duration>` — declarative
213    /// retention policy in milliseconds. `None` means retention is
214    /// not enforced. Reads filter out rows older than `now -
215    /// retention_duration_ms` by the collection's timestamp column.
216    /// Issue #580 — DeclarativeRetention slice 1.
217    pub retention_duration_ms: Option<u64>,
218    /// Analytical-storage seam (PRD #850, Phase 1). When present and
219    /// `columnar = true`, sealing this collection's hypertable chunks
220    /// routes to the columnar `ColumnBlock` writer; `None` (the default)
221    /// keeps the row engine. Decodes to `None` on sidecars written before
222    /// the feature.
223    pub analytical_storage: Option<crate::catalog::AnalyticalStorageConfig>,
224    /// Per-collection AI policy declared by `WITH (EMBED|MODERATE|VISION
225    /// (...))` (PRD #1267, issue #1271). `None` when no AI clause is
226    /// present, and decodes to `None` on sidecars written before the
227    /// feature (versioned/migrated with the schema). Validated against
228    /// the provider capability matrix (#1269) at DDL execution time.
229    pub ai_policy: Option<crate::catalog::AiPolicy>,
230}
231
232/// Canonical artifact lifecycle states.
233///
234/// State machine transitions:
235/// ```text
236///   Declared ──► Building ──► Ready ──► Stale ──► RequiresRebuild
237///       │            │          │                       │
238///       │            ▼          ▼                       │
239///       │         Failed    Disabled                    │
240///       │            │                                  │
241///       └────────────┴──────────────────────────────────┘
242///                    (rebuild restarts from Building)
243/// ```
244#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
245pub enum ArtifactState {
246    /// Index declared but never materialized.
247    Declared,
248    /// Artifact is being built or rebuilt.
249    Building,
250    /// Artifact is materialized and queryable.
251    Ready,
252    /// Artifact is explicitly disabled by the operator.
253    Disabled,
254    /// Underlying data changed; artifact is out of date.
255    Stale,
256    /// Build or warmup failed; manual intervention may be needed.
257    Failed,
258    /// Artifact must be rebuilt before it can serve reads.
259    RequiresRebuild,
260}
261
262impl ArtifactState {
263    /// Parse from the legacy string representation stored in physical metadata.
264    pub fn from_build_state(s: &str, enabled: bool) -> Self {
265        if !enabled {
266            return Self::Disabled;
267        }
268        match s {
269            "ready" => Self::Ready,
270            "building" | "catalog-derived" | "metadata-only" | "artifact-published"
271            | "registry-loaded" => Self::Building,
272            "stale" => Self::Stale,
273            "failed" => Self::Failed,
274            "requires_rebuild" | "requires-rebuild" => Self::RequiresRebuild,
275            _ => Self::Declared,
276        }
277    }
278
279    /// Canonical string representation for storage and API surfaces.
280    pub fn as_str(&self) -> &'static str {
281        match self {
282            Self::Declared => "declared",
283            Self::Building => "building",
284            Self::Ready => "ready",
285            Self::Disabled => "disabled",
286            Self::Stale => "stale",
287            Self::Failed => "failed",
288            Self::RequiresRebuild => "requires_rebuild",
289        }
290    }
291
292    /// Whether this artifact is safe for query reads.
293    pub fn is_queryable(&self) -> bool {
294        matches!(self, Self::Ready)
295    }
296
297    /// Whether a rebuild operation is valid from this state.
298    pub fn can_rebuild(&self) -> bool {
299        matches!(
300            self,
301            Self::Declared | Self::Stale | Self::Failed | Self::RequiresRebuild
302        )
303    }
304
305    /// Whether this state indicates the artifact needs attention.
306    pub fn needs_attention(&self) -> bool {
307        matches!(self, Self::Failed | Self::RequiresRebuild | Self::Stale)
308    }
309}
310
311impl std::fmt::Display for ArtifactState {
312    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
313        f.write_str(self.as_str())
314    }
315}
316
317#[derive(Debug, Clone)]
318pub struct PhysicalIndexState {
319    pub name: String,
320    pub kind: IndexKind,
321    pub collection: Option<String>,
322    pub enabled: bool,
323    pub entries: usize,
324    pub estimated_memory_bytes: u64,
325    pub last_refresh_ms: Option<u128>,
326    pub backend: String,
327    pub artifact_kind: Option<String>,
328    pub artifact_root_page: Option<u32>,
329    pub artifact_checksum: Option<u64>,
330    pub build_state: String,
331}
332
333impl PhysicalIndexState {
334    /// Canonical artifact lifecycle state derived from physical state.
335    pub fn artifact_state(&self) -> ArtifactState {
336        ArtifactState::from_build_state(&self.build_state, self.enabled)
337    }
338}
339
340/// A single persisted hypertable chunk. Mirror of
341/// `storage::timeseries::ChunkMeta`, flattened for the metadata
342/// sidecar so the registry's routing spine survives a restart
343/// (issue #866). `start_ns` plus the owning hypertable name is the
344/// chunk's stable identity.
345#[derive(Debug, Clone)]
346pub struct PhysicalHypertableChunk {
347    pub start_ns: u64,
348    pub end_ns_exclusive: u64,
349    pub row_count: u64,
350    pub min_ts_ns: u64,
351    pub max_ts_ns: u64,
352    pub sealed: bool,
353    pub ttl_override_ns: Option<u64>,
354    /// Columnar-vs-row migration discriminant — mirror of
355    /// `ChunkMeta.columnar_page` (PRD #850, Phase 1). `Some` → the chunk's
356    /// `RDCC` `ColumnBlock` location; `None` → legacy row-stored. Absent on
357    /// sidecars written before the feature, decoding to `None`.
358    pub columnar_page: Option<crate::storage::engine::PageLocation>,
359}
360
361/// A persisted hypertable spec plus all of its chunks. Stored in the
362/// physical metadata sidecar alongside collection contracts so chunk
363/// bounds / routing / TTL are recovered identically after a restart
364/// — the same durability path the rest of the catalog already uses,
365/// not a parallel one (issue #866).
366#[derive(Debug, Clone)]
367pub struct PhysicalHypertable {
368    pub name: String,
369    pub time_column: String,
370    pub chunk_interval_ns: u64,
371    pub default_ttl_ns: Option<u64>,
372    pub chunks: Vec<PhysicalHypertableChunk>,
373}
374
375#[derive(Debug, Clone)]
376pub struct PhysicalMetadataFile {
377    pub protocol_version: String,
378    pub generated_at_unix_ms: u128,
379    pub last_loaded_from: Option<String>,
380    pub last_healed_at_unix_ms: Option<u128>,
381    pub manifest: SchemaManifest,
382    pub catalog: CatalogSnapshot,
383    pub manifest_events: Vec<ManifestEvent>,
384    pub indexes: Vec<PhysicalIndexState>,
385    pub graph_projections: Vec<PhysicalGraphProjection>,
386    pub analytics_jobs: Vec<PhysicalAnalyticsJob>,
387    pub tree_definitions: Vec<PhysicalTreeDefinition>,
388    pub collection_ttl_defaults_ms: BTreeMap<String, u64>,
389    pub collection_contracts: Vec<CollectionContract>,
390    /// Persisted hypertable chunk spine (issue #866). Empty on legacy
391    /// sidecars written before the feature and for non-hypertable
392    /// databases.
393    pub hypertables: Vec<PhysicalHypertable>,
394    pub exports: Vec<ExportDescriptor>,
395    pub superblock: SuperblockHeader,
396    pub snapshots: Vec<SnapshotDescriptor>,
397}
398
399mod helpers;
400mod json_codec;
401mod metadata_file;
402pub mod shm;
403
404pub use self::shm::{
405    provision_shm, read_shm_header, set_shm_provisioning_enabled, shm_path_for,
406    shm_provisioning_enabled, ShmHandle, ShmHeader, ShmProvisionState, SHM_FILE_SIZE,
407    SHM_HEADER_SIZE, SHM_MAGIC, SHM_VERSION,
408};
409
410use self::helpers::*;
411use self::json_codec::*;