Skip to main content

moltendb_core/engine/
mod.rs

1// ─── engine/mod.rs ────────────────────────────────────────────────────────────
2// This is the root module of the database engine. It defines the `Db` struct —
3// the central object that the rest of the application interacts with.
4//
5// The Db struct is a thin, cloneable handle to the shared database state.
6// Cloning a Db is cheap — it just increments reference counts on the Arcs
7// inside. All clones share the same underlying data, so any write made through
8// one clone is immediately visible through all others. This is how Axum handler
9// functions can each receive their own Db clone via State<> extraction while
10// all operating on the same in-memory database.
11//
12// Internal structure:
13//   state        — the actual document data: collection → (key → JSON value)
14//   storage      — the persistence layer (disk, encrypted, or OPFS)
15//   tx           — broadcast channel for real-time WebSocket notifications
16//   indexes      — field indexes for fast WHERE queries
17//   query_heatmap — tracks query frequency for auto-indexing
18//
19// The Db struct has two constructors:
20//   open()      — native (server) build, opens a disk file
21//   open_wasm() — WASM (browser) build, opens an OPFS file
22// Both are conditionally compiled with #[cfg(...)] attributes.
23// ─────────────────────────────────────────────────────────────────────────────
24
25// Declare the sub-modules of the engine.
26mod types;      // LogEntry, DbError
27mod indexing;   // index_doc, unindex_doc, track_query, create_index
28mod storage;    // StorageBackend trait + concrete implementations
29mod config;     // DbConfig struct
30#[cfg(feature = "schema")]
31mod schema;     // JSON Schema validation
32mod operations; // get, get_all, insert_batch, update, delete, etc.
33
34// Re-export LogEntry so it can be used by tests and other crates.
35pub use types::{DbError, LogEntry};
36// Re-export DbConfig
37pub use config::DbConfig;
38// Re-export the StorageBackend trait so callers can use it without knowing
39// the internal module structure.
40pub use storage::{StorageBackend, EncryptedStorage};
41#[cfg(not(target_arch = "wasm32"))]
42pub use storage::{AsyncDiskStorage, SyncDiskStorage};
43
44// DashMap = concurrent hash map. DashSet = concurrent hash set.
45use dashmap::{DashMap, DashSet};
46use tracing::{info};
47// Value = dynamically-typed JSON value.
48use serde_json::Value;
49// Standard HashMap — used for return values from get operations.
50use std::collections::HashMap;
51// Arc = thread-safe reference-counted pointer.
52// Wrapping fields in Arc allows Db to be cheaply cloned — all clones share
53// the same underlying data.
54use std::ops::ControlFlow;
55use std::sync::Arc;
56// Tokio's broadcast channel: one sender, many receivers.
57// Used to push real-time change notifications to WebSocket subscribers.
58use tokio::sync::broadcast;
59
60/// The central database handle. Cheap to clone — all clones share the same state.
61///
62/// This struct is the public API of the engine. All database operations go
63/// through methods on this struct, which delegate to the operations module.
64#[derive(Clone)]
65pub struct Db {
66    /// The main document store.
67    /// Outer map: collection name (e.g. "users") → inner map.
68    /// Inner map: document key (e.g. "u1") → Hybrid Hot/Cold document state.
69    /// DashMap allows concurrent reads and writes from multiple threads.
70    state: Arc<DashMap<String, DashMap<String, crate::engine::types::DocumentState>>>,
71
72    /// The storage backend — handles persistence to disk or OPFS.
73    /// `pub` so handlers can access it directly if needed (e.g. for compaction).
74    /// `Arc<dyn StorageBackend>` = shared pointer to any type implementing the trait.
75    pub storage: Arc<dyn StorageBackend>,
76
77    /// Broadcast channel sender for real-time change notifications.
78    /// When a document is inserted, updated, or deleted, a JSON event is sent
79    /// on this channel. WebSocket handlers subscribe to receive these events.
80    /// `pub` so the WebSocket handler in main.rs can call subscribe().
81    pub tx: broadcast::Sender<String>,
82
83    /// The index store.
84    /// Key format: "collection:field" (e.g. "users:role").
85    /// Value: field_value → set of document keys with that value.
86    /// e.g. "users:role" → { "admin" → {"u1"}, "user" → {"u2", "u3"} }
87    /// `pub` so handlers.rs can check for index existence directly.
88    pub indexes: Arc<DashMap<String, DashMap<String, DashSet<String>>>>,
89
90    /// Query frequency counter for auto-indexing.
91    /// Key: "collection:field". Value: number of times queried.
92    /// When a field reaches 3 queries, an index is auto-created.
93    pub query_heatmap: Arc<DashMap<String, u32>>,
94
95    /// The maximum number of documents per collection to keep in RAM (Hot).
96    /// If a collection exceeds this, older documents are paged out to disk (Cold).
97    /// Default is 50,000.
98    pub hot_threshold: usize,
99
100    /// Max requests per window.
101    pub rate_limit_requests: u32,
102
103    /// Window size in seconds.
104    pub rate_limit_window: u64,
105
106    /// Maximum request body size in bytes.
107    pub max_body_size: usize,
108
109    /// Registered JSON schemas per collection.
110    /// Key: collection name → Value: (Original JSON, Compiled Validator).
111    #[cfg(feature = "schema")]
112    pub schemas: Arc<DashMap<String, Arc<(Value, jsonschema::Validator)>>>,
113
114    /// Optional shell command to execute after a successful backup.
115    /// Supports the {SNAPSHOT_PATH} placeholder.
116    pub post_backup_script: Option<String>,
117}
118
119impl Db {
120    /// Open (or create) a database at the given file path.
121    /// Only available on native (non-WASM) builds.
122    ///
123    /// `sync_mode`      — if true, use SyncDiskStorage (flush on every write).
124    ///                    if false, use AsyncDiskStorage (flush every 50ms).
125    ///                    Ignored when `tiered_mode` is true.
126    /// `tiered_mode`    — if true, use TieredStorage (hot + cold two-tier backend).
127    ///                    Hot writes go to the active log; cold data is archived and
128    ///                    read via mmap on startup. Best for large datasets (100k+ docs).
129    ///                    Enable with STORAGE_MODE=tiered environment variable.
130    /// `encryption_key` — if Some, wrap the storage in EncryptedStorage.
131    ///                    if None, data is stored in plaintext (not recommended).
132    #[cfg(not(target_arch = "wasm32"))]
133    pub fn open(config: DbConfig) -> Result<Self, DbError> {
134        let path = &config.path;
135        let sync_mode = config.sync_mode;
136        let tiered_mode = config.tiered_mode;
137        let hot_threshold = config.hot_threshold;
138        let rate_limit_requests = config.rate_limit_requests;
139        let rate_limit_window = config.rate_limit_window;
140        let max_body_size = config.max_body_size;
141        let encryption_key = config.encryption_key;
142        let post_backup_script = config.post_backup_script;
143
144        // Create the shared in-memory state containers.
145        let state = Arc::new(DashMap::new());
146        // Create the broadcast channel with a buffer of 100 messages.
147        // If the buffer fills up (no subscribers reading), old messages are dropped.
148        let (tx, _rx) = broadcast::channel(100);
149        let indexes: Arc<DashMap<String, DashMap<String, DashSet<String>>>> =
150            Arc::new(Default::default());
151        let query_heatmap = Arc::new(Default::default());
152        #[cfg(feature = "schema")]
153        let schemas = Arc::new(DashMap::new());
154
155        // Ensure the parent directory exists.
156        if let Some(parent) = std::path::Path::new(path).parent() {
157            std::fs::create_dir_all(parent)?;
158        }
159
160        // Choose the base storage backend based on the configured mode.
161        //
162        //   tiered_mode = true  → TieredStorage: hot log (async writes) + cold log
163        //                         (mmap reads). Best for large datasets. The cold log
164        //                         accumulates promoted hot data and is paged by the OS.
165        //
166        //   sync_mode = true    → SyncDiskStorage: every write is flushed to disk
167        //                         immediately. Zero data loss, lower throughput.
168        //
169        //   default             → AsyncDiskStorage: writes buffered in memory, flushed
170        //                         every 50ms. Highest throughput, up to 50ms data loss.
171        let base_storage: Arc<dyn StorageBackend> = if tiered_mode {
172            Arc::new(storage::TieredStorage::new(path)?)
173        } else if sync_mode {
174            Arc::new(storage::SyncDiskStorage::new(path)?)
175        } else {
176            Arc::new(storage::AsyncDiskStorage::new(path)?)
177        };
178
179        // Optionally wrap the base storage in EncryptedStorage.
180        // EncryptedStorage is transparent — it encrypts on write and decrypts
181        // on read, so the rest of the engine doesn't know encryption is happening.
182        let storage: Arc<dyn StorageBackend> = if let Some(key) = encryption_key {
183            Arc::new(storage::EncryptedStorage::new(base_storage, &key))
184        } else {
185            base_storage
186        };
187
188        // Replay the log (or snapshot + delta) into the in-memory state.
189        // After this call, `state` and `indexes` reflect the persisted data.
190        storage::stream_into_state(
191            &*storage,
192            &state,
193            &indexes,
194            #[cfg(feature = "schema")] &schemas,
195        )?;
196
197        Ok(Self {
198            state,
199            storage,
200            tx,
201            indexes,
202            query_heatmap,
203            hot_threshold,
204            rate_limit_requests,
205            rate_limit_window,
206            max_body_size,
207            #[cfg(feature = "schema")]
208            schemas,
209            post_backup_script,
210        })
211    }
212
213    /// Open (or create) a database in the browser using OPFS.
214    /// Only available on WASM builds. Async because OPFS APIs return Promises.
215    ///
216    /// `db_name` — the filename in the OPFS root directory (e.g. "analytics_db").
217    #[cfg(target_arch = "wasm32")]
218    pub async fn open_wasm(config: DbConfig) -> Result<Self, DbError> {
219        let db_name = &config.path;
220        let hot_threshold = config.hot_threshold;
221        let rate_limit_requests = config.rate_limit_requests;
222        let rate_limit_window = config.rate_limit_window;
223        let max_body_size = config.max_body_size;
224        let encryption_key = config.encryption_key;
225        let sync_mode = config.sync_mode;
226        let post_backup_script = config.post_backup_script;
227
228        let state = Arc::new(DashMap::new());
229        let (tx, _rx) = broadcast::channel(100);
230        let indexes: Arc<DashMap<String, DashMap<String, DashSet<String>>>> =
231            Arc::new(Default::default());
232        let query_heatmap = Arc::new(Default::default());
233        #[cfg(feature = "schema")]
234        let schemas = Arc::new(DashMap::new());
235
236        // Open the OPFS file. This is async because the browser's OPFS API
237        // uses Promises which we must await.
238        let mut storage: Arc<dyn StorageBackend> =
239            Arc::new(storage::OpfsStorage::new(db_name, sync_mode).await?);
240
241        // Apply encryption wrapper if a key is provided.
242        if let Some(key) = encryption_key {
243            storage = Arc::new(storage::EncryptedStorage::new(storage, &key));
244        }
245
246        // Replay the log into the in-memory state.
247        storage::stream_into_state(
248            &*storage,
249            &state,
250            &indexes,
251            #[cfg(feature = "schema")] &schemas,
252        )?;
253
254        Ok(Self {
255            state,
256            storage,
257            tx,
258            indexes,
259            query_heatmap,
260            hot_threshold,
261            rate_limit_requests,
262            rate_limit_window,
263            max_body_size,
264            #[cfg(feature = "schema")]
265            schemas,
266            post_backup_script,
267        })
268    }
269
270    /// Create a new broadcast receiver for real-time change notifications.
271    /// Each call returns an independent receiver — multiple WebSocket handlers
272    /// can each subscribe and receive all events independently.
273    pub fn subscribe(&self) -> broadcast::Receiver<String> {
274        self.tx.subscribe()
275    }
276
277    /// Retrieve a single document by key. Returns None if not found.
278    pub fn get(&self, collection: &str, key: &str) -> Option<Value> {
279        operations::get(&self.state, &self.storage, collection, key)
280    }
281
282    /// Retrieve all documents in a collection as a HashMap.
283    pub fn get_all(&self, collection: &str) -> HashMap<String, Value> {
284        operations::get_all(&self.state, &self.storage, collection)
285    }
286
287    /// Retrieve a specific set of documents by their keys.
288    pub fn get_batch(&self, collection: &str, keys: Vec<String>) -> HashMap<String, Value> {
289        operations::get_batch(&self.state, &self.storage, collection, keys)
290    }
291
292    /// Insert or overwrite multiple documents in one call.
293    /// Each item is a (key, value) pair. Writes are persisted to storage.
294    pub fn insert_batch(&self, collection: &str, items: Vec<(String, Value)>) -> Result<(), DbError> {
295        operations::insert_batch(
296            &self.state,
297            &self.indexes,
298            &self.storage,
299            &self.tx,
300            #[cfg(feature = "schema")] &self.schemas,
301            collection,
302            items,
303        )?;
304
305        // Auto-evict if the collection exceeds the threshold.
306        let _ = self.evict_collection(collection, self.hot_threshold);
307        Ok(())
308    }
309
310    /// Partially update a document — merges `updates` into the existing document.
311    /// Returns true if the document was found and updated, false if not found.
312    pub fn update(&self, collection: &str, key: &str, updates: Value) -> Result<bool, DbError> {
313        let updated = operations::update(
314            &self.state,
315            &self.indexes,
316            &self.storage,
317            &self.tx,
318            #[cfg(feature = "schema")] &self.schemas,
319            collection,
320            key,
321            updates,
322        )?;
323
324        if updated {
325            // Auto-evict if the collection exceeds the threshold.
326            let _ = self.evict_collection(collection, self.hot_threshold);
327        }
328        Ok(updated)
329    }
330
331    /// Delete a single document by key.
332    pub fn delete(&self, collection: &str, key: &str) -> Result<(), DbError> {
333        operations::delete(
334            &self.state,
335            &self.indexes,
336            &self.storage,
337            &self.tx,
338            collection,
339            key,
340        )
341    }
342
343    /// Delete multiple documents by key in one call.
344    pub fn delete_batch(&self, collection: &str, keys: Vec<String>) -> Result<(), DbError> {
345        operations::delete_batch(
346            &self.state,
347            &self.indexes,
348            &self.storage,
349            &self.tx,
350            collection,
351            keys,
352        )
353    }
354
355    /// Drop an entire collection — removes all documents and its indexes.
356    pub fn delete_collection(&self, collection: &str) -> Result<(), DbError> {
357        operations::delete_collection(
358            &self.state,
359            &self.indexes,
360            &self.storage,
361            &self.tx,
362            collection,
363        )
364    }
365
366    /// Track that `field` was queried in `collection` and auto-create an index
367    /// if this field has been queried 3 or more times.
368    /// Errors are silently ignored — auto-indexing is best-effort.
369    pub fn track_query(&self, collection: &str, field: &str) {
370        // The `let _ =` discards the Result — a failed auto-index is not fatal.
371        let _ = indexing::track_query(
372            &self.indexes,
373            &self.query_heatmap,
374            collection,
375            field,
376            &self.storage,
377            &self.state,
378        );
379    }
380
381    /// Register a JSON schema for a collection.
382    /// All subsequent writes to this collection must conform to this schema.
383    #[cfg(feature = "schema")]
384    pub fn set_schema(&self, collection: &str, schema: Value) -> Result<(), DbError> {
385        schema::set_schema(
386            &self.schemas,
387            &self.storage,
388            &self.tx,
389            collection,
390            schema
391        )
392    }
393    
394    /// Compact the log file — rewrite it to contain only the current state.
395    ///
396    /// This removes all dead entries (superseded INSERTs, DELETE tombstones)
397    /// and writes a binary snapshot for fast next startup.
398    ///
399    /// The compacted log contains:
400    ///   - One INSERT entry per live document (current value only).
401    ///   - One INDEX entry per registered index (index data is rebuilt on replay).
402    pub fn compact(&self) -> Result<(), DbError> {
403        info!("🔨 Starting Log Compaction...");
404
405        // Build the minimal set of entries representing the current state.
406        let mut entries = Vec::new();
407
408        // One INSERT per live document across all collections.
409        for col_ref in self.state.iter() {
410            let col_name = col_ref.key();
411            for item_ref in col_ref.value().iter() {
412                // To compact, we need the full Value. If it's Cold, we fetch it from storage.
413                let entry = match item_ref.value() {
414                    crate::engine::types::DocumentState::Hot(v) => {
415                        types::LogEntry::new(
416                            "INSERT".to_string(),
417                            col_name.clone(),
418                            item_ref.key().clone(),
419                            v.clone(),
420                        )
421                    }
422                    crate::engine::types::DocumentState::Cold(ptr) => {
423                        let bytes = self.storage.read_at(ptr.offset, ptr.length)?;
424                        serde_json::from_slice(&bytes)?
425                    }
426                };
427                entries.push(entry);
428            }
429        }
430
431        // One SCHEMA entry per collection.
432        #[cfg(feature = "schema")]
433        for schema_ref in self.schemas.iter() {
434            let col_name = schema_ref.key();
435            let (schema_json, _) = &**schema_ref.value();
436            entries.push(types::LogEntry::new(
437                "SCHEMA".to_string(),
438                col_name.clone(),
439                "".to_string(),
440                schema_json.clone(),
441            ));
442        }
443
444        // One INDEX entry per registered index.
445        // The index name format is "collection:field" — we split it to get both parts.
446        for index_ref in self.indexes.iter() {
447            let parts: Vec<&str> = index_ref.key().split(':').collect();
448            if parts.len() == 2 {
449                entries.push(types::LogEntry::new(
450                    "INDEX".to_string(),
451                    parts[0].to_string(),
452                    parts[1].to_string(),       // field name
453                    serde_json::json!(null),
454                ));
455            }
456        }
457
458        // Delegate the actual file rewrite (and snapshot write) to the storage backend.
459        self.storage.compact_with_hook(entries.clone(), self.post_backup_script.clone())?;
460
461        // After compaction the log is rewritten and all old RecordPointers are invalid.
462        // Promote every Cold entry in the in-memory state to Hot so subsequent reads
463        // don't try to seek to stale byte offsets in the now-truncated log file.
464        for entry in &entries {
465            if entry.cmd == "INSERT" {
466                if let Some(col) = self.state.get(&entry.collection) {
467                    if let Some(mut doc) = col.get_mut(&entry.key) {
468                        if matches!(*doc, crate::engine::types::DocumentState::Cold(_)) {
469                            *doc = crate::engine::types::DocumentState::Hot(entry.value.clone());
470                        }
471                    }
472                }
473            }
474        }
475
476        info!("✅ Log Compaction Finished!");
477        Ok(())
478    }
479
480    /// Evict documents from RAM to disk for a collection if it exceeds the threshold.
481    ///
482    /// This converts `Hot(Value)` entries into `Cold(RecordPointer)` entries.
483    /// In this v1, it re-scans the log to find the exact byte offsets for the documents.
484    pub fn evict_collection(&self, collection: &str, limit: usize) -> Result<usize, DbError> {
485        let col_len = if let Some(col) = self.state.get(collection) {
486            col.len()
487        } else {
488            return Err(DbError::CollectionNotFound);
489        };
490
491        if col_len <= limit {
492            return Ok(0);
493        }
494
495        let mut evicted_count = 0;
496        let mut offset = 0u64;
497        let to_evict = col_len - limit;
498
499        // To evict properly, we need the pointers. Since we don't store them for
500        // Hot documents, we re-scan the log to find them.
501        self.storage.stream_log_into(&mut |entry, length| {
502            if entry.collection == collection {
503                if evicted_count < to_evict {
504                    if let Some(col) = self.state.get(collection) {
505                        if let Some(mut doc_state) = col.get_mut(&entry.key) {
506                            if let crate::engine::types::DocumentState::Hot(_) = *doc_state {
507                                *doc_state = crate::engine::types::DocumentState::Cold(crate::engine::types::RecordPointer {
508                                    offset,
509                                    length,
510                                });
511                                evicted_count += 1;
512                            }
513                        }
514                    }
515                }
516            }
517            offset += (length + 1) as u64;
518            ControlFlow::Continue(())
519        })?;
520
521        Ok(evicted_count)
522    }
523
524    /// Recover the database state to a specific point in time or sequence number.
525    /// Returns the recovered state as a Vec of LogEntries that can be written to a snapshot.
526    ///
527    /// This is a utility function used by the CLI for PITR.
528    #[cfg(not(target_arch = "wasm32"))]
529    pub fn recover_to(
530        storage: &dyn StorageBackend,
531        to_time: Option<u64>,
532        to_seq: Option<u64>,
533    ) -> Result<Vec<LogEntry>, DbError> {
534        let state: DashMap<String, DashMap<String, crate::engine::types::DocumentState>> = DashMap::new();
535        let indexes: DashMap<String, DashMap<String, DashSet<String>>> = DashMap::new();
536        #[cfg(feature = "schema")]
537        let schemas: DashMap<String, Arc<(serde_json::Value, jsonschema::Validator)>> = DashMap::new();
538
539            let mut offset = 0u64;
540            let mut count = 0u64;
541            let mut current_tx_entries = Vec::new();
542            let mut current_tx_id = None;
543            
544            storage.stream_log_into(&mut |entry, length| {
545                // Condition 1: Check Timestamp
546                if let Some(t) = to_time {
547                    if entry._t > t {
548                        return ControlFlow::Break(());
549                    }
550                }
551    
552                // Condition 2: Check Sequence
553                if let Some(s) = to_seq {
554                    if count >= s {
555                        return ControlFlow::Break(());
556                    }
557                }
558
559            let pointer = crate::engine::types::RecordPointer {
560                offset,
561                length,
562            };
563
564            match entry.cmd.as_str() {
565                "TX_BEGIN" => {
566                    current_tx_id = Some(entry.key.clone());
567                    current_tx_entries.clear();
568                }
569                "TX_COMMIT" => {
570                    if current_tx_id.as_ref() == Some(&entry.key) {
571                        for (e, p) in current_tx_entries.drain(..) {
572                            crate::engine::storage::apply_entry(
573                                &e,
574                                &state,
575                                &indexes,
576                                #[cfg(feature = "schema")] &schemas,
577                                Some(p),
578                            );
579                        }
580                        current_tx_id = None;
581                    }
582                }
583                _ => {
584                    if current_tx_id.is_some() {
585                        current_tx_entries.push((entry, pointer));
586                    } else {
587                        crate::engine::storage::apply_entry(
588                            &entry,
589                            &state,
590                            &indexes,
591                            #[cfg(feature = "schema")] &schemas,
592                            Some(pointer),
593                        );
594                    }
595                }
596            }
597
598            count += 1;
599            offset += (length + 1) as u64;
600            ControlFlow::Continue(())
601        })?;
602
603        // Convert the recovered state into LogEntries (similar to compact logic)
604        let mut entries = Vec::new();
605        for col_ref in state.iter() {
606            let col_name = col_ref.key();
607            for item_ref in col_ref.value().iter() {
608                let entry = match item_ref.value() {
609                    crate::engine::types::DocumentState::Hot(v) => {
610                        LogEntry::new(
611                            "INSERT".to_string(),
612                            col_name.clone(),
613                            item_ref.key().clone(),
614                            v.clone(),
615                        )
616                    }
617                    crate::engine::types::DocumentState::Cold(ptr) => {
618                        let bytes = storage.read_at(ptr.offset, ptr.length).unwrap_or_default();
619                        serde_json::from_slice(&bytes).unwrap_or_else(|_| {
620                            LogEntry::new("INSERT".to_string(), col_name.clone(), item_ref.key().clone(), serde_json::Value::Null)
621                        })
622                    }
623                };
624                entries.push(entry);
625            }
626        }
627
628        #[cfg(feature = "schema")]
629        for schema_ref in schemas.iter() {
630            let col_name = schema_ref.key();
631            let (schema_json, _) = &**schema_ref.value();
632            entries.push(LogEntry::new(
633                "SCHEMA".to_string(),
634                col_name.clone(),
635                "".to_string(),
636                schema_json.clone(),
637            ));
638        }
639
640        for index_ref in indexes.iter() {
641            let parts: Vec<&str> = index_ref.key().split(':').collect();
642            if parts.len() == 2 {
643                entries.push(LogEntry::new(
644                    "INDEX".to_string(),
645                    parts[0].to_string(),
646                    parts[1].to_string(),
647                    serde_json::json!(null),
648                ));
649            }
650        }
651
652        Ok(entries)
653    }
654}