moltendb_core/engine/mod.rs
1// ─── engine/mod.rs ────────────────────────────────────────────────────────────
2// This is the root module of the database engine. It defines the `Db` struct —
3// the central object that the rest of the application interacts with.
4//
5// The Db struct is a thin, cloneable handle to the shared database state.
6// Cloning a Db is cheap — it just increments reference counts on the Arcs
7// inside. All clones share the same underlying data, so any write made through
8// one clone is immediately visible through all others. This is how Axum handler
9// functions can each receive their own Db clone via State<> extraction while
10// all operating on the same in-memory database.
11//
12// Internal structure:
13// state — the actual document data: collection → (key → JSON value)
14// storage — the persistence layer (disk, encrypted, or OPFS)
15// tx — broadcast channel for real-time WebSocket notifications
16// indexes — field indexes for fast WHERE queries
17// query_heatmap — tracks query frequency for auto-indexing
18//
19// The Db struct has two constructors:
20// open() — native (server) build, opens a disk file
21// open_wasm() — WASM (browser) build, opens an OPFS file
22// Both are conditionally compiled with #[cfg(...)] attributes.
23// ─────────────────────────────────────────────────────────────────────────────
24
25// Declare the sub-modules of the engine.
26mod types; // LogEntry, DbError
27mod indexing; // index_doc, unindex_doc, track_query, create_index
28mod storage; // StorageBackend trait + concrete implementations
29mod config; // DbConfig struct
30#[cfg(feature = "schema")]
31mod schema; // JSON Schema validation
32mod operations; // get, get_all, insert_batch, update, delete, etc.
33
34// Re-export LogEntry so it can be used by tests and other crates.
35pub use types::{DbError, LogEntry};
36// Re-export DbConfig
37pub use config::DbConfig;
38// Re-export the StorageBackend trait so callers can use it without knowing
39// the internal module structure.
40pub use storage::{StorageBackend, EncryptedStorage};
41#[cfg(not(target_arch = "wasm32"))]
42pub use storage::{AsyncDiskStorage, SyncDiskStorage};
43
44// DashMap = concurrent hash map. DashSet = concurrent hash set.
45use dashmap::{DashMap, DashSet};
46use tracing::{info};
47// Value = dynamically-typed JSON value.
48use serde_json::Value;
49// Standard HashMap — used for return values from get operations.
50use std::collections::HashMap;
51// Arc = thread-safe reference-counted pointer.
52// Wrapping fields in Arc allows Db to be cheaply cloned — all clones share
53// the same underlying data.
54use std::ops::ControlFlow;
55use std::sync::Arc;
56// Tokio's broadcast channel: one sender, many receivers.
57// Used to push real-time change notifications to WebSocket subscribers.
58use tokio::sync::broadcast;
59
60/// The central database handle. Cheap to clone — all clones share the same state.
61///
62/// This struct is the public API of the engine. All database operations go
63/// through methods on this struct, which delegate to the operations module.
64#[derive(Clone)]
65pub struct Db {
66 /// The main document store.
67 /// Outer map: collection name (e.g. "users") → inner map.
68 /// Inner map: document key (e.g. "u1") → Hybrid Hot/Cold document state.
69 /// DashMap allows concurrent reads and writes from multiple threads.
70 state: Arc<DashMap<String, DashMap<String, crate::engine::types::DocumentState>>>,
71
72 /// The storage backend — handles persistence to disk or OPFS.
73 /// `pub` so handlers can access it directly if needed (e.g. for compaction).
74 /// `Arc<dyn StorageBackend>` = shared pointer to any type implementing the trait.
75 pub storage: Arc<dyn StorageBackend>,
76
77 /// Broadcast channel sender for real-time change notifications.
78 /// When a document is inserted, updated, or deleted, a JSON event is sent
79 /// on this channel. WebSocket handlers subscribe to receive these events.
80 /// `pub` so the WebSocket handler in main.rs can call subscribe().
81 pub tx: broadcast::Sender<String>,
82
83 /// The index store.
84 /// Key format: "collection:field" (e.g. "users:role").
85 /// Value: field_value → set of document keys with that value.
86 /// e.g. "users:role" → { "admin" → {"u1"}, "user" → {"u2", "u3"} }
87 /// `pub` so handlers.rs can check for index existence directly.
88 pub indexes: Arc<DashMap<String, DashMap<String, DashSet<String>>>>,
89
90 /// Query frequency counter for auto-indexing.
91 /// Key: "collection:field". Value: number of times queried.
92 /// When a field reaches 3 queries, an index is auto-created.
93 pub query_heatmap: Arc<DashMap<String, u32>>,
94
95 /// The maximum number of documents per collection to keep in RAM (Hot).
96 /// If a collection exceeds this, older documents are paged out to disk (Cold).
97 /// Default is 50,000.
98 pub hot_threshold: usize,
99
100 /// Max requests per window.
101 pub rate_limit_requests: u32,
102
103 /// Window size in seconds.
104 pub rate_limit_window: u64,
105
106 /// Maximum request body size in bytes.
107 pub max_body_size: usize,
108
109 /// Registered JSON schemas per collection.
110 /// Key: collection name → Value: (Original JSON, Compiled Validator).
111 #[cfg(feature = "schema")]
112 pub schemas: Arc<DashMap<String, Arc<(Value, jsonschema::Validator)>>>,
113
114 /// Optional shell command to execute after a successful backup.
115 /// Supports the {SNAPSHOT_PATH} placeholder.
116 pub post_backup_script: Option<String>,
117}
118
119impl Db {
120 /// Open (or create) a database at the given file path.
121 /// Only available on native (non-WASM) builds.
122 ///
123 /// `sync_mode` — if true, use SyncDiskStorage (flush on every write).
124 /// if false, use AsyncDiskStorage (flush every 50ms).
125 /// Ignored when `tiered_mode` is true.
126 /// `tiered_mode` — if true, use TieredStorage (hot + cold two-tier backend).
127 /// Hot writes go to the active log; cold data is archived and
128 /// read via mmap on startup. Best for large datasets (100k+ docs).
129 /// Enable with STORAGE_MODE=tiered environment variable.
130 /// `encryption_key` — if Some, wrap the storage in EncryptedStorage.
131 /// if None, data is stored in plaintext (not recommended).
132 #[cfg(not(target_arch = "wasm32"))]
133 pub fn open(config: DbConfig) -> Result<Self, DbError> {
134 let path = &config.path;
135 let sync_mode = config.sync_mode;
136 let tiered_mode = config.tiered_mode;
137 let hot_threshold = config.hot_threshold;
138 let rate_limit_requests = config.rate_limit_requests;
139 let rate_limit_window = config.rate_limit_window;
140 let max_body_size = config.max_body_size;
141 let encryption_key = config.encryption_key;
142 let post_backup_script = config.post_backup_script;
143
144 // Create the shared in-memory state containers.
145 let state = Arc::new(DashMap::new());
146 // Create the broadcast channel with a buffer of 100 messages.
147 // If the buffer fills up (no subscribers reading), old messages are dropped.
148 let (tx, _rx) = broadcast::channel(100);
149 let indexes: Arc<DashMap<String, DashMap<String, DashSet<String>>>> =
150 Arc::new(Default::default());
151 let query_heatmap = Arc::new(Default::default());
152 #[cfg(feature = "schema")]
153 let schemas = Arc::new(DashMap::new());
154
155 // Ensure the parent directory exists.
156 if let Some(parent) = std::path::Path::new(path).parent() {
157 std::fs::create_dir_all(parent)?;
158 }
159
160 // Choose the base storage backend based on the configured mode.
161 //
162 // tiered_mode = true → TieredStorage: hot log (async writes) + cold log
163 // (mmap reads). Best for large datasets. The cold log
164 // accumulates promoted hot data and is paged by the OS.
165 //
166 // sync_mode = true → SyncDiskStorage: every write is flushed to disk
167 // immediately. Zero data loss, lower throughput.
168 //
169 // default → AsyncDiskStorage: writes buffered in memory, flushed
170 // every 50ms. Highest throughput, up to 50ms data loss.
171 let base_storage: Arc<dyn StorageBackend> = if tiered_mode {
172 Arc::new(storage::TieredStorage::new(path)?)
173 } else if sync_mode {
174 Arc::new(storage::SyncDiskStorage::new(path)?)
175 } else {
176 Arc::new(storage::AsyncDiskStorage::new(path)?)
177 };
178
179 // Optionally wrap the base storage in EncryptedStorage.
180 // EncryptedStorage is transparent — it encrypts on write and decrypts
181 // on read, so the rest of the engine doesn't know encryption is happening.
182 let storage: Arc<dyn StorageBackend> = if let Some(key) = encryption_key {
183 Arc::new(storage::EncryptedStorage::new(base_storage, &key))
184 } else {
185 base_storage
186 };
187
188 // Replay the log (or snapshot + delta) into the in-memory state.
189 // After this call, `state` and `indexes` reflect the persisted data.
190 storage::stream_into_state(
191 &*storage,
192 &state,
193 &indexes,
194 #[cfg(feature = "schema")] &schemas,
195 )?;
196
197 Ok(Self {
198 state,
199 storage,
200 tx,
201 indexes,
202 query_heatmap,
203 hot_threshold,
204 rate_limit_requests,
205 rate_limit_window,
206 max_body_size,
207 #[cfg(feature = "schema")]
208 schemas,
209 post_backup_script,
210 })
211 }
212
213 /// Open (or create) a database in the browser using OPFS.
214 /// Only available on WASM builds. Async because OPFS APIs return Promises.
215 ///
216 /// `db_name` — the filename in the OPFS root directory (e.g. "analytics_db").
217 #[cfg(target_arch = "wasm32")]
218 pub async fn open_wasm(config: DbConfig) -> Result<Self, DbError> {
219 let db_name = &config.path;
220 let hot_threshold = config.hot_threshold;
221 let rate_limit_requests = config.rate_limit_requests;
222 let rate_limit_window = config.rate_limit_window;
223 let max_body_size = config.max_body_size;
224 let encryption_key = config.encryption_key;
225 let sync_mode = config.sync_mode;
226 let post_backup_script = config.post_backup_script;
227
228 let state = Arc::new(DashMap::new());
229 let (tx, _rx) = broadcast::channel(100);
230 let indexes: Arc<DashMap<String, DashMap<String, DashSet<String>>>> =
231 Arc::new(Default::default());
232 let query_heatmap = Arc::new(Default::default());
233 #[cfg(feature = "schema")]
234 let schemas = Arc::new(DashMap::new());
235
236 // Open the OPFS file. This is async because the browser's OPFS API
237 // uses Promises which we must await.
238 let mut storage: Arc<dyn StorageBackend> =
239 Arc::new(storage::OpfsStorage::new(db_name, sync_mode).await?);
240
241 // Apply encryption wrapper if a key is provided.
242 if let Some(key) = encryption_key {
243 storage = Arc::new(storage::EncryptedStorage::new(storage, &key));
244 }
245
246 // Replay the log into the in-memory state.
247 storage::stream_into_state(
248 &*storage,
249 &state,
250 &indexes,
251 #[cfg(feature = "schema")] &schemas,
252 )?;
253
254 Ok(Self {
255 state,
256 storage,
257 tx,
258 indexes,
259 query_heatmap,
260 hot_threshold,
261 rate_limit_requests,
262 rate_limit_window,
263 max_body_size,
264 #[cfg(feature = "schema")]
265 schemas,
266 post_backup_script,
267 })
268 }
269
270 /// Create a new broadcast receiver for real-time change notifications.
271 /// Each call returns an independent receiver — multiple WebSocket handlers
272 /// can each subscribe and receive all events independently.
273 pub fn subscribe(&self) -> broadcast::Receiver<String> {
274 self.tx.subscribe()
275 }
276
277 /// Retrieve a single document by key. Returns None if not found.
278 pub fn get(&self, collection: &str, key: &str) -> Option<Value> {
279 operations::get(&self.state, &self.storage, collection, key)
280 }
281
282 /// Retrieve all documents in a collection as a HashMap.
283 pub fn get_all(&self, collection: &str) -> HashMap<String, Value> {
284 operations::get_all(&self.state, &self.storage, collection)
285 }
286
287 /// Retrieve a specific set of documents by their keys.
288 pub fn get_batch(&self, collection: &str, keys: Vec<String>) -> HashMap<String, Value> {
289 operations::get_batch(&self.state, &self.storage, collection, keys)
290 }
291
292 /// Insert or overwrite multiple documents in one call.
293 /// Each item is a (key, value) pair. Writes are persisted to storage.
294 pub fn insert_batch(&self, collection: &str, items: Vec<(String, Value)>) -> Result<(), DbError> {
295 operations::insert_batch(
296 &self.state,
297 &self.indexes,
298 &self.storage,
299 &self.tx,
300 #[cfg(feature = "schema")] &self.schemas,
301 collection,
302 items,
303 )?;
304
305 // Auto-evict if the collection exceeds the threshold.
306 let _ = self.evict_collection(collection, self.hot_threshold);
307 Ok(())
308 }
309
310 /// Partially update a document — merges `updates` into the existing document.
311 /// Returns true if the document was found and updated, false if not found.
312 pub fn update(&self, collection: &str, key: &str, updates: Value) -> Result<bool, DbError> {
313 let updated = operations::update(
314 &self.state,
315 &self.indexes,
316 &self.storage,
317 &self.tx,
318 #[cfg(feature = "schema")] &self.schemas,
319 collection,
320 key,
321 updates,
322 )?;
323
324 if updated {
325 // Auto-evict if the collection exceeds the threshold.
326 let _ = self.evict_collection(collection, self.hot_threshold);
327 }
328 Ok(updated)
329 }
330
331 /// Delete a single document by key.
332 pub fn delete(&self, collection: &str, key: &str) -> Result<(), DbError> {
333 operations::delete(
334 &self.state,
335 &self.indexes,
336 &self.storage,
337 &self.tx,
338 collection,
339 key,
340 )
341 }
342
343 /// Delete multiple documents by key in one call.
344 pub fn delete_batch(&self, collection: &str, keys: Vec<String>) -> Result<(), DbError> {
345 operations::delete_batch(
346 &self.state,
347 &self.indexes,
348 &self.storage,
349 &self.tx,
350 collection,
351 keys,
352 )
353 }
354
355 /// Drop an entire collection — removes all documents and its indexes.
356 pub fn delete_collection(&self, collection: &str) -> Result<(), DbError> {
357 operations::delete_collection(
358 &self.state,
359 &self.indexes,
360 &self.storage,
361 &self.tx,
362 collection,
363 )
364 }
365
366 /// Track that `field` was queried in `collection` and auto-create an index
367 /// if this field has been queried 3 or more times.
368 /// Errors are silently ignored — auto-indexing is best-effort.
369 pub fn track_query(&self, collection: &str, field: &str) {
370 // The `let _ =` discards the Result — a failed auto-index is not fatal.
371 let _ = indexing::track_query(
372 &self.indexes,
373 &self.query_heatmap,
374 collection,
375 field,
376 &self.storage,
377 &self.state,
378 );
379 }
380
381 /// Register a JSON schema for a collection.
382 /// All subsequent writes to this collection must conform to this schema.
383 #[cfg(feature = "schema")]
384 pub fn set_schema(&self, collection: &str, schema: Value) -> Result<(), DbError> {
385 schema::set_schema(
386 &self.schemas,
387 &self.storage,
388 &self.tx,
389 collection,
390 schema
391 )
392 }
393
394 /// Compact the log file — rewrite it to contain only the current state.
395 ///
396 /// This removes all dead entries (superseded INSERTs, DELETE tombstones)
397 /// and writes a binary snapshot for fast next startup.
398 ///
399 /// The compacted log contains:
400 /// - One INSERT entry per live document (current value only).
401 /// - One INDEX entry per registered index (index data is rebuilt on replay).
402 pub fn compact(&self) -> Result<(), DbError> {
403 info!("🔨 Starting Log Compaction...");
404
405 // Build the minimal set of entries representing the current state.
406 let mut entries = Vec::new();
407
408 // One INSERT per live document across all collections.
409 for col_ref in self.state.iter() {
410 let col_name = col_ref.key();
411 for item_ref in col_ref.value().iter() {
412 // To compact, we need the full Value. If it's Cold, we fetch it from storage.
413 let entry = match item_ref.value() {
414 crate::engine::types::DocumentState::Hot(v) => {
415 types::LogEntry::new(
416 "INSERT".to_string(),
417 col_name.clone(),
418 item_ref.key().clone(),
419 v.clone(),
420 )
421 }
422 crate::engine::types::DocumentState::Cold(ptr) => {
423 let bytes = self.storage.read_at(ptr.offset, ptr.length)?;
424 serde_json::from_slice(&bytes)?
425 }
426 };
427 entries.push(entry);
428 }
429 }
430
431 // One SCHEMA entry per collection.
432 #[cfg(feature = "schema")]
433 for schema_ref in self.schemas.iter() {
434 let col_name = schema_ref.key();
435 let (schema_json, _) = &**schema_ref.value();
436 entries.push(types::LogEntry::new(
437 "SCHEMA".to_string(),
438 col_name.clone(),
439 "".to_string(),
440 schema_json.clone(),
441 ));
442 }
443
444 // One INDEX entry per registered index.
445 // The index name format is "collection:field" — we split it to get both parts.
446 for index_ref in self.indexes.iter() {
447 let parts: Vec<&str> = index_ref.key().split(':').collect();
448 if parts.len() == 2 {
449 entries.push(types::LogEntry::new(
450 "INDEX".to_string(),
451 parts[0].to_string(),
452 parts[1].to_string(), // field name
453 serde_json::json!(null),
454 ));
455 }
456 }
457
458 // Delegate the actual file rewrite (and snapshot write) to the storage backend.
459 self.storage.compact_with_hook(entries.clone(), self.post_backup_script.clone())?;
460
461 // After compaction the log is rewritten and all old RecordPointers are invalid.
462 // Promote every Cold entry in the in-memory state to Hot so subsequent reads
463 // don't try to seek to stale byte offsets in the now-truncated log file.
464 for entry in &entries {
465 if entry.cmd == "INSERT" {
466 if let Some(col) = self.state.get(&entry.collection) {
467 if let Some(mut doc) = col.get_mut(&entry.key) {
468 if matches!(*doc, crate::engine::types::DocumentState::Cold(_)) {
469 *doc = crate::engine::types::DocumentState::Hot(entry.value.clone());
470 }
471 }
472 }
473 }
474 }
475
476 info!("✅ Log Compaction Finished!");
477 Ok(())
478 }
479
480 /// Evict documents from RAM to disk for a collection if it exceeds the threshold.
481 ///
482 /// This converts `Hot(Value)` entries into `Cold(RecordPointer)` entries.
483 /// In this v1, it re-scans the log to find the exact byte offsets for the documents.
484 pub fn evict_collection(&self, collection: &str, limit: usize) -> Result<usize, DbError> {
485 let col_len = if let Some(col) = self.state.get(collection) {
486 col.len()
487 } else {
488 return Err(DbError::CollectionNotFound);
489 };
490
491 if col_len <= limit {
492 return Ok(0);
493 }
494
495 let mut evicted_count = 0;
496 let mut offset = 0u64;
497 let to_evict = col_len - limit;
498
499 // To evict properly, we need the pointers. Since we don't store them for
500 // Hot documents, we re-scan the log to find them.
501 self.storage.stream_log_into(&mut |entry, length| {
502 if entry.collection == collection {
503 if evicted_count < to_evict {
504 if let Some(col) = self.state.get(collection) {
505 if let Some(mut doc_state) = col.get_mut(&entry.key) {
506 if let crate::engine::types::DocumentState::Hot(_) = *doc_state {
507 *doc_state = crate::engine::types::DocumentState::Cold(crate::engine::types::RecordPointer {
508 offset,
509 length,
510 });
511 evicted_count += 1;
512 }
513 }
514 }
515 }
516 }
517 offset += (length + 1) as u64;
518 ControlFlow::Continue(())
519 })?;
520
521 Ok(evicted_count)
522 }
523
524 /// Recover the database state to a specific point in time or sequence number.
525 /// Returns the recovered state as a Vec of LogEntries that can be written to a snapshot.
526 ///
527 /// This is a utility function used by the CLI for PITR.
528 #[cfg(not(target_arch = "wasm32"))]
529 pub fn recover_to(
530 storage: &dyn StorageBackend,
531 to_time: Option<u64>,
532 to_seq: Option<u64>,
533 ) -> Result<Vec<LogEntry>, DbError> {
534 let state: DashMap<String, DashMap<String, crate::engine::types::DocumentState>> = DashMap::new();
535 let indexes: DashMap<String, DashMap<String, DashSet<String>>> = DashMap::new();
536 #[cfg(feature = "schema")]
537 let schemas: DashMap<String, Arc<(serde_json::Value, jsonschema::Validator)>> = DashMap::new();
538
539 let mut offset = 0u64;
540 let mut count = 0u64;
541 let mut current_tx_entries = Vec::new();
542 let mut current_tx_id = None;
543
544 storage.stream_log_into(&mut |entry, length| {
545 // Condition 1: Check Timestamp
546 if let Some(t) = to_time {
547 if entry._t > t {
548 return ControlFlow::Break(());
549 }
550 }
551
552 // Condition 2: Check Sequence
553 if let Some(s) = to_seq {
554 if count >= s {
555 return ControlFlow::Break(());
556 }
557 }
558
559 let pointer = crate::engine::types::RecordPointer {
560 offset,
561 length,
562 };
563
564 match entry.cmd.as_str() {
565 "TX_BEGIN" => {
566 current_tx_id = Some(entry.key.clone());
567 current_tx_entries.clear();
568 }
569 "TX_COMMIT" => {
570 if current_tx_id.as_ref() == Some(&entry.key) {
571 for (e, p) in current_tx_entries.drain(..) {
572 crate::engine::storage::apply_entry(
573 &e,
574 &state,
575 &indexes,
576 #[cfg(feature = "schema")] &schemas,
577 Some(p),
578 );
579 }
580 current_tx_id = None;
581 }
582 }
583 _ => {
584 if current_tx_id.is_some() {
585 current_tx_entries.push((entry, pointer));
586 } else {
587 crate::engine::storage::apply_entry(
588 &entry,
589 &state,
590 &indexes,
591 #[cfg(feature = "schema")] &schemas,
592 Some(pointer),
593 );
594 }
595 }
596 }
597
598 count += 1;
599 offset += (length + 1) as u64;
600 ControlFlow::Continue(())
601 })?;
602
603 // Convert the recovered state into LogEntries (similar to compact logic)
604 let mut entries = Vec::new();
605 for col_ref in state.iter() {
606 let col_name = col_ref.key();
607 for item_ref in col_ref.value().iter() {
608 let entry = match item_ref.value() {
609 crate::engine::types::DocumentState::Hot(v) => {
610 LogEntry::new(
611 "INSERT".to_string(),
612 col_name.clone(),
613 item_ref.key().clone(),
614 v.clone(),
615 )
616 }
617 crate::engine::types::DocumentState::Cold(ptr) => {
618 let bytes = storage.read_at(ptr.offset, ptr.length).unwrap_or_default();
619 serde_json::from_slice(&bytes).unwrap_or_else(|_| {
620 LogEntry::new("INSERT".to_string(), col_name.clone(), item_ref.key().clone(), serde_json::Value::Null)
621 })
622 }
623 };
624 entries.push(entry);
625 }
626 }
627
628 #[cfg(feature = "schema")]
629 for schema_ref in schemas.iter() {
630 let col_name = schema_ref.key();
631 let (schema_json, _) = &**schema_ref.value();
632 entries.push(LogEntry::new(
633 "SCHEMA".to_string(),
634 col_name.clone(),
635 "".to_string(),
636 schema_json.clone(),
637 ));
638 }
639
640 for index_ref in indexes.iter() {
641 let parts: Vec<&str> = index_ref.key().split(':').collect();
642 if parts.len() == 2 {
643 entries.push(LogEntry::new(
644 "INDEX".to_string(),
645 parts[0].to_string(),
646 parts[1].to_string(),
647 serde_json::json!(null),
648 ));
649 }
650 }
651
652 Ok(entries)
653 }
654}