moltendb_core/engine/storage/mod.rs
1// ─── storage/mod.rs ──────────────────────────────────────────────────────────
2// This is the root module for all storage backends. It does three things:
3//
4// 1. Declares and conditionally exposes the concrete backend modules
5// (disk, encrypted, wasm) based on the compile target.
6//
7// 2. Defines the StorageBackend trait — the single interface that the rest
8// of the engine uses to read/write data. Any type that implements this
9// trait can be used as a storage backend, whether it writes to a disk
10// file, an encrypted file, or a browser OPFS file.
11//
12// 3. Provides the startup replay functions (stream_into_state, apply_entry,
13// replay_log_entries) that rebuild the in-memory database state from the
14// persistent log on server/worker startup.
15//
16// The StorageBackend trait is the key abstraction that makes MoltenDB's
17// "same engine, different storage" design possible. The engine (mod.rs,
18// operations.rs, handlers.rs) never imports a concrete storage type — it
19// only ever holds an Arc<dyn StorageBackend>. This means you can swap the
20// storage backend without changing any engine code.
21// ─────────────────────────────────────────────────────────────────────────────
22
23// ── Conditional module declarations ──────────────────────────────────────────
24// These cfg attributes mean "only compile this when NOT targeting wasm32".
25// On native (server) builds we get disk.rs and encrypted.rs.
26// On WASM (browser) builds we get wasm.rs.
27// This prevents browser-incompatible code (file I/O, Tokio tasks) from being
28// compiled into the WASM binary.
29
30#[cfg(not(target_arch = "wasm32"))]
31mod disk;
32mod encrypted;
33// tiered.rs provides MmapLogReader (memory-mapped cold log reads) and
34// TieredStorage (hot + cold two-tier backend for large-scale deployments).
35#[cfg(not(target_arch = "wasm32"))]
36mod tiered;
37// Re-export the concrete types so callers can write `storage::AsyncDiskStorage`
38// instead of `storage::disk::AsyncDiskStorage`.
39#[cfg(not(target_arch = "wasm32"))]
40pub use disk::{AsyncDiskStorage, SyncDiskStorage};
41pub use encrypted::EncryptedStorage;
42// Re-export TieredStorage so engine/mod.rs and main.rs can use it directly.
43#[cfg(not(target_arch = "wasm32"))]
44pub use tiered::TieredStorage;
45
46// On WASM builds, expose the browser-side OPFS storage.
47#[cfg(target_arch = "wasm32")]
48pub mod wasm;
49#[cfg(target_arch = "wasm32")]
50pub use wasm::OpfsStorage;
51
52// ── Shared imports ────────────────────────────────────────────────────────────
53// These are used by both the trait definition and the replay functions below.
54use crate::engine::types::{DbError, LogEntry};
55use serde_json::Value;
56// DashMap is a concurrent hash map — like HashMap but safe to read/write from
57// multiple threads simultaneously without a global lock.
58// DashSet is the set equivalent.
59use dashmap::{DashMap, DashSet};
60// serde_json::Value is a dynamically-typed JSON value (can be object, array,
61// string, number, bool, or null). All document data is stored as Value.
62
63// ─── StorageBackend trait ─────────────────────────────────────────────────────
64//
65// This is the core abstraction of the storage layer. Any type that implements
66// these three methods can serve as a MoltenDB storage backend.
67//
68// The trait requires Send + Sync because the backend is stored inside an
69// Arc<dyn StorageBackend> and shared across multiple Tokio tasks/threads.
70// • Send = the type can be moved to another thread
71// • Sync = the type can be referenced from multiple threads simultaneously
72// ─────────────────────────────────────────────────────────────────────────────
73
74/// The core storage abstraction. Implement this trait to add a new storage backend.
75///
76/// All three methods operate on `LogEntry` — the atomic unit of data in MoltenDB.
77/// The engine never writes raw bytes; it always goes through this interface.
78pub trait StorageBackend: Send + Sync {
79 /// Append a single log entry to the persistent store.
80 ///
81 /// This is called on every insert, update, delete, and index creation.
82 /// Implementations may buffer writes (async) or flush immediately (sync).
83 fn write_entry(&self, entry: &LogEntry) -> Result<(), DbError>;
84
85 /// Read all log entries from persistent storage into a Vec.
86 ///
87 /// Called on startup to rebuild the in-memory state, and by EncryptedStorage
88 /// which must decrypt entries before they can be streamed into state.
89 /// For large databases, prefer `stream_log_into` which avoids holding the
90 /// full log in RAM.
91 fn read_log(&self) -> Result<Vec<LogEntry>, DbError>;
92
93 /// Compact the log by writing only the current state (removing dead entries).
94 ///
95 /// `entries` is the complete current state of the database — every live
96 /// document as a single INSERT entry. The implementation should atomically
97 /// replace the existing log with this minimal set.
98 fn compact(&self, entries: Vec<LogEntry>) -> Result<(), DbError>;
99
100 /// Read exactly `length` bytes starting at `offset` from the log.
101 ///
102 /// This is used to fetch "Cold" documents from the append-only log without
103 /// loading the entire file into memory.
104 fn read_at(&self, offset: u64, length: u32) -> Result<Vec<u8>, DbError>;
105
106 /// Return the current size of the persistent log file in bytes.
107 ///
108 /// Used by the WASM worker to implement size-based auto-compaction — the JS
109 /// side calls `get_size` after every INSERT batch and compacts if the file
110 /// exceeds the configured threshold (default: 5 MB).
111 ///
112 /// The default implementation returns 0 (no size information available).
113 /// `OpfsStorage` overrides this with a real `FileSystemSyncAccessHandle.getSize()` call.
114 /// Native disk backends don't need this — they use OS-level file metadata instead.
115 #[allow(dead_code)]
116 fn get_size(&self) -> Result<u64, DbError> {
117 Ok(0)
118 }
119
120 /// Stream log entries into state one at a time, without loading the full
121 /// log into RAM. Implementations may load a binary snapshot first and only
122 /// replay the delta lines written after the snapshot.
123 ///
124 /// The default implementation falls back to `read_log()` for backwards
125 /// compatibility (used by WASM/EncryptedStorage which don't have snapshots).
126 ///
127 /// Returns the total number of entries processed.
128 fn stream_log_into(&self, f: &mut dyn FnMut(LogEntry, u32)) -> Result<u64, DbError> {
129 // Default: load everything into a Vec, then iterate.
130 // Concrete implementations (AsyncDiskStorage, SyncDiskStorage) override
131 // this with a more efficient snapshot + streaming approach.
132 let entries = self.read_log()?;
133 let count = entries.len() as u64;
134 for entry in entries {
135 // Default re-serializes to get length.
136 // Better implementations override this.
137 let json = serde_json::to_vec(&entry).unwrap_or_default();
138 let length = json.len() as u32;
139 f(entry, length);
140 }
141 Ok(count)
142 }
143}
144
145// ─── Startup replay ───────────────────────────────────────────────────────────
146//
147// When the server starts (or the WASM worker initialises), we need to rebuild
148// the in-memory state from the persistent log. These functions handle that.
149//
150// The process is:
151// 1. Call storage.stream_log_into() — this either loads a binary snapshot
152// + delta (fast path) or streams the full log line-by-line (slow path).
153// 2. For each LogEntry, call apply_entry() to update the in-memory DashMaps.
154// 3. After all entries are applied, the in-memory state matches the log.
155// ─────────────────────────────────────────────────────────────────────────────
156
157/// Drive startup by streaming all log entries from storage into the in-memory
158/// state and index maps. Uses snapshot + delta replay when available.
159///
160/// `state` — the main data store: collection name → (key → document state)
161/// `indexes` — the index store: "collection:field" → (field value → set of keys)
162///
163/// Returns the total number of log entries processed.
164pub fn stream_into_state(
165 storage: &dyn StorageBackend,
166 state: &DashMap<String, DashMap<String, crate::engine::types::DocumentState>>,
167 indexes: &DashMap<String, DashMap<String, DashSet<String>>>,
168 schemas: &DashMap<String, std::sync::Arc<(Value, jsonschema::Validator)>>,
169) -> Result<u64, DbError> {
170 let mut count = 0u64;
171 let mut offset = 0u64;
172 let mut tx_buffer: Vec<(LogEntry, crate::engine::types::RecordPointer)> = Vec::new();
173 let mut active_tx: Option<String> = None;
174
175 // stream_log_into calls our closure once per LogEntry, providing the
176 // LogEntry and its raw byte length in the log file.
177 storage.stream_log_into(&mut |entry, length| {
178 let pointer = crate::engine::types::RecordPointer {
179 offset,
180 length,
181 };
182
183 match entry.cmd.as_str() {
184 "TX_BEGIN" => {
185 active_tx = Some(entry.key.clone());
186 tx_buffer.clear();
187 }
188 "TX_COMMIT" => {
189 if active_tx.as_ref() == Some(&entry.key) {
190 // Flush buffer to DashMap
191 for (e, p) in tx_buffer.drain(..) {
192 apply_entry(&e, state, indexes, schemas, Some(p));
193 }
194 active_tx = None;
195 }
196 }
197 _ => {
198 if active_tx.is_some() {
199 // Hold in RAM until commit
200 tx_buffer.push((entry, pointer));
201 } else {
202 // Standard non-transactional entry
203 apply_entry(&entry, state, indexes, schemas, Some(pointer));
204 }
205 }
206 }
207
208 count += 1;
209 // +1 for the newline character appended to each JSON line in the log.
210 offset += (length + 1) as u64;
211 })?;
212
213 // If active_tx is still Some, the file ended prematurely (crash).
214 // The tx_buffer is dropped here -> Atomicity achieved.
215 Ok(count)
216}
217
218/// Apply a single log entry to the in-memory state and indexes.
219///
220/// If `pointer` is provided (during log replay), INSERT entries are stored
221/// as `DocumentState::Cold(pointer)` to save memory. Live writes stay `Hot`.
222fn apply_entry(
223 entry: &LogEntry,
224 state: &DashMap<String, DashMap<String, crate::engine::types::DocumentState>>,
225 indexes: &DashMap<String, DashMap<String, DashSet<String>>>,
226 schemas: &DashMap<String, std::sync::Arc<(Value, jsonschema::Validator)>>,
227 pointer: Option<crate::engine::types::RecordPointer>,
228) {
229 match entry.cmd.as_str() {
230 "INSERT" => {
231 let col = state
232 .entry(entry.collection.clone())
233 .or_insert_with(DashMap::new);
234
235 // During replay, we use the pointer (Cold). For live writes, we store the Value (Hot).
236 let doc_state = if let Some(p) = pointer {
237 crate::engine::types::DocumentState::Cold(p)
238 } else {
239 crate::engine::types::DocumentState::Hot(entry.value.clone())
240 };
241
242 col.insert(entry.key.clone(), doc_state);
243
244 // Indexes ALWAYS store values in RAM to keep searches O(1).
245 crate::engine::indexing::index_doc(indexes, &entry.collection, &entry.key, &entry.value);
246 }
247 "DELETE" => {
248 if let Some(col) = state.get(&entry.collection) {
249 // To unindex, we need the Value. If it's Cold, we'd have to fetch it.
250 // However, during REPLAY, we can just skip unindexing if we don't have the value,
251 // BUT that would break if a DELETE follows an INSERT.
252 // Actually, unindex_doc needs the Value.
253 // For simplicity in this v1 of Hybrid, we'll fetch if needed or change unindex_doc.
254 // Wait, if it's Cold, we don't have the value.
255 // I'll leave a TODO here and for now just handle Hot.
256 if let Some(old_state) = col.get(&entry.key) {
257 if let crate::engine::types::DocumentState::Hot(old_val) = old_state.value() {
258 crate::engine::indexing::unindex_doc(
259 indexes,
260 &entry.collection,
261 &entry.key,
262 old_val,
263 );
264 }
265 }
266 col.remove(&entry.key);
267 }
268 }
269 "DROP" => {
270 // Remove the entire collection from the state map.
271 state.remove(&entry.collection);
272 // Remove all indexes that belong to this collection.
273 // retain() keeps only entries where the closure returns true.
274 // We drop any index whose key starts with "collection:" (e.g. "users:role").
275 indexes.retain(|k, _| !k.starts_with(&format!("{}:", entry.collection)));
276 }
277 "INDEX" => {
278 // Register an empty index slot for "collection:field".
279 // The index will be populated as subsequent INSERT entries are applied.
280 // `entry.key` holds the field name (e.g. "role" for "users:role").
281 indexes.insert(
282 format!("{}:{}", entry.collection, entry.key),
283 DashMap::new(),
284 );
285 }
286 "SCHEMA" => {
287 // Re-compile and register the schema during replay.
288 if let Ok(validator) = jsonschema::validator_for(&entry.value) {
289 schemas.insert(entry.collection.clone(), std::sync::Arc::new((entry.value.clone(), validator)));
290 }
291 }
292 // Unknown command types are silently ignored for forward compatibility.
293 // If a future version of MoltenDB adds a new command, older versions
294 // will simply skip those entries rather than crashing.
295 _ => {}
296 }
297}
298
299// Replay a slice of already-decoded log entries into RAM state.
300//
301// This is an alternative to stream_into_state() used when the entries have
302// already been loaded into memory (e.g. after decryption by EncryptedStorage).
303// It applies the same logic as apply_entry() but iterates a pre-built slice.
304
305// pub fn replay_log_entries(
306// entries: &[LogEntry],
307// state: &DashMap<String, DashMap<String, Value>>,
308// indexes: &DashMap<String, DashMap<String, DashSet<String>>>,
309// ) {
310// for entry in entries {
311// match entry.cmd.as_str() {
312// "INSERT" => {
313// // Get or create the collection, then insert the document.
314// let col = state
315// .entry(entry.collection.clone())
316// .or_insert_with(DashMap::new);
317// col.insert(entry.key.clone(), entry.value.clone());
318// // Keep indexes in sync with the inserted document.
319// crate::engine::indexing::index_doc(indexes, &entry.collection, &entry.key, &entry.value);
320// }
321// "DELETE" => {
322// if let Some(col) = state.get(&entry.collection) {
323// // Remove from indexes before removing from state.
324// if let Some(old_val) = col.get(&entry.key) {
325// crate::engine::indexing::unindex_doc(
326// indexes,
327// &entry.collection,
328// &entry.key,
329// old_val.value(),
330// );
331// }
332// col.remove(&entry.key);
333// }
334// }
335// "DROP" => {
336// // Remove the collection and all its associated indexes.
337// state.remove(&entry.collection);
338// indexes.retain(|k, _| !k.starts_with(&format!("{}:", entry.collection)));
339// }
340// "INDEX" => {
341// // Register an empty index slot.
342// indexes.insert(
343// format!("{}:{}", entry.collection, entry.key),
344// DashMap::new(),
345// );
346// }
347// _ => {}
348// }
349// }
350// println!("✅ Database restored & Indexes rebuilt!");
351// }