Skip to main content

moltendb_core/engine/storage/
encrypted.rs

1// ─── encrypted.rs ────────────────────────────────────────────────────────────
2// This file implements EncryptedStorage — a transparent encryption wrapper
3// around any StorageBackend (e.g. AsyncDiskStorage or SyncDiskStorage).
4//
5// How it works:
6//   • write_entry()  — serialises the LogEntry to JSON, encrypts the JSON
7//                      string with XChaCha20-Poly1305, base64-encodes the
8//                      result, and writes a sentinel "ENC" LogEntry whose
9//                      `value` field holds the ciphertext. The underlying
10//                      storage backend sees only opaque ENC entries.
11//
12//   • read_log()     — reads raw ENC entries from the inner backend, decrypts
13//                      each one, and returns the original LogEntry objects.
14//                      Unencrypted entries (from a migration / legacy DB) are
15//                      passed through unchanged.
16//
17//   • compact()      — re-encrypts every entry during compaction so the
18//                      compacted file is fully encrypted with no plaintext
19//                      remnants left over from a migration.
20//
21// Encryption algorithm: XChaCha20-Poly1305
22//   • XChaCha20 is a stream cipher — it generates a keystream that is XOR'd
23//     with the plaintext. It's fast and has no timing side-channels.
24//   • Poly1305 is a message authentication code (MAC) — it detects any
25//     tampering with the ciphertext. If the MAC check fails, decryption
26//     returns an error instead of silently returning garbage data.
27//   • "X" variant uses a 192-bit (24-byte) nonce instead of 96-bit, making
28//     random nonce generation safe even for billions of messages.
29//
30// Key derivation: Argon2id
31//   • Argon2id is a memory-hard password hashing algorithm — it's designed
32//     to be slow and expensive to brute-force even with GPUs/ASICs.
33//   • We derive a deterministic 32-byte key from (password, db_path) so the
34//     same password always produces the same key for the same database file.
35//
36// On-disk format of each encrypted entry:
37//   {"cmd":"ENC","collection":"_","key":"_","value":"<base64>"}
38//   where <base64> = base64( nonce[24 bytes] || ciphertext )
39// ─────────────────────────────────────────────────────────────────────────────
40
41// Encryption works on both native and WASM.
42// #![cfg(not(target_arch = "wasm32"))]
43
44// The StorageBackend trait that EncryptedStorage implements.
45use super::StorageBackend;
46// Our internal data types.
47use crate::engine::types::{DbError, LogEntry};
48// base64 encoding/decoding — used to safely store binary ciphertext as text.
49use base64::{Engine as _, engine::general_purpose::STANDARD};
50// XChaCha20-Poly1305 AEAD cipher from the chacha20poly1305 crate.
51use chacha20poly1305::{
52    aead::{Aead, KeyInit}, // Aead = Authenticated Encryption with Associated Data
53    XChaCha20Poly1305,     // The cipher type
54    XNonce,                // 24-byte nonce type
55    Key,                   // 32-byte key type
56};
57// OsRng = cryptographically secure random number generator from the OS.
58// RngCore = trait that provides fill_bytes() for generating random bytes.
59use rand_core::{OsRng, RngCore};
60// Arc = thread-safe reference-counted pointer for shared ownership.
61use std::sync::Arc;
62
63/// Transparent encryption wrapper around any StorageBackend.
64///
65/// All data written through this wrapper is encrypted before reaching the
66/// inner storage, and decrypted when read back. The encryption is completely
67/// transparent to the rest of the database engine.
68pub struct EncryptedStorage {
69    /// The underlying storage backend (e.g. AsyncDiskStorage).
70    /// Arc<dyn StorageBackend> means "a thread-safe pointer to any type that
71    /// implements StorageBackend" — this is Rust's way of doing polymorphism
72    /// at runtime (similar to an interface in Java/C#).
73    inner: Arc<dyn StorageBackend>,
74    /// The initialized cipher instance, ready to encrypt/decrypt.
75    /// XChaCha20Poly1305 holds the expanded key schedule internally.
76    cipher: XChaCha20Poly1305,
77}
78
79impl EncryptedStorage {
80    /// Create a new EncryptedStorage wrapping `inner` with the given 32-byte key.
81    ///
82    /// Call `derive_key()` first to turn a human-readable password into a key.
83    /// The key must be exactly 32 bytes (256 bits) — the size XChaCha20 requires.
84    pub fn new(inner: Arc<dyn StorageBackend>, master_key: &[u8; 32]) -> Self {
85        // Key::from_slice() wraps the raw bytes in the Key newtype.
86        // XChaCha20Poly1305::new() expands the key into the cipher's internal state.
87        let key = Key::from_slice(master_key);
88        Self {
89            inner,
90            cipher: XChaCha20Poly1305::new(key),
91        }
92    }
93
94    /// Derive a deterministic 32-byte encryption key from a password string
95    /// using the Argon2id algorithm.
96    ///
97    /// `salt_context` is typically the database file path — this ensures that
98    /// the same password produces different keys for different database files,
99    /// preventing cross-database attacks.
100    ///
101    /// Argon2id is intentionally slow (memory-hard) to make brute-force attacks
102    /// expensive. The default parameters require ~64 MB of RAM and ~0.5 s of CPU.
103    pub fn derive_key(password: &str, salt_context: &str) -> [u8; 32] {
104        use argon2::{Argon2, PasswordHasher};
105        use argon2::password_hash::SaltString;
106
107        // Argon2 requires a base64-encoded salt of at least 8 bytes.
108        // We build a deterministic 22-character salt from the context string
109        // (22 base64 chars = 16 bytes of entropy, well above the minimum).
110        // The format!("{:0<22}", ...) pads the string to 22 chars with '0's.
111        let raw_salt = format!("{:0<22}", &salt_context[..salt_context.len().min(22)]);
112        // SaltString::from_b64 validates the base64 encoding.
113        // If it fails (e.g. invalid chars in the path), fall back to a known-good salt.
114        let salt = SaltString::from_b64(&raw_salt)
115            .unwrap_or_else(|_| SaltString::from_b64("bW9sdGVuZGJkZWZhdWx0").unwrap());
116
117        // Argon2::default() uses Argon2id variant with standard parameters.
118        let argon2 = Argon2::default();
119        // hash_password() runs the Argon2 algorithm and returns a PHC string.
120        let hash = argon2
121            .hash_password(password.as_bytes(), &salt)
122            .expect("Argon2 key derivation failed");
123
124        // Extract the raw 32-byte hash output from the PHC string.
125        // The PHC format includes the algorithm, parameters, salt, and hash.
126        let hash_output = hash.hash.expect("Argon2 produced no hash output");
127        let bytes = hash_output.as_bytes();
128        // Copy the first 32 bytes into our fixed-size key array.
129        let mut key = [0u8; 32];
130        key.copy_from_slice(&bytes[..32]);
131        key
132    }
133
134    /// Encrypt a single LogEntry and return a new "ENC" LogEntry whose `value`
135    /// field contains the base64-encoded ciphertext.
136    ///
137    /// A fresh random 24-byte nonce is generated for every single write.
138    /// Using a unique nonce per message is critical for security — reusing a
139    /// nonce with the same key would completely break the encryption.
140    fn encrypt_entry(&self, entry: &LogEntry) -> Result<LogEntry, DbError> {
141        // Step 1: Serialize the real entry to a JSON string.
142        let plain_json = serde_json::to_string(entry)?;
143
144        // Step 2: Generate a cryptographically random 24-byte nonce.
145        // OsRng reads from /dev/urandom (Linux) or BCryptGenRandom (Windows).
146        let mut nonce_bytes = [0u8; 24];
147        OsRng.fill_bytes(&mut nonce_bytes);
148        // XNonce::from_slice() wraps the bytes in the nonce newtype.
149        let nonce = XNonce::from_slice(&nonce_bytes);
150
151        // Step 3: Encrypt the plaintext. The cipher also computes a 16-byte
152        // Poly1305 MAC and appends it to the ciphertext automatically.
153        // Result: ciphertext = encrypt(key, nonce, plaintext) || MAC[16 bytes]
154        let cipher_text = self
155            .cipher
156            .encrypt(nonce, plain_json.as_bytes())
157            .map_err(|_| DbError::WriteError)?;
158
159        // Step 4: Prepend the nonce to the ciphertext so we have everything
160        // needed for decryption in one blob: nonce[24] || ciphertext || MAC[16]
161        let mut payload = nonce_bytes.to_vec();
162        payload.extend(cipher_text);
163
164        // Step 5: Base64-encode the binary payload so it can be safely stored
165        // as a JSON string value (JSON doesn't support raw binary).
166        let b64 = STANDARD.encode(&payload);
167
168        // Step 6: Return a sentinel LogEntry that hides the real cmd/collection/key.
169        // The underlying storage only ever sees these opaque ENC entries.
170        Ok(LogEntry {
171            cmd: "ENC".to_string(),
172            collection: "_".to_string(), // placeholder — real collection is inside the ciphertext
173            key: "_".to_string(),        // placeholder — real key is inside the ciphertext
174            value: serde_json::json!(b64),
175        })
176    }
177
178    /// Decrypt a single "ENC" LogEntry and return the original LogEntry.
179    ///
180    /// The Poly1305 MAC is verified automatically during decryption — if the
181    /// ciphertext has been tampered with or the wrong key is used, this returns
182    /// an error instead of silently returning garbage data.
183    fn decrypt_entry(&self, entry: &LogEntry) -> Result<LogEntry, DbError> {
184        // Extract the base64 string from the value field.
185        let b64 = entry.value.as_str().unwrap_or("");
186
187        // Decode from base64 back to raw bytes: nonce[24] || ciphertext || MAC[16]
188        let payload = STANDARD
189            .decode(b64)
190            .map_err(|_| DbError::WriteError)?;
191
192        // Sanity check: the payload must be at least 24 bytes (nonce) + 16 bytes (MAC).
193        if payload.len() < 24 {
194            return Err(DbError::WriteError);
195        }
196
197        // Split the payload into nonce (first 24 bytes) and ciphertext (the rest).
198        // split_at() returns two slices sharing the same underlying memory — no copy.
199        let (nonce_bytes, cipher_text) = payload.split_at(24);
200        let nonce = XNonce::from_slice(nonce_bytes);
201
202        // Decrypt and verify the MAC. If the MAC doesn't match (wrong key or
203        // tampered data), decrypt() returns Err and we propagate it as WriteError.
204        let plain_bytes = self
205            .cipher
206            .decrypt(nonce, cipher_text)
207            .map_err(|_| DbError::WriteError)?;
208
209        // Convert the decrypted bytes back to a UTF-8 string.
210        let plain_json = String::from_utf8(plain_bytes).map_err(|_| DbError::WriteError)?;
211
212        // Deserialize the JSON string back into a LogEntry.
213        serde_json::from_str::<LogEntry>(&plain_json).map_err(|e| DbError::Serialization(e))
214    }
215}
216
217/// Implement the StorageBackend trait so EncryptedStorage can be used anywhere
218/// a StorageBackend is expected — the rest of the engine doesn't know or care
219/// that encryption is happening.
220impl StorageBackend for EncryptedStorage {
221    /// Encrypt `entry` and write the resulting ENC entry to the inner backend.
222    fn write_entry(&self, entry: &LogEntry) -> Result<(), DbError> {
223        let encrypted = self.encrypt_entry(entry)?;
224        // Delegate to the inner backend (e.g. AsyncDiskStorage.write_entry).
225        self.inner.write_entry(&encrypted)
226    }
227
228    /// Read all ENC entries from the inner backend and decrypt them.
229    ///
230    /// Entries that fail to decrypt are skipped with a warning (not a crash)
231    /// so a single corrupt entry doesn't bring down the whole database.
232    /// Unencrypted entries (cmd != "ENC") are passed through unchanged —
233    /// this supports migrating an existing plaintext database to encrypted.
234    fn read_log(&self) -> Result<Vec<LogEntry>, DbError> {
235        // Read the raw (encrypted) entries from the underlying storage.
236        let raw_entries = self.inner.read_log()?;
237        let mut decrypted = Vec::with_capacity(raw_entries.len());
238
239        for entry in raw_entries {
240            if entry.cmd == "ENC" {
241                // This is an encrypted entry — decrypt it.
242                match self.decrypt_entry(&entry) {
243                    Ok(real_entry) => decrypted.push(real_entry),
244                    Err(e) => {
245                        // Log a warning but continue — don't crash on one bad entry.
246                        // This can happen if the encryption key changed or the file
247                        // was partially corrupted.
248                        tracing::warn!("⚠️  Skipping undecryptable log entry: {}", e);
249                    }
250                }
251            } else {
252                // Unencrypted legacy entry — pass through as-is.
253                // This allows migrating an existing plaintext database by simply
254                // enabling encryption; old entries are still readable.
255                decrypted.push(entry);
256            }
257        }
258
259        Ok(decrypted)
260    }
261
262    fn stream_log_into(&self, f: &mut dyn FnMut(LogEntry, u32)) -> Result<u64, DbError> {
263        let mut count = 0u64;
264        // EncryptedStorage wraps the inner backend. Since it doesn't have a 
265        // specialized streaming implementation yet, we fall back to read_log 
266        // BUT we need the ENCRYPTED length for the pointers.
267        
268        // Use inner.stream_log_into to get the encrypted entries and their lengths.
269        self.inner.stream_log_into(&mut |enc_entry, length| {
270            if enc_entry.cmd == "ENC" {
271                match self.decrypt_entry(&enc_entry) {
272                    Ok(real_entry) => {
273                        f(real_entry, length);
274                        count += 1;
275                    }
276                    Err(e) => {
277                        tracing::warn!("⚠️  Skipping undecryptable log entry during streaming: {}", e);
278                    }
279                }
280            } else {
281                // Pass through plaintext entry with its original length
282                f(enc_entry, length);
283                count += 1;
284            }
285        })?;
286        
287        Ok(count)
288    }
289
290    /// Re-encrypt all entries during compaction so the compacted file is fully
291    /// encrypted with no plaintext remnants from a migration.
292    ///
293    /// Each entry gets a fresh random nonce — even if the plaintext is the same
294    /// as before, the ciphertext will be different (this is correct and expected).
295    fn compact(&self, entries: Vec<LogEntry>) -> Result<(), DbError> {
296        // Encrypt every entry. If any encryption fails, the whole compaction fails
297        // (we don't want a partially-encrypted compacted file).
298        // The `.collect::<Result<Vec<_>, _>>()` pattern: if any map() call returns
299        // Err, the whole collect() returns that Err immediately (short-circuits).
300        let encrypted: Result<Vec<LogEntry>, DbError> =
301            entries.iter().map(|e| self.encrypt_entry(e)).collect();
302        // Delegate the actual file writing to the inner backend's compact().
303        self.inner.compact(encrypted?)
304    }
305
306    /// Read exactly `length` bytes from the inner backend and decrypt the entry.
307    ///
308    /// Note: in the encrypted Bitcask model, the pointer refers to the offset
309    /// and length of the ENCRYPTED entry in the log.
310    fn read_at(&self, offset: u64, length: u32) -> Result<Vec<u8>, DbError> {
311        // 1. Read the encrypted bytes from the inner storage.
312        let raw_bytes = self.inner.read_at(offset, length)?;
313
314        // 2. Deserialize the ENC LogEntry.
315        let enc_entry: LogEntry = serde_json::from_slice(&raw_bytes).map_err(DbError::Serialization)?;
316
317        // 3. Decrypt the entry.
318        let decrypted = self.decrypt_entry(&enc_entry)?;
319
320        // 4. Return the original plaintext LogEntry serialized as JSON.
321        // This matches the format expected by the engine (e.g. operations::get).
322        Ok(serde_json::to_vec(&decrypted).map_err(DbError::Serialization)?)
323    }
324}