moltendb_core/engine/storage/encrypted.rs
1// ─── encrypted.rs ────────────────────────────────────────────────────────────
2// This file implements EncryptedStorage — a transparent encryption wrapper
3// around any StorageBackend (e.g. AsyncDiskStorage or SyncDiskStorage).
4//
5// How it works:
6// • write_entry() — serialises the LogEntry to JSON, encrypts the JSON
7// string with XChaCha20-Poly1305, base64-encodes the
8// result, and writes a sentinel "ENC" LogEntry whose
9// `value` field holds the ciphertext. The underlying
10// storage backend sees only opaque ENC entries.
11//
12// • read_log() — reads raw ENC entries from the inner backend, decrypts
13// each one, and returns the original LogEntry objects.
14// Unencrypted entries (from a migration / legacy DB) are
15// passed through unchanged.
16//
17// • compact() — re-encrypts every entry during compaction so the
18// compacted file is fully encrypted with no plaintext
19// remnants left over from a migration.
20//
21// Encryption algorithm: XChaCha20-Poly1305
22// • XChaCha20 is a stream cipher — it generates a keystream that is XOR'd
23// with the plaintext. It's fast and has no timing side-channels.
24// • Poly1305 is a message authentication code (MAC) — it detects any
25// tampering with the ciphertext. If the MAC check fails, decryption
26// returns an error instead of silently returning garbage data.
27// • "X" variant uses a 192-bit (24-byte) nonce instead of 96-bit, making
28// random nonce generation safe even for billions of messages.
29//
30// Key derivation: Argon2id
31// • Argon2id is a memory-hard password hashing algorithm — it's designed
32// to be slow and expensive to brute-force even with GPUs/ASICs.
33// • We derive a deterministic 32-byte key from (password, db_path) so the
34// same password always produces the same key for the same database file.
35//
36// On-disk format of each encrypted entry:
37// {"cmd":"ENC","collection":"_","key":"_","value":"<base64>"}
38// where <base64> = base64( nonce[24 bytes] || ciphertext )
39// ─────────────────────────────────────────────────────────────────────────────
40
41// Encryption works on both native and WASM.
42// #![cfg(not(target_arch = "wasm32"))]
43
44// The StorageBackend trait that EncryptedStorage implements.
45use super::StorageBackend;
46// Our internal data types.
47use crate::engine::types::{DbError, LogEntry};
48// base64 encoding/decoding — used to safely store binary ciphertext as text.
49use base64::{Engine as _, engine::general_purpose::STANDARD};
50// XChaCha20-Poly1305 AEAD cipher from the chacha20poly1305 crate.
51use chacha20poly1305::{
52 aead::{Aead, KeyInit}, // Aead = Authenticated Encryption with Associated Data
53 XChaCha20Poly1305, // The cipher type
54 XNonce, // 24-byte nonce type
55 Key, // 32-byte key type
56};
57// OsRng = cryptographically secure random number generator from the OS.
58// RngCore = trait that provides fill_bytes() for generating random bytes.
59use rand_core::{OsRng, RngCore};
60// Arc = thread-safe reference-counted pointer for shared ownership.
61use std::ops::ControlFlow;
62use std::sync::Arc;
63
64/// Transparent encryption wrapper around any StorageBackend.
65///
66/// All data written through this wrapper is encrypted before reaching the
67/// inner storage, and decrypted when read back. The encryption is completely
68/// transparent to the rest of the database engine.
69pub struct EncryptedStorage {
70 /// The underlying storage backend (e.g. AsyncDiskStorage).
71 /// Arc<dyn StorageBackend> means "a thread-safe pointer to any type that
72 /// implements StorageBackend" — this is Rust's way of doing polymorphism
73 /// at runtime (similar to an interface in Java/C#).
74 inner: Arc<dyn StorageBackend>,
75 /// The initialized cipher instance, ready to encrypt/decrypt.
76 /// XChaCha20Poly1305 holds the expanded key schedule internally.
77 cipher: XChaCha20Poly1305,
78}
79
80impl EncryptedStorage {
81 /// Create a new EncryptedStorage wrapping `inner` with the given 32-byte key.
82 ///
83 /// Call `derive_key()` first to turn a human-readable password into a key.
84 /// The key must be exactly 32 bytes (256 bits) — the size XChaCha20 requires.
85 pub fn new(inner: Arc<dyn StorageBackend>, master_key: &[u8; 32]) -> Self {
86 // Key::from_slice() wraps the raw bytes in the Key newtype.
87 // XChaCha20Poly1305::new() expands the key into the cipher's internal state.
88 let key = Key::from_slice(master_key);
89 Self {
90 inner,
91 cipher: XChaCha20Poly1305::new(key),
92 }
93 }
94
95 /// Derive a deterministic 32-byte encryption key from a password string
96 /// using the Argon2id algorithm.
97 ///
98 /// `salt_context` is typically the database file path — this ensures that
99 /// the same password produces different keys for different database files,
100 /// preventing cross-database attacks.
101 ///
102 /// Argon2id is intentionally slow (memory-hard) to make brute-force attacks
103 /// expensive. The default parameters require ~64 MB of RAM and ~0.5 s of CPU.
104 pub fn derive_key(password: &str, salt_context: &str) -> [u8; 32] {
105 use argon2::{Argon2, PasswordHasher};
106 use argon2::password_hash::SaltString;
107
108 // Argon2 requires a base64-encoded salt of at least 8 bytes.
109 // We build a deterministic 22-character salt from the context string
110 // (22 base64 chars = 16 bytes of entropy, well above the minimum).
111 // The format!("{:0<22}", ...) pads the string to 22 chars with '0's.
112 let raw_salt = format!("{:0<22}", &salt_context[..salt_context.len().min(22)]);
113 // SaltString::from_b64 validates the base64 encoding.
114 // If it fails (e.g. invalid chars in the path), fall back to a known-good salt.
115 let salt = SaltString::from_b64(&raw_salt)
116 .unwrap_or_else(|_| SaltString::from_b64("bW9sdGVuZGJkZWZhdWx0").unwrap());
117
118 // Argon2::default() uses Argon2id variant with standard parameters.
119 let argon2 = Argon2::default();
120 // hash_password() runs the Argon2 algorithm and returns a PHC string.
121 let hash = argon2
122 .hash_password(password.as_bytes(), &salt)
123 .expect("Argon2 key derivation failed");
124
125 // Extract the raw 32-byte hash output from the PHC string.
126 // The PHC format includes the algorithm, parameters, salt, and hash.
127 let hash_output = hash.hash.expect("Argon2 produced no hash output");
128 let bytes = hash_output.as_bytes();
129 // Copy the first 32 bytes into our fixed-size key array.
130 let mut key = [0u8; 32];
131 key.copy_from_slice(&bytes[..32]);
132 key
133 }
134
135 /// Encrypt a single LogEntry and return a new "ENC" LogEntry whose `value`
136 /// field contains the base64-encoded ciphertext.
137 ///
138 /// A fresh random 24-byte nonce is generated for every single write.
139 /// Using a unique nonce per message is critical for security — reusing a
140 /// nonce with the same key would completely break the encryption.
141 fn encrypt_entry(&self, entry: &LogEntry) -> Result<LogEntry, DbError> {
142 // Step 1: Serialize the real entry to a JSON string.
143 let plain_json = serde_json::to_string(entry)?;
144
145 // Step 2: Generate a cryptographically random 24-byte nonce.
146 // OsRng reads from /dev/urandom (Linux) or BCryptGenRandom (Windows).
147 let mut nonce_bytes = [0u8; 24];
148 OsRng.fill_bytes(&mut nonce_bytes);
149 // XNonce::from_slice() wraps the bytes in the nonce newtype.
150 let nonce = XNonce::from_slice(&nonce_bytes);
151
152 // Step 3: Encrypt the plaintext. The cipher also computes a 16-byte
153 // Poly1305 MAC and appends it to the ciphertext automatically.
154 // Result: ciphertext = encrypt(key, nonce, plaintext) || MAC[16 bytes]
155 let cipher_text = self
156 .cipher
157 .encrypt(nonce, plain_json.as_bytes())
158 .map_err(|_| DbError::WriteError)?;
159
160 // Step 4: Prepend the nonce to the ciphertext so we have everything
161 // needed for decryption in one blob: nonce[24] || ciphertext || MAC[16]
162 let mut payload = nonce_bytes.to_vec();
163 payload.extend(cipher_text);
164
165 // Step 5: Base64-encode the binary payload so it can be safely stored
166 // as a JSON string value (JSON doesn't support raw binary).
167 let b64 = STANDARD.encode(&payload);
168
169 // Step 6: Return a sentinel LogEntry that hides the real cmd/collection/key.
170 // The underlying storage only ever sees these opaque ENC entries.
171 Ok(LogEntry::new(
172 "ENC".to_string(),
173 "_".to_string(), // placeholder — real collection is inside the ciphertext
174 "_".to_string(), // placeholder — real key is inside the ciphertext
175 serde_json::json!(b64),
176 ))
177 }
178
179 /// Decrypt a single "ENC" LogEntry and return the original LogEntry.
180 ///
181 /// The Poly1305 MAC is verified automatically during decryption — if the
182 /// ciphertext has been tampered with or the wrong key is used, this returns
183 /// an error instead of silently returning garbage data.
184 fn decrypt_entry(&self, entry: &LogEntry) -> Result<LogEntry, DbError> {
185 // Extract the base64 string from the value field.
186 let b64 = entry.value.as_str().unwrap_or("");
187
188 // Decode from base64 back to raw bytes: nonce[24] || ciphertext || MAC[16]
189 let payload = STANDARD
190 .decode(b64)
191 .map_err(|_| DbError::WriteError)?;
192
193 // Sanity check: the payload must be at least 24 bytes (nonce) + 16 bytes (MAC).
194 if payload.len() < 24 {
195 return Err(DbError::WriteError);
196 }
197
198 // Split the payload into nonce (first 24 bytes) and ciphertext (the rest).
199 // split_at() returns two slices sharing the same underlying memory — no copy.
200 let (nonce_bytes, cipher_text) = payload.split_at(24);
201 let nonce = XNonce::from_slice(nonce_bytes);
202
203 // Decrypt and verify the MAC. If the MAC doesn't match (wrong key or
204 // tampered data), decrypt() returns Err and we propagate it as WriteError.
205 let plain_bytes = self
206 .cipher
207 .decrypt(nonce, cipher_text)
208 .map_err(|_| DbError::WriteError)?;
209
210 // Convert the decrypted bytes back to a UTF-8 string.
211 let plain_json = String::from_utf8(plain_bytes).map_err(|_| DbError::WriteError)?;
212
213 // Deserialize the JSON string back into a LogEntry.
214 serde_json::from_str::<LogEntry>(&plain_json).map_err(|e| DbError::Serialization(e))
215 }
216}
217
218/// Implement the StorageBackend trait so EncryptedStorage can be used anywhere
219/// a StorageBackend is expected — the rest of the engine doesn't know or care
220/// that encryption is happening.
221impl StorageBackend for EncryptedStorage {
222 /// Encrypt `entry` and write the resulting ENC entry to the inner backend.
223 fn write_entry(&self, entry: &LogEntry) -> Result<(), DbError> {
224 let encrypted = self.encrypt_entry(entry)?;
225 // Delegate to the inner backend (e.g. AsyncDiskStorage.write_entry).
226 self.inner.write_entry(&encrypted)
227 }
228
229 /// Read all ENC entries from the inner backend and decrypt them.
230 ///
231 /// Entries that fail to decrypt are skipped with a warning (not a crash)
232 /// so a single corrupt entry doesn't bring down the whole database.
233 /// Unencrypted entries (cmd != "ENC") are passed through unchanged —
234 /// this supports migrating an existing plaintext database to encrypted.
235 fn read_log(&self) -> Result<Vec<LogEntry>, DbError> {
236 // Read the raw (encrypted) entries from the underlying storage.
237 let raw_entries = self.inner.read_log()?;
238 let mut decrypted = Vec::with_capacity(raw_entries.len());
239
240 for entry in raw_entries {
241 if entry.cmd == "ENC" {
242 // This is an encrypted entry — decrypt it.
243 match self.decrypt_entry(&entry) {
244 Ok(real_entry) => decrypted.push(real_entry),
245 Err(e) => {
246 // Log a warning but continue — don't crash on one bad entry.
247 // This can happen if the encryption key changed or the file
248 // was partially corrupted.
249 tracing::warn!("⚠️ Skipping undecryptable log entry: {}", e);
250 }
251 }
252 } else {
253 // Unencrypted legacy entry — pass through as-is.
254 // This allows migrating an existing plaintext database by simply
255 // enabling encryption; old entries are still readable.
256 decrypted.push(entry);
257 }
258 }
259
260 Ok(decrypted)
261 }
262
263 fn stream_log_into(&self, f: &mut dyn FnMut(LogEntry, u32) -> ControlFlow<(), ()>) -> Result<u64, DbError> {
264 let mut count = 0u64;
265 // EncryptedStorage wraps the inner backend. Since it doesn't have a
266 // specialized streaming implementation yet, we fall back to read_log
267 // BUT we need the ENCRYPTED length for the pointers.
268
269 // Use inner.stream_log_into to get the encrypted entries and their lengths.
270 self.inner.stream_log_into(&mut |enc_entry, length| {
271 if enc_entry.cmd == "ENC" {
272 match self.decrypt_entry(&enc_entry) {
273 Ok(real_entry) => {
274 let res = f(real_entry, length);
275 if let ControlFlow::Continue(_) = res {
276 count += 1;
277 }
278 res
279 }
280 Err(e) => {
281 tracing::warn!("⚠️ Skipping undecryptable log entry during streaming: {}", e);
282 ControlFlow::Continue(())
283 }
284 }
285 } else {
286 // Pass through plaintext entry with its original length
287 let res = f(enc_entry, length);
288 if let ControlFlow::Continue(_) = res {
289 count += 1;
290 }
291 res
292 }
293 })?;
294
295 Ok(count)
296 }
297
298 /// Re-encrypt all entries during compaction so the compacted file is fully
299 /// encrypted with no plaintext remnants from a migration.
300 ///
301 /// Each entry gets a fresh random nonce — even if the plaintext is the same
302 /// as before, the ciphertext will be different (this is correct and expected).
303 fn compact(&self, entries: Vec<LogEntry>) -> Result<(), DbError> {
304 // Encrypt every entry. If any encryption fails, the whole compaction fails
305 // (we don't want a partially-encrypted compacted file).
306 // The `.collect::<Result<Vec<_>, _>>()` pattern: if any map() call returns
307 // Err, the whole collect() returns that Err immediately (short-circuits).
308 let encrypted: Result<Vec<LogEntry>, DbError> =
309 entries.iter().map(|e| self.encrypt_entry(e)).collect();
310 // Delegate the actual file writing to the inner backend's compact().
311 self.inner.compact(encrypted?)
312 }
313
314 /// Read exactly `length` bytes from the inner backend and decrypt the entry.
315 ///
316 /// Note: in the encrypted Bitcask model, the pointer refers to the offset
317 /// and length of the ENCRYPTED entry in the log.
318 fn read_at(&self, offset: u64, length: u32) -> Result<Vec<u8>, DbError> {
319 // 1. Read the encrypted bytes from the inner storage.
320 let raw_bytes = self.inner.read_at(offset, length)?;
321
322 // 2. Deserialize the ENC LogEntry.
323 let enc_entry: LogEntry = serde_json::from_slice(&raw_bytes).map_err(DbError::Serialization)?;
324
325 // 3. Decrypt the entry.
326 let decrypted = self.decrypt_entry(&enc_entry)?;
327
328 // 4. Return the original plaintext LogEntry serialized as JSON.
329 // This matches the format expected by the engine (e.g. operations::get).
330 Ok(serde_json::to_vec(&decrypted).map_err(DbError::Serialization)?)
331 }
332}