moltendb_core/engine/storage/encrypted.rs
1// ─── encrypted.rs ────────────────────────────────────────────────────────────
2// This file implements EncryptedStorage — a transparent encryption wrapper
3// around any StorageBackend (e.g. AsyncDiskStorage or SyncDiskStorage).
4//
5// How it works:
6// • write_entry() — serialises the LogEntry to JSON, encrypts the JSON
7// string with XChaCha20-Poly1305, base64-encodes the
8// result, and writes a sentinel "ENC" LogEntry whose
9// `value` field holds the ciphertext. The underlying
10// storage backend sees only opaque ENC entries.
11//
12// • read_log() — reads raw ENC entries from the inner backend, decrypts
13// each one, and returns the original LogEntry objects.
14// Unencrypted entries (from a migration / legacy DB) are
15// passed through unchanged.
16//
17// • compact() — re-encrypts every entry during compaction so the
18// compacted file is fully encrypted with no plaintext
19// remnants left over from a migration.
20//
21// Encryption algorithm: XChaCha20-Poly1305
22// • XChaCha20 is a stream cipher — it generates a keystream that is XOR'd
23// with the plaintext. It's fast and has no timing side-channels.
24// • Poly1305 is a message authentication code (MAC) — it detects any
25// tampering with the ciphertext. If the MAC check fails, decryption
26// returns an error instead of silently returning garbage data.
27// • "X" variant uses a 192-bit (24-byte) nonce instead of 96-bit, making
28// random nonce generation safe even for billions of messages.
29//
30// Key derivation: Argon2id
31// • Argon2id is a memory-hard password hashing algorithm — it's designed
32// to be slow and expensive to brute-force even with GPUs/ASICs.
33// • We derive a deterministic 32-byte key from (password, db_path) so the
34// same password always produces the same key for the same database file.
35//
36// On-disk format of each encrypted entry:
37// {"cmd":"ENC","collection":"_","key":"_","value":"<base64>"}
38// where <base64> = base64( nonce[24 bytes] || ciphertext )
39// ─────────────────────────────────────────────────────────────────────────────
40
41// Encryption works on both native and WASM.
42// #![cfg(not(target_arch = "wasm32"))]
43
44// The StorageBackend trait that EncryptedStorage implements.
45use super::StorageBackend;
46// Our internal data types.
47use crate::engine::types::{DbError, LogEntry};
48// base64 encoding/decoding — used to safely store binary ciphertext as text.
49use base64::{Engine as _, engine::general_purpose::STANDARD};
50// XChaCha20-Poly1305 AEAD cipher from the chacha20poly1305 crate.
51use chacha20poly1305::{
52 aead::{Aead, KeyInit}, // Aead = Authenticated Encryption with Associated Data
53 XChaCha20Poly1305, // The cipher type
54 XNonce, // 24-byte nonce type
55 Key, // 32-byte key type
56};
57// OsRng = cryptographically secure random number generator from the OS.
58// RngCore = trait that provides fill_bytes() for generating random bytes.
59use rand_core::{OsRng, RngCore};
60// Arc = thread-safe reference-counted pointer for shared ownership.
61use std::sync::Arc;
62
63/// Transparent encryption wrapper around any StorageBackend.
64///
65/// All data written through this wrapper is encrypted before reaching the
66/// inner storage, and decrypted when read back. The encryption is completely
67/// transparent to the rest of the database engine.
68pub struct EncryptedStorage {
69 /// The underlying storage backend (e.g. AsyncDiskStorage).
70 /// Arc<dyn StorageBackend> means "a thread-safe pointer to any type that
71 /// implements StorageBackend" — this is Rust's way of doing polymorphism
72 /// at runtime (similar to an interface in Java/C#).
73 inner: Arc<dyn StorageBackend>,
74 /// The initialized cipher instance, ready to encrypt/decrypt.
75 /// XChaCha20Poly1305 holds the expanded key schedule internally.
76 cipher: XChaCha20Poly1305,
77}
78
79impl EncryptedStorage {
80 /// Create a new EncryptedStorage wrapping `inner` with the given 32-byte key.
81 ///
82 /// Call `derive_key()` first to turn a human-readable password into a key.
83 /// The key must be exactly 32 bytes (256 bits) — the size XChaCha20 requires.
84 pub fn new(inner: Arc<dyn StorageBackend>, master_key: &[u8; 32]) -> Self {
85 // Key::from_slice() wraps the raw bytes in the Key newtype.
86 // XChaCha20Poly1305::new() expands the key into the cipher's internal state.
87 let key = Key::from_slice(master_key);
88 Self {
89 inner,
90 cipher: XChaCha20Poly1305::new(key),
91 }
92 }
93
94 /// Derive a deterministic 32-byte encryption key from a password string
95 /// using the Argon2id algorithm.
96 ///
97 /// `salt_context` is typically the database file path — this ensures that
98 /// the same password produces different keys for different database files,
99 /// preventing cross-database attacks.
100 ///
101 /// Argon2id is intentionally slow (memory-hard) to make brute-force attacks
102 /// expensive. The default parameters require ~64 MB of RAM and ~0.5 s of CPU.
103 pub fn derive_key(password: &str, salt_context: &str) -> [u8; 32] {
104 use argon2::{Argon2, PasswordHasher};
105 use argon2::password_hash::SaltString;
106
107 // Argon2 requires a base64-encoded salt of at least 8 bytes.
108 // We build a deterministic 22-character salt from the context string
109 // (22 base64 chars = 16 bytes of entropy, well above the minimum).
110 // The format!("{:0<22}", ...) pads the string to 22 chars with '0's.
111 let raw_salt = format!("{:0<22}", &salt_context[..salt_context.len().min(22)]);
112 // SaltString::from_b64 validates the base64 encoding.
113 // If it fails (e.g. invalid chars in the path), fall back to a known-good salt.
114 let salt = SaltString::from_b64(&raw_salt)
115 .unwrap_or_else(|_| SaltString::from_b64("bW9sdGVuZGJkZWZhdWx0").unwrap());
116
117 // Argon2::default() uses Argon2id variant with standard parameters.
118 let argon2 = Argon2::default();
119 // hash_password() runs the Argon2 algorithm and returns a PHC string.
120 let hash = argon2
121 .hash_password(password.as_bytes(), &salt)
122 .expect("Argon2 key derivation failed");
123
124 // Extract the raw 32-byte hash output from the PHC string.
125 // The PHC format includes the algorithm, parameters, salt, and hash.
126 let hash_output = hash.hash.expect("Argon2 produced no hash output");
127 let bytes = hash_output.as_bytes();
128 // Copy the first 32 bytes into our fixed-size key array.
129 let mut key = [0u8; 32];
130 key.copy_from_slice(&bytes[..32]);
131 key
132 }
133
134 /// Encrypt a single LogEntry and return a new "ENC" LogEntry whose `value`
135 /// field contains the base64-encoded ciphertext.
136 ///
137 /// A fresh random 24-byte nonce is generated for every single write.
138 /// Using a unique nonce per message is critical for security — reusing a
139 /// nonce with the same key would completely break the encryption.
140 fn encrypt_entry(&self, entry: &LogEntry) -> Result<LogEntry, DbError> {
141 // Step 1: Serialize the real entry to a JSON string.
142 let plain_json = serde_json::to_string(entry)?;
143
144 // Step 2: Generate a cryptographically random 24-byte nonce.
145 // OsRng reads from /dev/urandom (Linux) or BCryptGenRandom (Windows).
146 let mut nonce_bytes = [0u8; 24];
147 OsRng.fill_bytes(&mut nonce_bytes);
148 // XNonce::from_slice() wraps the bytes in the nonce newtype.
149 let nonce = XNonce::from_slice(&nonce_bytes);
150
151 // Step 3: Encrypt the plaintext. The cipher also computes a 16-byte
152 // Poly1305 MAC and appends it to the ciphertext automatically.
153 // Result: ciphertext = encrypt(key, nonce, plaintext) || MAC[16 bytes]
154 let cipher_text = self
155 .cipher
156 .encrypt(nonce, plain_json.as_bytes())
157 .map_err(|_| DbError::WriteError)?;
158
159 // Step 4: Prepend the nonce to the ciphertext so we have everything
160 // needed for decryption in one blob: nonce[24] || ciphertext || MAC[16]
161 let mut payload = nonce_bytes.to_vec();
162 payload.extend(cipher_text);
163
164 // Step 5: Base64-encode the binary payload so it can be safely stored
165 // as a JSON string value (JSON doesn't support raw binary).
166 let b64 = STANDARD.encode(&payload);
167
168 // Step 6: Return a sentinel LogEntry that hides the real cmd/collection/key.
169 // The underlying storage only ever sees these opaque ENC entries.
170 Ok(LogEntry {
171 cmd: "ENC".to_string(),
172 collection: "_".to_string(), // placeholder — real collection is inside the ciphertext
173 key: "_".to_string(), // placeholder — real key is inside the ciphertext
174 value: serde_json::json!(b64),
175 })
176 }
177
178 /// Decrypt a single "ENC" LogEntry and return the original LogEntry.
179 ///
180 /// The Poly1305 MAC is verified automatically during decryption — if the
181 /// ciphertext has been tampered with or the wrong key is used, this returns
182 /// an error instead of silently returning garbage data.
183 fn decrypt_entry(&self, entry: &LogEntry) -> Result<LogEntry, DbError> {
184 // Extract the base64 string from the value field.
185 let b64 = entry.value.as_str().unwrap_or("");
186
187 // Decode from base64 back to raw bytes: nonce[24] || ciphertext || MAC[16]
188 let payload = STANDARD
189 .decode(b64)
190 .map_err(|_| DbError::WriteError)?;
191
192 // Sanity check: the payload must be at least 24 bytes (nonce) + 16 bytes (MAC).
193 if payload.len() < 24 {
194 return Err(DbError::WriteError);
195 }
196
197 // Split the payload into nonce (first 24 bytes) and ciphertext (the rest).
198 // split_at() returns two slices sharing the same underlying memory — no copy.
199 let (nonce_bytes, cipher_text) = payload.split_at(24);
200 let nonce = XNonce::from_slice(nonce_bytes);
201
202 // Decrypt and verify the MAC. If the MAC doesn't match (wrong key or
203 // tampered data), decrypt() returns Err and we propagate it as WriteError.
204 let plain_bytes = self
205 .cipher
206 .decrypt(nonce, cipher_text)
207 .map_err(|_| DbError::WriteError)?;
208
209 // Convert the decrypted bytes back to a UTF-8 string.
210 let plain_json = String::from_utf8(plain_bytes).map_err(|_| DbError::WriteError)?;
211
212 // Deserialize the JSON string back into a LogEntry.
213 serde_json::from_str::<LogEntry>(&plain_json).map_err(|e| DbError::Serialization(e))
214 }
215}
216
217/// Implement the StorageBackend trait so EncryptedStorage can be used anywhere
218/// a StorageBackend is expected — the rest of the engine doesn't know or care
219/// that encryption is happening.
220impl StorageBackend for EncryptedStorage {
221 /// Encrypt `entry` and write the resulting ENC entry to the inner backend.
222 fn write_entry(&self, entry: &LogEntry) -> Result<(), DbError> {
223 let encrypted = self.encrypt_entry(entry)?;
224 // Delegate to the inner backend (e.g. AsyncDiskStorage.write_entry).
225 self.inner.write_entry(&encrypted)
226 }
227
228 /// Read all ENC entries from the inner backend and decrypt them.
229 ///
230 /// Entries that fail to decrypt are skipped with a warning (not a crash)
231 /// so a single corrupt entry doesn't bring down the whole database.
232 /// Unencrypted entries (cmd != "ENC") are passed through unchanged —
233 /// this supports migrating an existing plaintext database to encrypted.
234 fn read_log(&self) -> Result<Vec<LogEntry>, DbError> {
235 // Read the raw (encrypted) entries from the underlying storage.
236 let raw_entries = self.inner.read_log()?;
237 let mut decrypted = Vec::with_capacity(raw_entries.len());
238
239 for entry in raw_entries {
240 if entry.cmd == "ENC" {
241 // This is an encrypted entry — decrypt it.
242 match self.decrypt_entry(&entry) {
243 Ok(real_entry) => decrypted.push(real_entry),
244 Err(e) => {
245 // Log a warning but continue — don't crash on one bad entry.
246 // This can happen if the encryption key changed or the file
247 // was partially corrupted.
248 tracing::warn!("⚠️ Skipping undecryptable log entry: {}", e);
249 }
250 }
251 } else {
252 // Unencrypted legacy entry — pass through as-is.
253 // This allows migrating an existing plaintext database by simply
254 // enabling encryption; old entries are still readable.
255 decrypted.push(entry);
256 }
257 }
258
259 Ok(decrypted)
260 }
261
262 fn stream_log_into(&self, f: &mut dyn FnMut(LogEntry, u32)) -> Result<u64, DbError> {
263 let mut count = 0u64;
264 // EncryptedStorage wraps the inner backend. Since it doesn't have a
265 // specialized streaming implementation yet, we fall back to read_log
266 // BUT we need the ENCRYPTED length for the pointers.
267
268 // Use inner.stream_log_into to get the encrypted entries and their lengths.
269 self.inner.stream_log_into(&mut |enc_entry, length| {
270 if enc_entry.cmd == "ENC" {
271 match self.decrypt_entry(&enc_entry) {
272 Ok(real_entry) => {
273 f(real_entry, length);
274 count += 1;
275 }
276 Err(e) => {
277 tracing::warn!("⚠️ Skipping undecryptable log entry during streaming: {}", e);
278 }
279 }
280 } else {
281 // Pass through plaintext entry with its original length
282 f(enc_entry, length);
283 count += 1;
284 }
285 })?;
286
287 Ok(count)
288 }
289
290 /// Re-encrypt all entries during compaction so the compacted file is fully
291 /// encrypted with no plaintext remnants from a migration.
292 ///
293 /// Each entry gets a fresh random nonce — even if the plaintext is the same
294 /// as before, the ciphertext will be different (this is correct and expected).
295 fn compact(&self, entries: Vec<LogEntry>) -> Result<(), DbError> {
296 // Encrypt every entry. If any encryption fails, the whole compaction fails
297 // (we don't want a partially-encrypted compacted file).
298 // The `.collect::<Result<Vec<_>, _>>()` pattern: if any map() call returns
299 // Err, the whole collect() returns that Err immediately (short-circuits).
300 let encrypted: Result<Vec<LogEntry>, DbError> =
301 entries.iter().map(|e| self.encrypt_entry(e)).collect();
302 // Delegate the actual file writing to the inner backend's compact().
303 self.inner.compact(encrypted?)
304 }
305
306 /// Read exactly `length` bytes from the inner backend and decrypt the entry.
307 ///
308 /// Note: in the encrypted Bitcask model, the pointer refers to the offset
309 /// and length of the ENCRYPTED entry in the log.
310 fn read_at(&self, offset: u64, length: u32) -> Result<Vec<u8>, DbError> {
311 // 1. Read the encrypted bytes from the inner storage.
312 let raw_bytes = self.inner.read_at(offset, length)?;
313
314 // 2. Deserialize the ENC LogEntry.
315 let enc_entry: LogEntry = serde_json::from_slice(&raw_bytes).map_err(DbError::Serialization)?;
316
317 // 3. Decrypt the entry.
318 let decrypted = self.decrypt_entry(&enc_entry)?;
319
320 // 4. Return the original plaintext LogEntry serialized as JSON.
321 // This matches the format expected by the engine (e.g. operations::get).
322 Ok(serde_json::to_vec(&decrypted).map_err(DbError::Serialization)?)
323 }
324}