paygress/luks.rs
1// LUKS-on-loop helpers for consumer-encrypted persistent volumes.
2//
3// Phase 2 of the volume-encryption work. Phase 1 (PR #46) shipped the
4// wire format + KDF; this module is what actually encrypts the bytes
5// on disk so the host operator's post-eviction `tar` reveals only
6// ciphertext.
7//
8// Layout on the host
9// ------------------
10// /var/lib/paygress/volumes/<id>.luks — sparse file, LUKS2 header + payload
11// /dev/mapper/paygress-<id>-luks — kernel device-mapper alias (after luksOpen)
12// /var/lib/paygress/mounts/<id>/ — ext4 mountpoint (the `-v` bind source)
13//
14// Lifecycle
15// ---------
16// `create_encrypted_volume` does the full create-format-open-mkfs-mount
17// dance, returning a handle whose `mount_path` the docker backend
18// bind-mounts into the container. `destroy_encrypted_volume` is the
19// inverse: umount, luksClose, luksErase (overwrites all keyslots so
20// the file's ciphertext is unrecoverable even by the host operator
21// who held the disk image), then rm.
22//
23// Idempotency
24// -----------
25// Both creation and destruction are best-effort idempotent:
26// - create rolls back any partial state on failure (so a half-
27// formatted file doesn't trap a future spawn at the same id),
28// - destroy never errors on "not present" — a half-leaked mapper
29// entry from a crashed previous run gets cleaned up on the next
30// `delete_container`.
31//
32// Why shell-out to cryptsetup
33// ---------------------------
34// libcryptsetup-rs exists, but it links against libcryptsetup (the
35// system C library) and hauls a large unsafe surface into the
36// process. Shelling out to `/sbin/cryptsetup` keeps the LUKS code
37// path entirely in a child process — easier to audit, easier to
38// strace, and matches how every other paygress subprocess (docker,
39// nginx) is invoked. Performance is irrelevant: we exec cryptsetup
40// twice per workload lifetime (create + destroy).
41//
42// Threat model recap (mirrors the wire-format doc on
43// `nostr::VolumeEncryption`):
44// - Defends: post-eviction disk forensics, lazy host-operator
45// backups, co-tenant attacks on shared storage, cold-disk
46// seizure.
47// - Does NOT defend: live host kernel reading /proc/<pid>/mem or
48// extracting the LUKS key from the kernel keyring while the
49// workload runs. That requires hardware confidential VMs
50// (SEV-SNP / TDX), gated behind the `attested-research-tier`
51// `IsolationLevel`.
52// - The key is fed to `cryptsetup` via stdin (key-file=-) so it
53// never appears on the command line (where `ps` would leak it).
54// Provider holds the key only in memory, dropped when
55// `ContainerConfig` goes out of scope.
56
57use std::path::PathBuf;
58use std::process::Stdio;
59
60use anyhow::{Context, Result};
61use tokio::io::AsyncWriteExt;
62use tokio::process::Command;
63use tracing::{debug, info, warn};
64
65/// Root directory for paygress-managed encrypted volumes. Two
66/// subdirectories live here:
67/// - `volumes/<id>.luks` — sparse files holding LUKS2 containers.
68/// - `mounts/<id>/` — ext4 mountpoints bind-mounted into the
69/// container at `data_path`.
70const VOLUME_ROOT: &str = "/var/lib/paygress";
71
72/// Kernel device-mapper name for a workload's open LUKS volume.
73/// Stable per `id` so cleanup can find it after a provider crash.
74fn mapper_name(id: u32) -> String {
75 format!("paygress-{}-luks", id)
76}
77
78/// Sparse file backing the LUKS container.
79fn image_path(id: u32) -> PathBuf {
80 PathBuf::from(VOLUME_ROOT)
81 .join("volumes")
82 .join(format!("{}.luks", id))
83}
84
85/// Mountpoint where the open LUKS volume's ext4 lives.
86fn mount_path(id: u32) -> PathBuf {
87 PathBuf::from(VOLUME_ROOT)
88 .join("mounts")
89 .join(id.to_string())
90}
91
92/// Fully-resolved /dev/mapper path (what `mount` and Docker bind
93/// mounts care about).
94fn mapper_device(id: u32) -> PathBuf {
95 PathBuf::from("/dev/mapper").join(mapper_name(id))
96}
97
98/// Created + open + mounted handle to an encrypted volume. The
99/// `mount_path` is what the Docker backend bind-mounts at
100/// `data_path` inside the container. Drop semantics: do NOT do
101/// anything on drop — destruction is explicit via
102/// `destroy_encrypted_volume`, which the docker backend calls from
103/// `delete_container`. (Doing it on drop would risk
104/// double-destruction on retry paths.)
105#[derive(Debug, Clone)]
106pub struct EncryptedVolume {
107 pub id: u32,
108 pub mount_path: PathBuf,
109}
110
111/// Verify cryptsetup is on PATH. Provider should call this at
112/// startup if any template it serves has `data_path: Some(_)` and
113/// the operator has not opted out of consumer-encrypted volumes.
114/// Returns the version string so the operator can log what they
115/// got.
116pub async fn check_cryptsetup_available() -> Result<String> {
117 let out = Command::new("cryptsetup")
118 .arg("--version")
119 .output()
120 .await
121 .context(
122 "cryptsetup binary not found on PATH; install cryptsetup or disable encrypted-volume support",
123 )?;
124 if !out.status.success() {
125 anyhow::bail!(
126 "cryptsetup --version returned non-zero: {}",
127 String::from_utf8_lossy(&out.stderr)
128 );
129 }
130 Ok(String::from_utf8_lossy(&out.stdout).trim().to_string())
131}
132
133/// Create + format + open + mount a LUKS-encrypted volume for the
134/// given workload id. Returns the mount path the caller should bind
135/// into the container.
136///
137/// On failure, attempts to roll back any partial state (close mapper,
138/// rm sparse file) so a retry at the same id starts clean.
139pub async fn create_encrypted_volume(
140 id: u32,
141 size_gb: u32,
142 key: &[u8; 32],
143) -> Result<EncryptedVolume> {
144 let img = image_path(id);
145 let mnt = mount_path(id);
146 let mapper = mapper_device(id);
147 let mapper_n = mapper_name(id);
148
149 info!(
150 "Creating LUKS-encrypted data volume: id={} size={}G image={}",
151 id,
152 size_gb,
153 img.display()
154 );
155
156 // 0. Pre-create cleanup. A previous spawn at the same id may
157 // have left a `/dev/mapper/paygress-<id>-luks` entry behind
158 // (e.g. our own `destroy_encrypted_volume` lazy-umount'd the
159 // mountpoint and the kernel hadn't released it by the time
160 // `luksClose` ran, so `luksClose` saw EBUSY and silently
161 // failed). Subsequent spawns at the same id then trip on
162 // `luksOpen: device already exists`. Make the create path
163 // self-healing by running destroy first — it's idempotent
164 // and a no-op when nothing is leftover.
165 if let Err(e) = destroy_encrypted_volume(id).await {
166 warn!(
167 "pre-create cleanup of id={} returned {}; continuing — \
168 create steps will surface any persistent state",
169 id, e
170 );
171 }
172
173 // 1. mkdir -p the parent directories. Both volumes/ and mounts/
174 // must exist before the next steps; they survive across
175 // spawns (best-effort once-per-host).
176 tokio::fs::create_dir_all(img.parent().unwrap())
177 .await
178 .context("create volumes/ directory")?;
179 tokio::fs::create_dir_all(&mnt)
180 .await
181 .context("create mountpoint directory")?;
182
183 // 2. Truncate to size. Sparse — only consumes disk on write.
184 // `truncate -s` is portable across the GNU coreutils on
185 // every Linux paygress runs on.
186 let bytes = (size_gb as u64) * 1024 * 1024 * 1024;
187 let img_str = img.to_string_lossy().to_string();
188 let trunc = Command::new("truncate")
189 .args(["-s", &bytes.to_string(), &img_str])
190 .output()
191 .await
192 .context("invoke truncate")?;
193 if !trunc.status.success() {
194 anyhow::bail!(
195 "truncate failed: {}",
196 String::from_utf8_lossy(&trunc.stderr)
197 );
198 }
199
200 // 3. luksFormat with the consumer key on stdin (--key-file=-).
201 // --batch-mode skips the interactive "are you sure" prompt;
202 // --type luks2 picks the modern header format with proper
203 // PBKDF2 + AEAD; defaults are fine for AES-XTS-Plain64.
204 if let Err(e) = run_with_key_stdin(
205 "cryptsetup",
206 &[
207 "luksFormat",
208 "--type",
209 "luks2",
210 "--batch-mode",
211 "--key-file=-",
212 &img_str,
213 ],
214 key,
215 )
216 .await
217 {
218 // Roll back: the truncate-d file is unusable junk. Don't
219 // leave it behind.
220 let _ = tokio::fs::remove_file(&img).await;
221 return Err(e.context("cryptsetup luksFormat"));
222 }
223
224 // 4. luksOpen → /dev/mapper/paygress-<id>-luks. Same key on
225 // stdin. After this the kernel device-mapper holds the key
226 // in keyring memory (visible to root via `dmsetup info`,
227 // which is exactly the threat-model boundary we documented).
228 if let Err(e) = run_with_key_stdin(
229 "cryptsetup",
230 &["luksOpen", "--key-file=-", &img_str, &mapper_n],
231 key,
232 )
233 .await
234 {
235 let _ = tokio::fs::remove_file(&img).await;
236 return Err(e.context("cryptsetup luksOpen"));
237 }
238
239 // 5. mkfs.ext4 on the mapper device. -F forces over any stale
240 // signature (a re-spawn at the same id with a new key would
241 // otherwise see leftover ext4 magic from a prior tenancy and
242 // refuse to reformat).
243 let mapper_str = mapper.to_string_lossy().to_string();
244 let mkfs = Command::new("mkfs.ext4")
245 .args(["-F", &mapper_str])
246 .output()
247 .await
248 .context("invoke mkfs.ext4")?;
249 if !mkfs.status.success() {
250 // Roll back: close the mapper, then drop the file.
251 let _ = run("cryptsetup", &["luksClose", &mapper_n]).await;
252 let _ = tokio::fs::remove_file(&img).await;
253 anyhow::bail!(
254 "mkfs.ext4 failed: {}",
255 String::from_utf8_lossy(&mkfs.stderr)
256 );
257 }
258
259 // 6. mount to /var/lib/paygress/mounts/<id>. The Docker backend
260 // bind-mounts this path at the template's `data_path`.
261 let mnt_str = mnt.to_string_lossy().to_string();
262 let mount = Command::new("mount")
263 .args([&mapper_str, &mnt_str])
264 .output()
265 .await
266 .context("invoke mount")?;
267 if !mount.status.success() {
268 let _ = run("cryptsetup", &["luksClose", &mapper_n]).await;
269 let _ = tokio::fs::remove_file(&img).await;
270 anyhow::bail!("mount failed: {}", String::from_utf8_lossy(&mount.stderr));
271 }
272
273 info!(
274 "LUKS volume id={} ready: mounted at {} (mapper {})",
275 id,
276 mnt.display(),
277 mapper.display()
278 );
279 Ok(EncryptedVolume {
280 id,
281 mount_path: mnt,
282 })
283}
284
285/// Tear down everything `create_encrypted_volume` set up. Idempotent
286/// — never errors on "already gone". Order matters:
287/// 1. umount the ext4 (releases the kernel block device handle)
288/// 2. luksClose (releases the mapper entry + the LUKS key from
289/// keyring memory)
290/// 3. luksErase (overwrites all keyslots → the underlying file's
291/// ciphertext is unrecoverable, even if the operator copied the
292/// file before this step ran)
293/// 4. rm the sparse file (free disk space; defense-in-depth even
294/// after luksErase)
295/// 5. rmdir the mountpoint (cosmetic; keeps /var/lib/paygress/mounts
296/// tidy)
297pub async fn destroy_encrypted_volume(id: u32) -> Result<()> {
298 let img = image_path(id);
299 let mnt = mount_path(id);
300 let mapper_n = mapper_name(id);
301 let img_str = img.to_string_lossy().to_string();
302 let mnt_str = mnt.to_string_lossy().to_string();
303
304 debug!("Destroying LUKS volume id={}", id);
305
306 // 1. umount. -l (lazy) handles the case where the container is
307 // still holding a file open during teardown — the kernel
308 // detaches the mount the moment the last reference drops.
309 if mnt.exists() {
310 let out = Command::new("umount").args(["-l", &mnt_str]).output().await;
311 match out {
312 Ok(o) if !o.status.success() => {
313 let stderr = String::from_utf8_lossy(&o.stderr);
314 if !stderr.contains("not mounted") {
315 warn!("umount {} non-fatal error: {}", mnt_str, stderr.trim());
316 }
317 }
318 Err(e) => warn!("umount {} could not exec: {}", mnt_str, e),
319 _ => {}
320 }
321 }
322
323 // 2. luksClose. Idempotent: cryptsetup returns 0 on success and
324 // a non-zero on "not active", which we tolerate.
325 let _ = run("cryptsetup", &["luksClose", &mapper_n]).await;
326
327 // 3. luksErase wipes ALL keyslots without needing the original
328 // key (--batch-mode bypasses the "are you really sure" prompt).
329 // After this, the LUKS header has no recoverable keyslot;
330 // even if the operator extracted the file before step 4,
331 // the AES-XTS payload is unreachable.
332 if img.exists() {
333 let out = Command::new("cryptsetup")
334 .args(["luksErase", "--batch-mode", &img_str])
335 .output()
336 .await;
337 if let Ok(o) = out {
338 if !o.status.success() {
339 warn!(
340 "cryptsetup luksErase {} non-fatal: {}",
341 img_str,
342 String::from_utf8_lossy(&o.stderr).trim()
343 );
344 }
345 }
346 }
347
348 // 4. rm the sparse file. Best-effort; the disk space matters
349 // more than the ciphertext (which is keyless after step 3).
350 if img.exists() {
351 if let Err(e) = tokio::fs::remove_file(&img).await {
352 warn!("remove {} non-fatal: {}", img.display(), e);
353 }
354 }
355
356 // 5. rmdir the mountpoint. Cosmetic.
357 if mnt.exists() {
358 let _ = tokio::fs::remove_dir(&mnt).await;
359 }
360
361 Ok(())
362}
363
364/// Spawn `prog` with `args` and feed `key` on stdin (for cryptsetup
365/// `--key-file=-`). The key bytes never appear on the command line
366/// (where `ps` would expose them) or in any log.
367async fn run_with_key_stdin(prog: &str, args: &[&str], key: &[u8; 32]) -> Result<()> {
368 let mut child = Command::new(prog)
369 .args(args)
370 .stdin(Stdio::piped())
371 .stdout(Stdio::piped())
372 .stderr(Stdio::piped())
373 .spawn()
374 .with_context(|| format!("spawn {}", prog))?;
375 {
376 let stdin = child.stdin.as_mut().context("child stdin not piped")?;
377 stdin.write_all(key).await.context("write key to stdin")?;
378 stdin.shutdown().await.context("close key stdin")?;
379 }
380 let out = child
381 .wait_with_output()
382 .await
383 .with_context(|| format!("wait for {}", prog))?;
384 if !out.status.success() {
385 anyhow::bail!(
386 "{} {:?} failed: {}",
387 prog,
388 args,
389 String::from_utf8_lossy(&out.stderr)
390 );
391 }
392 Ok(())
393}
394
395/// Spawn `prog` with `args` (no stdin), best-effort silent. Returns
396/// the success bool so callers can log without short-circuiting on
397/// "not present" cleanups.
398async fn run(prog: &str, args: &[&str]) -> bool {
399 Command::new(prog)
400 .args(args)
401 .stdout(Stdio::null())
402 .stderr(Stdio::null())
403 .status()
404 .await
405 .map(|s| s.success())
406 .unwrap_or(false)
407}
408
409#[cfg(test)]
410mod tests {
411 use super::*;
412
413 #[test]
414 fn paths_are_id_scoped_and_under_volume_root() {
415 let img = image_path(42);
416 let mnt = mount_path(42);
417 let dev = mapper_device(42);
418 assert!(
419 img.starts_with(VOLUME_ROOT),
420 "image not under VOLUME_ROOT: {}",
421 img.display()
422 );
423 assert!(
424 mnt.starts_with(VOLUME_ROOT),
425 "mount not under VOLUME_ROOT: {}",
426 mnt.display()
427 );
428 assert_eq!(img.file_name().unwrap(), "42.luks");
429 assert_eq!(mnt.file_name().unwrap(), "42");
430 assert_eq!(dev, PathBuf::from("/dev/mapper/paygress-42-luks"));
431 }
432
433 #[test]
434 fn mapper_name_is_distinct_per_id() {
435 assert_ne!(mapper_name(1), mapper_name(2));
436 assert_eq!(mapper_name(7), "paygress-7-luks");
437 }
438
439 #[test]
440 fn paths_for_different_ids_do_not_collide() {
441 assert_ne!(image_path(1), image_path(2));
442 assert_ne!(mount_path(1), mount_path(2));
443 }
444
445 /// `destroy_encrypted_volume` must be a no-op when nothing
446 /// exists at the given id. The pre-create cleanup in
447 /// `create_encrypted_volume` relies on this — if destroy
448 /// surfaced an error on "nothing to clean up", the create
449 /// would short-circuit on a fresh host.
450 ///
451 /// Marked `#[ignore]` because it shells out to `cryptsetup` /
452 /// `umount` / `rm` and exercises the real filesystem; runs as
453 /// part of the VPS acceptance suite, not on a build host.
454 #[tokio::test]
455 #[ignore]
456 async fn destroy_is_a_no_op_when_nothing_exists() {
457 // High id deliberately chosen so it can't collide with a
458 // real spawn on the host.
459 let res = destroy_encrypted_volume(99_999).await;
460 assert!(
461 res.is_ok(),
462 "destroy_encrypted_volume must succeed on a never-created id, got {:?}",
463 res
464 );
465 }
466}