Skip to main content

paygress/
luks.rs

1// LUKS-on-loop helpers for consumer-encrypted persistent volumes.
2//
3// Phase 2 of the volume-encryption work. Phase 1 (PR #46) shipped the
4// wire format + KDF; this module is what actually encrypts the bytes
5// on disk so the host operator's post-eviction `tar` reveals only
6// ciphertext.
7//
8// Layout on the host
9// ------------------
10// /var/lib/paygress/volumes/<id>.luks   — sparse file, LUKS2 header + payload
11// /dev/mapper/paygress-<id>-luks        — kernel device-mapper alias (after luksOpen)
12// /var/lib/paygress/mounts/<id>/        — ext4 mountpoint (the `-v` bind source)
13//
14// Lifecycle
15// ---------
16// `create_encrypted_volume` does the full create-format-open-mkfs-mount
17// dance, returning a handle whose `mount_path` the docker backend
18// bind-mounts into the container. `destroy_encrypted_volume` is the
19// inverse: umount, luksClose, luksErase (overwrites all keyslots so
20// the file's ciphertext is unrecoverable even by the host operator
21// who held the disk image), then rm.
22//
23// Idempotency
24// -----------
25// Both creation and destruction are best-effort idempotent:
26//   - create rolls back any partial state on failure (so a half-
27//     formatted file doesn't trap a future spawn at the same id),
28//   - destroy never errors on "not present" — a half-leaked mapper
29//     entry from a crashed previous run gets cleaned up on the next
30//     `delete_container`.
31//
32// Why shell-out to cryptsetup
33// ---------------------------
34// libcryptsetup-rs exists, but it links against libcryptsetup (the
35// system C library) and hauls a large unsafe surface into the
36// process. Shelling out to `/sbin/cryptsetup` keeps the LUKS code
37// path entirely in a child process — easier to audit, easier to
38// strace, and matches how every other paygress subprocess (docker,
39// nginx) is invoked. Performance is irrelevant: we exec cryptsetup
40// twice per workload lifetime (create + destroy).
41//
42// Threat model recap (mirrors the wire-format doc on
43// `nostr::VolumeEncryption`):
44//   - Defends: post-eviction disk forensics, lazy host-operator
45//     backups, co-tenant attacks on shared storage, cold-disk
46//     seizure.
47//   - Does NOT defend: live host kernel reading /proc/<pid>/mem or
48//     extracting the LUKS key from the kernel keyring while the
49//     workload runs. That requires hardware confidential VMs
50//     (SEV-SNP / TDX), gated behind the `attested-research-tier`
51//     `IsolationLevel`.
52//   - The key is fed to `cryptsetup` via stdin (key-file=-) so it
53//     never appears on the command line (where `ps` would leak it).
54//     Provider holds the key only in memory, dropped when
55//     `ContainerConfig` goes out of scope.
56
57use std::path::PathBuf;
58use std::process::Stdio;
59
60use anyhow::{Context, Result};
61use tokio::io::AsyncWriteExt;
62use tokio::process::Command;
63use tracing::{debug, info, warn};
64
65/// Root directory for paygress-managed encrypted volumes. Two
66/// subdirectories live here:
67/// - `volumes/<id>.luks` — sparse files holding LUKS2 containers.
68/// - `mounts/<id>/`     — ext4 mountpoints bind-mounted into the
69///                        container at `data_path`.
70const VOLUME_ROOT: &str = "/var/lib/paygress";
71
72/// Kernel device-mapper name for a workload's open LUKS volume.
73/// Stable per `id` so cleanup can find it after a provider crash.
74fn mapper_name(id: u32) -> String {
75    format!("paygress-{}-luks", id)
76}
77
78/// Sparse file backing the LUKS container.
79fn image_path(id: u32) -> PathBuf {
80    PathBuf::from(VOLUME_ROOT)
81        .join("volumes")
82        .join(format!("{}.luks", id))
83}
84
85/// Mountpoint where the open LUKS volume's ext4 lives.
86fn mount_path(id: u32) -> PathBuf {
87    PathBuf::from(VOLUME_ROOT)
88        .join("mounts")
89        .join(id.to_string())
90}
91
92/// Fully-resolved /dev/mapper path (what `mount` and Docker bind
93/// mounts care about).
94fn mapper_device(id: u32) -> PathBuf {
95    PathBuf::from("/dev/mapper").join(mapper_name(id))
96}
97
98/// Created + open + mounted handle to an encrypted volume. The
99/// `mount_path` is what the Docker backend bind-mounts at
100/// `data_path` inside the container. Drop semantics: do NOT do
101/// anything on drop — destruction is explicit via
102/// `destroy_encrypted_volume`, which the docker backend calls from
103/// `delete_container`. (Doing it on drop would risk
104/// double-destruction on retry paths.)
105#[derive(Debug, Clone)]
106pub struct EncryptedVolume {
107    pub id: u32,
108    pub mount_path: PathBuf,
109}
110
111/// Verify cryptsetup is on PATH. Provider should call this at
112/// startup if any template it serves has `data_path: Some(_)` and
113/// the operator has not opted out of consumer-encrypted volumes.
114/// Returns the version string so the operator can log what they
115/// got.
116pub async fn check_cryptsetup_available() -> Result<String> {
117    let out = Command::new("cryptsetup")
118        .arg("--version")
119        .output()
120        .await
121        .context(
122            "cryptsetup binary not found on PATH; install cryptsetup or disable encrypted-volume support",
123        )?;
124    if !out.status.success() {
125        anyhow::bail!(
126            "cryptsetup --version returned non-zero: {}",
127            String::from_utf8_lossy(&out.stderr)
128        );
129    }
130    Ok(String::from_utf8_lossy(&out.stdout).trim().to_string())
131}
132
133/// Create + format + open + mount a LUKS-encrypted volume for the
134/// given workload id. Returns the mount path the caller should bind
135/// into the container.
136///
137/// On failure, attempts to roll back any partial state (close mapper,
138/// rm sparse file) so a retry at the same id starts clean.
139pub async fn create_encrypted_volume(
140    id: u32,
141    size_gb: u32,
142    key: &[u8; 32],
143) -> Result<EncryptedVolume> {
144    let img = image_path(id);
145    let mnt = mount_path(id);
146    let mapper = mapper_device(id);
147    let mapper_n = mapper_name(id);
148
149    info!(
150        "Creating LUKS-encrypted data volume: id={} size={}G image={}",
151        id,
152        size_gb,
153        img.display()
154    );
155
156    // 0. Pre-create cleanup. A previous spawn at the same id may
157    //    have left a `/dev/mapper/paygress-<id>-luks` entry behind
158    //    (e.g. our own `destroy_encrypted_volume` lazy-umount'd the
159    //    mountpoint and the kernel hadn't released it by the time
160    //    `luksClose` ran, so `luksClose` saw EBUSY and silently
161    //    failed). Subsequent spawns at the same id then trip on
162    //    `luksOpen: device already exists`. Make the create path
163    //    self-healing by running destroy first — it's idempotent
164    //    and a no-op when nothing is leftover.
165    if let Err(e) = destroy_encrypted_volume(id).await {
166        warn!(
167            "pre-create cleanup of id={} returned {}; continuing — \
168             create steps will surface any persistent state",
169            id, e
170        );
171    }
172
173    // 1. mkdir -p the parent directories. Both volumes/ and mounts/
174    //    must exist before the next steps; they survive across
175    //    spawns (best-effort once-per-host).
176    tokio::fs::create_dir_all(img.parent().unwrap())
177        .await
178        .context("create volumes/ directory")?;
179    tokio::fs::create_dir_all(&mnt)
180        .await
181        .context("create mountpoint directory")?;
182
183    // 2. Truncate to size. Sparse — only consumes disk on write.
184    //    `truncate -s` is portable across the GNU coreutils on
185    //    every Linux paygress runs on.
186    let bytes = (size_gb as u64) * 1024 * 1024 * 1024;
187    let img_str = img.to_string_lossy().to_string();
188    let trunc = Command::new("truncate")
189        .args(["-s", &bytes.to_string(), &img_str])
190        .output()
191        .await
192        .context("invoke truncate")?;
193    if !trunc.status.success() {
194        anyhow::bail!(
195            "truncate failed: {}",
196            String::from_utf8_lossy(&trunc.stderr)
197        );
198    }
199
200    // 3. luksFormat with the consumer key on stdin (--key-file=-).
201    //    --batch-mode skips the interactive "are you sure" prompt;
202    //    --type luks2 picks the modern header format with proper
203    //    PBKDF2 + AEAD; defaults are fine for AES-XTS-Plain64.
204    if let Err(e) = run_with_key_stdin(
205        "cryptsetup",
206        &[
207            "luksFormat",
208            "--type",
209            "luks2",
210            "--batch-mode",
211            "--key-file=-",
212            &img_str,
213        ],
214        key,
215    )
216    .await
217    {
218        // Roll back: the truncate-d file is unusable junk. Don't
219        // leave it behind.
220        let _ = tokio::fs::remove_file(&img).await;
221        return Err(e.context("cryptsetup luksFormat"));
222    }
223
224    // 4. luksOpen → /dev/mapper/paygress-<id>-luks. Same key on
225    //    stdin. After this the kernel device-mapper holds the key
226    //    in keyring memory (visible to root via `dmsetup info`,
227    //    which is exactly the threat-model boundary we documented).
228    if let Err(e) = run_with_key_stdin(
229        "cryptsetup",
230        &["luksOpen", "--key-file=-", &img_str, &mapper_n],
231        key,
232    )
233    .await
234    {
235        let _ = tokio::fs::remove_file(&img).await;
236        return Err(e.context("cryptsetup luksOpen"));
237    }
238
239    // 5. mkfs.ext4 on the mapper device. -F forces over any stale
240    //    signature (a re-spawn at the same id with a new key would
241    //    otherwise see leftover ext4 magic from a prior tenancy and
242    //    refuse to reformat).
243    let mapper_str = mapper.to_string_lossy().to_string();
244    let mkfs = Command::new("mkfs.ext4")
245        .args(["-F", &mapper_str])
246        .output()
247        .await
248        .context("invoke mkfs.ext4")?;
249    if !mkfs.status.success() {
250        // Roll back: close the mapper, then drop the file.
251        let _ = run("cryptsetup", &["luksClose", &mapper_n]).await;
252        let _ = tokio::fs::remove_file(&img).await;
253        anyhow::bail!(
254            "mkfs.ext4 failed: {}",
255            String::from_utf8_lossy(&mkfs.stderr)
256        );
257    }
258
259    // 6. mount to /var/lib/paygress/mounts/<id>. The Docker backend
260    //    bind-mounts this path at the template's `data_path`.
261    let mnt_str = mnt.to_string_lossy().to_string();
262    let mount = Command::new("mount")
263        .args([&mapper_str, &mnt_str])
264        .output()
265        .await
266        .context("invoke mount")?;
267    if !mount.status.success() {
268        let _ = run("cryptsetup", &["luksClose", &mapper_n]).await;
269        let _ = tokio::fs::remove_file(&img).await;
270        anyhow::bail!("mount failed: {}", String::from_utf8_lossy(&mount.stderr));
271    }
272
273    info!(
274        "LUKS volume id={} ready: mounted at {} (mapper {})",
275        id,
276        mnt.display(),
277        mapper.display()
278    );
279    Ok(EncryptedVolume {
280        id,
281        mount_path: mnt,
282    })
283}
284
285/// Tear down everything `create_encrypted_volume` set up. Idempotent
286/// — never errors on "already gone". Order matters:
287/// 1. umount the ext4 (releases the kernel block device handle)
288/// 2. luksClose (releases the mapper entry + the LUKS key from
289///    keyring memory)
290/// 3. luksErase (overwrites all keyslots → the underlying file's
291///    ciphertext is unrecoverable, even if the operator copied the
292///    file before this step ran)
293/// 4. rm the sparse file (free disk space; defense-in-depth even
294///    after luksErase)
295/// 5. rmdir the mountpoint (cosmetic; keeps /var/lib/paygress/mounts
296///    tidy)
297pub async fn destroy_encrypted_volume(id: u32) -> Result<()> {
298    let img = image_path(id);
299    let mnt = mount_path(id);
300    let mapper_n = mapper_name(id);
301    let img_str = img.to_string_lossy().to_string();
302    let mnt_str = mnt.to_string_lossy().to_string();
303
304    debug!("Destroying LUKS volume id={}", id);
305
306    // 1. umount. -l (lazy) handles the case where the container is
307    //    still holding a file open during teardown — the kernel
308    //    detaches the mount the moment the last reference drops.
309    if mnt.exists() {
310        let out = Command::new("umount").args(["-l", &mnt_str]).output().await;
311        match out {
312            Ok(o) if !o.status.success() => {
313                let stderr = String::from_utf8_lossy(&o.stderr);
314                if !stderr.contains("not mounted") {
315                    warn!("umount {} non-fatal error: {}", mnt_str, stderr.trim());
316                }
317            }
318            Err(e) => warn!("umount {} could not exec: {}", mnt_str, e),
319            _ => {}
320        }
321    }
322
323    // 2. luksClose. Idempotent: cryptsetup returns 0 on success and
324    //    a non-zero on "not active", which we tolerate.
325    let _ = run("cryptsetup", &["luksClose", &mapper_n]).await;
326
327    // 3. luksErase wipes ALL keyslots without needing the original
328    //    key (--batch-mode bypasses the "are you really sure" prompt).
329    //    After this, the LUKS header has no recoverable keyslot;
330    //    even if the operator extracted the file before step 4,
331    //    the AES-XTS payload is unreachable.
332    if img.exists() {
333        let out = Command::new("cryptsetup")
334            .args(["luksErase", "--batch-mode", &img_str])
335            .output()
336            .await;
337        if let Ok(o) = out {
338            if !o.status.success() {
339                warn!(
340                    "cryptsetup luksErase {} non-fatal: {}",
341                    img_str,
342                    String::from_utf8_lossy(&o.stderr).trim()
343                );
344            }
345        }
346    }
347
348    // 4. rm the sparse file. Best-effort; the disk space matters
349    //    more than the ciphertext (which is keyless after step 3).
350    if img.exists() {
351        if let Err(e) = tokio::fs::remove_file(&img).await {
352            warn!("remove {} non-fatal: {}", img.display(), e);
353        }
354    }
355
356    // 5. rmdir the mountpoint. Cosmetic.
357    if mnt.exists() {
358        let _ = tokio::fs::remove_dir(&mnt).await;
359    }
360
361    Ok(())
362}
363
364/// Spawn `prog` with `args` and feed `key` on stdin (for cryptsetup
365/// `--key-file=-`). The key bytes never appear on the command line
366/// (where `ps` would expose them) or in any log.
367async fn run_with_key_stdin(prog: &str, args: &[&str], key: &[u8; 32]) -> Result<()> {
368    let mut child = Command::new(prog)
369        .args(args)
370        .stdin(Stdio::piped())
371        .stdout(Stdio::piped())
372        .stderr(Stdio::piped())
373        .spawn()
374        .with_context(|| format!("spawn {}", prog))?;
375    {
376        let stdin = child.stdin.as_mut().context("child stdin not piped")?;
377        stdin.write_all(key).await.context("write key to stdin")?;
378        stdin.shutdown().await.context("close key stdin")?;
379    }
380    let out = child
381        .wait_with_output()
382        .await
383        .with_context(|| format!("wait for {}", prog))?;
384    if !out.status.success() {
385        anyhow::bail!(
386            "{} {:?} failed: {}",
387            prog,
388            args,
389            String::from_utf8_lossy(&out.stderr)
390        );
391    }
392    Ok(())
393}
394
395/// Spawn `prog` with `args` (no stdin), best-effort silent. Returns
396/// the success bool so callers can log without short-circuiting on
397/// "not present" cleanups.
398async fn run(prog: &str, args: &[&str]) -> bool {
399    Command::new(prog)
400        .args(args)
401        .stdout(Stdio::null())
402        .stderr(Stdio::null())
403        .status()
404        .await
405        .map(|s| s.success())
406        .unwrap_or(false)
407}
408
409#[cfg(test)]
410mod tests {
411    use super::*;
412
413    #[test]
414    fn paths_are_id_scoped_and_under_volume_root() {
415        let img = image_path(42);
416        let mnt = mount_path(42);
417        let dev = mapper_device(42);
418        assert!(
419            img.starts_with(VOLUME_ROOT),
420            "image not under VOLUME_ROOT: {}",
421            img.display()
422        );
423        assert!(
424            mnt.starts_with(VOLUME_ROOT),
425            "mount not under VOLUME_ROOT: {}",
426            mnt.display()
427        );
428        assert_eq!(img.file_name().unwrap(), "42.luks");
429        assert_eq!(mnt.file_name().unwrap(), "42");
430        assert_eq!(dev, PathBuf::from("/dev/mapper/paygress-42-luks"));
431    }
432
433    #[test]
434    fn mapper_name_is_distinct_per_id() {
435        assert_ne!(mapper_name(1), mapper_name(2));
436        assert_eq!(mapper_name(7), "paygress-7-luks");
437    }
438
439    #[test]
440    fn paths_for_different_ids_do_not_collide() {
441        assert_ne!(image_path(1), image_path(2));
442        assert_ne!(mount_path(1), mount_path(2));
443    }
444
445    /// `destroy_encrypted_volume` must be a no-op when nothing
446    /// exists at the given id. The pre-create cleanup in
447    /// `create_encrypted_volume` relies on this — if destroy
448    /// surfaced an error on "nothing to clean up", the create
449    /// would short-circuit on a fresh host.
450    ///
451    /// Marked `#[ignore]` because it shells out to `cryptsetup` /
452    /// `umount` / `rm` and exercises the real filesystem; runs as
453    /// part of the VPS acceptance suite, not on a build host.
454    #[tokio::test]
455    #[ignore]
456    async fn destroy_is_a_no_op_when_nothing_exists() {
457        // High id deliberately chosen so it can't collide with a
458        // real spawn on the host.
459        let res = destroy_encrypted_volume(99_999).await;
460        assert!(
461            res.is_ok(),
462            "destroy_encrypted_volume must succeed on a never-created id, got {:?}",
463            res
464        );
465    }
466}