Skip to main content

paygress/
kvm.rs

1// KVM/qemu compute backend.
2//
3// Implements `ComputeBackend` by spawning a per-workload
4// qemu/KVM virtual machine. Each spawn is its own VM with its own
5// kernel, virtio devices, and disk. The host operator running on the
6// hypervisor can still see the guest's memory (no SEV-SNP), but
7// every container-escape path that exists in the Docker / LXD
8// backends is closed by the VM boundary.
9//
10// Where this fits in paygress's isolation tiers
11// ----------------------------------------------
12// (See `nostr::IsolationLevel`.)
13//   - `SharedKernel`        : Docker / LXD (today's default backends).
14//                             Cheapest, fastest, leakiest.
15//   - `DedicatedHost`       : THIS BACKEND. Per-workload VM. Defends
16//                             against co-tenant attacks and container
17//                             escape exploits. Does NOT defend against
18//                             the host operator with hypervisor root.
19//   - `AttestedResearchTier`: SEV-SNP / TDX. Defends against the
20//                             host operator. Needs hardware (AMD
21//                             EPYC 7003+ / Intel Sapphire Rapids).
22//                             This module is the foundation that
23//                             tier will extend — when an EPYC host
24//                             is provisioned, the changes are:
25//                               * `-machine confidential-guest-support=...`
26//                               * an attestation handshake before
27//                                 the consumer sends the spawn token
28//                               * a measured boot chain (kernel +
29//                                 initrd) the consumer can verify
30//                             Everything else (cloud-init seed, disk
31//                             overlay, network forwarding) reuses
32//                             this code unchanged.
33//
34// Why qemu+KVM and not Firecracker / Cloud Hypervisor
35// ----------------------------------------------------
36// qemu's larger surface buys us cloud-init compatibility (so we get
37// SSH onto a vanilla Ubuntu image without baking custom userdata
38// into a Firecracker rootfs), virtio device flexibility (we may add
39// a virtio-vsock channel later for the agent-sandbox HTTP exec
40// equivalent), and the eventual SEV-SNP path. Cold-start is slower
41// than Firecracker (~3-5s vs ~125ms) but the boot is amortized over
42// a multi-hour-to-multi-day workload lease — not a hot path.
43//
44// Storage layout
45// --------------
46// /var/lib/paygress/vm/base/<image>.img      — read-only cloud image
47// /var/lib/paygress/vm/<id>/disk.qcow2       — per-VM overlay
48// /var/lib/paygress/vm/<id>/seed.iso         — cloud-init seed
49// /var/lib/paygress/vm/<id>/qemu.pid         — qemu daemon pidfile
50// /var/lib/paygress/vm/<id>/serial.log       — guest serial console
51//
52// What this v1 deliberately does NOT do
53// -------------------------------------
54//   - Run the killer-template Docker images. The KVM backend serves
55//     a vanilla Ubuntu VM; consumers shell in via SSH and run their
56//     own software. Running templates inside a VM would require
57//     nested-Docker bootstrap inside cloud-init — out of scope here,
58//     tracked as a follow-up.
59//   - Honor `ContainerConfig.template_ports` beyond SSH. Each VM
60//     gets ONE host-port forward (the `host_port` field) for SSH;
61//     forwarding additional template ports lands when the v2
62//     template-aware path does.
63//   - Persistent data volumes (`data_path`). The VM's qcow2 disk IS
64//     the persistent state; LUKS-on-loop encryption from the Docker
65//     path doesn't apply (the disk is one file). Consumer-encrypted
66//     disks via LUKS-inside-the-guest is a follow-up.
67//   - Cleanup of VMs orphaned by a provider crash. A startup-time
68//     `pidfile-and-process-still-alive?` sweep lands separately.
69
70use std::path::PathBuf;
71use std::process::Stdio;
72
73use anyhow::{Context, Result};
74use async_trait::async_trait;
75use tokio::process::Command;
76use tracing::{debug, info, warn};
77
78use crate::compute::{ComputeBackend, ContainerConfig, NodeStatus};
79
80/// Root directory for paygress-managed VMs.
81const VM_ROOT: &str = "/var/lib/paygress/vm";
82
83/// Default base image. Ubuntu 22.04 cloud image — small (~600 MB),
84/// has cloud-init preinstalled, well-supported across hosting
85/// providers. Operators can override via `KvmConfig`.
86const DEFAULT_BASE_IMAGE_URL: &str =
87    "https://cloud-images.ubuntu.com/jammy/current/jammy-server-cloudimg-amd64.img";
88const DEFAULT_BASE_IMAGE_FILE: &str = "jammy-server-cloudimg-amd64.img";
89
90/// Per-instance configuration for the KVM backend. Provider
91/// operators tweak these at startup.
92#[derive(Debug, Clone)]
93pub struct KvmConfig {
94    /// Path to the read-only base cloud image. The backend creates
95    /// per-VM qcow2 overlays on top, so multiple workloads share one
96    /// physical base. Downloaded on first spawn if absent.
97    pub base_image_path: PathBuf,
98    /// URL the backend fetches the base image from when
99    /// `base_image_path` is missing. Defaults to the Ubuntu cloud
100    /// image.
101    pub base_image_url: String,
102    /// Where per-VM directories live.
103    pub vm_root: PathBuf,
104}
105
106impl Default for KvmConfig {
107    fn default() -> Self {
108        Self {
109            base_image_path: PathBuf::from(VM_ROOT)
110                .join("base")
111                .join(DEFAULT_BASE_IMAGE_FILE),
112            base_image_url: DEFAULT_BASE_IMAGE_URL.to_string(),
113            vm_root: PathBuf::from(VM_ROOT),
114        }
115    }
116}
117
118/// KVM/qemu backend.
119pub struct KvmBackend {
120    config: KvmConfig,
121}
122
123impl KvmBackend {
124    pub fn new(config: KvmConfig) -> Self {
125        Self { config }
126    }
127
128    fn vm_dir(&self, id: u32) -> PathBuf {
129        self.config.vm_root.join(id.to_string())
130    }
131
132    fn disk_path(&self, id: u32) -> PathBuf {
133        self.vm_dir(id).join("disk.qcow2")
134    }
135
136    fn seed_path(&self, id: u32) -> PathBuf {
137        self.vm_dir(id).join("seed.iso")
138    }
139
140    fn pidfile_path(&self, id: u32) -> PathBuf {
141        self.vm_dir(id).join("qemu.pid")
142    }
143
144    fn serial_log_path(&self, id: u32) -> PathBuf {
145        self.vm_dir(id).join("serial.log")
146    }
147
148    /// Verify qemu + KVM support is present. Provider should call
149    /// this at startup; surfacing "your host doesn't support KVM"
150    /// at config time is much better than at first-spawn time.
151    pub async fn check_kvm_available() -> Result<String> {
152        if !PathBuf::from("/dev/kvm").exists() {
153            anyhow::bail!(
154                "/dev/kvm not present; this host does not support KVM. \
155                 Use the Docker or LXD backend, or move to a host with \
156                 nested virtualization enabled."
157            );
158        }
159        let out = Command::new("qemu-system-x86_64")
160            .arg("--version")
161            .output()
162            .await
163            .context("qemu-system-x86_64 not found on PATH; install qemu-system-x86")?;
164        if !out.status.success() {
165            anyhow::bail!(
166                "qemu-system-x86_64 --version failed: {}",
167                String::from_utf8_lossy(&out.stderr)
168            );
169        }
170        Ok(String::from_utf8_lossy(&out.stdout)
171            .lines()
172            .next()
173            .unwrap_or("")
174            .to_string())
175    }
176
177    /// Ensure the base image is present, downloading on first call.
178    /// Idempotent: subsequent calls no-op.
179    async fn ensure_base_image(&self) -> Result<()> {
180        if self.config.base_image_path.exists() {
181            return Ok(());
182        }
183        let parent = self
184            .config
185            .base_image_path
186            .parent()
187            .context("base_image_path has no parent")?;
188        tokio::fs::create_dir_all(parent)
189            .await
190            .context("create base image directory")?;
191        info!(
192            "Downloading base image from {} to {}",
193            self.config.base_image_url,
194            self.config.base_image_path.display()
195        );
196        let out = Command::new("curl")
197            .args([
198                "-fsSL",
199                "-o",
200                self.config.base_image_path.to_string_lossy().as_ref(),
201                &self.config.base_image_url,
202            ])
203            .output()
204            .await
205            .context("invoke curl to fetch base image")?;
206        if !out.status.success() {
207            anyhow::bail!(
208                "curl failed to fetch base image: {}",
209                String::from_utf8_lossy(&out.stderr)
210            );
211        }
212        Ok(())
213    }
214
215    /// Cloud-init userdata: enable SSH password auth, set the root
216    /// password the consumer supplied. Yes, password auth — not key
217    /// auth — because the consumer already trusts this provider with
218    /// the password (it lived in the spawn DM); requiring an extra
219    /// pubkey upload is friction without a security gain when the
220    /// VM is single-tenant.
221    fn user_data(password: &str) -> String {
222        format!(
223            "#cloud-config\n\
224             ssh_pwauth: true\n\
225             disable_root: false\n\
226             chpasswd:\n  \
227               list: |\n    \
228                 root:{}\n  \
229               expire: false\n\
230             # Keep the boot fast: skip waiting for slow services.\n\
231             timezone: Etc/UTC\n",
232            password
233        )
234    }
235
236    fn meta_data(id: u32) -> String {
237        format!(
238            "instance-id: paygress-{0}\nlocal-hostname: paygress-{0}\n",
239            id
240        )
241    }
242
243    /// Build the cloud-init seed ISO. Uses `genisoimage` — the
244    /// `cloud-localds` wrapper would be more concise but it's
245    /// less universally available across distros.
246    async fn make_seed_iso(&self, id: u32, password: &str) -> Result<()> {
247        let dir = self.vm_dir(id);
248        let user_path = dir.join("user-data");
249        let meta_path = dir.join("meta-data");
250        tokio::fs::write(&user_path, Self::user_data(password))
251            .await
252            .context("write user-data")?;
253        tokio::fs::write(&meta_path, Self::meta_data(id))
254            .await
255            .context("write meta-data")?;
256        let out = Command::new("genisoimage")
257            .args([
258                "-output",
259                self.seed_path(id).to_string_lossy().as_ref(),
260                "-volid",
261                "cidata",
262                "-joliet",
263                "-rock",
264                user_path.to_string_lossy().as_ref(),
265                meta_path.to_string_lossy().as_ref(),
266            ])
267            .output()
268            .await
269            .context("invoke genisoimage")?;
270        if !out.status.success() {
271            anyhow::bail!(
272                "genisoimage failed: {}",
273                String::from_utf8_lossy(&out.stderr)
274            );
275        }
276        Ok(())
277    }
278
279    /// Build the qemu-system-x86_64 argv. Pure function — testable
280    /// without spawning anything. Caller is responsible for the seed
281    /// ISO + qcow2 already existing.
282    pub fn qemu_argv(&self, config: &ContainerConfig) -> Vec<String> {
283        let id = config.id;
284        let cores = config.cpu_cores.max(1);
285        let mem_mb = config.memory_mb.max(512);
286        let host_port = config.host_port.unwrap_or(0);
287
288        // user-mode networking + hostfwd: SSH always, plus any
289        // template ports the consumer asked for.
290        let mut hostfwds = vec![format!("hostfwd=tcp::{}-:22", host_port)];
291        for p in &config.template_ports {
292            hostfwds.push(format!(
293                "hostfwd={}::{}-:{}",
294                p.protocol, p.host_port, p.container_port
295            ));
296        }
297        let netdev = format!("user,id=net0,{}", hostfwds.join(","));
298
299        vec![
300            // Acceleration + CPU model: -enable-kvm uses the host's
301            // KVM module; -cpu host passes through the physical CPU
302            // features so guests can use AES-NI / AVX / etc.
303            "-enable-kvm".to_string(),
304            "-cpu".to_string(),
305            "host".to_string(),
306            "-machine".to_string(),
307            "type=q35,accel=kvm".to_string(),
308            "-smp".to_string(),
309            cores.to_string(),
310            "-m".to_string(),
311            mem_mb.to_string(),
312            // Primary disk: per-VM qcow2 overlay on the read-only base.
313            "-drive".to_string(),
314            format!(
315                "file={},if=virtio,format=qcow2",
316                self.disk_path(id).display()
317            ),
318            // Cloud-init seed: rom image, qemu picks it up at boot.
319            "-drive".to_string(),
320            format!(
321                "file={},if=virtio,format=raw,readonly=on",
322                self.seed_path(id).display()
323            ),
324            "-netdev".to_string(),
325            netdev,
326            "-device".to_string(),
327            "virtio-net-pci,netdev=net0".to_string(),
328            // Daemonize + pidfile so we can manage the lifecycle
329            // post-spawn via the pid (kill / wait).
330            "-daemonize".to_string(),
331            "-pidfile".to_string(),
332            self.pidfile_path(id).to_string_lossy().to_string(),
333            // No graphical console; serial captured to a file for
334            // operator debugging when a guest fails to boot.
335            "-nographic".to_string(),
336            "-serial".to_string(),
337            format!("file:{}", self.serial_log_path(id).display()),
338        ]
339    }
340
341    async fn create_overlay_disk(&self, id: u32, size_gb: u32) -> Result<()> {
342        // qemu-img create -f qcow2 -b <base> -F qcow2 <overlay> <size>G
343        // The -b backing-file + -F backing-format pair makes the
344        // overlay reference the base; only modified blocks live in
345        // the overlay.
346        let out = Command::new("qemu-img")
347            .args([
348                "create",
349                "-f",
350                "qcow2",
351                "-b",
352                self.config.base_image_path.to_string_lossy().as_ref(),
353                "-F",
354                "qcow2",
355                self.disk_path(id).to_string_lossy().as_ref(),
356                &format!("{}G", size_gb.max(5)),
357            ])
358            .output()
359            .await
360            .context("invoke qemu-img create")?;
361        if !out.status.success() {
362            anyhow::bail!(
363                "qemu-img create failed: {}",
364                String::from_utf8_lossy(&out.stderr)
365            );
366        }
367        Ok(())
368    }
369
370    /// Read the daemonized qemu's PID from its pidfile.
371    async fn read_pid(&self, id: u32) -> Option<i32> {
372        let p = self.pidfile_path(id);
373        let bytes = tokio::fs::read_to_string(&p).await.ok()?;
374        bytes.trim().parse().ok()
375    }
376}
377
378#[async_trait]
379impl ComputeBackend for KvmBackend {
380    /// Scan vm_root for existing per-id directories and pick the
381    /// next free id in range.
382    async fn find_available_id(&self, range_start: u32, range_end: u32) -> Result<u32> {
383        let mut used = std::collections::HashSet::new();
384        if let Ok(mut entries) = tokio::fs::read_dir(&self.config.vm_root).await {
385            while let Ok(Some(entry)) = entries.next_entry().await {
386                if let Some(name) = entry.file_name().to_str() {
387                    if let Ok(n) = name.parse::<u32>() {
388                        used.insert(n);
389                    }
390                }
391            }
392        }
393        for id in range_start..=range_end {
394            if !used.contains(&id) {
395                return Ok(id);
396            }
397        }
398        anyhow::bail!(
399            "no available VM id in range {}..={}",
400            range_start,
401            range_end
402        );
403    }
404
405    async fn create_container(&self, config: &ContainerConfig) -> Result<String> {
406        let id = config.id;
407        info!(
408            "Provisioning KVM VM: id={} cores={} mem={}MB disk={}GB",
409            id, config.cpu_cores, config.memory_mb, config.storage_gb
410        );
411
412        self.ensure_base_image().await?;
413
414        let dir = self.vm_dir(id);
415        tokio::fs::create_dir_all(&dir)
416            .await
417            .context("create vm directory")?;
418
419        self.create_overlay_disk(id, config.storage_gb)
420            .await
421            .context("create overlay disk")?;
422        self.make_seed_iso(id, &config.password)
423            .await
424            .context("build cloud-init seed iso")?;
425
426        let argv = self.qemu_argv(config);
427        debug!("qemu argv: {:?}", argv);
428        let arg_refs: Vec<&str> = argv.iter().map(|s| s.as_str()).collect();
429        let out = Command::new("qemu-system-x86_64")
430            .args(&arg_refs)
431            .stdout(Stdio::piped())
432            .stderr(Stdio::piped())
433            .output()
434            .await
435            .context("invoke qemu-system-x86_64")?;
436        if !out.status.success() {
437            // qemu's own error landed on stderr; surface it to the
438            // operator so a "missing /dev/kvm" or "image not found"
439            // is obvious.
440            anyhow::bail!(
441                "qemu-system-x86_64 failed: {}",
442                String::from_utf8_lossy(&out.stderr)
443            );
444        }
445
446        let pid = self
447            .read_pid(id)
448            .await
449            .context("qemu daemonized but pidfile missing — boot failed before pidfile write?")?;
450        info!("KVM VM id={} live (pid {})", id, pid);
451        Ok(format!("paygress-vm-{}", id))
452    }
453
454    /// Daemonized qemu starts on `create_container`; this is a no-op
455    /// since the lifecycle is single-shot. If we ever migrate to a
456    /// non-daemonized qemu (e.g. for live migration support),
457    /// `start_container` becomes the place that spawns the process.
458    async fn start_container(&self, _id: u32) -> Result<()> {
459        Ok(())
460    }
461
462    async fn stop_container(&self, id: u32) -> Result<()> {
463        if let Some(pid) = self.read_pid(id).await {
464            // Best-effort SIGTERM; qemu shuts down its guest cleanly
465            // when it receives SIGTERM (sends ACPI power button).
466            let _ = Command::new("kill")
467                .args(["-TERM", &pid.to_string()])
468                .status()
469                .await;
470        }
471        Ok(())
472    }
473
474    async fn delete_container(&self, id: u32) -> Result<()> {
475        // Stop first (idempotent if already gone), then nuke the dir.
476        let _ = self.stop_container(id).await;
477        if let Some(pid) = self.read_pid(id).await {
478            // If TERM didn't take, escalate after a short wait.
479            tokio::time::sleep(std::time::Duration::from_secs(2)).await;
480            let _ = Command::new("kill")
481                .args(["-KILL", &pid.to_string()])
482                .status()
483                .await;
484        }
485        let dir = self.vm_dir(id);
486        if dir.exists() {
487            if let Err(e) = tokio::fs::remove_dir_all(&dir).await {
488                warn!("remove {} non-fatal: {}", dir.display(), e);
489            }
490        }
491        Ok(())
492    }
493
494    async fn get_node_status(&self) -> Result<NodeStatus> {
495        // Same minimal report as DockerBackend; a sysinfo-backed
496        // implementation lands in a follow-up.
497        Ok(NodeStatus {
498            cpu_usage: 0.0,
499            memory_used: 0,
500            memory_total: 0,
501            disk_used: 0,
502            disk_total: 0,
503        })
504    }
505
506    async fn get_container_ip(&self, _id: u32) -> Result<Option<String>> {
507        // Guest is reachable via the host's IP + the SSH host_port
508        // forward. We don't expose the guest's internal 10.0.2.x IP
509        // because user-mode networking NATs everything.
510        Ok(None)
511    }
512}
513
514#[cfg(test)]
515mod tests {
516    use super::*;
517    use crate::compute::PortMapping;
518
519    fn cfg(id: u32) -> ContainerConfig {
520        ContainerConfig {
521            id,
522            name: format!("paygress-vm-{}", id),
523            image: String::new(), // ignored by KVM v1
524            cpu_cores: 2,
525            memory_mb: 2048,
526            storage_gb: 10,
527            password: "secret".to_string(),
528            ssh_key: None,
529            host_port: Some(31000),
530            template_ports: vec![PortMapping {
531                host_port: 18789,
532                container_port: 18789,
533                protocol: "tcp",
534            }],
535            template_env: Default::default(),
536            extra_runtime_args: vec![],
537            data_path: None,
538            volume_encryption_key: None,
539        }
540    }
541
542    #[test]
543    fn qemu_argv_includes_kvm_acceleration_and_cpu_host() {
544        let backend = KvmBackend::new(KvmConfig::default());
545        let argv = backend.qemu_argv(&cfg(42));
546        assert!(argv.iter().any(|a| a == "-enable-kvm"));
547        let cpu_idx = argv.iter().position(|a| a == "-cpu").unwrap();
548        assert_eq!(argv[cpu_idx + 1], "host");
549    }
550
551    #[test]
552    fn qemu_argv_forwards_ssh_and_template_ports() {
553        let backend = KvmBackend::new(KvmConfig::default());
554        let argv = backend.qemu_argv(&cfg(42));
555        let netdev = argv
556            .iter()
557            .position(|a| a == "-netdev")
558            .map(|i| argv[i + 1].clone())
559            .unwrap();
560        assert!(
561            netdev.contains("hostfwd=tcp::31000-:22"),
562            "ssh hostfwd missing in: {netdev}"
563        );
564        assert!(
565            netdev.contains("hostfwd=tcp::18789-:18789"),
566            "template hostfwd missing in: {netdev}"
567        );
568    }
569
570    #[test]
571    fn qemu_argv_pidfile_and_disk_paths_are_id_scoped() {
572        let backend = KvmBackend::new(KvmConfig::default());
573        let argv = backend.qemu_argv(&cfg(7));
574        let pidfile_idx = argv.iter().position(|a| a == "-pidfile").unwrap();
575        assert!(argv[pidfile_idx + 1].contains("/7/qemu.pid"));
576        let drives: Vec<&String> = argv
577            .iter()
578            .enumerate()
579            .filter(|(i, a)| *a == "-drive" && *i + 1 < argv.len())
580            .map(|(i, _)| &argv[i + 1])
581            .collect();
582        assert!(drives.iter().any(|d| d.contains("/7/disk.qcow2")));
583        assert!(drives.iter().any(|d| d.contains("/7/seed.iso")));
584    }
585
586    #[test]
587    fn qemu_argv_memory_floor() {
588        let backend = KvmBackend::new(KvmConfig::default());
589        let mut tiny = cfg(1);
590        tiny.memory_mb = 64; // way below qemu's reasonable floor
591        let argv = backend.qemu_argv(&tiny);
592        let m_idx = argv.iter().position(|a| a == "-m").unwrap();
593        assert_eq!(argv[m_idx + 1], "512", "must clamp to 512 MB minimum");
594    }
595
596    #[test]
597    fn paths_are_id_scoped_and_under_vm_root() {
598        let backend = KvmBackend::new(KvmConfig::default());
599        for (a, b) in [(1u32, 2u32), (10, 20), (999, 1000)] {
600            assert_ne!(backend.vm_dir(a), backend.vm_dir(b));
601            assert_ne!(backend.disk_path(a), backend.disk_path(b));
602            assert!(backend.vm_dir(a).starts_with(VM_ROOT));
603        }
604    }
605
606    #[test]
607    fn user_data_includes_password_and_enables_pwauth() {
608        let ud = KvmBackend::user_data("hunter2");
609        assert!(ud.contains("ssh_pwauth: true"));
610        assert!(ud.contains("root:hunter2"));
611    }
612}