Skip to main content

ryra_vm/
image.rs

1use std::fmt;
2use std::path::{Path, PathBuf};
3use std::process::Stdio;
4
5use anyhow::{Context, Result};
6use tokio::process::Command;
7
8/// Which distro/version to use as the base VM image.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum Distro {
11    Debian13,
12    Fedora43,
13}
14
15impl Distro {
16    fn cloud_image_url(&self) -> &str {
17        match self {
18            Distro::Debian13 => {
19                "https://cloud.debian.org/images/cloud/trixie/latest/debian-13-generic-arm64.qcow2"
20            }
21            Distro::Fedora43 => {
22                "https://download.fedoraproject.org/pub/fedora/linux/releases/43/Cloud/aarch64/images/Fedora-Cloud-Base-Generic-43-1.1.aarch64.qcow2"
23            }
24        }
25    }
26
27    fn image_filename(&self) -> &str {
28        match self {
29            Distro::Debian13 => "debian-13-generic-arm64.qcow2",
30            Distro::Fedora43 => "fedora-43-cloud-arm64.qcow2",
31        }
32    }
33
34    fn prepared_filename(&self) -> &str {
35        match self {
36            Distro::Debian13 => "debian-13-prepared-arm64.qcow2",
37            Distro::Fedora43 => "fedora-43-prepared-arm64.qcow2",
38        }
39    }
40
41    fn browser_prepared_filename(&self) -> &str {
42        match self {
43            Distro::Debian13 => "debian-13-browser-arm64.qcow2",
44            Distro::Fedora43 => "fedora-43-browser-arm64.qcow2",
45        }
46    }
47
48    fn snapshot_base(&self) -> &str {
49        match self {
50            Distro::Debian13 => "debian-13-arm64",
51            Distro::Fedora43 => "fedora-43-arm64",
52        }
53    }
54
55    fn browser_snapshot_base(&self) -> &str {
56        match self {
57            Distro::Debian13 => "debian-13-browser-arm64",
58            Distro::Fedora43 => "fedora-43-browser-arm64",
59        }
60    }
61
62    /// Packages to install via cloud-init during image preparation.
63    pub fn cloud_init_packages(&self) -> &[&str] {
64        match self {
65            // Runtime: podman, podman-compose (compose services), uidmap (rootless
66            // namespaces), systemd-container (machined), git (registry fetch).
67            // Test-only: curl (HTTP assertions), postgresql-client (postgres tests),
68            // restic (backup tests).
69            Distro::Debian13 => &[
70                "podman",
71                "podman-compose",
72                "uidmap",
73                "git",
74                "systemd-container",
75                "curl",
76                "postgresql-client",
77                "restic",
78            ],
79            // Fedora: uidmap is part of shadow-utils (already installed).
80            Distro::Fedora43 => &[
81                "podman",
82                "podman-compose",
83                "git",
84                "systemd-container",
85                "curl",
86                "restic",
87            ],
88        }
89    }
90}
91
92impl fmt::Display for Distro {
93    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
94        match self {
95            Distro::Debian13 => write!(f, "debian-13"),
96            Distro::Fedora43 => write!(f, "fedora-43"),
97        }
98    }
99}
100
101impl std::str::FromStr for Distro {
102    type Err = String;
103
104    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
105        match s {
106            "debian-13" => Ok(Distro::Debian13),
107            "fedora-43" => Ok(Distro::Fedora43),
108            other => Err(format!(
109                "unknown distro: {other} (available: debian-13, fedora-43)"
110            )),
111        }
112    }
113}
114
115/// Paths to the cached base image and EFI firmware.
116pub struct Image {
117    /// Prepared qcow2 image.
118    pub path: PathBuf,
119    pub efi_code: PathBuf,
120    pub efi_vars_template: PathBuf,
121    /// If true, cloud-init packages are already installed — skip package install.
122    pub prepared: bool,
123    /// Snapshot boot files (if available). When present, VMs restore from a
124    /// saved QEMU snapshot instead of cold-booting — SSH is ready in <1s.
125    pub snapshot: Option<SnapshotFiles>,
126}
127
128/// Files needed for QEMU snapshot restore.
129pub struct SnapshotFiles {
130    /// qcow2 disk with the "ready" snapshot saved inside.
131    pub disk: PathBuf,
132    /// qcow2 EFI vars with snapshot state (QEMU splits snapshots across drives).
133    pub efivars: PathBuf,
134    /// cloud-init seed ISO (must be present for device topology match, but is
135    /// not re-processed — cloud-init already ran in the snapshot).
136    pub seed_iso: PathBuf,
137    /// SSH private key baked into the snapshot via cloud-init.
138    pub ssh_key: PathBuf,
139    /// RAM size (MB) the snapshot was created with. VMs must use the same size.
140    pub memory_mb: u32,
141}
142
143/// Cache directory for downloaded images.
144fn cache_dir() -> Result<PathBuf> {
145    let base = dirs::cache_dir().context("could not determine cache directory (is $HOME set?)")?;
146    Ok(base.join("ryra-vm"))
147}
148
149/// Ensure the base cloud image, prepared image, and EFI firmware are available.
150///
151/// The "prepared" image has all packages pre-installed (podman, git, etc.)
152/// so VMs boot in ~30s instead of ~6 minutes. It's created by booting the raw
153/// cloud image once with cloud-init, then snapshotting.
154pub async fn ensure_image(
155    distro: &Distro,
156    redownload: bool,
157    use_kvm: bool,
158    max_memory_mb: u32,
159) -> Result<Image> {
160    let cache = cache_dir()?;
161    tokio::fs::create_dir_all(&cache)
162        .await
163        .context("failed to create image cache directory")?;
164
165    let raw_path = cache.join(distro.image_filename());
166    let prepared_path = cache.join(distro.prepared_filename());
167
168    // Download raw cloud image if needed
169    if redownload || !raw_path.exists() {
170        download_image(distro, &raw_path).await?;
171        // Force re-prepare if raw image changed
172        let _ = tokio::fs::remove_file(&prepared_path).await;
173    }
174
175    // Find EFI firmware
176    let efi = find_efi_firmware().await?;
177
178    // Create a vars template if we don't have one
179    let vars_template = cache.join("efivars.fd");
180    if !vars_template.exists() {
181        tokio::fs::copy(&efi.vars, &vars_template)
182            .await
183            .context("failed to copy EFI vars template")?;
184    }
185
186    // Build prepared image if it doesn't exist
187    if !prepared_path.exists() {
188        println!("Preparing base image (installing packages — this is a one-time operation)...");
189        let serial_log = cache_dir()?.join("prepare-base").join("serial.log");
190        println!("  Serial log: {}", serial_log.display());
191        prepare_image(
192            distro,
193            &raw_path,
194            &prepared_path,
195            &efi.code,
196            &vars_template,
197            use_kvm,
198        )
199        .await?;
200        println!("Prepared image cached at: {}", prepared_path.display());
201    } else {
202        println!("Using prepared image: {}", prepared_path.display());
203    }
204
205    // Create VM snapshot for instant boot (if not already created).
206    // Snapshot is cached per memory size — if tests need more RAM than the
207    // cached snapshot, a new one is created at the larger size.
208    let snapshot_prefix = format!("{}-snapshot-{max_memory_mb}", distro.snapshot_base());
209    let snapshot_disk = cache.join(format!("{snapshot_prefix}.qcow2"));
210    let snapshot_efivars = cache.join(format!("{snapshot_prefix}-efivars.qcow2"));
211    let snapshot_seed = cache.join(format!("{snapshot_prefix}-seed.iso"));
212    let snapshot_key = cache.join("test-ssh-key");
213
214    let snapshot = if snapshot_disk.exists() && snapshot_key.exists() {
215        Some(SnapshotFiles {
216            disk: snapshot_disk,
217            efivars: snapshot_efivars,
218            seed_iso: snapshot_seed,
219            ssh_key: snapshot_key,
220            memory_mb: max_memory_mb,
221        })
222    } else {
223        match create_snapshot(
224            &prepared_path,
225            &efi.code,
226            &vars_template,
227            &snapshot_disk,
228            &snapshot_efivars,
229            &snapshot_seed,
230            &snapshot_key,
231            max_memory_mb,
232            use_kvm,
233        )
234        .await
235        {
236            Ok(()) => {
237                println!("  VM snapshot created for instant boot ({max_memory_mb}MB)");
238                Some(SnapshotFiles {
239                    disk: snapshot_disk,
240                    efivars: snapshot_efivars,
241                    seed_iso: snapshot_seed,
242                    ssh_key: snapshot_key,
243                    memory_mb: max_memory_mb,
244                })
245            }
246            Err(e) => {
247                eprintln!(
248                    "  Warning: failed to create VM snapshot (falling back to cold boot): {e:#}"
249                );
250                None
251            }
252        }
253    };
254
255    Ok(Image {
256        path: prepared_path,
257        efi_code: efi.code,
258        efi_vars_template: vars_template,
259        prepared: true,
260        snapshot,
261    })
262}
263
264/// Ensure a browser-ready image exists (base image + bun + playwright + chromium).
265/// Built on top of the base prepared image — one-time operation.
266pub async fn ensure_browser_image(
267    base: &Image,
268    distro: &Distro,
269    redownload: bool,
270    use_kvm: bool,
271    max_memory_mb: u32,
272) -> Result<Image> {
273    let cache = cache_dir()?;
274    let browser_path = cache.join(distro.browser_prepared_filename());
275
276    if redownload {
277        let _ = tokio::fs::remove_file(&browser_path).await;
278    }
279
280    if !browser_path.exists() {
281        println!("Preparing browser image (installing bun + playwright + chromium)...");
282        println!("  This is a one-time operation.");
283        prepare_browser_image(base, &browser_path, use_kvm).await?;
284        println!("Browser image cached at: {}", browser_path.display());
285    } else {
286        println!("Using browser image: {}", browser_path.display());
287    }
288
289    // Create browser-specific snapshot
290    let cache = cache_dir()?;
291    let snap_prefix = format!(
292        "{}-snapshot-{max_memory_mb}",
293        distro.browser_snapshot_base()
294    );
295    let snap_disk = cache.join(format!("{snap_prefix}.qcow2"));
296    let snap_efivars = cache.join(format!("{snap_prefix}-efivars.qcow2"));
297    let snap_seed = cache.join(format!("{snap_prefix}-seed.iso"));
298    let snap_key = cache.join("test-ssh-key");
299
300    let snapshot = if snap_disk.exists() && snap_key.exists() {
301        Some(SnapshotFiles {
302            disk: snap_disk,
303            efivars: snap_efivars,
304            seed_iso: snap_seed,
305            ssh_key: snap_key,
306            memory_mb: max_memory_mb,
307        })
308    } else {
309        match create_snapshot(
310            &browser_path,
311            &base.efi_code,
312            &base.efi_vars_template,
313            &snap_disk,
314            &snap_efivars,
315            &snap_seed,
316            &snap_key,
317            max_memory_mb,
318            use_kvm,
319        )
320        .await
321        {
322            Ok(()) => Some(SnapshotFiles {
323                disk: snap_disk,
324                efivars: snap_efivars,
325                seed_iso: snap_seed,
326                ssh_key: snap_key,
327                memory_mb: max_memory_mb,
328            }),
329            Err(e) => {
330                eprintln!("  Warning: failed to create browser VM snapshot: {e:#}");
331                None
332            }
333        }
334    };
335
336    Ok(Image {
337        path: browser_path,
338        efi_code: base.efi_code.clone(),
339        efi_vars_template: base.efi_vars_template.clone(),
340        prepared: true,
341        snapshot,
342    })
343}
344
345/// Boot the base prepared image, install bun + playwright + chromium, then snapshot.
346async fn prepare_browser_image(base: &Image, browser_path: &Path, use_kvm: bool) -> Result<()> {
347    use crate::machine::{Machine, SpawnOpts};
348    use crate::ports;
349
350    let id = crate::machine::random_id();
351    let ssh_port = ports::allocate_ssh_port();
352    let opts = SpawnOpts {
353        use_kvm,
354        memory_mb: 4096, // chromium install needs decent RAM
355        cpus: 2,
356        disk_gb: 20,
357    };
358
359    let mut vm = Machine::spawn(base, &id, ssh_port, &opts).await?;
360
361    // Install unzip (needed by bun installer), bun, playwright + chromium.
362    // Runs as the ryra user; uses sudo for system-level operations.
363    let install_script = r#"
364set -e
365sudo apt-get update -qq && sudo apt-get install -y -qq unzip >/dev/null 2>&1
366curl -fsSL https://bun.sh/install | bash
367export BUN_INSTALL="$HOME/.bun"
368export PATH="$BUN_INSTALL/bin:$PATH"
369
370# Create a global playwright project so chromium is cached system-wide
371sudo mkdir -p /opt/playwright && sudo chown $USER:$USER /opt/playwright
372cd /opt/playwright
373bun init -y >/dev/null 2>&1
374bun add playwright @playwright/test
375bunx playwright install chromium --with-deps
376
377# Add bun to PATH for future SSH sessions
378echo 'export BUN_INSTALL="$HOME/.bun"' >> $HOME/.bashrc
379echo 'export PATH="$BUN_INSTALL/bin:$PATH"' >> $HOME/.bashrc
380"#;
381
382    println!("  Installing bun + playwright + chromium in VM...");
383    let result = vm.exec(install_script).await;
384    if let Err(e) = &result {
385        let _ = vm.destroy().await;
386        anyhow::bail!("failed to install browser tools: {e:#}");
387    }
388
389    // Shut down cleanly, wait for qemu to release the disk, then convert.
390    let disk = vm.work_dir.join("disk.qcow2");
391    let _ = vm.exec("sudo sync && sudo poweroff").await;
392    vm.wait_for_exit(std::time::Duration::from_secs(30)).await;
393
394    let status = Command::new("qemu-img")
395        .args([
396            "convert",
397            "-f",
398            "qcow2",
399            "-O",
400            "qcow2",
401            &disk.to_string_lossy(),
402            &browser_path.to_string_lossy(),
403        ])
404        .stdout(Stdio::null())
405        .stderr(Stdio::null())
406        .status()
407        .await
408        .context("qemu-img convert failed")?;
409    if !status.success() {
410        anyhow::bail!("qemu-img convert failed for browser image");
411    }
412
413    let _ = vm.destroy().await;
414    Ok(())
415}
416
417struct EfiFirmware {
418    code: PathBuf,
419    vars: PathBuf,
420}
421
422async fn find_efi_firmware() -> Result<EfiFirmware> {
423    let candidates = [
424        // Debian/Ubuntu
425        (
426            "/usr/share/AAVMF/AAVMF_CODE.fd",
427            "/usr/share/AAVMF/AAVMF_VARS.fd",
428        ),
429        (
430            "/usr/share/qemu-efi-aarch64/QEMU_EFI.fd",
431            "/usr/share/qemu-efi-aarch64/vars-template-pflash.raw",
432        ),
433        // Fedora / Arch
434        (
435            "/usr/share/edk2/aarch64/QEMU_EFI-pflash.raw",
436            "/usr/share/edk2/aarch64/vars-template-pflash.raw",
437        ),
438    ];
439
440    for (code, vars) in &candidates {
441        let code_path = PathBuf::from(code);
442        let vars_path = PathBuf::from(vars);
443        if code_path.exists() && vars_path.exists() {
444            return Ok(EfiFirmware {
445                code: code_path,
446                vars: vars_path,
447            });
448        }
449    }
450
451    anyhow::bail!(
452        "EFI firmware not found. Install it with:\n  \
453         sudo apt install qemu-efi-aarch64    # Debian/Ubuntu\n  \
454         sudo dnf install edk2-aarch64        # Fedora\n  \
455         sudo pacman -S edk2-aarch64          # Arch"
456    )
457}
458
459async fn download_image(distro: &Distro, dest: &PathBuf) -> Result<()> {
460    let url = distro.cloud_image_url();
461    println!("Downloading {distro} cloud image...");
462    println!("  {url}");
463
464    let partial = dest.with_extension("qcow2.partial");
465
466    let status = Command::new("curl")
467        .args([
468            "-L",
469            "--progress-bar",
470            "-o",
471            &partial.to_string_lossy(),
472            url,
473        ])
474        .stdout(Stdio::inherit())
475        .stderr(Stdio::inherit())
476        .status()
477        .await
478        .context("failed to run curl — is it installed?")?;
479
480    if !status.success() {
481        let _ = tokio::fs::remove_file(&partial).await;
482        anyhow::bail!("failed to download cloud image from {url}");
483    }
484
485    tokio::fs::rename(&partial, dest)
486        .await
487        .context("failed to move downloaded image into place")?;
488
489    println!("Image cached at: {}", dest.display());
490    Ok(())
491}
492
493/// Boot the raw cloud image, let cloud-init install packages, then snapshot it.
494///
495/// This is a one-time operation. The resulting image has podman, git, etc.
496/// already installed, so subsequent VMs skip the slow package install step.
497async fn prepare_image(
498    distro: &Distro,
499    raw_image: &Path,
500    prepared_path: &Path,
501    efi_code: &Path,
502    efi_vars_template: &Path,
503    use_kvm: bool,
504) -> Result<()> {
505    let work_dir = cache_dir()?.join("prepare-base");
506    let _ = tokio::fs::remove_dir_all(&work_dir).await;
507    tokio::fs::create_dir_all(&work_dir)
508        .await
509        .context("failed to create prepare work dir")?;
510
511    // Create a working copy of the raw image (not COW — we want a standalone result)
512    let disk = work_dir.join("disk.qcow2");
513    let status = Command::new("qemu-img")
514        .args([
515            "create",
516            "-f",
517            "qcow2",
518            "-b",
519            &raw_image.to_string_lossy(),
520            "-F",
521            "qcow2",
522            &disk.to_string_lossy(),
523            "20G",
524        ])
525        .stdout(Stdio::null())
526        .stderr(Stdio::null())
527        .status()
528        .await
529        .context("qemu-img create failed")?;
530    if !status.success() {
531        anyhow::bail!("qemu-img create failed for prepare step");
532    }
533
534    // Copy EFI vars
535    let efi_vars = work_dir.join("efivars.fd");
536    tokio::fs::copy(efi_vars_template, &efi_vars)
537        .await
538        .context("failed to copy EFI vars")?;
539
540    // Generate temp SSH key
541    let key_path = work_dir.join("id_ed25519");
542    let status = Command::new("ssh-keygen")
543        .args([
544            "-t",
545            "ed25519",
546            "-f",
547            &key_path.to_string_lossy(),
548            "-N",
549            "",
550            "-q",
551        ])
552        .stdout(Stdio::null())
553        .stderr(Stdio::null())
554        .status()
555        .await
556        .context("ssh-keygen failed")?;
557    if !status.success() {
558        anyhow::bail!("ssh-keygen failed");
559    }
560    let pub_key = tokio::fs::read_to_string(format!("{}.pub", key_path.display()))
561        .await
562        .context("failed to read public key")?;
563
564    // Build seed ISO with full package install
565    let seed_iso = work_dir.join("seed.iso");
566    crate::machine::build_seed_iso_full(
567        &work_dir,
568        &seed_iso,
569        "ryra-prepare",
570        pub_key.trim(),
571        distro.cloud_init_packages(),
572    )
573    .await?;
574
575    // Boot VM
576    let ssh_port = crate::ports::allocate_ssh_port();
577    let serial_log = work_dir.join("serial.log");
578    let memory = "2048";
579    let cpus = "2";
580    let efi_code_arg = format!(
581        "if=pflash,format=raw,file={},readonly=on",
582        efi_code.display()
583    );
584    let efi_vars_arg = format!("if=pflash,format=raw,file={}", efi_vars.display());
585    let disk_arg = format!("if=virtio,file={},format=qcow2", disk.display());
586    let seed_arg = format!("if=virtio,file={},format=raw", seed_iso.display());
587    let nic_arg = format!("user,hostfwd=tcp::{ssh_port}-:22");
588    let serial_arg = format!("file:{}", serial_log.display());
589
590    let mut args: Vec<&str> = vec![
591        "-machine",
592        "virt",
593        "-cpu",
594        if use_kvm { "host" } else { "max" },
595        "-m",
596        memory,
597        "-smp",
598        cpus,
599        "-drive",
600        &efi_code_arg,
601        "-drive",
602        &efi_vars_arg,
603        "-drive",
604        &disk_arg,
605        "-drive",
606        &seed_arg,
607        "-nic",
608        &nic_arg,
609        "-nographic",
610        "-serial",
611        &serial_arg,
612        "-monitor",
613        "none",
614    ];
615    if use_kvm {
616        args.extend(crate::accel_args().iter().copied());
617    }
618
619    let mut qemu = Command::new("qemu-system-aarch64")
620        .args(&args)
621        .stdout(Stdio::null())
622        .stderr(Stdio::null())
623        .spawn()
624        .context("failed to start QEMU for image preparation")?;
625
626    // Wait for SSH
627    let timeout = if use_kvm {
628        std::time::Duration::from_secs(300)
629    } else {
630        std::time::Duration::from_secs(900)
631    };
632    let mut progress = crate::progress::WaitProgress::new("image SSH", "ssh", timeout)
633        .with_heartbeat(std::time::Duration::from_secs(30));
634    let port_str = ssh_port.to_string();
635    loop {
636        let result = Command::new("ssh")
637            .args([
638                "-o",
639                "StrictHostKeyChecking=no",
640                "-o",
641                "UserKnownHostsFile=/dev/null",
642                "-o",
643                "LogLevel=ERROR",
644                "-o",
645                "ConnectTimeout=3",
646                "-o",
647                "BatchMode=yes",
648                "-i",
649                &key_path.to_string_lossy(),
650                "-p",
651                &port_str,
652                "ryra@127.0.0.1",
653                "true",
654            ])
655            .stdout(Stdio::null())
656            .stderr(Stdio::null())
657            .status()
658            .await;
659
660        if let Ok(s) = result
661            && s.success()
662        {
663            break;
664        }
665
666        if progress.timed_out() {
667            let _ = qemu.kill().await;
668            anyhow::bail!(
669                "timed out waiting for SSH during image preparation after {}s\n  \
670                 Serial log: {}",
671                timeout.as_secs(),
672                serial_log.display()
673            );
674        }
675
676        progress.tick();
677        tokio::time::sleep(std::time::Duration::from_secs(2)).await;
678    }
679
680    // Wait for cloud-init to finish
681    println!("  SSH ready, waiting for cloud-init to finish installing packages...");
682    let ci_result = Command::new("ssh")
683        .args([
684            "-o",
685            "StrictHostKeyChecking=no",
686            "-o",
687            "UserKnownHostsFile=/dev/null",
688            "-o",
689            "LogLevel=ERROR",
690            "-o",
691            "ConnectTimeout=10",
692            "-o",
693            "BatchMode=yes",
694            "-i",
695            &key_path.to_string_lossy(),
696            "-p",
697            &port_str,
698            "ryra@127.0.0.1",
699            "cloud-init status --wait",
700        ])
701        .stdout(Stdio::null())
702        .stderr(Stdio::null())
703        .status()
704        .await
705        .context("cloud-init wait failed")?;
706
707    if !ci_result.success() {
708        let _ = qemu.kill().await;
709        anyhow::bail!("cloud-init failed during image preparation");
710    }
711
712    // Clean up cloud-init state so it runs again on next boot (for per-VM SSH keys)
713    let _ = Command::new("ssh")
714        .args([
715            "-o",
716            "StrictHostKeyChecking=no",
717            "-o",
718            "UserKnownHostsFile=/dev/null",
719            "-o",
720            "LogLevel=ERROR",
721            "-o",
722            "BatchMode=yes",
723            "-i",
724            &key_path.to_string_lossy(),
725            "-p",
726            &port_str,
727            "ryra@127.0.0.1",
728            "cloud-init clean --logs && rm -f /etc/ssh/ssh_host_*_key*",
729        ])
730        .stdout(Stdio::null())
731        .stderr(Stdio::null())
732        .status()
733        .await;
734
735    // Shut down gracefully
736    let _ = Command::new("ssh")
737        .args([
738            "-o",
739            "StrictHostKeyChecking=no",
740            "-o",
741            "UserKnownHostsFile=/dev/null",
742            "-o",
743            "LogLevel=ERROR",
744            "-o",
745            "BatchMode=yes",
746            "-i",
747            &key_path.to_string_lossy(),
748            "-p",
749            &port_str,
750            "ryra@127.0.0.1",
751            "sudo poweroff",
752        ])
753        .stdout(Stdio::null())
754        .stderr(Stdio::null())
755        .status()
756        .await;
757
758    tokio::time::sleep(std::time::Duration::from_secs(5)).await;
759    let _ = qemu.kill().await;
760    let _ = qemu.wait().await;
761
762    // Compact the image — squash the COW layer into a standalone file
763    let status = Command::new("qemu-img")
764        .args([
765            "convert",
766            "-O",
767            "qcow2",
768            "-c",
769            &disk.to_string_lossy(),
770            &prepared_path.to_string_lossy(),
771        ])
772        .stdout(Stdio::null())
773        .stderr(Stdio::null())
774        .status()
775        .await
776        .context("qemu-img convert failed")?;
777    if !status.success() {
778        anyhow::bail!("failed to compact prepared image");
779    }
780
781    // Clean up work dir
782    let _ = tokio::fs::remove_dir_all(&work_dir).await;
783
784    Ok(())
785}
786
787/// Create a QEMU snapshot for instant VM boot.
788///
789/// Boots the prepared image, waits for SSH, then saves a VM snapshot.
790/// Subsequent VMs restore from this snapshot in <1s instead of cold-booting.
791#[allow(clippy::too_many_arguments)]
792async fn create_snapshot(
793    prepared_path: &Path,
794    efi_code: &Path,
795    efi_vars_template: &Path,
796    snapshot_disk: &Path,
797    snapshot_efivars: &Path,
798    snapshot_seed: &Path,
799    ssh_key_path: &Path,
800    memory_mb: u32,
801    use_kvm: bool,
802) -> Result<()> {
803    let work_dir = cache_dir()?.join("prepare-snapshot");
804    let _ = tokio::fs::remove_dir_all(&work_dir).await;
805    tokio::fs::create_dir_all(&work_dir)
806        .await
807        .context("failed to create snapshot work dir")?;
808
809    // Generate shared test SSH key (reused by all VMs)
810    if !ssh_key_path.exists() {
811        let status = Command::new("ssh-keygen")
812            .args([
813                "-t",
814                "ed25519",
815                "-f",
816                &ssh_key_path.to_string_lossy(),
817                "-N",
818                "",
819                "-q",
820            ])
821            .stdout(Stdio::null())
822            .stderr(Stdio::null())
823            .status()
824            .await
825            .context("ssh-keygen failed")?;
826        if !status.success() {
827            anyhow::bail!("ssh-keygen failed for test SSH key");
828        }
829    }
830
831    let pub_key = tokio::fs::read_to_string(format!("{}.pub", ssh_key_path.display()))
832        .await
833        .context("failed to read test SSH public key")?;
834
835    // Create COW overlay for snapshot boot
836    let disk = work_dir.join("disk.qcow2");
837    let status = Command::new("qemu-img")
838        .args([
839            "create",
840            "-f",
841            "qcow2",
842            "-b",
843            &prepared_path.to_string_lossy(),
844            "-F",
845            "qcow2",
846            &disk.to_string_lossy(),
847            "20G",
848        ])
849        .stdout(Stdio::null())
850        .stderr(Stdio::null())
851        .status()
852        .await
853        .context("qemu-img create failed")?;
854    if !status.success() {
855        anyhow::bail!("qemu-img create failed for snapshot disk");
856    }
857
858    // Convert EFI vars to qcow2 (required for snapshot support)
859    let efivars = work_dir.join("efivars.qcow2");
860    let status = Command::new("qemu-img")
861        .args([
862            "convert",
863            "-f",
864            "raw",
865            "-O",
866            "qcow2",
867            &efi_vars_template.to_string_lossy(),
868            &efivars.to_string_lossy(),
869        ])
870        .stdout(Stdio::null())
871        .stderr(Stdio::null())
872        .status()
873        .await
874        .context("qemu-img convert failed for efivars")?;
875    if !status.success() {
876        anyhow::bail!("failed to convert EFI vars to qcow2");
877    }
878
879    // Build seed ISO with the shared SSH key
880    let seed_iso = work_dir.join("seed.iso");
881    crate::machine::build_seed_iso(&work_dir, &seed_iso, "snapshot-prep", pub_key.trim()).await?;
882
883    // Boot with HMP monitor for savevm
884    let ssh_port = crate::ports::allocate_ssh_port();
885    let serial_log = work_dir.join("serial.log");
886    let port_str = ssh_port.to_string();
887
888    // Share the image store via virtfs (must match what Machine::spawn uses)
889    let shared_store = crate::machine::image_shared_store_dir()?;
890    tokio::fs::create_dir_all(&shared_store).await.ok();
891
892    let efi_code_arg = format!(
893        "if=pflash,format=raw,file={},readonly=on",
894        efi_code.display()
895    );
896    let efi_vars_arg = format!("if=pflash,format=qcow2,file={}", efivars.display());
897    let disk_arg = format!("if=virtio,file={},format=qcow2", disk.display());
898    let seed_arg = format!(
899        "if=virtio,file={},format=raw,readonly=on",
900        seed_iso.display()
901    );
902    let nic_arg = format!("user,hostfwd=tcp::{ssh_port}-:22");
903    let serial_arg = format!("file:{}", serial_log.display());
904    let mon_sock = work_dir.join("mon.sock");
905    let mon_arg = format!("unix:{},server,nowait", mon_sock.display());
906    let virtfs_arg = format!(
907        "local,path={},mount_tag=images,security_model=none,readonly=on",
908        shared_store.display()
909    );
910
911    let memory_str = memory_mb.to_string();
912    let mut args: Vec<&str> = vec![
913        "-machine",
914        "virt",
915        "-cpu",
916        if use_kvm { "host" } else { "max" },
917        "-m",
918        &memory_str,
919        "-smp",
920        "2",
921        "-drive",
922        &efi_code_arg,
923        "-drive",
924        &efi_vars_arg,
925        "-drive",
926        &disk_arg,
927        "-drive",
928        &seed_arg,
929        "-nic",
930        &nic_arg,
931        "-nographic",
932        "-serial",
933        &serial_arg,
934        "-monitor",
935        &mon_arg,
936        "-virtfs",
937        &virtfs_arg,
938    ];
939    if use_kvm {
940        args.extend(crate::accel_args().iter().copied());
941    }
942
943    let mut qemu = Command::new("qemu-system-aarch64")
944        .args(&args)
945        .stdout(Stdio::null())
946        .stderr(Stdio::null())
947        .spawn()
948        .context("failed to start QEMU for snapshot creation")?;
949
950    // Wait for SSH
951    let timeout = std::time::Duration::from_secs(if use_kvm { 120 } else { 600 });
952    let mut progress = crate::progress::WaitProgress::new("snapshot SSH", "ssh", timeout)
953        .with_heartbeat(std::time::Duration::from_secs(30));
954    loop {
955        let result = Command::new("ssh")
956            .args([
957                "-o",
958                "StrictHostKeyChecking=no",
959                "-o",
960                "UserKnownHostsFile=/dev/null",
961                "-o",
962                "LogLevel=ERROR",
963                "-o",
964                "ConnectTimeout=2",
965                "-o",
966                "BatchMode=yes",
967                "-i",
968                &ssh_key_path.to_string_lossy(),
969                "-p",
970                &port_str,
971                "ryra@127.0.0.1",
972                "true",
973            ])
974            .stdout(Stdio::null())
975            .stderr(Stdio::null())
976            .status()
977            .await;
978
979        if let Ok(s) = result
980            && s.success()
981        {
982            break;
983        }
984        if progress.timed_out() {
985            let _ = qemu.kill().await;
986            anyhow::bail!("timed out waiting for SSH during snapshot creation");
987        }
988        progress.tick();
989        tokio::time::sleep(std::time::Duration::from_secs(1)).await;
990    }
991
992    // Wait for cloud-init
993    let _ = Command::new("ssh")
994        .args([
995            "-o",
996            "StrictHostKeyChecking=no",
997            "-o",
998            "UserKnownHostsFile=/dev/null",
999            "-o",
1000            "LogLevel=ERROR",
1001            "-o",
1002            "BatchMode=yes",
1003            "-i",
1004            &ssh_key_path.to_string_lossy(),
1005            "-p",
1006            &port_str,
1007            "ryra@127.0.0.1",
1008            "cloud-init status --wait",
1009        ])
1010        .stdout(Stdio::null())
1011        .stderr(Stdio::null())
1012        .status()
1013        .await;
1014
1015    // Configure the VM before snapshotting so every restored VM starts clean.
1016    // - `/mnt/images` dir created (but NOT mounted — see below)
1017    // - Rootless podman config at ~/.config/containers/ (user-level)
1018    //
1019    // QEMU refuses `savevm` with "Migration is disabled when VirtFS export
1020    // path is mounted in the guest". So we deliberately leave /mnt/images
1021    // un-mounted while the snapshot is being saved. On cold boot the test
1022    // runner's `load_images_into_vm` mounts it on demand; when restoring
1023    // from this snapshot the same helper runs and handles the mount too.
1024    // The podman config still references /mnt/images — that path is
1025    // resolved lazily on first podman operation, so a stale reference
1026    // during snapshot save is harmless.
1027    let setup_cmd = "\
1028        sudo mkdir -p /mnt/images; \
1029        mkdir -p ~/.config/containers && \
1030        printf '[storage]\\ndriver = \"overlay\"\\n[storage.options]\\nadditionalimagestores = [\"/mnt/images\"]\\n' > ~/.config/containers/storage.conf && \
1031        printf 'unqualified-search-registries = [\"docker.io\"]\\n' > ~/.config/containers/registries.conf; \
1032        systemctl --user daemon-reload";
1033    let setup_status = Command::new("ssh")
1034        .args([
1035            "-o",
1036            "StrictHostKeyChecking=no",
1037            "-o",
1038            "UserKnownHostsFile=/dev/null",
1039            "-o",
1040            "LogLevel=ERROR",
1041            "-o",
1042            "BatchMode=yes",
1043            "-i",
1044            &ssh_key_path.to_string_lossy(),
1045            "-p",
1046            &port_str,
1047            "ryra@127.0.0.1",
1048            setup_cmd,
1049        ])
1050        .output()
1051        .await
1052        .context("failed to SSH for snapshot setup")?;
1053    if !setup_status.status.success() {
1054        let stderr = String::from_utf8_lossy(&setup_status.stderr);
1055        anyhow::bail!("snapshot setup failed: {stderr}");
1056    }
1057
1058    // Save snapshot via HMP monitor using socat
1059    let socat_result = std::process::Command::new("socat")
1060        .args(["-", &format!("UNIX-CONNECT:{}", mon_sock.display())])
1061        .stdin(std::process::Stdio::piped())
1062        .stdout(std::process::Stdio::null())
1063        .stderr(std::process::Stdio::null())
1064        .spawn()
1065        .and_then(|mut child| {
1066            use std::io::Write;
1067            if let Some(ref mut stdin) = child.stdin {
1068                stdin.write_all(b"savevm ready\n")?;
1069                stdin.flush()?;
1070            }
1071            child.stdin.take();
1072            Ok(child)
1073        });
1074
1075    match socat_result {
1076        Ok(mut child) => {
1077            // Wait for savevm to finish writing RAM state into the qcow2. We
1078            // poll the disk file's size: savevm grows it as it streams memory
1079            // out, so once the size is steady for a handful of consecutive
1080            // polls, we know the write is done. A plain `sleep(N)` is fragile
1081            // because `N` has to be large enough for the slowest host (6GB
1082            // can take 5+ min on a loaded Asahi) yet we pay it every boot on
1083            // fast hosts too. Polling ends as soon as the disk goes quiet.
1084            //
1085            // A hard ceiling prevents a hung savevm from pinning the process
1086            // forever, scaled to VM memory since bigger VMs legitimately take
1087            // longer.
1088            let start = std::time::Instant::now();
1089            let max_wait =
1090                std::time::Duration::from_secs(std::cmp::max(300, (memory_mb as u64) * 2));
1091            let poll_interval = std::time::Duration::from_secs(2);
1092            // Consider savevm done when the file size hasn't changed for
1093            // this many polls in a row (≈ 6 seconds of quiet). QEMU buffers
1094            // writes, so we want enough stability to rule out "briefly idle
1095            // between chunks".
1096            let stable_polls_needed = 3;
1097
1098            let mut last_size: u64 = 0;
1099            let mut stable_polls: u32 = 0;
1100            // Give savevm a moment to actually start writing before we begin
1101            // checking — otherwise we'd see the disk at its initial tiny size
1102            // and declare it "stable" before a single byte was written.
1103            tokio::time::sleep(std::time::Duration::from_secs(5)).await;
1104
1105            loop {
1106                let size = tokio::fs::metadata(&disk)
1107                    .await
1108                    .map(|m| m.len())
1109                    .unwrap_or(0);
1110                if size == last_size && size > 0 {
1111                    stable_polls += 1;
1112                    if stable_polls >= stable_polls_needed {
1113                        break;
1114                    }
1115                } else {
1116                    stable_polls = 0;
1117                    last_size = size;
1118                }
1119                if start.elapsed() > max_wait {
1120                    eprintln!(
1121                        "  warning: savevm hit max wait ({}s) — qcow2 size {}MB, proceeding anyway",
1122                        max_wait.as_secs(),
1123                        size / (1024 * 1024),
1124                    );
1125                    break;
1126                }
1127                tokio::time::sleep(poll_interval).await;
1128            }
1129            let _ = child.kill();
1130            let _ = child.wait();
1131        }
1132        Err(e) => {
1133            let _ = qemu.kill().await;
1134            anyhow::bail!("failed to save VM snapshot via socat: {e}. Is socat installed?");
1135        }
1136    }
1137
1138    let _ = qemu.kill().await;
1139    let _ = qemu.wait().await;
1140
1141    // Verify the snapshot was actually saved
1142    let check = Command::new("qemu-img")
1143        .args(["snapshot", "-l", &disk.to_string_lossy()])
1144        .output()
1145        .await
1146        .context("failed to run qemu-img snapshot -l")?;
1147    let snapshot_list = String::from_utf8_lossy(&check.stdout);
1148    if !snapshot_list.contains("ready") {
1149        anyhow::bail!(
1150            "savevm failed — snapshot 'ready' not found in {}. \
1151             This can happen if the VM needed more time to save {}MB of RAM.",
1152            disk.display(),
1153            memory_mb
1154        );
1155    }
1156
1157    // Move snapshot files to their final locations
1158    tokio::fs::rename(&disk, snapshot_disk)
1159        .await
1160        .context("failed to move snapshot disk")?;
1161    tokio::fs::rename(&efivars, snapshot_efivars)
1162        .await
1163        .context("failed to move snapshot efivars")?;
1164    tokio::fs::rename(&seed_iso, snapshot_seed)
1165        .await
1166        .context("failed to move snapshot seed ISO")?;
1167
1168    let _ = tokio::fs::remove_dir_all(&work_dir).await;
1169    Ok(())
1170}