Skip to main content

ryra_vm/
image.rs

1use std::fmt;
2use std::path::{Path, PathBuf};
3use std::process::Stdio;
4
5use anyhow::{Context, Result};
6use tokio::process::Command;
7
8/// Which distro/version to use as the base VM image.
9#[derive(Debug, Clone, Copy, PartialEq, Eq)]
10pub enum Distro {
11    Debian13,
12    Fedora43,
13}
14
15impl Distro {
16    fn cloud_image_url(&self) -> &str {
17        match self {
18            Distro::Debian13 => {
19                "https://cloud.debian.org/images/cloud/trixie/latest/debian-13-generic-arm64.qcow2"
20            }
21            Distro::Fedora43 => {
22                "https://download.fedoraproject.org/pub/fedora/linux/releases/43/Cloud/aarch64/images/Fedora-Cloud-Base-Generic-43-1.1.aarch64.qcow2"
23            }
24        }
25    }
26
27    fn image_filename(&self) -> &str {
28        match self {
29            Distro::Debian13 => "debian-13-generic-arm64.qcow2",
30            Distro::Fedora43 => "fedora-43-cloud-arm64.qcow2",
31        }
32    }
33
34    fn prepared_filename(&self) -> &str {
35        match self {
36            Distro::Debian13 => "debian-13-prepared-arm64.qcow2",
37            Distro::Fedora43 => "fedora-43-prepared-arm64.qcow2",
38        }
39    }
40
41    fn browser_prepared_filename(&self) -> &str {
42        match self {
43            Distro::Debian13 => "debian-13-browser-arm64.qcow2",
44            Distro::Fedora43 => "fedora-43-browser-arm64.qcow2",
45        }
46    }
47
48    fn snapshot_base(&self) -> &str {
49        match self {
50            Distro::Debian13 => "debian-13-arm64",
51            Distro::Fedora43 => "fedora-43-arm64",
52        }
53    }
54
55    fn browser_snapshot_base(&self) -> &str {
56        match self {
57            Distro::Debian13 => "debian-13-browser-arm64",
58            Distro::Fedora43 => "fedora-43-browser-arm64",
59        }
60    }
61
62    /// Packages to install via cloud-init during image preparation.
63    pub fn cloud_init_packages(&self) -> &[&str] {
64        match self {
65            // Runtime: podman, podman-compose (compose services), uidmap (rootless
66            // namespaces), systemd-container (machined), git (registry fetch).
67            // Test-only: curl (HTTP assertions), postgresql-client (postgres tests).
68            Distro::Debian13 => &[
69                "podman",
70                "podman-compose",
71                "uidmap",
72                "git",
73                "systemd-container",
74                "curl",
75                "postgresql-client",
76            ],
77            // Fedora: uidmap is part of shadow-utils (already installed).
78            Distro::Fedora43 => &[
79                "podman",
80                "podman-compose",
81                "git",
82                "systemd-container",
83                "curl",
84            ],
85        }
86    }
87}
88
89impl fmt::Display for Distro {
90    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
91        match self {
92            Distro::Debian13 => write!(f, "debian-13"),
93            Distro::Fedora43 => write!(f, "fedora-43"),
94        }
95    }
96}
97
98impl std::str::FromStr for Distro {
99    type Err = String;
100
101    fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
102        match s {
103            "debian-13" => Ok(Distro::Debian13),
104            "fedora-43" => Ok(Distro::Fedora43),
105            other => Err(format!(
106                "unknown distro: {other} (available: debian-13, fedora-43)"
107            )),
108        }
109    }
110}
111
112/// Paths to the cached base image and EFI firmware.
113pub struct Image {
114    /// Prepared qcow2 image.
115    pub path: PathBuf,
116    pub efi_code: PathBuf,
117    pub efi_vars_template: PathBuf,
118    /// If true, cloud-init packages are already installed — skip package install.
119    pub prepared: bool,
120    /// Snapshot boot files (if available). When present, VMs restore from a
121    /// saved QEMU snapshot instead of cold-booting — SSH is ready in <1s.
122    pub snapshot: Option<SnapshotFiles>,
123}
124
125/// Files needed for QEMU snapshot restore.
126pub struct SnapshotFiles {
127    /// qcow2 disk with the "ready" snapshot saved inside.
128    pub disk: PathBuf,
129    /// qcow2 EFI vars with snapshot state (QEMU splits snapshots across drives).
130    pub efivars: PathBuf,
131    /// cloud-init seed ISO (must be present for device topology match, but is
132    /// not re-processed — cloud-init already ran in the snapshot).
133    pub seed_iso: PathBuf,
134    /// SSH private key baked into the snapshot via cloud-init.
135    pub ssh_key: PathBuf,
136    /// RAM size (MB) the snapshot was created with. VMs must use the same size.
137    pub memory_mb: u32,
138}
139
140/// Cache directory for downloaded images.
141fn cache_dir() -> Result<PathBuf> {
142    let base = dirs::cache_dir().context("could not determine cache directory (is $HOME set?)")?;
143    Ok(base.join("ryra-vm"))
144}
145
146/// Ensure the base cloud image, prepared image, and EFI firmware are available.
147///
148/// The "prepared" image has all packages pre-installed (podman, git, etc.)
149/// so VMs boot in ~30s instead of ~6 minutes. It's created by booting the raw
150/// cloud image once with cloud-init, then snapshotting.
151pub async fn ensure_image(
152    distro: &Distro,
153    redownload: bool,
154    use_kvm: bool,
155    max_memory_mb: u32,
156) -> Result<Image> {
157    let cache = cache_dir()?;
158    tokio::fs::create_dir_all(&cache)
159        .await
160        .context("failed to create image cache directory")?;
161
162    let raw_path = cache.join(distro.image_filename());
163    let prepared_path = cache.join(distro.prepared_filename());
164
165    // Download raw cloud image if needed
166    if redownload || !raw_path.exists() {
167        download_image(distro, &raw_path).await?;
168        // Force re-prepare if raw image changed
169        let _ = tokio::fs::remove_file(&prepared_path).await;
170    }
171
172    // Find EFI firmware
173    let efi = find_efi_firmware().await?;
174
175    // Create a vars template if we don't have one
176    let vars_template = cache.join("efivars.fd");
177    if !vars_template.exists() {
178        tokio::fs::copy(&efi.vars, &vars_template)
179            .await
180            .context("failed to copy EFI vars template")?;
181    }
182
183    // Build prepared image if it doesn't exist
184    if !prepared_path.exists() {
185        println!("Preparing base image (installing packages — this is a one-time operation)...");
186        let serial_log = cache_dir()?.join("prepare-base").join("serial.log");
187        println!("  Serial log: {}", serial_log.display());
188        prepare_image(
189            distro,
190            &raw_path,
191            &prepared_path,
192            &efi.code,
193            &vars_template,
194            use_kvm,
195        )
196        .await?;
197        println!("Prepared image cached at: {}", prepared_path.display());
198    } else {
199        println!("Using prepared image: {}", prepared_path.display());
200    }
201
202    // Create VM snapshot for instant boot (if not already created).
203    // Snapshot is cached per memory size — if tests need more RAM than the
204    // cached snapshot, a new one is created at the larger size.
205    let snapshot_prefix = format!("{}-snapshot-{max_memory_mb}", distro.snapshot_base());
206    let snapshot_disk = cache.join(format!("{snapshot_prefix}.qcow2"));
207    let snapshot_efivars = cache.join(format!("{snapshot_prefix}-efivars.qcow2"));
208    let snapshot_seed = cache.join(format!("{snapshot_prefix}-seed.iso"));
209    let snapshot_key = cache.join("test-ssh-key");
210
211    let snapshot = if snapshot_disk.exists() && snapshot_key.exists() {
212        Some(SnapshotFiles {
213            disk: snapshot_disk,
214            efivars: snapshot_efivars,
215            seed_iso: snapshot_seed,
216            ssh_key: snapshot_key,
217            memory_mb: max_memory_mb,
218        })
219    } else {
220        match create_snapshot(
221            &prepared_path,
222            &efi.code,
223            &vars_template,
224            &snapshot_disk,
225            &snapshot_efivars,
226            &snapshot_seed,
227            &snapshot_key,
228            max_memory_mb,
229            use_kvm,
230        )
231        .await
232        {
233            Ok(()) => {
234                println!("  VM snapshot created for instant boot ({max_memory_mb}MB)");
235                Some(SnapshotFiles {
236                    disk: snapshot_disk,
237                    efivars: snapshot_efivars,
238                    seed_iso: snapshot_seed,
239                    ssh_key: snapshot_key,
240                    memory_mb: max_memory_mb,
241                })
242            }
243            Err(e) => {
244                eprintln!(
245                    "  Warning: failed to create VM snapshot (falling back to cold boot): {e:#}"
246                );
247                None
248            }
249        }
250    };
251
252    Ok(Image {
253        path: prepared_path,
254        efi_code: efi.code,
255        efi_vars_template: vars_template,
256        prepared: true,
257        snapshot,
258    })
259}
260
261/// Ensure a browser-ready image exists (base image + bun + playwright + chromium).
262/// Built on top of the base prepared image — one-time operation.
263pub async fn ensure_browser_image(
264    base: &Image,
265    distro: &Distro,
266    redownload: bool,
267    use_kvm: bool,
268    max_memory_mb: u32,
269) -> Result<Image> {
270    let cache = cache_dir()?;
271    let browser_path = cache.join(distro.browser_prepared_filename());
272
273    if redownload {
274        let _ = tokio::fs::remove_file(&browser_path).await;
275    }
276
277    if !browser_path.exists() {
278        println!("Preparing browser image (installing bun + playwright + chromium)...");
279        println!("  This is a one-time operation.");
280        prepare_browser_image(base, &browser_path, use_kvm).await?;
281        println!("Browser image cached at: {}", browser_path.display());
282    } else {
283        println!("Using browser image: {}", browser_path.display());
284    }
285
286    // Create browser-specific snapshot
287    let cache = cache_dir()?;
288    let snap_prefix = format!(
289        "{}-snapshot-{max_memory_mb}",
290        distro.browser_snapshot_base()
291    );
292    let snap_disk = cache.join(format!("{snap_prefix}.qcow2"));
293    let snap_efivars = cache.join(format!("{snap_prefix}-efivars.qcow2"));
294    let snap_seed = cache.join(format!("{snap_prefix}-seed.iso"));
295    let snap_key = cache.join("test-ssh-key");
296
297    let snapshot = if snap_disk.exists() && snap_key.exists() {
298        Some(SnapshotFiles {
299            disk: snap_disk,
300            efivars: snap_efivars,
301            seed_iso: snap_seed,
302            ssh_key: snap_key,
303            memory_mb: max_memory_mb,
304        })
305    } else {
306        match create_snapshot(
307            &browser_path,
308            &base.efi_code,
309            &base.efi_vars_template,
310            &snap_disk,
311            &snap_efivars,
312            &snap_seed,
313            &snap_key,
314            max_memory_mb,
315            use_kvm,
316        )
317        .await
318        {
319            Ok(()) => Some(SnapshotFiles {
320                disk: snap_disk,
321                efivars: snap_efivars,
322                seed_iso: snap_seed,
323                ssh_key: snap_key,
324                memory_mb: max_memory_mb,
325            }),
326            Err(e) => {
327                eprintln!("  Warning: failed to create browser VM snapshot: {e:#}");
328                None
329            }
330        }
331    };
332
333    Ok(Image {
334        path: browser_path,
335        efi_code: base.efi_code.clone(),
336        efi_vars_template: base.efi_vars_template.clone(),
337        prepared: true,
338        snapshot,
339    })
340}
341
342/// Boot the base prepared image, install bun + playwright + chromium, then snapshot.
343async fn prepare_browser_image(base: &Image, browser_path: &Path, use_kvm: bool) -> Result<()> {
344    use crate::machine::{Machine, SpawnOpts};
345    use crate::ports;
346
347    let id = crate::machine::random_id();
348    let ssh_port = ports::allocate_ssh_port();
349    let opts = SpawnOpts {
350        use_kvm,
351        memory_mb: 4096, // chromium install needs decent RAM
352        cpus: 2,
353        disk_gb: 20,
354    };
355
356    let mut vm = Machine::spawn(base, &id, ssh_port, &opts).await?;
357
358    // Install unzip (needed by bun installer), bun, playwright + chromium.
359    // Runs as the ryra user; uses sudo for system-level operations.
360    let install_script = r#"
361set -e
362sudo apt-get update -qq && sudo apt-get install -y -qq unzip >/dev/null 2>&1
363curl -fsSL https://bun.sh/install | bash
364export BUN_INSTALL="$HOME/.bun"
365export PATH="$BUN_INSTALL/bin:$PATH"
366
367# Create a global playwright project so chromium is cached system-wide
368sudo mkdir -p /opt/playwright && sudo chown $USER:$USER /opt/playwright
369cd /opt/playwright
370bun init -y >/dev/null 2>&1
371bun add playwright @playwright/test
372bunx playwright install chromium --with-deps
373
374# Add bun to PATH for future SSH sessions
375echo 'export BUN_INSTALL="$HOME/.bun"' >> $HOME/.bashrc
376echo 'export PATH="$BUN_INSTALL/bin:$PATH"' >> $HOME/.bashrc
377"#;
378
379    println!("  Installing bun + playwright + chromium in VM...");
380    let result = vm.exec(install_script).await;
381    if let Err(e) = &result {
382        let _ = vm.destroy().await;
383        anyhow::bail!("failed to install browser tools: {e:#}");
384    }
385
386    // Shut down cleanly, wait for qemu to release the disk, then convert.
387    let disk = vm.work_dir.join("disk.qcow2");
388    let _ = vm.exec("sudo sync && sudo poweroff").await;
389    vm.wait_for_exit(std::time::Duration::from_secs(30)).await;
390
391    let status = Command::new("qemu-img")
392        .args([
393            "convert",
394            "-f",
395            "qcow2",
396            "-O",
397            "qcow2",
398            &disk.to_string_lossy(),
399            &browser_path.to_string_lossy(),
400        ])
401        .stdout(Stdio::null())
402        .stderr(Stdio::null())
403        .status()
404        .await
405        .context("qemu-img convert failed")?;
406    if !status.success() {
407        anyhow::bail!("qemu-img convert failed for browser image");
408    }
409
410    let _ = vm.destroy().await;
411    Ok(())
412}
413
414struct EfiFirmware {
415    code: PathBuf,
416    vars: PathBuf,
417}
418
419async fn find_efi_firmware() -> Result<EfiFirmware> {
420    let candidates = [
421        // Debian/Ubuntu
422        (
423            "/usr/share/AAVMF/AAVMF_CODE.fd",
424            "/usr/share/AAVMF/AAVMF_VARS.fd",
425        ),
426        (
427            "/usr/share/qemu-efi-aarch64/QEMU_EFI.fd",
428            "/usr/share/qemu-efi-aarch64/vars-template-pflash.raw",
429        ),
430        // Fedora / Arch
431        (
432            "/usr/share/edk2/aarch64/QEMU_EFI-pflash.raw",
433            "/usr/share/edk2/aarch64/vars-template-pflash.raw",
434        ),
435    ];
436
437    for (code, vars) in &candidates {
438        let code_path = PathBuf::from(code);
439        let vars_path = PathBuf::from(vars);
440        if code_path.exists() && vars_path.exists() {
441            return Ok(EfiFirmware {
442                code: code_path,
443                vars: vars_path,
444            });
445        }
446    }
447
448    anyhow::bail!(
449        "EFI firmware not found. Install it with:\n  \
450         sudo apt install qemu-efi-aarch64    # Debian/Ubuntu\n  \
451         sudo dnf install edk2-aarch64        # Fedora\n  \
452         sudo pacman -S edk2-aarch64          # Arch"
453    )
454}
455
456async fn download_image(distro: &Distro, dest: &PathBuf) -> Result<()> {
457    let url = distro.cloud_image_url();
458    println!("Downloading {distro} cloud image...");
459    println!("  {url}");
460
461    let partial = dest.with_extension("qcow2.partial");
462
463    let status = Command::new("curl")
464        .args([
465            "-L",
466            "--progress-bar",
467            "-o",
468            &partial.to_string_lossy(),
469            url,
470        ])
471        .stdout(Stdio::inherit())
472        .stderr(Stdio::inherit())
473        .status()
474        .await
475        .context("failed to run curl — is it installed?")?;
476
477    if !status.success() {
478        let _ = tokio::fs::remove_file(&partial).await;
479        anyhow::bail!("failed to download cloud image from {url}");
480    }
481
482    tokio::fs::rename(&partial, dest)
483        .await
484        .context("failed to move downloaded image into place")?;
485
486    println!("Image cached at: {}", dest.display());
487    Ok(())
488}
489
490/// Boot the raw cloud image, let cloud-init install packages, then snapshot it.
491///
492/// This is a one-time operation. The resulting image has podman, git, etc.
493/// already installed, so subsequent VMs skip the slow package install step.
494async fn prepare_image(
495    distro: &Distro,
496    raw_image: &Path,
497    prepared_path: &Path,
498    efi_code: &Path,
499    efi_vars_template: &Path,
500    use_kvm: bool,
501) -> Result<()> {
502    let work_dir = cache_dir()?.join("prepare-base");
503    let _ = tokio::fs::remove_dir_all(&work_dir).await;
504    tokio::fs::create_dir_all(&work_dir)
505        .await
506        .context("failed to create prepare work dir")?;
507
508    // Create a working copy of the raw image (not COW — we want a standalone result)
509    let disk = work_dir.join("disk.qcow2");
510    let status = Command::new("qemu-img")
511        .args([
512            "create",
513            "-f",
514            "qcow2",
515            "-b",
516            &raw_image.to_string_lossy(),
517            "-F",
518            "qcow2",
519            &disk.to_string_lossy(),
520            "20G",
521        ])
522        .stdout(Stdio::null())
523        .stderr(Stdio::null())
524        .status()
525        .await
526        .context("qemu-img create failed")?;
527    if !status.success() {
528        anyhow::bail!("qemu-img create failed for prepare step");
529    }
530
531    // Copy EFI vars
532    let efi_vars = work_dir.join("efivars.fd");
533    tokio::fs::copy(efi_vars_template, &efi_vars)
534        .await
535        .context("failed to copy EFI vars")?;
536
537    // Generate temp SSH key
538    let key_path = work_dir.join("id_ed25519");
539    let status = Command::new("ssh-keygen")
540        .args([
541            "-t",
542            "ed25519",
543            "-f",
544            &key_path.to_string_lossy(),
545            "-N",
546            "",
547            "-q",
548        ])
549        .stdout(Stdio::null())
550        .stderr(Stdio::null())
551        .status()
552        .await
553        .context("ssh-keygen failed")?;
554    if !status.success() {
555        anyhow::bail!("ssh-keygen failed");
556    }
557    let pub_key = tokio::fs::read_to_string(format!("{}.pub", key_path.display()))
558        .await
559        .context("failed to read public key")?;
560
561    // Build seed ISO with full package install
562    let seed_iso = work_dir.join("seed.iso");
563    crate::machine::build_seed_iso_full(
564        &work_dir,
565        &seed_iso,
566        "ryra-prepare",
567        pub_key.trim(),
568        distro.cloud_init_packages(),
569    )
570    .await?;
571
572    // Boot VM
573    let ssh_port = crate::ports::allocate_ssh_port();
574    let serial_log = work_dir.join("serial.log");
575    let memory = "2048";
576    let cpus = "2";
577    let efi_code_arg = format!(
578        "if=pflash,format=raw,file={},readonly=on",
579        efi_code.display()
580    );
581    let efi_vars_arg = format!("if=pflash,format=raw,file={}", efi_vars.display());
582    let disk_arg = format!("if=virtio,file={},format=qcow2", disk.display());
583    let seed_arg = format!("if=virtio,file={},format=raw", seed_iso.display());
584    let nic_arg = format!("user,hostfwd=tcp::{ssh_port}-:22");
585    let serial_arg = format!("file:{}", serial_log.display());
586
587    let mut args: Vec<&str> = vec![
588        "-machine",
589        "virt",
590        "-cpu",
591        if use_kvm { "host" } else { "max" },
592        "-m",
593        memory,
594        "-smp",
595        cpus,
596        "-drive",
597        &efi_code_arg,
598        "-drive",
599        &efi_vars_arg,
600        "-drive",
601        &disk_arg,
602        "-drive",
603        &seed_arg,
604        "-nic",
605        &nic_arg,
606        "-nographic",
607        "-serial",
608        &serial_arg,
609        "-monitor",
610        "none",
611    ];
612    if use_kvm {
613        args.extend(crate::accel_args().iter().copied());
614    }
615
616    let mut qemu = Command::new("qemu-system-aarch64")
617        .args(&args)
618        .stdout(Stdio::null())
619        .stderr(Stdio::null())
620        .spawn()
621        .context("failed to start QEMU for image preparation")?;
622
623    // Wait for SSH
624    let timeout = if use_kvm {
625        std::time::Duration::from_secs(300)
626    } else {
627        std::time::Duration::from_secs(900)
628    };
629    let start = std::time::Instant::now();
630    let port_str = ssh_port.to_string();
631    loop {
632        let result = Command::new("ssh")
633            .args([
634                "-o",
635                "StrictHostKeyChecking=no",
636                "-o",
637                "UserKnownHostsFile=/dev/null",
638                "-o",
639                "LogLevel=ERROR",
640                "-o",
641                "ConnectTimeout=3",
642                "-o",
643                "BatchMode=yes",
644                "-i",
645                &key_path.to_string_lossy(),
646                "-p",
647                &port_str,
648                "ryra@127.0.0.1",
649                "true",
650            ])
651            .stdout(Stdio::null())
652            .stderr(Stdio::null())
653            .status()
654            .await;
655
656        if let Ok(s) = result
657            && s.success()
658        {
659            break;
660        }
661
662        if start.elapsed() > timeout {
663            let _ = qemu.kill().await;
664            anyhow::bail!(
665                "timed out waiting for SSH during image preparation after {}s\n  \
666                 Serial log: {}",
667                timeout.as_secs(),
668                serial_log.display()
669            );
670        }
671
672        if start.elapsed().as_secs().is_multiple_of(30) && start.elapsed().as_secs() > 0 {
673            println!(
674                "  preparing image... ({:.0}s elapsed)",
675                start.elapsed().as_secs_f64()
676            );
677        }
678
679        tokio::time::sleep(std::time::Duration::from_secs(2)).await;
680    }
681
682    // Wait for cloud-init to finish
683    println!("  SSH ready, waiting for cloud-init to finish installing packages...");
684    let ci_result = Command::new("ssh")
685        .args([
686            "-o",
687            "StrictHostKeyChecking=no",
688            "-o",
689            "UserKnownHostsFile=/dev/null",
690            "-o",
691            "LogLevel=ERROR",
692            "-o",
693            "ConnectTimeout=10",
694            "-o",
695            "BatchMode=yes",
696            "-i",
697            &key_path.to_string_lossy(),
698            "-p",
699            &port_str,
700            "ryra@127.0.0.1",
701            "cloud-init status --wait",
702        ])
703        .stdout(Stdio::null())
704        .stderr(Stdio::null())
705        .status()
706        .await
707        .context("cloud-init wait failed")?;
708
709    if !ci_result.success() {
710        let _ = qemu.kill().await;
711        anyhow::bail!("cloud-init failed during image preparation");
712    }
713
714    // Clean up cloud-init state so it runs again on next boot (for per-VM SSH keys)
715    let _ = Command::new("ssh")
716        .args([
717            "-o",
718            "StrictHostKeyChecking=no",
719            "-o",
720            "UserKnownHostsFile=/dev/null",
721            "-o",
722            "LogLevel=ERROR",
723            "-o",
724            "BatchMode=yes",
725            "-i",
726            &key_path.to_string_lossy(),
727            "-p",
728            &port_str,
729            "ryra@127.0.0.1",
730            "cloud-init clean --logs && rm -f /etc/ssh/ssh_host_*_key*",
731        ])
732        .stdout(Stdio::null())
733        .stderr(Stdio::null())
734        .status()
735        .await;
736
737    // Shut down gracefully
738    let _ = Command::new("ssh")
739        .args([
740            "-o",
741            "StrictHostKeyChecking=no",
742            "-o",
743            "UserKnownHostsFile=/dev/null",
744            "-o",
745            "LogLevel=ERROR",
746            "-o",
747            "BatchMode=yes",
748            "-i",
749            &key_path.to_string_lossy(),
750            "-p",
751            &port_str,
752            "ryra@127.0.0.1",
753            "sudo poweroff",
754        ])
755        .stdout(Stdio::null())
756        .stderr(Stdio::null())
757        .status()
758        .await;
759
760    tokio::time::sleep(std::time::Duration::from_secs(5)).await;
761    let _ = qemu.kill().await;
762    let _ = qemu.wait().await;
763
764    // Compact the image — squash the COW layer into a standalone file
765    let status = Command::new("qemu-img")
766        .args([
767            "convert",
768            "-O",
769            "qcow2",
770            "-c",
771            &disk.to_string_lossy(),
772            &prepared_path.to_string_lossy(),
773        ])
774        .stdout(Stdio::null())
775        .stderr(Stdio::null())
776        .status()
777        .await
778        .context("qemu-img convert failed")?;
779    if !status.success() {
780        anyhow::bail!("failed to compact prepared image");
781    }
782
783    // Clean up work dir
784    let _ = tokio::fs::remove_dir_all(&work_dir).await;
785
786    Ok(())
787}
788
789/// Create a QEMU snapshot for instant VM boot.
790///
791/// Boots the prepared image, waits for SSH, then saves a VM snapshot.
792/// Subsequent VMs restore from this snapshot in <1s instead of cold-booting.
793#[allow(clippy::too_many_arguments)]
794async fn create_snapshot(
795    prepared_path: &Path,
796    efi_code: &Path,
797    efi_vars_template: &Path,
798    snapshot_disk: &Path,
799    snapshot_efivars: &Path,
800    snapshot_seed: &Path,
801    ssh_key_path: &Path,
802    memory_mb: u32,
803    use_kvm: bool,
804) -> Result<()> {
805    let work_dir = cache_dir()?.join("prepare-snapshot");
806    let _ = tokio::fs::remove_dir_all(&work_dir).await;
807    tokio::fs::create_dir_all(&work_dir)
808        .await
809        .context("failed to create snapshot work dir")?;
810
811    // Generate shared test SSH key (reused by all VMs)
812    if !ssh_key_path.exists() {
813        let status = Command::new("ssh-keygen")
814            .args([
815                "-t",
816                "ed25519",
817                "-f",
818                &ssh_key_path.to_string_lossy(),
819                "-N",
820                "",
821                "-q",
822            ])
823            .stdout(Stdio::null())
824            .stderr(Stdio::null())
825            .status()
826            .await
827            .context("ssh-keygen failed")?;
828        if !status.success() {
829            anyhow::bail!("ssh-keygen failed for test SSH key");
830        }
831    }
832
833    let pub_key = tokio::fs::read_to_string(format!("{}.pub", ssh_key_path.display()))
834        .await
835        .context("failed to read test SSH public key")?;
836
837    // Create COW overlay for snapshot boot
838    let disk = work_dir.join("disk.qcow2");
839    let status = Command::new("qemu-img")
840        .args([
841            "create",
842            "-f",
843            "qcow2",
844            "-b",
845            &prepared_path.to_string_lossy(),
846            "-F",
847            "qcow2",
848            &disk.to_string_lossy(),
849            "20G",
850        ])
851        .stdout(Stdio::null())
852        .stderr(Stdio::null())
853        .status()
854        .await
855        .context("qemu-img create failed")?;
856    if !status.success() {
857        anyhow::bail!("qemu-img create failed for snapshot disk");
858    }
859
860    // Convert EFI vars to qcow2 (required for snapshot support)
861    let efivars = work_dir.join("efivars.qcow2");
862    let status = Command::new("qemu-img")
863        .args([
864            "convert",
865            "-f",
866            "raw",
867            "-O",
868            "qcow2",
869            &efi_vars_template.to_string_lossy(),
870            &efivars.to_string_lossy(),
871        ])
872        .stdout(Stdio::null())
873        .stderr(Stdio::null())
874        .status()
875        .await
876        .context("qemu-img convert failed for efivars")?;
877    if !status.success() {
878        anyhow::bail!("failed to convert EFI vars to qcow2");
879    }
880
881    // Build seed ISO with the shared SSH key
882    let seed_iso = work_dir.join("seed.iso");
883    crate::machine::build_seed_iso(&work_dir, &seed_iso, "snapshot-prep", pub_key.trim()).await?;
884
885    // Boot with HMP monitor for savevm
886    let ssh_port = crate::ports::allocate_ssh_port();
887    let serial_log = work_dir.join("serial.log");
888    let port_str = ssh_port.to_string();
889
890    // Share the image store via virtfs (must match what Machine::spawn uses)
891    let shared_store = crate::machine::image_shared_store_dir()?;
892    tokio::fs::create_dir_all(&shared_store).await.ok();
893
894    let efi_code_arg = format!(
895        "if=pflash,format=raw,file={},readonly=on",
896        efi_code.display()
897    );
898    let efi_vars_arg = format!("if=pflash,format=qcow2,file={}", efivars.display());
899    let disk_arg = format!("if=virtio,file={},format=qcow2", disk.display());
900    let seed_arg = format!(
901        "if=virtio,file={},format=raw,readonly=on",
902        seed_iso.display()
903    );
904    let nic_arg = format!("user,hostfwd=tcp::{ssh_port}-:22");
905    let serial_arg = format!("file:{}", serial_log.display());
906    let mon_sock = work_dir.join("mon.sock");
907    let mon_arg = format!("unix:{},server,nowait", mon_sock.display());
908    let virtfs_arg = format!(
909        "local,path={},mount_tag=images,security_model=none,readonly=on",
910        shared_store.display()
911    );
912
913    let memory_str = memory_mb.to_string();
914    let mut args: Vec<&str> = vec![
915        "-machine",
916        "virt",
917        "-cpu",
918        if use_kvm { "host" } else { "max" },
919        "-m",
920        &memory_str,
921        "-smp",
922        "2",
923        "-drive",
924        &efi_code_arg,
925        "-drive",
926        &efi_vars_arg,
927        "-drive",
928        &disk_arg,
929        "-drive",
930        &seed_arg,
931        "-nic",
932        &nic_arg,
933        "-nographic",
934        "-serial",
935        &serial_arg,
936        "-monitor",
937        &mon_arg,
938        "-virtfs",
939        &virtfs_arg,
940    ];
941    if use_kvm {
942        args.extend(crate::accel_args().iter().copied());
943    }
944
945    let mut qemu = Command::new("qemu-system-aarch64")
946        .args(&args)
947        .stdout(Stdio::null())
948        .stderr(Stdio::null())
949        .spawn()
950        .context("failed to start QEMU for snapshot creation")?;
951
952    // Wait for SSH
953    let timeout = std::time::Duration::from_secs(if use_kvm { 120 } else { 600 });
954    let start = std::time::Instant::now();
955    loop {
956        let result = Command::new("ssh")
957            .args([
958                "-o",
959                "StrictHostKeyChecking=no",
960                "-o",
961                "UserKnownHostsFile=/dev/null",
962                "-o",
963                "LogLevel=ERROR",
964                "-o",
965                "ConnectTimeout=2",
966                "-o",
967                "BatchMode=yes",
968                "-i",
969                &ssh_key_path.to_string_lossy(),
970                "-p",
971                &port_str,
972                "ryra@127.0.0.1",
973                "true",
974            ])
975            .stdout(Stdio::null())
976            .stderr(Stdio::null())
977            .status()
978            .await;
979
980        if let Ok(s) = result
981            && s.success()
982        {
983            break;
984        }
985        if start.elapsed() > timeout {
986            let _ = qemu.kill().await;
987            anyhow::bail!("timed out waiting for SSH during snapshot creation");
988        }
989        tokio::time::sleep(std::time::Duration::from_secs(1)).await;
990    }
991
992    // Wait for cloud-init
993    let _ = Command::new("ssh")
994        .args([
995            "-o",
996            "StrictHostKeyChecking=no",
997            "-o",
998            "UserKnownHostsFile=/dev/null",
999            "-o",
1000            "LogLevel=ERROR",
1001            "-o",
1002            "BatchMode=yes",
1003            "-i",
1004            &ssh_key_path.to_string_lossy(),
1005            "-p",
1006            &port_str,
1007            "ryra@127.0.0.1",
1008            "cloud-init status --wait",
1009        ])
1010        .stdout(Stdio::null())
1011        .stderr(Stdio::null())
1012        .status()
1013        .await;
1014
1015    // Configure the VM before snapshotting so every restored VM starts clean.
1016    // - `/mnt/images` dir created (but NOT mounted — see below)
1017    // - Rootless podman config at ~/.config/containers/ (user-level)
1018    //
1019    // QEMU refuses `savevm` with "Migration is disabled when VirtFS export
1020    // path is mounted in the guest". So we deliberately leave /mnt/images
1021    // un-mounted while the snapshot is being saved. On cold boot the test
1022    // runner's `load_images_into_vm` mounts it on demand; when restoring
1023    // from this snapshot the same helper runs and handles the mount too.
1024    // The podman config still references /mnt/images — that path is
1025    // resolved lazily on first podman operation, so a stale reference
1026    // during snapshot save is harmless.
1027    let setup_cmd = "\
1028        sudo mkdir -p /mnt/images; \
1029        mkdir -p ~/.config/containers && \
1030        printf '[storage]\\ndriver = \"overlay\"\\n[storage.options]\\nadditionalimagestores = [\"/mnt/images\"]\\n' > ~/.config/containers/storage.conf && \
1031        printf 'unqualified-search-registries = [\"docker.io\"]\\n' > ~/.config/containers/registries.conf; \
1032        systemctl --user daemon-reload";
1033    let setup_status = Command::new("ssh")
1034        .args([
1035            "-o",
1036            "StrictHostKeyChecking=no",
1037            "-o",
1038            "UserKnownHostsFile=/dev/null",
1039            "-o",
1040            "LogLevel=ERROR",
1041            "-o",
1042            "BatchMode=yes",
1043            "-i",
1044            &ssh_key_path.to_string_lossy(),
1045            "-p",
1046            &port_str,
1047            "ryra@127.0.0.1",
1048            setup_cmd,
1049        ])
1050        .output()
1051        .await
1052        .context("failed to SSH for snapshot setup")?;
1053    if !setup_status.status.success() {
1054        let stderr = String::from_utf8_lossy(&setup_status.stderr);
1055        anyhow::bail!("snapshot setup failed: {stderr}");
1056    }
1057
1058    // Save snapshot via HMP monitor using socat
1059    let socat_result = std::process::Command::new("socat")
1060        .args(["-", &format!("UNIX-CONNECT:{}", mon_sock.display())])
1061        .stdin(std::process::Stdio::piped())
1062        .stdout(std::process::Stdio::null())
1063        .stderr(std::process::Stdio::null())
1064        .spawn()
1065        .and_then(|mut child| {
1066            use std::io::Write;
1067            if let Some(ref mut stdin) = child.stdin {
1068                stdin.write_all(b"savevm ready\n")?;
1069                stdin.flush()?;
1070            }
1071            child.stdin.take();
1072            Ok(child)
1073        });
1074
1075    match socat_result {
1076        Ok(mut child) => {
1077            // Wait for savevm to finish writing RAM state into the qcow2. We
1078            // poll the disk file's size: savevm grows it as it streams memory
1079            // out, so once the size is steady for a handful of consecutive
1080            // polls, we know the write is done. A plain `sleep(N)` is fragile
1081            // because `N` has to be large enough for the slowest host (6GB
1082            // can take 5+ min on a loaded Asahi) yet we pay it every boot on
1083            // fast hosts too. Polling ends as soon as the disk goes quiet.
1084            //
1085            // A hard ceiling prevents a hung savevm from pinning the process
1086            // forever, scaled to VM memory since bigger VMs legitimately take
1087            // longer.
1088            let start = std::time::Instant::now();
1089            let max_wait =
1090                std::time::Duration::from_secs(std::cmp::max(300, (memory_mb as u64) * 2));
1091            let poll_interval = std::time::Duration::from_secs(2);
1092            // Consider savevm done when the file size hasn't changed for
1093            // this many polls in a row (≈ 6 seconds of quiet). QEMU buffers
1094            // writes, so we want enough stability to rule out "briefly idle
1095            // between chunks".
1096            let stable_polls_needed = 3;
1097
1098            let mut last_size: u64 = 0;
1099            let mut stable_polls: u32 = 0;
1100            // Give savevm a moment to actually start writing before we begin
1101            // checking — otherwise we'd see the disk at its initial tiny size
1102            // and declare it "stable" before a single byte was written.
1103            tokio::time::sleep(std::time::Duration::from_secs(5)).await;
1104
1105            loop {
1106                let size = tokio::fs::metadata(&disk)
1107                    .await
1108                    .map(|m| m.len())
1109                    .unwrap_or(0);
1110                if size == last_size && size > 0 {
1111                    stable_polls += 1;
1112                    if stable_polls >= stable_polls_needed {
1113                        break;
1114                    }
1115                } else {
1116                    stable_polls = 0;
1117                    last_size = size;
1118                }
1119                if start.elapsed() > max_wait {
1120                    eprintln!(
1121                        "  warning: savevm hit max wait ({}s) — qcow2 size {}MB, proceeding anyway",
1122                        max_wait.as_secs(),
1123                        size / (1024 * 1024),
1124                    );
1125                    break;
1126                }
1127                tokio::time::sleep(poll_interval).await;
1128            }
1129            let _ = child.kill();
1130            let _ = child.wait();
1131        }
1132        Err(e) => {
1133            let _ = qemu.kill().await;
1134            anyhow::bail!("failed to save VM snapshot via socat: {e}. Is socat installed?");
1135        }
1136    }
1137
1138    let _ = qemu.kill().await;
1139    let _ = qemu.wait().await;
1140
1141    // Verify the snapshot was actually saved
1142    let check = Command::new("qemu-img")
1143        .args(["snapshot", "-l", &disk.to_string_lossy()])
1144        .output()
1145        .await
1146        .context("failed to run qemu-img snapshot -l")?;
1147    let snapshot_list = String::from_utf8_lossy(&check.stdout);
1148    if !snapshot_list.contains("ready") {
1149        anyhow::bail!(
1150            "savevm failed — snapshot 'ready' not found in {}. \
1151             This can happen if the VM needed more time to save {}MB of RAM.",
1152            disk.display(),
1153            memory_mb
1154        );
1155    }
1156
1157    // Move snapshot files to their final locations
1158    tokio::fs::rename(&disk, snapshot_disk)
1159        .await
1160        .context("failed to move snapshot disk")?;
1161    tokio::fs::rename(&efivars, snapshot_efivars)
1162        .await
1163        .context("failed to move snapshot efivars")?;
1164    tokio::fs::rename(&seed_iso, snapshot_seed)
1165        .await
1166        .context("failed to move snapshot seed ISO")?;
1167
1168    let _ = tokio::fs::remove_dir_all(&work_dir).await;
1169    Ok(())
1170}