supermachine 0.4.13

Run any OCI/Docker image as a hardware-isolated microVM on macOS HVF (Linux KVM and Windows WHP in progress). Single library API, zero flags for the common case, sub-100 ms cold-restore from snapshot.
// virtio-balloon โ€” guest cooperative memory release.
//
// Two queues: inflate (idx 0) where the guest hands us PFN lists of
// pages it's no longer using, and deflate (idx 1) for taking pages
// back. We `madvise(MADV_FREE)` the inflated pages on the host's
// CoW RAM mapping so the kernel can reclaim them under pressure
// (next access faults from the snapshot file or zero-fills).
//
// The host triggers inflation by bumping the device's `num_pages`
// config register and asserting the config-change IRQ; the guest's
// virtio_balloon driver then frees that many pages and pushes the
// PFN list onto the inflate queue.
//
// Config space layout (virtio 1.2 ยง5.5.6):
//   0x000  num_pages  LE u32 โ€” set by host, read by guest
//   0x004  actual     LE u32 โ€” written by guest as it inflates

#![allow(dead_code)]

use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::{Arc, Mutex};

use super::queue::{Queue, VRING_DESC_F_WRITE};
use super::VirtioDevice;

const VIRTIO_ID_BALLOON: u32 = 5;

pub static INFLATED_PAGES: AtomicU64 = AtomicU64::new(0);

pub struct VirtioBalloon {
    queues: Mutex<Vec<Queue>>,
    activated: AtomicBool,
    num_pages: Mutex<u32>,
    actual: Mutex<u32>,
    irq_raise: Mutex<Option<Arc<dyn Fn() + Send + Sync>>>,
    /// Raised separately from the used-buffer IRQ when num_pages
    /// changes; sets interrupt_status bit 1 instead of bit 0. The
    /// MmioVirtio shell calls this via `raise_config_irq`.
    config_irq_raise: Mutex<Option<Arc<dyn Fn() + Send + Sync>>>,
}

impl VirtioBalloon {
    pub fn new() -> Self {
        Self {
            queues: Mutex::new(Vec::new()),
            activated: AtomicBool::new(false),
            num_pages: Mutex::new(0),
            actual: Mutex::new(0),
            irq_raise: Mutex::new(None),
            config_irq_raise: Mutex::new(None),
        }
    }

    pub fn set_irq_raise(&self, f: Arc<dyn Fn() + Send + Sync>) {
        *self.irq_raise.lock().unwrap() = Some(f);
    }
    pub fn set_config_irq_raise(&self, f: Arc<dyn Fn() + Send + Sync>) {
        *self.config_irq_raise.lock().unwrap() = Some(f);
    }

    /// Ask the guest to release `pages` 4 KiB pages. Bumps num_pages
    /// + fires config-change IRQ. The guest's balloon driver wakes,
    /// frees that many pages, and pushes their PFNs onto the inflate
    /// queue (which our `notify(0)` then madvise-FREEs).
    pub fn request_inflate(&self, pages: u32) {
        let mut np = self.num_pages.lock().unwrap();
        if *np == pages {
            return;
        }
        *np = pages;
        drop(np);
        if let Some(f) = self.config_irq_raise.lock().unwrap().clone() {
            f();
        }
    }

    fn drain_inflate(&self, ram_host: *mut u8, ram_size: usize, ram_gpa: u64) {
        if !self.activated.load(Ordering::Acquire) {
            return;
        }
        let mut qs = self.queues.lock().unwrap();
        let q = match qs.get_mut(0) {
            Some(q) => q,
            None => return,
        };
        if !q.ready {
            return;
        }
        let mut any = false;
        let mut freed: u64 = 0;
        loop {
            let (head, chain) = match q.pop_chain() {
                Some(p) => p,
                None => break,
            };
            let mut total: u32 = 0;
            for d in &chain {
                if d.flags & VRING_DESC_F_WRITE != 0 {
                    continue;
                }
                if d.len == 0 {
                    continue;
                }
                let mut buf = vec![0u8; d.len as usize];
                q.mem.read_slice(d.addr, &mut buf);
                for chunk in buf.chunks_exact(4) {
                    let pfn = u32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]) as u64;
                    let gpa = pfn * 4096;
                    if gpa < ram_gpa {
                        continue;
                    }
                    let off = (gpa - ram_gpa) as usize;
                    if off + 4096 > ram_size {
                        continue;
                    }
                    // SAFETY: ram_host valid for ram_size; off+4K bounded.
                    //
                    // Try MADV_FREE_REUSABLE first โ€” on macOS, this marks
                    // pages as reusable AND immediately decrements the
                    // process's RSS / phys_footprint, exactly the
                    // behaviour ballooning needs. Plain MADV_FREE on
                    // macOS is "lazy reclaim under pressure" โ†’ RSS
                    // doesn't drop until the OS gets squeezed, which
                    // defeats the "host can pack more workers" win.
                    // Fall back to MADV_FREE on the rare ENOTSUP.
                    unsafe {
                        let p = ram_host.add(off) as *mut libc::c_void;
                        let r = libc::madvise(p, 4096, libc::MADV_FREE_REUSABLE);
                        if r != 0 {
                            libc::madvise(p, 4096, libc::MADV_FREE);
                        }
                    }
                    freed += 1;
                }
                total = total.saturating_add(d.len);
            }
            q.add_used(head, total);
            any = true;
        }
        drop(qs);
        if freed > 0 {
            INFLATED_PAGES.fetch_add(freed, Ordering::Relaxed);
            if std::env::var_os("SUPERMACHINE_BALLOON_TRACE").is_some() {
                let total = INFLATED_PAGES.load(Ordering::Relaxed);
                eprintln!(
                    "[virtio-balloon] inflated +{freed} pages \
                     (total={total} = {} MiB reclaimed)",
                    total * 4 / 1024
                );
            }
        }
        if any {
            if let Some(f) = self.irq_raise.lock().unwrap().clone() {
                f();
            }
        }
    }

    fn drain_deflate(&self) {
        if !self.activated.load(Ordering::Acquire) {
            return;
        }
        let mut qs = self.queues.lock().unwrap();
        let q = match qs.get_mut(1) {
            Some(q) => q,
            None => return,
        };
        if !q.ready {
            return;
        }
        let mut any = false;
        let mut pages: u64 = 0;
        loop {
            let (head, chain) = match q.pop_chain() {
                Some(p) => p,
                None => break,
            };
            let mut total: u32 = 0;
            for d in &chain {
                pages += (d.len as u64) / 4;
                total = total.saturating_add(d.len);
            }
            q.add_used(head, total);
            any = true;
        }
        drop(qs);
        let cur = INFLATED_PAGES.load(Ordering::Relaxed);
        INFLATED_PAGES.store(cur.saturating_sub(pages), Ordering::Relaxed);
        if any {
            if let Some(f) = self.irq_raise.lock().unwrap().clone() {
                f();
            }
        }
    }
}

/// Wrapper that knows the guest-RAM layout for the madvise call.
pub struct VirtioBalloonWithRam {
    pub inner: Arc<VirtioBalloon>,
    pub ram_host: *mut u8,
    pub ram_size: usize,
    pub ram_gpa: u64,
}
unsafe impl Send for VirtioBalloonWithRam {}
unsafe impl Sync for VirtioBalloonWithRam {}

impl VirtioDevice for VirtioBalloonWithRam {
    fn device_id(&self) -> u32 {
        VIRTIO_ID_BALLOON
    }
    fn num_queues(&self) -> usize {
        2
    }
    fn features(&self) -> u64 {
        1u64 << 32 /* VIRTIO_F_VERSION_1 */
    }
    fn config(&self) -> Vec<u8> {
        let np = *self.inner.num_pages.lock().unwrap();
        let ac = *self.inner.actual.lock().unwrap();
        let mut v = Vec::with_capacity(8);
        v.extend_from_slice(&np.to_le_bytes());
        v.extend_from_slice(&ac.to_le_bytes());
        v
    }
    fn notify(&self, q: u16) {
        match q {
            0 => self
                .inner
                .drain_inflate(self.ram_host, self.ram_size, self.ram_gpa),
            1 => self.inner.drain_deflate(),
            _ => {}
        }
    }
    fn activate(&self, queues: Vec<Queue>) {
        *self.inner.queues.lock().unwrap() = queues;
        self.inner.activated.store(true, Ordering::Release);
        eprintln!("[virtio-balloon] activated");
    }
    fn snapshot_queues(&self) -> Vec<Queue> {
        self.inner.queues.lock().unwrap().clone()
    }
    fn config_write(&self, offset: usize, value: u32) {
        // Guest writes "actual" at config-offset 0x004 as it inflates.
        if offset == 0x004 {
            *self.inner.actual.lock().unwrap() = value;
        }
    }
}