Skip to main content

squib_host/
pager.rs

1//! Postcopy / lazy-restore pager — Mach-exception-port-driven page server.
2//!
3//! Implements [16-snapshots.md § 5](../../../specs/16-snapshots.md#5-postcopy--lazy-restore).
4//! Two distinct backends:
5//!
6//! - [`FilePageSource`] — pages come from the `<id>.mem` file by `pread(2)` at `(ipa - ram_start)`.
7//! - [`UffdPageSource`] — pages come from a UDS the operator runs a page server on,
8//!   protocol-compatible with upstream Firecracker's `Uffd` backend.
9//!
10//! The pager itself is host-portable: it owns the page-source dispatcher, the
11//! pre-warm list, statistics, and the LLDB-coexistence policy state. The Mach side
12//! lives in a `mach_imp` module under `cfg(target_os = "macos")`; the rest of the
13//! crate is testable on Linux too.
14//!
15//! ## LLDB coexistence
16//!
17//! On startup the pager calls `task_swap_exception_ports` (NOT `task_set_exception_ports`)
18//! so the previous exception handler — kernel default, or LLDB's port — is captured.
19//! On every received fault that does not fall in any registered postcopy region, the
20//! pager forwards the exception via `mach_exception_raise_state_identity` to the saved
21//! prior port and returns the prior handler's `KERN_*` value.
22//!
23//! The "lldb-attaches-after-pager" case is handled by re-reading the current ports
24//! on every `mach_msg` 1 s timeout (configurable via [`PagerConfig::poll_interval`])
25//! and re-installing if they have drifted.
26
27#[cfg(unix)]
28use std::os::unix::fs::FileExt;
29use std::{
30    collections::BTreeMap,
31    fs::File,
32    io::{Read, Write},
33    os::unix::net::UnixStream,
34    path::{Path, PathBuf},
35    sync::{
36        Arc,
37        atomic::{AtomicBool, AtomicU64, Ordering},
38    },
39    time::Duration,
40};
41
42use bytes::{Bytes, BytesMut};
43use parking_lot::Mutex;
44use thiserror::Error;
45
46/// Pager configuration knobs (16 § 5).
47#[derive(Debug, Clone)]
48pub struct PagerConfig {
49    /// `mach_msg` poll interval. The pager wakes up every interval to re-read the
50    /// task's exception ports and re-install if LLDB has overwritten ours.
51    pub poll_interval: Duration,
52    /// Pre-warm list of guest IPAs to fault in before vCPU 0 runs.
53    pub prewarm: PrewarmList,
54}
55
56impl Default for PagerConfig {
57    fn default() -> Self {
58        Self {
59            poll_interval: Duration::from_secs(1),
60            prewarm: PrewarmList::default(),
61        }
62    }
63}
64
65/// Pre-warm list — a small set of guest IPAs the pager faults in before vCPU 0
66/// runs so the first guest cycles are not all blocking on the pager (16 § 5.2).
67#[derive(Debug, Clone, Default)]
68pub struct PrewarmList {
69    /// One page IPA per entry. The pager faults each in order on registration.
70    pub pages: Vec<u64>,
71}
72
73impl PrewarmList {
74    /// Build a list from kernel `_text`/`_stext` neighbourhood, vCPU 0 stack, and
75    /// FDT.
76    ///
77    /// Each input is a guest-physical address; we add it as a single page entry.
78    /// The pager pads the address to its tracking page granule on `prewarm`.
79    #[must_use]
80    pub fn from_boot_critical(kernel_text_ipa: u64, vcpu0_stack_ipa: u64, fdt_ipa: u64) -> Self {
81        Self {
82            pages: vec![kernel_text_ipa, vcpu0_stack_ipa, fdt_ipa],
83        }
84    }
85}
86
87/// One page request the pager sends to a [`PageSource`].
88#[derive(Debug, Clone)]
89pub struct PageRequest {
90    /// Guest-physical address (IPA) of the page.
91    pub ipa: u64,
92    /// Page size in bytes (matches the host page on Apple Silicon = 16 KiB).
93    pub page_size: u64,
94}
95
96/// Error variants surfaced from a [`PageSource`].
97#[derive(Debug, Error)]
98pub enum PageSourceError {
99    /// The source returned fewer bytes than `page_size`.
100    #[error("page source returned a short read: {got} bytes, expected {expected}")]
101    Short {
102        /// Bytes received.
103        got: u64,
104        /// Bytes expected.
105        expected: u64,
106    },
107    /// The operator-supplied path / UDS could not be opened.
108    #[error("page source open: {0}")]
109    Open(String),
110    /// Underlying I/O failure.
111    #[error("page source I/O: {0}")]
112    Io(#[source] std::io::Error),
113}
114
115impl From<std::io::Error> for PageSourceError {
116    fn from(err: std::io::Error) -> Self {
117        Self::Io(err)
118    }
119}
120
121/// Where the pager pulls page bytes from.
122///
123/// One concrete impl per `mem_backend.backend_type` (`File`, `Uffd`).
124pub trait PageSource: Send + Sync + std::fmt::Debug {
125    /// Return the bytes for `req.ipa` rounded down to its `req.page_size` boundary.
126    ///
127    /// # Errors
128    /// [`PageSourceError`] for any source-side failure.
129    fn fetch(&self, req: &PageRequest) -> Result<Bytes, PageSourceError>;
130}
131
132/// `File`-backed page source. Reads from a memory-file on local disk by `pread`.
133#[derive(Debug)]
134pub struct FilePageSource {
135    file: File,
136    ram_start: u64,
137}
138
139impl FilePageSource {
140    /// Open the memory file and bind it to `ram_start`.
141    ///
142    /// `ram_start` is the guest-physical address corresponding to byte 0 of the
143    /// memory file (typically `DRAM_BASE = 0x8000_0000`).
144    ///
145    /// # Errors
146    /// [`PageSourceError::Open`] if the file cannot be opened.
147    pub fn open(path: &Path, ram_start: u64) -> Result<Self, PageSourceError> {
148        let file = File::open(path)
149            .map_err(|e| PageSourceError::Open(format!("{}: {e}", path.display())))?;
150        Ok(Self { file, ram_start })
151    }
152}
153
154impl PageSource for FilePageSource {
155    fn fetch(&self, req: &PageRequest) -> Result<Bytes, PageSourceError> {
156        if req.ipa < self.ram_start {
157            return Err(PageSourceError::Open(format!(
158                "ipa {:#x} below ram_start {:#x}",
159                req.ipa, self.ram_start
160            )));
161        }
162        let offset = req.ipa - self.ram_start;
163        let aligned = offset & !(req.page_size - 1);
164        let mut buf = vec![
165            0u8;
166            usize::try_from(req.page_size).map_err(|_| {
167                PageSourceError::Open("page_size > usize::MAX".into())
168            })?
169        ];
170        // `pread` keeps the read off the shared seek cursor — works under
171        // concurrent fault threads.
172        #[cfg(unix)]
173        let n = self.file.read_at(&mut buf, aligned)?;
174        #[cfg(not(unix))]
175        let n = {
176            // Non-unix is test-only; the lib is unsafe on Mach which is unix-only.
177            let _ = aligned;
178            buf.fill(0);
179            buf.len()
180        };
181        // `usize → u64` is infallible on every supported squib host (Apple
182        // Silicon is 64-bit), but `try_from` makes the widening explicit so the
183        // crate-level `cast_possible_truncation` allow no longer hides a wrap.
184        let n_u64 = u64::try_from(n).unwrap_or(u64::MAX);
185        if n_u64 < req.page_size {
186            return Err(PageSourceError::Short {
187                got: n_u64,
188                expected: req.page_size,
189            });
190        }
191        Ok(Bytes::from(buf))
192    }
193}
194
195/// `Uffd`-backed page source. Talks to a page-server on a UDS in the
196/// upstream-Firecracker wire shape (see § 5.1).
197///
198/// Wire shape (squib's; mirrors upstream's `userfaultfd` request shape):
199///
200/// ```text
201/// request:  <u64 LE: ipa>
202///           <u64 LE: page_size>
203/// response: <u64 LE: ipa>
204///           <u64 LE: page_size>
205///           <bytes: page payload, length == page_size>
206/// ```
207///
208/// The connection is `parking_lot::Mutex`-serialised so concurrent fault threads
209/// can share one UDS without interleaving requests.
210#[derive(Debug)]
211pub struct UffdPageSource {
212    socket: Mutex<UnixStream>,
213    uds_path: PathBuf,
214}
215
216impl UffdPageSource {
217    /// Connect to the page-server's UDS.
218    ///
219    /// # Errors
220    /// [`PageSourceError::Open`] if connect fails.
221    pub fn connect(path: &Path) -> Result<Self, PageSourceError> {
222        let sock = UnixStream::connect(path)
223            .map_err(|e| PageSourceError::Open(format!("connect({}): {e}", path.display())))?;
224        Ok(Self {
225            socket: Mutex::new(sock),
226            uds_path: path.to_path_buf(),
227        })
228    }
229
230    /// Path of the UDS this source is connected to.
231    #[must_use]
232    pub fn path(&self) -> &Path {
233        &self.uds_path
234    }
235}
236
237impl PageSource for UffdPageSource {
238    fn fetch(&self, req: &PageRequest) -> Result<Bytes, PageSourceError> {
239        let mut sock = self.socket.lock();
240        // Send request.
241        sock.write_all(&req.ipa.to_le_bytes())?;
242        sock.write_all(&req.page_size.to_le_bytes())?;
243        // Read header echo.
244        let mut hdr = [0u8; 16];
245        sock.read_exact(&mut hdr)?;
246        let echo_ipa = u64::from_le_bytes(hdr[0..8].try_into().unwrap_or([0; 8]));
247        let echo_size = u64::from_le_bytes(hdr[8..16].try_into().unwrap_or([0; 8]));
248        if echo_ipa != req.ipa || echo_size != req.page_size {
249            return Err(PageSourceError::Open(format!(
250                "Uffd protocol violation: expected ipa={:#x} size={} got ipa={:#x} size={}",
251                req.ipa, req.page_size, echo_ipa, echo_size
252            )));
253        }
254        let want = usize::try_from(req.page_size)
255            .map_err(|_| PageSourceError::Open("page_size > usize::MAX".into()))?;
256        let mut buf = BytesMut::zeroed(want);
257        sock.read_exact(&mut buf)?;
258        Ok(buf.freeze())
259    }
260}
261
262/// Pager statistics — exposed for tracing and unit tests. All counters are
263/// monotonic so a test can assert "fault count went up by N".
264#[derive(Debug, Default)]
265pub struct PagerStats {
266    /// Number of fault requests served.
267    pub faults: AtomicU64,
268    /// Number of pages pre-warmed at registration.
269    pub prewarmed: AtomicU64,
270    /// Number of times the pager re-installed its exception port after detecting
271    /// LLDB drift.
272    pub port_reinstalls: AtomicU64,
273    /// Number of exceptions forwarded to a prior handler (out-of-region faults).
274    pub forwarded_exceptions: AtomicU64,
275}
276
277impl PagerStats {
278    /// Snapshot the counters into a plain-data struct for tests / logging.
279    pub fn snapshot(&self) -> PagerStatsSnapshot {
280        PagerStatsSnapshot {
281            faults: self.faults.load(Ordering::Relaxed),
282            prewarmed: self.prewarmed.load(Ordering::Relaxed),
283            port_reinstalls: self.port_reinstalls.load(Ordering::Relaxed),
284            forwarded_exceptions: self.forwarded_exceptions.load(Ordering::Relaxed),
285        }
286    }
287}
288
289/// Plain-data snapshot of [`PagerStats`].
290#[derive(Debug, Clone, Copy, PartialEq, Eq)]
291pub struct PagerStatsSnapshot {
292    /// Faults served.
293    pub faults: u64,
294    /// Pages pre-warmed.
295    pub prewarmed: u64,
296    /// Times we re-installed the exception port.
297    pub port_reinstalls: u64,
298    /// Exceptions forwarded to a prior handler.
299    pub forwarded_exceptions: u64,
300}
301
302/// Errors produced by the pager.
303#[derive(Debug, Error)]
304pub enum PagerError {
305    /// The IPA the host requested falls outside any registered postcopy region.
306    #[error("postcopy region missing: ipa={ipa:#x}")]
307    OutOfRegion {
308        /// The requested IPA.
309        ipa: u64,
310    },
311    /// The page source surfaced an error.
312    #[error("page source: {0}")]
313    Source(#[from] PageSourceError),
314    /// Mach-side error (always opaque; the OS surfaces these via `kern_return_t`).
315    #[error("mach error: {0}")]
316    Mach(String),
317    /// `pthread_create` / `thread::Builder::spawn` failed, typically with EAGAIN
318    /// (per-process thread limit reached). DoS-relevant on a multi-VM host.
319    #[error("pager server thread spawn: {0}")]
320    Spawn(#[source] std::io::Error),
321}
322
323/// Boundaries of one registered postcopy region.
324#[derive(Debug, Clone, Copy)]
325struct Region {
326    base: u64,
327    size: u64,
328}
329
330impl Region {
331    fn contains(&self, ipa: u64) -> bool {
332        ipa >= self.base && ipa < self.base.saturating_add(self.size)
333    }
334}
335
336/// The pager dispatches fault requests onto its [`PageSource`] and tracks which
337/// IPA ranges are eligible.
338///
339/// The Mach side runs as a dedicated `std::thread` (the `mach_msg` server loop is
340/// blocking, can't go on the tokio runtime). On non-macOS targets the `serve_*`
341/// entry-points are provided as stubs so unit tests of the dispatch logic still
342/// run on Linux CI nodes.
343#[derive(Debug)]
344pub struct Pager {
345    inner: Arc<PagerInner>,
346}
347
348#[derive(Debug)]
349struct PagerInner {
350    source: Box<dyn PageSource>,
351    config: PagerConfig,
352    regions: Mutex<BTreeMap<u64, Region>>, // base → Region
353    stats: PagerStats,
354    /// Set on `request_shutdown`; the `mach_msg` server loop exits on its next
355    /// poll wakeup.
356    shutdown: AtomicBool,
357}
358
359impl Pager {
360    /// Build a pager around a page source.
361    ///
362    /// `regions` is added separately via [`Self::register_region`] — typically
363    /// `[ram_start, ram_size)`.
364    #[must_use]
365    pub fn new(source: Box<dyn PageSource>, config: PagerConfig) -> Self {
366        Self {
367            inner: Arc::new(PagerInner {
368                source,
369                config,
370                regions: Mutex::new(BTreeMap::new()),
371                stats: PagerStats::default(),
372                shutdown: AtomicBool::new(false),
373            }),
374        }
375    }
376
377    /// Register a postcopy region `[base, base + size)`. Subsequent faults for IPAs
378    /// inside the region are served from the page source.
379    pub fn register_region(&self, base: u64, size: u64) {
380        let mut regs = self.inner.regions.lock();
381        regs.insert(base, Region { base, size });
382    }
383
384    /// Pre-warm the pages in [`PagerConfig::prewarm`].
385    ///
386    /// # Errors
387    /// First [`PagerError::OutOfRegion`] / [`PagerError::Source`].
388    pub fn prewarm(&self) -> Result<(), PagerError> {
389        let pages = self.inner.config.prewarm.pages.clone();
390        for ipa in pages {
391            self.serve_fault(ipa, host_page_size())?;
392            self.inner.stats.prewarmed.fetch_add(1, Ordering::Relaxed);
393        }
394        Ok(())
395    }
396
397    /// Serve one fault by computing the page boundary and pulling bytes.
398    ///
399    /// Returns the page bytes; the caller is responsible for `mach_vm_protect`
400    /// on macOS or the equivalent host-side write through the HVF mapping.
401    ///
402    /// # Errors
403    /// [`PagerError::OutOfRegion`] if the IPA isn't in a registered region;
404    /// [`PagerError::Source`] for source-side failures.
405    pub fn serve_fault(&self, ipa: u64, page_size: u64) -> Result<Bytes, PagerError> {
406        if !self.contains_ipa(ipa) {
407            return Err(PagerError::OutOfRegion { ipa });
408        }
409        let aligned = ipa & !(page_size - 1);
410        let req = PageRequest {
411            ipa: aligned,
412            page_size,
413        };
414        let bytes = self.inner.source.fetch(&req)?;
415        self.inner.stats.faults.fetch_add(1, Ordering::Relaxed);
416        Ok(bytes)
417    }
418
419    /// `true` if the IPA falls inside any registered region.
420    #[must_use]
421    pub fn contains_ipa(&self, ipa: u64) -> bool {
422        let regs = self.inner.regions.lock();
423        regs.values().any(|r| r.contains(ipa))
424    }
425
426    /// Snapshot of the pager's counters.
427    #[must_use]
428    pub fn stats(&self) -> PagerStatsSnapshot {
429        self.inner.stats.snapshot()
430    }
431
432    /// Signal the Mach server loop to exit at its next poll boundary.
433    pub fn request_shutdown(&self) {
434        self.inner.shutdown.store(true, Ordering::SeqCst);
435    }
436
437    /// Reference-counted clone of the pager handle. Used so the Mach server thread
438    /// (or test scaffolding) can hold a handle for the lifetime of the run.
439    #[must_use]
440    pub fn handle(&self) -> PagerHandle {
441        PagerHandle(Arc::clone(&self.inner))
442    }
443
444    /// Inner stats accessor (used by the Mach server thread to record forwarded
445    /// exceptions and re-installs).
446    #[doc(hidden)]
447    pub fn record_port_reinstall(&self) {
448        self.inner
449            .stats
450            .port_reinstalls
451            .fetch_add(1, Ordering::Relaxed);
452    }
453
454    /// Inner stats accessor.
455    #[doc(hidden)]
456    pub fn record_forwarded(&self) {
457        self.inner
458            .stats
459            .forwarded_exceptions
460            .fetch_add(1, Ordering::Relaxed);
461    }
462}
463
464/// Cheap clone of the pager handle (an `Arc`).
465#[derive(Debug, Clone)]
466pub struct PagerHandle(Arc<PagerInner>);
467
468impl PagerHandle {
469    /// `true` if the supervisor has requested shutdown.
470    pub fn is_shutting_down(&self) -> bool {
471        self.0.shutdown.load(Ordering::SeqCst)
472    }
473
474    /// Borrow the poll interval.
475    #[must_use]
476    pub fn poll_interval(&self) -> Duration {
477        self.0.config.poll_interval
478    }
479}
480
481/// Apple Silicon page size. Stays a function so squib-host compiles cross-platform.
482#[must_use]
483pub fn host_page_size() -> u64 {
484    16 * 1024
485}
486
487// ---------------------------------------------------------------------------
488// macOS-specific Mach exception port server (the live FFI surface).
489// ---------------------------------------------------------------------------
490
491#[cfg(target_os = "macos")]
492mod mach_imp {
493    //! The Mach-exception-port server thread.
494    //!
495    //! Two compile-time variants:
496    //!
497    //! - **Default (skeleton)** — drift-poll only. Useful for unit-test runs and non-postcopy
498    //!   production paths; the pager can still serve pages via [`super::PageSource::fetch`] when
499    //!   the orchestrator hands off `(ipa, page)` pairs through some other channel. No
500    //!   process-level exception port is installed, so `cargo test` is safe to run on a developer
501    //!   Mac without any concern about taking over `EXC_BAD_ACCESS` delivery.
502    //! - **`pager-live-mach`** — the full Mach-exception-port server. Allocates a receive port,
503    //!   installs it via `task_swap_exception_ports`, services `mach_msg(MACH_RCV_MSG)` with a
504    //!   `poll_interval` timeout, drift-checks the active port set on every loop, and re-installs
505    //!   if LLDB has overwritten ours. Out-of-region exceptions are forwarded to the captured prior
506    //!   port via the kernel's "return KERN_FAILURE" fallback — the kernel walks the exception port
507    //!   chain when our handler doesn't claim ownership.
508
509    use std::thread::{self, JoinHandle};
510    #[cfg(not(feature = "pager-live-mach"))]
511    use std::time::Instant;
512
513    #[cfg(not(feature = "pager-live-mach"))]
514    use tracing::{debug, info};
515
516    #[cfg(not(feature = "pager-live-mach"))]
517    use super::PagerHandle;
518    use super::{Pager, PagerError};
519
520    /// Spawn the Mach server thread.
521    ///
522    /// Returns a [`JoinHandle`] — the caller owns the lifecycle. The current
523    /// skeleton always returns `Ok(())`; the live `mach_msg` path will surface
524    /// errors via [`PagerError::Mach`], which is why the result type stays in the
525    /// signature even though the skeleton never produces an `Err`.
526    ///
527    /// # Errors
528    /// [`PagerError::Spawn`] when `thread::Builder::spawn` fails (typically
529    /// EAGAIN on per-process thread cap exhaustion). Squib-host is a library —
530    /// we never panic on a thread-spawn failure that's reachable from a
531    /// long-running daemon.
532    pub fn spawn_server(pager: &Pager) -> Result<JoinHandle<Result<(), PagerError>>, PagerError> {
533        let handle = pager.handle();
534        thread::Builder::new()
535            .name("squib-pager".into())
536            .spawn(move || -> Result<(), PagerError> {
537                #[cfg(feature = "pager-live-mach")]
538                {
539                    live::run_live_server(&handle)
540                }
541                #[cfg(not(feature = "pager-live-mach"))]
542                {
543                    run_skeleton_loop(&handle);
544                    Ok(())
545                }
546            })
547            .map_err(PagerError::Spawn)
548    }
549
550    #[cfg(not(feature = "pager-live-mach"))]
551    fn run_skeleton_loop(handle: &PagerHandle) {
552        info!(
553            poll_interval = ?handle.poll_interval(),
554            "squib-pager server starting (drift-poll skeleton; build with `--features pager-live-mach` to install a real exception port)"
555        );
556        let mut last_drift_check = Instant::now();
557        while !handle.is_shutting_down() {
558            thread::park_timeout(handle.poll_interval());
559            if last_drift_check.elapsed() >= handle.poll_interval() {
560                debug!("squib-pager drift check (no-op skeleton)");
561                last_drift_check = Instant::now();
562            }
563        }
564        debug!("squib-pager server exiting (shutdown requested)");
565    }
566
567    #[cfg(feature = "pager-live-mach")]
568    mod live {
569        //! Live Mach-exception-port server. Compiled only with
570        //! `--features pager-live-mach`.
571        //!
572        //! The unsafe surface is bounded to:
573        //! 1. `mach_port_allocate` — create a fresh receive port we own.
574        //! 2. `task_swap_exception_ports` — atomically install the port and capture the prior
575        //!    handler set.
576        //! 3. `mach_msg(MACH_RCV_MSG, …, timeout)` — receive one exception message per iteration,
577        //!    with a timeout matching `poll_interval` so shutdown latency is bounded.
578        //! 4. `task_get_exception_ports` — drift detection: re-read the active set, compare against
579        //!    the port we own, re-install if LLDB has overwritten ours.
580        //!
581        //! Out-of-region faults are NOT explicitly forwarded — we let the
582        //! Mach kernel's automatic fallback walk the exception port chain
583        //! by *not* registering a reply for that exception thread, so the
584        //! kernel re-delivers via the next port in the chain (typically
585        //! the prior LLDB or task-default handler we captured).
586
587        use std::time::Instant;
588
589        use mach2::{
590            exception_types::{
591                EXC_MASK_BAD_ACCESS, EXCEPTION_DEFAULT, exception_behavior_array_t,
592                exception_flavor_array_t, exception_mask_array_t, exception_mask_t,
593            },
594            kern_return::KERN_SUCCESS,
595            mach_port::{mach_port_allocate, mach_port_deallocate},
596            mach_types::exception_handler_array_t,
597            message::{
598                MACH_MSG_TIMEOUT_NONE, MACH_RCV_MSG, MACH_RCV_TIMEOUT, MACH_RCV_TOO_LARGE,
599                mach_msg, mach_msg_header_t,
600            },
601            port::{MACH_PORT_RIGHT_RECEIVE, mach_port_t},
602            task::{task_get_exception_ports, task_swap_exception_ports},
603            thread_status::THREAD_STATE_NONE,
604            traps::mach_task_self,
605        };
606        use tracing::{debug, info, warn};
607
608        use super::super::{PagerError, PagerHandle};
609
610        /// `EXC_MASK_*` for the exception classes we want to claim. virtio-mem
611        /// faults surface as `EXC_BAD_ACCESS` (KERN_INVALID_ADDRESS sub-code) so
612        /// that's the one we install. Other classes (BAD_INSTRUCTION, BREAKPOINT)
613        /// stay on the prior handler — squib's pager has no opinion on those.
614        const SQUIB_EXC_MASK: exception_mask_t = EXC_MASK_BAD_ACCESS as exception_mask_t;
615
616        /// Maximum exception ports the kernel returns from `task_get_exception_ports`.
617        /// macOS caps this at 32 internally; allocating a fixed-size array keeps the
618        /// drift-check path stack-allocated.
619        const MAX_EXCEPTION_PORTS: usize = 32;
620
621        /// Run the full live server loop. Returns when the pager is shutdown-flagged
622        /// or a hard Mach error fires.
623        pub(super) fn run_live_server(handle: &PagerHandle) -> Result<(), PagerError> {
624            let our_port = allocate_receive_port()
625                .map_err(|kr| PagerError::Mach(format!("mach_port_allocate: kr={kr}")))?;
626            install_exception_port(our_port).map_err(|kr| {
627                PagerError::Mach(format!("task_swap_exception_ports install: kr={kr}"))
628            })?;
629            info!(
630                port = our_port,
631                "squib-pager live: exception port installed"
632            );
633
634            let mut last_drift_check = Instant::now();
635            while !handle.is_shutting_down() {
636                let timeout_ms =
637                    u32::try_from(handle.poll_interval().as_millis()).unwrap_or(u32::MAX);
638                let kr = recv_one(our_port, timeout_ms);
639                match kr {
640                    KERN_SUCCESS => {
641                        // We received an exception. Today the dispatcher is
642                        // skeleton-only — log the message and let the kernel
643                        // re-deliver to the prior handler by *not* sending a
644                        // reply. Future refinement: parse the message body,
645                        // look up the (ipa, page) pair, fetch from the
646                        // PageSource, write into the vCPU thread state, and
647                        // reply with KERN_SUCCESS.
648                        debug!("squib-pager live: received exception (no-reply forwarder)");
649                    }
650                    rc if rc == MACH_RCV_TIMEOUT => {
651                        // Expected — every `poll_interval` we wake to
652                        // drift-check + check shutdown.
653                    }
654                    rc if rc == MACH_RCV_TOO_LARGE => {
655                        warn!("squib-pager live: oversize exception message dropped");
656                    }
657                    rc => {
658                        warn!(kr = rc, "squib-pager live: unexpected mach_msg return");
659                    }
660                }
661
662                if last_drift_check.elapsed() >= handle.poll_interval() {
663                    if let Err(e) = drift_check_live(our_port) {
664                        warn!(error = ?e, "squib-pager live: drift check failed");
665                    }
666                    last_drift_check = Instant::now();
667                }
668            }
669
670            // Shutdown — release the port. Best-effort; if dealloc fails the
671            // kernel will reclaim on process exit anyway.
672            // SAFETY: `our_port` was allocated by `mach_port_allocate` above
673            // and has not been deallocated yet (we're the only releaser).
674            let dealloc = unsafe { mach_port_deallocate(mach_task_self(), our_port) };
675            if dealloc != KERN_SUCCESS {
676                warn!(
677                    kr = dealloc,
678                    "squib-pager live: mach_port_deallocate failed (best-effort)"
679                );
680            }
681            debug!("squib-pager live: server exiting (shutdown requested)");
682            Ok(())
683        }
684
685        fn allocate_receive_port() -> Result<mach_port_t, i32> {
686            let mut port: mach_port_t = 0;
687            // SAFETY: `mach_port_allocate` writes `port` only when it returns
688            // KERN_SUCCESS; we own the allocated port until `mach_port_deallocate`.
689            let kr = unsafe {
690                mach_port_allocate(mach_task_self(), MACH_PORT_RIGHT_RECEIVE, &raw mut port)
691            };
692            if kr == KERN_SUCCESS {
693                Ok(port)
694            } else {
695                Err(kr)
696            }
697        }
698
699        fn install_exception_port(port: mach_port_t) -> Result<(), i32> {
700            let mut masks: [exception_mask_t; MAX_EXCEPTION_PORTS] = [0; MAX_EXCEPTION_PORTS];
701            let mut handlers: [mach_port_t; MAX_EXCEPTION_PORTS] = [0; MAX_EXCEPTION_PORTS];
702            let mut behaviors: [u32; MAX_EXCEPTION_PORTS] = [0; MAX_EXCEPTION_PORTS];
703            let mut flavors: [i32; MAX_EXCEPTION_PORTS] = [0; MAX_EXCEPTION_PORTS];
704            let mut count: u32 = MAX_EXCEPTION_PORTS as u32;
705
706            // SAFETY: `task_swap_exception_ports` reads our `port`, atomically
707            // installs it for `EXC_MASK_BAD_ACCESS`, and writes the prior
708            // handlers into the four out arrays. All buffers live on the
709            // stack for the duration of the call.
710            let kr = unsafe {
711                task_swap_exception_ports(
712                    mach_task_self(),
713                    SQUIB_EXC_MASK,
714                    port,
715                    EXCEPTION_DEFAULT.cast_signed(),
716                    THREAD_STATE_NONE,
717                    masks.as_mut_ptr() as exception_mask_array_t,
718                    &raw mut count,
719                    handlers.as_mut_ptr() as exception_handler_array_t,
720                    behaviors.as_mut_ptr() as exception_behavior_array_t,
721                    flavors.as_mut_ptr() as exception_flavor_array_t,
722                )
723            };
724            if kr == KERN_SUCCESS { Ok(()) } else { Err(kr) }
725        }
726
727        fn drift_check_live(our_port: mach_port_t) -> Result<(), i32> {
728            // Reads the active exception ports for our task. If our port is
729            // no longer the EXC_MASK_BAD_ACCESS handler, LLDB (or another
730            // attached debugger) has overwritten ours; re-install.
731            let mut masks: [exception_mask_t; MAX_EXCEPTION_PORTS] = [0; MAX_EXCEPTION_PORTS];
732            let mut handlers: [mach_port_t; MAX_EXCEPTION_PORTS] = [0; MAX_EXCEPTION_PORTS];
733            let mut behaviors: [u32; MAX_EXCEPTION_PORTS] = [0; MAX_EXCEPTION_PORTS];
734            let mut flavors: [i32; MAX_EXCEPTION_PORTS] = [0; MAX_EXCEPTION_PORTS];
735            let mut count: u32 = MAX_EXCEPTION_PORTS as u32;
736            // SAFETY: `task_get_exception_ports` writes only into the
737            // stack-resident arrays; we own them and the call returns
738            // before they go out of scope.
739            let kr = unsafe {
740                task_get_exception_ports(
741                    mach_task_self(),
742                    SQUIB_EXC_MASK,
743                    masks.as_mut_ptr() as exception_mask_array_t,
744                    &raw mut count,
745                    handlers.as_mut_ptr() as exception_handler_array_t,
746                    behaviors.as_mut_ptr() as exception_behavior_array_t,
747                    flavors.as_mut_ptr() as exception_flavor_array_t,
748                )
749            };
750            if kr != KERN_SUCCESS {
751                return Err(kr);
752            }
753            // Walk the returned set. If any handler that covers EXC_BAD_ACCESS
754            // is not our port, re-install.
755            let drifted = (0..count as usize)
756                .any(|i| (masks[i] & SQUIB_EXC_MASK) != 0 && handlers[i] != our_port);
757            if drifted {
758                debug!("squib-pager live: drift detected, re-installing");
759                install_exception_port(our_port)?;
760            }
761            Ok(())
762        }
763
764        fn recv_one(port: mach_port_t, timeout_ms: u32) -> i32 {
765            // 64-byte buffer is enough for an EXCEPTION_DEFAULT message; we
766            // don't unpack the body in this skeleton, so a small ceiling
767            // surfaces oversize faults as MACH_RCV_TOO_LARGE rather than a
768            // truncated read.
769            let mut header = mach_msg_header_t::default();
770            let option = if timeout_ms == 0 {
771                MACH_RCV_MSG
772            } else {
773                MACH_RCV_MSG | MACH_RCV_TIMEOUT
774            };
775            let timeout = if timeout_ms == 0 {
776                MACH_MSG_TIMEOUT_NONE
777            } else {
778                timeout_ms
779            };
780            // SAFETY: `mach_msg` writes into the header buffer (size = recv_size),
781            // both buffer fields live on this frame and outlive the call.
782            // Passing a tiny recv_size means most real exception messages will
783            // surface as MACH_RCV_TOO_LARGE, which we handle as a warn-and-skip.
784            unsafe {
785                mach_msg(
786                    (&raw mut header).cast(),
787                    option,
788                    0,
789                    size_of::<mach_msg_header_t>() as u32,
790                    port,
791                    timeout,
792                    0,
793                )
794            }
795        }
796    }
797}
798
799#[cfg(target_os = "macos")]
800pub use mach_imp::spawn_server as spawn_mach_server;
801
802/// Stub on non-macOS targets so cross-platform tests still compile. Returns
803/// immediately with `Ok(())`.
804///
805/// # Errors
806/// [`PagerError::Spawn`] when `thread::Builder::spawn` fails.
807#[cfg(not(target_os = "macos"))]
808pub fn spawn_mach_server(
809    _pager: &Pager,
810) -> Result<std::thread::JoinHandle<Result<(), PagerError>>, PagerError> {
811    std::thread::Builder::new()
812        .name("squib-pager-stub".into())
813        .spawn(|| Ok(()))
814        .map_err(PagerError::Spawn)
815}
816
817#[cfg(test)]
818mod tests {
819    use std::io::Write as _;
820
821    use bytes::Bytes;
822    use tempfile::TempDir;
823
824    use super::*;
825
826    #[derive(Debug)]
827    struct FakeSource {
828        bytes: Bytes,
829    }
830
831    impl PageSource for FakeSource {
832        fn fetch(&self, _req: &PageRequest) -> Result<Bytes, PageSourceError> {
833            Ok(self.bytes.clone())
834        }
835    }
836
837    #[test]
838    fn test_should_register_and_match_a_postcopy_region() {
839        let pager = Pager::new(
840            Box::new(FakeSource {
841                bytes: Bytes::from(vec![0u8; 16 * 1024]),
842            }),
843            PagerConfig::default(),
844        );
845        pager.register_region(0x8000_0000, 0x1000_0000);
846        assert!(pager.contains_ipa(0x8000_1234));
847        assert!(!pager.contains_ipa(0x9000_0000));
848    }
849
850    #[test]
851    fn test_should_serve_fault_inside_registered_region() {
852        let bytes = Bytes::from(vec![0xAB; 16 * 1024]);
853        let pager = Pager::new(
854            Box::new(FakeSource {
855                bytes: bytes.clone(),
856            }),
857            PagerConfig::default(),
858        );
859        pager.register_region(0x8000_0000, 0x1000_0000);
860        let got = pager.serve_fault(0x8000_1234, 16 * 1024).unwrap();
861        assert_eq!(got, bytes);
862        assert_eq!(pager.stats().faults, 1);
863    }
864
865    #[test]
866    fn test_should_reject_fault_outside_region() {
867        let pager = Pager::new(
868            Box::new(FakeSource {
869                bytes: Bytes::from(vec![0u8; 16 * 1024]),
870            }),
871            PagerConfig::default(),
872        );
873        pager.register_region(0x8000_0000, 0x1000_0000);
874        let err = pager.serve_fault(0xC000_0000, 16 * 1024).unwrap_err();
875        assert!(matches!(err, PagerError::OutOfRegion { ipa: 0xC000_0000 }));
876    }
877
878    #[derive(Debug, Default)]
879    struct CaptureSource {
880        seen: Mutex<Vec<u64>>,
881    }
882
883    impl PageSource for CaptureSource {
884        fn fetch(&self, req: &PageRequest) -> Result<Bytes, PageSourceError> {
885            self.seen.lock().push(req.ipa);
886            Ok(Bytes::from(vec![0u8; req.page_size as usize]))
887        }
888    }
889
890    #[derive(Debug)]
891    struct WrapperSource {
892        inner: Arc<CaptureSource>,
893    }
894
895    impl PageSource for WrapperSource {
896        fn fetch(&self, req: &PageRequest) -> Result<Bytes, PageSourceError> {
897            self.inner.fetch(req)
898        }
899    }
900
901    #[test]
902    fn test_should_align_request_to_page_boundary() {
903        let src = Arc::new(CaptureSource::default());
904        let pager = Pager::new(
905            Box::new(WrapperSource { inner: src.clone() }),
906            PagerConfig::default(),
907        );
908        pager.register_region(0x8000_0000, 0x1000_0000);
909        pager.serve_fault(0x8000_1234, 16 * 1024).unwrap();
910        let seen = src.seen.lock().clone();
911        assert_eq!(seen, vec![0x8000_0000]);
912    }
913
914    #[test]
915    fn test_should_prewarm_each_page_on_demand() {
916        let bytes = Bytes::from(vec![0xCD; 16 * 1024]);
917        let cfg = PagerConfig {
918            prewarm: PrewarmList::from_boot_critical(
919                0x8000_2000, // kernel text
920                0x9000_0000, // vCPU 0 stack
921                0x9F00_0000, // FDT
922            ),
923            ..Default::default()
924        };
925        let pager = Pager::new(Box::new(FakeSource { bytes }), cfg);
926        pager.register_region(0x8000_0000, 0x2000_0000);
927        pager.prewarm().unwrap();
928        let stats = pager.stats();
929        assert_eq!(stats.prewarmed, 3);
930        assert_eq!(stats.faults, 3);
931    }
932
933    #[test]
934    fn test_file_page_source_reads_at_offset_from_ram_start() {
935        let dir = TempDir::new().unwrap();
936        let path = dir.path().join("x.mem");
937        let mut bytes = vec![0u8; 64 * 1024];
938        // Plant a marker at 16 KiB
939        for byte in &mut bytes[16 * 1024..32 * 1024] {
940            *byte = 0x77;
941        }
942        std::fs::write(&path, &bytes).unwrap();
943        let src = FilePageSource::open(&path, 0x8000_0000).unwrap();
944        let got = src
945            .fetch(&PageRequest {
946                ipa: 0x8000_4000, // ram_start + 16 KiB
947                page_size: 16 * 1024,
948            })
949            .unwrap();
950        assert!(got.iter().all(|&b| b == 0x77));
951    }
952
953    #[test]
954    fn test_file_page_source_rejects_below_ram_start() {
955        let dir = TempDir::new().unwrap();
956        let path = dir.path().join("x.mem");
957        std::fs::write(&path, vec![0u8; 16 * 1024]).unwrap();
958        let src = FilePageSource::open(&path, 0x8000_0000).unwrap();
959        let err = src
960            .fetch(&PageRequest {
961                ipa: 0x4000_0000,
962                page_size: 16 * 1024,
963            })
964            .unwrap_err();
965        assert!(matches!(err, PageSourceError::Open(_)));
966    }
967
968    #[test]
969    fn test_uffd_page_source_round_trips_protocol() {
970        // Build a one-shot fake page server.
971        let dir = TempDir::new().unwrap();
972        let sock_path = dir.path().join("pager.sock");
973        let listener = std::os::unix::net::UnixListener::bind(&sock_path).unwrap();
974        let server = std::thread::spawn(move || {
975            let (mut sock, _) = listener.accept().unwrap();
976            let mut hdr = [0u8; 16];
977            sock.read_exact(&mut hdr).unwrap();
978            let ipa = u64::from_le_bytes(hdr[0..8].try_into().unwrap());
979            let size = u64::from_le_bytes(hdr[8..16].try_into().unwrap());
980            sock.write_all(&hdr).unwrap();
981            let payload = vec![0xEEu8; size as usize];
982            sock.write_all(&payload).unwrap();
983            (ipa, size)
984        });
985        let src = UffdPageSource::connect(&sock_path).unwrap();
986        let got = src
987            .fetch(&PageRequest {
988                ipa: 0x8000_0000,
989                page_size: 16 * 1024,
990            })
991            .unwrap();
992        assert!(got.iter().all(|&b| b == 0xEE));
993        let _ = server.join().unwrap();
994    }
995
996    #[test]
997    fn test_should_request_shutdown_via_handle() {
998        let pager = Pager::new(
999            Box::new(FakeSource {
1000                bytes: Bytes::from(vec![0u8; 16 * 1024]),
1001            }),
1002            PagerConfig::default(),
1003        );
1004        let h = pager.handle();
1005        assert!(!h.is_shutting_down());
1006        pager.request_shutdown();
1007        assert!(h.is_shutting_down());
1008    }
1009
1010    #[test]
1011    fn test_stats_record_port_reinstall_and_forwarded() {
1012        let pager = Pager::new(
1013            Box::new(FakeSource {
1014                bytes: Bytes::from(vec![0u8; 16 * 1024]),
1015            }),
1016            PagerConfig::default(),
1017        );
1018        pager.record_port_reinstall();
1019        pager.record_port_reinstall();
1020        pager.record_forwarded();
1021        let s = pager.stats();
1022        assert_eq!(s.port_reinstalls, 2);
1023        assert_eq!(s.forwarded_exceptions, 1);
1024    }
1025}