arcbox-vmm 0.4.10

Virtual Machine Monitor for ArcBox
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
//! Per-vCPU run loop.
//!
//! Each vCPU thread calls [`vcpu_run_loop`], which creates an `HvVcpu` on
//! the calling thread (required because `HvVcpu` is `!Send`), programs the
//! ARM64 boot register state, and enters an exit-dispatch loop. Exits are
//! dispatched to the PL011 UART, the `DeviceManager` for VirtIO MMIO, or
//! to the HVC/PSCI handlers.

use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};

use arcbox_hv::{ExceptionClass, HvVcpu, VcpuExit};

use super::hvc_blk::{
    ARCBOX_HVC_BLK_FLUSH, ARCBOX_HVC_BLK_READ, ARCBOX_HVC_BLK_WRITE, ARCBOX_HVC_PROBE,
    handle_hvc_blk_flush, handle_hvc_blk_io,
};
use super::psci::{CpuOnSenders, handle_psci};
use super::{HvVcpuIds, Pl011, VcpuThreadHandles};

/// ARM64 register IDs re-exported from arcbox-hv.
pub(super) mod reg {
    pub use arcbox_hv::reg::{
        HV_REG_CPSR as CPSR, HV_REG_PC as PC, HV_REG_X0 as X0, HV_REG_X1 as X1, HV_REG_X2 as X2,
        HV_REG_X3 as X3,
    };
}

/// CPSR value: EL1h with DAIF masked (all interrupts masked at boot).
const CPSR_EL1H: u64 = 0x3C5;

/// ARM64 boot protocol: MMU off, caches off, plus ARMv8 RES1 bits.
const SCTLR_EL1_RESET: u64 = (1 << 11) // RES1
    | (1 << 20) // RES1
    | (1 << 22) // RES1
    | (1 << 23) // RES1
    | (1 << 28) // RES1
    | (1 << 29); // RES1

/// Shared state passed to each vCPU thread.
///
/// Groups the resources that every vCPU needs access to, replacing what
/// was previously 6 separate function parameters.
pub(super) struct VcpuContext {
    /// Shared device manager for MMIO dispatch.
    pub device_manager: Arc<crate::device::DeviceManager>,
    /// Shared flag; the loop exits when this is set to `false`.
    pub running: Arc<AtomicBool>,
    /// Cooperative pause flag. When `true`, the vCPU parks itself after its
    /// next `vcpu.run()` return instead of re-entering guest execution.
    /// Cleared by `resume`, which also unparks the thread.
    pub paused: Arc<AtomicBool>,
    /// Shared PL011 UART emulator for early console output.
    pub pl011: Arc<std::sync::Mutex<Pl011>>,
    /// Channel senders for waking secondary vCPUs via PSCI CPU_ON.
    /// `None` when the VM has only one vCPU.
    pub cpu_on_senders: Option<CpuOnSenders>,
    /// Registry of vCPU thread handles used by the IRQ callback to
    /// unpark WFI-blocked threads.
    pub vcpu_thread_handles: VcpuThreadHandles,
    /// Registry of Hypervisor.framework vCPU IDs. Populated by this loop
    /// after `HvVcpu::new()`; read by `pause`/`stop` when calling
    /// `hv_vcpus_exit` (which on arm64 requires a concrete list, not NULL).
    pub hv_vcpu_ids: HvVcpuIds,
    /// Per-block-device file descriptors and sector sizes for HVC fast path.
    pub hvc_blk_fds: Arc<Vec<(i32, u32)>>,
}

/// Reads the value of an MMIO write source register, handling the ARM64
/// XZR (register 31) special case.
///
/// On ARM64, register 31 in a load/store encoding is XZR (always zero),
/// not SP. `HvVcpu::get_reg(31)` returns SP, so we intercept it here.
fn read_mmio_write_reg(vcpu: &HvVcpu, vcpu_id: u32, register: u8) -> Option<u64> {
    if register == 31 {
        return Some(0);
    }
    match vcpu.get_reg(u32::from(register)) {
        Ok(v) => Some(v),
        Err(e) => {
            tracing::error!("vCPU {vcpu_id}: get_reg(X{register}) failed: {e}");
            None
        }
    }
}

/// Runs a single vCPU in a loop, dispatching MMIO traps to the device manager.
///
/// This function is intended to be called from a dedicated thread per vCPU.
/// `HvVcpu` is `!Send`, so it must be created inside this function on the
/// thread that will run it.
///
/// # Arguments
///
/// * `vcpu_id` — Logical vCPU index (0-based, for logging).
/// * `entry_addr` — Guest IPA where execution begins. For the BSP this is
///   the kernel entry point; for a secondary vCPU it is the address passed
///   in PSCI CPU_ON.
/// * `x0_value` — Initial value of X0. For the BSP this is the FDT address;
///   for a secondary vCPU it is the context_id from PSCI CPU_ON.
/// * `ctx` — Shared resources for the vCPU (device manager, IRQ state, etc.).
pub(super) fn vcpu_run_loop(vcpu_id: u32, entry_addr: u64, x0_value: u64, ctx: VcpuContext) {
    let VcpuContext {
        device_manager,
        running,
        paused,
        pl011,
        cpu_on_senders,
        vcpu_thread_handles,
        hv_vcpu_ids,
        hvc_blk_fds,
    } = ctx;

    let vcpu = match HvVcpu::new() {
        Ok(v) => v,
        Err(e) => {
            tracing::error!("vCPU {vcpu_id}: creation failed: {e}");
            return;
        }
    };

    // Set initial register state for ARM64 Linux boot protocol:
    //   PC   = entry address (kernel entry for BSP, PSCI entry for secondary)
    //   X0   = parameter (FDT address for BSP, context_id for secondary)
    //   X1-X3 = 0 (reserved per ARM64 boot protocol)
    //   CPSR = EL1h, DAIF masked
    if let Err(e) = vcpu.set_reg(reg::PC, entry_addr) {
        tracing::error!("vCPU {vcpu_id}: set PC failed: {e}");
        return;
    }
    if let Err(e) = vcpu.set_reg(reg::X0, x0_value) {
        tracing::error!("vCPU {vcpu_id}: set X0 failed: {e}");
        return;
    }
    let _ = vcpu.set_reg(reg::X1, 0);
    let _ = vcpu.set_reg(reg::X2, 0);
    let _ = vcpu.set_reg(reg::X3, 0);
    if let Err(e) = vcpu.set_reg(reg::CPSR, CPSR_EL1H) {
        tracing::error!("vCPU {vcpu_id}: set CPSR failed: {e}");
        return;
    }

    // ARM64 boot protocol: MMU must be off, caches can be on or off.
    if let Err(e) = vcpu.set_sys_reg(arcbox_hv::sys_reg::HV_SYS_REG_SCTLR_EL1, SCTLR_EL1_RESET) {
        tracing::warn!("vCPU {vcpu_id}: set SCTLR_EL1 failed: {e}");
    }

    // Register this vCPU's framework ID and this thread's handle after all
    // register-setup calls succeed. If any setup call fails above, the
    // early return drops `HvVcpu` (triggering `hv_vcpu_destroy`) and the
    // thread exits without leaving either a stale ID or a dead `Thread`
    // in the shared registries. A stale ID would make `hv_vcpus_exit`
    // pass a dangling handle to Apple's framework (UB per its contract,
    // ABX-367); a dead thread handle would grow the registry unboundedly
    // across failed boots.
    {
        let mut ids = hv_vcpu_ids
            .lock()
            .unwrap_or_else(std::sync::PoisonError::into_inner);
        ids.push(vcpu.raw_handle());
    }
    {
        let mut handles = vcpu_thread_handles
            .lock()
            .unwrap_or_else(std::sync::PoisonError::into_inner);
        handles.push(std::thread::current());
    }

    // Set MPIDR_EL1 for this vCPU (used by GIC affinity routing).
    // Simple layout: Aff0 = vcpu_id, all other affinity fields 0.
    let mpidr = u64::from(vcpu_id) & 0xFF;
    if let Err(e) = vcpu.set_sys_reg(arcbox_hv::sys_reg::HV_SYS_REG_MPIDR_EL1, mpidr) {
        tracing::warn!("vCPU {vcpu_id}: set MPIDR failed (may not be writable): {e}");
    }

    tracing::info!(
        "vCPU {vcpu_id}: starting at PC={:#x}, X0={:#x}, SCTLR={:#x}",
        entry_addr,
        x0_value,
        SCTLR_EL1_RESET,
    );

    loop {
        if !running.load(Ordering::Relaxed) {
            tracing::info!("vCPU {vcpu_id}: shutdown requested");
            break;
        }

        // Cooperative pause: if the host has requested a pause, park here
        // until resume clears the flag. Check after each `vcpu.run()` return
        // so the vCPU never re-enters guest execution while paused. The
        // host calls `hv_vcpus_exit` to kick us out of `vcpu.run()`, we
        // observe `paused`, and park. `resume` unparks every registered
        // vCPU thread via `vcpu_thread_handles`.
        while paused.load(Ordering::Acquire) && running.load(Ordering::Relaxed) {
            std::thread::park();
        }
        if !running.load(Ordering::Relaxed) {
            tracing::info!("vCPU {vcpu_id}: shutdown observed after pause");
            break;
        }

        // BSP (vCPU 0) handles bridge polling to avoid lock contention.
        // poll_net_rx removed — handled by net-io worker thread.
        // poll_vsock_rx removed — handled by vsock-io worker thread.
        if vcpu_id == 0 && device_manager.poll_bridge_rx() {
            if let Some(bid) = device_manager.bridge_device_id() {
                device_manager.raise_interrupt_for_device(bid, 1);
            }
        }

        // ABX-367 instrumentation: retained at trace level for future
        // diagnosis of vCPUs stuck inside or between `vcpu.run()` calls
        // during shutdown. Gated on `running=false` so normal operation
        // is unaffected. Not live debug logging — use RUST_LOG=trace to see.
        if !running.load(Ordering::Relaxed) {
            tracing::trace!("vCPU {vcpu_id}: iteration during shutdown (before vcpu.run)");
        }
        let exit = match vcpu.run() {
            Ok(e) => e,
            Err(e) => {
                tracing::error!("vCPU {vcpu_id}: run failed: {e}");
                running.store(false, Ordering::SeqCst);
                break;
            }
        };
        if !running.load(Ordering::Relaxed) {
            tracing::trace!(
                "vCPU {vcpu_id}: vcpu.run returned during shutdown, exit={:?}",
                core::mem::discriminant(&exit)
            );
        }

        match exit {
            VcpuExit::Exception {
                class: ExceptionClass::DataAbort(ref mmio),
                ..
            } => {
                // Check PL011 UART region first, then fall through to DeviceManager.
                let handled_by_pl011 = {
                    let uart_match = {
                        let guard = pl011.lock().unwrap();
                        guard.contains(mmio.address)
                    };
                    if uart_match {
                        if mmio.is_write {
                            let value =
                                read_mmio_write_reg(&vcpu, vcpu_id, mmio.register).unwrap_or(0);
                            pl011.lock().unwrap().write(
                                mmio.address,
                                mmio.access_size as usize,
                                value,
                            );
                        } else {
                            let value = pl011
                                .lock()
                                .unwrap()
                                .read(mmio.address, mmio.access_size as usize);
                            if let Err(e) = vcpu.set_reg(u32::from(mmio.register), value) {
                                tracing::error!(
                                    "vCPU {vcpu_id}: set_reg(X{}) failed: {e}",
                                    mmio.register
                                );
                            }
                        }
                        true
                    } else {
                        false
                    }
                };

                if !handled_by_pl011 {
                    // Dispatch to DeviceManager for VirtIO MMIO devices.
                    if mmio.is_write {
                        let Some(value) = read_mmio_write_reg(&vcpu, vcpu_id, mmio.register) else {
                            let pc = vcpu.get_reg(reg::PC).unwrap_or(0);
                            let _ = vcpu.set_reg(reg::PC, pc + 4);
                            continue;
                        };
                        tracing::trace!(
                            "MMIO write: addr={:#x} offset={:#x} X{}={:#x} size={}",
                            mmio.address,
                            mmio.address.saturating_sub(
                                mmio.address & !0xFFF // base = addr & ~0xFFF
                            ),
                            mmio.register,
                            value,
                            mmio.access_size,
                        );
                        if let Err(e) = device_manager.handle_mmio_write(
                            mmio.address,
                            mmio.access_size as usize,
                            value,
                        ) {
                            tracing::warn!(
                                "vCPU {vcpu_id}: MMIO write {:#x} failed: {e}",
                                mmio.address
                            );
                        }
                    } else {
                        let value = match device_manager
                            .handle_mmio_read(mmio.address, mmio.access_size as usize)
                        {
                            Ok(v) => v,
                            Err(e) => {
                                tracing::warn!(
                                    "vCPU {vcpu_id}: MMIO read {:#x} failed: {e}",
                                    mmio.address
                                );
                                0 // Return 0 for unknown reads.
                            }
                        };
                        if let Err(e) = vcpu.set_reg(u32::from(mmio.register), value) {
                            tracing::error!(
                                "vCPU {vcpu_id}: set_reg(X{}) failed: {e}",
                                mmio.register
                            );
                        }
                    }
                }

                // Advance PC past the trapped instruction (ARM64 = fixed 4 bytes).
                // Hypervisor.framework does NOT auto-advance PC on data aborts.
                let pc = vcpu.get_reg(reg::PC).unwrap_or(0);
                let _ = vcpu.set_reg(reg::PC, pc + 4);
            }

            VcpuExit::Exception {
                class: ExceptionClass::WaitForInterrupt,
                ..
            } => {
                // Guest executed WFI — it is idle and waiting for an interrupt.
                // Before parking, poll the bridge for incoming data. vsock and
                // net injection are handled by their dedicated worker threads.
                let wfi_has_bridge = device_manager.poll_bridge_rx();
                if wfi_has_bridge {
                    if let Some(bid) = device_manager.bridge_device_id() {
                        device_manager.raise_interrupt_for_device(bid, 1);
                    }
                    continue; // Re-enter run loop immediately.
                }
                // No pending data — park with timeout.
                std::thread::park_timeout(std::time::Duration::from_millis(1));
                // Re-check `running` immediately after the park returns so
                // that `stop_darwin_hv` does not have to race a fresh
                // `vcpu.run()` re-entry. Without this, a vCPU parked in WFI
                // could consume an `exit_all_vcpus` cancel intended for a
                // different vCPU and then slip back into `vcpu.run()`
                // unguarded. See ABX-367.
                if !running.load(Ordering::Relaxed) {
                    break;
                }
            }

            VcpuExit::Exception {
                class: ExceptionClass::HypercallHvc(_imm),
                ..
            } => {
                let func_id = match vcpu.get_reg(reg::X0) {
                    Ok(v) => v,
                    Err(_) => continue,
                };

                match func_id {
                    ARCBOX_HVC_PROBE => {
                        // Return number of block devices available for fast path.
                        // NOTE: Hypervisor.framework auto-advances PC on HVC exit.
                        // Do NOT manually advance PC — that would skip an instruction.
                        let _ = vcpu.set_reg(reg::X0, hvc_blk_fds.len() as u64);
                    }
                    ARCBOX_HVC_BLK_READ => {
                        let result = handle_hvc_blk_io(&vcpu, &hvc_blk_fds, &device_manager, false);
                        let _ = vcpu.set_reg(reg::X0, result);
                    }
                    ARCBOX_HVC_BLK_WRITE => {
                        let result = handle_hvc_blk_io(&vcpu, &hvc_blk_fds, &device_manager, true);
                        let _ = vcpu.set_reg(reg::X0, result);
                    }
                    ARCBOX_HVC_BLK_FLUSH => {
                        let result = handle_hvc_blk_flush(&vcpu, &hvc_blk_fds);
                        let _ = vcpu.set_reg(reg::X0, result);
                    }
                    _ => {
                        // PSCI and other standard calls.
                        handle_psci(vcpu_id, func_id, &vcpu, &running, cpu_on_senders.as_ref());
                        if !running.load(Ordering::Relaxed) {
                            break;
                        }
                    }
                }
            }

            VcpuExit::Exception {
                class: ExceptionClass::SmcCall(_),
                ..
            } => {
                // Some guests route PSCI through SMC instead of HVC.
                let func_id = match vcpu.get_reg(reg::X0) {
                    Ok(v) => v,
                    Err(_) => continue,
                };
                handle_psci(vcpu_id, func_id, &vcpu, &running, cpu_on_senders.as_ref());
                if !running.load(Ordering::Relaxed) {
                    break;
                }
            }

            VcpuExit::VtimerActivated => {
                // Virtual timer fired. Unmask it so the guest sees the interrupt.
                let _ = vcpu.set_vtimer_mask(false);
            }

            VcpuExit::Canceled => {
                if running.load(Ordering::Relaxed) {
                    // Woken by net-io thread for interrupt delivery.
                    continue;
                }
                tracing::info!("vCPU {vcpu_id}: canceled (shutdown)");
                break;
            }

            VcpuExit::Exception {
                class:
                    ExceptionClass::SystemRegister {
                        op0,
                        op1,
                        crn,
                        crm,
                        op2,
                        is_write,
                        rt,
                    },
                ..
            } => {
                // Apple's framework forwards unknown sysreg accesses as
                // EC=0x18 without auto-advancing ELR_EL2. If we re-enter
                // guest execution with PC unchanged, the same MSR/MRS
                // traps again — infinite loop (observed: Linux early boot
                // writes OSDLR_EL1 = S2_0_C1_C3_4 and wedges).
                //
                // Treat every unhandled sysreg as read-as-zero /
                // write-ignored: Linux boot touches debug regs
                // (OSDLR_EL1, MDSCR_EL1, DBGBCR*, etc.) that are safe
                // to silently drop, and reads of unknown regs yield 0.
                if !is_write && rt != 31 {
                    // MRS into Xrt. HV_REG_X0..X30 are 0..30 numerically,
                    // so rt maps directly to the register ID. rt==31 is
                    // XZR — the discard register; nothing to write.
                    let _ = vcpu.set_reg(u32::from(rt), 0);
                }
                // Advance PC past the trapping instruction (A64 = 4 bytes).
                if let Ok(pc) = vcpu.get_reg(reg::PC) {
                    let _ = vcpu.set_reg(reg::PC, pc.wrapping_add(4));
                }
                tracing::trace!(
                    vcpu_id,
                    is_write,
                    encoding = %format_args!("S{op0}_{op1}_C{crn}_C{crm}_{op2}"),
                    rt,
                    "sysreg access treated as RAZ/WI; PC advanced"
                );
            }

            VcpuExit::Exception {
                class: ref other, ..
            } => {
                tracing::warn!("vCPU {vcpu_id}: unhandled exception: {other:?}");
            }

            VcpuExit::Unknown(reason) => {
                tracing::warn!("vCPU {vcpu_id}: unknown exit reason {reason}");
            }
        }
    }

    // Flush any remaining UART output.
    pl011.lock().unwrap().flush();

    tracing::info!("vCPU {vcpu_id}: exited");
}