hyperlight_host/hypervisor/mod.rs
1/*
2Copyright 2025 The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17/// GDB debugging support
18#[cfg(gdb)]
19pub(crate) mod gdb;
20
21/// Abstracts over different hypervisor register representations
22pub(crate) mod regs;
23
24pub(crate) mod virtual_machine;
25
26#[cfg(target_os = "windows")]
27/// Hyperlight Surrogate Process
28pub(crate) mod surrogate_process;
29#[cfg(target_os = "windows")]
30/// Hyperlight Surrogate Process
31pub(crate) mod surrogate_process_manager;
32/// Safe wrappers around windows types like `PSTR`
33#[cfg(target_os = "windows")]
34pub mod wrappers;
35
36#[cfg(crashdump)]
37pub(crate) mod crashdump;
38
39pub(crate) mod hyperlight_vm;
40
41use std::fmt::Debug;
42#[cfg(any(kvm, mshv3))]
43use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU64, Ordering};
44#[cfg(target_os = "windows")]
45use std::sync::atomic::{AtomicU8, Ordering};
46#[cfg(any(kvm, mshv3))]
47use std::time::Duration;
48
49/// A trait for platform-specific interrupt handle implementation details
50pub(crate) trait InterruptHandleImpl: InterruptHandle {
51 /// Set the thread ID for the vcpu thread
52 #[cfg(any(kvm, mshv3))]
53 fn set_tid(&self);
54
55 /// Set the running state
56 fn set_running(&self);
57
58 /// Clear the running state
59 fn clear_running(&self);
60
61 /// Mark the handle as dropped
62 fn set_dropped(&self);
63
64 /// Check if cancellation was requested
65 fn is_cancelled(&self) -> bool;
66
67 /// Clear the cancellation request flag
68 fn clear_cancel(&self);
69
70 /// Check if debug interrupt was requested (always returns false when gdb feature is disabled)
71 fn is_debug_interrupted(&self) -> bool;
72
73 // Clear the debug interrupt request flag
74 #[cfg(gdb)]
75 fn clear_debug_interrupt(&self);
76}
77
78/// A trait for handling interrupts to a sandbox's vcpu
79pub trait InterruptHandle: Send + Sync + Debug {
80 /// Interrupt the corresponding sandbox from running.
81 ///
82 /// - If this is called while the the sandbox currently executing a guest function call, it will interrupt the sandbox and return `true`.
83 /// - If this is called while the sandbox is not running (for example before or after calling a guest function), it will do nothing and return `false`.
84 ///
85 /// # Note
86 /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
87 fn kill(&self) -> bool;
88
89 /// Used by a debugger to interrupt the corresponding sandbox from running.
90 ///
91 /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
92 /// - If this is called while the vcpu is not running, (for example during a host call), the
93 /// vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
94 /// it's scheduled, and returns `false`.
95 ///
96 /// # Note
97 /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
98 #[cfg(gdb)]
99 fn kill_from_debugger(&self) -> bool;
100
101 /// Returns true if the corresponding sandbox has been dropped
102 fn dropped(&self) -> bool;
103}
104
105#[cfg(any(kvm, mshv3))]
106#[derive(Debug)]
107pub(super) struct LinuxInterruptHandle {
108 /// Atomic value packing vcpu execution state.
109 ///
110 /// Bit layout:
111 /// - Bit 2: DEBUG_INTERRUPT_BIT - set when debugger interrupt is requested
112 /// - Bit 1: RUNNING_BIT - set when vcpu is actively running
113 /// - Bit 0: CANCEL_BIT - set when cancellation has been requested
114 ///
115 /// CANCEL_BIT persists across vcpu exits/re-entries within a single `VirtualCPU::run()` call
116 /// (e.g., during host function calls), but is cleared at the start of each new `VirtualCPU::run()` call.
117 state: AtomicU8,
118
119 /// Thread ID where the vcpu is running.
120 ///
121 /// Note: Multiple VMs may have the same `tid` (same thread runs multiple sandboxes sequentially),
122 /// but at most one VM will have RUNNING_BIT set at any given time.
123 tid: AtomicU64,
124
125 /// Whether the corresponding VM has been dropped.
126 dropped: AtomicBool,
127
128 /// Delay between retry attempts when sending signals to interrupt the vcpu.
129 retry_delay: Duration,
130
131 /// Offset from SIGRTMIN for the signal used to interrupt the vcpu thread.
132 sig_rt_min_offset: u8,
133}
134
135#[cfg(any(kvm, mshv3))]
136impl LinuxInterruptHandle {
137 const RUNNING_BIT: u8 = 1 << 1;
138 const CANCEL_BIT: u8 = 1 << 0;
139 #[cfg(gdb)]
140 const DEBUG_INTERRUPT_BIT: u8 = 1 << 2;
141
142 /// Get the running, cancel and debug flags atomically.
143 ///
144 /// # Memory Ordering
145 /// Uses `Acquire` ordering to synchronize with the `Release` in `set_running()` and `kill()`.
146 /// This ensures that when we observe running=true, we also see the correct `tid` value.
147 fn get_running_cancel_debug(&self) -> (bool, bool, bool) {
148 let state = self.state.load(Ordering::Acquire);
149 let running = state & Self::RUNNING_BIT != 0;
150 let cancel = state & Self::CANCEL_BIT != 0;
151 #[cfg(gdb)]
152 let debug = state & Self::DEBUG_INTERRUPT_BIT != 0;
153 #[cfg(not(gdb))]
154 let debug = false;
155 (running, cancel, debug)
156 }
157
158 fn send_signal(&self) -> bool {
159 let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int;
160 let mut sent_signal = false;
161
162 loop {
163 let (running, cancel, debug) = self.get_running_cancel_debug();
164
165 // Check if we should continue sending signals
166 // Exit if not running OR if neither cancel nor debug_interrupt is set
167 let should_continue = running && (cancel || debug);
168
169 if !should_continue {
170 break;
171 }
172
173 tracing::info!("Sending signal to kill vcpu thread...");
174 sent_signal = true;
175 // Acquire ordering to synchronize with the Release store in set_tid()
176 // This ensures we see the correct tid value for the currently running vcpu
177 unsafe {
178 libc::pthread_kill(self.tid.load(Ordering::Acquire) as _, signal_number);
179 }
180 std::thread::sleep(self.retry_delay);
181 }
182
183 sent_signal
184 }
185}
186
187#[cfg(any(kvm, mshv3))]
188impl InterruptHandleImpl for LinuxInterruptHandle {
189 fn set_tid(&self) {
190 // Release ordering to synchronize with the Acquire load of `running` in send_signal()
191 // This ensures that when send_signal() observes RUNNING_BIT=true (via Acquire),
192 // it also sees the correct tid value stored here
193 self.tid
194 .store(unsafe { libc::pthread_self() as u64 }, Ordering::Release);
195 }
196
197 fn set_running(&self) {
198 // Release ordering to ensure that the tid store (which uses Release)
199 // is visible to any thread that observes running=true via Acquire ordering.
200 // This prevents the interrupt thread from reading a stale tid value.
201 self.state.fetch_or(Self::RUNNING_BIT, Ordering::Release);
202 }
203
204 fn is_cancelled(&self) -> bool {
205 // Acquire ordering to synchronize with the Release in kill()
206 // This ensures we see the cancel flag set by the interrupt thread
207 self.state.load(Ordering::Acquire) & Self::CANCEL_BIT != 0
208 }
209
210 fn clear_cancel(&self) {
211 // Release ordering to ensure that any operations from the previous run()
212 // are visible to other threads. While this is typically called by the vcpu thread
213 // at the start of run(), the VM itself can move between threads across guest calls.
214 self.state.fetch_and(!Self::CANCEL_BIT, Ordering::Release);
215 }
216
217 fn clear_running(&self) {
218 // Release ordering to ensure all vcpu operations are visible before clearing running
219 self.state.fetch_and(!Self::RUNNING_BIT, Ordering::Release);
220 }
221
222 fn is_debug_interrupted(&self) -> bool {
223 #[cfg(gdb)]
224 {
225 self.state.load(Ordering::Acquire) & Self::DEBUG_INTERRUPT_BIT != 0
226 }
227 #[cfg(not(gdb))]
228 {
229 false
230 }
231 }
232
233 #[cfg(gdb)]
234 fn clear_debug_interrupt(&self) {
235 self.state
236 .fetch_and(!Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
237 }
238
239 fn set_dropped(&self) {
240 // Release ordering to ensure all VM cleanup operations are visible
241 // to any thread that checks dropped() via Acquire
242 self.dropped.store(true, Ordering::Release);
243 }
244}
245
246#[cfg(any(kvm, mshv3))]
247impl InterruptHandle for LinuxInterruptHandle {
248 fn kill(&self) -> bool {
249 // Release ordering ensures that any writes before kill() are visible to the vcpu thread
250 // when it checks is_cancelled() with Acquire ordering
251 self.state.fetch_or(Self::CANCEL_BIT, Ordering::Release);
252
253 // Send signals to interrupt the vcpu if it's currently running
254 self.send_signal()
255 }
256
257 #[cfg(gdb)]
258 fn kill_from_debugger(&self) -> bool {
259 self.state
260 .fetch_or(Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
261 self.send_signal()
262 }
263 fn dropped(&self) -> bool {
264 // Acquire ordering to synchronize with the Release in set_dropped()
265 // This ensures we see all VM cleanup operations that happened before drop
266 self.dropped.load(Ordering::Acquire)
267 }
268}
269
270#[cfg(target_os = "windows")]
271#[derive(Debug)]
272pub(super) struct WindowsInterruptHandle {
273 /// Atomic value packing vcpu execution state.
274 ///
275 /// Bit layout:
276 /// - Bit 2: DEBUG_INTERRUPT_BIT - set when debugger interrupt is requested
277 /// - Bit 1: RUNNING_BIT - set when vcpu is actively running
278 /// - Bit 0: CANCEL_BIT - set when cancellation has been requested
279 ///
280 /// `WHvCancelRunVirtualProcessor()` will return Ok even if the vcpu is not running,
281 /// which is why we need the RUNNING_BIT.
282 ///
283 /// CANCEL_BIT persists across vcpu exits/re-entries within a single `VirtualCPU::run()` call
284 /// (e.g., during host function calls), but is cleared at the start of each new `VirtualCPU::run()` call.
285 state: AtomicU8,
286
287 /// RwLock protecting the partition handle and dropped state.
288 ///
289 /// This lock prevents a race condition between `kill()` calling `WHvCancelRunVirtualProcessor`
290 /// and `WhpVm::drop()` calling `WHvDeletePartition`. These two Windows Hypervisor Platform APIs
291 /// must not execute concurrently - if `WHvDeletePartition` frees the partition while
292 /// `WHvCancelRunVirtualProcessor` is still accessing it, the result is a use-after-free
293 /// causing STATUS_ACCESS_VIOLATION or STATUS_HEAP_CORRUPTION.
294 ///
295 /// The synchronization works as follows:
296 /// - `kill()` takes a read lock before calling `WHvCancelRunVirtualProcessor`
297 /// - `set_dropped()` takes a write lock, which blocks until all in-flight `kill()` calls complete,
298 /// then sets `dropped = true`. This is called from `HyperlightVm::drop()` before `WhpVm::drop()`
299 /// runs, ensuring no `kill()` is accessing the partition when `WHvDeletePartition` is called.
300 partition_state: std::sync::RwLock<PartitionState>,
301}
302
303/// State protected by the RwLock in `WindowsInterruptHandle`.
304///
305/// Contains a copy of the partition handle from `WhpVm` (not an owning reference).
306/// The RwLock and `dropped` flag ensure this handle is never used after `WhpVm`
307/// deletes the partition.
308#[cfg(target_os = "windows")]
309#[derive(Debug)]
310pub(super) struct PartitionState {
311 /// Copy of partition handle from `WhpVm`. Only valid while `dropped` is false.
312 pub(super) handle: windows::Win32::System::Hypervisor::WHV_PARTITION_HANDLE,
313 /// Set true before partition deletion; prevents further use of `handle`.
314 pub(super) dropped: bool,
315}
316
317#[cfg(target_os = "windows")]
318impl WindowsInterruptHandle {
319 const RUNNING_BIT: u8 = 1 << 1;
320 const CANCEL_BIT: u8 = 1 << 0;
321 #[cfg(gdb)]
322 const DEBUG_INTERRUPT_BIT: u8 = 1 << 2;
323}
324
325#[cfg(target_os = "windows")]
326impl InterruptHandleImpl for WindowsInterruptHandle {
327 fn set_running(&self) {
328 // Release ordering to ensure prior memory operations are visible when another thread observes running=true
329 self.state.fetch_or(Self::RUNNING_BIT, Ordering::Release);
330 }
331
332 fn is_cancelled(&self) -> bool {
333 // Acquire ordering to synchronize with the Release in kill()
334 // This ensures we see the CANCEL_BIT set by the interrupt thread
335 self.state.load(Ordering::Acquire) & Self::CANCEL_BIT != 0
336 }
337
338 fn clear_cancel(&self) {
339 // Release ordering to ensure that any operations from the previous run()
340 // are visible to other threads. While this is typically called by the vcpu thread
341 // at the start of run(), the VM itself can move between threads across guest calls.
342 self.state.fetch_and(!Self::CANCEL_BIT, Ordering::Release);
343 }
344
345 fn clear_running(&self) {
346 // Release ordering to ensure all vcpu operations are visible before clearing running
347 self.state.fetch_and(!Self::RUNNING_BIT, Ordering::Release);
348 }
349
350 fn is_debug_interrupted(&self) -> bool {
351 #[cfg(gdb)]
352 {
353 self.state.load(Ordering::Acquire) & Self::DEBUG_INTERRUPT_BIT != 0
354 }
355 #[cfg(not(gdb))]
356 {
357 false
358 }
359 }
360
361 #[cfg(gdb)]
362 fn clear_debug_interrupt(&self) {
363 self.state
364 .fetch_and(!Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
365 }
366
367 fn set_dropped(&self) {
368 // Take write lock to:
369 // 1. Wait for any in-flight kill() calls (holding read locks) to complete
370 // 2. Block new kill() calls from starting while we hold the write lock
371 // 3. Set dropped=true so no future kill() calls will use the handle
372 // After this returns, no WHvCancelRunVirtualProcessor calls are in progress
373 // or will ever be made, so WHvDeletePartition can safely be called.
374 match self.partition_state.write() {
375 Ok(mut guard) => {
376 guard.dropped = true;
377 }
378 Err(e) => {
379 tracing::error!("Failed to acquire partition_state write lock: {}", e);
380 }
381 }
382 }
383}
384
385#[cfg(target_os = "windows")]
386impl InterruptHandle for WindowsInterruptHandle {
387 fn kill(&self) -> bool {
388 use windows::Win32::System::Hypervisor::WHvCancelRunVirtualProcessor;
389
390 // Release ordering ensures that any writes before kill() are visible to the vcpu thread
391 // when it checks is_cancelled() with Acquire ordering
392 self.state.fetch_or(Self::CANCEL_BIT, Ordering::Release);
393
394 // Acquire ordering to synchronize with the Release in set_running()
395 // This ensures we see the running state set by the vcpu thread
396 let state = self.state.load(Ordering::Acquire);
397 if state & Self::RUNNING_BIT == 0 {
398 return false;
399 }
400
401 // Take read lock to prevent race with WHvDeletePartition in set_dropped().
402 // Multiple kill() calls can proceed concurrently (read locks don't block each other),
403 // but set_dropped() will wait for all kill() calls to complete before proceeding.
404 let guard = match self.partition_state.read() {
405 Ok(guard) => guard,
406 Err(e) => {
407 tracing::error!("Failed to acquire partition_state read lock: {}", e);
408 return false;
409 }
410 };
411
412 if guard.dropped {
413 return false;
414 }
415
416 unsafe { WHvCancelRunVirtualProcessor(guard.handle, 0, 0).is_ok() }
417 }
418 #[cfg(gdb)]
419 fn kill_from_debugger(&self) -> bool {
420 use windows::Win32::System::Hypervisor::WHvCancelRunVirtualProcessor;
421
422 self.state
423 .fetch_or(Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
424
425 // Acquire ordering to synchronize with the Release in set_running()
426 let state = self.state.load(Ordering::Acquire);
427 if state & Self::RUNNING_BIT == 0 {
428 return false;
429 }
430
431 // Take read lock to prevent race with WHvDeletePartition in set_dropped()
432 let guard = match self.partition_state.read() {
433 Ok(guard) => guard,
434 Err(e) => {
435 tracing::error!("Failed to acquire partition_state read lock: {}", e);
436 return false;
437 }
438 };
439
440 if guard.dropped {
441 return false;
442 }
443
444 unsafe { WHvCancelRunVirtualProcessor(guard.handle, 0, 0).is_ok() }
445 }
446
447 fn dropped(&self) -> bool {
448 // Take read lock to check dropped state consistently
449 match self.partition_state.read() {
450 Ok(guard) => guard.dropped,
451 Err(e) => {
452 tracing::error!("Failed to acquire partition_state read lock: {}", e);
453 true // Assume dropped if we can't acquire lock
454 }
455 }
456 }
457}
458
459#[cfg(all(test, any(target_os = "windows", kvm)))]
460pub(crate) mod tests {
461 use std::sync::{Arc, Mutex};
462
463 use hyperlight_testing::dummy_guest_as_string;
464
465 use crate::sandbox::uninitialized::GuestBinary;
466 #[cfg(any(crashdump, gdb))]
467 use crate::sandbox::uninitialized::SandboxRuntimeConfig;
468 use crate::sandbox::uninitialized_evolve::set_up_hypervisor_partition;
469 use crate::sandbox::{SandboxConfiguration, UninitializedSandbox};
470 use crate::{Result, is_hypervisor_present, new_error};
471
472 #[cfg_attr(feature = "hw-interrupts", ignore)]
473 #[test]
474 fn test_initialise() -> Result<()> {
475 if !is_hypervisor_present() {
476 return Ok(());
477 }
478
479 use crate::mem::ptr::RawPtr;
480 use crate::sandbox::host_funcs::FunctionRegistry;
481
482 let filename = dummy_guest_as_string().map_err(|e| new_error!("{}", e))?;
483
484 let config: SandboxConfiguration = Default::default();
485 #[cfg(any(crashdump, gdb))]
486 let rt_cfg: SandboxRuntimeConfig = Default::default();
487 let sandbox =
488 UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?;
489 let (mut mem_mgr, gshm) = sandbox.mgr.build().unwrap();
490 let exn_stack_top_gva = hyperlight_common::layout::MAX_GVA as u64
491 - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET
492 + 1;
493 let mut vm = set_up_hypervisor_partition(
494 gshm,
495 &config,
496 exn_stack_top_gva,
497 page_size::get(),
498 #[cfg(any(crashdump, gdb))]
499 rt_cfg,
500 sandbox.load_info,
501 )?;
502
503 // Set up required parameters for initialise
504 let peb_addr = RawPtr::from(0x1000u64); // Dummy PEB address
505 let seed = 12345u64; // Random seed
506 let page_size = 4096u32; // Standard page size
507 let host_funcs = Arc::new(Mutex::new(FunctionRegistry::default()));
508 let guest_max_log_level = Some(tracing_core::LevelFilter::ERROR);
509
510 #[cfg(gdb)]
511 let dbg_mem_access_fn = Arc::new(Mutex::new(mem_mgr.clone()));
512
513 // Test the initialise method
514 vm.initialise(
515 peb_addr,
516 seed,
517 page_size,
518 &mut mem_mgr,
519 &host_funcs,
520 guest_max_log_level,
521 #[cfg(gdb)]
522 dbg_mem_access_fn,
523 )
524 .unwrap();
525
526 Ok(())
527 }
528}