Skip to main content

hyperlight_host/hypervisor/
mod.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17/// GDB debugging support
18#[cfg(gdb)]
19pub(crate) mod gdb;
20
21/// Abstracts over different hypervisor register representations
22pub(crate) mod regs;
23
24pub(crate) mod virtual_machine;
25
26#[cfg(target_os = "windows")]
27/// Hyperlight Surrogate Process
28pub(crate) mod surrogate_process;
29#[cfg(target_os = "windows")]
30/// Hyperlight Surrogate Process
31pub(crate) mod surrogate_process_manager;
32/// Safe wrappers around windows types like `PSTR`
33#[cfg(target_os = "windows")]
34pub mod wrappers;
35
36#[cfg(crashdump)]
37pub(crate) mod crashdump;
38
39pub(crate) mod hyperlight_vm;
40
41use std::fmt::Debug;
42#[cfg(any(kvm, mshv3))]
43use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU64, Ordering};
44#[cfg(target_os = "windows")]
45use std::sync::atomic::{AtomicU8, Ordering};
46#[cfg(any(kvm, mshv3))]
47use std::time::Duration;
48
49/// A trait for platform-specific interrupt handle implementation details
50pub(crate) trait InterruptHandleImpl: InterruptHandle {
51    /// Set the thread ID for the vcpu thread
52    #[cfg(any(kvm, mshv3))]
53    fn set_tid(&self);
54
55    /// Set the running state
56    fn set_running(&self);
57
58    /// Clear the running state
59    fn clear_running(&self);
60
61    /// Mark the handle as dropped
62    fn set_dropped(&self);
63
64    /// Check if cancellation was requested
65    fn is_cancelled(&self) -> bool;
66
67    /// Clear the cancellation request flag
68    fn clear_cancel(&self);
69
70    /// Check if debug interrupt was requested (always returns false when gdb feature is disabled)
71    fn is_debug_interrupted(&self) -> bool;
72
73    // Clear the debug interrupt request flag
74    #[cfg(gdb)]
75    fn clear_debug_interrupt(&self);
76}
77
78/// A trait for handling interrupts to a sandbox's vcpu
79pub trait InterruptHandle: Send + Sync + Debug {
80    /// Interrupt the corresponding sandbox from running.
81    ///
82    /// - If this is called while the the sandbox currently executing a guest function call, it will interrupt the sandbox and return `true`.
83    /// - If this is called while the sandbox is not running (for example before or after calling a guest function), it will do nothing and return `false`.
84    ///
85    /// # Note
86    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
87    fn kill(&self) -> bool;
88
89    /// Used by a debugger to interrupt the corresponding sandbox from running.
90    ///
91    /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
92    /// - If this is called while the vcpu is not running, (for example during a host call), the
93    ///   vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
94    ///   it's scheduled, and returns `false`.
95    ///
96    /// # Note
97    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
98    #[cfg(gdb)]
99    fn kill_from_debugger(&self) -> bool;
100
101    /// Returns true if the corresponding sandbox has been dropped
102    fn dropped(&self) -> bool;
103}
104
105#[cfg(any(kvm, mshv3))]
106#[derive(Debug)]
107pub(super) struct LinuxInterruptHandle {
108    /// Atomic value packing vcpu execution state.
109    ///
110    /// Bit layout:
111    /// - Bit 2: DEBUG_INTERRUPT_BIT - set when debugger interrupt is requested
112    /// - Bit 1: RUNNING_BIT - set when vcpu is actively running
113    /// - Bit 0: CANCEL_BIT - set when cancellation has been requested
114    ///
115    /// CANCEL_BIT persists across vcpu exits/re-entries within a single `VirtualCPU::run()` call
116    /// (e.g., during host function calls), but is cleared at the start of each new `VirtualCPU::run()` call.
117    state: AtomicU8,
118
119    /// Thread ID where the vcpu is running.
120    ///
121    /// Note: Multiple VMs may have the same `tid` (same thread runs multiple sandboxes sequentially),
122    /// but at most one VM will have RUNNING_BIT set at any given time.
123    tid: AtomicU64,
124
125    /// Whether the corresponding VM has been dropped.
126    dropped: AtomicBool,
127
128    /// Delay between retry attempts when sending signals to interrupt the vcpu.
129    retry_delay: Duration,
130
131    /// Offset from SIGRTMIN for the signal used to interrupt the vcpu thread.
132    sig_rt_min_offset: u8,
133}
134
135#[cfg(any(kvm, mshv3))]
136impl LinuxInterruptHandle {
137    const RUNNING_BIT: u8 = 1 << 1;
138    const CANCEL_BIT: u8 = 1 << 0;
139    #[cfg(gdb)]
140    const DEBUG_INTERRUPT_BIT: u8 = 1 << 2;
141
142    /// Get the running, cancel and debug flags atomically.
143    ///
144    /// # Memory Ordering
145    /// Uses `Acquire` ordering to synchronize with the `Release` in `set_running()` and `kill()`.
146    /// This ensures that when we observe running=true, we also see the correct `tid` value.
147    fn get_running_cancel_debug(&self) -> (bool, bool, bool) {
148        let state = self.state.load(Ordering::Acquire);
149        let running = state & Self::RUNNING_BIT != 0;
150        let cancel = state & Self::CANCEL_BIT != 0;
151        #[cfg(gdb)]
152        let debug = state & Self::DEBUG_INTERRUPT_BIT != 0;
153        #[cfg(not(gdb))]
154        let debug = false;
155        (running, cancel, debug)
156    }
157
158    fn send_signal(&self) -> bool {
159        let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int;
160        let mut sent_signal = false;
161
162        loop {
163            let (running, cancel, debug) = self.get_running_cancel_debug();
164
165            // Check if we should continue sending signals
166            // Exit if not running OR if neither cancel nor debug_interrupt is set
167            let should_continue = running && (cancel || debug);
168
169            if !should_continue {
170                break;
171            }
172
173            tracing::info!("Sending signal to kill vcpu thread...");
174            sent_signal = true;
175            // Acquire ordering to synchronize with the Release store in set_tid()
176            // This ensures we see the correct tid value for the currently running vcpu
177            unsafe {
178                libc::pthread_kill(self.tid.load(Ordering::Acquire) as _, signal_number);
179            }
180            std::thread::sleep(self.retry_delay);
181        }
182
183        sent_signal
184    }
185}
186
187#[cfg(any(kvm, mshv3))]
188impl InterruptHandleImpl for LinuxInterruptHandle {
189    fn set_tid(&self) {
190        // Release ordering to synchronize with the Acquire load of `running` in send_signal()
191        // This ensures that when send_signal() observes RUNNING_BIT=true (via Acquire),
192        // it also sees the correct tid value stored here
193        self.tid
194            .store(unsafe { libc::pthread_self() as u64 }, Ordering::Release);
195    }
196
197    fn set_running(&self) {
198        // Release ordering to ensure that the tid store (which uses Release)
199        // is visible to any thread that observes running=true via Acquire ordering.
200        // This prevents the interrupt thread from reading a stale tid value.
201        self.state.fetch_or(Self::RUNNING_BIT, Ordering::Release);
202    }
203
204    fn is_cancelled(&self) -> bool {
205        // Acquire ordering to synchronize with the Release in kill()
206        // This ensures we see the cancel flag set by the interrupt thread
207        self.state.load(Ordering::Acquire) & Self::CANCEL_BIT != 0
208    }
209
210    fn clear_cancel(&self) {
211        // Release ordering to ensure that any operations from the previous run()
212        // are visible to other threads. While this is typically called by the vcpu thread
213        // at the start of run(), the VM itself can move between threads across guest calls.
214        self.state.fetch_and(!Self::CANCEL_BIT, Ordering::Release);
215    }
216
217    fn clear_running(&self) {
218        // Release ordering to ensure all vcpu operations are visible before clearing running
219        self.state.fetch_and(!Self::RUNNING_BIT, Ordering::Release);
220    }
221
222    fn is_debug_interrupted(&self) -> bool {
223        #[cfg(gdb)]
224        {
225            self.state.load(Ordering::Acquire) & Self::DEBUG_INTERRUPT_BIT != 0
226        }
227        #[cfg(not(gdb))]
228        {
229            false
230        }
231    }
232
233    #[cfg(gdb)]
234    fn clear_debug_interrupt(&self) {
235        self.state
236            .fetch_and(!Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
237    }
238
239    fn set_dropped(&self) {
240        // Release ordering to ensure all VM cleanup operations are visible
241        // to any thread that checks dropped() via Acquire
242        self.dropped.store(true, Ordering::Release);
243    }
244}
245
246#[cfg(any(kvm, mshv3))]
247impl InterruptHandle for LinuxInterruptHandle {
248    fn kill(&self) -> bool {
249        // Release ordering ensures that any writes before kill() are visible to the vcpu thread
250        // when it checks is_cancelled() with Acquire ordering
251        self.state.fetch_or(Self::CANCEL_BIT, Ordering::Release);
252
253        // Send signals to interrupt the vcpu if it's currently running
254        self.send_signal()
255    }
256
257    #[cfg(gdb)]
258    fn kill_from_debugger(&self) -> bool {
259        self.state
260            .fetch_or(Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
261        self.send_signal()
262    }
263    fn dropped(&self) -> bool {
264        // Acquire ordering to synchronize with the Release in set_dropped()
265        // This ensures we see all VM cleanup operations that happened before drop
266        self.dropped.load(Ordering::Acquire)
267    }
268}
269
270#[cfg(target_os = "windows")]
271#[derive(Debug)]
272pub(super) struct WindowsInterruptHandle {
273    /// Atomic value packing vcpu execution state.
274    ///
275    /// Bit layout:
276    /// - Bit 2: DEBUG_INTERRUPT_BIT - set when debugger interrupt is requested
277    /// - Bit 1: RUNNING_BIT - set when vcpu is actively running
278    /// - Bit 0: CANCEL_BIT - set when cancellation has been requested
279    ///
280    /// `WHvCancelRunVirtualProcessor()` will return Ok even if the vcpu is not running,
281    /// which is why we need the RUNNING_BIT.
282    ///
283    /// CANCEL_BIT persists across vcpu exits/re-entries within a single `VirtualCPU::run()` call
284    /// (e.g., during host function calls), but is cleared at the start of each new `VirtualCPU::run()` call.
285    state: AtomicU8,
286
287    /// RwLock protecting the partition handle and dropped state.
288    ///
289    /// This lock prevents a race condition between `kill()` calling `WHvCancelRunVirtualProcessor`
290    /// and `WhpVm::drop()` calling `WHvDeletePartition`. These two Windows Hypervisor Platform APIs
291    /// must not execute concurrently - if `WHvDeletePartition` frees the partition while
292    /// `WHvCancelRunVirtualProcessor` is still accessing it, the result is a use-after-free
293    /// causing STATUS_ACCESS_VIOLATION or STATUS_HEAP_CORRUPTION.
294    ///
295    /// The synchronization works as follows:
296    /// - `kill()` takes a read lock before calling `WHvCancelRunVirtualProcessor`
297    /// - `set_dropped()` takes a write lock, which blocks until all in-flight `kill()` calls complete,
298    ///   then sets `dropped = true`. This is called from `HyperlightVm::drop()` before `WhpVm::drop()`
299    ///   runs, ensuring no `kill()` is accessing the partition when `WHvDeletePartition` is called.
300    partition_state: std::sync::RwLock<PartitionState>,
301}
302
303/// State protected by the RwLock in `WindowsInterruptHandle`.
304///
305/// Contains a copy of the partition handle from `WhpVm` (not an owning reference).
306/// The RwLock and `dropped` flag ensure this handle is never used after `WhpVm`
307/// deletes the partition.
308#[cfg(target_os = "windows")]
309#[derive(Debug)]
310pub(super) struct PartitionState {
311    /// Copy of partition handle from `WhpVm`. Only valid while `dropped` is false.
312    pub(super) handle: windows::Win32::System::Hypervisor::WHV_PARTITION_HANDLE,
313    /// Set true before partition deletion; prevents further use of `handle`.
314    pub(super) dropped: bool,
315}
316
317#[cfg(target_os = "windows")]
318impl WindowsInterruptHandle {
319    const RUNNING_BIT: u8 = 1 << 1;
320    const CANCEL_BIT: u8 = 1 << 0;
321    #[cfg(gdb)]
322    const DEBUG_INTERRUPT_BIT: u8 = 1 << 2;
323}
324
325#[cfg(target_os = "windows")]
326impl InterruptHandleImpl for WindowsInterruptHandle {
327    fn set_running(&self) {
328        // Release ordering to ensure prior memory operations are visible when another thread observes running=true
329        self.state.fetch_or(Self::RUNNING_BIT, Ordering::Release);
330    }
331
332    fn is_cancelled(&self) -> bool {
333        // Acquire ordering to synchronize with the Release in kill()
334        // This ensures we see the CANCEL_BIT set by the interrupt thread
335        self.state.load(Ordering::Acquire) & Self::CANCEL_BIT != 0
336    }
337
338    fn clear_cancel(&self) {
339        // Release ordering to ensure that any operations from the previous run()
340        // are visible to other threads. While this is typically called by the vcpu thread
341        // at the start of run(), the VM itself can move between threads across guest calls.
342        self.state.fetch_and(!Self::CANCEL_BIT, Ordering::Release);
343    }
344
345    fn clear_running(&self) {
346        // Release ordering to ensure all vcpu operations are visible before clearing running
347        self.state.fetch_and(!Self::RUNNING_BIT, Ordering::Release);
348    }
349
350    fn is_debug_interrupted(&self) -> bool {
351        #[cfg(gdb)]
352        {
353            self.state.load(Ordering::Acquire) & Self::DEBUG_INTERRUPT_BIT != 0
354        }
355        #[cfg(not(gdb))]
356        {
357            false
358        }
359    }
360
361    #[cfg(gdb)]
362    fn clear_debug_interrupt(&self) {
363        self.state
364            .fetch_and(!Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
365    }
366
367    fn set_dropped(&self) {
368        // Take write lock to:
369        // 1. Wait for any in-flight kill() calls (holding read locks) to complete
370        // 2. Block new kill() calls from starting while we hold the write lock
371        // 3. Set dropped=true so no future kill() calls will use the handle
372        // After this returns, no WHvCancelRunVirtualProcessor calls are in progress
373        // or will ever be made, so WHvDeletePartition can safely be called.
374        match self.partition_state.write() {
375            Ok(mut guard) => {
376                guard.dropped = true;
377            }
378            Err(e) => {
379                tracing::error!("Failed to acquire partition_state write lock: {}", e);
380            }
381        }
382    }
383}
384
385#[cfg(target_os = "windows")]
386impl InterruptHandle for WindowsInterruptHandle {
387    fn kill(&self) -> bool {
388        use windows::Win32::System::Hypervisor::WHvCancelRunVirtualProcessor;
389
390        // Release ordering ensures that any writes before kill() are visible to the vcpu thread
391        // when it checks is_cancelled() with Acquire ordering
392        self.state.fetch_or(Self::CANCEL_BIT, Ordering::Release);
393
394        // Acquire ordering to synchronize with the Release in set_running()
395        // This ensures we see the running state set by the vcpu thread
396        let state = self.state.load(Ordering::Acquire);
397        if state & Self::RUNNING_BIT == 0 {
398            return false;
399        }
400
401        // Take read lock to prevent race with WHvDeletePartition in set_dropped().
402        // Multiple kill() calls can proceed concurrently (read locks don't block each other),
403        // but set_dropped() will wait for all kill() calls to complete before proceeding.
404        let guard = match self.partition_state.read() {
405            Ok(guard) => guard,
406            Err(e) => {
407                tracing::error!("Failed to acquire partition_state read lock: {}", e);
408                return false;
409            }
410        };
411
412        if guard.dropped {
413            return false;
414        }
415
416        unsafe { WHvCancelRunVirtualProcessor(guard.handle, 0, 0).is_ok() }
417    }
418    #[cfg(gdb)]
419    fn kill_from_debugger(&self) -> bool {
420        use windows::Win32::System::Hypervisor::WHvCancelRunVirtualProcessor;
421
422        self.state
423            .fetch_or(Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
424
425        // Acquire ordering to synchronize with the Release in set_running()
426        let state = self.state.load(Ordering::Acquire);
427        if state & Self::RUNNING_BIT == 0 {
428            return false;
429        }
430
431        // Take read lock to prevent race with WHvDeletePartition in set_dropped()
432        let guard = match self.partition_state.read() {
433            Ok(guard) => guard,
434            Err(e) => {
435                tracing::error!("Failed to acquire partition_state read lock: {}", e);
436                return false;
437            }
438        };
439
440        if guard.dropped {
441            return false;
442        }
443
444        unsafe { WHvCancelRunVirtualProcessor(guard.handle, 0, 0).is_ok() }
445    }
446
447    fn dropped(&self) -> bool {
448        // Take read lock to check dropped state consistently
449        match self.partition_state.read() {
450            Ok(guard) => guard.dropped,
451            Err(e) => {
452                tracing::error!("Failed to acquire partition_state read lock: {}", e);
453                true // Assume dropped if we can't acquire lock
454            }
455        }
456    }
457}
458
459#[cfg(all(test, any(target_os = "windows", kvm)))]
460pub(crate) mod tests {
461    use std::sync::{Arc, Mutex};
462
463    use hyperlight_testing::dummy_guest_as_string;
464
465    use crate::sandbox::uninitialized::GuestBinary;
466    #[cfg(any(crashdump, gdb))]
467    use crate::sandbox::uninitialized::SandboxRuntimeConfig;
468    use crate::sandbox::uninitialized_evolve::set_up_hypervisor_partition;
469    use crate::sandbox::{SandboxConfiguration, UninitializedSandbox};
470    use crate::{Result, is_hypervisor_present, new_error};
471
472    #[cfg_attr(feature = "hw-interrupts", ignore)]
473    #[test]
474    fn test_initialise() -> Result<()> {
475        if !is_hypervisor_present() {
476            return Ok(());
477        }
478
479        use crate::mem::ptr::RawPtr;
480        use crate::sandbox::host_funcs::FunctionRegistry;
481
482        let filename = dummy_guest_as_string().map_err(|e| new_error!("{}", e))?;
483
484        let config: SandboxConfiguration = Default::default();
485        #[cfg(any(crashdump, gdb))]
486        let rt_cfg: SandboxRuntimeConfig = Default::default();
487        let sandbox =
488            UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?;
489        let (mut mem_mgr, gshm) = sandbox.mgr.build().unwrap();
490        let exn_stack_top_gva = hyperlight_common::layout::MAX_GVA as u64
491            - hyperlight_common::layout::SCRATCH_TOP_EXN_STACK_OFFSET
492            + 1;
493        let mut vm = set_up_hypervisor_partition(
494            gshm,
495            &config,
496            exn_stack_top_gva,
497            page_size::get(),
498            #[cfg(any(crashdump, gdb))]
499            rt_cfg,
500            sandbox.load_info,
501        )?;
502
503        // Set up required parameters for initialise
504        let peb_addr = RawPtr::from(0x1000u64); // Dummy PEB address
505        let seed = 12345u64; // Random seed
506        let page_size = 4096u32; // Standard page size
507        let host_funcs = Arc::new(Mutex::new(FunctionRegistry::default()));
508        let guest_max_log_level = Some(tracing_core::LevelFilter::ERROR);
509
510        #[cfg(gdb)]
511        let dbg_mem_access_fn = Arc::new(Mutex::new(mem_mgr.clone()));
512
513        // Test the initialise method
514        vm.initialise(
515            peb_addr,
516            seed,
517            page_size,
518            &mut mem_mgr,
519            &host_funcs,
520            guest_max_log_level,
521            #[cfg(gdb)]
522            dbg_mem_access_fn,
523        )
524        .unwrap();
525
526        Ok(())
527    }
528}