hyperlight_host/hypervisor/
mod.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use log::LevelFilter;
18
19use crate::Result;
20use crate::hypervisor::regs::{CommonFpu, CommonRegisters, CommonSpecialRegisters};
21use crate::mem::memory_region::MemoryRegion;
22
23/// HyperV-on-linux functionality
24#[cfg(mshv3)]
25pub(crate) mod hyperv_linux;
26#[cfg(target_os = "windows")]
27pub(crate) mod hyperv_windows;
28
29/// GDB debugging support
30#[cfg(gdb)]
31pub(crate) mod gdb;
32
33/// Abstracts over different hypervisor register representations
34pub(crate) mod regs;
35
36#[cfg(kvm)]
37/// Functionality to manipulate KVM-based virtual machines
38pub(crate) mod kvm;
39
40#[cfg(target_os = "windows")]
41/// Hyperlight Surrogate Process
42pub(crate) mod surrogate_process;
43#[cfg(target_os = "windows")]
44/// Hyperlight Surrogate Process
45pub(crate) mod surrogate_process_manager;
46/// Safe wrappers around windows types like `PSTR`
47#[cfg(target_os = "windows")]
48pub(crate) mod wrappers;
49
50#[cfg(crashdump)]
51pub(crate) mod crashdump;
52
53pub(crate) mod hyperlight_vm;
54
55use std::fmt::Debug;
56use std::str::FromStr;
57#[cfg(any(kvm, mshv3))]
58use std::sync::atomic::{AtomicBool, AtomicU8, AtomicU64, Ordering};
59#[cfg(target_os = "windows")]
60use std::sync::atomic::{AtomicBool, AtomicU8, Ordering};
61#[cfg(any(kvm, mshv3))]
62use std::time::Duration;
63
64pub(crate) enum HyperlightExit {
65    /// The vCPU has exited due to a debug event (usually breakpoint)
66    #[cfg(gdb)]
67    Debug { dr6: u64, exception: u32 },
68    /// The vCPU has halted
69    Halt(),
70    /// The vCPU has issued a write to the given port with the given value
71    IoOut(u16, Vec<u8>),
72    /// The vCPU tried to read from the given (unmapped) addr
73    MmioRead(u64),
74    /// The vCPU tried to write to the given (unmapped) addr
75    MmioWrite(u64),
76    /// The vCPU execution has been cancelled
77    Cancelled(),
78    /// The vCPU has exited for a reason that is not handled by Hyperlight
79    Unknown(String),
80    /// The operation should be retried, for example this can happen on Linux where a call to run the CPU can return EAGAIN
81    #[cfg_attr(
82        target_os = "windows",
83        expect(
84            dead_code,
85            reason = "Retry() is never constructed on Windows, but it is still matched on (which dead_code lint ignores)"
86        )
87    )]
88    Retry(),
89}
90
91/// Trait for single-vCPU VMs. Provides a common interface for basic VM operations.
92/// Abstracts over differences between KVM, MSHV and WHP implementations.
93pub(crate) trait Hypervisor: Debug + Send {
94    /// Map memory region into this VM
95    ///
96    /// # Safety
97    /// The caller must ensure that the memory region is valid and points to valid memory,
98    /// and lives long enough for the VM to use it.
99    /// The caller must ensure that the given u32 is not already mapped, otherwise previously mapped
100    /// memory regions may be overwritten.
101    /// The memory region must not overlap with an existing region, and depending on platform, must be aligned to page boundaries.
102    unsafe fn map_memory(&mut self, region: (u32, &MemoryRegion)) -> Result<()>;
103
104    /// Unmap memory region from this VM that has previously been mapped using `map_memory`.
105    fn unmap_memory(&mut self, region: (u32, &MemoryRegion)) -> Result<()>;
106
107    /// Runs the vCPU until it exits.
108    /// Note: this function should not emit any traces or spans as it is called after guest span is setup
109    fn run_vcpu(&mut self) -> Result<HyperlightExit>;
110
111    /// Get regs
112    #[allow(dead_code)]
113    fn regs(&self) -> Result<CommonRegisters>;
114    /// Set regs
115    fn set_regs(&self, regs: &CommonRegisters) -> Result<()>;
116    /// Get fpu regs
117    #[allow(dead_code)]
118    fn fpu(&self) -> Result<CommonFpu>;
119    /// Set fpu regs
120    fn set_fpu(&self, fpu: &CommonFpu) -> Result<()>;
121    /// Get special regs
122    #[allow(dead_code)]
123    fn sregs(&self) -> Result<CommonSpecialRegisters>;
124    /// Set special regs
125    fn set_sregs(&self, sregs: &CommonSpecialRegisters) -> Result<()>;
126
127    /// xsave
128    #[cfg(crashdump)]
129    fn xsave(&self) -> Result<Vec<u8>>;
130
131    /// Get partition handle
132    #[cfg(target_os = "windows")]
133    fn partition_handle(&self) -> windows::Win32::System::Hypervisor::WHV_PARTITION_HANDLE;
134
135    /// Mark that initial memory setup is complete. After this, map_memory will fail.
136    /// This is only needed on Windows where dynamic memory mapping is not yet supported.
137    #[cfg(target_os = "windows")]
138    fn complete_initial_memory_setup(&mut self);
139}
140
141/// Get the logging level to pass to the guest entrypoint
142fn get_max_log_level() -> u32 {
143    // Check to see if the RUST_LOG environment variable is set
144    // and if so, parse it to get the log_level for hyperlight_guest
145    // if that is not set get the log level for the hyperlight_host
146
147    // This is done as the guest will produce logs based on the log level returned here
148    // producing those logs is expensive and we don't want to do it if the host is not
149    // going to process them
150
151    let val = std::env::var("RUST_LOG").unwrap_or_default();
152
153    let level = if val.contains("hyperlight_guest") {
154        val.split(',')
155            .find(|s| s.contains("hyperlight_guest"))
156            .unwrap_or("")
157            .split('=')
158            .nth(1)
159            .unwrap_or("")
160    } else if val.contains("hyperlight_host") {
161        val.split(',')
162            .find(|s| s.contains("hyperlight_host"))
163            .unwrap_or("")
164            .split('=')
165            .nth(1)
166            .unwrap_or("")
167    } else {
168        // look for a value string that does not contain "="
169        val.split(',').find(|s| !s.contains("=")).unwrap_or("")
170    };
171
172    log::info!("Determined guest log level: {}", level);
173    // Convert the log level string to a LevelFilter
174    // If no value is found, default to Error
175    LevelFilter::from_str(level).unwrap_or(LevelFilter::Error) as u32
176}
177
178/// A trait for platform-specific interrupt handle implementation details
179pub(crate) trait InterruptHandleImpl: InterruptHandle {
180    /// Set the thread ID for the vcpu thread
181    #[cfg(any(kvm, mshv3))]
182    fn set_tid(&self);
183
184    /// Set the running state
185    fn set_running(&self);
186
187    /// Clear the running state
188    fn clear_running(&self);
189
190    /// Mark the handle as dropped
191    fn set_dropped(&self);
192
193    /// Check if cancellation was requested
194    fn is_cancelled(&self) -> bool;
195
196    /// Clear the cancellation request flag
197    fn clear_cancel(&self);
198
199    /// Check if debug interrupt was requested (always returns false when gdb feature is disabled)
200    fn is_debug_interrupted(&self) -> bool;
201
202    // Clear the debug interrupt request flag
203    #[cfg(gdb)]
204    fn clear_debug_interrupt(&self);
205}
206
207/// A trait for handling interrupts to a sandbox's vcpu
208pub trait InterruptHandle: Send + Sync + Debug {
209    /// Interrupt the corresponding sandbox from running.
210    ///
211    /// - If this is called while the the sandbox currently executing a guest function call, it will interrupt the sandbox and return `true`.
212    /// - If this is called while the sandbox is not running (for example before or after calling a guest function), it will do nothing and return `false`.
213    ///
214    /// # Note
215    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
216    fn kill(&self) -> bool;
217
218    /// Used by a debugger to interrupt the corresponding sandbox from running.
219    ///
220    /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
221    /// - If this is called while the vcpu is not running, (for example during a host call), the
222    ///   vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
223    ///   it's scheduled, and returns `false`.
224    ///
225    /// # Note
226    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
227    #[cfg(gdb)]
228    fn kill_from_debugger(&self) -> bool;
229
230    /// Returns true if the corresponding sandbox has been dropped
231    fn dropped(&self) -> bool;
232}
233
234#[cfg(any(kvm, mshv3))]
235#[derive(Debug)]
236pub(super) struct LinuxInterruptHandle {
237    /// Atomic value packing vcpu execution state.
238    ///
239    /// Bit layout:
240    /// - Bit 2: DEBUG_INTERRUPT_BIT - set when debugger interrupt is requested
241    /// - Bit 1: RUNNING_BIT - set when vcpu is actively running
242    /// - Bit 0: CANCEL_BIT - set when cancellation has been requested
243    ///
244    /// CANCEL_BIT persists across vcpu exits/re-entries within a single `VirtualCPU::run()` call
245    /// (e.g., during host function calls), but is cleared at the start of each new `VirtualCPU::run()` call.
246    state: AtomicU8,
247
248    /// Thread ID where the vcpu is running.
249    ///
250    /// Note: Multiple VMs may have the same `tid` (same thread runs multiple sandboxes sequentially),
251    /// but at most one VM will have RUNNING_BIT set at any given time.
252    tid: AtomicU64,
253
254    /// Whether the corresponding VM has been dropped.
255    dropped: AtomicBool,
256
257    /// Delay between retry attempts when sending signals to interrupt the vcpu.
258    retry_delay: Duration,
259
260    /// Offset from SIGRTMIN for the signal used to interrupt the vcpu thread.
261    sig_rt_min_offset: u8,
262}
263
264#[cfg(any(kvm, mshv3))]
265impl LinuxInterruptHandle {
266    const RUNNING_BIT: u8 = 1 << 1;
267    const CANCEL_BIT: u8 = 1 << 0;
268    #[cfg(gdb)]
269    const DEBUG_INTERRUPT_BIT: u8 = 1 << 2;
270
271    /// Get the running, cancel and debug flags atomically.
272    ///
273    /// # Memory Ordering
274    /// Uses `Acquire` ordering to synchronize with the `Release` in `set_running()` and `kill()`.
275    /// This ensures that when we observe running=true, we also see the correct `tid` value.
276    fn get_running_cancel_debug(&self) -> (bool, bool, bool) {
277        let state = self.state.load(Ordering::Acquire);
278        let running = state & Self::RUNNING_BIT != 0;
279        let cancel = state & Self::CANCEL_BIT != 0;
280        #[cfg(gdb)]
281        let debug = state & Self::DEBUG_INTERRUPT_BIT != 0;
282        #[cfg(not(gdb))]
283        let debug = false;
284        (running, cancel, debug)
285    }
286
287    fn send_signal(&self) -> bool {
288        let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int;
289        let mut sent_signal = false;
290
291        loop {
292            let (running, cancel, debug) = self.get_running_cancel_debug();
293
294            // Check if we should continue sending signals
295            // Exit if not running OR if neither cancel nor debug_interrupt is set
296            let should_continue = running && (cancel || debug);
297
298            if !should_continue {
299                break;
300            }
301
302            log::info!("Sending signal to kill vcpu thread...");
303            sent_signal = true;
304            // Acquire ordering to synchronize with the Release store in set_tid()
305            // This ensures we see the correct tid value for the currently running vcpu
306            unsafe {
307                libc::pthread_kill(self.tid.load(Ordering::Acquire) as _, signal_number);
308            }
309            std::thread::sleep(self.retry_delay);
310        }
311
312        sent_signal
313    }
314}
315
316#[cfg(any(kvm, mshv3))]
317impl InterruptHandleImpl for LinuxInterruptHandle {
318    fn set_tid(&self) {
319        // Release ordering to synchronize with the Acquire load of `running` in send_signal()
320        // This ensures that when send_signal() observes RUNNING_BIT=true (via Acquire),
321        // it also sees the correct tid value stored here
322        self.tid
323            .store(unsafe { libc::pthread_self() as u64 }, Ordering::Release);
324    }
325
326    fn set_running(&self) {
327        // Release ordering to ensure that the tid store (which uses Release)
328        // is visible to any thread that observes running=true via Acquire ordering.
329        // This prevents the interrupt thread from reading a stale tid value.
330        self.state.fetch_or(Self::RUNNING_BIT, Ordering::Release);
331    }
332
333    fn is_cancelled(&self) -> bool {
334        // Acquire ordering to synchronize with the Release in kill()
335        // This ensures we see the cancel flag set by the interrupt thread
336        self.state.load(Ordering::Acquire) & Self::CANCEL_BIT != 0
337    }
338
339    fn clear_cancel(&self) {
340        // Release ordering to ensure that any operations from the previous run()
341        // are visible to other threads. While this is typically called by the vcpu thread
342        // at the start of run(), the VM itself can move between threads across guest calls.
343        self.state.fetch_and(!Self::CANCEL_BIT, Ordering::Release);
344    }
345
346    fn clear_running(&self) {
347        // Release ordering to ensure all vcpu operations are visible before clearing running
348        self.state.fetch_and(!Self::RUNNING_BIT, Ordering::Release);
349    }
350
351    fn is_debug_interrupted(&self) -> bool {
352        #[cfg(gdb)]
353        {
354            self.state.load(Ordering::Acquire) & Self::DEBUG_INTERRUPT_BIT != 0
355        }
356        #[cfg(not(gdb))]
357        {
358            false
359        }
360    }
361
362    #[cfg(gdb)]
363    fn clear_debug_interrupt(&self) {
364        self.state
365            .fetch_and(!Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
366    }
367
368    fn set_dropped(&self) {
369        // Release ordering to ensure all VM cleanup operations are visible
370        // to any thread that checks dropped() via Acquire
371        self.dropped.store(true, Ordering::Release);
372    }
373}
374
375#[cfg(any(kvm, mshv3))]
376impl InterruptHandle for LinuxInterruptHandle {
377    fn kill(&self) -> bool {
378        // Release ordering ensures that any writes before kill() are visible to the vcpu thread
379        // when it checks is_cancelled() with Acquire ordering
380        self.state.fetch_or(Self::CANCEL_BIT, Ordering::Release);
381
382        // Send signals to interrupt the vcpu if it's currently running
383        self.send_signal()
384    }
385
386    #[cfg(gdb)]
387    fn kill_from_debugger(&self) -> bool {
388        self.state
389            .fetch_or(Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
390        self.send_signal()
391    }
392    fn dropped(&self) -> bool {
393        // Acquire ordering to synchronize with the Release in set_dropped()
394        // This ensures we see all VM cleanup operations that happened before drop
395        self.dropped.load(Ordering::Acquire)
396    }
397}
398
399#[cfg(target_os = "windows")]
400#[derive(Debug)]
401pub(super) struct WindowsInterruptHandle {
402    /// Atomic value packing vcpu execution state.
403    ///
404    /// Bit layout:
405    /// - Bit 2: DEBUG_INTERRUPT_BIT - set when debugger interrupt is requested
406    /// - Bit 1: RUNNING_BIT - set when vcpu is actively running
407    /// - Bit 0: CANCEL_BIT - set when cancellation has been requested
408    ///
409    /// `WHvCancelRunVirtualProcessor()` will return Ok even if the vcpu is not running,
410    /// which is why we need the RUNNING_BIT.
411    ///
412    /// CANCEL_BIT persists across vcpu exits/re-entries within a single `VirtualCPU::run()` call
413    /// (e.g., during host function calls), but is cleared at the start of each new `VirtualCPU::run()` call.
414    state: AtomicU8,
415
416    partition_handle: windows::Win32::System::Hypervisor::WHV_PARTITION_HANDLE,
417    dropped: AtomicBool,
418}
419
420#[cfg(target_os = "windows")]
421impl WindowsInterruptHandle {
422    const RUNNING_BIT: u8 = 1 << 1;
423    const CANCEL_BIT: u8 = 1 << 0;
424    #[cfg(gdb)]
425    const DEBUG_INTERRUPT_BIT: u8 = 1 << 2;
426}
427
428#[cfg(target_os = "windows")]
429impl InterruptHandleImpl for WindowsInterruptHandle {
430    fn set_running(&self) {
431        // Release ordering to ensure prior memory operations are visible when another thread observes running=true
432        self.state.fetch_or(Self::RUNNING_BIT, Ordering::Release);
433    }
434
435    fn is_cancelled(&self) -> bool {
436        // Acquire ordering to synchronize with the Release in kill()
437        // This ensures we see the CANCEL_BIT set by the interrupt thread
438        self.state.load(Ordering::Acquire) & Self::CANCEL_BIT != 0
439    }
440
441    fn clear_cancel(&self) {
442        // Release ordering to ensure that any operations from the previous run()
443        // are visible to other threads. While this is typically called by the vcpu thread
444        // at the start of run(), the VM itself can move between threads across guest calls.
445        self.state.fetch_and(!Self::CANCEL_BIT, Ordering::Release);
446    }
447
448    fn clear_running(&self) {
449        // Release ordering to ensure all vcpu operations are visible before clearing running
450        self.state.fetch_and(!Self::RUNNING_BIT, Ordering::Release);
451    }
452
453    fn is_debug_interrupted(&self) -> bool {
454        #[cfg(gdb)]
455        {
456            self.state.load(Ordering::Acquire) & Self::DEBUG_INTERRUPT_BIT != 0
457        }
458        #[cfg(not(gdb))]
459        {
460            false
461        }
462    }
463
464    #[cfg(gdb)]
465    fn clear_debug_interrupt(&self) {
466        self.state
467            .fetch_and(!Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
468    }
469
470    fn set_dropped(&self) {
471        // Release ordering to ensure all VM cleanup operations are visible
472        // to any thread that checks dropped() via Acquire
473        self.dropped.store(true, Ordering::Release);
474    }
475}
476
477#[cfg(target_os = "windows")]
478impl InterruptHandle for WindowsInterruptHandle {
479    fn kill(&self) -> bool {
480        use windows::Win32::System::Hypervisor::WHvCancelRunVirtualProcessor;
481
482        // Release ordering ensures that any writes before kill() are visible to the vcpu thread
483        // when it checks is_cancelled() with Acquire ordering
484        self.state.fetch_or(Self::CANCEL_BIT, Ordering::Release);
485
486        // Acquire ordering to synchronize with the Release in set_running()
487        // This ensures we see the running state set by the vcpu thread
488        let state = self.state.load(Ordering::Acquire);
489        if state & Self::RUNNING_BIT != 0 {
490            unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
491        } else {
492            false
493        }
494    }
495    #[cfg(gdb)]
496    fn kill_from_debugger(&self) -> bool {
497        use windows::Win32::System::Hypervisor::WHvCancelRunVirtualProcessor;
498
499        self.state
500            .fetch_or(Self::DEBUG_INTERRUPT_BIT, Ordering::Release);
501        // Acquire ordering to synchronize with the Release in set_running()
502        let state = self.state.load(Ordering::Acquire);
503        if state & Self::RUNNING_BIT != 0 {
504            unsafe { WHvCancelRunVirtualProcessor(self.partition_handle, 0, 0).is_ok() }
505        } else {
506            false
507        }
508    }
509
510    fn dropped(&self) -> bool {
511        // Acquire ordering to synchronize with the Release in set_dropped()
512        // This ensures we see all VM cleanup operations that happened before drop
513        self.dropped.load(Ordering::Acquire)
514    }
515}
516
517#[cfg(all(test, any(target_os = "windows", kvm)))]
518pub(crate) mod tests {
519    use std::sync::{Arc, Mutex};
520
521    use hyperlight_testing::dummy_guest_as_string;
522
523    use crate::sandbox::uninitialized::GuestBinary;
524    #[cfg(any(crashdump, gdb))]
525    use crate::sandbox::uninitialized::SandboxRuntimeConfig;
526    use crate::sandbox::uninitialized_evolve::set_up_hypervisor_partition;
527    use crate::sandbox::{SandboxConfiguration, UninitializedSandbox};
528    use crate::{Result, is_hypervisor_present, new_error};
529
530    #[test]
531    fn test_initialise() -> Result<()> {
532        if !is_hypervisor_present() {
533            return Ok(());
534        }
535
536        use crate::mem::ptr::RawPtr;
537        use crate::sandbox::host_funcs::FunctionRegistry;
538
539        let filename = dummy_guest_as_string().map_err(|e| new_error!("{}", e))?;
540
541        let config: SandboxConfiguration = Default::default();
542        #[cfg(any(crashdump, gdb))]
543        let rt_cfg: SandboxRuntimeConfig = Default::default();
544        let sandbox =
545            UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?;
546        let (mut mem_mgr, mut gshm) = sandbox.mgr.build();
547        let mut vm = set_up_hypervisor_partition(
548            &mut gshm,
549            &config,
550            #[cfg(any(crashdump, gdb))]
551            &rt_cfg,
552            sandbox.load_info,
553        )?;
554
555        // Set up required parameters for initialise
556        let peb_addr = RawPtr::from(0x1000u64); // Dummy PEB address
557        let seed = 12345u64; // Random seed
558        let page_size = 4096u32; // Standard page size
559        let host_funcs = Arc::new(Mutex::new(FunctionRegistry::default()));
560        let guest_max_log_level = Some(log::LevelFilter::Error);
561
562        #[cfg(gdb)]
563        let dbg_mem_access_fn = Arc::new(Mutex::new(mem_mgr.clone()));
564
565        // Test the initialise method
566        vm.initialise(
567            peb_addr,
568            seed,
569            page_size,
570            &mut mem_mgr,
571            &host_funcs,
572            guest_max_log_level,
573            #[cfg(gdb)]
574            dbg_mem_access_fn,
575        )?;
576
577        Ok(())
578    }
579}