hyperlight_host/hypervisor/
mod.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use log::LevelFilter;
18use tracing::{Span, instrument};
19
20use crate::HyperlightError::StackOverflow;
21use crate::error::HyperlightError::ExecutionCanceledByHost;
22use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
23use crate::metrics::METRIC_GUEST_CANCELLATION;
24#[cfg(feature = "trace_guest")]
25use crate::sandbox::TraceInfo;
26use crate::{HyperlightError, Result, log_then_return};
27
28/// Util for handling x87 fpu state
29#[cfg(any(kvm, mshv, target_os = "windows"))]
30pub mod fpu;
31
32/// HyperV-on-linux functionality
33#[cfg(mshv)]
34pub mod hyperv_linux;
35#[cfg(target_os = "windows")]
36/// Hyperv-on-windows functionality
37pub(crate) mod hyperv_windows;
38
39/// GDB debugging support
40#[cfg(gdb)]
41pub(crate) mod gdb;
42
43#[cfg(kvm)]
44/// Functionality to manipulate KVM-based virtual machines
45pub mod kvm;
46#[cfg(target_os = "windows")]
47/// Hyperlight Surrogate Process
48pub(crate) mod surrogate_process;
49#[cfg(target_os = "windows")]
50/// Hyperlight Surrogate Process
51pub(crate) mod surrogate_process_manager;
52/// WindowsHypervisorPlatform utilities
53#[cfg(target_os = "windows")]
54pub(crate) mod windows_hypervisor_platform;
55/// Safe wrappers around windows types like `PSTR`
56#[cfg(target_os = "windows")]
57pub(crate) mod wrappers;
58
59#[cfg(crashdump)]
60pub(crate) mod crashdump;
61
62use std::fmt::Debug;
63use std::str::FromStr;
64#[cfg(any(kvm, mshv))]
65use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
66use std::sync::{Arc, Mutex};
67#[cfg(any(kvm, mshv))]
68use std::time::Duration;
69
70#[cfg(gdb)]
71use gdb::VcpuStopReason;
72
73use crate::mem::ptr::RawPtr;
74use crate::mem::shared_mem::HostSharedMemory;
75use crate::sandbox::host_funcs::FunctionRegistry;
76use crate::sandbox::mem_mgr::MemMgrWrapper;
77
78cfg_if::cfg_if! {
79    if #[cfg(feature = "init-paging")] {
80        pub(crate) const CR4_PAE: u64 = 1 << 5;
81        pub(crate) const CR4_OSFXSR: u64 = 1 << 9;
82        pub(crate) const CR4_OSXMMEXCPT: u64 = 1 << 10;
83        pub(crate) const CR0_PE: u64 = 1;
84        pub(crate) const CR0_MP: u64 = 1 << 1;
85        pub(crate) const CR0_ET: u64 = 1 << 4;
86        pub(crate) const CR0_NE: u64 = 1 << 5;
87        pub(crate) const CR0_WP: u64 = 1 << 16;
88        pub(crate) const CR0_AM: u64 = 1 << 18;
89        pub(crate) const CR0_PG: u64 = 1 << 31;
90        pub(crate) const EFER_LME: u64 = 1 << 8;
91        pub(crate) const EFER_LMA: u64 = 1 << 10;
92        pub(crate) const EFER_SCE: u64 = 1;
93        pub(crate) const EFER_NX: u64 = 1 << 11;
94    }
95}
96
97/// These are the generic exit reasons that we can handle from a Hypervisor the Hypervisors run method is responsible for mapping from
98/// the hypervisor specific exit reasons to these generic ones
99pub enum HyperlightExit {
100    #[cfg(gdb)]
101    /// The vCPU has exited due to a debug event
102    Debug(VcpuStopReason),
103    /// The vCPU has halted
104    Halt(),
105    /// The vCPU has issued a write to the given port with the given value
106    IoOut(u16, Vec<u8>, u64, u64),
107    /// The vCPU has attempted to read or write from an unmapped address
108    Mmio(u64),
109    /// The vCPU tried to access memory but was missing the required permissions
110    AccessViolation(u64, MemoryRegionFlags, MemoryRegionFlags),
111    /// The vCPU execution has been cancelled
112    Cancelled(),
113    /// The vCPU has exited for a reason that is not handled by Hyperlight
114    Unknown(String),
115    /// The operation should be retried, for example this can happen on Linux where a call to run the CPU can return EAGAIN
116    Retry(),
117}
118
119/// Registers which may be useful for tracing/stack unwinding
120#[cfg(feature = "trace_guest")]
121pub enum TraceRegister {
122    /// RAX
123    RAX,
124    /// RCX
125    RCX,
126    /// RIP
127    RIP,
128    /// RSP
129    RSP,
130    /// RBP
131    RBP,
132}
133
134/// A common set of hypervisor functionality
135pub(crate) trait Hypervisor: Debug + Send {
136    /// Initialise the internally stored vCPU with the given PEB address and
137    /// random number seed, then run it until a HLT instruction.
138    #[allow(clippy::too_many_arguments)]
139    fn initialise(
140        &mut self,
141        peb_addr: RawPtr,
142        seed: u64,
143        page_size: u32,
144        mem_mgr: MemMgrWrapper<HostSharedMemory>,
145        host_funcs: Arc<Mutex<FunctionRegistry>>,
146        guest_max_log_level: Option<LevelFilter>,
147        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<MemMgrWrapper<HostSharedMemory>>>,
148    ) -> Result<()>;
149
150    /// Map a region of host memory into the sandbox.
151    ///
152    /// Depending on the host platform, there are likely alignment
153    /// requirements of at least one page for base and len.
154    unsafe fn map_region(&mut self, rgn: &MemoryRegion) -> Result<()>;
155
156    /// Unmap a memory region from the sandbox
157    unsafe fn unmap_region(&mut self, rgn: &MemoryRegion) -> Result<()>;
158
159    /// Get the currently mapped dynamic memory regions (not including sandbox regions)
160    ///
161    /// Note: Box needed for trait to be object-safe :(
162    fn get_mapped_regions(&self) -> Box<dyn ExactSizeIterator<Item = &MemoryRegion> + '_>;
163
164    /// Dispatch a call from the host to the guest using the given pointer
165    /// to the dispatch function _in the guest's address space_.
166    ///
167    /// Do this by setting the instruction pointer to `dispatch_func_addr`
168    /// and then running the execution loop until a halt instruction.
169    ///
170    /// Returns `Ok` if the call succeeded, and an `Err` if it failed
171    fn dispatch_call_from_host(
172        &mut self,
173        dispatch_func_addr: RawPtr,
174        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<MemMgrWrapper<HostSharedMemory>>>,
175    ) -> Result<()>;
176
177    /// Handle an IO exit from the internally stored vCPU.
178    fn handle_io(
179        &mut self,
180        port: u16,
181        data: Vec<u8>,
182        rip: u64,
183        instruction_length: u64,
184    ) -> Result<()>;
185
186    /// Run the vCPU
187    fn run(&mut self) -> Result<HyperlightExit>;
188
189    /// Get InterruptHandle to underlying VM
190    fn interrupt_handle(&self) -> Arc<dyn InterruptHandle>;
191
192    /// Get the logging level to pass to the guest entrypoint
193    fn get_max_log_level(&self) -> u32 {
194        // Check to see if the RUST_LOG environment variable is set
195        // and if so, parse it to get the log_level for hyperlight_guest
196        // if that is not set get the log level for the hyperlight_host
197
198        // This is done as the guest will produce logs based on the log level returned here
199        // producing those logs is expensive and we don't want to do it if the host is not
200        // going to process them
201
202        let val = std::env::var("RUST_LOG").unwrap_or_default();
203
204        let level = if val.contains("hyperlight_guest") {
205            val.split(',')
206                .find(|s| s.contains("hyperlight_guest"))
207                .unwrap_or("")
208                .split('=')
209                .nth(1)
210                .unwrap_or("")
211        } else if val.contains("hyperlight_host") {
212            val.split(',')
213                .find(|s| s.contains("hyperlight_host"))
214                .unwrap_or("")
215                .split('=')
216                .nth(1)
217                .unwrap_or("")
218        } else {
219            // look for a value string that does not contain "="
220            val.split(',').find(|s| !s.contains("=")).unwrap_or("")
221        };
222
223        log::info!("Determined guest log level: {}", level);
224        // Convert the log level string to a LevelFilter
225        // If no value is found, default to Error
226        LevelFilter::from_str(level).unwrap_or(LevelFilter::Error) as u32
227    }
228
229    /// get a mutable trait object from self
230    fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor;
231
232    #[cfg(crashdump)]
233    fn crashdump_context(&self) -> Result<Option<crashdump::CrashDumpContext>>;
234
235    #[cfg(gdb)]
236    /// handles the cases when the vCPU stops due to a Debug event
237    fn handle_debug(
238        &mut self,
239        _dbg_mem_access_fn: Arc<Mutex<MemMgrWrapper<HostSharedMemory>>>,
240        _stop_reason: VcpuStopReason,
241    ) -> Result<()> {
242        unimplemented!()
243    }
244
245    /// Check stack guard to see if the stack is still valid
246    fn check_stack_guard(&self) -> Result<bool>;
247
248    /// Read a register for trace/unwind purposes
249    #[cfg(feature = "trace_guest")]
250    fn read_trace_reg(&self, reg: TraceRegister) -> Result<u64>;
251
252    /// Get a reference of the trace info for the guest
253    #[cfg(feature = "trace_guest")]
254    fn trace_info_as_ref(&self) -> &TraceInfo;
255    /// Get a mutable reference of the trace info for the guest
256    #[cfg(feature = "trace_guest")]
257    fn trace_info_as_mut(&mut self) -> &mut TraceInfo;
258}
259
260/// Returns a Some(HyperlightExit::AccessViolation(..)) if the given gpa doesn't have
261/// access its corresponding region. Returns None otherwise, or if the region is not found.
262pub(crate) fn get_memory_access_violation<'a>(
263    gpa: usize,
264    mut mem_regions: impl Iterator<Item = &'a MemoryRegion>,
265    access_info: MemoryRegionFlags,
266) -> Option<HyperlightExit> {
267    // find the region containing the given gpa
268    let region = mem_regions.find(|region| region.guest_region.contains(&gpa));
269
270    if let Some(region) = region {
271        if !region.flags.contains(access_info)
272            || region.flags.contains(MemoryRegionFlags::STACK_GUARD)
273        {
274            return Some(HyperlightExit::AccessViolation(
275                gpa as u64,
276                access_info,
277                region.flags,
278            ));
279        }
280    }
281    None
282}
283
284/// A virtual CPU that can be run until an exit occurs
285pub struct VirtualCPU {}
286
287impl VirtualCPU {
288    /// Run the given hypervisor until a halt instruction is reached
289    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
290    pub(crate) fn run(
291        hv: &mut dyn Hypervisor,
292        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<MemMgrWrapper<HostSharedMemory>>>,
293    ) -> Result<()> {
294        loop {
295            match hv.run() {
296                #[cfg(gdb)]
297                Ok(HyperlightExit::Debug(stop_reason)) => {
298                    if let Err(e) = hv.handle_debug(dbg_mem_access_fn.clone(), stop_reason) {
299                        log_then_return!(e);
300                    }
301                }
302
303                Ok(HyperlightExit::Halt()) => {
304                    break;
305                }
306                Ok(HyperlightExit::IoOut(port, data, rip, instruction_length)) => {
307                    hv.handle_io(port, data, rip, instruction_length)?
308                }
309                Ok(HyperlightExit::Mmio(addr)) => {
310                    #[cfg(crashdump)]
311                    crashdump::generate_crashdump(hv)?;
312
313                    if !hv.check_stack_guard()? {
314                        log_then_return!(StackOverflow());
315                    }
316
317                    log_then_return!("MMIO access address {:#x}", addr);
318                }
319                Ok(HyperlightExit::AccessViolation(addr, tried, region_permission)) => {
320                    #[cfg(crashdump)]
321                    crashdump::generate_crashdump(hv)?;
322
323                    // If GDB is enabled, we handle the debug memory access
324                    // Disregard return value as we want to return the error
325                    #[cfg(gdb)]
326                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
327
328                    if region_permission.intersects(MemoryRegionFlags::STACK_GUARD) {
329                        return Err(HyperlightError::StackOverflow());
330                    }
331                    log_then_return!(HyperlightError::MemoryAccessViolation(
332                        addr,
333                        tried,
334                        region_permission
335                    ));
336                }
337                Ok(HyperlightExit::Cancelled()) => {
338                    // Shutdown is returned when the host has cancelled execution
339                    // After termination, the main thread will re-initialize the VM
340                    metrics::counter!(METRIC_GUEST_CANCELLATION).increment(1);
341                    log_then_return!(ExecutionCanceledByHost());
342                }
343                Ok(HyperlightExit::Unknown(reason)) => {
344                    #[cfg(crashdump)]
345                    crashdump::generate_crashdump(hv)?;
346                    // If GDB is enabled, we handle the debug memory access
347                    // Disregard return value as we want to return the error
348                    #[cfg(gdb)]
349                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
350
351                    log_then_return!("Unexpected VM Exit {:?}", reason);
352                }
353                Ok(HyperlightExit::Retry()) => continue,
354                Err(e) => {
355                    #[cfg(crashdump)]
356                    crashdump::generate_crashdump(hv)?;
357                    // If GDB is enabled, we handle the debug memory access
358                    // Disregard return value as we want to return the error
359                    #[cfg(gdb)]
360                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
361
362                    return Err(e);
363                }
364            }
365        }
366
367        Ok(())
368    }
369}
370
371/// A trait for handling interrupts to a sandbox's vcpu
372pub trait InterruptHandle: Debug + Send + Sync {
373    /// Interrupt the corresponding sandbox from running.
374    ///
375    /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
376    /// - If this is called while the vcpu is not running, (for example during a host call), the
377    ///   vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
378    ///   it's scheduled, and returns `false`.
379    ///
380    /// # Note
381    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
382    fn kill(&self) -> bool;
383
384    /// Used by a debugger to interrupt the corresponding sandbox from running.
385    ///
386    /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
387    /// - If this is called while the vcpu is not running, (for example during a host call), the
388    ///   vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
389    ///   it's scheduled, and returns `false`.
390    ///
391    /// # Note
392    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
393    #[cfg(gdb)]
394    fn kill_from_debugger(&self) -> bool;
395
396    /// Returns true if the corresponding sandbox has been dropped
397    fn dropped(&self) -> bool;
398}
399
400#[cfg(any(kvm, mshv))]
401#[derive(Debug)]
402pub(super) struct LinuxInterruptHandle {
403    /// Invariant: vcpu is running => most significant bit (63) of `running` is set. (Neither converse nor inverse is true)
404    ///
405    /// Additionally, bit 0-62 tracks how many times the VCPU has been run. Incremented each time `run()` is called.
406    ///
407    /// This prevents an ABA problem where:
408    /// 1. The VCPU is running (generation N),
409    /// 2. It gets cancelled,
410    /// 3. Then quickly restarted (generation N+1),
411    ///    before the original thread has observed that it was cancelled.
412    ///
413    /// Without this generation counter, the interrupt logic might assume the VCPU is still
414    /// in the *original* run (generation N), see that it's `running`, and re-send the signal.
415    /// But the new VCPU run (generation N+1) would treat this as a stale signal and ignore it,
416    /// potentially causing an infinite loop where no effective interrupt is delivered.
417    ///
418    /// Invariant: If the VCPU is running, `run_generation[bit 0-62]` matches the current run's generation.
419    running: AtomicU64,
420    /// Invariant: vcpu is running => `tid` is the thread on which it is running.
421    /// Note: multiple vms may have the same `tid`, but at most one vm will have `running` set to true.
422    tid: AtomicU64,
423    /// True when an "interruptor" has requested the VM to be cancelled. Set immediately when
424    /// `kill()` is called, and cleared when the vcpu is no longer running.
425    /// This is used to
426    /// 1. make sure stale signals do not interrupt the
427    ///    the wrong vcpu (a vcpu may only be interrupted iff `cancel_requested` is true),
428    /// 2. ensure that if a vm is killed while a host call is running,
429    ///    the vm will not re-enter the guest after the host call returns.
430    cancel_requested: AtomicBool,
431    /// True when the debugger has requested the VM to be interrupted. Set immediately when
432    /// `kill_from_debugger()` is called, and cleared when the vcpu is no longer running.
433    /// This is used to make sure stale signals do not interrupt the the wrong vcpu
434    /// (a vcpu may only be interrupted by a debugger if `debug_interrupt` is true),
435    #[cfg(gdb)]
436    debug_interrupt: AtomicBool,
437    /// Whether the corresponding vm is dropped
438    dropped: AtomicBool,
439    /// Retry delay between signals sent to the vcpu thread
440    retry_delay: Duration,
441    /// The offset of the SIGRTMIN signal used to interrupt the vcpu thread
442    sig_rt_min_offset: u8,
443}
444
445#[cfg(any(kvm, mshv))]
446impl LinuxInterruptHandle {
447    const RUNNING_BIT: u64 = 1 << 63;
448    const MAX_GENERATION: u64 = Self::RUNNING_BIT - 1;
449
450    // set running to true and increment the generation. Generation will wrap around at `MAX_GENERATION`.
451    fn set_running_and_increment_generation(&self) -> std::result::Result<u64, u64> {
452        self.running
453            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |raw| {
454                let generation = raw & !Self::RUNNING_BIT;
455                if generation == Self::MAX_GENERATION {
456                    // restart generation from 0
457                    return Some(Self::RUNNING_BIT);
458                }
459                Some((generation + 1) | Self::RUNNING_BIT)
460            })
461    }
462
463    // clear the running bit and return the generation
464    fn clear_running_bit(&self) -> u64 {
465        self.running
466            .fetch_and(!Self::RUNNING_BIT, Ordering::Relaxed)
467    }
468
469    fn get_running_and_generation(&self) -> (bool, u64) {
470        let raw = self.running.load(Ordering::Relaxed);
471        let running = raw & Self::RUNNING_BIT != 0;
472        let generation = raw & !Self::RUNNING_BIT;
473        (running, generation)
474    }
475
476    fn send_signal(&self) -> bool {
477        let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int;
478        let mut sent_signal = false;
479        let mut target_generation: Option<u64> = None;
480
481        loop {
482            let (running, generation) = self.get_running_and_generation();
483
484            if !running {
485                break;
486            }
487
488            match target_generation {
489                None => target_generation = Some(generation),
490                // prevent ABA problem
491                Some(expected) if expected != generation => break,
492                _ => {}
493            }
494
495            log::info!("Sending signal to kill vcpu thread...");
496            sent_signal = true;
497            unsafe {
498                libc::pthread_kill(self.tid.load(Ordering::Relaxed) as _, signal_number);
499            }
500            std::thread::sleep(self.retry_delay);
501        }
502
503        sent_signal
504    }
505}
506
507#[cfg(any(kvm, mshv))]
508impl InterruptHandle for LinuxInterruptHandle {
509    fn kill(&self) -> bool {
510        self.cancel_requested.store(true, Ordering::Relaxed);
511
512        self.send_signal()
513    }
514    #[cfg(gdb)]
515    fn kill_from_debugger(&self) -> bool {
516        self.debug_interrupt.store(true, Ordering::Relaxed);
517        self.send_signal()
518    }
519    fn dropped(&self) -> bool {
520        self.dropped.load(Ordering::Relaxed)
521    }
522}
523
524#[cfg(all(test, any(target_os = "windows", kvm)))]
525pub(crate) mod tests {
526    use std::sync::{Arc, Mutex};
527
528    use hyperlight_testing::dummy_guest_as_string;
529
530    use crate::sandbox::uninitialized::GuestBinary;
531    #[cfg(any(crashdump, gdb))]
532    use crate::sandbox::uninitialized::SandboxRuntimeConfig;
533    use crate::sandbox::uninitialized_evolve::set_up_hypervisor_partition;
534    use crate::sandbox::{SandboxConfiguration, UninitializedSandbox};
535    use crate::{Result, is_hypervisor_present, new_error};
536
537    #[test]
538    fn test_initialise() -> Result<()> {
539        if !is_hypervisor_present() {
540            return Ok(());
541        }
542
543        use crate::mem::ptr::RawPtr;
544        use crate::sandbox::host_funcs::FunctionRegistry;
545
546        let filename = dummy_guest_as_string().map_err(|e| new_error!("{}", e))?;
547
548        let config: SandboxConfiguration = Default::default();
549        #[cfg(any(crashdump, gdb))]
550        let rt_cfg: SandboxRuntimeConfig = Default::default();
551        let sandbox =
552            UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?;
553        let (mem_mgr, mut gshm) = sandbox.mgr.build();
554        let mut vm = set_up_hypervisor_partition(
555            &mut gshm,
556            &config,
557            #[cfg(any(crashdump, gdb))]
558            &rt_cfg,
559            sandbox.load_info,
560        )?;
561
562        // Set up required parameters for initialise
563        let peb_addr = RawPtr::from(0x1000u64); // Dummy PEB address
564        let seed = 12345u64; // Random seed
565        let page_size = 4096u32; // Standard page size
566        let host_funcs = Arc::new(Mutex::new(FunctionRegistry::default()));
567        let guest_max_log_level = Some(log::LevelFilter::Error);
568
569        #[cfg(gdb)]
570        let dbg_mem_access_fn = Arc::new(Mutex::new(mem_mgr.clone()));
571
572        // Test the initialise method
573        vm.initialise(
574            peb_addr,
575            seed,
576            page_size,
577            mem_mgr,
578            host_funcs,
579            guest_max_log_level,
580            #[cfg(gdb)]
581            dbg_mem_access_fn,
582        )?;
583
584        Ok(())
585    }
586}