hyperlight_host/hypervisor/
mod.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use log::LevelFilter;
18use tracing::{Span, instrument};
19
20use crate::error::HyperlightError::ExecutionCanceledByHost;
21use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
22use crate::metrics::METRIC_GUEST_CANCELLATION;
23#[cfg(feature = "trace_guest")]
24use crate::sandbox::TraceInfo;
25use crate::{HyperlightError, Result, log_then_return};
26
27/// Util for handling x87 fpu state
28#[cfg(any(kvm, mshv, target_os = "windows"))]
29pub mod fpu;
30/// Handlers for Hypervisor custom logic
31#[cfg(gdb)]
32pub mod handlers;
33/// HyperV-on-linux functionality
34#[cfg(mshv)]
35pub mod hyperv_linux;
36#[cfg(target_os = "windows")]
37/// Hyperv-on-windows functionality
38pub(crate) mod hyperv_windows;
39
40/// GDB debugging support
41#[cfg(gdb)]
42pub(crate) mod gdb;
43
44#[cfg(kvm)]
45/// Functionality to manipulate KVM-based virtual machines
46pub mod kvm;
47#[cfg(target_os = "windows")]
48/// Hyperlight Surrogate Process
49pub(crate) mod surrogate_process;
50#[cfg(target_os = "windows")]
51/// Hyperlight Surrogate Process
52pub(crate) mod surrogate_process_manager;
53/// WindowsHypervisorPlatform utilities
54#[cfg(target_os = "windows")]
55pub(crate) mod windows_hypervisor_platform;
56/// Safe wrappers around windows types like `PSTR`
57#[cfg(target_os = "windows")]
58pub(crate) mod wrappers;
59
60#[cfg(crashdump)]
61pub(crate) mod crashdump;
62
63use std::fmt::Debug;
64use std::str::FromStr;
65#[cfg(any(kvm, mshv))]
66use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
67use std::sync::{Arc, Mutex};
68#[cfg(any(kvm, mshv))]
69use std::time::Duration;
70
71#[cfg(gdb)]
72use gdb::VcpuStopReason;
73
74#[cfg(gdb)]
75use self::handlers::{DbgMemAccessHandlerCaller, DbgMemAccessHandlerWrapper};
76use crate::mem::ptr::RawPtr;
77use crate::mem::shared_mem::HostSharedMemory;
78use crate::sandbox::host_funcs::FunctionRegistry;
79use crate::sandbox::mem_access::handle_mem_access;
80use crate::sandbox::mem_mgr::MemMgrWrapper;
81
82cfg_if::cfg_if! {
83    if #[cfg(feature = "init-paging")] {
84        pub(crate) const CR4_PAE: u64 = 1 << 5;
85        pub(crate) const CR4_OSFXSR: u64 = 1 << 9;
86        pub(crate) const CR4_OSXMMEXCPT: u64 = 1 << 10;
87        pub(crate) const CR0_PE: u64 = 1;
88        pub(crate) const CR0_MP: u64 = 1 << 1;
89        pub(crate) const CR0_ET: u64 = 1 << 4;
90        pub(crate) const CR0_NE: u64 = 1 << 5;
91        pub(crate) const CR0_WP: u64 = 1 << 16;
92        pub(crate) const CR0_AM: u64 = 1 << 18;
93        pub(crate) const CR0_PG: u64 = 1 << 31;
94        pub(crate) const EFER_LME: u64 = 1 << 8;
95        pub(crate) const EFER_LMA: u64 = 1 << 10;
96        pub(crate) const EFER_SCE: u64 = 1;
97        pub(crate) const EFER_NX: u64 = 1 << 11;
98    }
99}
100
101/// These are the generic exit reasons that we can handle from a Hypervisor the Hypervisors run method is responsible for mapping from
102/// the hypervisor specific exit reasons to these generic ones
103pub enum HyperlightExit {
104    #[cfg(gdb)]
105    /// The vCPU has exited due to a debug event
106    Debug(VcpuStopReason),
107    /// The vCPU has halted
108    Halt(),
109    /// The vCPU has issued a write to the given port with the given value
110    IoOut(u16, Vec<u8>, u64, u64),
111    /// The vCPU has attempted to read or write from an unmapped address
112    Mmio(u64),
113    /// The vCPU tried to access memory but was missing the required permissions
114    AccessViolation(u64, MemoryRegionFlags, MemoryRegionFlags),
115    /// The vCPU execution has been cancelled
116    Cancelled(),
117    /// The vCPU has exited for a reason that is not handled by Hyperlight
118    Unknown(String),
119    /// The operation should be retried, for example this can happen on Linux where a call to run the CPU can return EAGAIN
120    Retry(),
121}
122
123/// Registers which may be useful for tracing/stack unwinding
124#[cfg(feature = "trace_guest")]
125pub enum TraceRegister {
126    /// RAX
127    RAX,
128    /// RCX
129    RCX,
130    /// RIP
131    RIP,
132    /// RSP
133    RSP,
134    /// RBP
135    RBP,
136}
137
138/// A common set of hypervisor functionality
139pub(crate) trait Hypervisor: Debug + Send {
140    /// Initialise the internally stored vCPU with the given PEB address and
141    /// random number seed, then run it until a HLT instruction.
142    #[allow(clippy::too_many_arguments)]
143    fn initialise(
144        &mut self,
145        peb_addr: RawPtr,
146        seed: u64,
147        page_size: u32,
148        mem_mgr: MemMgrWrapper<HostSharedMemory>,
149        host_funcs: Arc<Mutex<FunctionRegistry>>,
150        guest_max_log_level: Option<LevelFilter>,
151        #[cfg(gdb)] dbg_mem_access_fn: DbgMemAccessHandlerWrapper,
152    ) -> Result<()>;
153
154    /// Map a region of host memory into the sandbox.
155    ///
156    /// Depending on the host platform, there are likely alignment
157    /// requirements of at least one page for base and len.
158    unsafe fn map_region(&mut self, rgn: &MemoryRegion) -> Result<()>;
159
160    /// Unmap a memory region from the sandbox
161    unsafe fn unmap_region(&mut self, rgn: &MemoryRegion) -> Result<()>;
162
163    /// Get the currently mapped dynamic memory regions (not including sandbox regions)
164    ///
165    /// Note: Box needed for trait to be object-safe :(
166    fn get_mapped_regions(&self) -> Box<dyn ExactSizeIterator<Item = &MemoryRegion> + '_>;
167
168    /// Dispatch a call from the host to the guest using the given pointer
169    /// to the dispatch function _in the guest's address space_.
170    ///
171    /// Do this by setting the instruction pointer to `dispatch_func_addr`
172    /// and then running the execution loop until a halt instruction.
173    ///
174    /// Returns `Ok` if the call succeeded, and an `Err` if it failed
175    fn dispatch_call_from_host(
176        &mut self,
177        dispatch_func_addr: RawPtr,
178        #[cfg(gdb)] dbg_mem_access_fn: DbgMemAccessHandlerWrapper,
179    ) -> Result<()>;
180
181    /// Handle an IO exit from the internally stored vCPU.
182    fn handle_io(
183        &mut self,
184        port: u16,
185        data: Vec<u8>,
186        rip: u64,
187        instruction_length: u64,
188    ) -> Result<()>;
189
190    /// Run the vCPU
191    fn run(&mut self) -> Result<HyperlightExit>;
192
193    /// Get InterruptHandle to underlying VM
194    fn interrupt_handle(&self) -> Arc<dyn InterruptHandle>;
195
196    /// Get the logging level to pass to the guest entrypoint
197    fn get_max_log_level(&self) -> u32 {
198        // Check to see if the RUST_LOG environment variable is set
199        // and if so, parse it to get the log_level for hyperlight_guest
200        // if that is not set get the log level for the hyperlight_host
201
202        // This is done as the guest will produce logs based on the log level returned here
203        // producing those logs is expensive and we don't want to do it if the host is not
204        // going to process them
205
206        let val = std::env::var("RUST_LOG").unwrap_or_default();
207
208        let level = if val.contains("hyperlight_guest") {
209            val.split(',')
210                .find(|s| s.contains("hyperlight_guest"))
211                .unwrap_or("")
212                .split('=')
213                .nth(1)
214                .unwrap_or("")
215        } else if val.contains("hyperlight_host") {
216            val.split(',')
217                .find(|s| s.contains("hyperlight_host"))
218                .unwrap_or("")
219                .split('=')
220                .nth(1)
221                .unwrap_or("")
222        } else {
223            // look for a value string that does not contain "="
224            val.split(',').find(|s| !s.contains("=")).unwrap_or("")
225        };
226
227        log::info!("Determined guest log level: {}", level);
228        // Convert the log level string to a LevelFilter
229        // If no value is found, default to Error
230        LevelFilter::from_str(level).unwrap_or(LevelFilter::Error) as u32
231    }
232
233    /// get a mutable trait object from self
234    fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor;
235
236    #[cfg(crashdump)]
237    fn crashdump_context(&self) -> Result<Option<crashdump::CrashDumpContext>>;
238
239    #[cfg(gdb)]
240    /// handles the cases when the vCPU stops due to a Debug event
241    fn handle_debug(
242        &mut self,
243        _dbg_mem_access_fn: Arc<Mutex<dyn DbgMemAccessHandlerCaller>>,
244        _stop_reason: VcpuStopReason,
245    ) -> Result<()> {
246        unimplemented!()
247    }
248
249    /// Check stack guard to see if the stack is still valid
250    fn check_stack_guard(&self) -> Result<bool>;
251
252    /// Read a register for trace/unwind purposes
253    #[cfg(feature = "trace_guest")]
254    fn read_trace_reg(&self, reg: TraceRegister) -> Result<u64>;
255
256    /// Get a reference of the trace info for the guest
257    #[cfg(feature = "trace_guest")]
258    fn trace_info_as_ref(&self) -> &TraceInfo;
259    /// Get a mutable reference of the trace info for the guest
260    #[cfg(feature = "trace_guest")]
261    fn trace_info_as_mut(&mut self) -> &mut TraceInfo;
262}
263
264/// Returns a Some(HyperlightExit::AccessViolation(..)) if the given gpa doesn't have
265/// access its corresponding region. Returns None otherwise, or if the region is not found.
266pub(crate) fn get_memory_access_violation<'a>(
267    gpa: usize,
268    mut mem_regions: impl Iterator<Item = &'a MemoryRegion>,
269    access_info: MemoryRegionFlags,
270) -> Option<HyperlightExit> {
271    // find the region containing the given gpa
272    let region = mem_regions.find(|region| region.guest_region.contains(&gpa));
273
274    if let Some(region) = region {
275        if !region.flags.contains(access_info)
276            || region.flags.contains(MemoryRegionFlags::STACK_GUARD)
277        {
278            return Some(HyperlightExit::AccessViolation(
279                gpa as u64,
280                access_info,
281                region.flags,
282            ));
283        }
284    }
285    None
286}
287
288/// A virtual CPU that can be run until an exit occurs
289pub struct VirtualCPU {}
290
291impl VirtualCPU {
292    /// Run the given hypervisor until a halt instruction is reached
293    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
294    pub(crate) fn run(
295        hv: &mut dyn Hypervisor,
296        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<dyn DbgMemAccessHandlerCaller>>,
297    ) -> Result<()> {
298        loop {
299            match hv.run() {
300                #[cfg(gdb)]
301                Ok(HyperlightExit::Debug(stop_reason)) => {
302                    if let Err(e) = hv.handle_debug(dbg_mem_access_fn.clone(), stop_reason) {
303                        log_then_return!(e);
304                    }
305                }
306
307                Ok(HyperlightExit::Halt()) => {
308                    break;
309                }
310                Ok(HyperlightExit::IoOut(port, data, rip, instruction_length)) => {
311                    hv.handle_io(port, data, rip, instruction_length)?
312                }
313                Ok(HyperlightExit::Mmio(addr)) => {
314                    #[cfg(crashdump)]
315                    crashdump::generate_crashdump(hv)?;
316
317                    handle_mem_access(hv)?;
318
319                    log_then_return!("MMIO access address {:#x}", addr);
320                }
321                Ok(HyperlightExit::AccessViolation(addr, tried, region_permission)) => {
322                    #[cfg(crashdump)]
323                    crashdump::generate_crashdump(hv)?;
324
325                    // If GDB is enabled, we handle the debug memory access
326                    // Disregard return value as we want to return the error
327                    #[cfg(gdb)]
328                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
329
330                    if region_permission.intersects(MemoryRegionFlags::STACK_GUARD) {
331                        return Err(HyperlightError::StackOverflow());
332                    }
333                    log_then_return!(HyperlightError::MemoryAccessViolation(
334                        addr,
335                        tried,
336                        region_permission
337                    ));
338                }
339                Ok(HyperlightExit::Cancelled()) => {
340                    // Shutdown is returned when the host has cancelled execution
341                    // After termination, the main thread will re-initialize the VM
342                    metrics::counter!(METRIC_GUEST_CANCELLATION).increment(1);
343                    log_then_return!(ExecutionCanceledByHost());
344                }
345                Ok(HyperlightExit::Unknown(reason)) => {
346                    #[cfg(crashdump)]
347                    crashdump::generate_crashdump(hv)?;
348                    // If GDB is enabled, we handle the debug memory access
349                    // Disregard return value as we want to return the error
350                    #[cfg(gdb)]
351                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
352
353                    log_then_return!("Unexpected VM Exit {:?}", reason);
354                }
355                Ok(HyperlightExit::Retry()) => continue,
356                Err(e) => {
357                    #[cfg(crashdump)]
358                    crashdump::generate_crashdump(hv)?;
359                    // If GDB is enabled, we handle the debug memory access
360                    // Disregard return value as we want to return the error
361                    #[cfg(gdb)]
362                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
363
364                    return Err(e);
365                }
366            }
367        }
368
369        Ok(())
370    }
371}
372
373/// A trait for handling interrupts to a sandbox's vcpu
374pub trait InterruptHandle: Debug + Send + Sync {
375    /// Interrupt the corresponding sandbox from running.
376    ///
377    /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
378    /// - If this is called while the vcpu is not running, (for example during a host call), the
379    ///   vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
380    ///   it's scheduled, and returns `false`.
381    ///
382    /// # Note
383    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
384    fn kill(&self) -> bool;
385
386    /// Used by a debugger to interrupt the corresponding sandbox from running.
387    ///
388    /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
389    /// - If this is called while the vcpu is not running, (for example during a host call), the
390    ///   vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
391    ///   it's scheduled, and returns `false`.
392    ///
393    /// # Note
394    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
395    #[cfg(gdb)]
396    fn kill_from_debugger(&self) -> bool;
397
398    /// Returns true if the corresponding sandbox has been dropped
399    fn dropped(&self) -> bool;
400}
401
402#[cfg(any(kvm, mshv))]
403#[derive(Debug)]
404pub(super) struct LinuxInterruptHandle {
405    /// Invariant: vcpu is running => most significant bit (63) of `running` is set. (Neither converse nor inverse is true)
406    ///
407    /// Additionally, bit 0-62 tracks how many times the VCPU has been run. Incremented each time `run()` is called.
408    ///
409    /// This prevents an ABA problem where:
410    /// 1. The VCPU is running (generation N),
411    /// 2. It gets cancelled,
412    /// 3. Then quickly restarted (generation N+1),
413    ///    before the original thread has observed that it was cancelled.
414    ///
415    /// Without this generation counter, the interrupt logic might assume the VCPU is still
416    /// in the *original* run (generation N), see that it's `running`, and re-send the signal.
417    /// But the new VCPU run (generation N+1) would treat this as a stale signal and ignore it,
418    /// potentially causing an infinite loop where no effective interrupt is delivered.
419    ///
420    /// Invariant: If the VCPU is running, `run_generation[bit 0-62]` matches the current run's generation.
421    running: AtomicU64,
422    /// Invariant: vcpu is running => `tid` is the thread on which it is running.
423    /// Note: multiple vms may have the same `tid`, but at most one vm will have `running` set to true.
424    tid: AtomicU64,
425    /// True when an "interruptor" has requested the VM to be cancelled. Set immediately when
426    /// `kill()` is called, and cleared when the vcpu is no longer running.
427    /// This is used to
428    /// 1. make sure stale signals do not interrupt the
429    ///    the wrong vcpu (a vcpu may only be interrupted iff `cancel_requested` is true),
430    /// 2. ensure that if a vm is killed while a host call is running,
431    ///    the vm will not re-enter the guest after the host call returns.
432    cancel_requested: AtomicBool,
433    /// True when the debugger has requested the VM to be interrupted. Set immediately when
434    /// `kill_from_debugger()` is called, and cleared when the vcpu is no longer running.
435    /// This is used to make sure stale signals do not interrupt the the wrong vcpu
436    /// (a vcpu may only be interrupted by a debugger if `debug_interrupt` is true),
437    #[cfg(gdb)]
438    debug_interrupt: AtomicBool,
439    /// Whether the corresponding vm is dropped
440    dropped: AtomicBool,
441    /// Retry delay between signals sent to the vcpu thread
442    retry_delay: Duration,
443    /// The offset of the SIGRTMIN signal used to interrupt the vcpu thread
444    sig_rt_min_offset: u8,
445}
446
447#[cfg(any(kvm, mshv))]
448impl LinuxInterruptHandle {
449    const RUNNING_BIT: u64 = 1 << 63;
450    const MAX_GENERATION: u64 = Self::RUNNING_BIT - 1;
451
452    // set running to true and increment the generation. Generation will wrap around at `MAX_GENERATION`.
453    fn set_running_and_increment_generation(&self) -> std::result::Result<u64, u64> {
454        self.running
455            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |raw| {
456                let generation = raw & !Self::RUNNING_BIT;
457                if generation == Self::MAX_GENERATION {
458                    // restart generation from 0
459                    return Some(Self::RUNNING_BIT);
460                }
461                Some((generation + 1) | Self::RUNNING_BIT)
462            })
463    }
464
465    // clear the running bit and return the generation
466    fn clear_running_bit(&self) -> u64 {
467        self.running
468            .fetch_and(!Self::RUNNING_BIT, Ordering::Relaxed)
469    }
470
471    fn get_running_and_generation(&self) -> (bool, u64) {
472        let raw = self.running.load(Ordering::Relaxed);
473        let running = raw & Self::RUNNING_BIT != 0;
474        let generation = raw & !Self::RUNNING_BIT;
475        (running, generation)
476    }
477
478    fn send_signal(&self) -> bool {
479        let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int;
480        let mut sent_signal = false;
481        let mut target_generation: Option<u64> = None;
482
483        loop {
484            let (running, generation) = self.get_running_and_generation();
485
486            if !running {
487                break;
488            }
489
490            match target_generation {
491                None => target_generation = Some(generation),
492                // prevent ABA problem
493                Some(expected) if expected != generation => break,
494                _ => {}
495            }
496
497            log::info!("Sending signal to kill vcpu thread...");
498            sent_signal = true;
499            unsafe {
500                libc::pthread_kill(self.tid.load(Ordering::Relaxed) as _, signal_number);
501            }
502            std::thread::sleep(self.retry_delay);
503        }
504
505        sent_signal
506    }
507}
508
509#[cfg(any(kvm, mshv))]
510impl InterruptHandle for LinuxInterruptHandle {
511    fn kill(&self) -> bool {
512        self.cancel_requested.store(true, Ordering::Relaxed);
513
514        self.send_signal()
515    }
516    #[cfg(gdb)]
517    fn kill_from_debugger(&self) -> bool {
518        self.debug_interrupt.store(true, Ordering::Relaxed);
519        self.send_signal()
520    }
521    fn dropped(&self) -> bool {
522        self.dropped.load(Ordering::Relaxed)
523    }
524}
525
526#[cfg(all(test, any(target_os = "windows", kvm)))]
527pub(crate) mod tests {
528    use std::sync::{Arc, Mutex};
529
530    use hyperlight_testing::dummy_guest_as_string;
531
532    use crate::sandbox::uninitialized::GuestBinary;
533    #[cfg(any(crashdump, gdb))]
534    use crate::sandbox::uninitialized::SandboxRuntimeConfig;
535    use crate::sandbox::uninitialized_evolve::set_up_hypervisor_partition;
536    use crate::sandbox::{SandboxConfiguration, UninitializedSandbox};
537    use crate::{Result, is_hypervisor_present, new_error};
538
539    #[test]
540    fn test_initialise() -> Result<()> {
541        if !is_hypervisor_present() {
542            return Ok(());
543        }
544
545        use crate::mem::ptr::RawPtr;
546        use crate::sandbox::host_funcs::FunctionRegistry;
547        #[cfg(gdb)]
548        use crate::sandbox::mem_access::dbg_mem_access_handler_wrapper;
549
550        let filename = dummy_guest_as_string().map_err(|e| new_error!("{}", e))?;
551
552        let config: SandboxConfiguration = Default::default();
553        #[cfg(any(crashdump, gdb))]
554        let rt_cfg: SandboxRuntimeConfig = Default::default();
555        let sandbox =
556            UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?;
557        let (mem_mgr, mut gshm) = sandbox.mgr.build();
558        let mut vm = set_up_hypervisor_partition(
559            &mut gshm,
560            &config,
561            #[cfg(any(crashdump, gdb))]
562            &rt_cfg,
563            sandbox.load_info,
564        )?;
565
566        // Set up required parameters for initialise
567        let peb_addr = RawPtr::from(0x1000u64); // Dummy PEB address
568        let seed = 12345u64; // Random seed
569        let page_size = 4096u32; // Standard page size
570        let host_funcs = Arc::new(Mutex::new(FunctionRegistry::default()));
571        let guest_max_log_level = Some(log::LevelFilter::Error);
572
573        #[cfg(gdb)]
574        let dbg_mem_access_fn = dbg_mem_access_handler_wrapper(mem_mgr.clone());
575
576        // Test the initialise method
577        vm.initialise(
578            peb_addr,
579            seed,
580            page_size,
581            mem_mgr,
582            host_funcs,
583            guest_max_log_level,
584            #[cfg(gdb)]
585            dbg_mem_access_fn,
586        )?;
587
588        Ok(())
589    }
590}