hyperlight_host/hypervisor/
mod.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use log::LevelFilter;
18use tracing::{Span, instrument};
19
20use crate::HyperlightError::StackOverflow;
21use crate::error::HyperlightError::ExecutionCanceledByHost;
22use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
23use crate::metrics::METRIC_GUEST_CANCELLATION;
24#[cfg(feature = "trace_guest")]
25use crate::sandbox::TraceInfo;
26use crate::{HyperlightError, Result, log_then_return};
27
28/// Util for handling x87 fpu state
29#[cfg(any(kvm, mshv, target_os = "windows"))]
30pub mod fpu;
31
32/// HyperV-on-linux functionality
33#[cfg(mshv)]
34pub mod hyperv_linux;
35#[cfg(target_os = "windows")]
36/// Hyperv-on-windows functionality
37pub(crate) mod hyperv_windows;
38
39/// GDB debugging support
40#[cfg(gdb)]
41pub(crate) mod gdb;
42
43#[cfg(kvm)]
44/// Functionality to manipulate KVM-based virtual machines
45pub mod kvm;
46#[cfg(target_os = "windows")]
47/// Hyperlight Surrogate Process
48pub(crate) mod surrogate_process;
49#[cfg(target_os = "windows")]
50/// Hyperlight Surrogate Process
51pub(crate) mod surrogate_process_manager;
52/// WindowsHypervisorPlatform utilities
53#[cfg(target_os = "windows")]
54pub(crate) mod windows_hypervisor_platform;
55/// Safe wrappers around windows types like `PSTR`
56#[cfg(target_os = "windows")]
57pub(crate) mod wrappers;
58
59#[cfg(crashdump)]
60pub(crate) mod crashdump;
61
62use std::fmt::Debug;
63use std::str::FromStr;
64#[cfg(any(kvm, mshv))]
65use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
66use std::sync::{Arc, Mutex};
67#[cfg(any(kvm, mshv))]
68use std::time::Duration;
69
70#[cfg(gdb)]
71use gdb::VcpuStopReason;
72
73use crate::mem::mgr::SandboxMemoryManager;
74use crate::mem::ptr::RawPtr;
75use crate::mem::shared_mem::HostSharedMemory;
76use crate::sandbox::host_funcs::FunctionRegistry;
77
78cfg_if::cfg_if! {
79    if #[cfg(feature = "init-paging")] {
80        pub(crate) const CR4_PAE: u64 = 1 << 5;
81        pub(crate) const CR4_OSFXSR: u64 = 1 << 9;
82        pub(crate) const CR4_OSXMMEXCPT: u64 = 1 << 10;
83        pub(crate) const CR0_PE: u64 = 1;
84        pub(crate) const CR0_MP: u64 = 1 << 1;
85        pub(crate) const CR0_ET: u64 = 1 << 4;
86        pub(crate) const CR0_NE: u64 = 1 << 5;
87        pub(crate) const CR0_WP: u64 = 1 << 16;
88        pub(crate) const CR0_AM: u64 = 1 << 18;
89        pub(crate) const CR0_PG: u64 = 1 << 31;
90        pub(crate) const EFER_LME: u64 = 1 << 8;
91        pub(crate) const EFER_LMA: u64 = 1 << 10;
92        pub(crate) const EFER_SCE: u64 = 1;
93        pub(crate) const EFER_NX: u64 = 1 << 11;
94    }
95}
96
97/// These are the generic exit reasons that we can handle from a Hypervisor the Hypervisors run method is responsible for mapping from
98/// the hypervisor specific exit reasons to these generic ones
99pub enum HyperlightExit {
100    #[cfg(gdb)]
101    /// The vCPU has exited due to a debug event
102    Debug(VcpuStopReason),
103    /// The vCPU has halted
104    Halt(),
105    /// The vCPU has issued a write to the given port with the given value
106    IoOut(u16, Vec<u8>, u64, u64),
107    /// The vCPU has attempted to read or write from an unmapped address
108    Mmio(u64),
109    /// The vCPU tried to access memory but was missing the required permissions
110    AccessViolation(u64, MemoryRegionFlags, MemoryRegionFlags),
111    /// The vCPU execution has been cancelled
112    Cancelled(),
113    /// The vCPU has exited for a reason that is not handled by Hyperlight
114    Unknown(String),
115    /// The operation should be retried, for example this can happen on Linux where a call to run the CPU can return EAGAIN
116    Retry(),
117}
118
119/// Registers which may be useful for tracing/stack unwinding
120#[cfg(feature = "trace_guest")]
121pub enum TraceRegister {
122    /// RAX
123    RAX,
124    /// RCX
125    RCX,
126    /// RIP
127    RIP,
128    /// RSP
129    RSP,
130    /// RBP
131    RBP,
132}
133
134/// A common set of hypervisor functionality
135pub(crate) trait Hypervisor: Debug + Send {
136    /// Initialise the internally stored vCPU with the given PEB address and
137    /// random number seed, then run it until a HLT instruction.
138    #[allow(clippy::too_many_arguments)]
139    fn initialise(
140        &mut self,
141        peb_addr: RawPtr,
142        seed: u64,
143        page_size: u32,
144        mem_mgr: SandboxMemoryManager<HostSharedMemory>,
145        host_funcs: Arc<Mutex<FunctionRegistry>>,
146        guest_max_log_level: Option<LevelFilter>,
147        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
148    ) -> Result<()>;
149
150    /// Map a region of host memory into the sandbox.
151    ///
152    /// Depending on the host platform, there are likely alignment
153    /// requirements of at least one page for base and len.
154    unsafe fn map_region(&mut self, rgn: &MemoryRegion) -> Result<()>;
155
156    /// Unmap a memory region from the sandbox
157    unsafe fn unmap_region(&mut self, rgn: &MemoryRegion) -> Result<()>;
158
159    /// Get the currently mapped dynamic memory regions (not including sandbox regions)
160    ///
161    /// Note: Box needed for trait to be object-safe :(
162    fn get_mapped_regions(&self) -> Box<dyn ExactSizeIterator<Item = &MemoryRegion> + '_>;
163
164    /// Dispatch a call from the host to the guest using the given pointer
165    /// to the dispatch function _in the guest's address space_.
166    ///
167    /// Do this by setting the instruction pointer to `dispatch_func_addr`
168    /// and then running the execution loop until a halt instruction.
169    ///
170    /// Returns `Ok` if the call succeeded, and an `Err` if it failed
171    fn dispatch_call_from_host(
172        &mut self,
173        dispatch_func_addr: RawPtr,
174        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
175    ) -> Result<()>;
176
177    /// Handle an IO exit from the internally stored vCPU.
178    fn handle_io(
179        &mut self,
180        port: u16,
181        data: Vec<u8>,
182        rip: u64,
183        instruction_length: u64,
184    ) -> Result<()>;
185
186    /// Run the vCPU
187    fn run(&mut self) -> Result<HyperlightExit>;
188
189    /// Get InterruptHandle to underlying VM
190    fn interrupt_handle(&self) -> Arc<dyn InterruptHandle>;
191
192    /// Get the logging level to pass to the guest entrypoint
193    fn get_max_log_level(&self) -> u32 {
194        // Check to see if the RUST_LOG environment variable is set
195        // and if so, parse it to get the log_level for hyperlight_guest
196        // if that is not set get the log level for the hyperlight_host
197
198        // This is done as the guest will produce logs based on the log level returned here
199        // producing those logs is expensive and we don't want to do it if the host is not
200        // going to process them
201
202        let val = std::env::var("RUST_LOG").unwrap_or_default();
203
204        let level = if val.contains("hyperlight_guest") {
205            val.split(',')
206                .find(|s| s.contains("hyperlight_guest"))
207                .unwrap_or("")
208                .split('=')
209                .nth(1)
210                .unwrap_or("")
211        } else if val.contains("hyperlight_host") {
212            val.split(',')
213                .find(|s| s.contains("hyperlight_host"))
214                .unwrap_or("")
215                .split('=')
216                .nth(1)
217                .unwrap_or("")
218        } else {
219            // look for a value string that does not contain "="
220            val.split(',').find(|s| !s.contains("=")).unwrap_or("")
221        };
222
223        log::info!("Determined guest log level: {}", level);
224        // Convert the log level string to a LevelFilter
225        // If no value is found, default to Error
226        LevelFilter::from_str(level).unwrap_or(LevelFilter::Error) as u32
227    }
228
229    /// get a mutable trait object from self
230    fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor;
231
232    #[cfg(crashdump)]
233    fn crashdump_context(&self) -> Result<Option<crashdump::CrashDumpContext<'_>>>;
234
235    #[cfg(gdb)]
236    /// handles the cases when the vCPU stops due to a Debug event
237    fn handle_debug(
238        &mut self,
239        _dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
240        _stop_reason: VcpuStopReason,
241    ) -> Result<()> {
242        unimplemented!()
243    }
244
245    /// Check stack guard to see if the stack is still valid
246    fn check_stack_guard(&self) -> Result<bool>;
247
248    /// Read a register for trace/unwind purposes
249    #[cfg(feature = "trace_guest")]
250    fn read_trace_reg(&self, reg: TraceRegister) -> Result<u64>;
251
252    /// Get a reference of the trace info for the guest
253    #[cfg(feature = "trace_guest")]
254    fn trace_info_as_ref(&self) -> &TraceInfo;
255    /// Get a mutable reference of the trace info for the guest
256    #[cfg(feature = "trace_guest")]
257    fn trace_info_as_mut(&mut self) -> &mut TraceInfo;
258}
259
260/// Returns a Some(HyperlightExit::AccessViolation(..)) if the given gpa doesn't have
261/// access its corresponding region. Returns None otherwise, or if the region is not found.
262pub(crate) fn get_memory_access_violation<'a>(
263    gpa: usize,
264    mut mem_regions: impl Iterator<Item = &'a MemoryRegion>,
265    access_info: MemoryRegionFlags,
266) -> Option<HyperlightExit> {
267    // find the region containing the given gpa
268    let region = mem_regions.find(|region| region.guest_region.contains(&gpa));
269
270    if let Some(region) = region
271        && (!region.flags.contains(access_info)
272            || region.flags.contains(MemoryRegionFlags::STACK_GUARD))
273    {
274        return Some(HyperlightExit::AccessViolation(
275            gpa as u64,
276            access_info,
277            region.flags,
278        ));
279    }
280    None
281}
282
283/// A virtual CPU that can be run until an exit occurs
284pub struct VirtualCPU {}
285
286impl VirtualCPU {
287    /// Run the given hypervisor until a halt instruction is reached
288    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
289    pub(crate) fn run(
290        hv: &mut dyn Hypervisor,
291        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
292    ) -> Result<()> {
293        loop {
294            match hv.run() {
295                #[cfg(gdb)]
296                Ok(HyperlightExit::Debug(stop_reason)) => {
297                    if let Err(e) = hv.handle_debug(dbg_mem_access_fn.clone(), stop_reason) {
298                        log_then_return!(e);
299                    }
300                }
301
302                Ok(HyperlightExit::Halt()) => {
303                    break;
304                }
305                Ok(HyperlightExit::IoOut(port, data, rip, instruction_length)) => {
306                    hv.handle_io(port, data, rip, instruction_length)?
307                }
308                Ok(HyperlightExit::Mmio(addr)) => {
309                    #[cfg(crashdump)]
310                    crashdump::generate_crashdump(hv)?;
311
312                    if !hv.check_stack_guard()? {
313                        log_then_return!(StackOverflow());
314                    }
315
316                    log_then_return!("MMIO access address {:#x}", addr);
317                }
318                Ok(HyperlightExit::AccessViolation(addr, tried, region_permission)) => {
319                    #[cfg(crashdump)]
320                    crashdump::generate_crashdump(hv)?;
321
322                    // If GDB is enabled, we handle the debug memory access
323                    // Disregard return value as we want to return the error
324                    #[cfg(gdb)]
325                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
326
327                    if region_permission.intersects(MemoryRegionFlags::STACK_GUARD) {
328                        return Err(HyperlightError::StackOverflow());
329                    }
330                    log_then_return!(HyperlightError::MemoryAccessViolation(
331                        addr,
332                        tried,
333                        region_permission
334                    ));
335                }
336                Ok(HyperlightExit::Cancelled()) => {
337                    // Shutdown is returned when the host has cancelled execution
338                    // After termination, the main thread will re-initialize the VM
339                    metrics::counter!(METRIC_GUEST_CANCELLATION).increment(1);
340                    log_then_return!(ExecutionCanceledByHost());
341                }
342                Ok(HyperlightExit::Unknown(reason)) => {
343                    #[cfg(crashdump)]
344                    crashdump::generate_crashdump(hv)?;
345                    // If GDB is enabled, we handle the debug memory access
346                    // Disregard return value as we want to return the error
347                    #[cfg(gdb)]
348                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
349
350                    log_then_return!("Unexpected VM Exit {:?}", reason);
351                }
352                Ok(HyperlightExit::Retry()) => continue,
353                Err(e) => {
354                    #[cfg(crashdump)]
355                    crashdump::generate_crashdump(hv)?;
356                    // If GDB is enabled, we handle the debug memory access
357                    // Disregard return value as we want to return the error
358                    #[cfg(gdb)]
359                    let _ = hv.handle_debug(dbg_mem_access_fn.clone(), VcpuStopReason::Crash);
360
361                    return Err(e);
362                }
363            }
364        }
365
366        Ok(())
367    }
368}
369
370/// A trait for handling interrupts to a sandbox's vcpu
371pub trait InterruptHandle: Debug + Send + Sync {
372    /// Interrupt the corresponding sandbox from running.
373    ///
374    /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
375    /// - If this is called while the vcpu is not running, (for example during a host call), the
376    ///   vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
377    ///   it's scheduled, and returns `false`.
378    ///
379    /// # Note
380    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
381    fn kill(&self) -> bool;
382
383    /// Used by a debugger to interrupt the corresponding sandbox from running.
384    ///
385    /// - If this is called while the vcpu is running, then it will interrupt the vcpu and return `true`.
386    /// - If this is called while the vcpu is not running, (for example during a host call), the
387    ///   vcpu will not immediately be interrupted, but will prevent the vcpu from running **the next time**
388    ///   it's scheduled, and returns `false`.
389    ///
390    /// # Note
391    /// This function will block for the duration of the time it takes for the vcpu thread to be interrupted.
392    #[cfg(gdb)]
393    fn kill_from_debugger(&self) -> bool;
394
395    /// Returns true if the corresponding sandbox has been dropped
396    fn dropped(&self) -> bool;
397}
398
399#[cfg(any(kvm, mshv))]
400#[derive(Debug)]
401pub(super) struct LinuxInterruptHandle {
402    /// Invariant: vcpu is running => most significant bit (63) of `running` is set. (Neither converse nor inverse is true)
403    ///
404    /// Additionally, bit 0-62 tracks how many times the VCPU has been run. Incremented each time `run()` is called.
405    ///
406    /// This prevents an ABA problem where:
407    /// 1. The VCPU is running (generation N),
408    /// 2. It gets cancelled,
409    /// 3. Then quickly restarted (generation N+1),
410    ///    before the original thread has observed that it was cancelled.
411    ///
412    /// Without this generation counter, the interrupt logic might assume the VCPU is still
413    /// in the *original* run (generation N), see that it's `running`, and re-send the signal.
414    /// But the new VCPU run (generation N+1) would treat this as a stale signal and ignore it,
415    /// potentially causing an infinite loop where no effective interrupt is delivered.
416    ///
417    /// Invariant: If the VCPU is running, `run_generation[bit 0-62]` matches the current run's generation.
418    running: AtomicU64,
419    /// Invariant: vcpu is running => `tid` is the thread on which it is running.
420    /// Note: multiple vms may have the same `tid`, but at most one vm will have `running` set to true.
421    tid: AtomicU64,
422    /// True when an "interruptor" has requested the VM to be cancelled. Set immediately when
423    /// `kill()` is called, and cleared when the vcpu is no longer running.
424    /// This is used to
425    /// 1. make sure stale signals do not interrupt the
426    ///    the wrong vcpu (a vcpu may only be interrupted iff `cancel_requested` is true),
427    /// 2. ensure that if a vm is killed while a host call is running,
428    ///    the vm will not re-enter the guest after the host call returns.
429    cancel_requested: AtomicBool,
430    /// True when the debugger has requested the VM to be interrupted. Set immediately when
431    /// `kill_from_debugger()` is called, and cleared when the vcpu is no longer running.
432    /// This is used to make sure stale signals do not interrupt the the wrong vcpu
433    /// (a vcpu may only be interrupted by a debugger if `debug_interrupt` is true),
434    #[cfg(gdb)]
435    debug_interrupt: AtomicBool,
436    /// Whether the corresponding vm is dropped
437    dropped: AtomicBool,
438    /// Retry delay between signals sent to the vcpu thread
439    retry_delay: Duration,
440    /// The offset of the SIGRTMIN signal used to interrupt the vcpu thread
441    sig_rt_min_offset: u8,
442}
443
444#[cfg(any(kvm, mshv))]
445impl LinuxInterruptHandle {
446    const RUNNING_BIT: u64 = 1 << 63;
447    const MAX_GENERATION: u64 = Self::RUNNING_BIT - 1;
448
449    // set running to true and increment the generation. Generation will wrap around at `MAX_GENERATION`.
450    fn set_running_and_increment_generation(&self) -> std::result::Result<u64, u64> {
451        self.running
452            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |raw| {
453                let generation = raw & !Self::RUNNING_BIT;
454                if generation == Self::MAX_GENERATION {
455                    // restart generation from 0
456                    return Some(Self::RUNNING_BIT);
457                }
458                Some((generation + 1) | Self::RUNNING_BIT)
459            })
460    }
461
462    // clear the running bit and return the generation
463    fn clear_running_bit(&self) -> u64 {
464        self.running
465            .fetch_and(!Self::RUNNING_BIT, Ordering::Relaxed)
466    }
467
468    fn get_running_and_generation(&self) -> (bool, u64) {
469        let raw = self.running.load(Ordering::Relaxed);
470        let running = raw & Self::RUNNING_BIT != 0;
471        let generation = raw & !Self::RUNNING_BIT;
472        (running, generation)
473    }
474
475    fn send_signal(&self) -> bool {
476        let signal_number = libc::SIGRTMIN() + self.sig_rt_min_offset as libc::c_int;
477        let mut sent_signal = false;
478        let mut target_generation: Option<u64> = None;
479
480        loop {
481            let (running, generation) = self.get_running_and_generation();
482
483            if !running {
484                break;
485            }
486
487            match target_generation {
488                None => target_generation = Some(generation),
489                // prevent ABA problem
490                Some(expected) if expected != generation => break,
491                _ => {}
492            }
493
494            log::info!("Sending signal to kill vcpu thread...");
495            sent_signal = true;
496            unsafe {
497                libc::pthread_kill(self.tid.load(Ordering::Relaxed) as _, signal_number);
498            }
499            std::thread::sleep(self.retry_delay);
500        }
501
502        sent_signal
503    }
504}
505
506#[cfg(any(kvm, mshv))]
507impl InterruptHandle for LinuxInterruptHandle {
508    fn kill(&self) -> bool {
509        self.cancel_requested.store(true, Ordering::Relaxed);
510
511        self.send_signal()
512    }
513    #[cfg(gdb)]
514    fn kill_from_debugger(&self) -> bool {
515        self.debug_interrupt.store(true, Ordering::Relaxed);
516        self.send_signal()
517    }
518    fn dropped(&self) -> bool {
519        self.dropped.load(Ordering::Relaxed)
520    }
521}
522
523#[cfg(all(test, any(target_os = "windows", kvm)))]
524pub(crate) mod tests {
525    use std::sync::{Arc, Mutex};
526
527    use hyperlight_testing::dummy_guest_as_string;
528
529    use crate::sandbox::uninitialized::GuestBinary;
530    #[cfg(any(crashdump, gdb))]
531    use crate::sandbox::uninitialized::SandboxRuntimeConfig;
532    use crate::sandbox::uninitialized_evolve::set_up_hypervisor_partition;
533    use crate::sandbox::{SandboxConfiguration, UninitializedSandbox};
534    use crate::{Result, is_hypervisor_present, new_error};
535
536    #[test]
537    fn test_initialise() -> Result<()> {
538        if !is_hypervisor_present() {
539            return Ok(());
540        }
541
542        use crate::mem::ptr::RawPtr;
543        use crate::sandbox::host_funcs::FunctionRegistry;
544
545        let filename = dummy_guest_as_string().map_err(|e| new_error!("{}", e))?;
546
547        let config: SandboxConfiguration = Default::default();
548        #[cfg(any(crashdump, gdb))]
549        let rt_cfg: SandboxRuntimeConfig = Default::default();
550        let sandbox =
551            UninitializedSandbox::new(GuestBinary::FilePath(filename.clone()), Some(config))?;
552        let (mem_mgr, mut gshm) = sandbox.mgr.build();
553        let mut vm = set_up_hypervisor_partition(
554            &mut gshm,
555            &config,
556            #[cfg(any(crashdump, gdb))]
557            &rt_cfg,
558            sandbox.load_info,
559        )?;
560
561        // Set up required parameters for initialise
562        let peb_addr = RawPtr::from(0x1000u64); // Dummy PEB address
563        let seed = 12345u64; // Random seed
564        let page_size = 4096u32; // Standard page size
565        let host_funcs = Arc::new(Mutex::new(FunctionRegistry::default()));
566        let guest_max_log_level = Some(log::LevelFilter::Error);
567
568        #[cfg(gdb)]
569        let dbg_mem_access_fn = Arc::new(Mutex::new(mem_mgr.clone()));
570
571        // Test the initialise method
572        vm.initialise(
573            peb_addr,
574            seed,
575            page_size,
576            mem_mgr,
577            host_funcs,
578            guest_max_log_level,
579            #[cfg(gdb)]
580            dbg_mem_access_fn,
581        )?;
582
583        Ok(())
584    }
585}