hyperlight_host/hypervisor/
hyperv_linux.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17extern crate mshv_bindings;
18extern crate mshv_ioctls;
19
20use std::fmt::{Debug, Formatter};
21use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
22use std::sync::{Arc, Mutex};
23
24use log::{LevelFilter, error};
25use mshv_bindings::{
26    FloatingPointUnit, SpecialRegisters, StandardRegisters, hv_message_type,
27    hv_message_type_HVMSG_GPA_INTERCEPT, hv_message_type_HVMSG_UNMAPPED_GPA,
28    hv_message_type_HVMSG_X64_HALT, hv_message_type_HVMSG_X64_IO_PORT_INTERCEPT,
29    hv_partition_property_code_HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES,
30    hv_partition_synthetic_processor_features, hv_register_assoc,
31    hv_register_name_HV_X64_REGISTER_RIP, hv_register_value, mshv_user_mem_region,
32};
33#[cfg(gdb)]
34use mshv_bindings::{
35    HV_INTERCEPT_ACCESS_MASK_EXECUTE, hv_intercept_parameters,
36    hv_intercept_type_HV_INTERCEPT_TYPE_EXCEPTION, hv_message_type_HVMSG_X64_EXCEPTION_INTERCEPT,
37    mshv_install_intercept,
38};
39use mshv_ioctls::{Mshv, VcpuFd, VmFd};
40use tracing::{Span, instrument};
41#[cfg(feature = "trace_guest")]
42use tracing_opentelemetry::OpenTelemetrySpanExt;
43#[cfg(crashdump)]
44use {super::crashdump, std::path::Path};
45
46#[cfg(gdb)]
47use super::gdb::{
48    DebugCommChannel, DebugMemoryAccess, DebugMsg, DebugResponse, GuestDebug, MshvDebug,
49    VcpuStopReason,
50};
51use super::{HyperlightExit, Hypervisor, LinuxInterruptHandle, VirtualCPU};
52#[cfg(gdb)]
53use crate::HyperlightError;
54use crate::hypervisor::get_memory_access_violation;
55use crate::hypervisor::regs::CommonFpu;
56use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
57use crate::mem::mgr::SandboxMemoryManager;
58use crate::mem::ptr::{GuestPtr, RawPtr};
59use crate::mem::shared_mem::HostSharedMemory;
60use crate::sandbox::SandboxConfiguration;
61use crate::sandbox::host_funcs::FunctionRegistry;
62use crate::sandbox::outb::handle_outb;
63#[cfg(feature = "mem_profile")]
64use crate::sandbox::trace::MemTraceInfo;
65#[cfg(crashdump)]
66use crate::sandbox::uninitialized::SandboxRuntimeConfig;
67use crate::{Result, log_then_return, new_error};
68
69#[cfg(gdb)]
70mod debug {
71    use super::mshv_bindings::hv_x64_exception_intercept_message;
72    use super::{HypervLinuxDriver, *};
73    use crate::hypervisor::gdb::{DebugMemoryAccess, DebugMsg, DebugResponse, VcpuStopReason};
74    use crate::{Result, new_error};
75
76    impl HypervLinuxDriver {
77        /// Resets the debug information to disable debugging
78        fn disable_debug(&mut self) -> Result<()> {
79            let mut debug = MshvDebug::default();
80
81            debug.set_single_step(&self.vcpu_fd, false)?;
82
83            self.debug = Some(debug);
84
85            Ok(())
86        }
87
88        /// Get the reason the vCPU has stopped
89        pub(crate) fn get_stop_reason(
90            &mut self,
91            ex_info: hv_x64_exception_intercept_message,
92        ) -> Result<VcpuStopReason> {
93            let debug = self
94                .debug
95                .as_mut()
96                .ok_or_else(|| new_error!("Debug is not enabled"))?;
97
98            debug.get_stop_reason(&self.vcpu_fd, ex_info.exception_vector, self.entrypoint)
99        }
100
101        pub(crate) fn process_dbg_request(
102            &mut self,
103            req: DebugMsg,
104            mem_access: &DebugMemoryAccess,
105        ) -> Result<DebugResponse> {
106            if let Some(debug) = self.debug.as_mut() {
107                match req {
108                    DebugMsg::AddHwBreakpoint(addr) => Ok(DebugResponse::AddHwBreakpoint(
109                        debug
110                            .add_hw_breakpoint(&self.vcpu_fd, addr)
111                            .map_err(|e| {
112                                log::error!("Failed to add hw breakpoint: {:?}", e);
113
114                                e
115                            })
116                            .is_ok(),
117                    )),
118                    DebugMsg::AddSwBreakpoint(addr) => Ok(DebugResponse::AddSwBreakpoint(
119                        debug
120                            .add_sw_breakpoint(&self.vcpu_fd, addr, mem_access)
121                            .map_err(|e| {
122                                log::error!("Failed to add sw breakpoint: {:?}", e);
123
124                                e
125                            })
126                            .is_ok(),
127                    )),
128                    DebugMsg::Continue => {
129                        debug.set_single_step(&self.vcpu_fd, false).map_err(|e| {
130                            log::error!("Failed to continue execution: {:?}", e);
131
132                            e
133                        })?;
134
135                        Ok(DebugResponse::Continue)
136                    }
137                    DebugMsg::DisableDebug => {
138                        self.disable_debug().map_err(|e| {
139                            log::error!("Failed to disable debugging: {:?}", e);
140
141                            e
142                        })?;
143
144                        Ok(DebugResponse::DisableDebug)
145                    }
146                    DebugMsg::GetCodeSectionOffset => {
147                        let offset = mem_access
148                            .dbg_mem_access_fn
149                            .try_lock()
150                            .map_err(|e| {
151                                new_error!("Error locking at {}:{}: {}", file!(), line!(), e)
152                            })?
153                            .layout
154                            .get_guest_code_address();
155
156                        Ok(DebugResponse::GetCodeSectionOffset(offset as u64))
157                    }
158                    DebugMsg::ReadAddr(addr, len) => {
159                        let mut data = vec![0u8; len];
160
161                        debug.read_addrs(&self.vcpu_fd, addr, &mut data, mem_access)?;
162
163                        Ok(DebugResponse::ReadAddr(data))
164                    }
165                    DebugMsg::ReadRegisters => debug
166                        .read_regs(&self.vcpu_fd)
167                        .map_err(|e| {
168                            log::error!("Failed to read registers: {:?}", e);
169
170                            e
171                        })
172                        .map(|(regs, fpu)| DebugResponse::ReadRegisters(Box::new((regs, fpu)))),
173                    DebugMsg::RemoveHwBreakpoint(addr) => Ok(DebugResponse::RemoveHwBreakpoint(
174                        debug
175                            .remove_hw_breakpoint(&self.vcpu_fd, addr)
176                            .map_err(|e| {
177                                log::error!("Failed to remove hw breakpoint: {:?}", e);
178
179                                e
180                            })
181                            .is_ok(),
182                    )),
183                    DebugMsg::RemoveSwBreakpoint(addr) => Ok(DebugResponse::RemoveSwBreakpoint(
184                        debug
185                            .remove_sw_breakpoint(&self.vcpu_fd, addr, mem_access)
186                            .map_err(|e| {
187                                log::error!("Failed to remove sw breakpoint: {:?}", e);
188
189                                e
190                            })
191                            .is_ok(),
192                    )),
193                    DebugMsg::Step => {
194                        debug.set_single_step(&self.vcpu_fd, true).map_err(|e| {
195                            log::error!("Failed to enable step instruction: {:?}", e);
196
197                            e
198                        })?;
199
200                        Ok(DebugResponse::Step)
201                    }
202                    DebugMsg::WriteAddr(addr, data) => {
203                        debug.write_addrs(&self.vcpu_fd, addr, &data, mem_access)?;
204
205                        Ok(DebugResponse::WriteAddr)
206                    }
207                    DebugMsg::WriteRegisters(boxed_regs) => {
208                        let (regs, fpu) = boxed_regs.as_ref();
209                        debug
210                            .write_regs(&self.vcpu_fd, regs, fpu)
211                            .map_err(|e| {
212                                log::error!("Failed to write registers: {:?}", e);
213
214                                e
215                            })
216                            .map(|_| DebugResponse::WriteRegisters)
217                    }
218                }
219            } else {
220                Err(new_error!("Debugging is not enabled"))
221            }
222        }
223
224        pub(crate) fn recv_dbg_msg(&mut self) -> Result<DebugMsg> {
225            let gdb_conn = self
226                .gdb_conn
227                .as_mut()
228                .ok_or_else(|| new_error!("Debug is not enabled"))?;
229
230            gdb_conn.recv().map_err(|e| {
231                new_error!(
232                    "Got an error while waiting to receive a
233                    message: {:?}",
234                    e
235                )
236            })
237        }
238
239        pub(crate) fn send_dbg_msg(&mut self, cmd: DebugResponse) -> Result<()> {
240            log::debug!("Sending {:?}", cmd);
241
242            let gdb_conn = self
243                .gdb_conn
244                .as_mut()
245                .ok_or_else(|| new_error!("Debug is not enabled"))?;
246
247            gdb_conn
248                .send(cmd)
249                .map_err(|e| new_error!("Got an error while sending a response message {:?}", e))
250        }
251    }
252}
253
254/// Determine whether the HyperV for Linux hypervisor API is present
255/// and functional.
256#[instrument(skip_all, parent = Span::current(), level = "Trace")]
257pub(crate) fn is_hypervisor_present() -> bool {
258    match Mshv::new() {
259        Ok(_) => true,
260        Err(_) => {
261            log::info!("MSHV is not available on this system");
262            false
263        }
264    }
265}
266
267/// A Hypervisor driver for HyperV-on-Linux. This hypervisor is often
268/// called the Microsoft Hypervisor (MSHV)
269pub(crate) struct HypervLinuxDriver {
270    _mshv: Mshv,
271    page_size: usize,
272    vm_fd: VmFd,
273    vcpu_fd: VcpuFd,
274    orig_rsp: GuestPtr,
275    entrypoint: u64,
276    interrupt_handle: Arc<LinuxInterruptHandle>,
277    mem_mgr: Option<SandboxMemoryManager<HostSharedMemory>>,
278    host_funcs: Option<Arc<Mutex<FunctionRegistry>>>,
279
280    sandbox_regions: Vec<MemoryRegion>, // Initially mapped regions when sandbox is created
281    mmap_regions: Vec<MemoryRegion>,    // Later mapped regions
282
283    #[cfg(gdb)]
284    debug: Option<MshvDebug>,
285    #[cfg(gdb)]
286    gdb_conn: Option<DebugCommChannel<DebugResponse, DebugMsg>>,
287    #[cfg(crashdump)]
288    rt_cfg: SandboxRuntimeConfig,
289    #[cfg(feature = "mem_profile")]
290    trace_info: MemTraceInfo,
291}
292
293impl HypervLinuxDriver {
294    /// Create a new `HypervLinuxDriver`, complete with all registers
295    /// set up to execute a Hyperlight binary inside a HyperV-powered
296    /// sandbox on Linux.
297    ///
298    /// While registers are set up, they will not have been applied to
299    /// the underlying virtual CPU after this function returns. Call the
300    /// `apply_registers` method to do that, or more likely call
301    /// `initialise` to do it for you.
302    #[allow(clippy::too_many_arguments)]
303    // TODO: refactor this function to take fewer arguments. Add trace_info to rt_cfg
304    #[instrument(skip_all, parent = Span::current(), level = "Trace")]
305    pub(crate) fn new(
306        mem_regions: Vec<MemoryRegion>,
307        entrypoint_ptr: GuestPtr,
308        rsp_ptr: GuestPtr,
309        pml4_ptr: GuestPtr,
310        config: &SandboxConfiguration,
311        #[cfg(gdb)] gdb_conn: Option<DebugCommChannel<DebugResponse, DebugMsg>>,
312        #[cfg(crashdump)] rt_cfg: SandboxRuntimeConfig,
313        #[cfg(feature = "mem_profile")] trace_info: MemTraceInfo,
314    ) -> Result<Self> {
315        let mshv = Mshv::new()?;
316        let pr = Default::default();
317
318        let vm_fd = {
319            // It's important to avoid create_vm() and explicitly use
320            // create_vm_with_args() with an empty arguments structure
321            // here, because otherwise the partition is set up with a SynIC.
322
323            let vm_fd = mshv.create_vm_with_args(&pr)?;
324            let features: hv_partition_synthetic_processor_features = Default::default();
325            vm_fd.set_partition_property(
326                hv_partition_property_code_HV_PARTITION_PROPERTY_SYNTHETIC_PROC_FEATURES,
327                unsafe { features.as_uint64[0] },
328            )?;
329            vm_fd.initialize()?;
330            vm_fd
331        };
332
333        let vcpu_fd = vm_fd.create_vcpu(0)?;
334
335        #[cfg(gdb)]
336        let (debug, gdb_conn) = if let Some(gdb_conn) = gdb_conn {
337            let mut debug = MshvDebug::new();
338            debug.add_hw_breakpoint(&vcpu_fd, entrypoint_ptr.absolute()?)?;
339
340            // The bellow intercepts make the vCPU exit with the Exception Intercept exit code
341            // Check Table 6-1. Exceptions and Interrupts at Page 6-13 Vol. 1
342            // of Intel 64 and IA-32 Architectures Software Developer's Manual
343            // Install intercept for #DB (1) exception
344            vm_fd
345                .install_intercept(mshv_install_intercept {
346                    access_type_mask: HV_INTERCEPT_ACCESS_MASK_EXECUTE,
347                    intercept_type: hv_intercept_type_HV_INTERCEPT_TYPE_EXCEPTION,
348                    // Exception handler #DB (1)
349                    intercept_parameter: hv_intercept_parameters {
350                        exception_vector: 0x1,
351                    },
352                })
353                .map_err(|e| new_error!("Cannot install debug exception intercept: {}", e))?;
354
355            // Install intercept for #BP (3) exception
356            vm_fd
357                .install_intercept(mshv_install_intercept {
358                    access_type_mask: HV_INTERCEPT_ACCESS_MASK_EXECUTE,
359                    intercept_type: hv_intercept_type_HV_INTERCEPT_TYPE_EXCEPTION,
360                    // Exception handler #BP (3)
361                    intercept_parameter: hv_intercept_parameters {
362                        exception_vector: 0x3,
363                    },
364                })
365                .map_err(|e| new_error!("Cannot install breakpoint exception intercept: {}", e))?;
366
367            (Some(debug), Some(gdb_conn))
368        } else {
369            (None, None)
370        };
371
372        mem_regions.iter().try_for_each(|region| {
373            let mshv_region = region.to_owned().into();
374            vm_fd.map_user_memory(mshv_region)
375        })?;
376
377        let interrupt_handle = Arc::new(LinuxInterruptHandle {
378            running: AtomicU64::new(0),
379            cancel_requested: AtomicU64::new(0),
380            call_active: AtomicBool::new(false),
381            #[cfg(gdb)]
382            debug_interrupt: AtomicBool::new(false),
383            #[cfg(all(
384                target_arch = "x86_64",
385                target_vendor = "unknown",
386                target_os = "linux",
387                target_env = "musl"
388            ))]
389            tid: AtomicU64::new(unsafe { libc::pthread_self() as u64 }),
390            #[cfg(not(all(
391                target_arch = "x86_64",
392                target_vendor = "unknown",
393                target_os = "linux",
394                target_env = "musl"
395            )))]
396            tid: AtomicU64::new(unsafe { libc::pthread_self() }),
397            retry_delay: config.get_interrupt_retry_delay(),
398            sig_rt_min_offset: config.get_interrupt_vcpu_sigrtmin_offset(),
399            dropped: AtomicBool::new(false),
400        });
401
402        let mut hv = Self {
403            _mshv: mshv,
404            page_size: 0,
405            vm_fd,
406            vcpu_fd,
407            sandbox_regions: mem_regions,
408            mmap_regions: Vec::new(),
409            entrypoint: entrypoint_ptr.absolute()?,
410            orig_rsp: rsp_ptr,
411            interrupt_handle: interrupt_handle.clone(),
412            mem_mgr: None,
413            host_funcs: None,
414            #[cfg(gdb)]
415            debug,
416            #[cfg(gdb)]
417            gdb_conn,
418            #[cfg(crashdump)]
419            rt_cfg,
420            #[cfg(feature = "mem_profile")]
421            trace_info,
422        };
423
424        hv.setup_initial_sregs(pml4_ptr.absolute()?)?;
425
426        // Send the interrupt handle to the GDB thread if debugging is enabled
427        // This is used to allow the GDB thread to stop the vCPU
428        #[cfg(gdb)]
429        if hv.debug.is_some() {
430            hv.send_dbg_msg(DebugResponse::InterruptHandle(interrupt_handle))?;
431        }
432
433        Ok(hv)
434    }
435}
436
437impl Debug for HypervLinuxDriver {
438    fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result {
439        let mut f = f.debug_struct("Hyperv Linux Driver");
440
441        f.field("Entrypoint", &self.entrypoint)
442            .field("Original RSP", &self.orig_rsp);
443
444        for region in &self.sandbox_regions {
445            f.field("Sandbox Memory Region", &region);
446        }
447        for region in &self.mmap_regions {
448            f.field("Mapped Memory Region", &region);
449        }
450
451        let regs = self.vcpu_fd.get_regs();
452
453        if let Ok(regs) = regs {
454            f.field("Registers", &regs);
455        }
456
457        let sregs = self.vcpu_fd.get_sregs();
458
459        if let Ok(sregs) = sregs {
460            f.field("Special Registers", &sregs);
461        }
462
463        f.finish()
464    }
465}
466
467impl Hypervisor for HypervLinuxDriver {
468    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
469    fn initialise(
470        &mut self,
471        peb_addr: RawPtr,
472        seed: u64,
473        page_size: u32,
474        mem_mgr: SandboxMemoryManager<HostSharedMemory>,
475        host_funcs: Arc<Mutex<FunctionRegistry>>,
476        max_guest_log_level: Option<LevelFilter>,
477        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
478    ) -> Result<()> {
479        self.mem_mgr = Some(mem_mgr);
480        self.host_funcs = Some(host_funcs);
481        self.page_size = page_size as usize;
482
483        let max_guest_log_level: u64 = match max_guest_log_level {
484            Some(level) => level as u64,
485            None => self.get_max_log_level().into(),
486        };
487
488        let regs = StandardRegisters {
489            rip: self.entrypoint,
490            rsp: self.orig_rsp.absolute()?,
491            rflags: 2, //bit 1 of rlags is required to be set
492
493            // function args
494            rdi: peb_addr.into(),
495            rsi: seed,
496            rdx: page_size.into(),
497            rcx: max_guest_log_level,
498
499            ..Default::default()
500        };
501        self.vcpu_fd.set_regs(&regs)?;
502
503        VirtualCPU::run(
504            self.as_mut_hypervisor(),
505            #[cfg(gdb)]
506            dbg_mem_access_fn,
507        )
508    }
509
510    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
511    unsafe fn map_region(&mut self, rgn: &MemoryRegion) -> Result<()> {
512        if [
513            rgn.guest_region.start,
514            rgn.guest_region.end,
515            rgn.host_region.start,
516            rgn.host_region.end,
517        ]
518        .iter()
519        .any(|x| x % self.page_size != 0)
520        {
521            log_then_return!("region is not page-aligned");
522        }
523        let mshv_region: mshv_user_mem_region = rgn.to_owned().into();
524        self.vm_fd.map_user_memory(mshv_region)?;
525        self.mmap_regions.push(rgn.to_owned());
526        Ok(())
527    }
528
529    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
530    unsafe fn unmap_region(&mut self, region: &MemoryRegion) -> Result<()> {
531        if let Some(pos) = self.mmap_regions.iter().position(|r| r == region) {
532            let removed_region = self.mmap_regions.remove(pos);
533            let mshv_region: mshv_user_mem_region = removed_region.into();
534            self.vm_fd.unmap_user_memory(mshv_region)?;
535            Ok(())
536        } else {
537            Err(new_error!("Tried to unmap region that is not mapped"))
538        }
539    }
540
541    fn get_mapped_regions(&self) -> Box<dyn ExactSizeIterator<Item = &MemoryRegion> + '_> {
542        Box::new(self.mmap_regions.iter())
543    }
544
545    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
546    fn dispatch_call_from_host(
547        &mut self,
548        dispatch_func_addr: RawPtr,
549        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
550    ) -> Result<()> {
551        // Reset general purpose registers, then set RIP and RSP
552        let regs = StandardRegisters {
553            rip: dispatch_func_addr.into(),
554            rsp: self.orig_rsp.absolute()?,
555            rflags: 2, //bit 1 of rlags is required to be set
556            ..Default::default()
557        };
558        self.vcpu_fd.set_regs(&regs)?;
559
560        // reset fpu state
561        self.set_fpu(&CommonFpu::default())?;
562
563        // run
564        VirtualCPU::run(
565            self.as_mut_hypervisor(),
566            #[cfg(gdb)]
567            dbg_mem_access_fn,
568        )?;
569
570        Ok(())
571    }
572
573    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
574    fn handle_io(
575        &mut self,
576        port: u16,
577        data: Vec<u8>,
578        rip: u64,
579        instruction_length: u64,
580    ) -> Result<()> {
581        let mut padded = [0u8; 4];
582        let copy_len = data.len().min(4);
583        padded[..copy_len].copy_from_slice(&data[..copy_len]);
584        let val = u32::from_le_bytes(padded);
585
586        #[cfg(feature = "mem_profile")]
587        {
588            // We need to handle the borrow checker issue where we need both:
589            // - &mut SandboxMemoryManager (from self.mem_mgr)
590            // - &mut dyn Hypervisor (from self)
591            // We'll use a temporary approach to extract the mem_mgr temporarily
592            let mem_mgr_option = self.mem_mgr.take();
593            let mut mem_mgr = mem_mgr_option
594                .ok_or_else(|| new_error!("mem_mgr should be initialized before handling IO"))?;
595            let host_funcs = self
596                .host_funcs
597                .as_ref()
598                .ok_or_else(|| new_error!("host_funcs should be initialized before handling IO"))?
599                .clone();
600
601            handle_outb(&mut mem_mgr, host_funcs, self, port, val)?;
602
603            // Put the mem_mgr back
604            self.mem_mgr = Some(mem_mgr);
605        }
606
607        #[cfg(not(feature = "mem_profile"))]
608        {
609            let mem_mgr = self
610                .mem_mgr
611                .as_mut()
612                .ok_or_else(|| new_error!("mem_mgr should be initialized before handling IO"))?;
613            let host_funcs = self
614                .host_funcs
615                .as_ref()
616                .ok_or_else(|| new_error!("host_funcs should be initialized before handling IO"))?
617                .clone();
618
619            handle_outb(mem_mgr, host_funcs, port, val)?;
620        }
621
622        // update rip
623        self.vcpu_fd.set_reg(&[hv_register_assoc {
624            name: hv_register_name_HV_X64_REGISTER_RIP,
625            value: hv_register_value {
626                reg64: rip + instruction_length,
627            },
628            ..Default::default()
629        }])?;
630        Ok(())
631    }
632
633    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
634    fn run(
635        &mut self,
636        #[cfg(feature = "trace_guest")] tc: &mut crate::sandbox::trace::TraceContext,
637    ) -> Result<super::HyperlightExit> {
638        const HALT_MESSAGE: hv_message_type = hv_message_type_HVMSG_X64_HALT;
639        const IO_PORT_INTERCEPT_MESSAGE: hv_message_type =
640            hv_message_type_HVMSG_X64_IO_PORT_INTERCEPT;
641        const UNMAPPED_GPA_MESSAGE: hv_message_type = hv_message_type_HVMSG_UNMAPPED_GPA;
642        const INVALID_GPA_ACCESS_MESSAGE: hv_message_type = hv_message_type_HVMSG_GPA_INTERCEPT;
643        #[cfg(gdb)]
644        const EXCEPTION_INTERCEPT: hv_message_type = hv_message_type_HVMSG_X64_EXCEPTION_INTERCEPT;
645
646        self.interrupt_handle
647            .tid
648            .store(unsafe { libc::pthread_self() as u64 }, Ordering::Release);
649        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
650        // Cast to internal trait for access to internal methods
651        let interrupt_handle_internal =
652            self.interrupt_handle.as_ref() as &dyn super::InterruptHandleInternal;
653
654        // (after set_running_bit but before checking cancel_requested):
655        // - kill() will stamp cancel_requested with the current generation
656        // - We will check cancel_requested below and skip the VcpuFd::run() call
657        // - This is the desired behavior - the kill takes effect immediately
658        let generation = interrupt_handle_internal.set_running_bit();
659
660        #[cfg(not(gdb))]
661        let debug_interrupt = false;
662        #[cfg(gdb)]
663        let debug_interrupt = self
664            .interrupt_handle
665            .debug_interrupt
666            .load(Ordering::Relaxed);
667
668        // Don't run the vcpu if `cancel_requested` is set for our generation
669        //
670        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
671        // (after checking cancel_requested but before vcpu.run()):
672        // - kill() will stamp cancel_requested with the current generation
673        // - We will proceed with vcpu.run(), but signals will be sent to interrupt it
674        // - The vcpu will be interrupted and return EINTR (handled below)
675        let exit_reason = if interrupt_handle_internal
676            .is_cancel_requested_for_generation(generation)
677            || debug_interrupt
678        {
679            Err(mshv_ioctls::MshvError::from(libc::EINTR))
680        } else {
681            #[cfg(feature = "trace_guest")]
682            tc.setup_guest_trace(Span::current().context());
683
684            // Note: if a `InterruptHandle::kill()` called while this thread is **here**
685            // Then the vcpu will run, but we will keep sending signals to this thread
686            // to interrupt it until `running` is set to false. The `vcpu_fd::run()` call will
687            // return either normally with an exit reason, or from being "kicked" by out signal handler, with an EINTR error,
688            // both of which are fine.
689            self.vcpu_fd.run()
690        };
691        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
692        // (after vcpu.run() returns but before clear_running_bit):
693        // - kill() continues sending signals to this thread (running bit is still set)
694        // - The signals are harmless (no-op handler), we just need to check cancel_requested
695        // - We load cancel_requested below to determine if this run was cancelled
696        let cancel_requested =
697            interrupt_handle_internal.is_cancel_requested_for_generation(generation);
698        #[cfg(gdb)]
699        let debug_interrupt = self
700            .interrupt_handle
701            .debug_interrupt
702            .load(Ordering::Relaxed);
703        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
704        // (after loading cancel_requested but before clear_running_bit):
705        // - kill() stamps cancel_requested with the CURRENT generation (not the one we just loaded)
706        // - kill() continues sending signals until running bit is cleared
707        // - The newly stamped cancel_requested will affect the NEXT vcpu.run() call
708        // - Signals sent now are harmless (no-op handler)
709        interrupt_handle_internal.clear_running_bit();
710        // At this point, running bit is clear so kill() will stop sending signals.
711        // However, we may still receive delayed signals that were sent before clear_running_bit.
712        // These stale signals are harmless because:
713        // - The signal handler is a no-op
714        // - We check generation matching in cancel_requested before treating EINTR as cancellation
715        // - If generation doesn't match, we return Retry instead of Cancelled
716        let result = match exit_reason {
717            Ok(m) => match m.header.message_type {
718                HALT_MESSAGE => {
719                    crate::debug!("mshv - Halt Details : {:#?}", &self);
720                    HyperlightExit::Halt()
721                }
722                IO_PORT_INTERCEPT_MESSAGE => {
723                    let io_message = m.to_ioport_info().map_err(mshv_ioctls::MshvError::from)?;
724                    let port_number = io_message.port_number;
725                    let rip = io_message.header.rip;
726                    let rax = io_message.rax;
727                    let instruction_length = io_message.header.instruction_length() as u64;
728                    crate::debug!("mshv IO Details : \nPort : {}\n{:#?}", port_number, &self);
729                    HyperlightExit::IoOut(
730                        port_number,
731                        rax.to_le_bytes().to_vec(),
732                        rip,
733                        instruction_length,
734                    )
735                }
736                UNMAPPED_GPA_MESSAGE => {
737                    let mimo_message = m.to_memory_info().map_err(mshv_ioctls::MshvError::from)?;
738                    let addr = mimo_message.guest_physical_address;
739                    crate::debug!(
740                        "mshv MMIO unmapped GPA -Details: Address: {} \n {:#?}",
741                        addr,
742                        &self
743                    );
744                    HyperlightExit::Mmio(addr)
745                }
746                INVALID_GPA_ACCESS_MESSAGE => {
747                    let mimo_message = m.to_memory_info().map_err(mshv_ioctls::MshvError::from)?;
748                    let gpa = mimo_message.guest_physical_address;
749                    let access_info = MemoryRegionFlags::try_from(mimo_message)?;
750                    crate::debug!(
751                        "mshv MMIO invalid GPA access -Details: Address: {} \n {:#?}",
752                        gpa,
753                        &self
754                    );
755                    match get_memory_access_violation(
756                        gpa as usize,
757                        self.sandbox_regions.iter().chain(self.mmap_regions.iter()),
758                        access_info,
759                    ) {
760                        Some(access_info_violation) => access_info_violation,
761                        None => HyperlightExit::Mmio(gpa),
762                    }
763                }
764                // The only case an intercept exit is expected is when debugging is enabled
765                // and the intercepts are installed.
766                // Provide the extra information about the exception to accurately determine
767                // the stop reason
768                #[cfg(gdb)]
769                EXCEPTION_INTERCEPT => {
770                    // Extract exception info from the message so we can figure out
771                    // more information about the vCPU state
772                    let ex_info = match m.to_exception_info().map_err(mshv_ioctls::MshvError::from)
773                    {
774                        Ok(info) => info,
775                        Err(e) => {
776                            log_then_return!("Error converting to exception info: {:?}", e);
777                        }
778                    };
779
780                    match self.get_stop_reason(ex_info) {
781                        Ok(reason) => HyperlightExit::Debug(reason),
782                        Err(e) => {
783                            log_then_return!("Error getting stop reason: {:?}", e);
784                        }
785                    }
786                }
787                other => {
788                    crate::debug!("mshv Other Exit: Exit: {:#?} \n {:#?}", other, &self);
789                    #[cfg(crashdump)]
790                    let _ = crashdump::generate_crashdump(self);
791                    log_then_return!("unknown Hyper-V run message type {:?}", other);
792                }
793            },
794            Err(e) => match e.errno() {
795                // We send a signal (SIGRTMIN+offset) to interrupt the vcpu, which causes EINTR
796                libc::EINTR => {
797                    // Check if cancellation was requested for THIS specific generation.
798                    // If not, the EINTR came from:
799                    // - A debug interrupt (if GDB is enabled)
800                    // - A stale signal from a previous guest call (generation mismatch)
801                    // - A signal meant for a different sandbox on the same thread
802                    // In these cases, we return Retry to continue execution.
803                    if cancel_requested {
804                        interrupt_handle_internal.clear_cancel_requested();
805                        HyperlightExit::Cancelled()
806                    } else {
807                        #[cfg(gdb)]
808                        if debug_interrupt {
809                            self.interrupt_handle
810                                .debug_interrupt
811                                .store(false, Ordering::Relaxed);
812
813                            // If the vCPU was stopped because of an interrupt, we need to
814                            // return a special exit reason so that the gdb thread can handle it
815                            // and resume execution
816                            HyperlightExit::Debug(VcpuStopReason::Interrupt)
817                        } else {
818                            HyperlightExit::Retry()
819                        }
820
821                        #[cfg(not(gdb))]
822                        HyperlightExit::Retry()
823                    }
824                }
825                libc::EAGAIN => HyperlightExit::Retry(),
826                _ => {
827                    crate::debug!("mshv Error - Details: Error: {} \n {:#?}", e, &self);
828                    log_then_return!("Error running VCPU {:?}", e);
829                }
830            },
831        };
832        Ok(result)
833    }
834
835    fn regs(&self) -> Result<super::regs::CommonRegisters> {
836        let mshv_regs = self.vcpu_fd.get_regs()?;
837        Ok((&mshv_regs).into())
838    }
839
840    fn set_regs(&mut self, regs: &super::regs::CommonRegisters) -> Result<()> {
841        let mshv_regs: StandardRegisters = regs.into();
842        self.vcpu_fd.set_regs(&mshv_regs)?;
843        Ok(())
844    }
845
846    fn fpu(&self) -> Result<super::regs::CommonFpu> {
847        let mshv_fpu = self.vcpu_fd.get_fpu()?;
848        Ok((&mshv_fpu).into())
849    }
850
851    fn set_fpu(&mut self, fpu: &super::regs::CommonFpu) -> Result<()> {
852        let mshv_fpu: FloatingPointUnit = fpu.into();
853        self.vcpu_fd.set_fpu(&mshv_fpu)?;
854        Ok(())
855    }
856
857    fn sregs(&self) -> Result<super::regs::CommonSpecialRegisters> {
858        let mshv_sregs = self.vcpu_fd.get_sregs()?;
859        Ok((&mshv_sregs).into())
860    }
861
862    fn set_sregs(&mut self, sregs: &super::regs::CommonSpecialRegisters) -> Result<()> {
863        let mshv_sregs: SpecialRegisters = sregs.into();
864        self.vcpu_fd.set_sregs(&mshv_sregs)?;
865        Ok(())
866    }
867
868    #[instrument(skip_all, parent = Span::current(), level = "Trace")]
869    fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor {
870        self as &mut dyn Hypervisor
871    }
872
873    fn interrupt_handle(&self) -> Arc<dyn super::InterruptHandleInternal> {
874        self.interrupt_handle.clone()
875    }
876
877    #[cfg(crashdump)]
878    fn crashdump_context(&self) -> Result<Option<super::crashdump::CrashDumpContext>> {
879        if self.rt_cfg.guest_core_dump {
880            let mut regs = [0; 27];
881
882            let vcpu_regs = self.vcpu_fd.get_regs()?;
883            let sregs = self.vcpu_fd.get_sregs()?;
884            let xsave = self.vcpu_fd.get_xsave()?;
885
886            // Set up the registers for the crash dump
887            regs[0] = vcpu_regs.r15; // r15
888            regs[1] = vcpu_regs.r14; // r14
889            regs[2] = vcpu_regs.r13; // r13
890            regs[3] = vcpu_regs.r12; // r12
891            regs[4] = vcpu_regs.rbp; // rbp
892            regs[5] = vcpu_regs.rbx; // rbx
893            regs[6] = vcpu_regs.r11; // r11
894            regs[7] = vcpu_regs.r10; // r10
895            regs[8] = vcpu_regs.r9; // r9
896            regs[9] = vcpu_regs.r8; // r8
897            regs[10] = vcpu_regs.rax; // rax
898            regs[11] = vcpu_regs.rcx; // rcx
899            regs[12] = vcpu_regs.rdx; // rdx
900            regs[13] = vcpu_regs.rsi; // rsi
901            regs[14] = vcpu_regs.rdi; // rdi
902            regs[15] = 0; // orig rax
903            regs[16] = vcpu_regs.rip; // rip
904            regs[17] = sregs.cs.selector as u64; // cs
905            regs[18] = vcpu_regs.rflags; // eflags
906            regs[19] = vcpu_regs.rsp; // rsp
907            regs[20] = sregs.ss.selector as u64; // ss
908            regs[21] = sregs.fs.base; // fs_base
909            regs[22] = sregs.gs.base; // gs_base
910            regs[23] = sregs.ds.selector as u64; // ds
911            regs[24] = sregs.es.selector as u64; // es
912            regs[25] = sregs.fs.selector as u64; // fs
913            regs[26] = sregs.gs.selector as u64; // gs
914
915            // Get the filename from the binary path
916            let filename = self.rt_cfg.binary_path.clone().and_then(|path| {
917                Path::new(&path)
918                    .file_name()
919                    .and_then(|name| name.to_os_string().into_string().ok())
920            });
921
922            // Include both initial sandbox regions and dynamically mapped regions
923            let mut regions: Vec<MemoryRegion> = self.sandbox_regions.clone();
924            regions.extend(self.mmap_regions.iter().cloned());
925            Ok(Some(crashdump::CrashDumpContext::new(
926                regions,
927                regs,
928                xsave.buffer.to_vec(),
929                self.entrypoint,
930                self.rt_cfg.binary_path.clone(),
931                filename,
932            )))
933        } else {
934            Ok(None)
935        }
936    }
937
938    #[cfg(gdb)]
939    fn handle_debug(
940        &mut self,
941        dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
942        stop_reason: VcpuStopReason,
943    ) -> Result<()> {
944        if self.debug.is_none() {
945            return Err(new_error!("Debugging is not enabled"));
946        }
947
948        let mem_access = DebugMemoryAccess {
949            dbg_mem_access_fn,
950            guest_mmap_regions: self.mmap_regions.to_vec(),
951        };
952
953        match stop_reason {
954            // If the vCPU stopped because of a crash, we need to handle it differently
955            // We do not want to allow resuming execution or placing breakpoints
956            // because the guest has crashed.
957            // We only allow reading registers and memory
958            VcpuStopReason::Crash => {
959                self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
960                    .map_err(|e| {
961                        new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
962                    })?;
963
964                loop {
965                    log::debug!("Debug wait for event to resume vCPU");
966                    // Wait for a message from gdb
967                    let req = self.recv_dbg_msg()?;
968
969                    // Flag to store if we should deny continue or step requests
970                    let mut deny_continue = false;
971                    // Flag to store if we should detach from the gdb session
972                    let mut detach = false;
973
974                    let response = match req {
975                        // Allow the detach request to disable debugging by continuing resuming
976                        // hypervisor crash error reporting
977                        DebugMsg::DisableDebug => {
978                            detach = true;
979                            DebugResponse::DisableDebug
980                        }
981                        // Do not allow continue or step requests
982                        DebugMsg::Continue | DebugMsg::Step => {
983                            deny_continue = true;
984                            DebugResponse::NotAllowed
985                        }
986                        // Do not allow adding/removing breakpoints and writing to memory or registers
987                        DebugMsg::AddHwBreakpoint(_)
988                        | DebugMsg::AddSwBreakpoint(_)
989                        | DebugMsg::RemoveHwBreakpoint(_)
990                        | DebugMsg::RemoveSwBreakpoint(_)
991                        | DebugMsg::WriteAddr(_, _)
992                        | DebugMsg::WriteRegisters(_) => DebugResponse::NotAllowed,
993
994                        // For all other requests, we will process them normally
995                        _ => {
996                            let result = self.process_dbg_request(req, &mem_access);
997                            match result {
998                                Ok(response) => response,
999                                Err(HyperlightError::TranslateGuestAddress(_)) => {
1000                                    // Treat non fatal errors separately so the guest doesn't fail
1001                                    DebugResponse::ErrorOccurred
1002                                }
1003                                Err(e) => {
1004                                    log::error!("Error processing debug request: {:?}", e);
1005                                    return Err(e);
1006                                }
1007                            }
1008                        }
1009                    };
1010
1011                    // Send the response to the request back to gdb
1012                    self.send_dbg_msg(response)
1013                        .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
1014
1015                    // If we are denying continue or step requests, the debugger assumes the
1016                    // execution started so we need to report a stop reason as a crash and let
1017                    // it request to read registers/memory to figure out what happened
1018                    if deny_continue {
1019                        self.send_dbg_msg(DebugResponse::VcpuStopped(VcpuStopReason::Crash))
1020                            .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
1021                    }
1022
1023                    // If we are detaching, we will break the loop and the Hypervisor will continue
1024                    // to handle the Crash reason
1025                    if detach {
1026                        break;
1027                    }
1028                }
1029            }
1030            // If the vCPU stopped because of any other reason except a crash, we can handle it
1031            // normally
1032            _ => {
1033                // Send the stop reason to the gdb thread
1034                self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
1035                    .map_err(|e| {
1036                        new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
1037                    })?;
1038
1039                loop {
1040                    log::debug!("Debug wait for event to resume vCPU");
1041                    // Wait for a message from gdb
1042                    let req = self.recv_dbg_msg()?;
1043
1044                    let result = self.process_dbg_request(req, &mem_access);
1045
1046                    let response = match result {
1047                        Ok(response) => response,
1048                        // Treat non fatal errors separately so the guest doesn't fail
1049                        Err(HyperlightError::TranslateGuestAddress(_)) => {
1050                            DebugResponse::ErrorOccurred
1051                        }
1052                        Err(e) => {
1053                            return Err(e);
1054                        }
1055                    };
1056
1057                    let cont = matches!(
1058                        response,
1059                        DebugResponse::Continue | DebugResponse::Step | DebugResponse::DisableDebug
1060                    );
1061
1062                    self.send_dbg_msg(response)
1063                        .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
1064
1065                    // Check if we should continue execution
1066                    // We continue if the response is one of the following: Step, Continue, or DisableDebug
1067                    if cont {
1068                        break;
1069                    }
1070                }
1071            }
1072        }
1073
1074        Ok(())
1075    }
1076
1077    fn check_stack_guard(&self) -> Result<bool> {
1078        if let Some(mgr) = self.mem_mgr.as_ref() {
1079            mgr.check_stack_guard()
1080        } else {
1081            Err(new_error!("Memory manager is not initialized"))
1082        }
1083    }
1084
1085    #[cfg(feature = "trace_guest")]
1086    fn handle_trace(&mut self, tc: &mut crate::sandbox::trace::TraceContext) -> Result<()> {
1087        let regs = self.regs()?;
1088        tc.handle_trace(
1089            &regs,
1090            self.mem_mgr.as_mut().ok_or_else(|| {
1091                new_error!("Memory manager is not initialized before handling trace")
1092            })?,
1093        )
1094    }
1095
1096    #[cfg(feature = "mem_profile")]
1097    fn trace_info_mut(&mut self) -> &mut MemTraceInfo {
1098        &mut self.trace_info
1099    }
1100}
1101
1102impl Drop for HypervLinuxDriver {
1103    #[instrument(skip_all, parent = Span::current(), level = "Trace")]
1104    fn drop(&mut self) {
1105        self.interrupt_handle.dropped.store(true, Ordering::Relaxed);
1106        for region in self.sandbox_regions.iter().chain(self.mmap_regions.iter()) {
1107            let mshv_region: mshv_user_mem_region = region.to_owned().into();
1108            match self.vm_fd.unmap_user_memory(mshv_region) {
1109                Ok(_) => (),
1110                Err(e) => error!("Failed to unmap user memory in HyperVOnLinux ({:?})", e),
1111            }
1112        }
1113    }
1114}
1115
1116#[cfg(test)]
1117mod tests {
1118    use super::*;
1119    #[cfg(feature = "mem_profile")]
1120    use crate::mem::exe::DummyUnwindInfo;
1121    use crate::mem::memory_region::MemoryRegionVecBuilder;
1122    use crate::mem::shared_mem::{ExclusiveSharedMemory, SharedMemory};
1123
1124    #[rustfmt::skip]
1125    const CODE: [u8; 12] = [
1126        0xba, 0xf8, 0x03, /* mov $0x3f8, %dx */
1127        0x00, 0xd8, /* add %bl, %al */
1128        0x04, b'0', /* add $'0', %al */
1129        0xee, /* out %al, (%dx) */
1130        /* send a 0 to indicate we're done */
1131        0xb0, b'\0', /* mov $'\0', %al */
1132        0xee, /* out %al, (%dx) */
1133        0xf4, /* HLT */
1134    ];
1135
1136    fn shared_mem_with_code(
1137        code: &[u8],
1138        mem_size: usize,
1139        load_offset: usize,
1140    ) -> Result<Box<ExclusiveSharedMemory>> {
1141        if load_offset > mem_size {
1142            log_then_return!(
1143                "code load offset ({}) > memory size ({})",
1144                load_offset,
1145                mem_size
1146            );
1147        }
1148        let mut shared_mem = ExclusiveSharedMemory::new(mem_size)?;
1149        shared_mem.copy_from_slice(code, load_offset)?;
1150        Ok(Box::new(shared_mem))
1151    }
1152
1153    #[test]
1154    fn create_driver() {
1155        if !super::is_hypervisor_present() {
1156            return;
1157        }
1158        const MEM_SIZE: usize = 0x3000;
1159        let gm = shared_mem_with_code(CODE.as_slice(), MEM_SIZE, 0).unwrap();
1160        let rsp_ptr = GuestPtr::try_from(0).unwrap();
1161        let pml4_ptr = GuestPtr::try_from(0).unwrap();
1162        let entrypoint_ptr = GuestPtr::try_from(0).unwrap();
1163        let mut regions = MemoryRegionVecBuilder::new(0, gm.base_addr());
1164        regions.push_page_aligned(
1165            MEM_SIZE,
1166            MemoryRegionFlags::READ | MemoryRegionFlags::WRITE | MemoryRegionFlags::EXECUTE,
1167            crate::mem::memory_region::MemoryRegionType::Code,
1168        );
1169        let config: SandboxConfiguration = Default::default();
1170
1171        super::HypervLinuxDriver::new(
1172            regions.build(),
1173            entrypoint_ptr,
1174            rsp_ptr,
1175            pml4_ptr,
1176            &config,
1177            #[cfg(gdb)]
1178            None,
1179            #[cfg(crashdump)]
1180            SandboxRuntimeConfig {
1181                #[cfg(crashdump)]
1182                binary_path: None,
1183                #[cfg(gdb)]
1184                debug_info: None,
1185                #[cfg(crashdump)]
1186                guest_core_dump: true,
1187            },
1188            #[cfg(feature = "mem_profile")]
1189            MemTraceInfo::new(Arc::new(DummyUnwindInfo {})).unwrap(),
1190        )
1191        .unwrap();
1192    }
1193}