hyperlight_host/hypervisor/
kvm.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use std::convert::TryFrom;
18use std::fmt::Debug;
19use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
20use std::sync::{Arc, Mutex};
21
22use kvm_bindings::{kvm_fpu, kvm_regs, kvm_userspace_memory_region};
23use kvm_ioctls::Cap::UserMemory;
24use kvm_ioctls::{Kvm, VcpuExit, VcpuFd, VmFd};
25use log::LevelFilter;
26use tracing::{Span, instrument};
27#[cfg(crashdump)]
28use {super::crashdump, std::path::Path};
29
30#[cfg(feature = "trace_guest")]
31use super::TraceRegister;
32use super::fpu::{FP_CONTROL_WORD_DEFAULT, FP_TAG_WORD_DEFAULT, MXCSR_DEFAULT};
33#[cfg(gdb)]
34use super::gdb::{DebugCommChannel, DebugMsg, DebugResponse, GuestDebug, KvmDebug, VcpuStopReason};
35#[cfg(gdb)]
36use super::handlers::DbgMemAccessHandlerWrapper;
37#[cfg(feature = "init-paging")]
38use super::{
39    CR0_AM, CR0_ET, CR0_MP, CR0_NE, CR0_PE, CR0_PG, CR0_WP, CR4_OSFXSR, CR4_OSXMMEXCPT, CR4_PAE,
40    EFER_LMA, EFER_LME, EFER_NX, EFER_SCE,
41};
42use super::{HyperlightExit, Hypervisor, InterruptHandle, LinuxInterruptHandle, VirtualCPU};
43#[cfg(gdb)]
44use crate::HyperlightError;
45use crate::hypervisor::get_memory_access_violation;
46use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
47use crate::mem::ptr::{GuestPtr, RawPtr};
48use crate::mem::shared_mem::HostSharedMemory;
49use crate::sandbox::SandboxConfiguration;
50#[cfg(feature = "trace_guest")]
51use crate::sandbox::TraceInfo;
52use crate::sandbox::host_funcs::FunctionRegistry;
53use crate::sandbox::mem_mgr::MemMgrWrapper;
54use crate::sandbox::outb::handle_outb;
55#[cfg(crashdump)]
56use crate::sandbox::uninitialized::SandboxRuntimeConfig;
57use crate::{Result, log_then_return, new_error};
58
59/// Return `true` if the KVM API is available, version 12, and has UserMemory capability, or `false` otherwise
60#[instrument(skip_all, parent = Span::current(), level = "Trace")]
61pub(crate) fn is_hypervisor_present() -> bool {
62    if let Ok(kvm) = Kvm::new() {
63        let api_version = kvm.get_api_version();
64        match api_version {
65            version if version == 12 && kvm.check_extension(UserMemory) => true,
66            12 => {
67                log::info!("KVM does not have KVM_CAP_USER_MEMORY capability");
68                false
69            }
70            version => {
71                log::info!("KVM GET_API_VERSION returned {}, expected 12", version);
72                false
73            }
74        }
75    } else {
76        log::info!("KVM is not available on this system");
77        false
78    }
79}
80
81#[cfg(gdb)]
82mod debug {
83    use std::sync::{Arc, Mutex};
84
85    use kvm_bindings::kvm_debug_exit_arch;
86
87    use super::KVMDriver;
88    use crate::hypervisor::gdb::{
89        DebugMsg, DebugResponse, GuestDebug, KvmDebug, VcpuStopReason, X86_64Regs,
90    };
91    use crate::hypervisor::handlers::DbgMemAccessHandlerCaller;
92    use crate::{Result, new_error};
93
94    impl KVMDriver {
95        /// Resets the debug information to disable debugging
96        fn disable_debug(&mut self) -> Result<()> {
97            let mut debug = KvmDebug::default();
98
99            debug.set_single_step(&self.vcpu_fd, false)?;
100
101            self.debug = Some(debug);
102
103            Ok(())
104        }
105
106        /// Get the reason the vCPU has stopped
107        pub(crate) fn get_stop_reason(
108            &mut self,
109            debug_exit: kvm_debug_exit_arch,
110        ) -> Result<VcpuStopReason> {
111            let debug = self
112                .debug
113                .as_mut()
114                .ok_or_else(|| new_error!("Debug is not enabled"))?;
115
116            debug.get_stop_reason(&self.vcpu_fd, debug_exit, self.entrypoint)
117        }
118
119        pub(crate) fn process_dbg_request(
120            &mut self,
121            req: DebugMsg,
122            dbg_mem_access_fn: Arc<Mutex<dyn DbgMemAccessHandlerCaller>>,
123        ) -> Result<DebugResponse> {
124            if let Some(debug) = self.debug.as_mut() {
125                match req {
126                    DebugMsg::AddHwBreakpoint(addr) => Ok(DebugResponse::AddHwBreakpoint(
127                        debug
128                            .add_hw_breakpoint(&self.vcpu_fd, addr)
129                            .map_err(|e| {
130                                log::error!("Failed to add hw breakpoint: {:?}", e);
131
132                                e
133                            })
134                            .is_ok(),
135                    )),
136                    DebugMsg::AddSwBreakpoint(addr) => Ok(DebugResponse::AddSwBreakpoint(
137                        debug
138                            .add_sw_breakpoint(&self.vcpu_fd, addr, dbg_mem_access_fn)
139                            .map_err(|e| {
140                                log::error!("Failed to add sw breakpoint: {:?}", e);
141
142                                e
143                            })
144                            .is_ok(),
145                    )),
146                    DebugMsg::Continue => {
147                        debug.set_single_step(&self.vcpu_fd, false).map_err(|e| {
148                            log::error!("Failed to continue execution: {:?}", e);
149
150                            e
151                        })?;
152
153                        Ok(DebugResponse::Continue)
154                    }
155                    DebugMsg::DisableDebug => {
156                        self.disable_debug().map_err(|e| {
157                            log::error!("Failed to disable debugging: {:?}", e);
158
159                            e
160                        })?;
161
162                        Ok(DebugResponse::DisableDebug)
163                    }
164                    DebugMsg::GetCodeSectionOffset => {
165                        let offset = dbg_mem_access_fn
166                            .try_lock()
167                            .map_err(|e| {
168                                new_error!("Error locking at {}:{}: {}", file!(), line!(), e)
169                            })?
170                            .get_code_offset()
171                            .map_err(|e| {
172                                log::error!("Failed to get code offset: {:?}", e);
173
174                                e
175                            })?;
176
177                        Ok(DebugResponse::GetCodeSectionOffset(offset as u64))
178                    }
179                    DebugMsg::ReadAddr(addr, len) => {
180                        let mut data = vec![0u8; len];
181
182                        debug
183                            .read_addrs(&self.vcpu_fd, addr, &mut data, dbg_mem_access_fn)
184                            .map_err(|e| {
185                                log::error!("Failed to read from address: {:?}", e);
186
187                                e
188                            })?;
189
190                        Ok(DebugResponse::ReadAddr(data))
191                    }
192                    DebugMsg::ReadRegisters => {
193                        let mut regs = X86_64Regs::default();
194
195                        debug
196                            .read_regs(&self.vcpu_fd, &mut regs)
197                            .map_err(|e| {
198                                log::error!("Failed to read registers: {:?}", e);
199
200                                e
201                            })
202                            .map(|_| DebugResponse::ReadRegisters(regs))
203                    }
204                    DebugMsg::RemoveHwBreakpoint(addr) => Ok(DebugResponse::RemoveHwBreakpoint(
205                        debug
206                            .remove_hw_breakpoint(&self.vcpu_fd, addr)
207                            .map_err(|e| {
208                                log::error!("Failed to remove hw breakpoint: {:?}", e);
209
210                                e
211                            })
212                            .is_ok(),
213                    )),
214                    DebugMsg::RemoveSwBreakpoint(addr) => Ok(DebugResponse::RemoveSwBreakpoint(
215                        debug
216                            .remove_sw_breakpoint(&self.vcpu_fd, addr, dbg_mem_access_fn)
217                            .map_err(|e| {
218                                log::error!("Failed to remove sw breakpoint: {:?}", e);
219
220                                e
221                            })
222                            .is_ok(),
223                    )),
224                    DebugMsg::Step => {
225                        debug.set_single_step(&self.vcpu_fd, true).map_err(|e| {
226                            log::error!("Failed to enable step instruction: {:?}", e);
227
228                            e
229                        })?;
230
231                        Ok(DebugResponse::Step)
232                    }
233                    DebugMsg::WriteAddr(addr, data) => {
234                        debug
235                            .write_addrs(&self.vcpu_fd, addr, &data, dbg_mem_access_fn)
236                            .map_err(|e| {
237                                log::error!("Failed to write to address: {:?}", e);
238
239                                e
240                            })?;
241
242                        Ok(DebugResponse::WriteAddr)
243                    }
244                    DebugMsg::WriteRegisters(regs) => debug
245                        .write_regs(&self.vcpu_fd, &regs)
246                        .map_err(|e| {
247                            log::error!("Failed to write registers: {:?}", e);
248
249                            e
250                        })
251                        .map(|_| DebugResponse::WriteRegisters),
252                }
253            } else {
254                Err(new_error!("Debugging is not enabled"))
255            }
256        }
257
258        pub(crate) fn recv_dbg_msg(&mut self) -> Result<DebugMsg> {
259            let gdb_conn = self
260                .gdb_conn
261                .as_mut()
262                .ok_or_else(|| new_error!("Debug is not enabled"))?;
263
264            gdb_conn.recv().map_err(|e| {
265                new_error!(
266                    "Got an error while waiting to receive a message from the gdb thread: {:?}",
267                    e
268                )
269            })
270        }
271
272        pub(crate) fn send_dbg_msg(&mut self, cmd: DebugResponse) -> Result<()> {
273            log::debug!("Sending {:?}", cmd);
274
275            let gdb_conn = self
276                .gdb_conn
277                .as_mut()
278                .ok_or_else(|| new_error!("Debug is not enabled"))?;
279
280            gdb_conn.send(cmd).map_err(|e| {
281                new_error!(
282                    "Got an error while sending a response message to the gdb thread: {:?}",
283                    e
284                )
285            })
286        }
287    }
288}
289
290/// A Hypervisor driver for KVM on Linux
291pub(crate) struct KVMDriver {
292    _kvm: Kvm,
293    vm_fd: VmFd,
294    page_size: usize,
295    vcpu_fd: VcpuFd,
296    entrypoint: u64,
297    orig_rsp: GuestPtr,
298    interrupt_handle: Arc<LinuxInterruptHandle>,
299    mem_mgr: Option<MemMgrWrapper<HostSharedMemory>>,
300    host_funcs: Option<Arc<Mutex<FunctionRegistry>>>,
301
302    sandbox_regions: Vec<MemoryRegion>, // Initially mapped regions when sandbox is created
303    mmap_regions: Vec<(MemoryRegion, u32)>, // Later mapped regions (region, slot number)
304    next_slot: u32,                     // Monotonically increasing slot number
305    freed_slots: Vec<u32>,              // Reusable slots from unmapped regions
306
307    #[cfg(gdb)]
308    debug: Option<KvmDebug>,
309    #[cfg(gdb)]
310    gdb_conn: Option<DebugCommChannel<DebugResponse, DebugMsg>>,
311    #[cfg(crashdump)]
312    rt_cfg: SandboxRuntimeConfig,
313    #[cfg(feature = "trace_guest")]
314    #[allow(dead_code)]
315    trace_info: TraceInfo,
316}
317
318impl KVMDriver {
319    /// Create a new instance of a `KVMDriver`, with only control registers
320    /// set. Standard registers will not be set, and `initialise` must
321    /// be called to do so.
322    #[allow(clippy::too_many_arguments)]
323    // TODO: refactor this function to take fewer arguments. Add trace_info to rt_cfg
324    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
325    pub(crate) fn new(
326        mem_regions: Vec<MemoryRegion>,
327        pml4_addr: u64,
328        entrypoint: u64,
329        rsp: u64,
330        config: &SandboxConfiguration,
331        #[cfg(gdb)] gdb_conn: Option<DebugCommChannel<DebugResponse, DebugMsg>>,
332        #[cfg(crashdump)] rt_cfg: SandboxRuntimeConfig,
333        #[cfg(feature = "trace_guest")] trace_info: TraceInfo,
334    ) -> Result<Self> {
335        let kvm = Kvm::new()?;
336
337        let vm_fd = kvm.create_vm_with_type(0)?;
338
339        mem_regions.iter().enumerate().try_for_each(|(i, region)| {
340            let mut kvm_region: kvm_userspace_memory_region = region.clone().into();
341            kvm_region.slot = i as u32;
342            unsafe { vm_fd.set_user_memory_region(kvm_region) }
343        })?;
344
345        let mut vcpu_fd = vm_fd.create_vcpu(0)?;
346        Self::setup_initial_sregs(&mut vcpu_fd, pml4_addr)?;
347
348        #[cfg(gdb)]
349        let (debug, gdb_conn) = if let Some(gdb_conn) = gdb_conn {
350            let mut debug = KvmDebug::new();
351            // Add breakpoint to the entry point address
352            debug.add_hw_breakpoint(&vcpu_fd, entrypoint)?;
353
354            (Some(debug), Some(gdb_conn))
355        } else {
356            (None, None)
357        };
358
359        let rsp_gp = GuestPtr::try_from(RawPtr::from(rsp))?;
360
361        let interrupt_handle = Arc::new(LinuxInterruptHandle {
362            running: AtomicU64::new(0),
363            cancel_requested: AtomicBool::new(false),
364            #[cfg(gdb)]
365            debug_interrupt: AtomicBool::new(false),
366            #[cfg(all(
367                target_arch = "x86_64",
368                target_vendor = "unknown",
369                target_os = "linux",
370                target_env = "musl"
371            ))]
372            tid: AtomicU64::new(unsafe { libc::pthread_self() as u64 }),
373            #[cfg(not(all(
374                target_arch = "x86_64",
375                target_vendor = "unknown",
376                target_os = "linux",
377                target_env = "musl"
378            )))]
379            tid: AtomicU64::new(unsafe { libc::pthread_self() }),
380            retry_delay: config.get_interrupt_retry_delay(),
381            dropped: AtomicBool::new(false),
382            sig_rt_min_offset: config.get_interrupt_vcpu_sigrtmin_offset(),
383        });
384
385        #[allow(unused_mut)]
386        let mut hv = Self {
387            _kvm: kvm,
388            vm_fd,
389            page_size: 0,
390            vcpu_fd,
391            entrypoint,
392            orig_rsp: rsp_gp,
393            next_slot: mem_regions.len() as u32,
394            sandbox_regions: mem_regions,
395            mmap_regions: Vec::new(),
396            freed_slots: Vec::new(),
397            interrupt_handle: interrupt_handle.clone(),
398            mem_mgr: None,
399            host_funcs: None,
400            #[cfg(gdb)]
401            debug,
402            #[cfg(gdb)]
403            gdb_conn,
404            #[cfg(crashdump)]
405            rt_cfg,
406            #[cfg(feature = "trace_guest")]
407            trace_info,
408        };
409
410        // Send the interrupt handle to the GDB thread if debugging is enabled
411        // This is used to allow the GDB thread to stop the vCPU
412        #[cfg(gdb)]
413        if hv.debug.is_some() {
414            hv.send_dbg_msg(DebugResponse::InterruptHandle(interrupt_handle))?;
415        }
416
417        Ok(hv)
418    }
419
420    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
421    fn setup_initial_sregs(vcpu_fd: &mut VcpuFd, _pml4_addr: u64) -> Result<()> {
422        // setup paging and IA-32e (64-bit) mode
423        let mut sregs = vcpu_fd.get_sregs()?;
424        cfg_if::cfg_if! {
425            if #[cfg(feature = "init-paging")] {
426                sregs.cr3 = _pml4_addr;
427                sregs.cr4 = CR4_PAE | CR4_OSFXSR | CR4_OSXMMEXCPT;
428                sregs.cr0 = CR0_PE | CR0_MP | CR0_ET | CR0_NE | CR0_AM | CR0_PG | CR0_WP;
429                sregs.efer = EFER_LME | EFER_LMA | EFER_SCE | EFER_NX;
430                sregs.cs.l = 1; // required for 64-bit mode
431            } else {
432                sregs.cs.base = 0;
433                sregs.cs.selector = 0;
434            }
435        }
436        vcpu_fd.set_sregs(&sregs)?;
437        Ok(())
438    }
439}
440
441impl Debug for KVMDriver {
442    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
443        let mut f = f.debug_struct("KVM Driver");
444        // Output each memory region
445
446        for region in &self.sandbox_regions {
447            f.field("Sandbox Memory Region", &region);
448        }
449        for region in &self.mmap_regions {
450            f.field("Mapped Memory Region", &region);
451        }
452        let regs = self.vcpu_fd.get_regs();
453        // check that regs is OK and then set field in debug struct
454
455        if let Ok(regs) = regs {
456            f.field("Registers", &regs);
457        }
458
459        let sregs = self.vcpu_fd.get_sregs();
460
461        // check that sregs is OK and then set field in debug struct
462
463        if let Ok(sregs) = sregs {
464            f.field("Special Registers", &sregs);
465        }
466
467        f.finish()
468    }
469}
470
471impl Hypervisor for KVMDriver {
472    /// Implementation of initialise for Hypervisor trait.
473    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
474    fn initialise(
475        &mut self,
476        peb_addr: RawPtr,
477        seed: u64,
478        page_size: u32,
479        mem_mgr: MemMgrWrapper<HostSharedMemory>,
480        host_funcs: Arc<Mutex<FunctionRegistry>>,
481        max_guest_log_level: Option<LevelFilter>,
482        #[cfg(gdb)] dbg_mem_access_fn: DbgMemAccessHandlerWrapper,
483    ) -> Result<()> {
484        self.mem_mgr = Some(mem_mgr);
485        self.host_funcs = Some(host_funcs);
486        self.page_size = page_size as usize;
487
488        let max_guest_log_level: u64 = match max_guest_log_level {
489            Some(level) => level as u64,
490            None => self.get_max_log_level().into(),
491        };
492
493        let regs = kvm_regs {
494            rip: self.entrypoint,
495            rsp: self.orig_rsp.absolute()?,
496
497            // function args
498            rdi: peb_addr.into(),
499            rsi: seed,
500            rdx: page_size.into(),
501            rcx: max_guest_log_level,
502
503            ..Default::default()
504        };
505        self.vcpu_fd.set_regs(&regs)?;
506
507        VirtualCPU::run(
508            self.as_mut_hypervisor(),
509            #[cfg(gdb)]
510            dbg_mem_access_fn,
511        )
512    }
513
514    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
515    unsafe fn map_region(&mut self, region: &MemoryRegion) -> Result<()> {
516        if [
517            region.guest_region.start,
518            region.guest_region.end,
519            region.host_region.start,
520            region.host_region.end,
521        ]
522        .iter()
523        .any(|x| x % self.page_size != 0)
524        {
525            log_then_return!(
526                "region is not page-aligned {:x}, {region:?}",
527                self.page_size
528            );
529        }
530
531        let mut kvm_region: kvm_userspace_memory_region = region.clone().into();
532
533        // Try to reuse a freed slot first, otherwise use next_slot
534        let slot = if let Some(freed_slot) = self.freed_slots.pop() {
535            freed_slot
536        } else {
537            let slot = self.next_slot;
538            self.next_slot += 1;
539            slot
540        };
541
542        kvm_region.slot = slot;
543        unsafe { self.vm_fd.set_user_memory_region(kvm_region) }?;
544        self.mmap_regions.push((region.to_owned(), slot));
545        Ok(())
546    }
547
548    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
549    unsafe fn unmap_region(&mut self, region: &MemoryRegion) -> Result<()> {
550        if let Some(idx) = self.mmap_regions.iter().position(|(r, _)| r == region) {
551            let (region, slot) = self.mmap_regions.remove(idx);
552            let mut kvm_region: kvm_userspace_memory_region = region.into();
553            kvm_region.slot = slot;
554            // Setting memory_size to 0 unmaps the slot's region
555            // From https://docs.kernel.org/virt/kvm/api.html
556            // > Deleting a slot is done by passing zero for memory_size.
557            kvm_region.memory_size = 0;
558            unsafe { self.vm_fd.set_user_memory_region(kvm_region) }?;
559
560            // Add the freed slot to the reuse list
561            self.freed_slots.push(slot);
562
563            Ok(())
564        } else {
565            Err(new_error!("Tried to unmap region that is not mapped"))
566        }
567    }
568
569    fn get_mapped_regions(&self) -> Box<dyn ExactSizeIterator<Item = &MemoryRegion> + '_> {
570        Box::new(self.mmap_regions.iter().map(|(region, _)| region))
571    }
572
573    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
574    fn dispatch_call_from_host(
575        &mut self,
576        dispatch_func_addr: RawPtr,
577        #[cfg(gdb)] dbg_mem_access_fn: DbgMemAccessHandlerWrapper,
578    ) -> Result<()> {
579        // Reset general purpose registers, then set RIP and RSP
580        let regs = kvm_regs {
581            rip: dispatch_func_addr.into(),
582            rsp: self.orig_rsp.absolute()?,
583            ..Default::default()
584        };
585        self.vcpu_fd.set_regs(&regs)?;
586
587        // reset fpu state
588        let fpu = kvm_fpu {
589            fcw: FP_CONTROL_WORD_DEFAULT,
590            ftwx: FP_TAG_WORD_DEFAULT,
591            mxcsr: MXCSR_DEFAULT,
592            ..Default::default() // zero out the rest
593        };
594        self.vcpu_fd.set_fpu(&fpu)?;
595
596        // run
597        VirtualCPU::run(
598            self.as_mut_hypervisor(),
599            #[cfg(gdb)]
600            dbg_mem_access_fn,
601        )?;
602
603        Ok(())
604    }
605
606    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
607    fn handle_io(
608        &mut self,
609        port: u16,
610        data: Vec<u8>,
611        _rip: u64,
612        _instruction_length: u64,
613    ) -> Result<()> {
614        // KVM does not need RIP or instruction length, as it automatically sets the RIP
615
616        // The payload param for the outb_handle_fn is the first byte
617        // of the data array cast to an u64. Thus, we need to make sure
618        // the data array has at least one u8, then convert that to an u64
619        if data.is_empty() {
620            log_then_return!("no data was given in IO interrupt");
621        } else {
622            let mut padded = [0u8; 4];
623            let copy_len = data.len().min(4);
624            padded[..copy_len].copy_from_slice(&data[..copy_len]);
625            let value = u32::from_le_bytes(padded);
626
627            #[cfg(feature = "trace_guest")]
628            {
629                // We need to handle the borrow checker issue where we need both:
630                // - &mut MemMgrWrapper (from self.mem_mgr.as_mut())
631                // - &mut dyn Hypervisor (from self)
632                // We'll use a temporary approach to extract the mem_mgr temporarily
633                let mem_mgr_option = self.mem_mgr.take();
634                let mut mem_mgr =
635                    mem_mgr_option.ok_or_else(|| new_error!("mem_mgr not initialized"))?;
636                let host_funcs = self
637                    .host_funcs
638                    .as_ref()
639                    .ok_or_else(|| new_error!("host_funcs not initialized"))?
640                    .clone();
641
642                handle_outb(&mut mem_mgr, host_funcs, self, port, value)?;
643
644                // Put the mem_mgr back
645                self.mem_mgr = Some(mem_mgr);
646            }
647
648            #[cfg(not(feature = "trace_guest"))]
649            {
650                let mem_mgr = self
651                    .mem_mgr
652                    .as_mut()
653                    .ok_or_else(|| new_error!("mem_mgr not initialized"))?;
654                let host_funcs = self
655                    .host_funcs
656                    .as_ref()
657                    .ok_or_else(|| new_error!("host_funcs not initialized"))?
658                    .clone();
659
660                handle_outb(mem_mgr, host_funcs, port, value)?;
661            }
662        }
663
664        Ok(())
665    }
666
667    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
668    fn run(&mut self) -> Result<HyperlightExit> {
669        self.interrupt_handle
670            .tid
671            .store(unsafe { libc::pthread_self() as u64 }, Ordering::Relaxed);
672        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
673        // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
674        self.interrupt_handle
675            .set_running_and_increment_generation()
676            .map_err(|e| {
677                new_error!(
678                    "Error setting running state and incrementing generation: {}",
679                    e
680                )
681            })?;
682        #[cfg(not(gdb))]
683        let debug_interrupt = false;
684        #[cfg(gdb)]
685        let debug_interrupt = self
686            .interrupt_handle
687            .debug_interrupt
688            .load(Ordering::Relaxed);
689        // Don't run the vcpu if `cancel_requested` is true
690        //
691        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
692        // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
693        let exit_reason = if self
694            .interrupt_handle
695            .cancel_requested
696            .load(Ordering::Relaxed)
697            || debug_interrupt
698        {
699            Err(kvm_ioctls::Error::new(libc::EINTR))
700        } else {
701            #[cfg(feature = "trace_guest")]
702            if self.trace_info.guest_start_epoch.is_none() {
703                // Store the guest start epoch and cycles to trace the guest execution time
704                crate::debug!("KVM - Guest Start Epoch set");
705                self.trace_info.guest_start_epoch = Some(std::time::Instant::now());
706                self.trace_info.guest_start_tsc =
707                    Some(hyperlight_guest_tracing::invariant_tsc::read_tsc());
708            }
709
710            // Note: if a `InterruptHandle::kill()` called while this thread is **here**
711            // Then the vcpu will run, but we will keep sending signals to this thread
712            // to interrupt it until `running` is set to false. The `vcpu_fd::run()` call will
713            // return either normally with an exit reason, or from being "kicked" by out signal handler, with an EINTR error,
714            // both of which are fine.
715            self.vcpu_fd.run()
716        };
717        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
718        // Then signals will be sent to this thread until `running` is set to false.
719        // This is fine since the signal handler is a no-op.
720        let cancel_requested = self
721            .interrupt_handle
722            .cancel_requested
723            .load(Ordering::Relaxed);
724        #[cfg(gdb)]
725        let debug_interrupt = self
726            .interrupt_handle
727            .debug_interrupt
728            .load(Ordering::Relaxed);
729        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
730        // Then `cancel_requested` will be set to true again, which will cancel the **next vcpu run**.
731        // Additionally signals will be sent to this thread until `running` is set to false.
732        // This is fine since the signal handler is a no-op.
733        self.interrupt_handle.clear_running_bit();
734        // At this point, `running` is false so no more signals will be sent to this thread,
735        // but we may still receive async signals that were sent before this point.
736        // To prevent those signals from interrupting subsequent calls to `run()` (on other vms!),
737        // we make sure to check `cancel_requested` before cancelling (see `libc::EINTR` match-arm below).
738        let result = match exit_reason {
739            Ok(VcpuExit::Hlt) => {
740                crate::debug!("KVM - Halt Details : {:#?}", &self);
741                HyperlightExit::Halt()
742            }
743            Ok(VcpuExit::IoOut(port, data)) => {
744                // because vcpufd.run() mutably borrows self we cannot pass self to crate::debug! macro here
745                crate::debug!("KVM IO Details : \nPort : {}\nData : {:?}", port, data);
746                // KVM does not need to set RIP or instruction length so these are set to 0
747                HyperlightExit::IoOut(port, data.to_vec(), 0, 0)
748            }
749            Ok(VcpuExit::MmioRead(addr, _)) => {
750                crate::debug!("KVM MMIO Read -Details: Address: {} \n {:#?}", addr, &self);
751
752                match get_memory_access_violation(
753                    addr as usize,
754                    self.sandbox_regions
755                        .iter()
756                        .chain(self.mmap_regions.iter().map(|(r, _)| r)),
757                    MemoryRegionFlags::READ,
758                ) {
759                    Some(access_violation_exit) => access_violation_exit,
760                    None => HyperlightExit::Mmio(addr),
761                }
762            }
763            Ok(VcpuExit::MmioWrite(addr, _)) => {
764                crate::debug!("KVM MMIO Write -Details: Address: {} \n {:#?}", addr, &self);
765
766                match get_memory_access_violation(
767                    addr as usize,
768                    self.sandbox_regions
769                        .iter()
770                        .chain(self.mmap_regions.iter().map(|(r, _)| r)),
771                    MemoryRegionFlags::WRITE,
772                ) {
773                    Some(access_violation_exit) => access_violation_exit,
774                    None => HyperlightExit::Mmio(addr),
775                }
776            }
777            #[cfg(gdb)]
778            // KVM provides architecture specific information about the vCPU state when exiting
779            Ok(VcpuExit::Debug(debug_exit)) => match self.get_stop_reason(debug_exit) {
780                Ok(reason) => HyperlightExit::Debug(reason),
781                Err(e) => {
782                    log_then_return!("Error getting stop reason: {:?}", e);
783                }
784            },
785            Err(e) => match e.errno() {
786                // we send a signal to the thread to cancel execution this results in EINTR being returned by KVM so we return Cancelled
787                libc::EINTR => {
788                    // If cancellation was not requested for this specific vm, the vcpu was interrupted because of debug interrupt or
789                    // a stale signal that meant to be delivered to a previous/other vcpu on this same thread, so let's ignore it
790                    if cancel_requested {
791                        self.interrupt_handle
792                            .cancel_requested
793                            .store(false, Ordering::Relaxed);
794                        HyperlightExit::Cancelled()
795                    } else {
796                        #[cfg(gdb)]
797                        if debug_interrupt {
798                            self.interrupt_handle
799                                .debug_interrupt
800                                .store(false, Ordering::Relaxed);
801
802                            // If the vCPU was stopped because of an interrupt, we need to
803                            // return a special exit reason so that the gdb thread can handle it
804                            // and resume execution
805                            HyperlightExit::Debug(VcpuStopReason::Interrupt)
806                        } else {
807                            HyperlightExit::Retry()
808                        }
809
810                        #[cfg(not(gdb))]
811                        HyperlightExit::Retry()
812                    }
813                }
814                libc::EAGAIN => HyperlightExit::Retry(),
815                _ => {
816                    crate::debug!("KVM Error -Details: Address: {} \n {:#?}", e, &self);
817                    log_then_return!("Error running VCPU {:?}", e);
818                }
819            },
820            Ok(other) => {
821                let err_msg = format!("Unexpected KVM Exit {:?}", other);
822                crate::debug!("KVM Other Exit Details: {:#?}", &self);
823                HyperlightExit::Unknown(err_msg)
824            }
825        };
826        Ok(result)
827    }
828
829    #[instrument(skip_all, parent = Span::current(), level = "Trace")]
830    fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor {
831        self as &mut dyn Hypervisor
832    }
833
834    fn interrupt_handle(&self) -> Arc<dyn InterruptHandle> {
835        self.interrupt_handle.clone()
836    }
837
838    #[cfg(crashdump)]
839    fn crashdump_context(&self) -> Result<Option<crashdump::CrashDumpContext>> {
840        if self.rt_cfg.guest_core_dump {
841            let mut regs = [0; 27];
842
843            let vcpu_regs = self.vcpu_fd.get_regs()?;
844            let sregs = self.vcpu_fd.get_sregs()?;
845            let xsave = self.vcpu_fd.get_xsave()?;
846
847            // Set the registers in the order expected by the crashdump context
848            regs[0] = vcpu_regs.r15; // r15
849            regs[1] = vcpu_regs.r14; // r14
850            regs[2] = vcpu_regs.r13; // r13
851            regs[3] = vcpu_regs.r12; // r12
852            regs[4] = vcpu_regs.rbp; // rbp
853            regs[5] = vcpu_regs.rbx; // rbx
854            regs[6] = vcpu_regs.r11; // r11
855            regs[7] = vcpu_regs.r10; // r10
856            regs[8] = vcpu_regs.r9; // r9
857            regs[9] = vcpu_regs.r8; // r8
858            regs[10] = vcpu_regs.rax; // rax
859            regs[11] = vcpu_regs.rcx; // rcx
860            regs[12] = vcpu_regs.rdx; // rdx
861            regs[13] = vcpu_regs.rsi; // rsi
862            regs[14] = vcpu_regs.rdi; // rdi
863            regs[15] = 0; // orig rax
864            regs[16] = vcpu_regs.rip; // rip
865            regs[17] = sregs.cs.selector as u64; // cs
866            regs[18] = vcpu_regs.rflags; // eflags
867            regs[19] = vcpu_regs.rsp; // rsp
868            regs[20] = sregs.ss.selector as u64; // ss
869            regs[21] = sregs.fs.base; // fs_base
870            regs[22] = sregs.gs.base; // gs_base
871            regs[23] = sregs.ds.selector as u64; // ds
872            regs[24] = sregs.es.selector as u64; // es
873            regs[25] = sregs.fs.selector as u64; // fs
874            regs[26] = sregs.gs.selector as u64; // gs
875
876            // Get the filename from the runtime config
877            let filename = self.rt_cfg.binary_path.clone().and_then(|path| {
878                Path::new(&path)
879                    .file_name()
880                    .and_then(|name| name.to_os_string().into_string().ok())
881            });
882
883            // The [`CrashDumpContext`] accepts xsave as a vector of u8, so we need to convert the
884            // xsave region to a vector of u8
885            Ok(Some(crashdump::CrashDumpContext::new(
886                &self.sandbox_regions,
887                regs,
888                xsave
889                    .region
890                    .iter()
891                    .flat_map(|item| item.to_le_bytes())
892                    .collect::<Vec<u8>>(),
893                self.entrypoint,
894                self.rt_cfg.binary_path.clone(),
895                filename,
896            )))
897        } else {
898            Ok(None)
899        }
900    }
901
902    #[cfg(gdb)]
903    fn handle_debug(
904        &mut self,
905        dbg_mem_access_fn: Arc<Mutex<dyn super::handlers::DbgMemAccessHandlerCaller>>,
906        stop_reason: VcpuStopReason,
907    ) -> Result<()> {
908        if self.debug.is_none() {
909            return Err(new_error!("Debugging is not enabled"));
910        }
911
912        match stop_reason {
913            // If the vCPU stopped because of a crash, we need to handle it differently
914            // We do not want to allow resuming execution or placing breakpoints
915            // because the guest has crashed.
916            // We only allow reading registers and memory
917            VcpuStopReason::Crash => {
918                self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
919                    .map_err(|e| {
920                        new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
921                    })?;
922
923                loop {
924                    log::debug!("Debug wait for event to resume vCPU");
925                    // Wait for a message from gdb
926                    let req = self.recv_dbg_msg()?;
927
928                    // Flag to store if we should deny continue or step requests
929                    let mut deny_continue = false;
930                    // Flag to store if we should detach from the gdb session
931                    let mut detach = false;
932
933                    let response = match req {
934                        // Allow the detach request to disable debugging by continuing resuming
935                        // hypervisor crash error reporting
936                        DebugMsg::DisableDebug => {
937                            detach = true;
938                            DebugResponse::DisableDebug
939                        }
940                        // Do not allow continue or step requests
941                        DebugMsg::Continue | DebugMsg::Step => {
942                            deny_continue = true;
943                            DebugResponse::NotAllowed
944                        }
945                        // Do not allow adding/removing breakpoints and writing to memory or registers
946                        DebugMsg::AddHwBreakpoint(_)
947                        | DebugMsg::AddSwBreakpoint(_)
948                        | DebugMsg::RemoveHwBreakpoint(_)
949                        | DebugMsg::RemoveSwBreakpoint(_)
950                        | DebugMsg::WriteAddr(_, _)
951                        | DebugMsg::WriteRegisters(_) => DebugResponse::NotAllowed,
952
953                        // For all other requests, we will process them normally
954                        _ => {
955                            let result = self.process_dbg_request(req, dbg_mem_access_fn.clone());
956                            match result {
957                                Ok(response) => response,
958                                Err(HyperlightError::TranslateGuestAddress(_)) => {
959                                    // Treat non fatal errors separately so the guest doesn't fail
960                                    DebugResponse::ErrorOccurred
961                                }
962                                Err(e) => {
963                                    log::error!("Error processing debug request: {:?}", e);
964                                    return Err(e);
965                                }
966                            }
967                        }
968                    };
969
970                    // Send the response to the request back to gdb
971                    self.send_dbg_msg(response)
972                        .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
973
974                    // If we are denying continue or step requests, the debugger assumes the
975                    // execution started so we need to report a stop reason as a crash and let
976                    // it request to read registers/memory to figure out what happened
977                    if deny_continue {
978                        self.send_dbg_msg(DebugResponse::VcpuStopped(VcpuStopReason::Crash))
979                            .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
980                    }
981
982                    // If we are detaching, we will break the loop and the Hypervisor will continue
983                    // to handle the Crash reason
984                    if detach {
985                        break;
986                    }
987                }
988            }
989            // If the vCPU stopped because of any other reason except a crash, we can handle it
990            // normally
991            _ => {
992                // Send the stop reason to the gdb thread
993                self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
994                    .map_err(|e| {
995                        new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
996                    })?;
997
998                loop {
999                    log::debug!("Debug wait for event to resume vCPU");
1000                    // Wait for a message from gdb
1001                    let req = self.recv_dbg_msg()?;
1002
1003                    let result = self.process_dbg_request(req, dbg_mem_access_fn.clone());
1004
1005                    let response = match result {
1006                        Ok(response) => response,
1007                        // Treat non fatal errors separately so the guest doesn't fail
1008                        Err(HyperlightError::TranslateGuestAddress(_)) => {
1009                            DebugResponse::ErrorOccurred
1010                        }
1011                        Err(e) => {
1012                            return Err(e);
1013                        }
1014                    };
1015
1016                    let cont = matches!(
1017                        response,
1018                        DebugResponse::Continue | DebugResponse::Step | DebugResponse::DisableDebug
1019                    );
1020
1021                    self.send_dbg_msg(response)
1022                        .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
1023
1024                    // Check if we should continue execution
1025                    // We continue if the response is one of the following: Step, Continue, or DisableDebug
1026                    if cont {
1027                        break;
1028                    }
1029                }
1030            }
1031        }
1032
1033        Ok(())
1034    }
1035
1036    fn check_stack_guard(&self) -> Result<bool> {
1037        if let Some(mgr) = self.mem_mgr.as_ref() {
1038            mgr.check_stack_guard()
1039        } else {
1040            Err(new_error!("Memory manager is not initialized"))
1041        }
1042    }
1043
1044    #[cfg(feature = "trace_guest")]
1045    fn read_trace_reg(&self, reg: TraceRegister) -> Result<u64> {
1046        let regs = self.vcpu_fd.get_regs()?;
1047        Ok(match reg {
1048            TraceRegister::RAX => regs.rax,
1049            TraceRegister::RCX => regs.rcx,
1050            TraceRegister::RIP => regs.rip,
1051            TraceRegister::RSP => regs.rsp,
1052            TraceRegister::RBP => regs.rbp,
1053        })
1054    }
1055
1056    #[cfg(feature = "trace_guest")]
1057    fn trace_info_as_ref(&self) -> &TraceInfo {
1058        &self.trace_info
1059    }
1060    #[cfg(feature = "trace_guest")]
1061    fn trace_info_as_mut(&mut self) -> &mut TraceInfo {
1062        &mut self.trace_info
1063    }
1064}
1065
1066impl Drop for KVMDriver {
1067    fn drop(&mut self) {
1068        self.interrupt_handle.dropped.store(true, Ordering::Relaxed);
1069    }
1070}