hyperlight_host/hypervisor/
kvm.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use std::convert::TryFrom;
18use std::fmt::Debug;
19use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
20use std::sync::{Arc, Mutex};
21
22use kvm_bindings::{kvm_fpu, kvm_regs, kvm_userspace_memory_region};
23use kvm_ioctls::Cap::UserMemory;
24use kvm_ioctls::{Kvm, VcpuExit, VcpuFd, VmFd};
25use log::LevelFilter;
26use tracing::{Span, instrument};
27#[cfg(crashdump)]
28use {super::crashdump, std::path::Path};
29
30#[cfg(feature = "trace_guest")]
31use super::TraceRegister;
32use super::fpu::{FP_CONTROL_WORD_DEFAULT, FP_TAG_WORD_DEFAULT, MXCSR_DEFAULT};
33#[cfg(gdb)]
34use super::gdb::{DebugCommChannel, DebugMsg, DebugResponse, GuestDebug, KvmDebug, VcpuStopReason};
35#[cfg(feature = "init-paging")]
36use super::{
37    CR0_AM, CR0_ET, CR0_MP, CR0_NE, CR0_PE, CR0_PG, CR0_WP, CR4_OSFXSR, CR4_OSXMMEXCPT, CR4_PAE,
38    EFER_LMA, EFER_LME, EFER_NX, EFER_SCE,
39};
40use super::{HyperlightExit, Hypervisor, InterruptHandle, LinuxInterruptHandle, VirtualCPU};
41#[cfg(gdb)]
42use crate::HyperlightError;
43use crate::hypervisor::get_memory_access_violation;
44use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
45use crate::mem::ptr::{GuestPtr, RawPtr};
46use crate::mem::shared_mem::HostSharedMemory;
47use crate::sandbox::SandboxConfiguration;
48#[cfg(feature = "trace_guest")]
49use crate::sandbox::TraceInfo;
50use crate::sandbox::host_funcs::FunctionRegistry;
51use crate::sandbox::mem_mgr::MemMgrWrapper;
52use crate::sandbox::outb::handle_outb;
53#[cfg(crashdump)]
54use crate::sandbox::uninitialized::SandboxRuntimeConfig;
55use crate::{Result, log_then_return, new_error};
56
57/// Return `true` if the KVM API is available, version 12, and has UserMemory capability, or `false` otherwise
58#[instrument(skip_all, parent = Span::current(), level = "Trace")]
59pub(crate) fn is_hypervisor_present() -> bool {
60    if let Ok(kvm) = Kvm::new() {
61        let api_version = kvm.get_api_version();
62        match api_version {
63            version if version == 12 && kvm.check_extension(UserMemory) => true,
64            12 => {
65                log::info!("KVM does not have KVM_CAP_USER_MEMORY capability");
66                false
67            }
68            version => {
69                log::info!("KVM GET_API_VERSION returned {}, expected 12", version);
70                false
71            }
72        }
73    } else {
74        log::info!("KVM is not available on this system");
75        false
76    }
77}
78
79#[cfg(gdb)]
80mod debug {
81    use std::sync::{Arc, Mutex};
82
83    use kvm_bindings::kvm_debug_exit_arch;
84
85    use super::KVMDriver;
86    use crate::hypervisor::gdb::{
87        DebugMsg, DebugResponse, GuestDebug, KvmDebug, VcpuStopReason, X86_64Regs,
88    };
89    use crate::mem::shared_mem::HostSharedMemory;
90    use crate::sandbox::mem_mgr::MemMgrWrapper;
91    use crate::{Result, new_error};
92
93    impl KVMDriver {
94        /// Resets the debug information to disable debugging
95        fn disable_debug(&mut self) -> Result<()> {
96            let mut debug = KvmDebug::default();
97
98            debug.set_single_step(&self.vcpu_fd, false)?;
99
100            self.debug = Some(debug);
101
102            Ok(())
103        }
104
105        /// Get the reason the vCPU has stopped
106        pub(crate) fn get_stop_reason(
107            &mut self,
108            debug_exit: kvm_debug_exit_arch,
109        ) -> Result<VcpuStopReason> {
110            let debug = self
111                .debug
112                .as_mut()
113                .ok_or_else(|| new_error!("Debug is not enabled"))?;
114
115            debug.get_stop_reason(&self.vcpu_fd, debug_exit, self.entrypoint)
116        }
117
118        pub(crate) fn process_dbg_request(
119            &mut self,
120            req: DebugMsg,
121            dbg_mem_access_fn: Arc<Mutex<MemMgrWrapper<HostSharedMemory>>>,
122        ) -> Result<DebugResponse> {
123            if let Some(debug) = self.debug.as_mut() {
124                match req {
125                    DebugMsg::AddHwBreakpoint(addr) => Ok(DebugResponse::AddHwBreakpoint(
126                        debug
127                            .add_hw_breakpoint(&self.vcpu_fd, addr)
128                            .map_err(|e| {
129                                log::error!("Failed to add hw breakpoint: {:?}", e);
130
131                                e
132                            })
133                            .is_ok(),
134                    )),
135                    DebugMsg::AddSwBreakpoint(addr) => Ok(DebugResponse::AddSwBreakpoint(
136                        debug
137                            .add_sw_breakpoint(&self.vcpu_fd, addr, dbg_mem_access_fn)
138                            .map_err(|e| {
139                                log::error!("Failed to add sw breakpoint: {:?}", e);
140
141                                e
142                            })
143                            .is_ok(),
144                    )),
145                    DebugMsg::Continue => {
146                        debug.set_single_step(&self.vcpu_fd, false).map_err(|e| {
147                            log::error!("Failed to continue execution: {:?}", e);
148
149                            e
150                        })?;
151
152                        Ok(DebugResponse::Continue)
153                    }
154                    DebugMsg::DisableDebug => {
155                        self.disable_debug().map_err(|e| {
156                            log::error!("Failed to disable debugging: {:?}", e);
157
158                            e
159                        })?;
160
161                        Ok(DebugResponse::DisableDebug)
162                    }
163                    DebugMsg::GetCodeSectionOffset => {
164                        let offset = dbg_mem_access_fn
165                            .try_lock()
166                            .map_err(|e| {
167                                new_error!("Error locking at {}:{}: {}", file!(), line!(), e)
168                            })?
169                            .unwrap_mgr()
170                            .layout
171                            .get_guest_code_address();
172
173                        Ok(DebugResponse::GetCodeSectionOffset(offset as u64))
174                    }
175                    DebugMsg::ReadAddr(addr, len) => {
176                        let mut data = vec![0u8; len];
177
178                        debug
179                            .read_addrs(&self.vcpu_fd, addr, &mut data, dbg_mem_access_fn)
180                            .map_err(|e| {
181                                log::error!("Failed to read from address: {:?}", e);
182
183                                e
184                            })?;
185
186                        Ok(DebugResponse::ReadAddr(data))
187                    }
188                    DebugMsg::ReadRegisters => {
189                        let mut regs = X86_64Regs::default();
190
191                        debug
192                            .read_regs(&self.vcpu_fd, &mut regs)
193                            .map_err(|e| {
194                                log::error!("Failed to read registers: {:?}", e);
195
196                                e
197                            })
198                            .map(|_| DebugResponse::ReadRegisters(regs))
199                    }
200                    DebugMsg::RemoveHwBreakpoint(addr) => Ok(DebugResponse::RemoveHwBreakpoint(
201                        debug
202                            .remove_hw_breakpoint(&self.vcpu_fd, addr)
203                            .map_err(|e| {
204                                log::error!("Failed to remove hw breakpoint: {:?}", e);
205
206                                e
207                            })
208                            .is_ok(),
209                    )),
210                    DebugMsg::RemoveSwBreakpoint(addr) => Ok(DebugResponse::RemoveSwBreakpoint(
211                        debug
212                            .remove_sw_breakpoint(&self.vcpu_fd, addr, dbg_mem_access_fn)
213                            .map_err(|e| {
214                                log::error!("Failed to remove sw breakpoint: {:?}", e);
215
216                                e
217                            })
218                            .is_ok(),
219                    )),
220                    DebugMsg::Step => {
221                        debug.set_single_step(&self.vcpu_fd, true).map_err(|e| {
222                            log::error!("Failed to enable step instruction: {:?}", e);
223
224                            e
225                        })?;
226
227                        Ok(DebugResponse::Step)
228                    }
229                    DebugMsg::WriteAddr(addr, data) => {
230                        debug
231                            .write_addrs(&self.vcpu_fd, addr, &data, dbg_mem_access_fn)
232                            .map_err(|e| {
233                                log::error!("Failed to write to address: {:?}", e);
234
235                                e
236                            })?;
237
238                        Ok(DebugResponse::WriteAddr)
239                    }
240                    DebugMsg::WriteRegisters(regs) => debug
241                        .write_regs(&self.vcpu_fd, &regs)
242                        .map_err(|e| {
243                            log::error!("Failed to write registers: {:?}", e);
244
245                            e
246                        })
247                        .map(|_| DebugResponse::WriteRegisters),
248                }
249            } else {
250                Err(new_error!("Debugging is not enabled"))
251            }
252        }
253
254        pub(crate) fn recv_dbg_msg(&mut self) -> Result<DebugMsg> {
255            let gdb_conn = self
256                .gdb_conn
257                .as_mut()
258                .ok_or_else(|| new_error!("Debug is not enabled"))?;
259
260            gdb_conn.recv().map_err(|e| {
261                new_error!(
262                    "Got an error while waiting to receive a message from the gdb thread: {:?}",
263                    e
264                )
265            })
266        }
267
268        pub(crate) fn send_dbg_msg(&mut self, cmd: DebugResponse) -> Result<()> {
269            log::debug!("Sending {:?}", cmd);
270
271            let gdb_conn = self
272                .gdb_conn
273                .as_mut()
274                .ok_or_else(|| new_error!("Debug is not enabled"))?;
275
276            gdb_conn.send(cmd).map_err(|e| {
277                new_error!(
278                    "Got an error while sending a response message to the gdb thread: {:?}",
279                    e
280                )
281            })
282        }
283    }
284}
285
286/// A Hypervisor driver for KVM on Linux
287pub(crate) struct KVMDriver {
288    _kvm: Kvm,
289    vm_fd: VmFd,
290    page_size: usize,
291    vcpu_fd: VcpuFd,
292    entrypoint: u64,
293    orig_rsp: GuestPtr,
294    interrupt_handle: Arc<LinuxInterruptHandle>,
295    mem_mgr: Option<MemMgrWrapper<HostSharedMemory>>,
296    host_funcs: Option<Arc<Mutex<FunctionRegistry>>>,
297
298    sandbox_regions: Vec<MemoryRegion>, // Initially mapped regions when sandbox is created
299    mmap_regions: Vec<(MemoryRegion, u32)>, // Later mapped regions (region, slot number)
300    next_slot: u32,                     // Monotonically increasing slot number
301    freed_slots: Vec<u32>,              // Reusable slots from unmapped regions
302
303    #[cfg(gdb)]
304    debug: Option<KvmDebug>,
305    #[cfg(gdb)]
306    gdb_conn: Option<DebugCommChannel<DebugResponse, DebugMsg>>,
307    #[cfg(crashdump)]
308    rt_cfg: SandboxRuntimeConfig,
309    #[cfg(feature = "trace_guest")]
310    #[allow(dead_code)]
311    trace_info: TraceInfo,
312}
313
314impl KVMDriver {
315    /// Create a new instance of a `KVMDriver`, with only control registers
316    /// set. Standard registers will not be set, and `initialise` must
317    /// be called to do so.
318    #[allow(clippy::too_many_arguments)]
319    // TODO: refactor this function to take fewer arguments. Add trace_info to rt_cfg
320    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
321    pub(crate) fn new(
322        mem_regions: Vec<MemoryRegion>,
323        pml4_addr: u64,
324        entrypoint: u64,
325        rsp: u64,
326        config: &SandboxConfiguration,
327        #[cfg(gdb)] gdb_conn: Option<DebugCommChannel<DebugResponse, DebugMsg>>,
328        #[cfg(crashdump)] rt_cfg: SandboxRuntimeConfig,
329        #[cfg(feature = "trace_guest")] trace_info: TraceInfo,
330    ) -> Result<Self> {
331        let kvm = Kvm::new()?;
332
333        let vm_fd = kvm.create_vm_with_type(0)?;
334
335        mem_regions.iter().enumerate().try_for_each(|(i, region)| {
336            let mut kvm_region: kvm_userspace_memory_region = region.clone().into();
337            kvm_region.slot = i as u32;
338            unsafe { vm_fd.set_user_memory_region(kvm_region) }
339        })?;
340
341        let mut vcpu_fd = vm_fd.create_vcpu(0)?;
342        Self::setup_initial_sregs(&mut vcpu_fd, pml4_addr)?;
343
344        #[cfg(gdb)]
345        let (debug, gdb_conn) = if let Some(gdb_conn) = gdb_conn {
346            let mut debug = KvmDebug::new();
347            // Add breakpoint to the entry point address
348            debug.add_hw_breakpoint(&vcpu_fd, entrypoint)?;
349
350            (Some(debug), Some(gdb_conn))
351        } else {
352            (None, None)
353        };
354
355        let rsp_gp = GuestPtr::try_from(RawPtr::from(rsp))?;
356
357        let interrupt_handle = Arc::new(LinuxInterruptHandle {
358            running: AtomicU64::new(0),
359            cancel_requested: AtomicBool::new(false),
360            #[cfg(gdb)]
361            debug_interrupt: AtomicBool::new(false),
362            #[cfg(all(
363                target_arch = "x86_64",
364                target_vendor = "unknown",
365                target_os = "linux",
366                target_env = "musl"
367            ))]
368            tid: AtomicU64::new(unsafe { libc::pthread_self() as u64 }),
369            #[cfg(not(all(
370                target_arch = "x86_64",
371                target_vendor = "unknown",
372                target_os = "linux",
373                target_env = "musl"
374            )))]
375            tid: AtomicU64::new(unsafe { libc::pthread_self() }),
376            retry_delay: config.get_interrupt_retry_delay(),
377            dropped: AtomicBool::new(false),
378            sig_rt_min_offset: config.get_interrupt_vcpu_sigrtmin_offset(),
379        });
380
381        #[allow(unused_mut)]
382        let mut hv = Self {
383            _kvm: kvm,
384            vm_fd,
385            page_size: 0,
386            vcpu_fd,
387            entrypoint,
388            orig_rsp: rsp_gp,
389            next_slot: mem_regions.len() as u32,
390            sandbox_regions: mem_regions,
391            mmap_regions: Vec::new(),
392            freed_slots: Vec::new(),
393            interrupt_handle: interrupt_handle.clone(),
394            mem_mgr: None,
395            host_funcs: None,
396            #[cfg(gdb)]
397            debug,
398            #[cfg(gdb)]
399            gdb_conn,
400            #[cfg(crashdump)]
401            rt_cfg,
402            #[cfg(feature = "trace_guest")]
403            trace_info,
404        };
405
406        // Send the interrupt handle to the GDB thread if debugging is enabled
407        // This is used to allow the GDB thread to stop the vCPU
408        #[cfg(gdb)]
409        if hv.debug.is_some() {
410            hv.send_dbg_msg(DebugResponse::InterruptHandle(interrupt_handle))?;
411        }
412
413        Ok(hv)
414    }
415
416    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
417    fn setup_initial_sregs(vcpu_fd: &mut VcpuFd, _pml4_addr: u64) -> Result<()> {
418        // setup paging and IA-32e (64-bit) mode
419        let mut sregs = vcpu_fd.get_sregs()?;
420        cfg_if::cfg_if! {
421            if #[cfg(feature = "init-paging")] {
422                sregs.cr3 = _pml4_addr;
423                sregs.cr4 = CR4_PAE | CR4_OSFXSR | CR4_OSXMMEXCPT;
424                sregs.cr0 = CR0_PE | CR0_MP | CR0_ET | CR0_NE | CR0_AM | CR0_PG | CR0_WP;
425                sregs.efer = EFER_LME | EFER_LMA | EFER_SCE | EFER_NX;
426                sregs.cs.l = 1; // required for 64-bit mode
427            } else {
428                sregs.cs.base = 0;
429                sregs.cs.selector = 0;
430            }
431        }
432        vcpu_fd.set_sregs(&sregs)?;
433        Ok(())
434    }
435}
436
437impl Debug for KVMDriver {
438    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
439        let mut f = f.debug_struct("KVM Driver");
440        // Output each memory region
441
442        for region in &self.sandbox_regions {
443            f.field("Sandbox Memory Region", &region);
444        }
445        for region in &self.mmap_regions {
446            f.field("Mapped Memory Region", &region);
447        }
448        let regs = self.vcpu_fd.get_regs();
449        // check that regs is OK and then set field in debug struct
450
451        if let Ok(regs) = regs {
452            f.field("Registers", &regs);
453        }
454
455        let sregs = self.vcpu_fd.get_sregs();
456
457        // check that sregs is OK and then set field in debug struct
458
459        if let Ok(sregs) = sregs {
460            f.field("Special Registers", &sregs);
461        }
462
463        f.finish()
464    }
465}
466
467impl Hypervisor for KVMDriver {
468    /// Implementation of initialise for Hypervisor trait.
469    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
470    fn initialise(
471        &mut self,
472        peb_addr: RawPtr,
473        seed: u64,
474        page_size: u32,
475        mem_mgr: MemMgrWrapper<HostSharedMemory>,
476        host_funcs: Arc<Mutex<FunctionRegistry>>,
477        max_guest_log_level: Option<LevelFilter>,
478        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<MemMgrWrapper<HostSharedMemory>>>,
479    ) -> Result<()> {
480        self.mem_mgr = Some(mem_mgr);
481        self.host_funcs = Some(host_funcs);
482        self.page_size = page_size as usize;
483
484        let max_guest_log_level: u64 = match max_guest_log_level {
485            Some(level) => level as u64,
486            None => self.get_max_log_level().into(),
487        };
488
489        let regs = kvm_regs {
490            rip: self.entrypoint,
491            rsp: self.orig_rsp.absolute()?,
492
493            // function args
494            rdi: peb_addr.into(),
495            rsi: seed,
496            rdx: page_size.into(),
497            rcx: max_guest_log_level,
498
499            ..Default::default()
500        };
501        self.vcpu_fd.set_regs(&regs)?;
502
503        VirtualCPU::run(
504            self.as_mut_hypervisor(),
505            #[cfg(gdb)]
506            dbg_mem_access_fn,
507        )
508    }
509
510    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
511    unsafe fn map_region(&mut self, region: &MemoryRegion) -> Result<()> {
512        if [
513            region.guest_region.start,
514            region.guest_region.end,
515            region.host_region.start,
516            region.host_region.end,
517        ]
518        .iter()
519        .any(|x| x % self.page_size != 0)
520        {
521            log_then_return!(
522                "region is not page-aligned {:x}, {region:?}",
523                self.page_size
524            );
525        }
526
527        let mut kvm_region: kvm_userspace_memory_region = region.clone().into();
528
529        // Try to reuse a freed slot first, otherwise use next_slot
530        let slot = if let Some(freed_slot) = self.freed_slots.pop() {
531            freed_slot
532        } else {
533            let slot = self.next_slot;
534            self.next_slot += 1;
535            slot
536        };
537
538        kvm_region.slot = slot;
539        unsafe { self.vm_fd.set_user_memory_region(kvm_region) }?;
540        self.mmap_regions.push((region.to_owned(), slot));
541        Ok(())
542    }
543
544    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
545    unsafe fn unmap_region(&mut self, region: &MemoryRegion) -> Result<()> {
546        if let Some(idx) = self.mmap_regions.iter().position(|(r, _)| r == region) {
547            let (region, slot) = self.mmap_regions.remove(idx);
548            let mut kvm_region: kvm_userspace_memory_region = region.into();
549            kvm_region.slot = slot;
550            // Setting memory_size to 0 unmaps the slot's region
551            // From https://docs.kernel.org/virt/kvm/api.html
552            // > Deleting a slot is done by passing zero for memory_size.
553            kvm_region.memory_size = 0;
554            unsafe { self.vm_fd.set_user_memory_region(kvm_region) }?;
555
556            // Add the freed slot to the reuse list
557            self.freed_slots.push(slot);
558
559            Ok(())
560        } else {
561            Err(new_error!("Tried to unmap region that is not mapped"))
562        }
563    }
564
565    fn get_mapped_regions(&self) -> Box<dyn ExactSizeIterator<Item = &MemoryRegion> + '_> {
566        Box::new(self.mmap_regions.iter().map(|(region, _)| region))
567    }
568
569    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
570    fn dispatch_call_from_host(
571        &mut self,
572        dispatch_func_addr: RawPtr,
573        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<MemMgrWrapper<HostSharedMemory>>>,
574    ) -> Result<()> {
575        // Reset general purpose registers, then set RIP and RSP
576        let regs = kvm_regs {
577            rip: dispatch_func_addr.into(),
578            rsp: self.orig_rsp.absolute()?,
579            ..Default::default()
580        };
581        self.vcpu_fd.set_regs(&regs)?;
582
583        // reset fpu state
584        let fpu = kvm_fpu {
585            fcw: FP_CONTROL_WORD_DEFAULT,
586            ftwx: FP_TAG_WORD_DEFAULT,
587            mxcsr: MXCSR_DEFAULT,
588            ..Default::default() // zero out the rest
589        };
590        self.vcpu_fd.set_fpu(&fpu)?;
591
592        // run
593        VirtualCPU::run(
594            self.as_mut_hypervisor(),
595            #[cfg(gdb)]
596            dbg_mem_access_fn,
597        )?;
598
599        Ok(())
600    }
601
602    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
603    fn handle_io(
604        &mut self,
605        port: u16,
606        data: Vec<u8>,
607        _rip: u64,
608        _instruction_length: u64,
609    ) -> Result<()> {
610        // KVM does not need RIP or instruction length, as it automatically sets the RIP
611
612        // The payload param for the outb_handle_fn is the first byte
613        // of the data array cast to an u64. Thus, we need to make sure
614        // the data array has at least one u8, then convert that to an u64
615        if data.is_empty() {
616            log_then_return!("no data was given in IO interrupt");
617        } else {
618            let mut padded = [0u8; 4];
619            let copy_len = data.len().min(4);
620            padded[..copy_len].copy_from_slice(&data[..copy_len]);
621            let value = u32::from_le_bytes(padded);
622
623            #[cfg(feature = "trace_guest")]
624            {
625                // We need to handle the borrow checker issue where we need both:
626                // - &mut MemMgrWrapper (from self.mem_mgr.as_mut())
627                // - &mut dyn Hypervisor (from self)
628                // We'll use a temporary approach to extract the mem_mgr temporarily
629                let mem_mgr_option = self.mem_mgr.take();
630                let mut mem_mgr =
631                    mem_mgr_option.ok_or_else(|| new_error!("mem_mgr not initialized"))?;
632                let host_funcs = self
633                    .host_funcs
634                    .as_ref()
635                    .ok_or_else(|| new_error!("host_funcs not initialized"))?
636                    .clone();
637
638                handle_outb(&mut mem_mgr, host_funcs, self, port, value)?;
639
640                // Put the mem_mgr back
641                self.mem_mgr = Some(mem_mgr);
642            }
643
644            #[cfg(not(feature = "trace_guest"))]
645            {
646                let mem_mgr = self
647                    .mem_mgr
648                    .as_mut()
649                    .ok_or_else(|| new_error!("mem_mgr not initialized"))?;
650                let host_funcs = self
651                    .host_funcs
652                    .as_ref()
653                    .ok_or_else(|| new_error!("host_funcs not initialized"))?
654                    .clone();
655
656                handle_outb(mem_mgr, host_funcs, port, value)?;
657            }
658        }
659
660        Ok(())
661    }
662
663    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
664    fn run(&mut self) -> Result<HyperlightExit> {
665        self.interrupt_handle
666            .tid
667            .store(unsafe { libc::pthread_self() as u64 }, Ordering::Relaxed);
668        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
669        // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
670        self.interrupt_handle
671            .set_running_and_increment_generation()
672            .map_err(|e| {
673                new_error!(
674                    "Error setting running state and incrementing generation: {}",
675                    e
676                )
677            })?;
678        #[cfg(not(gdb))]
679        let debug_interrupt = false;
680        #[cfg(gdb)]
681        let debug_interrupt = self
682            .interrupt_handle
683            .debug_interrupt
684            .load(Ordering::Relaxed);
685        // Don't run the vcpu if `cancel_requested` is true
686        //
687        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
688        // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
689        let exit_reason = if self
690            .interrupt_handle
691            .cancel_requested
692            .load(Ordering::Relaxed)
693            || debug_interrupt
694        {
695            Err(kvm_ioctls::Error::new(libc::EINTR))
696        } else {
697            #[cfg(feature = "trace_guest")]
698            if self.trace_info.guest_start_epoch.is_none() {
699                // Store the guest start epoch and cycles to trace the guest execution time
700                crate::debug!("KVM - Guest Start Epoch set");
701                self.trace_info.guest_start_epoch = Some(std::time::Instant::now());
702                self.trace_info.guest_start_tsc =
703                    Some(hyperlight_guest_tracing::invariant_tsc::read_tsc());
704            }
705
706            // Note: if a `InterruptHandle::kill()` called while this thread is **here**
707            // Then the vcpu will run, but we will keep sending signals to this thread
708            // to interrupt it until `running` is set to false. The `vcpu_fd::run()` call will
709            // return either normally with an exit reason, or from being "kicked" by out signal handler, with an EINTR error,
710            // both of which are fine.
711            self.vcpu_fd.run()
712        };
713        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
714        // Then signals will be sent to this thread until `running` is set to false.
715        // This is fine since the signal handler is a no-op.
716        let cancel_requested = self
717            .interrupt_handle
718            .cancel_requested
719            .load(Ordering::Relaxed);
720        #[cfg(gdb)]
721        let debug_interrupt = self
722            .interrupt_handle
723            .debug_interrupt
724            .load(Ordering::Relaxed);
725        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
726        // Then `cancel_requested` will be set to true again, which will cancel the **next vcpu run**.
727        // Additionally signals will be sent to this thread until `running` is set to false.
728        // This is fine since the signal handler is a no-op.
729        self.interrupt_handle.clear_running_bit();
730        // At this point, `running` is false so no more signals will be sent to this thread,
731        // but we may still receive async signals that were sent before this point.
732        // To prevent those signals from interrupting subsequent calls to `run()` (on other vms!),
733        // we make sure to check `cancel_requested` before cancelling (see `libc::EINTR` match-arm below).
734        let result = match exit_reason {
735            Ok(VcpuExit::Hlt) => {
736                crate::debug!("KVM - Halt Details : {:#?}", &self);
737                HyperlightExit::Halt()
738            }
739            Ok(VcpuExit::IoOut(port, data)) => {
740                // because vcpufd.run() mutably borrows self we cannot pass self to crate::debug! macro here
741                crate::debug!("KVM IO Details : \nPort : {}\nData : {:?}", port, data);
742                // KVM does not need to set RIP or instruction length so these are set to 0
743                HyperlightExit::IoOut(port, data.to_vec(), 0, 0)
744            }
745            Ok(VcpuExit::MmioRead(addr, _)) => {
746                crate::debug!("KVM MMIO Read -Details: Address: {} \n {:#?}", addr, &self);
747
748                match get_memory_access_violation(
749                    addr as usize,
750                    self.sandbox_regions
751                        .iter()
752                        .chain(self.mmap_regions.iter().map(|(r, _)| r)),
753                    MemoryRegionFlags::READ,
754                ) {
755                    Some(access_violation_exit) => access_violation_exit,
756                    None => HyperlightExit::Mmio(addr),
757                }
758            }
759            Ok(VcpuExit::MmioWrite(addr, _)) => {
760                crate::debug!("KVM MMIO Write -Details: Address: {} \n {:#?}", addr, &self);
761
762                match get_memory_access_violation(
763                    addr as usize,
764                    self.sandbox_regions
765                        .iter()
766                        .chain(self.mmap_regions.iter().map(|(r, _)| r)),
767                    MemoryRegionFlags::WRITE,
768                ) {
769                    Some(access_violation_exit) => access_violation_exit,
770                    None => HyperlightExit::Mmio(addr),
771                }
772            }
773            #[cfg(gdb)]
774            // KVM provides architecture specific information about the vCPU state when exiting
775            Ok(VcpuExit::Debug(debug_exit)) => match self.get_stop_reason(debug_exit) {
776                Ok(reason) => HyperlightExit::Debug(reason),
777                Err(e) => {
778                    log_then_return!("Error getting stop reason: {:?}", e);
779                }
780            },
781            Err(e) => match e.errno() {
782                // we send a signal to the thread to cancel execution this results in EINTR being returned by KVM so we return Cancelled
783                libc::EINTR => {
784                    // If cancellation was not requested for this specific vm, the vcpu was interrupted because of debug interrupt or
785                    // a stale signal that meant to be delivered to a previous/other vcpu on this same thread, so let's ignore it
786                    if cancel_requested {
787                        self.interrupt_handle
788                            .cancel_requested
789                            .store(false, Ordering::Relaxed);
790                        HyperlightExit::Cancelled()
791                    } else {
792                        #[cfg(gdb)]
793                        if debug_interrupt {
794                            self.interrupt_handle
795                                .debug_interrupt
796                                .store(false, Ordering::Relaxed);
797
798                            // If the vCPU was stopped because of an interrupt, we need to
799                            // return a special exit reason so that the gdb thread can handle it
800                            // and resume execution
801                            HyperlightExit::Debug(VcpuStopReason::Interrupt)
802                        } else {
803                            HyperlightExit::Retry()
804                        }
805
806                        #[cfg(not(gdb))]
807                        HyperlightExit::Retry()
808                    }
809                }
810                libc::EAGAIN => HyperlightExit::Retry(),
811                _ => {
812                    crate::debug!("KVM Error -Details: Address: {} \n {:#?}", e, &self);
813                    log_then_return!("Error running VCPU {:?}", e);
814                }
815            },
816            Ok(other) => {
817                let err_msg = format!("Unexpected KVM Exit {:?}", other);
818                crate::debug!("KVM Other Exit Details: {:#?}", &self);
819                HyperlightExit::Unknown(err_msg)
820            }
821        };
822        Ok(result)
823    }
824
825    #[instrument(skip_all, parent = Span::current(), level = "Trace")]
826    fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor {
827        self as &mut dyn Hypervisor
828    }
829
830    fn interrupt_handle(&self) -> Arc<dyn InterruptHandle> {
831        self.interrupt_handle.clone()
832    }
833
834    #[cfg(crashdump)]
835    fn crashdump_context(&self) -> Result<Option<crashdump::CrashDumpContext>> {
836        if self.rt_cfg.guest_core_dump {
837            let mut regs = [0; 27];
838
839            let vcpu_regs = self.vcpu_fd.get_regs()?;
840            let sregs = self.vcpu_fd.get_sregs()?;
841            let xsave = self.vcpu_fd.get_xsave()?;
842
843            // Set the registers in the order expected by the crashdump context
844            regs[0] = vcpu_regs.r15; // r15
845            regs[1] = vcpu_regs.r14; // r14
846            regs[2] = vcpu_regs.r13; // r13
847            regs[3] = vcpu_regs.r12; // r12
848            regs[4] = vcpu_regs.rbp; // rbp
849            regs[5] = vcpu_regs.rbx; // rbx
850            regs[6] = vcpu_regs.r11; // r11
851            regs[7] = vcpu_regs.r10; // r10
852            regs[8] = vcpu_regs.r9; // r9
853            regs[9] = vcpu_regs.r8; // r8
854            regs[10] = vcpu_regs.rax; // rax
855            regs[11] = vcpu_regs.rcx; // rcx
856            regs[12] = vcpu_regs.rdx; // rdx
857            regs[13] = vcpu_regs.rsi; // rsi
858            regs[14] = vcpu_regs.rdi; // rdi
859            regs[15] = 0; // orig rax
860            regs[16] = vcpu_regs.rip; // rip
861            regs[17] = sregs.cs.selector as u64; // cs
862            regs[18] = vcpu_regs.rflags; // eflags
863            regs[19] = vcpu_regs.rsp; // rsp
864            regs[20] = sregs.ss.selector as u64; // ss
865            regs[21] = sregs.fs.base; // fs_base
866            regs[22] = sregs.gs.base; // gs_base
867            regs[23] = sregs.ds.selector as u64; // ds
868            regs[24] = sregs.es.selector as u64; // es
869            regs[25] = sregs.fs.selector as u64; // fs
870            regs[26] = sregs.gs.selector as u64; // gs
871
872            // Get the filename from the runtime config
873            let filename = self.rt_cfg.binary_path.clone().and_then(|path| {
874                Path::new(&path)
875                    .file_name()
876                    .and_then(|name| name.to_os_string().into_string().ok())
877            });
878
879            // The [`CrashDumpContext`] accepts xsave as a vector of u8, so we need to convert the
880            // xsave region to a vector of u8
881            Ok(Some(crashdump::CrashDumpContext::new(
882                &self.sandbox_regions,
883                regs,
884                xsave
885                    .region
886                    .iter()
887                    .flat_map(|item| item.to_le_bytes())
888                    .collect::<Vec<u8>>(),
889                self.entrypoint,
890                self.rt_cfg.binary_path.clone(),
891                filename,
892            )))
893        } else {
894            Ok(None)
895        }
896    }
897
898    #[cfg(gdb)]
899    fn handle_debug(
900        &mut self,
901        dbg_mem_access_fn: Arc<Mutex<MemMgrWrapper<HostSharedMemory>>>,
902        stop_reason: VcpuStopReason,
903    ) -> Result<()> {
904        if self.debug.is_none() {
905            return Err(new_error!("Debugging is not enabled"));
906        }
907
908        match stop_reason {
909            // If the vCPU stopped because of a crash, we need to handle it differently
910            // We do not want to allow resuming execution or placing breakpoints
911            // because the guest has crashed.
912            // We only allow reading registers and memory
913            VcpuStopReason::Crash => {
914                self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
915                    .map_err(|e| {
916                        new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
917                    })?;
918
919                loop {
920                    log::debug!("Debug wait for event to resume vCPU");
921                    // Wait for a message from gdb
922                    let req = self.recv_dbg_msg()?;
923
924                    // Flag to store if we should deny continue or step requests
925                    let mut deny_continue = false;
926                    // Flag to store if we should detach from the gdb session
927                    let mut detach = false;
928
929                    let response = match req {
930                        // Allow the detach request to disable debugging by continuing resuming
931                        // hypervisor crash error reporting
932                        DebugMsg::DisableDebug => {
933                            detach = true;
934                            DebugResponse::DisableDebug
935                        }
936                        // Do not allow continue or step requests
937                        DebugMsg::Continue | DebugMsg::Step => {
938                            deny_continue = true;
939                            DebugResponse::NotAllowed
940                        }
941                        // Do not allow adding/removing breakpoints and writing to memory or registers
942                        DebugMsg::AddHwBreakpoint(_)
943                        | DebugMsg::AddSwBreakpoint(_)
944                        | DebugMsg::RemoveHwBreakpoint(_)
945                        | DebugMsg::RemoveSwBreakpoint(_)
946                        | DebugMsg::WriteAddr(_, _)
947                        | DebugMsg::WriteRegisters(_) => DebugResponse::NotAllowed,
948
949                        // For all other requests, we will process them normally
950                        _ => {
951                            let result = self.process_dbg_request(req, dbg_mem_access_fn.clone());
952                            match result {
953                                Ok(response) => response,
954                                Err(HyperlightError::TranslateGuestAddress(_)) => {
955                                    // Treat non fatal errors separately so the guest doesn't fail
956                                    DebugResponse::ErrorOccurred
957                                }
958                                Err(e) => {
959                                    log::error!("Error processing debug request: {:?}", e);
960                                    return Err(e);
961                                }
962                            }
963                        }
964                    };
965
966                    // Send the response to the request back to gdb
967                    self.send_dbg_msg(response)
968                        .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
969
970                    // If we are denying continue or step requests, the debugger assumes the
971                    // execution started so we need to report a stop reason as a crash and let
972                    // it request to read registers/memory to figure out what happened
973                    if deny_continue {
974                        self.send_dbg_msg(DebugResponse::VcpuStopped(VcpuStopReason::Crash))
975                            .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
976                    }
977
978                    // If we are detaching, we will break the loop and the Hypervisor will continue
979                    // to handle the Crash reason
980                    if detach {
981                        break;
982                    }
983                }
984            }
985            // If the vCPU stopped because of any other reason except a crash, we can handle it
986            // normally
987            _ => {
988                // Send the stop reason to the gdb thread
989                self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
990                    .map_err(|e| {
991                        new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
992                    })?;
993
994                loop {
995                    log::debug!("Debug wait for event to resume vCPU");
996                    // Wait for a message from gdb
997                    let req = self.recv_dbg_msg()?;
998
999                    let result = self.process_dbg_request(req, dbg_mem_access_fn.clone());
1000
1001                    let response = match result {
1002                        Ok(response) => response,
1003                        // Treat non fatal errors separately so the guest doesn't fail
1004                        Err(HyperlightError::TranslateGuestAddress(_)) => {
1005                            DebugResponse::ErrorOccurred
1006                        }
1007                        Err(e) => {
1008                            return Err(e);
1009                        }
1010                    };
1011
1012                    let cont = matches!(
1013                        response,
1014                        DebugResponse::Continue | DebugResponse::Step | DebugResponse::DisableDebug
1015                    );
1016
1017                    self.send_dbg_msg(response)
1018                        .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
1019
1020                    // Check if we should continue execution
1021                    // We continue if the response is one of the following: Step, Continue, or DisableDebug
1022                    if cont {
1023                        break;
1024                    }
1025                }
1026            }
1027        }
1028
1029        Ok(())
1030    }
1031
1032    fn check_stack_guard(&self) -> Result<bool> {
1033        if let Some(mgr) = self.mem_mgr.as_ref() {
1034            mgr.check_stack_guard()
1035        } else {
1036            Err(new_error!("Memory manager is not initialized"))
1037        }
1038    }
1039
1040    #[cfg(feature = "trace_guest")]
1041    fn read_trace_reg(&self, reg: TraceRegister) -> Result<u64> {
1042        let regs = self.vcpu_fd.get_regs()?;
1043        Ok(match reg {
1044            TraceRegister::RAX => regs.rax,
1045            TraceRegister::RCX => regs.rcx,
1046            TraceRegister::RIP => regs.rip,
1047            TraceRegister::RSP => regs.rsp,
1048            TraceRegister::RBP => regs.rbp,
1049        })
1050    }
1051
1052    #[cfg(feature = "trace_guest")]
1053    fn trace_info_as_ref(&self) -> &TraceInfo {
1054        &self.trace_info
1055    }
1056    #[cfg(feature = "trace_guest")]
1057    fn trace_info_as_mut(&mut self) -> &mut TraceInfo {
1058        &mut self.trace_info
1059    }
1060}
1061
1062impl Drop for KVMDriver {
1063    fn drop(&mut self) {
1064        self.interrupt_handle.dropped.store(true, Ordering::Relaxed);
1065    }
1066}