hyperlight_host/hypervisor/
kvm.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use std::convert::TryFrom;
18use std::fmt::Debug;
19use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
20use std::sync::{Arc, Mutex};
21
22use kvm_bindings::{kvm_fpu, kvm_regs, kvm_userspace_memory_region};
23use kvm_ioctls::Cap::UserMemory;
24use kvm_ioctls::{Kvm, VcpuExit, VcpuFd, VmFd};
25use log::LevelFilter;
26use tracing::{Span, instrument};
27#[cfg(crashdump)]
28use {super::crashdump, std::path::Path};
29
30#[cfg(feature = "trace_guest")]
31use super::TraceRegister;
32use super::fpu::{FP_CONTROL_WORD_DEFAULT, FP_TAG_WORD_DEFAULT, MXCSR_DEFAULT};
33#[cfg(gdb)]
34use super::gdb::{DebugCommChannel, DebugMsg, DebugResponse, GuestDebug, KvmDebug, VcpuStopReason};
35#[cfg(feature = "init-paging")]
36use super::{
37    CR0_AM, CR0_ET, CR0_MP, CR0_NE, CR0_PE, CR0_PG, CR0_WP, CR4_OSFXSR, CR4_OSXMMEXCPT, CR4_PAE,
38    EFER_LMA, EFER_LME, EFER_NX, EFER_SCE,
39};
40use super::{HyperlightExit, Hypervisor, InterruptHandle, LinuxInterruptHandle, VirtualCPU};
41#[cfg(gdb)]
42use crate::HyperlightError;
43use crate::hypervisor::get_memory_access_violation;
44use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
45use crate::mem::mgr::SandboxMemoryManager;
46use crate::mem::ptr::{GuestPtr, RawPtr};
47use crate::mem::shared_mem::HostSharedMemory;
48use crate::sandbox::SandboxConfiguration;
49#[cfg(feature = "trace_guest")]
50use crate::sandbox::TraceInfo;
51use crate::sandbox::host_funcs::FunctionRegistry;
52use crate::sandbox::outb::handle_outb;
53#[cfg(crashdump)]
54use crate::sandbox::uninitialized::SandboxRuntimeConfig;
55use crate::{Result, log_then_return, new_error};
56
57/// Return `true` if the KVM API is available, version 12, and has UserMemory capability, or `false` otherwise
58#[instrument(skip_all, parent = Span::current(), level = "Trace")]
59pub(crate) fn is_hypervisor_present() -> bool {
60    if let Ok(kvm) = Kvm::new() {
61        let api_version = kvm.get_api_version();
62        match api_version {
63            version if version == 12 && kvm.check_extension(UserMemory) => true,
64            12 => {
65                log::info!("KVM does not have KVM_CAP_USER_MEMORY capability");
66                false
67            }
68            version => {
69                log::info!("KVM GET_API_VERSION returned {}, expected 12", version);
70                false
71            }
72        }
73    } else {
74        log::info!("KVM is not available on this system");
75        false
76    }
77}
78
79#[cfg(gdb)]
80mod debug {
81    use std::sync::{Arc, Mutex};
82
83    use kvm_bindings::kvm_debug_exit_arch;
84
85    use super::KVMDriver;
86    use crate::hypervisor::gdb::{
87        DebugMsg, DebugResponse, GuestDebug, KvmDebug, VcpuStopReason, X86_64Regs,
88    };
89    use crate::mem::mgr::SandboxMemoryManager;
90    use crate::mem::shared_mem::HostSharedMemory;
91    use crate::{Result, new_error};
92
93    impl KVMDriver {
94        /// Resets the debug information to disable debugging
95        fn disable_debug(&mut self) -> Result<()> {
96            let mut debug = KvmDebug::default();
97
98            debug.set_single_step(&self.vcpu_fd, false)?;
99
100            self.debug = Some(debug);
101
102            Ok(())
103        }
104
105        /// Get the reason the vCPU has stopped
106        pub(crate) fn get_stop_reason(
107            &mut self,
108            debug_exit: kvm_debug_exit_arch,
109        ) -> Result<VcpuStopReason> {
110            let debug = self
111                .debug
112                .as_mut()
113                .ok_or_else(|| new_error!("Debug is not enabled"))?;
114
115            debug.get_stop_reason(&self.vcpu_fd, debug_exit, self.entrypoint)
116        }
117
118        pub(crate) fn process_dbg_request(
119            &mut self,
120            req: DebugMsg,
121            dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
122        ) -> Result<DebugResponse> {
123            if let Some(debug) = self.debug.as_mut() {
124                match req {
125                    DebugMsg::AddHwBreakpoint(addr) => Ok(DebugResponse::AddHwBreakpoint(
126                        debug
127                            .add_hw_breakpoint(&self.vcpu_fd, addr)
128                            .map_err(|e| {
129                                log::error!("Failed to add hw breakpoint: {:?}", e);
130
131                                e
132                            })
133                            .is_ok(),
134                    )),
135                    DebugMsg::AddSwBreakpoint(addr) => Ok(DebugResponse::AddSwBreakpoint(
136                        debug
137                            .add_sw_breakpoint(&self.vcpu_fd, addr, dbg_mem_access_fn)
138                            .map_err(|e| {
139                                log::error!("Failed to add sw breakpoint: {:?}", e);
140
141                                e
142                            })
143                            .is_ok(),
144                    )),
145                    DebugMsg::Continue => {
146                        debug.set_single_step(&self.vcpu_fd, false).map_err(|e| {
147                            log::error!("Failed to continue execution: {:?}", e);
148
149                            e
150                        })?;
151
152                        Ok(DebugResponse::Continue)
153                    }
154                    DebugMsg::DisableDebug => {
155                        self.disable_debug().map_err(|e| {
156                            log::error!("Failed to disable debugging: {:?}", e);
157
158                            e
159                        })?;
160
161                        Ok(DebugResponse::DisableDebug)
162                    }
163                    DebugMsg::GetCodeSectionOffset => {
164                        let offset = dbg_mem_access_fn
165                            .try_lock()
166                            .map_err(|e| {
167                                new_error!("Error locking at {}:{}: {}", file!(), line!(), e)
168                            })?
169                            .layout
170                            .get_guest_code_address();
171
172                        Ok(DebugResponse::GetCodeSectionOffset(offset as u64))
173                    }
174                    DebugMsg::ReadAddr(addr, len) => {
175                        let mut data = vec![0u8; len];
176
177                        debug
178                            .read_addrs(&self.vcpu_fd, addr, &mut data, dbg_mem_access_fn)
179                            .map_err(|e| {
180                                log::error!("Failed to read from address: {:?}", e);
181
182                                e
183                            })?;
184
185                        Ok(DebugResponse::ReadAddr(data))
186                    }
187                    DebugMsg::ReadRegisters => {
188                        let mut regs = X86_64Regs::default();
189
190                        debug
191                            .read_regs(&self.vcpu_fd, &mut regs)
192                            .map_err(|e| {
193                                log::error!("Failed to read registers: {:?}", e);
194
195                                e
196                            })
197                            .map(|_| DebugResponse::ReadRegisters(Box::new(regs)))
198                    }
199                    DebugMsg::RemoveHwBreakpoint(addr) => Ok(DebugResponse::RemoveHwBreakpoint(
200                        debug
201                            .remove_hw_breakpoint(&self.vcpu_fd, addr)
202                            .map_err(|e| {
203                                log::error!("Failed to remove hw breakpoint: {:?}", e);
204
205                                e
206                            })
207                            .is_ok(),
208                    )),
209                    DebugMsg::RemoveSwBreakpoint(addr) => Ok(DebugResponse::RemoveSwBreakpoint(
210                        debug
211                            .remove_sw_breakpoint(&self.vcpu_fd, addr, dbg_mem_access_fn)
212                            .map_err(|e| {
213                                log::error!("Failed to remove sw breakpoint: {:?}", e);
214
215                                e
216                            })
217                            .is_ok(),
218                    )),
219                    DebugMsg::Step => {
220                        debug.set_single_step(&self.vcpu_fd, true).map_err(|e| {
221                            log::error!("Failed to enable step instruction: {:?}", e);
222
223                            e
224                        })?;
225
226                        Ok(DebugResponse::Step)
227                    }
228                    DebugMsg::WriteAddr(addr, data) => {
229                        debug
230                            .write_addrs(&self.vcpu_fd, addr, &data, dbg_mem_access_fn)
231                            .map_err(|e| {
232                                log::error!("Failed to write to address: {:?}", e);
233
234                                e
235                            })?;
236
237                        Ok(DebugResponse::WriteAddr)
238                    }
239                    DebugMsg::WriteRegisters(regs) => debug
240                        .write_regs(&self.vcpu_fd, &regs)
241                        .map_err(|e| {
242                            log::error!("Failed to write registers: {:?}", e);
243
244                            e
245                        })
246                        .map(|_| DebugResponse::WriteRegisters),
247                }
248            } else {
249                Err(new_error!("Debugging is not enabled"))
250            }
251        }
252
253        pub(crate) fn recv_dbg_msg(&mut self) -> Result<DebugMsg> {
254            let gdb_conn = self
255                .gdb_conn
256                .as_mut()
257                .ok_or_else(|| new_error!("Debug is not enabled"))?;
258
259            gdb_conn.recv().map_err(|e| {
260                new_error!(
261                    "Got an error while waiting to receive a message from the gdb thread: {:?}",
262                    e
263                )
264            })
265        }
266
267        pub(crate) fn send_dbg_msg(&mut self, cmd: DebugResponse) -> Result<()> {
268            log::debug!("Sending {:?}", cmd);
269
270            let gdb_conn = self
271                .gdb_conn
272                .as_mut()
273                .ok_or_else(|| new_error!("Debug is not enabled"))?;
274
275            gdb_conn.send(cmd).map_err(|e| {
276                new_error!(
277                    "Got an error while sending a response message to the gdb thread: {:?}",
278                    e
279                )
280            })
281        }
282    }
283}
284
285/// A Hypervisor driver for KVM on Linux
286pub(crate) struct KVMDriver {
287    _kvm: Kvm,
288    vm_fd: VmFd,
289    page_size: usize,
290    vcpu_fd: VcpuFd,
291    entrypoint: u64,
292    orig_rsp: GuestPtr,
293    interrupt_handle: Arc<LinuxInterruptHandle>,
294    mem_mgr: Option<SandboxMemoryManager<HostSharedMemory>>,
295    host_funcs: Option<Arc<Mutex<FunctionRegistry>>>,
296
297    sandbox_regions: Vec<MemoryRegion>, // Initially mapped regions when sandbox is created
298    mmap_regions: Vec<(MemoryRegion, u32)>, // Later mapped regions (region, slot number)
299    next_slot: u32,                     // Monotonically increasing slot number
300    freed_slots: Vec<u32>,              // Reusable slots from unmapped regions
301
302    #[cfg(gdb)]
303    debug: Option<KvmDebug>,
304    #[cfg(gdb)]
305    gdb_conn: Option<DebugCommChannel<DebugResponse, DebugMsg>>,
306    #[cfg(crashdump)]
307    rt_cfg: SandboxRuntimeConfig,
308    #[cfg(feature = "trace_guest")]
309    #[allow(dead_code)]
310    trace_info: TraceInfo,
311}
312
313impl KVMDriver {
314    /// Create a new instance of a `KVMDriver`, with only control registers
315    /// set. Standard registers will not be set, and `initialise` must
316    /// be called to do so.
317    #[allow(clippy::too_many_arguments)]
318    // TODO: refactor this function to take fewer arguments. Add trace_info to rt_cfg
319    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
320    pub(crate) fn new(
321        mem_regions: Vec<MemoryRegion>,
322        pml4_addr: u64,
323        entrypoint: u64,
324        rsp: u64,
325        config: &SandboxConfiguration,
326        #[cfg(gdb)] gdb_conn: Option<DebugCommChannel<DebugResponse, DebugMsg>>,
327        #[cfg(crashdump)] rt_cfg: SandboxRuntimeConfig,
328        #[cfg(feature = "trace_guest")] trace_info: TraceInfo,
329    ) -> Result<Self> {
330        let kvm = Kvm::new()?;
331
332        let vm_fd = kvm.create_vm_with_type(0)?;
333
334        mem_regions.iter().enumerate().try_for_each(|(i, region)| {
335            let mut kvm_region: kvm_userspace_memory_region = region.clone().into();
336            kvm_region.slot = i as u32;
337            unsafe { vm_fd.set_user_memory_region(kvm_region) }
338        })?;
339
340        let mut vcpu_fd = vm_fd.create_vcpu(0)?;
341        Self::setup_initial_sregs(&mut vcpu_fd, pml4_addr)?;
342
343        #[cfg(gdb)]
344        let (debug, gdb_conn) = if let Some(gdb_conn) = gdb_conn {
345            let mut debug = KvmDebug::new();
346            // Add breakpoint to the entry point address
347            debug.add_hw_breakpoint(&vcpu_fd, entrypoint)?;
348
349            (Some(debug), Some(gdb_conn))
350        } else {
351            (None, None)
352        };
353
354        let rsp_gp = GuestPtr::try_from(RawPtr::from(rsp))?;
355
356        let interrupt_handle = Arc::new(LinuxInterruptHandle {
357            running: AtomicU64::new(0),
358            cancel_requested: AtomicBool::new(false),
359            #[cfg(gdb)]
360            debug_interrupt: AtomicBool::new(false),
361            #[cfg(all(
362                target_arch = "x86_64",
363                target_vendor = "unknown",
364                target_os = "linux",
365                target_env = "musl"
366            ))]
367            tid: AtomicU64::new(unsafe { libc::pthread_self() as u64 }),
368            #[cfg(not(all(
369                target_arch = "x86_64",
370                target_vendor = "unknown",
371                target_os = "linux",
372                target_env = "musl"
373            )))]
374            tid: AtomicU64::new(unsafe { libc::pthread_self() }),
375            retry_delay: config.get_interrupt_retry_delay(),
376            dropped: AtomicBool::new(false),
377            sig_rt_min_offset: config.get_interrupt_vcpu_sigrtmin_offset(),
378        });
379
380        #[allow(unused_mut)]
381        let mut hv = Self {
382            _kvm: kvm,
383            vm_fd,
384            page_size: 0,
385            vcpu_fd,
386            entrypoint,
387            orig_rsp: rsp_gp,
388            next_slot: mem_regions.len() as u32,
389            sandbox_regions: mem_regions,
390            mmap_regions: Vec::new(),
391            freed_slots: Vec::new(),
392            interrupt_handle: interrupt_handle.clone(),
393            mem_mgr: None,
394            host_funcs: None,
395            #[cfg(gdb)]
396            debug,
397            #[cfg(gdb)]
398            gdb_conn,
399            #[cfg(crashdump)]
400            rt_cfg,
401            #[cfg(feature = "trace_guest")]
402            trace_info,
403        };
404
405        // Send the interrupt handle to the GDB thread if debugging is enabled
406        // This is used to allow the GDB thread to stop the vCPU
407        #[cfg(gdb)]
408        if hv.debug.is_some() {
409            hv.send_dbg_msg(DebugResponse::InterruptHandle(interrupt_handle))?;
410        }
411
412        Ok(hv)
413    }
414
415    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
416    fn setup_initial_sregs(vcpu_fd: &mut VcpuFd, _pml4_addr: u64) -> Result<()> {
417        // setup paging and IA-32e (64-bit) mode
418        let mut sregs = vcpu_fd.get_sregs()?;
419        cfg_if::cfg_if! {
420            if #[cfg(feature = "init-paging")] {
421                sregs.cr3 = _pml4_addr;
422                sregs.cr4 = CR4_PAE | CR4_OSFXSR | CR4_OSXMMEXCPT;
423                sregs.cr0 = CR0_PE | CR0_MP | CR0_ET | CR0_NE | CR0_AM | CR0_PG | CR0_WP;
424                sregs.efer = EFER_LME | EFER_LMA | EFER_SCE | EFER_NX;
425                sregs.cs.l = 1; // required for 64-bit mode
426            } else {
427                sregs.cs.base = 0;
428                sregs.cs.selector = 0;
429            }
430        }
431        vcpu_fd.set_sregs(&sregs)?;
432        Ok(())
433    }
434}
435
436impl Debug for KVMDriver {
437    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
438        let mut f = f.debug_struct("KVM Driver");
439        // Output each memory region
440
441        for region in &self.sandbox_regions {
442            f.field("Sandbox Memory Region", &region);
443        }
444        for region in &self.mmap_regions {
445            f.field("Mapped Memory Region", &region);
446        }
447        let regs = self.vcpu_fd.get_regs();
448        // check that regs is OK and then set field in debug struct
449
450        if let Ok(regs) = regs {
451            f.field("Registers", &regs);
452        }
453
454        let sregs = self.vcpu_fd.get_sregs();
455
456        // check that sregs is OK and then set field in debug struct
457
458        if let Ok(sregs) = sregs {
459            f.field("Special Registers", &sregs);
460        }
461
462        f.finish()
463    }
464}
465
466impl Hypervisor for KVMDriver {
467    /// Implementation of initialise for Hypervisor trait.
468    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
469    fn initialise(
470        &mut self,
471        peb_addr: RawPtr,
472        seed: u64,
473        page_size: u32,
474        mem_mgr: SandboxMemoryManager<HostSharedMemory>,
475        host_funcs: Arc<Mutex<FunctionRegistry>>,
476        max_guest_log_level: Option<LevelFilter>,
477        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
478    ) -> Result<()> {
479        self.mem_mgr = Some(mem_mgr);
480        self.host_funcs = Some(host_funcs);
481        self.page_size = page_size as usize;
482
483        let max_guest_log_level: u64 = match max_guest_log_level {
484            Some(level) => level as u64,
485            None => self.get_max_log_level().into(),
486        };
487
488        let regs = kvm_regs {
489            rip: self.entrypoint,
490            rsp: self.orig_rsp.absolute()?,
491
492            // function args
493            rdi: peb_addr.into(),
494            rsi: seed,
495            rdx: page_size.into(),
496            rcx: max_guest_log_level,
497
498            ..Default::default()
499        };
500        self.vcpu_fd.set_regs(&regs)?;
501
502        VirtualCPU::run(
503            self.as_mut_hypervisor(),
504            #[cfg(gdb)]
505            dbg_mem_access_fn,
506        )
507    }
508
509    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
510    unsafe fn map_region(&mut self, region: &MemoryRegion) -> Result<()> {
511        if [
512            region.guest_region.start,
513            region.guest_region.end,
514            region.host_region.start,
515            region.host_region.end,
516        ]
517        .iter()
518        .any(|x| x % self.page_size != 0)
519        {
520            log_then_return!(
521                "region is not page-aligned {:x}, {region:?}",
522                self.page_size
523            );
524        }
525
526        let mut kvm_region: kvm_userspace_memory_region = region.clone().into();
527
528        // Try to reuse a freed slot first, otherwise use next_slot
529        let slot = if let Some(freed_slot) = self.freed_slots.pop() {
530            freed_slot
531        } else {
532            let slot = self.next_slot;
533            self.next_slot += 1;
534            slot
535        };
536
537        kvm_region.slot = slot;
538        unsafe { self.vm_fd.set_user_memory_region(kvm_region) }?;
539        self.mmap_regions.push((region.to_owned(), slot));
540        Ok(())
541    }
542
543    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
544    unsafe fn unmap_region(&mut self, region: &MemoryRegion) -> Result<()> {
545        if let Some(idx) = self.mmap_regions.iter().position(|(r, _)| r == region) {
546            let (region, slot) = self.mmap_regions.remove(idx);
547            let mut kvm_region: kvm_userspace_memory_region = region.into();
548            kvm_region.slot = slot;
549            // Setting memory_size to 0 unmaps the slot's region
550            // From https://docs.kernel.org/virt/kvm/api.html
551            // > Deleting a slot is done by passing zero for memory_size.
552            kvm_region.memory_size = 0;
553            unsafe { self.vm_fd.set_user_memory_region(kvm_region) }?;
554
555            // Add the freed slot to the reuse list
556            self.freed_slots.push(slot);
557
558            Ok(())
559        } else {
560            Err(new_error!("Tried to unmap region that is not mapped"))
561        }
562    }
563
564    fn get_mapped_regions(&self) -> Box<dyn ExactSizeIterator<Item = &MemoryRegion> + '_> {
565        Box::new(self.mmap_regions.iter().map(|(region, _)| region))
566    }
567
568    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
569    fn dispatch_call_from_host(
570        &mut self,
571        dispatch_func_addr: RawPtr,
572        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
573    ) -> Result<()> {
574        // Reset general purpose registers, then set RIP and RSP
575        let regs = kvm_regs {
576            rip: dispatch_func_addr.into(),
577            rsp: self.orig_rsp.absolute()?,
578            ..Default::default()
579        };
580        self.vcpu_fd.set_regs(&regs)?;
581
582        // reset fpu state
583        let fpu = kvm_fpu {
584            fcw: FP_CONTROL_WORD_DEFAULT,
585            ftwx: FP_TAG_WORD_DEFAULT,
586            mxcsr: MXCSR_DEFAULT,
587            ..Default::default() // zero out the rest
588        };
589
590        // note kvm set_fpu doesn't actually set or read the mxcsr value
591        // https://elixir.bootlin.com/linux/v6.16/source/arch/x86/kvm/x86.c#L12229
592        self.vcpu_fd.set_fpu(&fpu)?;
593
594        // run
595        VirtualCPU::run(
596            self.as_mut_hypervisor(),
597            #[cfg(gdb)]
598            dbg_mem_access_fn,
599        )?;
600
601        Ok(())
602    }
603
604    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
605    fn handle_io(
606        &mut self,
607        port: u16,
608        data: Vec<u8>,
609        _rip: u64,
610        _instruction_length: u64,
611    ) -> Result<()> {
612        // KVM does not need RIP or instruction length, as it automatically sets the RIP
613
614        // The payload param for the outb_handle_fn is the first byte
615        // of the data array cast to an u64. Thus, we need to make sure
616        // the data array has at least one u8, then convert that to an u64
617        if data.is_empty() {
618            log_then_return!("no data was given in IO interrupt");
619        } else {
620            let mut padded = [0u8; 4];
621            let copy_len = data.len().min(4);
622            padded[..copy_len].copy_from_slice(&data[..copy_len]);
623            let value = u32::from_le_bytes(padded);
624
625            #[cfg(feature = "trace_guest")]
626            {
627                // We need to handle the borrow checker issue where we need both:
628                // - &mut SandboxMemoryManager (from self.mem_mgr.as_mut())
629                // - &mut dyn Hypervisor (from self)
630                // We'll use a temporary approach to extract the mem_mgr temporarily
631                let mem_mgr_option = self.mem_mgr.take();
632                let mut mem_mgr =
633                    mem_mgr_option.ok_or_else(|| new_error!("mem_mgr not initialized"))?;
634                let host_funcs = self
635                    .host_funcs
636                    .as_ref()
637                    .ok_or_else(|| new_error!("host_funcs not initialized"))?
638                    .clone();
639
640                handle_outb(&mut mem_mgr, host_funcs, self, port, value)?;
641
642                // Put the mem_mgr back
643                self.mem_mgr = Some(mem_mgr);
644            }
645
646            #[cfg(not(feature = "trace_guest"))]
647            {
648                let mem_mgr = self
649                    .mem_mgr
650                    .as_mut()
651                    .ok_or_else(|| new_error!("mem_mgr not initialized"))?;
652                let host_funcs = self
653                    .host_funcs
654                    .as_ref()
655                    .ok_or_else(|| new_error!("host_funcs not initialized"))?
656                    .clone();
657
658                handle_outb(mem_mgr, host_funcs, port, value)?;
659            }
660        }
661
662        Ok(())
663    }
664
665    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
666    fn run(&mut self) -> Result<HyperlightExit> {
667        self.interrupt_handle
668            .tid
669            .store(unsafe { libc::pthread_self() as u64 }, Ordering::Relaxed);
670        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
671        // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
672        self.interrupt_handle
673            .set_running_and_increment_generation()
674            .map_err(|e| {
675                new_error!(
676                    "Error setting running state and incrementing generation: {}",
677                    e
678                )
679            })?;
680        #[cfg(not(gdb))]
681        let debug_interrupt = false;
682        #[cfg(gdb)]
683        let debug_interrupt = self
684            .interrupt_handle
685            .debug_interrupt
686            .load(Ordering::Relaxed);
687        // Don't run the vcpu if `cancel_requested` is true
688        //
689        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
690        // Then this is fine since `cancel_requested` is set to true, so we will skip the `VcpuFd::run()` call
691        let exit_reason = if self
692            .interrupt_handle
693            .cancel_requested
694            .load(Ordering::Relaxed)
695            || debug_interrupt
696        {
697            Err(kvm_ioctls::Error::new(libc::EINTR))
698        } else {
699            #[cfg(feature = "trace_guest")]
700            if self.trace_info.guest_start_epoch.is_none() {
701                // Store the guest start epoch and cycles to trace the guest execution time
702                crate::debug!("KVM - Guest Start Epoch set");
703                self.trace_info.guest_start_epoch = Some(std::time::Instant::now());
704                self.trace_info.guest_start_tsc =
705                    Some(hyperlight_guest_tracing::invariant_tsc::read_tsc());
706            }
707
708            // Note: if a `InterruptHandle::kill()` called while this thread is **here**
709            // Then the vcpu will run, but we will keep sending signals to this thread
710            // to interrupt it until `running` is set to false. The `vcpu_fd::run()` call will
711            // return either normally with an exit reason, or from being "kicked" by out signal handler, with an EINTR error,
712            // both of which are fine.
713            self.vcpu_fd.run()
714        };
715        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
716        // Then signals will be sent to this thread until `running` is set to false.
717        // This is fine since the signal handler is a no-op.
718        let cancel_requested = self
719            .interrupt_handle
720            .cancel_requested
721            .load(Ordering::Relaxed);
722        #[cfg(gdb)]
723        let debug_interrupt = self
724            .interrupt_handle
725            .debug_interrupt
726            .load(Ordering::Relaxed);
727        // Note: if a `InterruptHandle::kill()` called while this thread is **here**
728        // Then `cancel_requested` will be set to true again, which will cancel the **next vcpu run**.
729        // Additionally signals will be sent to this thread until `running` is set to false.
730        // This is fine since the signal handler is a no-op.
731        self.interrupt_handle.clear_running_bit();
732        // At this point, `running` is false so no more signals will be sent to this thread,
733        // but we may still receive async signals that were sent before this point.
734        // To prevent those signals from interrupting subsequent calls to `run()` (on other vms!),
735        // we make sure to check `cancel_requested` before cancelling (see `libc::EINTR` match-arm below).
736        let result = match exit_reason {
737            Ok(VcpuExit::Hlt) => {
738                crate::debug!("KVM - Halt Details : {:#?}", &self);
739                HyperlightExit::Halt()
740            }
741            Ok(VcpuExit::IoOut(port, data)) => {
742                // because vcpufd.run() mutably borrows self we cannot pass self to crate::debug! macro here
743                crate::debug!("KVM IO Details : \nPort : {}\nData : {:?}", port, data);
744                // KVM does not need to set RIP or instruction length so these are set to 0
745                HyperlightExit::IoOut(port, data.to_vec(), 0, 0)
746            }
747            Ok(VcpuExit::MmioRead(addr, _)) => {
748                crate::debug!("KVM MMIO Read -Details: Address: {} \n {:#?}", addr, &self);
749
750                match get_memory_access_violation(
751                    addr as usize,
752                    self.sandbox_regions
753                        .iter()
754                        .chain(self.mmap_regions.iter().map(|(r, _)| r)),
755                    MemoryRegionFlags::READ,
756                ) {
757                    Some(access_violation_exit) => access_violation_exit,
758                    None => HyperlightExit::Mmio(addr),
759                }
760            }
761            Ok(VcpuExit::MmioWrite(addr, _)) => {
762                crate::debug!("KVM MMIO Write -Details: Address: {} \n {:#?}", addr, &self);
763
764                match get_memory_access_violation(
765                    addr as usize,
766                    self.sandbox_regions
767                        .iter()
768                        .chain(self.mmap_regions.iter().map(|(r, _)| r)),
769                    MemoryRegionFlags::WRITE,
770                ) {
771                    Some(access_violation_exit) => access_violation_exit,
772                    None => HyperlightExit::Mmio(addr),
773                }
774            }
775            #[cfg(gdb)]
776            // KVM provides architecture specific information about the vCPU state when exiting
777            Ok(VcpuExit::Debug(debug_exit)) => match self.get_stop_reason(debug_exit) {
778                Ok(reason) => HyperlightExit::Debug(reason),
779                Err(e) => {
780                    log_then_return!("Error getting stop reason: {:?}", e);
781                }
782            },
783            Err(e) => match e.errno() {
784                // we send a signal to the thread to cancel execution this results in EINTR being returned by KVM so we return Cancelled
785                libc::EINTR => {
786                    // If cancellation was not requested for this specific vm, the vcpu was interrupted because of debug interrupt or
787                    // a stale signal that meant to be delivered to a previous/other vcpu on this same thread, so let's ignore it
788                    if cancel_requested {
789                        self.interrupt_handle
790                            .cancel_requested
791                            .store(false, Ordering::Relaxed);
792                        HyperlightExit::Cancelled()
793                    } else {
794                        #[cfg(gdb)]
795                        if debug_interrupt {
796                            self.interrupt_handle
797                                .debug_interrupt
798                                .store(false, Ordering::Relaxed);
799
800                            // If the vCPU was stopped because of an interrupt, we need to
801                            // return a special exit reason so that the gdb thread can handle it
802                            // and resume execution
803                            HyperlightExit::Debug(VcpuStopReason::Interrupt)
804                        } else {
805                            HyperlightExit::Retry()
806                        }
807
808                        #[cfg(not(gdb))]
809                        HyperlightExit::Retry()
810                    }
811                }
812                libc::EAGAIN => HyperlightExit::Retry(),
813                _ => {
814                    crate::debug!("KVM Error -Details: Address: {} \n {:#?}", e, &self);
815                    log_then_return!("Error running VCPU {:?}", e);
816                }
817            },
818            Ok(other) => {
819                let err_msg = format!("Unexpected KVM Exit {:?}", other);
820                crate::debug!("KVM Other Exit Details: {:#?}", &self);
821                HyperlightExit::Unknown(err_msg)
822            }
823        };
824        Ok(result)
825    }
826
827    #[instrument(skip_all, parent = Span::current(), level = "Trace")]
828    fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor {
829        self as &mut dyn Hypervisor
830    }
831
832    fn interrupt_handle(&self) -> Arc<dyn InterruptHandle> {
833        self.interrupt_handle.clone()
834    }
835
836    #[cfg(crashdump)]
837    fn crashdump_context(&self) -> Result<Option<crashdump::CrashDumpContext<'_>>> {
838        if self.rt_cfg.guest_core_dump {
839            let mut regs = [0; 27];
840
841            let vcpu_regs = self.vcpu_fd.get_regs()?;
842            let sregs = self.vcpu_fd.get_sregs()?;
843            let xsave = self.vcpu_fd.get_xsave()?;
844
845            // Set the registers in the order expected by the crashdump context
846            regs[0] = vcpu_regs.r15; // r15
847            regs[1] = vcpu_regs.r14; // r14
848            regs[2] = vcpu_regs.r13; // r13
849            regs[3] = vcpu_regs.r12; // r12
850            regs[4] = vcpu_regs.rbp; // rbp
851            regs[5] = vcpu_regs.rbx; // rbx
852            regs[6] = vcpu_regs.r11; // r11
853            regs[7] = vcpu_regs.r10; // r10
854            regs[8] = vcpu_regs.r9; // r9
855            regs[9] = vcpu_regs.r8; // r8
856            regs[10] = vcpu_regs.rax; // rax
857            regs[11] = vcpu_regs.rcx; // rcx
858            regs[12] = vcpu_regs.rdx; // rdx
859            regs[13] = vcpu_regs.rsi; // rsi
860            regs[14] = vcpu_regs.rdi; // rdi
861            regs[15] = 0; // orig rax
862            regs[16] = vcpu_regs.rip; // rip
863            regs[17] = sregs.cs.selector as u64; // cs
864            regs[18] = vcpu_regs.rflags; // eflags
865            regs[19] = vcpu_regs.rsp; // rsp
866            regs[20] = sregs.ss.selector as u64; // ss
867            regs[21] = sregs.fs.base; // fs_base
868            regs[22] = sregs.gs.base; // gs_base
869            regs[23] = sregs.ds.selector as u64; // ds
870            regs[24] = sregs.es.selector as u64; // es
871            regs[25] = sregs.fs.selector as u64; // fs
872            regs[26] = sregs.gs.selector as u64; // gs
873
874            // Get the filename from the runtime config
875            let filename = self.rt_cfg.binary_path.clone().and_then(|path| {
876                Path::new(&path)
877                    .file_name()
878                    .and_then(|name| name.to_os_string().into_string().ok())
879            });
880
881            // The [`CrashDumpContext`] accepts xsave as a vector of u8, so we need to convert the
882            // xsave region to a vector of u8
883            Ok(Some(crashdump::CrashDumpContext::new(
884                &self.sandbox_regions,
885                regs,
886                xsave
887                    .region
888                    .iter()
889                    .flat_map(|item| item.to_le_bytes())
890                    .collect::<Vec<u8>>(),
891                self.entrypoint,
892                self.rt_cfg.binary_path.clone(),
893                filename,
894            )))
895        } else {
896            Ok(None)
897        }
898    }
899
900    #[cfg(gdb)]
901    fn handle_debug(
902        &mut self,
903        dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
904        stop_reason: VcpuStopReason,
905    ) -> Result<()> {
906        if self.debug.is_none() {
907            return Err(new_error!("Debugging is not enabled"));
908        }
909
910        match stop_reason {
911            // If the vCPU stopped because of a crash, we need to handle it differently
912            // We do not want to allow resuming execution or placing breakpoints
913            // because the guest has crashed.
914            // We only allow reading registers and memory
915            VcpuStopReason::Crash => {
916                self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
917                    .map_err(|e| {
918                        new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
919                    })?;
920
921                loop {
922                    log::debug!("Debug wait for event to resume vCPU");
923                    // Wait for a message from gdb
924                    let req = self.recv_dbg_msg()?;
925
926                    // Flag to store if we should deny continue or step requests
927                    let mut deny_continue = false;
928                    // Flag to store if we should detach from the gdb session
929                    let mut detach = false;
930
931                    let response = match req {
932                        // Allow the detach request to disable debugging by continuing resuming
933                        // hypervisor crash error reporting
934                        DebugMsg::DisableDebug => {
935                            detach = true;
936                            DebugResponse::DisableDebug
937                        }
938                        // Do not allow continue or step requests
939                        DebugMsg::Continue | DebugMsg::Step => {
940                            deny_continue = true;
941                            DebugResponse::NotAllowed
942                        }
943                        // Do not allow adding/removing breakpoints and writing to memory or registers
944                        DebugMsg::AddHwBreakpoint(_)
945                        | DebugMsg::AddSwBreakpoint(_)
946                        | DebugMsg::RemoveHwBreakpoint(_)
947                        | DebugMsg::RemoveSwBreakpoint(_)
948                        | DebugMsg::WriteAddr(_, _)
949                        | DebugMsg::WriteRegisters(_) => DebugResponse::NotAllowed,
950
951                        // For all other requests, we will process them normally
952                        _ => {
953                            let result = self.process_dbg_request(req, dbg_mem_access_fn.clone());
954                            match result {
955                                Ok(response) => response,
956                                Err(HyperlightError::TranslateGuestAddress(_)) => {
957                                    // Treat non fatal errors separately so the guest doesn't fail
958                                    DebugResponse::ErrorOccurred
959                                }
960                                Err(e) => {
961                                    log::error!("Error processing debug request: {:?}", e);
962                                    return Err(e);
963                                }
964                            }
965                        }
966                    };
967
968                    // Send the response to the request back to gdb
969                    self.send_dbg_msg(response)
970                        .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
971
972                    // If we are denying continue or step requests, the debugger assumes the
973                    // execution started so we need to report a stop reason as a crash and let
974                    // it request to read registers/memory to figure out what happened
975                    if deny_continue {
976                        self.send_dbg_msg(DebugResponse::VcpuStopped(VcpuStopReason::Crash))
977                            .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
978                    }
979
980                    // If we are detaching, we will break the loop and the Hypervisor will continue
981                    // to handle the Crash reason
982                    if detach {
983                        break;
984                    }
985                }
986            }
987            // If the vCPU stopped because of any other reason except a crash, we can handle it
988            // normally
989            _ => {
990                // Send the stop reason to the gdb thread
991                self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
992                    .map_err(|e| {
993                        new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
994                    })?;
995
996                loop {
997                    log::debug!("Debug wait for event to resume vCPU");
998                    // Wait for a message from gdb
999                    let req = self.recv_dbg_msg()?;
1000
1001                    let result = self.process_dbg_request(req, dbg_mem_access_fn.clone());
1002
1003                    let response = match result {
1004                        Ok(response) => response,
1005                        // Treat non fatal errors separately so the guest doesn't fail
1006                        Err(HyperlightError::TranslateGuestAddress(_)) => {
1007                            DebugResponse::ErrorOccurred
1008                        }
1009                        Err(e) => {
1010                            return Err(e);
1011                        }
1012                    };
1013
1014                    let cont = matches!(
1015                        response,
1016                        DebugResponse::Continue | DebugResponse::Step | DebugResponse::DisableDebug
1017                    );
1018
1019                    self.send_dbg_msg(response)
1020                        .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
1021
1022                    // Check if we should continue execution
1023                    // We continue if the response is one of the following: Step, Continue, or DisableDebug
1024                    if cont {
1025                        break;
1026                    }
1027                }
1028            }
1029        }
1030
1031        Ok(())
1032    }
1033
1034    fn check_stack_guard(&self) -> Result<bool> {
1035        if let Some(mgr) = self.mem_mgr.as_ref() {
1036            mgr.check_stack_guard()
1037        } else {
1038            Err(new_error!("Memory manager is not initialized"))
1039        }
1040    }
1041
1042    #[cfg(feature = "trace_guest")]
1043    fn read_trace_reg(&self, reg: TraceRegister) -> Result<u64> {
1044        let regs = self.vcpu_fd.get_regs()?;
1045        Ok(match reg {
1046            TraceRegister::RAX => regs.rax,
1047            TraceRegister::RCX => regs.rcx,
1048            TraceRegister::RIP => regs.rip,
1049            TraceRegister::RSP => regs.rsp,
1050            TraceRegister::RBP => regs.rbp,
1051        })
1052    }
1053
1054    #[cfg(feature = "trace_guest")]
1055    fn trace_info_as_ref(&self) -> &TraceInfo {
1056        &self.trace_info
1057    }
1058    #[cfg(feature = "trace_guest")]
1059    fn trace_info_as_mut(&mut self) -> &mut TraceInfo {
1060        &mut self.trace_info
1061    }
1062}
1063
1064impl Drop for KVMDriver {
1065    fn drop(&mut self) {
1066        self.interrupt_handle.dropped.store(true, Ordering::Relaxed);
1067    }
1068}