hyperlight_host/hypervisor/
kvm.rs

1/*
2Copyright 2025  The Hyperlight Authors.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8    http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17use std::fmt::Debug;
18use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
19use std::sync::{Arc, Mutex};
20
21use kvm_bindings::{kvm_fpu, kvm_regs, kvm_sregs, kvm_userspace_memory_region};
22use kvm_ioctls::Cap::UserMemory;
23use kvm_ioctls::{Kvm, VcpuExit, VcpuFd, VmFd};
24use log::LevelFilter;
25use tracing::{Span, instrument};
26#[cfg(feature = "trace_guest")]
27use tracing_opentelemetry::OpenTelemetrySpanExt;
28#[cfg(crashdump)]
29use {super::crashdump, std::path::Path};
30
31#[cfg(gdb)]
32use super::gdb::{
33    DebugCommChannel, DebugMemoryAccess, DebugMsg, DebugResponse, GuestDebug, KvmDebug,
34    VcpuStopReason,
35};
36use super::{HyperlightExit, Hypervisor, LinuxInterruptHandle, VirtualCPU};
37#[cfg(gdb)]
38use crate::HyperlightError;
39use crate::hypervisor::get_memory_access_violation;
40use crate::hypervisor::regs::{CommonFpu, CommonRegisters};
41use crate::mem::memory_region::{MemoryRegion, MemoryRegionFlags};
42use crate::mem::mgr::SandboxMemoryManager;
43use crate::mem::ptr::{GuestPtr, RawPtr};
44use crate::mem::shared_mem::HostSharedMemory;
45use crate::sandbox::SandboxConfiguration;
46use crate::sandbox::host_funcs::FunctionRegistry;
47use crate::sandbox::outb::handle_outb;
48#[cfg(feature = "mem_profile")]
49use crate::sandbox::trace::MemTraceInfo;
50#[cfg(crashdump)]
51use crate::sandbox::uninitialized::SandboxRuntimeConfig;
52use crate::{Result, log_then_return, new_error};
53
54/// Return `true` if the KVM API is available, version 12, and has UserMemory capability, or `false` otherwise
55#[instrument(skip_all, parent = Span::current(), level = "Trace")]
56pub(crate) fn is_hypervisor_present() -> bool {
57    if let Ok(kvm) = Kvm::new() {
58        let api_version = kvm.get_api_version();
59        match api_version {
60            version if version == 12 && kvm.check_extension(UserMemory) => true,
61            12 => {
62                log::info!("KVM does not have KVM_CAP_USER_MEMORY capability");
63                false
64            }
65            version => {
66                log::info!("KVM GET_API_VERSION returned {}, expected 12", version);
67                false
68            }
69        }
70    } else {
71        log::info!("KVM is not available on this system");
72        false
73    }
74}
75
76#[cfg(gdb)]
77mod debug {
78    use kvm_bindings::kvm_debug_exit_arch;
79
80    use super::KVMDriver;
81    use crate::hypervisor::gdb::{
82        DebugMemoryAccess, DebugMsg, DebugResponse, GuestDebug, KvmDebug, VcpuStopReason,
83    };
84    use crate::{Result, new_error};
85
86    impl KVMDriver {
87        /// Resets the debug information to disable debugging
88        fn disable_debug(&mut self) -> Result<()> {
89            let mut debug = KvmDebug::default();
90
91            debug.set_single_step(&self.vcpu_fd, false)?;
92
93            self.debug = Some(debug);
94
95            Ok(())
96        }
97
98        /// Get the reason the vCPU has stopped
99        pub(crate) fn get_stop_reason(
100            &mut self,
101            debug_exit: kvm_debug_exit_arch,
102        ) -> Result<VcpuStopReason> {
103            let debug = self
104                .debug
105                .as_mut()
106                .ok_or_else(|| new_error!("Debug is not enabled"))?;
107
108            debug.get_stop_reason(&self.vcpu_fd, debug_exit, self.entrypoint)
109        }
110
111        pub(crate) fn process_dbg_request(
112            &mut self,
113            req: DebugMsg,
114            mem_access: &DebugMemoryAccess,
115        ) -> Result<DebugResponse> {
116            if let Some(debug) = self.debug.as_mut() {
117                match req {
118                    DebugMsg::AddHwBreakpoint(addr) => Ok(DebugResponse::AddHwBreakpoint(
119                        debug
120                            .add_hw_breakpoint(&self.vcpu_fd, addr)
121                            .map_err(|e| {
122                                log::error!("Failed to add hw breakpoint: {:?}", e);
123
124                                e
125                            })
126                            .is_ok(),
127                    )),
128                    DebugMsg::AddSwBreakpoint(addr) => Ok(DebugResponse::AddSwBreakpoint(
129                        debug
130                            .add_sw_breakpoint(&self.vcpu_fd, addr, mem_access)
131                            .map_err(|e| {
132                                log::error!("Failed to add sw breakpoint: {:?}", e);
133
134                                e
135                            })
136                            .is_ok(),
137                    )),
138                    DebugMsg::Continue => {
139                        debug.set_single_step(&self.vcpu_fd, false).map_err(|e| {
140                            log::error!("Failed to continue execution: {:?}", e);
141
142                            e
143                        })?;
144
145                        Ok(DebugResponse::Continue)
146                    }
147                    DebugMsg::DisableDebug => {
148                        self.disable_debug().map_err(|e| {
149                            log::error!("Failed to disable debugging: {:?}", e);
150
151                            e
152                        })?;
153
154                        Ok(DebugResponse::DisableDebug)
155                    }
156                    DebugMsg::GetCodeSectionOffset => {
157                        let offset = mem_access
158                            .dbg_mem_access_fn
159                            .try_lock()
160                            .map_err(|e| {
161                                new_error!("Error locking at {}:{}: {}", file!(), line!(), e)
162                            })?
163                            .layout
164                            .get_guest_code_address();
165
166                        Ok(DebugResponse::GetCodeSectionOffset(offset as u64))
167                    }
168                    DebugMsg::ReadAddr(addr, len) => {
169                        let mut data = vec![0u8; len];
170
171                        debug.read_addrs(&self.vcpu_fd, addr, &mut data, mem_access)?;
172
173                        Ok(DebugResponse::ReadAddr(data))
174                    }
175                    DebugMsg::ReadRegisters => debug
176                        .read_regs(&self.vcpu_fd)
177                        .map_err(|e| {
178                            log::error!("Failed to read registers: {:?}", e);
179
180                            e
181                        })
182                        .map(|(regs, fpu)| DebugResponse::ReadRegisters(Box::new((regs, fpu)))),
183                    DebugMsg::RemoveHwBreakpoint(addr) => Ok(DebugResponse::RemoveHwBreakpoint(
184                        debug
185                            .remove_hw_breakpoint(&self.vcpu_fd, addr)
186                            .map_err(|e| {
187                                log::error!("Failed to remove hw breakpoint: {:?}", e);
188
189                                e
190                            })
191                            .is_ok(),
192                    )),
193                    DebugMsg::RemoveSwBreakpoint(addr) => Ok(DebugResponse::RemoveSwBreakpoint(
194                        debug
195                            .remove_sw_breakpoint(&self.vcpu_fd, addr, mem_access)
196                            .map_err(|e| {
197                                log::error!("Failed to remove sw breakpoint: {:?}", e);
198
199                                e
200                            })
201                            .is_ok(),
202                    )),
203                    DebugMsg::Step => {
204                        debug.set_single_step(&self.vcpu_fd, true).map_err(|e| {
205                            log::error!("Failed to enable step instruction: {:?}", e);
206
207                            e
208                        })?;
209
210                        Ok(DebugResponse::Step)
211                    }
212                    DebugMsg::WriteAddr(addr, data) => {
213                        debug.write_addrs(&self.vcpu_fd, addr, &data, mem_access)?;
214
215                        Ok(DebugResponse::WriteAddr)
216                    }
217                    DebugMsg::WriteRegisters(boxed_regs) => {
218                        let (regs, fpu) = boxed_regs.as_ref();
219                        debug
220                            .write_regs(&self.vcpu_fd, regs, fpu)
221                            .map_err(|e| {
222                                log::error!("Failed to write registers: {:?}", e);
223
224                                e
225                            })
226                            .map(|_| DebugResponse::WriteRegisters)
227                    }
228                }
229            } else {
230                Err(new_error!("Debugging is not enabled"))
231            }
232        }
233
234        pub(crate) fn recv_dbg_msg(&mut self) -> Result<DebugMsg> {
235            let gdb_conn = self
236                .gdb_conn
237                .as_mut()
238                .ok_or_else(|| new_error!("Debug is not enabled"))?;
239
240            gdb_conn.recv().map_err(|e| {
241                new_error!(
242                    "Got an error while waiting to receive a message from the gdb thread: {:?}",
243                    e
244                )
245            })
246        }
247
248        pub(crate) fn send_dbg_msg(&mut self, cmd: DebugResponse) -> Result<()> {
249            log::debug!("Sending {:?}", cmd);
250
251            let gdb_conn = self
252                .gdb_conn
253                .as_mut()
254                .ok_or_else(|| new_error!("Debug is not enabled"))?;
255
256            gdb_conn.send(cmd).map_err(|e| {
257                new_error!(
258                    "Got an error while sending a response message to the gdb thread: {:?}",
259                    e
260                )
261            })
262        }
263    }
264}
265
266/// A Hypervisor driver for KVM on Linux
267pub(crate) struct KVMDriver {
268    _kvm: Kvm,
269    vm_fd: VmFd,
270    page_size: usize,
271    vcpu_fd: VcpuFd,
272    entrypoint: u64,
273    orig_rsp: GuestPtr,
274    interrupt_handle: Arc<LinuxInterruptHandle>,
275    mem_mgr: Option<SandboxMemoryManager<HostSharedMemory>>,
276    host_funcs: Option<Arc<Mutex<FunctionRegistry>>>,
277
278    sandbox_regions: Vec<MemoryRegion>, // Initially mapped regions when sandbox is created
279    mmap_regions: Vec<(MemoryRegion, u32)>, // Later mapped regions (region, slot number)
280    next_slot: u32,                     // Monotonically increasing slot number
281    freed_slots: Vec<u32>,              // Reusable slots from unmapped regions
282
283    #[cfg(gdb)]
284    debug: Option<KvmDebug>,
285    #[cfg(gdb)]
286    gdb_conn: Option<DebugCommChannel<DebugResponse, DebugMsg>>,
287    #[cfg(crashdump)]
288    rt_cfg: SandboxRuntimeConfig,
289    #[cfg(feature = "mem_profile")]
290    trace_info: MemTraceInfo,
291}
292
293impl KVMDriver {
294    /// Create a new instance of a `KVMDriver`, with only control registers
295    /// set. Standard registers will not be set, and `initialise` must
296    /// be called to do so.
297    #[allow(clippy::too_many_arguments)]
298    // TODO: refactor this function to take fewer arguments. Add trace_info to rt_cfg
299    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
300    pub(crate) fn new(
301        mem_regions: Vec<MemoryRegion>,
302        pml4_addr: u64,
303        entrypoint: u64,
304        rsp: u64,
305        config: &SandboxConfiguration,
306        #[cfg(gdb)] gdb_conn: Option<DebugCommChannel<DebugResponse, DebugMsg>>,
307        #[cfg(crashdump)] rt_cfg: SandboxRuntimeConfig,
308        #[cfg(feature = "mem_profile")] trace_info: MemTraceInfo,
309    ) -> Result<Self> {
310        let kvm = Kvm::new()?;
311
312        let vm_fd = kvm.create_vm_with_type(0)?;
313
314        mem_regions.iter().enumerate().try_for_each(|(i, region)| {
315            let mut kvm_region: kvm_userspace_memory_region = region.clone().into();
316            kvm_region.slot = i as u32;
317            unsafe { vm_fd.set_user_memory_region(kvm_region) }
318        })?;
319
320        let vcpu_fd = vm_fd.create_vcpu(0)?;
321
322        #[cfg(gdb)]
323        let (debug, gdb_conn) = if let Some(gdb_conn) = gdb_conn {
324            let mut debug = KvmDebug::new();
325            // Add breakpoint to the entry point address
326            debug.add_hw_breakpoint(&vcpu_fd, entrypoint)?;
327
328            (Some(debug), Some(gdb_conn))
329        } else {
330            (None, None)
331        };
332
333        let rsp_gp = GuestPtr::try_from(RawPtr::from(rsp))?;
334
335        let interrupt_handle = Arc::new(LinuxInterruptHandle {
336            running: AtomicU64::new(0),
337            cancel_requested: AtomicU64::new(0),
338            call_active: AtomicBool::new(false),
339            #[cfg(gdb)]
340            debug_interrupt: AtomicBool::new(false),
341            #[cfg(all(
342                target_arch = "x86_64",
343                target_vendor = "unknown",
344                target_os = "linux",
345                target_env = "musl"
346            ))]
347            tid: AtomicU64::new(unsafe { libc::pthread_self() as u64 }),
348            #[cfg(not(all(
349                target_arch = "x86_64",
350                target_vendor = "unknown",
351                target_os = "linux",
352                target_env = "musl"
353            )))]
354            tid: AtomicU64::new(unsafe { libc::pthread_self() }),
355            retry_delay: config.get_interrupt_retry_delay(),
356            dropped: AtomicBool::new(false),
357            sig_rt_min_offset: config.get_interrupt_vcpu_sigrtmin_offset(),
358        });
359
360        let mut kvm = Self {
361            _kvm: kvm,
362            vm_fd,
363            page_size: 0,
364            vcpu_fd,
365            entrypoint,
366            orig_rsp: rsp_gp,
367            next_slot: mem_regions.len() as u32,
368            sandbox_regions: mem_regions,
369            mmap_regions: Vec::new(),
370            freed_slots: Vec::new(),
371            interrupt_handle: interrupt_handle.clone(),
372            mem_mgr: None,
373            host_funcs: None,
374            #[cfg(gdb)]
375            debug,
376            #[cfg(gdb)]
377            gdb_conn,
378            #[cfg(crashdump)]
379            rt_cfg,
380            #[cfg(feature = "mem_profile")]
381            trace_info,
382        };
383
384        kvm.setup_initial_sregs(pml4_addr)?;
385
386        // Send the interrupt handle to the GDB thread if debugging is enabled
387        // This is used to allow the GDB thread to stop the vCPU
388        #[cfg(gdb)]
389        if kvm.debug.is_some() {
390            kvm.send_dbg_msg(DebugResponse::InterruptHandle(interrupt_handle))?;
391        }
392
393        Ok(kvm)
394    }
395}
396
397impl Debug for KVMDriver {
398    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
399        let mut f = f.debug_struct("KVM Driver");
400        // Output each memory region
401
402        for region in &self.sandbox_regions {
403            f.field("Sandbox Memory Region", &region);
404        }
405        for region in &self.mmap_regions {
406            f.field("Mapped Memory Region", &region);
407        }
408        let regs = self.vcpu_fd.get_regs();
409        // check that regs is OK and then set field in debug struct
410
411        if let Ok(regs) = regs {
412            f.field("Registers", &regs);
413        }
414
415        let sregs = self.vcpu_fd.get_sregs();
416
417        // check that sregs is OK and then set field in debug struct
418
419        if let Ok(sregs) = sregs {
420            f.field("Special Registers", &sregs);
421        }
422
423        f.finish()
424    }
425}
426
427impl Hypervisor for KVMDriver {
428    /// Implementation of initialise for Hypervisor trait.
429    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
430    fn initialise(
431        &mut self,
432        peb_addr: RawPtr,
433        seed: u64,
434        page_size: u32,
435        mem_mgr: SandboxMemoryManager<HostSharedMemory>,
436        host_funcs: Arc<Mutex<FunctionRegistry>>,
437        max_guest_log_level: Option<LevelFilter>,
438        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
439    ) -> Result<()> {
440        self.mem_mgr = Some(mem_mgr);
441        self.host_funcs = Some(host_funcs);
442        self.page_size = page_size as usize;
443
444        let max_guest_log_level: u64 = match max_guest_log_level {
445            Some(level) => level as u64,
446            None => self.get_max_log_level().into(),
447        };
448
449        let regs = CommonRegisters {
450            rip: self.entrypoint,
451            rsp: self.orig_rsp.absolute()?,
452
453            // function args
454            rdi: peb_addr.into(),
455            rsi: seed,
456            rdx: page_size.into(),
457            rcx: max_guest_log_level,
458
459            ..Default::default()
460        };
461        self.set_regs(&regs)?;
462
463        VirtualCPU::run(
464            self.as_mut_hypervisor(),
465            #[cfg(gdb)]
466            dbg_mem_access_fn,
467        )
468    }
469
470    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
471    unsafe fn map_region(&mut self, region: &MemoryRegion) -> Result<()> {
472        if [
473            region.guest_region.start,
474            region.guest_region.end,
475            region.host_region.start,
476            region.host_region.end,
477        ]
478        .iter()
479        .any(|x| x % self.page_size != 0)
480        {
481            log_then_return!(
482                "region is not page-aligned {:x}, {region:?}",
483                self.page_size
484            );
485        }
486
487        let mut kvm_region: kvm_userspace_memory_region = region.clone().into();
488
489        // Try to reuse a freed slot first, otherwise use next_slot
490        let slot = if let Some(freed_slot) = self.freed_slots.pop() {
491            freed_slot
492        } else {
493            let slot = self.next_slot;
494            self.next_slot += 1;
495            slot
496        };
497
498        kvm_region.slot = slot;
499        unsafe { self.vm_fd.set_user_memory_region(kvm_region) }?;
500        self.mmap_regions.push((region.to_owned(), slot));
501        Ok(())
502    }
503
504    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
505    unsafe fn unmap_region(&mut self, region: &MemoryRegion) -> Result<()> {
506        if let Some(idx) = self.mmap_regions.iter().position(|(r, _)| r == region) {
507            let (region, slot) = self.mmap_regions.remove(idx);
508            let mut kvm_region: kvm_userspace_memory_region = region.into();
509            kvm_region.slot = slot;
510            // Setting memory_size to 0 unmaps the slot's region
511            // From https://docs.kernel.org/virt/kvm/api.html
512            // > Deleting a slot is done by passing zero for memory_size.
513            kvm_region.memory_size = 0;
514            unsafe { self.vm_fd.set_user_memory_region(kvm_region) }?;
515
516            // Add the freed slot to the reuse list
517            self.freed_slots.push(slot);
518
519            Ok(())
520        } else {
521            Err(new_error!("Tried to unmap region that is not mapped"))
522        }
523    }
524
525    fn get_mapped_regions(&self) -> Box<dyn ExactSizeIterator<Item = &MemoryRegion> + '_> {
526        Box::new(self.mmap_regions.iter().map(|(region, _)| region))
527    }
528
529    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
530    fn dispatch_call_from_host(
531        &mut self,
532        dispatch_func_addr: RawPtr,
533        #[cfg(gdb)] dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
534    ) -> Result<()> {
535        // Reset general purpose registers, then set RIP and RSP
536        let regs = CommonRegisters {
537            rip: dispatch_func_addr.into(),
538            rsp: self.orig_rsp.absolute()?,
539            ..Default::default()
540        };
541        self.set_regs(&regs)?;
542
543        // reset fpu state
544        self.set_fpu(&CommonFpu::default())?;
545
546        // run
547        VirtualCPU::run(
548            self.as_mut_hypervisor(),
549            #[cfg(gdb)]
550            dbg_mem_access_fn,
551        )?;
552
553        Ok(())
554    }
555
556    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
557    fn handle_io(
558        &mut self,
559        port: u16,
560        data: Vec<u8>,
561        _rip: u64,
562        _instruction_length: u64,
563    ) -> Result<()> {
564        // KVM does not need RIP or instruction length, as it automatically sets the RIP
565
566        // The payload param for the outb_handle_fn is the first byte
567        // of the data array cast to an u64. Thus, we need to make sure
568        // the data array has at least one u8, then convert that to an u64
569        if data.is_empty() {
570            log_then_return!("no data was given in IO interrupt");
571        } else {
572            let mut padded = [0u8; 4];
573            let copy_len = data.len().min(4);
574            padded[..copy_len].copy_from_slice(&data[..copy_len]);
575            let value = u32::from_le_bytes(padded);
576
577            #[cfg(feature = "mem_profile")]
578            {
579                // We need to handle the borrow checker issue where we need both:
580                // - &mut SandboxMemoryManager (from self.mem_mgr.as_mut())
581                // - &mut dyn Hypervisor (from self)
582                // We'll use a temporary approach to extract the mem_mgr temporarily
583                let mem_mgr_option = self.mem_mgr.take();
584                let mut mem_mgr =
585                    mem_mgr_option.ok_or_else(|| new_error!("mem_mgr not initialized"))?;
586                let host_funcs = self
587                    .host_funcs
588                    .as_ref()
589                    .ok_or_else(|| new_error!("host_funcs not initialized"))?
590                    .clone();
591
592                handle_outb(&mut mem_mgr, host_funcs, self, port, value)?;
593
594                // Put the mem_mgr back
595                self.mem_mgr = Some(mem_mgr);
596            }
597
598            #[cfg(not(feature = "mem_profile"))]
599            {
600                let mem_mgr = self
601                    .mem_mgr
602                    .as_mut()
603                    .ok_or_else(|| new_error!("mem_mgr not initialized"))?;
604                let host_funcs = self
605                    .host_funcs
606                    .as_ref()
607                    .ok_or_else(|| new_error!("host_funcs not initialized"))?
608                    .clone();
609
610                handle_outb(mem_mgr, host_funcs, port, value)?;
611            }
612        }
613
614        Ok(())
615    }
616
617    #[instrument(err(Debug), skip_all, parent = Span::current(), level = "Trace")]
618    fn run(
619        &mut self,
620        #[cfg(feature = "trace_guest")] tc: &mut crate::sandbox::trace::TraceContext,
621    ) -> Result<HyperlightExit> {
622        self.interrupt_handle
623            .tid
624            .store(unsafe { libc::pthread_self() as u64 }, Ordering::Release);
625        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
626        // Cast to internal trait for access to internal methods
627        let interrupt_handle_internal =
628            self.interrupt_handle.as_ref() as &dyn super::InterruptHandleInternal;
629
630        // (after set_running_bit but before checking cancel_requested):
631        // - kill() will stamp cancel_requested with the current generation
632        // - We will check cancel_requested below and skip the VcpuFd::run() call
633        // - This is the desired behavior - the kill takes effect immediately
634        let generation = interrupt_handle_internal.set_running_bit();
635
636        #[cfg(not(gdb))]
637        let debug_interrupt = false;
638        #[cfg(gdb)]
639        let debug_interrupt = self
640            .interrupt_handle
641            .debug_interrupt
642            .load(Ordering::Relaxed);
643        // Don't run the vcpu if `cancel_requested` is set for our generation
644        //
645        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
646        // (after checking cancel_requested but before vcpu.run()):
647        // - kill() will stamp cancel_requested with the current generation
648        // - We will proceed with vcpu.run(), but signals will be sent to interrupt it
649        // - The vcpu will be interrupted and return EINTR (handled below)
650        let exit_reason = if interrupt_handle_internal
651            .is_cancel_requested_for_generation(generation)
652            || debug_interrupt
653        {
654            Err(kvm_ioctls::Error::new(libc::EINTR))
655        } else {
656            #[cfg(feature = "trace_guest")]
657            tc.setup_guest_trace(Span::current().context());
658
659            // Note: if `InterruptHandle::kill()` is called while this thread is **here**
660            // (during vcpu.run() execution):
661            // - kill() stamps cancel_requested with the current generation
662            // - kill() sends signals (SIGRTMIN+offset) to this thread repeatedly
663            // - The signal handler is a no-op, but it causes vcpu.run() to return EINTR
664            // - We check cancel_requested below and return Cancelled if generation matches
665            self.vcpu_fd.run()
666        };
667        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
668        // (after vcpu.run() returns but before clear_running_bit):
669        // - kill() continues sending signals to this thread (running bit is still set)
670        // - The signals are harmless (no-op handler), we just need to check cancel_requested
671        // - We load cancel_requested below to determine if this run was cancelled
672        let cancel_requested =
673            interrupt_handle_internal.is_cancel_requested_for_generation(generation);
674        #[cfg(gdb)]
675        let debug_interrupt = self
676            .interrupt_handle
677            .debug_interrupt
678            .load(Ordering::Relaxed);
679        // Note: if `InterruptHandle::kill()` is called while this thread is **here**
680        // (after loading cancel_requested but before clear_running_bit):
681        // - kill() stamps cancel_requested with the CURRENT generation (not the one we just loaded)
682        // - kill() continues sending signals until running bit is cleared
683        // - The newly stamped cancel_requested will affect the NEXT vcpu.run() call
684        // - Signals sent now are harmless (no-op handler)
685        interrupt_handle_internal.clear_running_bit();
686        // At this point, running bit is clear so kill() will stop sending signals.
687        // However, we may still receive delayed signals that were sent before clear_running_bit.
688        // These stale signals are harmless because:
689        // - The signal handler is a no-op
690        // - We check generation matching in cancel_requested before treating EINTR as cancellation
691        // - If generation doesn't match, we return Retry instead of Cancelled
692        let result = match exit_reason {
693            Ok(VcpuExit::Hlt) => {
694                crate::debug!("KVM - Halt Details : {:#?}", &self);
695                HyperlightExit::Halt()
696            }
697            Ok(VcpuExit::IoOut(port, data)) => {
698                // because vcpufd.run() mutably borrows self we cannot pass self to crate::debug! macro here
699                crate::debug!("KVM IO Details : \nPort : {}\nData : {:?}", port, data);
700                // KVM does not need to set RIP or instruction length so these are set to 0
701                HyperlightExit::IoOut(port, data.to_vec(), 0, 0)
702            }
703            Ok(VcpuExit::MmioRead(addr, _)) => {
704                crate::debug!("KVM MMIO Read -Details: Address: {} \n {:#?}", addr, &self);
705
706                match get_memory_access_violation(
707                    addr as usize,
708                    self.sandbox_regions
709                        .iter()
710                        .chain(self.mmap_regions.iter().map(|(r, _)| r)),
711                    MemoryRegionFlags::READ,
712                ) {
713                    Some(access_violation_exit) => access_violation_exit,
714                    None => HyperlightExit::Mmio(addr),
715                }
716            }
717            Ok(VcpuExit::MmioWrite(addr, _)) => {
718                crate::debug!("KVM MMIO Write -Details: Address: {} \n {:#?}", addr, &self);
719
720                match get_memory_access_violation(
721                    addr as usize,
722                    self.sandbox_regions
723                        .iter()
724                        .chain(self.mmap_regions.iter().map(|(r, _)| r)),
725                    MemoryRegionFlags::WRITE,
726                ) {
727                    Some(access_violation_exit) => access_violation_exit,
728                    None => HyperlightExit::Mmio(addr),
729                }
730            }
731            #[cfg(gdb)]
732            // KVM provides architecture specific information about the vCPU state when exiting
733            Ok(VcpuExit::Debug(debug_exit)) => match self.get_stop_reason(debug_exit) {
734                Ok(reason) => HyperlightExit::Debug(reason),
735                Err(e) => {
736                    log_then_return!("Error getting stop reason: {:?}", e);
737                }
738            },
739            Err(e) => match e.errno() {
740                // We send a signal (SIGRTMIN+offset) to interrupt the vcpu, which causes EINTR
741                libc::EINTR => {
742                    // Check if cancellation was requested for THIS specific generation.
743                    // If not, the EINTR came from:
744                    // - A debug interrupt (if GDB is enabled)
745                    // - A stale signal from a previous guest call (generation mismatch)
746                    // - A signal meant for a different sandbox on the same thread
747                    // In these cases, we return Retry to continue execution.
748                    if cancel_requested {
749                        interrupt_handle_internal.clear_cancel_requested();
750                        HyperlightExit::Cancelled()
751                    } else {
752                        #[cfg(gdb)]
753                        if debug_interrupt {
754                            self.interrupt_handle
755                                .debug_interrupt
756                                .store(false, Ordering::Relaxed);
757
758                            // If the vCPU was stopped because of an interrupt, we need to
759                            // return a special exit reason so that the gdb thread can handle it
760                            // and resume execution
761                            HyperlightExit::Debug(VcpuStopReason::Interrupt)
762                        } else {
763                            HyperlightExit::Retry()
764                        }
765
766                        #[cfg(not(gdb))]
767                        HyperlightExit::Retry()
768                    }
769                }
770                libc::EAGAIN => HyperlightExit::Retry(),
771                _ => {
772                    crate::debug!("KVM Error -Details: Address: {} \n {:#?}", e, &self);
773                    log_then_return!("Error running VCPU {:?}", e);
774                }
775            },
776            Ok(other) => {
777                let err_msg = format!("Unexpected KVM Exit {:?}", other);
778                crate::debug!("KVM Other Exit Details: {:#?}", &self);
779                HyperlightExit::Unknown(err_msg)
780            }
781        };
782        Ok(result)
783    }
784
785    fn regs(&self) -> Result<super::regs::CommonRegisters> {
786        let kvm_regs = self.vcpu_fd.get_regs()?;
787        Ok((&kvm_regs).into())
788    }
789
790    fn set_regs(&mut self, regs: &super::regs::CommonRegisters) -> Result<()> {
791        let kvm_regs: kvm_regs = regs.into();
792        self.vcpu_fd.set_regs(&kvm_regs)?;
793        Ok(())
794    }
795
796    fn fpu(&self) -> Result<super::regs::CommonFpu> {
797        let kvm_fpu = self.vcpu_fd.get_fpu()?;
798        Ok((&kvm_fpu).into())
799    }
800
801    fn set_fpu(&mut self, fpu: &super::regs::CommonFpu) -> Result<()> {
802        let kvm_fpu: kvm_fpu = fpu.into();
803        self.vcpu_fd.set_fpu(&kvm_fpu)?;
804        Ok(())
805    }
806
807    fn sregs(&self) -> Result<super::regs::CommonSpecialRegisters> {
808        let kvm_sregs = self.vcpu_fd.get_sregs()?;
809        Ok((&kvm_sregs).into())
810    }
811
812    fn set_sregs(&mut self, sregs: &super::regs::CommonSpecialRegisters) -> Result<()> {
813        let kvm_sregs: kvm_sregs = sregs.into();
814        self.vcpu_fd.set_sregs(&kvm_sregs)?;
815        Ok(())
816    }
817
818    #[instrument(skip_all, parent = Span::current(), level = "Trace")]
819    fn as_mut_hypervisor(&mut self) -> &mut dyn Hypervisor {
820        self as &mut dyn Hypervisor
821    }
822
823    fn interrupt_handle(&self) -> Arc<dyn super::InterruptHandleInternal> {
824        self.interrupt_handle.clone()
825    }
826
827    #[cfg(crashdump)]
828    fn crashdump_context(&self) -> Result<Option<crashdump::CrashDumpContext>> {
829        if self.rt_cfg.guest_core_dump {
830            let mut regs = [0; 27];
831
832            let vcpu_regs = self.vcpu_fd.get_regs()?;
833            let sregs = self.vcpu_fd.get_sregs()?;
834            let xsave = self.vcpu_fd.get_xsave()?;
835
836            // Set the registers in the order expected by the crashdump context
837            regs[0] = vcpu_regs.r15; // r15
838            regs[1] = vcpu_regs.r14; // r14
839            regs[2] = vcpu_regs.r13; // r13
840            regs[3] = vcpu_regs.r12; // r12
841            regs[4] = vcpu_regs.rbp; // rbp
842            regs[5] = vcpu_regs.rbx; // rbx
843            regs[6] = vcpu_regs.r11; // r11
844            regs[7] = vcpu_regs.r10; // r10
845            regs[8] = vcpu_regs.r9; // r9
846            regs[9] = vcpu_regs.r8; // r8
847            regs[10] = vcpu_regs.rax; // rax
848            regs[11] = vcpu_regs.rcx; // rcx
849            regs[12] = vcpu_regs.rdx; // rdx
850            regs[13] = vcpu_regs.rsi; // rsi
851            regs[14] = vcpu_regs.rdi; // rdi
852            regs[15] = 0; // orig rax
853            regs[16] = vcpu_regs.rip; // rip
854            regs[17] = sregs.cs.selector as u64; // cs
855            regs[18] = vcpu_regs.rflags; // eflags
856            regs[19] = vcpu_regs.rsp; // rsp
857            regs[20] = sregs.ss.selector as u64; // ss
858            regs[21] = sregs.fs.base; // fs_base
859            regs[22] = sregs.gs.base; // gs_base
860            regs[23] = sregs.ds.selector as u64; // ds
861            regs[24] = sregs.es.selector as u64; // es
862            regs[25] = sregs.fs.selector as u64; // fs
863            regs[26] = sregs.gs.selector as u64; // gs
864
865            // Get the filename from the runtime config
866            let filename = self.rt_cfg.binary_path.clone().and_then(|path| {
867                Path::new(&path)
868                    .file_name()
869                    .and_then(|name| name.to_os_string().into_string().ok())
870            });
871
872            // The [`CrashDumpContext`] accepts xsave as a vector of u8, so we need to convert the
873            // xsave region to a vector of u8
874            // Also include mapped regions in addition to the initial sandbox regions
875            let mut regions: Vec<MemoryRegion> = self.sandbox_regions.clone();
876            regions.extend(self.mmap_regions.iter().map(|(r, _)| r.clone()));
877            Ok(Some(crashdump::CrashDumpContext::new(
878                regions,
879                regs,
880                xsave
881                    .region
882                    .iter()
883                    .flat_map(|item| item.to_le_bytes())
884                    .collect::<Vec<u8>>(),
885                self.entrypoint,
886                self.rt_cfg.binary_path.clone(),
887                filename,
888            )))
889        } else {
890            Ok(None)
891        }
892    }
893
894    #[cfg(gdb)]
895    fn handle_debug(
896        &mut self,
897        dbg_mem_access_fn: Arc<Mutex<SandboxMemoryManager<HostSharedMemory>>>,
898        stop_reason: VcpuStopReason,
899    ) -> Result<()> {
900        if self.debug.is_none() {
901            return Err(new_error!("Debugging is not enabled"));
902        }
903
904        let mem_access = DebugMemoryAccess {
905            dbg_mem_access_fn,
906            guest_mmap_regions: self.mmap_regions.iter().map(|(r, _)| r.clone()).collect(),
907        };
908
909        match stop_reason {
910            // If the vCPU stopped because of a crash, we need to handle it differently
911            // We do not want to allow resuming execution or placing breakpoints
912            // because the guest has crashed.
913            // We only allow reading registers and memory
914            VcpuStopReason::Crash => {
915                self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
916                    .map_err(|e| {
917                        new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
918                    })?;
919
920                loop {
921                    log::debug!("Debug wait for event to resume vCPU");
922                    // Wait for a message from gdb
923                    let req = self.recv_dbg_msg()?;
924
925                    // Flag to store if we should deny continue or step requests
926                    let mut deny_continue = false;
927                    // Flag to store if we should detach from the gdb session
928                    let mut detach = false;
929
930                    let response = match req {
931                        // Allow the detach request to disable debugging by continuing resuming
932                        // hypervisor crash error reporting
933                        DebugMsg::DisableDebug => {
934                            detach = true;
935                            DebugResponse::DisableDebug
936                        }
937                        // Do not allow continue or step requests
938                        DebugMsg::Continue | DebugMsg::Step => {
939                            deny_continue = true;
940                            DebugResponse::NotAllowed
941                        }
942                        // Do not allow adding/removing breakpoints and writing to memory or registers
943                        DebugMsg::AddHwBreakpoint(_)
944                        | DebugMsg::AddSwBreakpoint(_)
945                        | DebugMsg::RemoveHwBreakpoint(_)
946                        | DebugMsg::RemoveSwBreakpoint(_)
947                        | DebugMsg::WriteAddr(_, _)
948                        | DebugMsg::WriteRegisters(_) => DebugResponse::NotAllowed,
949
950                        // For all other requests, we will process them normally
951                        _ => {
952                            let result = self.process_dbg_request(req, &mem_access);
953                            match result {
954                                Ok(response) => response,
955                                Err(HyperlightError::TranslateGuestAddress(_)) => {
956                                    // Treat non fatal errors separately so the guest doesn't fail
957                                    DebugResponse::ErrorOccurred
958                                }
959                                Err(e) => {
960                                    log::error!("Error processing debug request: {:?}", e);
961                                    return Err(e);
962                                }
963                            }
964                        }
965                    };
966
967                    // Send the response to the request back to gdb
968                    self.send_dbg_msg(response)
969                        .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
970
971                    // If we are denying continue or step requests, the debugger assumes the
972                    // execution started so we need to report a stop reason as a crash and let
973                    // it request to read registers/memory to figure out what happened
974                    if deny_continue {
975                        self.send_dbg_msg(DebugResponse::VcpuStopped(VcpuStopReason::Crash))
976                            .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
977                    }
978
979                    // If we are detaching, we will break the loop and the Hypervisor will continue
980                    // to handle the Crash reason
981                    if detach {
982                        break;
983                    }
984                }
985            }
986            // If the vCPU stopped because of any other reason except a crash, we can handle it
987            // normally
988            _ => {
989                // Send the stop reason to the gdb thread
990                self.send_dbg_msg(DebugResponse::VcpuStopped(stop_reason))
991                    .map_err(|e| {
992                        new_error!("Couldn't signal vCPU stopped event to GDB thread: {:?}", e)
993                    })?;
994
995                loop {
996                    log::debug!("Debug wait for event to resume vCPU");
997                    // Wait for a message from gdb
998                    let req = self.recv_dbg_msg()?;
999
1000                    let result = self.process_dbg_request(req, &mem_access);
1001
1002                    let response = match result {
1003                        Ok(response) => response,
1004                        // Treat non fatal errors separately so the guest doesn't fail
1005                        Err(HyperlightError::TranslateGuestAddress(_)) => {
1006                            DebugResponse::ErrorOccurred
1007                        }
1008                        Err(e) => {
1009                            return Err(e);
1010                        }
1011                    };
1012
1013                    let cont = matches!(
1014                        response,
1015                        DebugResponse::Continue | DebugResponse::Step | DebugResponse::DisableDebug
1016                    );
1017
1018                    self.send_dbg_msg(response)
1019                        .map_err(|e| new_error!("Couldn't send response to gdb: {:?}", e))?;
1020
1021                    // Check if we should continue execution
1022                    // We continue if the response is one of the following: Step, Continue, or DisableDebug
1023                    if cont {
1024                        break;
1025                    }
1026                }
1027            }
1028        }
1029
1030        Ok(())
1031    }
1032
1033    fn check_stack_guard(&self) -> Result<bool> {
1034        if let Some(mgr) = self.mem_mgr.as_ref() {
1035            mgr.check_stack_guard()
1036        } else {
1037            Err(new_error!("Memory manager is not initialized"))
1038        }
1039    }
1040
1041    #[cfg(feature = "trace_guest")]
1042    fn handle_trace(&mut self, tc: &mut crate::sandbox::trace::TraceContext) -> Result<()> {
1043        let regs = self.regs()?;
1044        tc.handle_trace(
1045            &regs,
1046            self.mem_mgr.as_mut().ok_or_else(|| {
1047                new_error!("Memory manager is not initialized before handling trace")
1048            })?,
1049        )
1050    }
1051
1052    #[cfg(feature = "mem_profile")]
1053    fn trace_info_mut(&mut self) -> &mut MemTraceInfo {
1054        &mut self.trace_info
1055    }
1056}
1057
1058impl Drop for KVMDriver {
1059    fn drop(&mut self) {
1060        self.interrupt_handle.dropped.store(true, Ordering::Relaxed);
1061    }
1062}