Skip to main content

arcbox_hypervisor/linux/
vm.rs

1//! Virtual machine implementation for Linux KVM.
2
3use std::sync::atomic::{AtomicBool, AtomicU32, AtomicU64, Ordering};
4use std::sync::{Arc, RwLock};
5
6use crate::{
7    config::VmConfig,
8    error::HypervisorError,
9    memory::{GuestAddress, PAGE_SIZE},
10    traits::VirtualMachine,
11    types::{DeviceSnapshot, DirtyPageInfo, VirtioDeviceConfig, VirtioDeviceType},
12};
13
14use std::os::unix::io::RawFd;
15
16use super::ffi::{self, KvmPitConfig, KvmSystem, KvmUserspaceMemoryRegion, KvmVmFd};
17use super::memory::KvmMemory;
18use super::vcpu::KvmVcpu;
19
20/// Global VM ID counter.
21static VM_ID_COUNTER: AtomicU64 = AtomicU64::new(0);
22
23// ============================================================================
24// VirtIO MMIO Constants
25// ============================================================================
26
27/// Base address for VirtIO MMIO devices (ARM64).
28/// This is placed at 160MB to avoid conflicts with RAM and other devices.
29const VIRTIO_MMIO_BASE: u64 = 0x0a00_0000;
30
31/// Size of each VirtIO MMIO device region (512 bytes).
32const VIRTIO_MMIO_SIZE: u64 = 0x200;
33
34/// Gap between VirtIO MMIO devices (for alignment).
35const VIRTIO_MMIO_GAP: u64 = 0x200;
36
37/// VirtIO MMIO register offset for queue notify (used for IOEVENTFD).
38const VIRTIO_MMIO_QUEUE_NOTIFY: u64 = 0x50;
39
40/// Base IRQ for VirtIO devices.
41/// On ARM64 GIC, SPI interrupts start at 32.
42/// On x86 IOAPIC, we use IRQs starting at 5 (avoiding legacy devices).
43#[cfg(target_arch = "aarch64")]
44const VIRTIO_IRQ_BASE: u32 = 32;
45
46#[cfg(target_arch = "x86_64")]
47const VIRTIO_IRQ_BASE: u32 = 5;
48
49/// Maximum number of VirtIO devices.
50const MAX_VIRTIO_DEVICES: usize = 32;
51
52// ============================================================================
53// Memory Slot Tracking for Dirty Logging
54// ============================================================================
55
56/// Information about a memory slot for dirty page tracking.
57#[derive(Debug, Clone)]
58struct MemorySlotInfo {
59    /// Slot ID.
60    slot: u32,
61    /// Guest physical address.
62    guest_phys_addr: u64,
63    /// Size in bytes.
64    size: u64,
65    /// Host virtual address.
66    userspace_addr: u64,
67}
68
69// ============================================================================
70// VirtIO Device Tracking
71// ============================================================================
72
73/// Information about an attached VirtIO device.
74#[derive(Debug)]
75pub struct VirtioDeviceInfo {
76    /// Device type.
77    pub device_type: VirtioDeviceType,
78    /// MMIO base address.
79    pub mmio_base: u64,
80    /// MMIO region size.
81    pub mmio_size: u64,
82    /// Assigned IRQ (GSI).
83    pub irq: u32,
84    /// Eventfd for IRQ injection.
85    pub irq_fd: RawFd,
86    /// Eventfd for queue notification.
87    pub notify_fd: RawFd,
88}
89
90/// Virtual machine state.
91#[derive(Debug, Clone, Copy, PartialEq, Eq)]
92pub enum VmState {
93    /// VM is created but not started.
94    Created,
95    /// VM is starting.
96    Starting,
97    /// VM is running.
98    Running,
99    /// VM is paused.
100    Paused,
101    /// VM is stopping.
102    Stopping,
103    /// VM is stopped.
104    Stopped,
105    /// VM encountered an error.
106    Error,
107}
108
109/// Virtual machine implementation for Linux KVM.
110///
111/// This wraps a KVM VM and provides the platform-agnostic interface.
112pub struct KvmVm {
113    /// Unique VM ID.
114    id: u64,
115    /// VM configuration.
116    config: VmConfig,
117    /// KVM system handle.
118    #[allow(dead_code)]
119    kvm: Arc<KvmSystem>,
120    /// KVM VM file descriptor.
121    vm_fd: KvmVmFd,
122    /// vCPU mmap size.
123    vcpu_mmap_size: usize,
124    /// Guest memory.
125    memory: KvmMemory,
126    /// Next memory slot ID.
127    next_slot: AtomicU32,
128    /// Created vCPU IDs.
129    vcpus: RwLock<Vec<u32>>,
130    /// Current state.
131    state: RwLock<VmState>,
132    /// Whether the VM is running.
133    running: AtomicBool,
134    /// Attached VirtIO devices.
135    virtio_devices: RwLock<Vec<VirtioDeviceInfo>>,
136    /// Next VirtIO IRQ offset.
137    next_virtio_irq: AtomicU32,
138    /// Memory slots for dirty page tracking.
139    memory_slots: RwLock<Vec<MemorySlotInfo>>,
140    /// Whether dirty page tracking is enabled.
141    dirty_tracking_enabled: AtomicBool,
142}
143
144// Safety: All mutable state is properly synchronized.
145unsafe impl Send for KvmVm {}
146unsafe impl Sync for KvmVm {}
147
148impl KvmVm {
149    /// Creates a new KVM VM.
150    pub(crate) fn new(
151        kvm: Arc<KvmSystem>,
152        vcpu_mmap_size: usize,
153        config: VmConfig,
154    ) -> Result<Self, HypervisorError> {
155        let id = VM_ID_COUNTER.fetch_add(1, Ordering::SeqCst);
156
157        // Create the VM
158        let vm_fd = kvm.create_vm().map_err(|e| {
159            HypervisorError::VmCreationFailed(format!("Failed to create KVM VM: {}", e))
160        })?;
161
162        // Setup architecture-specific components
163        #[cfg(target_arch = "x86_64")]
164        Self::setup_x86_vm(&vm_fd)?;
165
166        // Allocate guest memory
167        let memory = KvmMemory::new(config.memory_size)?;
168
169        // Map memory to the VM
170        let region = KvmUserspaceMemoryRegion {
171            slot: 0,
172            flags: 0,
173            guest_phys_addr: 0,
174            memory_size: config.memory_size,
175            userspace_addr: memory.host_address() as u64,
176        };
177
178        vm_fd.set_user_memory_region(&region).map_err(|e| {
179            HypervisorError::VmCreationFailed(format!("Failed to map guest memory: {}", e))
180        })?;
181
182        // Track the main memory slot for dirty page tracking.
183        let main_slot = MemorySlotInfo {
184            slot: 0,
185            guest_phys_addr: 0,
186            size: config.memory_size,
187            userspace_addr: memory.host_address() as u64,
188        };
189
190        memory.attach_vm_fd(vm_fd.as_raw_fd());
191        memory.register_slot(
192            main_slot.slot,
193            main_slot.guest_phys_addr,
194            main_slot.size,
195            main_slot.userspace_addr,
196            0,
197        )?;
198
199        tracing::info!(
200            "Created KVM VM {}: vcpus={}, memory={}MB",
201            id,
202            config.vcpu_count,
203            config.memory_size / (1024 * 1024)
204        );
205
206        Ok(Self {
207            id,
208            config,
209            kvm,
210            vm_fd,
211            vcpu_mmap_size,
212            memory,
213            next_slot: AtomicU32::new(1), // Slot 0 is used for main memory
214            vcpus: RwLock::new(Vec::new()),
215            state: RwLock::new(VmState::Created),
216            running: AtomicBool::new(false),
217            virtio_devices: RwLock::new(Vec::new()),
218            next_virtio_irq: AtomicU32::new(0),
219            memory_slots: RwLock::new(vec![main_slot]),
220            dirty_tracking_enabled: AtomicBool::new(false),
221        })
222    }
223
224    /// Sets up x86-specific VM components.
225    #[cfg(target_arch = "x86_64")]
226    fn setup_x86_vm(vm_fd: &KvmVmFd) -> Result<(), HypervisorError> {
227        // Set TSS address (required for Intel VT-x)
228        // The TSS is placed at the end of the 4GB space to avoid conflicts
229        const TSS_ADDR: u64 = 0xfffb_d000;
230        vm_fd
231            .set_tss_addr(TSS_ADDR)
232            .map_err(|e| HypervisorError::VmCreationFailed(format!("Failed to set TSS: {}", e)))?;
233
234        // Set identity map address
235        const IDENTITY_MAP_ADDR: u64 = 0xfffb_c000;
236        vm_fd
237            .set_identity_map_addr(IDENTITY_MAP_ADDR)
238            .map_err(|e| {
239                HypervisorError::VmCreationFailed(format!("Failed to set identity map: {}", e))
240            })?;
241
242        // Create in-kernel IRQ chip (APIC, IOAPIC, PIC)
243        vm_fd.create_irqchip().map_err(|e| {
244            HypervisorError::VmCreationFailed(format!("Failed to create IRQ chip: {}", e))
245        })?;
246
247        // Create PIT (Programmable Interval Timer)
248        let pit_config = KvmPitConfig::default();
249        vm_fd.create_pit2(&pit_config).map_err(|e| {
250            HypervisorError::VmCreationFailed(format!("Failed to create PIT: {}", e))
251        })?;
252
253        Ok(())
254    }
255
256    /// Returns the VM ID.
257    #[must_use]
258    pub fn id(&self) -> u64 {
259        self.id
260    }
261
262    /// Returns the VM configuration.
263    #[must_use]
264    pub fn config(&self) -> &VmConfig {
265        &self.config
266    }
267
268    /// Returns the current VM state.
269    pub fn state(&self) -> VmState {
270        *self.state.read().unwrap()
271    }
272
273    /// Returns whether the VM is running.
274    #[must_use]
275    pub fn is_running(&self) -> bool {
276        self.running.load(Ordering::SeqCst)
277    }
278
279    /// Sets the VM state.
280    fn set_state(&self, new_state: VmState) {
281        let mut state = self.state.write().unwrap();
282        tracing::debug!("VM {} state: {:?} -> {:?}", self.id, *state, new_state);
283        *state = new_state;
284    }
285
286    /// Returns the KVM VM file descriptor.
287    pub(crate) fn vm_fd(&self) -> &KvmVmFd {
288        &self.vm_fd
289    }
290
291    /// Returns the vCPU mmap size.
292    pub(crate) fn vcpu_mmap_size(&self) -> usize {
293        self.vcpu_mmap_size
294    }
295
296    /// Adds an additional memory region to the VM.
297    pub fn add_memory_region(
298        &self,
299        guest_addr: GuestAddress,
300        host_addr: *mut u8,
301        size: u64,
302        read_only: bool,
303    ) -> Result<u32, HypervisorError> {
304        let slot = self.next_slot.fetch_add(1, Ordering::SeqCst);
305
306        let mut base_flags = 0u32;
307        if read_only {
308            base_flags |= ffi::KVM_MEM_READONLY;
309        }
310
311        let mut region_flags = base_flags;
312        if self.dirty_tracking_enabled.load(Ordering::SeqCst) {
313            region_flags |= ffi::KVM_MEM_LOG_DIRTY_PAGES;
314        }
315
316        let region = KvmUserspaceMemoryRegion {
317            slot,
318            flags: region_flags,
319            guest_phys_addr: guest_addr.raw(),
320            memory_size: size,
321            userspace_addr: host_addr as u64,
322        };
323
324        self.vm_fd.set_user_memory_region(&region).map_err(|e| {
325            HypervisorError::MemoryError(format!("Failed to add memory region: {}", e))
326        })?;
327
328        {
329            let mut slots = self
330                .memory_slots
331                .write()
332                .map_err(|_| HypervisorError::SnapshotError("Lock poisoned".to_string()))?;
333            slots.push(MemorySlotInfo {
334                slot,
335                guest_phys_addr: guest_addr.raw(),
336                size,
337                userspace_addr: host_addr as u64,
338            });
339        }
340
341        self.memory
342            .register_slot(slot, guest_addr.raw(), size, host_addr as u64, base_flags)?;
343
344        tracing::debug!(
345            "Added memory region {} at {}: {}MB, read_only={}",
346            slot,
347            guest_addr,
348            size / (1024 * 1024),
349            read_only
350        );
351
352        Ok(slot)
353    }
354
355    /// Removes a memory region from the VM.
356    pub fn remove_memory_region(&self, slot: u32) -> Result<(), HypervisorError> {
357        let region = KvmUserspaceMemoryRegion {
358            slot,
359            flags: 0,
360            guest_phys_addr: 0,
361            memory_size: 0,
362            userspace_addr: 0,
363        };
364
365        self.vm_fd.set_user_memory_region(&region).map_err(|e| {
366            HypervisorError::MemoryError(format!("Failed to remove memory region: {}", e))
367        })?;
368
369        {
370            let mut slots = self
371                .memory_slots
372                .write()
373                .map_err(|_| HypervisorError::SnapshotError("Lock poisoned".to_string()))?;
374            slots.retain(|entry| entry.slot != slot);
375        }
376
377        self.memory.unregister_slot(slot)?;
378
379        tracing::debug!("Removed memory region {}", slot);
380
381        Ok(())
382    }
383
384    // ========================================================================
385    // IRQ Injection Interface
386    // ========================================================================
387
388    /// Sets the IRQ line level on the in-kernel irqchip.
389    ///
390    /// For edge-triggered interrupts, call with level=true then level=false.
391    /// For level-triggered interrupts, keep level=true until acknowledged.
392    ///
393    /// # Arguments
394    /// * `gsi` - Global System Interrupt number
395    /// * `level` - true to assert, false to deassert
396    pub fn set_irq_line(&self, gsi: u32, level: bool) -> Result<(), HypervisorError> {
397        self.vm_fd
398            .set_irq_line(gsi, level)
399            .map_err(|e| HypervisorError::DeviceError(format!("Failed to set IRQ line: {}", e)))
400    }
401
402    /// Triggers an edge-triggered interrupt.
403    ///
404    /// Convenience method that asserts then immediately deasserts the IRQ line.
405    pub fn trigger_edge_irq(&self, gsi: u32) -> Result<(), HypervisorError> {
406        self.set_irq_line(gsi, true)?;
407        self.set_irq_line(gsi, false)
408    }
409
410    /// Registers an eventfd for IRQ injection (IRQFD).
411    ///
412    /// When the eventfd is signaled (write 1 to it), KVM will automatically
413    /// inject the specified GSI into the guest. This is the most efficient
414    /// method for interrupt delivery.
415    ///
416    /// # Arguments
417    /// * `eventfd` - The eventfd file descriptor
418    /// * `gsi` - The GSI to inject when eventfd is signaled
419    /// * `resample_fd` - For level-triggered IRQs, optional resample eventfd
420    pub fn register_irqfd(
421        &self,
422        eventfd: RawFd,
423        gsi: u32,
424        resample_fd: Option<RawFd>,
425    ) -> Result<(), HypervisorError> {
426        self.vm_fd
427            .register_irqfd(eventfd, gsi, resample_fd)
428            .map_err(|e| HypervisorError::DeviceError(format!("Failed to register IRQFD: {}", e)))
429    }
430
431    /// Unregisters an eventfd for IRQ injection.
432    pub fn unregister_irqfd(&self, eventfd: RawFd, gsi: u32) -> Result<(), HypervisorError> {
433        self.vm_fd
434            .unregister_irqfd(eventfd, gsi)
435            .map_err(|e| HypervisorError::DeviceError(format!("Failed to unregister IRQFD: {}", e)))
436    }
437
438    // ========================================================================
439    // VirtIO Device Support
440    // ========================================================================
441
442    /// Allocates an MMIO region for a VirtIO device.
443    ///
444    /// Returns the base address for the device.
445    fn allocate_mmio_region(&self) -> Result<u64, HypervisorError> {
446        let devices = self
447            .virtio_devices
448            .read()
449            .map_err(|_| HypervisorError::DeviceError("Lock poisoned".to_string()))?;
450
451        if devices.len() >= MAX_VIRTIO_DEVICES {
452            return Err(HypervisorError::DeviceError(
453                "Maximum number of VirtIO devices reached".to_string(),
454            ));
455        }
456
457        // Calculate next available address.
458        let offset = devices.len() as u64 * (VIRTIO_MMIO_SIZE + VIRTIO_MMIO_GAP);
459        Ok(VIRTIO_MMIO_BASE + offset)
460    }
461
462    /// Allocates an IRQ (GSI) for a VirtIO device.
463    fn allocate_irq(&self) -> u32 {
464        let offset = self.next_virtio_irq.fetch_add(1, Ordering::SeqCst);
465        VIRTIO_IRQ_BASE + offset
466    }
467
468    /// Creates an eventfd.
469    fn create_eventfd() -> Result<RawFd, HypervisorError> {
470        let fd = unsafe { libc::eventfd(0, libc::EFD_NONBLOCK | libc::EFD_CLOEXEC) };
471        if fd < 0 {
472            return Err(HypervisorError::DeviceError(format!(
473                "Failed to create eventfd: {}",
474                std::io::Error::last_os_error()
475            )));
476        }
477        Ok(fd)
478    }
479
480    /// Sets up IOEVENTFD for VirtIO queue notification.
481    ///
482    /// This allows the guest to notify the host about queue updates by writing
483    /// to a specific MMIO address, without causing a VM exit.
484    fn setup_ioeventfd(&self, mmio_base: u64) -> Result<RawFd, HypervisorError> {
485        let notify_fd = Self::create_eventfd()?;
486
487        // Register IOEVENTFD at the queue notify register address.
488        // 4 bytes for 32-bit writes to VIRTIO_MMIO_QUEUE_NOTIFY.
489        let notify_addr = mmio_base + VIRTIO_MMIO_QUEUE_NOTIFY;
490
491        self.vm_fd
492            .register_ioeventfd(notify_addr, 4, notify_fd, None)
493            .map_err(|e| {
494                // Clean up the eventfd on failure.
495                unsafe { libc::close(notify_fd) };
496                HypervisorError::DeviceError(format!("Failed to register IOEVENTFD: {}", e))
497            })?;
498
499        tracing::debug!(
500            "Registered IOEVENTFD at {:#x} with fd={}",
501            notify_addr,
502            notify_fd
503        );
504
505        Ok(notify_fd)
506    }
507
508    /// Returns a copy of the attached VirtIO devices info.
509    pub fn virtio_devices(&self) -> Result<Vec<VirtioDeviceInfo>, HypervisorError> {
510        let devices = self
511            .virtio_devices
512            .read()
513            .map_err(|_| HypervisorError::DeviceError("Lock poisoned".to_string()))?;
514
515        Ok(devices
516            .iter()
517            .map(|d| VirtioDeviceInfo {
518                device_type: d.device_type.clone(),
519                mmio_base: d.mmio_base,
520                mmio_size: d.mmio_size,
521                irq: d.irq,
522                irq_fd: d.irq_fd,
523                notify_fd: d.notify_fd,
524            })
525            .collect())
526    }
527
528    // ========================================================================
529    // Dirty Page Tracking
530    // ========================================================================
531
532    /// Enables dirty page tracking for all memory regions.
533    ///
534    /// When enabled, KVM tracks which pages have been written to by the guest.
535    /// Use `get_dirty_pages` to retrieve and clear the dirty page bitmap.
536    ///
537    /// This is useful for:
538    /// - Live migration: Only transfer modified pages
539    /// - Snapshotting: Track incremental changes
540    ///
541    /// # Errors
542    ///
543    /// Returns an error if dirty logging cannot be enabled.
544    pub fn enable_dirty_tracking(&self) -> Result<(), HypervisorError> {
545        if self.dirty_tracking_enabled.load(Ordering::SeqCst) {
546            // Already enabled.
547            return Ok(());
548        }
549
550        let slots = self
551            .memory_slots
552            .read()
553            .map_err(|_| HypervisorError::SnapshotError("Lock poisoned".to_string()))?;
554
555        // Enable dirty logging for all memory slots.
556        for slot in slots.iter() {
557            self.vm_fd
558                .enable_dirty_logging(
559                    slot.slot,
560                    slot.guest_phys_addr,
561                    slot.size,
562                    slot.userspace_addr,
563                )
564                .map_err(|e| {
565                    HypervisorError::SnapshotError(format!(
566                        "Failed to enable dirty logging for slot {}: {}",
567                        slot.slot, e
568                    ))
569                })?;
570
571            tracing::debug!(
572                "Enabled dirty logging for slot {}: guest={:#x}, size={}MB",
573                slot.slot,
574                slot.guest_phys_addr,
575                slot.size / (1024 * 1024)
576            );
577        }
578
579        self.dirty_tracking_enabled.store(true, Ordering::SeqCst);
580        self.memory.set_dirty_tracking_enabled(true);
581        tracing::info!("Dirty page tracking enabled for VM {}", self.id);
582
583        Ok(())
584    }
585
586    /// Disables dirty page tracking for all memory regions.
587    ///
588    /// # Errors
589    ///
590    /// Returns an error if dirty logging cannot be disabled.
591    pub fn disable_dirty_tracking(&self) -> Result<(), HypervisorError> {
592        if !self.dirty_tracking_enabled.load(Ordering::SeqCst) {
593            // Already disabled.
594            return Ok(());
595        }
596
597        let slots = self
598            .memory_slots
599            .read()
600            .map_err(|_| HypervisorError::SnapshotError("Lock poisoned".to_string()))?;
601
602        // Disable dirty logging for all memory slots.
603        for slot in slots.iter() {
604            self.vm_fd
605                .disable_dirty_logging(
606                    slot.slot,
607                    slot.guest_phys_addr,
608                    slot.size,
609                    slot.userspace_addr,
610                )
611                .map_err(|e| {
612                    HypervisorError::SnapshotError(format!(
613                        "Failed to disable dirty logging for slot {}: {}",
614                        slot.slot, e
615                    ))
616                })?;
617        }
618
619        self.dirty_tracking_enabled.store(false, Ordering::SeqCst);
620        self.memory.set_dirty_tracking_enabled(false);
621        tracing::info!("Dirty page tracking disabled for VM {}", self.id);
622
623        Ok(())
624    }
625
626    /// Returns whether dirty page tracking is enabled.
627    #[must_use]
628    pub fn is_dirty_tracking_enabled(&self) -> bool {
629        self.dirty_tracking_enabled.load(Ordering::SeqCst)
630    }
631
632    /// Gets the list of dirty pages across all memory regions.
633    ///
634    /// This retrieves and clears the dirty page bitmap from KVM.
635    /// Each call returns pages that were written since the last call.
636    ///
637    /// # Errors
638    ///
639    /// Returns an error if dirty tracking is not enabled or if the
640    /// dirty log cannot be retrieved.
641    pub fn get_dirty_pages(&self) -> Result<Vec<DirtyPageInfo>, HypervisorError> {
642        if !self.dirty_tracking_enabled.load(Ordering::SeqCst) {
643            return Err(HypervisorError::SnapshotError(
644                "Dirty tracking not enabled".to_string(),
645            ));
646        }
647
648        let slots = self
649            .memory_slots
650            .read()
651            .map_err(|_| HypervisorError::SnapshotError("Lock poisoned".to_string()))?;
652
653        let mut dirty_pages = Vec::new();
654
655        for slot in slots.iter() {
656            // Get the dirty bitmap for this slot.
657            let bitmap = self
658                .vm_fd
659                .get_dirty_log(slot.slot, slot.size, PAGE_SIZE)
660                .map_err(|e| {
661                    HypervisorError::SnapshotError(format!(
662                        "Failed to get dirty log for slot {}: {}",
663                        slot.slot, e
664                    ))
665                })?;
666
667            // Parse the bitmap to extract dirty page addresses.
668            let pages = Self::parse_dirty_bitmap(&bitmap, slot.guest_phys_addr, slot.size);
669
670            tracing::debug!(
671                "Slot {}: {} dirty pages out of {} total",
672                slot.slot,
673                pages.len(),
674                slot.size / PAGE_SIZE
675            );
676
677            dirty_pages.extend(pages);
678        }
679
680        tracing::debug!(
681            "get_dirty_pages: found {} dirty pages total",
682            dirty_pages.len()
683        );
684
685        Ok(dirty_pages)
686    }
687
688    /// Parses a dirty bitmap to extract individual dirty page addresses.
689    ///
690    /// # Arguments
691    /// * `bitmap` - The bitmap from KVM_GET_DIRTY_LOG
692    /// * `base_addr` - The guest physical address of the region start
693    /// * `size` - Total size of the region
694    ///
695    /// # Returns
696    /// A vector of DirtyPageInfo for each dirty page.
697    fn parse_dirty_bitmap(bitmap: &[u64], base_addr: u64, size: u64) -> Vec<DirtyPageInfo> {
698        let mut pages = Vec::new();
699        let num_pages = size / PAGE_SIZE;
700
701        for (word_idx, &word) in bitmap.iter().enumerate() {
702            if word == 0 {
703                // Skip words with no dirty pages.
704                continue;
705            }
706
707            // Check each bit in the word.
708            for bit_idx in 0..64 {
709                if (word >> bit_idx) & 1 != 0 {
710                    let page_num = (word_idx as u64 * 64) + bit_idx as u64;
711                    if page_num < num_pages {
712                        pages.push(DirtyPageInfo {
713                            guest_addr: base_addr + page_num * PAGE_SIZE,
714                            size: PAGE_SIZE,
715                        });
716                    }
717                }
718            }
719        }
720
721        pages
722    }
723}
724
725impl VirtualMachine for KvmVm {
726    type Vcpu = KvmVcpu;
727    type Memory = KvmMemory;
728
729    fn memory(&self) -> &Self::Memory {
730        &self.memory
731    }
732
733    fn create_vcpu(&mut self, id: u32) -> Result<Self::Vcpu, HypervisorError> {
734        if id >= self.config.vcpu_count {
735            return Err(HypervisorError::VcpuCreationFailed {
736                id,
737                reason: format!(
738                    "vCPU ID {} exceeds configured count {}",
739                    id, self.config.vcpu_count
740                ),
741            });
742        }
743
744        // Check if already created
745        {
746            let vcpus = self
747                .vcpus
748                .read()
749                .map_err(|_| HypervisorError::VcpuCreationFailed {
750                    id,
751                    reason: "Lock poisoned".to_string(),
752                })?;
753
754            if vcpus.contains(&id) {
755                return Err(HypervisorError::VcpuCreationFailed {
756                    id,
757                    reason: "vCPU already created".to_string(),
758                });
759            }
760        }
761
762        // Create vCPU via KVM
763        let vcpu_fd = self
764            .vm_fd
765            .create_vcpu(id, self.vcpu_mmap_size)
766            .map_err(|e| HypervisorError::VcpuCreationFailed {
767                id,
768                reason: format!("KVM error: {}", e),
769            })?;
770
771        // Create wrapper
772        let vcpu = KvmVcpu::new(id, vcpu_fd)?;
773
774        // Record creation
775        {
776            let mut vcpus =
777                self.vcpus
778                    .write()
779                    .map_err(|_| HypervisorError::VcpuCreationFailed {
780                        id,
781                        reason: "Lock poisoned".to_string(),
782                    })?;
783            vcpus.push(id);
784        }
785
786        tracing::debug!("Created vCPU {} for VM {}", id, self.id);
787
788        Ok(vcpu)
789    }
790
791    fn add_virtio_device(&mut self, device: VirtioDeviceConfig) -> Result<(), HypervisorError> {
792        // 1. Check state - devices can only be added before VM starts.
793        let state = self.state();
794        if state != VmState::Created {
795            return Err(HypervisorError::DeviceError(
796                "Cannot add device: VM not in Created state".to_string(),
797            ));
798        }
799
800        // 2. Allocate MMIO address space for the device.
801        let mmio_base = self.allocate_mmio_region()?;
802
803        // 3. Allocate an IRQ (GSI) for the device.
804        let gsi = self.allocate_irq();
805
806        // 4. Create eventfd for IRQ injection and register IRQFD.
807        let irq_fd = Self::create_eventfd()?;
808
809        if let Err(e) = self.register_irqfd(irq_fd, gsi, None) {
810            // Clean up on failure.
811            unsafe { libc::close(irq_fd) };
812            return Err(e);
813        }
814
815        // 5. Set up IOEVENTFD for queue notification.
816        let notify_fd = match self.setup_ioeventfd(mmio_base) {
817            Ok(fd) => fd,
818            Err(e) => {
819                // Clean up on failure.
820                let _ = self.unregister_irqfd(irq_fd, gsi);
821                unsafe { libc::close(irq_fd) };
822                return Err(e);
823            }
824        };
825
826        // 6. Record the device information.
827        let device_info = VirtioDeviceInfo {
828            device_type: device.device_type.clone(),
829            mmio_base,
830            mmio_size: VIRTIO_MMIO_SIZE,
831            irq: gsi,
832            irq_fd,
833            notify_fd,
834        };
835
836        {
837            let mut devices = self.virtio_devices.write().map_err(|_| {
838                // Clean up on failure.
839                let _ = self.unregister_irqfd(irq_fd, gsi);
840                unsafe {
841                    libc::close(irq_fd);
842                    libc::close(notify_fd);
843                }
844                HypervisorError::DeviceError("Lock poisoned".to_string())
845            })?;
846
847            devices.push(device_info);
848        }
849
850        tracing::info!(
851            "Added {:?} device to VM {}: MMIO={:#x}-{:#x}, IRQ={}, irq_fd={}, notify_fd={}",
852            device.device_type,
853            self.id,
854            mmio_base,
855            mmio_base + VIRTIO_MMIO_SIZE,
856            gsi,
857            irq_fd,
858            notify_fd
859        );
860
861        Ok(())
862    }
863
864    fn start(&mut self) -> Result<(), HypervisorError> {
865        let state = self.state();
866        if state != VmState::Created && state != VmState::Stopped {
867            return Err(HypervisorError::VmStateError {
868                expected: "Created or Stopped".to_string(),
869                actual: format!("{:?}", state),
870            });
871        }
872
873        self.set_state(VmState::Starting);
874
875        // Mark as running
876        self.running.store(true, Ordering::SeqCst);
877        self.set_state(VmState::Running);
878
879        tracing::info!("Started VM {}", self.id);
880
881        Ok(())
882    }
883
884    fn pause(&mut self) -> Result<(), HypervisorError> {
885        let state = self.state();
886        if state != VmState::Running {
887            return Err(HypervisorError::VmStateError {
888                expected: "Running".to_string(),
889                actual: format!("{:?}", state),
890            });
891        }
892
893        // Signal all vCPUs to pause
894        // In KVM, this is typically done by setting immediate_exit and signaling
895        // the vCPU threads
896
897        self.set_state(VmState::Paused);
898
899        tracing::info!("Paused VM {}", self.id);
900
901        Ok(())
902    }
903
904    fn resume(&mut self) -> Result<(), HypervisorError> {
905        let state = self.state();
906        if state != VmState::Paused {
907            return Err(HypervisorError::VmStateError {
908                expected: "Paused".to_string(),
909                actual: format!("{:?}", state),
910            });
911        }
912
913        self.set_state(VmState::Running);
914
915        tracing::info!("Resumed VM {}", self.id);
916
917        Ok(())
918    }
919
920    fn stop(&mut self) -> Result<(), HypervisorError> {
921        let state = self.state();
922        if state != VmState::Running && state != VmState::Paused {
923            return Err(HypervisorError::VmStateError {
924                expected: "Running or Paused".to_string(),
925                actual: format!("{:?}", state),
926            });
927        }
928
929        self.set_state(VmState::Stopping);
930
931        // Signal all vCPUs to stop
932        self.running.store(false, Ordering::SeqCst);
933
934        self.set_state(VmState::Stopped);
935
936        tracing::info!("Stopped VM {}", self.id);
937
938        Ok(())
939    }
940
941    fn as_any(&self) -> &dyn std::any::Any {
942        self
943    }
944
945    fn as_any_mut(&mut self) -> &mut dyn std::any::Any {
946        self
947    }
948
949    fn vcpu_count(&self) -> u32 {
950        self.config.vcpu_count
951    }
952
953    fn snapshot_devices(&self) -> Result<Vec<DeviceSnapshot>, HypervisorError> {
954        let devices = self
955            .virtio_devices
956            .read()
957            .map_err(|_| HypervisorError::SnapshotError("Lock poisoned".to_string()))?;
958
959        let mut snapshots = Vec::with_capacity(devices.len());
960
961        for device in devices.iter() {
962            // KVM VirtIO devices don't have internal state accessible from host.
963            // The actual device state is managed by the VMM layer (e.g., arcbox-virtio).
964            // We record the device configuration here; the VMM would need to
965            // serialize its own device state.
966            let state_bytes = bincode_serialize_device_config(device);
967
968            snapshots.push(DeviceSnapshot {
969                device_type: device.device_type.clone(),
970                name: format!("{:?}-{}", device.device_type, snapshots.len()),
971                state: state_bytes,
972            });
973        }
974
975        tracing::debug!(
976            "snapshot_devices: captured {} device configurations",
977            snapshots.len()
978        );
979
980        Ok(snapshots)
981    }
982
983    fn restore_devices(&mut self, snapshots: &[DeviceSnapshot]) -> Result<(), HypervisorError> {
984        // KVM device restoration is complex:
985        // 1. VirtIO MMIO regions must be at the same addresses
986        // 2. IRQs must be assigned to the same GSIs
987        // 3. The VMM layer must restore internal device state
988        //
989        // For now, we verify that the device configuration matches.
990        let devices = self
991            .virtio_devices
992            .read()
993            .map_err(|_| HypervisorError::SnapshotError("Lock poisoned".to_string()))?;
994
995        if snapshots.len() != devices.len() {
996            return Err(HypervisorError::SnapshotError(format!(
997                "Device count mismatch: snapshot has {}, VM has {}",
998                snapshots.len(),
999                devices.len()
1000            )));
1001        }
1002
1003        for (snapshot, device) in snapshots.iter().zip(devices.iter()) {
1004            if snapshot.device_type != device.device_type {
1005                return Err(HypervisorError::SnapshotError(format!(
1006                    "Device type mismatch: snapshot has {:?}, VM has {:?}",
1007                    snapshot.device_type, device.device_type
1008                )));
1009            }
1010        }
1011
1012        tracing::debug!(
1013            "restore_devices: verified {} device configurations",
1014            snapshots.len()
1015        );
1016
1017        Ok(())
1018    }
1019}
1020
1021/// Serializes device configuration to bytes for snapshot storage.
1022fn bincode_serialize_device_config(device: &VirtioDeviceInfo) -> Vec<u8> {
1023    // Simple serialization of device config for snapshot purposes.
1024    // A full implementation would use serde/bincode.
1025    let mut bytes = Vec::new();
1026
1027    // Serialize device type as u8
1028    let type_byte = match device.device_type {
1029        VirtioDeviceType::Block => 0u8,
1030        VirtioDeviceType::Net => 1,
1031        VirtioDeviceType::Console => 2,
1032        VirtioDeviceType::Rng => 3,
1033        VirtioDeviceType::Balloon => 4,
1034        VirtioDeviceType::Fs => 5,
1035        VirtioDeviceType::Vsock => 6,
1036        VirtioDeviceType::Gpu => 7,
1037    };
1038    bytes.push(type_byte);
1039
1040    // Serialize MMIO base and size
1041    bytes.extend_from_slice(&device.mmio_base.to_le_bytes());
1042    bytes.extend_from_slice(&device.mmio_size.to_le_bytes());
1043
1044    // Serialize IRQ
1045    bytes.extend_from_slice(&device.irq.to_le_bytes());
1046
1047    bytes
1048}
1049
1050impl Drop for KvmVm {
1051    fn drop(&mut self) {
1052        // Stop VM if running.
1053        if self.is_running() {
1054            let _ = self.stop();
1055        }
1056
1057        // Clean up VirtIO device resources.
1058        if let Ok(devices) = self.virtio_devices.read() {
1059            for device in devices.iter() {
1060                // Unregister IRQFD.
1061                let _ = self.unregister_irqfd(device.irq_fd, device.irq);
1062
1063                // Unregister IOEVENTFD.
1064                let notify_addr = device.mmio_base + VIRTIO_MMIO_QUEUE_NOTIFY;
1065                let _ = self
1066                    .vm_fd
1067                    .unregister_ioeventfd(notify_addr, 4, device.notify_fd);
1068
1069                // Close eventfds.
1070                unsafe {
1071                    libc::close(device.irq_fd);
1072                    libc::close(device.notify_fd);
1073                }
1074            }
1075        }
1076
1077        tracing::debug!("Dropped VM {}", self.id);
1078    }
1079}
1080
1081#[cfg(test)]
1082mod tests {
1083    use super::*;
1084    use crate::types::CpuArch;
1085
1086    #[test]
1087    #[ignore] // Requires /dev/kvm
1088    fn test_vm_creation() {
1089        let kvm = Arc::new(KvmSystem::open().expect("Failed to open KVM"));
1090        let mmap_size = kvm.vcpu_mmap_size().expect("Failed to get mmap size");
1091
1092        let config = VmConfig {
1093            vcpu_count: 2,
1094            memory_size: 128 * 1024 * 1024,
1095            arch: CpuArch::native(),
1096            ..Default::default()
1097        };
1098
1099        let vm = KvmVm::new(kvm, mmap_size, config).unwrap();
1100        assert_eq!(vm.state(), VmState::Created);
1101        assert!(!vm.is_running());
1102    }
1103
1104    #[test]
1105    #[ignore] // Requires /dev/kvm
1106    fn test_vcpu_creation() {
1107        let kvm = Arc::new(KvmSystem::open().expect("Failed to open KVM"));
1108        let mmap_size = kvm.vcpu_mmap_size().expect("Failed to get mmap size");
1109
1110        let config = VmConfig {
1111            vcpu_count: 4,
1112            memory_size: 128 * 1024 * 1024,
1113            ..Default::default()
1114        };
1115
1116        let mut vm = KvmVm::new(kvm, mmap_size, config).unwrap();
1117
1118        // Create valid vCPUs
1119        let vcpu0 = vm.create_vcpu(0);
1120        assert!(vcpu0.is_ok());
1121        assert_eq!(vcpu0.unwrap().id(), 0);
1122
1123        let vcpu1 = vm.create_vcpu(1);
1124        assert!(vcpu1.is_ok());
1125
1126        // Try to create same vCPU again
1127        let vcpu0_again = vm.create_vcpu(0);
1128        assert!(vcpu0_again.is_err());
1129
1130        // Try to create vCPU with invalid ID
1131        let vcpu99 = vm.create_vcpu(99);
1132        assert!(vcpu99.is_err());
1133    }
1134
1135    #[test]
1136    #[ignore] // Requires /dev/kvm
1137    fn test_vm_lifecycle() {
1138        let kvm = Arc::new(KvmSystem::open().expect("Failed to open KVM"));
1139        let mmap_size = kvm.vcpu_mmap_size().expect("Failed to get mmap size");
1140
1141        let config = VmConfig {
1142            vcpu_count: 1,
1143            memory_size: 64 * 1024 * 1024,
1144            ..Default::default()
1145        };
1146
1147        let mut vm = KvmVm::new(kvm, mmap_size, config).unwrap();
1148        assert_eq!(vm.state(), VmState::Created);
1149
1150        // Start
1151        vm.start().unwrap();
1152        assert_eq!(vm.state(), VmState::Running);
1153        assert!(vm.is_running());
1154
1155        // Pause
1156        vm.pause().unwrap();
1157        assert_eq!(vm.state(), VmState::Paused);
1158
1159        // Resume
1160        vm.resume().unwrap();
1161        assert_eq!(vm.state(), VmState::Running);
1162
1163        // Stop
1164        vm.stop().unwrap();
1165        assert_eq!(vm.state(), VmState::Stopped);
1166        assert!(!vm.is_running());
1167    }
1168
1169    #[test]
1170    #[ignore] // Requires /dev/kvm
1171    fn test_add_virtio_device() {
1172        let kvm = Arc::new(KvmSystem::open().expect("Failed to open KVM"));
1173        let mmap_size = kvm.vcpu_mmap_size().expect("Failed to get mmap size");
1174
1175        let config = VmConfig {
1176            vcpu_count: 1,
1177            memory_size: 64 * 1024 * 1024,
1178            ..Default::default()
1179        };
1180
1181        let mut vm = KvmVm::new(kvm, mmap_size, config).unwrap();
1182        assert_eq!(vm.state(), VmState::Created);
1183
1184        // Add a block device.
1185        let blk_config = VirtioDeviceConfig::block("/dev/null", true);
1186        vm.add_virtio_device(blk_config).unwrap();
1187
1188        // Verify device info.
1189        let devices = vm.virtio_devices().unwrap();
1190        assert_eq!(devices.len(), 1);
1191        assert_eq!(devices[0].device_type, VirtioDeviceType::Block);
1192        assert_eq!(devices[0].mmio_base, VIRTIO_MMIO_BASE);
1193        assert_eq!(devices[0].mmio_size, VIRTIO_MMIO_SIZE);
1194        assert!(devices[0].irq_fd >= 0);
1195        assert!(devices[0].notify_fd >= 0);
1196
1197        // Add a network device.
1198        let net_config = VirtioDeviceConfig::net();
1199        vm.add_virtio_device(net_config).unwrap();
1200
1201        // Verify second device.
1202        let devices = vm.virtio_devices().unwrap();
1203        assert_eq!(devices.len(), 2);
1204        assert_eq!(devices[1].device_type, VirtioDeviceType::Net);
1205        assert_eq!(
1206            devices[1].mmio_base,
1207            VIRTIO_MMIO_BASE + VIRTIO_MMIO_SIZE + VIRTIO_MMIO_GAP
1208        );
1209
1210        // Cannot add device after VM starts.
1211        vm.start().unwrap();
1212        let fs_config = VirtioDeviceConfig::filesystem("/share", "share");
1213        let result = vm.add_virtio_device(fs_config);
1214        assert!(result.is_err());
1215    }
1216
1217    #[test]
1218    fn test_parse_dirty_bitmap_empty() {
1219        let bitmap: Vec<u64> = vec![0; 4];
1220        let pages = KvmVm::parse_dirty_bitmap(&bitmap, 0, 1024 * 1024);
1221        assert!(pages.is_empty());
1222    }
1223
1224    #[test]
1225    fn test_parse_dirty_bitmap_all_dirty() {
1226        // 64 pages = 1 word, all bits set.
1227        let bitmap: Vec<u64> = vec![u64::MAX];
1228        let pages = KvmVm::parse_dirty_bitmap(&bitmap, 0, 64 * PAGE_SIZE);
1229        assert_eq!(pages.len(), 64);
1230
1231        // Verify first and last page addresses.
1232        assert_eq!(pages[0].guest_addr, 0);
1233        assert_eq!(pages[0].size, PAGE_SIZE);
1234        assert_eq!(pages[63].guest_addr, 63 * PAGE_SIZE);
1235    }
1236
1237    #[test]
1238    fn test_parse_dirty_bitmap_sparse() {
1239        // Only pages 0, 63, 64, and 127 are dirty (first and last of two words).
1240        let mut bitmap: Vec<u64> = vec![0; 2];
1241        bitmap[0] = 1 | (1 << 63); // Page 0 and 63.
1242        bitmap[1] = 1 | (1 << 63); // Page 64 and 127.
1243
1244        let pages = KvmVm::parse_dirty_bitmap(&bitmap, 0x1000_0000, 128 * PAGE_SIZE);
1245        assert_eq!(pages.len(), 4);
1246
1247        assert_eq!(pages[0].guest_addr, 0x1000_0000);
1248        assert_eq!(pages[1].guest_addr, 0x1000_0000 + 63 * PAGE_SIZE);
1249        assert_eq!(pages[2].guest_addr, 0x1000_0000 + 64 * PAGE_SIZE);
1250        assert_eq!(pages[3].guest_addr, 0x1000_0000 + 127 * PAGE_SIZE);
1251    }
1252
1253    #[test]
1254    #[ignore] // Requires /dev/kvm
1255    fn test_dirty_tracking_enable_disable() {
1256        let kvm = Arc::new(KvmSystem::open().expect("Failed to open KVM"));
1257        let mmap_size = kvm.vcpu_mmap_size().expect("Failed to get mmap size");
1258
1259        let config = VmConfig {
1260            vcpu_count: 1,
1261            memory_size: 16 * 1024 * 1024, // 16MB
1262            ..Default::default()
1263        };
1264
1265        let vm = KvmVm::new(kvm, mmap_size, config).unwrap();
1266
1267        // Initially disabled.
1268        assert!(!vm.is_dirty_tracking_enabled());
1269
1270        // Cannot get dirty pages when not enabled.
1271        assert!(vm.get_dirty_pages().is_err());
1272
1273        // Enable dirty tracking.
1274        vm.enable_dirty_tracking().unwrap();
1275        assert!(vm.is_dirty_tracking_enabled());
1276
1277        // Re-enabling is a no-op.
1278        vm.enable_dirty_tracking().unwrap();
1279        assert!(vm.is_dirty_tracking_enabled());
1280
1281        // Can get dirty pages (should be all pages initially).
1282        let pages = vm.get_dirty_pages().unwrap();
1283        // All pages might be marked dirty initially.
1284        println!("Initial dirty pages: {}", pages.len());
1285
1286        // Disable dirty tracking.
1287        vm.disable_dirty_tracking().unwrap();
1288        assert!(!vm.is_dirty_tracking_enabled());
1289
1290        // Cannot get dirty pages when disabled.
1291        assert!(vm.get_dirty_pages().is_err());
1292    }
1293}