Skip to main content

mimobox_vm/
restore_pool.rs

1#![cfg(all(target_os = "linux", feature = "kvm"))]
2
3use std::collections::VecDeque;
4use std::sync::{Arc, Mutex, MutexGuard};
5use std::time::{Duration, Instant};
6
7use kvm_bindings::kvm_userspace_memory_region;
8#[cfg(target_arch = "x86_64")]
9use kvm_bindings::{KVM_PIT_SPEAKER_DUMMY, kvm_pit_config};
10use kvm_ioctls::{Kvm, VcpuFd, VmFd};
11use mimobox_core::{DirEntry, FileStat, SandboxConfig, SandboxError, SandboxSnapshot};
12use thiserror::Error;
13use tracing::warn;
14use vm_memory::{GuestAddress, GuestMemory, GuestMemoryMmap};
15
16#[cfg(target_arch = "x86_64")]
17use crate::kvm::{KVM_IDENTITY_MAP_ADDR, KVM_TSS_ADDR};
18use crate::kvm::{KvmBackend, RestoreProfile, restore_runtime_state};
19#[cfg(feature = "zerocopy-fork")]
20use crate::snapshot::load_state_from_memory_file;
21use crate::vm::LifecycleError;
22use crate::{
23    GuestCommandResult, GuestExecOptions, HttpRequest, HttpResponse, MicrovmConfig, MicrovmError,
24    MicrovmSnapshot, StreamEvent,
25};
26
27/// Pre-created KVM VM shell that has memory and vCPUs registered but no restored guest state.
28pub(crate) struct EmptyVmSlot {
29    kvm: Kvm,
30    vm_fd: VmFd,
31    vcpus: Vec<VcpuFd>,
32    guest_memory: GuestMemoryMmap,
33    config: MicrovmConfig,
34    base_config: SandboxConfig,
35}
36
37impl EmptyVmSlot {
38    /// Creates an empty VM slot ready to accept snapshot memory and runtime state.
39    pub(crate) fn new(
40        base_config: SandboxConfig,
41        config: MicrovmConfig,
42    ) -> Result<Self, MicrovmError> {
43        config.validate()?;
44
45        let kvm = Kvm::new().map_err(to_backend_error)?;
46        let vm_fd = kvm.create_vm().map_err(to_backend_error)?;
47
48        #[cfg(target_arch = "x86_64")]
49        {
50            vm_fd.create_irq_chip().map_err(to_backend_error)?;
51            let pit_config = kvm_pit_config {
52                flags: KVM_PIT_SPEAKER_DUMMY,
53                ..Default::default()
54            };
55            vm_fd.create_pit2(pit_config).map_err(to_backend_error)?;
56            vm_fd
57                .set_identity_map_address(KVM_IDENTITY_MAP_ADDR)
58                .map_err(to_backend_error)?;
59            vm_fd
60                .set_tss_address(KVM_TSS_ADDR)
61                .map_err(to_backend_error)?;
62        }
63
64        let guest_memory =
65            GuestMemoryMmap::from_ranges(&[(GuestAddress(0), config.memory_bytes()?)])
66                .map_err(to_backend_error)?;
67        KvmBackend::try_enable_huge_pages(&guest_memory);
68
69        let mut vcpus = Vec::with_capacity(usize::from(config.vcpu_count));
70        for vcpu_index in 0..u64::from(config.vcpu_count) {
71            let vcpu = vm_fd.create_vcpu(vcpu_index).map_err(to_backend_error)?;
72            vcpus.push(vcpu);
73        }
74
75        register_guest_memory(&vm_fd, &guest_memory, &config)?;
76
77        Ok(Self {
78            kvm,
79            vm_fd,
80            vcpus,
81            guest_memory,
82            config,
83            base_config,
84        })
85    }
86
87    /// Converts this empty slot into a restored backend using in-memory snapshot data.
88    pub(crate) fn into_restored_backend(
89        self,
90        memory: &[u8],
91        vcpu_state: &[u8],
92    ) -> Result<KvmBackend, MicrovmError> {
93        let mut backend = KvmBackend::from_slot_components(
94            self.kvm,
95            self.vm_fd,
96            self.vcpus,
97            self.guest_memory,
98            self.base_config,
99            self.config,
100        );
101        backend.set_pending_restore_profile(RestoreProfile::default());
102
103        let mut restore_profile = backend.take_or_seed_restore_profile();
104
105        let restore_memory_started_at = Instant::now();
106        backend.restore_guest_memory(memory)?;
107        restore_profile.memory_state_write = restore_memory_started_at.elapsed();
108
109        restore_profile.cpuid_config = backend.prepare_restored_vcpus()?;
110
111        let runtime_restore_profile = restore_runtime_state(&mut backend, vcpu_state)?;
112        restore_profile.vcpu_state_restore = runtime_restore_profile.vcpu_state_restore;
113        restore_profile.device_state_restore = runtime_restore_profile.device_state_restore;
114
115        backend.set_lifecycle_ready();
116        backend.emit_restore_profile_without_resume(&restore_profile);
117        backend.set_pending_restore_profile(restore_profile);
118        Ok(backend)
119    }
120
121    /// Converts this empty slot into a restored backend using a file-backed memory snapshot.
122    #[cfg(feature = "zerocopy-fork")]
123    pub(crate) fn into_restored_backend_from_file(
124        self,
125        memory_path: &std::path::Path,
126        vcpu_state: &[u8],
127    ) -> Result<KvmBackend, MicrovmError> {
128        let mut backend = KvmBackend::from_slot_components(
129            self.kvm,
130            self.vm_fd,
131            self.vcpus,
132            self.guest_memory,
133            self.base_config,
134            self.config,
135        );
136        backend.set_pending_restore_profile(RestoreProfile::default());
137
138        let mut restore_profile = backend.take_or_seed_restore_profile();
139
140        let restore_memory_started_at = Instant::now();
141        backend.restore_from_file_zerocopy(memory_path)?;
142        restore_profile.memory_state_write = restore_memory_started_at.elapsed();
143
144        restore_profile.cpuid_config = backend.prepare_restored_vcpus()?;
145
146        let runtime_restore_profile = restore_runtime_state(&mut backend, vcpu_state)?;
147        restore_profile.vcpu_state_restore = runtime_restore_profile.vcpu_state_restore;
148        restore_profile.device_state_restore = runtime_restore_profile.device_state_restore;
149
150        backend.set_lifecycle_ready();
151        backend.emit_restore_profile_without_resume(&restore_profile);
152        backend.set_pending_restore_profile(restore_profile);
153        Ok(backend)
154    }
155}
156
157/// Configuration for the snapshot restore pool.
158#[derive(Debug, Clone, Copy, PartialEq, Eq)]
159pub struct RestorePoolConfig {
160    /// Minimum number of empty VM shells to prewarm and keep available.
161    pub min_size: usize,
162    /// Maximum number of empty VM shells retained by the restore pool.
163    pub max_size: usize,
164}
165
166/// Error returned by [`RestorePool`] operations.
167#[derive(Debug, Error)]
168pub enum RestorePoolError {
169    /// Restore pool configuration is invalid.
170    #[error("invalid restore pool config: min_size={min_size}, max_size={max_size}")]
171    InvalidConfig {
172        /// Invalid minimum idle target.
173        min_size: usize,
174        /// Invalid maximum capacity.
175        max_size: usize,
176    },
177
178    /// Internal shared state lock is poisoned.
179    #[error("restore pool state lock poisoned")]
180    StatePoisoned,
181
182    /// Underlying microVM error.
183    #[error(transparent)]
184    Microvm(
185        /// Source microVM error.
186        #[from]
187        MicrovmError,
188    ),
189}
190
191#[derive(Default)]
192struct RestorePoolState {
193    idle: VecDeque<EmptyVmSlot>,
194    in_use_count: usize,
195}
196
197struct RestorePoolInner {
198    base_config: SandboxConfig,
199    config: MicrovmConfig,
200    pool_config: RestorePoolConfig,
201    state: Mutex<RestorePoolState>,
202}
203
204impl RestorePoolInner {
205    fn lock_state(&self) -> Result<MutexGuard<'_, RestorePoolState>, RestorePoolError> {
206        self.state
207            .lock()
208            .map_err(|_| RestorePoolError::StatePoisoned)
209    }
210
211    fn create_slot(&self) -> Result<EmptyVmSlot, RestorePoolError> {
212        EmptyVmSlot::new(self.base_config.clone(), self.config.clone()).map_err(Into::into)
213    }
214
215    fn rollback_in_use(&self) {
216        match self.state.lock() {
217            Ok(mut state) => {
218                state.in_use_count = state.in_use_count.saturating_sub(1);
219            }
220            Err(_) => {
221                warn!("回滚恢复池 in_use 计数失败:状态锁已中毒");
222            }
223        }
224    }
225
226    fn push_idle_slot(&self, slot: EmptyVmSlot) -> Result<bool, RestorePoolError> {
227        let mut state = self.lock_state()?;
228        if state.idle.len() >= self.pool_config.max_size {
229            return Ok(false);
230        }
231        state.idle.push_back(slot);
232        Ok(true)
233    }
234
235    fn warm(&self, target_idle_size: usize) -> Result<(), RestorePoolError> {
236        let target_idle_size = target_idle_size.min(self.pool_config.max_size);
237        let current_idle = self.lock_state()?.idle.len();
238        if current_idle >= target_idle_size {
239            return Ok(());
240        }
241
242        let create_count = target_idle_size.saturating_sub(current_idle);
243        let mut slots = Vec::with_capacity(create_count);
244        for _ in 0..create_count {
245            slots.push(self.create_slot()?);
246        }
247
248        let mut state = self.lock_state()?;
249        let available = self.pool_config.max_size.saturating_sub(state.idle.len());
250        let keep_count = available.min(slots.len());
251        for slot in slots.drain(..keep_count) {
252            state.idle.push_back(slot);
253        }
254        Ok(())
255    }
256
257    fn replenish_if_needed(&self) {
258        let should_replenish = match self.state.lock() {
259            Ok(state) => state.idle.len() < self.pool_config.min_size,
260            Err(_) => {
261                warn!("检查恢复池补充条件失败:状态锁已中毒");
262                return;
263            }
264        };
265
266        if !should_replenish {
267            return;
268        }
269
270        match self.create_slot() {
271            Ok(slot) => match self.push_idle_slot(slot) {
272                Ok(true) => {}
273                Ok(false) => {}
274                Err(err) => warn!("回填空壳 VM 失败: {err}"),
275            },
276            Err(err) => warn!("创建空壳 VM 失败,无法回填恢复池: {err}"),
277        }
278    }
279
280    fn release_backend(&self, mut backend: KvmBackend) {
281        if let Err(err) = backend.shutdown() {
282            warn!("销毁恢复态 VM 失败: {err}");
283        }
284
285        match self.state.lock() {
286            Ok(mut state) => {
287                state.in_use_count = state.in_use_count.saturating_sub(1);
288            }
289            Err(_) => {
290                warn!("释放恢复态 VM 失败:状态锁已中毒");
291                return;
292            }
293        }
294
295        self.replenish_if_needed();
296    }
297}
298
299impl Drop for RestorePoolInner {
300    fn drop(&mut self) {
301        // 获取 idle 队列中所有空壳 VM 并丢弃,释放 KVM/VmFd/VcpuFd 资源。
302        let idle = match self.state.lock() {
303            Ok(mut state) => std::mem::take(&mut state.idle),
304            Err(_) => {
305                warn!("RestorePool drop 时状态锁已中毒,无法清理 idle slot");
306                return;
307            }
308        };
309        let count = idle.len();
310        drop(idle);
311        if count > 0 {
312            tracing::debug!(count, "RestorePool drop 清理完成");
313        }
314    }
315}
316
317/// Pool of empty VM shells optimized for snapshot restoration.
318///
319/// Unlike [`crate::pool::VmPool`], this pool does not keep fully booted guests.
320/// Instead, it keeps KVM VM shells with memory and vCPU structures allocated so
321/// snapshot state can be restored with less setup latency.
322#[derive(Clone)]
323pub struct RestorePool {
324    inner: Arc<RestorePoolInner>,
325}
326
327impl RestorePool {
328    /// Creates a pool of empty VM shells for snapshot restore.
329    ///
330    /// The pool validates capacity limits and microVM assets, then warms the
331    /// configured minimum number of empty slots.
332    pub fn new(
333        base_config: SandboxConfig,
334        config: MicrovmConfig,
335        pool_config: RestorePoolConfig,
336    ) -> Result<Self, RestorePoolError> {
337        if pool_config.max_size == 0 || pool_config.min_size > pool_config.max_size {
338            return Err(RestorePoolError::InvalidConfig {
339                min_size: pool_config.min_size,
340                max_size: pool_config.max_size,
341            });
342        }
343
344        config.validate()?;
345
346        let pool = Self {
347            inner: Arc::new(RestorePoolInner {
348                base_config,
349                config,
350                pool_config,
351                state: Mutex::new(RestorePoolState::default()),
352            }),
353        };
354
355        if pool.inner.pool_config.min_size > 0 {
356            pool.inner.warm(pool.inner.pool_config.min_size)?;
357        }
358
359        Ok(pool)
360    }
361
362    /// Restores a microVM from guest memory pages and serialized vCPU state.
363    ///
364    /// A pre-created slot is used when available; otherwise the pool creates one on
365    /// demand and rolls back accounting if restoration fails.
366    pub fn restore(
367        &self,
368        memory: &[u8],
369        vcpu_state: &[u8],
370    ) -> Result<PooledRestoreVm, RestorePoolError> {
371        let slot = {
372            let mut state = self.inner.lock_state()?;
373            state.in_use_count += 1;
374            state.idle.pop_back()
375        };
376
377        let slot = match slot {
378            Some(slot) => slot,
379            None => match self.inner.create_slot() {
380                Ok(slot) => slot,
381                Err(err) => {
382                    self.inner.rollback_in_use();
383                    return Err(err);
384                }
385            },
386        };
387
388        let backend = match slot.into_restored_backend(memory, vcpu_state) {
389            Ok(backend) => backend,
390            Err(err) => {
391                self.inner.rollback_in_use();
392                self.inner.replenish_if_needed();
393                return Err(err.into());
394            }
395        };
396
397        Ok(PooledRestoreVm {
398            backend: Some(backend),
399            pool: Arc::clone(&self.inner),
400        })
401    }
402
403    /// Restores a microVM from full self-describing snapshot bytes.
404    pub fn restore_from_bytes(&self, data: &[u8]) -> Result<PooledRestoreVm, RestorePoolError> {
405        let snapshot = MicrovmSnapshot::restore(data)?;
406        let (_, _, memory, vcpu_state) = snapshot.into_parts();
407        self.restore(memory.as_slice(), vcpu_state.as_slice())
408    }
409
410    /// Restores a microVM from a [`SandboxSnapshot`].
411    ///
412    /// File-backed snapshots use the optimized file path when the build supports it;
413    /// otherwise the snapshot is loaded into memory before restoration.
414    pub fn restore_snapshot(
415        &self,
416        snapshot: &SandboxSnapshot,
417    ) -> Result<PooledRestoreVm, RestorePoolError> {
418        if let Some(memory_path) = snapshot.memory_file_path() {
419            #[cfg(feature = "zerocopy-fork")]
420            {
421                let (_, _, vcpu_state) = load_state_from_memory_file(memory_path)?;
422                return self.restore_from_file(memory_path, vcpu_state.as_slice());
423            }
424
425            #[cfg(not(feature = "zerocopy-fork"))]
426            {
427                let snapshot = MicrovmSnapshot::from_memory_file(memory_path)?;
428                let (_, _, memory, vcpu_state) = snapshot.into_parts();
429                return self.restore(memory.as_slice(), vcpu_state.as_slice());
430            }
431        }
432
433        let data = snapshot.as_bytes().map_err(map_snapshot_access_error)?;
434        self.restore_from_bytes(data)
435    }
436
437    #[cfg(feature = "zerocopy-fork")]
438    fn restore_from_file(
439        &self,
440        memory_path: &std::path::Path,
441        vcpu_state: &[u8],
442    ) -> Result<PooledRestoreVm, RestorePoolError> {
443        let slot = {
444            let mut state = self.inner.lock_state()?;
445            state.in_use_count += 1;
446            state.idle.pop_back()
447        };
448
449        let slot = match slot {
450            Some(slot) => slot,
451            None => match self.inner.create_slot() {
452                Ok(slot) => slot,
453                Err(err) => {
454                    self.inner.rollback_in_use();
455                    return Err(err);
456                }
457            },
458        };
459
460        let backend = match slot.into_restored_backend_from_file(memory_path, vcpu_state) {
461            Ok(backend) => backend,
462            Err(err) => {
463                self.inner.rollback_in_use();
464                self.inner.replenish_if_needed();
465                return Err(err.into());
466            }
467        };
468
469        Ok(PooledRestoreVm {
470            backend: Some(backend),
471            pool: Arc::clone(&self.inner),
472        })
473    }
474
475    /// Returns the current number of idle slots in the restore pool.
476    ///
477    /// If the internal lock is poisoned, this method logs the condition and returns
478    /// `0` rather than panicking.
479    pub fn idle_count(&self) -> usize {
480        match self.inner.state.lock() {
481            Ok(state) => state.idle.len(),
482            Err(_) => {
483                warn!("查询恢复池空闲槽位失败:状态锁已中毒");
484                0
485            }
486        }
487    }
488
489    /// Warms the restore pool to at least `target` empty VM shells.
490    ///
491    /// The effective target is capped by [`RestorePoolConfig::max_size`].
492    pub fn warm(&self, target: usize) -> Result<(), RestorePoolError> {
493        self.inner.warm(target)
494    }
495}
496
497/// Restored microVM handle borrowed from a [`RestorePool`].
498///
499/// Dropping the handle shuts down the restored VM and allows the pool to replenish
500/// an empty slot if needed.
501pub struct PooledRestoreVm {
502    backend: Option<KvmBackend>,
503    pool: Arc<RestorePoolInner>,
504}
505
506impl PooledRestoreVm {
507    /// Executes a guest command and waits for completion.
508    pub fn execute(&mut self, cmd: &[String]) -> Result<GuestCommandResult, MicrovmError> {
509        self.execute_with_options(cmd, GuestExecOptions::default())
510    }
511
512    /// Executes a guest command with command-level options.
513    ///
514    /// Returns [`LifecycleError::Released`] through [`MicrovmError::Lifecycle`] when
515    /// the restored handle has already been released.
516    pub fn execute_with_options(
517        &mut self,
518        cmd: &[String],
519        options: GuestExecOptions,
520    ) -> Result<GuestCommandResult, MicrovmError> {
521        match self.backend.as_mut() {
522            Some(backend) => backend.run_command_with_options(cmd, &options),
523            None => Err(MicrovmError::Lifecycle(LifecycleError::Released(
524                "restored VM has been released".into(),
525            ))),
526        }
527    }
528
529    /// Executes a guest command and returns a receiver for streaming output events.
530    pub fn stream_execute(
531        &mut self,
532        cmd: &[String],
533    ) -> Result<std::sync::mpsc::Receiver<StreamEvent>, MicrovmError> {
534        self.stream_execute_with_options(cmd, GuestExecOptions::default())
535    }
536
537    /// Executes a guest command as streaming output events with command-level options.
538    pub fn stream_execute_with_options(
539        &mut self,
540        cmd: &[String],
541        options: GuestExecOptions,
542    ) -> Result<std::sync::mpsc::Receiver<StreamEvent>, MicrovmError> {
543        match self.backend.as_mut() {
544            Some(backend) => backend.run_command_streaming_with_options(cmd, &options),
545            None => Err(MicrovmError::Lifecycle(LifecycleError::Released(
546                "restored VM has been released".into(),
547            ))),
548        }
549    }
550
551    /// Reads file contents from the restored guest filesystem.
552    pub fn read_file(&mut self, path: &str) -> Result<Vec<u8>, MicrovmError> {
553        match self.backend.as_mut() {
554            Some(backend) => backend.read_file(path),
555            None => Err(MicrovmError::Lifecycle(LifecycleError::Released(
556                "restored VM has been released".into(),
557            ))),
558        }
559    }
560
561    /// Writes file contents into the restored guest filesystem.
562    pub fn write_file(&mut self, path: &str, data: &[u8]) -> Result<(), MicrovmError> {
563        match self.backend.as_mut() {
564            Some(backend) => backend.write_file(path, data),
565            None => Err(MicrovmError::Lifecycle(LifecycleError::Released(
566                "restored VM has been released".into(),
567            ))),
568        }
569    }
570
571    /// Lists directory entries from the restored guest filesystem.
572    pub fn list_dir(&mut self, path: &str) -> Result<Vec<DirEntry>, MicrovmError> {
573        crate::guest_file_ops::list_dir(path, |cmd| self.execute(cmd))
574    }
575
576    /// Returns whether a guest path exists.
577    pub fn file_exists(&mut self, path: &str) -> Result<bool, MicrovmError> {
578        crate::guest_file_ops::file_exists(path, |cmd| self.execute(cmd))
579    }
580
581    /// Removes a file from the restored guest filesystem.
582    pub fn remove_file(&mut self, path: &str) -> Result<(), MicrovmError> {
583        crate::guest_file_ops::remove_file(path, |cmd| self.execute(cmd))
584    }
585
586    /// Renames or moves a file inside the restored guest filesystem.
587    pub fn rename(&mut self, from: &str, to: &str) -> Result<(), MicrovmError> {
588        crate::guest_file_ops::rename(from, to, |cmd| self.execute(cmd))
589    }
590
591    /// Returns restored guest file metadata.
592    pub fn stat(&mut self, path: &str) -> Result<FileStat, MicrovmError> {
593        crate::guest_file_ops::stat(path, |cmd| self.execute(cmd))
594    }
595
596    /// Runs one guest `PING`/`PONG` readiness probe and returns the round-trip duration.
597    pub fn ping(&mut self) -> Result<Duration, MicrovmError> {
598        match self.backend.as_mut() {
599            Some(backend) => backend.ping(),
600            None => Err(MicrovmError::Lifecycle(LifecycleError::Released(
601                "restored VM has been released".into(),
602            ))),
603        }
604    }
605
606    /// Sends a request through the host-controlled HTTP proxy for the restored VM.
607    pub fn http_request(&mut self, request: HttpRequest) -> Result<HttpResponse, MicrovmError> {
608        match self.backend.as_mut() {
609            Some(backend) => backend.http_request(request),
610            None => Err(MicrovmError::Lifecycle(LifecycleError::Released(
611                "restored VM has been released".into(),
612            ))),
613        }
614    }
615
616    /// Exports a file-backed snapshot of the current restored VM.
617    pub fn snapshot(&self) -> Result<SandboxSnapshot, MicrovmError> {
618        match self.backend.as_ref() {
619            Some(backend) => backend.snapshot_to_file(),
620            None => Err(MicrovmError::Lifecycle(LifecycleError::Released(
621                "restored VM has been released".into(),
622            ))),
623        }
624    }
625}
626
627impl Drop for PooledRestoreVm {
628    fn drop(&mut self) {
629        if let Some(backend) = self.backend.take() {
630            self.pool.release_backend(backend);
631        }
632    }
633}
634
635fn register_guest_memory(
636    vm_fd: &VmFd,
637    guest_memory: &GuestMemoryMmap,
638    config: &MicrovmConfig,
639) -> Result<(), MicrovmError> {
640    let host_addr = guest_memory
641        .get_host_address(GuestAddress(0))
642        .map_err(to_backend_error)? as u64;
643    let memory_size = u64::try_from(config.memory_bytes()?).map_err(|_| {
644        MicrovmError::Backend("guest memory size cannot be converted to u64".into())
645    })?;
646    let memory_region = kvm_userspace_memory_region {
647        slot: 0,
648        guest_phys_addr: 0,
649        memory_size,
650        userspace_addr: host_addr,
651        flags: 0,
652    };
653
654    // SAFETY: `userspace_addr` comes directly from a valid mmap owned by the current
655    // `GuestMemoryMmap`. Its lifetime covers the entire empty slot, and this only
656    // registers slot 0, so it cannot overlap with other regions.
657    unsafe {
658        vm_fd
659            .set_user_memory_region(memory_region)
660            .map_err(to_backend_error)?;
661    }
662    Ok(())
663}
664
665fn to_backend_error(err: impl std::fmt::Display) -> MicrovmError {
666    MicrovmError::Backend(err.to_string())
667}
668
669fn map_snapshot_access_error(error: SandboxError) -> RestorePoolError {
670    let error = match error {
671        SandboxError::Io(error) => MicrovmError::Io(error),
672        SandboxError::InvalidSnapshot => {
673            MicrovmError::SnapshotFormat("invalid sandbox snapshot".into())
674        }
675        other => MicrovmError::SnapshotFormat(other.to_string()),
676    };
677    RestorePoolError::Microvm(error)
678}