Skip to main content

squib_virtio/devices/
block.rs

1//! virtio-block — sync engine.
2//!
3//! Per [14-virtio-and-devices.md § 4.1](../../../specs/14-virtio-and-devices.md#41-virtio-block):
4//!
5//! > **Sync engine**: every queue notification handler reads/writes
6//! > synchronously on the device thread. Suitable for low-IOPS workloads.
7//! > **Async engine**: queue notifications dispatch onto
8//! > `tokio::task::spawn_blocking`, which calls `pread`/`pwrite` against an
9//! > `F_NOCACHE`-opened fd. The macOS analogue of Linux `O_DIRECT`.
10//!
11//! The async engine + rate limiter are deferred to Phase 7 perf work; they
12//! land in [93-improvements-review.md](../../../specs/93-improvements-review.md)
13//! as known follow-ups so the sync engine ships clean for 1.0 functional
14//! coverage. The trait [`BlockBackend`] abstracts the I/O so the async path
15//! can drop in without touching the device-front-end logic.
16
17// The sync block engine deliberately uses `std::fs::File` and synchronous
18// I/O per [14-virtio-and-devices.md §
19// 4.1](../../../specs/14-virtio-and-devices.md#41-virtio-block): "every queue notification handler
20// reads/writes synchronously on the device thread". The workspace `disallowed-types` lint pushes
21// runtime I/O onto Tokio; the sync engine is the explicit blocking-I/O alternative for
22// low-IOPS workloads, so the allow is intentional. The async engine uses
23// `tokio::task::spawn_blocking` and lives in the deferred-findings list.
24#[allow(clippy::disallowed_types)]
25use std::fs::{File, OpenOptions};
26use std::{
27    io::{Read, Seek, SeekFrom, Write},
28    path::PathBuf,
29    sync::Arc,
30};
31
32use parking_lot::Mutex;
33use squib_core::{GuestAddress, GuestMemory};
34
35use crate::{
36    device::{ActivateError, VirtioDevice},
37    device_id::VirtioDeviceType,
38    interrupt::IrqLine,
39    queue::Queue,
40};
41
42/// Sector size in bytes — fixed at 512 by the virtio-block spec.
43pub const SECTOR_SIZE: u64 = 512;
44
45/// `VIRTIO_BLK_T_IN` — guest read.
46pub const REQ_TYPE_IN: u32 = 0;
47/// `VIRTIO_BLK_T_OUT` — guest write.
48pub const REQ_TYPE_OUT: u32 = 1;
49/// `VIRTIO_BLK_T_FLUSH` — durability barrier.
50pub const REQ_TYPE_FLUSH: u32 = 4;
51/// `VIRTIO_BLK_T_GET_ID` — fetch the device's identifier string.
52pub const REQ_TYPE_GET_ID: u32 = 8;
53
54/// `VIRTIO_BLK_S_OK` — request completed successfully.
55pub const STATUS_OK: u8 = 0;
56/// `VIRTIO_BLK_S_IOERR` — host-side I/O failure.
57pub const STATUS_IOERR: u8 = 1;
58/// `VIRTIO_BLK_S_UNSUPP` — request type unrecognized.
59pub const STATUS_UNSUPP: u8 = 2;
60
61/// `VIRTIO_BLK_F_RO` — device is read-only.
62pub const F_RO: u64 = 1 << 5;
63/// `VIRTIO_BLK_F_BLK_SIZE` — driver may read `blk_size` from config-space.
64pub const F_BLK_SIZE: u64 = 1 << 6;
65/// `VIRTIO_BLK_F_FLUSH` — device honours flush requests.
66pub const F_FLUSH: u64 = 1 << 9;
67
68const REQ_QUEUE: usize = 0;
69const QUEUE_MAX_SIZE: u16 = 256;
70
71/// Cache mode — surfaces in feature bits.
72#[derive(Debug, Clone, Copy, PartialEq, Eq)]
73pub enum CacheType {
74    /// `Unsafe` cache mode — no flush; matches Firecracker default.
75    Unsafe,
76    /// `Writeback` cache mode — `VIRTIO_BLK_F_FLUSH` is offered.
77    Writeback,
78}
79
80/// Block-device configuration.
81#[derive(Debug, Clone)]
82pub struct BlockConfig {
83    /// Operator-supplied identifier — surfaces via `VIRTIO_BLK_T_GET_ID`.
84    pub drive_id: String,
85    /// Path to the host-side backing file.
86    pub path_on_host: PathBuf,
87    /// `true` if the drive is the root device (informational; no behavior).
88    pub is_root_device: bool,
89    /// `true` opens the backing file read-only and offers `VIRTIO_BLK_F_RO`.
90    pub is_read_only: bool,
91    /// `Unsafe` (no flush) or `Writeback` (flush honored).
92    pub cache_type: CacheType,
93    /// PARTUUID, threaded into the FDT `boot_args` if `is_root_device`.
94    pub partuuid: Option<String>,
95}
96
97/// Backend abstraction — the sync engine implements this against a
98/// `parking_lot::Mutex<File>`; the async engine (Phase 7) wraps a Tokio task.
99pub trait BlockBackend: Send + Sync + std::fmt::Debug {
100    /// Total size of the backing storage in bytes.
101    fn size_bytes(&self) -> u64;
102
103    /// Read `buf.len()` bytes starting at `byte_offset`.
104    ///
105    /// # Errors
106    /// `std::io::Error` for any host failure.
107    fn read_at(&self, byte_offset: u64, buf: &mut [u8]) -> std::io::Result<()>;
108
109    /// Write `buf.len()` bytes starting at `byte_offset`.
110    ///
111    /// # Errors
112    /// `std::io::Error` for any host failure.
113    fn write_at(&self, byte_offset: u64, buf: &[u8]) -> std::io::Result<()>;
114
115    /// Synchronous flush (`fsync`).
116    ///
117    /// # Errors
118    /// `std::io::Error` for any host failure.
119    fn flush(&self) -> std::io::Result<()>;
120
121    /// `true` if the backing storage is opened read-only.
122    fn read_only(&self) -> bool;
123}
124
125/// Synchronous file-backed `BlockBackend` — opens the host file once and
126/// serializes I/O through a `parking_lot::Mutex<File>`. The mutex guard is
127/// released before the device thread re-acquires its own state lock so the
128/// bus is not blocked across host I/O.
129///
130/// The workspace `disallowed-types` lint pushes runtime I/O onto Tokio; the
131/// sync engine is the explicit blocking-I/O alternative for low-IOPS
132/// workloads (per [14-virtio-and-devices.md §
133/// 4.1](../../../specs/14-virtio-and-devices.md#41-virtio-block)) so the `std::fs::File` allow is
134/// intentional. The async engine uses `tokio::task::spawn_blocking` and lives in a follow-up.
135#[derive(Debug)]
136#[allow(clippy::disallowed_types)]
137pub struct SyncFileBackend {
138    file: Mutex<File>,
139    size_bytes: u64,
140    read_only: bool,
141}
142
143#[allow(clippy::disallowed_types, clippy::disallowed_methods)]
144impl SyncFileBackend {
145    /// Open `path` for sync I/O.
146    ///
147    /// # Errors
148    /// `std::io::Error` if the file cannot be opened.
149    pub fn open(path: &std::path::Path, read_only: bool) -> std::io::Result<Self> {
150        let mut opts = OpenOptions::new();
151        opts.read(true);
152        if !read_only {
153            opts.write(true);
154        }
155        let file = opts.open(path)?;
156        let size_bytes = file.metadata()?.len();
157        Ok(Self {
158            file: Mutex::new(file),
159            size_bytes,
160            read_only,
161        })
162    }
163}
164
165impl BlockBackend for SyncFileBackend {
166    fn size_bytes(&self) -> u64 {
167        self.size_bytes
168    }
169    fn read_at(&self, byte_offset: u64, buf: &mut [u8]) -> std::io::Result<()> {
170        let mut f = self.file.lock();
171        f.seek(SeekFrom::Start(byte_offset))?;
172        f.read_exact(buf)
173    }
174    fn write_at(&self, byte_offset: u64, buf: &[u8]) -> std::io::Result<()> {
175        if self.read_only {
176            return Err(std::io::Error::new(
177                std::io::ErrorKind::PermissionDenied,
178                "read-only block backend",
179            ));
180        }
181        let mut f = self.file.lock();
182        f.seek(SeekFrom::Start(byte_offset))?;
183        f.write_all(buf)
184    }
185    fn flush(&self) -> std::io::Result<()> {
186        let f = self.file.lock();
187        f.sync_data()
188    }
189    fn read_only(&self) -> bool {
190        self.read_only
191    }
192}
193
194/// Direct-I/O file-backed `BlockBackend` for the high-IOPS path.
195///
196/// Differs from [`SyncFileBackend`] structurally: **lockless positioned
197/// I/O**. Uses `pread(2)` / `pwrite(2)` via
198/// `std::os::unix::fs::FileExt::{read_at, write_at}`, so multiple threads
199/// can issue concurrent operations against the same fd without serialising
200/// through a `Mutex<File>`. The 100 K IOPS budget in
201/// [71 § 3](../../../specs/71-performance-budgets.md#3-block-io) needs the
202/// lockless path — a mutex-serialised engine caps at the latency of one
203/// pread per op (≈ 30 μs ⇒ 33 K IOPS ceiling on a single thread).
204///
205/// **`F_NOCACHE` is not set here** because `squib-virtio` carries
206/// `#![forbid(unsafe_code)]` (I-CRATE-2): the `fcntl(F_NOCACHE)` call lives
207/// in `squib-host::block_io::set_f_nocache`, which can wrap the fd before
208/// it's handed to this constructor (the operator-supplied path is opened
209/// once at boot, so the small upfront cost is fine). The host-side cache
210/// pressure without `F_NOCACHE` is small for the Lambda workload profile;
211/// the perf-tuning lane benchmark
212/// ([71 § 3](../../../specs/71-performance-budgets.md#3-block-io)) gates
213/// the regression once the bench harness lights up.
214///
215/// Functionally interchangeable with `SyncFileBackend`; constructed via
216/// [`Self::open`] and dropped in via the same `Arc<dyn BlockBackend>`
217/// the device frontend takes today.
218#[derive(Debug)]
219#[allow(clippy::disallowed_types)]
220pub struct AsyncFileBackend {
221    file: File,
222    size_bytes: u64,
223    read_only: bool,
224}
225
226#[allow(clippy::disallowed_types, clippy::disallowed_methods)]
227impl AsyncFileBackend {
228    /// Open `path` for lockless positioned I/O.
229    ///
230    /// The fd is shared across every thread that calls into `BlockBackend`;
231    /// concurrent positioned I/O is safe via `pread`/`pwrite`.
232    ///
233    /// # Errors
234    /// `std::io::Error` if the file cannot be opened.
235    pub fn open(path: &std::path::Path, read_only: bool) -> std::io::Result<Self> {
236        let mut opts = OpenOptions::new();
237        opts.read(true);
238        if !read_only {
239            opts.write(true);
240        }
241        let file = opts.open(path)?;
242        Self::from_file(file, read_only)
243    }
244
245    /// Wrap an already-open `File`. Used by `squib-host` after applying
246    /// `F_NOCACHE` via the unsafe `fcntl` boundary: the operator constructs
247    /// the fd, hands it to us, and we own the read/write side.
248    ///
249    /// # Errors
250    /// `std::io::Error` if the file's metadata can't be queried.
251    pub fn from_file(file: File, read_only: bool) -> std::io::Result<Self> {
252        let size_bytes = file.metadata()?.len();
253        Ok(Self {
254            file,
255            size_bytes,
256            read_only,
257        })
258    }
259}
260
261impl BlockBackend for AsyncFileBackend {
262    fn size_bytes(&self) -> u64 {
263        self.size_bytes
264    }
265    fn read_at(&self, byte_offset: u64, buf: &mut [u8]) -> std::io::Result<()> {
266        // `read_at` (pread) doesn't change the fd seek position, so concurrent
267        // calls against the same fd are safe per `FileExt` contract. No mutex.
268        use std::os::unix::fs::FileExt as _;
269        // `read_exact_at` would loop on partial reads; we want exactly that
270        // semantic for virtio-block where requests are size-bounded.
271        self.file.read_exact_at(buf, byte_offset)
272    }
273    fn write_at(&self, byte_offset: u64, buf: &[u8]) -> std::io::Result<()> {
274        use std::os::unix::fs::FileExt as _;
275        if self.read_only {
276            return Err(std::io::Error::new(
277                std::io::ErrorKind::PermissionDenied,
278                "read-only block backend",
279            ));
280        }
281        self.file.write_all_at(buf, byte_offset)
282    }
283    fn flush(&self) -> std::io::Result<()> {
284        self.file.sync_data()
285    }
286    fn read_only(&self) -> bool {
287        self.read_only
288    }
289}
290
291/// Token-bucket rate limiter wrapping any [`BlockBackend`].
292///
293/// `tower::Layer`-shaped (without the actual `tower` dependency — the
294/// trait surface here is sync, not request/response): construct via
295/// [`RateLimitedBackend::new`] with a wrapped backend and a [`RateLimit`]
296/// budget. Every `read_at` / `write_at` calls `acquire(buf.len())` first;
297/// if the bucket is empty the call blocks (`std::thread::sleep`) until
298/// enough tokens replenish.
299///
300/// Per [14 § 4.1](../../../specs/14-virtio-and-devices.md#41-virtio-block) and
301/// the `D7-adjacent` rate-limiter requirement: bound aggregate throughput
302/// within ±5 % of the configured rate. The bucket is refilled
303/// continuously based on wall-clock elapsed time since the last drain,
304/// which gives a smooth ±n % envelope at any rate.
305#[derive(Debug)]
306pub struct RateLimitedBackend {
307    inner: Arc<dyn BlockBackend>,
308    state: Mutex<TokenBucket>,
309    config: RateLimit,
310}
311
312/// Per-direction rate cap (bytes/sec). Use `unlimited()` to bypass.
313#[derive(Debug, Clone, Copy)]
314pub struct RateLimit {
315    /// Refill rate (bytes per second).
316    pub bytes_per_sec: u64,
317    /// Burst — bucket size in bytes; max instantaneous draw.
318    pub burst_bytes: u64,
319}
320
321impl RateLimit {
322    /// No rate limit (pass-through).
323    #[must_use]
324    pub const fn unlimited() -> Self {
325        Self {
326            bytes_per_sec: u64::MAX,
327            burst_bytes: u64::MAX,
328        }
329    }
330
331    /// `bytes_per_sec` as a steady-state cap, with a 100ms-worth burst.
332    #[must_use]
333    pub const fn steady(bytes_per_sec: u64) -> Self {
334        Self {
335            bytes_per_sec,
336            burst_bytes: bytes_per_sec / 10,
337        }
338    }
339}
340
341#[derive(Debug)]
342struct TokenBucket {
343    tokens: u64,
344    last_refill: std::time::Instant,
345}
346
347impl RateLimitedBackend {
348    /// Wrap `inner` with a rate-limit gate. `inner` is held by `Arc` so
349    /// the same backend can sit behind several rate-limit slices.
350    #[must_use]
351    pub fn new(inner: Arc<dyn BlockBackend>, config: RateLimit) -> Self {
352        Self {
353            state: Mutex::new(TokenBucket {
354                tokens: config.burst_bytes,
355                last_refill: std::time::Instant::now(),
356            }),
357            config,
358            inner,
359        }
360    }
361
362    fn refill(state: &mut TokenBucket, config: &RateLimit, ceiling: u64) {
363        if config.bytes_per_sec == u64::MAX {
364            state.tokens = ceiling;
365            return;
366        }
367        let now = std::time::Instant::now();
368        let elapsed = now.duration_since(state.last_refill);
369        // Saturating to defeat overflow on long pauses (e.g. snapshot
370        // restore where the bucket sits idle for hours).
371        let earned_u128 =
372            u128::from(config.bytes_per_sec).saturating_mul(elapsed.as_nanos()) / 1_000_000_000;
373        let earned = u64::try_from(earned_u128).unwrap_or(u64::MAX);
374        state.tokens = state.tokens.saturating_add(earned).min(ceiling);
375        state.last_refill = now;
376    }
377
378    fn acquire(&self, n: u64) {
379        if self.config.bytes_per_sec == u64::MAX {
380            return;
381        }
382        // For oversize requests (n > burst_bytes), let the bucket grow up to
383        // `n` for this call so the request can complete in finite time.
384        // Steady-state burst is still `burst_bytes`; this is the
385        // single-request override.
386        let ceiling = n.max(self.config.burst_bytes);
387        let mut g = self.state.lock();
388        Self::refill(&mut g, &self.config, ceiling);
389        if g.tokens >= n {
390            g.tokens -= n;
391            return;
392        }
393        // Sleep for the remainder of the time it would take to refill
394        // `n - tokens` bytes, then drain. `parking_lot::Mutex` is not
395        // poison-safe; we hold it across the sleep deliberately so a
396        // concurrent caller cannot bleed in and drain the bucket.
397        let needed = n - g.tokens;
398        let nanos_to_wait =
399            (u128::from(needed) * 1_000_000_000) / u128::from(self.config.bytes_per_sec.max(1));
400        let wait =
401            std::time::Duration::from_nanos(u64::try_from(nanos_to_wait).unwrap_or(u64::MAX));
402        // Suppressing the cv to avoid the wait_for-loop bug where a
403        // refill clamped to `burst_bytes` would never reach `n`. Single
404        // sleep then drain is correct: by the time we wake, at least
405        // `needed` more bytes have been "earned" by the rate limit.
406        std::thread::sleep(wait);
407        Self::refill(&mut g, &self.config, ceiling);
408        g.tokens = g.tokens.saturating_sub(n);
409    }
410}
411
412impl BlockBackend for RateLimitedBackend {
413    fn size_bytes(&self) -> u64 {
414        self.inner.size_bytes()
415    }
416    fn read_at(&self, byte_offset: u64, buf: &mut [u8]) -> std::io::Result<()> {
417        self.acquire(u64::try_from(buf.len()).unwrap_or(u64::MAX));
418        self.inner.read_at(byte_offset, buf)
419    }
420    fn write_at(&self, byte_offset: u64, buf: &[u8]) -> std::io::Result<()> {
421        self.acquire(u64::try_from(buf.len()).unwrap_or(u64::MAX));
422        self.inner.write_at(byte_offset, buf)
423    }
424    fn flush(&self) -> std::io::Result<()> {
425        self.inner.flush()
426    }
427    fn read_only(&self) -> bool {
428        self.inner.read_only()
429    }
430}
431
432/// virtio-block frontend.
433#[derive(Debug)]
434pub struct BlockDevice {
435    avail: u64,
436    acked: u64,
437    queues: Vec<Queue>,
438    config: BlockConfig,
439    backend: Arc<dyn BlockBackend>,
440    state: Arc<Mutex<ActiveState>>,
441}
442
443#[derive(Debug, Default)]
444struct ActiveState {
445    mem: Option<Arc<dyn GuestMemory>>,
446    irq: Option<IrqLine>,
447    activated: bool,
448}
449
450impl BlockDevice {
451    /// Build a virtio-block from a validated [`BlockConfig`] and a
452    /// concrete backend.
453    #[must_use]
454    pub fn new(config: BlockConfig, backend: Arc<dyn BlockBackend>) -> Self {
455        let mut avail = F_BLK_SIZE;
456        if config.is_read_only || backend.read_only() {
457            avail |= F_RO;
458        }
459        if matches!(config.cache_type, CacheType::Writeback) {
460            avail |= F_FLUSH;
461        }
462        Self {
463            avail,
464            acked: 0,
465            queues: vec![Queue::new(QUEUE_MAX_SIZE)],
466            config,
467            backend,
468            state: Arc::new(Mutex::new(ActiveState::default())),
469        }
470    }
471
472    /// Backing-store size in 512-byte sectors (virtio-block config layout).
473    #[must_use]
474    pub fn capacity_sectors(&self) -> u64 {
475        self.backend.size_bytes() / SECTOR_SIZE
476    }
477
478    fn handle_request_inner(
479        backend: &dyn BlockBackend,
480        drive_id: &str,
481        mem: &dyn GuestMemory,
482        descs: &[crate::queue::Descriptor],
483    ) -> (u8, u32) {
484        // The driver lays out: header descriptor, payload descriptor(s),
485        // status descriptor (always device-write, 1 byte).
486        if descs.len() < 2 {
487            return (STATUS_IOERR, 0);
488        }
489        let header = descs[0];
490        if header.is_write_only() || header.len < 16 {
491            return (STATUS_IOERR, 0);
492        }
493        let Ok(req_type) = mem.read_u32_le(header.addr) else {
494            return (STATUS_IOERR, 0);
495        };
496        let Ok(sector) = mem.read_u64_le(GuestAddress(header.addr.raw() + 8)) else {
497            return (STATUS_IOERR, 0);
498        };
499        // Payload descriptors are everything between header and status.
500        let payload = &descs[1..descs.len() - 1];
501        let status_desc = descs.last().copied().unwrap_or(header);
502        if !status_desc.is_write_only() || status_desc.len < 1 {
503            return (STATUS_IOERR, 0);
504        }
505        let mut bytes_written: u32 = 0;
506        let status = match req_type {
507            REQ_TYPE_IN => match Self::do_read(backend, mem, payload, sector) {
508                Ok(written) => {
509                    bytes_written = written;
510                    STATUS_OK
511                }
512                Err(_) => STATUS_IOERR,
513            },
514            REQ_TYPE_OUT => match Self::do_write(backend, mem, payload, sector) {
515                Ok(()) => STATUS_OK,
516                Err(_) => STATUS_IOERR,
517            },
518            REQ_TYPE_FLUSH => match backend.flush() {
519                Ok(()) => STATUS_OK,
520                Err(_) => STATUS_IOERR,
521            },
522            REQ_TYPE_GET_ID => match Self::do_get_id(drive_id, mem, payload) {
523                Ok(written) => {
524                    bytes_written = written;
525                    STATUS_OK
526                }
527                Err(_) => STATUS_IOERR,
528            },
529            _ => STATUS_UNSUPP,
530        };
531        // Status byte is always one byte.
532        if mem.write(status_desc.addr, &[status]).is_err() {
533            return (STATUS_IOERR, bytes_written);
534        }
535        (status, bytes_written.saturating_add(1))
536    }
537
538    fn do_read(
539        backend: &dyn BlockBackend,
540        mem: &dyn GuestMemory,
541        payload: &[crate::queue::Descriptor],
542        sector: u64,
543    ) -> std::io::Result<u32> {
544        let mut byte_off = sector
545            .checked_mul(SECTOR_SIZE)
546            .ok_or_else(|| std::io::Error::other("sector*SECTOR_SIZE overflow"))?;
547        let mut total: u32 = 0;
548        for desc in payload {
549            if !desc.is_write_only() {
550                continue;
551            }
552            let len = desc.len as usize;
553            let mut buf = vec![0u8; len];
554            backend.read_at(byte_off, &mut buf)?;
555            mem.write(desc.addr, &buf)
556                .map_err(|e| std::io::Error::other(e.to_string()))?;
557            byte_off = byte_off
558                .checked_add(u64::from(desc.len))
559                .ok_or_else(|| std::io::Error::other("descriptor offset overflow"))?;
560            total = total.saturating_add(desc.len);
561        }
562        Ok(total)
563    }
564
565    fn do_write(
566        backend: &dyn BlockBackend,
567        mem: &dyn GuestMemory,
568        payload: &[crate::queue::Descriptor],
569        sector: u64,
570    ) -> std::io::Result<()> {
571        let mut byte_off = sector
572            .checked_mul(SECTOR_SIZE)
573            .ok_or_else(|| std::io::Error::other("sector*SECTOR_SIZE overflow"))?;
574        for desc in payload {
575            if desc.is_write_only() {
576                continue;
577            }
578            let len = desc.len as usize;
579            let mut buf = vec![0u8; len];
580            mem.read(desc.addr, &mut buf)
581                .map_err(|e| std::io::Error::other(e.to_string()))?;
582            backend.write_at(byte_off, &buf)?;
583            byte_off = byte_off
584                .checked_add(u64::from(desc.len))
585                .ok_or_else(|| std::io::Error::other("descriptor offset overflow"))?;
586        }
587        Ok(())
588    }
589
590    fn do_get_id(
591        drive_id: &str,
592        mem: &dyn GuestMemory,
593        payload: &[crate::queue::Descriptor],
594    ) -> std::io::Result<u32> {
595        if payload.is_empty() {
596            return Ok(0);
597        }
598        // Virtio spec: `VIRTIO_BLK_T_GET_ID` returns 20 bytes of ID, padded
599        // with zeros if shorter.
600        let mut id = [0u8; 20];
601        let bytes = drive_id.as_bytes();
602        let n = bytes.len().min(20);
603        id[..n].copy_from_slice(&bytes[..n]);
604        let desc = payload[0];
605        if !desc.is_write_only() {
606            return Ok(0);
607        }
608        let len = (desc.len as usize).min(20);
609        mem.write(desc.addr, &id[..len])
610            .map_err(|e| std::io::Error::other(e.to_string()))?;
611        Ok(len as u32)
612    }
613
614    fn drain_requests(&mut self) {
615        let (mem, irq) = {
616            let state = self.state.lock();
617            match (state.mem.clone(), state.irq.clone()) {
618                (Some(m), Some(i)) => (m, i),
619                _ => return,
620            }
621        };
622        let backend = Arc::clone(&self.backend);
623        let drive_id = self.config.drive_id.clone();
624        let queue = &mut self.queues[REQ_QUEUE];
625        let mut completed = false;
626        loop {
627            let chain = match queue.pop_avail(mem.as_ref()) {
628                Ok(Some(c)) => c,
629                Ok(None) => break,
630                Err(err) => {
631                    tracing::warn!(error = %err, "block: walk failed");
632                    break;
633                }
634            };
635            let head = chain.head_index();
636            let descs = match chain.collect(mem.as_ref()) {
637                Ok(d) => d,
638                Err(err) => {
639                    tracing::warn!(error = %err, "block: chain collect failed");
640                    break;
641                }
642            };
643            let (_status, written) =
644                Self::handle_request_inner(backend.as_ref(), &drive_id, mem.as_ref(), &descs);
645            if let Err(err) = queue.push_used(mem.as_ref(), head, written) {
646                tracing::warn!(error = %err, "block: push_used failed");
647                break;
648            }
649            completed = true;
650        }
651        if completed {
652            let _ = irq.trigger_queue();
653        }
654    }
655}
656
657impl VirtioDevice for BlockDevice {
658    fn device_type(&self) -> VirtioDeviceType {
659        VirtioDeviceType::Block
660    }
661    fn avail_features(&self) -> u64 {
662        self.avail
663    }
664    fn acked_features(&self) -> u64 {
665        self.acked
666    }
667    fn set_acked_features(&mut self, value: u64) {
668        self.acked = value;
669    }
670    fn queue_max_sizes(&self) -> &[u16] {
671        const SIZES: &[u16] = &[QUEUE_MAX_SIZE];
672        SIZES
673    }
674    fn queues(&self) -> &[Queue] {
675        &self.queues
676    }
677    fn queues_mut(&mut self) -> &mut [Queue] {
678        &mut self.queues
679    }
680    fn read_config(&self, offset: u64, data: &mut [u8]) {
681        // Config layout (virtio v1.2 § 5.2.4):
682        //   0x00 u64 capacity (sectors)
683        //   0x08 u32 size_max          (max single-segment size; 0 = no limit)
684        //   0x0C u32 seg_max           (max segments per request)
685        //   0x14 u32 blk_size          (only valid if F_BLK_SIZE)
686        // Per virtio v1.2 the driver assumes `seg_max = 1` if the field
687        // reads zero — that limits each request to a single descriptor and
688        // breaks chained reads/writes. Publish `QUEUE_MAX_SIZE - 2` so the
689        // driver can chain header + payload + status (matches upstream
690        // Firecracker's `vendors/firecracker/.../block/.../device.rs`).
691        let mut full = [0u8; 64];
692        full[0..8].copy_from_slice(&self.capacity_sectors().to_le_bytes());
693        // size_max stays 0 (no limit).
694        let seg_max = u32::from(QUEUE_MAX_SIZE - 2);
695        full[12..16].copy_from_slice(&seg_max.to_le_bytes());
696        full[20..24].copy_from_slice(&(SECTOR_SIZE as u32).to_le_bytes());
697        let off = offset as usize;
698        for (i, b) in data.iter_mut().enumerate() {
699            *b = full.get(off + i).copied().unwrap_or(0);
700        }
701    }
702    fn write_config(&mut self, _offset: u64, _data: &[u8]) {}
703    fn activate(&mut self, mem: Arc<dyn GuestMemory>, irq: IrqLine) -> Result<(), ActivateError> {
704        let mut state = self.state.lock();
705        state.mem = Some(mem);
706        state.irq = Some(irq);
707        state.activated = true;
708        Ok(())
709    }
710    fn is_activated(&self) -> bool {
711        self.state.lock().activated
712    }
713    fn process_queue(&mut self, queue_index: u16) {
714        if queue_index as usize == REQ_QUEUE {
715            self.drain_requests();
716        }
717    }
718}
719
720#[cfg(test)]
721mod tests {
722    use squib_arch::IntId;
723    use squib_core::SliceGuestMemory;
724    use squib_gic::Gic;
725
726    use super::*;
727    use crate::queue::{VIRTQ_DESC_F_NEXT, VIRTQ_DESC_F_WRITE};
728
729    #[test]
730    #[allow(clippy::disallowed_methods)]
731    fn test_should_round_trip_async_file_backend_via_pread_pwrite() {
732        // Use a tempfile so the test doesn't depend on /tmp pinned state.
733        let dir = tempfile::TempDir::new().unwrap();
734        let path = dir.path().join("blk.img");
735        std::fs::write(&path, [0u8; 4096]).unwrap();
736        let backend = AsyncFileBackend::open(&path, false).unwrap();
737        assert_eq!(backend.size_bytes(), 4096);
738        backend.write_at(512, b"hello-async").unwrap();
739        let mut buf = [0u8; 11];
740        backend.read_at(512, &mut buf).unwrap();
741        assert_eq!(&buf, b"hello-async");
742    }
743
744    #[test]
745    #[allow(clippy::disallowed_methods)]
746    fn test_should_reject_writes_through_async_file_backend_when_read_only() {
747        let dir = tempfile::TempDir::new().unwrap();
748        let path = dir.path().join("ro.img");
749        std::fs::write(&path, [0u8; 16]).unwrap();
750        let backend = AsyncFileBackend::open(&path, true).unwrap();
751        let err = backend.write_at(0, b"x").unwrap_err();
752        assert_eq!(err.kind(), std::io::ErrorKind::PermissionDenied);
753    }
754
755    #[test]
756    fn test_should_pass_through_when_rate_limit_is_unlimited() {
757        let inner: Arc<dyn BlockBackend> = Arc::new(MemoryBackend::new(1024));
758        let limited = RateLimitedBackend::new(inner, RateLimit::unlimited());
759        // 256-byte write — well above any meaningful per-call cap, but
760        // unlimited means it's instant.
761        let start = std::time::Instant::now();
762        limited.write_at(0, &[0xAA; 256]).unwrap();
763        assert!(start.elapsed() < std::time::Duration::from_millis(50));
764    }
765
766    #[test]
767    fn test_should_block_until_tokens_replenish() {
768        // 1 KiB/s with a 100-byte burst → 256-byte write must wait
769        // (256 - 100) / 1024 ≈ 152 ms for the additional 156 bytes to
770        // refill. Test asserts the call waited at least 100 ms.
771        let inner: Arc<dyn BlockBackend> = Arc::new(MemoryBackend::new(2048));
772        let limited = RateLimitedBackend::new(
773            inner,
774            RateLimit {
775                bytes_per_sec: 1024,
776                burst_bytes: 100,
777            },
778        );
779        let start = std::time::Instant::now();
780        // Drain the burst first.
781        limited.write_at(0, &[0; 100]).unwrap();
782        // Second write needs to wait for refill.
783        limited.write_at(0, &[0; 256]).unwrap();
784        let elapsed = start.elapsed();
785        assert!(
786            elapsed >= std::time::Duration::from_millis(100),
787            "limiter must throttle; elapsed = {elapsed:?}"
788        );
789    }
790
791    #[test]
792    fn test_should_recharge_steady_helper_to_one_tenth_of_rate() {
793        let cfg = RateLimit::steady(1_000_000);
794        assert_eq!(cfg.bytes_per_sec, 1_000_000);
795        assert_eq!(cfg.burst_bytes, 100_000);
796    }
797
798    /// In-memory backend for tests — 1 MiB of zero-initialized space.
799    #[derive(Debug)]
800    struct MemoryBackend {
801        bytes: Mutex<Vec<u8>>,
802    }
803    impl MemoryBackend {
804        fn new(size: usize) -> Self {
805            Self {
806                bytes: Mutex::new(vec![0u8; size]),
807            }
808        }
809    }
810    impl BlockBackend for MemoryBackend {
811        fn size_bytes(&self) -> u64 {
812            self.bytes.lock().len() as u64
813        }
814        fn read_at(&self, byte_offset: u64, buf: &mut [u8]) -> std::io::Result<()> {
815            let bytes = self.bytes.lock();
816            let off = byte_offset as usize;
817            let end = off + buf.len();
818            if end > bytes.len() {
819                return Err(std::io::Error::new(
820                    std::io::ErrorKind::UnexpectedEof,
821                    "out of range",
822                ));
823            }
824            buf.copy_from_slice(&bytes[off..end]);
825            Ok(())
826        }
827        fn write_at(&self, byte_offset: u64, buf: &[u8]) -> std::io::Result<()> {
828            let mut bytes = self.bytes.lock();
829            let off = byte_offset as usize;
830            let end = off + buf.len();
831            if end > bytes.len() {
832                return Err(std::io::Error::new(
833                    std::io::ErrorKind::UnexpectedEof,
834                    "out of range",
835                ));
836            }
837            bytes[off..end].copy_from_slice(buf);
838            Ok(())
839        }
840        fn flush(&self) -> std::io::Result<()> {
841            Ok(())
842        }
843        fn read_only(&self) -> bool {
844            false
845        }
846    }
847
848    #[derive(Debug, Default)]
849    struct StubGic;
850    impl Gic for StubGic {
851        fn pulse_spi(&self, _: IntId) -> Result<(), squib_gic::GicError> {
852            Ok(())
853        }
854        fn set_spi_level(&self, _: IntId, _: bool) -> Result<(), squib_gic::GicError> {
855            Ok(())
856        }
857        fn save_state(&self) -> Result<Vec<u8>, squib_gic::GicError> {
858            Ok(Vec::new())
859        }
860        fn restore_state(&self, _data: &[u8]) -> Result<(), squib_gic::GicError> {
861            Ok(())
862        }
863    }
864
865    fn line() -> IrqLine {
866        let gic: Arc<dyn Gic + Send + Sync> = Arc::new(StubGic);
867        IrqLine::new(gic, IntId::from_spi_cell(16).unwrap())
868    }
869
870    fn config(read_only: bool) -> BlockConfig {
871        BlockConfig {
872            drive_id: "rootfs".into(),
873            path_on_host: "/dev/null".into(),
874            is_root_device: true,
875            is_read_only: read_only,
876            cache_type: CacheType::Writeback,
877            partuuid: None,
878        }
879    }
880
881    #[test]
882    fn test_should_offer_ro_when_config_marks_read_only() {
883        let backend = Arc::new(MemoryBackend::new(SECTOR_SIZE as usize));
884        let dev = BlockDevice::new(config(true), backend);
885        assert_ne!(dev.avail_features() & F_RO, 0);
886    }
887
888    #[test]
889    fn test_should_offer_flush_for_writeback_cache() {
890        let backend = Arc::new(MemoryBackend::new(SECTOR_SIZE as usize));
891        let dev = BlockDevice::new(config(false), backend);
892        assert_ne!(dev.avail_features() & F_FLUSH, 0);
893    }
894
895    #[test]
896    fn test_should_publish_capacity_sectors_in_config() {
897        let backend = Arc::new(MemoryBackend::new((SECTOR_SIZE * 1024) as usize));
898        let dev = BlockDevice::new(config(false), backend);
899        let mut cfg = [0u8; 64];
900        dev.read_config(0, &mut cfg);
901        let cap = u64::from_le_bytes(cfg[0..8].try_into().unwrap());
902        assert_eq!(cap, 1024);
903    }
904
905    #[test]
906    fn test_should_complete_get_id_with_drive_id_padded_to_20_bytes() {
907        let backend = Arc::new(MemoryBackend::new((SECTOR_SIZE * 32) as usize));
908        let mut dev = BlockDevice::new(config(false), backend);
909        let mem = Arc::new(SliceGuestMemory::new(GuestAddress(0x4000_0000), 0x4000));
910        let q = &mut dev.queues_mut()[REQ_QUEUE];
911        q.size = 8;
912        q.desc_table_addr = GuestAddress(0x4000_0000);
913        q.avail_ring_addr = GuestAddress(0x4000_0800);
914        q.used_ring_addr = GuestAddress(0x4000_1000);
915        q.ready = true;
916        // Header at 0x4000_2000: type=GET_ID, sector=0.
917        mem.write_u32_le(GuestAddress(0x4000_2000), REQ_TYPE_GET_ID)
918            .unwrap();
919        // header descriptor (16 bytes), payload descriptor (20 bytes write-only),
920        // status descriptor (1 byte write-only).
921        let base = 0x4000_0000u64;
922        // Header
923        mem.write_u32_le(GuestAddress(base), 0x4000_2000).unwrap();
924        mem.write_u32_le(GuestAddress(base + 4), 0).unwrap();
925        mem.write_u32_le(GuestAddress(base + 8), 16).unwrap();
926        mem.write_u16_le(GuestAddress(base + 12), VIRTQ_DESC_F_NEXT)
927            .unwrap();
928        mem.write_u16_le(GuestAddress(base + 14), 1).unwrap();
929        // Payload (20 bytes)
930        let p = base + 16;
931        mem.write_u32_le(GuestAddress(p), 0x4000_2100).unwrap();
932        mem.write_u32_le(GuestAddress(p + 4), 0).unwrap();
933        mem.write_u32_le(GuestAddress(p + 8), 20).unwrap();
934        mem.write_u16_le(GuestAddress(p + 12), VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE)
935            .unwrap();
936        mem.write_u16_le(GuestAddress(p + 14), 2).unwrap();
937        // Status (1 byte)
938        let s = base + 32;
939        mem.write_u32_le(GuestAddress(s), 0x4000_2200).unwrap();
940        mem.write_u32_le(GuestAddress(s + 4), 0).unwrap();
941        mem.write_u32_le(GuestAddress(s + 8), 1).unwrap();
942        mem.write_u16_le(GuestAddress(s + 12), VIRTQ_DESC_F_WRITE)
943            .unwrap();
944        mem.write_u16_le(GuestAddress(s + 14), 0).unwrap();
945        mem.write_u16_le(GuestAddress(0x4000_0804), 0).unwrap();
946        mem.write_u16_le(GuestAddress(0x4000_0802), 1).unwrap();
947        dev.activate(mem.clone(), line()).unwrap();
948        dev.process_queue(REQ_QUEUE as u16);
949        let mut id = [0u8; 20];
950        mem.read(GuestAddress(0x4000_2100), &mut id).unwrap();
951        assert_eq!(&id[..6], b"rootfs");
952        let status = {
953            let mut b = [0u8; 1];
954            mem.read(GuestAddress(0x4000_2200), &mut b).unwrap();
955            b[0]
956        };
957        assert_eq!(status, STATUS_OK);
958    }
959
960    #[test]
961    fn test_should_complete_in_request_with_payload_data_from_backend() {
962        let backend = Arc::new(MemoryBackend::new((SECTOR_SIZE * 32) as usize));
963        // Pre-populate sector 1 with "hello\0\0\0...".
964        backend.write_at(SECTOR_SIZE, b"hello").unwrap();
965        let mut dev = BlockDevice::new(config(false), backend);
966        let mem = Arc::new(SliceGuestMemory::new(GuestAddress(0x4000_0000), 0x1_0000));
967        let q = &mut dev.queues_mut()[REQ_QUEUE];
968        q.size = 8;
969        q.desc_table_addr = GuestAddress(0x4000_0000);
970        q.avail_ring_addr = GuestAddress(0x4000_0800);
971        q.used_ring_addr = GuestAddress(0x4000_1000);
972        q.ready = true;
973        // Header: REQ_TYPE_IN, sector=1.
974        mem.write_u32_le(GuestAddress(0x4000_2000), REQ_TYPE_IN)
975            .unwrap();
976        mem.write_u64_le(GuestAddress(0x4000_2008), 1).unwrap();
977        let base = 0x4000_0000u64;
978        // Header descriptor.
979        mem.write_u32_le(GuestAddress(base), 0x4000_2000).unwrap();
980        mem.write_u32_le(GuestAddress(base + 4), 0).unwrap();
981        mem.write_u32_le(GuestAddress(base + 8), 16).unwrap();
982        mem.write_u16_le(GuestAddress(base + 12), VIRTQ_DESC_F_NEXT)
983            .unwrap();
984        mem.write_u16_le(GuestAddress(base + 14), 1).unwrap();
985        // Payload: 512-byte write-only.
986        let p = base + 16;
987        mem.write_u32_le(GuestAddress(p), 0x4000_3000).unwrap();
988        mem.write_u32_le(GuestAddress(p + 4), 0).unwrap();
989        mem.write_u32_le(GuestAddress(p + 8), 512).unwrap();
990        mem.write_u16_le(GuestAddress(p + 12), VIRTQ_DESC_F_NEXT | VIRTQ_DESC_F_WRITE)
991            .unwrap();
992        mem.write_u16_le(GuestAddress(p + 14), 2).unwrap();
993        // Status: 1-byte write-only.
994        let s = base + 32;
995        mem.write_u32_le(GuestAddress(s), 0x4000_3200).unwrap();
996        mem.write_u32_le(GuestAddress(s + 4), 0).unwrap();
997        mem.write_u32_le(GuestAddress(s + 8), 1).unwrap();
998        mem.write_u16_le(GuestAddress(s + 12), VIRTQ_DESC_F_WRITE)
999            .unwrap();
1000        mem.write_u16_le(GuestAddress(s + 14), 0).unwrap();
1001        mem.write_u16_le(GuestAddress(0x4000_0804), 0).unwrap();
1002        mem.write_u16_le(GuestAddress(0x4000_0802), 1).unwrap();
1003        dev.activate(mem.clone(), line()).unwrap();
1004        dev.process_queue(REQ_QUEUE as u16);
1005        let mut got = [0u8; 5];
1006        mem.read(GuestAddress(0x4000_3000), &mut got).unwrap();
1007        assert_eq!(&got, b"hello");
1008        let mut st = [0u8; 1];
1009        mem.read(GuestAddress(0x4000_3200), &mut st).unwrap();
1010        assert_eq!(st[0], STATUS_OK);
1011    }
1012
1013    #[test]
1014    fn test_should_return_unsupp_for_unknown_request_type() {
1015        let backend = Arc::new(MemoryBackend::new(SECTOR_SIZE as usize));
1016        let dev = BlockDevice::new(config(false), backend);
1017        let mem = SliceGuestMemory::new(GuestAddress(0), 0x1_0000);
1018        // Build a synthetic descriptor list manually for the unit-level test.
1019        let header_addr = GuestAddress(0x100);
1020        mem.write_u32_le(header_addr, 99).unwrap();
1021        let descs = vec![
1022            crate::queue::Descriptor {
1023                addr: header_addr,
1024                len: 16,
1025                flags: VIRTQ_DESC_F_NEXT,
1026                next: 1,
1027            },
1028            crate::queue::Descriptor {
1029                addr: GuestAddress(0x200),
1030                len: 1,
1031                flags: VIRTQ_DESC_F_WRITE,
1032                next: 0,
1033            },
1034        ];
1035        let (status, _) = BlockDevice::handle_request_inner(
1036            dev.backend.as_ref(),
1037            &dev.config.drive_id,
1038            &mem,
1039            &descs,
1040        );
1041        assert_eq!(status, STATUS_UNSUPP);
1042    }
1043}