Skip to main content

squib_virtio/
queue.rs

1//! Split-ring virtqueue handling.
2//!
3//! Implements the classic split-ring layout from
4//! [virtio v1.2 § 2.7][spec]. The packed-ring (`VIRTIO_F_RING_PACKED`,
5//! [`crate::feature_bits::RING_PACKED`]) is intentionally not implemented for
6//! 1.0 — split-ring is what every Firecracker-compat guest driver uses, and
7//! the simpler queue handler keeps the device modules small.
8//!
9//! ## Layout (per spec § 2.7)
10//!
11//! ```text
12//! Descriptor table:    desc_table_addr,  16 * queue_size bytes
13//! Available ring:      avail_ring_addr,  6 + 2 * queue_size bytes
14//! Used ring:           used_ring_addr,   6 + 8 * queue_size bytes
15//! ```
16//!
17//! Each descriptor:
18//!
19//! ```text
20//! 0x00 u64  addr     guest-physical address of the buffer
21//! 0x08 u32  len      buffer length in bytes
22//! 0x0C u16  flags    (NEXT | WRITE | INDIRECT)
23//! 0x0E u16  next     descriptor table index of the next descriptor
24//! ```
25//!
26//! Available ring:
27//!
28//! ```text
29//! 0x00 u16  flags
30//! 0x02 u16  idx        producer (driver) cursor
31//! 0x04 [u16; queue_size] ring   indices of descriptor heads
32//! ```
33//!
34//! Used ring:
35//!
36//! ```text
37//! 0x00 u16  flags
38//! 0x02 u16  idx                        producer (device) cursor
39//! 0x04 [(u32, u32); queue_size] ring   (head_index, total_bytes_written)
40//! ```
41//!
42//! ## Concurrency
43//!
44//! The device handler thread "owns" the queue's `last_avail_idx` cursor; the
45//! guest driver is the sole writer of `avail.idx`. The transport's queue
46//! state lives behind the device's mutex, so race-free across threads.
47//!
48//! [spec]: https://docs.oasis-open.org/virtio/virtio/v1.2/csd01/virtio-v1.2-csd01.html#x1-340007
49
50use squib_core::{GuestAddress, GuestMemory};
51use thiserror::Error;
52
53/// Maximum allowed `queue_size`. Per spec § 2.7 it must be a power of two
54/// ≤ 32768. Squib's per-device modules cap below this for memory pressure
55/// reasons; the spec ceiling is exposed so a future device that wants to push
56/// it can.
57pub const MAX_QUEUE_SIZE: u16 = 32768;
58
59/// Type-safe wrapper around a queue cursor index. Wraps modulo `queue_size`
60/// at the call site.
61#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
62pub struct QueueIndex(pub u16);
63
64/// Descriptor flag: another descriptor follows in the chain (`next` field is
65/// valid).
66pub const VIRTQ_DESC_F_NEXT: u16 = 1;
67/// Descriptor flag: descriptor is device-write (the device fills it).
68pub const VIRTQ_DESC_F_WRITE: u16 = 2;
69/// Descriptor flag: this descriptor's buffer holds an indirect descriptor
70/// table.
71pub const VIRTQ_DESC_F_INDIRECT: u16 = 4;
72
73/// Per-queue setup state and runtime cursor.
74///
75/// The transport mutates `size`, `*_addr`, and `ready` in response to guest
76/// MMIO writes during driver init. The device handler mutates
77/// `next_avail_idx` as it consumes descriptor heads.
78#[derive(Debug, Clone)]
79pub struct Queue {
80    /// Maximum descriptor count negotiated at boot (the device exposes this
81    /// as `QueueNumMax`).
82    pub max_size: u16,
83    /// Active descriptor count. Bounded by `max_size`. Set by the driver
84    /// writing `QueueNum`.
85    pub size: u16,
86    /// `true` once the driver writes `QueueReady = 1`.
87    pub ready: bool,
88    /// Guest-physical address of the descriptor table.
89    pub desc_table_addr: GuestAddress,
90    /// Guest-physical address of the available ring.
91    pub avail_ring_addr: GuestAddress,
92    /// Guest-physical address of the used ring.
93    pub used_ring_addr: GuestAddress,
94    /// Device-side cursor into the available ring.
95    pub next_avail_idx: u16,
96    /// Device-side cursor into the used ring (the next slot to write to).
97    pub next_used_idx: u16,
98}
99
100impl Queue {
101    /// Build a fresh queue with the given maximum descriptor count.
102    #[must_use]
103    pub fn new(max_size: u16) -> Self {
104        Self {
105            max_size,
106            size: max_size,
107            ready: false,
108            desc_table_addr: GuestAddress(0),
109            avail_ring_addr: GuestAddress(0),
110            used_ring_addr: GuestAddress(0),
111            next_avail_idx: 0,
112            next_used_idx: 0,
113        }
114    }
115
116    /// Set the negotiated descriptor count, clamping to `max_size` and to a
117    /// power of two as the spec requires.
118    pub fn set_size(&mut self, requested: u16) {
119        // Spec § 2.7.4: the driver writes a power-of-two value ≤
120        // QueueNumMax. We silently clamp invalid values rather than refusing
121        // — Firecracker behaviour, observed by mainline virtio drivers.
122        let clamped = requested.min(self.max_size);
123        self.size = if clamped == 0 {
124            1
125        } else if clamped.is_power_of_two() {
126            clamped
127        } else {
128            // Round down to the previous power of two; never zero. Use
129            // `checked_next_power_of_two` to defend against a future bump of
130            // `MAX_QUEUE_SIZE` past 32768 — overflow returns `None` here, in
131            // which case we fall back to the largest representable power of
132            // two `≤ u16::MAX` (`1 << 15`).
133            clamped
134                .checked_next_power_of_two()
135                .map_or(1u16 << 15, |p| (p >> 1).max(1))
136        };
137    }
138
139    /// `true` when every required address is set and `ready = 1`.
140    #[must_use]
141    pub fn is_valid(&self) -> bool {
142        self.ready
143            && self.size > 0
144            && self.size <= self.max_size
145            && self.desc_table_addr.raw() != 0
146            && self.avail_ring_addr.raw() != 0
147            && self.used_ring_addr.raw() != 0
148    }
149
150    /// Reset the queue cursors and clear the ready flag. Called when the
151    /// driver writes `Status = INIT`.
152    pub fn reset(&mut self) {
153        self.size = self.max_size;
154        self.ready = false;
155        self.desc_table_addr = GuestAddress(0);
156        self.avail_ring_addr = GuestAddress(0);
157        self.used_ring_addr = GuestAddress(0);
158        self.next_avail_idx = 0;
159        self.next_used_idx = 0;
160    }
161
162    /// Read `avail.idx` from guest memory.
163    fn avail_idx<M: GuestMemory + ?Sized>(&self, mem: &M) -> Result<u16, QueueError> {
164        let addr = GuestAddress(self.avail_ring_addr.raw() + 2);
165        Ok(mem.read_u16_le(addr)?)
166    }
167
168    /// Read the descriptor head index at `avail.ring[i % size]`.
169    fn avail_ring_entry<M: GuestMemory + ?Sized>(
170        &self,
171        mem: &M,
172        i: u16,
173    ) -> Result<u16, QueueError> {
174        let off = u64::from(i % self.size) * 2;
175        let addr = GuestAddress(self.avail_ring_addr.raw() + 4 + off);
176        Ok(mem.read_u16_le(addr)?)
177    }
178
179    /// Pop the next descriptor head from the available ring.
180    ///
181    /// Returns `Ok(None)` if the driver has not made any new descriptors
182    /// available since the last call.
183    ///
184    /// # Errors
185    /// [`QueueError`] for any underlying memory access failure.
186    pub fn pop_avail<M: GuestMemory + ?Sized>(
187        &mut self,
188        mem: &M,
189    ) -> Result<Option<DescriptorChain>, QueueError> {
190        if !self.is_valid() {
191            return Err(QueueError::QueueNotReady);
192        }
193        let driver_idx = self.avail_idx(mem)?;
194        if driver_idx == self.next_avail_idx {
195            return Ok(None);
196        }
197        let head_idx = self.avail_ring_entry(mem, self.next_avail_idx)?;
198        if head_idx >= self.size {
199            return Err(QueueError::DescriptorOutOfRange {
200                head: head_idx,
201                size: self.size,
202            });
203        }
204        self.next_avail_idx = self.next_avail_idx.wrapping_add(1);
205        Ok(Some(DescriptorChain {
206            desc_table_addr: self.desc_table_addr,
207            queue_size: self.size,
208            head_index: head_idx,
209            current: head_idx,
210            length: 0,
211            done: false,
212        }))
213    }
214
215    /// Push a completed descriptor chain into the used ring.
216    ///
217    /// `head_index` is the descriptor index returned by `pop_avail`;
218    /// `bytes_written` is the total bytes the device wrote into device-write
219    /// descriptors of the chain (zero for purely device-read chains).
220    ///
221    /// # Errors
222    /// [`QueueError`] for any underlying memory access failure.
223    pub fn push_used<M: GuestMemory + ?Sized>(
224        &mut self,
225        mem: &M,
226        head_index: u16,
227        bytes_written: u32,
228    ) -> Result<(), QueueError> {
229        if !self.is_valid() {
230            return Err(QueueError::QueueNotReady);
231        }
232        let slot = self.next_used_idx % self.size;
233        let elem_addr = GuestAddress(self.used_ring_addr.raw() + 4 + u64::from(slot) * 8);
234        mem.write_u32_le(elem_addr, u32::from(head_index))?;
235        mem.write_u32_le(GuestAddress(elem_addr.raw() + 4), bytes_written)?;
236        self.next_used_idx = self.next_used_idx.wrapping_add(1);
237        // Publish the new device cursor.
238        let used_idx_addr = GuestAddress(self.used_ring_addr.raw() + 2);
239        mem.write_u16_le(used_idx_addr, self.next_used_idx)?;
240        Ok(())
241    }
242}
243
244/// Iterator over the descriptors in a single chain (head + linked descriptors
245/// via `NEXT`).
246#[derive(Debug)]
247pub struct DescriptorChain {
248    desc_table_addr: GuestAddress,
249    queue_size: u16,
250    /// Index of the head descriptor; needed when pushing the chain into the
251    /// used ring.
252    head_index: u16,
253    /// Index of the descriptor to be returned next (or the head if no
254    /// descriptors have been read yet).
255    current: u16,
256    /// Number of descriptors yielded so far. Bounded by `queue_size` to
257    /// detect cycles.
258    length: u16,
259    /// Set once a descriptor without `NEXT` has been yielded.
260    done: bool,
261}
262
263/// One descriptor entry from a chain: a single guest buffer with its flags.
264#[derive(Debug, Clone, Copy)]
265pub struct Descriptor {
266    /// Guest-physical address of the buffer.
267    pub addr: GuestAddress,
268    /// Buffer length in bytes.
269    pub len: u32,
270    /// `VIRTQ_DESC_F_*` flags.
271    pub flags: u16,
272    /// Next descriptor index (only valid if `flags & NEXT`).
273    pub next: u16,
274}
275
276impl Descriptor {
277    /// `true` if the device should treat this buffer as device-write (it
278    /// fills the buffer) rather than device-read (it consumes the buffer).
279    #[must_use]
280    pub fn is_write_only(&self) -> bool {
281        self.flags & VIRTQ_DESC_F_WRITE != 0
282    }
283
284    /// `true` if a successor descriptor exists.
285    #[must_use]
286    pub fn has_next(&self) -> bool {
287        self.flags & VIRTQ_DESC_F_NEXT != 0
288    }
289}
290
291impl DescriptorChain {
292    /// Index of the head descriptor — pass to `Queue::push_used` when
293    /// completing the chain.
294    #[must_use]
295    pub fn head_index(&self) -> u16 {
296        self.head_index
297    }
298
299    /// Read the next descriptor in the chain, if any.
300    ///
301    /// Returns `Ok(None)` when the chain is fully traversed.
302    ///
303    /// # Errors
304    /// - [`QueueError::DescriptorOutOfRange`] if `next` exceeds `queue_size`.
305    /// - [`QueueError::ChainTooLong`] if the chain length exceeds `queue_size` (cycle protection).
306    /// - [`QueueError::Memory`] for any descriptor table read failure.
307    pub fn next_descriptor<M: GuestMemory + ?Sized>(
308        &mut self,
309        mem: &M,
310    ) -> Result<Option<Descriptor>, QueueError> {
311        if self.done {
312            return Ok(None);
313        }
314        if self.current >= self.queue_size {
315            return Err(QueueError::DescriptorOutOfRange {
316                head: self.current,
317                size: self.queue_size,
318            });
319        }
320        if self.length >= self.queue_size {
321            return Err(QueueError::ChainTooLong {
322                size: self.queue_size,
323            });
324        }
325        let off = u64::from(self.current) * 16;
326        let base = GuestAddress(self.desc_table_addr.raw() + off);
327        let addr = mem.read_u64_le(base)?;
328        let len = mem.read_u32_le(GuestAddress(base.raw() + 8))?;
329        let flags = mem.read_u16_le(GuestAddress(base.raw() + 12))?;
330        let next = mem.read_u16_le(GuestAddress(base.raw() + 14))?;
331        let desc = Descriptor {
332            addr: GuestAddress(addr),
333            len,
334            flags,
335            next,
336        };
337        self.length = self.length.wrapping_add(1);
338        if desc.has_next() {
339            self.current = next;
340        } else {
341            self.done = true;
342        }
343        Ok(Some(desc))
344    }
345
346    /// Drain the chain into a `Vec<Descriptor>`. Convenience for short
347    /// chains where the caller wants to pre-collect.
348    ///
349    /// # Errors
350    /// Same as [`Self::next_descriptor`].
351    pub fn collect<M: GuestMemory + ?Sized>(
352        mut self,
353        mem: &M,
354    ) -> Result<Vec<Descriptor>, QueueError> {
355        let mut out = Vec::with_capacity(usize::from(self.queue_size).min(8));
356        while let Some(d) = self.next_descriptor(mem)? {
357            out.push(d);
358        }
359        Ok(out)
360    }
361}
362
363/// Queue / descriptor / ring errors.
364#[derive(Debug, Error)]
365#[non_exhaustive]
366pub enum QueueError {
367    /// Driver attempted to use the queue before posting `QueueReady = 1`.
368    #[error("queue not ready")]
369    QueueNotReady,
370    /// A descriptor index exceeded the negotiated `queue_size`.
371    #[error("descriptor index {head} >= queue_size {size}")]
372    DescriptorOutOfRange {
373        /// The offending index.
374        head: u16,
375        /// Negotiated `queue_size`.
376        size: u16,
377    },
378    /// A descriptor chain length exceeded `queue_size` (cycle protection).
379    #[error("descriptor chain length exceeds queue_size {size} (cycle?)")]
380    ChainTooLong {
381        /// Negotiated `queue_size`.
382        size: u16,
383    },
384    /// Guest memory access failed.
385    #[error("queue memory error: {0}")]
386    Memory(#[from] squib_core::Error),
387}
388
389#[cfg(test)]
390mod tests {
391    use squib_core::SliceGuestMemory;
392
393    use super::*;
394
395    fn mem() -> SliceGuestMemory {
396        SliceGuestMemory::new(GuestAddress(0x4000_0000), 0x1_0000)
397    }
398
399    fn setup_queue(mem: &SliceGuestMemory) -> Queue {
400        let mut q = Queue::new(64);
401        q.size = 8;
402        q.desc_table_addr = GuestAddress(0x4000_0000);
403        q.avail_ring_addr = GuestAddress(0x4000_0800);
404        q.used_ring_addr = GuestAddress(0x4000_1000);
405        q.ready = true;
406        // Wipe rings to deterministic state.
407        let zeros = vec![0u8; 0x100];
408        mem.write(q.desc_table_addr, &zeros).unwrap();
409        mem.write(q.avail_ring_addr, &zeros).unwrap();
410        mem.write(q.used_ring_addr, &zeros).unwrap();
411        q
412    }
413
414    fn write_desc(
415        mem: &SliceGuestMemory,
416        q: &Queue,
417        idx: u16,
418        addr: u64,
419        len: u32,
420        flags: u16,
421        next: u16,
422    ) {
423        let base = q.desc_table_addr.raw() + u64::from(idx) * 16;
424        mem.write_u32_le(GuestAddress(base), addr as u32).unwrap();
425        mem.write_u32_le(GuestAddress(base + 4), (addr >> 32) as u32)
426            .unwrap();
427        mem.write_u32_le(GuestAddress(base + 8), len).unwrap();
428        mem.write_u16_le(GuestAddress(base + 12), flags).unwrap();
429        mem.write_u16_le(GuestAddress(base + 14), next).unwrap();
430    }
431
432    #[test]
433    fn test_should_clamp_queue_size_to_power_of_two() {
434        let mut q = Queue::new(256);
435        q.set_size(200);
436        assert_eq!(q.size, 128);
437        q.set_size(256);
438        assert_eq!(q.size, 256);
439        q.set_size(1024); // > max
440        assert_eq!(q.size, 256);
441    }
442
443    #[test]
444    fn test_should_reject_pop_before_ready() {
445        let m = mem();
446        let mut q = Queue::new(8);
447        let err = q.pop_avail(&m).unwrap_err();
448        assert!(matches!(err, QueueError::QueueNotReady));
449    }
450
451    #[test]
452    fn test_should_yield_no_descriptor_when_avail_idx_unchanged() {
453        let m = mem();
454        let mut q = setup_queue(&m);
455        assert!(q.pop_avail(&m).unwrap().is_none());
456    }
457
458    #[test]
459    fn test_should_walk_single_descriptor_chain() {
460        let m = mem();
461        let mut q = setup_queue(&m);
462        write_desc(&m, &q, 0, 0x1234, 16, 0, 0);
463        m.write_u16_le(GuestAddress(q.avail_ring_addr.raw() + 4), 0)
464            .unwrap();
465        m.write_u16_le(GuestAddress(q.avail_ring_addr.raw() + 2), 1)
466            .unwrap();
467        let chain = q.pop_avail(&m).unwrap().expect("chain available");
468        let descs = chain.collect(&m).unwrap();
469        assert_eq!(descs.len(), 1);
470        assert_eq!(descs[0].addr.raw(), 0x1234);
471        assert_eq!(descs[0].len, 16);
472    }
473
474    #[test]
475    fn test_should_walk_two_descriptor_chain() {
476        let m = mem();
477        let mut q = setup_queue(&m);
478        write_desc(&m, &q, 0, 0x1000, 16, VIRTQ_DESC_F_NEXT, 1);
479        write_desc(&m, &q, 1, 0x2000, 32, VIRTQ_DESC_F_WRITE, 0);
480        m.write_u16_le(GuestAddress(q.avail_ring_addr.raw() + 4), 0)
481            .unwrap();
482        m.write_u16_le(GuestAddress(q.avail_ring_addr.raw() + 2), 1)
483            .unwrap();
484        let chain = q.pop_avail(&m).unwrap().expect("chain available");
485        let descs = chain.collect(&m).unwrap();
486        assert_eq!(descs.len(), 2);
487        assert_eq!(descs[0].addr.raw(), 0x1000);
488        assert!(!descs[0].is_write_only());
489        assert!(descs[0].has_next());
490        assert_eq!(descs[1].addr.raw(), 0x2000);
491        assert!(descs[1].is_write_only());
492        assert!(!descs[1].has_next());
493    }
494
495    #[test]
496    fn test_should_reject_descriptor_index_out_of_range() {
497        let m = mem();
498        let mut q = setup_queue(&m);
499        // Head index 9 with queue_size 8 → out of range.
500        m.write_u16_le(GuestAddress(q.avail_ring_addr.raw() + 4), 9)
501            .unwrap();
502        m.write_u16_le(GuestAddress(q.avail_ring_addr.raw() + 2), 1)
503            .unwrap();
504        let err = q.pop_avail(&m).unwrap_err();
505        assert!(matches!(err, QueueError::DescriptorOutOfRange { .. }));
506    }
507
508    #[test]
509    fn test_should_break_descriptor_chain_cycle_with_chain_too_long() {
510        let m = mem();
511        let mut q = setup_queue(&m);
512        // 0 → 1 → 0 → 1 → ... cycle.
513        write_desc(&m, &q, 0, 0x1000, 16, VIRTQ_DESC_F_NEXT, 1);
514        write_desc(&m, &q, 1, 0x2000, 16, VIRTQ_DESC_F_NEXT, 0);
515        m.write_u16_le(GuestAddress(q.avail_ring_addr.raw() + 4), 0)
516            .unwrap();
517        m.write_u16_le(GuestAddress(q.avail_ring_addr.raw() + 2), 1)
518            .unwrap();
519        let chain = q.pop_avail(&m).unwrap().expect("chain available");
520        let err = chain.collect(&m).unwrap_err();
521        assert!(matches!(err, QueueError::ChainTooLong { .. }));
522    }
523
524    #[test]
525    fn test_should_publish_used_ring_with_head_index_and_byte_count() {
526        let m = mem();
527        let mut q = setup_queue(&m);
528        q.push_used(&m, 5, 128).unwrap();
529        // First used elem at used_ring + 4.
530        let elem_addr = GuestAddress(q.used_ring_addr.raw() + 4);
531        let head = m.read_u32_le(elem_addr).unwrap();
532        let len = m.read_u32_le(GuestAddress(elem_addr.raw() + 4)).unwrap();
533        assert_eq!(head, 5);
534        assert_eq!(len, 128);
535        // used.idx now == 1.
536        let used_idx = m
537            .read_u16_le(GuestAddress(q.used_ring_addr.raw() + 2))
538            .unwrap();
539        assert_eq!(used_idx, 1);
540    }
541
542    #[test]
543    fn test_should_reset_queue_back_to_max_size_and_unready() {
544        let mut q = Queue::new(8);
545        q.size = 4;
546        q.ready = true;
547        q.next_avail_idx = 3;
548        q.reset();
549        assert_eq!(q.size, 8);
550        assert!(!q.ready);
551        assert_eq!(q.next_avail_idx, 0);
552    }
553}