virtio_driver/devices/
virtio_blk.rs

1// SPDX-License-Identifier: (MIT OR Apache-2.0)
2
3use crate::virtqueue::{Virtqueue, VirtqueueIter, VirtqueueLayout};
4use crate::{ByteValued, Completion, Le16, Le32, Le64, VirtioFeatureFlags, VirtioTransport};
5use bitflags::bitflags;
6use libc::{c_void, iovec, EIO, ENOTSUP, EPROTO};
7use std::convert::TryFrom;
8use std::io::{Error, ErrorKind};
9use std::iter;
10use std::mem;
11
12bitflags! {
13    pub struct VirtioBlkFeatureFlags: u64 {
14        const SIZE_MAX = 1 << 1;
15        const SEG_MAX = 1 << 2;
16        const GEOMETRY = 1 << 4;
17        const RO = 1 << 5;
18        const BLK_SIZE = 1 << 6;
19        const FLUSH = 1 << 9;
20        const TOPOLOGY = 1 << 10;
21        const CONFIG_WCE = 1 << 11;
22        const MQ = 1 << 12;
23        const DISCARD = 1 << 13;
24        const WRITE_ZEROES = 1 << 14;
25        const LIFETIME = 1 << 15;
26        const SECURE_ERASE = 1 << 16;
27    }
28}
29
30/// The Device Configuration Space for a virtio-blk device.
31///
32/// This is `struct virtio_blk_config`` from the VIRTIO 1.1 specification (see 5.2.4).
33#[derive(Clone, Copy, Default)]
34#[repr(C, packed)]
35pub struct VirtioBlkConfig {
36    pub capacity: Le64,
37    pub size_max: Le32,
38    pub seg_max: Le32,
39    pub cylinders: Le16,
40    pub heads: u8,
41    pub sectors: u8,
42    pub blk_size: Le32,
43    pub physical_block_exp: u8,
44    pub alignment_offset: u8,
45    pub min_io_size: Le16,
46    pub opt_io_size: Le32,
47    pub writeback: u8,
48    _unused0: u8,
49    pub num_queues: Le16,
50    pub max_discard_sectors: Le32,
51    pub max_discard_seg: Le32,
52    pub discard_sector_alignment: Le32,
53    pub max_write_zeroes_sectors: Le32,
54    pub max_write_zeroes_seg: Le32,
55    pub write_zeroes_may_unmap: u8,
56    _unused1: [u8; 3],
57}
58
59unsafe impl ByteValued for VirtioBlkConfig {}
60
61fn to_lba(offset: u64) -> Result<u64, Error> {
62    // This is independent of the reported block size of the device
63    let block_size = 512;
64
65    if offset & (block_size - 1) != 0 {
66        return Err(Error::new(ErrorKind::InvalidInput, "Unaligned request"));
67    }
68
69    Ok(offset / block_size)
70}
71
72pub fn validate_lba(offset: u64) -> Result<(), Error> {
73    to_lba(offset).map(|_| ())
74}
75
76pub fn virtio_blk_max_queues(transport: &VirtioBlkTransport) -> Result<usize, Error> {
77    // Some transports (e.g. vhost-vdpa before Linux v5.18) may not be able
78    // to provide the number of queues, so let's look in the config space.
79    let features = VirtioBlkFeatureFlags::from_bits_truncate(transport.get_features());
80    if features.contains(VirtioBlkFeatureFlags::MQ) {
81        let cfg = transport.get_config()?;
82        Ok(u16::from(cfg.num_queues) as usize)
83    } else {
84        // If VirtioBlkFeatureFlags::MQ is not negotiated, the device supports
85        // only a single queue
86        Ok(1)
87    }
88}
89
90#[derive(Clone, Copy, Default)]
91#[repr(C, packed)]
92#[allow(dead_code)]
93struct DiscardWriteZeroesData {
94    sector: Le64,
95    num_sectors: Le32,
96    flags: Le32,
97}
98
99bitflags! {
100    pub struct DiscardWriteZeroesFlags: u32 {
101        const UNMAP = 1 << 0;
102    }
103}
104
105impl DiscardWriteZeroesData {
106    fn new(offset: u64, len: u64, unmap: bool) -> Result<Self, Error> {
107        let start = to_lba(offset)?;
108        let num_sectors = u32::try_from(to_lba(len)?)
109            .map_err(|_e| Error::new(ErrorKind::InvalidInput, "Discard length too large"))?;
110        let flags = if unmap {
111            DiscardWriteZeroesFlags::UNMAP.bits()
112        } else {
113            0
114        };
115
116        Ok(DiscardWriteZeroesData {
117            sector: start.into(),
118            num_sectors: num_sectors.into(),
119            flags: flags.into(),
120        })
121    }
122}
123
124/// The request header for virtio-blk devices.
125///
126/// This is the first part of `struct virtio_blk_req`` from the VIRTIO 1.1 specification (see
127/// 5.2.6).
128#[derive(Clone, Copy, Default)]
129#[repr(C, packed)]
130#[allow(dead_code)]
131struct VirtioBlkReqHeader {
132    req_type: Le32,
133    _reserved: Le32,
134    sector: Le64,
135}
136
137#[derive(Clone, Copy, PartialEq, Eq)]
138#[repr(u32)]
139enum VirtioBlkReqType {
140    Read = 0,
141    Write = 1,
142    Flush = 4,
143    Discard = 11,
144    WriteZeroes = 13,
145}
146
147impl VirtioBlkReqType {
148    fn is_from_dev(&self) -> bool {
149        // The return value for Flush doesn't matter because it doesn't have any buffers
150        *self == Self::Read
151    }
152}
153
154impl VirtioBlkReqHeader {
155    fn new(req_type: VirtioBlkReqType, offset: u64) -> Self {
156        Self {
157            req_type: (req_type as u32).into(),
158            _reserved: 0.into(),
159            sector: offset.into(),
160        }
161    }
162}
163
164unsafe impl ByteValued for VirtioBlkReqHeader {}
165
166#[derive(Clone, Copy)]
167pub struct VirtioBlkReqBuf {
168    header: VirtioBlkReqHeader,
169    status: u8,
170    dwz_data: DiscardWriteZeroesData,
171}
172
173pub type VirtioBlkTransport = dyn VirtioTransport<VirtioBlkConfig, VirtioBlkReqBuf>;
174
175/// A queue of a virtio-blk device.
176///
177/// This is used to send block I/O requests to the device and receive completions. Note that calling
178/// transport specific functions may need to be called before or after certain operations on the
179/// `VirtioBlkQueue`:
180///
181/// * All request methods only enqueue the requests in the rings. They don't notify the device of
182///   new requests, so it may or may not start processing them. Call
183///   [`crate::QueueNotifier::notify`] on the result of [`VirtioTransport::get_submission_notifier`]
184///   after queuing requests to notify the device. You can queue multiple requests and then send a
185///   single notification for all of them.
186///
187/// * To be notified of new completions, use the `EventFd` returned by
188///   [`VirtioTransport::get_completion_fd`].
189///
190/// When a request is submitted, the user provides a "context" of type `C` that will later be
191/// returned in the completion for that request.
192///
193/// Use [`setup_queues`] to create the queues for a device.
194///
195/// # Examples
196///
197/// ```no_run
198/// # use virtio_driver::{
199/// #     VhostUser, VirtioBlkQueue, VirtioBlkTransport, VirtioFeatureFlags, VirtioTransport
200/// # };
201/// use rustix::fs::{memfd_create, MemfdFlags};
202/// use std::ffi::CStr;
203/// use std::fs::File;
204/// use std::os::unix::io::{AsRawFd, FromRawFd};
205/// use std::sync::{Arc, RwLock};
206///
207/// // Connect to the vhost-user socket and create the queues
208/// let mut vhost = VhostUser::new("/tmp/vhost.sock", VirtioFeatureFlags::VERSION_1.bits())?;
209/// let mut vhost = Arc::new(RwLock::new(Box::new(vhost) as Box<VirtioBlkTransport>));
210/// let mut queues = VirtioBlkQueue::<&'static str>::setup_queues(&vhost, 1, 128)?;
211///
212/// // Create shared memory that is visible for the device
213/// let mem_file: File = memfd_create("guest-ram", MemfdFlags::empty())?.into();
214/// mem_file.set_len(512)?;
215/// let mut mem = unsafe { memmap2::MmapMut::map_mut(&mem_file) }?;
216/// vhost.write().unwrap().map_mem_region(mem.as_ptr() as usize, 512, mem_file.as_raw_fd(), 0)?;
217///
218/// // Submit a request
219/// queues[0].read(0, &mut mem, "my-request-context")?;
220/// vhost.read().unwrap().get_submission_notifier(0).notify()?;
221///
222/// // Wait for its completion
223/// let mut done = false;
224/// while !done {
225///     let ret = vhost.read().unwrap().get_completion_fd(0).read();
226///     if ret.is_err() {
227///         continue;
228///     }
229///
230///     for c in queues[0].completions() {
231///         println!("Completed request with context {:?}, return value {}", c.context, c.ret);
232///         done = true;
233///     }
234/// }
235/// # Result::<(), Box<dyn std::error::Error>>::Ok(())
236/// ```
237///
238/// [`setup_queues`]: Self::setup_queues
239pub struct VirtioBlkQueue<'a, C> {
240    vq: Virtqueue<'a, VirtioBlkReqBuf>,
241    req_contexts: Box<[Option<C>]>,
242}
243
244impl<'a, C> VirtioBlkQueue<'a, C> {
245    fn new(vq: Virtqueue<'a, VirtioBlkReqBuf>) -> Self {
246        let queue_size = vq.queue_size().into();
247        let req_contexts = iter::repeat_with(|| None).take(queue_size).collect();
248
249        Self { vq, req_contexts }
250    }
251
252    /// Creates the queues for a virtio-blk device.
253    pub fn setup_queues(
254        transport: &mut VirtioBlkTransport,
255        num_queues: usize,
256        queue_size: u16,
257    ) -> Result<Vec<Self>, Error> {
258        if virtio_blk_max_queues(transport)? < num_queues {
259            return Err(Error::new(
260                ErrorKind::InvalidInput,
261                "Too many queues requested",
262            ));
263        }
264
265        let features = VirtioFeatureFlags::from_bits_truncate(transport.get_features());
266        let layout =
267            VirtqueueLayout::new::<VirtioBlkReqBuf>(num_queues, queue_size as usize, features)?;
268        let queues: Vec<_> = {
269            // Not actually needless: must drop the borrow on the transport before alloc_queue_mem()
270            #[allow(clippy::needless_collect)]
271            let iova_translators: Vec<_> = iter::repeat_with(|| transport.iova_translator())
272                .take(num_queues)
273                .collect();
274
275            let mem = transport.alloc_queue_mem(&layout)?;
276
277            iova_translators
278                .into_iter()
279                .enumerate()
280                .map(|(i, iova_translator)| {
281                    let mem_queue = unsafe {
282                        std::slice::from_raw_parts_mut(
283                            &mut mem[i * layout.end_offset] as *mut u8,
284                            layout.end_offset,
285                        )
286                    };
287                    Virtqueue::new(iova_translator, mem_queue, queue_size, features)
288                })
289                .collect::<Result<_, _>>()?
290        };
291        transport.setup_queues(&queues)?;
292
293        Ok(queues.into_iter().map(Self::new).collect())
294    }
295
296    fn queue_request_full(
297        &mut self,
298        req_type: VirtioBlkReqType,
299        offset: u64,
300        buf: &[iovec],
301        dwz_data: Option<DiscardWriteZeroesData>,
302        context: C,
303    ) -> Result<(), Error> {
304        let lba = to_lba(offset)?;
305
306        let desc_idx = self.vq.add_request(|req, add_desc| {
307            *req = VirtioBlkReqBuf {
308                header: VirtioBlkReqHeader::new(req_type, lba),
309                status: 0,
310                dwz_data: dwz_data.unwrap_or_default(),
311            };
312
313            add_desc(
314                iovec {
315                    iov_base: &mut req.header as *mut _ as *mut c_void,
316                    iov_len: mem::size_of::<VirtioBlkReqHeader>(),
317                },
318                false,
319            )?;
320
321            if dwz_data.is_some() {
322                add_desc(
323                    iovec {
324                        iov_base: &mut req.dwz_data as *mut _ as *mut c_void,
325                        iov_len: mem::size_of::<DiscardWriteZeroesData>(),
326                    },
327                    false,
328                )?;
329            }
330
331            for b in buf {
332                add_desc(*b, req_type.is_from_dev())?;
333            }
334
335            add_desc(
336                iovec {
337                    iov_base: &mut req.status as *mut _ as *mut c_void,
338                    iov_len: 1,
339                },
340                true,
341            )?;
342
343            Ok(())
344        })?;
345
346        let old = self.req_contexts[desc_idx as usize].replace(context);
347        assert!(old.is_none());
348
349        Ok(())
350    }
351
352    fn queue_request(
353        &mut self,
354        req_type: VirtioBlkReqType,
355        offset: u64,
356        buf: &[iovec],
357        context: C,
358    ) -> Result<(), Error> {
359        self.queue_request_full(req_type, offset, buf, None, context)
360    }
361
362    /// Reads from the disk image into a given iovec.
363    ///
364    /// `context` is an arbitrary caller-defined value that is returned in the corresponding
365    /// [`Completion`] to allow associating the result with a specific request.
366    ///
367    /// # Safety
368    ///
369    /// The caller must ensure that the `iovec`/`iovcnt` pair is valid and all memory regions
370    /// referenced by it are safe to access.
371    pub unsafe fn readv(
372        &mut self,
373        offset: u64,
374        iovec: *const iovec,
375        iovcnt: usize,
376        context: C,
377    ) -> Result<(), Error> {
378        let iov = unsafe { std::slice::from_raw_parts(iovec, iovcnt) };
379        self.queue_request(VirtioBlkReqType::Read, offset, iov, context)
380    }
381
382    /// Reads from the disk image into a given buffer.
383    ///
384    /// `context` is an arbitrary caller-defined value that is returned in the corresponding
385    /// [`Completion`] to allow associating the result with a specific request.
386    ///
387    /// # Safety
388    ///
389    /// The caller must ensure that the buffer described by `buf` and `len` is safe to access.
390    pub unsafe fn read_raw(
391        &mut self,
392        offset: u64,
393        buf: *mut u8,
394        len: usize,
395        context: C,
396    ) -> Result<(), Error> {
397        let iov = iovec {
398            iov_base: buf as *mut c_void,
399            iov_len: len,
400        };
401
402        self.queue_request(VirtioBlkReqType::Read, offset, &[iov], context)
403    }
404
405    /// Reads from the disk image into a given byte slice.
406    ///
407    /// `context` is an arbitrary caller-defined value that is returned in the corresponding
408    /// [`Completion`] to allow associating the result with a specific request.
409    pub fn read(&mut self, offset: u64, buf: &mut [u8], context: C) -> Result<(), Error> {
410        unsafe { self.read_raw(offset, buf.as_mut_ptr(), buf.len(), context) }
411    }
412
413    /// Writes to the disk image from a given iovec.
414    ///
415    /// `context` is an arbitrary caller-defined value that is returned in the corresponding
416    /// [`Completion`] to allow associating the result with a specific request.
417    ///
418    /// # Safety
419    ///
420    /// The caller must ensure that the `iovec`/`iovcnt` pair is valid and all memory regions
421    /// referenced by it are safe to access.
422    pub unsafe fn writev(
423        &mut self,
424        offset: u64,
425        iovec: *const iovec,
426        iovcnt: usize,
427        context: C,
428    ) -> Result<(), Error> {
429        let iov = unsafe { std::slice::from_raw_parts(iovec, iovcnt) };
430        self.queue_request(VirtioBlkReqType::Write, offset, iov, context)
431    }
432
433    /// Writes to the disk image from a given buffer.
434    ///
435    /// `context` is an arbitrary caller-defined value that is returned in the corresponding
436    /// [`Completion`] to allow associating the result with a specific request.
437    ///
438    /// # Safety
439    ///
440    /// The caller must ensure that the buffer described by `buf` and `len` is safe to access.
441    pub unsafe fn write_raw(
442        &mut self,
443        offset: u64,
444        buf: *const u8,
445        len: usize,
446        context: C,
447    ) -> Result<(), Error> {
448        let iov = iovec {
449            iov_base: buf as *mut c_void,
450            iov_len: len,
451        };
452
453        self.queue_request(VirtioBlkReqType::Write, offset, &[iov], context)
454    }
455
456    /// Writes to the disk image from a given byte slice.
457    ///
458    /// `context` is an arbitrary caller-defined value that is returned in the corresponding
459    /// [`Completion`] to allow associating the result with a specific request.
460    pub fn write(&mut self, offset: u64, buf: &[u8], context: C) -> Result<(), Error> {
461        unsafe { self.write_raw(offset, buf.as_ptr(), buf.len(), context) }
462    }
463
464    /// Discards an area in the disk image.
465    ///
466    /// After completion, the content of the specified area is undefined. Discard is only a hint
467    /// and doing nothing is a valid implementation. This means that the discarded data may remain
468    /// accessible, this is not a way to safely delete data.
469    ///
470    /// `context` is an arbitrary caller-defined value that is returned in the corresponding
471    /// [`Completion`] to allow associating the result with a specific request.
472    pub fn discard(&mut self, offset: u64, len: u64, context: C) -> Result<(), Error> {
473        let dwz_data = DiscardWriteZeroesData::new(offset, len, false)?;
474        self.queue_request_full(VirtioBlkReqType::Discard, 0, &[], Some(dwz_data), context)
475    }
476
477    /// Zeroes out an area in the disk image.
478    ///
479    /// If `unmap` is `true`, the area is tried to be deallocated if we know that it will read back
480    /// as all zeroes afterwards. If it is `false`, allocated parts will remain allocated.
481    ///
482    /// `context` is an arbitrary caller-defined value that is returned in the corresponding
483    /// [`Completion`] to allow associating the result with a specific request.
484    pub fn write_zeroes(
485        &mut self,
486        offset: u64,
487        len: u64,
488        unmap: bool,
489        context: C,
490    ) -> Result<(), Error> {
491        let dwz_data = DiscardWriteZeroesData::new(offset, len, unmap)?;
492        self.queue_request_full(
493            VirtioBlkReqType::WriteZeroes,
494            0,
495            &[],
496            Some(dwz_data),
497            context,
498        )
499    }
500
501    /// Flushes the disk cache.
502    ///
503    /// This ensures that on successful completion, any requests that had completed before this
504    /// flush request was issued are not sitting in any writeback cache, but are actually stored on
505    /// disk.
506    ///
507    /// `context` is an arbitrary caller-defined value that is returned in the corresponding
508    /// [`Completion`] to allow associating the result with a specific request.
509    pub fn flush(&mut self, context: C) -> Result<(), Error> {
510        self.queue_request(VirtioBlkReqType::Flush, 0, &[], context)
511    }
512
513    /// Returns the result for any completed requests.
514    pub fn completions(&mut self) -> CompletionIter<'_, 'a, C> {
515        CompletionIter {
516            it: self.vq.completions(),
517            req_contexts: &mut self.req_contexts,
518        }
519    }
520
521    pub fn avail_notif_needed(&mut self) -> bool {
522        self.vq.avail_notif_needed()
523    }
524
525    pub fn set_used_notif_enabled(&mut self, enabled: bool) {
526        self.vq.set_used_notif_enabled(enabled);
527    }
528}
529
530pub struct CompletionIter<'a, 'queue, C> {
531    it: VirtqueueIter<'a, 'queue, VirtioBlkReqBuf>,
532    req_contexts: &'a mut Box<[Option<C>]>,
533}
534
535impl<C> CompletionIter<'_, '_, C> {
536    pub fn has_next(&self) -> bool {
537        self.it.has_next()
538    }
539}
540
541impl<'queue, C> Iterator for CompletionIter<'_, 'queue, C> {
542    type Item = Completion<C>;
543
544    fn next(&mut self) -> Option<Self::Item> {
545        let completion = self.it.next()?;
546
547        // If the backend sent a completion for a request we never made, just ignore it.
548        let context = self.req_contexts[completion.id as usize].take()?;
549
550        Some(Completion {
551            context,
552            ret: match completion.req.status {
553                0 => 0,
554                1 => -EIO,
555                2 => -ENOTSUP,
556                _ => -EPROTO,
557            },
558        })
559    }
560
561    fn size_hint(&self) -> (usize, Option<usize>) {
562        self.it.size_hint()
563    }
564}