virtiofsd 1.13.3

A virtio-fs vhost-user device daemon
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
// SPDX-License-Identifier: BSD-3-Clause

use crate::soft_idmap::{HostGid, HostUid, Id};
use bitflags::bitflags;
use std::ffi::{CStr, CString};
use std::fs::File;
use std::io::{self, Error, Result};
use std::os::unix::io::{AsRawFd, BorrowedFd, RawFd};
use std::os::unix::prelude::FromRawFd;

// A helper function that check the return value of a C function call
// and wraps it in a `Result` type, returning the `errno` code as `Err`.
fn check_retval<T: From<i8> + PartialEq>(t: T) -> Result<T> {
    if t == T::from(-1_i8) {
        Err(Error::last_os_error())
    } else {
        Ok(t)
    }
}

/// Simple object to collect basic facts about the OS,
/// such as available syscalls.
pub struct OsFacts {
    pub has_openat2: bool,
}

#[allow(clippy::new_without_default)]
impl OsFacts {
    /// This object should only be constructed using new.
    #[must_use]
    pub fn new() -> Self {
        // Checking for `openat2()` since it first appeared in Linux 5.6.
        // SAFETY: all-zero byte-pattern is a valid `libc::open_how`
        let how: libc::open_how = unsafe { std::mem::zeroed() };
        let cwd = CString::new(".").unwrap();
        // SAFETY: `cwd.as_ptr()` points to a valid NUL-terminated string,
        // and the `how` pointer is a valid pointer to an `open_how` struct.
        let fd = unsafe {
            libc::syscall(
                libc::SYS_openat2,
                libc::AT_FDCWD,
                cwd.as_ptr(),
                std::ptr::addr_of!(how),
                std::mem::size_of::<libc::open_how>(),
            )
        };

        let has_openat2 = fd >= 0;
        if has_openat2 {
            // SAFETY: `fd` is an open file descriptor
            unsafe {
                libc::close(fd as libc::c_int);
            }
        }

        Self { has_openat2 }
    }
}

/// Safe wrapper for `mount(2)`
///
/// # Errors
///
/// Will return `Err(errno)` if `mount(2)` fails.
/// Each filesystem type may have its own special errors and its own special behavior,
/// see `mount(2)` and the linux source kernel for details.
///
/// # Panics
///
/// This function panics if the strings `source`, `target` or `fstype` contain an internal 0 byte.
pub fn mount(source: Option<&str>, target: &str, fstype: Option<&str>, flags: u64) -> Result<()> {
    let source = CString::new(source.unwrap_or("")).unwrap();
    let source = source.as_ptr();

    let target = CString::new(target).unwrap();
    let target = target.as_ptr();

    let fstype = CString::new(fstype.unwrap_or("")).unwrap();
    let fstype = fstype.as_ptr();

    // Safety: `source`, `target` or `fstype` are a valid C string pointers
    check_retval(unsafe { libc::mount(source, target, fstype, flags, std::ptr::null()) })?;
    Ok(())
}

/// Safe wrapper for `umount2(2)`
///
/// # Errors
///
/// Will return `Err(errno)` if `umount2(2)` fails.
/// Each filesystem type may have its own special errors and its own special behavior,
/// see `umount2(2)` and the linux source kernel for details.
///
/// # Panics
///
/// This function panics if the strings `target` contains an internal 0 byte.
pub fn umount2(target: &str, flags: i32) -> Result<()> {
    let target = CString::new(target).unwrap();
    let target = target.as_ptr();

    // Safety: `target` is a valid C string pointer
    check_retval(unsafe { libc::umount2(target, flags) })?;
    Ok(())
}

/// Safe wrapper for `fchdir(2)`
///
/// # Errors
///
/// Will return `Err(errno)` if `fchdir(2)` fails.
/// Each filesystem type may have its own special errors, see `fchdir(2)` for details.
pub fn fchdir(fd: RawFd) -> Result<()> {
    check_retval(unsafe { libc::fchdir(fd) })?;
    Ok(())
}

/// Safe wrapper for `fchmod(2)`
///
/// # Errors
///
/// Will return `Err(errno)` if `fchmod(2)` fails.
/// Each filesystem type may have its own special errors, see `fchmod(2)` for details.
pub fn fchmod(fd: RawFd, mode: libc::mode_t) -> Result<()> {
    check_retval(unsafe { libc::fchmod(fd, mode) })?;
    Ok(())
}

/// Safe wrapper for `fchmodat(2)`
///
/// # Errors
///
/// Will return `Err(errno)` if `fchmodat(2)` fails.
/// Each filesystem type may have its own special errors, see `fchmodat(2)` for details.
pub fn fchmodat(dirfd: RawFd, pathname: String, mode: libc::mode_t, flags: i32) -> Result<()> {
    let pathname =
        CString::new(pathname).map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
    let pathname = pathname.as_ptr();

    check_retval(unsafe { libc::fchmodat(dirfd, pathname, mode, flags) })?;
    Ok(())
}

/// Safe wrapper for `umask(2)`
pub fn umask(mask: u32) -> u32 {
    // SAFETY: this call doesn't modify any memory and there is no need
    // to check the return value because this system call always succeeds.
    unsafe { libc::umask(mask) }
}

/// An RAII implementation of a scoped file mode creation mask (umask), it set the
/// new umask. When this structure is dropped (falls out of scope), it set the previous
/// value of the mask.
pub struct ScopedUmask {
    umask: libc::mode_t,
}

impl ScopedUmask {
    pub fn new(new_umask: u32) -> Self {
        Self {
            umask: umask(new_umask),
        }
    }
}

impl Drop for ScopedUmask {
    fn drop(&mut self) {
        umask(self.umask);
    }
}

/// Safe wrapper around `openat(2)`.
///
/// # Errors
///
/// Will return `Err(errno)` if `openat(2)` fails,
/// see `openat(2)` for details.
pub fn openat(dir: &impl AsRawFd, pathname: &CStr, flags: i32, mode: Option<u32>) -> Result<RawFd> {
    let mode = u64::from(mode.unwrap_or(0));

    // SAFETY: `pathname` points to a valid NUL-terminated string.
    // However, the caller must ensure that `dir` can provide a valid file descriptor.
    check_retval(unsafe {
        libc::openat(
            dir.as_raw_fd(),
            pathname.as_ptr(),
            flags as libc::c_int,
            mode,
        )
    })
}

/// An utility function that uses `openat2(2)` to restrict the how the provided pathname
/// is resolved. It uses the following flags:
/// - `RESOLVE_IN_ROOT`: Treat the directory referred to by dirfd as the root directory while
///   resolving pathname. This has the effect as though virtiofsd had used chroot(2) to modify its
///   root directory to dirfd.
/// - `RESOLVE_NO_MAGICLINKS`: Disallow all magic-link (i.e., proc(2) link-like files) resolution
///   during path resolution.
///
/// Additionally, the flags `O_NOFOLLOW` and `O_CLOEXEC` are added.
///
/// # Error
///
/// Will return `Err(errno)` if `openat2(2)` fails, see the man page for details.
///
/// # Safety
///
/// The caller must ensure that dirfd is a valid file descriptor.
pub fn do_open_relative_to(
    dir: &impl AsRawFd,
    pathname: &CStr,
    flags: i32,
    mode: Option<u32>,
) -> Result<RawFd> {
    // `openat2(2)` returns an error if `how.mode` contains bits other than those in range 07777,
    // let's ignore the extra bits to be compatible with `openat(2)`.
    let mode = u64::from(mode.unwrap_or(0)) & 0o7777;

    // SAFETY: all-zero byte-pattern represents a valid `libc::open_how`
    let mut how: libc::open_how = unsafe { std::mem::zeroed() };
    how.resolve = libc::RESOLVE_IN_ROOT | libc::RESOLVE_NO_MAGICLINKS;
    how.flags = flags as u64;
    how.mode = mode;

    // SAFETY: `pathname` points to a valid NUL-terminated string, and the `how` pointer is a valid
    // pointer to an `open_how` struct. However, the caller must ensure that `dir` can provide a
    // valid file descriptor (this can be changed to BorrowedFd).
    check_retval(unsafe {
        libc::syscall(
            libc::SYS_openat2,
            dir.as_raw_fd(),
            pathname.as_ptr(),
            std::ptr::addr_of!(how),
            std::mem::size_of::<libc::open_how>(),
        )
    } as RawFd)
}

mod filehandle {
    use crate::passthrough::file_handle::SerializableFileHandle;
    use crate::util::other_io_error;
    use std::convert::{TryFrom, TryInto};
    use std::io;

    const MAX_HANDLE_SZ: usize = 128;

    #[derive(Clone, PartialOrd, Ord, PartialEq, Eq)]
    #[repr(C)]
    pub struct CFileHandle {
        handle_bytes: libc::c_uint,
        handle_type: libc::c_int,
        f_handle: [u8; MAX_HANDLE_SZ],
    }

    impl Default for CFileHandle {
        fn default() -> Self {
            CFileHandle {
                handle_bytes: MAX_HANDLE_SZ as libc::c_uint,
                handle_type: 0,
                f_handle: [0; MAX_HANDLE_SZ],
            }
        }
    }

    impl CFileHandle {
        pub fn as_bytes(&self) -> &[u8] {
            &self.f_handle[..(self.handle_bytes as usize)]
        }

        pub fn handle_type(&self) -> libc::c_int {
            self.handle_type
        }
    }

    impl TryFrom<&SerializableFileHandle> for CFileHandle {
        type Error = io::Error;

        fn try_from(sfh: &SerializableFileHandle) -> io::Result<Self> {
            let sfh_bytes = sfh.as_bytes();
            if sfh_bytes.len() > MAX_HANDLE_SZ {
                return Err(other_io_error("File handle too long"));
            }
            let mut f_handle = [0u8; MAX_HANDLE_SZ];
            f_handle[..sfh_bytes.len()].copy_from_slice(sfh_bytes);

            Ok(CFileHandle {
                handle_bytes: sfh_bytes.len().try_into().map_err(|err| {
                    other_io_error(format!(
                        "Handle size ({} bytes) too big: {err}",
                        sfh_bytes.len(),
                    ))
                })?,
                #[allow(clippy::useless_conversion)]
                handle_type: sfh.handle_type().try_into().map_err(|err| {
                    other_io_error(format!(
                        "Handle type (0x{:x}) too large: {err}",
                        sfh.handle_type(),
                    ))
                })?,
                f_handle,
            })
        }
    }

    extern "C" {
        pub fn name_to_handle_at(
            dirfd: libc::c_int,
            pathname: *const libc::c_char,
            file_handle: *mut CFileHandle,
            mount_id: *mut libc::c_int,
            flags: libc::c_int,
        ) -> libc::c_int;

        // Technically `file_handle` should be a `mut` pointer, but `open_by_handle_at()` is specified
        // not to change it, so we can declare it `const`.
        pub fn open_by_handle_at(
            mount_fd: libc::c_int,
            file_handle: *const CFileHandle,
            flags: libc::c_int,
        ) -> libc::c_int;
    }
}
pub use filehandle::CFileHandle;

pub fn name_to_handle_at(
    dirfd: &impl AsRawFd,
    pathname: &CStr,
    file_handle: &mut CFileHandle,
    mount_id: &mut libc::c_int,
    flags: libc::c_int,
) -> Result<()> {
    // SAFETY: `dirfd` is a valid file descriptor, `file_handle`
    // is a valid reference to `CFileHandle`, and `mount_id` is
    // valid reference to an `int`
    check_retval(unsafe {
        filehandle::name_to_handle_at(
            dirfd.as_raw_fd(),
            pathname.as_ptr(),
            file_handle,
            mount_id,
            flags,
        )
    })?;
    Ok(())
}

pub fn open_by_handle_at(
    mount_fd: &impl AsRawFd,
    file_handle: &CFileHandle,
    flags: libc::c_int,
) -> Result<File> {
    // SAFETY: `mount_fd` is a valid file descriptor and `file_handle`
    // is a valid reference to `CFileHandle`
    let fd = check_retval(unsafe {
        filehandle::open_by_handle_at(mount_fd.as_raw_fd(), file_handle, flags)
    })?;

    // SAFETY: `open_by_handle_at()` guarantees `fd` is a valid file descriptor
    Ok(unsafe { File::from_raw_fd(fd) })
}

bitflags! {
    /// A bitwise OR of zero or more flags passed in as a parameter to the
    /// write vectored function `writev_at()`.
    pub struct WritevFlags: i32 {
        /// High priority writes. Allows block-based filesystems to use polling of the device, which
        /// provides lower latency, but may use additional resources. (Currently, this feature is
        /// usable only on a file descriptor opened using the `O_DIRECT` flag.)
        const RWF_HIPRI = libc::RWF_HIPRI;

        /// Provide a per-write equivalent of the `O_DSYNC` `open(2)` flag. Its effect applies
        /// only to the data range written by the system call.
        const RWF_DSYNC = libc::RWF_DSYNC;

        /// Provide a per-write equivalent of the `O_SYNC` `open(2)` flag. Its effect applies only
        /// to the data range written by the system call.
        const RWF_SYNC = libc::RWF_SYNC;

        /// Provide a per-write equivalent of the `O_APPEND` `open(2)` flag. Its effect applies only
        /// to the data range written by the system call. The offset argument does not affect the
        /// write operation; the data is always appended to the end of the file.
        /// However, if the offset argument is -1, the current file offset is updated.
        const RWF_APPEND = libc::RWF_APPEND;

        /// Do not honor the `O_APPEND` `open(2)` flag (since Linux 6.9).
        const RWF_NOAPPEND = libc::RWF_NOAPPEND;

        /// Requires that writes to regular files in block-based filesystems be issued with
        /// torn-write protection. Torn-write protection means that for a power or any other
        /// hardware failure, all or none of the data from the write will be stored, but never a
        /// mix of old and new data (since Linux 6.11).
        const RWF_ATOMIC = libc::RWF_ATOMIC;

        /// Uncached buffered write (since Linux 6.14).
        const RWF_DONTCACHE = libc::RWF_DONTCACHE;
    }
}

bitflags! {
    /// A bitwise OR of zero or more flags passed in as a parameter to the
    /// read vectored function `readv_at()`.
    pub struct ReadvFlags: i32 {
        /// High priority read. Allows block-based filesystems to use polling of the device, which
        /// provides lower latency, but may use additional resources. (Currently, this feature is
        /// usable only on a file descriptor opened using the O_DIRECT flag.)
        const RWF_HIPRI = libc::RWF_HIPRI;

        /// Do not wait for data which is not immediately available. If this flag is specified,
        /// the `readv_at()` will return instantly if it would have to read data from the backing
        /// storage or wait for a lock. If some data was successfully read, it will return the
        /// number of bytes read. If no bytes were read, it will return -1 and set errno to
        /// `EAGAIN`.
        const RWF_NOWAIT = libc::RWF_NOWAIT;

        /// Uncached buffered read, any data read will be removed from the page cache upon
        /// completion (since Linux 6.14).
        const RWF_DONTCACHE = libc::RWF_DONTCACHE;
    }
}

/// Safe wrapper for `pwritev2(2)`
///
/// This system call is similar `pwritev(2)`, but add a new argument,
/// flags, which modifies the behavior on a per-call basis.
/// Unlike `pwritev(2)`, if the offset argument is -1, then the current file offset
/// is used and updated.
///
/// # Errors
///
/// Will return `Err(errno)` if `pwritev2(2)` fails, see `pwritev2(2)` for details.
///
/// # Safety
///
/// The caller must ensure that each iovec element is valid (i.e., it has a valid `iov_base`
/// pointer and `iov_len`).
pub unsafe fn writev_at(
    fd: BorrowedFd,
    iovecs: &[libc::iovec],
    offset: i64,
    flags: Option<WritevFlags>,
) -> Result<usize> {
    let flags = flags.unwrap_or(WritevFlags::empty());
    // SAFETY: `fd` is a valid filed descriptor, `iov` is a valid pointer
    // to the iovec slice `ìovecs` of `iovcnt` elements. However, the caller
    // must ensure that each iovec element has a valid `iov_base` pointer and `iov_len`.
    let bytes_written = check_retval(unsafe {
        libc::pwritev2(
            fd.as_raw_fd(),
            iovecs.as_ptr(),
            iovecs.len() as libc::c_int,
            offset,
            flags.bits(),
        )
    })?;
    Ok(bytes_written as usize)
}

/// Safe wrapper for `preadv2(2)`
///
/// This system call is similar `preadv(2)`, but add a new argument,
/// flags, which modifies the behavior on a per-call basis.
/// Unlike `preadv(2)`, if the offset argument is -1, then the current file offset
/// is used and updated.
///
/// # Errors
///
/// Will return `Err(errno)` if `preadv2(2)` fails, see `preadv2(2)` for details.
///
/// # Safety
///
/// The caller must ensure that each iovec element is valid (i.e., it has a valid `iov_base`
/// pointer and `iov_len`).
pub unsafe fn readv_at(
    fd: BorrowedFd,
    iovecs: &[libc::iovec],
    offset: i64,
    flags: Option<ReadvFlags>,
) -> Result<usize> {
    let flags = flags.unwrap_or(ReadvFlags::empty());
    // SAFETY: `fd` is a valid filed descriptor, `iov` is a valid pointer
    // to the iovec slice `ìovecs` of `iovcnt` elements. However, the caller
    // must ensure that each iovec element has a valid `iov_base` pointer and `iov_len`.
    let bytes_read = check_retval(unsafe {
        libc::preadv2(
            fd.as_raw_fd(),
            iovecs.as_ptr(),
            iovecs.len() as libc::c_int,
            offset,
            flags.bits(),
        )
    })?;
    Ok(bytes_read as usize)
}

pub struct PipeReader(File);

impl io::Read for PipeReader {
    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
        self.0.read(buf)
    }
}

pub struct PipeWriter(File);

impl io::Write for PipeWriter {
    fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
        self.0.write(buf)
    }

    fn flush(&mut self) -> io::Result<()> {
        self.0.flush()
    }
}

pub fn pipe() -> io::Result<(PipeReader, PipeWriter)> {
    let mut fds: [RawFd; 2] = [-1, -1];
    let ret = unsafe { libc::pipe2(fds.as_mut_ptr(), libc::O_CLOEXEC) };
    if ret == -1 {
        Err(io::Error::last_os_error())
    } else {
        Ok((
            PipeReader(unsafe { File::from_raw_fd(fds[0]) }),
            PipeWriter(unsafe { File::from_raw_fd(fds[1]) }),
        ))
    }
}

// We want credential changes to be per-thread because otherwise
// we might interfere with operations being carried out on other
// threads with different uids/gids. However, posix requires that
// all threads in a process share the same credentials. To do this
// libc uses signals to ensure that when one thread changes its
// credentials the other threads do the same thing.
//
// So instead we invoke the syscall directly in order to get around
// this limitation. Another option is to use the setfsuid and
// setfsgid systems calls. However since those calls have no way to
// return an error, it's preferable to do this instead.
/// Set effective user ID
pub fn seteffuid(uid: HostUid) -> io::Result<()> {
    check_retval(unsafe { libc::syscall(libc::SYS_setresuid, -1, uid.into_inner(), -1) })?;
    Ok(())
}

/// Set effective group ID
pub fn seteffgid(gid: HostGid) -> io::Result<()> {
    check_retval(unsafe { libc::syscall(libc::SYS_setresgid, -1, gid.into_inner(), -1) })?;
    Ok(())
}

/// Set supplementary group
pub fn setsupgroup(gid: HostGid) -> io::Result<()> {
    let gid_raw = gid.into_inner();
    check_retval(unsafe { libc::syscall(libc::SYS_setgroups, 1, &gid_raw) })?;
    Ok(())
}

/// Drop all supplementary groups
pub fn dropsupgroups() -> io::Result<()> {
    check_retval(unsafe {
        libc::syscall(libc::SYS_setgroups, 0, std::ptr::null::<libc::gid_t>())
    })?;
    Ok(())
}