sandboxfs 0.2.0

A virtual file system for sandboxing
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
// Copyright 2018 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License"); you may not
// use this file except in compliance with the License.  You may obtain a copy
// of the License at:
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
// License for the specific language governing permissions and limitations
// under the License.

use failure::Fallible;
use nix::errno::Errno;
use nix::unistd;
use nix::sys::{self, signal};
use signal_hook;
use std::cmp;
use std::fs;
use std::io::{self, Read};
use std::os::unix::io as unix_io;
use std::path::{Path, PathBuf};
use std::process;
use std::sync::Arc;
use std::sync::atomic::{AtomicBool, Ordering};
use std::sync::mpsc;
use std::time;
use std::thread;

/// Converts a `nix::Error` that we expect to carry an errno to an `io::Error`.
fn nix_to_io_error(err: nix::Error) -> io::Error {
    match err {
        nix::Error::Sys(errno) => io::Error::from_raw_os_error(errno as i32),
        e => panic!("Did not expect to get an error without an errno from a nix call: {:?}", e),
    }
}

/// A scope-owned file with support for multiple non-owner readers in different threads.
///
/// A `ShareableFile` object owns the file passed to it at construction time and will close the
/// underlying file handle when the object is dropped.
///
/// Concurrent views into this same file, obtained via the `reader` method, do not own the
/// file handle.  Such views must accept the fact that the handle can be closed at any time, a
/// condition that is simply exposed as if the handle reached EOF.  Concurrent views can safely be
/// moved across threads.
pub struct ShareableFile {
    /// Underlying file descriptor shared across all views of this file.
    fd: unix_io::RawFd,

    /// Write ends of the pipes used by `ShareableFileReader` to know if the file has been closed.
    watchers: Vec<unix_io::RawFd>,

    /// Whether the file descriptor has already been closed or not.
    ///
    /// In principle, we don't need this field: concurrent readers of this file will get their
    /// selects abruptly terminated with an error and we should be able to rely on `EBADF` to tell
    /// that this happened because of us closing the file descriptor.  But it seems better to track
    /// our own closed condition so that we can distinguish actual errors from expected errors.
    closed: Arc<AtomicBool>,
}

impl ShareableFile {
    /// Constructs a new `ShareableFile` from an open file and takes ownership of it.
    pub fn from(file: fs::File) -> ShareableFile {
        use std::os::unix::io::IntoRawFd;
        ShareableFile {
            fd: file.into_raw_fd(),
            watchers: vec!(),
            closed: Arc::from(AtomicBool::new(false)),
        }
    }

    /// Returns an unowned view of the file.
    ///
    /// Users of this file must accept that the file can be closed at any time by the owner.
    pub fn reader(&mut self) -> io::Result<ShareableFileReader> {
        let (notifier, watcher) = unistd::pipe().map_err(nix_to_io_error)?;
        self.watchers.push(watcher);
        Ok(ShareableFileReader {
            fd: self.fd,
            notifier: notifier,
            closed: self.closed.clone(),
        })
    }
}

impl Drop for ShareableFile {
    fn drop(&mut self) {
        debug!("Closing ShareableFile with fd {}", self.fd);

        // We are about to touch file handles used by other threads.  If those threads are blocked
        // on a select call, the call may return an error on some systems due to the closed file
        // descriptors.  If those threads are about to issue a read after a select, the read will
        // fail with a bad file descriptor.  Prepare them about these potential failure conditions
        // before we actually touch anything.
        self.closed.store(true, Ordering::SeqCst);

        for watcher in &self.watchers {
            if let Err(e) = unistd::write(*watcher, &[0]) {
                // This write to a pipe we control really should not have failed.  If it did there
                // is not much we can do other than log an error.  We may get stuck threads on exit
                // though...
                //
                // TODO(jmmv): We expect the write to fail if we had any ShareableFileReader that
                // has already been dropped, as that drop would have closed the read end of the
                // pipe.  Note that this means that, if we create/drop ShareableFileReaders non-stop
                // for this ShareableFile, we'll exhaust the open file descriptors because we are
                // leaking the write ends of these pipes.  This should be fixed, but it's not a big
                // deal because we don't do this in sandboxfs.
                if e.as_errno().expect("Must have been a system error") != Errno::EPIPE {
                    warn!("Failed to tell ShareableFileReader with handle {} of close: {}",
                        *watcher, e)
                }
            }
            if let Err(e) = unistd::close(*watcher) {
                // Closing should really not have failed, but if it did, it does not hurt and there
                // is nothing we can do anyway.
                warn!("Failed to close pipe write end with handle {}: {}", *watcher, e)
            }
        }

        if let Err(e) = unistd::close(self.fd) {
            warn!("Failed to close fd {}: {}", self.fd, e);
        }
    }
}

/// A non-owned view of a `ShareableFile`.
pub struct ShareableFileReader {
    /// Underlying file descriptor shared across all views of this file.
    fd: unix_io::RawFd,

    /// Read end of the pipe used by `ShareableFile` to tell us that the file has been closed.
    notifier: unix_io::RawFd,

    /// Whether the file descriptor has already been closed or not.  See description in
    /// `ShareableFile` for more details.
    closed: Arc<AtomicBool>,
}

impl Drop for ShareableFileReader {
    fn drop(&mut self) {
        if let Err(e) = unistd::close(self.notifier) {
            warn!("Failed to close fd {}: {}", self.notifier, e);
        }
    }
}

impl Read for ShareableFileReader {
    fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
        let mut read_set = sys::select::FdSet::new();
        read_set.insert(self.fd);
        read_set.insert(self.notifier);
        let result = sys::select::select(None, Some(&mut read_set), None, None, None)
            .and_then(|ready_count| {
                debug_assert!(ready_count <= 2);
                if read_set.contains(self.notifier) {
                    // The file has been closed by the owner.  There is no point in attempting to
                    // read from the fd even if there were data in it.
                    Ok(0)
                } else {
                    assert!(read_set.contains(self.fd));
                    unistd::read(self.fd, buf)
                }
            })
            .map_err(nix_to_io_error);

        match result {
            Ok(read_count) => Ok(read_count),
            err => {
                if self.closed.load(Ordering::SeqCst) {
                    Ok(0)  // Simulate EOF due to close in another thread.
                } else {
                    err
                }
            },
        }
    }
}

/// List of termination signals that cause the mount point to be correctly unmounted.
static CAPTURED_SIGNALS: [signal::Signal; 4] = [
    signal::Signal::SIGHUP,
    signal::Signal::SIGTERM,
    signal::Signal::SIGINT,
    signal::Signal::SIGQUIT,
];

/// Two-phase installer for `SignalsHandler`, which is responsible for unmounting a file system.
///
/// Installing the signals is tricky business because of a potential race: if the signal handler is
/// installed and a signal arrives *before* the mount point has been configured, the unmounting will
/// not succeed, which means we will enter the server loop and lose the signal.  Conversely, if we
/// did this backwards, we could receive a signal after the mount point has been configured but
/// before we install the signal handler, which means we'd terminate but leak the mount point.
///
/// To solve this, we must block signals while the mount point is being set up.  We achieve this by
/// exposing an interface that forces the caller to take two steps before it can obtain the actual
/// `SignalsHandler` object: the caller must call `prepare()` before mounting the file system and
/// then call `install()` once the file system is ready to serve.
///
/// Keeping this logic as a separate `SignalsInstaller` object, instead of trying to expose a "safe
/// mount function" helps ensure restoration of the original signal mask in all cases because this
/// type does so at drop time.
pub struct SignalsInstaller {
    /// Signal mask to restore at drop time.
    old_sigset: signal::SigSet,
}

impl SignalsInstaller {
    /// Blocks signals in preparation to mount the file system.
    pub fn prepare() -> SignalsInstaller {
        let mut old_sigset = signal::SigSet::empty();
        let mut sigset = signal::SigSet::empty();
        for signal in CAPTURED_SIGNALS.iter() {
            sigset.add(*signal);
        }
        signal::pthread_sigmask(
            signal::SigmaskHow::SIG_BLOCK, Some(&sigset), Some(&mut old_sigset))
            .expect("pthread_sigmask is not expected to fail");
        SignalsInstaller { old_sigset }
    }

    /// Installs all signal handlers to unmount the given `mount_point`.
    pub fn install(self, mount_point: PathBuf) -> Fallible<SignalsHandler> {
        let (signal_sender, signal_receiver) = mpsc::channel();

        let mut signums = vec!();
        for signal in CAPTURED_SIGNALS.iter() {
            signums.push(*signal as i32);
        }
        let signals = signal_hook::iterator::Signals::new(&signums)?;

        std::thread::spawn(move || SignalsHandler::handler(&signals, mount_point, &signal_sender));

        Ok(SignalsHandler { signal_receiver })

        // Consumes self which causes the original signal mask to be restored and thus unblocks
        // signals.
    }
}

impl Drop for SignalsInstaller {
    fn drop(&mut self) {
        signal::pthread_sigmask(signal::SigmaskHow::SIG_SETMASK, Some(&self.old_sigset), None)
            .expect("pthread_sigmask is not expected to fail and we cannot correctly clean up");
    }
}

/// Unmounts a file system by shelling out to the correct unmount tool.
///
/// Doing this in-process is very difficult because of differences across systems and the fact that
/// neither `nix` nor `libc` currently expose any of the unmounting functionality.
fn unmount(path: &Path) -> Fallible<()> {
    #[cfg(not(any(target_os = "linux")))]
    fn run_unmount(path: &Path) -> io::Result<process::Output> {
        process::Command::new("umount").arg(path).output()
    }

    #[cfg(any(target_os = "linux"))]
    fn run_unmount(path: &Path) -> io::Result<process::Output> {
        process::Command::new("fusermount").arg("-u").arg(path).output()
    }

    let output = run_unmount(path)?;
    if output.status.success() {
        Ok(())
    } else {
        Err(format_err!("stdout: {}, stderr: {}",
            String::from_utf8_lossy(&output.stdout).trim(),
            String::from_utf8_lossy(&output.stderr).trim()))
    }
}

/// Tries to unmount the given file system indefinitely.
///
/// If unmounting fails, it is probably because the file system is busy.  We don't know but it
/// doesn't matter: we have entered a terminal status: we do this at exit time so we'll keep trying
/// to unclog things while telling the user what's going on.  They are the ones that have to fix
/// this situation.
fn retry_unmount<P: AsRef<Path>>(mount_point: P) {
    let mut backoff = time::Duration::from_millis(10);
    let goal = time::Duration::from_secs(1);
    'retry: loop {
        match unmount(mount_point.as_ref()) {
            Ok(()) => break 'retry,
            Err(e) => {
                if backoff >= goal {
                    warn!("Unmounting file system failed with '{}'; will retry in {:?}",
                        e, backoff);
                }
                thread::sleep(backoff);
                if backoff < goal {
                    backoff = cmp::min(goal, backoff * 2);
                }
            },
        }
    }
}

/// Maintains state and allows interaction with the installed signal handler.
///
/// The signal handler is responsible for unmounting the file system, which in turn causes the
/// FUSE serve loop to either never start or to finish execution if it was already running.
pub struct SignalsHandler {
    /// Channel used to receive, on the main thread, the number of the signal that was captured.
    // TODO(https://github.com/vorner/signal-hook/pull/8): Replace i32 with SigNo once merged.
    signal_receiver: mpsc::Receiver<i32>,
}

impl SignalsHandler {
    /// Returns the signal that was caught, if any.
    ///
    /// This is *not* racy when used after the file system has been unmounted (i.e. once the FUSE
    /// loop terminates).
    pub fn caught(&self) -> Option<i32> {
        match self.signal_receiver.try_recv() {
            Ok(signo) => Some(signo),
            Err(_) => None,
        }
    }

    /// The signal handler.
    ///
    /// This blocks until the receipt of the first signal and then ignores the rest.
    ///
    /// Upon receipt of a signal from `signals`, the handler first updates `signal_sender` with the
    /// number of the received signal and then attempts to unmount `mount_point` indefinitely to
    /// unblock the main FUSE loop.
    fn handler(signals: &signal_hook::iterator::Signals, mount_point: PathBuf,
        signal_sender: &mpsc::Sender<i32>) {
        let signo = signals.forever().next().unwrap();
        if let Err(e) = signal_sender.send(signo) {
            warn!("Failed to propagate signal to main thread; will get stuck exiting: {}", e);
        }
        info!("Caught signal {}; unmounting {}", signo, mount_point.display());
        retry_unmount(mount_point);

        // It'd be nice if we could just "drop(signals)" here and then send the same received signal
        // to ourselves so that the program terminated with the correct exit status.  Unfortunately,
        // "drop(signals)" unregisters our hooks from the signal handlers installed by signal-hook
        // but it does not actually return the signal handlers to their original values.  This is a
        // limitation of the signal-hook crate.  Instead, we need the "caught" hack above to let the
        // main thread return an error code instead.
    }
}

#[cfg(test)]
mod tests {
    use nix::sys;
    use std::io::{Read, Write};
    use std::thread;
    use super::*;
    use tempfile;

    #[test]
    fn test_shareable_file_clones_share_descriptor_and_only_one_owns() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("file");
        fs::File::create(&path).unwrap().write_all(b"ABCDEFG").unwrap();

        fn read_one_byte(input: &mut impl Read) -> u8 {
            let mut buffer = [0];
            input.read_exact(&mut buffer).unwrap();
            buffer[0]
        }

        let mut file = ShareableFile::from(fs::File::open(&path).unwrap());
        let mut reader1 = file.reader().unwrap();
        let mut reader2 = file.reader().unwrap();
        assert_eq!(b'A', read_one_byte(&mut reader1));
        drop(reader1);  // Make sure dropping a non-owner copy doesn't close the file handle.
        assert_eq!(b'B', read_one_byte(&mut reader2));
        drop(file);  // Closes the file descriptor so the readers should now not be able to read.
        let mut buffer = [0];
        assert_eq!(0, reader2.read(&mut buffer).expect("Expected 0 byte count as EOF after close"));
    }

    fn try_shareable_file_close_unblocks_reads_without_error() {
        let dir = tempfile::tempdir().unwrap();
        let path = dir.path().join("pipe");
        unistd::mkfifo(&path, sys::stat::Mode::S_IRUSR | sys::stat::Mode::S_IWUSR).unwrap();

        // We need to open the FIFO for writes so that the open for reads doesn't block.  But we
        // have to do this open in a separate thread because it will block too.
        let writer_handle = {
            let path = path.clone();
            thread::spawn(move || fs::File::create(path))
        };

        // This will block until the thread we spawned above opens the file for writing.
        let mut file = ShareableFile::from(fs::File::open(&path).unwrap());

        let (reader_ready_tx, reader_ready_rx) = mpsc::channel();
        let reader_handle = {
            let mut file = file.reader().unwrap();
            thread::spawn(move || {
                let mut buffer = [0];
                reader_ready_tx.send(()).unwrap();
                file.read(&mut buffer)
            })
        };

        // This is racy: there is no guarantee that the reader thread is now blocked on the read
        // syscall although there is a high likelihood that it is (and our explicit yield makes this
        // even more likely, especially on single-core systems).  If we experience the race, the
        // test will silently pass even if there is a problem in the ShareableFile.
        reader_ready_rx.recv().unwrap();
        thread::sleep(time::Duration::from_millis(1));

        drop(file);  // This should unblock the reader thread and let the join complete.
        reader_handle.join().unwrap().expect("Read didn't return success on EOF-like condition");

        // We have already verified that the reader can be asynchronously terminated without the
        // write handle causing interference.  Retrieve the write end of the FIFO and close it.
        let writer = writer_handle.join().unwrap().expect("Write didn't finish successfully");
        drop(writer);
    }

    #[test]
    fn test_shareable_file_close_unblocks_reads_without_error() {
        // This test exercises a threading condition that cannot be reproduced in a non-racy manner.
        // If we are subject to the race condition, the test will falsely pass; otherwise it will
        // correctly fail.  Running this same test a few times ensures that we have higher chances
        // of catching a problem.
        for _ in 0..10 {
            try_shareable_file_close_unblocks_reads_without_error()
        }
    }
}