1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
/*! panic_monitor helps you monitor your threads and deal with panics.

You might be tempted to use libstd's [`JoinHandle`]s for this use-case; however, they have two
major limitations:

 * [`JoinHandle::join`] blocks the current thread.  If you want to monitor multiple threads from a
   single "supervisor" thread, you would need something like `try_join`, and ideally you'd have an
   "epoll for [`JoinHandle`]s" as well to avoid busy-waiting.  [`JoinHandle`] doesn't implement
   these, however.
 * You can't clone a [`JoinHandle`].  If you want multiple threads to be notified when a particular
   thread panics, you can't use its [`JoinHandle`] to achieve it.

panic_monitor handles both of these issues.  [`PanicMonitor::wait`] allows you to specify a number
of threads.  As soon as one of them panics, it returns a [`Thread`] struct (which contains the name
and ID of the panicking thread).  When calling [`PanicMonitor::wait`], you specify the watch-list
in terms of [`ThreadId`]s.  Since these are clonable, mulitple supervisor threads can monitor the
same worker thread.

Some other differences between [`PanicMonitor::wait`] and [`JoinHandle::join`]:

 * You don't receive the value which was passed to [`panic`].  (This would be impossible, given
   that such values are not required to implement [`Clone`].)
 * You aren't notified when a thread shuts down normally.  `PanicMonitor` is for handling
   panicking threads only.

[`PanicMonitor::wait`]: struct.PanicMonitor.html#method.wait
[`JoinHandle`]: https://doc.rust-lang.org/std/thread/struct.JoinHandle.html
[`JoinHandle::join`]: https://doc.rust-lang.org/std/thread/struct.JoinHandle.html#method.join
[`panic`]: https://doc.rust-lang.org/std/macro.panic.html
[`Clone`]: https://doc.rust-lang.org/std/clone/trait.Clone.html
[`Thread`]: https://doc.rust-lang.org/std/thread/struct.Thread.html
[`ThreadId`]: https://doc.rust-lang.org/std/thread/struct.ThreadId.html

## Usage

Create a global [`PanicMonitor`] using [`lazy_static`], and initialise it from your main thread.
Ideally you should do this before spawning any new threads.

[`PanicMonitor`]: struct.PanicMonitor.html
[`lazy_static`]: https://docs.rs/lazy_static/1.0.0/lazy_static/macro.lazy_static.html

```
#[macro_use] extern crate lazy_static;
extern crate panic_monitor;

use panic_monitor::PanicMonitor;
use std::thread;
use std::time::Duration;

lazy_static! {
    static ref PANIC_MONITOR: PanicMonitor = PanicMonitor::new();
}

fn main() {
    // Install a panic hook
    PANIC_MONITOR.init();

    let h = thread::spawn(|| {
        thread::sleep(Duration::from_millis(100));
        panic!();
    });

    PANIC_MONITOR.wait(&[h.thread().id()]);
    // ^ this will block until the thread panics

    PANIC_MONITOR.wait(&[h.thread().id()]);
    // ^ this will return immediately, since the thread is already dead

    h.join().unwrap_err();
}
```
*/

use std::collections::HashMap;
use std::panic;
use std::sync::*;
use std::thread::{self, Thread, ThreadId};
use std::time::*;

const POISON_MSG: &str = "panic_monitor: Inner lock poisoned (please submit a bug report)";

/// A list of all threads which have panicked, with the ability to notify interested parties when
/// this list is updated.
pub struct PanicMonitor {
    panicked: Mutex<HashMap<ThreadId, Thread>>,   // All threads which have historically panicked
    cvar: Condvar,
}

impl PanicMonitor {
    /// Create a new `PanicMonitor`.
    ///
    /// Call this inside a [`lazy_static`] block.  You must call [`init`] after this.
    ///
    /// [`init`]: #method.init
    /// [`lazy_static`]: https://docs.rs/lazy_static/1.0.0/lazy_static/macro.lazy_static.html
    pub fn new() -> PanicMonitor {
        PanicMonitor {
            panicked: Mutex::new(HashMap::new()),
            cvar: Condvar::new(),
        }
    }

    /// Initialise the `PanicMonitor`.
    ///
    /// Call this method as early as you can: a thread which panics before the `PanicMonitor` is
    /// initialised will not trigger wake-ups.  Calling `init` multiple times is relatively
    /// harmless.
    //
    // If you need to uninstall some existing handlers by calling `std::panic::set_hook(|_| {})`,
    // or something, you can call `init` again afterwards to re-add `PanicMonitor`'s hook.
    pub fn init(&'static self) {
        // Install a panic hook which makes a record of the panicking thread and notifies all
        // threads waiting on the PanicMonitor
        let hook = panic::take_hook();
        panic::set_hook(Box::new(move|x| {
            let mut panicked = self.panicked.lock().expect(POISON_MSG);
            let current = thread::current();
            panicked.insert(current.id(), current);
            self.cvar.notify_all();
            hook(x);
        }));
    }

    /// Block the current thread until one of the watched threads panics.  The returned vector is
    /// always non-empty.
    ///
    /// Note that this function returns as soon as one or more of the threads on the watch list has
    /// panicked.  This means that if you specify a thread which has already panicked, this
    /// function will return immediately.  Think of it as level-triggered, not edge-triggered.
    pub fn wait(&self, watch_list: &[ThreadId]) -> Vec<Thread> {
        let mut watched_panicked = vec![];
        let mut panicked = self.panicked.lock().expect(POISON_MSG);
        loop {
            for tid in watch_list {
                if let Some(t) = panicked.get(tid) {
                    watched_panicked.push(t.clone());
                }
            }
            if watched_panicked.len() > 0 { return watched_panicked; }
            panicked = self.cvar.wait(panicked).expect(POISON_MSG);
        }
    }

    /// Block the current thread until one of the watched threads panic, or the timeout expires.
    /// The returned vector is empty if and only if the timeout expired.
    ///
    /// See [`wait`] for more information.
    ///
    /// [`wait`]: #method.wait
    pub fn wait_timeout(&self, watch_list: &[ThreadId], mut dur: Duration) -> Vec<Thread> {
        let mut watched_panicked = vec![];
        let mut panicked = self.panicked.lock().expect(POISON_MSG);
        loop {
            for tid in watch_list {
                if let Some(t) = panicked.get(tid) {
                    watched_panicked.push(t.clone());
                }
            }
            if watched_panicked.len() > 0 { return watched_panicked; }
            let now = Instant::now();
            let (guard, res) = self.cvar.wait_timeout(panicked, dur).expect(POISON_MSG);
            let elapsed = now.elapsed();
            panicked = guard;
            if res.timed_out() || elapsed >= dur { return vec![]; }
            dur -= elapsed; // safe because ^
        }
    }

    /// Check if any of the specified threads have panicked.  This function may block, but only
    /// very briefly.  The returned vector may be empty.
    ///
    /// See [`wait`] for more information.
    ///
    /// [`wait`]: #method.wait
    pub fn check(&self, watch_list: &[ThreadId]) -> Vec<Thread> {
        let mut watched_panicked = vec![];
        let panicked = self.panicked.lock().expect(POISON_MSG);
        for tid in watch_list {
            if let Some(t) = panicked.get(tid) {
                watched_panicked.push(t.clone());
            }
        }
        watched_panicked
    }
}