1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
//! A library for awaiting and killing child processes from multiple threads.
//!
//! - [Docs](https://docs.rs/shared_child)
//! - [Crate](https://crates.io/crates/shared_child)
//! - [Repo](https://github.com/oconnor663/shared_child.rs)
//!
//! The
//! [`std::process::Child`](https://doc.rust-lang.org/std/process/struct.Child.html)
//! type in the standard library provides
//! [`wait`](https://doc.rust-lang.org/std/process/struct.Child.html#method.wait)
//! and
//! [`kill`](https://doc.rust-lang.org/std/process/struct.Child.html#method.kill)
//! methods that take `&mut self`, making it impossible to kill a child process
//! while another thread is waiting on it. That design works around a race
//! condition in Unix's `waitpid` function, where a PID might get reused as soon
//! as the wait returns, so a signal sent around the same time could
//! accidentally get delivered to the wrong process.
//!
//! However with the newer POSIX `waitid` function, we can wait on a child
//! without freeing its PID for reuse. That makes it safe to send signals
//! concurrently. Windows has actually always supported this, by preventing PID
//! reuse while there are still open handles to a child process. This library
//! wraps `std::process::Child` for concurrent use, backed by these APIs.
//!
//! Compatibility note: The `libc` crate doesn't currently support `waitid` on
//! NetBSD or OpenBSD, or on older versions of OSX. There [might also
//! be](https://bugs.python.org/msg167016) some version of OSX where the
//! `waitid` function exists but is broken. We can add a "best effort"
//! workaround using `waitpid` for these platforms as we run into them. Please
//! [file an issue](https://github.com/oconnor663/shared_child.rs/issues/new) if
//! you hit this.
//!
//! # Example
//!
//! ```rust
//! use shared_child::SharedChild;
//! use std::process::Command;
//! use std::sync::Arc;
//!
//! // Spawn a child that will just sleep for a long time,
//! // and put it in an Arc to share between threads.
//! let mut command = Command::new("python");
//! command.arg("-c").arg("import time; time.sleep(1000000000)");
//! let shared_child = SharedChild::spawn(&mut command).unwrap();
//! let child_arc = Arc::new(shared_child);
//!
//! // On another thread, wait on the child process.
//! let child_arc_clone = child_arc.clone();
//! let thread = std::thread::spawn(move || {
//!     child_arc_clone.wait().unwrap()
//! });
//!
//! // While the other thread is waiting, kill the child process.
//! // This wouldn't be possible with e.g. Arc<Mutex<Child>> from
//! // the standard library, because the waiting thread would be
//! // holding the mutex.
//! child_arc.kill().unwrap();
//!
//! // Join the waiting thread and get the exit status.
//! let exit_status = thread.join().unwrap();
//! assert!(!exit_status.success());
//! ```

use std::io;
use std::process::{Child, ChildStderr, ChildStdin, ChildStdout, Command, ExitStatus};
use std::sync::{Condvar, Mutex};

mod sys;

// Publish the Unix-only SharedChildExt trait.
#[cfg(unix)]
pub mod unix;

#[derive(Debug)]
pub struct SharedChild {
    // This lock provides shared access to kill() and wait(). We never hold it
    // during a blocking wait, though, so that non-blocking waits and kills can
    // go through. (Blocking waits use libc::waitid with the WNOWAIT flag.)
    child: Mutex<Child>,

    // When there are multiple waiting threads, one of them will actually wait
    // on the child, and the rest will block on this condvar.
    state_lock: Mutex<ChildState>,
    state_condvar: Condvar,
}

impl SharedChild {
    /// Spawn a new `SharedChild` from a
    /// [`std::process::Command`](https://doc.rust-lang.org/std/process/struct.Command.html).
    pub fn spawn(command: &mut Command) -> io::Result<Self> {
        let child = command.spawn()?;
        Ok(Self {
            child: Mutex::new(child),
            state_lock: Mutex::new(NotWaiting),
            state_condvar: Condvar::new(),
        })
    }

    /// Construct a new `SharedChild` from an already spawned
    /// [`std::process::Child`](https://doc.rust-lang.org/std/process/struct.Child.html).
    ///
    /// This constructor needs to know whether `child` has already been waited on, and the only way
    /// to find that out is to call `child.try_wait()` internally. If the child process is
    /// currently a zombie, that call will clean it up as a side effect. The [`SharedChild::spawn`]
    /// constructor doesn't need to do this.
    pub fn new(mut child: Child) -> io::Result<Self> {
        let state = match child.try_wait()? {
            Some(status) => Exited(status),
            None => NotWaiting,
        };
        Ok(Self {
            child: Mutex::new(child),
            state_lock: Mutex::new(state),
            state_condvar: Condvar::new(),
        })
    }

    /// Return the child process ID.
    pub fn id(&self) -> u32 {
        self.child.lock().unwrap().id()
    }

    fn get_handle(&self) -> sys::Handle {
        sys::get_handle(&self.child.lock().unwrap())
    }

    /// Wait for the child to exit, blocking the current thread, and return its
    /// exit status.
    pub fn wait(&self) -> io::Result<ExitStatus> {
        let mut state = self.state_lock.lock().unwrap();
        loop {
            match *state {
                NotWaiting => {
                    // Either no one is waiting on the child yet, or a previous
                    // waiter failed. That means we need to do it ourselves.
                    // Break out of this loop.
                    break;
                }
                Waiting => {
                    // Another thread is already waiting on the child. We'll
                    // block until it signal us on the condvar, then loop again.
                    // Spurious wakeups could bring us here multiple times
                    // though, see the Condvar docs.
                    state = self.state_condvar.wait(state).unwrap();
                }
                Exited(exit_status) => return Ok(exit_status),
            }
        }

        // If we get here, we have the state lock, and we're the thread
        // responsible for waiting on the child. Set the state to Waiting and
        // then release the state lock, so that other threads can observe it
        // while we block. Afterwards we must leave the Waiting state before
        // this function exits, or other waiters will deadlock.
        *state = Waiting;
        drop(state);

        // Block until the child exits without reaping it. (On Unix, that means
        // we need to call libc::waitid with the WNOWAIT flag. On Windows
        // waiting never reaps.) That makes it safe for another thread to kill
        // while we're here, without racing against some process reusing the
        // child's PID. Having only one thread in this section is important,
        // because POSIX doesn't guarantee much about what happens when multiple
        // threads wait on a child at the same time:
        // http://pubs.opengroup.org/onlinepubs/9699919799/functions/V2_chap02.html#tag_15_13
        let noreap_result = sys::wait_without_reaping(self.get_handle());

        // Now either we hit an error, or the child has exited and needs to be
        // reaped. Retake the state lock and handle all the different exit
        // cases. No matter what happened/happens, we'll leave the Waiting state
        // and signal the state condvar.
        let mut state = self.state_lock.lock().unwrap();
        // The child has already exited, so this wait should clean up without blocking.
        let final_result = noreap_result.and_then(|_| self.child.lock().unwrap().wait());
        *state = if let Ok(exit_status) = final_result {
            Exited(exit_status)
        } else {
            NotWaiting
        };
        self.state_condvar.notify_all();
        final_result
    }

    /// Return the child's exit status if it has already exited. If the child is
    /// still running, return `Ok(None)`.
    pub fn try_wait(&self) -> io::Result<Option<ExitStatus>> {
        let mut status = self.state_lock.lock().unwrap();

        // Unlike wait() above, we don't loop on the Condvar here. If the status
        // is Waiting or Exited, we return immediately. However, if the status
        // is NotWaiting, we'll do a non-blocking wait below, in case the child
        // has already exited.
        match *status {
            NotWaiting => {}
            Waiting => return Ok(None),
            Exited(exit_status) => return Ok(Some(exit_status)),
        };

        // No one is waiting on the child. Check to see if it's already exited.
        // If it has, put ourselves in the Exited state. (There can't be any
        // other waiters to signal, because the state was NotWaiting when we
        // started, and we're still holding the status lock.)
        if sys::try_wait_without_reaping(self.get_handle())? {
            // The child has exited. Reap it. This should not block.
            let exit_status = self.child.lock().unwrap().wait()?;
            *status = Exited(exit_status);
            Ok(Some(exit_status))
        } else {
            Ok(None)
        }
    }

    /// Send a kill signal to the child. On Unix this sends SIGKILL, and you
    /// should call `wait` afterwards to avoid leaving a zombie. If the process
    /// has already been waited on, this returns `Ok(())` and does nothing.
    pub fn kill(&self) -> io::Result<()> {
        let status = self.state_lock.lock().unwrap();
        if let Exited(_) = *status {
            return Ok(());
        }
        // The child is still running. Kill it. This assumes that the wait
        // functions above will never hold the child lock during a blocking
        // wait.
        self.child.lock().unwrap().kill()
    }

    /// Consume the `SharedChild` and return the
    /// [`std::process::Child`](https://doc.rust-lang.org/std/process/struct.Child.html)
    /// it contains.
    ///
    /// We never reap the child process except by calling `wait` or `try_wait`
    /// on it, so the child object's inner state is correct, even if it was
    /// waited on while it was shared.
    pub fn into_inner(self) -> Child {
        self.child.into_inner().unwrap()
    }

    /// Take the child's
    /// [`stdin`](https://doc.rust-lang.org/std/process/struct.Child.html#structfield.stdin)
    /// handle, if any.
    ///
    /// This will only return `Some` the first time it's called, and then only if the `Command`
    /// that created the child was configured with `.stdin(Stdio::piped())`.
    pub fn take_stdin(&self) -> Option<ChildStdin> {
        self.child.lock().unwrap().stdin.take()
    }

    /// Take the child's
    /// [`stdout`](https://doc.rust-lang.org/std/process/struct.Child.html#structfield.stdout)
    /// handle, if any.
    ///
    /// This will only return `Some` the first time it's called, and then only if the `Command`
    /// that created the child was configured with `.stdout(Stdio::piped())`.
    pub fn take_stdout(&self) -> Option<ChildStdout> {
        self.child.lock().unwrap().stdout.take()
    }

    /// Take the child's
    /// [`stderr`](https://doc.rust-lang.org/std/process/struct.Child.html#structfield.stderr)
    /// handle, if any.
    ///
    /// This will only return `Some` the first time it's called, and then only if the `Command`
    /// that created the child was configured with `.stderr(Stdio::piped())`.
    pub fn take_stderr(&self) -> Option<ChildStderr> {
        self.child.lock().unwrap().stderr.take()
    }
}

#[derive(Debug)]
enum ChildState {
    NotWaiting,
    Waiting,
    Exited(ExitStatus),
}

use crate::ChildState::*;

#[cfg(test)]
mod tests {
    use super::*;
    use std::error::Error;
    use std::process::{Command, Stdio};
    use std::sync::Arc;

    // Python isn't available on some Unix platforms, e.g. Android, so we need this instead.
    #[cfg(unix)]
    pub fn true_cmd() -> Command {
        Command::new("true")
    }

    #[cfg(not(unix))]
    pub fn true_cmd() -> Command {
        let mut cmd = Command::new("python");
        cmd.arg("-c").arg("");
        cmd
    }

    // Python isn't available on some Unix platforms, e.g. Android, so we need this instead.
    #[cfg(unix)]
    pub fn sleep_forever_cmd() -> Command {
        let mut cmd = Command::new("sleep");
        cmd.arg("1000000");
        cmd
    }

    #[cfg(not(unix))]
    pub fn sleep_forever_cmd() -> Command {
        let mut cmd = Command::new("python");
        cmd.arg("-c").arg("import time; time.sleep(1000000)");
        cmd
    }

    // Python isn't available on some Unix platforms, e.g. Android, so we need this instead.
    #[cfg(unix)]
    pub fn cat_cmd() -> Command {
        Command::new("cat")
    }

    #[cfg(not(unix))]
    pub fn cat_cmd() -> Command {
        let mut cmd = Command::new("python");
        cmd.arg("-c").arg("");
        cmd
    }

    #[test]
    fn test_wait() {
        let child = SharedChild::spawn(&mut true_cmd()).unwrap();
        // Test the id() function while we're at it.
        let id = child.id();
        assert!(id > 0);
        let status = child.wait().unwrap();
        assert_eq!(status.code().unwrap(), 0);
    }

    #[test]
    fn test_kill() {
        let child = SharedChild::spawn(&mut sleep_forever_cmd()).unwrap();
        child.kill().unwrap();
        let status = child.wait().unwrap();
        assert!(!status.success());
    }

    #[test]
    fn test_try_wait() {
        let child = SharedChild::spawn(&mut sleep_forever_cmd()).unwrap();
        let maybe_status = child.try_wait().unwrap();
        assert_eq!(maybe_status, None);
        child.kill().unwrap();
        // The child will handle that signal asynchronously, so we check it
        // repeatedly in a busy loop.
        let mut maybe_status = None;
        while let None = maybe_status {
            maybe_status = child.try_wait().unwrap();
        }
        assert!(maybe_status.is_some());
        assert!(!maybe_status.unwrap().success());
    }

    #[test]
    fn test_many_waiters() {
        let child = Arc::new(SharedChild::spawn(&mut sleep_forever_cmd()).unwrap());
        let mut threads = Vec::new();
        for _ in 0..10 {
            let clone = child.clone();
            threads.push(std::thread::spawn(move || clone.wait()));
        }
        child.kill().unwrap();
        for thread in threads {
            thread.join().unwrap().unwrap();
        }
    }

    #[test]
    fn test_waitid_after_exit_doesnt_hang() {
        // There are ominous reports (https://bugs.python.org/issue10812) of a
        // broken waitid implementation on OSX, which might hang forever if it
        // tries to wait on a child that's already exited.
        let child = true_cmd().spawn().unwrap();
        sys::wait_without_reaping(sys::get_handle(&child)).unwrap();
        // At this point the child has definitely exited. Wait again to test
        // that a second wait doesn't block.
        sys::wait_without_reaping(sys::get_handle(&child)).unwrap();
    }

    #[test]
    fn test_into_inner_before_wait() {
        let shared_child = SharedChild::spawn(&mut sleep_forever_cmd()).unwrap();
        let mut child = shared_child.into_inner();
        child.kill().unwrap();
        child.wait().unwrap();
    }

    #[test]
    fn test_into_inner_after_wait() {
        // This makes sure the child's inner state is valid. If we used waitpid
        // on the side, the inner child would try to wait again and cause an
        // error.
        let shared_child = SharedChild::spawn(&mut sleep_forever_cmd()).unwrap();
        shared_child.kill().unwrap();
        shared_child.wait().unwrap();
        let mut child = shared_child.into_inner();
        // The child has already been waited on, so kill should be an error.
        let kill_err = child.kill().unwrap_err();
        if cfg!(windows) {
            assert_eq!(std::io::ErrorKind::PermissionDenied, kill_err.kind());
        } else {
            assert_eq!(std::io::ErrorKind::InvalidInput, kill_err.kind());
        }
        // But wait should succeed.
        child.wait().unwrap();
    }

    #[test]
    fn test_new() -> Result<(), Box<dyn Error>> {
        // Spawn a short-lived child.
        let mut command = cat_cmd();
        command.stdin(Stdio::piped());
        command.stdout(Stdio::null());
        let mut child = command.spawn()?;
        let child_stdin = child.stdin.take().unwrap();

        // Construct a SharedChild from the Child, which has not yet been waited on. The child is
        // blocked on stdin, so we know it hasn't yet exited.
        let mut shared_child = SharedChild::new(child).unwrap();
        assert!(matches!(
            *shared_child.state_lock.lock().unwrap(),
            NotWaiting,
        ));

        // Now close the child's stdin. This will cause the child to exit.
        drop(child_stdin);

        // Construct more SharedChild objects from the same child, in a loop. Eventually one of
        // them will notice that the child has exited.
        loop {
            shared_child = SharedChild::new(shared_child.into_inner())?;
            if let Exited(status) = &*shared_child.state_lock.lock().unwrap() {
                assert!(status.success());
                return Ok(());
            }
        }
    }

    #[test]
    fn test_takes() -> Result<(), Box<dyn Error>> {
        let mut command = true_cmd();
        command.stdin(Stdio::piped());
        command.stdout(Stdio::piped());
        command.stderr(Stdio::piped());
        let shared_child = SharedChild::spawn(&mut command)?;

        assert!(shared_child.take_stdin().is_some());
        assert!(shared_child.take_stdout().is_some());
        assert!(shared_child.take_stderr().is_some());

        assert!(shared_child.take_stdin().is_none());
        assert!(shared_child.take_stdout().is_none());
        assert!(shared_child.take_stderr().is_none());

        shared_child.wait()?;
        Ok(())
    }
}