Skip to main content

libdd_common/unix_utils/
process.rs

1// Copyright 2025-Present Datadog, Inc. https://www.datadoghq.com/
2// SPDX-License-Identifier: Apache-2.0
3
4use libc::{_exit, nfds_t, poll, pollfd, EXIT_FAILURE, POLLHUP};
5use nix::sys::wait::{waitpid, WaitPidFlag, WaitStatus};
6use nix::unistd::Pid;
7use std::os::fd::RawFd;
8
9use crate::timeout::TimeoutManager;
10
11#[derive(Debug, Eq, PartialEq, thiserror::Error)]
12pub enum ReapError {
13    #[error("Timeout waiting for child process to exit")]
14    Timeout,
15    #[error("Error waiting for child process to exit: {0}")]
16    WaitError(#[from] nix::Error),
17}
18
19#[derive(Debug, Eq, PartialEq, thiserror::Error)]
20pub enum PollError {
21    #[error("Poll failed with errno: {0}")]
22    PollError(i32),
23    #[error("Poll returned unexpected result: revents = {0}")]
24    UnexpectedResult(i16),
25}
26
27/// Non-blocking child reaper
28/// * If the child process has exited, return true
29/// * If the child process cannot be found, return false
30/// * If the child is still alive, or some other error occurs, return an error Either way, after
31///   this returns, you probably don't have to do anything else.
32// Note: some resources indicate it is unsafe to call `waitpid` from a signal handler, especially
33//       on macos, where the OS will terminate an offending process.  This appears to be untrue
34//       and `waitpid()` is characterized as async-signal safe by POSIX.
35pub fn reap_child_non_blocking(
36    pid: Pid,
37    timeout_manager: &TimeoutManager,
38) -> Result<bool, ReapError> {
39    loop {
40        match waitpid(pid, Some(WaitPidFlag::WNOHANG)) {
41            Ok(WaitStatus::StillAlive) => {
42                if timeout_manager.elapsed() > timeout_manager.timeout() {
43                    return Err(ReapError::Timeout);
44                }
45                // TODO, this is currently a busy loop.  Consider sleeping for a short time.
46            }
47            Ok(_status) => return Ok(true),
48            Err(nix::Error::ECHILD) => {
49                // Non-availability of the specified process is weird, since we should have
50                // exclusive access to reaping its exit, but at the very least means there is
51                // nothing further for us to do.
52                return Ok(false);
53            }
54            Err(e) => return Err(ReapError::WaitError(e)),
55        }
56    }
57}
58
59/// Kills the program without raising an abort or calling at_exit
60pub fn terminate() -> ! {
61    // Safety: No preconditions
62    unsafe { _exit(EXIT_FAILURE) }
63}
64
65/// true if successful wait, false if timeout occurred.
66pub fn wait_for_pollhup(
67    target_fd: RawFd,
68    timeout_manager: &TimeoutManager,
69) -> Result<bool, PollError> {
70    let mut poll_fds = [pollfd {
71        fd: target_fd,
72        events: POLLHUP,
73        revents: 0,
74    }];
75
76    loop {
77        let timeout_ms = timeout_manager.remaining().as_millis() as i32;
78        let poll_result =
79            unsafe { poll(poll_fds.as_mut_ptr(), poll_fds.len() as nfds_t, timeout_ms) };
80        match poll_result {
81            -1 => {
82                match nix::Error::last_raw() {
83                    libc::EAGAIN | libc::EINTR => {
84                        // Retry on EAGAIN or EINTR
85                        continue;
86                    }
87                    errno => return Err(PollError::PollError(errno)),
88                }
89            }
90            0 => return Ok(false), // Timeout occurred
91            _ => {
92                let revents = poll_fds[0].revents;
93                if revents & POLLHUP != 0 {
94                    return Ok(true); // POLLHUP detected
95                } else {
96                    return Err(PollError::UnexpectedResult(revents));
97                }
98            }
99        }
100    }
101}
102
103#[cfg(test)]
104mod tests {
105    use super::*;
106    use std::time::Duration;
107
108    #[cfg_attr(miri, ignore)] // miri doesn't support waitpid
109    #[test]
110    fn test_reap_child_non_blocking_timeout() {
111        let timeout = Duration::from_millis(10);
112        let manager = TimeoutManager::new(timeout);
113
114        // Try to reap a non-existent process
115        let result = reap_child_non_blocking(Pid::from_raw(99999), &manager);
116        assert!(matches!(result, Ok(false)));
117    }
118
119    #[cfg_attr(miri, ignore)] // miri doesn't support waitpid
120    #[test]
121    fn test_reap_child_non_blocking_exited_child() {
122        // This test would require actually creating a child process
123        // For now, just test that the function compiles and handles non-existent PIDs
124        let timeout = Duration::from_secs(1);
125        let manager = TimeoutManager::new(timeout);
126
127        let result = reap_child_non_blocking(Pid::from_raw(99999), &manager);
128        assert!(matches!(result, Ok(false)));
129    }
130
131    #[cfg_attr(miri, ignore)] // miri doesn't support waitpid
132    #[test]
133    fn test_reap_child_non_blocking_nonexistent_pid() {
134        let timeout = Duration::from_secs(1);
135        let manager = TimeoutManager::new(timeout);
136
137        let result = reap_child_non_blocking(Pid::from_raw(99999), &manager);
138        assert!(matches!(result, Ok(false)));
139    }
140
141    #[cfg_attr(miri, ignore)] // miri needs to support poll https://github.com/rust-lang/miri/issues/4413
142    #[test]
143    fn test_wait_for_pollhup_timeout() {
144        let timeout = Duration::from_millis(10);
145        let manager = TimeoutManager::new(timeout);
146
147        // Use an invalid file descriptor to test timeout
148        let result = wait_for_pollhup(-1, &manager);
149        assert!(matches!(result, Ok(false)));
150    }
151
152    #[cfg_attr(miri, ignore)] // miri doesn't support poll
153    #[test]
154    fn test_wait_for_pollhup_invalid_fd() {
155        let timeout = Duration::from_secs(1);
156        let manager = TimeoutManager::new(timeout);
157
158        // Use a positive, almost certainly invalid file descriptor
159        let invalid_fd = 999_999;
160        let result = wait_for_pollhup(invalid_fd, &manager);
161
162        // Invalid FD should result in an error, not a timeout
163        match result {
164            Err(PollError::PollError(errno)) => {
165                // Should be a valid errno (EBADF or similar)
166                assert!(errno > 0);
167            }
168            Err(PollError::UnexpectedResult(revents)) => {
169                println!("wait_for_pollhup({invalid_fd}, ..) returned UnexpectedResult({revents}) as allowed on this platform");
170            }
171            _ => panic!("Expected error for invalid FD, got: {result:?}"),
172        }
173    }
174}