oxur_repl/metrics/
subprocess.rs

1//! Subprocess metrics for restart tracking and lifecycle monitoring
2//!
3//! Provides:
4//! - [`RestartReason`]: Categorized restart reasons from exit status
5//! - [`SubprocessMetrics`]: Metrics recorder for subprocess lifecycle
6
7use metrics::{counter, gauge};
8use std::process::ExitStatus;
9use std::time::Instant;
10
11/// Categorized reasons for subprocess restart.
12///
13/// Determined from the subprocess exit status, distinguishing between
14/// clean exits, error exits, and signal-based terminations.
15///
16/// # Detection Methods
17///
18/// | Variant | Detection |
19/// |---------|-----------|
20/// | `UserRequested` | Explicit restart command |
21/// | `CleanShutdown` | `exit(0)` |
22/// | `ErrorExit(n)` | `exit(n)` where n ≠ 0 |
23/// | `Segfault` | SIGSEGV (signal 11) |
24/// | `Killed` | SIGKILL (signal 9), possibly OOM |
25/// | `Aborted` | SIGABRT (signal 6) |
26/// | `Terminated` | SIGTERM (signal 15) |
27/// | `SignalOther(n)` | Other signal |
28/// | `Unknown` | No exit code or signal available |
29#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
30pub enum RestartReason {
31    /// User explicitly requested restart via command
32    UserRequested,
33    /// Clean shutdown with exit code 0
34    CleanShutdown,
35    /// Error exit with non-zero code
36    ErrorExit(i32),
37    /// Segmentation fault (SIGSEGV, signal 11)
38    Segfault,
39    /// Killed (SIGKILL, signal 9) - possibly OOM
40    Killed,
41    /// Aborted (SIGABRT, signal 6) - typically assertion failure
42    Aborted,
43    /// Terminated (SIGTERM, signal 15) - graceful shutdown request
44    Terminated,
45    /// Other signal termination
46    SignalOther(i32),
47    /// Unknown reason (no exit code or signal)
48    Unknown,
49}
50
51impl RestartReason {
52    /// Determine restart reason from process exit status.
53    ///
54    /// On Unix, checks for signal termination first, then exit code.
55    /// On other platforms, only exit code is available.
56    ///
57    /// # Example
58    ///
59    /// ```no_run
60    /// use std::process::Command;
61    /// use oxur_repl::metrics::RestartReason;
62    ///
63    /// let mut child = Command::new("ls").spawn().unwrap();
64    /// let status = child.wait().unwrap();
65    /// let reason = RestartReason::from_exit_status(status);
66    /// ```
67    pub fn from_exit_status(status: ExitStatus) -> Self {
68        #[cfg(unix)]
69        {
70            use std::os::unix::process::ExitStatusExt;
71            if let Some(signal) = status.signal() {
72                return match signal {
73                    6 => Self::Aborted,
74                    9 => Self::Killed,
75                    11 => Self::Segfault,
76                    15 => Self::Terminated,
77                    s => Self::SignalOther(s),
78                };
79            }
80        }
81
82        match status.code() {
83            Some(0) => Self::CleanShutdown,
84            Some(n) => Self::ErrorExit(n),
85            None => Self::Unknown,
86        }
87    }
88
89    /// Get the metrics label for this restart reason.
90    ///
91    /// Returns a static string suitable for use as a metrics label value.
92    pub fn as_label(&self) -> &'static str {
93        match self {
94            Self::UserRequested => "user_requested",
95            Self::CleanShutdown => "clean_shutdown",
96            Self::ErrorExit(_) => "error_exit",
97            Self::Segfault => "segfault",
98            Self::Killed => "killed",
99            Self::Aborted => "aborted",
100            Self::Terminated => "terminated",
101            Self::SignalOther(_) => "signal_other",
102            Self::Unknown => "unknown",
103        }
104    }
105
106    /// Check if this is a signal-based termination.
107    pub fn is_signal(&self) -> bool {
108        matches!(
109            self,
110            Self::Segfault | Self::Killed | Self::Aborted | Self::Terminated | Self::SignalOther(_)
111        )
112    }
113
114    /// Check if this is a clean shutdown.
115    pub fn is_clean(&self) -> bool {
116        matches!(self, Self::CleanShutdown | Self::UserRequested)
117    }
118}
119
120impl std::fmt::Display for RestartReason {
121    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
122        match self {
123            Self::UserRequested => write!(f, "user requested"),
124            Self::CleanShutdown => write!(f, "clean shutdown"),
125            Self::ErrorExit(code) => write!(f, "error exit (code {})", code),
126            Self::Segfault => write!(f, "segmentation fault (SIGSEGV)"),
127            Self::Killed => write!(f, "killed (SIGKILL)"),
128            Self::Aborted => write!(f, "aborted (SIGABRT)"),
129            Self::Terminated => write!(f, "terminated (SIGTERM)"),
130            Self::SignalOther(sig) => write!(f, "signal {}", sig),
131            Self::Unknown => write!(f, "unknown"),
132        }
133    }
134}
135
136/// Subprocess metrics recorder.
137///
138/// Tracks subprocess lifecycle including:
139/// - Restart counts by reason
140/// - Current uptime
141/// - Start time tracking
142///
143/// Maintains local state for `(stats subprocess)` display while also emitting
144/// to the `metrics` crate facade for external monitoring.
145///
146/// # Usage
147///
148/// ```
149/// use oxur_repl::metrics::{SubprocessMetrics, RestartReason};
150///
151/// let mut metrics = SubprocessMetrics::new();
152///
153/// // Record process start
154/// metrics.process_started();
155///
156/// // On subprocess death, record reason
157/// metrics.record_restart(RestartReason::ErrorExit(1));
158///
159/// // Get current uptime
160/// let uptime = metrics.uptime_seconds();
161///
162/// // Get snapshot for display
163/// let snapshot = metrics.snapshot();
164/// ```
165#[derive(Debug)]
166pub struct SubprocessMetrics {
167    started_at: Option<Instant>,
168    restart_count: u64,
169    last_restart_reason: Option<RestartReason>,
170}
171
172/// Snapshot of subprocess metrics for display.
173#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
174pub struct SubprocessMetricsSnapshot {
175    /// Current uptime in seconds
176    pub uptime_seconds: f64,
177    /// Total restart count
178    pub restart_count: u64,
179    /// Last restart reason (if any)
180    pub last_restart_reason: Option<RestartReason>,
181    /// Whether the subprocess is currently running
182    pub is_running: bool,
183}
184
185impl SubprocessMetrics {
186    /// Create a new SubprocessMetrics instance.
187    pub fn new() -> Self {
188        Self { started_at: None, restart_count: 0, last_restart_reason: None }
189    }
190
191    /// Record that the subprocess has started.
192    ///
193    /// Resets the uptime tracking and updates the `repl.subprocess.uptime_seconds` gauge.
194    pub fn process_started(&mut self) {
195        self.started_at = Some(Instant::now());
196        gauge!("repl.subprocess.uptime_seconds").set(0.0);
197    }
198
199    /// Record a subprocess restart with the given reason.
200    ///
201    /// Increments `repl.subprocess.restarts_total` counter with reason label.
202    ///
203    /// # Arguments
204    ///
205    /// * `reason` - The categorized reason for the restart
206    pub fn record_restart(&mut self, reason: RestartReason) {
207        self.restart_count += 1;
208        self.last_restart_reason = Some(reason);
209        counter!("repl.subprocess.restarts_total", "reason" => reason.as_label()).increment(1);
210    }
211
212    /// Get the current uptime in seconds.
213    ///
214    /// Returns 0.0 if the subprocess hasn't been started.
215    pub fn uptime_seconds(&self) -> f64 {
216        self.started_at.map(|start| start.elapsed().as_secs_f64()).unwrap_or(0.0)
217    }
218
219    /// Update the uptime gauge with the current value.
220    ///
221    /// Should be called periodically to keep the gauge current.
222    pub fn update_uptime_gauge(&self) {
223        gauge!("repl.subprocess.uptime_seconds").set(self.uptime_seconds());
224    }
225
226    /// Get the total restart count.
227    pub fn restart_count(&self) -> u64 {
228        self.restart_count
229    }
230
231    /// Get the last restart reason.
232    pub fn last_restart_reason(&self) -> Option<RestartReason> {
233        self.last_restart_reason
234    }
235
236    /// Check if the subprocess is currently running.
237    pub fn is_running(&self) -> bool {
238        self.started_at.is_some()
239    }
240
241    /// Get a snapshot of current metrics for display.
242    pub fn snapshot(&self) -> SubprocessMetricsSnapshot {
243        SubprocessMetricsSnapshot {
244            uptime_seconds: self.uptime_seconds(),
245            restart_count: self.restart_count,
246            last_restart_reason: self.last_restart_reason,
247            is_running: self.is_running(),
248        }
249    }
250}
251
252impl Default for SubprocessMetrics {
253    fn default() -> Self {
254        Self::new()
255    }
256}
257
258#[cfg(test)]
259mod tests {
260    use super::*;
261
262    #[test]
263    fn test_restart_reason_labels() {
264        assert_eq!(RestartReason::UserRequested.as_label(), "user_requested");
265        assert_eq!(RestartReason::CleanShutdown.as_label(), "clean_shutdown");
266        assert_eq!(RestartReason::ErrorExit(1).as_label(), "error_exit");
267        assert_eq!(RestartReason::Segfault.as_label(), "segfault");
268        assert_eq!(RestartReason::Killed.as_label(), "killed");
269        assert_eq!(RestartReason::Aborted.as_label(), "aborted");
270        assert_eq!(RestartReason::Terminated.as_label(), "terminated");
271        assert_eq!(RestartReason::SignalOther(99).as_label(), "signal_other");
272        assert_eq!(RestartReason::Unknown.as_label(), "unknown");
273    }
274
275    #[test]
276    fn test_restart_reason_is_signal() {
277        assert!(!RestartReason::UserRequested.is_signal());
278        assert!(!RestartReason::CleanShutdown.is_signal());
279        assert!(!RestartReason::ErrorExit(1).is_signal());
280        assert!(RestartReason::Segfault.is_signal());
281        assert!(RestartReason::Killed.is_signal());
282        assert!(RestartReason::Aborted.is_signal());
283        assert!(RestartReason::Terminated.is_signal());
284        assert!(RestartReason::SignalOther(99).is_signal());
285        assert!(!RestartReason::Unknown.is_signal());
286    }
287
288    #[test]
289    fn test_restart_reason_is_clean() {
290        assert!(RestartReason::UserRequested.is_clean());
291        assert!(RestartReason::CleanShutdown.is_clean());
292        assert!(!RestartReason::ErrorExit(1).is_clean());
293        assert!(!RestartReason::Segfault.is_clean());
294        assert!(!RestartReason::Unknown.is_clean());
295    }
296
297    #[test]
298    fn test_restart_reason_display() {
299        assert_eq!(format!("{}", RestartReason::UserRequested), "user requested");
300        assert_eq!(format!("{}", RestartReason::CleanShutdown), "clean shutdown");
301        assert_eq!(format!("{}", RestartReason::ErrorExit(42)), "error exit (code 42)");
302        assert_eq!(format!("{}", RestartReason::Segfault), "segmentation fault (SIGSEGV)");
303        assert_eq!(format!("{}", RestartReason::Killed), "killed (SIGKILL)");
304    }
305
306    #[test]
307    fn test_subprocess_metrics_creation() {
308        let metrics = SubprocessMetrics::new();
309        assert_eq!(metrics.uptime_seconds(), 0.0);
310        assert_eq!(metrics.restart_count(), 0);
311    }
312
313    #[test]
314    fn test_subprocess_metrics_lifecycle() {
315        let mut metrics = SubprocessMetrics::new();
316
317        // Start process
318        metrics.process_started();
319        assert!(metrics.uptime_seconds() >= 0.0);
320
321        // Record restart
322        metrics.record_restart(RestartReason::ErrorExit(1));
323        assert_eq!(metrics.restart_count(), 1);
324
325        // Start again
326        metrics.process_started();
327        metrics.record_restart(RestartReason::Segfault);
328        assert_eq!(metrics.restart_count(), 2);
329    }
330}