ralph_workflow/interrupt/mod.rs
1//! Interrupt signal handling for graceful checkpoint save.
2//!
3//! This module provides signal handling for the Ralph pipeline, ensuring
4//! clean shutdown when the user interrupts with Ctrl+C.
5//!
6//! When an interrupt is received:
7//!
8//! - If the reducer event loop is running, the handler sets a global interrupt request
9//! flag and returns. The event loop consumes that flag and performs the reducer-driven
10//! termination sequence (`RestorePromptPermissions` -> `SaveCheckpoint` -> shutdown).
11//! - If the event loop is not running yet (early startup), the handler falls back to a
12//! best-effort checkpoint save and exits with the standard SIGINT code (130).
13//!
14//! ## Ctrl+C Exception for Safety Check
15//!
16//! The `interrupted_by_user` flag distinguishes user-initiated interrupts (Ctrl+C)
17//! from programmatic interrupts (`AwaitingDevFix` exhaustion, completion marker emission).
18//! When set to `true`, the pre-termination commit safety check is skipped because
19//! the user explicitly chose to interrupt execution. This respects user intent while
20//! ensuring all other termination paths commit uncommitted work before exiting.
21
22use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
23use std::sync::Mutex;
24
25use std::path::Path;
26
27pub(crate) mod checkpoint;
28
29pub use checkpoint::InterruptContext;
30
31/// Global interrupt context for checkpoint saving on interrupt.
32///
33/// This is set during pipeline initialization and used by the interrupt
34/// handler to save a checkpoint when the user presses Ctrl+C.
35pub(crate) static INTERRUPT_CONTEXT: Mutex<Option<InterruptContext>> = Mutex::new(None);
36
37/// True when a user interrupt (SIGINT / Ctrl+C) has been requested.
38///
39/// The signal handler sets this flag. The reducer event loop consumes it and
40/// transitions the pipeline to an Interrupted state so termination effects
41/// (`RestorePromptPermissions`, `SaveCheckpoint`) execute deterministically.
42static USER_INTERRUPT_REQUESTED: AtomicBool = AtomicBool::new(false);
43
44/// True once a user interrupt has occurred during this process lifetime.
45///
46/// Unlike `USER_INTERRUPT_REQUESTED`, this flag is NEVER cleared. It remains
47/// set even after the event loop consumes the pending interrupt request via
48/// `take_user_interrupt_request()`. Use this flag in shutdown code paths
49/// (e.g., `capture_git_state`) where you need to know whether the process is
50/// shutting down due to Ctrl+C, even after the pending request has been consumed.
51static USER_INTERRUPTED_OCCURRED: AtomicBool = AtomicBool::new(false);
52
53/// True while the reducer event loop is running.
54///
55/// When true, the Ctrl+C handler must NOT call `process::exit()`.
56/// Instead it requests interruption and lets the event loop drive:
57/// - `RestorePromptPermissions`
58/// - `SaveCheckpoint`
59/// - orderly shutdown
60pub(crate) static EVENT_LOOP_ACTIVE: AtomicBool = AtomicBool::new(false);
61
62/// Number of SIGINTs received while the reducer event loop is active.
63///
64/// First Ctrl+C requests graceful reducer-driven shutdown.
65/// Second Ctrl+C forces immediate process exit to avoid indefinite hangs.
66pub(crate) static EVENT_LOOP_ACTIVE_SIGINT_COUNT: AtomicUsize = AtomicUsize::new(0);
67
68/// True when the process should exit with code 130 after the pipeline returns.
69///
70/// We intentionally do not call `process::exit(130)` from inside the pipeline runner,
71/// because that would bypass Rust destructors (RAII cleanup like `AgentPhaseGuard::drop()`).
72/// Instead, the pipeline requests this exit code and `main()` performs the actual
73/// exit after stack unwinding and cleanup completes.
74static EXIT_130_AFTER_RUN: AtomicBool = AtomicBool::new(false);
75
76/// Request that the process exit with code 130 once the pipeline returns.
77pub fn request_exit_130_after_run() {
78 EXIT_130_AFTER_RUN.store(true, Ordering::SeqCst);
79}
80
81/// Consume a pending exit-130 request.
82pub fn take_exit_130_after_run() -> bool {
83 EXIT_130_AFTER_RUN.swap(false, Ordering::SeqCst)
84}
85
86#[cfg(unix)]
87fn restore_prompt_md_writable_via_std_fs() {
88 use std::os::unix::fs::PermissionsExt;
89
90 fn make_writable(path: &std::path::Path) -> bool {
91 let Ok(metadata) = std::fs::metadata(path) else {
92 return false;
93 };
94
95 let mut perms = metadata.permissions();
96 // Preserve existing mode bits but ensure owner write is enabled.
97 perms.set_mode(perms.mode() | 0o200);
98 std::fs::set_permissions(path, perms).is_ok()
99 }
100
101 // Fast path: current working directory is already the repo root in normal runs.
102 if make_writable(std::path::Path::new("PROMPT.md")) {
103 return;
104 }
105
106 // Fallback: discover repo root.
107 let Ok(repo_root) = crate::git_helpers::get_repo_root() else {
108 return;
109 };
110
111 let prompt_path = repo_root.join("PROMPT.md");
112 let _ = make_writable(&prompt_path);
113}
114
115fn remove_repo_root_ralph_dir_via_std_fs() {
116 let repo_root = INTERRUPT_CONTEXT
117 .lock()
118 .unwrap_or_else(std::sync::PoisonError::into_inner)
119 .as_ref()
120 .map(|context| context.workspace.root().to_path_buf())
121 .or_else(|| crate::git_helpers::get_repo_root().ok());
122
123 if let Some(repo_root) = repo_root {
124 let _ = std::fs::remove_dir_all(repo_root.join(".git/ralph"));
125 }
126}
127
128#[cfg(not(unix))]
129fn restore_prompt_md_writable_via_std_fs() {}
130
131/// RAII guard that marks the reducer event loop as active.
132pub struct EventLoopActiveGuard;
133
134impl Drop for EventLoopActiveGuard {
135 fn drop(&mut self) {
136 EVENT_LOOP_ACTIVE.store(false, Ordering::SeqCst);
137 EVENT_LOOP_ACTIVE_SIGINT_COUNT.store(0, Ordering::SeqCst);
138 }
139}
140
141/// Mark the reducer event loop as active for the duration of the returned guard.
142pub fn event_loop_active_guard() -> EventLoopActiveGuard {
143 EVENT_LOOP_ACTIVE_SIGINT_COUNT.store(0, Ordering::SeqCst);
144 EVENT_LOOP_ACTIVE.store(true, Ordering::SeqCst);
145 EventLoopActiveGuard
146}
147
148fn is_event_loop_active() -> bool {
149 EVENT_LOOP_ACTIVE.load(Ordering::SeqCst)
150}
151
152pub(crate) fn register_sigint_during_active_event_loop() -> bool {
153 // Returns true on second (or later) SIGINT while event loop is active.
154 let count = EVENT_LOOP_ACTIVE_SIGINT_COUNT.fetch_add(1, Ordering::SeqCst) + 1;
155 count >= 2
156}
157
158/// Request that the running pipeline treat the run as user-interrupted.
159///
160/// This is called by the Ctrl+C handler. The event loop is responsible for
161/// consuming the request and translating it into a reducer-visible transition.
162///
163/// Also sets the persistent `USER_INTERRUPTED_OCCURRED` flag, which is never
164/// cleared and allows shutdown code paths (e.g., `capture_git_state`) to
165/// detect the interrupt even after the event loop has consumed the pending
166/// request via `take_user_interrupt_request()`.
167pub fn request_user_interrupt() {
168 USER_INTERRUPT_REQUESTED.store(true, Ordering::SeqCst);
169 USER_INTERRUPTED_OCCURRED.store(true, Ordering::SeqCst);
170}
171
172/// Check if a user interrupt has occurred at any point during this process lifetime.
173///
174/// Returns true once a Ctrl+C has been received, and remains true for the rest
175/// of the process lifetime even after `take_user_interrupt_request()` has consumed
176/// the pending request.
177///
178/// Use this in shutdown code paths where you need to know whether the process
179/// is shutting down due to user interruption, even if the event loop has already
180/// consumed the interrupt request. For example, `capture_git_state` uses this
181/// to skip git commands that could hang indefinitely during interrupt-triggered
182/// shutdown.
183pub fn user_interrupted_occurred() -> bool {
184 USER_INTERRUPTED_OCCURRED.load(Ordering::SeqCst)
185}
186
187/// Check if a user interrupt request is pending without consuming it.
188///
189/// Returns true if an interrupt is pending. The flag remains set so that
190/// the event loop can still consume it via `take_user_interrupt_request()`.
191///
192/// Use this when you need to react to an interrupt (e.g., kill a subprocess)
193/// without stealing the flag from the event loop's per-iteration check.
194pub fn is_user_interrupt_requested() -> bool {
195 USER_INTERRUPT_REQUESTED.load(Ordering::SeqCst)
196}
197
198/// Consume a pending user interrupt request.
199///
200/// Returns true if an interrupt was pending.
201pub fn take_user_interrupt_request() -> bool {
202 USER_INTERRUPT_REQUESTED.swap(false, Ordering::SeqCst)
203}
204
205/// Reset the persistent user-interrupted flag.
206///
207/// Only intended for use in tests to restore a clean state between test cases
208/// that exercise interrupt behavior. Production code must not call this.
209#[cfg(test)]
210pub fn reset_user_interrupted_occurred() {
211 USER_INTERRUPTED_OCCURRED.store(false, Ordering::SeqCst);
212}
213
214/// Global mutex used by tests to serialize access to the process-global interrupt flags.
215///
216/// The interrupt flags are process-global (`static` atomics). Rust unit tests run in
217/// parallel by default, so tests that call `request_user_interrupt()`,
218/// `take_user_interrupt_request()`, or `reset_user_interrupted_occurred()` can interfere
219/// with each other unless they coordinate.
220///
221/// This lock should be held for the full duration of any test that:
222/// - sets or consumes the interrupt request flag, or
223/// - requires the interrupt flags to remain in a known state while exercising behavior.
224///
225/// Production code must not use this.
226#[cfg(test)]
227static TEST_INTERRUPT_LOCK: Mutex<()> = Mutex::new(());
228
229#[cfg(test)]
230pub(crate) fn interrupt_test_lock() -> std::sync::MutexGuard<'static, ()> {
231 TEST_INTERRUPT_LOCK
232 .lock()
233 .unwrap_or_else(std::sync::PoisonError::into_inner)
234}
235
236/// Set the global interrupt context.
237///
238/// This function should be called during pipeline initialization to
239/// provide the interrupt handler with the context needed to save
240/// a checkpoint when interrupted.
241///
242/// # Arguments
243///
244/// * `context` - The interrupt context to store
245///
246/// # Note
247///
248/// This function is typically called at the start of `run_pipeline()`
249/// to ensure the interrupt handler has the most up-to-date context.
250pub fn set_interrupt_context(context: InterruptContext) {
251 let mut ctx = INTERRUPT_CONTEXT.lock().unwrap_or_else(|poison| {
252 // If mutex is poisoned, recover the guard and clear the state
253 poison.into_inner()
254 });
255 *ctx = Some(context);
256}
257
258/// Clear the global interrupt context.
259///
260/// This should be called when the pipeline completes successfully
261/// to prevent saving an interrupt checkpoint after normal completion.
262pub fn clear_interrupt_context() {
263 let mut ctx = INTERRUPT_CONTEXT.lock().unwrap_or_else(|poison| {
264 // If mutex is poisoned, recover the guard and clear the state
265 poison.into_inner()
266 });
267 *ctx = None;
268}
269
270/// Set up the interrupt handler for graceful shutdown with checkpoint saving.
271///
272/// This function registers a SIGINT handler that will:
273/// 1. Save a checkpoint with the current pipeline state
274/// 2. Clean up generated files
275/// 3. Exit gracefully
276///
277/// Call this early in `main()` after initializing the pipeline context.
278pub fn setup_interrupt_handler() {
279 let install = ctrlc::set_handler(|| {
280 request_user_interrupt();
281
282 // If the reducer event loop is running, do not exit here.
283 // The event loop will observe the request, restore permissions, and checkpoint.
284 if is_event_loop_active() {
285 if register_sigint_during_active_event_loop() {
286 eprintln!("\nSecond interrupt received; forcing immediate exit.");
287 restore_prompt_md_writable_via_std_fs();
288 eprintln!("Cleaning up...");
289 crate::git_helpers::cleanup_agent_phase_silent();
290 remove_repo_root_ralph_dir_via_std_fs();
291 std::process::exit(130);
292 }
293
294 eprintln!(
295 "\nInterrupt received; requesting graceful shutdown (waiting for checkpoint)..."
296 );
297 return;
298 }
299
300 eprintln!("\nInterrupt received; saving checkpoint...");
301
302 // Clone the entire context (small, Arc-backed) and then perform I/O without
303 // holding the mutex.
304 let context = {
305 let ctx = INTERRUPT_CONTEXT
306 .lock()
307 .unwrap_or_else(std::sync::PoisonError::into_inner);
308 ctx.clone()
309 };
310
311 if let Some(ref context) = context {
312 if let Err(e) = checkpoint::save_interrupt_checkpoint(context) {
313 eprintln!("Warning: Failed to save checkpoint: {e}");
314 } else {
315 eprintln!("Checkpoint saved. Resume with: ralph --resume");
316 }
317 }
318
319 // Best-effort: restore PROMPT.md permissions so we don't leave the repo locked.
320 // This is primarily for early-interrupt cases before the reducer event loop starts.
321 //
322 // Always attempt a std::fs fallback using repo discovery. This covers:
323 // - interrupt context not yet installed (very early SIGINT)
324 // - workspace implementations that cannot mutate real filesystem permissions
325 // (e.g., MemoryWorkspace)
326 restore_prompt_md_writable_via_std_fs();
327
328 if let Some(ref context) = context {
329 let _ = context.workspace.set_writable(Path::new("PROMPT.md"));
330 }
331
332 eprintln!("Cleaning up...");
333 crate::git_helpers::cleanup_agent_phase_silent();
334 remove_repo_root_ralph_dir_via_std_fs();
335 std::process::exit(130); // Standard exit code for SIGINT
336 });
337
338 if let Err(e) = install {
339 // Handler installation failure is a reliability issue: without it, Ctrl+C will not
340 // trigger checkpointing/cleanup and can leave the repo in a broken state.
341 eprintln!("Warning: failed to install Ctrl+C handler: {e}");
342 }
343}
344
345#[cfg(test)]
346mod tests {
347 include!("tests.rs");
348}