ralph_workflow/interrupt/mod.rs
1//! Interrupt signal handling for graceful checkpoint save.
2//!
3//! This module provides signal handling for the Ralph pipeline, ensuring
4//! clean shutdown when the user interrupts with Ctrl+C.
5//!
6//! When an interrupt is received:
7//!
8//! - If the reducer event loop is running, the handler sets a global interrupt request
9//! flag and returns. The event loop consumes that flag and performs the reducer-driven
10//! termination sequence (`RestorePromptPermissions` -> `SaveCheckpoint` -> shutdown).
11//! - If the event loop is not running yet (early startup), the handler falls back to a
12//! best-effort checkpoint save and exits with the standard SIGINT code (130).
13//!
14//! ## Ctrl+C Exception for Safety Check
15//!
16//! The `interrupted_by_user` flag distinguishes user-initiated interrupts (Ctrl+C)
17//! from programmatic interrupts (`AwaitingDevFix` exhaustion, completion marker emission).
18//! When set to `true`, the pre-termination commit safety check is skipped because
19//! the user explicitly chose to interrupt execution. This respects user intent while
20//! ensuring all other termination paths commit uncommitted work before exiting.
21
22use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
23#[cfg(test)]
24use std::sync::Mutex;
25
26use std::path::Path;
27
28pub(crate) mod checkpoint;
29#[path = "runtime.rs"]
30mod handling;
31pub use checkpoint::InterruptContext;
32pub use handling::{clear_interrupt_context, get_interrupt_context, set_interrupt_context};
33
34/// True when a user interrupt (SIGINT / Ctrl+C) has been requested.
35///
36/// The signal handler sets this flag. The reducer event loop consumes it and
37/// transitions the pipeline to an Interrupted state so termination effects
38/// (`RestorePromptPermissions`, `SaveCheckpoint`) execute deterministically.
39static USER_INTERRUPT_REQUESTED: AtomicBool = AtomicBool::new(false);
40
41/// True once a user interrupt has occurred during this process lifetime.
42///
43/// Unlike `USER_INTERRUPT_REQUESTED`, this flag is NEVER cleared. It remains
44/// set even after the event loop consumes the pending interrupt request via
45/// `take_user_interrupt_request()`. Use this flag in shutdown code paths
46/// (e.g., `capture_git_state`) where you need to know whether the process is
47/// shutting down due to Ctrl+C, even after the pending request has been consumed.
48static USER_INTERRUPTED_OCCURRED: AtomicBool = AtomicBool::new(false);
49
50/// True while the reducer event loop is running.
51///
52/// When true, the Ctrl+C handler must NOT call `process::exit()`.
53/// Instead it requests interruption and lets the event loop drive:
54/// - `RestorePromptPermissions`
55/// - `SaveCheckpoint`
56/// - orderly shutdown
57pub(crate) static EVENT_LOOP_ACTIVE: AtomicBool = AtomicBool::new(false);
58
59/// Number of SIGINTs received while the reducer event loop is active.
60///
61/// First Ctrl+C requests graceful reducer-driven shutdown.
62/// Second Ctrl+C forces immediate process exit to avoid indefinite hangs.
63pub(crate) static EVENT_LOOP_ACTIVE_SIGINT_COUNT: AtomicUsize = AtomicUsize::new(0);
64
65/// True when the process should exit with code 130 after the pipeline returns.
66///
67/// We intentionally do not call `process::exit(130)` from inside the pipeline runner,
68/// because that would bypass Rust destructors (RAII cleanup like `AgentPhaseGuard::drop()`).
69/// Instead, the pipeline requests this exit code and `main()` performs the actual
70/// exit after stack unwinding and cleanup completes.
71static EXIT_130_AFTER_RUN: AtomicBool = AtomicBool::new(false);
72
73/// Request that the process exit with code 130 once the pipeline returns.
74pub fn request_exit_130_after_run() {
75 EXIT_130_AFTER_RUN.store(true, Ordering::SeqCst);
76}
77
78/// Consume a pending exit-130 request.
79pub fn take_exit_130_after_run() -> bool {
80 EXIT_130_AFTER_RUN.swap(false, Ordering::SeqCst)
81}
82
83#[cfg(unix)]
84fn restore_prompt_md_writable_via_std_fs() {
85 // Fast path: current working directory is already the repo root in normal runs.
86 if handling::restore_prompt_md_writable(std::path::Path::new("PROMPT.md")) {
87 return;
88 }
89
90 // Fallback: discover repo root.
91 let Ok(repo_root) = crate::git_helpers::get_repo_root() else {
92 return;
93 };
94
95 let _ = handling::restore_prompt_md_writable_in_repo(&repo_root);
96}
97
98fn remove_repo_root_ralph_dir_via_std_fs() {
99 let repo_root = handling::get_interrupt_context()
100 .map(|ctx| ctx.workspace.root().to_path_buf())
101 .or_else(|| crate::git_helpers::get_repo_root().ok());
102
103 if let Some(repo_root) = repo_root {
104 handling::remove_ralph_dir(&repo_root);
105 }
106}
107
108#[cfg(not(unix))]
109fn restore_prompt_md_writable_via_std_fs() {}
110
111/// RAII guard that marks the reducer event loop as active.
112pub struct EventLoopActiveGuard;
113
114impl Drop for EventLoopActiveGuard {
115 fn drop(&mut self) {
116 EVENT_LOOP_ACTIVE.store(false, Ordering::SeqCst);
117 EVENT_LOOP_ACTIVE_SIGINT_COUNT.store(0, Ordering::SeqCst);
118 }
119}
120
121/// Mark the reducer event loop as active for the duration of the returned guard.
122pub fn event_loop_active_guard() -> EventLoopActiveGuard {
123 EVENT_LOOP_ACTIVE_SIGINT_COUNT.store(0, Ordering::SeqCst);
124 EVENT_LOOP_ACTIVE.store(true, Ordering::SeqCst);
125 EventLoopActiveGuard
126}
127
128fn is_event_loop_active() -> bool {
129 EVENT_LOOP_ACTIVE.load(Ordering::SeqCst)
130}
131
132pub(crate) fn register_sigint_during_active_event_loop() -> bool {
133 // Returns true on second (or later) SIGINT while event loop is active.
134 let count = EVENT_LOOP_ACTIVE_SIGINT_COUNT.fetch_add(1, Ordering::SeqCst) + 1;
135 count >= 2
136}
137
138/// Request that the running pipeline treat the run as user-interrupted.
139///
140/// This is called by the Ctrl+C handler. The event loop is responsible for
141/// consuming the request and translating it into a reducer-visible transition.
142///
143/// Also sets the persistent `USER_INTERRUPTED_OCCURRED` flag, which is never
144/// cleared and allows shutdown code paths (e.g., `capture_git_state`) to
145/// detect the interrupt even after the event loop has consumed the pending
146/// request via `take_user_interrupt_request()`.
147pub fn request_user_interrupt() {
148 USER_INTERRUPT_REQUESTED.store(true, Ordering::SeqCst);
149 USER_INTERRUPTED_OCCURRED.store(true, Ordering::SeqCst);
150}
151
152/// Check if a user interrupt has occurred at any point during this process lifetime.
153///
154/// Returns true once a Ctrl+C has been received, and remains true for the rest
155/// of the process lifetime even after `take_user_interrupt_request()` has consumed
156/// the pending request.
157///
158/// Use this in shutdown code paths where you need to know whether the process
159/// is shutting down due to user interruption, even if the event loop has already
160/// consumed the interrupt request. For example, `capture_git_state` uses this
161/// to skip git commands that could hang indefinitely during interrupt-triggered
162/// shutdown.
163pub fn user_interrupted_occurred() -> bool {
164 USER_INTERRUPTED_OCCURRED.load(Ordering::SeqCst)
165}
166
167/// Check if a user interrupt request is pending without consuming it.
168///
169/// Returns true if an interrupt is pending. The flag remains set so that
170/// the event loop can still consume it via `take_user_interrupt_request()`.
171///
172/// Use this when you need to react to an interrupt (e.g., kill a subprocess)
173/// without stealing the flag from the event loop's per-iteration check.
174pub fn is_user_interrupt_requested() -> bool {
175 USER_INTERRUPT_REQUESTED.load(Ordering::SeqCst)
176}
177
178/// Consume a pending user interrupt request.
179///
180/// Returns true if an interrupt was pending.
181pub fn take_user_interrupt_request() -> bool {
182 USER_INTERRUPT_REQUESTED.swap(false, Ordering::SeqCst)
183}
184
185/// Reset the persistent user-interrupted flag.
186///
187/// Only intended for use in tests to restore a clean state between test cases
188/// that exercise interrupt behavior. Production code must not call this.
189#[cfg(test)]
190pub fn reset_user_interrupted_occurred() {
191 USER_INTERRUPTED_OCCURRED.store(false, Ordering::SeqCst);
192}
193
194/// Global mutex used by tests to serialize access to the process-global interrupt flags.
195///
196/// The interrupt flags are process-global (`static` atomics). Rust unit tests run in
197/// parallel by default, so tests that call `request_user_interrupt()`,
198/// `take_user_interrupt_request()`, or `reset_user_interrupted_occurred()` can interfere
199/// with each other unless they coordinate.
200///
201/// This lock should be held for the full duration of any test that:
202/// - sets or consumes the interrupt request flag, or
203/// - requires the interrupt flags to remain in a known state while exercising behavior.
204///
205/// Production code must not use this.
206#[cfg(test)]
207static TEST_INTERRUPT_LOCK: Mutex<()> = Mutex::new(());
208
209#[cfg(test)]
210pub(crate) fn interrupt_test_lock() -> std::sync::MutexGuard<'static, ()> {
211 TEST_INTERRUPT_LOCK
212 .lock()
213 .unwrap_or_else(std::sync::PoisonError::into_inner)
214}
215
216/// Set up the interrupt handler for graceful shutdown with checkpoint saving.
217///
218/// This function registers a SIGINT handler that will:
219/// 1. Save a checkpoint with the current pipeline state
220/// 2. Clean up generated files
221/// 3. Exit gracefully
222///
223/// Call this early in `main()` after initializing the pipeline context.
224#[expect(clippy::print_stderr, reason = "critical interrupt handling messages")]
225pub fn setup_interrupt_handler() {
226 let install = ctrlc::set_handler(|| {
227 request_user_interrupt();
228
229 // If the reducer event loop is running, do not exit here.
230 // The event loop will observe the request, restore permissions, and checkpoint.
231 if is_event_loop_active() {
232 if register_sigint_during_active_event_loop() {
233 eprintln!("\nSecond interrupt received; forcing immediate exit.");
234 restore_prompt_md_writable_via_std_fs();
235 eprintln!("Cleaning up...");
236 crate::git_helpers::cleanup_agent_phase_silent();
237 remove_repo_root_ralph_dir_via_std_fs();
238 handling::exit_sigint();
239 }
240
241 eprintln!(
242 "\nInterrupt received; requesting graceful shutdown (waiting for checkpoint)..."
243 );
244 return;
245 }
246
247 eprintln!("\nInterrupt received; saving checkpoint...");
248
249 // Clone the entire context (small, Arc-backed) and then perform I/O without
250 // holding the mutex.
251 let context = handling::get_interrupt_context();
252
253 if let Some(ref context) = context {
254 if let Err(e) = checkpoint::save_interrupt_checkpoint(context) {
255 eprintln!("Warning: Failed to save checkpoint: {e}");
256 } else {
257 eprintln!("Checkpoint saved. Resume with: ralph --resume");
258 }
259 }
260
261 // Best-effort: restore PROMPT.md permissions so we don't leave the repo locked.
262 // This is primarily for early-interrupt cases before the reducer event loop starts.
263 //
264 // Always attempt a std::fs fallback using repo discovery. This covers:
265 // - interrupt context not yet installed (very early SIGINT)
266 // - workspace implementations that cannot mutate real filesystem permissions
267 // (e.g., MemoryWorkspace)
268 restore_prompt_md_writable_via_std_fs();
269
270 if let Some(ref context) = context {
271 let _ = context.workspace.set_writable(Path::new("PROMPT.md"));
272 }
273
274 eprintln!("Cleaning up...");
275 crate::git_helpers::cleanup_agent_phase_silent();
276 remove_repo_root_ralph_dir_via_std_fs();
277 handling::exit_sigint();
278 });
279
280 if let Err(e) = install {
281 // Handler installation failure is a reliability issue: without it, Ctrl+C will not
282 // trigger checkpointing/cleanup and can leave the repo in a broken state.
283 eprintln!("Warning: failed to install Ctrl+C handler: {e}");
284 }
285}
286
287#[cfg(test)]
288mod io_tests {
289 include!("io_tests.rs");
290}