ralph_workflow/files/protection/
monitoring.rs

1//! Real-time file system monitoring for PROMPT.md protection.
2//!
3//! This module provides proactive monitoring to detect deletion attempts
4//! on PROMPT.md immediately, rather than waiting for periodic checks.
5//! It uses the `notify` crate for cross-platform file system events.
6//!
7//! # Design
8//!
9//! The monitor runs in a background thread and watches for deletion events
10//! on PROMPT.md. When a deletion is detected, it's automatically restored
11//! from backup. The main thread can poll the monitor to check if any
12//! restoration events occurred.
13//!
14//! # Platform Support
15//!
16//! - **Unix/Linux**: inotify via `notify` crate
17//! - **macOS**: `FSEvents` via `notify` crate
18//! - **Windows**: `ReadDirectoryChangesW` via `notify` crate
19
20use std::fs;
21use std::path::Path;
22use std::sync::atomic::{AtomicBool, Ordering};
23use std::sync::Arc;
24use std::thread;
25use std::time::Duration;
26
27/// File system monitor for detecting PROMPT.md deletion events.
28///
29/// The monitor watches for deletion events and automatically restores
30/// PROMPT.md from backup when detected. Monitoring happens in a background
31/// thread, so the main thread is not blocked.
32///
33/// # Example
34///
35/// ```no_run
36/// # use ralph_workflow::files::protection::monitoring::PromptMonitor;
37/// let mut monitor = PromptMonitor::new().unwrap();
38/// monitor.start().unwrap();
39///
40/// // ... run pipeline phases ...
41///
42/// // Check if any restoration occurred
43/// if monitor.check_and_restore() {
44///     println!("PROMPT.md was restored!");
45/// }
46///
47/// monitor.stop();
48/// # Ok::<(), std::io::Error>(())
49/// ```
50pub struct PromptMonitor {
51    /// Flag indicating if PROMPT.md was deleted and restored
52    restoration_detected: Arc<AtomicBool>,
53    /// Flag to signal the monitor thread to stop
54    stop_signal: Arc<AtomicBool>,
55    /// Handle to the monitor thread (None if not started)
56    monitor_thread: Option<thread::JoinHandle<()>>,
57}
58
59impl PromptMonitor {
60    /// Create a new file system monitor for PROMPT.md.
61    ///
62    /// Returns an error if the current directory cannot be accessed or
63    /// if PROMPT.md doesn't exist (we need to know what to watch for).
64    pub fn new() -> std::io::Result<Self> {
65        // Verify we're in a valid directory with PROMPT.md
66        let prompt_path = Path::new("PROMPT.md");
67        if !prompt_path.exists() {
68            return Err(std::io::Error::new(
69                std::io::ErrorKind::NotFound,
70                "PROMPT.md does not exist - cannot monitor",
71            ));
72        }
73
74        Ok(Self {
75            restoration_detected: Arc::new(AtomicBool::new(false)),
76            stop_signal: Arc::new(AtomicBool::new(false)),
77            monitor_thread: None,
78        })
79    }
80
81    /// Start monitoring PROMPT.md for deletion events.
82    ///
83    /// This spawns a background thread that watches for file system events.
84    /// Returns immediately; monitoring happens asynchronously.
85    ///
86    /// The monitor will automatically restore PROMPT.md from backup if
87    /// deletion is detected.
88    pub fn start(&mut self) -> std::io::Result<()> {
89        if self.monitor_thread.is_some() {
90            return Err(std::io::Error::new(
91                std::io::ErrorKind::AlreadyExists,
92                "Monitor is already running",
93            ));
94        }
95
96        let restoration_flag = Arc::clone(&self.restoration_detected);
97        let stop_signal = Arc::clone(&self.stop_signal);
98
99        let handle = thread::spawn(move || {
100            Self::monitor_thread_main(&restoration_flag, &stop_signal);
101        });
102
103        self.monitor_thread = Some(handle);
104        Ok(())
105    }
106
107    /// Background thread entry point for file system monitoring.
108    ///
109    /// This thread watches the current directory for deletion events on
110    /// PROMPT.md and restores from backup when detected.
111    fn monitor_thread_main(restoration_detected: &Arc<AtomicBool>, stop_signal: &Arc<AtomicBool>) {
112        use notify::Watcher;
113
114        // Create a channel to receive file system events
115        let (tx, rx) = std::sync::mpsc::channel();
116
117        // Create a watcher for the current directory
118        let mut watcher = match notify::recommended_watcher(tx) {
119            Ok(w) => w,
120            Err(e) => {
121                eprintln!("Warning: Failed to create file system watcher: {e}");
122                eprintln!("Falling back to periodic polling for PROMPT.md protection");
123                // Fallback to polling if watcher creation fails
124                Self::polling_monitor(restoration_detected, stop_signal);
125                return;
126            }
127        };
128
129        // Watch the current directory for events
130        if let Err(e) = watcher.watch(Path::new("."), notify::RecursiveMode::NonRecursive) {
131            eprintln!("Warning: Failed to watch current directory: {e}");
132            eprintln!("Falling back to periodic polling for PROMPT.md protection");
133            Self::polling_monitor(restoration_detected, stop_signal);
134            return;
135        }
136
137        // Process events until stop signal is received
138        let mut prompt_existed_last_check = true;
139
140        while !stop_signal.load(Ordering::Relaxed) {
141            // Check for events with a short timeout
142            match rx.recv_timeout(Duration::from_millis(100)) {
143                Ok(Ok(event)) => {
144                    Self::handle_fs_event(
145                        &event,
146                        restoration_detected,
147                        &mut prompt_existed_last_check,
148                    );
149                }
150                Ok(Err(_)) | Err(std::sync::mpsc::RecvTimeoutError::Timeout) => {
151                    // Error in watcher or timeout - continue anyway
152                }
153                Err(_) => {
154                    // Channel disconnected - stop monitoring
155                    break;
156                }
157            }
158        }
159    }
160
161    /// Handle a file system event from the watcher.
162    fn handle_fs_event(
163        event: &notify::Event,
164        restoration_detected: &Arc<AtomicBool>,
165        _prompt_existed_last_check: &mut bool,
166    ) {
167        for path in &event.paths {
168            if path.as_os_str() == "PROMPT.md" {
169                // Check for remove event
170                if matches!(event.kind, notify::EventKind::Remove(_)) {
171                    // PROMPT.md was removed - restore it
172                    if Self::restore_from_backup() {
173                        restoration_detected.store(true, Ordering::Release);
174                    }
175                }
176            }
177        }
178    }
179
180    /// Fallback polling-based monitor when file system watcher fails.
181    ///
182    /// Some filesystems (NFS, network drives) don't support file system
183    /// events. This fallback polls every 100ms to check if PROMPT.md exists.
184    fn polling_monitor(restoration_detected: &Arc<AtomicBool>, stop_signal: &Arc<AtomicBool>) {
185        let mut prompt_existed = Path::new("PROMPT.md").exists();
186
187        while !stop_signal.load(Ordering::Relaxed) {
188            thread::sleep(Duration::from_millis(100));
189
190            let prompt_exists_now = Path::new("PROMPT.md").exists();
191
192            // Detect deletion (transition from exists to not exists)
193            if prompt_existed && !prompt_exists_now && Self::restore_from_backup() {
194                restoration_detected.store(true, Ordering::Release);
195            }
196
197            prompt_existed = prompt_exists_now;
198        }
199    }
200
201    /// Restore PROMPT.md from backup.
202    ///
203    /// Tries backups in order:
204    /// - .agent/PROMPT.md.backup
205    /// - .agent/PROMPT.md.backup.1
206    /// - .agent/PROMPT.md.backup.2
207    ///
208    /// Returns true if restoration succeeded, false otherwise.
209    ///
210    /// Uses atomic open to avoid TOCTOU race conditions - opens and reads
211    /// the file in one operation rather than checking existence separately.
212    fn restore_from_backup() -> bool {
213        let backup_paths = [
214            Path::new(".agent/PROMPT.md.backup"),
215            Path::new(".agent/PROMPT.md.backup.1"),
216            Path::new(".agent/PROMPT.md.backup.2"),
217        ];
218
219        for backup_path in &backup_paths {
220            // Use std::fs::File::open to atomically open the file, avoiding TOCTOU
221            // race conditions where the file could be replaced between exists() check
222            // and read operation
223            let backup_content = match std::fs::File::open(backup_path) {
224                Ok(mut file) => {
225                    // Verify it's a regular file, not a symlink or special file
226                    match file.metadata() {
227                        Ok(metadata) if metadata.is_file() => {
228                            // Read the content
229                            let mut buffer = String::new();
230                            match std::io::Read::read_to_string(&mut file, &mut buffer) {
231                                Ok(_) => buffer,
232                                Err(_) => continue,
233                            }
234                        }
235                        _ => continue, // Not a regular file, skip
236                    }
237                }
238                Err(_) => continue, // File doesn't exist or can't be opened
239            };
240
241            if backup_content.trim().is_empty() {
242                continue;
243            }
244
245            // Restore from backup - ensure parent directory exists
246            let prompt_path = Path::new("PROMPT.md");
247            if let Some(parent) = prompt_path.parent() {
248                if let Err(e) = fs::create_dir_all(parent) {
249                    eprintln!("Failed to create parent directory for PROMPT.md: {e}");
250                    continue;
251                }
252            }
253
254            if fs::write(prompt_path, backup_content).is_err() {
255                eprintln!("Failed to write PROMPT.md from backup");
256                continue;
257            }
258
259            // Set read-only permissions
260            #[cfg(unix)]
261            {
262                use std::os::unix::fs::PermissionsExt;
263                if let Ok(metadata) = fs::metadata(prompt_path) {
264                    let mut perms = metadata.permissions();
265                    perms.set_mode(0o444);
266                    let _ = fs::set_permissions(prompt_path, perms);
267                }
268            }
269
270            #[cfg(windows)]
271            {
272                if let Ok(metadata) = fs::metadata(prompt_path) {
273                    let mut perms = metadata.permissions();
274                    perms.set_readonly(true);
275                    let _ = fs::set_permissions(prompt_path, perms);
276                }
277            }
278
279            return true;
280        }
281
282        false
283    }
284
285    /// Check if any restoration events were detected and reset the flag.
286    ///
287    /// Returns true if PROMPT.md was deleted and restored since the last
288    /// check. This is a one-time check - the flag is reset after reading.
289    ///
290    /// # Example
291    ///
292    /// ```no_run
293    /// # use ralph_workflow::files::protection::monitoring::PromptMonitor;
294    /// # let mut monitor = PromptMonitor::new().unwrap();
295    /// # monitor.start().unwrap();
296    /// // After running some agent code
297    /// if monitor.check_and_restore() {
298    ///     println!("PROMPT.md was restored during this phase!");
299    /// }
300    /// ```
301    pub fn check_and_restore(&self) -> bool {
302        self.restoration_detected.load(Ordering::Acquire)
303    }
304
305    /// Stop monitoring and cleanup resources.
306    ///
307    /// Signals the monitor thread to stop and waits for it to complete.
308    pub fn stop(mut self) {
309        // Signal the thread to stop
310        self.stop_signal.store(true, Ordering::Release);
311
312        // Wait for the thread to finish and check for panics
313        if let Some(handle) = self.monitor_thread.take() {
314            if let Err(panic_payload) = handle.join() {
315                // Thread panicked - extract and log panic message for diagnostics
316                // Try common panic payload types
317                let panic_msg = panic_payload
318                    .downcast_ref::<String>()
319                    .cloned()
320                    .or_else(|| {
321                        panic_payload
322                            .downcast_ref::<&str>()
323                            .map(ToString::to_string)
324                    })
325                    .or_else(|| {
326                        panic_payload
327                            .downcast_ref::<&String>()
328                            .map(|s| (*s).clone())
329                    })
330                    .unwrap_or_else(|| {
331                        // Fallback: Try to get any available information
332                        format!(
333                            "<unknown panic type: {}>",
334                            std::any::type_name_of_val(&panic_payload)
335                        )
336                    });
337                eprintln!("Warning: File monitoring thread panicked: {panic_msg}");
338            }
339        }
340    }
341}
342
343impl Drop for PromptMonitor {
344    fn drop(&mut self) {
345        // Signal the thread to stop when dropped
346        self.stop_signal.store(true, Ordering::Release);
347
348        // Take the handle and let it finish on its own
349        // (we can't wait in Drop because we might be panicking)
350        let _ = self.monitor_thread.take();
351    }
352}
353
354#[cfg(test)]
355mod tests {
356    // Note: Tests that change directories are problematic in test suites.
357    // The monitoring functionality will be tested through integration tests
358    // when the monitor is integrated into the pipeline.
359}