Skip to main content

lean_ctx/core/
startup_guard.rs

1use std::io::Write as _;
2use std::path::PathBuf;
3use std::time::Duration;
4
5pub const CRASH_LOOP_WINDOW_SECS: u64 = 60;
6pub const CRASH_LOOP_THRESHOLD: usize = 8;
7pub const CRASH_LOOP_MAX_BACKOFF_SECS: u64 = 30;
8
9pub const MCP_PROCESS_NAME: &str = "mcp-server";
10
11pub fn crash_loop_log_path(process_name: &str) -> Option<PathBuf> {
12    crate::core::data_dir::lean_ctx_data_dir()
13        .ok()
14        .map(|dir| dir.join(format!(".{}-starts.log", sanitize_lock_name(process_name))))
15}
16
17pub struct StartupLockGuard {
18    path: PathBuf,
19}
20
21impl StartupLockGuard {
22    pub fn touch(&self) {
23        // Refresh the lock's mtime so stale eviction doesn't reclaim an active
24        // long-running holder, while preserving the owner PID line so a crashed
25        // holder can still be detected as dead by other processes.
26        if let Ok(mut f) = std::fs::OpenOptions::new()
27            .write(true)
28            .truncate(true)
29            .open(&self.path)
30        {
31            let _ = writeln!(f, "{}", std::process::id());
32        }
33    }
34}
35
36/// Decides whether a currently-held lock file can be reclaimed by a waiter.
37///
38/// A lock whose recorded owner PID is no longer alive is reclaimed immediately —
39/// this is what stops a crashed/killed holder's lock from lingering until
40/// `stale_after` elapses (the cause of the stale `.graph-idx-*.lock` build-up).
41/// If the owner is alive, or the lock predates PID tracking (legacy 0-byte
42/// file), we fall back to the long-standing mtime staleness safety valve.
43fn lock_is_reclaimable(path: &std::path::Path, stale_after: Duration) -> bool {
44    if let Ok(content) = std::fs::read_to_string(path) {
45        if let Some(pid) = content
46            .lines()
47            .next()
48            .and_then(|l| l.trim().parse::<u32>().ok())
49        {
50            if !crate::ipc::process::is_alive(pid) {
51                return true;
52            }
53        }
54    }
55    if let Ok(meta) = std::fs::metadata(path) {
56        if let Ok(modified) = meta.modified() {
57            return modified.elapsed().unwrap_or_default() > stale_after;
58        }
59    }
60    false
61}
62
63impl Drop for StartupLockGuard {
64    fn drop(&mut self) {
65        let _ = std::fs::remove_file(&self.path);
66    }
67}
68
69fn sanitize_lock_name(name: &str) -> String {
70    name.chars()
71        .map(|c| {
72            if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
73                c
74            } else {
75                '_'
76            }
77        })
78        .collect()
79}
80
81/// Best-effort cross-process lock (create_new + stale eviction).
82///
83/// Returns `None` if the data dir can't be resolved or if the lock can't be acquired
84/// within `timeout`.
85pub fn try_acquire_lock(
86    name: &str,
87    timeout: Duration,
88    stale_after: Duration,
89) -> Option<StartupLockGuard> {
90    let dir = crate::core::data_dir::lean_ctx_data_dir().ok()?;
91    let _ = std::fs::create_dir_all(&dir);
92
93    let name = sanitize_lock_name(name);
94    let path = dir.join(format!(".{name}.lock"));
95
96    let deadline = std::time::Instant::now().checked_add(timeout)?;
97    let mut sleep_ms: u64 = 10;
98
99    loop {
100        match std::fs::OpenOptions::new()
101            .write(true)
102            .create_new(true)
103            .open(&path)
104        {
105            Ok(mut f) => {
106                // Record the owner PID so a crashed holder's lock can be
107                // reclaimed immediately instead of waiting out `stale_after`.
108                let _ = writeln!(f, "{}", std::process::id());
109                return Some(StartupLockGuard { path });
110            }
111            Err(_) => {
112                if lock_is_reclaimable(&path, stale_after) {
113                    let _ = std::fs::remove_file(&path);
114                }
115            }
116        }
117
118        if std::time::Instant::now() >= deadline {
119            return None;
120        }
121
122        std::thread::sleep(Duration::from_millis(sleep_ms));
123        sleep_ms = (sleep_ms.saturating_mul(2)).min(120);
124    }
125}
126
127/// Detects rapid restart loops (e.g., IDE keeps respawning a crashing MCP server).
128/// Records each startup timestamp; if too many happen within the window, sleeps
129/// with exponential backoff to break the loop and avoid host degradation.
130pub fn crash_loop_backoff(process_name: &str) {
131    let Some(dir) = crate::core::data_dir::lean_ctx_data_dir().ok() else {
132        return;
133    };
134    let _ = std::fs::create_dir_all(&dir);
135    let ts_path = dir.join(format!(".{}-starts.log", sanitize_lock_name(process_name)));
136
137    let now = std::time::SystemTime::now()
138        .duration_since(std::time::UNIX_EPOCH)
139        .unwrap_or_default()
140        .as_secs();
141
142    let cutoff = now.saturating_sub(CRASH_LOOP_WINDOW_SECS);
143
144    let mut recent: Vec<u64> = std::fs::read_to_string(&ts_path)
145        .unwrap_or_default()
146        .lines()
147        .filter_map(|l| l.trim().parse::<u64>().ok())
148        .filter(|&ts| ts >= cutoff)
149        .collect();
150    recent.push(now);
151
152    if let Ok(mut f) = std::fs::File::create(&ts_path) {
153        for ts in &recent {
154            let _ = writeln!(f, "{ts}");
155        }
156    }
157
158    if recent.len() > CRASH_LOOP_THRESHOLD {
159        let restarts_over = recent.len() - CRASH_LOOP_THRESHOLD;
160        let backoff_secs =
161            (2u64.saturating_pow(restarts_over as u32)).min(CRASH_LOOP_MAX_BACKOFF_SECS);
162        let msg = format!(
163            "lean-ctx: crash-loop protection — {process_name} started {} times in {CRASH_LOOP_WINDOW_SECS}s, \
164             waiting {backoff_secs}s before accepting connections. \
165             If your IDE is slow to initialize, this is normal.",
166            recent.len()
167        );
168        tracing::warn!("{msg}");
169        eprintln!("{msg}");
170        std::thread::sleep(Duration::from_secs(backoff_secs));
171    }
172}
173
174/// Clears the crash-loop history file, resetting any active backoff.
175pub fn reset_crash_loop(process_name: &str) {
176    let Some(dir) = crate::core::data_dir::lean_ctx_data_dir().ok() else {
177        return;
178    };
179    let ts_path = dir.join(format!(".{}-starts.log", sanitize_lock_name(process_name)));
180    let _ = std::fs::remove_file(&ts_path);
181}
182
183#[cfg(test)]
184mod tests {
185    use super::*;
186
187    struct EnvVarGuard {
188        key: &'static str,
189        prev: Option<String>,
190    }
191
192    impl EnvVarGuard {
193        fn set(key: &'static str, value: &std::path::Path) -> Self {
194            let prev = std::env::var(key).ok();
195            std::env::set_var(key, value);
196            Self { key, prev }
197        }
198    }
199
200    impl Drop for EnvVarGuard {
201        fn drop(&mut self) {
202            match self.prev.as_deref() {
203                Some(v) => std::env::set_var(self.key, v),
204                None => std::env::remove_var(self.key),
205            }
206        }
207    }
208
209    #[test]
210    fn lock_acquire_and_release() {
211        let _env = crate::core::data_dir::test_env_lock();
212        let dir = tempfile::tempdir().unwrap();
213        let _guard = EnvVarGuard::set("LEAN_CTX_DATA_DIR", dir.path());
214
215        let g = try_acquire_lock(
216            "unit-test",
217            Duration::from_millis(200),
218            Duration::from_secs(30),
219        );
220        assert!(g.is_some());
221
222        let lock_path = dir.path().join(".unit-test.lock");
223        assert!(lock_path.exists());
224
225        drop(g);
226        assert!(!lock_path.exists());
227    }
228
229    #[test]
230    fn lock_times_out_while_held() {
231        let _env = crate::core::data_dir::test_env_lock();
232        let dir = tempfile::tempdir().unwrap();
233        let _guard = EnvVarGuard::set("LEAN_CTX_DATA_DIR", dir.path());
234
235        let g1 = try_acquire_lock(
236            "unit-test-2",
237            Duration::from_millis(200),
238            Duration::from_secs(30),
239        )
240        .expect("first lock should acquire");
241        let g2 = try_acquire_lock(
242            "unit-test-2",
243            Duration::from_millis(60),
244            Duration::from_secs(30),
245        );
246        assert!(g2.is_none());
247
248        drop(g1);
249        let g3 = try_acquire_lock(
250            "unit-test-2",
251            Duration::from_millis(200),
252            Duration::from_secs(30),
253        );
254        assert!(g3.is_some());
255    }
256
257    #[test]
258    fn dead_owner_lock_is_reclaimed_immediately() {
259        let _env = crate::core::data_dir::test_env_lock();
260        let dir = tempfile::tempdir().unwrap();
261        let _guard = EnvVarGuard::set("LEAN_CTX_DATA_DIR", dir.path());
262
263        // Pre-seed a held lock owned by a PID that cannot be alive.
264        let lock_path = dir.path().join(".dead-owner.lock");
265        std::fs::write(&lock_path, "4294967294\n").unwrap();
266
267        // The lock's mtime is fresh (just written), so the mtime safety valve
268        // would NOT reclaim it within stale_after — only the dead-PID check can.
269        let g = try_acquire_lock(
270            "dead-owner",
271            Duration::from_millis(300),
272            Duration::from_secs(30),
273        );
274        assert!(
275            g.is_some(),
276            "lock with a dead owner PID must be reclaimable"
277        );
278    }
279
280    #[test]
281    fn crash_loop_thresholds_are_resilient() {
282        let threshold = CRASH_LOOP_THRESHOLD;
283        let window = CRASH_LOOP_WINDOW_SECS;
284        let backoff = CRASH_LOOP_MAX_BACKOFF_SECS;
285        assert!(
286            threshold >= 8,
287            "threshold must tolerate IDE restart patterns (was {threshold})"
288        );
289        assert!(
290            window >= 60,
291            "window must cover slow IDE startup (was {window}s)"
292        );
293        assert!(
294            backoff <= 30,
295            "max backoff must not be too aggressive (was {backoff}s)"
296        );
297    }
298
299    #[test]
300    fn crash_loop_backoff_under_threshold_no_sleep() {
301        let _env = crate::core::data_dir::test_env_lock();
302        let dir = tempfile::tempdir().unwrap();
303        let _guard = EnvVarGuard::set("LEAN_CTX_DATA_DIR", dir.path());
304
305        let start = std::time::Instant::now();
306        for _ in 0..CRASH_LOOP_THRESHOLD {
307            crash_loop_backoff("test-no-sleep");
308        }
309        assert!(
310            start.elapsed() < Duration::from_secs(1),
311            "under threshold should not sleep"
312        );
313    }
314
315    #[test]
316    fn reset_crash_loop_clears_history() {
317        let _env = crate::core::data_dir::test_env_lock();
318        let dir = tempfile::tempdir().unwrap();
319        let _guard = EnvVarGuard::set("LEAN_CTX_DATA_DIR", dir.path());
320
321        for _ in 0..5 {
322            crash_loop_backoff("test-reset");
323        }
324        let log_path = dir.path().join(".test-reset-starts.log");
325        assert!(log_path.exists(), "crash loop log should exist after calls");
326
327        reset_crash_loop("test-reset");
328        assert!(
329            !log_path.exists(),
330            "crash loop log should be removed after reset"
331        );
332    }
333
334    #[test]
335    fn reset_crash_loop_nonexistent_is_noop() {
336        let _env = crate::core::data_dir::test_env_lock();
337        let dir = tempfile::tempdir().unwrap();
338        let _guard = EnvVarGuard::set("LEAN_CTX_DATA_DIR", dir.path());
339
340        reset_crash_loop("never-existed");
341    }
342
343    #[test]
344    fn crash_loop_log_only_keeps_recent_entries() {
345        let _env = crate::core::data_dir::test_env_lock();
346        let dir = tempfile::tempdir().unwrap();
347        let _guard = EnvVarGuard::set("LEAN_CTX_DATA_DIR", dir.path());
348
349        let log_path = dir.path().join(".test-prune-starts.log");
350        let old_ts = 1000u64;
351        std::fs::write(&log_path, format!("{old_ts}\n")).unwrap();
352
353        crash_loop_backoff("test-prune");
354
355        let content = std::fs::read_to_string(&log_path).unwrap();
356        let lines: Vec<&str> = content.lines().collect();
357        assert_eq!(
358            lines.len(),
359            1,
360            "old entry should be pruned, only current remains"
361        );
362        let ts: u64 = lines[0].parse().unwrap();
363        assert!(ts > old_ts, "remaining entry should be recent");
364    }
365
366    #[test]
367    fn sanitize_lock_name_strips_special_chars() {
368        assert_eq!(sanitize_lock_name("mcp-stdio"), "mcp-stdio");
369        assert_eq!(sanitize_lock_name("mcp_http"), "mcp_http");
370        assert_eq!(sanitize_lock_name("a/b\\c:d"), "a_b_c_d");
371        assert_eq!(sanitize_lock_name("name with spaces"), "name_with_spaces");
372    }
373
374    #[test]
375    fn crash_loop_backoff_formula_correctness() {
376        assert_eq!(
377            2u64.saturating_pow(1).min(CRASH_LOOP_MAX_BACKOFF_SECS),
378            2,
379            "1 over threshold = 2s backoff"
380        );
381        assert_eq!(
382            2u64.saturating_pow(2).min(CRASH_LOOP_MAX_BACKOFF_SECS),
383            4,
384            "2 over threshold = 4s backoff"
385        );
386        assert_eq!(
387            2u64.saturating_pow(3).min(CRASH_LOOP_MAX_BACKOFF_SECS),
388            8,
389            "3 over threshold = 8s backoff"
390        );
391        assert_eq!(
392            2u64.saturating_pow(4).min(CRASH_LOOP_MAX_BACKOFF_SECS),
393            16,
394            "4 over threshold = 16s backoff"
395        );
396        assert_eq!(
397            2u64.saturating_pow(5).min(CRASH_LOOP_MAX_BACKOFF_SECS),
398            30,
399            "5 over threshold = capped at 30s"
400        );
401        assert_eq!(
402            2u64.saturating_pow(10).min(CRASH_LOOP_MAX_BACKOFF_SECS),
403            30,
404            "10 over threshold = still capped at 30s"
405        );
406    }
407}