Skip to main content

lean_ctx/core/
startup_guard.rs

1use std::io::Write as _;
2use std::path::PathBuf;
3use std::time::Duration;
4
5pub const CRASH_LOOP_WINDOW_SECS: u64 = 60;
6pub const CRASH_LOOP_THRESHOLD: usize = 8;
7pub const CRASH_LOOP_MAX_BACKOFF_SECS: u64 = 30;
8
9pub const MCP_PROCESS_NAME: &str = "mcp-server";
10
11pub fn crash_loop_log_path(process_name: &str) -> Option<PathBuf> {
12    crate::core::data_dir::lean_ctx_data_dir()
13        .ok()
14        .map(|dir| dir.join(format!(".{}-starts.log", sanitize_lock_name(process_name))))
15}
16
17pub struct StartupLockGuard {
18    path: PathBuf,
19}
20
21impl StartupLockGuard {
22    pub fn touch(&self) {
23        // Update mtime so stale eviction doesn't kill active long-running processes.
24        let now_ms = std::time::SystemTime::now()
25            .duration_since(std::time::UNIX_EPOCH)
26            .unwrap_or_default()
27            .as_millis() as u64;
28        if let Ok(mut f) = std::fs::OpenOptions::new()
29            .write(true)
30            .truncate(true)
31            .open(&self.path)
32        {
33            let _ = writeln!(f, "{now_ms}");
34        }
35    }
36}
37
38impl Drop for StartupLockGuard {
39    fn drop(&mut self) {
40        let _ = std::fs::remove_file(&self.path);
41    }
42}
43
44fn sanitize_lock_name(name: &str) -> String {
45    name.chars()
46        .map(|c| {
47            if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
48                c
49            } else {
50                '_'
51            }
52        })
53        .collect()
54}
55
56/// Best-effort cross-process lock (create_new + stale eviction).
57///
58/// Returns `None` if the data dir can't be resolved or if the lock can't be acquired
59/// within `timeout`.
60pub fn try_acquire_lock(
61    name: &str,
62    timeout: Duration,
63    stale_after: Duration,
64) -> Option<StartupLockGuard> {
65    let dir = crate::core::data_dir::lean_ctx_data_dir().ok()?;
66    let _ = std::fs::create_dir_all(&dir);
67
68    let name = sanitize_lock_name(name);
69    let path = dir.join(format!(".{name}.lock"));
70
71    let deadline = std::time::Instant::now().checked_add(timeout)?;
72    let mut sleep_ms: u64 = 10;
73
74    loop {
75        if std::fs::OpenOptions::new()
76            .write(true)
77            .create_new(true)
78            .open(&path)
79            .is_ok()
80        {
81            return Some(StartupLockGuard { path });
82        }
83
84        if let Ok(meta) = std::fs::metadata(&path) {
85            if let Ok(modified) = meta.modified() {
86                if modified
87                    .elapsed()
88                    .unwrap_or_default()
89                    .saturating_sub(stale_after)
90                    > Duration::from_secs(0)
91                {
92                    let _ = std::fs::remove_file(&path);
93                }
94            }
95        }
96
97        if std::time::Instant::now() >= deadline {
98            return None;
99        }
100
101        std::thread::sleep(Duration::from_millis(sleep_ms));
102        sleep_ms = (sleep_ms.saturating_mul(2)).min(120);
103    }
104}
105
106/// Detects rapid restart loops (e.g., IDE keeps respawning a crashing MCP server).
107/// Records each startup timestamp; if too many happen within the window, sleeps
108/// with exponential backoff to break the loop and avoid host degradation.
109pub fn crash_loop_backoff(process_name: &str) {
110    let Some(dir) = crate::core::data_dir::lean_ctx_data_dir().ok() else {
111        return;
112    };
113    let _ = std::fs::create_dir_all(&dir);
114    let ts_path = dir.join(format!(".{}-starts.log", sanitize_lock_name(process_name)));
115
116    let now = std::time::SystemTime::now()
117        .duration_since(std::time::UNIX_EPOCH)
118        .unwrap_or_default()
119        .as_secs();
120
121    let cutoff = now.saturating_sub(CRASH_LOOP_WINDOW_SECS);
122
123    let mut recent: Vec<u64> = std::fs::read_to_string(&ts_path)
124        .unwrap_or_default()
125        .lines()
126        .filter_map(|l| l.trim().parse::<u64>().ok())
127        .filter(|&ts| ts >= cutoff)
128        .collect();
129    recent.push(now);
130
131    if let Ok(mut f) = std::fs::File::create(&ts_path) {
132        for ts in &recent {
133            let _ = writeln!(f, "{ts}");
134        }
135    }
136
137    if recent.len() > CRASH_LOOP_THRESHOLD {
138        let restarts_over = recent.len() - CRASH_LOOP_THRESHOLD;
139        let backoff_secs =
140            (2u64.saturating_pow(restarts_over as u32)).min(CRASH_LOOP_MAX_BACKOFF_SECS);
141        let msg = format!(
142            "lean-ctx: crash-loop protection — {process_name} started {} times in {CRASH_LOOP_WINDOW_SECS}s, \
143             waiting {backoff_secs}s before accepting connections. \
144             If your IDE is slow to initialize, this is normal.",
145            recent.len()
146        );
147        tracing::warn!("{msg}");
148        eprintln!("{msg}");
149        std::thread::sleep(Duration::from_secs(backoff_secs));
150    }
151}
152
153/// Clears the crash-loop history file, resetting any active backoff.
154pub fn reset_crash_loop(process_name: &str) {
155    let Some(dir) = crate::core::data_dir::lean_ctx_data_dir().ok() else {
156        return;
157    };
158    let ts_path = dir.join(format!(".{}-starts.log", sanitize_lock_name(process_name)));
159    let _ = std::fs::remove_file(&ts_path);
160}
161
162#[cfg(test)]
163mod tests {
164    use super::*;
165
166    struct EnvVarGuard {
167        key: &'static str,
168        prev: Option<String>,
169    }
170
171    impl EnvVarGuard {
172        fn set(key: &'static str, value: &std::path::Path) -> Self {
173            let prev = std::env::var(key).ok();
174            std::env::set_var(key, value);
175            Self { key, prev }
176        }
177    }
178
179    impl Drop for EnvVarGuard {
180        fn drop(&mut self) {
181            match self.prev.as_deref() {
182                Some(v) => std::env::set_var(self.key, v),
183                None => std::env::remove_var(self.key),
184            }
185        }
186    }
187
188    #[test]
189    fn lock_acquire_and_release() {
190        let _env = crate::core::data_dir::test_env_lock();
191        let dir = tempfile::tempdir().unwrap();
192        let _guard = EnvVarGuard::set("LEAN_CTX_DATA_DIR", dir.path());
193
194        let g = try_acquire_lock(
195            "unit-test",
196            Duration::from_millis(200),
197            Duration::from_secs(30),
198        );
199        assert!(g.is_some());
200
201        let lock_path = dir.path().join(".unit-test.lock");
202        assert!(lock_path.exists());
203
204        drop(g);
205        assert!(!lock_path.exists());
206    }
207
208    #[test]
209    fn lock_times_out_while_held() {
210        let _env = crate::core::data_dir::test_env_lock();
211        let dir = tempfile::tempdir().unwrap();
212        let _guard = EnvVarGuard::set("LEAN_CTX_DATA_DIR", dir.path());
213
214        let g1 = try_acquire_lock(
215            "unit-test-2",
216            Duration::from_millis(200),
217            Duration::from_secs(30),
218        )
219        .expect("first lock should acquire");
220        let g2 = try_acquire_lock(
221            "unit-test-2",
222            Duration::from_millis(60),
223            Duration::from_secs(30),
224        );
225        assert!(g2.is_none());
226
227        drop(g1);
228        let g3 = try_acquire_lock(
229            "unit-test-2",
230            Duration::from_millis(200),
231            Duration::from_secs(30),
232        );
233        assert!(g3.is_some());
234    }
235
236    #[test]
237    fn crash_loop_thresholds_are_resilient() {
238        let threshold = CRASH_LOOP_THRESHOLD;
239        let window = CRASH_LOOP_WINDOW_SECS;
240        let backoff = CRASH_LOOP_MAX_BACKOFF_SECS;
241        assert!(
242            threshold >= 8,
243            "threshold must tolerate IDE restart patterns (was {threshold})"
244        );
245        assert!(
246            window >= 60,
247            "window must cover slow IDE startup (was {window}s)"
248        );
249        assert!(
250            backoff <= 30,
251            "max backoff must not be too aggressive (was {backoff}s)"
252        );
253    }
254
255    #[test]
256    fn crash_loop_backoff_under_threshold_no_sleep() {
257        let _env = crate::core::data_dir::test_env_lock();
258        let dir = tempfile::tempdir().unwrap();
259        let _guard = EnvVarGuard::set("LEAN_CTX_DATA_DIR", dir.path());
260
261        let start = std::time::Instant::now();
262        for _ in 0..CRASH_LOOP_THRESHOLD {
263            crash_loop_backoff("test-no-sleep");
264        }
265        assert!(
266            start.elapsed() < Duration::from_secs(1),
267            "under threshold should not sleep"
268        );
269    }
270
271    #[test]
272    fn reset_crash_loop_clears_history() {
273        let _env = crate::core::data_dir::test_env_lock();
274        let dir = tempfile::tempdir().unwrap();
275        let _guard = EnvVarGuard::set("LEAN_CTX_DATA_DIR", dir.path());
276
277        for _ in 0..5 {
278            crash_loop_backoff("test-reset");
279        }
280        let log_path = dir.path().join(".test-reset-starts.log");
281        assert!(log_path.exists(), "crash loop log should exist after calls");
282
283        reset_crash_loop("test-reset");
284        assert!(
285            !log_path.exists(),
286            "crash loop log should be removed after reset"
287        );
288    }
289
290    #[test]
291    fn reset_crash_loop_nonexistent_is_noop() {
292        let _env = crate::core::data_dir::test_env_lock();
293        let dir = tempfile::tempdir().unwrap();
294        let _guard = EnvVarGuard::set("LEAN_CTX_DATA_DIR", dir.path());
295
296        reset_crash_loop("never-existed");
297    }
298
299    #[test]
300    fn crash_loop_log_only_keeps_recent_entries() {
301        let _env = crate::core::data_dir::test_env_lock();
302        let dir = tempfile::tempdir().unwrap();
303        let _guard = EnvVarGuard::set("LEAN_CTX_DATA_DIR", dir.path());
304
305        let log_path = dir.path().join(".test-prune-starts.log");
306        let old_ts = 1000u64;
307        std::fs::write(&log_path, format!("{old_ts}\n")).unwrap();
308
309        crash_loop_backoff("test-prune");
310
311        let content = std::fs::read_to_string(&log_path).unwrap();
312        let lines: Vec<&str> = content.lines().collect();
313        assert_eq!(
314            lines.len(),
315            1,
316            "old entry should be pruned, only current remains"
317        );
318        let ts: u64 = lines[0].parse().unwrap();
319        assert!(ts > old_ts, "remaining entry should be recent");
320    }
321
322    #[test]
323    fn sanitize_lock_name_strips_special_chars() {
324        assert_eq!(sanitize_lock_name("mcp-stdio"), "mcp-stdio");
325        assert_eq!(sanitize_lock_name("mcp_http"), "mcp_http");
326        assert_eq!(sanitize_lock_name("a/b\\c:d"), "a_b_c_d");
327        assert_eq!(sanitize_lock_name("name with spaces"), "name_with_spaces");
328    }
329
330    #[test]
331    fn crash_loop_backoff_formula_correctness() {
332        assert_eq!(
333            2u64.saturating_pow(1).min(CRASH_LOOP_MAX_BACKOFF_SECS),
334            2,
335            "1 over threshold = 2s backoff"
336        );
337        assert_eq!(
338            2u64.saturating_pow(2).min(CRASH_LOOP_MAX_BACKOFF_SECS),
339            4,
340            "2 over threshold = 4s backoff"
341        );
342        assert_eq!(
343            2u64.saturating_pow(3).min(CRASH_LOOP_MAX_BACKOFF_SECS),
344            8,
345            "3 over threshold = 8s backoff"
346        );
347        assert_eq!(
348            2u64.saturating_pow(4).min(CRASH_LOOP_MAX_BACKOFF_SECS),
349            16,
350            "4 over threshold = 16s backoff"
351        );
352        assert_eq!(
353            2u64.saturating_pow(5).min(CRASH_LOOP_MAX_BACKOFF_SECS),
354            30,
355            "5 over threshold = capped at 30s"
356        );
357        assert_eq!(
358            2u64.saturating_pow(10).min(CRASH_LOOP_MAX_BACKOFF_SECS),
359            30,
360            "10 over threshold = still capped at 30s"
361        );
362    }
363}