codelens_engine/lsp/readiness.rs
1//! Per-session readiness tracking.
2//!
3//! LSP servers complete their `initialize` handshake in tens of
4//! milliseconds, but real workspace indexing (rust-analyzer's project
5//! model, pyright's module graph, tsserver's file-system walk) can
6//! take 15–60 seconds. Pre-P0-4 harnesses papered over this with a
7//! fixed `sleep 45` after `prepare_harness_session` — honest but
8//! wasteful: every bench run paid the worst-case wait regardless of
9//! how quickly indexing actually finished, and production agent
10//! sessions had no signal at all.
11//!
12//! This module exposes a cheap, lock-free readiness snapshot per LSP
13//! session. The pool records:
14//!
15//! - `started_at` — the wall-clock instant the session was spawned.
16//! - `ms_to_first_response` — elapsed milliseconds when any LSP call
17//! first returned `Ok`. Usually the bootstrap `workspace/symbol`
18//! from the auto-attach prewarm. Proves the server's handshake
19//! completed.
20//! - `ms_to_first_nonempty` — elapsed milliseconds when a call first
21//! returned a **non-empty** result. This is the stronger signal
22//! that indexing has progressed far enough to serve real caller
23//! queries: rust-analyzer and pyright both reply with `[]` while
24//! the project is still being walked, then start returning real
25//! hits once the module graph is populated.
26//! - `response_count` / `nonempty_count` / `failure_count` — rolling
27//! counters so callers can distinguish "indexing still warming" from
28//! "server is failing every request".
29//!
30//! Reads are via `Arc<ReadinessState>` + atomics, so snapshot calls
31//! never contend with the per-session I/O mutex. That keeps the
32//! downstream MCP `get_lsp_readiness` handler cheap enough for a
33//! 500 ms polling loop to be the canonical wait-for-ready mechanism.
34
35use std::sync::atomic::{AtomicU64, Ordering};
36use std::time::Instant;
37
38/// Readiness state shared between a session's owning thread and the
39/// pool's snapshot readers. Created when a session is spawned and
40/// retained until the session is dropped.
41#[derive(Debug)]
42pub struct ReadinessState {
43 pub command: String,
44 pub args: Vec<String>,
45 started_at: Instant,
46 ms_to_first_response: AtomicU64,
47 ms_to_first_nonempty: AtomicU64,
48 ms_to_last_response: AtomicU64,
49 response_count: AtomicU64,
50 nonempty_count: AtomicU64,
51 failure_count: AtomicU64,
52}
53
54impl ReadinessState {
55 pub(super) fn new(command: String, args: Vec<String>) -> Self {
56 Self {
57 command,
58 args,
59 started_at: Instant::now(),
60 ms_to_first_response: AtomicU64::new(0),
61 ms_to_first_nonempty: AtomicU64::new(0),
62 ms_to_last_response: AtomicU64::new(0),
63 response_count: AtomicU64::new(0),
64 nonempty_count: AtomicU64::new(0),
65 failure_count: AtomicU64::new(0),
66 }
67 }
68
69 /// Record a successful LSP response. `was_nonempty` is the caller's
70 /// domain judgement (e.g. `references.len() > 0`,
71 /// `workspace_symbols.len() > 0`). A response with zero results is
72 /// still meaningful — it proves the server handshake is alive —
73 /// but indexing-readiness requires at least one hit.
74 pub(super) fn record_ok(&self, was_nonempty: bool) {
75 // `max(1)` so a response at exactly t=0 (test mock) is still
76 // distinguishable from "no response yet".
77 let elapsed = self.started_at.elapsed().as_millis() as u64;
78 let ms = elapsed.max(1);
79
80 // compare_exchange with expected=0 gives us a one-shot latch
81 // for the "first" milestones. Subsequent calls silently no-op.
82 let _ =
83 self.ms_to_first_response
84 .compare_exchange(0, ms, Ordering::Relaxed, Ordering::Relaxed);
85 if was_nonempty {
86 let _ = self.ms_to_first_nonempty.compare_exchange(
87 0,
88 ms,
89 Ordering::Relaxed,
90 Ordering::Relaxed,
91 );
92 self.nonempty_count.fetch_add(1, Ordering::Relaxed);
93 }
94 self.ms_to_last_response.store(ms, Ordering::Relaxed);
95 self.response_count.fetch_add(1, Ordering::Relaxed);
96 }
97
98 /// Record a failed LSP call. Failures bump a counter so callers
99 /// can treat a session with `failure_count > 0 && response_count == 0`
100 /// as unhealthy rather than warming.
101 pub(super) fn record_failure(&self) {
102 self.failure_count.fetch_add(1, Ordering::Relaxed);
103 }
104
105 pub fn snapshot(&self) -> ReadinessSnapshot {
106 let read = |a: &AtomicU64| a.load(Ordering::Relaxed);
107 let opt = |v: u64| if v == 0 { None } else { Some(v) };
108 ReadinessSnapshot {
109 command: self.command.clone(),
110 args: self.args.clone(),
111 elapsed_ms: self.started_at.elapsed().as_millis() as u64,
112 ms_to_first_response: opt(read(&self.ms_to_first_response)),
113 ms_to_first_nonempty: opt(read(&self.ms_to_first_nonempty)),
114 ms_to_last_response: opt(read(&self.ms_to_last_response)),
115 response_count: read(&self.response_count),
116 nonempty_count: read(&self.nonempty_count),
117 failure_count: read(&self.failure_count),
118 }
119 }
120}
121
122/// Plain-old-data readiness view for callers (MCP handlers, bench
123/// scripts). All milliseconds are relative to `session.started_at`.
124#[derive(Debug, Clone, serde::Serialize)]
125pub struct ReadinessSnapshot {
126 pub command: String,
127 pub args: Vec<String>,
128 pub elapsed_ms: u64,
129 pub ms_to_first_response: Option<u64>,
130 pub ms_to_first_nonempty: Option<u64>,
131 pub ms_to_last_response: Option<u64>,
132 pub response_count: u64,
133 pub nonempty_count: u64,
134 pub failure_count: u64,
135}
136
137impl ReadinessSnapshot {
138 /// A session is **ready** when it has returned at least one
139 /// non-empty response. Zero-result responses are not enough —
140 /// pyright and rust-analyzer both emit `[]` while the project is
141 /// being walked, and an agent that unblocks on the first empty
142 /// reply ends up issuing the real query before indexing is done
143 /// (which is the failure mode P0-4 was created to stop).
144 pub fn is_ready(&self) -> bool {
145 self.ms_to_first_nonempty.is_some()
146 }
147
148 /// A session is **alive** when its handshake round-tripped at
149 /// least once. Alive-but-not-ready means the LSP is up but has
150 /// not produced usable data yet.
151 pub fn is_alive(&self) -> bool {
152 self.ms_to_first_response.is_some()
153 }
154}