difflore_core/infra/startup.rs
1//! Fast-start gate that amortises first-command checks across the CLI.
2//!
3//! Every `difflore` invocation used to pay the cost of:
4//! - opening the `SQLite` pool + running migrations
5//! - reading provider config (auth check probe)
6//! - (if logged in) pinging the cloud for reachability
7//!
8//! On an interactive shell this is ~200–600 ms wasted per command when
9//! nothing has actually changed. `ensure_ready` keeps a tiny JSON file
10//! (`~/.difflore/startup-cache.json`) with the last-known-good timestamp
11//! for each check and short-circuits when all timestamps are fresh.
12//!
13//! TTL is deliberately short (5 min) so genuine outages surface within a
14//! few invocations rather than being masked for the whole session.
15
16use chrono::{DateTime, Duration, Utc};
17use serde::{Deserialize, Serialize};
18use std::path::PathBuf;
19use tokio::fs;
20
21use crate::errors::CoreError;
22use crate::paths;
23
24/// Five minutes — balances "skip the probe on the next N commands after a
25/// fresh one" (common case) with "surface real drift within seconds" (when
26/// the user just logged out of the cloud, swapped provider keys, etc.).
27pub const STARTUP_TTL_MINUTES: i64 = 5;
28
29/// Serialized as `~/.difflore/startup-cache.json`. Every field except
30/// `version` and `migrations_applied_at` is optional so the struct can
31/// round-trip cleanly across future schema additions.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct StartupStatus {
34 pub version: String,
35 pub migrations_applied_at: DateTime<Utc>,
36 #[serde(default, skip_serializing_if = "Option::is_none")]
37 pub provider_ok_at: Option<DateTime<Utc>>,
38 #[serde(default, skip_serializing_if = "Option::is_none")]
39 pub cloud_ok_at: Option<DateTime<Utc>>,
40}
41
42impl StartupStatus {
43 /// Are all probe timestamps newer than `now - TTL`? `provider_ok_at`
44 /// and `cloud_ok_at` are considered fresh when present and recent;
45 /// `None` means the probe was never run (e.g. not logged in yet) and
46 /// is NOT treated as stale — we don't want to re-ping the cloud on
47 /// every invocation just because the user isn't signed in.
48 fn is_fresh(&self, now: DateTime<Utc>) -> bool {
49 let ttl = Duration::minutes(STARTUP_TTL_MINUTES);
50 let migrations_fresh = (now - self.migrations_applied_at) < ttl;
51 let provider_fresh = self.provider_ok_at.is_none_or(|t| (now - t) < ttl);
52 let cloud_fresh = self.cloud_ok_at.is_none_or(|t| (now - t) < ttl);
53 migrations_fresh && provider_fresh && cloud_fresh
54 }
55}
56
57/// `~/.difflore/startup-cache.json` (overridable via `DIFFLORE_HOME`).
58fn cache_path() -> Result<PathBuf, CoreError> {
59 let dir = paths::data_home().map_err(CoreError::Internal)?;
60 Ok(dir.join("startup-cache.json"))
61}
62
63async fn read_cache() -> Option<StartupStatus> {
64 let path = cache_path().ok()?;
65 let bytes = fs::read(&path).await.ok()?;
66 serde_json::from_slice(&bytes).ok()
67}
68
69async fn write_cache(status: &StartupStatus) -> Result<(), CoreError> {
70 let path = cache_path()?;
71 if let Some(parent) = path.parent() {
72 fs::create_dir_all(parent).await?;
73 }
74 let bytes = serde_json::to_vec_pretty(status)?;
75 fs::write(&path, bytes).await?;
76 Ok(())
77}
78
79/// In-process mirror of the on-disk cache. Avoids re-paying the file
80/// roundtrip when `ensure_ready` is called many times within a single
81/// CLI invocation (MCP server + hook subprocess + CLI command), and
82/// sidesteps Windows NTFS write-then-read visibility races that surface
83/// on CI's cold-start virtual disk. The file cache is still the source
84/// of truth across processes — the memory cache only short-circuits
85/// repeats within the same process.
86static MEMORY_CACHE: tokio::sync::Mutex<Option<StartupStatus>> =
87 tokio::sync::Mutex::const_new(None);
88
89/// Run the full check battery: open the DB pool (running any pending
90/// migrations), read the provider config, and ping the cloud if the
91/// user is logged in. Returns a freshly-stamped `StartupStatus`.
92///
93/// Every individual probe is fault-tolerant — a cloud outage must not
94/// prevent the CLI from running. We record `None` for probes that
95/// failed so the next invocation will retry, rather than masking them
96/// behind a stale-but-present timestamp.
97async fn run_full_check() -> Result<StartupStatus, CoreError> {
98 let now = Utc::now();
99
100 // Migrations — re-running is a no-op when everything is already
101 // applied, so the cost is a single metadata query.
102 let _pool = crate::db::init_db().await.map_err(CoreError::Internal)?;
103
104 // Recover any cloud-outbox rows that got stuck in 'processing' after
105 // a crashed drain (e.g. SIGKILL'd hook). Anything older than 60 s is
106 // bounced back to 'pending' so the next drain can retry. Failures
107 // are logged but never block startup — a stale outbox row costs at
108 // most one retry's worth of duplicate work on the cloud side.
109 if let Ok(pool) = crate::db::init_db().await {
110 let queue = crate::cloud::outbox::OutboxQueue::new(pool);
111 if let Err(e) = queue
112 .reset_stale(crate::cloud::outbox::DEFAULT_STALE_SECONDS)
113 .await
114 {
115 eprintln!("[difflore] cloud_outbox reset_stale skipped: {e}");
116 }
117 }
118
119 // Provider config read. We only need to know "is there a usable
120 // provider table?" — the `list()` query walks the same rows as the
121 // CLI would. Failures are recorded as `None` so the next invocation
122 // retries.
123 let db = crate::db::init_db().await.map_err(CoreError::Internal)?;
124 let provider_ok_at = match crate::providers::list(&db).await {
125 Ok(_) => Some(now),
126 Err(_) => None,
127 };
128
129 // Cloud reachability: only exercised when the user is logged in.
130 // A logged-out user has no cloud to ping, and we leave the field
131 // unset so `is_fresh` treats it as "not applicable".
132 let cloud_ok_at = {
133 let client = crate::cloud::client::CloudClient::create().await;
134 if client.is_logged_in() {
135 // Simple HEAD-equivalent: we don't have a dedicated ping
136 // endpoint, so call the cheapest logged-in-required probe
137 // we do have. On any error we record `None` and move on.
138 // The `recall_past_verdicts` surface already swallows
139 // network failures and returns `Ok(vec![])`, which makes
140 // it a perfect cache-probe proxy.
141 // Minimal probe: a fixed sentinel query that satisfies the
142 // server-side `min(1)` validation on `queryText`. Empty
143 // string used to be accepted but the Zod schema now requires
144 // ≥1 char; using `"_ping_"` keeps this a no-op-on-the-data
145 // round-trip while no longer spamming a 400 to stderr on
146 // every CLI command. We don't care about the contents of
147 // the response — we only care that the round-trip succeeded.
148 let req = crate::cloud::api_types::RecallPastVerdictsRequest {
149 embedding: Vec::new(),
150 query_text: Some("_ping_".to_owned()),
151 repo_id: None,
152 scope: "personal".to_owned(),
153 team_id: None,
154 k: 1,
155 target_file: None,
156 };
157 match client.recall_past_verdicts(req).await {
158 Ok(_) => Some(now),
159 Err(_) => None,
160 }
161 } else {
162 None
163 }
164 };
165
166 let status = StartupStatus {
167 version: env!("CARGO_PKG_VERSION").to_owned(),
168 migrations_applied_at: now,
169 provider_ok_at,
170 cloud_ok_at,
171 };
172 write_cache(&status).await?;
173 Ok(status)
174}
175
176/// Gate every CLI command goes through on entry. When the cache is
177/// fresh and `force` is false, returns the cached status without
178/// touching the filesystem beyond the single read. When any probe is
179/// stale, runs the full check battery and updates the cache.
180///
181/// `force=true` always re-runs every probe. Use it from explicit
182/// "reset"-style subcommands (`difflore init`, `difflore cloud login`,
183/// etc.) where we want the next command to see the post-change state
184/// immediately, not up to 5 minutes later.
185pub async fn ensure_ready(force: bool) -> Result<StartupStatus, CoreError> {
186 let now = Utc::now();
187 if !force {
188 if let Some(cached) = MEMORY_CACHE.lock().await.as_ref()
189 && cached.is_fresh(now)
190 {
191 return Ok(cached.clone());
192 }
193 if let Some(cached) = read_cache().await
194 && cached.is_fresh(now)
195 {
196 *MEMORY_CACHE.lock().await = Some(cached.clone());
197 return Ok(cached);
198 }
199 }
200 let status = run_full_check().await?;
201 *MEMORY_CACHE.lock().await = Some(status.clone());
202 Ok(status)
203}
204
205#[cfg(test)]
206mod tests {
207 use super::*;
208
209 /// Serialise the two cache-dependent tests in this module. They
210 /// share `startup-cache.json` and the global `data.db` migration
211 /// state under the crate-wide test home; running them in parallel
212 /// produced false failures where one test's force-refresh read
213 /// the other's half-written cache and `migrate!` raced on the
214 /// `_sqlx_migrations` table. Holding an async `Mutex` guard
215 /// across `await` points is supported by `tokio::sync::Mutex`.
216 static CACHE_SERIAL: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(());
217
218 #[test]
219 fn is_fresh_handles_missing_probes() {
220 let now = Utc::now();
221 let status = StartupStatus {
222 version: "0.1.0".into(),
223 migrations_applied_at: now,
224 provider_ok_at: None,
225 cloud_ok_at: None,
226 };
227 // Missing probes shouldn't render the cache stale — a logged-
228 // out user legitimately has no cloud timestamp.
229 assert!(status.is_fresh(now));
230 }
231
232 #[test]
233 fn is_fresh_rejects_stale_migrations() {
234 let now = Utc::now();
235 let status = StartupStatus {
236 version: "0.1.0".into(),
237 migrations_applied_at: now - Duration::minutes(STARTUP_TTL_MINUTES + 1),
238 provider_ok_at: Some(now),
239 cloud_ok_at: Some(now),
240 };
241 assert!(!status.is_fresh(now));
242 }
243
244 #[test]
245 fn is_fresh_rejects_stale_cloud() {
246 let now = Utc::now();
247 let status = StartupStatus {
248 version: "0.1.0".into(),
249 migrations_applied_at: now,
250 provider_ok_at: Some(now),
251 cloud_ok_at: Some(now - Duration::minutes(STARTUP_TTL_MINUTES + 1)),
252 };
253 assert!(!status.is_fresh(now));
254 }
255
256 #[tokio::test]
257 async fn ensure_ready_caches_between_calls() {
258 let _guard = CACHE_SERIAL.lock().await;
259 let _home = crate::db::shared_test_home();
260
261 // Force one fresh probe so we're comparing against a known
262 // baseline rather than whatever the previous test left in the
263 // shared cache file.
264 let first = ensure_ready(true).await.expect("first call");
265 let first_ts = first.migrations_applied_at;
266
267 // Second call should return the cached status — same timestamp.
268 let second = ensure_ready(false).await.expect("second call");
269 assert_eq!(
270 second.migrations_applied_at, first_ts,
271 "second call should come from cache, not re-run"
272 );
273 }
274
275 #[tokio::test]
276 async fn ensure_ready_force_refreshes_cache() {
277 let _guard = CACHE_SERIAL.lock().await;
278 let _home = crate::db::shared_test_home();
279
280 let first = ensure_ready(false).await.expect("first call");
281 let first_ts = first.migrations_applied_at;
282
283 // Tiny sleep so the recorded timestamps are actually different.
284 // We don't want this test to be timing-sensitive, so we use
285 // millisecond-level resolution via tokio::time::sleep.
286 tokio::time::sleep(std::time::Duration::from_millis(15)).await;
287
288 let second = ensure_ready(true).await.expect("force call");
289 assert!(
290 second.migrations_applied_at > first_ts,
291 "force=true must re-run the full check (got {} vs {first_ts})",
292 second.migrations_applied_at
293 );
294 }
295}