Skip to main content

difflore_core/infra/
startup.rs

1//! Fast-start gate that amortises first-command checks across the CLI.
2//!
3//! Every `difflore` invocation used to pay the cost of:
4//!   - opening the `SQLite` pool + running migrations
5//!   - reading provider config (auth check probe)
6//!   - (if logged in) pinging the cloud for reachability
7//!
8//! On an interactive shell this is ~200–600 ms wasted per command when
9//! nothing has actually changed. `ensure_ready` keeps a tiny JSON file
10//! (`~/.difflore/startup-cache.json`) with the last-known-good timestamp
11//! for each check and short-circuits when all timestamps are fresh.
12//!
13//! TTL is deliberately short (5 min) so genuine outages surface within a
14//! few invocations rather than being masked for the whole session.
15
16use chrono::{DateTime, Duration, Utc};
17use serde::{Deserialize, Serialize};
18use std::path::PathBuf;
19use tokio::fs;
20
21use crate::errors::CoreError;
22use crate::paths;
23
24/// Five minutes — balances "skip the probe on the next N commands after a
25/// fresh one" (common case) with "surface real drift within seconds" (when
26/// the user just logged out of the cloud, swapped provider keys, etc.).
27pub const STARTUP_TTL_MINUTES: i64 = 5;
28
29/// Serialized as `~/.difflore/startup-cache.json`. Every field except
30/// `version` and `migrations_applied_at` is optional so the struct can
31/// round-trip cleanly across future schema additions.
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct StartupStatus {
34    pub version: String,
35    pub migrations_applied_at: DateTime<Utc>,
36    #[serde(default, skip_serializing_if = "Option::is_none")]
37    pub provider_ok_at: Option<DateTime<Utc>>,
38    #[serde(default, skip_serializing_if = "Option::is_none")]
39    pub cloud_ok_at: Option<DateTime<Utc>>,
40}
41
42impl StartupStatus {
43    /// Are all probe timestamps newer than `now - TTL`? `provider_ok_at`
44    /// and `cloud_ok_at` are considered fresh when present and recent;
45    /// `None` means the probe was never run (e.g. not logged in yet) and
46    /// is NOT treated as stale — we don't want to re-ping the cloud on
47    /// every invocation just because the user isn't signed in.
48    fn is_fresh(&self, now: DateTime<Utc>) -> bool {
49        let ttl = Duration::minutes(STARTUP_TTL_MINUTES);
50        let migrations_fresh = (now - self.migrations_applied_at) < ttl;
51        let provider_fresh = self.provider_ok_at.is_none_or(|t| (now - t) < ttl);
52        let cloud_fresh = self.cloud_ok_at.is_none_or(|t| (now - t) < ttl);
53        migrations_fresh && provider_fresh && cloud_fresh
54    }
55}
56
57/// `~/.difflore/startup-cache.json` (overridable via `DIFFLORE_HOME`).
58fn cache_path() -> Result<PathBuf, CoreError> {
59    let dir = paths::data_home().map_err(CoreError::Internal)?;
60    Ok(dir.join("startup-cache.json"))
61}
62
63async fn read_cache() -> Option<StartupStatus> {
64    let path = cache_path().ok()?;
65    let bytes = fs::read(&path).await.ok()?;
66    serde_json::from_slice(&bytes).ok()
67}
68
69async fn write_cache(status: &StartupStatus) -> Result<(), CoreError> {
70    let path = cache_path()?;
71    if let Some(parent) = path.parent() {
72        fs::create_dir_all(parent).await?;
73    }
74    let bytes = serde_json::to_vec_pretty(status)?;
75    fs::write(&path, bytes).await?;
76    Ok(())
77}
78
79/// In-process mirror of the on-disk cache. Avoids re-paying the file
80/// roundtrip when `ensure_ready` is called many times within a single
81/// CLI invocation (MCP server + hook subprocess + CLI command), and
82/// sidesteps Windows NTFS write-then-read visibility races that surface
83/// on CI's cold-start virtual disk. The file cache is still the source
84/// of truth across processes — the memory cache only short-circuits
85/// repeats within the same process.
86static MEMORY_CACHE: tokio::sync::Mutex<Option<StartupStatus>> =
87    tokio::sync::Mutex::const_new(None);
88
89/// Run the full check battery: open the DB pool (running any pending
90/// migrations), read the provider config, and ping the cloud if the
91/// user is logged in. Returns a freshly-stamped `StartupStatus`.
92///
93/// Every individual probe is fault-tolerant — a cloud outage must not
94/// prevent the CLI from running. We record `None` for probes that
95/// failed so the next invocation will retry, rather than masking them
96/// behind a stale-but-present timestamp.
97async fn run_full_check() -> Result<StartupStatus, CoreError> {
98    let now = Utc::now();
99
100    // Migrations — re-running is a no-op when everything is already
101    // applied, so the cost is a single metadata query.
102    let _pool = crate::db::init_db().await.map_err(CoreError::Internal)?;
103
104    // Recover any cloud-outbox rows that got stuck in 'processing' after
105    // a crashed drain (e.g. SIGKILL'd hook). Anything older than 60 s is
106    // bounced back to 'pending' so the next drain can retry. Failures
107    // are logged but never block startup — a stale outbox row costs at
108    // most one retry's worth of duplicate work on the cloud side.
109    if let Ok(pool) = crate::db::init_db().await {
110        let queue = crate::cloud::outbox::OutboxQueue::new(pool);
111        if let Err(e) = queue
112            .reset_stale(crate::cloud::outbox::DEFAULT_STALE_SECONDS)
113            .await
114        {
115            eprintln!("[difflore] cloud_outbox reset_stale skipped: {e}");
116        }
117    }
118
119    // Provider config read. We only need to know "is there a usable
120    // provider table?" — the `list()` query walks the same rows as the
121    // CLI would. Failures are recorded as `None` so the next invocation
122    // retries.
123    let db = crate::db::init_db().await.map_err(CoreError::Internal)?;
124    let provider_ok_at = match crate::providers::list(&db).await {
125        Ok(_) => Some(now),
126        Err(_) => None,
127    };
128
129    // Cloud reachability: only exercised when the user is logged in.
130    // A logged-out user has no cloud to ping, and we leave the field
131    // unset so `is_fresh` treats it as "not applicable".
132    let cloud_ok_at = {
133        let client = crate::cloud::client::CloudClient::create().await;
134        if client.is_logged_in() {
135            // Simple HEAD-equivalent: we don't have a dedicated ping
136            // endpoint, so call the cheapest logged-in-required probe
137            // we do have. On any error we record `None` and move on.
138            // The `recall_past_verdicts` surface already swallows
139            // network failures and returns `Ok(vec![])`, which makes
140            // it a perfect cache-probe proxy.
141            // Minimal probe: a fixed sentinel query that satisfies the
142            // server-side `min(1)` validation on `queryText`. Empty
143            // string used to be accepted but the Zod schema now requires
144            // ≥1 char; using `"_ping_"` keeps this a no-op-on-the-data
145            // round-trip while no longer spamming a 400 to stderr on
146            // every CLI command. We don't care about the contents of
147            // the response — we only care that the round-trip succeeded.
148            let req = crate::cloud::api_types::RecallPastVerdictsRequest {
149                embedding: Vec::new(),
150                query_text: Some("_ping_".to_owned()),
151                repo_id: None,
152                scope: "personal".to_owned(),
153                team_id: None,
154                k: 1,
155                target_file: None,
156            };
157            match client.recall_past_verdicts(req).await {
158                Ok(_) => Some(now),
159                Err(_) => None,
160            }
161        } else {
162            None
163        }
164    };
165
166    let status = StartupStatus {
167        version: env!("CARGO_PKG_VERSION").to_owned(),
168        migrations_applied_at: now,
169        provider_ok_at,
170        cloud_ok_at,
171    };
172    write_cache(&status).await?;
173    Ok(status)
174}
175
176/// Gate every CLI command goes through on entry. When the cache is
177/// fresh and `force` is false, returns the cached status without
178/// touching the filesystem beyond the single read. When any probe is
179/// stale, runs the full check battery and updates the cache.
180///
181/// `force=true` always re-runs every probe. Use it from explicit
182/// "reset"-style subcommands (`difflore init`, `difflore cloud login`,
183/// etc.) where we want the next command to see the post-change state
184/// immediately, not up to 5 minutes later.
185pub async fn ensure_ready(force: bool) -> Result<StartupStatus, CoreError> {
186    let now = Utc::now();
187    if !force {
188        if let Some(cached) = MEMORY_CACHE.lock().await.as_ref()
189            && cached.is_fresh(now)
190        {
191            return Ok(cached.clone());
192        }
193        if let Some(cached) = read_cache().await
194            && cached.is_fresh(now)
195        {
196            *MEMORY_CACHE.lock().await = Some(cached.clone());
197            return Ok(cached);
198        }
199    }
200    let status = run_full_check().await?;
201    *MEMORY_CACHE.lock().await = Some(status.clone());
202    Ok(status)
203}
204
205#[cfg(test)]
206mod tests {
207    use super::*;
208
209    /// Serialise the two cache-dependent tests in this module. They
210    /// share `startup-cache.json` and the global `data.db` migration
211    /// state under the crate-wide test home; running them in parallel
212    /// produced false failures where one test's force-refresh read
213    /// the other's half-written cache and `migrate!` raced on the
214    /// `_sqlx_migrations` table. Holding an async `Mutex` guard
215    /// across `await` points is supported by `tokio::sync::Mutex`.
216    static CACHE_SERIAL: tokio::sync::Mutex<()> = tokio::sync::Mutex::const_new(());
217
218    #[test]
219    fn is_fresh_handles_missing_probes() {
220        let now = Utc::now();
221        let status = StartupStatus {
222            version: "0.1.0".into(),
223            migrations_applied_at: now,
224            provider_ok_at: None,
225            cloud_ok_at: None,
226        };
227        // Missing probes shouldn't render the cache stale — a logged-
228        // out user legitimately has no cloud timestamp.
229        assert!(status.is_fresh(now));
230    }
231
232    #[test]
233    fn is_fresh_rejects_stale_migrations() {
234        let now = Utc::now();
235        let status = StartupStatus {
236            version: "0.1.0".into(),
237            migrations_applied_at: now - Duration::minutes(STARTUP_TTL_MINUTES + 1),
238            provider_ok_at: Some(now),
239            cloud_ok_at: Some(now),
240        };
241        assert!(!status.is_fresh(now));
242    }
243
244    #[test]
245    fn is_fresh_rejects_stale_cloud() {
246        let now = Utc::now();
247        let status = StartupStatus {
248            version: "0.1.0".into(),
249            migrations_applied_at: now,
250            provider_ok_at: Some(now),
251            cloud_ok_at: Some(now - Duration::minutes(STARTUP_TTL_MINUTES + 1)),
252        };
253        assert!(!status.is_fresh(now));
254    }
255
256    #[tokio::test]
257    async fn ensure_ready_caches_between_calls() {
258        let _guard = CACHE_SERIAL.lock().await;
259        let _home = crate::db::shared_test_home();
260
261        // Force one fresh probe so we're comparing against a known
262        // baseline rather than whatever the previous test left in the
263        // shared cache file.
264        let first = ensure_ready(true).await.expect("first call");
265        let first_ts = first.migrations_applied_at;
266
267        // Second call should return the cached status — same timestamp.
268        let second = ensure_ready(false).await.expect("second call");
269        assert_eq!(
270            second.migrations_applied_at, first_ts,
271            "second call should come from cache, not re-run"
272        );
273    }
274
275    #[tokio::test]
276    async fn ensure_ready_force_refreshes_cache() {
277        let _guard = CACHE_SERIAL.lock().await;
278        let _home = crate::db::shared_test_home();
279
280        let first = ensure_ready(false).await.expect("first call");
281        let first_ts = first.migrations_applied_at;
282
283        // Tiny sleep so the recorded timestamps are actually different.
284        // We don't want this test to be timing-sensitive, so we use
285        // millisecond-level resolution via tokio::time::sleep.
286        tokio::time::sleep(std::time::Duration::from_millis(15)).await;
287
288        let second = ensure_ready(true).await.expect("force call");
289        assert!(
290            second.migrations_applied_at > first_ts,
291            "force=true must re-run the full check (got {} vs {first_ts})",
292            second.migrations_applied_at
293        );
294    }
295}