zccache 1.12.8

Local-first compiler cache for C/C++/Rust/Emscripten
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
//! Daemon lifecycle: start, stop, version probing, ensure-running, binary discovery.

use crate::core::NormalizedPath;
use std::process::ExitCode;

use super::super::status_probe_timeout;
use super::util::{connect, resolve_endpoint, run_async, LOST_CONNECTION_MSG};

pub(crate) enum VersionCheck {
    Ok,
    /// Daemon is newer than client — safe to proceed.
    DaemonNewer {
        daemon_ver: String,
    },
    /// Daemon is older than client — must restart.
    DaemonOlder {
        daemon_ver: String,
    },
    /// Could not connect to the daemon at all.
    Unreachable,
    /// Connected but could not complete the version exchange (protocol mismatch, etc.).
    CommError,
    /// Client-side daemon wire configuration is invalid.
    ClientConfigError(String),
}

/// Connect to the daemon and compare its version to ours.
///
/// The Status recv is bounded by [`status_probe_timeout`] so that a wedged
/// daemon (alive socket, no response) surfaces as `CommError` in seconds
/// rather than the 5-minute global default. The caller's recovery path
/// (`ensure_daemon` → `stop_stale_daemon` → `spawn_and_wait`) then runs
/// promptly. See issue #554.
pub(crate) async fn check_daemon_version(endpoint: &str) -> VersionCheck {
    match crate::ipc::daemon_control_roundtrip(
        endpoint,
        crate::ipc::DaemonControlRequest::Status,
        Some(status_probe_timeout()),
    )
    .await
    {
        Ok(Some(crate::protocol::Response::Status(s))) => {
            if s.version == crate::core::VERSION {
                return VersionCheck::Ok;
            }
            let client_ver = crate::core::version::current();
            match crate::core::version::Version::parse(&s.version) {
                Some(daemon_ver) => match daemon_ver.cmp(&client_ver) {
                    std::cmp::Ordering::Equal => VersionCheck::Ok,
                    std::cmp::Ordering::Greater => VersionCheck::DaemonNewer {
                        daemon_ver: s.version,
                    },
                    std::cmp::Ordering::Less => VersionCheck::DaemonOlder {
                        daemon_ver: s.version,
                    },
                },
                // Unparseable daemon version → treat as older (safe default)
                None => VersionCheck::DaemonOlder {
                    daemon_ver: s.version,
                },
            }
        }
        Err(crate::ipc::IpcError::Endpoint(message))
            if message.contains(crate::protocol::wire_prost::WIRE_FORMAT_ENV) =>
        {
            VersionCheck::ClientConfigError(message)
        }
        Err(err) if crate::cli::client::is_daemon_unreachable_err(&err) => {
            VersionCheck::Unreachable
        }
        _ => VersionCheck::CommError,
    }
}

/// Spawn a new daemon and wait for it to become ready.
///
/// `outbound_pid` is `Some(pid)` when this spawn is the second half of
/// a takeover orchestrated by `stop_stale_daemon` — the helper emits
/// the linked `daemon-died{reason: takeover}` + `pipe-handover` pair
/// once the new daemon's PID has been observed. `None` for a clean
/// initial-start (no predecessor to record).  Issue #755 acceptance #2.
pub(crate) async fn spawn_and_wait(
    endpoint: &str,
    reason: &str,
    outbound_pid: Option<u32>,
) -> Result<(), String> {
    let daemon_bin = find_daemon_binary().ok_or("cannot find zccache-daemon binary")?;
    tracing::debug!(?daemon_bin, %endpoint, reason, "spawning daemon");
    // Record *why* the CLI is about to spawn a daemon so an operator
    // can correlate each CLI decision with the resulting daemon PID
    // by parsing the single `daemon-lifecycle.log`. See zccache#323
    // for the diagnostic gap that motivated this.
    let meta = crate::core::lifecycle::client_meta(crate::core::VERSION);
    crate::core::lifecycle::write_event(
        crate::core::lifecycle::EVENT_SPAWN_ATTEMPT,
        serde_json::json!({
            "reason": reason,
            "endpoint": endpoint,
            "daemon_namespace": crate::core::config::daemon_namespace_label(),
            "client_pid": std::process::id(),
            // #755 acceptance #4: see runtime.rs for rationale.
            "client_version": meta["client_version"],
            "client_binary_path": meta["client_binary_path"],
        }),
    );
    super::super::spawn_daemon(&daemon_bin, endpoint)?;

    // Adaptive wait keyed on the daemon-lifecycle lockfile PID (issue #673):
    // the previous 100-iteration / 10 s loop expired under thundering-herd
    // builds while individual ERROR_PIPE_BUSY backoffs were still in flight.
    // The shared helper polls past 10 s as long as a daemon owns the lockfile.
    super::super::wait_for_daemon_ready(endpoint).await?;

    // #755 acceptance #2: emit linked daemon-died + pipe-handover events
    // for the takeover case. Best-effort: if we can't read the new
    // daemon's PID right after `wait_for_daemon_ready` (unlikely but
    // possible under thundering-herd lockfile contention) we skip the
    // linkage; the regular `spawn` line still records the new daemon.
    if let Some(killed_pid) = outbound_pid {
        if let Some(new_pid) = crate::ipc::check_running_daemon() {
            crate::core::lifecycle::emit_takeover_lifecycle_events(
                killed_pid,
                new_pid,
                crate::core::VERSION,
                endpoint,
            );
        }
    }
    Ok(())
}

/// Stop a stale daemon that is unreachable or version-incompatible.
///
/// Attempts graceful shutdown via IPC first, then falls back to force-killing
/// the process via the lock file PID. Waits for the endpoint to be released.
///
/// Returns `Some(pid)` with the killed daemon's PID when a force-kill
/// actually fired — the caller threads this through `spawn_and_wait`
/// so the linked daemon-died + pipe-handover events get an
/// `outbound_pid`. `None` means no live daemon was found to kill
/// (graceful shutdown succeeded, or no daemon was running). #755.
pub(crate) async fn stop_stale_daemon(endpoint: &str) -> Option<u32> {
    // Try graceful shutdown via IPC.
    let _ = crate::ipc::daemon_control_roundtrip(
        endpoint,
        crate::ipc::DaemonControlRequest::Shutdown,
        None,
    )
    .await;
    tokio::time::sleep(std::time::Duration::from_millis(200)).await;

    // Force-kill via lock file PID if the daemon is still alive
    let killed_pid = if let Some(pid) = crate::ipc::check_running_daemon() {
        tracing::debug!(pid, "force-killing stale daemon process");
        let kill_ok = crate::ipc::force_kill_process(pid).is_ok();
        if kill_ok {
            for _ in 0..50 {
                if !crate::ipc::is_process_alive(pid) {
                    break;
                }
                tokio::time::sleep(std::time::Duration::from_millis(100)).await;
            }
        }
        crate::ipc::remove_lock_file();
        kill_ok.then_some(pid)
    } else {
        None
    };

    // Wait briefly for the endpoint (named pipe / socket) to be fully released
    tokio::time::sleep(std::time::Duration::from_millis(200)).await;
    killed_pid
}

/// Ensure the daemon is running **and version-compatible**.
///
/// Version checking is asymmetric: a newer daemon is accepted (it's
/// backward-compatible), but an older daemon triggers a hard error
/// telling the user to run `zccache stop` first.
///
/// Handles concurrent calls gracefully: when multiple processes race to start
/// the daemon, only one wins the bind. The losers detect this and connect to
/// the winning daemon instead of failing.
pub(crate) async fn ensure_daemon(endpoint: &str) -> Result<(), String> {
    // Fast path: connect + version check
    match check_daemon_version(endpoint).await {
        VersionCheck::Ok => return Ok(()),
        VersionCheck::DaemonNewer { daemon_ver } => {
            tracing::debug!(
                daemon_ver,
                client_ver = crate::core::VERSION,
                "daemon is newer than client, proceeding"
            );
            return Ok(());
        }
        VersionCheck::DaemonOlder { daemon_ver } => {
            tracing::info!(
                daemon_ver,
                client_ver = crate::core::VERSION,
                "daemon is older than client, auto-recovering"
            );
            let killed_pid = stop_stale_daemon(endpoint).await;
            return spawn_and_wait(
                endpoint,
                crate::core::lifecycle::REASON_REPLACED_STALE_VERSION,
                killed_pid,
            )
            .await;
        }
        VersionCheck::CommError => {
            tracing::info!("cannot communicate with daemon, auto-recovering");
            let killed_pid = stop_stale_daemon(endpoint).await;
            return spawn_and_wait(
                endpoint,
                crate::core::lifecycle::REASON_REPLACED_COMM_ERROR,
                killed_pid,
            )
            .await;
        }
        VersionCheck::ClientConfigError(message) => return Err(message),
        VersionCheck::Unreachable => {
            // Fall through to lock-file check / spawn
        }
    }

    // Check lock file for a running daemon we just can't reach yet
    if let Some(pid) = crate::ipc::check_running_daemon() {
        for _ in 0..20 {
            tokio::time::sleep(std::time::Duration::from_millis(100)).await;
            match check_daemon_version(endpoint).await {
                VersionCheck::Ok => return Ok(()),
                VersionCheck::DaemonNewer { daemon_ver } => {
                    tracing::debug!(
                        daemon_ver,
                        client_ver = crate::core::VERSION,
                        "daemon is newer than client, proceeding"
                    );
                    return Ok(());
                }
                VersionCheck::DaemonOlder { daemon_ver } => {
                    tracing::info!(
                        daemon_ver,
                        client_ver = crate::core::VERSION,
                        "daemon is older than client during startup, auto-recovering"
                    );
                    let killed_pid = stop_stale_daemon(endpoint).await;
                    return spawn_and_wait(
                        endpoint,
                        crate::core::lifecycle::REASON_REPLACED_STALE_VERSION,
                        killed_pid,
                    )
                    .await;
                }
                VersionCheck::CommError => {
                    tracing::info!(
                        "cannot communicate with daemon during startup, auto-recovering"
                    );
                    let killed_pid = stop_stale_daemon(endpoint).await;
                    return spawn_and_wait(
                        endpoint,
                        crate::core::lifecycle::REASON_REPLACED_COMM_ERROR,
                        killed_pid,
                    )
                    .await;
                }
                VersionCheck::ClientConfigError(message) => return Err(message),
                VersionCheck::Unreachable => continue,
            }
        }
        return Err(format!(
            "daemon process {pid} exists but not accepting connections"
        ));
    }

    // No daemon running — spawn one
    spawn_and_wait(endpoint, crate::core::lifecycle::REASON_INITIAL_START, None).await
}

/// Find the daemon binary. Looks next to the CLI binary first, then on PATH.
pub(crate) fn find_daemon_binary() -> Option<NormalizedPath> {
    let name = if cfg!(windows) {
        "zccache-daemon.exe"
    } else {
        "zccache-daemon"
    };

    // Look next to the CLI binary
    if let Ok(exe) = std::env::current_exe() {
        if let Some(dir) = exe.parent() {
            let candidate = dir.join(name);
            if candidate.exists() {
                return Some(candidate.into());
            }
        }
    }

    // Fall back to PATH
    which_on_path(name)
}

/// Simple PATH lookup (no external crate needed).
/// On Windows, also tries appending `.exe` if the name has no extension.
pub(crate) fn which_on_path(name: &str) -> Option<NormalizedPath> {
    let path_var = std::env::var_os("PATH")?;
    for dir in std::env::split_paths(&path_var) {
        let candidate = dir.join(name);
        if candidate.is_file() {
            return Some(candidate.into());
        }
        // On Windows, try with .exe suffix
        #[cfg(windows)]
        if std::path::Path::new(name).extension().is_none() {
            let with_exe = dir.join(format!("{name}.exe"));
            if with_exe.is_file() {
                return Some(with_exe.into());
            }
        }
    }
    None
}

pub(crate) async fn cmd_start(endpoint: &str) -> ExitCode {
    match ensure_daemon(endpoint).await {
        Ok(()) => {
            eprintln!("daemon running at {endpoint}");
            ExitCode::SUCCESS
        }
        Err(e) => {
            eprintln!("failed to start daemon: {e}");
            ExitCode::FAILURE
        }
    }
}

pub(crate) async fn cmd_stop(endpoint: &str) -> ExitCode {
    let recv_result = match crate::ipc::daemon_control_roundtrip(
        endpoint,
        crate::ipc::DaemonControlRequest::Shutdown,
        None,
    )
    .await
    {
        Ok(response) => response,
        Err(e) if crate::cli::client::is_daemon_unreachable_err(&e) => {
            let Some(pid) = crate::ipc::check_running_daemon() else {
                eprintln!("daemon not running at {endpoint}");
                // No daemon — but the index file might still be there from a
                // crashed prior run. Probe once so callers (CI tar) can rely
                // on the lock being gone after `zccache stop` returns.
                wait_for_daemon_teardown(endpoint).await;
                return ExitCode::SUCCESS;
            };

            match crate::ipc::force_kill_process(pid) {
                Ok(()) => {
                    for _ in 0..50 {
                        if !crate::ipc::is_process_alive(pid) {
                            crate::ipc::remove_lock_file();
                            eprintln!(
                                "daemon process {pid} terminated after IPC connection failed"
                            );
                            wait_for_daemon_teardown(endpoint).await;
                            return ExitCode::SUCCESS;
                        }
                        tokio::time::sleep(std::time::Duration::from_millis(100)).await;
                    }
                    eprintln!(
                        "zccache: sent termination to daemon process {pid}, but it did not exit"
                    );
                    return ExitCode::FAILURE;
                }
                Err(e) => {
                    eprintln!(
                        "zccache: cannot connect to daemon at {endpoint}, and failed to kill \
                         locked process {pid}: {e}"
                    );
                    return ExitCode::FAILURE;
                }
            }
        }
        Err(e) => {
            eprintln!("zccache[err][R]: broken connection to daemon: {e}");
            return ExitCode::FAILURE;
        }
    };
    match recv_result {
        Some(crate::protocol::Response::ShuttingDown) => {
            // The daemon acknowledges `Shutdown` immediately and continues
            // teardown asynchronously. On Windows the redb index lock is held
            // until the daemon process actually exits and `Drop` fires. Wait
            // for the IPC endpoint to drop and for `index.redb` to be
            // openable (i.e. no exclusive share lock) so callers like the CI
            // post-step tar do not race the daemon. See issue #182.
            wait_for_daemon_teardown(endpoint).await;
            eprintln!("daemon stopped");
            ExitCode::SUCCESS
        }
        None => {
            eprintln!("{LOST_CONNECTION_MSG}");
            ExitCode::FAILURE
        }
        Some(other) => {
            eprintln!("zccache[err][U]: unexpected response from daemon: {other:?}");
            ExitCode::FAILURE
        }
    }
}

/// Default cap on how long `zccache stop` will wait after the daemon ACKs
/// `Shutdown` for the IPC endpoint to disappear and `index.redb` to become
/// openable. Overridable with `ZCCACHE_STOP_TIMEOUT_SECS`.
const STOP_WAIT_DEFAULT_SECS: u64 = 10;
/// Poll cadence inside the bounded wait loop.
const STOP_WAIT_POLL_INTERVAL: std::time::Duration = std::time::Duration::from_millis(100);

/// Returns the bounded total wait duration for `zccache stop`, honoring
/// `ZCCACHE_STOP_TIMEOUT_SECS` if it parses as a non-negative `u64`.
fn stop_wait_timeout() -> std::time::Duration {
    let secs = std::env::var("ZCCACHE_STOP_TIMEOUT_SECS")
        .ok()
        .and_then(|s| s.trim().parse::<u64>().ok())
        .unwrap_or(STOP_WAIT_DEFAULT_SECS);
    std::time::Duration::from_secs(secs)
}

/// Poll until the IPC endpoint is unreachable. Emits a warning on timeout
/// but never fails the caller — the worst case is that the caller (e.g. CI
/// cache tar) sees the same error it would have seen without this wait.
///
/// The legacy redb-era version of this routine also waited for the index
/// file's exclusive share lock to drop on Windows. With the bincode blob
/// there is no file lock — `flush()` writes via temp+rename, holding the
/// file handle only briefly during the rename — so endpoint reachability
/// is the only signal we need.
pub(crate) async fn wait_for_daemon_teardown(endpoint: &str) {
    let deadline = std::time::Instant::now() + stop_wait_timeout();
    loop {
        if !is_ipc_endpoint_reachable(endpoint).await {
            return;
        }
        if std::time::Instant::now() >= deadline {
            eprintln!(
                "zccache: timed out waiting for daemon endpoint to disappear after stop; \
                 continuing anyway. set ZCCACHE_STOP_TIMEOUT_SECS to override."
            );
            return;
        }
        tokio::time::sleep(STOP_WAIT_POLL_INTERVAL).await;
    }
}

/// True if a fresh `connect()` to the daemon IPC endpoint succeeds.
async fn is_ipc_endpoint_reachable(endpoint: &str) -> bool {
    connect(endpoint).await.is_ok()
}

// Trampolines for top-level flags / `start`/`stop` so the dispatch
// match in `cli::mod` doesn't need its own runtime plumbing.
pub(crate) fn run_start() -> ExitCode {
    let endpoint = resolve_endpoint(None);
    run_async(cmd_start(&endpoint))
}

pub(crate) fn run_stop() -> ExitCode {
    let endpoint = resolve_endpoint(None);
    run_async(cmd_stop(&endpoint))
}