revka 2026.6.22

Revka — memory-native AI agent runtime powered by Kumiho
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
//! Local Kumiho Python SDK bridge launcher/client.
//!
//! The hosted FastAPI BFF remains the fallback transport. This module starts a
//! loopback-only Python sidecar from the existing Kumiho venv and lets gateway
//! routes issue FastAPI-shaped `/api/v1/*` calls against the SDK directly.

use super::kumiho_client::KumihoError;
use anyhow::{Context, Result};
use reqwest::{Client, Method, StatusCode};
use serde_json::Value;
use std::fs::OpenOptions;
use std::net::TcpListener;
use std::path::{Path, PathBuf};
use std::process::Stdio;
use std::sync::OnceLock;
use std::time::Duration;
use tokio::process::{Child, Command};
use tokio::sync::Mutex;

const BRIDGE_SCRIPT: &str = include_str!("../../resources/sidecars/kumiho_sdk_bridge.py");
const BRIDGE_SCRIPT_NAME: &str = "kumiho_sdk_bridge.py";

/// Process-env var the bridge reads to decide whether tokenless CE routing is
/// allowed. Set by daemon startup only in local-CE mode; cleared in cloud mode.
const CE_ENDPOINT_ENV: &str = "KUMIHO_LOCAL_SERVER_ENDPOINT";
/// CE-only Redis URL exported alongside the CE endpoint; cleared in cloud mode.
const CE_REDIS_ENV: &str = "KUMIHO_UPSTASH_REDIS_URL";

/// Cloud-routing creds that must be shadowed to empty for the CE sidecar. A
/// non-empty `KUMIHO_AUTH_TOKEN` makes the SDK take the cloud discovery path
/// instead of the local CE probe, and an inherited `KUMIHO_CONTROL_PLANE_URL`
/// would skew the bridge's cloud-path routing cache key. Mirrors the shadowing
/// in `src/agent/kumiho.rs` (memory MCP) and `src/main.rs` (startup shim).
const CE_SHADOWED_CLOUD_VARS: [&str; 3] = [
    "KUMIHO_AUTH_TOKEN",
    "KUMIHO_SERVICE_TOKEN",
    "KUMIHO_CONTROL_PLANE_URL",
];

/// True when local self-hosted CE is configured, signalled by a non-empty
/// `KUMIHO_LOCAL_SERVER_ENDPOINT`. This is the same signal `send_raw` keys its
/// tokenless-CE routing decision on, set by both the daemon startup shim and the
/// memory MCP path whenever CE is active.
fn ce_configured() -> bool {
    std::env::var(CE_ENDPOINT_ENV)
        .map(|v| !v.trim().is_empty())
        .unwrap_or(false)
}

/// When local CE is configured, shadow the inherited cloud-routing creds to
/// empty on the spawned child so the loopback CE sidecar takes the tokenless CE
/// probe path rather than cloud discovery. The daemon-CE startup shim already
/// blanks these process-wide, but doing it here makes the bridge self-sufficient:
/// child routing stays CE-correct regardless of how the gateway was launched,
/// matching the shadowing the memory MCP path and the startup shim perform.
/// See memory-6.
fn apply_ce_cred_shadowing(cmd: &mut Command) {
    if ce_configured() {
        for var in CE_SHADOWED_CLOUD_VARS {
            cmd.env(var, "");
        }
    }
}

/// Clear stale local-CE env vars so cloud mode is authoritative over routing.
///
/// `send_raw` derives its tokenless-CE decision from the *presence* of
/// `KUMIHO_LOCAL_SERVER_ENDPOINT` in the process env. That var can leak into a
/// cloud-mode daemon (workspace `.env` importer, a prior CE run, or a shell
/// export); left set, a cloud daemon with an empty token would skip the token
/// guard and route gateway traffic tokenlessly to a (typically dead) loopback
/// CE endpoint instead of falling back to hosted FastAPI. Called once early in
/// cloud-mode startup to remove that signal (mirrors how CE mode shadows cloud
/// creds). See memory-3.
pub fn clear_stale_ce_env() {
    // SAFETY: called once early in main before worker threads are spawned.
    unsafe {
        std::env::remove_var(CE_ENDPOINT_ENV);
        std::env::remove_var(CE_REDIS_ENV);
    }
}

#[derive(Debug)]
struct BridgeState {
    base_url: String,
    child: Child,
}

static BRIDGE: OnceLock<Mutex<Option<BridgeState>>> = OnceLock::new();

#[derive(Debug, Clone)]
pub struct BridgeResponse {
    pub status: StatusCode,
    pub body: String,
}

fn bridge_enabled() -> bool {
    !matches!(
        std::env::var("REVKA_KUMIHO_SDK_BRIDGE")
            .unwrap_or_else(|_| "1".to_string())
            .trim()
            .to_ascii_lowercase()
            .as_str(),
        "0" | "false" | "no" | "off"
    )
}

fn kumiho_dir() -> Result<PathBuf> {
    Ok(crate::sidecars::revka_root()?.join("kumiho"))
}

fn bridge_script_path() -> Result<PathBuf> {
    Ok(kumiho_dir()?.join(BRIDGE_SCRIPT_NAME))
}

fn venv_python(dir: &Path) -> Option<PathBuf> {
    if cfg!(windows) {
        let candidate = dir.join("venv").join("Scripts").join("python.exe");
        candidate.exists().then_some(candidate)
    } else {
        let python3 = dir.join("venv").join("bin").join("python3");
        if python3.exists() {
            return Some(python3);
        }
        let python = dir.join("venv").join("bin").join("python");
        python.exists().then_some(python)
    }
}

fn materialize_bridge_script() -> Result<PathBuf> {
    let dir = kumiho_dir()?;
    std::fs::create_dir_all(&dir).with_context(|| format!("creating {}", dir.display()))?;
    let script = bridge_script_path()?;
    let write = match std::fs::read_to_string(&script) {
        Ok(existing) => existing != BRIDGE_SCRIPT,
        Err(_) => true,
    };
    if write {
        std::fs::write(&script, BRIDGE_SCRIPT)
            .with_context(|| format!("writing {}", script.display()))?;
    }
    Ok(script)
}

fn reserve_loopback_port() -> Result<u16> {
    let listener = TcpListener::bind("127.0.0.1:0").context("binding loopback bridge port")?;
    let port = listener.local_addr()?.port();
    drop(listener);
    Ok(port)
}

fn log_file(name: &str) -> Option<std::fs::File> {
    let root = crate::sidecars::revka_root().ok()?;
    let dir = root.join("logs");
    std::fs::create_dir_all(&dir).ok()?;
    OpenOptions::new()
        .create(true)
        .append(true)
        .open(dir.join(name))
        .ok()
}

async fn poll_health(client: &Client, base_url: &str) -> bool {
    let deadline = tokio::time::Instant::now() + Duration::from_secs(10);
    while tokio::time::Instant::now() < deadline {
        if let Ok(resp) = client
            .get(format!("{base_url}/health"))
            .timeout(Duration::from_millis(500))
            .send()
            .await
        {
            if resp.status().is_success() {
                return true;
            }
        }
        tokio::time::sleep(Duration::from_millis(150)).await;
    }
    false
}

async fn start_bridge(client: &Client) -> Result<BridgeState> {
    let dir = kumiho_dir()?;
    let python = venv_python(&dir).ok_or_else(|| {
        anyhow::anyhow!(
            "Kumiho sidecar venv not found under {}. Run `revka install --sidecars-only`.",
            dir.display()
        )
    })?;
    let script = materialize_bridge_script()?;
    let port = reserve_loopback_port()?;
    let base_url = format!("http://127.0.0.1:{port}");

    let stderr = log_file("kumiho-sdk-bridge.stderr.log").map(Stdio::from);
    let stdout = log_file("kumiho-sdk-bridge.stdout.log").map(Stdio::from);

    let mut cmd = Command::new(python);
    cmd.arg(script)
        .env("KUMIHO_SDK_BRIDGE_HOST", "127.0.0.1")
        .env("KUMIHO_SDK_BRIDGE_PORT", port.to_string())
        .env("PYTHONUNBUFFERED", "1")
        .env_remove("KUMIHO_AUTO_CONFIGURE")
        .stdin(Stdio::null())
        .stdout(stdout.unwrap_or_else(Stdio::null))
        .stderr(stderr.unwrap_or_else(Stdio::null));

    apply_ce_cred_shadowing(&mut cmd);

    #[cfg(windows)]
    {
        cmd.creation_flags(0x0800_0000);
    }

    let mut child = cmd.spawn().context("spawning Kumiho SDK bridge")?;
    if poll_health(client, &base_url).await {
        tracing::info!(%base_url, "Kumiho SDK bridge started");
        return Ok(BridgeState { base_url, child });
    }

    let _ = child.kill().await;
    anyhow::bail!("Kumiho SDK bridge did not become healthy");
}

async fn ensure_bridge(client: &Client) -> Option<String> {
    if !bridge_enabled() {
        return None;
    }

    let lock = BRIDGE.get_or_init(|| Mutex::new(None));
    let mut guard = lock.lock().await;
    if let Some(state) = guard.as_mut() {
        match state.child.try_wait() {
            Ok(None) => return Some(state.base_url.clone()),
            Ok(Some(status)) => {
                tracing::warn!(?status, "Kumiho SDK bridge exited; restarting on demand");
                *guard = None;
            }
            Err(err) => {
                tracing::warn!(error = %err, "Kumiho SDK bridge status check failed");
                *guard = None;
            }
        }
    }

    match start_bridge(client).await {
        Ok(state) => {
            let base_url = state.base_url.clone();
            *guard = Some(state);
            Some(base_url)
        }
        Err(err) => {
            tracing::warn!(error = %err, "Kumiho SDK bridge unavailable; falling back to FastAPI");
            None
        }
    }
}

async fn mark_dead() {
    let Some(lock) = BRIDGE.get() else {
        return;
    };
    let mut guard = lock.lock().await;
    if let Some(mut state) = guard.take() {
        let _ = state.child.kill().await;
    }
}

/// Decide whether a failed bridge request means the sidecar is actually gone.
///
/// A `reqwest` failure is *not* proof the process died: a per-request timeout or
/// a transient blip can fire against an otherwise-healthy sidecar. Tearing the
/// child down on those forces every subsequent caller to pay the cold-start cost
/// (process spawn + up to a 10s health poll). Only treat the sidecar as dead
/// when the error indicates the listener is gone (`is_connect()`) or when
/// `try_wait()` confirms the child has actually exited. See memory-5.
async fn bridge_is_dead(err: &reqwest::Error) -> bool {
    if err.is_connect() {
        return true;
    }
    if err.is_timeout() {
        return false;
    }
    let Some(lock) = BRIDGE.get() else {
        return false;
    };
    let mut guard = lock.lock().await;
    match guard.as_mut() {
        Some(state) => matches!(state.child.try_wait(), Ok(Some(_)) | Err(_)),
        None => false,
    }
}

fn is_unsupported_bridge_response(status: StatusCode, body: &str) -> bool {
    if status != StatusCode::NOT_IMPLEMENTED {
        return false;
    }
    serde_json::from_str::<Value>(body)
        .ok()
        .and_then(|v| {
            v.get("error_code")
                .and_then(|c| c.as_str())
                .map(str::to_string)
        })
        .is_some_and(|code| code == "kumiho_sdk_bridge_unsupported")
}

/// Send a FastAPI-shaped request through the local SDK bridge.
///
/// Returns `None` when the bridge is disabled/unavailable or does not support
/// the route, allowing callers to fall back to the hosted FastAPI transport.
pub async fn send_raw(
    client: &Client,
    method: Method,
    path: &str,
    query: Vec<(String, String)>,
    token: &str,
    body: Option<Value>,
) -> Option<std::result::Result<BridgeResponse, KumihoError>> {
    let token = token.trim();
    // Cloud requires a token to use the bridge. Local self-hosted CE is
    // tokenless: when a CE endpoint is configured, still route through the bridge
    // — CE serves gRPC (not JSON REST), so the hosted FastAPI `/api/v1` fallback
    // would receive an undecodable gRPC body. The bridge shim builds a tokenless
    // CE client from KUMIHO_LOCAL_SERVER_ENDPOINT.
    if token.is_empty() && !ce_configured() {
        return None;
    }

    let base_url = ensure_bridge(client).await?;
    let url = if path == "/health" {
        format!("{}/health", base_url.trim_end_matches('/'))
    } else {
        format!("{}/api/v1{}", base_url.trim_end_matches('/'), path)
    };
    let mut req = client
        .request(method, &url)
        .header("X-Kumiho-Token", token)
        .timeout(Duration::from_secs(10));
    if !query.is_empty() {
        req = req.query(&query);
    }
    if let Some(body) = body {
        req = req.json(&body);
    }

    let resp = match req.send().await {
        Ok(resp) => resp,
        Err(err) => {
            if bridge_is_dead(&err).await {
                tracing::warn!(error = %err, path = %path, "Kumiho SDK bridge request failed; sidecar appears dead, tearing down");
                mark_dead().await;
            } else {
                tracing::warn!(error = %err, path = %path, "Kumiho SDK bridge request failed transiently; leaving sidecar running and falling back to FastAPI");
            }
            return None;
        }
    };
    let status = resp.status();
    let body = resp.text().await.unwrap_or_default();
    if is_unsupported_bridge_response(status, &body) {
        return None;
    }
    if status.is_server_error() {
        tracing::warn!(
            upstream_status = status.as_u16(),
            path = %path,
            body = %body,
            "Kumiho SDK bridge returned 5xx; falling back to FastAPI"
        );
        return None;
    }
    if !status.is_success() {
        return Some(Err(KumihoError::Api {
            status: status.as_u16(),
            body,
        }));
    }
    Some(Ok(BridgeResponse { status, body }))
}

#[cfg(test)]
mod tests {
    use super::*;

    /// Serializes the process-global env mutations across every test in this
    /// module that touches `KUMIHO_LOCAL_SERVER_ENDPOINT` / the cloud-cred vars,
    /// so they can't race each other's set/restore. A single shared guard is
    /// required because separate function-local statics are distinct mutexes.
    static ENV_GUARD: std::sync::Mutex<()> = std::sync::Mutex::new(());

    /// Regression for the stale-`KUMIHO_LOCAL_SERVER_ENDPOINT` misroute
    /// (memory-3). Reproduces the leak the fix targets: a cloud-mode daemon
    /// starts with a *stale* `KUMIHO_LOCAL_SERVER_ENDPOINT` set and an empty
    /// service token. Pre-fix, `send_raw` would compute `ce_configured == true`
    /// from that leaked var, skip the token guard, and route tokenlessly to a
    /// dead loopback CE endpoint. The cloud-mode fix is `clear_stale_ce_env()`;
    /// after it runs, the bridge guard must short-circuit to `None` so the
    /// caller falls back to hosted FastAPI. Reverting the clear would leave the
    /// var set and flip this assertion. The env is process-global, so the
    /// mutation is serialized and the prior value restored.
    // ENV_GUARD must stay held across the send_raw().await below to keep the
    // process-global CE env stable while send_raw reads it; send_raw short-
    // circuits to None before any real await and never touches ENV_GUARD, so
    // there is no deadlock — the await_holding_lock lint is a false positive here.
    #[allow(clippy::await_holding_lock)]
    #[tokio::test]
    async fn cloud_startup_clears_stale_ce_endpoint_so_empty_token_returns_none() {
        let _guard = ENV_GUARD.lock().unwrap_or_else(|e| e.into_inner());

        let prev = std::env::var(CE_ENDPOINT_ENV).ok();

        // Simulate the leak: a stale CE endpoint is present in the process env
        // at cloud-mode startup. SAFETY: env mutation is serialized by
        // ENV_GUARD for the duration of this test; restored before returning.
        unsafe { std::env::set_var(CE_ENDPOINT_ENV, "http://127.0.0.1:1/stale") };

        // Pre-condition: with the stale var set, the bridge would treat this as
        // CE-configured and bypass the empty-token guard — the bug.
        let ce_configured = std::env::var(CE_ENDPOINT_ENV)
            .map(|v| !v.trim().is_empty())
            .unwrap_or(false);
        assert!(
            ce_configured,
            "test setup: stale CE endpoint should read as CE-configured before the fix runs"
        );

        // Apply the cloud-mode fix: clear the leaked CE env vars.
        clear_stale_ce_env();

        let client = Client::new();
        let result = send_raw(&client, Method::GET, "/memory/recall", Vec::new(), "", None).await;

        // SAFETY: see above — restore prior state under the same guard.
        unsafe {
            match prev {
                Some(v) => std::env::set_var(CE_ENDPOINT_ENV, v),
                None => std::env::remove_var(CE_ENDPOINT_ENV),
            }
        }

        assert!(
            result.is_none(),
            "after clearing the stale CE endpoint, a cloud-mode bridge call with \
             an empty token must return None to fall back to hosted FastAPI"
        );
    }

    /// Regression for memory-6: the bridge launcher must self-guarantee
    /// CE-correct child routing rather than relying on `main.rs` having blanked
    /// the cloud creds process-wide. With CE configured (a non-empty
    /// `KUMIHO_LOCAL_SERVER_ENDPOINT`), `apply_ce_cred_shadowing` must set the
    /// three cloud-routing vars to empty on the spawned child even when the
    /// parent process still carries non-empty inherited values — so an inherited
    /// cloud token can't push the loopback CE sidecar onto the cloud discovery
    /// path. The env is process-global, so the mutation is serialized and the
    /// prior values restored.
    #[test]
    fn ce_mode_shadows_inherited_cloud_creds_on_spawned_child() {
        let _guard = ENV_GUARD.lock().unwrap_or_else(|e| e.into_inner());

        let prev_endpoint = std::env::var(CE_ENDPOINT_ENV).ok();
        let prev_creds: Vec<(&str, Option<String>)> = CE_SHADOWED_CLOUD_VARS
            .iter()
            .map(|&v| (v, std::env::var(v).ok()))
            .collect();

        // Simulate a CE-configured daemon whose process env still carries an
        // inherited (non-empty) cloud token — the gap the fix closes. SAFETY:
        // env mutation is serialized by ENV_GUARD and restored before returning.
        unsafe {
            std::env::set_var(CE_ENDPOINT_ENV, "127.0.0.1:9190");
            for &var in &CE_SHADOWED_CLOUD_VARS {
                std::env::set_var(var, "inherited-cloud-value");
            }
        }

        let mut cmd = Command::new("python");
        apply_ce_cred_shadowing(&mut cmd);

        // The child env override must blank every cloud-routing var.
        let overrides: std::collections::HashMap<String, Option<String>> = cmd
            .as_std()
            .get_envs()
            .map(|(k, v)| {
                (
                    k.to_string_lossy().into_owned(),
                    v.map(|v| v.to_string_lossy().into_owned()),
                )
            })
            .collect();

        // SAFETY: see above — restore prior state under the same guard.
        unsafe {
            match prev_endpoint {
                Some(v) => std::env::set_var(CE_ENDPOINT_ENV, v),
                None => std::env::remove_var(CE_ENDPOINT_ENV),
            }
            for (var, prev) in prev_creds {
                match prev {
                    Some(v) => std::env::set_var(var, v),
                    None => std::env::remove_var(var),
                }
            }
        }

        for var in CE_SHADOWED_CLOUD_VARS {
            assert_eq!(
                overrides.get(var),
                Some(&Some(String::new())),
                "{var} must be shadowed to empty on the CE child env"
            );
        }
    }

    /// In cloud mode (no `KUMIHO_LOCAL_SERVER_ENDPOINT`), the bridge must NOT
    /// shadow the cloud creds — the child needs the real token to route through
    /// cloud discovery. Guards against the CE shadowing leaking into cloud mode.
    #[test]
    fn cloud_mode_does_not_shadow_cloud_creds_on_spawned_child() {
        let _guard = ENV_GUARD.lock().unwrap_or_else(|e| e.into_inner());

        let prev_endpoint = std::env::var(CE_ENDPOINT_ENV).ok();

        // Cloud mode: no CE endpoint signal in the env. SAFETY: serialized by
        // ENV_GUARD; restored before returning.
        unsafe { std::env::remove_var(CE_ENDPOINT_ENV) };

        let mut cmd = Command::new("python");
        apply_ce_cred_shadowing(&mut cmd);

        let touched_cloud_var = cmd
            .as_std()
            .get_envs()
            .any(|(k, _)| CE_SHADOWED_CLOUD_VARS.contains(&k.to_string_lossy().as_ref()));

        // SAFETY: see above — restore prior state under the same guard.
        unsafe {
            match prev_endpoint {
                Some(v) => std::env::set_var(CE_ENDPOINT_ENV, v),
                None => std::env::remove_var(CE_ENDPOINT_ENV),
            }
        }

        assert!(
            !touched_cloud_var,
            "cloud mode must leave the cloud-routing creds untouched on the child env"
        );
    }

    /// Regression for memory-5: a transient request error must not tear down an
    /// otherwise-healthy sidecar. A connection refusal (`is_connect()`) means
    /// the listener is gone and should mark the bridge dead; a timeout
    /// (`is_timeout()`) is a transient blip against a possibly-healthy process
    /// and must *not*. These two `bridge_is_dead` branches return before
    /// touching the global `BRIDGE` state, so they can be exercised with real
    /// `reqwest::Error`s without a live sidecar.
    #[tokio::test]
    async fn timeout_does_not_mark_bridge_dead_but_connect_refused_does() {
        // Connect error: nothing is listening on this freshly-released port.
        let port = reserve_loopback_port().expect("reserve loopback port");
        let client = Client::new();
        let connect_err = client
            .get(format!("http://127.0.0.1:{port}/health"))
            .send()
            .await
            .expect_err("connecting to an unbound loopback port must fail");
        assert!(
            connect_err.is_connect(),
            "expected a connect error, got: {connect_err}"
        );
        assert!(
            bridge_is_dead(&connect_err).await,
            "a connection refusal indicates the sidecar listener is gone"
        );

        // Timeout error: a real loopback listener accepts the connection but
        // never sends a response, so the connect succeeds and the per-request
        // timeout fires while awaiting the reply. This keeps the test
        // deterministic and free of any external-network dependency.
        let listener = TcpListener::bind("127.0.0.1:0").expect("bind black-hole listener");
        let addr = listener.local_addr().expect("listener addr");
        // Hold the listener open without accepting/responding for the test.
        let _accept = std::thread::spawn(move || {
            // Accept the single connection so it doesn't get refused, then sit
            // on it; the OS keeps the accepted socket alive until the thread or
            // process ends.
            let _conn = listener.accept();
            std::thread::sleep(Duration::from_secs(2));
        });
        let timeout_err = client
            .get(format!("http://{addr}/health"))
            .timeout(Duration::from_millis(100))
            .send()
            .await
            .expect_err("a request to a non-responding listener must time out");
        assert!(
            timeout_err.is_timeout(),
            "expected a timeout error, got: {timeout_err}"
        );
        assert!(
            !bridge_is_dead(&timeout_err).await,
            "a transient timeout must not tear down an otherwise-healthy sidecar"
        );
    }
}