cellos-broker-vault 0.5.1

//! [`SecretBroker`] that fetches secrets from HashiCorp Vault using AppRole authentication.
//!
//! # Overview
//!
//! AppRole is the canonical machine-to-machine authentication method for Vault.
//! The operator pre-stages a `role_id` and `secret_id` for the runner; the broker
//! exchanges them for a short-lived Vault token at resolve time, then reads the
//! requested secret from Vault KV v2.
//!
//! # Configuration
//!
//! | Env var | Required | Description |
//! |---------|----------|-------------|
//! | `CELLOS_VAULT_ADDR` | yes | Vault server address, e.g. `https://vault.example.com` |
//! | `CELLOS_VAULT_ROLE_ID` | yes | AppRole role_id |
//! | `CELLOS_VAULT_SECRET_ID` | yes | AppRole secret_id |
//! | `CELLOS_VAULT_KV_MOUNT` | no | KV v2 mount name (default: `secret`) |
//! | `CELLOS_VAULT_KV_PATH_PREFIX` | no | Path prefix prepended to every key (e.g. `cellos/prod`) |
//! | `CELLOS_VAULT_NAMESPACE` | no | Vault Enterprise namespace (`X-Vault-Namespace` header) |
//! | `CELLOS_CA_BUNDLE` | no | PEM CA bundle for TLS to private Vault endpoints |
//!
//! # Secret path resolution
//!
//! For `resolve("DB_PASSWORD", ...)`, the broker reads from:
//! ```text
//! GET {CELLOS_VAULT_ADDR}/v1/{KV_MOUNT}/data/{PATH_PREFIX}/{key}
//! ```
//!
//! If `CELLOS_VAULT_KV_PATH_PREFIX` is not set, no prefix is added and the key is
//! used directly. The `data` wrapper is Vault KV v2 format.
//!
//! # Auth flow
//!
//! Classic `resolve(...)`:
//! 1. POST `{addr}/v1/auth/approle/login` with `{ "role_id": "...", "secret_id": "..." }`
//! 2. GET `{addr}/v1/{mount}/data/{path}` with `X-Vault-Token: {client_token}`
//! 3. Extract `data.data.{key}` from the KV v2 response envelope
//!
//! A fresh token is obtained for each `resolve` call. This avoids persistent token
//! state in the supervisor process and ensures each secret fetch is independently
//! authenticated. Use a `use_limit=1` or short `ttl` policy on the AppRole if you
//! want one-shot tokens.
//!
//! `runtimeLeasedBroker`:
//! 1. `prepare_runtime_secret_lease(...)` performs a single AppRole login for the cell
//! 2. `fetch_runtime_secret(...)` reuses the cached token for on-demand KV reads
//! 3. `revoke_for_cell(...)` calls `/v1/auth/token/revoke-self` and drops the cached token
//!
//! # Revocation
//!
//! For classic env delivery, revocation still depends on short Vault TTLs because a raw
//! secret has already been materialized into the child process. For `runtimeLeasedBroker`,
//! `revoke_for_cell` actively revokes the cached Vault token and drops the local runtime
//! channel, so future fetches fail both locally and upstream.
//!
//! # TLS
//!
//! Honors `CELLOS_CA_BUNDLE` (PEM file path) for Vault endpoints behind a private CA.
//! `HTTP_PROXY`, `HTTPS_PROXY`, and `NO_PROXY` are respected automatically by reqwest.
//!
//! # Timeout contract (BROKER-VAULT-TIMEOUT)
//!
//! The reqwest client is built with **bounded** request and connect timeouts so a
//! hung Vault endpoint cannot stall a cell's secret-resolve phase indefinitely:
//!
//! - Request timeout: [`DEFAULT_REQUEST_TIMEOUT_MS`] (override via
//!   `CELLOS_VAULT_TIMEOUT_MS`). Default is 15 s — Vault calls are interactive
//!   (login + KV read), so a shorter ceiling than artifact upload is appropriate.
//! - Connect timeout: [`DEFAULT_CONNECT_TIMEOUT_MS`] (override via
//!   `CELLOS_VAULT_CONNECT_TIMEOUT_MS`).
//!
//! Both env vars accept a positive `u64` count of milliseconds; unparseable or
//! zero values fall back to the default. The client is **never** constructed
//! without explicit timeouts.
//!
//! # Correlation propagation (Tranche-1 seam-freeze G1)
//!
//! Each cell run gets a fresh AppRole login (or a single login when
//! `runtimeLeasedBroker` is active). The Vault `client_token` returned by
//! that login is the natural seed for a broker-side correlation ID, but
//! Vault tokens are sensitive material — we do **not** surface the token
//! itself. [`SecretBroker::broker_correlation_id`] therefore returns `None`
//! today, and the supervisor uses the operator-supplied
//! `spec.correlation.correlationId` for cross-tool correlation. A future
//! revision may emit `urn:vault:lease:<accessor>` once the seam-freeze
//! consumer tools (taudit, tencrypt) commit to that shape — accessors are
//! safe to expose because they bind to the lease's audit log entry rather
//! than the credential.

use std::collections::HashMap;
use std::sync::Mutex;
use std::time::Duration;

use async_trait::async_trait;
use cellos_core::ports::SecretBroker;
use cellos_core::{CellosError, RuntimeSecretLeaseRequest, SecretView};
use serde::Deserialize;
use std::fmt;
use tracing::instrument;
use zeroize::{Zeroize, ZeroizeOnDrop};

/// Default total request timeout (ms) applied to every Vault HTTP call.
///
/// 15 seconds: Vault interactions are interactive (login + KV read), so a
/// tighter default than the artifact-upload sinks is appropriate. Long enough
/// for a slow private network, short enough that a black-holed Vault endpoint
/// does not block the secret-resolve phase indefinitely.
pub const DEFAULT_REQUEST_TIMEOUT_MS: u64 = 15_000;

/// Default TCP connect timeout (ms) for the underlying reqwest client.
pub const DEFAULT_CONNECT_TIMEOUT_MS: u64 = 10_000;

/// Env var to override [`DEFAULT_REQUEST_TIMEOUT_MS`].
pub const ENV_REQUEST_TIMEOUT_MS: &str = "CELLOS_VAULT_TIMEOUT_MS";

/// Env var to override [`DEFAULT_CONNECT_TIMEOUT_MS`].
pub const ENV_CONNECT_TIMEOUT_MS: &str = "CELLOS_VAULT_CONNECT_TIMEOUT_MS";

/// Resolve a timeout in milliseconds from the named env var.
///
/// Returns `default_ms` when the env var is unset, empty, non-numeric, or `0`.
/// Pure function — exposed so callers (and contract tests) can verify the
/// resolution policy without constructing a client.
pub fn resolve_timeout_ms(env_var: &str, default_ms: u64) -> u64 {
    match std::env::var(env_var) {
        Ok(raw) => raw
            .trim()
            .parse::<u64>()
            .ok()
            .filter(|v| *v > 0)
            .unwrap_or(default_ms),
        Err(_) => default_ms,
    }
}

/// Vault AppRole secret broker.
///
/// Authenticates with Vault per `resolve` call for classic env delivery, and can
/// also hold a cell-scoped upstream token for `runtimeLeasedBroker`.
///
/// # D7 redaction
///
/// `Debug` is hand-written to redact both `role_id` and `secret_id` so
/// accidental `tracing::debug!("{:?}", broker)` calls cannot leak the credential
/// through logs. Zeroization on drop is delegated to a `ZeroizeOnDrop` derive
/// that wipes `secret_id`; cached [`RuntimeVaultLease`] tokens are zeroized via
/// their own `ZeroizeOnDrop` derive when the inner `Mutex<HashMap<..>>` drops.
#[derive(ZeroizeOnDrop)]
pub struct VaultAppRoleBroker {
    #[zeroize(skip)]
    client: reqwest::Client,
    /// Vault server address — not sensitive.
    #[zeroize(skip)]
    addr: String,
    /// AppRole role_id — not a credential itself but treated as non-public.
    #[zeroize(skip)]
    role_id: String,
    /// AppRole secret_id — sensitive, zeroized on drop and redacted in Debug.
    secret_id: String,
    #[zeroize(skip)]
    kv_mount: String,
    #[zeroize(skip)]
    kv_path_prefix: Option<String>,
    #[zeroize(skip)]
    namespace: Option<String>,
    /// Runtime-lease map. Each contained [`RuntimeVaultLease`] derives
    /// `ZeroizeOnDrop`, so dropping the map zeroizes every cached token.
    #[zeroize(skip)]
    runtime_leases: Mutex<HashMap<String, RuntimeVaultLease>>,
}

impl fmt::Debug for VaultAppRoleBroker {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("VaultAppRoleBroker")
            .field("addr", &self.addr)
            .field("role_id", &"<redacted>")
            .field("secret_id", &"<redacted>")
            .field("kv_mount", &self.kv_mount)
            .field("kv_path_prefix", &self.kv_path_prefix)
            .field("namespace", &self.namespace)
            // Mask the runtime-lease map by cardinality only — do not iterate
            // tokens through Debug.
            .field(
                "runtime_leases",
                &format_args!(
                    "<{} cell(s)>",
                    self.runtime_leases
                        .lock()
                        .map(|g| g.len())
                        .unwrap_or_else(|e| e.into_inner().len())
                ),
            )
            .finish()
    }
}

#[derive(ZeroizeOnDrop)]
struct RuntimeVaultLease {
    /// Vault `client_token` — sensitive, zeroized on drop and redacted in Debug.
    token: String,
}

impl fmt::Debug for RuntimeVaultLease {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        f.debug_struct("RuntimeVaultLease")
            .field("token", &"<REDACTED>")
            .finish()
    }
}

impl RuntimeVaultLease {
    /// Explicit early zeroization, for paths that revoke a token before the
    /// owning struct would otherwise drop (e.g. lease replacement).
    fn zeroize(&mut self) {
        self.token.zeroize();
    }
}

// ── Vault HTTP response shapes ─────────────────────────────────────────────

/// VAULT-CT-ZERO (reviewer wave 9, follow-up #1):
///
/// `serde::Deserialize` is not implemented for `zeroize::Zeroizing<String>` in
/// the upstream `zeroize` crate, so we cannot wrap the field directly at the
/// struct level. To minimize the un-zeroized window we:
///
/// 1. Keep `client_token: String` (serde-friendly), but
/// 2. Use [`std::mem::take`] in [`VaultAppRoleBroker::login`] to *move* the
///    token's heap buffer out of the parsed struct in a single op (leaving
///    an empty `String` behind), then explicitly drop the parsed
///    [`VaultLoginResponse`] before returning. The consumer (the caller of
///    `login`) then wraps the moved buffer in [`zeroize::Zeroizing`] (or
///    stores it in a [`RuntimeVaultLease`] that zeroizes on drop / revoke),
///    so the only live `String` holding the token bytes is now under our
///    zeroization regime.
/// 3. Implement [`Debug`] manually so the token cannot leak via `{:?}` /
///    `tracing::error!` spans on parse errors or future log additions.
///
/// We deliberately do NOT implement [`Drop`] on [`VaultLoginResponse`] /
/// [`VaultAuth`] because that would forbid moving the inner `String` out at
/// all (Rust forbids partial moves out of `Drop` types). The
/// [`std::mem::take`] pattern is the canonical replacement: it leaves the
/// original allocation in place to be freed by the normal struct drop while
/// the consumer wraps the live token in [`zeroize::Zeroizing`].
#[derive(Deserialize)]
struct VaultLoginResponse {
    auth: VaultAuth,
}

#[derive(Deserialize)]
struct VaultAuth {
    client_token: String,
}

impl std::fmt::Debug for VaultLoginResponse {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("VaultLoginResponse")
            .field("auth", &self.auth)
            .finish()
    }
}

impl std::fmt::Debug for VaultAuth {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("VaultAuth")
            .field("client_token", &"<redacted>")
            .finish()
    }
}

#[derive(Deserialize)]
struct VaultKvResponse {
    data: VaultKvDataWrapper,
}

#[derive(Deserialize)]
struct VaultKvDataWrapper {
    data: serde_json::Map<String, serde_json::Value>,
}

// ── CA bundle helper (same pattern as cellos-export-http) ─────────────────

/// Build a reqwest client that honours `CELLOS_CA_BUNDLE` (path to a PEM CA bundle).
///
/// Always installs **bounded** request and connect timeouts (see module docs).
fn http_client_builder() -> Result<reqwest::ClientBuilder, String> {
    let request_timeout = Duration::from_millis(resolve_timeout_ms(
        ENV_REQUEST_TIMEOUT_MS,
        DEFAULT_REQUEST_TIMEOUT_MS,
    ));
    let connect_timeout = Duration::from_millis(resolve_timeout_ms(
        ENV_CONNECT_TIMEOUT_MS,
        DEFAULT_CONNECT_TIMEOUT_MS,
    ));
    let mut builder = reqwest::Client::builder()
        .timeout(request_timeout)
        .connect_timeout(connect_timeout);
    if let Ok(path) = std::env::var("CELLOS_CA_BUNDLE") {
        let pem =
            std::fs::read(&path).map_err(|e| format!("CELLOS_CA_BUNDLE: read {path}: {e}"))?;
        let mut added = 0usize;
        for block in pem_cert_blocks(&pem) {
            let cert = reqwest::Certificate::from_pem(&block)
                .map_err(|e| format!("CELLOS_CA_BUNDLE: parse cert in {path}: {e}"))?;
            builder = builder.add_root_certificate(cert);
            added += 1;
        }
        if added == 0 {
            return Err(format!("CELLOS_CA_BUNDLE: no certificates found in {path}"));
        }
        tracing::debug!(path = %path, count = added, "CELLOS_CA_BUNDLE: loaded CA certificates");
    }
    Ok(builder)
}

fn pem_cert_blocks(pem: &[u8]) -> Vec<Vec<u8>> {
    let text = String::from_utf8_lossy(pem);
    let mut blocks = Vec::new();
    let mut current = String::new();
    let mut in_block = false;
    for line in text.lines() {
        if line.starts_with("-----BEGIN ") {
            in_block = true;
            current.clear();
        }
        if in_block {
            current.push_str(line);
            current.push('\n');
            if line.starts_with("-----END ") {
                blocks.push(current.as_bytes().to_vec());
                in_block = false;
            }
        }
    }
    blocks
}

// ── Implementation ─────────────────────────────────────────────────────────

impl VaultAppRoleBroker {
    /// Construct from environment variables. Reads all config at construction time
    /// so misconfiguration fails at startup rather than at first secret resolve.
    pub fn from_env() -> Result<Self, CellosError> {
        let addr = std::env::var("CELLOS_VAULT_ADDR")
            .map_err(|_| CellosError::SecretBroker("CELLOS_VAULT_ADDR not set".into()))?;
        let addr = addr.trim().trim_end_matches('/').to_string();
        if addr.is_empty() {
            return Err(CellosError::SecretBroker(
                "CELLOS_VAULT_ADDR is empty after trim".into(),
            ));
        }
        let parsed = reqwest::Url::parse(&addr).map_err(|e| {
            CellosError::SecretBroker(format!("CELLOS_VAULT_ADDR invalid URL: {e}"))
        })?;
        let scheme = parsed.scheme();
        if scheme != "http" && scheme != "https" {
            return Err(CellosError::SecretBroker(format!(
                "CELLOS_VAULT_ADDR scheme must be http or https, got {scheme}"
            )));
        }

        let role_id = std::env::var("CELLOS_VAULT_ROLE_ID")
            .map_err(|_| CellosError::SecretBroker("CELLOS_VAULT_ROLE_ID not set".into()))?;
        if role_id.trim().is_empty() {
            return Err(CellosError::SecretBroker(
                "CELLOS_VAULT_ROLE_ID is empty".into(),
            ));
        }

        let secret_id = std::env::var("CELLOS_VAULT_SECRET_ID")
            .map_err(|_| CellosError::SecretBroker("CELLOS_VAULT_SECRET_ID not set".into()))?;
        if secret_id.trim().is_empty() {
            return Err(CellosError::SecretBroker(
                "CELLOS_VAULT_SECRET_ID is empty".into(),
            ));
        }

        let kv_mount =
            std::env::var("CELLOS_VAULT_KV_MOUNT").unwrap_or_else(|_| "secret".to_string());
        let kv_mount = kv_mount.trim().trim_matches('/').to_string();

        let kv_path_prefix = std::env::var("CELLOS_VAULT_KV_PATH_PREFIX")
            .ok()
            .map(|p| p.trim().trim_matches('/').to_string())
            .filter(|p| !p.is_empty());

        let namespace = std::env::var("CELLOS_VAULT_NAMESPACE")
            .ok()
            .map(|n| n.trim().to_string())
            .filter(|n| !n.is_empty());

        let client = http_client_builder()
            .map_err(CellosError::SecretBroker)?
            .build()
            .map_err(|e| CellosError::SecretBroker(format!("vault http client init: {e}")))?;

        Ok(Self {
            client,
            addr,
            role_id,
            secret_id,
            kv_mount,
            kv_path_prefix,
            namespace,
            runtime_leases: Mutex::new(HashMap::new()),
        })
    }

    /// Authenticate with AppRole and return a short-lived Vault token.
    async fn login(&self) -> Result<String, CellosError> {
        let url = format!("{}/v1/auth/approle/login", self.addr);
        let body = serde_json::json!({
            "role_id": self.role_id,
            "secret_id": self.secret_id,
        });
        let mut req = self.client.post(&url).json(&body);
        if let Some(ref ns) = self.namespace {
            req = req.header("X-Vault-Namespace", ns);
        }
        let resp = req
            .send()
            .await
            .map_err(|e| CellosError::SecretBroker(format!("vault approle login request: {e}")))?;

        if !resp.status().is_success() {
            let status = resp.status();
            let body = resp.text().await.unwrap_or_default();
            return Err(CellosError::SecretBroker(format!(
                "vault approle login returned {status}: {body}"
            )));
        }

        let mut login: VaultLoginResponse = resp
            .json()
            .await
            .map_err(|e| CellosError::SecretBroker(format!("vault login response parse: {e}")))?;

        // VAULT-CT-ZERO: take ownership of the token's heap buffer immediately,
        // leaving an empty `String` in the parsed struct. The caller wraps
        // this into `Zeroizing<String>` (or moves it into a `RuntimeVaultLease`
        // that is zeroized on drop / revoke), so the only `String` allocation
        // holding the token bytes is now under our zeroization regime.
        let token = std::mem::take(&mut login.auth.client_token);
        drop(login);
        tracing::debug!("vault approle login succeeded");
        Ok(token)
    }

    /// Build the KV v2 path for a given key.
    fn kv_path(&self, key: &str) -> String {
        match &self.kv_path_prefix {
            Some(prefix) => format!("{}/v1/{}/data/{}/{}", self.addr, self.kv_mount, prefix, key),
            None => format!("{}/v1/{}/data/{}", self.addr, self.kv_mount, key),
        }
    }

    /// Fetch a secret value from Vault KV v2 using an already-obtained token.
    async fn fetch_secret(&self, token: &str, key: &str) -> Result<String, CellosError> {
        let url = self.kv_path(key);
        let mut req = self.client.get(&url).header("X-Vault-Token", token);
        if let Some(ref ns) = self.namespace {
            req = req.header("X-Vault-Namespace", ns);
        }
        let resp = req
            .send()
            .await
            .map_err(|e| CellosError::SecretBroker(format!("vault kv read request: {e}")))?;

        if resp.status().as_u16() == 404 {
            return Err(CellosError::SecretBroker(format!(
                "vault kv secret not found: {key}"
            )));
        }
        if !resp.status().is_success() {
            let status = resp.status();
            let body = resp.text().await.unwrap_or_default();
            return Err(CellosError::SecretBroker(format!(
                "vault kv read returned {status}: {body}"
            )));
        }

        let kv: VaultKvResponse = resp
            .json()
            .await
            .map_err(|e| CellosError::SecretBroker(format!("vault kv response parse: {e}")))?;

        // Extract the value from data.data[key]; fall back to first value if not present.
        let value = kv
            .data
            .data
            .get(key)
            .or_else(|| kv.data.data.values().next())
            .ok_or_else(|| {
                CellosError::SecretBroker(format!(
                    "vault kv secret {key:?} has no fields in data.data"
                ))
            })?;

        match value {
            serde_json::Value::String(s) => Ok(s.clone()),
            other => Ok(other.to_string()),
        }
    }

    async fn revoke_token(&self, token: &str) -> Result<(), CellosError> {
        let url = format!("{}/v1/auth/token/revoke-self", self.addr);
        let mut req = self.client.post(&url).header("X-Vault-Token", token);
        if let Some(ref ns) = self.namespace {
            req = req.header("X-Vault-Namespace", ns);
        }
        let resp = req.send().await.map_err(|e| {
            CellosError::SecretBroker(format!("vault revoke-self request failed: {e}"))
        })?;

        if !resp.status().is_success() {
            let status = resp.status();
            let body = resp.text().await.unwrap_or_default();
            return Err(CellosError::SecretBroker(format!(
                "vault revoke-self returned {status}: {body}"
            )));
        }

        Ok(())
    }

    fn take_runtime_lease(&self, cell_id: &str) -> Option<RuntimeVaultLease> {
        self.runtime_leases
            .lock()
            .unwrap_or_else(|e| e.into_inner())
            .remove(cell_id)
    }

    fn insert_runtime_lease(&self, cell_id: &str, lease: RuntimeVaultLease) {
        if let Some(mut previous) = self
            .runtime_leases
            .lock()
            .unwrap_or_else(|e| e.into_inner())
            .insert(cell_id.to_string(), lease)
        {
            previous.zeroize();
        }
    }

    /// Introspection surface (E2-03): does this broker hold a runtime lease for
    /// `cell_id`? Used by tests to assert that a partial-resolve failure left
    /// no orphan lease in the local map.
    pub fn has_runtime_lease(&self, cell_id: &str) -> bool {
        self.runtime_leases
            .lock()
            .unwrap_or_else(|e| e.into_inner())
            .contains_key(cell_id)
    }

    /// Introspection surface (E2-03): total number of runtime leases currently
    /// held by this broker. Used by tests to assert no orphan leases remain
    /// across cells after a forced mid-resolve failure.
    pub fn runtime_lease_count(&self) -> usize {
        self.runtime_leases
            .lock()
            .unwrap_or_else(|e| e.into_inner())
            .len()
    }
}

#[async_trait]
impl SecretBroker for VaultAppRoleBroker {
    /// Resolves a secret from Vault KV v2.
    ///
    /// Performs AppRole login + KV read per call. No token cache.
    #[instrument(skip(self), fields(key = %key, cell_id = %cell_id))]
    async fn resolve(
        &self,
        key: &str,
        cell_id: &str,
        _ttl_seconds: u64,
    ) -> Result<SecretView, CellosError> {
        tracing::debug!(key = %key, cell_id = %cell_id, "resolving vault secret");
        // VAULT-CT-ZERO: wrap the per-resolve Vault token in `Zeroizing` so its
        // bytes are wiped on scope exit (success or error) before this function
        // returns to the supervisor.
        let token = zeroize::Zeroizing::new(self.login().await?);
        let value = self.fetch_secret(token.as_str(), key).await?;
        tracing::info!(key = %key, cell_id = %cell_id, "vault secret resolved");
        Ok(SecretView {
            key: key.to_string(),
            value: zeroize::Zeroizing::new(value),
        })
    }

    async fn prepare_runtime_secret_lease(
        &self,
        cell_id: &str,
        requests: &[RuntimeSecretLeaseRequest],
    ) -> Result<(), CellosError> {
        if requests.is_empty() {
            return Ok(());
        }

        if let Some(mut previous) = self.take_runtime_lease(cell_id) {
            let revoke_result = self.revoke_token(&previous.token).await;
            previous.zeroize();
            revoke_result?;
        }

        // E2-03: From the moment login() returns, we hold a materialised
        // upstream Vault lease. Any failure on the way out — including the
        // per-key pre-warm validation below — must revoke that lease before
        // returning Err, otherwise the cell's token outlives the prepare call
        // both at Vault (zombie token) and, if it had been inserted, locally
        // (orphan map entry). We therefore keep the token in a local before
        // inserting and explicitly revoke on every Err exit.
        let mut token = self.login().await?;

        // Pre-warm: validate that each requested key is readable with this
        // token. Catching failures here surfaces missing keys / policy gaps
        // at admission time rather than at first runtime fetch, and it is
        // also the realistic "subsequent step that fails after lease was
        // materialised" — the failure mode E2-03 hardens against.
        for req in requests {
            if let Err(e) = self.fetch_secret(&token, &req.key).await {
                // Revoke the upstream lease before propagating the error so
                // we leave neither a zombie token at Vault nor an orphan
                // entry in the local lease map (we have not inserted yet).
                if let Err(revoke_err) = self.revoke_token(&token).await {
                    tracing::warn!(
                        cell_id = %cell_id,
                        revoke_error = %revoke_err,
                        "failed to revoke partial Vault lease after prepare error; \
                         upstream may rely on TTL"
                    );
                }
                tracing::warn!(
                    cell_id = %cell_id,
                    key = %req.key,
                    error = %e,
                    "Vault prepare aborted; partial lease revoked (E2-03)"
                );
                token.zeroize();
                return Err(e);
            }
        }

        self.insert_runtime_lease(cell_id, RuntimeVaultLease { token });
        tracing::info!(
            cell_id = %cell_id,
            secret_count = requests.len(),
            "prepared Vault runtime secret lease"
        );
        Ok(())
    }

    async fn fetch_runtime_secret(
        &self,
        key: &str,
        cell_id: &str,
        _ttl_seconds: u64,
    ) -> Result<SecretView, CellosError> {
        let mut token = self
            .runtime_leases
            .lock()
            .unwrap_or_else(|e| e.into_inner())
            .get(cell_id)
            .map(|lease| lease.token.clone())
            .ok_or_else(|| {
                CellosError::SecretBroker(format!(
                    "no prepared Vault runtime lease for cell {cell_id:?}"
                ))
            })?;

        let result = self
            .fetch_secret(&token, key)
            .await
            .map(|value| SecretView {
                key: key.to_string(),
                value: zeroize::Zeroizing::new(value),
            });
        token.zeroize();
        result
    }

    async fn revoke_for_cell(&self, cell_id: &str) -> Result<(), CellosError> {
        let Some(mut lease) = self.take_runtime_lease(cell_id) else {
            return Ok(());
        };

        let revoke_result = self.revoke_token(&lease.token).await;
        lease.zeroize();
        revoke_result
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::io::{BufRead, BufReader, Read, Write};
    use std::net::{TcpListener, TcpStream};
    use std::thread;
    use std::time::{Duration, Instant};

    #[derive(Debug)]
    struct CapturedRequest {
        method: String,
        target: String,
        token: Option<String>,
        body: String,
    }

    fn read_request(stream: &mut TcpStream) -> CapturedRequest {
        let mut reader = BufReader::new(stream.try_clone().expect("clone stream"));
        let mut request_line = String::new();
        reader
            .read_line(&mut request_line)
            .expect("read request line");
        assert!(!request_line.trim().is_empty(), "expected request line");

        let mut content_length = 0usize;
        let mut token = None;
        loop {
            let mut line = String::new();
            reader.read_line(&mut line).expect("read header");
            if line == "\r\n" || line.is_empty() {
                break;
            }
            if let Some((name, value)) = line.split_once(':') {
                let name = name.trim().to_ascii_lowercase();
                let value = value.trim().to_string();
                if name == "content-length" {
                    content_length = value.parse::<usize>().expect("parse content-length");
                } else if name == "x-vault-token" {
                    token = Some(value);
                }
            }
        }

        let mut body = vec![0u8; content_length];
        reader.read_exact(&mut body).expect("read request body");

        let mut parts = request_line.split_whitespace();
        let method = parts.next().expect("method").to_string();
        let target = parts.next().expect("target").to_string();
        CapturedRequest {
            method,
            target,
            token,
            body: String::from_utf8(body).expect("utf8 request body"),
        }
    }

    fn write_response(stream: &mut TcpStream, status_line: &str, body: &str, content_type: &str) {
        write!(
            stream,
            "HTTP/1.1 {status_line}\r\nContent-Type: {content_type}\r\nContent-Length: {}\r\nConnection: close\r\n\r\n{body}",
            body.len()
        )
        .expect("write response");
        stream.flush().expect("flush response");
    }

    fn start_mock_vault(
        expected_requests: usize,
    ) -> (String, thread::JoinHandle<Vec<CapturedRequest>>) {
        let listener = TcpListener::bind("127.0.0.1:0").expect("bind mock vault");
        listener
            .set_nonblocking(true)
            .expect("set mock vault nonblocking");
        let addr = listener.local_addr().expect("mock vault addr");
        let handle = thread::spawn(move || {
            let deadline = Instant::now() + Duration::from_secs(10);
            let mut requests = Vec::new();
            while requests.len() < expected_requests && Instant::now() < deadline {
                match listener.accept() {
                    Ok((mut stream, _)) => {
                        stream
                            .set_nonblocking(false)
                            .expect("set accepted stream blocking");
                        let request = read_request(&mut stream);
                        match (request.method.as_str(), request.target.as_str()) {
                            ("POST", "/v1/auth/approle/login") => write_response(
                                &mut stream,
                                "200 OK",
                                r#"{"auth":{"client_token":"vault-token"}}"#,
                                "application/json",
                            ),
                            ("GET", "/v1/secret/data/API_TOKEN") => write_response(
                                &mut stream,
                                "200 OK",
                                r#"{"data":{"data":{"API_TOKEN":"leased-secret"}}}"#,
                                "application/json",
                            ),
                            ("POST", "/v1/auth/token/revoke-self") => {
                                write_response(&mut stream, "204 No Content", "", "text/plain")
                            }
                            _ => write_response(
                                &mut stream,
                                "404 Not Found",
                                "unexpected request",
                                "text/plain",
                            ),
                        }
                        requests.push(request);
                    }
                    Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => {
                        thread::sleep(Duration::from_millis(20));
                    }
                    Err(err) => panic!("mock vault accept failed: {err}"),
                }
            }
            requests
        });

        (format!("http://{addr}"), handle)
    }

    fn set_required_env() {
        std::env::set_var("CELLOS_VAULT_ADDR", "https://vault.example.com");
        std::env::set_var("CELLOS_VAULT_ROLE_ID", "test-role-id");
        std::env::set_var("CELLOS_VAULT_SECRET_ID", "test-secret-id");
        std::env::remove_var("CELLOS_VAULT_KV_MOUNT");
        std::env::remove_var("CELLOS_VAULT_KV_PATH_PREFIX");
        std::env::remove_var("CELLOS_VAULT_NAMESPACE");
        std::env::remove_var("CELLOS_CA_BUNDLE");
    }

    fn clear_required_env() {
        std::env::remove_var("CELLOS_VAULT_ADDR");
        std::env::remove_var("CELLOS_VAULT_ROLE_ID");
        std::env::remove_var("CELLOS_VAULT_SECRET_ID");
    }

    use std::sync::Mutex;
    static ENV_LOCK: Mutex<()> = Mutex::new(());

    /// Acquire the env lock, recovering from poison (a prior test panicked while holding it).
    fn env_lock() -> std::sync::MutexGuard<'static, ()> {
        ENV_LOCK.lock().unwrap_or_else(|e| e.into_inner())
    }

    #[test]
    fn constructs_with_required_env() {
        let _g = env_lock();
        set_required_env();
        let broker = VaultAppRoleBroker::from_env();
        clear_required_env();
        assert!(broker.is_ok());
    }

    #[test]
    fn fails_when_addr_missing() {
        let _g = env_lock();
        set_required_env();
        std::env::remove_var("CELLOS_VAULT_ADDR");
        let err = VaultAppRoleBroker::from_env().unwrap_err();
        clear_required_env();
        assert!(err.to_string().contains("CELLOS_VAULT_ADDR"), "got: {err}");
    }

    #[test]
    fn fails_when_role_id_missing() {
        let _g = env_lock();
        set_required_env();
        std::env::remove_var("CELLOS_VAULT_ROLE_ID");
        let err = VaultAppRoleBroker::from_env().unwrap_err();
        clear_required_env();
        assert!(
            err.to_string().contains("CELLOS_VAULT_ROLE_ID"),
            "got: {err}"
        );
    }

    #[test]
    fn fails_when_secret_id_missing() {
        let _g = env_lock();
        set_required_env();
        std::env::remove_var("CELLOS_VAULT_SECRET_ID");
        let err = VaultAppRoleBroker::from_env().unwrap_err();
        clear_required_env();
        assert!(
            err.to_string().contains("CELLOS_VAULT_SECRET_ID"),
            "got: {err}"
        );
    }

    #[test]
    fn fails_when_addr_not_http() {
        let _g = env_lock();
        set_required_env();
        std::env::set_var("CELLOS_VAULT_ADDR", "grpc://vault.example.com");
        let err = VaultAppRoleBroker::from_env().unwrap_err();
        clear_required_env();
        assert!(err.to_string().contains("http or https"), "got: {err}");
    }

    #[test]
    fn uses_default_kv_mount() {
        let _g = env_lock();
        set_required_env();
        let broker = VaultAppRoleBroker::from_env().unwrap();
        clear_required_env();
        assert_eq!(broker.kv_mount, "secret");
    }

    #[test]
    fn custom_kv_mount_and_prefix() {
        let _g = env_lock();
        set_required_env();
        std::env::set_var("CELLOS_VAULT_KV_MOUNT", "kv");
        std::env::set_var("CELLOS_VAULT_KV_PATH_PREFIX", "cellos/prod");
        let broker = VaultAppRoleBroker::from_env().unwrap();
        clear_required_env();
        std::env::remove_var("CELLOS_VAULT_KV_MOUNT");
        std::env::remove_var("CELLOS_VAULT_KV_PATH_PREFIX");
        assert_eq!(broker.kv_mount, "kv");
        assert_eq!(broker.kv_path_prefix.as_deref(), Some("cellos/prod"));
    }

    #[test]
    fn kv_path_without_prefix() {
        let _g = env_lock();
        set_required_env();
        let broker = VaultAppRoleBroker::from_env().unwrap();
        clear_required_env();
        assert_eq!(
            broker.kv_path("DB_PASSWORD"),
            "https://vault.example.com/v1/secret/data/DB_PASSWORD"
        );
    }

    #[test]
    fn kv_path_with_prefix() {
        let _g = env_lock();
        set_required_env();
        std::env::set_var("CELLOS_VAULT_KV_MOUNT", "kv");
        std::env::set_var("CELLOS_VAULT_KV_PATH_PREFIX", "cellos/prod");
        let broker = VaultAppRoleBroker::from_env().unwrap();
        std::env::remove_var("CELLOS_VAULT_KV_MOUNT");
        std::env::remove_var("CELLOS_VAULT_KV_PATH_PREFIX");
        clear_required_env();
        assert_eq!(
            broker.kv_path("DB_PASSWORD"),
            "https://vault.example.com/v1/kv/data/cellos/prod/DB_PASSWORD"
        );
    }

    #[tokio::test]
    async fn resolve_fails_without_vault_running() {
        let broker = {
            let _g = env_lock();
            set_required_env();
            // Use localhost on an unbound port — no Vault server running.
            std::env::set_var("CELLOS_VAULT_ADDR", "http://127.0.0.1:19999");
            let broker = VaultAppRoleBroker::from_env().unwrap();
            clear_required_env();
            broker
        };
        let err = broker.resolve("ANY_KEY", "cell-1", 60).await.unwrap_err();
        assert!(
            err.to_string().contains("vault approle login"),
            "got: {err}"
        );
    }

    #[tokio::test]
    async fn runtime_leased_prepare_fetches_and_revokes_token() {
        // E2-03: prepare pre-warms each requested key, so the expected
        // sequence is now: login → GET (prewarm) → GET (runtime fetch) →
        // POST revoke-self = 4 requests.
        let (addr, server) = start_mock_vault(4);
        let broker = {
            let _g = env_lock();
            set_required_env();
            std::env::set_var("CELLOS_VAULT_ADDR", addr);
            let broker = VaultAppRoleBroker::from_env().unwrap();
            clear_required_env();
            broker
        };

        broker
            .prepare_runtime_secret_lease(
                "cell-1",
                &[RuntimeSecretLeaseRequest {
                    key: "API_TOKEN".into(),
                    ttl_seconds: 60,
                }],
            )
            .await
            .unwrap();

        let view = broker
            .fetch_runtime_secret("API_TOKEN", "cell-1", 60)
            .await
            .unwrap();
        assert_eq!(view.key, "API_TOKEN");
        assert_eq!(view.value.as_str(), "leased-secret");

        broker.revoke_for_cell("cell-1").await.unwrap();
        let requests = server.join().expect("join mock vault");
        assert_eq!(requests.len(), 4);
        assert_eq!(requests[0].method, "POST");
        assert_eq!(requests[0].target, "/v1/auth/approle/login");
        assert!(requests[0].body.contains("\"role_id\":\"test-role-id\""));
        assert!(requests[0]
            .body
            .contains("\"secret_id\":\"test-secret-id\""));
        // prewarm GET
        assert_eq!(requests[1].method, "GET");
        assert_eq!(requests[1].target, "/v1/secret/data/API_TOKEN");
        assert_eq!(requests[1].token.as_deref(), Some("vault-token"));
        // runtime fetch GET
        assert_eq!(requests[2].method, "GET");
        assert_eq!(requests[2].target, "/v1/secret/data/API_TOKEN");
        assert_eq!(requests[2].token.as_deref(), Some("vault-token"));
        assert_eq!(requests[3].method, "POST");
        assert_eq!(requests[3].target, "/v1/auth/token/revoke-self");
        assert_eq!(requests[3].token.as_deref(), Some("vault-token"));
    }

    #[tokio::test]
    async fn runtime_leased_fetch_requires_prepared_lease() {
        let broker = {
            let _g = env_lock();
            set_required_env();
            let broker = VaultAppRoleBroker::from_env().unwrap();
            clear_required_env();
            broker
        };
        let err = broker
            .fetch_runtime_secret("API_TOKEN", "missing-cell", 60)
            .await
            .unwrap_err();
        assert!(
            err.to_string().contains("no prepared Vault runtime lease"),
            "got: {err}"
        );
    }

    #[tokio::test]
    async fn revoke_without_prepared_lease_is_ok() {
        let broker = {
            let _g = env_lock();
            set_required_env();
            let broker = VaultAppRoleBroker::from_env().unwrap();
            clear_required_env();
            broker
        };
        broker.revoke_for_cell("any-cell").await.unwrap();
    }

    /// VAULT-CT-ZERO: the parse-side `VaultLoginResponse` / `VaultAuth` types
    /// must redact `client_token` in their `Debug` representation so a stray
    /// `tracing::error!(?response)` cannot leak the token to logs.
    #[test]
    fn vault_login_response_debug_redacts_client_token() {
        let response = VaultLoginResponse {
            auth: VaultAuth {
                client_token: "VAULT-CT-ZERO-INLINE-SENTINEL".to_string(),
            },
        };
        let dbg = format!("{response:?}");
        assert!(
            !dbg.contains("VAULT-CT-ZERO-INLINE-SENTINEL"),
            "VaultLoginResponse Debug leaked client_token: {dbg}"
        );
        assert!(
            dbg.contains("<redacted>"),
            "VaultLoginResponse Debug should mark client_token as redacted: {dbg}"
        );

        let auth_dbg = format!("{:?}", response.auth);
        assert!(
            !auth_dbg.contains("VAULT-CT-ZERO-INLINE-SENTINEL"),
            "VaultAuth Debug leaked client_token: {auth_dbg}"
        );
    }
}