rsclaw-provider 0.1.0

//! rsclaw-server kvCacheMode=2 provider — incremental session protocol.
//!
//! Wire-level contract: see `~/dev/rsclaw-llm/docs/rsclaw-protocol.md` v1.1+.
//!
//! Stateful sessions where rsclaw-server is the source of truth for
//! conversation history. Per-turn the client sends only the delta
//! (new user message OR tool_results); the server's KV cache stays
//! hot across turns.
//!
//! This provider rejects requests with `kv_cache_mode != 2` — those
//! must go through one of the regular OAI / Anthropic providers.

use std::{
    collections::HashMap,
    sync::{Arc, Mutex},
    time::Duration,
};

use anyhow::{Context, Result};
use futures::{StreamExt, TryStreamExt, future::BoxFuture};
use reqwest::StatusCode;
use serde::{Deserialize, Serialize};
use serde_json::{Value, json};

use super::{
    AgentEndpoint, ContentPart, LlmProvider, LlmRequest, LlmStream, Message, MessageContent,
    RecallMetadata, Role, StreamEvent, TokenUsage,
};

/// Default base for the rsclaw-server fleet: the OpenAI-compatible `/v1`
/// root. The incremental-protocol `/sessions/...` paths live under the
/// `/agent` mount (`/v1/agent/sessions`, …) which this provider prepends
/// itself (see `send_following_redirects`), so the base stays at `/v1`
/// where sibling resources like `/v1/models` resolve correctly. Legacy
/// configs that set the base to `…/v1/agent` are normalized in the
/// constructor (trailing `/agent` stripped). Setting `RSCLAW_URL` overrides.
///
/// `api.rsclaw.ai` fronts the fleet behind a 308-emitting LB that
/// pins clients (via the [`RedirectCache`] in this module) to their
/// resolved worker host for the response's `Cache-Control: max-age`
/// window (1h by default). First request through any provider
/// instance pays the redirect cost; everything within the TTL after
/// goes direct, so steady-state latency matches a direct deployment.
pub const RSCLAW_DEFAULT_BASE: &str = "https://api.rsclaw.ai/v1";

/// Default `prefix_id` per protocol §2.1.1 / §2.10.1 — namespaced
/// `<ns>/<ver>` string the gateway sends on `POST /sessions`. It's a
/// STATIC identifier (config-driven, never derived from `req.model` or
/// any hash of the request body) so the worker can route the call to
/// its static-registry prefix when one is registered, and otherwise
/// fall back to the dynamic-LRU keyed by `hash(system + tools)`
/// computed worker-side. `req.model` deliberately does NOT participate
/// in prefix_id construction — model selection is independent from the
/// prefix-cache identity. Override via the per-provider config field
/// `prefix_id` (see `ProviderConfig::prefix_id`).
///
/// The `<ver>` component here is the **baseline** version, NOT the
/// gateway's `CARGO_PKG_VERSION`. They are decoupled on purpose:
/// rsclaw-llm has to manually pre-register a base-layer KV slot for
/// each unique prefix_id, and a typical Cargo bump (patch release,
/// channel hot-fix, debug toggle) does not change the canonical
/// `shared_prefix + builtin_tools` payload that this identifier
/// names. Auto-tracking `CARGO_PKG_VERSION` would invalidate the
/// worker's registered slot on every gateway release. Bump this
/// string by hand only when the canonical baseline (asserted by
/// `tests/fixtures/baseline-<ver>.json`) actually changes, and
/// coordinate with rsclaw-llm to re-ingest the new fixture under
/// the new identifier.
pub const RSCLAW_DEFAULT_PREFIX_ID: &str = "rsclaw/2026.6.18";

/// Default HTTP timeout (seconds) for the `/sessions/<id>/compact` splice
/// call. The server holds the per-session lock and dispatches the splice
/// to a worker, and a large `keep_tail` can take a while to redecode, so
/// this must be ≥ the server's splice ceiling. Overridable per provider via
/// config `compactTimeoutSecs`.
pub const RSCLAW_DEFAULT_COMPACT_TIMEOUT_SECS: u64 = 180;

/// Well-known model names served by the managed rsclaw fleet (see
/// `GET /v1/agent/models`). Used by `agent::runtime` to auto-resolve
/// `flash` and `vision` when the user only configured a `rsclaw/*`
/// primary model. The contract: as long as your `model.primary` lives
/// under the `rsclaw/` namespace, you get the rsclaw flash and vision
/// slots for free without repeating them in config. Override via
/// `agents.defaults.model.flash` / `agents.defaults.model.vision` when
/// you want a different model.
///
/// Keep these in sync with whatever the fleet ingests under the
/// version named by [`RSCLAW_DEFAULT_PREFIX_ID`]. When the fleet bumps
/// a model name (e.g. `rsclaw-flash-v2`), bump both — clients on the
/// new gateway version pick up the new defaults automatically without
/// every user having to edit their config.
pub const RSCLAW_DEFAULT_FLASH: &str = "rsclaw/rsclaw-flash-v1";
pub const RSCLAW_DEFAULT_VISION: &str = "rsclaw/rsclaw-vision-v1";

/// How long to wait for `/sessions/<id>/turn` to start responding
/// (TCP connect + TLS + send body + receive headers + first byte).
/// Once the body stream begins this deadline no longer applies — the
/// SSE body is allowed to take as long as the model needs.
const TURN_HEADERS_TIMEOUT: Duration = Duration::from_secs(60);

/// Read-idle bound applied per chunk once the SSE body is streaming. The
/// headers deadline above no longer applies after the body begins, so this
/// is the only thing that catches a worker that goes silent mid-generation
/// (TCP alive, no bytes). 45s ≈ 15 missed heartbeats; matches the replay
/// path's guard.
const STREAM_READ_IDLE_SECS: u64 = 45;

/// Hard cap on the in-memory `sessions` cache. Each entry is a few
/// dozen bytes (`session_id` + `prefix_id` + counter), so 10_000 caps
/// the per-process footprint at ~1MB even under churn. When the cap is
/// hit, [`evict_if_oversized`] drops half the entries — picked by
/// HashMap iteration order, which is good enough since we lack
/// last-access timestamps and the alternative (LRU bookkeeping) would
/// add a synchronisation hot spot. Evicted entries cause one extra
/// replay on their next access, which is the same recovery path used
/// for server-side eviction (§2.2).
///
/// Without this cap a long-running gateway with high session churn
/// (every WeChat user = one session_key) accumulates entries forever
/// and bleeds memory until OOM. Mirrors the pre-existing safeguard in
/// the previous OpenAI-provider implementation that this provider
/// replaced.
const MAX_SESSIONS: usize = 10_000;

// ---------------------------------------------------------------------------
// Provider
// ---------------------------------------------------------------------------

pub struct RsclawProvider {
    /// Shared fleet HTTP client: a connection-reusing reqwest client
    /// (redirects DISABLED so the 308 `Cache-Control` survives) wrapped
    /// in the crate-wide [`rsclaw_embed::FleetHttp`] 307/308 redirect
    /// cache, so the LB hop is amortised across this provider AND the
    /// OCR / embed / rerank lanes. Redirect-caching is rsclaw-specific
    /// (only rsclaw-server is OUR infra); arbitrary upstreams keep the
    /// default redirect policy via `http_client_with_ua`.
    fleet: rsclaw_embed::FleetHttp,
    base_url: String,
    bearer: Option<String>,
    /// Namespaced `<ns>/<ver>` string sent verbatim as the wire
    /// `prefix_id` on every `POST /sessions` / `POST /sessions/replay`.
    /// Resolved at provider construction from the per-provider config
    /// `prefix_id` field, falling back to [`RSCLAW_DEFAULT_PREFIX_ID`]
    /// when the field is absent. It is intentionally static for the
    /// lifetime of the provider: the worker-side dynamic-LRU is keyed
    /// by `hash(system + tools)` (not `user_system`, not `req.model`),
    /// so threading model or per-request data into this field would
    /// only fragment the cache.
    prefix_id: String,
    /// HTTP timeout for the `/sessions/<id>/compact` splice POST. Defaults
    /// to [`RSCLAW_DEFAULT_COMPACT_TIMEOUT_SECS`]; overridable via the
    /// per-provider config `compactTimeoutSecs` so it can be aligned with
    /// the server's splice ceiling without a rebuild.
    compact_timeout: Duration,
    /// When true, every turn whose request carries tools sends
    /// `options.constrain_tool_calls: true` so the worker constrains
    /// tool-call decoding with the lazy GBNF grammar it derived from the
    /// session's tools at create/replay. Per-provider config
    /// `constrainToolCalls`; defaults to false during fleet rollout
    /// (workers without `supports_constrained_tool_calls` ignore the
    /// option, so a stale mix is safe but pointless).
    constrain_tool_calls: bool,
    sessions: Arc<Mutex<HashMap<String, SessionEntry>>>,
}

/// Consume an SSE response until the first terminal event and return
/// `(event_name, data)`. Used by `replay()` for the server's keep-alive
/// variant of `/sessions/replay`: comment heartbeats (`: ...`) arrive
/// every 3s while the server works, then exactly one `result` or
/// `error` event terminates the stream.
///
/// Liveness: each read is bounded by a 45s idle timeout — 15 missed
/// heartbeats means the connection is dead even though TCP hasn't
/// noticed. Total-duration capping is the caller's job.
async fn read_sse_terminal_event(resp: reqwest::Response) -> Result<(String, String)> {
    let mut stream = resp.bytes_stream();
    let mut buf: Vec<u8> = Vec::new();
    let mut parser = SseTerminalParser::default();

    loop {
        let chunk = tokio::time::timeout(Duration::from_secs(STREAM_READ_IDLE_SECS), stream.next())
            .await
            .map_err(|_| {
                anyhow::anyhow!(
                    "rsclaw replay: SSE idle for {STREAM_READ_IDLE_SECS}s (heartbeats stopped)"
                )
            })?;
        let Some(chunk) = chunk else {
            anyhow::bail!("rsclaw replay: SSE stream ended without a terminal event");
        };
        let chunk = chunk.context("rsclaw replay: SSE read error")?;
        buf.extend_from_slice(&chunk);

        while let Some(pos) = buf.iter().position(|&b| b == b'\n') {
            let line_bytes: Vec<u8> = buf.drain(..=pos).collect();
            let line_owned = String::from_utf8_lossy(&line_bytes).into_owned();
            if let Some(terminal) = parser.push_line(line_owned.trim_end_matches(['\n', '\r'])) {
                return Ok(terminal);
            }
        }
    }
}

/// Line-level state machine behind [`read_sse_terminal_event`]: feed
/// decoded SSE lines (no trailing newline), get `Some((event, data))`
/// back when a terminal `result` / `error` event completes. Comments
/// (heartbeats) and any non-terminal events are swallowed.
#[derive(Default)]
struct SseTerminalParser {
    event_name: String,
    data: String,
}

impl SseTerminalParser {
    fn push_line(&mut self, line: &str) -> Option<(String, String)> {
        if line.is_empty() {
            // Blank line = event boundary. Heartbeat comments produce
            // empty events — skip those, return the first terminal event.
            if self.event_name == "result" || self.event_name == "error" {
                return Some((
                    std::mem::take(&mut self.event_name),
                    std::mem::take(&mut self.data),
                ));
            }
            self.event_name.clear();
            self.data.clear();
            return None;
        }
        if line.starts_with(':') {
            return None; // SSE comment — the keep-alive heartbeat
        }
        if let Some(v) = line.strip_prefix("event:") {
            self.event_name = v.trim().to_owned();
        } else if let Some(v) = line.strip_prefix("data:") {
            if !self.data.is_empty() {
                self.data.push('\n');
            }
            self.data.push_str(v.strip_prefix(' ').unwrap_or(v));
        }
        None
    }
}

#[derive(Clone, Debug)]
struct SessionEntry {
    /// Server-issued, format `rs_<instance>_<random>`.
    session_id: String,
    /// `prefix_id` (post-rename, was `rsclaw_version` pre-spec-v1.7) this
    /// session was opened against. A bump triggers re-open since prefix
    /// cache layout changes invalidate the session.
    prefix_id: String,
    /// Largest `req.messages.len()` we've observed on this session.
    /// A subsequent call with a smaller list means the runtime trimmed
    /// history (compaction, repair, reset) and the server-side KV no
    /// longer matches what the gateway thinks the conversation is —
    /// trigger a re-hydrate via /sessions/replay.
    last_seen_msgs_len: usize,
}

impl RsclawProvider {
    pub fn new(base_url: impl Into<String>, bearer: Option<String>) -> Self {
        Self::with_user_agent(base_url, bearer, None)
    }

    /// Create a provider with custom User-Agent.
    pub fn with_user_agent(
        base_url: impl Into<String>,
        bearer: Option<String>,
        user_agent: Option<String>,
    ) -> Self {
        // Trim whitespace before trimming trailing slashes — env vars
        // loaded from a dotenv file frequently carry a trailing newline
        // (`RSCLAW_KEY=sk-abc\n`), and reqwest rejects header values
        // containing `\n` outright (RFC 7230 forbids CTLs in field
        // values). Without this, every signed request 500s with
        // "invalid HTTP header value" from inside the client builder
        // before it ever leaves the process. Same hazard applies to
        // base_url where stray whitespace flips reqwest into
        // url-parse-error territory.
        let base_url = base_url.into().trim().trim_end_matches('/').to_string();
        // Back-compat: older configs set base_url to `…/v1/agent`. The provider
        // now prepends the `/agent` protocol mount itself (so `/v1/models` and
        // friends sit at the `/v1` root), so strip a trailing `/agent` to avoid
        // doubling it into `/v1/agent/agent/sessions`.
        let base_url = base_url
            .strip_suffix("/agent")
            .map(|s| s.trim_end_matches('/').to_string())
            .unwrap_or(base_url);
        let bearer = bearer
            .map(|b| b.trim().to_string())
            .filter(|b| !b.is_empty());
        // Build a reqwest client with redirects DISABLED so the
        // 307/308 capture loop in `send_following_redirects` can read
        // each redirect response's `Cache-Control` header before
        // following. reqwest's default policy follows transparently
        // and discards intermediate headers, which would defeat the
        // 308 TTL caching the LB-aware routing depends on. The other
        // tuning (UA, connect timeout, keep-alive, idle pool window)
        // mirrors `http_client_with_ua` since rsclaw-server lives in
        // the same operational envelope as the OAI-compat upstreams
        // that helper was tuned for.
        // Tuned for rsclaw native protocol (NOT OpenAI-compat):
        // - connect_timeout 30s: localhost loopback connects fast (<1ms) but a
        //   saturated rsclaw-server under heavy session churn can accept-stall. 30s
        //   tolerates a brief bind() backlog without surfacing as timeout.
        // - pool_idle_timeout 30s: rsclaw-server (axum) keeps idle keepalive for 60s by
        //   default, so a 30s client-side idle is safely inside that window. Old 10s
        //   caused stale-pool refetches under bursty load.
        // - tcp_keepalive 30s: keeps long-prefill streaming connections alive through
        //   any intermediate timeouts during a 20+ second prefill.
        let client = reqwest::Client::builder()
            .user_agent(user_agent.as_deref().unwrap_or(super::DEFAULT_USER_AGENT))
            .redirect(reqwest::redirect::Policy::none())
            .connect_timeout(Duration::from_secs(30))
            .pool_idle_timeout(Duration::from_secs(30))
            .tcp_keepalive(Duration::from_secs(30))
            .build()
            .expect("failed to build rsclaw HTTP client");
        Self {
            // Wrap the provider's connection-reusing, redirect-disabled client
            // in the shared 307/308 cache so the LB hop amortises across this
            // provider and the OCR / embed / rerank lanes alike.
            fleet: rsclaw_embed::FleetHttp::from_client(client),
            base_url,
            bearer,
            prefix_id: RSCLAW_DEFAULT_PREFIX_ID.to_owned(),
            compact_timeout: Duration::from_secs(RSCLAW_DEFAULT_COMPACT_TIMEOUT_SECS),
            constrain_tool_calls: false,
            sessions: Arc::new(Mutex::new(HashMap::new())),
        }
    }

    /// Enable per-turn grammar-constrained tool-call decoding. Used by the
    /// provider builder in `gateway::providers` when the config carries an
    /// explicit `constrainToolCalls: true`.
    pub fn with_constrain_tool_calls(mut self, enabled: bool) -> Self {
        self.constrain_tool_calls = enabled;
        self
    }

    /// Override the `/compact` splice HTTP timeout (seconds). Used by the
    /// provider builder in `gateway::providers` when the config carries an
    /// explicit `compactTimeoutSecs`. A zero/absent value is ignored so a
    /// misconfigured `0` can't disable the timeout entirely (which would
    /// let a wedged splice hang the compaction path forever).
    pub fn with_compact_timeout_secs(mut self, secs: u64) -> Self {
        if secs > 0 {
            self.compact_timeout = Duration::from_secs(secs);
        }
        self
    }

    /// Override the default `prefix_id` sent on the wire. Used by the
    /// provider builder in `gateway::providers` when the config carries
    /// an explicit `prefix_id`. Trims whitespace and ignores empty
    /// strings (dotenv-style trailing newlines or unset config keys
    /// would otherwise produce an invalid wire value).
    ///
    /// Also validates the §2.10.1 contract: exactly one `/` separator.
    /// Inputs like `"rsclaw-2026.5.15"` (zero slashes) or
    /// `"foo/bar/baz"` (two slashes) are rejected at the builder so a
    /// typo in config doesn't survive gateway boot and only surface as
    /// a per-session 400 from the server. Rejected inputs are logged at
    /// `warn` and the override is dropped (falls back to the default).
    pub fn with_prefix_id(mut self, prefix_id: impl Into<String>) -> Self {
        let s = prefix_id.into().trim().to_owned();
        if s.is_empty() {
            return self;
        }
        let slash_count = s.matches('/').count();
        if slash_count != 1 {
            tracing::warn!(
                requested_prefix_id = %s,
                slash_count,
                default_prefix_id = RSCLAW_DEFAULT_PREFIX_ID,
                "rsclaw with_prefix_id: ignoring override that violates §2.10.1 \
                 (need exactly one '/' separator); falling back to default"
            );
            return self;
        }
        self.prefix_id = s;
        self
    }

    /// Acquire the sessions lock, recovering from poison rather than
    /// silently dropping the call.
    ///
    /// `Mutex::lock()` returns `Err` only after a panic occurred while
    /// some other thread held the lock. The pre-poison helpers used
    /// `.ok()?` / `if let Ok(...)`, which silently turned every
    /// post-poison call into a no-op — the provider went brain-dead
    /// (lookups always missed, store/forget became unobservable
    /// drops) but emitted no signal, so operators couldn't tell from
    /// logs that anything was wrong. Recovering with `into_inner()` on
    /// poison preserves the data (HashMap state is itself well-defined
    /// — only an in-flight insert/remove could leave logical staleness,
    /// and that staleness is bounded by the same eviction signals that
    /// already drive replay) and lets us flag the post-mortem in logs.
    fn lock_sessions(&self) -> std::sync::MutexGuard<'_, HashMap<String, SessionEntry>> {
        match self.sessions.lock() {
            Ok(g) => g,
            Err(p) => {
                // Use a static `OnceLock` to log only once per process
                // lifetime — poison is a permanent condition, no need
                // to spam every subsequent call.
                use std::sync::OnceLock;
                static LOGGED: OnceLock<()> = OnceLock::new();
                if LOGGED.set(()).is_ok() {
                    tracing::error!(
                        "rsclaw: sessions mutex poisoned — a prior thread \
                         panicked while holding it. Recovering inner data \
                         and continuing; expect possible session-state \
                         drift until restart."
                    );
                }
                p.into_inner()
            }
        }
    }

    /// Atomically look up a cached session AND validate its freshness.
    /// Returns `None` (forcing a re-hydrate) when the entry is missing,
    /// has a stale `prefix_id`, or its `last_seen_msgs_len` exceeds the
    /// incoming `msgs_len` (history was trimmed under our feet). On
    /// success bumps `last_seen_msgs_len` to the new value so the next
    /// call's comparison is against the most recent state.
    fn lookup_and_bump(
        &self,
        session_key: &str,
        prefix_id: &str,
        msgs_len: usize,
    ) -> Option<SessionEntry> {
        let mut map = self.lock_sessions();
        let entry = map.get_mut(session_key)?;
        if entry.prefix_id != prefix_id {
            return None;
        }
        if msgs_len < entry.last_seen_msgs_len {
            return None;
        }
        entry.last_seen_msgs_len = msgs_len;
        Some(entry.clone())
    }

    fn store(&self, session_key: &str, entry: SessionEntry) {
        let mut map = self.lock_sessions();
        map.insert(session_key.to_string(), entry);
        // Cap memory after every insert. Done inline (not on a timer
        // or a separate task) so a sudden churn burst can't tip over
        // the high-water mark while waiting for an external sweeper.
        evict_if_oversized(&mut map);
    }

    fn forget(&self, session_key: &str) {
        let mut map = self.lock_sessions();
        map.remove(session_key);
    }
}

/// Evict roughly half the entries when the cache exceeds [`MAX_SESSIONS`].
/// Iteration order on a `HashMap` is non-deterministic but stable
/// enough within one call to give a consistent set of victims; the
/// alternative (true LRU) would need an auxiliary data structure and a
/// per-call timestamp update on the read path. Evicted sessions cost
/// one extra replay round-trip the next time they're touched — the
/// same code path that handles upstream-side eviction.
fn evict_if_oversized(map: &mut HashMap<String, SessionEntry>) {
    if map.len() <= MAX_SESSIONS {
        return;
    }
    let target_drop = map.len() - MAX_SESSIONS / 2;
    let victims: Vec<String> = map.keys().take(target_drop).cloned().collect();
    let dropped = victims.len();
    for k in victims {
        map.remove(&k);
    }
    tracing::info!(
        cap = MAX_SESSIONS,
        dropped,
        remaining = map.len(),
        "rsclaw: sessions cache over cap, evicted batch"
    );
}

// ---------------------------------------------------------------------------
// LlmProvider impl
// ---------------------------------------------------------------------------

/// Outcome of [`dispatch_decision`] — either a stateless one-shot route
/// or a sentinel telling the caller to continue down the stateful
/// `/v1/agent/sessions/*` protocol path.
#[derive(Debug, PartialEq, Eq)]
enum DispatchRoute {
    OneShot(&'static str),
    Sessions,
}

/// Pure routing classification for an `LlmRequest` on the rsclaw provider.
///
/// Single source of truth: this is what `stream()` consults and what the
/// test suite asserts against. Both bail conditions live here so callers
/// can't silently misroute by forgetting the safety checks.
///
/// Server enforces per-route model whitelists (400 model_slot_mismatch on
/// violations), so canonical model names take priority over endpoint
/// variants. The endpoint variant only matters when the model name is
/// non-canonical.
///
/// Precedence:
///   1. model rsclaw-flash-*                            → /fastshot
///   2. model rsclaw-vision-*                           → /vision
///   3. model rsclaw-agent-* + session_key=None         → /oneshot
///   4. model rsclaw-agent-* + session_key=Some         → /sessions
///      (kvCacheMode=2 required)
///   5. non-canonical model + endpoint=Flash            → /fastshot (server may
///      400)
///   6. non-canonical model + endpoint=Vision           → /vision (server may
///      400)
///   7. Primary + session_key=Some                      → /sessions
///      (kvCacheMode=2 required)
///   8. Primary + session_key=None                      → /oneshot
///
/// Bails (before any rule fires):
///   • kv_cache_mode=2 + session_key=None — caller asked for stateful
///     traffic but forgot the session key; would silently drop kvCache.
///   • session_key=Some + kv_cache_mode!=2 — sessions path requires
///     mode 2.
///
/// Trailing-dash on prefixes prevents collisions with hypothetical names
/// like `rsclaw-flashy`.
fn dispatch_decision(req: &LlmRequest) -> Result<DispatchRoute> {
    let bare_model = req.model.strip_prefix("rsclaw/").unwrap_or(&req.model);
    let is_flash_model = bare_model.starts_with("rsclaw-flash-");
    let is_vision_model = bare_model.starts_with("rsclaw-vision-");
    let is_agent_model = bare_model.starts_with("rsclaw-agent-");

    // Safety net: kv_cache_mode=2 requires session_key. Catching this
    // BEFORE the rule chain prevents a stateless misroute (rule 8) from
    // silently dropping kvCache continuity.
    if req.kv_cache_mode == 2 && req.session_key.is_none() {
        anyhow::bail!(
            "rsclaw kv_cache_mode=2 requires session_key (got None); \
             set session_key=Some(...) for stateful traffic or \
             kv_cache_mode=0 + session_key=None for /oneshot"
        );
    }

    // Rules 1–3.
    if is_flash_model {
        return Ok(DispatchRoute::OneShot("/fastshot"));
    }
    if is_vision_model {
        return Ok(DispatchRoute::OneShot("/vision"));
    }
    if is_agent_model && req.session_key.is_none() {
        return Ok(DispatchRoute::OneShot("/oneshot"));
    }

    // Surface an "agent-* model overrides your endpoint hint" warning
    // so operators can debug "I asked for Flash but the request went
    // to /sessions". The canonical model wins by design, but silently
    // (R1 review I1) is debug-hostile in a 1000-worker fleet.
    if is_agent_model && !matches!(req.endpoint, AgentEndpoint::Primary) {
        tracing::warn!(
            model = %req.model,
            endpoint = ?req.endpoint,
            "rsclaw dispatch: agent-* model overrides endpoint hint; routing to /sessions"
        );
    }

    // Rules 5–6: non-canonical model honors endpoint variant hint.
    if !is_agent_model {
        if matches!(req.endpoint, AgentEndpoint::Flash) {
            return Ok(DispatchRoute::OneShot("/fastshot"));
        }
        if matches!(req.endpoint, AgentEndpoint::Vision) {
            return Ok(DispatchRoute::OneShot("/vision"));
        }
    }

    // Rule 8.
    if req.session_key.is_none() {
        return Ok(DispatchRoute::OneShot("/oneshot"));
    }

    // Rules 4 / 7: stateful sessions path requires kv_cache_mode=2.
    if req.kv_cache_mode != 2 {
        anyhow::bail!(
            "rsclaw session-mode call requires kv_cache_mode=2 (got {}); \
             pass session_key=None to route to /oneshot instead",
            req.kv_cache_mode
        );
    }
    Ok(DispatchRoute::Sessions)
}

impl LlmProvider for RsclawProvider {
    fn name(&self) -> &str {
        "rsclaw"
    }

    fn stream(&self, mut req: LlmRequest) -> BoxFuture<'_, Result<LlmStream>> {
        Box::pin(async move {
            // Single source of truth for dispatch routing — see
            // [`dispatch_decision`] for the full precedence table.
            match dispatch_decision(&req)? {
                DispatchRoute::OneShot(path) => return self.stream_oneshot(req, path).await,
                DispatchRoute::Sessions => { /* fall through to stateful path */ }
            }
            let session_key = req
                .session_key
                .clone()
                .context("rsclaw kv_cache_mode=2 requires session_key on the request")?;

            // The runtime appends `Role::System` messages AFTER the
            // User/Tool delta on the first iteration of any turn that
            // has dynamic /ctx or just-installed-skill blocks (see
            // agent/runtime.rs ~4068-4082). Without this, `from_request`
            // sees `Role::System` as the last message and aborts the
            // entire turn. Fold trailing System text back into the
            // preceding User delta so the model still gets the context.
            normalize_trailing_system(&mut req.messages);

            let split = split_request(&req, &self.prefix_id, self.constrain_tool_calls)?;

            // Lookup or hydrate. Cache miss / mutation happens on first
            // call, version drift, after a prior replay failure, or
            // after the runtime trimmed history (compaction, repair,
            // reset) — all cases where `req.messages` may not match
            // what the server has hydrated. open() can't hydrate, so
            // when history exists we go straight to replay; an empty
            // history list takes the cheaper open() path.
            let entry =
                match self.lookup_and_bump(&session_key, &split.prefix_id, req.messages.len()) {
                    Some(e) => e,
                    None => {
                        self.forget(&session_key);
                        let history = history_for_replay(&req.messages);
                        let resp = if history.is_empty() {
                            self.open(&split).await?
                        } else {
                            self.replay(&split, history).await?
                        };
                        let entry = SessionEntry {
                            session_id: resp.session_id.clone(),
                            // Cache key MUST be the request value, not the
                            // upstream canonical. open()'s response echoes
                            // the resolved prefix_id (per §2.1.6), which can
                            // differ from the requested alias — e.g. request
                            // `rsclaw/latest`, response `rsclaw/2026.5.15`.
                            // `lookup_and_bump` compares the cached value
                            // against the next call's `split.prefix_id` (also
                            // the alias), so caching the canonical
                            // guarantees a miss on every subsequent call:
                            // re-hydrate every turn, defeating kvCacheMode=2
                            // entirely. Replay's response per §2.2 omits the
                            // field, which happened to make recovery-path
                            // entries self-consistent — but open()-path
                            // entries were always broken. Version drift is
                            // detected server-side via 409 (handled by
                            // is_session_evicted), so we don't need the
                            // canonical here for freshness.
                            prefix_id: split.prefix_id.clone(),
                            last_seen_msgs_len: req.messages.len(),
                        };
                        self.store(&session_key, entry.clone());
                        entry
                    }
                };

            let delta = TurnDelta::from_request(&req)?;

            // Optional debug dump — when RSCLAW_DUMP_TURN env is set we
            // write the full request shape (LlmRequest + rsclaw turn body
            // + rsclaw replay body + equivalent OpenAI chat-completions
            // body) to `<base_dir>/debug/turn-<ms>-<sess>.json`. Lets
            // operators replay the SAME turn against different worker
            // endpoints (rsclaw `/sessions/<id>/turn`, `/sessions/replay`,
            // vanilla `/v1/chat/completions`) to bisect protocol-vs-model
            // truncation behavior. No-op when the env var is unset, so
            // production stays untouched.
            if std::env::var("RSCLAW_DUMP_TURN").is_ok() {
                dump_turn_for_debug(&session_key, &entry, &split, &delta, &req);
            }

            // Forget the cached session entry on any non-recoverable
            // turn failure. Without this, an Err here leaves the
            // SessionEntry in cache with a `last_seen_msgs_len` that
            // already counts the delta we tried to send. Failover
            // routes the user-facing turn through another provider,
            // the runtime stores that provider's assistant in session,
            // and on the next rsclaw call `lookup_and_bump` happily
            // returns the stale entry — `turn()` then sends only the
            // *next* delta, so the assistant generated by the fallback
            // never reaches rsclaw-server. Server-side history
            // diverges silently from the runtime's mental model;
            // subsequent generations base their reasoning on a partial
            // log. Forgetting forces a full /sessions/replay on the
            // next rsclaw call, which re-anchors server state to the
            // runtime's complete history.
            let resp = match self.turn(&entry.session_id, &delta, &req).await {
                Ok(o) => o,
                Err(e) => {
                    self.forget(&session_key);
                    return Err(e);
                }
            };
            let resp = match resp {
                TurnOutcome::Stream(s) => s,
                TurnOutcome::SessionNotFound => {
                    // Recover via /sessions/replay then retry the turn.
                    // History excludes the trailing delta — turn() below
                    // re-sends it. Including it in replay would hydrate
                    // the same message twice (once batched, once as the
                    // turn input) and confuse the model.
                    self.forget(&session_key);
                    let replay_history = history_for_replay(&req.messages);
                    let replayed = self.replay(&split, replay_history).await?;
                    let entry = SessionEntry {
                        session_id: replayed.session_id.clone(),
                        // Same rationale as the open/replay path above:
                        // cache key is the request alias, not the
                        // upstream canonical. (Replay's response per
                        // §2.2 doesn't even include prefix_id, so this
                        // site happened to be self-consistent before —
                        // but normalising both sites on
                        // `split.prefix_id` keeps the cache-key contract
                        // single-sourced.)
                        prefix_id: split.prefix_id.clone(),
                        last_seen_msgs_len: req.messages.len(),
                    };
                    self.store(&session_key, entry.clone());
                    // Same forget-on-Err treatment as the primary
                    // turn() path above — recovery doesn't grant
                    // immunity from divergence.
                    match self.turn(&entry.session_id, &delta, &req).await {
                        Ok(TurnOutcome::Stream(s)) => s,
                        Ok(TurnOutcome::SessionNotFound) => {
                            self.forget(&session_key);
                            anyhow::bail!(
                                "rsclaw: session vanished immediately after replay (id={})",
                                entry.session_id
                            );
                        }
                        Err(e) => {
                            self.forget(&session_key);
                            return Err(e);
                        }
                    }
                }
            };
            // Wrap the stream so that the FIRST transport error or
            // explicit `StreamEvent::Error` evicts the session entry.
            // If the stream tears down mid-turn, the runtime sees the
            // error and aborts the iteration — but the rsclaw provider
            // would otherwise keep the session cached, and rsclaw-
            // server's view of that turn could be partially-committed
            // or rolled back depending on where the failure landed.
            // Forcing a fresh replay on the next call re-anchors both
            // sides to the runtime's confirmed history.
            Ok(invalidate_on_error(
                resp,
                Arc::clone(&self.sessions),
                session_key,
            ))
        })
    }

    /// Resolve `session_key` → wire `session_id` via the cached
    /// `SessionEntry`, then delegate to the inner `compact_splice`
    /// helper. On HTTP success, update the cached entry's
    /// `last_seen_msgs_len` to the post-splice value so subsequent
    /// `lookup_and_bump` calls don't (incorrectly) detect the message
    /// drop as "history was trimmed under us" and force a replay.
    ///
    /// Per the 2026-05-16 decision (Listen-first), the cached
    /// `last_seen_msgs_len` is updated from the gateway-local
    /// computation (`keep_head_messages + 1 + keep_tail_messages`) NOT
    /// from `resp.msgs_count`. The server-reported `msgs_count` is
    /// returned to the caller for cross-check / telemetry only.
    ///
    /// Returns `Err` when no `SessionEntry` exists for `session_key`
    /// (no point splicing what we don't think is open) — caller falls
    /// back to the replay path which will re-open the session anyway.
    fn compact_splice<'a>(
        &'a self,
        session_key: &'a str,
        keep_head_messages: usize,
        summary: &'a str,
        keep_tail_messages: usize,
        expected_msgs_count: Option<usize>,
    ) -> BoxFuture<'a, Result<usize>> {
        Box::pin(async move {
            // Snapshot the session_id under the lock, drop the lock
            // before the network call. Holding the mutex across an
            // await would block every other turn on this provider for
            // the duration of the splice.
            let session_id = {
                let map = self.lock_sessions();
                map.get(session_key)
                    .map(|e| e.session_id.clone())
                    .ok_or_else(|| {
                        anyhow::anyhow!(
                            "rsclaw compact splice: no cached session for key {session_key} — \
                             falling back to replay"
                        )
                    })?
            };

            // Bounded optimistic-concurrency retry (protocol §6.3.1).
            // A non-atomic turn (server appends the user slot before
            // dispatch and does not roll it back on failure) or genuine
            // concurrency can leave the server's slot count ahead of
            // ours. On 409 the server returns its authoritative `current`;
            // the extra slots are the most recent, so we keep them
            // verbatim by growing the tail. The dropped middle is
            // unchanged, so the already-computed `summary` stays valid —
            // no re-summarize needed. Falls back to replay (bail) only on
            // the genuinely unrecoverable shapes.
            const MAX_SPLICE_ATTEMPTS: usize = 3;
            let mut expected = expected_msgs_count;
            let mut keep_tail = keep_tail_messages;
            let mut attempt = 0usize;
            let resp = loop {
                attempt += 1;
                match self
                    .compact_splice_inner(
                        &session_id,
                        keep_head_messages,
                        summary,
                        keep_tail,
                        expected,
                    )
                    .await?
                {
                    SpliceOutcome::Done(r) => break r,
                    SpliceOutcome::CountMismatch { current } => {
                        let prev = expected.unwrap_or(current);
                        if current < prev {
                            // Local history is AHEAD of the server — should
                            // not happen with correct slot counting. Don't
                            // shrink the tail (would drop messages the
                            // summary doesn't cover); fall back to replay.
                            anyhow::bail!(
                                "rsclaw compact splice: server count {current} < local \
                                 {prev}; falling back to replay rather than dropping \
                                 unsummarized messages"
                            );
                        }
                        keep_tail += current - prev;
                        expected = Some(current);
                        if keep_head_messages + keep_tail >= current {
                            anyhow::bail!(
                                "rsclaw compact splice: realigned keep ranges (head \
                                 {keep_head_messages} + tail {keep_tail}) leave no middle \
                                 to drop against server count {current}; falling back to \
                                 replay"
                            );
                        }
                        if attempt >= MAX_SPLICE_ATTEMPTS {
                            anyhow::bail!(
                                "rsclaw compact splice: still 409 msg_count_mismatch \
                                 after {attempt} attempts (count drifting under \
                                 concurrent load); falling back to replay"
                            );
                        }
                        tracing::info!(
                            session_key,
                            current,
                            new_keep_tail = keep_tail,
                            attempt,
                            "rsclaw compact splice: 409 msg_count_mismatch — realigned \
                             expected to server count and retrying"
                        );
                    }
                }
            };

            // Optimistically update last_seen_msgs_len with the
            // gateway-local computation (head + summary(1) + tail).
            // `keep_tail` may have grown across 409 retries, so recompute
            // from the final value rather than the original argument.
            // The server's resp.msgs_count is also tracked but kept as
            // a sanity cross-check at log level.
            let local_msgs_count = keep_head_messages + 1 + keep_tail;
            if resp.msgs_count != local_msgs_count {
                tracing::warn!(
                    session_key,
                    server_count = resp.msgs_count,
                    local_count = local_msgs_count,
                    "rsclaw compact splice: server msgs_count diverges from gateway computation"
                );
            }
            // Emit the server's authoritative post-splice counters here
            // (rather than at the call site in `compact_inner`) because
            // tokens_count is exposed only by the wire response — the
            // trait surface itself returns msgs_count alone. Telemetry
            // tools that need to track KV slot tokens over time should
            // scrape this log.
            tracing::info!(
                session_key,
                msgs_count = resp.msgs_count,
                tokens_count = resp.tokens_count,
                "rsclaw compact splice: server-side splice complete"
            );
            {
                let mut map = self.lock_sessions();
                if let Some(entry) = map.get_mut(session_key) {
                    entry.last_seen_msgs_len = local_msgs_count;
                }
            }
            Ok(resp.msgs_count)
        })
    }
}

/// Wrap an `LlmStream` so the first error item evicts `session_key`
/// from the shared session cache. `errored` flips on the first
/// error to make the eviction idempotent — multiple `Err` items in
/// the same stream don't try to re-acquire the lock unnecessarily.
fn invalidate_on_error(
    inner: LlmStream,
    sessions: Arc<Mutex<HashMap<String, SessionEntry>>>,
    session_key: String,
) -> LlmStream {
    use futures::StreamExt;
    let mut errored = false;
    Box::pin(inner.inspect(move |item| {
        if errored {
            return;
        }
        let invalidate = match item {
            Err(_) => true,
            Ok(StreamEvent::Error(_)) => true,
            _ => false,
        };
        if !invalidate {
            return;
        }
        errored = true;
        // Best-effort lock; if poisoned, the parent provider's
        // `lock_sessions` already logged the original poison —
        // don't compound the noise here.
        match sessions.lock() {
            Ok(mut map) => {
                map.remove(&session_key);
            }
            Err(p) => {
                p.into_inner().remove(&session_key);
            }
        }
    }))
}

// ---------------------------------------------------------------------------
// Protocol operations: open / turn / replay (internal)
// ---------------------------------------------------------------------------

impl RsclawProvider {
    /// POST `body` to `path` (e.g. `/sessions`) under `base_url`,
    /// delegating 307/308 capture + caching to the shared
    /// [`rsclaw_embed::FleetHttp`] — the SAME redirect cache the OCR /
    /// embed / rerank lanes use, so the LB hop amortises fleet-wide.
    ///
    /// - 308: the target origin is cached (per `Cache-Control: max-age`,
    ///   or a short default) so subsequent calls route DIRECT until the
    ///   TTL expires, instead of re-paying the LB redirect per request.
    /// - 307: followed without caching (temporary by spec).
    /// - All other statuses (incl. errors): returned as-is for the
    ///   caller to interpret (e.g. turn() reads 404/409/503 as
    ///   session-evicted and triggers replay).
    ///
    /// `builder_timeout` is reqwest's per-hop deadline (NOT cumulative);
    /// `None` is used by streaming `turn()`, which bounds the headers
    /// phase externally with `tokio::time::timeout`. `idempotency_key`
    /// adds `Idempotency-Key` on every hop (safe-to-dedupe requests like
    /// `/sessions/replay`). `accept_sse` adds `Accept: text/event-stream`
    /// to opt into the server's keep-alive long-poll variant. The bearer
    /// is taken from `self.bearer`; FleetHttp drops it when empty, so an
    /// `RSCLAW_KEY=""` never emits a bare `Authorization: Bearer `.
    async fn send_following_redirects<B: Serialize>(
        &self,
        path: &str,
        body: &B,
        builder_timeout: Option<Duration>,
        idempotency_key: Option<&str>,
        accept_sse: bool,
    ) -> Result<reqwest::Response> {
        // Protocol paths live under the `/agent` mount; `base_url` is the `/v1`
        // root (see RSCLAW_DEFAULT_BASE), so prepend `/agent` here. `/models`
        // and other OpenAI-compat resources go to the bare `/v1` root elsewhere.
        let url = format!("{}/agent{}", self.base_url, path);
        self.fleet
            .post_following_redirects(
                &url,
                body,
                self.bearer.as_deref(),
                accept_sse,
                idempotency_key,
                builder_timeout,
            )
            .await
    }

    async fn open(&self, split: &SplitRequest<'_>) -> Result<CreateSessionResp> {
        let (prefix_id, dynamic_prefix, top_level_user_tools) = prefix_fields(
            &split.prefix_id,
            DynamicPrefixWire {
                system: split.dynamic_system,
                tools: &split.dynamic_tools,
                user_tools: &split.dynamic_user_tools,
                user_system: split.dynamic_user_system,
            },
        );
        let body = CreateSessionReq {
            prefix_id,
            model: &split.model,
            dynamic_prefix,
            user_tools: top_level_user_tools,
            options: Some(split.options.clone()),
        };
        // 180s caps the worst-case prefix-decode time for a fresh
        // session; without an explicit timeout reqwest hangs forever
        // on a stalled server (the 20s connect_timeout only covers TCP
        // establishment, not response wait). The deadline is per-hop
        // so a redirected open still gets the full budget against the
        // ultimate target rather than splitting it.
        let resp = self
            .send_following_redirects(
                "/sessions",
                &body,
                Some(Duration::from_secs(180)),
                None,
                false,
            )
            .await?;
        let status = resp.status();
        if !status.is_success() {
            let body = resp.text().await.unwrap_or_default();
            anyhow::bail!("rsclaw open session failed {status}: {body}");
        }
        resp.json::<CreateSessionResp>()
            .await
            .context("rsclaw open: parse response")
    }

    async fn replay(
        &self,
        split: &SplitRequest<'_>,
        messages: &[Message],
    ) -> Result<CreateSessionResp> {
        // Protocol §2.2 history accepts only `role: "user"` and
        // `role: "assistant"`. The runtime, however, threads
        // `Role::System` messages into the conversation list for
        // plugins/skills prefixes, just-installed skills, and
        // dynamic /ctx blocks (see agent/runtime.rs ~4054). Sending
        // those through as-is would trigger `400 invalid_history`
        // and tank every replay. Pull them out and append their
        // text to `user_system` so the content still reaches the
        // server — at the static-prefix slot, the only place the
        // protocol allows non-conversational system content.
        let (filtered, extra_suffix) = split_system_messages(messages);
        // System-Role messages threaded through the conversation list
        // (plugins/skills/ctx blocks — see comment above) get folded
        // back into `dynamic_prefix.user_system`, the only protocol
        // slot that accepts non-conversational system content. Without
        // this they'd hit `400 invalid_history` on the worker side.
        let user_system_owned: String = if extra_suffix.is_empty() {
            String::new()
        } else if split.dynamic_user_system.is_empty() {
            extra_suffix
        } else {
            format!("{}\n\n{}", split.dynamic_user_system, extra_suffix)
        };
        let user_system: &str = if user_system_owned.is_empty() {
            split.dynamic_user_system
        } else {
            &user_system_owned
        };
        let history: Vec<Value> = serialize_replay_history(&filtered);
        let (prefix_id, dynamic_prefix, top_level_user_tools) = prefix_fields(
            &split.prefix_id,
            DynamicPrefixWire {
                system: split.dynamic_system,
                tools: &split.dynamic_tools,
                user_tools: &split.dynamic_user_tools,
                user_system,
            },
        );
        let body = ReplayReq {
            prefix_id,
            model: &split.model,
            dynamic_prefix,
            user_tools: top_level_user_tools,
            history,
            options: Some(split.options.clone()),
        };
        // Replay re-decodes prefix + full history and may sit minutes
        // in the worker's warm→hot hydrate, so we opt in to the
        // server's SSE keep-alive variant (Accept: text/event-stream):
        // comment heartbeats every 3s, then exactly one terminal
        // `result` / `error` event. Liveness is enforced by a 45s
        // read-idle timeout (15 missed heartbeats) plus a 30min total
        // cap, instead of one big request deadline that a slow-but-
        // healthy hydrate would trip. Old servers ignore the Accept
        // header and answer plain JSON — that path keeps the previous
        // 300s body deadline.
        //
        // No builder timeout: reqwest's `timeout()` covers the whole
        // body phase, which for SSE is exactly the long wait we're
        // keeping alive. The headers phase is guarded externally below.
        //
        // Idempotency-Key: per-call UUID so a transport retry (transient
        // timeout, 503) that may have actually reached the server is
        // safely deduped server-side instead of creating two sessions
        // for the same gateway turn. Server caches the response by key
        // for 5 minutes (rsclaw-server agent_proxy::IDEMPOTENCY_TTL).
        let idem_key = uuid::Uuid::new_v4().to_string();
        let resp = tokio::time::timeout(
            Duration::from_secs(60),
            self.send_following_redirects("/sessions/replay", &body, None, Some(&idem_key), true),
        )
        .await
        .map_err(|_| anyhow::anyhow!("rsclaw replay: no response headers within 60s"))??;
        let status = resp.status();
        let is_sse = resp
            .headers()
            .get(reqwest::header::CONTENT_TYPE)
            .and_then(|v| v.to_str().ok())
            .map(|v| v.contains("text/event-stream"))
            .unwrap_or(false);

        if !is_sse {
            // Pre-SSE server: plain JSON, restore the old 300s body
            // deadline so a stalled body read can't hang forever.
            if !status.is_success() {
                let body = resp.text().await.unwrap_or_default();
                anyhow::bail!("rsclaw replay failed {status}: {body}");
            }
            return tokio::time::timeout(
                Duration::from_secs(300),
                resp.json::<CreateSessionResp>(),
            )
            .await
            .map_err(|_| anyhow::anyhow!("rsclaw replay: body read timed out after 300s"))?
            .context("rsclaw replay: parse response");
        }

        let (event, data) =
            tokio::time::timeout(Duration::from_secs(30 * 60), read_sse_terminal_event(resp))
                .await
                .map_err(|_| {
                    anyhow::anyhow!("rsclaw replay: SSE total deadline (30min) exceeded")
                })??;

        if event == "result" {
            return serde_json::from_str::<CreateSessionResp>(&data)
                .context("rsclaw replay: parse SSE result event");
        }
        // `error` event: data = {"status": <code>, "body": <upstream error>}.
        // Keep the bail message in the same shape as the plain-JSON error
        // path so failover classification treats both identically.
        let parsed: Value = serde_json::from_str(&data).unwrap_or(Value::Null);
        let code = parsed.get("status").and_then(Value::as_u64).unwrap_or(0);
        let detail = parsed.get("body").map(Value::to_string).unwrap_or(data);
        anyhow::bail!("rsclaw replay failed {code}: {detail}");
    }

    /// In-place compact splice (protocol §2.4). Issues
    /// `POST /v1/agent/sessions/<session_id>/compact` to ask the server to
    /// drop the KV pages for the middle of the conversation, prefill the
    /// new summary in their place, and leave head/tail KV unchanged. The
    /// session's KV slot — and therefore its `session_id` — survives.
    ///
    /// Caller responsibilities:
    /// - Provide `session_id` from the cached `SessionEntry`. Server returns
    ///   410 if the slot has been evicted.
    /// - Choose `keep_head_messages` consistently across the lifetime of a
    ///   session (typically 2 = first user/assistant pair carrying `[Session
    ///   started: ...]`). Changing it mid-session breaks the
    ///   head-byte-stability invariant and forces a head re-prefill on the
    ///   server.
    /// - Provide a self-contained `summary` (no `[Session started:]` — that's
    ///   preserved in head — but a fresh `[CONTEXT COMPACTION compacted at <ISO
    ///   ts>]` header is the convention so the model has a
    ///   "recent-vs-summarized" temporal anchor; this struct does not enforce
    ///   that format).
    /// - On any `Err`, callers MUST fall back to `/sessions/replay` —
    ///   `compact_inner` does this unconditionally (per user 2026-05-16
    ///   decision). 409 / 410 / 422 are the documented fallback codes but the
    ///   contract is "any non-2xx + any transport error → replay".
    ///
    /// Timeout: 180s. The server-side splice involves dropping KV pages
    /// and prefilling the summary (~2K tokens by default), which is
    /// fast — comparable to a small turn prefill. The deadline matches
    /// `open()` so we don't have an inconsistent ceiling between the
    /// two new-KV-content code paths.
    ///
    /// Named with the `_inner` suffix to disambiguate from the
    /// `LlmProvider::compact_splice` trait method, which sits one layer
    /// above and resolves `session_key` → `session_id` before delegating
    /// here. The trait method is the public API; this is the wire-level
    /// implementation.
    async fn compact_splice_inner(
        &self,
        session_id: &str,
        keep_head_messages: usize,
        summary: &str,
        keep_tail_messages: usize,
        expected_msgs_count: Option<usize>,
    ) -> Result<SpliceOutcome> {
        let path = format!("/sessions/{}/compact", session_id);
        let body = CompactSpliceReq {
            keep_head_messages,
            summary,
            keep_tail_messages,
            expected_msgs_count,
        };
        let resp = self
            .send_following_redirects(&path, &body, Some(self.compact_timeout), None, false)
            .await?;
        let status = resp.status();
        // 409 msg_count_mismatch is optimistic-concurrency, NOT a hard
        // failure — the server hands back its authoritative `current`
        // slot count so the caller can re-align and retry (protocol
        // §6.3.1). Surface it as a typed outcome instead of bailing, so
        // the wrapper's retry loop can resolve it without falling back to
        // a full replay.
        if status.as_u16() == 409 {
            let text = resp.text().await.unwrap_or_default();
            let parsed: CompactSplice409 = serde_json::from_str(&text)
                .with_context(|| format!("rsclaw compact: parse 409 body: {text}"))?;
            return Ok(SpliceOutcome::CountMismatch {
                current: parsed.error.current,
            });
        }
        if !status.is_success() {
            let body = resp.text().await.unwrap_or_default();
            anyhow::bail!("rsclaw compact splice failed {status}: {body}");
        }
        let resp = resp
            .json::<CompactSpliceResp>()
            .await
            .context("rsclaw compact: parse response")?;
        Ok(SpliceOutcome::Done(resp))
    }

    /// Dispatch a one-shot stateless request to `/fastshot`, `/vision`,
    /// or `/oneshot`. Route selection is made by the unified dispatcher
    /// in `stream()`; this method just sends the bytes.
    ///
    /// Wire shape (identical across all three paths — only `/vision`
    /// adds the `images: [...]` array; see protocol spec
    /// `docs/adr` notes 2026-05-15):
    ///
    /// ```text
    /// POST {base_url}/{fastshot|vision|oneshot}
    /// {
    ///   "prompt": "...",
    ///   "max_tokens": N,
    ///   "options": { "temperature": 0.7 },
    ///   "stream": true,
    ///   "model": "rsclaw-flash-v1"
    /// }
    /// ```
    ///
    /// Response is OAI chat.completion.chunk SSE. We always stream —
    /// the agent's stream consumer collapses to a single `Done` event
    /// for non-streaming callers. Reuses the OpenAI SSE chunk parser
    /// since the response shape is byte-for-byte identical.
    async fn stream_oneshot(&self, req: LlmRequest, path: &'static str) -> Result<LlmStream> {
        use futures::StreamExt;

        let prompt = flatten_prompt_for_oneshot(&req);
        if prompt.trim().is_empty() {
            anyhow::bail!("rsclaw {path}: empty prompt after flattening req.messages");
        }

        let mut body = serde_json::Map::new();
        body.insert("prompt".to_owned(), Value::String(prompt));
        // /vision wants a single complete result — the caller (e.g. vlm_parse)
        // accumulates ALL deltas before using the text, so streaming buys
        // nothing and pays the cluster's per-SSE-chunk forwarding latency
        // (~250ms/chunk through the relay ⇒ tens of seconds per call, and long
        // replies time out → "error decoding response body"). Request a single
        // non-streamed JSON body for vision; keep SSE for /fastshot and
        // /oneshot where token-by-token delivery still matters.
        let use_stream = path != "/vision";
        body.insert("stream".to_owned(), Value::Bool(use_stream));
        if let Some(mt) = req.max_tokens {
            body.insert("max_tokens".to_owned(), Value::from(mt));
        }
        // Hard-bind the model id to the endpoint per the fleet's
        // model-slot whitelist:
        //   /fastshot → rsclaw-flash-v1
        //   /vision   → rsclaw-vision-v1
        //   /oneshot  → rsclaw-agent-v1
        // The dispatch chain in `route_for` (rules 1–3 + 5–6) already
        // routes requests here based on the caller's model hint or
        // endpoint hint, so by the time we land in this function the
        // path uniquely determines which slot the worker accepts.
        // Forwarding `req.model` verbatim risks
        // `model_slot_mismatch` 400s when the caller mixes
        // (model=anthropic/..., endpoint=Flash) — we already routed to
        // /fastshot, but the wire model field would have been wrong.
        // Stamping the canonical id keeps callers from having to know
        // the exact slot strings.
        let canonical_model = match path {
            "/fastshot" => "rsclaw-flash-v1",
            "/vision" => "rsclaw-vision-v1",
            _ => "rsclaw-agent-v1", // /oneshot and any future stateless variant
        };
        body.insert(
            "model".to_owned(),
            Value::String(canonical_model.to_owned()),
        );
        let mut options = serde_json::Map::new();
        if let Some(t) = req.temperature {
            options.insert("temperature".to_owned(), super::json_f32(t));
        }
        // Forward thinking control (mirrors TurnOptions for /sessions). Without
        // this, /oneshot + /fastshot always run with the reasoning model's
        // thinking ON, wrapping every answer in <think>…</think>. Structured
        // one-shot callers (distill: extraction, lessons, crystallization) pass
        // thinking_budget=Some(0) to get clean JSON directly.
        if let Some(budget) = req.thinking_budget {
            options.insert("enable_thinking".to_owned(), Value::Bool(budget > 0));
        }
        if !options.is_empty() {
            body.insert("options".to_owned(), Value::Object(options));
        }
        if path == "/vision" {
            let images = extract_images_for_oneshot(&req);
            if images.is_empty() {
                anyhow::bail!("rsclaw /vision: request has no image content");
            }
            body.insert(
                "images".to_owned(),
                Value::Array(images.into_iter().map(Value::String).collect()),
            );
        }
        let body = Value::Object(body);

        // Send via the same redirect-cache + 308-aware pipeline that
        // session traffic uses, so a fastshot worker pinned under the
        // LB benefits from the same per-origin caching as the primary
        // pool.
        // No per-request builder timeout: reqwest's `.timeout()` is a
        // TOTAL deadline that includes the streamed SSE body, so the old
        // 60s cap killed long vision/oneshot generations mid-stream —
        // surfacing as a misleading "error decoding response body" and
        // getting amplified 3× by the failover retry layer. Mirror
        // `turn()`: bound only time-to-headers with `tokio::time::timeout`;
        // once headers arrive the body streams as long as the generation
        // needs (TCP liveness is covered by the client-level
        // `tcp_keepalive(30s)`). Without this, /vision /oneshot /fastshot
        // /ocr all 503-by-timeout on any response that runs past 60s.
        let send_fut = self.send_following_redirects(path, &body, None, None, false);
        let resp = match tokio::time::timeout(TURN_HEADERS_TIMEOUT, send_fut).await {
            Ok(r) => r?,
            Err(_) => anyhow::bail!(
                "rsclaw {path}: timed out waiting for response headers after {}s ({}/agent{})",
                TURN_HEADERS_TIMEOUT.as_secs(),
                self.base_url,
                path,
            ),
        };
        let status = resp.status();
        if !status.is_success() {
            let body = resp.text().await.unwrap_or_default();
            anyhow::bail!("rsclaw {path} failed {status}: {body}");
        }

        // Non-streaming lane (vision): a single JSON {content, finish_reason,
        // usage} reply (same shape as the OCR lane — `content`, not `text`).
        // Wrap it as a one-delta stream so the LlmStream interface is
        // unchanged for callers, while skipping the slow per-chunk SSE relay.
        if !use_stream {
            let json: Value = resp
                .json()
                .await
                .map_err(|e| anyhow::anyhow!("rsclaw {path}: decode non-stream body: {e}"))?;
            if let Some(err) = json.get("error").and_then(|v| v.as_str()) {
                return Ok(Box::pin(futures::stream::iter(vec![Ok(StreamEvent::Error(
                    err.to_owned(),
                ))])) as LlmStream);
            }
            let content = json
                .get("content")
                .and_then(|v| v.as_str())
                .or_else(|| json.get("text").and_then(|v| v.as_str()))
                .or_else(|| {
                    json.pointer("/choices/0/message/content")
                        .and_then(|v| v.as_str())
                })
                .unwrap_or_default()
                .to_owned();
            let events = vec![
                Ok(StreamEvent::TextDelta(content)),
                Ok(StreamEvent::Done { usage: None }),
            ];
            return Ok(Box::pin(futures::stream::iter(events)) as LlmStream);
        }

        // Native rsclaw fastshot/vision SSE — distinct from the
        // primary sessions endpoint's OAI-style frames. The three
        // event types per `docs/fastshot-vision-protocol.md §3`:
        //
        //   data: {"type":"delta","content":"..."}
        //   data: {"type":"done","finish_reason":"stop","usage":{...}}
        //   data: {"type":"error","error":"..."}
        //   data: [DONE]
        //
        // Line buffering + UTF-8 boundary handling mirrors the
        // openai.rs implementation (worker can split frames across
        // TCP segments) but the JSON shape is fastshot-native so we
        // parse it locally.
        // Per-chunk read-idle guard. We deliberately dropped the total
        // `.timeout()` above so long generations can stream past 60s, but
        // that leaves no bound on a worker that goes silent mid-stream (TCP
        // alive, no bytes) — the consumer would hang forever. Mirror the
        // replay path's 45s read-idle timeout: a silent gap surfaces as an
        // error event instead of an indefinite stall.
        let path_owned = path.to_string();
        let byte_stream = tokio_stream::StreamExt::timeout(
            resp.bytes_stream(),
            Duration::from_secs(STREAM_READ_IDLE_SECS),
        )
        .map(move |r| match r {
            Ok(Ok(bytes)) => Ok(bytes),
            Ok(Err(e)) => Err(anyhow::anyhow!("stream read error: {e}")),
            Err(_) => Err(anyhow::anyhow!(
                "rsclaw {path_owned}: SSE idle for {STREAM_READ_IDLE_SECS}s (worker stalled mid-generation)"
            )),
        });
        let line_buffer = Arc::new(tokio::sync::Mutex::new(String::new()));
        let utf8_remainder = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let event_stream = byte_stream
            .then(move |chunk| {
                let line_buffer = line_buffer.clone();
                let utf8_remainder = utf8_remainder.clone();
                async move { parse_oneshot_sse_chunk(chunk, &line_buffer, &utf8_remainder).await }
            })
            .flat_map(|events| futures::stream::iter(events));
        Ok(Box::pin(event_stream) as LlmStream)
    }

    async fn turn(
        &self,
        session_id: &str,
        delta: &TurnDelta,
        req: &LlmRequest,
    ) -> Result<TurnOutcome> {
        let path = format!("/sessions/{}/turn", session_id);
        let recall = req.recall.as_ref().filter(|r| !r.context.trim().is_empty());
        let body = TurnReq {
            delta,
            recall_context: recall.map(|r| r.context.as_str()),
            recall: recall.map(|r| &r.metadata),
            options: Some(TurnOptions::from_request(req, self.constrain_tool_calls)),
            stream: true,
        };
        // No per-hop builder timeout: reqwest's `.timeout()` is a
        // total deadline that includes the streaming body, so a 180s
        // cap would kill long generations (reasoning models with
        // extended thinking + large outputs routinely run past three
        // minutes). Instead bound only the time-to-response-headers
        // (PLUS any redirect-following hops along the way) with
        // `tokio::time::timeout` around the entire helper call so a
        // wedged server still surfaces fast — once headers arrive,
        // the body stream is allowed to take as long as it needs.
        // Connection liveness during streaming is covered by the
        // client-level `tcp_keepalive(30s)` configured above.
        // Retry policy for `/sessions/<id>/turn`:
        //
        // `POST /turn` is NOT idempotent — its body contains
        // `tool_results` that the server appends to the session log on
        // success. A blind retry on 502/504 risks double-appending if
        // the upstream reached the worker but failed to flush headers.
        // 429 surfaces a real rate-limit signal the caller should see.
        //
        // Per docs/client-server-integration.md §4.4: "Treat 503 on
        // /turn as retryable with backoff" — only 503 means the
        // server-side auto-replay attempted recovery and gave up
        // BEFORE the worker advanced state. We narrow retry to that
        // single status.
        //
        // 503s with a specific `session_diverged` / `version_drift` /
        // `backend_unavailable` (pinned-node) body code go through
        // `is_session_evicted` above and return SessionNotFound so the
        // caller can drive a clean `/sessions/replay` recovery.
        const RETRY_BACKOFFS: [Duration; 2] = [Duration::from_millis(500), Duration::from_secs(2)];
        let mut attempt: usize = 0;
        let resp = loop {
            let send_fut = self.send_following_redirects(&path, &body, None, None, false);
            let resp = match tokio::time::timeout(TURN_HEADERS_TIMEOUT, send_fut).await {
                Ok(r) => r?,
                Err(_) => anyhow::bail!(
                    "rsclaw turn: timed out waiting for response headers after {}s ({}/agent{})",
                    TURN_HEADERS_TIMEOUT.as_secs(),
                    self.base_url,
                    path,
                ),
            };
            let status = resp.status();
            if status.is_success() {
                break resp;
            }
            let body_text = resp.text().await.unwrap_or_default();
            // 404 session_not_found (slot evicted), 409 version_drift
            // (pinned node upgraded past our rsclaw_version) and 503
            // backend_unavailable (pinned node gone via heartbeat
            // timeout) all share the same recovery path: replay against
            // current rsclaw_version and retry. Other 404s — typically
            // a misrouted request hitting a CDN/proxy 404 page — should
            // bail with the upstream body so operators can see the real
            // error instead of looping forever in replay.
            if is_session_evicted(status, &body_text) {
                return Ok(TurnOutcome::SessionNotFound);
            }
            if status == StatusCode::SERVICE_UNAVAILABLE && attempt < RETRY_BACKOFFS.len() {
                let delay = RETRY_BACKOFFS[attempt];
                tracing::warn!(
                    status = %status,
                    attempt = attempt + 1,
                    delay_ms = delay.as_millis() as u64,
                    "rsclaw turn: 503 from upstream after server-side auto-replay; retrying"
                );
                tokio::time::sleep(delay).await;
                attempt += 1;
                continue;
            }
            anyhow::bail!("rsclaw turn failed {status}: {body_text}");
        };

        let byte_stream = resp.bytes_stream();
        let line_buffer = Arc::new(tokio::sync::Mutex::new(String::new()));
        let utf8_remainder = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let block_state = Arc::new(tokio::sync::Mutex::new(SseState::default()));
        let event_stream =
            byte_stream
                .map_err(|e| anyhow::anyhow!("stream read error: {e}"))
                .then(move |chunk| {
                    let line_buffer = line_buffer.clone();
                    let utf8_remainder = utf8_remainder.clone();
                    let block_state = block_state.clone();
                    async move {
                        parse_sse_chunk(chunk, &line_buffer, &utf8_remainder, &block_state).await
                    }
                })
                .flat_map(futures::stream::iter);

        Ok(TurnOutcome::Stream(Box::pin(event_stream)))
    }
}

enum TurnOutcome {
    Stream(LlmStream),
    SessionNotFound,
}

// ---------------------------------------------------------------------------
// Wire types — mirror rsclaw-protocol.md §2
// ---------------------------------------------------------------------------

/// Wire shape of `dynamic_prefix` per protocol §2.1.2 (v1.9 segment-aware:
/// base `system`+`tools` participate in the worker's content-addressed
/// LRU hash; `user_tools` and `user_system` do NOT — they form a
/// per-session segment layered on top of the base. Render order on the
/// worker side is `shared/system + base tools + user_tools + user_system
/// + turns` inside a single Qwen `<tools>` block).
#[derive(Debug, Serialize)]
struct DynamicPrefixWire<'a> {
    #[serde(skip_serializing_if = "str::is_empty")]
    system: &'a str,
    /// Shared/base tools — byte-stable across every client of this
    /// RsClaw version. Participates in `base_hash16` so per-client
    /// variance MUST go in `user_tools` instead, or the base cache
    /// fragments to 1-pool-per-client.
    tools: &'a [Value],
    /// Per-session private tools: plugins, MCP, workspace-specific. Not
    /// part of `base_hash16`; folded into the user-segment cache key so
    /// changes here only invalidate the segment, not the base prefix.
    /// Skipped from the wire when empty so degenerate sessions match
    /// the v1.7 byte shape that didn't carry the field at all.
    #[serde(skip_serializing_if = "slice_ref_is_empty")]
    user_tools: &'a [Value],
    /// Per-session text that does NOT participate in the worker's
    /// system+tools hash. Maps to the worker's `user_system` KV cache
    /// layer. Server treats this as opaque text to prefill after the
    /// base layer and before session_tail.
    #[serde(skip_serializing_if = "str::is_empty")]
    user_system: &'a str,
}

/// `skip_serializing_if` predicate for `&[T]` fields. Serde hands the
/// predicate `&FieldType`, so for a `&'a [Value]` field the parameter
/// is `&&'a [Value]`. The double-ref is what derives the explicit
/// helper instead of using `<[Value]>::is_empty` inline.
fn slice_ref_is_empty<T>(s: &&[T]) -> bool {
    s.is_empty()
}

#[derive(Debug, Serialize)]
struct CreateSessionReq<'a> {
    /// Protocol §2.1.1 — namespaced `<ns>/<ver>`. Mutually exclusive with
    /// `dynamic_prefix`: a non-empty `prefix_id` forks the session from the
    /// worker's static registry slot, so `dynamic_prefix` is omitted. When
    /// `prefix_id` is empty the worker falls back to the dynamic-LRU keyed
    /// by hash(system+tools) — then `dynamic_prefix` is sent and `prefix_id`
    /// is omitted. See `prefix_fields`.
    #[serde(skip_serializing_if = "Option::is_none")]
    prefix_id: Option<&'a str>,
    /// Bare model id (no `rsclaw/` namespace prefix) — required since
    /// 2026-05 to route the request to the correct model slot on the
    /// worker. The session retains this binding for its lifetime, so
    /// `/turn` and `/replay` traffic against the same session_id never
    /// needs to repeat it. Strip the `rsclaw/` namespace before
    /// sending because the server records the bare id.
    model: &'a str,
    /// Sent only when `prefix_id` is empty (dynamic-LRU mode). Omitted
    /// when `prefix_id` is present (static-registry fork).
    #[serde(skip_serializing_if = "Option::is_none")]
    dynamic_prefix: Option<DynamicPrefixWire<'a>>,
    /// Protocol §2.1.1 registry-path field: per-session private tools
    /// (plugins, MCP, workspace) sent at the TOP level alongside
    /// `prefix_id`. The dynamic path carries the same payload inside
    /// `dynamic_prefix.user_tools` instead — exactly one of the two
    /// positions is populated per request (the other is empty and
    /// skipped from the wire body). Built that way so the worker's
    /// `tool_name_collision` check (§2.1.4) runs over the same
    /// (base, user) pair regardless of which path the client took.
    #[serde(skip_serializing_if = "slice_ref_is_empty")]
    user_tools: &'a [Value],
    #[serde(skip_serializing_if = "Option::is_none")]
    options: Option<TurnOptions>,
}

#[derive(Debug, Serialize)]
struct ReplayReq<'a> {
    /// Same mutual-exclusion contract as `CreateSessionReq`: present
    /// only when non-empty, in which case `dynamic_prefix` is omitted.
    #[serde(skip_serializing_if = "Option::is_none")]
    prefix_id: Option<&'a str>,
    /// Same bare model id contract as `CreateSessionReq` — replay
    /// rebuilds the session from scratch, so the model binding must be
    /// declared again. Worker returns
    /// `missing_model` 400 if omitted.
    model: &'a str,
    /// Sent only when `prefix_id` is empty. See `CreateSessionReq`.
    #[serde(skip_serializing_if = "Option::is_none")]
    dynamic_prefix: Option<DynamicPrefixWire<'a>>,
    /// Same registry-path field as `CreateSessionReq.user_tools` — see
    /// the comment there for the position-selection rule.
    #[serde(skip_serializing_if = "slice_ref_is_empty")]
    user_tools: &'a [Value],
    history: Vec<Value>,
    #[serde(skip_serializing_if = "Option::is_none")]
    options: Option<TurnOptions>,
}

/// Split the `(prefix_id, dynamic_prefix, dynamic_user_tools)` triple into
/// the protocol's mutually-exclusive wire shape:
/// - **Registry path** (`prefix_id` non-empty, §2.1.1): emit `prefix_id`, omit
///   `dynamic_prefix`, surface `user_tools` at the TOP level so the worker
///   pairs them with the registered base.
/// - **Dynamic path** (`prefix_id` empty, §2.1.2): omit `prefix_id`, emit
///   `dynamic_prefix` carrying the full base+user payload. Top-level
///   `user_tools` is empty (its values live inside
///   `dynamic_prefix.user_tools`).
///
/// In both cases the same `dynamic.user_tools` slice is the source of
/// truth; this helper just routes it to the correct wire position so the
/// caller doesn't replicate the conditional at every callsite.
fn prefix_fields<'a>(
    prefix_id: &'a str,
    dynamic: DynamicPrefixWire<'a>,
) -> (Option<&'a str>, Option<DynamicPrefixWire<'a>>, &'a [Value]) {
    if prefix_id.is_empty() {
        // Dynamic path: dynamic_prefix.user_tools already carries the
        // per-session private tools; the top-level slot stays empty
        // (skip_serializing_if drops it from the wire body).
        (None, Some(dynamic), &[])
    } else {
        // Registry path: lift user_tools out of the dropped
        // dynamic_prefix and surface them at the top level. The base
        // (system + builtin tools) lives in the worker's static slot
        // so dynamic_prefix itself is omitted.
        let top_level_user_tools = dynamic.user_tools;
        (Some(prefix_id), None, top_level_user_tools)
    }
}

/// Wire body for `POST /v1/agent/sessions/<id>/compact` per protocol §2.4.
/// In-place splice: keep first `keep_head_messages` messages' KV unchanged,
/// drop the middle KV pages, prefill `summary` in place, keep the last
/// `keep_tail_messages` messages' KV unchanged. Server returns the same
/// `session_id` (no slot reallocation).
///
/// `expected_msgs_count` is optimistic concurrency — gateway tells server
/// what total `msgs_count` it thinks the session has right now. Mismatch
/// returns 409 and gateway must fall back to `/sessions/replay`.
///
/// The wire excludes `prefix_id` / `dynamic_prefix` because compact targets
/// an already-open session by `session_id`; the slot's existing prefix
/// stays bound to whatever was used at open time.
#[derive(Debug, Serialize)]
struct CompactSpliceReq<'a> {
    keep_head_messages: usize,
    summary: &'a str,
    keep_tail_messages: usize,
    #[serde(skip_serializing_if = "Option::is_none")]
    expected_msgs_count: Option<usize>,
}

/// Response from `POST /sessions/<id>/compact`. `session_id` mirrors the
/// path parameter and exists for sanity-check / log correlation only —
/// the §2.4 spec guarantees it does NOT change across splice. `msgs_count`
/// and `tokens_count` are server's authoritative post-splice counts; the
/// gateway uses them for cross-check against its own local computation
/// (head_count + 1 summary + tail_count, summed tokens of those).
#[derive(Debug, Deserialize, Clone)]
struct CompactSpliceResp {
    #[allow(dead_code)] // kept for log correlation when paired with session_id-keyed metrics
    session_id: String,
    msgs_count: usize,
    tokens_count: usize,
}

/// Outcome of a single `/compact` POST. Splits the success path from the
/// `409 msg_count_mismatch` optimistic-concurrency signal so the wrapper
/// can retry the latter (protocol §6.3.1) instead of treating it as a
/// hard failure that forces a full replay.
enum SpliceOutcome {
    Done(CompactSpliceResp),
    CountMismatch { current: usize },
}

/// Body of a `409 msg_count_mismatch` from `/compact`:
/// `{"error":{"code":"msg_count_mismatch","detail":"expected 20, got
/// 21","current":21}}`. `current` is the server's authoritative post-turn slot
/// count; the client re-aligns `expected_msgs_count` to it and retries.
#[derive(Debug, Deserialize)]
struct CompactSplice409 {
    error: CompactSplice409Error,
}

#[derive(Debug, Deserialize)]
struct CompactSplice409Error {
    current: usize,
}

#[derive(Debug, Deserialize, Clone)]
struct CreateSessionResp {
    session_id: String,
    /// Post-rename canonical id from protocol §2.1.6 (`<namespace>/<id>`).
    ///
    /// Modeled as `Option<String>` rather than `#[serde(default)]
    /// String` so we accept three shapes:
    ///   - field absent              → `None` (replay path)
    ///   - field present, string     → `Some(v)` (open path)
    ///   - field present, JSON null  → `None`
    ///
    /// The null case is the trap: serde's `String` deserializer rejects
    /// `null` outright with `invalid type: null, expected a string` and
    /// the whole response parse dies. Upstream nodes occasionally emit
    /// `"prefix_id": null` mid-roll — accept that gracefully.
    ///
    /// We do NOT use `#[serde(alias = "rsclaw_version")]` here. The
    /// pre-rename `rsclaw_version` field is being dropped server-side;
    /// in the meantime some builds still emit it alongside `prefix_id`
    /// in the same payload. With an `alias` serde would treat the
    /// second occurrence as a duplicate field and bail the whole
    /// response parse with `duplicate field`prefix_id``, which surfaced
    /// to callers as the opaque `rsclaw open: parse response` error
    /// (seen in production e2e against `:8443`). Without the alias,
    /// the legacy `rsclaw_version` field is just an unknown key serde
    /// ignores by default — exactly what we want as the field gets
    /// retired.
    ///
    /// Parsed for forward compat / observability only. NOT used as the
    /// session cache key — see the SessionEntry construction sites for
    /// why caching the upstream canonical breaks alias-based requests.
    #[serde(default)]
    #[allow(dead_code)]
    prefix_id: Option<String>,
}

#[derive(Debug, Serialize)]
struct TurnReq<'a> {
    #[serde(flatten)]
    delta: &'a TurnDelta,
    #[serde(skip_serializing_if = "Option::is_none")]
    recall_context: Option<&'a str>,
    #[serde(skip_serializing_if = "Option::is_none")]
    recall: Option<&'a RecallMetadata>,
    #[serde(skip_serializing_if = "Option::is_none")]
    options: Option<TurnOptions>,
    stream: bool,
}

#[derive(Debug, Serialize)]
#[serde(untagged)]
enum TurnDelta {
    User { user_message: String },
    Tools { tool_results: Vec<ToolResultDelta> },
}

impl TurnDelta {
    fn from_request(req: &LlmRequest) -> Result<Self> {
        let last = req
            .messages
            .last()
            .context("rsclaw: empty messages, no delta to send")?;
        if !matches!(last.role, Role::User | Role::Tool) {
            anyhow::bail!(
                "rsclaw: last message must be User or Tool, got {:?}",
                last.role
            );
        }

        // When the assistant calls N tools in parallel, the runtime
        // queues N consecutive Role::Tool messages (one per result).
        // Protocol §2.3 requires a single turn() carry ALL of them in
        // one tool_results array, else server bails with 400
        // tool_results_incomplete. Walk back from the tail collecting
        // every consecutive Tool message; stop at the first non-Tool.
        if matches!(last.role, Role::Tool) {
            let mut tail: Vec<&Message> = Vec::new();
            for m in req.messages.iter().rev() {
                if matches!(m.role, Role::Tool) {
                    tail.push(m);
                } else {
                    break;
                }
            }
            tail.reverse();
            let mut tool_results: Vec<ToolResultDelta> = Vec::new();
            for m in tail {
                if let MessageContent::Parts(parts) = &m.content {
                    for p in parts {
                        if let ContentPart::ToolResult {
                            tool_use_id,
                            content,
                            is_error,
                        } = p
                        {
                            tool_results.push(ToolResultDelta {
                                tool_use_id: tool_use_id.clone(),
                                content: content.clone(),
                                is_error: is_error.unwrap_or(false),
                            });
                        }
                    }
                }
            }
            if tool_results.is_empty() {
                anyhow::bail!("rsclaw: trailing Tool message(s) carried no tool_result parts");
            }
            return Ok(TurnDelta::Tools { tool_results });
        }

        // Role::User branch — the trailing message is treated as one
        // user_message. Empty content (Text("") or Parts with only
        // empty Text fragments) bails: protocol §2.3 requires a real
        // delta, and silently shipping "" still bills a prefill on the
        // upstream slot. Mirrors the empty-tool_results bail above.
        let mut user_text = String::new();
        match &last.content {
            MessageContent::Text(t) => user_text.push_str(t),
            MessageContent::Parts(parts) => {
                for p in parts {
                    if let ContentPart::Text { text } = p {
                        user_text.push_str(text);
                    }
                }
            }
        }
        if user_text.is_empty() {
            anyhow::bail!("rsclaw: last message has no usable content for delta")
        }
        Ok(TurnDelta::User {
            user_message: user_text,
        })
    }
}

#[derive(Debug, Serialize)]
struct ToolResultDelta {
    tool_use_id: String,
    content: String,
    #[serde(default)]
    is_error: bool,
}

/// Serializer for `Option<f32>` that routes the value through
/// `super::json_f32` (rounds to 2 decimal places) instead of the default
/// f32 → f64 path which leaks IEEE 754 precision artefacts (0.6_f32 →
/// 0.6000000238418579 in JSON output). Mirrors what every other provider
/// in this crate does manually with `body["temperature"] = json_f32(t)`.
fn ser_opt_f32<S: serde::Serializer>(
    v: &Option<f32>,
    s: S,
) -> std::result::Result<S::Ok, S::Error> {
    match v {
        None => s.serialize_none(),
        Some(f) => super::json_f32(*f).serialize(s),
    }
}

#[derive(Debug, Serialize, Clone, Default)]
struct TurnOptions {
    #[serde(skip_serializing_if = "Option::is_none")]
    max_tokens: Option<u32>,
    #[serde(
        skip_serializing_if = "Option::is_none",
        serialize_with = "ser_opt_f32"
    )]
    temperature: Option<f32>,
    #[serde(
        skip_serializing_if = "Option::is_none",
        serialize_with = "ser_opt_f32"
    )]
    top_p: Option<f32>,
    #[serde(skip_serializing_if = "Option::is_none")]
    enable_thinking: Option<bool>,
    #[serde(skip_serializing_if = "Option::is_none")]
    stop: Option<Vec<String>>,
    #[serde(skip_serializing_if = "Option::is_none")]
    idle_ttl_secs: Option<u32>,
    /// Worker constrains tool-call decoding with the lazy GBNF grammar it
    /// derived from this session's tools at create/replay. `None` keeps the
    /// wire bytes identical to pre-grammar builds (and lets old workers
    /// stay oblivious).
    #[serde(skip_serializing_if = "Option::is_none")]
    constrain_tool_calls: Option<bool>,
}

impl TurnOptions {
    /// `constrain_tool_calls` comes from the provider-level config flag;
    /// it is only put on the wire when the request actually carries tools
    /// (a tool-less request has no grammar to constrain with).
    fn from_request(req: &LlmRequest, constrain_tool_calls: bool) -> Self {
        Self {
            max_tokens: req.max_tokens,
            temperature: req.temperature,
            top_p: None,
            enable_thinking: req.thinking_budget.map(|b| b > 0),
            stop: None,
            idle_ttl_secs: None,
            constrain_tool_calls: (constrain_tool_calls && !req.tools.is_empty()).then_some(true),
        }
    }
}

// ---------------------------------------------------------------------------
// Request splitting (LlmRequest → protocol fields)
// ---------------------------------------------------------------------------

/// Maps an `LlmRequest` onto the protocol's split fields per
/// rsclaw-protocol §2.1 (v1.9 segment-aware hybrid path).
///
/// When the runtime populated `req.system_shared` / `req.user_system`
/// (kvCacheMode=2 path on the main agent loop), the split lands in the
/// "real" hybrid shape:
/// - `dynamic_system`        ← shared system prefix (byte-stable across every
///   RsClaw client of this version) → wire `dynamic_prefix.system`
/// - `dynamic_user_system`   ← per-machine non-hashed segment → wire
///   `dynamic_prefix.user_system` (worker layer-2 cache key intentionally
///   EXCLUDES this, so it can vary per session without collapsing the hit rate)
/// - `dynamic_tools`         ← shared/base tools — names from
///   [`BUILTIN_TOOL_NAMES`]. Participates in `base_hash16`, so this set MUST be
///   byte-identical across clients of the same RsClaw version or the base
///   prefix cache fragments per-client.
/// - `dynamic_user_tools`    ← per-session/per-client private tools: plugins,
///   MCP, workspace-specific. Not in `base_hash16`; folded into the
///   user-segment key so two sessions differing only here share the base cache
///   but get separate segment slots.
///
/// When the split fields are missing (internal sessions / non-runtime
/// callers) we degrade gracefully: stuff `req.system` into
/// `dynamic_system` and leave `dynamic_user_system` empty. Tool
/// classification still applies — builtins go to `dynamic_tools`,
/// everything else to `dynamic_user_tools`. The base cache now hashes
/// only over the builtin slice, so even internal callers share the
/// per-version base slot (strict cache improvement over the v1.8
/// "everything-into-tools" degraded path).
struct SplitRequest<'a> {
    /// Namespaced `rsclaw/<id>` per protocol §2.10.1.
    prefix_id: String,
    /// Bare model id with the `rsclaw/` namespace prefix stripped. Required
    /// in the wire body for both `POST /sessions` and `/sessions/replay`
    /// as of 2026-05; sessions carry this binding for their lifetime.
    model: String,
    dynamic_system: &'a str,
    dynamic_user_system: &'a str,
    /// Base / shared tools — only names in [`BUILTIN_TOOL_NAMES`].
    /// Drives `base_hash16`.
    dynamic_tools: Vec<Value>,
    /// Private tools — names NOT in [`BUILTIN_TOOL_NAMES`]. Drives only
    /// the user-segment key.
    dynamic_user_tools: Vec<Value>,
    options: TurnOptions,
}

/// Dump the full request shape for one turn so the operator can
/// replay the same logical input against several rsclaw-llm endpoints
/// (rsclaw stateful `/sessions/<id>/turn`, `/sessions/replay`, vanilla
/// `/v1/chat/completions`) and compare the model's output. Used to
/// bisect a "truncation happens via rsclaw protocol but not via OpenAI
/// compat" symptom into "rsclaw-llm side problem" vs "model side
/// problem".
///
/// Writes one JSON file per turn to:
///   `<base_dir>/debug/turn-<unix_ms>-<session_suffix>.json`
///
/// Gated on `RSCLAW_DUMP_TURN` env var being set (any non-empty value).
/// Write failures are logged at WARN but don't abort the turn.
fn dump_turn_for_debug(
    session_key: &str,
    entry: &SessionEntry,
    split: &SplitRequest<'_>,
    delta: &TurnDelta,
    req: &LlmRequest,
) {
    use std::time::{SystemTime, UNIX_EPOCH};
    let now_ms = SystemTime::now()
        .duration_since(UNIX_EPOCH)
        .map(|d| d.as_millis() as u64)
        .unwrap_or(0);

    // Compose the replay history slice (everything before the trailing
    // delta — same logic `replay()` uses on the recovery path).
    let history_owned: Vec<&Message> = history_for_replay(&req.messages).iter().collect();
    let history_values = serialize_replay_history(&history_owned);

    // Equivalent OpenAI-compatible chat body — message-array + tools
    // shape that vanilla `/v1/chat/completions` accepts. The model id
    // is the bare form (no `rsclaw/` namespace prefix) since the local
    // llama-server's OpenAI surface doesn't recognize prefixed names.
    let openai_model = req.model.strip_prefix("rsclaw/").unwrap_or(&req.model);
    let openai_messages: Vec<Value> = req
        .messages
        .iter()
        .filter_map(|m| serde_json::to_value(m).ok())
        .collect();
    let openai_tools: Vec<Value> = req
        .tools
        .iter()
        .map(|t| {
            json!({
                "type": "function",
                "function": {
                    "name": t.name,
                    "description": t.description,
                    "parameters": t.parameters,
                }
            })
        })
        .collect();

    let opts = split.options.clone();

    // Wire body of the actual `/sessions/<id>/turn` request we're about
    // to send. Must match what `Provider::turn()` puts on the wire: the
    // TurnDelta uses `#[serde(flatten)]` inside TurnReq, so `user_message`
    // or `tool_results` sit at the TOP level alongside `options`/`stream`.
    // Nesting them under a `delta` wrapper would make the dump non-replayable
    // — the worker returns 400 `invalid_request: turn must include exactly
    // one of user_message (string) or tool_results (non-empty array)`.
    // Canonicalize through to_canonical_value so the dumped bytes match
    // the actual wire bytes the provider sends. The crate uses
    // serde_json's `preserve_order` feature, which keeps insertion-order
    // keys; the wire send path runs the body through `to_canonical_value`
    // (BTreeMap-sorted) so worker-side hashes are byte-stable. Without
    // the same pass on the dump, an operator using RSCLAW_DUMP_TURN to
    // bisect a "truncation in rsclaw protocol but not in OpenAI compat"
    // symptom would see *different* bytes from what `/sessions/.../turn`
    // received — masking exactly the kind of byte-level bug the dump
    // tool exists to diagnose. R1 review I2.
    let recall = req.recall.as_ref().filter(|r| !r.context.trim().is_empty());
    let turn_body = to_canonical_value(
        serde_json::to_value(&TurnReq {
            delta,
            recall_context: recall.map(|r| r.context.as_str()),
            recall: recall.map(|r| &r.metadata),
            options: Some(opts.clone()),
            stream: true,
        })
        .unwrap_or(Value::Null),
    );

    // Wire body that would rehydrate this session from scratch via
    // `/sessions/replay`. Useful when the session_id is no longer alive
    // on the worker and the operator wants to recreate the exact state.
    let mut dynamic_prefix_dump = serde_json::Map::new();
    dynamic_prefix_dump.insert("system".to_owned(), json!(split.dynamic_system));
    dynamic_prefix_dump.insert("tools".to_owned(), json!(split.dynamic_tools));
    if !split.dynamic_user_tools.is_empty() {
        dynamic_prefix_dump.insert("user_tools".to_owned(), json!(split.dynamic_user_tools));
    }
    if !split.dynamic_user_system.is_empty() {
        dynamic_prefix_dump.insert("user_system".to_owned(), json!(split.dynamic_user_system));
    }
    let replay_body = to_canonical_value(json!({
        "prefix_id": split.prefix_id,
        "dynamic_prefix": Value::Object(dynamic_prefix_dump),
        "history": history_values,
        "options": serde_json::to_value(&opts).unwrap_or(Value::Null),
    }));

    let dump = json!({
        "schema_version": 1,
        "timestamp_ms": now_ms,
        "session_key": session_key,
        "model": req.model,
        "rsclaw_session": {
            "session_id": entry.session_id,
            "prefix_id": entry.prefix_id,
            "last_seen_msgs_len": entry.last_seen_msgs_len,
        },
        "llm_request_summary": {
            "msg_count": req.messages.len(),
            "tool_count": req.tools.len(),
            "system_len": req.system.as_deref().map(|s| s.len()).unwrap_or(0),
            "max_tokens": req.max_tokens,
            "temperature": req.temperature,
            "kv_cache_mode": req.kv_cache_mode,
        },
        "rsclaw_turn_body": turn_body,
        "rsclaw_replay_body": replay_body,
        "openai_chat_completions_body": {
            "model": openai_model,
            "messages": openai_messages,
            "tools": openai_tools,
            "temperature": req.temperature,
            "max_tokens": req.max_tokens,
            "stream": true,
        },
        "replay_instructions": [
            "Pick ONE of the three replay paths and POST against the worker:",
            "  A. Stateful turn against a LIVE session (only works while session is alive):",
            "     curl -X POST $BASE/sessions/<session_id>/turn -d @<this-file>[.rsclaw_turn_body]",
            "  B. Re-hydrate then turn — recreates session deterministically:",
            "     curl -X POST $BASE/sessions/replay  -d @<this-file>[.rsclaw_replay_body]",
            "     curl -X POST $BASE/sessions/<new_session_id>/turn -d @<this-file>[.rsclaw_turn_body]",
            "  C. Stateless OpenAI-compat for comparison (no session, full history each time):",
            "     curl -X POST $BASE/v1/chat/completions -d @<this-file>[.openai_chat_completions_body]"
        ]
    });

    // Pick a short suffix for the file name — session_id's tail hex is
    // unique enough to disambiguate within one millisecond.
    let sess_suffix: String = entry
        .session_id
        .rsplit('_')
        .next()
        .unwrap_or("unknown")
        .chars()
        .take(8)
        .collect();
    let dir = rsclaw_config::loader::base_dir().join("debug");
    if let Err(e) = std::fs::create_dir_all(&dir) {
        tracing::warn!(error = %e, "RSCLAW_DUMP_TURN: create_dir_all failed");
        return;
    }
    let path = dir.join(format!("turn-{now_ms}-{sess_suffix}.json"));
    match serde_json::to_string_pretty(&dump) {
        Ok(s) => match std::fs::write(&path, s) {
            Ok(_) => tracing::info!(path = %path.display(), "RSCLAW_DUMP_TURN: turn dumped"),
            Err(e) => {
                tracing::warn!(error = %e, path = %path.display(), "RSCLAW_DUMP_TURN: write failed")
            }
        },
        Err(e) => tracing::warn!(error = %e, "RSCLAW_DUMP_TURN: serialize failed"),
    }
}

/// Walk a `serde_json::Value`, returning a deep copy with every
/// `Object` map's keys reordered alphabetically. Arrays preserve
/// input order (they're inherently positional). Primitives are
/// cloned as-is.
///
/// Purpose: the crate-wide `preserve_order` feature on `serde_json`
/// makes `Map<String, Value>` an `IndexMap` that keeps insertion
/// order — fine for round-tripping JSON5 configs, but it lets the
/// non-determinism of upstream `HashMap` iteration leak into the
/// wire JSON sent to rsclaw-server. The worker hashes the
/// `dynamic_prefix.tools` payload byte-by-byte to form its prefix
/// cache key, so even one swapped key triggers a `dynamic_miss`
/// and a full prefill (~200s on a 28k-token prefix). Sorting keys
/// here gives a content-addressed canonical form: identical tool
/// definitions → identical bytes → identical hash → `dynamic_hit`.
fn to_canonical_value(v: serde_json::Value) -> serde_json::Value {
    use std::collections::BTreeMap;
    match v {
        serde_json::Value::Object(map) => {
            // BTreeMap forces alphabetical key order regardless of
            // the original IndexMap's insertion sequence.
            let sorted: BTreeMap<String, serde_json::Value> = map
                .into_iter()
                .map(|(k, v)| (k, to_canonical_value(v)))
                .collect();
            let canon: serde_json::Map<String, serde_json::Value> = sorted.into_iter().collect();
            serde_json::Value::Object(canon)
        }
        serde_json::Value::Array(arr) => {
            serde_json::Value::Array(arr.into_iter().map(to_canonical_value).collect())
        }
        other => other,
    }
}

fn split_request<'a>(
    req: &'a LlmRequest,
    prefix_id: &str,
    constrain_tool_calls: bool,
) -> Result<SplitRequest<'a>> {
    // `prefix_id` is config-driven (provider-level, default
    // [`RSCLAW_DEFAULT_PREFIX_ID`]) — NOT derived from `req.model` or
    // any per-turn data. Protocol §2.10.1 only mandates exactly one
    // `/` separator; the provider builder is responsible for keeping
    // the wire shape valid, so this function passes it through as-is.
    let prefix_id = prefix_id.to_owned();

    // `model` is required in the open/replay wire body since 2026-05.
    // Strip the `rsclaw/` namespace prefix so the server records the
    // bare slot id (e.g. `rsclaw-agent-v1`) — passing the namespaced
    // form trips the worker's model-slot whitelist check.
    let model = req
        .model
        .strip_prefix("rsclaw/")
        .unwrap_or(req.model.as_str())
        .to_owned();
    if model.is_empty() {
        anyhow::bail!("rsclaw: req.model is empty; cannot open session without a model id");
    }

    // Each tool's wire JSON is `to_canonical_value`-flattened to give
    // byte-stable output across gateway runs. Without this pass the
    // serialized `input_schema` carries whatever key order `serde_json`
    // observed when each field was inserted — and because the crate is
    // compiled with the `preserve_order` feature globally, ordering is
    // whatever the source `HashMap` / macro / derive happened to emit,
    // which is non-deterministic across runs. A flipped key in any of
    // the dozens of schema entries flips the worker-side dynamic_prefix
    // hash, forces `dynamic_miss`, and triggers a fresh 30-200s prefill
    // on every gateway restart even though the agent's tool list is
    // logically unchanged. Worker hash is content-addressed, so
    // alphabetical key order alone makes "same tools" map to the same
    // slot reliably.
    let tool_json = |t: &super::ToolDef| {
        to_canonical_value(json!({
            "name": t.name,
            "description": t.description,
            "input_schema": t.parameters,
        }))
    };

    // v1.9 split: builtins → `dynamic_prefix.tools` (base hash), everything
    // else → `dynamic_prefix.user_tools` (user segment). Classification is
    // by name membership in `BUILTIN_TOOL_NAMES`. This applies in both real
    // and degraded modes — the system text split degrades, but tool
    // classification is unconditional so the base cache slot is byte-stable
    // across clients of this version regardless of which MCP/plugin tools
    // they bring.
    let mut dynamic_tools: Vec<Value> = Vec::new();
    let mut dynamic_user_tools: Vec<Value> = Vec::new();
    for t in &req.tools {
        if rsclaw_types::BUILTIN_TOOL_NAMES.contains(&t.name.as_str()) {
            dynamic_tools.push(tool_json(t));
        } else {
            dynamic_user_tools.push(tool_json(t));
        }
    }

    let (dynamic_system, dynamic_user_system) =
        if req.system_shared.is_some() || req.user_system.is_some() {
            (
                req.system_shared.as_deref().unwrap_or(""),
                req.user_system.as_deref().unwrap_or(""),
            )
        } else {
            // Internal sessions / non-runtime callers: no system_shared/user_system
            // split. Collapse the full system text into the base position; the
            // tool classification above still gives us cacheable base+user
            // segments.
            (req.system.as_deref().unwrap_or(""), "")
        };

    Ok(SplitRequest {
        prefix_id,
        model,
        dynamic_system,
        dynamic_user_system,
        dynamic_tools,
        dynamic_user_tools,
        options: TurnOptions::from_request(req, constrain_tool_calls),
    })
}

/// Returns the history slice to send to `/sessions/replay`: every
/// message except the trailing delta (which `turn()` will re-send).
/// Empty input returns an empty slice — replay can still hydrate a
/// fresh session with no prior turns.
///
/// When the assistant calls N tools in parallel the runtime queues N
/// consecutive `Role::Tool` messages, and `TurnDelta::from_request`
/// folds ALL of them into a single tool_results delta (protocol §2.3
/// requires it). To keep the two sides symmetric the history slice
/// must drop every consecutive trailing `Role::Tool` — dropping just
/// one would leave the other N-1 in history, the server would replay
/// them into the KV, and then the turn would re-send the same
/// Flatten an `LlmRequest`'s system + messages into a single prompt
/// string for the one-shot `/fastshot` and `/vision` endpoints, which
/// take a bare `prompt` field instead of OpenAI-style messages.
///
/// Concatenation order: system → message texts in order, joined by
/// blank lines. Image parts are skipped here (the caller pulls them
/// into the `images` array via `extract_images_for_oneshot`). Tool
/// use/result parts and reasoning parts are also skipped — fastshot
/// is a tool-less endpoint and historical tool traffic isn't
/// meaningful in that context.
fn flatten_prompt_for_oneshot(req: &LlmRequest) -> String {
    let mut parts: Vec<String> = Vec::new();
    if let Some(sys) = req.system.as_deref() {
        let trimmed = sys.trim();
        if !trimmed.is_empty() {
            parts.push(trimmed.to_owned());
        }
    }
    for msg in &req.messages {
        match &msg.content {
            MessageContent::Text(t) => {
                let trimmed = t.trim();
                if !trimmed.is_empty() {
                    parts.push(trimmed.to_owned());
                }
            }
            MessageContent::Parts(content_parts) => {
                for p in content_parts {
                    if let ContentPart::Text { text } = p {
                        let trimmed = text.trim();
                        if !trimmed.is_empty() {
                            parts.push(trimmed.to_owned());
                        }
                    }
                }
            }
        }
    }
    parts.join("\n\n")
}

/// Parse one SSE chunk from the native fastshot/vision wire and
/// emit `StreamEvent`s. Buffers partial lines across chunks so a
/// frame split mid-JSON resolves cleanly; mirrors the strategy used
/// by `openai::parse_sse_chunk_with_buffer` but with the
/// fastshot-native JSON shape.
async fn parse_oneshot_sse_chunk(
    chunk: anyhow::Result<bytes::Bytes>,
    line_buffer: &tokio::sync::Mutex<String>,
    utf8_remainder: &tokio::sync::Mutex<Vec<u8>>,
) -> Vec<anyhow::Result<StreamEvent>> {
    let bytes = match chunk {
        Ok(b) => b,
        Err(e) => return vec![Err(e)],
    };

    // Carry forward any UTF-8 continuation bytes that landed at the
    // tail of the previous chunk — without this, CJK / emoji
    // characters that straddle a chunk boundary corrupt into U+FFFD.
    let mut remainder = utf8_remainder.lock().await;
    let combined = if remainder.is_empty() {
        bytes.to_vec()
    } else {
        let mut c = std::mem::take(&mut *remainder);
        c.extend_from_slice(&bytes);
        c
    };
    let text: String = match std::str::from_utf8(&combined) {
        Ok(t) => {
            drop(remainder);
            t.to_owned()
        }
        Err(e) => {
            let valid_up_to = e.valid_up_to();
            *remainder = combined[valid_up_to..].to_vec();
            drop(remainder);
            if valid_up_to == 0 {
                return Vec::new();
            }
            // SAFETY: valid_up_to is at a valid UTF-8 boundary by
            // construction of the `Utf8Error`.
            unsafe { std::str::from_utf8_unchecked(&combined[..valid_up_to]) }.to_owned()
        }
    };

    let mut buffer = line_buffer.lock().await;
    buffer.push_str(&text);
    let Some(last_newline) = buffer.rfind('\n') else {
        return Vec::new();
    };
    let complete = buffer[..last_newline].to_owned();
    let leftover = buffer[last_newline + 1..].to_owned();
    buffer.clear();
    buffer.push_str(&leftover);
    drop(buffer);

    let mut events: Vec<anyhow::Result<StreamEvent>> = Vec::new();
    for line in complete.lines() {
        let Some(payload) = line.strip_prefix("data:") else {
            continue;
        };
        let payload = payload.trim_start();
        if payload.is_empty() {
            continue;
        }
        if payload == "[DONE]" {
            // Spec §3: server always sends `data: [DONE]` after the
            // terminal frame. We only emit our own Done if the
            // worker never sent one — typical path is `done` event
            // first (which we already turned into StreamEvent::Done
            // with usage) then `[DONE]` which we swallow here.
            continue;
        }
        let val: Value = match serde_json::from_str(payload) {
            Ok(v) => v,
            Err(_) => {
                tracing::debug!(payload, "rsclaw fastshot: ignoring unparseable SSE line");
                continue;
            }
        };
        let ty = val.get("type").and_then(Value::as_str).unwrap_or("");
        match ty {
            // Legacy fastshot frame: {"type":"delta","content":"..."}.
            "delta" => {
                if let Some(content) = val.get("content").and_then(Value::as_str) {
                    if !content.is_empty() {
                        events.push(Ok(StreamEvent::TextDelta(content.to_owned())));
                    }
                }
            }
            // Current Anthropic-block-style frame the worker actually emits:
            //   {"type":"block_delta","index":0,"delta":"..."}
            // Text lives in `delta` (not `content`) and the type is
            // `block_delta` (not `delta`). Without this arm the entire stream
            // parses to zero TextDelta — `done` still fires, so callers get a
            // clean-but-EMPTY completion. That silent "empty output from LLM"
            // broke /oneshot AND /fastshot, taking down L1/lesson extraction
            // and crystallization. Prefer `delta`, fall back to
            // `content`/`text` for forward/backward compatibility.
            "block_delta" => {
                let piece = val
                    .get("delta")
                    .and_then(Value::as_str)
                    .or_else(|| val.get("content").and_then(Value::as_str))
                    .or_else(|| val.get("text").and_then(Value::as_str));
                if let Some(t) = piece {
                    if !t.is_empty() {
                        events.push(Ok(StreamEvent::TextDelta(t.to_owned())));
                    }
                }
            }
            // Block-framing markers carry no text — ignore quietly.
            "start" | "block_start" | "block_stop" | "ping" => {}
            "done" => {
                let usage = val
                    .get("usage")
                    .and_then(Value::as_object)
                    .map(|u| TokenUsage {
                        input: extract_usage_count(u, &["input_tokens", "prompt_tokens", "input"]),
                        output: extract_usage_count(
                            u,
                            &["output_tokens", "completion_tokens", "output"],
                        ),
                        cache_creation: extract_usage_count(
                            u,
                            &["cache_creation_input_tokens", "cache_creation_tokens"],
                        ),
                        cache_read: extract_usage_count(
                            u,
                            &[
                                "cache_read_input_tokens",
                                "cached_tokens",
                                "cache_read_tokens",
                            ],
                        ),
                        recall_tokens: extract_usage_count(u, &["recall_tokens"]),
                        recall_doc_ids: extract_usage_string_array(u, "recall_doc_ids"),
                        recall_hash: u
                            .get("recall_hash")
                            .and_then(Value::as_str)
                            .map(str::to_owned),
                        recall_truncated: u
                            .get("recall_truncated")
                            .and_then(Value::as_bool)
                            .unwrap_or(false),
                    });
                events.push(Ok(StreamEvent::Done { usage }));
            }
            "error" => {
                // Per §4.2 the error payload is `{code, message}`.
                let err = val.get("error");
                let code = err
                    .and_then(|e| e.get("code"))
                    .and_then(Value::as_str)
                    .unwrap_or("");
                let detail = err
                    .and_then(|e| e.get("message"))
                    .and_then(Value::as_str)
                    .unwrap_or("");
                let msg = match (code.is_empty(), detail.is_empty()) {
                    (false, false) => format!("rsclaw stream error [{code}]: {detail}"),
                    (false, true) => format!("rsclaw stream error [{code}]"),
                    (true, false) => format!("rsclaw stream error: {detail}"),
                    (true, true) => "rsclaw stream error".to_string(),
                };
                events.push(Ok(StreamEvent::Error(msg)));
            }
            "thinking" => {
                if let Some(s) = val.get("content").and_then(Value::as_str)
                    && !s.is_empty()
                {
                    events.push(Ok(StreamEvent::ReasoningDelta(s.to_string())));
                }
            }
            "tool_call" => {
                let id = val
                    .get("id")
                    .and_then(Value::as_str)
                    .unwrap_or("")
                    .to_string();
                let name = val
                    .get("name")
                    .and_then(Value::as_str)
                    .unwrap_or("")
                    .to_string();
                let input = val
                    .get("input")
                    .cloned()
                    .filter(Value::is_object)
                    .unwrap_or(Value::Object(Default::default()));
                events.push(Ok(StreamEvent::ToolCall { id, name, input }));
            }
            other => {
                tracing::debug!(ty = other, payload, "rsclaw fastshot: unknown event type");
            }
        }
    }
    events
}

/// Pull every image URL/data-URI out of an `LlmRequest`'s message
/// content parts, preserving order. Used by the `/vision` one-shot
/// endpoint which expects an `images: [...]` array alongside the
/// flattened prompt.
fn extract_images_for_oneshot(req: &LlmRequest) -> Vec<String> {
    let mut images = Vec::new();
    for msg in &req.messages {
        if let MessageContent::Parts(parts) = &msg.content {
            for p in parts {
                if let ContentPart::Image { url } = p {
                    if !url.is_empty() {
                        images.push(url.clone());
                    }
                }
            }
        }
    }
    images
}

/// tool_results, hydrating duplicates.
///
/// For a User-trailing list (the iter-1 case after
/// `normalize_trailing_system` ran) we drop exactly one message.
fn history_for_replay(messages: &[Message]) -> &[Message] {
    if messages.is_empty() {
        return messages;
    }
    let last = &messages[messages.len() - 1];
    if matches!(last.role, Role::Tool) {
        let mut keep = messages.len();
        while keep > 0 && matches!(messages[keep - 1].role, Role::Tool) {
            keep -= 1;
        }
        &messages[..keep]
    } else {
        &messages[..messages.len() - 1]
    }
}

/// Partition a history slice into (non-system messages, concatenated
/// system text). The runtime threads `Role::System` messages through
/// the conversation list for plugins/skills/ctx blocks, but protocol
/// §2.2 only accepts `user` / `assistant` in history — so we lift the
/// system text out and let the caller append it to `user_system`.
/// Order of system blocks is preserved within the returned String;
/// blocks are joined with a blank line. Non-text content on a
/// `Role::System` message is dropped (system messages are documented
/// as text-only in the runtime).
fn split_system_messages(messages: &[Message]) -> (Vec<&Message>, String) {
    let mut filtered: Vec<&Message> = Vec::with_capacity(messages.len());
    let mut sys_parts: Vec<String> = Vec::new();
    for m in messages {
        if matches!(m.role, Role::System) {
            // Text(t) and Parts(...) must skip empties symmetrically —
            // otherwise an empty System(Text("")) leaks a blank entry
            // into sys_parts and pollutes user_system with a leading
            // `\n\n` once `sys_parts.join("\n\n")` runs.
            let txt = match &m.content {
                MessageContent::Text(t) => t.clone(),
                MessageContent::Parts(parts) => {
                    let mut joined = String::new();
                    for p in parts {
                        if let ContentPart::Text { text } = p {
                            joined.push_str(text);
                        }
                    }
                    joined
                }
            };
            if !txt.is_empty() {
                sys_parts.push(txt);
            }
        } else {
            filtered.push(m);
        }
    }
    (filtered, sys_parts.join("\n\n"))
}

/// Pull any trailing `Role::System` messages off the end of `messages`
/// and fold their text into the preceding `Role::User` message.
///
/// The runtime appends `Role::System` blocks (dynamic /ctx, just-
/// installed skills) AFTER the User delta on the first iteration of
/// each turn (`turn_scratchpad` empty — see agent/runtime.rs). On
/// later iterations the scratchpad's Assistant/Tool entries follow,
/// so trailing-System only happens iter-1. `TurnDelta::from_request`
/// rejects a trailing System ("last message must be User or Tool"),
/// failing the whole turn — fold the text inline so the model still
/// sees the dynamic context, just as part of the user_message body.
///
/// If the message immediately before the trailing System block(s) is
/// `Role::Tool` (parallel tool_results case, theoretical — runtime
/// doesn't currently inject System after Tool, but defend anyway),
/// drop the System text. Persistent system content already lives in
/// `user_system` from the prior open/replay; only the per-iteration
/// dynamic context would be lost, which the protocol has no slot
/// for in a tool_results delta.
fn normalize_trailing_system(messages: &mut Vec<Message>) {
    let mut trailing: Vec<String> = Vec::new();
    while matches!(messages.last(), Some(m) if matches!(m.role, Role::System)) {
        let m = messages.pop().expect("matched Some above");
        let txt = match m.content {
            MessageContent::Text(t) => t,
            MessageContent::Parts(parts) => parts
                .into_iter()
                .filter_map(|p| match p {
                    ContentPart::Text { text } => Some(text),
                    _ => None,
                })
                .collect::<Vec<_>>()
                .join(""),
        };
        if !txt.is_empty() {
            trailing.push(txt);
        }
    }
    if trailing.is_empty() {
        return;
    }
    trailing.reverse();
    let combined = trailing.join("\n\n");
    match messages.last_mut() {
        Some(last) if matches!(last.role, Role::User) => match &mut last.content {
            MessageContent::Text(t) => {
                if t.is_empty() {
                    *t = combined;
                } else {
                    t.push_str("\n\n");
                    t.push_str(&combined);
                }
            }
            MessageContent::Parts(parts) => {
                parts.push(ContentPart::Text { text: combined });
            }
        },
        _ => {
            // Tool / empty — nothing to fold into. Drop silently.
        }
    }
}

fn serialize_history_message(msg: &Message) -> Value {
    let role = match msg.role {
        Role::System => "system",
        Role::User => "user",
        Role::Assistant => "assistant",
        Role::Tool => "user",
    };
    let content = match &msg.content {
        MessageContent::Text(t) => json!(t),
        MessageContent::Parts(parts) => {
            let mapped: Vec<Value> = parts.iter().map(serialize_history_part).collect();
            json!(mapped)
        }
    };
    let mut out = json!({ "role": role, "content": content });
    if let Some(hidden) = &msg.rsclaw_hidden
        && let Some(obj) = out.as_object_mut()
    {
        obj.insert(
            "rsclaw_hidden".to_owned(),
            serde_json::to_value(hidden).unwrap_or(Value::Null),
        );
    }
    out
}

fn serialize_history_part(p: &ContentPart) -> Value {
    match p {
        ContentPart::Text { text } => json!({"type":"text","text":text}),
        ContentPart::Image { url } => {
            json!({"type":"image","source":{"type":"url","url":url}})
        }
        ContentPart::ToolUse { id, name, input } => {
            json!({"type":"tool_use","id":id,"name":name,"input":input})
        }
        ContentPart::ToolResult {
            tool_use_id,
            content,
            is_error,
        } => {
            let mut obj = json!({
                "type":"tool_result",
                "tool_use_id":tool_use_id,
                "content":content,
            });
            if let Some(e) = is_error {
                obj["is_error"] = json!(e);
            }
            obj
        }
        ContentPart::Reasoning { text } => json!({"type":"thinking","text":text}),
    }
}

/// Serialize replay history with consecutive `Role::Tool` messages
/// coalesced into one `user`-role entry whose `content` array carries
/// every `tool_result` part.
///
/// Why: when the assistant calls N tools in parallel, the runtime queues
/// N consecutive `Role::Tool` messages (one per result). `from_request`
/// already merges them into a single `tool_results` array on a live
/// turn; replay history needs the same shape per protocol §2.2 — the
/// example there shows tool_results inside ONE user-role entry, and
/// shipping N separate entries would either tokenize wrong or trip
/// `400 invalid_history` on stricter chat templates.
fn serialize_replay_history(messages: &[&Message]) -> Vec<Value> {
    let mut out: Vec<Value> = Vec::with_capacity(messages.len());
    let mut i = 0;
    while i < messages.len() {
        let m = messages[i];
        if !matches!(m.role, Role::Tool) {
            out.push(serialize_history_message(m));
            i += 1;
            continue;
        }
        let mut combined: Vec<Value> = Vec::new();
        while i < messages.len() && matches!(messages[i].role, Role::Tool) {
            match &messages[i].content {
                MessageContent::Parts(parts) => {
                    for p in parts {
                        if matches!(p, ContentPart::ToolResult { .. }) {
                            combined.push(serialize_history_part(p));
                        }
                    }
                }
                MessageContent::Text(_) => {
                    // Defensive: today's runtime always emits
                    // `Role::Tool` with `Parts(vec![ToolResult{..}])`,
                    // so this branch should never trigger. If it ever
                    // does — e.g. a future runtime path or a plugin
                    // injecting a synthesised tool message — the text
                    // has no `tool_use_id` to anchor it server-side
                    // (protocol §2.2 requires `tool_result` parts to
                    // pair with prior `tool_use` ids). Drop and surface
                    // a warning so we notice the contract change rather
                    // than silently producing a turn whose model
                    // response is shaped by missing context.
                    tracing::warn!(
                        "rsclaw: dropping Role::Tool with text-only content during \
                         replay (no tool_use_id to pair with — runtime contract \
                         expects Parts(ToolResult{{..}}))",
                    );
                    debug_assert!(
                        false,
                        "Role::Tool must carry Parts(ToolResult{{..}}); got Text"
                    );
                }
            }
            i += 1;
        }
        if !combined.is_empty() {
            out.push(json!({ "role": "user", "content": combined }));
        }
    }
    out
}

// ---------------------------------------------------------------------------
// SSE parsing — rsclaw-native event shape (docs/client-server-integration.md
// §4.2)
// ---------------------------------------------------------------------------
//
// Five top-level frame types share a flat `{type, ...}` shape across every
// rsclaw-server lane that speaks this provider (`/v1/agent/sessions/*/turn`,
// `/v1/agent/fastshot`, `/v1/agent/oneshot`, `/v1/agent/vision`):
//
//   data: {"type":"delta","content":"Hello"}
//   data: {"type":"thinking","content":"reasoning fragment..."}    (reasoning
// models)   data: {"type":"tool_call","id":"...","name":"...","input":{...}}
// (whole frame, not accumulated)   data: {"type":"done","finish_reason":"...","
// usage":{...}}   data: {"type":"error","error":{"code":"...","message":"..."}}
//   data: [DONE]                                                   (SSE framing
// sentinel)
//
// Forward-compat rule: unknown `type` values are silently ignored — the
// server may add new types (e.g. `cache_hit_summary`) without breaking
// old clients.

/// Per-stream state for the v1 native protocol. Frames are
/// block-oriented (Anthropic Messages-style):
///
/// ```text
/// block_start{index, block:{type,id?,name?}}
/// block_delta{index, delta}     (zero or more, may interleave by index)
/// block_stop{index}
/// ```
///
/// `blocks` tracks open builders keyed by the `index` field so deltas
/// for interleaved parallel blocks (e.g. one `text` and one `tool_call`
/// open simultaneously) route to the right accumulator.
#[derive(Debug, Default)]
struct SseState {
    blocks: std::collections::HashMap<u64, BlockBuilder>,
}

#[derive(Debug)]
struct BlockBuilder {
    kind: BlockKind,
    /// Accumulates partial JSON for `tool_call` blocks. Text/thinking
    /// blocks emit incrementally on every `block_delta` for UI
    /// streaming, so `buf` stays empty for them — saves memory on
    /// long answers and lets `block_stop` distinguish the two paths
    /// purely by `kind`.
    buf: String,
    tool_id: String,
    tool_name: String,
}

#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum BlockKind {
    Text,
    Thinking,
    ToolCall,
}

async fn parse_sse_chunk(
    chunk: Result<bytes::Bytes>,
    line_buffer: &Arc<tokio::sync::Mutex<String>>,
    utf8_remainder: &Arc<tokio::sync::Mutex<Vec<u8>>>,
    block_state: &Arc<tokio::sync::Mutex<SseState>>,
) -> Vec<Result<StreamEvent>> {
    let mut events: Vec<Result<StreamEvent>> = Vec::new();
    let bytes = match chunk {
        Ok(b) => b,
        Err(e) => {
            events.push(Err(e));
            return events;
        }
    };

    // Stitch this chunk onto any partial UTF-8 left over from the
    // previous chunk; decode strict; stash the trailing invalid bytes
    // (an incomplete multi-byte sequence at the chunk boundary) for
    // the next call. from_utf8_lossy would corrupt CJK / emoji deltas
    // that straddle chunk boundaries by inserting U+FFFD.
    let mut remainder = utf8_remainder.lock().await;
    let stitched: Vec<u8> = if remainder.is_empty() {
        bytes.to_vec()
    } else {
        let mut combined = std::mem::take(&mut *remainder);
        combined.extend_from_slice(&bytes);
        combined
    };
    let decoded: String = match std::str::from_utf8(&stitched) {
        Ok(s) => s.to_owned(),
        Err(e) => {
            let valid_up_to = e.valid_up_to();
            // Two distinct error shapes per `Utf8Error`:
            //   error_len() == None    → trailing bytes are an
            //     INCOMPLETE multi-byte sequence; stash them so the
            //     next chunk completes the codepoint.
            //   error_len() == Some(n) → the next n bytes are
            //     INVALID and will never become valid; advance past
            //     them. Without this advance, every subsequent chunk
            //     stitches onto the bad prefix and fails at the same
            //     position — remainder grows unboundedly and the
            //     stream stalls forever (a single stray 0xFF from a
            //     buggy proxy is enough to wedge the turn). The lost
            //     bytes are unrecoverable garbage in either reading.
            let advance_past_invalid = e.error_len().unwrap_or(0);
            *remainder = stitched[valid_up_to + advance_past_invalid..].to_vec();
            if valid_up_to == 0 {
                return events;
            }
            std::str::from_utf8(&stitched[..valid_up_to])
                .expect("valid_up_to guarantees valid UTF-8")
                .to_owned()
        }
    };
    drop(remainder);

    let mut buf = line_buffer.lock().await;
    buf.push_str(&decoded);
    while let Some(idx) = buf.find('\n') {
        let line = buf[..idx].trim_end_matches('\r').to_string();
        buf.drain(..=idx);
        // SSE has two relevant line shapes: `event: <name>` (the named
        // event channel) and `data: <json>` (the payload). Anthropic
        // shape uses both: `event:` mirrors the JSON's `"type"` field
        // for SDK compatibility. We key off the JSON `"type"` since
        // that's authoritative — `event:` lines without a body, comment
        // lines (`:keepalive`), and stray blank lines are dropped here.
        let Some(payload) = line
            .strip_prefix("data:")
            .map(|s| s.trim_start_matches(' '))
        else {
            continue;
        };
        // v1 native protocol emits a trailing `[DONE]` sentinel after
        // the final `done` event (mirrors OAI translator convention so
        // generic SSE clients have a deterministic close signal). The
        // authoritative terminator is `done`, which already pushed a
        // `StreamEvent::Done` upstream. `[DONE]` itself carries no
        // payload — drop it silently rather than surface a parse error.
        if payload == "[DONE]" {
            continue;
        }
        // Skip empty `data:` payloads silently. SSE keep-alives sometimes
        // surface as `data:\n\n` (no body) when proxies translate
        // `:keepalive` comments. Pushing a parse error here would
        // surface as `Err(...)` down the stream — and runtime's
        // `match event?` (agent/runtime.rs ~4356) propagates that with
        // `?`, killing the whole turn. The empty line carries no model
        // signal; drop it the same way `[DONE]` is dropped.
        if payload.is_empty() {
            continue;
        }
        let value: Value = match serde_json::from_str(payload) {
            Ok(v) => v,
            Err(e) => {
                events.push(Err(anyhow::anyhow!(
                    "rsclaw SSE parse: {e}; line: {payload}"
                )));
                continue;
            }
        };
        let kind = value.get("type").and_then(|v| v.as_str()).unwrap_or("");
        let mut state = block_state.lock().await;
        match kind {
            // Stream prelude — carries cache stats / model / session
            // metadata for telemetry. No StreamEvent emission; clients
            // should tap this for observability if they care.
            "start" => {}
            // Server-side keepalive. SSE-comment translators may also
            // arrive here. Reset any keepalive timer the caller tracks;
            // no StreamEvent.
            "ping" => {}
            // Open a new content block. `index` is the routing key;
            // future deltas with the same index land in this builder.
            // Unknown `block.type` is skipped (forward-compat: server
            // may add e.g. `image` blocks later — old clients ignore).
            "block_start" => {
                let Some(index) = value.get("index").and_then(Value::as_u64) else {
                    continue;
                };
                let block = value.get("block");
                let block_type = block
                    .and_then(|b| b.get("type"))
                    .and_then(Value::as_str)
                    .unwrap_or("");
                let kind = match block_type {
                    "text" => BlockKind::Text,
                    "thinking" => BlockKind::Thinking,
                    "tool_call" => BlockKind::ToolCall,
                    _ => continue,
                };
                let (tool_id, tool_name) = if kind == BlockKind::ToolCall {
                    (
                        block
                            .and_then(|b| b.get("id"))
                            .and_then(Value::as_str)
                            .unwrap_or("")
                            .to_string(),
                        block
                            .and_then(|b| b.get("name"))
                            .and_then(Value::as_str)
                            .unwrap_or("")
                            .to_string(),
                    )
                } else {
                    (String::new(), String::new())
                };
                state.blocks.insert(
                    index,
                    BlockBuilder {
                        kind,
                        buf: String::new(),
                        tool_id,
                        tool_name,
                    },
                );
            }
            // Streaming content for an open block. Text/thinking emit
            // each delta IMMEDIATELY so the UI streams character-by-
            // character; tool_call deltas accumulate (partial JSON
            // shards) until block_stop, where the whole input parses
            // atomically. A delta arriving for an unknown index is
            // dropped silently (forward-compat with reordered or
            // dropped block_start frames).
            "block_delta" => {
                let Some(index) = value.get("index").and_then(Value::as_u64) else {
                    continue;
                };
                let Some(delta) = value.get("delta").and_then(Value::as_str) else {
                    continue;
                };
                let Some(b) = state.blocks.get_mut(&index) else {
                    continue;
                };
                match b.kind {
                    BlockKind::Text => {
                        if !delta.is_empty() {
                            events.push(Ok(StreamEvent::TextDelta(delta.to_string())));
                        }
                    }
                    BlockKind::Thinking => {
                        if !delta.is_empty() {
                            events.push(Ok(StreamEvent::ReasoningDelta(delta.to_string())));
                        }
                    }
                    BlockKind::ToolCall => {
                        b.buf.push_str(delta);
                    }
                }
            }
            // Close a block. For tool_call: parse the accumulated buf
            // as JSON and emit a single ToolCall event. Malformed JSON
            // collapses to an empty object so downstream `.as_object()`
            // consumers never have to match Null; the empty-args path
            // is the runtime's existing "tool with no args" branch.
            // Text/thinking already emitted incrementally — block_stop
            // is just a no-op cleanup for them.
            "block_stop" => {
                let Some(index) = value.get("index").and_then(Value::as_u64) else {
                    continue;
                };
                if let Some(b) = state.blocks.remove(&index)
                    && b.kind == BlockKind::ToolCall
                {
                    let input: Value = if b.buf.is_empty() {
                        Value::Object(Default::default())
                    } else {
                        serde_json::from_str(&b.buf)
                            .unwrap_or_else(|_| Value::Object(Default::default()))
                    };
                    events.push(Ok(StreamEvent::ToolCall {
                        id: b.tool_id,
                        name: b.tool_name,
                        input,
                    }));
                }
            }
            // Terminal frame — carries `finish_reason` (we don't
            // propagate it; runtime treats Done as "stream complete")
            // and `usage` with worker token counts. Field-name drift
            // across lanes documented in client-server-integration.md
            // §4.3 (input_tokens / prompt_tokens / input — try each).
            "done" => {
                let usage = value
                    .get("usage")
                    .and_then(Value::as_object)
                    .map(|u| TokenUsage {
                        input: extract_usage_count(u, &["input_tokens", "prompt_tokens", "input"]),
                        output: extract_usage_count(
                            u,
                            &["output_tokens", "completion_tokens", "output"],
                        ),
                        // Cache stats: Anthropic-style names primary, OpenAI's
                        // single `cached_tokens` accepted as a fallback for
                        // cache_read (OAI doesn't distinguish creation vs read).
                        cache_creation: extract_usage_count(
                            u,
                            &["cache_creation_input_tokens", "cache_creation_tokens"],
                        ),
                        cache_read: extract_usage_count(
                            u,
                            &[
                                "cache_read_input_tokens",
                                "cached_tokens",
                                "cache_read_tokens",
                            ],
                        ),
                        recall_tokens: extract_usage_count(u, &["recall_tokens"]),
                        recall_doc_ids: extract_usage_string_array(u, "recall_doc_ids"),
                        recall_hash: u
                            .get("recall_hash")
                            .and_then(Value::as_str)
                            .map(str::to_owned),
                        recall_truncated: u
                            .get("recall_truncated")
                            .and_then(Value::as_bool)
                            .unwrap_or(false),
                    });
                events.push(Ok(StreamEvent::Done { usage }));
            }
            // Mid-stream error frame: `{type:"error", error:{code, message}}`.
            // Empty fields collapse to a generic message rather than a
            // confusing "[]: " prefix.
            "error" => {
                let err = value.get("error");
                let code = err
                    .and_then(|e| e.get("code"))
                    .and_then(Value::as_str)
                    .unwrap_or("");
                let detail = err
                    .and_then(|e| e.get("message"))
                    .and_then(Value::as_str)
                    .unwrap_or("");
                let msg = match (code.is_empty(), detail.is_empty()) {
                    (false, false) => format!("rsclaw stream error [{code}]: {detail}"),
                    (false, true) => format!("rsclaw stream error [{code}]"),
                    (true, false) => format!("rsclaw stream error: {detail}"),
                    (true, true) => "rsclaw stream error".to_string(),
                };
                events.push(Ok(StreamEvent::Error(msg)));
            }
            // Unknown types: forward-compat — server may add new
            // frame types (e.g. `cache_hit_summary`) and old clients
            // should ignore them rather than fail the turn.
            _ => {}
        }
        drop(state);
    }
    events
}

/// Pull a usage count from a worker `usage` object, trying field
/// names in order and defaulting missing to 0. Lane-specific name
/// drift is documented in client-server-integration.md §4.3.
fn extract_usage_count(u: &serde_json::Map<String, Value>, names: &[&str]) -> u64 {
    for name in names {
        if let Some(n) = u.get(*name).and_then(Value::as_u64) {
            return n;
        }
    }
    0
}

fn extract_usage_string_array(u: &serde_json::Map<String, Value>, name: &str) -> Vec<String> {
    u.get(name)
        .and_then(Value::as_array)
        .map(|items| {
            items
                .iter()
                .filter_map(|v| v.as_str().map(str::to_owned))
                .collect()
        })
        .unwrap_or_default()
}

/// True when the (status, body) pair is a documented session-eviction
/// signal that the gateway should recover from via replay:
/// - `404 session_not_found` — slot evicted (LRU, idle TTL) or upstream
///   restart; per protocol §5 the recovery is `POST /sessions/replay`
/// - `409 version_drift` — pinned node upgraded past our rsclaw_version
/// - `503 backend_unavailable` — pinned node gone (heartbeat timeout)
///
/// `503 no_backend_available` (capacity exhaustion) is intentionally NOT
/// recoverable here — replay would just hit the same wall.
fn is_session_evicted(status: StatusCode, body: &str) -> bool {
    let code = serde_json::from_str::<Value>(body)
        .ok()
        .as_ref()
        .and_then(|v| v.get("error"))
        .and_then(|e| e.get("code"))
        .and_then(Value::as_str)
        .map(str::to_owned);
    match (status, code.as_deref()) {
        (StatusCode::NOT_FOUND, Some("session_not_found")) => true,
        (StatusCode::CONFLICT, Some("version_drift")) => true,
        (StatusCode::SERVICE_UNAVAILABLE, Some("backend_unavailable")) => true,
        _ => false,
    }
}

// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------

#[cfg(test)]
mod tests {
    use super::*;
    use crate::ToolDef;

    fn feed_sse(parser: &mut SseTerminalParser, raw: &str) -> Option<(String, String)> {
        for line in raw.split('\n') {
            if let Some(t) = parser.push_line(line.trim_end_matches('\r')) {
                return Some(t);
            }
        }
        None
    }

    #[test]
    fn sse_parser_skips_heartbeats_and_returns_result() {
        let mut p = SseTerminalParser::default();
        let raw = ": replay-keepalive\n\n: replay-keepalive\n\nevent: result\ndata: {\"session_id\":\"rs_x\"}\n\n";
        let (ev, data) = feed_sse(&mut p, raw).expect("terminal event");
        assert_eq!(ev, "result");
        assert_eq!(data, "{\"session_id\":\"rs_x\"}");
    }

    #[test]
    fn sse_parser_returns_error_event_with_status() {
        let mut p = SseTerminalParser::default();
        let raw =
            ": hb\n\nevent: error\ndata: {\"status\":503,\"body\":{\"error\":\"no capacity\"}}\n\n";
        let (ev, data) = feed_sse(&mut p, raw).expect("terminal event");
        assert_eq!(ev, "error");
        let v: Value = serde_json::from_str(&data).unwrap();
        assert_eq!(v["status"], 503);
    }

    #[test]
    fn sse_parser_joins_multi_line_data() {
        let mut p = SseTerminalParser::default();
        let raw = "event: result\ndata: line1\ndata: line2\n\n";
        let (ev, data) = feed_sse(&mut p, raw).expect("terminal event");
        assert_eq!(ev, "result");
        assert_eq!(data, "line1\nline2");
    }

    #[test]
    fn sse_parser_ignores_unknown_events() {
        let mut p = SseTerminalParser::default();
        assert!(feed_sse(&mut p, "event: progress\ndata: 42\n\n").is_none());
        let (ev, _) = feed_sse(&mut p, "event: result\ndata: {}\n\n").expect("terminal");
        assert_eq!(ev, "result");
    }

    /// Thin shim mirroring `parse_sse_chunk`'s signature so tests don't
    /// have to repeat the lock-wrap boilerplate.
    async fn parse_sse_test(
        chunk: Result<bytes::Bytes>,
        buf: &Arc<tokio::sync::Mutex<String>>,
        rem: &Arc<tokio::sync::Mutex<Vec<u8>>>,
        state: &Arc<tokio::sync::Mutex<SseState>>,
    ) -> Vec<Result<StreamEvent>> {
        parse_sse_chunk(chunk, buf, rem, state).await
    }

    /// Convenience: create a fresh block state for a single-call test.
    fn new_state() -> Arc<tokio::sync::Mutex<SseState>> {
        Arc::new(tokio::sync::Mutex::new(SseState::default()))
    }

    #[tokio::test]
    async fn parse_sse_chunk_recovers_split_utf8() {
        // v1 block_delta line carrying "你好" (U+4F60 = E4 BD A0, U+597D
        // = E5 A5 BD). Open the block first, then split the delta
        // line so chunk boundaries land mid-CJK — UTF-8 stitching
        // must preserve the codepoints verbatim, no U+FFFD leak.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        // Prime the parser with block_start so subsequent block_delta
        // frames have somewhere to route.
        let _ = parse_sse_test(
            Ok(bytes::Bytes::from_static(
                b"data: {\"type\":\"block_start\",\"index\":0,\"block\":{\"type\":\"text\"}}\n",
            )),
            &buf,
            &rem,
            &state,
        )
        .await;

        let line_full = b"data: {\"type\":\"block_delta\",\"index\":0,\"delta\":\"\xe4\xbd\xa0\xe5\xa5\xbd\"}\n";
        // 51 bytes prefix before the CJK bytes; split inside the
        // first codepoint at byte 52 (E4 BD ...| A0 ...).
        let split = 52;
        let (a, b) = line_full.split_at(split);
        let (b, c) = b.split_at(2);

        for piece in [a, b, c] {
            let _ =
                parse_sse_test(Ok(bytes::Bytes::copy_from_slice(piece)), &buf, &rem, &state).await;
        }
        let evs = parse_sse_test(Ok(bytes::Bytes::from_static(b"")), &buf, &rem, &state).await;

        let texts: Vec<_> = evs
            .into_iter()
            .filter_map(|e| match e {
                Ok(StreamEvent::TextDelta(t)) => Some(t),
                _ => None,
            })
            .collect();
        let all_text: String = texts.into_iter().collect();
        // Final newline-terminated frame must produce 你好 verbatim.
        assert!(
            !all_text.contains('\u{FFFD}'),
            "expected no replacement char, got {all_text:?}"
        );
    }

    #[tokio::test]
    async fn parse_sse_chunk_advances_past_invalid_utf8_byte() {
        // A stray 0xFF (or any byte that's *invalid as a UTF-8 start*,
        // not just incomplete) MUST be skipped, not pinned in
        // `utf8_remainder`. Without this, every subsequent chunk
        // stitches onto the bad prefix, fails at the same position,
        // and remainder grows unboundedly while no events ever fire —
        // the stream stalls forever on a single bad byte.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();

        // Chunk 1: just an invalid byte. error_len() = Some(1).
        let evs = parse_sse_test(Ok(bytes::Bytes::from_static(b"\xff")), &buf, &rem, &state).await;
        assert!(
            evs.iter().all(|e| e.is_ok()),
            "stray 0xFF must not surface as Err — got {evs:?}"
        );
        {
            let r = rem.lock().await;
            assert!(
                !r.contains(&0xff),
                "0xFF must be advanced past, not pinned in remainder; got {:?}",
                *r
            );
        }

        // Chunk 2: a complete SSE event. The stream must recover and
        // emit it normally — no contamination from the prior bad byte.
        let evs = parse_sse_test(
            Ok(bytes::Bytes::from_static(
                b"data: {\"type\":\"block_start\",\"index\":0,\"block\":{\"type\":\"text\"}}\ndata: {\"type\":\"block_delta\",\"index\":0,\"delta\":\"hi\"}\ndata: {\"type\":\"block_stop\",\"index\":0}\n",
            )),
            &buf,
            &rem,
            &state,
        )
        .await;
        let texts: Vec<_> = evs
            .into_iter()
            .filter_map(|e| match e {
                Ok(StreamEvent::TextDelta(t)) => Some(t),
                _ => None,
            })
            .collect();
        assert_eq!(
            texts,
            vec!["hi".to_string()],
            "stream must recover and emit subsequent events after a bad byte"
        );
    }

    #[tokio::test]
    async fn parse_sse_chunk_invalid_byte_does_not_unbounded_grow_remainder() {
        // Regression: feeding the same invalid byte over and over MUST
        // NOT grow `utf8_remainder` linearly. Pre-fix, every call
        // appended the bad byte and re-saved the entire stitched buffer;
        // 1000 chunks → 1000-byte remainder → eventual OOM in long
        // streams. Post-fix the remainder stays empty after each call.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        for _ in 0..50 {
            let _ =
                parse_sse_test(Ok(bytes::Bytes::from_static(b"\xff")), &buf, &rem, &state).await;
        }
        let r = rem.lock().await;
        assert!(
            r.len() <= 3,
            "remainder must not accumulate invalid bytes (cap 3 for trailing incomplete UTF-8); got {} bytes",
            r.len()
        );
    }

    #[tokio::test]
    async fn parse_sse_chunk_skips_empty_data_payload() {
        // Empty `data:` lines (a heartbeat shape some proxies emit when
        // translating `:keepalive` comments) MUST NOT surface as Err in
        // the stream — the runtime propagates Err with `?` and would
        // kill an otherwise-healthy turn. Mix an empty line with a real
        // event and assert: only the real text delta fires, no Err.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data:\ndata: {\"type\":\"block_start\",\"index\":0,\"block\":{\"type\":\"text\"}}\ndata: {\"type\":\"block_delta\",\"index\":0,\"delta\":\"hi\"}\ndata: {\"type\":\"block_stop\",\"index\":0}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let mut texts: Vec<String> = Vec::new();
        for e in evs {
            match e {
                Ok(StreamEvent::TextDelta(t)) => texts.push(t),
                Err(err) => panic!("empty data: must not surface as Err — got {err}"),
                _ => {}
            }
        }
        assert_eq!(texts, vec!["hi".to_string()]);
    }

    #[tokio::test]
    async fn parse_sse_chunk_skips_data_with_only_spaces() {
        // `data:    \n` (whitespace-only after the colon) trims to "" via
        // `trim_start_matches(' ')` and lands in the same empty-skip path.
        // Verify the same: no Err, no spurious event.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data:    \n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        for e in evs {
            if let Err(err) = e {
                panic!("whitespace-only data: must not surface as Err — got {err}");
            }
        }
    }

    #[tokio::test]
    async fn parse_sse_chunk_accepts_data_without_leading_space() {
        // SSE field syntax allows the space after the colon to be
        // omitted; rsclaw-server (or any node that routes through
        // hyper / nginx with comp-stripping middleware) may emit
        // `data:{...}` without a space. Both forms must parse.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        // SSE spec allows the space after `data:` to be omitted; both
        // forms must parse. v1 frames split across three lines here.
        let line = b"data:{\"type\":\"block_start\",\"index\":0,\"block\":{\"type\":\"text\"}}\ndata:{\"type\":\"block_delta\",\"index\":0,\"delta\":\"hi\"}\ndata:{\"type\":\"block_stop\",\"index\":0}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let texts: Vec<_> = evs
            .into_iter()
            .filter_map(|e| match e {
                Ok(StreamEvent::TextDelta(t)) => Some(t),
                _ => None,
            })
            .collect();
        assert_eq!(texts, vec!["hi".to_string()]);
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_delta_emits_text() {
        // Happy-path text fragment. Worker emits one of these per
        // generated token on every native lane.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data: {\"type\":\"block_start\",\"index\":0,\"block\":{\"type\":\"text\"}}\ndata: {\"type\":\"block_delta\",\"index\":0,\"delta\":\"hello\"}\ndata: {\"type\":\"block_stop\",\"index\":0}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let texts: Vec<String> = evs
            .into_iter()
            .filter_map(|e| match e {
                Ok(StreamEvent::TextDelta(s)) => Some(s),
                _ => None,
            })
            .collect();
        assert_eq!(texts, vec!["hello".to_string()]);
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_thinking_emits_reasoning() {
        // Reasoning-model lane: `{type:"thinking",content:"..."}`
        // maps to ReasoningDelta so the agent runtime can stash it
        // separately from user-visible TextDelta.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data: {\"type\":\"block_start\",\"index\":0,\"block\":{\"type\":\"thinking\"}}\ndata: {\"type\":\"block_delta\",\"index\":0,\"delta\":\"step 1: parse\"}\ndata: {\"type\":\"block_stop\",\"index\":0}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let reasonings: Vec<String> = evs
            .into_iter()
            .filter_map(|e| match e {
                Ok(StreamEvent::ReasoningDelta(s)) => Some(s),
                _ => None,
            })
            .collect();
        assert_eq!(reasonings, vec!["step 1: parse".to_string()]);
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_tool_call_emits_whole_frame() {
        // v1 tool_call block: id+name on `block_start`; input as one
        // or more `block_delta` JSON shards; `block_stop` triggers
        // a single `ToolCall` emission with the parsed input.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = br#"data: {"type":"block_start","index":0,"block":{"type":"tool_call","id":"call_42","name":"read_file"}}
data: {"type":"block_delta","index":0,"delta":"{\"path\":\"x.rs\"}"}
data: {"type":"block_stop","index":0}
"#;
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let (id, name, input) = evs
            .into_iter()
            .find_map(|e| match e {
                Ok(StreamEvent::ToolCall { id, name, input }) => Some((id, name, input)),
                _ => None,
            })
            .expect("expected one ToolCall event");
        assert_eq!(id, "call_42");
        assert_eq!(name, "read_file");
        assert_eq!(input.get("path").and_then(Value::as_str), Some("x.rs"));
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_tool_call_missing_input_defaults_empty_object() {
        // A tool_call block that gets `block_stop` without any
        // `block_delta` (worker had no input args) must emit
        // ToolCall with input = {}. Downstream consumers call
        // `.as_object()` directly without a Null match.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data: {\"type\":\"block_start\",\"index\":0,\"block\":{\"type\":\"tool_call\",\"id\":\"c\",\"name\":\"get_time\"}}\ndata: {\"type\":\"block_stop\",\"index\":0}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let input = evs
            .into_iter()
            .find_map(|e| match e {
                Ok(StreamEvent::ToolCall { input, .. }) => Some(input),
                _ => None,
            })
            .expect("expected one ToolCall event");
        assert!(
            input.as_object().is_some_and(|m| m.is_empty()),
            "missing input must default to empty object, got {input:?}"
        );
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_done_emits_done_with_usage() {
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data: {\"type\":\"done\",\"finish_reason\":\"end_turn\",\"usage\":{\"input_tokens\":11,\"output_tokens\":22}}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let mut saw_done = false;
        for e in evs {
            if let Ok(StreamEvent::Done { usage }) = e {
                let u = usage.expect("usage should be populated");
                assert_eq!(u.input, 11);
                assert_eq!(u.output, 22);
                saw_done = true;
            }
        }
        assert!(saw_done, "expected Done from native done frame");
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_done_without_usage() {
        // Server may omit usage on early termination — Done must still fire.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data: {\"type\":\"done\",\"finish_reason\":\"end_turn\"}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let mut saw_done = false;
        for e in evs {
            if let Ok(StreamEvent::Done { usage }) = e {
                assert!(usage.is_none());
                saw_done = true;
            }
        }
        assert!(saw_done, "expected Done event even without usage");
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_done_usage_field_name_fallback() {
        // §4.3: lanes differ on usage field names. Each side must try
        //   input_tokens || prompt_tokens || input
        //   output_tokens || completion_tokens || output
        // and default missing to 0 rather than dropping the whole
        // usage object.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line =
            b"data: {\"type\":\"done\",\"usage\":{\"prompt_tokens\":7,\"completion_tokens\":13}}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let u = evs
            .into_iter()
            .find_map(|e| match e {
                Ok(StreamEvent::Done { usage }) => usage,
                _ => None,
            })
            .expect("expected Done with usage");
        assert_eq!(u.input, 7);
        assert_eq!(u.output, 13);
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_done_usage_includes_recall_accounting() {
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data: {\"type\":\"done\",\"usage\":{\"input_tokens\":17,\"output_tokens\":5,\"recall_tokens\":3,\"recall_doc_ids\":[\"doc-1\",\"doc-2\"],\"recall_hash\":\"sha256:abc\",\"recall_truncated\":true}}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let u = evs
            .into_iter()
            .find_map(|e| match e {
                Ok(StreamEvent::Done { usage }) => usage,
                _ => None,
            })
            .expect("expected Done with usage");
        assert_eq!(u.input, 17);
        assert_eq!(u.recall_tokens, 3);
        assert_eq!(u.recall_doc_ids, vec!["doc-1", "doc-2"]);
        assert_eq!(u.recall_hash.as_deref(), Some("sha256:abc"));
        assert!(u.recall_truncated);
    }

    #[test]
    fn serialize_history_message_preserves_rsclaw_hidden_recall() {
        let msg = Message {
            role: Role::User,
            content: MessageContent::Text("我的手机号是什么?".into()),
            rsclaw_hidden: Some(crate::RsclawHidden {
                recall_context: "- 用户手机号: 13900001234".into(),
                recall_format: "xml".into(),
                recall_mode: "committed".into(),
                recall_doc_ids: vec!["mem-1".into()],
                recall_hash: "sha256:abc".into(),
                recall_truncated: false,
                recall_input_tokens: Some(12),
                recall_trace_id: Some("recall_1".into()),
            }),
        };

        let out = serialize_history_message(&msg);

        assert_eq!(out["role"], "user");
        assert_eq!(out["content"], "我的手机号是什么?");
        assert_eq!(
            out["rsclaw_hidden"]["recall_context"],
            "- 用户手机号: 13900001234"
        );
        assert_eq!(out["rsclaw_hidden"]["recall_doc_ids"][0], "mem-1");
        assert_eq!(out["rsclaw_hidden"]["recall_trace_id"], "recall_1");
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_done_partial_usage_keeps_present_side() {
        // Pre-fix the `?` short-circuit nuked the entire TokenUsage on
        // a single missing field. Default each side to 0 so the half
        // we DID get is preserved.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data: {\"type\":\"done\",\"usage\":{\"input_tokens\":17}}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let usage = evs
            .into_iter()
            .find_map(|e| match e {
                Ok(StreamEvent::Done { usage }) => Some(usage),
                _ => None,
            })
            .expect("expected one Done event")
            .expect("usage should survive partial fields");
        assert_eq!(usage.input, 17);
        assert_eq!(usage.output, 0);
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_done_null_usage_is_none() {
        // `"usage": null` must collapse to None, not Some(0,0) — a
        // phantom zero-token turn would dilute accounting averages and
        // mask buggy worker builds that drop the field.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data: {\"type\":\"done\",\"usage\":null}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let mut saw_done = false;
        for e in evs {
            if let Ok(StreamEvent::Done { usage }) = e {
                assert!(
                    usage.is_none(),
                    "null usage must collapse to None, got {usage:?}"
                );
                saw_done = true;
            }
        }
        assert!(saw_done, "expected Done event with usage=None");
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_done_non_object_usage_is_none() {
        // Malformed `"usage": [1,2]` must NOT yield Some(TokenUsage{0,0}).
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data: {\"type\":\"done\",\"usage\":[1,2,3]}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let mut saw_done = false;
        for e in evs {
            if let Ok(StreamEvent::Done { usage }) = e {
                assert!(usage.is_none(), "non-object usage must collapse to None");
                saw_done = true;
            }
        }
        assert!(saw_done, "expected Done event");
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_error_preserves_code_and_message() {
        // §4.2: error frame is `{type:"error", error:{code, message}}`.
        // Both fields must survive into StreamEvent::Error.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = br#"data: {"type":"error","error":{"code":"slot_evicted","message":"slot was reclaimed mid-decode"}}
"#;
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let msg = evs
            .into_iter()
            .find_map(|e| match e {
                Ok(StreamEvent::Error(m)) => Some(m),
                _ => None,
            })
            .expect("expected one Error event");
        assert!(msg.contains("slot_evicted"), "missing code: {msg}");
        assert!(
            msg.contains("slot was reclaimed mid-decode"),
            "missing message: {msg}"
        );
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_error_code_missing_keeps_message() {
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = br#"data: {"type":"error","error":{"message":"upstream hung up"}}
"#;
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let msg = evs
            .into_iter()
            .find_map(|e| match e {
                Ok(StreamEvent::Error(m)) => Some(m),
                _ => None,
            })
            .expect("expected one Error event");
        assert!(msg.contains("upstream hung up"), "missing message: {msg}");
        assert!(!msg.contains("[]"), "empty-code marker leaked: {msg}");
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_error_message_missing_keeps_code() {
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = br#"data: {"type":"error","error":{"code":"version_drift"}}
"#;
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let msg = evs
            .into_iter()
            .find_map(|e| match e {
                Ok(StreamEvent::Error(m)) => Some(m),
                _ => None,
            })
            .expect("expected one Error event");
        assert!(msg.contains("version_drift"), "missing code: {msg}");
        assert!(!msg.ends_with(": "), "trailing empty-message leaked: {msg}");
    }

    #[tokio::test]
    async fn parse_sse_chunk_native_error_uses_default_when_both_missing() {
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data: {\"type\":\"error\",\"error\":{}}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let msg = evs
            .into_iter()
            .find_map(|e| match e {
                Ok(StreamEvent::Error(m)) => Some(m),
                _ => None,
            })
            .expect("expected one Error event");
        assert_eq!(msg, "rsclaw stream error");
    }

    #[tokio::test]
    async fn parse_sse_chunk_unknown_type_ignored_for_forward_compat() {
        // §4.2 forward-compat rule: unknown types (e.g. future
        // `cache_hit_summary` frames) must be silently ignored.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let frames = br#"data: {"type":"cache_hit_summary","hits":42}
data: {"type":"block_start","index":0,"block":{"type":"text"}}
data: {"type":"block_delta","index":0,"delta":"hi"}
data: {"type":"block_stop","index":0}
"#;
        let evs = parse_sse_test(
            Ok(bytes::Bytes::copy_from_slice(frames)),
            &buf,
            &rem,
            &state,
        )
        .await;
        let texts: Vec<String> = evs
            .into_iter()
            .filter_map(|e| match e {
                Ok(StreamEvent::TextDelta(s)) => Some(s),
                _ => None,
            })
            .collect();
        assert_eq!(texts, vec!["hi".to_string()]);
    }

    // ----- v1 protocol: block-oriented streaming -----

    #[tokio::test]
    async fn parse_v1_text_emits_incremental_text_deltas() {
        // Two block_delta frames inside one text block must each
        // emit a TextDelta in order (UI streams character-by-
        // character; we don't wait for block_stop).
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let frames = br#"data: {"type":"start"}
data: {"type":"block_start","index":0,"block":{"type":"text"}}
data: {"type":"block_delta","index":0,"delta":"Hel"}
data: {"type":"block_delta","index":0,"delta":"lo"}
data: {"type":"block_stop","index":0}
data: {"type":"done","finish_reason":"stop","usage":{"input_tokens":1,"output_tokens":2}}
data: [DONE]
"#;
        let evs = parse_sse_test(
            Ok(bytes::Bytes::copy_from_slice(frames)),
            &buf,
            &rem,
            &state,
        )
        .await;
        let mut texts = Vec::new();
        let mut got_done = false;
        for e in evs {
            match e.unwrap() {
                StreamEvent::TextDelta(s) => texts.push(s),
                StreamEvent::Done { usage } => {
                    got_done = true;
                    let u = usage.expect("usage present");
                    assert_eq!(u.input, 1);
                    assert_eq!(u.output, 2);
                }
                other => panic!("unexpected event {other:?}"),
            }
        }
        assert_eq!(texts, vec!["Hel".to_string(), "lo".to_string()]);
        assert!(got_done, "Done event missing");
    }

    #[tokio::test]
    async fn parse_v1_tool_call_streams_args_then_emits_one_toolcall() {
        // Tool input arrives as partial JSON shards across multiple
        // block_delta frames. The parser accumulates and emits a
        // single ToolCall on block_stop with the parsed input.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let frames = br#"data: {"type":"block_start","index":0,"block":{"type":"tool_call","id":"c1","name":"write_file"}}
data: {"type":"block_delta","index":0,"delta":"{\"path\":\"a.txt\","}
data: {"type":"block_delta","index":0,"delta":"\"content\":\"hello\"}"}
data: {"type":"block_stop","index":0}
data: {"type":"done"}
data: [DONE]
"#;
        let evs = parse_sse_test(
            Ok(bytes::Bytes::copy_from_slice(frames)),
            &buf,
            &rem,
            &state,
        )
        .await;
        // Exactly one ToolCall, no intermediate TextDelta events.
        let mut tool_calls = Vec::new();
        for e in &evs {
            if let Ok(StreamEvent::TextDelta(_)) = e {
                panic!("tool_call deltas must NOT emit TextDelta");
            }
            if let Ok(StreamEvent::ToolCall { id, name, input }) = e {
                tool_calls.push((id.clone(), name.clone(), input.clone()));
            }
        }
        assert_eq!(tool_calls.len(), 1, "expected exactly one ToolCall");
        let (id, name, input) = tool_calls.into_iter().next().unwrap();
        assert_eq!(id, "c1");
        assert_eq!(name, "write_file");
        assert_eq!(input.get("path").and_then(Value::as_str), Some("a.txt"));
        assert_eq!(input.get("content").and_then(Value::as_str), Some("hello"));
    }

    #[tokio::test]
    async fn parse_v1_parallel_blocks_by_index() {
        // Text block (index=0) and tool_call block (index=1) opened
        // simultaneously with interleaved deltas. Each delta must
        // route to the right builder by index — no cross-talk.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let frames = br#"data: {"type":"block_start","index":0,"block":{"type":"text"}}
data: {"type":"block_start","index":1,"block":{"type":"tool_call","id":"t1","name":"shell"}}
data: {"type":"block_delta","index":0,"delta":"Running... "}
data: {"type":"block_delta","index":1,"delta":"{\"cmd\":\""}
data: {"type":"block_delta","index":0,"delta":"please wait"}
data: {"type":"block_delta","index":1,"delta":"ls\"}"}
data: {"type":"block_stop","index":1}
data: {"type":"block_stop","index":0}
"#;
        let evs = parse_sse_test(
            Ok(bytes::Bytes::copy_from_slice(frames)),
            &buf,
            &rem,
            &state,
        )
        .await;
        let mut texts = Vec::new();
        let mut tool_calls = Vec::new();
        for e in evs {
            match e.unwrap() {
                StreamEvent::TextDelta(s) => texts.push(s),
                StreamEvent::ToolCall { id, name, input } => tool_calls.push((id, name, input)),
                _ => {}
            }
        }
        assert_eq!(
            texts,
            vec!["Running... ".to_string(), "please wait".to_string()]
        );
        assert_eq!(tool_calls.len(), 1);
        let (id, name, input) = tool_calls.into_iter().next().unwrap();
        assert_eq!(id, "t1");
        assert_eq!(name, "shell");
        assert_eq!(input.get("cmd").and_then(Value::as_str), Some("ls"));
    }

    #[tokio::test]
    #[allow(clippy::never_loop)] // assert-empty: loop body diverges by design
    async fn parse_v1_start_and_ping_emit_nothing() {
        // `start` carries telemetry metadata; `ping` is a keepalive.
        // Neither should surface as a StreamEvent.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let frames = br#"data: {"type":"start","model":"foo","session_id":"s1"}
data: {"type":"ping"}
data: {"type":"ping"}
"#;
        let evs = parse_sse_test(
            Ok(bytes::Bytes::copy_from_slice(frames)),
            &buf,
            &rem,
            &state,
        )
        .await;
        for e in evs {
            match e {
                Ok(ev) => panic!("start/ping must not emit events; got {ev:?}"),
                Err(err) => panic!("start/ping must not surface as Err; got {err}"),
            }
        }
    }

    #[tokio::test]
    async fn parse_v1_done_sentinel_is_silently_consumed() {
        // The trailing `data: [DONE]` after `done` carries no
        // payload; parser must not surface it as a parse error.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let frames = br#"data: {"type":"done"}
data: [DONE]
"#;
        let evs = parse_sse_test(
            Ok(bytes::Bytes::copy_from_slice(frames)),
            &buf,
            &rem,
            &state,
        )
        .await;
        let dones: Vec<_> = evs
            .iter()
            .filter_map(|e| match e {
                Ok(StreamEvent::Done { .. }) => Some(()),
                _ => None,
            })
            .collect();
        assert_eq!(dones.len(), 1, "expected exactly one Done event");
        for e in &evs {
            if let Err(err) = e {
                panic!("[DONE] sentinel must not surface as Err; got {err}");
            }
        }
    }

    #[tokio::test]
    #[allow(clippy::never_loop)] // assert-empty: loop body diverges by design
    async fn parse_v1_block_delta_for_unopened_index_dropped_silently() {
        // A block_delta arriving without a matching block_start (e.g.
        // dropped frame, server bug) must NOT emit an event and must
        // NOT panic. Forward-compat behavior.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let frames = br#"data: {"type":"block_delta","index":99,"delta":"orphan"}
data: {"type":"block_stop","index":99}
"#;
        let evs = parse_sse_test(
            Ok(bytes::Bytes::copy_from_slice(frames)),
            &buf,
            &rem,
            &state,
        )
        .await;
        for e in evs {
            match e {
                Ok(ev) => panic!("orphan delta must not emit; got {ev:?}"),
                Err(err) => panic!("orphan delta must not surface as Err; got {err}"),
            }
        }
    }

    #[tokio::test]
    async fn parse_v1_done_extracts_cache_stats() {
        // v1 done.usage carries cache breakdown:
        //   cache_creation_input_tokens — tokens written to cache this turn
        //   cache_read_input_tokens     — tokens served from cache (savings)
        // Both default to 0 when absent. OAI's `cached_tokens` is accepted
        // as an alias for cache_read (one-counter providers).
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data: {\"type\":\"done\",\"usage\":{\"input_tokens\":120,\"output_tokens\":40,\"cache_creation_input_tokens\":50,\"cache_read_input_tokens\":70}}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let usage = evs
            .into_iter()
            .find_map(|e| match e {
                Ok(StreamEvent::Done { usage }) => usage,
                _ => None,
            })
            .expect("Done with usage");
        assert_eq!(usage.input, 120);
        assert_eq!(usage.output, 40);
        assert_eq!(usage.cache_creation, 50);
        assert_eq!(usage.cache_read, 70);
    }

    #[tokio::test]
    async fn parse_v1_done_cached_tokens_alias_maps_to_cache_read() {
        // OAI compat: `cached_tokens` (single counter) populates
        // cache_read so dashboards have a uniform field to read.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line = b"data: {\"type\":\"done\",\"usage\":{\"input_tokens\":80,\"output_tokens\":20,\"cached_tokens\":60}}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let usage = evs
            .into_iter()
            .find_map(|e| match e {
                Ok(StreamEvent::Done { usage }) => usage,
                _ => None,
            })
            .expect("Done with usage");
        assert_eq!(usage.cache_creation, 0);
        assert_eq!(usage.cache_read, 60);
    }

    #[tokio::test]
    async fn parse_v1_done_without_cache_fields_defaults_to_zero() {
        // Old-shape usage (no cache fields) must still parse cleanly;
        // both cache counters default to 0 — same as "no cache activity".
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let line =
            b"data: {\"type\":\"done\",\"usage\":{\"input_tokens\":10,\"output_tokens\":5}}\n";
        let evs = parse_sse_test(Ok(bytes::Bytes::copy_from_slice(line)), &buf, &rem, &state).await;
        let usage = evs
            .into_iter()
            .find_map(|e| match e {
                Ok(StreamEvent::Done { usage }) => usage,
                _ => None,
            })
            .expect("Done with usage");
        assert_eq!(usage.input, 10);
        assert_eq!(usage.output, 5);
        assert_eq!(usage.cache_creation, 0);
        assert_eq!(usage.cache_read, 0);
    }

    #[tokio::test]
    async fn parse_v1_tool_call_malformed_json_falls_back_to_empty_object() {
        // Malformed tool_call input must not panic / surface as Err —
        // the parser falls back to {} so downstream consumers don't
        // crash. Matches the runtime's "no-args" branch behavior.
        let buf = Arc::new(tokio::sync::Mutex::new(String::new()));
        let rem = Arc::new(tokio::sync::Mutex::new(Vec::<u8>::new()));
        let state = new_state();
        let frames = br#"data: {"type":"block_start","index":0,"block":{"type":"tool_call","id":"c","name":"foo"}}
data: {"type":"block_delta","index":0,"delta":"{not valid json"}
data: {"type":"block_stop","index":0}
"#;
        let evs = parse_sse_test(
            Ok(bytes::Bytes::copy_from_slice(frames)),
            &buf,
            &rem,
            &state,
        )
        .await;
        let input = evs
            .into_iter()
            .find_map(|e| match e {
                Ok(StreamEvent::ToolCall { input, .. }) => Some(input),
                _ => None,
            })
            .expect("expected one ToolCall event");
        assert!(input.is_object());
        assert_eq!(input.as_object().unwrap().len(), 0);
    }

    #[test]
    fn turn_headers_timeout_is_bounded_and_finite() {
        // Sanity-check the constant: streaming turns rely on this for
        // wedged-server detection. Too short would cause spurious
        // failures on a slow TLS handshake; too long would let a dead
        // server hang the runtime indefinitely.
        let s = TURN_HEADERS_TIMEOUT.as_secs();
        assert!(
            (30..=120).contains(&s),
            "TURN_HEADERS_TIMEOUT={s}s out of range"
        );
    }

    #[test]
    fn ctor_trims_whitespace_from_base_url_and_bearer() {
        // dotenv-loaded env vars routinely carry a trailing newline —
        // `RSCLAW_KEY=sk-abc\n` round-trips into the provider as
        // `Some("sk-abc\n")`. reqwest rejects HTTP header values
        // containing `\n` (RFC 7230), so without trimming every signed
        // request fails before leaving the process. Same hazard for
        // base_url where leading/trailing whitespace breaks URL parse.
        // (The wire-side "drop empty bearer so we never emit a bare
        // `Authorization: Bearer `" guard now lives in `FleetHttp`.)
        let p = RsclawProvider::new("  http://x:8090/v1/agent/  ", Some("  sk-abc\n  ".into()));
        assert_eq!(p.base_url, "http://x:8090/v1/agent");
        assert_eq!(p.bearer.as_deref(), Some("sk-abc"));
    }

    #[test]
    fn ctor_blank_or_empty_bearer_becomes_none() {
        // `RSCLAW_KEY=""` (env var set but blank) flows in as `Some("")`
        // from `std::env::var(...).ok()`; `RSCLAW_KEY="   "` as
        // `Some("   ")`. Neither MUST survive — emitting `Authorization:
        // Bearer ` (or `Bearer    `) gets rejected by stricter proxies
        // and obscures the real "no auth configured" error. None / empty
        // / whitespace-only all collapse to `bearer == None`.
        assert!(RsclawProvider::new("http://x", None).bearer.is_none());
        assert!(
            RsclawProvider::new("http://x", Some(String::new()))
                .bearer
                .is_none()
        );
        assert!(
            RsclawProvider::new("http://x", Some("   \n\t".into()))
                .bearer
                .is_none()
        );
    }

    #[test]
    fn is_session_evicted_recognizes_session_not_found() {
        let body = r#"{"error":{"code":"session_not_found","detail":"slot evicted"}}"#;
        assert!(is_session_evicted(StatusCode::NOT_FOUND, body));
    }

    #[test]
    fn is_session_evicted_rejects_404_with_other_code() {
        // A 404 from a misrouted request (e.g. wrong path → CDN 404
        // page or `404 unknown_version` from /sessions/replay) MUST NOT
        // be treated as a session eviction. Earlier code blindly
        // short-circuited any 404 to SessionNotFound, which would loop
        // forever in replay; the unified `is_session_evicted` check
        // requires the body to confirm the eviction code.
        let body = r#"{"error":{"code":"unknown_version","detail":"v not registered"}}"#;
        assert!(!is_session_evicted(StatusCode::NOT_FOUND, body));
        assert!(!is_session_evicted(StatusCode::NOT_FOUND, ""));
        assert!(!is_session_evicted(
            StatusCode::NOT_FOUND,
            "<html>not found</html>",
        ));
    }

    #[test]
    fn is_session_evicted_recognizes_version_drift() {
        let body = r#"{"error":{"code":"version_drift","detail":"node has been upgraded"}}"#;
        assert!(is_session_evicted(StatusCode::CONFLICT, body));
    }

    #[test]
    fn is_session_evicted_recognizes_backend_unavailable() {
        let body = r#"{"error":{"code":"backend_unavailable","detail":"heartbeat timeout"}}"#;
        assert!(is_session_evicted(StatusCode::SERVICE_UNAVAILABLE, body));
    }

    #[test]
    fn is_session_evicted_excludes_no_backend_available() {
        // Capacity exhaustion — replay won't help, must bail.
        let body = r#"{"error":{"code":"no_backend_available","detail":"all GPUs saturated"}}"#;
        assert!(!is_session_evicted(StatusCode::SERVICE_UNAVAILABLE, body));
    }

    #[test]
    fn is_session_evicted_rejects_status_code_mismatch() {
        // Right code, wrong status — don't recover.
        let body = r#"{"error":{"code":"version_drift","detail":"x"}}"#;
        assert!(!is_session_evicted(StatusCode::SERVICE_UNAVAILABLE, body));
        let body = r#"{"error":{"code":"backend_unavailable","detail":"x"}}"#;
        assert!(!is_session_evicted(StatusCode::CONFLICT, body));
    }

    #[test]
    fn is_session_evicted_rejects_malformed_body() {
        assert!(!is_session_evicted(StatusCode::CONFLICT, ""));
        assert!(!is_session_evicted(StatusCode::CONFLICT, "not json"));
        assert!(!is_session_evicted(
            StatusCode::CONFLICT,
            r#"{"code":"version_drift"}"#,
        ));
    }

    fn req_with(messages: Vec<Message>, mode: u8, key: Option<&str>) -> LlmRequest {
        LlmRequest {
            fallback_models: Vec::new(),
            model: "2026.5.15".into(),
            messages,
            system: Some("you are an agent".into()),
            kv_cache_mode: mode,
            session_key: key.map(str::to_string),
            ..Default::default()
        }
    }

    #[test]
    fn canonical_value_sorts_object_keys_alphabetically() {
        // `preserve_order` keeps the IndexMap insertion order; passing
        // a JSON literal in a non-alphabetical order verifies the
        // canonical pass reorders keys.
        let input = json!({
            "z_last": 1,
            "a_first": 2,
            "m_mid": 3,
        });
        let canon = to_canonical_value(input);
        let serialized = serde_json::to_string(&canon).unwrap();
        // BTreeMap-sourced canonical output must emit keys alphabetically.
        assert_eq!(serialized, r#"{"a_first":2,"m_mid":3,"z_last":1}"#);
    }

    #[test]
    fn canonical_value_recurses_into_nested_objects() {
        // The whole point of canonicalization is that nested schema
        // bodies (input_schema.properties.{...}) also get a stable
        // order — that's where actual tool parameter HashMaps surface.
        let input = json!({
            "outer_b": {"y": 1, "x": 2},
            "outer_a": {"inner": {"z": 0, "a": 1}},
        });
        let canon = to_canonical_value(input);
        let s = serde_json::to_string(&canon).unwrap();
        assert_eq!(
            s,
            r#"{"outer_a":{"inner":{"a":1,"z":0}},"outer_b":{"x":2,"y":1}}"#
        );
    }

    #[test]
    fn canonical_value_preserves_array_order() {
        // Arrays are positional, not associative — order is meaningful
        // (e.g. tool ordering, message history). Don't touch.
        let input = json!([3, 1, 2, {"b": 1, "a": 2}]);
        let canon = to_canonical_value(input);
        let s = serde_json::to_string(&canon).unwrap();
        assert_eq!(s, r#"[3,1,2,{"a":2,"b":1}]"#);
    }

    #[test]
    fn canonical_value_is_idempotent() {
        // Running canonicalization twice must produce identical output —
        // worker prefix hashing depends on byte equality across runs,
        // so the operation must be a fixed point.
        let input = json!({
            "tools": [{
                "name": "search",
                "input_schema": {
                    "type": "object",
                    "properties": {"q": {"type": "string"}, "k": {"type": "integer"}},
                    "required": ["q"]
                }
            }]
        });
        let once = to_canonical_value(input.clone());
        let twice = to_canonical_value(once.clone());
        assert_eq!(
            serde_json::to_string(&once).unwrap(),
            serde_json::to_string(&twice).unwrap()
        );
    }

    #[test]
    fn canonical_value_byte_stable_across_input_orderings() {
        // The smoking-gun assertion: two logically-identical JSON
        // values constructed with different insertion orders MUST
        // serialize to the same bytes after canonicalization. This
        // is the exact property the worker's prefix hash depends on.
        let a = json!({
            "z": [{"b": 1, "a": 2}],
            "a": {"inner_z": 1, "inner_a": 2},
        });
        let b = json!({
            "a": {"inner_a": 2, "inner_z": 1},
            "z": [{"a": 2, "b": 1}],
        });
        let canon_a = to_canonical_value(a);
        let canon_b = to_canonical_value(b);
        assert_eq!(
            serde_json::to_string(&canon_a).unwrap(),
            serde_json::to_string(&canon_b).unwrap()
        );
    }

    #[test]
    fn split_request_dynamic_tools_are_canonical() {
        // End-to-end: feed a non-builtin tool whose input_schema has keys
        // in a non-alphabetical order, verify the wire entry the provider
        // would put on the worker (now in `dynamic_user_tools` after the
        // v1.9 split) is in alphabetical order. Canonicalization is the
        // same `tool_json` closure for both arrays, so testing one array
        // covers the other.
        let mut req = req_with(vec![], 2, Some("k"));
        req.tools = vec![crate::ToolDef {
            name: "search".into(), // not in BUILTIN_TOOL_NAMES → user_tools
            description: "look stuff up".into(),
            parameters: json!({
                "type": "object",
                "required": ["q"],
                "properties": {
                    "q": {"type": "string"},
                    "k": {"type": "integer"},
                }
            }),
        }];
        let split = split_request(&req, RSCLAW_DEFAULT_PREFIX_ID, false).unwrap();
        assert_eq!(
            split.dynamic_tools.len(),
            0,
            "non-builtin 'search' must NOT land in the base tools array"
        );
        assert_eq!(split.dynamic_user_tools.len(), 1);
        let serialized = serde_json::to_string(&split.dynamic_user_tools[0]).unwrap();
        // Top level keys: description, input_schema, name (alphabetical).
        // input_schema body: properties, required, type. properties body: k, q.
        // Property bodies: type only (single key — order irrelevant).
        assert_eq!(
            serialized,
            concat!(
                r#"{"description":"look stuff up","input_schema":"#,
                r#"{"properties":{"k":{"type":"integer"},"q":{"type":"string"}},"#,
                r#""required":["q"],"type":"object"},"name":"search"}"#
            )
        );
    }

    #[test]
    fn split_request_uses_provided_prefix_id_verbatim() {
        // prefix_id is config-driven; split_request passes it through
        // without inspecting req.model. Two distinct model strings on
        // the same request must yield the SAME wire prefix_id when the
        // caller supplied the same value.
        let mut req = req_with(vec![], 2, Some("k"));
        req.model = "qwen3-235b".into();
        let split = split_request(&req, RSCLAW_DEFAULT_PREFIX_ID, false).unwrap();
        assert_eq!(split.prefix_id, RSCLAW_DEFAULT_PREFIX_ID);

        req.model = "myorg/qwen3-235b".into();
        let split2 = split_request(&req, RSCLAW_DEFAULT_PREFIX_ID, false).unwrap();
        assert_eq!(split2.prefix_id, RSCLAW_DEFAULT_PREFIX_ID);
    }

    #[test]
    fn split_request_honours_custom_prefix_id_override() {
        // Provider configured with a non-default prefix_id (e.g. a
        // tenant's private namespace) — split_request forwards the
        // override verbatim, independent of req.model.
        let mut req = req_with(vec![], 2, Some("k"));
        req.model = "qwen3-235b".into();
        let split = split_request(&req, "myorg/2026.5.15", false).unwrap();
        assert_eq!(split.prefix_id, "myorg/2026.5.15");
    }

    #[test]
    fn with_prefix_id_overrides_default_and_ignores_blank() {
        // Builder swaps in the override; whitespace-only / empty input
        // is rejected so a misconfigured config file can't produce a
        // §2.10.1-invalid wire value.
        let p = RsclawProvider::new("http://x", None);
        assert_eq!(p.prefix_id, RSCLAW_DEFAULT_PREFIX_ID);

        let p = RsclawProvider::new("http://x", None).with_prefix_id("tenant/2026.6.1");
        assert_eq!(p.prefix_id, "tenant/2026.6.1");

        let p = RsclawProvider::new("http://x", None).with_prefix_id("   \n  ");
        assert_eq!(p.prefix_id, RSCLAW_DEFAULT_PREFIX_ID);
    }

    #[test]
    fn with_prefix_id_rejects_invalid_slash_count() {
        // §2.10.1 mandates exactly one '/' separator. A config typo with
        // zero slashes (e.g. "rsclaw-2026.5.15") or two+ ("foo/bar/baz")
        // would survive boot and only fail on the first wire call, which
        // is annoying to debug. Validate at the builder so the override
        // is dropped early and we boot with the safe default.
        let default = RSCLAW_DEFAULT_PREFIX_ID;

        let p = RsclawProvider::new("http://x", None).with_prefix_id("rsclaw-2026.5.15");
        assert_eq!(p.prefix_id, default, "no slash → reject");

        let p = RsclawProvider::new("http://x", None).with_prefix_id("foo/bar/baz");
        assert_eq!(p.prefix_id, default, "two slashes → reject");

        // Surrounding whitespace gets trimmed before validation — a
        // valid value with stray dotenv newline still works.
        let p = RsclawProvider::new("http://x", None).with_prefix_id("  tenant/v1\n");
        assert_eq!(p.prefix_id, "tenant/v1", "trim before count");
    }

    #[test]
    fn split_request_separates_builtin_and_user_tools_when_split_present() {
        // With `system_shared` populated, the runtime is in real-split
        // mode: builtins go to `dynamic_prefix.tools` (base hash),
        // non-builtins go to `dynamic_prefix.user_tools` (user segment).
        // v1.9 separates the two arrays — they used to be concatenated
        // with builtin-first ordering inside a single `tools` array.
        let mut req = req_with(vec![], 2, Some("k"));
        req.system_shared = Some("<shared system>".into());
        req.user_system = Some("<user suffix>".into());
        // Order is irrelevant — classification is by name lookup in
        // BUILTIN_TOOL_NAMES.
        req.tools.push(ToolDef {
            name: "search".into(), // not in BUILTIN_TOOL_NAMES
            description: "search the web".into(),
            parameters: json!({"type":"object","properties":{}}),
        });
        req.tools.push(ToolDef {
            name: "memory".into(), // builtin
            description: "memory tool".into(),
            parameters: json!({"type":"object","properties":{}}),
        });
        let split = split_request(&req, RSCLAW_DEFAULT_PREFIX_ID, false).unwrap();
        assert_eq!(split.dynamic_tools.len(), 1, "only 'memory' is builtin");
        assert_eq!(split.dynamic_tools[0]["name"], "memory");
        assert_eq!(split.dynamic_user_tools.len(), 1, "'search' is per-client");
        assert_eq!(split.dynamic_user_tools[0]["name"], "search");
        assert_eq!(split.dynamic_system, "<shared system>");
        assert_eq!(split.dynamic_user_system, "<user suffix>");
    }

    #[test]
    fn split_request_classifies_tools_in_degraded_mode_too() {
        // Internal sessions / non-runtime callers don't populate the
        // system_shared/user_system text split — but tool classification
        // is unconditional, so the base cache still hashes over builtins
        // only. v1.8 collapsed everything into `tools` here and forfeited
        // per-client cache sharing; v1.9 keeps the split.
        let mut req = req_with(vec![], 2, Some("k"));
        req.tools.push(ToolDef {
            name: "search".into(), // not builtin
            description: "search".into(),
            parameters: json!({"type":"object"}),
        });
        req.tools.push(ToolDef {
            name: "memory".into(), // builtin
            description: "memory".into(),
            parameters: json!({"type":"object"}),
        });
        let split = split_request(&req, RSCLAW_DEFAULT_PREFIX_ID, false).unwrap();
        assert_eq!(split.dynamic_tools.len(), 1);
        assert_eq!(split.dynamic_tools[0]["name"], "memory");
        assert_eq!(split.dynamic_user_tools.len(), 1);
        assert_eq!(split.dynamic_user_tools[0]["name"], "search");
        assert_eq!(split.dynamic_system, "you are an agent");
        assert_eq!(split.dynamic_user_system, "");
    }

    #[test]
    fn create_session_req_with_prefix_id_omits_dynamic_prefix() {
        // Non-empty prefix_id → static-registry fork: send `prefix_id`,
        // OMIT `dynamic_prefix` entirely. Also assert the legacy / pre-rename
        // field names never leak at top-level.
        let mut req = req_with(vec![], 2, Some("k"));
        req.system_shared = Some("<sys>".into());
        req.user_system = Some("<suf>".into());
        req.tools.push(ToolDef {
            name: "memory".into(),
            description: "memory tool".into(),
            parameters: json!({"type":"object"}),
        });
        let split = split_request(&req, RSCLAW_DEFAULT_PREFIX_ID, false).unwrap();
        let (prefix_id, dynamic_prefix, top_level_user_tools) = prefix_fields(
            &split.prefix_id,
            DynamicPrefixWire {
                system: split.dynamic_system,
                tools: &split.dynamic_tools,
                user_tools: &split.dynamic_user_tools,
                user_system: split.dynamic_user_system,
            },
        );
        let body = CreateSessionReq {
            prefix_id,
            model: &split.model,
            dynamic_prefix,
            user_tools: top_level_user_tools,
            options: Some(split.options.clone()),
        };
        let v = serde_json::to_value(&body).unwrap();
        assert_eq!(v["prefix_id"], RSCLAW_DEFAULT_PREFIX_ID);
        assert!(
            v.get("dynamic_prefix").is_none(),
            "non-empty prefix_id must OMIT dynamic_prefix (mutually exclusive)"
        );
        // This req has only the builtin `memory` tool, so dynamic_user_tools
        // is empty and the skip-if-empty serde rule drops the top-level
        // `user_tools` slot. The next test exercises the non-empty case.
        assert!(
            v.get("user_tools").is_none(),
            "no per-session private tools → top-level user_tools omitted"
        );
        assert!(
            v.get("rsclaw_version").is_none(),
            "rsclaw_version is the pre-rename name; never send"
        );
        assert!(
            v.get("user_suffix").is_none(),
            "user_suffix is the legacy name; never send (top-level or otherwise)"
        );
        assert!(
            v.get("user_system").is_none(),
            "user_system lives inside dynamic_prefix, never at top-level"
        );
        assert!(
            v.get("plugins_system").is_none(),
            "pre-rename field; folded into dynamic_prefix.system"
        );
        assert!(
            v.get("skills_system").is_none(),
            "pre-rename field; folded into dynamic_prefix.system"
        );
    }

    #[test]
    fn create_session_req_registry_path_sends_top_level_user_tools() {
        // Non-empty prefix_id + non-builtin tool → registry path: lift
        // user tools out of the (omitted) dynamic_prefix and place them
        // at the top level per protocol §2.1.1.
        let mut req = req_with(vec![], 2, Some("k"));
        req.system_shared = Some("<sys>".into());
        req.tools.push(ToolDef {
            name: "memory".into(), /* builtin → drops into base, which the registered prefix
                                    * already owns */
            description: "memory".into(),
            parameters: json!({"type":"object"}),
        });
        req.tools.push(ToolDef {
            name: "douyin__publish".into(), // namespaced plugin tool → user_tools
            description: "publish to douyin".into(),
            parameters: json!({"type":"object"}),
        });
        let split = split_request(&req, RSCLAW_DEFAULT_PREFIX_ID, false).unwrap();
        let (prefix_id, dynamic_prefix, top_level_user_tools) = prefix_fields(
            &split.prefix_id,
            DynamicPrefixWire {
                system: split.dynamic_system,
                tools: &split.dynamic_tools,
                user_tools: &split.dynamic_user_tools,
                user_system: split.dynamic_user_system,
            },
        );
        let body = CreateSessionReq {
            prefix_id,
            model: &split.model,
            dynamic_prefix,
            user_tools: top_level_user_tools,
            options: Some(split.options.clone()),
        };
        let v = serde_json::to_value(&body).unwrap();
        assert_eq!(v["prefix_id"], RSCLAW_DEFAULT_PREFIX_ID);
        assert!(
            v.get("dynamic_prefix").is_none(),
            "registry path must omit dynamic_prefix"
        );
        let tools = v["user_tools"]
            .as_array()
            .expect("top-level user_tools must be present");
        assert_eq!(tools.len(), 1);
        assert_eq!(tools[0]["name"], "douyin__publish");
    }

    #[test]
    fn create_session_req_dynamic_path_keeps_user_tools_inside_dynamic_prefix() {
        // Empty prefix_id → dynamic path: per-session private tools
        // belong inside dynamic_prefix.user_tools; the top-level slot
        // must stay empty so we don't double-render the same payload.
        let mut req = req_with(vec![], 2, Some("k"));
        req.system_shared = Some("<sys>".into());
        req.tools.push(ToolDef {
            name: "memory".into(),
            description: "memory".into(),
            parameters: json!({"type":"object"}),
        });
        req.tools.push(ToolDef {
            name: "douyin__publish".into(),
            description: "publish to douyin".into(),
            parameters: json!({"type":"object"}),
        });
        let split = split_request(&req, "", false).unwrap();
        let (prefix_id, dynamic_prefix, top_level_user_tools) = prefix_fields(
            &split.prefix_id,
            DynamicPrefixWire {
                system: split.dynamic_system,
                tools: &split.dynamic_tools,
                user_tools: &split.dynamic_user_tools,
                user_system: split.dynamic_user_system,
            },
        );
        let body = CreateSessionReq {
            prefix_id,
            model: &split.model,
            dynamic_prefix,
            user_tools: top_level_user_tools,
            options: Some(split.options.clone()),
        };
        let v = serde_json::to_value(&body).unwrap();
        assert!(
            v.get("prefix_id").is_none(),
            "dynamic path omits top-level prefix_id"
        );
        assert!(
            v.get("user_tools").is_none(),
            "dynamic path keeps user_tools inside dynamic_prefix"
        );
        let dyn_user_tools = v["dynamic_prefix"]["user_tools"]
            .as_array()
            .expect("dynamic_prefix.user_tools must be present");
        assert_eq!(dyn_user_tools.len(), 1);
        assert_eq!(dyn_user_tools[0]["name"], "douyin__publish");
        let dyn_tools = v["dynamic_prefix"]["tools"]
            .as_array()
            .expect("dynamic_prefix.tools must be present");
        assert_eq!(dyn_tools.len(), 1);
        assert_eq!(
            dyn_tools[0]["name"], "memory",
            "builtins stay in dynamic_prefix.tools (base hash)"
        );
    }

    #[test]
    fn create_session_req_empty_prefix_id_sends_dynamic_prefix() {
        // Empty prefix_id → dynamic-LRU mode: OMIT `prefix_id`, send the
        // full `dynamic_prefix{system,tools,user_system}`.
        let mut req = req_with(vec![], 2, Some("k"));
        req.system_shared = Some("<sys>".into());
        req.user_system = Some("<suf>".into());
        req.tools.push(ToolDef {
            name: "memory".into(),
            description: "memory tool".into(),
            parameters: json!({"type":"object"}),
        });
        // Empty prefix_id forces the dynamic-LRU path.
        let split = split_request(&req, "", false).unwrap();
        let (prefix_id, dynamic_prefix, top_level_user_tools) = prefix_fields(
            &split.prefix_id,
            DynamicPrefixWire {
                system: split.dynamic_system,
                tools: &split.dynamic_tools,
                user_tools: &split.dynamic_user_tools,
                user_system: split.dynamic_user_system,
            },
        );
        let body = CreateSessionReq {
            prefix_id,
            model: &split.model,
            dynamic_prefix,
            user_tools: top_level_user_tools,
            options: Some(split.options.clone()),
        };
        let v = serde_json::to_value(&body).unwrap();
        assert!(
            v.get("prefix_id").is_none(),
            "empty prefix_id must be OMITTED (mutually exclusive with dynamic_prefix)"
        );
        assert_eq!(v["dynamic_prefix"]["system"], "<sys>");
        assert_eq!(v["dynamic_prefix"]["user_system"], "<suf>");
        assert_eq!(v["dynamic_prefix"]["tools"][0]["name"], "memory");
    }

    #[test]
    fn history_for_replay_drops_trailing_delta() {
        let m = |role, txt: &str| Message {
            role,
            content: MessageContent::Text(txt.into()),
            rsclaw_hidden: None,
        };
        let msgs = vec![
            m(Role::User, "hi"),
            m(Role::Assistant, "yo"),
            m(Role::User, "again"),
        ];
        let slice = history_for_replay(&msgs);
        assert_eq!(slice.len(), 2);
        assert!(matches!(slice[0].role, Role::User));
        assert!(matches!(slice[1].role, Role::Assistant));
    }

    #[test]
    fn history_for_replay_handles_empty_and_singleton() {
        let empty: Vec<Message> = Vec::new();
        assert!(history_for_replay(&empty).is_empty());
        let one = vec![Message {
            role: Role::User,
            content: MessageContent::Text("solo".into()),
            rsclaw_hidden: None,
        }];
        assert!(history_for_replay(&one).is_empty());
    }

    #[test]
    fn history_for_replay_drops_all_consecutive_trailing_tools() {
        // Parallel-tool case: assistant emits N tool_use blocks, runtime
        // queues N consecutive Role::Tool messages, from_request folds
        // them into a single Tools delta. history_for_replay must drop
        // ALL N — dropping just one leaves N-1 tool_results in history,
        // server replays them into KV, then turn() re-sends them as the
        // delta, hydrating duplicates.
        let m = |role, txt: &str| Message {
            role,
            content: MessageContent::Text(txt.into()),
            rsclaw_hidden: None,
        };
        let tool = |id: &str| Message {
            role: Role::Tool,
            content: MessageContent::Parts(vec![ContentPart::ToolResult {
                tool_use_id: id.into(),
                content: "ok".into(),
                is_error: None,
            }]),
            rsclaw_hidden: None,
        };
        let msgs = vec![
            m(Role::User, "do all three"),
            m(Role::Assistant, "calling tools"),
            tool("toolu_1"),
            tool("toolu_2"),
            tool("toolu_3"),
        ];
        let slice = history_for_replay(&msgs);
        assert_eq!(slice.len(), 2);
        assert!(matches!(slice[0].role, Role::User));
        assert!(matches!(slice[1].role, Role::Assistant));
    }

    #[test]
    fn history_for_replay_keeps_earlier_tool_messages() {
        // Sequential-tool case across a multi-iteration turn:
        // [..., User, Asst, Tool, Asst, Tool, Asst, Tool] — only the
        // FINAL contiguous Tool run belongs to the current step's delta;
        // earlier Tool messages are part of completed sub-iterations and
        // stay in history.
        let m = |role, txt: &str| Message {
            role,
            content: MessageContent::Text(txt.into()),
            rsclaw_hidden: None,
        };
        let tool = |id: &str| Message {
            role: Role::Tool,
            content: MessageContent::Parts(vec![ContentPart::ToolResult {
                tool_use_id: id.into(),
                content: "ok".into(),
                is_error: None,
            }]),
            rsclaw_hidden: None,
        };
        let msgs = vec![
            m(Role::User, "go"),
            m(Role::Assistant, "step1"),
            tool("a"),
            m(Role::Assistant, "step2"),
            tool("b"),
        ];
        let slice = history_for_replay(&msgs);
        assert_eq!(slice.len(), 4);
        // The earlier Tool stays in history.
        assert!(matches!(slice[2].role, Role::Tool));
        // Trailing Tool dropped.
        assert!(matches!(slice[3].role, Role::Assistant));
    }

    #[test]
    fn serialize_replay_history_coalesces_parallel_tools() {
        // Assistant called 3 tools in parallel → runtime queued 3 Tool
        // messages. In replay history they MUST collapse into one
        // user-role entry whose content[] carries all three tool_results,
        // matching the protocol §2.2 example shape.
        let mk_tool = |id: &str, body: &str| Message {
            role: Role::Tool,
            content: MessageContent::Parts(vec![ContentPart::ToolResult {
                tool_use_id: id.into(),
                content: body.into(),
                is_error: None,
            }]),
            rsclaw_hidden: None,
        };
        let user = Message {
            role: Role::User,
            content: MessageContent::Text("go".into()),
            rsclaw_hidden: None,
        };
        let asst = Message {
            role: Role::Assistant,
            content: MessageContent::Text("calling tools".into()),
            rsclaw_hidden: None,
        };
        let ta = mk_tool("a", "ra");
        let tb = mk_tool("b", "rb");
        let tc = mk_tool("c", "rc");
        let msgs = vec![&user, &asst, &ta, &tb, &tc];
        let out = serialize_replay_history(&msgs);
        assert_eq!(
            out.len(),
            3,
            "user + assistant + 1 coalesced tool entry: {out:?}"
        );
        assert_eq!(out[0]["role"], "user");
        assert_eq!(out[1]["role"], "assistant");
        assert_eq!(out[2]["role"], "user");
        let parts = out[2]["content"].as_array().expect("content array");
        assert_eq!(parts.len(), 3);
        assert_eq!(parts[0]["tool_use_id"], "a");
        assert_eq!(parts[1]["tool_use_id"], "b");
        assert_eq!(parts[2]["tool_use_id"], "c");
        for p in parts {
            assert_eq!(p["type"], "tool_result");
        }
    }

    #[test]
    fn serialize_replay_history_keeps_separated_tool_runs_separate() {
        // Sequential-tool sub-iterations: Tool, Asst, Tool → two distinct
        // user-role entries (one per tool run), with the assistant block
        // between them.
        let mk_tool = |id: &str| Message {
            role: Role::Tool,
            content: MessageContent::Parts(vec![ContentPart::ToolResult {
                tool_use_id: id.into(),
                content: "ok".into(),
                is_error: None,
            }]),
            rsclaw_hidden: None,
        };
        let asst = Message {
            role: Role::Assistant,
            content: MessageContent::Text("step".into()),
            rsclaw_hidden: None,
        };
        let ta = mk_tool("a");
        let tb = mk_tool("b");
        let msgs = vec![&ta, &asst, &tb];
        let out = serialize_replay_history(&msgs);
        assert_eq!(out.len(), 3);
        assert_eq!(out[0]["role"], "user");
        assert_eq!(out[0]["content"][0]["tool_use_id"], "a");
        assert_eq!(out[1]["role"], "assistant");
        assert_eq!(out[2]["role"], "user");
        assert_eq!(out[2]["content"][0]["tool_use_id"], "b");
    }

    #[test]
    fn serialize_replay_history_drops_tool_run_with_no_tool_result_parts() {
        // Defensive: a stray Role::Tool message carrying non-ToolResult
        // parts (Text/Image/etc) should not produce an empty user-role
        // entry — that would be `{"role":"user","content":[]}`, which
        // some chat templates reject.
        let bad = Message {
            role: Role::Tool,
            content: MessageContent::Parts(vec![ContentPart::Text {
                text: "noise".into(),
            }]),
            rsclaw_hidden: None,
        };
        let user = Message {
            role: Role::User,
            content: MessageContent::Text("hi".into()),
            rsclaw_hidden: None,
        };
        let msgs = vec![&user, &bad];
        let out = serialize_replay_history(&msgs);
        assert_eq!(out.len(), 1, "only the User survives: {out:?}");
        assert_eq!(out[0]["role"], "user");
    }

    #[test]
    fn turn_delta_user_text() {
        let req = req_with(
            vec![Message {
                role: Role::User,
                content: MessageContent::Text("hello".into()),
                rsclaw_hidden: None,
            }],
            2,
            Some("k"),
        );
        let delta = TurnDelta::from_request(&req).unwrap();
        let body = serde_json::to_value(&delta).unwrap();
        assert_eq!(body["user_message"], "hello");
    }

    #[test]
    fn turn_request_serializes_recall_as_independent_top_level_fields() {
        let mut req = req_with(
            vec![Message {
                role: Role::User,
                content: MessageContent::Text("hello".into()),
                rsclaw_hidden: None,
            }],
            2,
            Some("k"),
        );
        req.recall = Some(crate::RecallBundle {
            context: "用户手机号: 13900001234".into(),
            metadata: crate::RecallMetadata {
                mode: "committed".into(),
                format: "xml".into(),
                source: "server".into(),
                trace_id: Some("recall_test".into()),
                max_tokens: Some(1200),
                doc_ids: vec!["doc-1".into()],
                hash: "sha256:abc".into(),
                truncated: false,
            },
        });
        let delta = TurnDelta::from_request(&req).unwrap();
        let body = serde_json::to_value(&TurnReq {
            delta: &delta,
            recall_context: req.recall.as_ref().map(|r| r.context.as_str()),
            recall: req.recall.as_ref().map(|r| &r.metadata),
            options: Some(TurnOptions::from_request(&req, false)),
            stream: true,
        })
        .unwrap();

        assert_eq!(body["user_message"], "hello");
        assert_eq!(body["recall_context"], "用户手机号: 13900001234");
        assert_eq!(body["recall"]["mode"], "committed");
        assert_eq!(body["recall"]["format"], "xml");
        assert_eq!(body["recall"]["doc_ids"][0], "doc-1");
        assert!(
            !body["user_message"].as_str().unwrap().contains("<recall>"),
            "worker owns canonical recall wrapper"
        );
    }

    #[test]
    fn turn_request_omits_empty_recall_fields() {
        let mut req = req_with(
            vec![Message {
                role: Role::User,
                content: MessageContent::Text("hello".into()),
                rsclaw_hidden: None,
            }],
            2,
            Some("k"),
        );
        req.recall = Some(crate::RecallBundle {
            context: "  ".into(),
            metadata: crate::RecallMetadata::default(),
        });
        let delta = TurnDelta::from_request(&req).unwrap();
        let recall = req.recall.as_ref().filter(|r| !r.context.trim().is_empty());
        let body = serde_json::to_value(&TurnReq {
            delta: &delta,
            recall_context: recall.map(|r| r.context.as_str()),
            recall: recall.map(|r| &r.metadata),
            options: Some(TurnOptions::from_request(&req, false)),
            stream: true,
        })
        .unwrap();

        assert!(body.get("recall_context").is_none());
        assert!(body.get("recall").is_none());
    }

    #[test]
    fn turn_delta_user_text_empty_bails() {
        let req = req_with(
            vec![Message {
                role: Role::User,
                content: MessageContent::Text(String::new()),
                rsclaw_hidden: None,
            }],
            2,
            Some("k"),
        );
        let err = TurnDelta::from_request(&req).unwrap_err().to_string();
        assert!(err.contains("no usable content"), "got: {err}");
    }

    #[test]
    fn turn_delta_user_parts_with_only_empty_text_bails() {
        let req = req_with(
            vec![Message {
                role: Role::User,
                content: MessageContent::Parts(vec![
                    ContentPart::Text {
                        text: String::new(),
                    },
                    ContentPart::Text {
                        text: String::new(),
                    },
                ]),
                rsclaw_hidden: None,
            }],
            2,
            Some("k"),
        );
        let err = TurnDelta::from_request(&req).unwrap_err().to_string();
        assert!(err.contains("no usable content"), "got: {err}");
    }

    #[test]
    fn turn_delta_user_parts_concatenates_text_fragments() {
        let req = req_with(
            vec![Message {
                role: Role::User,
                content: MessageContent::Parts(vec![
                    ContentPart::Text {
                        text: "hello ".into(),
                    },
                    ContentPart::Text {
                        text: "world".into(),
                    },
                ]),
                rsclaw_hidden: None,
            }],
            2,
            Some("k"),
        );
        let delta = TurnDelta::from_request(&req).unwrap();
        let body = serde_json::to_value(&delta).unwrap();
        assert_eq!(body["user_message"], "hello world");
    }

    #[test]
    fn turn_delta_tool_results() {
        let req = req_with(
            vec![Message {
                role: Role::Tool,
                content: MessageContent::Parts(vec![ContentPart::ToolResult {
                    tool_use_id: "toolu_1".into(),
                    content: "ok".into(),
                    is_error: None,
                }]),
                rsclaw_hidden: None,
            }],
            2,
            Some("k"),
        );
        let delta = TurnDelta::from_request(&req).unwrap();
        let body = serde_json::to_value(&delta).unwrap();
        assert_eq!(body["tool_results"][0]["tool_use_id"], "toolu_1");
    }

    #[test]
    fn lookup_and_bump_evicts_on_history_shrink() {
        let provider = RsclawProvider::new("http://x", None);
        provider.store(
            "k",
            SessionEntry {
                session_id: "rs_w7_abc".into(),
                prefix_id: "rsclaw/2026.5.28".into(),
                last_seen_msgs_len: 12,
            },
        );
        // Same len → cached entry returned, last_seen unchanged.
        assert!(
            provider
                .lookup_and_bump("k", "rsclaw/2026.5.28", 12)
                .is_some()
        );
        // Growth → bumped, returned.
        assert!(
            provider
                .lookup_and_bump("k", "rsclaw/2026.5.28", 14)
                .is_some()
        );
        // Shrink (compaction trimmed history) → None, caller re-hydrates.
        assert!(
            provider
                .lookup_and_bump("k", "rsclaw/2026.5.28", 8)
                .is_none()
        );
        // Version drift → None even if len matches.
        assert!(
            provider
                .lookup_and_bump("k", "rsclaw/2026.5.6", 14)
                .is_none()
        );
        // Missing key → None.
        assert!(
            provider
                .lookup_and_bump("missing", "rsclaw/2026.5.28", 14)
                .is_none()
        );
    }

    #[test]
    fn evict_if_oversized_culls_to_half_cap_when_over() {
        // Construct a HashMap larger than MAX_SESSIONS to verify the
        // batched eviction policy actually drops entries (not all, not
        // none) when the cache exceeds the cap. Cap is 10_000 so use a
        // synthetic over-cap fill.
        let mut map: HashMap<String, SessionEntry> = HashMap::new();
        let total = MAX_SESSIONS + 100;
        for i in 0..total {
            map.insert(
                format!("k{i}"),
                SessionEntry {
                    session_id: format!("rs_w_{i}"),
                    prefix_id: "rsclaw/test".into(),
                    last_seen_msgs_len: 1,
                },
            );
        }
        evict_if_oversized(&mut map);
        // After culling we expect ~MAX_SESSIONS/2 retained: the formula
        // drops (total - MAX_SESSIONS/2) entries.
        assert_eq!(map.len(), MAX_SESSIONS / 2);
    }

    #[test]
    fn evict_if_oversized_no_op_when_under_cap() {
        // Below the cap the function must NOT touch the map — eviction
        // is purely a memory-safety measure, not a routine GC.
        let mut map: HashMap<String, SessionEntry> = HashMap::new();
        for i in 0..100 {
            map.insert(
                format!("k{i}"),
                SessionEntry {
                    session_id: format!("rs_{i}"),
                    prefix_id: "rsclaw/test".into(),
                    last_seen_msgs_len: 1,
                },
            );
        }
        evict_if_oversized(&mut map);
        assert_eq!(map.len(), 100);
    }

    #[tokio::test]
    async fn invalidate_on_error_evicts_session_on_first_err() {
        // Wrap a stream that yields one Ok then an Err; the wrapper
        // must remove the session entry when the Err lands. Subsequent
        // Err items don't re-evict (idempotency by `errored` flag).
        let provider = RsclawProvider::new("http://x", None);
        provider.store(
            "session-key",
            SessionEntry {
                session_id: "rs_w7_xyz".into(),
                prefix_id: "rsclaw/test".into(),
                last_seen_msgs_len: 5,
            },
        );
        let inner: LlmStream = Box::pin(futures::stream::iter(vec![
            Ok(StreamEvent::TextDelta("hi".into())),
            Err(anyhow::anyhow!("boom")),
        ]));
        let wrapped = invalidate_on_error(
            inner,
            Arc::clone(&provider.sessions),
            "session-key".to_owned(),
        );
        let collected: Vec<_> = wrapped.collect().await;
        assert_eq!(collected.len(), 2);
        assert!(matches!(collected[0], Ok(StreamEvent::TextDelta(_))));
        assert!(collected[1].is_err());
        // Session must be gone after the error item passed through.
        assert!(provider.lock_sessions().get("session-key").is_none());
    }

    #[tokio::test]
    async fn invalidate_on_error_evicts_on_stream_event_error() {
        // Protocol §2.3 `error` events surface as `Ok(StreamEvent::Error)`
        // — these MUST also force eviction, otherwise a server-issued
        // error mid-stream leaves the cached session pointing at a
        // partially-committed turn.
        let provider = RsclawProvider::new("http://x", None);
        provider.store(
            "k",
            SessionEntry {
                session_id: "rs_w7_abc".into(),
                prefix_id: "rsclaw/test".into(),
                last_seen_msgs_len: 5,
            },
        );
        let inner: LlmStream = Box::pin(futures::stream::iter(vec![Ok(StreamEvent::Error(
            "model_overloaded".into(),
        ))]));
        let wrapped = invalidate_on_error(inner, Arc::clone(&provider.sessions), "k".into());
        let _: Vec<_> = wrapped.collect().await;
        assert!(provider.lock_sessions().get("k").is_none());
    }

    #[tokio::test]
    async fn invalidate_on_error_keeps_session_on_clean_stream() {
        // Stream with no errors → session stays cached. Otherwise we'd
        // pay an unnecessary replay round-trip on every successful
        // turn, defeating kvCacheMode=2.
        let provider = RsclawProvider::new("http://x", None);
        provider.store(
            "k",
            SessionEntry {
                session_id: "rs_w7_abc".into(),
                prefix_id: "rsclaw/test".into(),
                last_seen_msgs_len: 5,
            },
        );
        let inner: LlmStream = Box::pin(futures::stream::iter(vec![
            Ok(StreamEvent::TextDelta("hello".into())),
            Ok(StreamEvent::Done { usage: None }),
        ]));
        let wrapped = invalidate_on_error(inner, Arc::clone(&provider.sessions), "k".into());
        let _: Vec<_> = wrapped.collect().await;
        assert!(provider.lock_sessions().get("k").is_some());
    }

    #[test]
    fn turn_delta_collects_parallel_tool_results() {
        // Assistant called 3 tools in parallel → 3 trailing Tool messages.
        let tool_msg = |id: &str, body: &str| Message {
            role: Role::Tool,
            content: MessageContent::Parts(vec![ContentPart::ToolResult {
                tool_use_id: id.into(),
                content: body.into(),
                is_error: None,
            }]),
            rsclaw_hidden: None,
        };
        let req = req_with(
            vec![
                Message {
                    role: Role::User,
                    content: MessageContent::Text("do three things".into()),
                    rsclaw_hidden: None,
                },
                tool_msg("toolu_a", "result a"),
                tool_msg("toolu_b", "result b"),
                tool_msg("toolu_c", "result c"),
            ],
            2,
            Some("k"),
        );
        let delta = TurnDelta::from_request(&req).unwrap();
        let body = serde_json::to_value(&delta).unwrap();
        let arr = body["tool_results"].as_array().unwrap();
        assert_eq!(arr.len(), 3);
        assert_eq!(arr[0]["tool_use_id"], "toolu_a");
        assert_eq!(arr[1]["tool_use_id"], "toolu_b");
        assert_eq!(arr[2]["tool_use_id"], "toolu_c");
    }

    #[test]
    fn turn_delta_does_not_cross_user_boundary() {
        // A non-Tool message between User and the trailing Tool must
        // stop the back-walk — the earlier Tool belongs to a prior turn.
        let req = req_with(
            vec![
                Message {
                    role: Role::Tool,
                    content: MessageContent::Parts(vec![ContentPart::ToolResult {
                        tool_use_id: "toolu_old".into(),
                        content: "stale".into(),
                        is_error: None,
                    }]),
                    rsclaw_hidden: None,
                },
                Message {
                    role: Role::Assistant,
                    content: MessageContent::Text("ack".into()),
                    rsclaw_hidden: None,
                },
                Message {
                    role: Role::Tool,
                    content: MessageContent::Parts(vec![ContentPart::ToolResult {
                        tool_use_id: "toolu_new".into(),
                        content: "fresh".into(),
                        is_error: None,
                    }]),
                    rsclaw_hidden: None,
                },
            ],
            2,
            Some("k"),
        );
        let delta = TurnDelta::from_request(&req).unwrap();
        let body = serde_json::to_value(&delta).unwrap();
        let arr = body["tool_results"].as_array().unwrap();
        assert_eq!(arr.len(), 1);
        assert_eq!(arr[0]["tool_use_id"], "toolu_new");
    }

    #[test]
    fn split_system_messages_lifts_system_to_suffix() {
        // Mid-conversation Role::System blocks (plugins, skills,
        // /ctx) must NOT appear in /sessions/replay history — the
        // protocol rejects role:"system" with 400 invalid_history.
        let m = |role: Role, txt: &str| Message {
            role,
            content: MessageContent::Text(txt.into()),
            rsclaw_hidden: None,
        };
        let msgs = vec![
            m(Role::System, "PLUGINS"),
            m(Role::System, "SKILLS"),
            m(Role::User, "hi"),
            m(Role::Assistant, "yo"),
            m(Role::System, "## New Skill Installed\nfoo"),
            m(Role::User, "again"),
        ];
        let (filtered, suffix) = split_system_messages(&msgs);
        assert_eq!(filtered.len(), 3);
        for m in &filtered {
            assert!(!matches!(m.role, Role::System));
        }
        assert_eq!(suffix, "PLUGINS\n\nSKILLS\n\n## New Skill Installed\nfoo");
    }

    #[test]
    fn split_system_messages_handles_text_parts() {
        let msgs = vec![Message {
            role: Role::System,
            content: MessageContent::Parts(vec![
                ContentPart::Text {
                    text: "hello ".into(),
                },
                ContentPart::Text {
                    text: "world".into(),
                },
            ]),
            rsclaw_hidden: None,
        }];
        let (filtered, suffix) = split_system_messages(&msgs);
        assert!(filtered.is_empty());
        assert_eq!(suffix, "hello world");
    }

    #[test]
    fn split_system_messages_empty_when_no_system() {
        let msgs = vec![Message {
            role: Role::User,
            content: MessageContent::Text("hi".into()),
            rsclaw_hidden: None,
        }];
        let (filtered, suffix) = split_system_messages(&msgs);
        assert_eq!(filtered.len(), 1);
        assert!(suffix.is_empty());
    }

    #[test]
    fn split_system_messages_drops_empty_text_system() {
        // An empty System(Text("")) used to leak into sys_parts and
        // produce a stray "\n\n" prefix once joined. Verify it now
        // drops cleanly, matching the Parts path's behavior.
        let msgs = vec![
            Message {
                role: Role::User,
                content: MessageContent::Text("hi".into()),
                rsclaw_hidden: None,
            },
            Message {
                role: Role::System,
                content: MessageContent::Text(String::new()),
                rsclaw_hidden: None,
            },
            Message {
                role: Role::System,
                content: MessageContent::Text("real ctx".into()),
                rsclaw_hidden: None,
            },
        ];
        let (filtered, suffix) = split_system_messages(&msgs);
        assert_eq!(filtered.len(), 1);
        assert_eq!(
            suffix, "real ctx",
            "leading empty System must not produce a blank-line prefix; got {suffix:?}"
        );
    }

    #[test]
    fn split_system_messages_drops_parts_with_only_empty_text() {
        // Same symmetry check on the Parts path — ensure there's no
        // regression from the unification.
        let msgs = vec![Message {
            role: Role::System,
            content: MessageContent::Parts(vec![
                ContentPart::Text {
                    text: String::new(),
                },
                ContentPart::Image {
                    url: "https://x/i".into(),
                },
            ]),
            rsclaw_hidden: None,
        }];
        let (_filtered, suffix) = split_system_messages(&msgs);
        assert!(
            suffix.is_empty(),
            "Parts whose only Text was empty must not leak; got {suffix:?}"
        );
    }

    #[test]
    fn normalize_trailing_system_folds_into_user_text() {
        // Runtime appended a dynamic-ctx Role::System after the User
        // delta — fold it into the user_message body so from_request
        // sees a User-trailing list.
        let m = |role: Role, txt: &str| Message {
            role,
            content: MessageContent::Text(txt.into()),
            rsclaw_hidden: None,
        };
        let mut msgs = vec![
            m(Role::User, "fix the bug"),
            m(Role::System, "## Dynamic /ctx\nworking on handler.py"),
        ];
        normalize_trailing_system(&mut msgs);
        assert_eq!(msgs.len(), 1);
        assert!(matches!(msgs[0].role, Role::User));
        let MessageContent::Text(t) = &msgs[0].content else {
            panic!("expected Text content")
        };
        assert_eq!(t, "fix the bug\n\n## Dynamic /ctx\nworking on handler.py");
    }

    #[test]
    fn normalize_trailing_system_concatenates_multiple_in_order() {
        // Two runtime-appended System blocks (e.g. new_skills_tail +
        // dynamic_ctx) concatenated in original order, joined by \n\n.
        let m = |role: Role, txt: &str| Message {
            role,
            content: MessageContent::Text(txt.into()),
            rsclaw_hidden: None,
        };
        let mut msgs = vec![
            m(Role::User, "go"),
            m(Role::System, "FIRST"),
            m(Role::System, "SECOND"),
        ];
        normalize_trailing_system(&mut msgs);
        assert_eq!(msgs.len(), 1);
        let MessageContent::Text(t) = &msgs[0].content else {
            panic!("expected Text content")
        };
        assert_eq!(t, "go\n\nFIRST\n\nSECOND");
    }

    #[test]
    fn normalize_trailing_system_noop_without_trailing_system() {
        // No System anywhere — list is unchanged.
        let m = |role: Role, txt: &str| Message {
            role,
            content: MessageContent::Text(txt.into()),
            rsclaw_hidden: None,
        };
        let original = vec![m(Role::User, "hi"), m(Role::Assistant, "yo")];
        let mut msgs = original.clone();
        normalize_trailing_system(&mut msgs);
        assert_eq!(msgs.len(), 2);
        let MessageContent::Text(last) = &msgs[1].content else {
            panic!()
        };
        assert_eq!(last, "yo");
    }

    #[test]
    fn normalize_trailing_system_folds_into_user_parts() {
        // User content already in Parts form (text + image) — fold
        // System text by appending a new Text part rather than mutating
        // an existing part.
        let mut msgs = vec![
            Message {
                role: Role::User,
                content: MessageContent::Parts(vec![
                    ContentPart::Text {
                        text: "look at this".into(),
                    },
                    ContentPart::Image {
                        url: "https://x/y.png".into(),
                    },
                ]),
                rsclaw_hidden: None,
            },
            Message {
                role: Role::System,
                content: MessageContent::Text("CTX".into()),
                rsclaw_hidden: None,
            },
        ];
        normalize_trailing_system(&mut msgs);
        assert_eq!(msgs.len(), 1);
        let MessageContent::Parts(parts) = &msgs[0].content else {
            panic!("expected Parts content")
        };
        assert_eq!(parts.len(), 3);
        match &parts[2] {
            ContentPart::Text { text } => assert_eq!(text, "CTX"),
            _ => panic!("expected appended Text part"),
        }
    }

    #[test]
    fn normalize_trailing_system_drops_when_preceded_by_tool() {
        // Defensive — runtime doesn't currently inject System after
        // Tool, but if it ever did, we can't fold into a tool_results
        // delta. Drop the System text, leave the Tool tail intact so
        // from_request can build a Tools delta.
        let mut msgs = vec![
            Message {
                role: Role::Tool,
                content: MessageContent::Parts(vec![ContentPart::ToolResult {
                    tool_use_id: "toolu_1".into(),
                    content: "result".into(),
                    is_error: None,
                }]),
                rsclaw_hidden: None,
            },
            Message {
                role: Role::System,
                content: MessageContent::Text("dynamic ctx".into()),
                rsclaw_hidden: None,
            },
        ];
        normalize_trailing_system(&mut msgs);
        assert_eq!(msgs.len(), 1);
        assert!(matches!(msgs[0].role, Role::Tool));
    }

    #[test]
    fn normalize_trailing_system_skips_empty_system_blocks() {
        // An empty Role::System (text=="") shouldn't add stray "\n\n"
        // separators. Drop empty ones; fold non-empty ones.
        let m = |role: Role, txt: &str| Message {
            role,
            content: MessageContent::Text(txt.into()),
            rsclaw_hidden: None,
        };
        let mut msgs = vec![
            m(Role::User, "hi"),
            m(Role::System, ""),
            m(Role::System, "non-empty"),
        ];
        normalize_trailing_system(&mut msgs);
        assert_eq!(msgs.len(), 1);
        let MessageContent::Text(t) = &msgs[0].content else {
            panic!()
        };
        assert_eq!(t, "hi\n\nnon-empty");
    }

    #[test]
    fn turn_options_temperature_clamps_to_two_decimals() {
        // Default serde f32 serialization leaks IEEE 754 noise:
        // `0.6_f32` lifts to f64 `0.6000000238418579`, which makes the
        // request body ugly, breaks request hashing for caching layers,
        // and confuses anyone tailing logs. ser_opt_f32 routes through
        // super::json_f32 which rounds to 2 decimals.
        let opts = TurnOptions {
            max_tokens: None,
            temperature: Some(0.6),
            top_p: Some(0.95),
            enable_thinking: None,
            stop: None,
            idle_ttl_secs: None,
            constrain_tool_calls: None,
        };
        let body = serde_json::to_value(&opts).unwrap();
        // serde_json compares numbers by value not by string repr, so
        // an Eq against json!(0.6) would succeed even with the buggy
        // path. Compare the serialized text instead.
        let s = serde_json::to_string(&opts).unwrap();
        assert!(
            s.contains("\"temperature\":0.6"),
            "expected temperature:0.6, got {s}"
        );
        assert!(
            !s.contains("0.6000000238418579"),
            "leaked f32→f64 noise: {s}"
        );
        assert!(s.contains("\"top_p\":0.95"), "expected top_p:0.95, got {s}");
        // sanity — body is still well-formed JSON.
        assert!(body.is_object());
    }

    #[test]
    fn turn_options_temperature_none_omits_field() {
        let opts = TurnOptions {
            max_tokens: None,
            temperature: None,
            top_p: None,
            enable_thinking: None,
            stop: None,
            idle_ttl_secs: None,
            constrain_tool_calls: None,
        };
        let body = serde_json::to_value(&opts).unwrap();
        assert!(body.get("temperature").is_none());
        assert!(body.get("top_p").is_none());
    }

    #[test]
    fn turn_options_constrain_tool_calls_wire_shape() {
        // Off (provider default): the field must be absent so the wire
        // stays byte-identical to pre-grammar builds.
        let mut req = req_with(
            vec![Message {
                role: Role::User,
                content: MessageContent::Text("hi".into()),
                rsclaw_hidden: None,
            }],
            2,
            Some("k"),
        );
        req.tools = vec![crate::ToolDef {
            name: "read_file".into(),
            description: "read".into(),
            parameters: serde_json::json!({"type": "object", "properties": {}}),
        }];
        let off = serde_json::to_value(TurnOptions::from_request(&req, false)).unwrap();
        assert!(off.get("constrain_tool_calls").is_none());

        // On + tools present: serialized as a bare true.
        let on = serde_json::to_value(TurnOptions::from_request(&req, true)).unwrap();
        assert_eq!(on["constrain_tool_calls"], true);

        // On but no tools: nothing to constrain — field stays absent so
        // the worker never arms a grammar for a tool-less session.
        req.tools.clear();
        let no_tools = serde_json::to_value(TurnOptions::from_request(&req, true)).unwrap();
        assert!(no_tools.get("constrain_tool_calls").is_none());
    }

    #[test]
    fn create_session_resp_parses_replay_shape_without_prefix_id() {
        // Protocol §2.2 replay response carries session_id but NOT
        // prefix_id. Without #[serde(default)] this fails with
        // "missing field prefix_id" and breaks every replay path.
        let body = r#"{
            "session_id": "rs_w7_8a3c1f2b",
            "n_prefix_tokens": 27981,
            "n_user_tokens": 612,
            "n_history_tokens": 8420,
            "n_tokens": 37013,
            "instance_id": "llama-worker-7",
            "replay_ms": 2340
        }"#;
        let resp: CreateSessionResp = serde_json::from_str(body).expect("replay shape parses");
        assert_eq!(resp.session_id, "rs_w7_8a3c1f2b");
        assert!(resp.prefix_id.is_none());
    }

    #[test]
    fn create_session_resp_parses_create_shape_with_prefix_id() {
        // Protocol §2.1.6 (post-rename) create response carries
        // `prefix_id`. New servers send this name natively.
        let body = r#"{
            "session_id": "rs_w7_8a3c1f2b",
            "prefix_id": "rsclaw/2026.5.28"
        }"#;
        let resp: CreateSessionResp = serde_json::from_str(body).expect("create shape parses");
        assert_eq!(resp.prefix_id.as_deref(), Some("rsclaw/2026.5.28"));
    }

    #[test]
    fn create_session_resp_ignores_unknown_legacy_rsclaw_version() {
        // Pre-rename `rsclaw_version` is being dropped server-side
        // entirely. While some builds still emit it alongside
        // `prefix_id` mid-roll, our struct treats it as an unknown
        // key and serde ignores it silently. Production e2e against
        // `:8443` sends exactly this shape:
        //   {"prefix_id":"dynamic/...","prefix_source":"dynamic_miss",
        //    "rsclaw_version":""}
        // Without this regression test the prior `serde(alias)`
        // approach would resurface and trip `duplicate field` errors
        // again.
        let body = r#"{
            "session_id":"rs_w7_8cebc736",
            "prefix_id":"dynamic/9e8598684ad34ff0a615899fefb811de",
            "prefix_source":"dynamic_miss",
            "rsclaw_version":""
        }"#;
        let resp: CreateSessionResp =
            serde_json::from_str(body).expect("mixed post-rename + legacy fields must parse");
        assert_eq!(resp.session_id, "rs_w7_8cebc736");
        assert_eq!(
            resp.prefix_id.as_deref(),
            Some("dynamic/9e8598684ad34ff0a615899fefb811de"),
        );
    }

    #[test]
    fn create_session_resp_parses_explicit_null_prefix_id() {
        // `prefix_id: String` with `#[serde(default)]` would FAIL
        // parsing on explicit JSON null with "invalid type: null,
        // expected a string", tanking the whole `/sessions` (or
        // `/sessions/replay`) response and surfacing as an opaque
        // "parse response" error to the caller. Upstream nodes
        // occasionally emit null while the version registry is
        // mid-roll. Option<String> accepts null → None and keeps the
        // rest of the response usable.
        let body = r#"{"session_id":"rs_a_b","prefix_id":null}"#;
        let resp: CreateSessionResp =
            serde_json::from_str(body).expect("null prefix_id must parse");
        assert_eq!(resp.session_id, "rs_a_b");
        assert!(resp.prefix_id.is_none());
    }

    #[test]
    fn create_session_resp_parses_missing_prefix_id() {
        // The replay response per §2.2 omits prefix_id entirely.
        // Behaviour must match the explicit-null case: parse cleanly,
        // surface None.
        let body = r#"{"session_id":"rs_a_b"}"#;
        let resp: CreateSessionResp = serde_json::from_str(body).expect("missing field must parse");
        assert!(resp.prefix_id.is_none());
    }

    #[test]
    fn create_session_resp_parses_populated_prefix_id() {
        // Round-trip the happy path so the Option<String> change
        // doesn't accidentally start coercing real values to None.
        let body = r#"{"session_id":"rs_a_b","prefix_id":"rsclaw/2026.5.28"}"#;
        let resp: CreateSessionResp = serde_json::from_str(body).expect("string field must parse");
        assert_eq!(resp.prefix_id.as_deref(), Some("rsclaw/2026.5.28"));
    }

    // -- 8-rule dispatch precedence (R1 C3) -----------------------------
    //
    // Table-driven coverage so a future drift between the comment-table
    // in `dispatch_decision` and the runtime decision is caught.

    fn dispatch_req(
        model: &str,
        endpoint: AgentEndpoint,
        session_key: Option<&str>,
        kv_cache_mode: u8,
    ) -> LlmRequest {
        LlmRequest {
            fallback_models: Vec::new(),
            model: model.into(),
            endpoint,
            kv_cache_mode,
            session_key: session_key.map(str::to_string),
            ..Default::default()
        }
    }

    #[test]
    fn dispatch_rule_1_flash_model_routes_fastshot() {
        // Rule 1: rsclaw-flash-* wins regardless of endpoint hint.
        let route = dispatch_decision(&dispatch_req(
            "rsclaw/rsclaw-flash-v1",
            AgentEndpoint::Primary,
            None,
            0,
        ))
        .unwrap();
        assert_eq!(route, DispatchRoute::OneShot("/fastshot"));

        // Even with explicit Vision endpoint hint, model name still wins.
        let route = dispatch_decision(&dispatch_req(
            "rsclaw/rsclaw-flash-v1",
            AgentEndpoint::Vision,
            None,
            0,
        ))
        .unwrap();
        assert_eq!(route, DispatchRoute::OneShot("/fastshot"));
    }

    #[test]
    fn dispatch_rule_2_vision_model_routes_vision() {
        let route = dispatch_decision(&dispatch_req(
            "rsclaw/rsclaw-vision-v1",
            AgentEndpoint::Primary,
            None,
            0,
        ))
        .unwrap();
        assert_eq!(route, DispatchRoute::OneShot("/vision"));
    }

    #[test]
    fn dispatch_rule_3_agent_model_no_session_routes_oneshot() {
        // Per R2 review: stateless agent call must NOT bail; routes to
        // /oneshot per server hint "use /v1/agent/oneshot for agent model".
        let route = dispatch_decision(&dispatch_req(
            "rsclaw/rsclaw-agent-v1",
            AgentEndpoint::Primary,
            None,
            0,
        ))
        .unwrap();
        assert_eq!(route, DispatchRoute::OneShot("/oneshot"));
    }

    #[test]
    fn dispatch_rule_4_agent_model_with_session_routes_sessions() {
        let route = dispatch_decision(&dispatch_req(
            "rsclaw/rsclaw-agent-v1",
            AgentEndpoint::Primary,
            Some("sess-x"),
            2,
        ))
        .unwrap();
        assert_eq!(route, DispatchRoute::Sessions);
    }

    #[test]
    fn dispatch_rule_5_non_canonical_flash_endpoint_routes_fastshot() {
        // Rule 5: non-canonical model + endpoint=Flash hint → /fastshot.
        let route = dispatch_decision(&dispatch_req(
            "anthropic/claude-3-5-haiku",
            AgentEndpoint::Flash,
            None,
            0,
        ))
        .unwrap();
        assert_eq!(route, DispatchRoute::OneShot("/fastshot"));
    }

    #[test]
    fn dispatch_rule_6_non_canonical_vision_endpoint_routes_vision() {
        let route = dispatch_decision(&dispatch_req(
            "anthropic/claude-3-5-sonnet",
            AgentEndpoint::Vision,
            None,
            0,
        ))
        .unwrap();
        assert_eq!(route, DispatchRoute::OneShot("/vision"));
    }

    #[test]
    fn dispatch_rule_7_primary_with_session_routes_sessions() {
        let route = dispatch_decision(&dispatch_req(
            "anthropic/claude-3-5-sonnet",
            AgentEndpoint::Primary,
            Some("sess-y"),
            2,
        ))
        .unwrap();
        assert_eq!(route, DispatchRoute::Sessions);
    }

    #[test]
    fn dispatch_rule_8_primary_stateless_routes_oneshot() {
        let route = dispatch_decision(&dispatch_req(
            "anthropic/claude-3-5-sonnet",
            AgentEndpoint::Primary,
            None,
            0,
        ))
        .unwrap();
        assert_eq!(route, DispatchRoute::OneShot("/oneshot"));
    }

    #[test]
    fn dispatch_bail_kv2_without_session_key() {
        // Safety net (R1 C2): kv_cache_mode=2 + no session_key bails
        // BEFORE routing, so caller can't silently downgrade to /oneshot
        // and lose kvCache continuity.
        let err = dispatch_decision(&dispatch_req(
            "anthropic/claude-3-5-sonnet",
            AgentEndpoint::Primary,
            None,
            2,
        ))
        .unwrap_err()
        .to_string();
        assert!(err.contains("session_key"), "got: {err}");
        assert!(err.contains("kv_cache_mode=2"), "got: {err}");
    }

    #[test]
    fn dispatch_bail_session_without_kv2() {
        // Sessions path requires kv_cache_mode=2.
        let err = dispatch_decision(&dispatch_req(
            "anthropic/claude-3-5-sonnet",
            AgentEndpoint::Primary,
            Some("sess-z"),
            1,
        ))
        .unwrap_err()
        .to_string();
        assert!(err.contains("kv_cache_mode=2"), "got: {err}");
    }

    #[test]
    fn dispatch_canonical_model_overrides_endpoint_hint() {
        // Rule 1 wins over endpoint=Vision when model is flash family.
        // Important: covers the case where a misconfigured caller sets
        // a Flash endpoint hint but resolved a non-flash agent model.
        let route = dispatch_decision(&dispatch_req(
            "rsclaw/rsclaw-flash-v1",
            AgentEndpoint::Primary,
            Some("sess-q"),
            2,
        ))
        .unwrap();
        // Rule 1 fires regardless of session_key / kv_cache_mode,
        // because the server-side /fastshot whitelist accepts only
        // rsclaw-flash-*. (Server may 400 on the session_key field but
        // routing is correct at the client.)
        assert_eq!(route, DispatchRoute::OneShot("/fastshot"));
    }

    #[test]
    fn dispatch_rule_3_overrides_rule_5_for_agent_model() {
        // Rule 3 (agent + no session) fires before rule 5 (endpoint=Flash).
        // Caller passing Flash hint on an agent-* model → /oneshot, NOT
        // /fastshot (server would 400 the agent model on /fastshot).
        let route = dispatch_decision(&dispatch_req(
            "rsclaw/rsclaw-agent-v1",
            AgentEndpoint::Flash,
            None,
            0,
        ))
        .unwrap();
        assert_eq!(route, DispatchRoute::OneShot("/oneshot"));
    }

    #[test]
    fn rejects_non_kv2_mode() {
        let provider = RsclawProvider::new("http://x", None);
        let req = req_with(vec![], 1, Some("k"));
        let err = match futures::executor::block_on(provider.stream(req)) {
            Ok(_) => panic!("expected error for kv_cache_mode=1"),
            Err(e) => e,
        };
        assert!(err.to_string().contains("kv_cache_mode=2"));
    }

    // Renamed + re-scoped: the previous version expected an error when
    // session_key was `None` with kv_cache_mode=2, but the dispatch
    // refactor (commit cc6314a) now routes session_key=None to /oneshot
    // regardless of kv_cache_mode. The remaining session-mode contract
    // worth pinning is the inverse: session_key=Some + kv_cache_mode!=2
    // must error rather than silently mis-route. (Also: tokio::test
    // instead of futures::executor::block_on — provider.stream calls
    // into reqwest, which needs a tokio reactor on the current thread.)
    #[tokio::test]
    async fn rejects_session_mode_without_kv_cache_mode_2() {
        let provider = RsclawProvider::new("http://x", None);
        let req = req_with(vec![], 0, Some("session-xyz"));
        let err = match provider.stream(req).await {
            Ok(_) => panic!("expected error for session_key + kv_cache_mode!=2"),
            Err(e) => e,
        };
        assert!(
            err.to_string().contains("kv_cache_mode=2"),
            "unexpected error text: {err}"
        );
    }

    // ----- compact splice wire shape (§2.4) -------------------------------

    #[test]
    fn compact_splice_req_serialises_post_2_4_shape() {
        // Pin the wire shape so rsclaw-server and gateway can't drift
        // independently. expected_msgs_count is optional; when None it
        // MUST be omitted from the body (not emitted as `"expected_msgs_count": null`)
        // so a server that hasn't shipped the field yet doesn't 400.
        let body = CompactSpliceReq {
            keep_head_messages: 2,
            summary: "<sum>",
            keep_tail_messages: 10,
            expected_msgs_count: Some(80),
        };
        let v = serde_json::to_value(&body).unwrap();
        assert_eq!(v["keep_head_messages"], 2);
        assert_eq!(v["summary"], "<sum>");
        assert_eq!(v["keep_tail_messages"], 10);
        assert_eq!(v["expected_msgs_count"], 80);

        let body_no_expect = CompactSpliceReq {
            keep_head_messages: 2,
            summary: "<sum>",
            keep_tail_messages: 10,
            expected_msgs_count: None,
        };
        let v_no_expect = serde_json::to_value(&body_no_expect).unwrap();
        assert!(
            v_no_expect.get("expected_msgs_count").is_none(),
            "None must be omitted from the wire body, not emitted as null"
        );
    }

    #[test]
    fn compact_splice_resp_parses_happy_shape() {
        let body = r#"{"session_id":"rs_w7_abc","msgs_count":13,"tokens_count":8421}"#;
        let resp: CompactSpliceResp =
            serde_json::from_str(body).expect("happy compact response must parse");
        assert_eq!(resp.session_id, "rs_w7_abc");
        assert_eq!(resp.msgs_count, 13);
        assert_eq!(resp.tokens_count, 8421);
    }

    #[test]
    fn compact_splice_trait_default_returns_err_for_non_rsclaw() {
        // Trait-level default impl: non-rsclaw providers should bail
        // with a "not supported" error so callers can fall back cleanly.
        // Sanity-check on a placeholder provider via the public trait.
        use crate::LlmProvider;
        struct StubProvider;
        impl LlmProvider for StubProvider {
            fn name(&self) -> &str {
                "stub"
            }
            fn stream(
                &self,
                _req: crate::LlmRequest,
            ) -> futures::future::BoxFuture<'_, anyhow::Result<crate::LlmStream>> {
                Box::pin(async { anyhow::bail!("stub provider has no streaming") })
            }
        }
        let p = StubProvider;
        let err = futures::executor::block_on(p.compact_splice("k", 2, "x", 10, None))
            .expect_err("default impl must Err");
        let msg = err.to_string();
        assert!(
            msg.contains("not supported") && msg.contains("stub"),
            "default impl Err should name the provider: {msg}"
        );
    }

    #[tokio::test]
    async fn compact_splice_errs_when_no_cached_session() {
        // Splice short-circuits BEFORE any HTTP call when the cached
        // SessionEntry for `session_key` is missing — no point splicing
        // a session we don't think is open. Caller (compact_inner)
        // observes the Err and falls back to replay path.
        use crate::LlmProvider;
        let provider = RsclawProvider::new("http://nonexistent-host.invalid", None);
        let err = provider
            .compact_splice("missing-key", 2, "summary", 10, None)
            .await
            .expect_err("should Err when no cached SessionEntry exists");
        let msg = err.to_string();
        assert!(
            msg.contains("no cached session"),
            "Err message should mention missing cached session, got: {msg}"
        );
    }

    #[tokio::test]
    async fn compact_splice_updates_last_seen_msgs_len_on_success() {
        // Pin the critical post-splice state mutation: on HTTP success
        // the cached SessionEntry.last_seen_msgs_len MUST be updated to
        // the gateway-local computation (head + 1 + tail). Without this
        // update, the next turn's lookup_and_bump would (incorrectly)
        // see msgs.len() < last_seen and force an unnecessary replay.
        use wiremock::{
            Mock, MockServer, ResponseTemplate,
            matchers::{method, path},
        };

        use crate::LlmProvider;

        let mock_server = MockServer::start().await;
        let session_id = "rs_w7_abc";

        Mock::given(method("POST"))
            .and(path(format!("/sessions/{}/compact", session_id)))
            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
                "session_id": session_id,
                "msgs_count": 13,
                "tokens_count": 8421,
            })))
            .expect(1)
            .mount(&mock_server)
            .await;

        let provider = RsclawProvider::new(mock_server.uri(), None);

        // Pre-populate the cached SessionEntry so the splice has
        // something to operate against. last_seen_msgs_len starts at 50
        // (a typical pre-compact value) so we can verify it's updated
        // to 13 (head=2 + summary=1 + tail=10) after success.
        {
            let mut map = provider.lock_sessions();
            map.insert(
                "test-key".to_owned(),
                SessionEntry {
                    session_id: session_id.to_owned(),
                    prefix_id: RSCLAW_DEFAULT_PREFIX_ID.to_owned(),
                    last_seen_msgs_len: 50,
                },
            );
        }

        let result = provider
            .compact_splice("test-key", 2, "<summary>", 10, Some(50))
            .await
            .expect("happy-path splice should succeed");
        assert_eq!(result, 13, "trait method returns server's msgs_count");

        let map = provider.lock_sessions();
        let entry = map
            .get("test-key")
            .expect("SessionEntry must still exist after splice — id is preserved");
        assert_eq!(
            entry.last_seen_msgs_len, 13,
            "last_seen_msgs_len must be updated to head(2) + summary(1) + tail(10)"
        );
        assert_eq!(
            entry.session_id, session_id,
            "session_id MUST be unchanged across splice (§2.4 invariant)"
        );
        assert_eq!(
            entry.prefix_id, RSCLAW_DEFAULT_PREFIX_ID,
            "prefix_id must be unchanged"
        );
    }

    #[test]
    fn compact_splice_409_body_parses() {
        let body = r#"{"error":{"code":"msg_count_mismatch","detail":"expected 50, got 52","current":52}}"#;
        let parsed: CompactSplice409 = serde_json::from_str(body).expect("409 body must parse");
        assert_eq!(parsed.error.current, 52);
    }

    #[tokio::test]
    async fn compact_splice_retries_on_409_then_succeeds() {
        // Protocol §6.3.1: 409 msg_count_mismatch is optimistic-concurrency,
        // NOT fatal. The client must read `current`, re-align, and retry —
        // a non-atomic turn leaves the server's slot count ahead of ours.
        // First POST 409s (server has 52, we sent 50); the client grows the
        // tail by the delta (2) and retries, which succeeds. No replay
        // fallback, session_id preserved.
        use wiremock::{
            Mock, MockServer, ResponseTemplate,
            matchers::{method, path},
        };

        use crate::LlmProvider;

        let mock_server = MockServer::start().await;
        let session_id = "rs_w7_retry";
        let compact_path = format!("/sessions/{}/compact", session_id);

        // First call: 409 with the server's authoritative current=52.
        Mock::given(method("POST"))
            .and(path(compact_path.clone()))
            .respond_with(ResponseTemplate::new(409).set_body_json(serde_json::json!({
                "error": {
                    "code": "msg_count_mismatch",
                    "detail": "expected 50, got 52",
                    "current": 52
                }
            })))
            .up_to_n_times(1)
            .expect(1)
            .mount(&mock_server)
            .await;

        // Second call (after realign): success. keep_tail grew 10→12, so
        // server reports head(2)+summary(1)+tail(12)=15.
        Mock::given(method("POST"))
            .and(path(compact_path))
            .respond_with(ResponseTemplate::new(200).set_body_json(serde_json::json!({
                "session_id": session_id,
                "msgs_count": 15,
                "tokens_count": 9000,
            })))
            .expect(1)
            .mount(&mock_server)
            .await;

        let provider = RsclawProvider::new(mock_server.uri(), None);
        {
            let mut map = provider.lock_sessions();
            map.insert(
                "retry-key".to_owned(),
                SessionEntry {
                    session_id: session_id.to_owned(),
                    prefix_id: RSCLAW_DEFAULT_PREFIX_ID.to_owned(),
                    last_seen_msgs_len: 50,
                },
            );
        }

        let result = provider
            .compact_splice("retry-key", 2, "<summary>", 10, Some(50))
            .await
            .expect("splice should succeed after one 409 retry");
        assert_eq!(
            result, 15,
            "returns server msgs_count from the retried call"
        );

        let map = provider.lock_sessions();
        let entry = map.get("retry-key").expect("entry preserved");
        assert_eq!(
            entry.last_seen_msgs_len, 15,
            "last_seen recomputed from the GROWN tail: head(2)+summary(1)+tail(12)"
        );
    }
}