Skip to main content

mockforge_http/
llm_mock.rs

1//! Mock LLM endpoint (#912, #915).
2//!
3//! Serves OpenAI-compatible (`POST /v1/chat/completions`, `GET /v1/models`)
4//! and Anthropic-compatible (`POST /v1/messages`) endpoints so an agent
5//! (Cursor, Claude Code, ChatGPT clients, custom agents) can point its base
6//! URL at MockForge and receive correctly-shaped completions with realistic
7//! envelopes (ids, `usage` token counts, `finish_reason` / `stop_reason`) and
8//! SSE streaming when the caller sets `stream: true`.
9//!
10//! Four modes ([`LlmMockMode`]), all opt-in via `--llm-mock-mode`; the default
11//! stays a pure offline mock:
12//! - `mock` (default): canned/templated text, never calls out. Deterministic.
13//! - `proxy`: forward every request to a configured OpenAI/Anthropic-compatible
14//!   upstream and return the real response (a man-in-the-middle for agent<->LLM
15//!   traffic; combine with `--latency`/`--failures` for chaos on real traffic).
16//! - `record`: on a cassette miss, forward to upstream and save the response;
17//!   on a hit, replay from the cassette. Real content, deterministic after
18//!   warm-up.
19//! - `replay`: serve only from the cassette (fully offline); a miss falls back
20//!   to the canned reply.
21//!
22//! Any request the caller sends is already in the upstream's wire shape, so
23//! upstream calls forward the model + messages verbatim (always non-streaming);
24//! streaming clients get the resolved text re-chunked locally.
25//!
26//! Mounted by `mockforge serve --llm-mock`.
27
28use axum::{
29    extract::State,
30    response::{
31        sse::{Event, KeepAlive, Sse},
32        IntoResponse, Response,
33    },
34    routing::{get, post},
35    Json, Router,
36};
37use futures::stream::{self, Stream};
38use serde_json::{json, Value};
39use std::collections::HashMap;
40use std::convert::Infallible;
41use std::path::PathBuf;
42use std::sync::{Arc, Mutex};
43use std::time::Duration;
44
45/// How the mock LLM endpoint sources its reply text.
46#[derive(Clone, Copy, Debug, PartialEq, Eq, Default)]
47pub enum LlmMockMode {
48    /// Canned/templated text only; never calls out. Deterministic + offline.
49    #[default]
50    Mock,
51    /// Always forward to the configured upstream and return the real response.
52    Proxy,
53    /// Cassette miss forwards to upstream and records; hit replays from cassette.
54    Record,
55    /// Cassette only (offline); a miss falls back to the canned reply.
56    Replay,
57}
58
59impl std::str::FromStr for LlmMockMode {
60    type Err = String;
61    fn from_str(s: &str) -> Result<Self, Self::Err> {
62        match s.trim().to_lowercase().as_str() {
63            "mock" | "canned" | "off" => Ok(Self::Mock),
64            "proxy" | "passthrough" => Ok(Self::Proxy),
65            "record" => Ok(Self::Record),
66            "replay" => Ok(Self::Replay),
67            _ => Err(format!("unknown --llm-mock-mode '{s}' (mock|proxy|record|replay)")),
68        }
69    }
70}
71
72impl std::fmt::Display for LlmMockMode {
73    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
74        f.write_str(match self {
75            Self::Mock => "mock",
76            Self::Proxy => "proxy",
77            Self::Record => "record",
78            Self::Replay => "replay",
79        })
80    }
81}
82
83/// Runtime configuration for the mock LLM endpoint.
84#[derive(Clone, Debug)]
85pub struct LlmMockConfig {
86    /// Canned assistant reply used when no per-request override applies.
87    pub canned_reply: String,
88    /// Model id echoed back in responses when the request omits one.
89    pub default_model: String,
90    /// When true, prepend a short echo of the user's last message to the
91    /// reply so callers can confirm round-trip wiring.
92    pub echo_prompt: bool,
93    /// Per-chunk delay for streaming responses (milliseconds). 0 = no delay.
94    pub stream_chunk_delay_ms: u64,
95    /// Reply sourcing mode. Defaults to [`LlmMockMode::Mock`].
96    pub mode: LlmMockMode,
97    /// Base URL of the OpenAI/Anthropic-compatible upstream (no trailing path),
98    /// e.g. `https://api.openai.com` or `http://localhost:11434`. Required for
99    /// `proxy` and `record`.
100    pub upstream_base_url: Option<String>,
101    /// API key forwarded to the upstream (Bearer for OpenAI, `x-api-key` for
102    /// Anthropic). When None, no auth header is sent (fine for local upstreams).
103    pub upstream_api_key: Option<String>,
104    /// Cassette file for `record` / `replay`. Loaded at startup; `record`
105    /// rewrites it as new prompts are captured.
106    pub cassette_path: Option<PathBuf>,
107}
108
109impl Default for LlmMockConfig {
110    fn default() -> Self {
111        Self {
112            canned_reply: "This is a mock response from MockForge's LLM endpoint.".to_string(),
113            default_model: "mockforge-mock-1".to_string(),
114            echo_prompt: true,
115            stream_chunk_delay_ms: 0,
116            mode: LlmMockMode::Mock,
117            upstream_base_url: None,
118            upstream_api_key: None,
119            cassette_path: None,
120        }
121    }
122}
123
124/// One recorded completion, keyed by a hash of (endpoint, model, messages).
125#[derive(Clone, Debug, serde::Serialize, serde::Deserialize)]
126struct CassetteEntry {
127    text: String,
128    prompt_tokens: u32,
129    completion_tokens: u32,
130}
131
132/// In-memory cassette backed by a JSON file on disk.
133#[derive(Default)]
134struct Cassette {
135    entries: HashMap<String, CassetteEntry>,
136    path: Option<PathBuf>,
137}
138
139impl Cassette {
140    fn load(path: Option<PathBuf>) -> Self {
141        let entries = path
142            .as_ref()
143            .and_then(|p| std::fs::read(p).ok())
144            .and_then(|b| serde_json::from_slice::<HashMap<String, CassetteEntry>>(&b).ok())
145            .unwrap_or_default();
146        Self { entries, path }
147    }
148
149    fn save(&self) {
150        if let Some(ref p) = self.path {
151            if let Ok(json) = serde_json::to_string_pretty(&self.entries) {
152                let _ = std::fs::write(p, json);
153            }
154        }
155    }
156}
157
158/// Router state: config plus the shared cassette and an HTTP client for
159/// upstream calls. Cheap to clone (Arc + reqwest::Client are handle types).
160#[derive(Clone)]
161pub struct LlmMockState {
162    config: LlmMockConfig,
163    cassette: Arc<Mutex<Cassette>>,
164    http: reqwest::Client,
165}
166
167impl LlmMockState {
168    /// Build state from config, loading any existing cassette from disk.
169    pub fn new(config: LlmMockConfig) -> Self {
170        let cassette = Cassette::load(config.cassette_path.clone());
171        Self {
172            config,
173            cassette: Arc::new(Mutex::new(cassette)),
174            http: reqwest::Client::new(),
175        }
176    }
177}
178
179/// Build the axum router exposing the mock LLM endpoints.
180pub fn router(config: LlmMockConfig) -> Router {
181    Router::new()
182        .route("/v1/chat/completions", post(chat_completions))
183        .route("/v1/models", get(list_models))
184        .route("/v1/messages", post(anthropic_messages))
185        .with_state(LlmMockState::new(config))
186}
187
188/// Which upstream wire dialect a request/response uses.
189#[derive(Clone, Copy, PartialEq, Eq)]
190enum Dialect {
191    OpenAi,
192    Anthropic,
193}
194
195/// The resolved reply text plus token counts, and where it came from.
196struct Resolved {
197    text: String,
198    prompt_tokens: u32,
199    completion_tokens: u32,
200}
201
202// ---------------------------------------------------------------------------
203// Shared helpers
204// ---------------------------------------------------------------------------
205
206/// Approximate token count: whitespace-delimited words. Good enough for a
207/// mock; real tokenizers differ but callers testing wiring only need a
208/// plausible, monotonic-with-length number.
209fn approx_tokens(text: &str) -> u32 {
210    text.split_whitespace().count().max(1) as u32
211}
212
213/// Extract the last user message text from a list of OpenAI/Anthropic-shaped
214/// messages. `content` may be a plain string or an array of content blocks
215/// (`[{"type":"text","text":"..."}]`); both are handled.
216fn last_user_text(messages: &[Value]) -> String {
217    for m in messages.iter().rev() {
218        if m.get("role").and_then(|r| r.as_str()) == Some("user") {
219            return content_to_text(m.get("content"));
220        }
221    }
222    // Fall back to the last message of any role.
223    messages.last().map(|m| content_to_text(m.get("content"))).unwrap_or_default()
224}
225
226fn content_to_text(content: Option<&Value>) -> String {
227    match content {
228        Some(Value::String(s)) => s.clone(),
229        Some(Value::Array(parts)) => parts
230            .iter()
231            .filter_map(|p| p.get("text").and_then(|t| t.as_str()))
232            .collect::<Vec<_>>()
233            .join(" "),
234        _ => String::new(),
235    }
236}
237
238/// Produce the deterministic reply text for a request.
239fn build_reply(config: &LlmMockConfig, messages: &[Value]) -> String {
240    if config.echo_prompt {
241        let prompt = last_user_text(messages);
242        if !prompt.is_empty() {
243            let trimmed: String = prompt.chars().take(120).collect();
244            return format!("{} (you said: \"{}\")", config.canned_reply, trimmed);
245        }
246    }
247    config.canned_reply.clone()
248}
249
250/// Deterministic id derived from the reply + a prefix, so repeated identical
251/// requests yield stable ids (useful for snapshot tests) without pulling in a
252/// random/uuid dependency.
253fn stable_id(prefix: &str, seed: &str) -> String {
254    let mut hash: u64 = 1469598103934665603; // FNV-1a offset
255    for b in seed.bytes() {
256        hash ^= b as u64;
257        hash = hash.wrapping_mul(1099511628211);
258    }
259    format!("{prefix}{hash:016x}")
260}
261
262/// Split a reply into streaming "tokens" (words, keeping trailing spaces) for
263/// chunked SSE delivery.
264fn stream_chunks(reply: &str) -> Vec<String> {
265    let mut out = Vec::new();
266    for (i, word) in reply.split_whitespace().enumerate() {
267        if i == 0 {
268            out.push(word.to_string());
269        } else {
270            out.push(format!(" {word}"));
271        }
272    }
273    if out.is_empty() {
274        out.push(reply.to_string());
275    }
276    out
277}
278
279// ---------------------------------------------------------------------------
280// OpenAI: GET /v1/models
281// ---------------------------------------------------------------------------
282
283async fn list_models(State(state): State<LlmMockState>) -> Json<Value> {
284    Json(json!({
285        "object": "list",
286        "data": [{
287            "id": state.config.default_model,
288            "object": "model",
289            "created": 0,
290            "owned_by": "mockforge",
291        }],
292    }))
293}
294
295// ---------------------------------------------------------------------------
296// OpenAI: POST /v1/chat/completions
297// ---------------------------------------------------------------------------
298
299async fn chat_completions(State(state): State<LlmMockState>, Json(body): Json<Value>) -> Response {
300    let model = body
301        .get("model")
302        .and_then(|m| m.as_str())
303        .unwrap_or(&state.config.default_model)
304        .to_string();
305    let messages: Vec<Value> =
306        body.get("messages").and_then(|m| m.as_array()).cloned().unwrap_or_default();
307    let stream = body.get("stream").and_then(|s| s.as_bool()).unwrap_or(false);
308
309    let r = resolve_reply(&state, Dialect::OpenAi, &model, &messages).await;
310    let id = stable_id("chatcmpl-", &r.text);
311
312    if stream {
313        return openai_stream(state.config.stream_chunk_delay_ms, id, model, r.text)
314            .into_response();
315    }
316
317    Json(json!({
318        "id": id,
319        "object": "chat.completion",
320        "created": 0,
321        "model": model,
322        "choices": [{
323            "index": 0,
324            "message": { "role": "assistant", "content": r.text },
325            "finish_reason": "stop",
326        }],
327        "usage": {
328            "prompt_tokens": r.prompt_tokens,
329            "completion_tokens": r.completion_tokens,
330            "total_tokens": r.prompt_tokens + r.completion_tokens,
331        },
332    }))
333    .into_response()
334}
335
336fn openai_stream(
337    delay_ms: u64,
338    id: String,
339    model: String,
340    reply: String,
341) -> Sse<impl Stream<Item = Result<Event, Infallible>>> {
342    let mut events: Vec<Event> = Vec::new();
343    // Initial role delta.
344    events.push(sse_json(&json!({
345        "id": id, "object": "chat.completion.chunk", "created": 0, "model": model,
346        "choices": [{ "index": 0, "delta": { "role": "assistant" }, "finish_reason": Value::Null }],
347    })));
348    // One content delta per token.
349    for chunk in stream_chunks(&reply) {
350        events.push(sse_json(&json!({
351            "id": id, "object": "chat.completion.chunk", "created": 0, "model": model,
352            "choices": [{ "index": 0, "delta": { "content": chunk }, "finish_reason": Value::Null }],
353        })));
354    }
355    // Terminal chunk + [DONE].
356    events.push(sse_json(&json!({
357        "id": id, "object": "chat.completion.chunk", "created": 0, "model": model,
358        "choices": [{ "index": 0, "delta": {}, "finish_reason": "stop" }],
359    })));
360    events.push(Event::default().data("[DONE]"));
361
362    sse_response(events, delay_ms)
363}
364
365// ---------------------------------------------------------------------------
366// Anthropic: POST /v1/messages
367// ---------------------------------------------------------------------------
368
369async fn anthropic_messages(
370    State(state): State<LlmMockState>,
371    Json(body): Json<Value>,
372) -> Response {
373    let model = body
374        .get("model")
375        .and_then(|m| m.as_str())
376        .unwrap_or(&state.config.default_model)
377        .to_string();
378    let messages: Vec<Value> =
379        body.get("messages").and_then(|m| m.as_array()).cloned().unwrap_or_default();
380    let stream = body.get("stream").and_then(|s| s.as_bool()).unwrap_or(false);
381
382    let r = resolve_reply(&state, Dialect::Anthropic, &model, &messages).await;
383    let id = stable_id("msg_", &r.text);
384
385    if stream {
386        return anthropic_stream(
387            state.config.stream_chunk_delay_ms,
388            id,
389            model,
390            r.text,
391            r.prompt_tokens,
392            r.completion_tokens,
393        )
394        .into_response();
395    }
396
397    Json(json!({
398        "id": id,
399        "type": "message",
400        "role": "assistant",
401        "model": model,
402        "content": [{ "type": "text", "text": r.text }],
403        "stop_reason": "end_turn",
404        "stop_sequence": Value::Null,
405        "usage": { "input_tokens": r.prompt_tokens, "output_tokens": r.completion_tokens },
406    }))
407    .into_response()
408}
409
410fn anthropic_stream(
411    delay_ms: u64,
412    id: String,
413    model: String,
414    reply: String,
415    input_tokens: u32,
416    output_tokens: u32,
417) -> Sse<impl Stream<Item = Result<Event, Infallible>>> {
418    let mut events: Vec<Event> = Vec::new();
419    events.push(sse_named(
420        "message_start",
421        &json!({
422            "type": "message_start",
423            "message": {
424                "id": id, "type": "message", "role": "assistant", "model": model,
425                "content": [], "stop_reason": Value::Null, "stop_sequence": Value::Null,
426                "usage": { "input_tokens": input_tokens, "output_tokens": 0 },
427            },
428        }),
429    ));
430    events.push(sse_named(
431        "content_block_start",
432        &json!({ "type": "content_block_start", "index": 0, "content_block": { "type": "text", "text": "" } }),
433    ));
434    for chunk in stream_chunks(&reply) {
435        events.push(sse_named(
436            "content_block_delta",
437            &json!({ "type": "content_block_delta", "index": 0, "delta": { "type": "text_delta", "text": chunk } }),
438        ));
439    }
440    events.push(sse_named(
441        "content_block_stop",
442        &json!({ "type": "content_block_stop", "index": 0 }),
443    ));
444    events.push(sse_named(
445        "message_delta",
446        &json!({ "type": "message_delta", "delta": { "stop_reason": "end_turn", "stop_sequence": Value::Null }, "usage": { "output_tokens": output_tokens } }),
447    ));
448    events.push(sse_named("message_stop", &json!({ "type": "message_stop" })));
449
450    sse_response(events, delay_ms)
451}
452
453// ---------------------------------------------------------------------------
454// Reply resolution: mock / proxy / record / replay
455// ---------------------------------------------------------------------------
456
457/// Stable cassette key for a request: FNV hash of dialect + model + canonical
458/// messages JSON. serde_json's default `Map` sorts keys, so the key is stable
459/// across runs for identical inputs.
460fn cassette_key(dialect: Dialect, model: &str, messages: &[Value]) -> String {
461    let d = match dialect {
462        Dialect::OpenAi => "openai",
463        Dialect::Anthropic => "anthropic",
464    };
465    let msgs = serde_json::to_string(messages).unwrap_or_default();
466    stable_id("", &format!("{d}\n{model}\n{msgs}"))
467}
468
469/// Build the canned reply + approximate token counts (the default path and the
470/// fallback for every mode when the upstream/cassette can't serve).
471fn canned_resolved(cfg: &LlmMockConfig, messages: &[Value]) -> Resolved {
472    let text = build_reply(cfg, messages);
473    let prompt_text = messages
474        .iter()
475        .map(|m| content_to_text(m.get("content")))
476        .collect::<Vec<_>>()
477        .join(" ");
478    Resolved {
479        prompt_tokens: approx_tokens(&prompt_text),
480        completion_tokens: approx_tokens(&text),
481        text,
482    }
483}
484
485fn cassette_lookup(state: &LlmMockState, key: &str) -> Option<Resolved> {
486    let e = state.cassette.lock().ok()?.entries.get(key).cloned()?;
487    Some(Resolved {
488        text: e.text,
489        prompt_tokens: e.prompt_tokens,
490        completion_tokens: e.completion_tokens,
491    })
492}
493
494/// Resolve the reply text + token counts per the configured mode. Upstream and
495/// cassette failures degrade to the canned reply so the mock never hard-fails a
496/// caller (it logs the reason instead).
497async fn resolve_reply(
498    state: &LlmMockState,
499    dialect: Dialect,
500    model: &str,
501    messages: &[Value],
502) -> Resolved {
503    let cfg = &state.config;
504    match cfg.mode {
505        LlmMockMode::Mock => canned_resolved(cfg, messages),
506        LlmMockMode::Replay => {
507            let key = cassette_key(dialect, model, messages);
508            cassette_lookup(state, &key).unwrap_or_else(|| {
509                tracing::warn!(target: "mockforge::llm_mock", "replay cassette miss (key {key}); serving canned reply");
510                canned_resolved(cfg, messages)
511            })
512        }
513        LlmMockMode::Record => {
514            let key = cassette_key(dialect, model, messages);
515            if let Some(hit) = cassette_lookup(state, &key) {
516                return hit;
517            }
518            match call_upstream(state, dialect, model, messages).await {
519                Ok(r) => {
520                    if let Ok(mut c) = state.cassette.lock() {
521                        c.entries.insert(
522                            key,
523                            CassetteEntry {
524                                text: r.text.clone(),
525                                prompt_tokens: r.prompt_tokens,
526                                completion_tokens: r.completion_tokens,
527                            },
528                        );
529                        c.save();
530                    }
531                    r
532                }
533                Err(e) => {
534                    tracing::error!(target: "mockforge::llm_mock", "record: upstream call failed ({e}); serving canned reply");
535                    canned_resolved(cfg, messages)
536                }
537            }
538        }
539        LlmMockMode::Proxy => match call_upstream(state, dialect, model, messages).await {
540            Ok(r) => r,
541            Err(e) => {
542                tracing::error!(target: "mockforge::llm_mock", "proxy: upstream call failed ({e}); serving canned reply");
543                canned_resolved(cfg, messages)
544            }
545        },
546    }
547}
548
549/// Forward a request to the configured OpenAI/Anthropic-compatible upstream
550/// (always non-streaming) and parse the reply text + token usage back out.
551async fn call_upstream(
552    state: &LlmMockState,
553    dialect: Dialect,
554    model: &str,
555    messages: &[Value],
556) -> Result<Resolved, String> {
557    let base = state
558        .config
559        .upstream_base_url
560        .as_deref()
561        .ok_or("no upstream configured (set --llm-mock-upstream)")?
562        .trim_end_matches('/');
563
564    let (url, body) = match dialect {
565        Dialect::OpenAi => (
566            format!("{base}/v1/chat/completions"),
567            json!({ "model": model, "messages": messages, "stream": false }),
568        ),
569        // Anthropic requires max_tokens on the request.
570        Dialect::Anthropic => (
571            format!("{base}/v1/messages"),
572            json!({ "model": model, "messages": messages, "max_tokens": 1024, "stream": false }),
573        ),
574    };
575
576    let mut req = state.http.post(&url).json(&body);
577    if let Some(ref k) = state.config.upstream_api_key {
578        req = match dialect {
579            Dialect::OpenAi => req.header("authorization", format!("Bearer {k}")),
580            Dialect::Anthropic => {
581                req.header("x-api-key", k).header("anthropic-version", "2023-06-01")
582            }
583        };
584    }
585
586    let resp = req.send().await.map_err(|e| e.to_string())?;
587    let status = resp.status();
588    if !status.is_success() {
589        return Err(format!("{url} returned HTTP {status}"));
590    }
591    let v: Value = resp.json().await.map_err(|e| e.to_string())?;
592
593    let (text, pt, ct) = match dialect {
594        Dialect::OpenAi => (
595            v.pointer("/choices/0/message/content")
596                .and_then(|x| x.as_str())
597                .unwrap_or_default(),
598            v.pointer("/usage/prompt_tokens").and_then(|x| x.as_u64()).unwrap_or(0) as u32,
599            v.pointer("/usage/completion_tokens").and_then(|x| x.as_u64()).unwrap_or(0) as u32,
600        ),
601        Dialect::Anthropic => (
602            v.pointer("/content/0/text").and_then(|x| x.as_str()).unwrap_or_default(),
603            v.pointer("/usage/input_tokens").and_then(|x| x.as_u64()).unwrap_or(0) as u32,
604            v.pointer("/usage/output_tokens").and_then(|x| x.as_u64()).unwrap_or(0) as u32,
605        ),
606    };
607    let text = text.to_string();
608    // Backfill token counts if the upstream omitted usage.
609    let prompt_tokens = if pt > 0 {
610        pt
611    } else {
612        let pj = messages
613            .iter()
614            .map(|m| content_to_text(m.get("content")))
615            .collect::<Vec<_>>()
616            .join(" ");
617        approx_tokens(&pj)
618    };
619    let completion_tokens = if ct > 0 { ct } else { approx_tokens(&text) };
620    Ok(Resolved {
621        text,
622        prompt_tokens,
623        completion_tokens,
624    })
625}
626
627// ---------------------------------------------------------------------------
628// SSE plumbing
629// ---------------------------------------------------------------------------
630
631fn sse_json(value: &Value) -> Event {
632    Event::default().data(value.to_string())
633}
634
635fn sse_named(name: &str, value: &Value) -> Event {
636    Event::default().event(name).data(value.to_string())
637}
638
639/// Turn a precomputed list of events into an SSE stream, optionally spacing
640/// them out by `delay_ms` so callers can observe incremental delivery.
641fn sse_response(
642    events: Vec<Event>,
643    delay_ms: u64,
644) -> Sse<impl Stream<Item = Result<Event, Infallible>>> {
645    let s = stream::unfold(events.into_iter(), move |mut it| async move {
646        let next = it.next()?;
647        if delay_ms > 0 {
648            tokio::time::sleep(Duration::from_millis(delay_ms)).await;
649        }
650        Some((Ok::<Event, Infallible>(next), it))
651    });
652    Sse::new(s).keep_alive(KeepAlive::new().interval(Duration::from_secs(15)))
653}
654
655#[cfg(test)]
656mod tests {
657    use super::*;
658
659    fn cfg() -> LlmMockConfig {
660        LlmMockConfig {
661            echo_prompt: false,
662            ..Default::default()
663        }
664    }
665
666    /// Router state wrapping a config, for handler tests.
667    fn st() -> LlmMockState {
668        LlmMockState::new(cfg())
669    }
670
671    fn user(text: &str) -> Vec<Value> {
672        vec![json!({"role":"user","content":text})]
673    }
674
675    #[test]
676    fn approx_tokens_counts_words() {
677        assert_eq!(approx_tokens("one two three"), 3);
678        assert_eq!(approx_tokens(""), 1); // min 1
679    }
680
681    #[test]
682    fn last_user_text_handles_string_and_array_content() {
683        let msgs = vec![
684            json!({"role":"system","content":"be brief"}),
685            json!({"role":"user","content":"hello world"}),
686        ];
687        assert_eq!(last_user_text(&msgs), "hello world");
688        let arr = vec![
689            json!({"role":"user","content":[{"type":"text","text":"a"},{"type":"text","text":"b"}]}),
690        ];
691        assert_eq!(last_user_text(&arr), "a b");
692    }
693
694    #[test]
695    fn echo_prompt_reflects_user_message() {
696        let c = LlmMockConfig {
697            echo_prompt: true,
698            ..Default::default()
699        };
700        let msgs = vec![json!({"role":"user","content":"ping"})];
701        let reply = build_reply(&c, &msgs);
702        assert!(reply.contains("ping"), "reply should echo the prompt: {reply}");
703    }
704
705    #[test]
706    fn stable_id_is_deterministic_and_prefixed() {
707        let a = stable_id("chatcmpl-", "same");
708        let b = stable_id("chatcmpl-", "same");
709        assert_eq!(a, b);
710        assert!(a.starts_with("chatcmpl-"));
711        assert_ne!(stable_id("chatcmpl-", "x"), stable_id("chatcmpl-", "y"));
712    }
713
714    #[test]
715    fn stream_chunks_preserve_leading_space_after_first() {
716        let chunks = stream_chunks("alpha beta gamma");
717        assert_eq!(chunks, vec!["alpha", " beta", " gamma"]);
718        assert_eq!(chunks.concat(), "alpha beta gamma");
719    }
720
721    #[tokio::test]
722    async fn chat_completions_non_stream_shape() {
723        let body = json!({"model":"gpt-x","messages":[{"role":"user","content":"hi there"}]});
724        let resp = chat_completions(State(st()), Json(body)).await;
725        let bytes = axum::body::to_bytes(resp.into_body(), usize::MAX).await.unwrap();
726        let v: Value = serde_json::from_slice(&bytes).unwrap();
727        assert_eq!(v["object"], "chat.completion");
728        assert_eq!(v["choices"][0]["message"]["role"], "assistant");
729        assert_eq!(v["choices"][0]["finish_reason"], "stop");
730        assert!(v["usage"]["total_tokens"].as_u64().unwrap() >= 2);
731        assert!(v["id"].as_str().unwrap().starts_with("chatcmpl-"));
732    }
733
734    #[tokio::test]
735    async fn anthropic_non_stream_shape() {
736        let body = json!({"model":"claude-x","messages":[{"role":"user","content":"hi"}]});
737        let resp = anthropic_messages(State(st()), Json(body)).await;
738        let bytes = axum::body::to_bytes(resp.into_body(), usize::MAX).await.unwrap();
739        let v: Value = serde_json::from_slice(&bytes).unwrap();
740        assert_eq!(v["type"], "message");
741        assert_eq!(v["content"][0]["type"], "text");
742        assert_eq!(v["stop_reason"], "end_turn");
743        assert!(v["usage"]["output_tokens"].as_u64().unwrap() >= 1);
744        assert!(v["id"].as_str().unwrap().starts_with("msg_"));
745    }
746
747    #[tokio::test]
748    async fn models_list_shape() {
749        let Json(v) = list_models(State(st())).await;
750        assert_eq!(v["object"], "list");
751        assert_eq!(v["data"][0]["owned_by"], "mockforge");
752    }
753
754    // ---- #915: modes + cassette ----
755
756    #[test]
757    fn mode_parses_and_displays() {
758        for (s, m) in [
759            ("mock", LlmMockMode::Mock),
760            ("off", LlmMockMode::Mock),
761            ("proxy", LlmMockMode::Proxy),
762            ("record", LlmMockMode::Record),
763            ("replay", LlmMockMode::Replay),
764        ] {
765            assert_eq!(s.parse::<LlmMockMode>().unwrap(), m);
766        }
767        assert!("bogus".parse::<LlmMockMode>().is_err());
768        assert_eq!(LlmMockMode::Record.to_string(), "record");
769    }
770
771    #[test]
772    fn cassette_key_is_stable_and_input_sensitive() {
773        let m = user("hello");
774        let k1 = cassette_key(Dialect::OpenAi, "gpt-4o", &m);
775        let k2 = cassette_key(Dialect::OpenAi, "gpt-4o", &m);
776        assert_eq!(k1, k2, "same input -> same key");
777        assert_ne!(k1, cassette_key(Dialect::OpenAi, "gpt-4o", &user("world")));
778        assert_ne!(k1, cassette_key(Dialect::OpenAi, "gpt-5", &m), "model is part of the key");
779        assert_ne!(
780            k1,
781            cassette_key(Dialect::Anthropic, "gpt-4o", &m),
782            "dialect is part of the key"
783        );
784    }
785
786    #[test]
787    fn cassette_roundtrips_through_disk() {
788        let dir = std::env::temp_dir().join(format!("mf-cassette-{}", std::process::id()));
789        std::fs::create_dir_all(&dir).unwrap();
790        let path = dir.join("c.json");
791        let mut c = Cassette::load(Some(path.clone()));
792        c.entries.insert(
793            "k".into(),
794            CassetteEntry {
795                text: "recorded".into(),
796                prompt_tokens: 3,
797                completion_tokens: 1,
798            },
799        );
800        c.save();
801        let reloaded = Cassette::load(Some(path.clone()));
802        assert_eq!(reloaded.entries.get("k").unwrap().text, "recorded");
803        let _ = std::fs::remove_dir_all(&dir);
804    }
805
806    #[tokio::test]
807    async fn mock_mode_serves_canned_without_upstream() {
808        let state = LlmMockState::new(cfg()); // mode defaults to Mock, no upstream
809        let r = resolve_reply(&state, Dialect::OpenAi, "gpt-4o", &user("hi")).await;
810        assert!(r.text.contains("mock response"));
811    }
812
813    #[tokio::test]
814    async fn replay_hit_serves_cassette_miss_serves_canned() {
815        let state = LlmMockState::new(LlmMockConfig {
816            mode: LlmMockMode::Replay,
817            echo_prompt: false,
818            ..Default::default()
819        });
820        // Seed a cassette entry under the exact key the resolver will compute.
821        let msgs = user("what is 2+2");
822        let key = cassette_key(Dialect::OpenAi, "gpt-4o", &msgs);
823        state.cassette.lock().unwrap().entries.insert(
824            key,
825            CassetteEntry {
826                text: "four".into(),
827                prompt_tokens: 4,
828                completion_tokens: 1,
829            },
830        );
831        // Hit -> cassette content.
832        let hit = resolve_reply(&state, Dialect::OpenAi, "gpt-4o", &msgs).await;
833        assert_eq!(hit.text, "four");
834        assert_eq!(hit.completion_tokens, 1);
835        // Miss -> canned fallback (offline, no upstream).
836        let miss = resolve_reply(&state, Dialect::OpenAi, "gpt-4o", &user("unseen")).await;
837        assert!(miss.text.contains("mock response"));
838    }
839
840    #[tokio::test]
841    async fn proxy_without_upstream_degrades_to_canned() {
842        // Proxy mode but no upstream configured: must not hard-fail.
843        let state = LlmMockState::new(LlmMockConfig {
844            mode: LlmMockMode::Proxy,
845            echo_prompt: false,
846            ..Default::default()
847        });
848        let r = resolve_reply(&state, Dialect::OpenAi, "gpt-4o", &user("hi")).await;
849        assert!(r.text.contains("mock response"), "should fall back to canned, got: {}", r.text);
850    }
851}