llmsim 0.4.0

LLM Traffic Simulator - A lightweight, high-performance LLM API simulator
Documentation
#!/usr/bin/env -S uv run --script
# /// script
# requires-python = ">=3.10"
# dependencies = [
#     "httpx>=0.27.0",
# ]
# ///
"""
OpenAI Responses API client example for llmsim.

This script demonstrates connecting to a running llmsim server using the
OpenAI Responses API format. The Responses API is OpenAI's newer stateful API
that unifies Chat Completions and Assistants capabilities.

Server endpoints:
    POST /openai/v1/responses - Create a response (streaming and non-streaming)

Prerequisites:
    Start the llmsim server first:
        llmsim serve --port 8080

    Or from source:
        cargo run --release -- serve --port 8080

Usage:
    uv run examples/responses_client.py

Environment variables:
    LLMSIM_URL: Server URL (default: http://localhost:8080)
"""

import json
import os
import sys
from typing import Any, Iterator

import httpx


def create_response(
    client: httpx.Client,
    model: str,
    input_data: str | list[dict[str, Any]],
    stream: bool = False,
    **kwargs: Any,
) -> dict[str, Any] | Iterator[dict[str, Any]]:
    """Create a response using the Responses API."""
    payload = {
        "model": model,
        "input": input_data,
        "stream": stream,
        **kwargs,
    }

    if stream:
        return _stream_response(client, payload)
    else:
        response = client.post("/openai/v1/responses", json=payload)
        response.raise_for_status()
        return response.json()


def _stream_response(
    client: httpx.Client, payload: dict[str, Any]
) -> Iterator[dict[str, Any]]:
    """Stream a response and yield parsed events."""
    with client.stream("POST", "/openai/v1/responses", json=payload) as response:
        response.raise_for_status()

        buffer = ""
        for chunk in response.iter_text():
            buffer += chunk

            while "\n\n" in buffer:
                event_str, buffer = buffer.split("\n\n", 1)
                if not event_str.strip():
                    continue

                event = {}
                for line in event_str.strip().split("\n"):
                    if line.startswith("event: "):
                        event["event"] = line[7:]
                    elif line.startswith("data: "):
                        try:
                            event["data"] = json.loads(line[6:])
                        except json.JSONDecodeError:
                            event["data"] = line[6:]

                if event:
                    yield event


def main() -> None:
    base_url = os.environ.get("LLMSIM_URL", "http://localhost:8080")

    print("=" * 60)
    print("OpenAI Responses API + LLMSim Example")
    print("=" * 60)
    print(f"\nConnecting to: {base_url}")
    print()

    client = httpx.Client(base_url=base_url, timeout=60.0)

    # Example 1: Simple text input
    print("1. Simple Text Input")
    print("-" * 40)
    try:
        response = create_response(
            client,
            model="gpt-5",
            input_data="What is the capital of France?",
        )
        print(f"Response ID: {response['id']}")
        print(f"Status: {response['status']}")
        print(f"Output: {response.get('output_text', 'N/A')}")
        if response.get("usage"):
            print(f"Tokens: {response['usage']}")
    except httpx.HTTPStatusError as e:
        print(f"Error: {e}")
        print("\nMake sure the llmsim server is running:")
        print("  llmsim serve --port 8080")
        sys.exit(1)
    print()

    # Example 2: Message array input
    print("2. Message Array Input")
    print("-" * 40)
    response = create_response(
        client,
        model="gpt-5",
        input_data=[
            {"type": "message", "role": "system", "content": "You are a helpful assistant."},
            {"type": "message", "role": "user", "content": "Tell me a joke."},
        ],
    )
    print(f"Response ID: {response['id']}")
    print(f"Output: {response.get('output_text', 'N/A')}")
    print()

    # Example 3: With instructions
    print("3. With Instructions")
    print("-" * 40)
    response = create_response(
        client,
        model="gpt-5",
        input_data="Write something creative.",
        instructions="You are a creative writing assistant. Be poetic and imaginative.",
    )
    print(f"Output: {response.get('output_text', 'N/A')}")
    print()

    # Example 4: Different models
    print("4. Different Models")
    print("-" * 40)
    models = ["gpt-5-mini", "claude-opus-4.5", "o3-mini"]
    for model in models:
        response = create_response(
            client,
            model=model,
            input_data="Hello!",
        )
        output = response.get("output_text", "")
        print(f"{model}: {output[:60]}...")
    print()

    # Example 5: Streaming response
    print("5. Streaming Response")
    print("-" * 40)
    print("Response: ", end="", flush=True)

    for event in create_response(
        client,
        model="gpt-5",
        input_data="Tell me a short story.",
        stream=True,
    ):
        event_type = event.get("event", "")
        data = event.get("data", {})

        if event_type == "response.output_text.delta":
            print(data.get("delta", ""), end="", flush=True)
        elif event_type == "response.completed":
            usage = data.get("response", {}).get("usage", {})
            print(f"\n\nTokens: {usage}")
    print()

    # Example 6: Examine full response structure
    print("6. Full Response Structure")
    print("-" * 40)
    response = create_response(
        client,
        model="gpt-5",
        input_data="Hello, world!",
    )
    print(f"ID: {response['id']}")
    print(f"Object: {response['object']}")
    print(f"Model: {response['model']}")
    print(f"Status: {response['status']}")
    print(f"Output items: {len(response.get('output', []))}")
    if response.get("output"):
        item = response["output"][0]
        print(f"  - Type: {item.get('type')}")
        print(f"  - Role: {item.get('role')}")
        print(f"  - Status: {item.get('status')}")
    print()

    # Example 7: Reasoning / Thinking (non-streaming)
    print("7. Reasoning / Thinking (non-streaming)")
    print("-" * 40)
    response = create_response(
        client,
        model="o3",
        input_data="What is 2 + 2?",
        reasoning={"effort": "medium", "summary": "auto"},
    )
    print(f"Model: {response['model']}")
    print(f"Output items: {len(response.get('output', []))}")
    for item in response.get("output", []):
        if item["type"] == "reasoning":
            print(f"\n  [Thinking]")
            print(f"  ID: {item['id']}")
            print(f"  Status: {item['status']}")
            if item.get("summary"):
                for s in item["summary"]:
                    print(f"  Summary: {s['text']}")
            else:
                print("  (no summary requested)")
        elif item["type"] == "message":
            print(f"\n  [Response]")
            for content in item.get("content", []):
                if content["type"] == "output_text":
                    print(f"  Text: {content['text'][:100]}")
    if response.get("usage"):
        usage = response["usage"]
        reasoning_tokens = usage.get("output_tokens_details", {}).get("reasoning_tokens", 0)
        print(f"\n  Tokens: input={usage['input_tokens']}, output={usage['output_tokens']}, "
              f"reasoning={reasoning_tokens}, total={usage['total_tokens']}")
    print()

    # Example 8: Reasoning with different effort levels
    print("8. Reasoning Effort Levels")
    print("-" * 40)
    for effort in ["low", "medium", "high"]:
        response = create_response(
            client,
            model="o3",
            input_data="Explain gravity.",
            reasoning={"effort": effort, "summary": "concise"},
        )
        usage = response.get("usage", {})
        reasoning_tokens = usage.get("output_tokens_details", {}).get("reasoning_tokens", 0)
        print(f"  effort={effort:6s} -> reasoning_tokens={reasoning_tokens}, total={usage.get('total_tokens', 0)}")
    print()

    # Example 9: Streaming with thinking visualization
    print("9. Streaming with Thinking")
    print("-" * 40)

    for event in create_response(
        client,
        model="o3",
        input_data="Why is the sky blue?",
        stream=True,
        reasoning={"effort": "medium", "summary": "auto"},
    ):
        event_type = event.get("event", "")
        data = event.get("data", {})

        if event_type == "response.output_item.added":
            item = data.get("item", {})
            if item.get("type") == "reasoning":
                print("[Thinking] ", end="", flush=True)
            elif item.get("type") == "message":
                print("\n[Response] ", end="", flush=True)
        elif event_type == "response.reasoning_summary_text.delta":
            print(data.get("delta", ""), end="", flush=True)
        elif event_type == "response.output_text.delta":
            print(data.get("delta", ""), end="", flush=True)
        elif event_type == "response.completed":
            usage = data.get("response", {}).get("usage", {})
            reasoning_tokens = usage.get("output_tokens_details", {}).get("reasoning_tokens", 0)
            print(f"\n\n  Tokens: input={usage.get('input_tokens', 0)}, "
                  f"output={usage.get('output_tokens', 0)}, "
                  f"reasoning={reasoning_tokens}, "
                  f"total={usage.get('total_tokens', 0)}")
    print()

    print("=" * 60)
    print("Examples complete!")
    print("=" * 60)


if __name__ == "__main__":
    main()