openai-oxide 0.11.2

#!/usr/bin/env python3
"""Side-by-side benchmark: openai-oxide-python (Rust/PyO3) vs openai (Python/httpx)."""

import asyncio
import concurrent.futures
import json
import time
import sys

try:
    from openai import OpenAI as PyClient
except ImportError:
    print("Please install openai: pip install openai")
    sys.exit(1)

try:
    from openai_oxide import Client as OxideClient
except ImportError:
    print("Please install openai_oxide_python. Run `uv run maturin develop --release` in `openai-oxide-python`.")
    sys.exit(1)

MODEL = "gpt-5.4"
N = 5  # iterations

py = PyClient()
oxide = OxideClient()

def stats(times):
    times.sort()
    return times[len(times) // 2] if times else 0

results = []

def row(name, oxide_ms, py_ms):
    if oxide_ms == "N/A":
        print(f"{name:<25} {'N/A':>8} {py_ms:>8}ms  {'python':>8} (---%)")
        return
    winner = "OXIDE" if oxide_ms <= py_ms else "python"
    pct = abs(int((py_ms - oxide_ms) / max(py_ms, 1) * 100))
    results.append((name, oxide_ms, py_ms, winner))
    print(f"{name:<25} {oxide_ms:>6}ms {py_ms:>6}ms  {winner:>8} ({pct}%)")

async def main():
    print("Warming up...")
    py.responses.create(model=MODEL, input="ping", max_output_tokens=16)
    await oxide.create(MODEL, "ping", max_output_tokens=16)
    print("Ready.\n")

    print(f"{'Test':<25} {'oxide-py':>8} {'python':>8}  {'winner':>8}")
    print("-" * 62)

    # 1. Plain text
    ot, pt = [], []
    for _ in range(N):
        t0 = time.perf_counter()
        await oxide.create(MODEL, "What is the capital of France? One word.", max_output_tokens=16)
        ot.append(int((time.perf_counter() - t0) * 1000))
    for _ in range(N):
        t0 = time.perf_counter()
        py.responses.create(model=MODEL, input="What is the capital of France? One word.", max_output_tokens=16)
        pt.append(int((time.perf_counter() - t0) * 1000))
    row("Plain text", stats(ot), stats(pt))

    # 2. Structured output
    schema_obj = {
        "type": "object",
        "properties": {"languages": {"type": "array", "items": {"type": "object",
            "properties": {"name": {"type": "string"}, "year": {"type": "integer"}},
            "required": ["name", "year"], "additionalProperties": False}}},
        "required": ["languages"], "additionalProperties": False}
    schema_str = json.dumps(schema_obj)
    ot, pt = [], []
    for _ in range(N):
        t0 = time.perf_counter()
        await oxide.create_structured(MODEL, "List 3 programming languages with year created", "langs", schema_str, max_output_tokens=200)
        ot.append(int((time.perf_counter() - t0) * 1000))
    for _ in range(N):
        t0 = time.perf_counter()
        py.responses.create(model=MODEL, input="List 3 programming languages with year created", max_output_tokens=200,
            text={"format":{"type":"json_schema","name":"langs","strict":True,"schema":schema_obj}})
        pt.append(int((time.perf_counter() - t0) * 1000))
    row("Structured output", stats(ot), stats(pt))

    # 3. Function calling
    tools_obj = [{"type":"function","name":"get_weather","description":"Get weather","parameters":{"type":"object","properties":{"city":{"type":"string"}, "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}},"required":["city", "unit"],"additionalProperties":False}}]
    tools_str = json.dumps(tools_obj)
    ot, pt = [], []
    for _ in range(N):
        t0 = time.perf_counter()
        await oxide.create_with_tools(MODEL, "What's the weather in Tokyo?", tools_str)
        ot.append(int((time.perf_counter() - t0) * 1000))
    for _ in range(N):
        t0 = time.perf_counter()
        py.responses.create(model=MODEL, input="What's the weather in Tokyo?", tools=tools_obj)
        pt.append(int((time.perf_counter() - t0) * 1000))
    row("Function calling", stats(ot), stats(pt))

    # 4. Multi-turn
    ot, pt = [], []
    for _ in range(N):
        t0 = time.perf_counter()
        r1_raw = await oxide.create_raw(json.dumps({"model": MODEL, "input": "Remember: the answer is 42.", "store": True, "max_output_tokens": 32}))
        r1_id = json.loads(r1_raw).get("id", "")
        await oxide.create_raw(json.dumps({"model": MODEL, "input": "What is the answer?", "previous_response_id": r1_id, "max_output_tokens": 16}))
        ot.append(int((time.perf_counter() - t0) * 1000))
    for _ in range(N):
        t0 = time.perf_counter()
        r1 = py.responses.create(model=MODEL, input="Remember: the answer is 42.", store=True, max_output_tokens=32)
        py.responses.create(model=MODEL, input="What is the answer?", previous_response_id=r1.id, max_output_tokens=16)
        pt.append(int((time.perf_counter() - t0) * 1000))
    row("Multi-turn (2 reqs)", stats(ot), stats(pt))

    # 5. Web search
    ot, pt = [], []
    for _ in range(N):
        t0 = time.perf_counter()
        await oxide.create_raw(json.dumps({"model": MODEL, "input": "What is the latest Rust version?", "tools": [{"type": "web_search", "search_context_size": "low"}], "max_output_tokens": 100}))
        ot.append(int((time.perf_counter() - t0) * 1000))
    for _ in range(N):
        t0 = time.perf_counter()
        py.responses.create(model=MODEL, input="What is the latest Rust version?", max_output_tokens=100, tools=[{"type": "web_search", "search_context_size": "low"}])
        pt.append(int((time.perf_counter() - t0) * 1000))
    row("Web search", stats(ot), stats(pt))

    # 6. Nested structured output
    complex_schema = {
        "type": "object",
        "properties": {
            "company": {"type": "object", "properties": {
                "name": {"type": "string"}, "founded": {"type": "integer"}, "ceo": {"type": "string"},
                "products": {"type": "array", "items": {"type": "object", "properties": {
                    "name": {"type": "string"},
                    "category": {"type": "string", "enum": ["hardware", "software", "service"]},
                    "revenue_billions": {"type": "number"}, "active": {"type": "boolean"}},
                    "required": ["name", "category", "revenue_billions", "active"], "additionalProperties": False}}},
                "required": ["name", "founded", "ceo", "products"], "additionalProperties": False},
            "competitors": {"type": "array", "items": {"type": "string"}},
            "summary": {"type": "string"}},
        "required": ["company", "competitors", "summary"], "additionalProperties": False}
    ot, pt = [], []
    for _ in range(N):
        t0 = time.perf_counter()
        await oxide.create_raw(json.dumps({"model": MODEL, "input": "Analyze Apple Inc: products with revenue, competitors, summary.", "max_output_tokens": 800, "text": {"format": {"type": "json_schema", "name": "company_analysis", "strict": True, "schema": complex_schema}}}))
        ot.append(int((time.perf_counter() - t0) * 1000))
    for _ in range(N):
        t0 = time.perf_counter()
        py.responses.create(model=MODEL, input="Analyze Apple Inc: products with revenue, competitors, summary.", max_output_tokens=800, text={"format": {"type": "json_schema", "name": "company_analysis", "strict": True, "schema": complex_schema}})
        pt.append(int((time.perf_counter() - t0) * 1000))
    row("Nested structured", stats(ot), stats(pt))

    # 7. Agent loop (2-step)
    ot, pt = [], []
    for _ in range(N):
        t0 = time.perf_counter()
        r1_raw = await oxide.create_raw(json.dumps({"model": MODEL, "input": "What's the weather in Tokyo and what should I wear?", "store": True, "tools": tools_obj}))
        r1_d = json.loads(r1_raw)
        call_id = "call_1"
        for item in r1_d.get("output", []):
            if item.get("type") == "function_call":
                call_id = item.get("call_id")
                break
        await oxide.create_raw(json.dumps({"model": MODEL, "previous_response_id": r1_d.get("id"), "max_output_tokens": 200, "input": [{"type": "function_call_output", "call_id": call_id, "output": '{"temp":22,"condition":"sunny","humidity":45}'}], "text": {"format": {"type": "json_schema", "name": "recommendation", "strict": True, "schema": {"type": "object", "properties": {"outfit": {"type": "string"}, "accessories": {"type": "array", "items": {"type": "string"}}, "warning": {"type": "string"}}, "required": ["outfit", "accessories", "warning"], "additionalProperties": False}}}}))
        ot.append(int((time.perf_counter() - t0) * 1000))
    for _ in range(N):
        t0 = time.perf_counter()
        step1 = py.responses.create(model=MODEL, input="What's the weather in Tokyo and what should I wear?", store=True, tools=tools_obj)
        call_id = next((item.call_id for item in step1.output if item.type == "function_call"), "call_1")
        py.responses.create(model=MODEL, previous_response_id=step1.id, max_output_tokens=200, input=[{"type": "function_call_output", "call_id": call_id, "output": '{"temp":22,"condition":"sunny","humidity":45}'}], text={"format": {"type": "json_schema", "name": "recommendation", "strict": True, "schema": {"type": "object", "properties": {"outfit": {"type": "string"}, "accessories": {"type": "array", "items": {"type": "string"}}, "warning": {"type": "string"}}, "required": ["outfit", "accessories", "warning"], "additionalProperties": False}}})
        pt.append(int((time.perf_counter() - t0) * 1000))
    row("Agent loop (2-step)", stats(ot), stats(pt))

    # 8. Rapid-fire (5 calls)
    ot, pt = [], []
    for _ in range(N):
        t0 = time.perf_counter()
        for i in range(1, 6):
            await oxide.create(MODEL, f"What is {i}+{i}? Reply with just the number.", max_output_tokens=16)
        ot.append(int((time.perf_counter() - t0) * 1000))
    for _ in range(N):
        t0 = time.perf_counter()
        for i in range(1, 6):
            py.responses.create(model=MODEL, input=f"What is {i}+{i}? Reply with just the number.", max_output_tokens=16)
        pt.append(int((time.perf_counter() - t0) * 1000))
    row("Rapid-fire (5 calls)", stats(ot), stats(pt))

    # 9. Prompt-cached
    ot, pt = [], []
    system_prompt = "You are a senior software architect with 20 years of experience in distributed systems, microservices, and cloud-native architectures. Always provide specific, actionable advice with code examples where relevant. Consider scalability, maintainability, and security in every recommendation."
    await oxide.create_raw(json.dumps({"model": MODEL, "instructions": system_prompt, "input": "ping", "prompt_cache_key": "bench-architect-ox", "max_output_tokens": 16}))
    py.responses.create(model=MODEL, instructions=system_prompt, input="ping", prompt_cache_key="bench-architect-py", max_output_tokens=16)
    
    for _ in range(N):
        t0 = time.perf_counter()
        await oxide.create_raw(json.dumps({"model": MODEL, "instructions": system_prompt, "input": "How should I design a rate limiter for an API gateway?", "prompt_cache_key": "bench-architect-ox", "prompt_cache_retention": "24h", "max_output_tokens": 200}))
        ot.append(int((time.perf_counter() - t0) * 1000))
    for _ in range(N):
        t0 = time.perf_counter()
        py.responses.create(model=MODEL, instructions=system_prompt, input="How should I design a rate limiter for an API gateway?", prompt_cache_key="bench-architect-py", prompt_cache_retention="24h", max_output_tokens=200)
        pt.append(int((time.perf_counter() - t0) * 1000))
    row("Prompt-cached", stats(ot), stats(pt))

    # 10. Streaming TTFT
    ot, pt = [], []
    for _ in range(N):
        t0 = time.perf_counter()
        stream_ox = await oxide.create_stream(MODEL, "Explain quicksort in 3 sentences.", max_output_tokens=200)
        async for event_str in stream_ox:
            event = json.loads(event_str)
            if event.get("type") == "response.output_text.delta":
                ot.append(int((time.perf_counter() - t0) * 1000))
                break
    for _ in range(N):
        t0 = time.perf_counter()
        stream_py = py.responses.create(model=MODEL, input="Explain quicksort in 3 sentences.", max_output_tokens=200, stream=True)
        for event in stream_py:
            if event.type == "response.output_text.delta":
                pt.append(int((time.perf_counter() - t0) * 1000))
                break
    row("Streaming TTFT", stats(ot), stats(pt))

    # 11. Parallel 3x
    ot, pt = [], []
    for _ in range(N):
        t0 = time.perf_counter()
        await asyncio.gather(
            oxide.create(MODEL, "Capital of France? One word.", max_output_tokens=16),
            oxide.create(MODEL, "Capital of Japan? One word.", max_output_tokens=16),
            oxide.create(MODEL, "Capital of Brazil? One word.", max_output_tokens=16),
        )
        ot.append(int((time.perf_counter() - t0) * 1000))
    for _ in range(N):
        t0 = time.perf_counter()
        with concurrent.futures.ThreadPoolExecutor(max_workers=3) as ex:
            fs = [ex.submit(lambda q: py.responses.create(model=MODEL, input=f"Capital of {q}? One word.", max_output_tokens=16), q) for q in ["France", "Japan", "Brazil"]]
            for f in concurrent.futures.as_completed(fs):
                f.result()
        pt.append(int((time.perf_counter() - t0) * 1000))
    row("Parallel 3x (fan-out)", stats(ot), stats(pt))

    # 12. Hedged (2x race)
    ot, pt = [], []
    for _ in range(N):
        t0 = time.perf_counter()
        t1 = (oxide.create(MODEL, "What is 7*8? Number only.", max_output_tokens=16))
        t2 = (oxide.create(MODEL, "What is 7*8? Number only.", max_output_tokens=16))
        done, pending = await asyncio.wait([asyncio.ensure_future(t1), asyncio.ensure_future(t2)], return_when=asyncio.FIRST_COMPLETED)
        for p in pending:
            p.cancel()
        ot.append(int((time.perf_counter() - t0) * 1000))
    for _ in range(N):
        t0 = time.perf_counter()
        with concurrent.futures.ThreadPoolExecutor(max_workers=2) as ex:
            f1 = ex.submit(lambda: py.responses.create(model=MODEL, input="What is 7*8? Number only.", max_output_tokens=16))
            f2 = ex.submit(lambda: py.responses.create(model=MODEL, input="What is 7*8? Number only.", max_output_tokens=16))
            done, _ = concurrent.futures.wait([f1, f2], return_when=concurrent.futures.FIRST_COMPLETED)
            next(iter(done)).result()
        pt.append(int((time.perf_counter() - t0) * 1000))
    row("Hedged (2x race)", stats(ot), stats(pt))

    print(f"\n{N} iterations, median. Model: {MODEL}")
    print(f"oxide-py: openai-oxide v0.9.2 (Rust via PyO3)")
    print(f"python:   openai v{__import__('openai').__version__} (httpx)")

    wins = sum(1 for _, _, _, w in results if w == "OXIDE")
    print(f"\noxide-py wins {wins}/{len(results)} tests")

if __name__ == "__main__":
    asyncio.run(main())