car-inference 0.6.0

[
  {
    "id": "qwen/qwen3-embedding-0.6b:q8_0",
    "name": "Qwen3-Embedding-0.6B",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "embed"
    ],
    "context_length": 8192,
    "param_count": "0.6B",
    "quantization": "Q8_0",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 639,
      "ram_mb": 639
    },
    "source": {
      "type": "local",
      "hf_repo": "Qwen/Qwen3-Embedding-0.6B-GGUF",
      "hf_filename": "Qwen3-Embedding-0.6B-Q8_0.gguf",
      "tokenizer_repo": "Qwen/Qwen3-Embedding-0.6B"
    },
    "tags": [
      "builtin",
      "embedding"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "qwen/qwen3-reranker-0.6b:q8_0",
    "name": "Qwen3-Reranker-0.6B",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "rerank"
    ],
    "context_length": 32768,
    "param_count": "0.6B",
    "quantization": "Q8_0",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 640,
      "ram_mb": 640
    },
    "source": {
      "type": "local",
      "hf_repo": "Qwen/Qwen3-Reranker-0.6B-GGUF",
      "hf_filename": "Qwen3-Reranker-0.6B-Q8_0.gguf",
      "tokenizer_repo": "Qwen/Qwen3-Reranker-0.6B"
    },
    "tags": [
      "builtin",
      "reranker"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "qwen/qwen3-0.6b:q8_0",
    "name": "Qwen3-0.6B",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "generate",
      "classify",
      "reasoning"
    ],
    "context_length": 32768,
    "param_count": "0.6B",
    "quantization": "Q8_0",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": 100.0
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 650,
      "ram_mb": 650
    },
    "source": {
      "type": "local",
      "hf_repo": "Qwen/Qwen3-0.6B-GGUF",
      "hf_filename": "Qwen3-0.6B-Q8_0.gguf",
      "tokenizer_repo": "Qwen/Qwen3-0.6B"
    },
    "tags": [
      "builtin",
      "fast"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "qwen/qwen3-1.7b:q8_0",
    "name": "Qwen3-1.7B",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "generate",
      "code",
      "reasoning"
    ],
    "context_length": 32768,
    "param_count": "1.7B",
    "quantization": "Q8_0",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": 70.0
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 1800,
      "ram_mb": 1800
    },
    "source": {
      "type": "local",
      "hf_repo": "Qwen/Qwen3-1.7B-GGUF",
      "hf_filename": "Qwen3-1.7B-Q8_0.gguf",
      "tokenizer_repo": "Qwen/Qwen3-1.7B"
    },
    "tags": [
      "builtin"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "qwen/qwen3-4b:q4_k_m",
    "name": "Qwen3-4B",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "tool_use",
      "multi_tool_call"
    ],
    "context_length": 32768,
    "param_count": "4B",
    "quantization": "Q4_K_M",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": 45.0
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 2500,
      "ram_mb": 2500
    },
    "source": {
      "type": "local",
      "hf_repo": "Qwen/Qwen3-4B-GGUF",
      "hf_filename": "Qwen3-4B-Q4_K_M.gguf",
      "tokenizer_repo": "Qwen/Qwen3-4B"
    },
    "tags": [
      "builtin",
      "code"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "qwen/qwen3-8b:q4_k_m",
    "name": "Qwen3-8B",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "summarize",
      "tool_use",
      "multi_tool_call"
    ],
    "context_length": 131072,
    "param_count": "8B",
    "quantization": "Q4_K_M",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": 25.0
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 4900,
      "ram_mb": 4900
    },
    "source": {
      "type": "local",
      "hf_repo": "Qwen/Qwen3-8B-GGUF",
      "hf_filename": "Qwen3-8B-Q4_K_M.gguf",
      "tokenizer_repo": "Qwen/Qwen3-8B"
    },
    "tags": [
      "builtin",
      "reasoning"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "qwen/qwen3-30b-a3b:q4_k_m",
    "name": "Qwen3-30B-A3B",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "summarize",
      "tool_use",
      "multi_tool_call"
    ],
    "context_length": 131072,
    "param_count": "30B (3B active)",
    "quantization": "Q4_K_M",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": 35.0
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 17000,
      "ram_mb": 17000
    },
    "source": {
      "type": "local",
      "hf_repo": "Qwen/Qwen3-30B-A3B-GGUF",
      "hf_filename": "Qwen3-30B-A3B-Q4_K_M.gguf",
      "tokenizer_repo": "Qwen/Qwen3-30B-A3B"
    },
    "tags": [
      "builtin",
      "moe",
      "reasoning"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "mlx/qwen3-embedding-0.6b:mxfp8",
    "name": "Qwen3-Embedding-0.6B-MLX",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "embed"
    ],
    "context_length": 8192,
    "param_count": "0.6B",
    "quantization": "mxfp8",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 620,
      "ram_mb": 620
    },
    "source": {
      "type": "mlx",
      "hf_repo": "mlx-community/Qwen3-Embedding-0.6B-mxfp8",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx",
      "embedding"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "mlx/qwen3-reranker-0.6b:bf16",
    "name": "Qwen3-Reranker-0.6B-MLX",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "rerank"
    ],
    "context_length": 32768,
    "param_count": "0.6B",
    "quantization": "bf16",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 1200,
      "ram_mb": 1200
    },
    "source": {
      "type": "mlx",
      "hf_repo": "mlx-community/Qwen3-Reranker-0.6B-bf16",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx",
      "reranker"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "mlx/qwen3-0.6b:6bit",
    "name": "Qwen3-0.6B-MLX",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "generate",
      "classify",
      "reasoning"
    ],
    "context_length": 32768,
    "param_count": "0.6B",
    "quantization": "6bit",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": 140.0
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 500,
      "ram_mb": 500
    },
    "source": {
      "type": "mlx",
      "hf_repo": "mlx-community/Qwen3-0.6B-6bit",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx",
      "fast"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "mlx/qwen3-1.7b:3bit",
    "name": "Qwen3-1.7B-MLX",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "generate",
      "code",
      "reasoning"
    ],
    "context_length": 32768,
    "param_count": "1.7B",
    "quantization": "3bit",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": 110.0
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 800,
      "ram_mb": 800
    },
    "source": {
      "type": "mlx",
      "hf_repo": "mlx-community/Qwen3-1.7B-3bit",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "mlx/qwen3-4b:4bit",
    "name": "Qwen3-4B-MLX",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "tool_use",
      "multi_tool_call"
    ],
    "context_length": 32768,
    "param_count": "4B",
    "quantization": "4bit",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": 90.0
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 2400,
      "ram_mb": 2400
    },
    "source": {
      "type": "mlx",
      "hf_repo": "mlx-community/Qwen3-4B-4bit",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx",
      "code"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "mlx/qwen3-8b:4bit",
    "name": "Qwen3-8B-MLX",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "summarize",
      "tool_use",
      "multi_tool_call"
    ],
    "context_length": 131072,
    "param_count": "8B",
    "quantization": "4bit",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": 55.0
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 4800,
      "ram_mb": 4800
    },
    "source": {
      "type": "mlx",
      "hf_repo": "mlx-community/Qwen3-8B-4bit",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx",
      "reasoning"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "mlx/qwen3-30b-a3b:4bit",
    "name": "Qwen3-30B-A3B-MLX",
    "provider": "qwen",
    "family": "qwen3",
    "version": "1.0",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "summarize",
      "tool_use",
      "multi_tool_call"
    ],
    "context_length": 131072,
    "param_count": "30B (3B active)",
    "quantization": "4bit",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": 80.0
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 16500,
      "ram_mb": 16500
    },
    "source": {
      "type": "mlx",
      "hf_repo": "mlx-community/Qwen3-30B-A3B-4bit",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx",
      "moe",
      "reasoning"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "vllm-mlx/qwen3.6-35b-a3b:4bit",
    "name": "Qwen3.6-35B-A3B-MLX-vLLM",
    "provider": "qwen",
    "family": "qwen3.6",
    "version": "3.6",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "summarize",
      "tool_use",
      "multi_tool_call",
      "vision"
    ],
    "context_length": 262144,
    "param_count": "35B (3B active)",
    "quantization": "4bit",
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": 80.0
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 20400,
      "ram_mb": 24000
    },
    "source": {
      "type": "vllm_mlx",
      "endpoint": "http://localhost:8000",
      "model_name": "mlx-community/Qwen3.6-35B-A3B-4bit"
    },
    "tags": [
      "builtin",
      "vllm-mlx",
      "mlx-vlm",
      "local",
      "apple-silicon",
      "latest",
      "qwen3.6",
      "qwen3_5_moe",
      "moe",
      "reasoning"
    ],
    "supported_params": [
      "temperature",
      "top_p",
      "max_tokens",
      "stop_sequences"
    ],
    "public_benchmarks": []
  },
  {
    "id": "mlx-vlm/qwen3-vl-2b:bf16",
    "name": "Qwen3-VL-2B-mlx-vlm",
    "provider": "qwen",
    "family": "qwen3-vl",
    "version": "bf16",
    "capabilities": [
      "generate",
      "vision",
      "grounding"
    ],
    "context_length": 262144,
    "param_count": "2B",
    "quantization": null,
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": 50.0
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 4000,
      "ram_mb": 6000
    },
    "source": {
      "type": "mlx",
      "hf_repo": "Qwen/Qwen3-VL-2B-Instruct",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx-vlm-cli",
      "local",
      "vision",
      "requires-mlx-vlm",
      "qwen3-vl"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "vllm-mlx/gemma-4-e2b-it",
    "name": "gemma-4-E2B-it",
    "provider": "google",
    "family": "gemma-4",
    "version": "1.0",
    "capabilities": [
      "generate",
      "tool_use",
      "vision",
      "video_understanding",
      "audio_understanding",
      "reasoning"
    ],
    "context_length": 131072,
    "param_count": "2B",
    "quantization": null,
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 2000,
      "ram_mb": 2000
    },
    "source": {
      "type": "vllm_mlx",
      "endpoint": "http://localhost:8000",
      "model_name": "google/gemma-4-E2B-it"
    },
    "tags": [
      "builtin",
      "vllm-mlx",
      "local",
      "multimodal"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "vllm-mlx/gemma-4-e4b-it",
    "name": "gemma-4-E4B-it",
    "provider": "google",
    "family": "gemma-4",
    "version": "1.0",
    "capabilities": [
      "generate",
      "tool_use",
      "vision",
      "video_understanding",
      "audio_understanding",
      "reasoning"
    ],
    "context_length": 131072,
    "param_count": "4B",
    "quantization": null,
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 4000,
      "ram_mb": 4000
    },
    "source": {
      "type": "vllm_mlx",
      "endpoint": "http://localhost:8000",
      "model_name": "google/gemma-4-E4B-it"
    },
    "tags": [
      "builtin",
      "vllm-mlx",
      "local",
      "multimodal"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "vllm-mlx/gemma-4-26b-a4b-it",
    "name": "gemma-4-26B-A4B-it",
    "provider": "google",
    "family": "gemma-4",
    "version": "1.0",
    "capabilities": [
      "generate",
      "tool_use",
      "vision",
      "video_understanding",
      "reasoning"
    ],
    "context_length": 131072,
    "param_count": "26B (4B active)",
    "quantization": null,
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 16000,
      "ram_mb": 16000
    },
    "source": {
      "type": "vllm_mlx",
      "endpoint": "http://localhost:8000",
      "model_name": "google/gemma-4-26B-A4B-it"
    },
    "tags": [
      "builtin",
      "vllm-mlx",
      "local",
      "moe",
      "multimodal"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "mlx/parakeet-tdt-0.6b-v3:default",
    "name": "Parakeet-TDT-0.6B-v3-MLX",
    "provider": "mlx-community",
    "family": "parakeet",
    "version": "v3",
    "capabilities": [
      "speech_to_text"
    ],
    "context_length": 0,
    "param_count": "0.6B",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 350,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 2510,
      "ram_mb": 3000
    },
    "source": {
      "type": "mlx",
      "hf_repo": "mlx-community/parakeet-tdt-0.6b-v3",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx",
      "speech",
      "stt",
      "realtime"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "mlx/kokoro-82m:6bit",
    "name": "Kokoro-82M-6bit",
    "provider": "mlx-community",
    "family": "kokoro",
    "version": "6bit",
    "capabilities": [
      "text_to_speech"
    ],
    "context_length": 0,
    "param_count": "82M",
    "quantization": "6bit",
    "performance": {
      "latency_p50_ms": 180,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 348,
      "ram_mb": 768
    },
    "source": {
      "type": "mlx",
      "hf_repo": "mlx-community/Kokoro-82M-6bit",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx",
      "speech",
      "tts",
      "realtime",
      "fast"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "mlx/kokoro-82m:bf16",
    "name": "Kokoro-82M-bf16",
    "provider": "mlx-community",
    "family": "kokoro",
    "version": "bf16",
    "capabilities": [
      "text_to_speech"
    ],
    "context_length": 0,
    "param_count": "82M",
    "quantization": "bf16",
    "performance": {
      "latency_p50_ms": 220,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 512,
      "ram_mb": 1024
    },
    "source": {
      "type": "mlx",
      "hf_repo": "mlx-community/Kokoro-82M-bf16",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx",
      "speech",
      "tts",
      "realtime",
      "stable"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "mlx/qwen3-tts-1.7b:5bit",
    "name": "Qwen3-TTS-12Hz-1.7B-Base-5bit",
    "provider": "qwen",
    "family": "qwen3-tts",
    "version": "5bit",
    "capabilities": [
      "text_to_speech"
    ],
    "context_length": 0,
    "param_count": "1.7B",
    "quantization": "5bit",
    "performance": {
      "latency_p50_ms": 900,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 2200,
      "ram_mb": 3000
    },
    "source": {
      "type": "mlx",
      "hf_repo": "mlx-community/Qwen3-TTS-12Hz-1.7B-Base-5bit",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx",
      "speech",
      "tts",
      "quality"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "elevenlabs/scribe-v1:latest",
    "name": "scribe_v1",
    "provider": "elevenlabs",
    "family": "scribe",
    "version": "latest",
    "capabilities": [
      "speech_to_text"
    ],
    "context_length": 0,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 700,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "proprietary",
      "provider": "elevenlabs",
      "endpoint": "https://api.elevenlabs.io",
      "auth": {
        "type": "api_key_env",
        "env_var": "ELEVENLABS_API_KEY"
      },
      "protocol": {
        "chat_path": "/chat",
        "content_type": "application/json",
        "streaming": false,
        "extra_headers": {}
      }
    },
    "tags": [
      "builtin",
      "speech",
      "stt",
      "remote"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "elevenlabs/eleven-flash-v2.5:latest",
    "name": "eleven_flash_v2_5",
    "provider": "elevenlabs",
    "family": "eleven-flash",
    "version": "latest",
    "capabilities": [
      "text_to_speech"
    ],
    "context_length": 0,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 250,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "proprietary",
      "provider": "elevenlabs",
      "endpoint": "https://api.elevenlabs.io",
      "auth": {
        "type": "api_key_env",
        "env_var": "ELEVENLABS_API_KEY"
      },
      "protocol": {
        "chat_path": "/chat",
        "content_type": "application/json",
        "streaming": false,
        "extra_headers": {}
      }
    },
    "tags": [
      "builtin",
      "speech",
      "tts",
      "remote",
      "fast"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "anthropic/claude-opus-4-7:latest",
    "name": "claude-opus-4-7",
    "provider": "anthropic",
    "family": "claude-4",
    "version": "latest",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "tool_use",
      "multi_tool_call",
      "summarize",
      "vision"
    ],
    "context_length": 1000000,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": null,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "remote_api",
      "endpoint": "https://api.anthropic.com/v1/messages",
      "api_key_env": "ANTHROPIC_API_KEY",
      "api_key_envs": [],
      "api_version": "2023-06-01",
      "protocol": "anthropic"
    },
    "tags": [
      "builtin",
      "frontier"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "anthropic/claude-opus-4-6:latest",
    "name": "claude-opus-4-6",
    "provider": "anthropic",
    "family": "claude-4",
    "version": "latest",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "tool_use",
      "multi_tool_call",
      "summarize",
      "vision"
    ],
    "context_length": 1000000,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 3000,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": 5.0,
      "output_per_mtok": 25.0,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "remote_api",
      "endpoint": "https://api.anthropic.com/v1/messages",
      "api_key_env": "ANTHROPIC_API_KEY",
      "api_key_envs": [],
      "api_version": "2023-06-01",
      "protocol": "anthropic"
    },
    "tags": [
      "builtin",
      "frontier"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "anthropic/claude-sonnet-4-6:latest",
    "name": "claude-sonnet-4-6",
    "provider": "anthropic",
    "family": "claude-4",
    "version": "latest",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "tool_use",
      "multi_tool_call",
      "summarize",
      "vision"
    ],
    "context_length": 200000,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 1500,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": 3.0,
      "output_per_mtok": 15.0,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "remote_api",
      "endpoint": "https://api.anthropic.com/v1/messages",
      "api_key_env": "ANTHROPIC_API_KEY",
      "api_key_envs": [],
      "api_version": "2023-06-01",
      "protocol": "anthropic"
    },
    "tags": [
      "builtin",
      "balanced"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "anthropic/claude-haiku-4-5:latest",
    "name": "claude-haiku-4-5",
    "provider": "anthropic",
    "family": "claude-4",
    "version": "latest",
    "capabilities": [
      "generate",
      "code",
      "tool_use",
      "multi_tool_call",
      "summarize",
      "vision"
    ],
    "context_length": 200000,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 600,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": 1.0,
      "output_per_mtok": 5.0,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "remote_api",
      "endpoint": "https://api.anthropic.com/v1/messages",
      "api_key_env": "ANTHROPIC_API_KEY",
      "api_key_envs": [],
      "api_version": "2023-06-01",
      "protocol": "anthropic"
    },
    "tags": [
      "builtin",
      "fast",
      "cheap"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "openai/gpt-5.4:latest",
    "name": "gpt-5.4",
    "provider": "openai",
    "family": "gpt-5",
    "version": "latest",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "tool_use",
      "multi_tool_call",
      "summarize",
      "vision"
    ],
    "context_length": 1000000,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 2000,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": 2.5,
      "output_per_mtok": 10.0,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "remote_api",
      "endpoint": "https://api.openai.com",
      "api_key_env": "OPENAI_API_KEY",
      "api_key_envs": [],
      "api_version": null,
      "protocol": "open_ai_responses"
    },
    "tags": [
      "builtin",
      "frontier"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "openai/gpt-5.4-mini:latest",
    "name": "gpt-5.4-mini",
    "provider": "openai",
    "family": "gpt-5",
    "version": "latest",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "tool_use",
      "multi_tool_call",
      "summarize",
      "vision"
    ],
    "context_length": 1000000,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 800,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": 0.4,
      "output_per_mtok": 1.6,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "remote_api",
      "endpoint": "https://api.openai.com",
      "api_key_env": "OPENAI_API_KEY",
      "api_key_envs": [],
      "api_version": null,
      "protocol": "open_ai_responses"
    },
    "tags": [
      "builtin",
      "fast",
      "cheap"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "openai/o3:latest",
    "name": "o3",
    "provider": "openai",
    "family": "o-series",
    "version": "latest",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "tool_use",
      "multi_tool_call",
      "vision"
    ],
    "context_length": 200000,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 5000,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": 2.0,
      "output_per_mtok": 8.0,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "remote_api",
      "endpoint": "https://api.openai.com",
      "api_key_env": "OPENAI_API_KEY",
      "api_key_envs": [],
      "api_version": null,
      "protocol": "open_ai_responses"
    },
    "tags": [
      "builtin",
      "reasoning"
    ],
    "supported_params": [
      "max_tokens",
      "stop_sequences"
    ],
    "public_benchmarks": []
  },
  {
    "id": "openai/o4-mini:latest",
    "name": "o4-mini",
    "provider": "openai",
    "family": "o-series",
    "version": "latest",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "tool_use",
      "vision"
    ],
    "context_length": 200000,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 2500,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": 1.1,
      "output_per_mtok": 4.4,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "remote_api",
      "endpoint": "https://api.openai.com",
      "api_key_env": "OPENAI_API_KEY",
      "api_key_envs": [],
      "api_version": null,
      "protocol": "open_ai_responses"
    },
    "tags": [
      "builtin",
      "reasoning",
      "cheap"
    ],
    "supported_params": [
      "max_tokens",
      "stop_sequences"
    ],
    "public_benchmarks": []
  },
  {
    "id": "openai/gpt-5.3-codex:latest",
    "name": "gpt-5.3-codex",
    "provider": "openai",
    "family": "gpt-5",
    "version": "latest",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "tool_use",
      "multi_tool_call"
    ],
    "context_length": 192000,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 2000,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": 3.0,
      "output_per_mtok": 15.0,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "remote_api",
      "endpoint": "https://api.openai.com",
      "api_key_env": "OPENAI_API_KEY",
      "api_key_envs": [],
      "api_version": null,
      "protocol": "open_ai_responses"
    },
    "tags": [
      "builtin",
      "code"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "openai/gpt-4.1-mini:latest",
    "name": "gpt-4.1-mini",
    "provider": "openai",
    "family": "gpt-4",
    "version": "latest",
    "capabilities": [
      "generate",
      "code",
      "tool_use",
      "multi_tool_call",
      "vision"
    ],
    "context_length": 1000000,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 600,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": 0.4,
      "output_per_mtok": 1.6,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "remote_api",
      "endpoint": "https://api.openai.com",
      "api_key_env": "OPENAI_API_KEY",
      "api_key_envs": [],
      "api_version": null,
      "protocol": "open_ai_responses"
    },
    "tags": [
      "builtin",
      "fast",
      "cheap"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "google/gemini-2.5-pro:latest",
    "name": "gemini-2.5-pro",
    "provider": "google",
    "family": "gemini-2",
    "version": "latest",
    "capabilities": [
      "generate",
      "code",
      "reasoning",
      "summarize",
      "tool_use",
      "multi_tool_call",
      "vision"
    ],
    "context_length": 1000000,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 2500,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": 1.25,
      "output_per_mtok": 10.0,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "remote_api",
      "endpoint": "https://generativelanguage.googleapis.com",
      "api_key_env": "GOOGLE_API_KEY",
      "api_key_envs": [],
      "api_version": null,
      "protocol": "google"
    },
    "tags": [
      "builtin",
      "balanced"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "google/gemini-2.5-flash:latest",
    "name": "gemini-2.5-flash",
    "provider": "google",
    "family": "gemini-2",
    "version": "latest",
    "capabilities": [
      "generate",
      "code",
      "tool_use",
      "multi_tool_call",
      "summarize",
      "vision"
    ],
    "context_length": 1000000,
    "param_count": "",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 500,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": 0.3,
      "output_per_mtok": 2.5,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "remote_api",
      "endpoint": "https://generativelanguage.googleapis.com",
      "api_key_env": "GOOGLE_API_KEY",
      "api_key_envs": [],
      "api_version": null,
      "protocol": "google"
    },
    "tags": [
      "builtin",
      "fast",
      "cheap"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "mlx/flux-1-lite-8b:q4",
    "name": "Flux-1.lite-8B-MLX-Q4",
    "provider": "mlx-community",
    "family": "flux",
    "version": "q4",
    "capabilities": [
      "image_generation"
    ],
    "context_length": 0,
    "param_count": "8B",
    "quantization": "Q4",
    "performance": {
      "latency_p50_ms": 12000,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 7680,
      "ram_mb": 12288
    },
    "source": {
      "type": "mlx",
      "hf_repo": "mlx-community/Flux-1.lite-8B-MLX-Q4",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx",
      "image"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "mlx/ltx-2.3:q4",
    "name": "ltx-2.3-mlx-q4",
    "provider": "dgrauet",
    "family": "ltx-2.3",
    "version": "4bit",
    "capabilities": [
      "video_generation"
    ],
    "context_length": 0,
    "param_count": "22B",
    "quantization": "4bit",
    "performance": {
      "latency_p50_ms": 45000,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": 40800,
      "ram_mb": 40800
    },
    "source": {
      "type": "mlx",
      "hf_repo": "dgrauet/ltx-2.3-mlx-q4",
      "hf_weight_file": null
    },
    "tags": [
      "builtin",
      "mlx",
      "video"
    ],
    "supported_params": [],
    "public_benchmarks": []
  },
  {
    "id": "apple/foundation:default",
    "name": "apple-foundation",
    "provider": "apple",
    "family": "foundation",
    "version": "1.0",
    "capabilities": [
      "generate",
      "summarize",
      "classify"
    ],
    "context_length": 8192,
    "param_count": "system",
    "quantization": null,
    "performance": {
      "latency_p50_ms": 50,
      "latency_p99_ms": null,
      "tokens_per_second": null
    },
    "cost": {
      "input_per_mtok": null,
      "output_per_mtok": null,
      "size_mb": null,
      "ram_mb": null
    },
    "source": {
      "type": "apple_foundation_models",
      "use_case": null
    },
    "tags": [
      "builtin",
      "local",
      "low_latency",
      "private"
    ],
    "supported_params": [
      "temperature",
      "max_tokens"
    ],
    "public_benchmarks": []
  }
]