---
openapi: 3.1.0
info:
title: Inference Gateway API
description: |
The API for interacting with various language models and other AI services.
OpenAI, Groq, Ollama, and other providers are supported.
OpenAI compatible API for using with existing clients.
Unified API for all providers.
contact:
name: Inference Gateway
url: https://inference-gateway.github.io/docs/
version: 1.0.0
license:
name: MIT
url: https://github.com/inference-gateway/inference-gateway/blob/main/LICENSE
servers:
- url: http://localhost:8080
description: Default server without version prefix for healthcheck and proxy and points
x-server-tags: ['Health', 'Proxy']
- url: http://localhost:8080/v1
description: Default server with version prefix for listing models and chat completions
x-server-tags: ['Models', 'Completions']
- url: https://api.inference-gateway.local/v1
description: Local server with version prefix for listing models and chat completions
x-server-tags: ['Models', 'Completions']
tags:
- name: Models
description: List and describe the various models available in the API.
- name: Completions
description: Generate completions from the models.
- name: MCP
description: List and manage MCP tools.
- name: Proxy
description: Proxy requests to provider endpoints.
- name: Health
description: Health check
paths:
/models:
get:
operationId: listModels
tags:
- Models
description: |
Lists the currently available models, and provides basic information
about each one such as the owner and availability.
summary:
Lists the currently available models, and provides basic information
about each one such as the owner and availability.
security:
- bearerAuth: []
parameters:
- name: provider
in: query
required: false
schema:
$ref: '#/components/schemas/Provider'
description: Specific provider to query (optional)
responses:
'200':
description: List of available models
content:
application/json:
schema:
$ref: '#/components/schemas/ListModelsResponse'
examples:
allProviders:
summary: Models from all providers
value:
object: 'list'
data:
- id: 'openai/gpt-4o'
object: 'model'
created: 1686935002
owned_by: 'openai'
served_by: 'openai'
- id: 'openai/llama-3.3-70b-versatile'
object: 'model'
created: 1723651281
owned_by: 'groq'
served_by: 'groq'
- id: 'cohere/claude-3-opus-20240229'
object: 'model'
created: 1708905600
owned_by: 'anthropic'
served_by: 'anthropic'
- id: 'cohere/command-r'
object: 'model'
created: 1707868800
owned_by: 'cohere'
served_by: 'cohere'
- id: 'ollama/phi3:3.8b'
object: 'model'
created: 1718441600
owned_by: 'ollama'
served_by: 'ollama'
- id: 'ollama_cloud/gpt-oss:20b'
object: 'model'
created: 1730419200
owned_by: 'ollama_cloud'
served_by: 'ollama_cloud'
- id: 'mistral/mistral-large-latest'
object: 'model'
created: 1698019200
owned_by: 'mistral'
served_by: 'mistral'
singleProvider:
summary: Models from a specific provider
value:
object: 'list'
data:
- id: 'openai/gpt-4o'
object: 'model'
created: 1686935002
owned_by: 'openai'
served_by: 'openai'
- id: 'openai/gpt-4-turbo'
object: 'model'
created: 1687882410
owned_by: 'openai'
served_by: 'openai'
- id: 'openai/gpt-3.5-turbo'
object: 'model'
created: 1677649963
owned_by: 'openai'
served_by: 'openai'
'401':
$ref: '#/components/responses/Unauthorized'
'500':
$ref: '#/components/responses/InternalError'
/chat/completions:
post:
operationId: createChatCompletion
tags:
- Completions
description: |
Generates a chat completion based on the provided input.
The completion can be streamed to the client as it is generated.
summary: Create a chat completion
security:
- bearerAuth: []
parameters:
- name: provider
in: query
required: false
schema:
$ref: '#/components/schemas/Provider'
description: Specific provider to use (default determined by model)
requestBody:
$ref: '#/components/requestBodies/CreateChatCompletionRequest'
responses:
'200':
description: Successful response
content:
application/json:
schema:
$ref: '#/components/schemas/CreateChatCompletionResponse'
text/event-stream:
schema:
$ref: '#/components/schemas/SSEvent'
'400':
$ref: '#/components/responses/BadRequest'
'401':
$ref: '#/components/responses/Unauthorized'
'500':
$ref: '#/components/responses/InternalError'
/mcp/tools:
get:
operationId: listTools
tags:
- MCP
description: |
Lists the currently available MCP tools. Only accessible when EXPOSE_MCP is enabled.
summary: Lists the currently available MCP tools
security:
- bearerAuth: []
responses:
'200':
description: Successful response
content:
application/json:
schema:
$ref: '#/components/schemas/ListToolsResponse'
'401':
$ref: '#/components/responses/Unauthorized'
'403':
$ref: '#/components/responses/MCPNotExposed'
'500':
$ref: '#/components/responses/InternalError'
/proxy/{provider}/{path}:
parameters:
- name: provider
in: path
required: true
schema:
$ref: '#/components/schemas/Provider'
- name: path
in: path
required: true
style: simple
explode: false
schema:
type: string
description: The remaining path to proxy to the provider
get:
operationId: proxyGet
tags:
- Proxy
description: |
Proxy GET request to provider
The request body depends on the specific provider and endpoint being called.
If you decide to use this approach, please follow the provider-specific documentations.
summary: Proxy GET request to provider
responses:
'200':
$ref: '#/components/responses/ProviderResponse'
'400':
$ref: '#/components/responses/BadRequest'
'401':
$ref: '#/components/responses/Unauthorized'
'500':
$ref: '#/components/responses/InternalError'
security:
- bearerAuth: []
post:
operationId: proxyPost
tags:
- Proxy
description: |
Proxy POST request to provider
The request body depends on the specific provider and endpoint being called.
If you decide to use this approach, please follow the provider-specific documentations.
summary: Proxy POST request to provider
requestBody:
$ref: '#/components/requestBodies/ProviderRequest'
responses:
'200':
$ref: '#/components/responses/ProviderResponse'
'400':
$ref: '#/components/responses/BadRequest'
'401':
$ref: '#/components/responses/Unauthorized'
'500':
$ref: '#/components/responses/InternalError'
security:
- bearerAuth: []
put:
operationId: proxyPut
tags:
- Proxy
description: |
Proxy PUT request to provider
The request body depends on the specific provider and endpoint being called.
If you decide to use this approach, please follow the provider-specific documentations.
summary: Proxy PUT request to provider
requestBody:
$ref: '#/components/requestBodies/ProviderRequest'
responses:
'200':
$ref: '#/components/responses/ProviderResponse'
'400':
$ref: '#/components/responses/BadRequest'
'401':
$ref: '#/components/responses/Unauthorized'
'500':
$ref: '#/components/responses/InternalError'
security:
- bearerAuth: []
delete:
operationId: proxyDelete
tags:
- Proxy
description: |
Proxy DELETE request to provider
The request body depends on the specific provider and endpoint being called.
If you decide to use this approach, please follow the provider-specific documentations.
summary: Proxy DELETE request to provider
responses:
'200':
$ref: '#/components/responses/ProviderResponse'
'400':
$ref: '#/components/responses/BadRequest'
'401':
$ref: '#/components/responses/Unauthorized'
'500':
$ref: '#/components/responses/InternalError'
security:
- bearerAuth: []
patch:
operationId: proxyPatch
tags:
- Proxy
description: |
Proxy PATCH request to provider
The request body depends on the specific provider and endpoint being called.
If you decide to use this approach, please follow the provider-specific documentations.
summary: Proxy PATCH request to provider
requestBody:
$ref: '#/components/requestBodies/ProviderRequest'
responses:
'200':
$ref: '#/components/responses/ProviderResponse'
'400':
$ref: '#/components/responses/BadRequest'
'401':
$ref: '#/components/responses/Unauthorized'
'500':
$ref: '#/components/responses/InternalError'
security:
- bearerAuth: []
/health:
get:
operationId: healthCheck
tags:
- Health
description: |
Health check endpoint
Returns a 200 status code if the service is healthy
summary: Health check
responses:
'200':
description: Health check successful
components:
requestBodies:
ProviderRequest:
required: true
description: |
ProviderRequest depends on the specific provider and endpoint being called
If you decide to use this approach, please follow the provider-specific documentations.
content:
application/json:
schema:
type: object
properties:
model:
type: string
messages:
type: array
items:
type: object
properties:
role:
type: string
content:
type: string
temperature:
type: number
format: float
default: 0.7
examples:
openai:
summary: OpenAI chat completion request
value:
model: 'gpt-3.5-turbo'
messages:
- role: 'user'
content: 'Hello! How can I assist you today?'
temperature: 0.7
anthropic:
summary: Anthropic Claude request
value:
model: 'claude-3-opus-20240229'
messages:
- role: 'user'
content: 'Explain quantum computing'
temperature: 0.5
mistral:
summary: Mistral AI request
value:
model: 'mistral-large-latest'
messages:
- role: 'user'
content: 'Write a Python function to calculate fibonacci numbers'
temperature: 0.3
CreateChatCompletionRequest:
required: true
description: |
ProviderRequest depends on the specific provider and endpoint being called
If you decide to use this approach, please follow the provider-specific documentations.
content:
application/json:
schema:
$ref: '#/components/schemas/CreateChatCompletionRequest'
responses:
BadRequest:
description: Bad request
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
Unauthorized:
description: Unauthorized
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
InternalError:
description: Internal server error
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
MCPNotExposed:
description: MCP tools endpoint is not exposed
content:
application/json:
schema:
$ref: '#/components/schemas/Error'
example:
error: 'MCP tools endpoint is not exposed. Set EXPOSE_MCP=true to enable.'
ProviderResponse:
description: |
ProviderResponse depends on the specific provider and endpoint being called
If you decide to use this approach, please follow the provider-specific documentations.
content:
application/json:
schema:
$ref: '#/components/schemas/ProviderSpecificResponse'
examples:
openai:
summary: OpenAI API response
value:
{
'id': 'chatcmpl-123',
'object': 'chat.completion',
'created': 1677652288,
'model': 'gpt-3.5-turbo',
'choices':
[
{
'index': 0,
'message':
{
'role': 'assistant',
'content': 'Hello! How can I help you today?',
},
'finish_reason': 'stop',
},
],
}
mistral:
summary: Mistral AI response
value:
{
'id': 'cmpl-123',
'object': 'chat.completion',
'created': 1677652288,
'model': 'mistral-large-latest',
'choices':
[
{
'index': 0,
'message':
{
'role': 'assistant',
'content': 'def fibonacci(n):\n if n <= 1:\n return n\n return fibonacci(n-1) + fibonacci(n-2)',
},
'finish_reason': 'stop',
},
],
}
securitySchemes:
bearerAuth:
type: http
scheme: bearer
bearerFormat: JWT
description: |
Authentication is optional by default.
To enable authentication, set AUTH_ENABLE to true.
When enabled, requests must include a valid JWT token in the Authorization header.
schemas:
Provider:
type: string
enum:
- ollama
- ollama_cloud
- groq
- openai
- cloudflare
- cohere
- anthropic
- deepseek
- google
- mistral
- moonshot
x-provider-configs:
ollama:
id: 'ollama'
url: 'http://ollama:8080/v1'
auth_type: 'none'
supports_vision: true
endpoints:
models:
name: 'list_models'
method: 'GET'
endpoint: '/models'
chat:
name: 'chat_completions'
method: 'POST'
endpoint: '/chat/completions'
ollama_cloud:
id: 'ollama_cloud'
url: 'https://ollama.com/v1'
auth_type: 'bearer'
supports_vision: true
endpoints:
models:
name: 'list_models'
method: 'GET'
endpoint: '/models'
chat:
name: 'chat_completions'
method: 'POST'
endpoint: '/chat/completions'
anthropic:
id: 'anthropic'
url: 'https://api.anthropic.com/v1'
auth_type: 'xheader'
supports_vision: true
endpoints:
models:
name: 'list_models'
method: 'GET'
endpoint: '/models'
chat:
name: 'chat_completions'
method: 'POST'
endpoint: '/chat/completions'
cohere:
id: 'cohere'
url: 'https://api.cohere.ai'
auth_type: 'bearer'
supports_vision: true
endpoints:
models:
name: 'list_models'
method: 'GET'
endpoint: '/v1/models'
chat:
name: 'chat_completions'
method: 'POST'
endpoint: '/compatibility/v1/chat/completions'
groq:
id: 'groq'
url: 'https://api.groq.com/openai/v1'
auth_type: 'bearer'
supports_vision: true
endpoints:
models:
name: 'list_models'
method: 'GET'
endpoint: '/models'
chat:
name: 'chat_completions'
method: 'POST'
endpoint: '/chat/completions'
openai:
id: 'openai'
url: 'https://api.openai.com/v1'
auth_type: 'bearer'
supports_vision: true
endpoints:
models:
name: 'list_models'
method: 'GET'
endpoint: '/models'
chat:
name: 'chat_completions'
method: 'POST'
endpoint: '/chat/completions'
cloudflare:
id: 'cloudflare'
url: 'https://api.cloudflare.com/client/v4/accounts/{ACCOUNT_ID}/ai'
auth_type: 'bearer'
supports_vision: false
endpoints:
models:
name: 'list_models'
method: 'GET'
endpoint: '/finetunes/public?limit=1000'
chat:
name: 'chat_completions'
method: 'POST'
endpoint: '/v1/chat/completions'
deepseek:
id: 'deepseek'
url: 'https://api.deepseek.com'
auth_type: 'bearer'
supports_vision: false
endpoints:
models:
name: 'list_models'
method: 'GET'
endpoint: '/models'
chat:
name: 'chat_completions'
method: 'POST'
endpoint: '/chat/completions'
google:
id: 'google'
url: 'https://generativelanguage.googleapis.com/v1beta/openai'
auth_type: 'bearer'
supports_vision: true
endpoints:
models:
name: 'list_models'
method: 'GET'
endpoint: '/models'
chat:
name: 'chat_completions'
method: 'POST'
endpoint: '/chat/completions'
mistral:
id: 'mistral'
url: 'https://api.mistral.ai/v1'
auth_type: 'bearer'
supports_vision: true
endpoints:
models:
name: 'list_models'
method: 'GET'
endpoint: '/models'
chat:
name: 'chat_completions'
method: 'POST'
endpoint: '/chat/completions'
moonshot:
id: 'moonshot'
url: 'https://api.moonshot.ai/v1'
auth_type: 'bearer'
supports_vision: false
endpoints:
models:
name: 'list_models'
method: 'GET'
endpoint: '/models'
chat:
name: 'chat_completions'
method: 'POST'
endpoint: '/chat/completions'
ProviderSpecificResponse:
type: object
description: |
Provider-specific response format. Examples:
OpenAI GET /v1/models?provider=openai response:
```json
{
"provider": "openai",
"object": "list",
"data": [
{
"id": "gpt-4",
"object": "model",
"created": 1687882410,
"owned_by": "openai",
"served_by": "openai"
}
]
}
```
Anthropic GET /v1/models?provider=anthropic response:
```json
{
"provider": "anthropic",
"object": "list",
"data": [
{
"id": "gpt-4",
"object": "model",
"created": 1687882410,
"owned_by": "openai",
"served_by": "openai"
}
]
}
```
ProviderAuthType:
type: string
description: Authentication type for providers
enum:
- bearer
- xheader
- query
- none
SSEvent:
type: object
properties:
event:
type: string
enum:
- message-start
- stream-start
- content-start
- content-delta
- content-end
- message-end
- stream-end
data:
type: string
format: byte
retry:
type: integer
Endpoints:
type: object
properties:
models:
type: string
chat:
type: string
required:
- models
- chat
Error:
type: object
properties:
error:
type: string
MessageRole:
type: string
description: Role of the message sender
enum:
- system
- user
- assistant
- tool
Message:
type: object
description: Message structure for provider requests
properties:
role:
$ref: '#/components/schemas/MessageRole'
content:
$ref: '#/components/schemas/MessageContent'
tool_calls:
type: array
items:
$ref: '#/components/schemas/ChatCompletionMessageToolCall'
tool_call_id:
type: string
reasoning_content:
type: string
description: The reasoning content of the chunk message.
reasoning:
type: string
description: The reasoning of the chunk message. Same as reasoning_content.
required:
- role
- content
MessageContent:
description: Message content - either text or multimodal content parts
oneOf:
- type: string
description: Text content (backward compatibility)
- type: array
items:
$ref: '#/components/schemas/ContentPart'
description: Array of content parts for multimodal messages
ContentPart:
type: object
description: A content part within a multimodal message
oneOf:
- $ref: '#/components/schemas/TextContentPart'
- $ref: '#/components/schemas/ImageContentPart'
TextContentPart:
type: object
description: Text content part
properties:
type:
type: string
enum: [text]
description: Content type identifier
text:
type: string
description: The text content
required:
- type
- text
ImageContentPart:
type: object
description: Image content part
properties:
type:
type: string
enum: [image_url]
description: Content type identifier
image_url:
$ref: '#/components/schemas/ImageURL'
required:
- type
- image_url
ImageURL:
type: object
description: Image URL configuration
properties:
url:
type: string
description: URL of the image (data URLs supported)
detail:
type: string
enum: [auto, low, high]
default: auto
description: Image detail level for vision processing
required:
- url
Model:
type: object
description: Common model information
properties:
id:
type: string
object:
type: string
created:
type: integer
format: int64
owned_by:
type: string
served_by:
$ref: '#/components/schemas/Provider'
required:
- id
- object
- created
- owned_by
- served_by
ListModelsResponse:
type: object
description: Response structure for listing models
properties:
provider:
$ref: '#/components/schemas/Provider'
object:
type: string
data:
type: array
items:
$ref: '#/components/schemas/Model'
default: []
required:
- object
- data
ListToolsResponse:
type: object
description: Response structure for listing MCP tools
properties:
object:
type: string
description: Always "list"
example: 'list'
data:
type: array
items:
$ref: '#/components/schemas/MCPTool'
default: []
description: Array of available MCP tools
required:
- object
- data
MCPTool:
type: object
description: An MCP tool definition
properties:
name:
type: string
description: The name of the tool
example: 'read_file'
description:
type: string
description: A description of what the tool does
example: 'Read content from a file'
server:
type: string
description: The MCP server that provides this tool
example: 'http://mcp-filesystem-server:8083/mcp'
input_schema:
type: object
description: JSON schema for the tool's input parameters
example:
type: 'object'
properties:
file_path:
type: 'string'
description: 'Path to the file to read'
required: ['file_path']
additionalProperties: true
required:
- name
- description
- server
FunctionObject:
type: object
properties:
description:
type: string
description:
A description of what the function does, used by the model to
choose when and how to call the function.
name:
type: string
description:
The name of the function to be called. Must be a-z, A-Z, 0-9, or
contain underscores and dashes, with a maximum length of 64.
parameters:
$ref: '#/components/schemas/FunctionParameters'
strict:
type: boolean
default: false
description:
Whether to enable strict schema adherence when generating the
function call. If set to true, the model will follow the exact
schema defined in the `parameters` field. Only a subset of JSON
Schema is supported when `strict` is `true`. Learn more about
Structured Outputs in the [function calling
guide](docs/guides/function-calling).
required:
- name
ChatCompletionTool:
type: object
properties:
type:
$ref: '#/components/schemas/ChatCompletionToolType'
function:
$ref: '#/components/schemas/FunctionObject'
required:
- type
- function
FunctionParameters:
type: object
description: >-
The parameters the functions accepts, described as a JSON Schema object.
See the [guide](/docs/guides/function-calling) for examples, and the
[JSON Schema
reference](https://json-schema.org/understanding-json-schema/) for
documentation about the format.
Omitting `parameters` defines a function with an empty parameter list.
additionalProperties: true
ChatCompletionToolType:
type: string
description: The type of the tool. Currently, only `function` is supported.
enum:
- function
CompletionUsage:
type: object
description: Usage statistics for the completion request.
properties:
completion_tokens:
type: integer
default: 0
format: int64
description: Number of tokens in the generated completion.
prompt_tokens:
type: integer
default: 0
format: int64
description: Number of tokens in the prompt.
total_tokens:
type: integer
default: 0
format: int64
description: Total number of tokens used in the request (prompt + completion).
required:
- prompt_tokens
- completion_tokens
- total_tokens
ChatCompletionStreamOptions:
description: >
Options for streaming response. Only set this when you set `stream:
true`.
type: object
properties:
include_usage:
type: boolean
description: >
If set, an additional chunk will be streamed before the `data:
[DONE]` message. The `usage` field on this chunk shows the token
usage statistics for the entire request, and the `choices` field
will always be an empty array. All other chunks will also include a
`usage` field, but with a null value.
required:
- include_usage
CreateChatCompletionRequest:
type: object
properties:
model:
type: string
description: Model ID to use
messages:
description: >
A list of messages comprising the conversation so far.
type: array
minItems: 1
items:
$ref: '#/components/schemas/Message'
max_tokens:
description: >
An upper bound for the number of tokens that can be generated
for a completion, including visible output tokens and reasoning tokens.
type: integer
stream:
description: >
If set to true, the model response data will be streamed to the
client as it is generated using [server-sent
events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
type: boolean
default: false
stream_options:
$ref: '#/components/schemas/ChatCompletionStreamOptions'
tools:
type: array
description: >
A list of tools the model may call. Currently, only functions
are supported as a tool. Use this to provide a list of functions
the model may generate JSON inputs for. A max of 128 functions
are supported.
items:
$ref: '#/components/schemas/ChatCompletionTool'
reasoning_format:
type: string
description: >
The format of the reasoning content. Can be `raw` or `parsed`.
When specified as raw some reasoning models will output <think /> tags.
When specified as parsed the model will output the reasoning under
`reasoning` or `reasoning_content` attribute.
required:
- model
- messages
ChatCompletionMessageToolCallFunction:
type: object
description: The function that the model called.
properties:
name:
type: string
description: The name of the function to call.
arguments:
type: string
description:
The arguments to call the function with, as generated by the model
in JSON format. Note that the model does not always generate
valid JSON, and may hallucinate parameters not defined by your
function schema. Validate the arguments in your code before
calling your function.
required:
- name
- arguments
ChatCompletionMessageToolCall:
type: object
properties:
id:
type: string
description: The ID of the tool call.
type:
$ref: '#/components/schemas/ChatCompletionToolType'
function:
$ref: '#/components/schemas/ChatCompletionMessageToolCallFunction'
extra_content:
$ref: '#/components/schemas/ToolCallExtraContent'
required:
- id
- type
- function
ChatCompletionChoice:
type: object
properties:
finish_reason:
$ref: '#/components/schemas/FinishReason'
index:
type: integer
description: The index of the choice in the list of choices.
message:
$ref: '#/components/schemas/Message'
logprobs:
description: Log probability information for the choice.
type: object
nullable: true
properties:
content:
description: A list of message content tokens with log probability information.
type: array
items:
$ref: '#/components/schemas/ChatCompletionTokenLogprob'
refusal:
description: A list of message refusal tokens with log probability information.
type: array
items:
$ref: '#/components/schemas/ChatCompletionTokenLogprob'
required:
- content
- refusal
required:
- finish_reason
- index
- message
ChatCompletionStreamChoice:
type: object
required:
- delta
- finish_reason
- index
properties:
delta:
$ref: '#/components/schemas/ChatCompletionStreamResponseDelta'
logprobs:
description: Log probability information for the choice.
type: object
properties:
content:
description: A list of message content tokens with log probability information.
type: array
items:
$ref: '#/components/schemas/ChatCompletionTokenLogprob'
refusal:
description: A list of message refusal tokens with log probability information.
type: array
items:
$ref: '#/components/schemas/ChatCompletionTokenLogprob'
required:
- content
- refusal
finish_reason:
$ref: '#/components/schemas/FinishReason'
index:
type: integer
description: The index of the choice in the list of choices.
CreateChatCompletionResponse:
type: object
description:
Represents a chat completion response returned by model, based on
the provided input.
properties:
id:
type: string
description: A unique identifier for the chat completion.
choices:
type: array
description:
A list of chat completion choices. Can be more than one if `n` is
greater than 1.
items:
$ref: '#/components/schemas/ChatCompletionChoice'
created:
type: integer
description:
The Unix timestamp (in seconds) of when the chat completion was
created.
model:
type: string
description: The model used for the chat completion.
object:
type: string
description: The object type, which is always `chat.completion`.
x-stainless-const: true
usage:
$ref: '#/components/schemas/CompletionUsage'
required:
- choices
- created
- id
- model
- object
ChatCompletionStreamResponseDelta:
type: object
description: A chat completion delta generated by streamed model responses.
properties:
content:
type: string
description: The contents of the chunk message.
reasoning_content:
type: string
description: The reasoning content of the chunk message.
reasoning:
type: string
description: The reasoning of the chunk message. Same as reasoning_content.
tool_calls:
type: array
items:
$ref: '#/components/schemas/ChatCompletionMessageToolCallChunk'
role:
$ref: '#/components/schemas/MessageRole'
refusal:
type: string
description: The refusal message generated by the model.
required:
- content
- role
ChatCompletionMessageToolCallChunk:
type: object
properties:
index:
type: integer
id:
type: string
description: The ID of the tool call.
type:
type: string
description: The type of the tool. Currently, only `function` is supported.
function:
$ref: '#/components/schemas/ChatCompletionMessageToolCallFunction'
extra_content:
$ref: '#/components/schemas/ToolCallExtraContent'
required:
- index
ToolCallExtraContent:
type: object
description: |
Provider-specific opaque data attached to a tool call. The contents are
not interpreted by the gateway, but must be echoed back verbatim on the
next request that references this tool call. Currently used by Google
Gemini extended-thinking models to carry the per-call `thought_signature`.
Other providers may ignore the field.
properties:
google:
type: object
description: Google Gemini-specific extra content.
properties:
thought_signature:
type: string
description: |
Opaque signature returned with reasoning-enabled tool calls.
Must be echoed back verbatim in the next request that includes
this tool call, or Google will reject the request.
additionalProperties: true
ChatCompletionTokenLogprob:
type: object
properties:
token: &a1
description: The token.
type: string
logprob: &a2
description:
The log probability of this token, if it is within the top 20 most
likely tokens. Otherwise, the value `-9999.0` is used to signify
that the token is very unlikely.
type: number
bytes: &a3
description:
A list of integers representing the UTF-8 bytes representation of
the token. Useful in instances where characters are represented by
multiple tokens and their byte representations must be combined to
generate the correct text representation. Can be `null` if there is
no bytes representation for the token.
type: array
items:
type: integer
top_logprobs:
description:
List of the most likely tokens and their log probability, at this
token position. In rare cases, there may be fewer than the number of
requested `top_logprobs` returned.
type: array
items:
type: object
properties:
token: *a1
logprob: *a2
bytes: *a3
required:
- token
- logprob
- bytes
required:
- token
- logprob
- bytes
- top_logprobs
FinishReason:
type: string
description: >
The reason the model stopped generating tokens. This will be
`stop` if the model hit a natural stop point or a provided
stop sequence,
`length` if the maximum number of tokens specified in the
request was reached,
`content_filter` if content was omitted due to a flag from our
content filters,
`tool_calls` if the model called a tool.
enum:
- stop
- length
- tool_calls
- content_filter
- function_call
CreateChatCompletionStreamResponse:
type: object
description: |
Represents a streamed chunk of a chat completion response returned
by the model, based on the provided input.
properties:
id:
type: string
description:
A unique identifier for the chat completion. Each chunk has the
same ID.
choices:
type: array
description: >
A list of chat completion choices. Can contain more than one
elements if `n` is greater than 1. Can also be empty for the
last chunk if you set `stream_options: {"include_usage": true}`.
items:
$ref: '#/components/schemas/ChatCompletionStreamChoice'
created:
type: integer
description:
The Unix timestamp (in seconds) of when the chat completion was
created. Each chunk has the same timestamp.
model:
type: string
description: The model to generate the completion.
system_fingerprint:
type: string
description: >
This fingerprint represents the backend configuration that the model
runs with.
Can be used in conjunction with the `seed` request parameter to
understand when backend changes have been made that might impact
determinism.
object:
type: string
description: The object type, which is always `chat.completion.chunk`.
usage:
$ref: '#/components/schemas/CompletionUsage'
reasoning_format:
type: string
description: >
The format of the reasoning content. Can be `raw` or `parsed`.
When specified as raw some reasoning models will output <think /> tags.
When specified as parsed the model will output the reasoning under reasoning_content.
required:
- choices
- created
- id
- model
- object
Config:
x-config:
sections:
- general:
title: 'General settings'
settings:
- name: environment
env: 'ENVIRONMENT'
type: string
default: 'production'
description: 'The environment'
- name: allowed_models
env: 'ALLOWED_MODELS'
type: string
default: ''
description: 'Comma-separated list of models to allow. If empty, all models will be available'
- name: disallowed_models
env: 'DISALLOWED_MODELS'
type: string
default: ''
description: 'Comma-separated list of models to disallow. If empty, no models will be blocked. Takes lower precedence than ALLOWED_MODELS'
- name: enable_vision
env: 'ENABLE_VISION'
type: bool
default: 'false'
description: 'Enable vision/multimodal support for all providers. When disabled, image inputs will be rejected even if the provider and model support vision'
- name: debug_content_truncate_words
env: 'DEBUG_CONTENT_TRUNCATE_WORDS'
type: int
default: '10'
description: 'Number of words to truncate per content section in debug logs (development mode only)'
- name: debug_max_messages
env: 'DEBUG_MAX_MESSAGES'
type: int
default: '100'
description: 'Maximum number of messages to show in debug logs (development mode only)'
- telemetry:
title: 'Telemetry'
settings:
- name: telemetry_enable
env: 'TELEMETRY_ENABLE'
type: bool
default: 'false'
description: 'Enable telemetry'
- name: telemetry_metrics_port
env: 'TELEMETRY_METRICS_PORT'
type: string
default: '9464'
description: 'Port for telemetry metrics server'
- mcp:
title: 'Model Context Protocol (MCP)'
settings:
- name: mcp_enable
env: 'MCP_ENABLE'
type: bool
default: 'false'
description: 'Enable MCP'
- name: mcp_expose
env: 'MCP_EXPOSE'
type: bool
default: 'false'
description: 'Expose MCP tools endpoint'
- name: mcp_servers
env: 'MCP_SERVERS'
type: string
description: 'List of MCP servers'
- name: mcp_client_timeout
env: 'MCP_CLIENT_TIMEOUT'
type: time.Duration
default: '5s'
description: 'MCP client HTTP timeout'
- name: mcp_dial_timeout
env: 'MCP_DIAL_TIMEOUT'
type: time.Duration
default: '3s'
description: 'MCP client dial timeout'
- name: mcp_tls_handshake_timeout
env: 'MCP_TLS_HANDSHAKE_TIMEOUT'
type: time.Duration
default: '3s'
description: 'MCP client TLS handshake timeout'
- name: mcp_response_header_timeout
env: 'MCP_RESPONSE_HEADER_TIMEOUT'
type: time.Duration
default: '3s'
description: 'MCP client response header timeout'
- name: mcp_expect_continue_timeout
env: 'MCP_EXPECT_CONTINUE_TIMEOUT'
type: time.Duration
default: '1s'
description: 'MCP client expect continue timeout'
- name: mcp_request_timeout
env: 'MCP_REQUEST_TIMEOUT'
type: time.Duration
default: '5s'
description: 'MCP client request timeout for initialize and tool calls'
- name: mcp_max_retries
env: 'MCP_MAX_RETRIES'
type: int
default: '3'
description: 'Maximum number of connection retry attempts'
- name: mcp_retry_interval
env: 'MCP_RETRY_INTERVAL'
type: time.Duration
default: '5s'
description: 'Interval between connection retry attempts'
- name: mcp_initial_backoff
env: 'MCP_INITIAL_BACKOFF'
type: time.Duration
default: '1s'
description: 'Initial backoff duration for exponential backoff retry'
- name: mcp_enable_reconnect
env: 'MCP_ENABLE_RECONNECT'
type: bool
default: 'true'
description: 'Enable automatic reconnection for failed servers'
- name: mcp_reconnect_interval
env: 'MCP_RECONNECT_INTERVAL'
type: time.Duration
default: '30s'
description: 'Interval between reconnection attempts'
- name: mcp_polling_enable
env: 'MCP_POLLING_ENABLE'
type: bool
default: 'true'
description: 'Enable health check polling'
- name: mcp_polling_interval
env: 'MCP_POLLING_INTERVAL'
type: time.Duration
default: '30s'
description: 'Interval between health check polling requests'
- name: mcp_polling_timeout
env: 'MCP_POLLING_TIMEOUT'
type: time.Duration
default: '5s'
description: 'Timeout for individual health check requests'
- name: mcp_disable_healthcheck_logs
env: 'MCP_DISABLE_HEALTHCHECK_LOGS'
type: bool
default: 'true'
description: 'Disable health check log messages to reduce noise'
- auth:
title: 'Authentication'
settings:
- name: auth_enable
env: 'AUTH_ENABLE'
type: bool
default: 'false'
description: 'Enable authentication'
- name: auth_oidc_issuer
env: 'AUTH_OIDC_ISSUER'
type: string
default: 'http://keycloak:8080/realms/inference-gateway-realm'
description: 'OIDC issuer URL'
- name: auth_oidc_client_id
env: 'AUTH_OIDC_CLIENT_ID'
type: string
default: 'inference-gateway-client'
description: 'OIDC client ID'
secret: true
- name: auth_oidc_client_secret
env: 'AUTH_OIDC_CLIENT_SECRET'
type: string
description: 'OIDC client secret'
secret: true
- server:
title: 'Server settings'
settings:
- name: host
env: 'SERVER_HOST'
type: string
default: '0.0.0.0'
description: 'Server host'
- name: port
env: 'SERVER_PORT'
type: string
default: '8080'
description: 'Server port'
- name: read_timeout
env: 'SERVER_READ_TIMEOUT'
type: time.Duration
default: '30s'
description: 'Read timeout'
- name: write_timeout
env: 'SERVER_WRITE_TIMEOUT'
type: time.Duration
default: '30s'
description: 'Write timeout'
- name: idle_timeout
env: 'SERVER_IDLE_TIMEOUT'
type: time.Duration
default: '120s'
description: 'Idle timeout'
- name: tls_cert_path
env: 'SERVER_TLS_CERT_PATH'
type: string
description: 'TLS certificate path'
- name: tls_key_path
env: 'SERVER_TLS_KEY_PATH'
type: string
description: 'TLS key path'
- client:
title: 'Client settings'
settings:
- name: timeout
env: 'CLIENT_TIMEOUT'
type: time.Duration
default: '30s'
description: 'Client timeout'
- name: max_idle_conns
env: 'CLIENT_MAX_IDLE_CONNS'
type: int
default: '20'
description: 'Maximum idle connections'
- name: max_idle_conns_per_host
env: 'CLIENT_MAX_IDLE_CONNS_PER_HOST'
type: int
default: '20'
description: 'Maximum idle connections per host'
- name: idle_conn_timeout
env: 'CLIENT_IDLE_CONN_TIMEOUT'
type: time.Duration
default: '30s'
description: 'Idle connection timeout'
- name: tls_min_version
env: 'CLIENT_TLS_MIN_VERSION'
type: string
default: 'TLS12'
description: 'Minimum TLS version'
- name: disable_compression
env: 'CLIENT_DISABLE_COMPRESSION'
type: bool
default: 'true'
description: 'Disable compression for faster streaming'
- name: response_header_timeout
env: 'CLIENT_RESPONSE_HEADER_TIMEOUT'
type: time.Duration
default: '10s'
description: 'Response header timeout'
- name: expect_continue_timeout
env: 'CLIENT_EXPECT_CONTINUE_TIMEOUT'
type: time.Duration
default: '1s'
description: 'Expect continue timeout'
- providers:
title: 'Providers'
settings:
- name: anthropic_api_url
env: 'ANTHROPIC_API_URL'
type: string
default: 'https://api.anthropic.com/v1'
description: 'Anthropic API URL'
- name: anthropic_api_key
env: 'ANTHROPIC_API_KEY'
type: string
description: 'Anthropic API Key'
secret: true
- name: cloudflare_api_url
env: 'CLOUDFLARE_API_URL'
type: string
default: 'https://api.cloudflare.com/client/v4/accounts/{ACCOUNT_ID}/ai'
description: 'Cloudflare API URL'
- name: cloudflare_api_key
env: 'CLOUDFLARE_API_KEY'
type: string
description: 'Cloudflare API Key'
secret: true
- name: cohere_api_url
env: 'COHERE_API_URL'
type: string
default: 'https://api.cohere.ai'
description: 'Cohere API URL'
- name: cohere_api_key
env: 'COHERE_API_KEY'
type: string
description: 'Cohere API Key'
secret: true
- name: groq_api_url
env: 'GROQ_API_URL'
type: string
default: 'https://api.groq.com/openai/v1'
description: 'Groq API URL'
- name: groq_api_key
env: 'GROQ_API_KEY'
type: string
description: 'Groq API Key'
secret: true
- name: ollama_api_url
env: 'OLLAMA_API_URL'
type: string
default: 'http://ollama:8080/v1'
description: 'Ollama API URL'
- name: ollama_api_key
env: 'OLLAMA_API_KEY'
type: string
description: 'Ollama API Key'
secret: true
- name: ollama_cloud_api_url
env: 'OLLAMA_CLOUD_API_URL'
type: string
default: 'https://ollama.com/v1'
description: 'Ollama Cloud API URL'
- name: ollama_cloud_api_key
env: 'OLLAMA_CLOUD_API_KEY'
type: string
description: 'Ollama Cloud API Key'
secret: true
- name: openai_api_url
env: 'OPENAI_API_URL'
type: string
default: 'https://api.openai.com/v1'
description: 'OpenAI API URL'
- name: openai_api_key
env: 'OPENAI_API_KEY'
type: string
description: 'OpenAI API Key'
secret: true
- name: deepseek_api_url
env: 'DEEPSEEK_API_URL'
type: string
default: 'https://api.deepseek.com'
description: 'DeepSeek API URL'
- name: deepseek_api_key
env: 'DEEPSEEK_API_KEY'
type: string
description: 'DeepSeek API Key'
secret: true
- name: google_api_url
env: 'GOOGLE_API_URL'
type: string
default: 'https://generativelanguage.googleapis.com/v1beta/openai'
description: 'Google API URL'
- name: google_api_key
env: 'GOOGLE_API_KEY'
type: string
description: 'Google API Key'
secret: true
- name: mistral_api_url
env: 'MISTRAL_API_URL'
type: string
default: 'https://api.mistral.ai/v1'
description: 'Mistral API URL'
- name: mistral_api_key
env: 'MISTRAL_API_KEY'
type: string
description: 'Mistral API Key'
secret: true
- name: moonshot_api_url
env: 'MOONSHOT_API_URL'
type: string
default: 'https://api.moonshot.ai/v1'
description: 'Moonshot API URL'
- name: moonshot_api_key
env: 'MOONSHOT_API_KEY'
type: string
description: 'Moonshot API Key'
secret: true