1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
use std::collections::HashMap;
use serde::{Deserialize, Serialize};
use crate::types::{
realtime::{
MaxOutputTokens, RealtimeAudioFormats, RealtimeConversationItem, RealtimeTool,
RealtimeVoice, ToolChoice,
},
responses::Prompt,
};
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct RealtimeResponseUsage {
/// Details about the input tokens used in the Response. Cached tokens are tokens from previous
/// turns in the conversation that are included as context for the current response. Cached tokens
/// here are counted as a subset of input tokens, meaning input tokens will include cached and
/// uncached tokens.
#[serde(skip_serializing_if = "Option::is_none")]
pub input_token_details: Option<InputTokenDetails>,
/// The number of input tokens used in the Response, including text and audio tokens.
pub input_tokens: u32,
#[serde(skip_serializing_if = "Option::is_none")]
pub output_token_details: Option<OutputTokenDetails>,
/// The number of output tokens sent in the Response, including text and audio tokens.
pub output_tokens: u32,
/// The total number of tokens in the Response including input and output text and audio tokens.
pub total_tokens: u32,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct InputTokenDetails {
/// The number of audio tokens used as input for the Response.
#[serde(skip_serializing_if = "Option::is_none")]
pub audio_tokens: Option<u32>,
/// The number of cached tokens used as input for the Response.
#[serde(skip_serializing_if = "Option::is_none")]
pub cached_tokens: Option<u32>,
/// Details about the cached tokens used as input for the Response.
pub cached_token_details: Option<CachedTokenDetails>,
/// The number of image tokens used as input for the Response.
#[serde(skip_serializing_if = "Option::is_none")]
pub image_tokens: Option<u32>,
/// The number of text tokens used as input for the Response.
#[serde(skip_serializing_if = "Option::is_none")]
pub text_tokens: Option<u32>,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct CachedTokenDetails {
/// The number of cached audio tokens used as input for the Response.
#[serde(skip_serializing_if = "Option::is_none")]
pub audio_tokens: Option<u32>,
/// The number of cached image tokens used as input for the Response.
#[serde(skip_serializing_if = "Option::is_none")]
pub image_tokens: Option<u32>,
/// The number of cached text tokens used as input for the Response.
#[serde(skip_serializing_if = "Option::is_none")]
pub text_tokens: Option<u32>,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct OutputTokenDetails {
#[serde(skip_serializing_if = "Option::is_none")]
pub text_tokens: Option<u32>,
#[serde(skip_serializing_if = "Option::is_none")]
pub audio_tokens: Option<u32>,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "snake_case")]
pub enum RealtimeResponseStatus {
InProgress,
Completed,
Cancelled,
Failed,
Incomplete,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct Error {
pub code: String,
pub r#type: String,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "lowercase")]
pub enum RealtimeResponseStatusDetailType {
Completed,
Cancelled,
Incomplete,
Failed,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
#[serde(rename_all = "snake_case")]
pub enum RealtimeResponseStatusDetailReason {
TurnDetected,
ClientCancelled,
MaxOutputTokens,
ContentFilter,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct RealtimeResponseStatusDetail {
/// A description of the error that caused the response to fail, populated when the status is failed.
pub error: Option<Error>,
/// The reason the Response did not complete. For a `cancelled` Response, one of `turn_detected`
/// (the server VAD detected a new start of speech) or `client_cancelled` (the client sent a cancel
/// event). For an incomplete Response, one of `max_output_tokens` or `content_filter` (the
/// server-side safety filter activated and cut off the response).
pub reason: Option<RealtimeResponseStatusDetailReason>,
/// The type of error that caused the response to fail, corresponding with the `status`
/// field (`completed`, `cancelled`, `incomplete`, `failed`).
pub r#type: RealtimeResponseStatusDetailType,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct ResponseAudioOutput {
/// The format of the output audio.
pub format: RealtimeAudioFormats,
/// The voice the model uses to respond. Supported built-in voices are `alloy`, `ash`, `ballad`,
/// `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`. Voice cannot be changed during
/// the session once the model has responded with audio at least once. We recommend `marin` and `cedar`
/// for best quality.
pub voice: RealtimeVoice,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct ResponseAudio {
/// Configuration for audio output.
pub output: ResponseAudioOutput,
}
#[derive(Debug, Serialize, Deserialize, Clone, Default)]
#[serde(rename_all = "lowercase")]
pub enum Conversation {
#[default]
Auto,
None,
}
/// The response resource.
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct RealtimeResponseCreateParams {
/// Configuration for audio input and output.
pub audio: ResponseAudio,
/// Controls which conversation the response is added to. Currently supports auto and none,
/// with auto as the default value. The auto value means that the contents of the response
/// will be added to the default conversation. Set this to none to create an out-of-band
/// response which will not add items to default conversation.
pub conversation: Conversation,
/// Input items to include in the prompt for the model. Using this field creates a new context
/// for this Response instead of using the default conversation. An empty array `[]` will clear
/// the context for this Response. Note that this can include references to items that
/// previously appeared in the session using their id.
pub input: Vec<RealtimeConversationItem>,
/// The default system instructions (i.e. system message) prepended to model calls.
/// This field allows the client to guide the model on desired responses.
/// The model can be instructed on response content and format, (e.g. "be extremely succinct",
/// "act friendly", "here are examples of good responses") and on audio behavior
/// (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently").
/// The instructions are not guaranteed to be followed by the model, but they provide
/// guidance to the model on the desired behavior. Note that the server sets default
/// instructions which will be used if this field is not set and are visible in
/// the `session.created` event at the start of the session.
pub instructions: String,
/// Maximum number of output tokens for a single assistant response, inclusive of tool calls.
/// Provide an integer between 1 and 4096 to limit output tokens, or inf for the maximum
/// available tokens for a given model. Defaults to `inf`.
pub max_output_tokens: MaxOutputTokens,
/// Set of 16 key-value pairs that can be attached to an object. This can be useful for
/// storing additional information about the object in a structured format, and querying
/// for objects via API or the dashboard.
///
/// Keys are strings with a maximum length of 64 characters. Values are strings with a
/// maximum length of 512 characters.
#[serde(skip_serializing_if = "Option::is_none")]
pub metadata: Option<serde_json::Value>,
/// The set of modalities the model used to respond, currently the only possible values
/// are [\"audio\"], [\"text\"]. Audio output always include a text transcript.
/// Setting the output to mode `text` will disable audio output from the model.
pub output_modalities: Vec<String>,
/// Reference to a prompt template and its variables.
/// [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
#[serde(skip_serializing_if = "Option::is_none")]
pub prompt: Option<Prompt>,
/// How the model chooses tools. Provide one of the string modes or force a specific
/// function/MCP tool.
#[serde(skip_serializing_if = "Option::is_none")]
pub tool_choice: Option<ToolChoice>,
/// Tools available to the model.
#[serde(skip_serializing_if = "Option::is_none")]
pub tools: Option<Vec<RealtimeTool>>,
}
#[derive(Debug, Serialize, Deserialize, Clone)]
pub struct RealtimeResponse {
/// Configuration for audio output.
pub audio: Option<ResponseAudio>,
/// Which conversation the response is added to, determined by the `conversation` field in the
/// `response.create` event. If `auto`, the response will be added to the default conversation
/// and the value of `conversation_id` will be an id like `conv_1234`. If `none`, the response
/// will not be added to any conversation and the value of `conversation_id` will be `null`.
/// If responses are being triggered automatically by VAD the response will be added to the
/// default conversation.
#[serde(skip_serializing_if = "Option::is_none")]
pub conversation_id: Option<String>,
/// The unique ID of the response, will look like `resp_1234`.
pub id: String,
/// Maximum number of output tokens for a single assistant response, inclusive of tool calls,
/// that was used in this response.
pub max_output_tokens: MaxOutputTokens,
/// Set of 16 key-value pairs that can be attached to an object. This can be useful for
/// storing additional information about the object in a structured format, and querying
/// for objects via API or the dashboard.
///
/// Keys are strings with a maximum length of 64 characters. Values are strings with a
/// maximum length of 512 characters.
#[serde(skip_serializing_if = "Option::is_none")]
pub metadata: Option<HashMap<String, String>>,
/// The object type, must be "realtime.response".
pub object: String,
/// The list of output items generated by the response.
pub output: Vec<RealtimeConversationItem>,
/// The set of modalities the model used to respond, currently the only possible values
/// are [\"audio\"], [\"text\"]. Audio output always include a text transcript.
/// Setting the output to mode `text` will disable audio output from the model.
pub output_modalities: Vec<String>,
/// The final status of the response (`completed`, `cancelled`, `failed`, or `incomplete`, `in_progress`).
pub status: RealtimeResponseStatus,
/// Additional details about the status.
pub status_details: Option<RealtimeResponseStatusDetail>,
/// Usage statistics for the Response, this will correspond to billing. A Realtime API session
/// will maintain a conversation context and append new Items to the Conversation, thus output
/// from previous turns (text and audio tokens) will become the input for later turns.
pub usage: Option<RealtimeResponseUsage>,
}