async_openai/types/realtime/response.rs
1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4
5use crate::types::{
6 realtime::{
7 MaxOutputTokens, RealtimeAudioFormats, RealtimeConversationItem, RealtimeTool,
8 RealtimeVoice, ToolChoice,
9 },
10 responses::Prompt,
11};
12
13#[derive(Debug, Serialize, Deserialize, Clone)]
14pub struct RealtimeResponseUsage {
15 /// Details about the input tokens used in the Response. Cached tokens are tokens from previous
16 /// turns in the conversation that are included as context for the current response. Cached tokens
17 /// here are counted as a subset of input tokens, meaning input tokens will include cached and
18 /// uncached tokens.
19 #[serde(skip_serializing_if = "Option::is_none")]
20 pub input_token_details: Option<InputTokenDetails>,
21
22 /// The number of input tokens used in the Response, including text and audio tokens.
23 pub input_tokens: u32,
24
25 #[serde(skip_serializing_if = "Option::is_none")]
26 pub output_token_details: Option<OutputTokenDetails>,
27
28 /// The number of output tokens sent in the Response, including text and audio tokens.
29 pub output_tokens: u32,
30
31 /// The total number of tokens in the Response including input and output text and audio tokens.
32 pub total_tokens: u32,
33}
34
35#[derive(Debug, Serialize, Deserialize, Clone)]
36pub struct InputTokenDetails {
37 /// The number of audio tokens used as input for the Response.
38 #[serde(skip_serializing_if = "Option::is_none")]
39 pub audio_tokens: Option<u32>,
40 /// The number of cached tokens used as input for the Response.
41 #[serde(skip_serializing_if = "Option::is_none")]
42 pub cached_tokens: Option<u32>,
43
44 /// Details about the cached tokens used as input for the Response.
45 pub cached_token_details: Option<CachedTokenDetails>,
46
47 /// The number of image tokens used as input for the Response.
48 #[serde(skip_serializing_if = "Option::is_none")]
49 pub image_tokens: Option<u32>,
50
51 /// The number of text tokens used as input for the Response.
52 #[serde(skip_serializing_if = "Option::is_none")]
53 pub text_tokens: Option<u32>,
54}
55
56#[derive(Debug, Serialize, Deserialize, Clone)]
57pub struct CachedTokenDetails {
58 /// The number of cached audio tokens used as input for the Response.
59 #[serde(skip_serializing_if = "Option::is_none")]
60 pub audio_tokens: Option<u32>,
61
62 /// The number of cached image tokens used as input for the Response.
63 #[serde(skip_serializing_if = "Option::is_none")]
64 pub image_tokens: Option<u32>,
65
66 /// The number of cached text tokens used as input for the Response.
67 #[serde(skip_serializing_if = "Option::is_none")]
68 pub text_tokens: Option<u32>,
69}
70
71#[derive(Debug, Serialize, Deserialize, Clone)]
72pub struct OutputTokenDetails {
73 #[serde(skip_serializing_if = "Option::is_none")]
74 pub text_tokens: Option<u32>,
75 #[serde(skip_serializing_if = "Option::is_none")]
76 pub audio_tokens: Option<u32>,
77}
78
79#[derive(Debug, Serialize, Deserialize, Clone)]
80#[serde(rename_all = "snake_case")]
81pub enum RealtimeResponseStatus {
82 InProgress,
83 Completed,
84 Cancelled,
85 Failed,
86 Incomplete,
87}
88
89#[derive(Debug, Serialize, Deserialize, Clone)]
90pub struct Error {
91 pub code: String,
92 pub r#type: String,
93}
94
95#[derive(Debug, Serialize, Deserialize, Clone)]
96#[serde(rename_all = "lowercase")]
97pub enum RealtimeResponseStatusDetailType {
98 Completed,
99 Cancelled,
100 Incomplete,
101 Failed,
102}
103
104#[derive(Debug, Serialize, Deserialize, Clone)]
105#[serde(rename_all = "snake_case")]
106pub enum RealtimeResponseStatusDetailReason {
107 TurnDetected,
108 ClientCancelled,
109 MaxOutputTokens,
110 ContentFilter,
111}
112
113#[derive(Debug, Serialize, Deserialize, Clone)]
114pub struct RealtimeResponseStatusDetail {
115 /// A description of the error that caused the response to fail, populated when the status is failed.
116 pub error: Option<Error>,
117 /// The reason the Response did not complete. For a `cancelled` Response, one of `turn_detected`
118 /// (the server VAD detected a new start of speech) or `client_cancelled` (the client sent a cancel
119 /// event). For an incomplete Response, one of `max_output_tokens` or `content_filter` (the
120 /// server-side safety filter activated and cut off the response).
121 pub reason: Option<RealtimeResponseStatusDetailReason>,
122 /// The type of error that caused the response to fail, corresponding with the `status`
123 /// field (`completed`, `cancelled`, `incomplete`, `failed`).
124 pub r#type: RealtimeResponseStatusDetailType,
125}
126
127#[derive(Debug, Serialize, Deserialize, Clone)]
128pub struct ResponseAudioOutput {
129 /// The format of the output audio.
130 pub format: RealtimeAudioFormats,
131
132 /// The voice the model uses to respond. Voice cannot be changed during the session once
133 /// the model has responded with audio at least once. Current voice options are
134 /// `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`.
135 /// We recommend `marin` and `cedar` for best quality.
136 pub voice: RealtimeVoice,
137}
138
139#[derive(Debug, Serialize, Deserialize, Clone)]
140pub struct ResponseAudio {
141 /// Configuration for audio output.
142 pub output: ResponseAudioOutput,
143}
144
145#[derive(Debug, Serialize, Deserialize, Clone, Default)]
146#[serde(rename_all = "lowercase")]
147pub enum Conversation {
148 #[default]
149 Auto,
150 None,
151}
152
153/// The response resource.
154#[derive(Debug, Serialize, Deserialize, Clone)]
155pub struct RealtimeResponseCreateParams {
156 /// Configuration for audio input and output.
157 pub audio: ResponseAudio,
158
159 /// Controls which conversation the response is added to. Currently supports auto and none,
160 /// with auto as the default value. The auto value means that the contents of the response
161 /// will be added to the default conversation. Set this to none to create an out-of-band
162 /// response which will not add items to default conversation.
163 pub conversation: Conversation,
164
165 /// Input items to include in the prompt for the model. Using this field creates a new context
166 /// for this Response instead of using the default conversation. An empty array `[]` will clear
167 /// the context for this Response. Note that this can include references to items that
168 /// previously appeared in the session using their id.
169 pub input: Vec<RealtimeConversationItem>,
170
171 /// The default system instructions (i.e. system message) prepended to model calls.
172 /// This field allows the client to guide the model on desired responses.
173 /// The model can be instructed on response content and format, (e.g. "be extremely succinct",
174 /// "act friendly", "here are examples of good responses") and on audio behavior
175 /// (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently").
176 /// The instructions are not guaranteed to be followed by the model, but they provide
177 /// guidance to the model on the desired behavior. Note that the server sets default
178 /// instructions which will be used if this field is not set and are visible in
179 /// the `session.created` event at the start of the session.
180 pub instructions: String,
181
182 /// Maximum number of output tokens for a single assistant response, inclusive of tool calls.
183 /// Provide an integer between 1 and 4096 to limit output tokens, or inf for the maximum
184 /// available tokens for a given model. Defaults to `inf`.
185 pub max_output_tokens: MaxOutputTokens,
186
187 /// Set of 16 key-value pairs that can be attached to an object. This can be useful for
188 /// storing additional information about the object in a structured format, and querying
189 /// for objects via API or the dashboard.
190 ///
191 /// Keys are strings with a maximum length of 64 characters. Values are strings with a
192 /// maximum length of 512 characters.
193 #[serde(skip_serializing_if = "Option::is_none")]
194 pub metadata: Option<serde_json::Value>,
195
196 /// The set of modalities the model used to respond, currently the only possible values
197 /// are [\"audio\"], [\"text\"]. Audio output always include a text transcript.
198 /// Setting the output to mode `text` will disable audio output from the model.
199 pub output_modalities: Vec<String>,
200
201 /// Reference to a prompt template and its variables.
202 /// [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
203 #[serde(skip_serializing_if = "Option::is_none")]
204 pub prompt: Option<Prompt>,
205
206 /// How the model chooses tools. Provide one of the string modes or force a specific
207 /// function/MCP tool.
208 #[serde(skip_serializing_if = "Option::is_none")]
209 pub tool_choice: Option<ToolChoice>,
210
211 /// Tools available to the model.
212 #[serde(skip_serializing_if = "Option::is_none")]
213 pub tools: Option<Vec<RealtimeTool>>,
214}
215
216#[derive(Debug, Serialize, Deserialize, Clone)]
217pub struct RealtimeResponse {
218 /// Configuration for audio output.
219 pub audio: Option<ResponseAudio>,
220
221 /// Which conversation the response is added to, determined by the `conversation` field in the
222 /// `response.create` event. If `auto`, the response will be added to the default conversation
223 /// and the value of `conversation_id` will be an id like `conv_1234`. If `none`, the response
224 /// will not be added to any conversation and the value of `conversation_id` will be `null`.
225 /// If responses are being triggered automatically by VAD the response will be added to the
226 /// default conversation.
227 #[serde(skip_serializing_if = "Option::is_none")]
228 pub conversation_id: Option<String>,
229
230 /// The unique ID of the response, will look like `resp_1234`.
231 pub id: String,
232
233 /// Maximum number of output tokens for a single assistant response, inclusive of tool calls,
234 /// that was used in this response.
235 pub max_output_tokens: MaxOutputTokens,
236
237 /// Set of 16 key-value pairs that can be attached to an object. This can be useful for
238 /// storing additional information about the object in a structured format, and querying
239 /// for objects via API or the dashboard.
240 ///
241 /// Keys are strings with a maximum length of 64 characters. Values are strings with a
242 /// maximum length of 512 characters.
243 #[serde(skip_serializing_if = "Option::is_none")]
244 pub metadata: Option<HashMap<String, String>>,
245
246 /// The object type, must be "realtime.response".
247 pub object: String,
248
249 /// The list of output items generated by the response.
250 pub output: Vec<RealtimeConversationItem>,
251
252 /// The set of modalities the model used to respond, currently the only possible values
253 /// are [\"audio\"], [\"text\"]. Audio output always include a text transcript.
254 /// Setting the output to mode `text` will disable audio output from the model.
255 pub output_modalities: Vec<String>,
256
257 /// The final status of the response (`completed`, `cancelled`, `failed`, or `incomplete`, `in_progress`).
258 pub status: RealtimeResponseStatus,
259
260 /// Additional details about the status.
261 pub status_details: Option<RealtimeResponseStatusDetail>,
262
263 /// Usage statistics for the Response, this will correspond to billing. A Realtime API session
264 /// will maintain a conversation context and append new Items to the Conversation, thus output
265 /// from previous turns (text and audio tokens) will become the input for later turns.
266 pub usage: Option<RealtimeResponseUsage>,
267}