async_openai/types/realtime/
response.rs

1use std::collections::HashMap;
2
3use serde::{Deserialize, Serialize};
4
5use crate::types::{
6    realtime::{
7        MaxOutputTokens, RealtimeAudioFormats, RealtimeConversationItem, RealtimeTool,
8        RealtimeVoice, ToolChoice,
9    },
10    responses::Prompt,
11};
12
13#[derive(Debug, Serialize, Deserialize, Clone)]
14pub struct RealtimeResponseUsage {
15    /// Details about the input tokens used in the Response. Cached tokens are tokens from previous
16    /// turns in the conversation that are included as context for the current response. Cached tokens
17    /// here are counted as a subset of input tokens, meaning input tokens will include cached and
18    /// uncached tokens.
19    #[serde(skip_serializing_if = "Option::is_none")]
20    pub input_token_details: Option<InputTokenDetails>,
21
22    /// The number of input tokens used in the Response, including text and audio tokens.
23    pub input_tokens: u32,
24
25    #[serde(skip_serializing_if = "Option::is_none")]
26    pub output_token_details: Option<OutputTokenDetails>,
27
28    /// The number of output tokens sent in the Response, including text and audio tokens.
29    pub output_tokens: u32,
30
31    /// The total number of tokens in the Response including input and output text and audio tokens.
32    pub total_tokens: u32,
33}
34
35#[derive(Debug, Serialize, Deserialize, Clone)]
36pub struct InputTokenDetails {
37    /// The number of audio tokens used as input for the Response.
38    #[serde(skip_serializing_if = "Option::is_none")]
39    pub audio_tokens: Option<u32>,
40    /// The number of cached tokens used as input for the Response.
41    #[serde(skip_serializing_if = "Option::is_none")]
42    pub cached_tokens: Option<u32>,
43
44    /// Details about the cached tokens used as input for the Response.
45    pub cached_token_details: Option<CachedTokenDetails>,
46
47    /// The number of image tokens used as input for the Response.
48    #[serde(skip_serializing_if = "Option::is_none")]
49    pub image_tokens: Option<u32>,
50
51    /// The number of text tokens used as input for the Response.
52    #[serde(skip_serializing_if = "Option::is_none")]
53    pub text_tokens: Option<u32>,
54}
55
56#[derive(Debug, Serialize, Deserialize, Clone)]
57pub struct CachedTokenDetails {
58    /// The number of cached audio tokens used as input for the Response.
59    #[serde(skip_serializing_if = "Option::is_none")]
60    pub audio_tokens: Option<u32>,
61
62    /// The number of cached image tokens used as input for the Response.
63    #[serde(skip_serializing_if = "Option::is_none")]
64    pub image_tokens: Option<u32>,
65
66    /// The number of cached text tokens used as input for the Response.
67    #[serde(skip_serializing_if = "Option::is_none")]
68    pub text_tokens: Option<u32>,
69}
70
71#[derive(Debug, Serialize, Deserialize, Clone)]
72pub struct OutputTokenDetails {
73    #[serde(skip_serializing_if = "Option::is_none")]
74    pub text_tokens: Option<u32>,
75    #[serde(skip_serializing_if = "Option::is_none")]
76    pub audio_tokens: Option<u32>,
77}
78
79#[derive(Debug, Serialize, Deserialize, Clone)]
80#[serde(rename_all = "snake_case")]
81pub enum RealtimeResponseStatus {
82    InProgress,
83    Completed,
84    Cancelled,
85    Failed,
86    Incomplete,
87}
88
89#[derive(Debug, Serialize, Deserialize, Clone)]
90pub struct Error {
91    pub code: String,
92    pub r#type: String,
93}
94
95#[derive(Debug, Serialize, Deserialize, Clone)]
96#[serde(rename_all = "lowercase")]
97pub enum RealtimeResponseStatusDetailType {
98    Completed,
99    Cancelled,
100    Incomplete,
101    Failed,
102}
103
104#[derive(Debug, Serialize, Deserialize, Clone)]
105#[serde(rename_all = "snake_case")]
106pub enum RealtimeResponseStatusDetailReason {
107    TurnDetected,
108    ClientCancelled,
109    MaxOutputTokens,
110    ContentFilter,
111}
112
113#[derive(Debug, Serialize, Deserialize, Clone)]
114pub struct RealtimeResponseStatusDetail {
115    /// A description of the error that caused the response to fail, populated when the status is failed.
116    pub error: Option<Error>,
117    /// The reason the Response did not complete. For a `cancelled` Response, one of `turn_detected`
118    /// (the server VAD detected a new start of speech) or `client_cancelled` (the client sent a cancel
119    /// event). For an incomplete Response, one of `max_output_tokens` or `content_filter` (the
120    ///  server-side safety filter activated and cut off the response).
121    pub reason: Option<RealtimeResponseStatusDetailReason>,
122    /// The type of error that caused the response to fail, corresponding with the `status`
123    /// field (`completed`, `cancelled`, `incomplete`, `failed`).
124    pub r#type: RealtimeResponseStatusDetailType,
125}
126
127#[derive(Debug, Serialize, Deserialize, Clone)]
128pub struct ResponseAudioOutput {
129    /// The format of the output audio.
130    pub format: RealtimeAudioFormats,
131
132    /// The voice the model uses to respond. Voice cannot be changed during the session once
133    /// the model has responded with audio at least once. Current voice options are
134    /// `alloy`, `ash`, `ballad`, `coral`, `echo`, `sage`, `shimmer`, `verse`, `marin`, and `cedar`.
135    /// We recommend `marin` and `cedar` for best quality.
136    pub voice: RealtimeVoice,
137}
138
139#[derive(Debug, Serialize, Deserialize, Clone)]
140pub struct ResponseAudio {
141    /// Configuration for audio output.
142    pub output: ResponseAudioOutput,
143}
144
145#[derive(Debug, Serialize, Deserialize, Clone, Default)]
146#[serde(rename_all = "lowercase")]
147pub enum Conversation {
148    #[default]
149    Auto,
150    None,
151}
152
153/// The response resource.
154#[derive(Debug, Serialize, Deserialize, Clone)]
155pub struct RealtimeResponseCreateParams {
156    /// Configuration for audio input and output.
157    pub audio: ResponseAudio,
158
159    /// Controls which conversation the response is added to. Currently supports auto and none,
160    /// with auto as the default value. The auto value means that the contents of the response
161    /// will be added to the default conversation. Set this to none to create an out-of-band
162    /// response which will not add items to default conversation.
163    pub conversation: Conversation,
164
165    /// Input items to include in the prompt for the model. Using this field creates a new context
166    /// for this Response instead of using the default conversation. An empty array `[]` will clear
167    /// the context for this Response. Note that this can include references to items that
168    /// previously appeared in the session using their id.
169    pub input: Vec<RealtimeConversationItem>,
170
171    /// The default system instructions (i.e. system message) prepended to model calls.
172    /// This field allows the client to guide the model on desired responses.
173    /// The model can be instructed on response content and format, (e.g. "be extremely succinct",
174    /// "act friendly", "here are examples of good responses") and on audio behavior
175    /// (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently").
176    /// The instructions are not guaranteed to be followed by the model, but they provide
177    /// guidance to the model on the desired behavior. Note that the server sets default
178    /// instructions which will be used if this field is not set and are visible in
179    /// the `session.created` event at the start of the session.
180    pub instructions: String,
181
182    /// Maximum number of output tokens for a single assistant response, inclusive of tool calls.
183    /// Provide an integer between 1 and 4096 to limit output tokens, or inf for the maximum
184    /// available tokens for a given model. Defaults to `inf`.
185    pub max_output_tokens: MaxOutputTokens,
186
187    /// Set of 16 key-value pairs that can be attached to an object. This can be useful for
188    /// storing additional information about the object in a structured format, and querying
189    /// for objects via API or the dashboard.
190    ///
191    /// Keys are strings with a maximum length of 64 characters. Values are strings with a
192    /// maximum length of 512 characters.
193    #[serde(skip_serializing_if = "Option::is_none")]
194    pub metadata: Option<serde_json::Value>,
195
196    /// The set of modalities the model used to respond, currently the only possible values
197    /// are [\"audio\"], [\"text\"]. Audio output always include a text transcript.
198    /// Setting the output to mode `text` will disable audio output from the model.
199    pub output_modalities: Vec<String>,
200
201    /// Reference to a prompt template and its variables.
202    /// [Learn more](https://platform.openai.com/docs/guides/text?api-mode=responses#reusable-prompts).
203    #[serde(skip_serializing_if = "Option::is_none")]
204    pub prompt: Option<Prompt>,
205
206    /// How the model chooses tools. Provide one of the string modes or force a specific
207    /// function/MCP tool.
208    #[serde(skip_serializing_if = "Option::is_none")]
209    pub tool_choice: Option<ToolChoice>,
210
211    /// Tools available to the model.
212    #[serde(skip_serializing_if = "Option::is_none")]
213    pub tools: Option<Vec<RealtimeTool>>,
214}
215
216#[derive(Debug, Serialize, Deserialize, Clone)]
217pub struct RealtimeResponse {
218    /// Configuration for audio output.
219    pub audio: Option<ResponseAudio>,
220
221    /// Which conversation the response is added to, determined by the `conversation` field in the
222    /// `response.create` event. If `auto`, the response will be added to the default conversation
223    /// and the value of `conversation_id` will be an id like `conv_1234`. If `none`, the response
224    /// will not be added to any conversation and the value of `conversation_id` will be `null`.
225    /// If responses are being triggered automatically by VAD the response will be added to the
226    /// default conversation.
227    #[serde(skip_serializing_if = "Option::is_none")]
228    pub conversation_id: Option<String>,
229
230    /// The unique ID of the response, will look like `resp_1234`.
231    pub id: String,
232
233    /// Maximum number of output tokens for a single assistant response, inclusive of tool calls,
234    /// that was used in this response.
235    pub max_output_tokens: MaxOutputTokens,
236
237    /// Set of 16 key-value pairs that can be attached to an object. This can be useful for
238    /// storing additional information about the object in a structured format, and querying
239    /// for objects via API or the dashboard.
240    ///
241    /// Keys are strings with a maximum length of 64 characters. Values are strings with a
242    /// maximum length of 512 characters.
243    #[serde(skip_serializing_if = "Option::is_none")]
244    pub metadata: Option<HashMap<String, String>>,
245
246    /// The object type, must be "realtime.response".
247    pub object: String,
248
249    /// The list of output items generated by the response.
250    pub output: Vec<RealtimeConversationItem>,
251
252    /// The set of modalities the model used to respond, currently the only possible values
253    /// are [\"audio\"], [\"text\"]. Audio output always include a text transcript.
254    /// Setting the output to mode `text` will disable audio output from the model.
255    pub output_modalities: Vec<String>,
256
257    /// The final status of the response (`completed`, `cancelled`, `failed`, or `incomplete`, `in_progress`).
258    pub status: RealtimeResponseStatus,
259
260    /// Additional details about the status.
261    pub status_details: Option<RealtimeResponseStatusDetail>,
262
263    /// Usage statistics for the Response, this will correspond to billing. A Realtime API session
264    /// will maintain a conversation context and append new Items to the Conversation, thus output
265    /// from previous turns (text and audio tokens) will become the input for later turns.
266    pub usage: Option<RealtimeResponseUsage>,
267}