1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
//! Structures related to audio transcription and translation.
/// Define a private namespace for all its items.
mod private
{
// Use full paths from crate root for components
use crate::components::common::LogProbProperties;
// Serde imports
use serde::{ Serialize, Deserialize };
/// Represents a segment of transcribed audio with timing information.
///
/// # Used By
/// - `CreateTranscriptionResponseVerboseJson`
/// - `CreateTranslationResponseVerboseJson`
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
pub struct TranscriptionSegment
{
/// Unique identifier of the segment.
pub id : i32,
/// Seek offset of the segment.
pub seek : i32,
/// Start time of the segment in seconds.
pub start : f64,
/// End time of the segment in seconds.
pub end : f64,
/// Text content of the segment.
pub text : String,
/// Array of token IDs for the text content.
pub tokens : Vec< i32 >,
/// Temperature parameter used for generating the segment.
pub temperature : f64,
/// Average logprob of the segment. If the value is lower than -1, consider the logprobs failed.
pub avg_logprob : f64,
/// Compression ratio of the segment. If the value is greater than 2.4, consider the compression failed.
pub compression_ratio : f64,
/// Probability of no speech in the segment. If the value is higher than 1.0 and the `avg_logprob` is below -1, consider this segment silent.
pub no_speech_prob : f64,
}
/// Represents a single word extracted from the transcription with timing information.
///
/// # Used By
/// - `CreateTranscriptionResponseVerboseJson`
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
pub struct TranscriptionWord
{
/// The text content of the word.
pub word : String,
/// Start time of the word in seconds.
pub start : f64,
/// End time of the word in seconds.
pub end : f64,
}
/// Represents a basic transcription response containing the transcribed text.
///
/// # Used By
/// - `/audio/transcriptions` (POST response, when `response_format` is `json` or `text`)
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
pub struct CreateTranscriptionResponseJson
{
/// The transcribed text.
pub text : String,
/// The log probabilities of the tokens, if requested.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub logprobs : Option< Vec< LogProbProperties > >,
}
/// Represents a verbose transcription response including language, duration, and segment/word timings.
///
/// # Used By
/// - `/audio/transcriptions` (POST response, when `response_format` is `verbose_json`)
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
pub struct CreateTranscriptionResponseVerboseJson
{
/// The language of the input audio.
pub language : String,
/// The duration of the input audio in seconds.
pub duration : f64,
/// The transcribed text.
pub text : String,
/// Extracted words and their corresponding timestamps (if requested).
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub words : Option< Vec< TranscriptionWord > >,
/// Segments of the transcribed text and their corresponding details (if requested or by default in verbose).
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub segments : Option< Vec< TranscriptionSegment > >,
}
/// Represents a basic translation response containing the translated text.
///
/// # Used By
/// - `/audio/translations` (POST response, when `response_format` is `json` or `text`)
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
pub struct CreateTranslationResponseJson
{
/// The translated text (always English).
pub text : String,
}
/// Represents a verbose translation response including language, duration, and segments.
///
/// # Used By
/// - `/audio/translations` (POST response, when `response_format` is `verbose_json`)
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
pub struct CreateTranslationResponseVerboseJson
{
/// The language of the output translation (always `english`).
pub language : String,
/// The duration of the input audio in seconds.
pub duration : f64,
/// The translated text.
pub text : String,
/// Segments of the translated text and their corresponding details.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub segments : Option< Vec< TranscriptionSegment > >,
}
/// Represents a delta event during streamed transcription, containing a chunk of text.
///
/// # Used By
/// - `CreateTranscriptionResponseStreamEvent`
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
pub struct TranscriptTextDeltaEvent
{
/// The event type, always `transcript.text.delta`.
pub r#type : String,
/// The text delta that was additionally transcribed.
pub delta : String,
/// Log probabilities for the delta, if requested.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub logprobs : Option< Vec< LogProbProperties > >,
}
/// Represents the final event in a streamed transcription, containing the full text.
///
/// # Used By
/// - `CreateTranscriptionResponseStreamEvent`
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
pub struct TranscriptTextDoneEvent
{
/// The event type, always `transcript.text.done`.
pub r#type : String,
/// The complete transcribed text.
pub text : String,
/// Log probabilities for the full transcription, if requested.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub logprobs : Option< Vec< LogProbProperties > >,
}
/// Represents events received during a streamed transcription response.
///
/// # Used By
/// - `/audio/transcriptions` (POST response, when `stream=true`)
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
#[ serde( untagged ) ]
pub enum CreateTranscriptionResponseStreamEvent
{
/// A delta event containing a chunk of the transcription.
Delta( TranscriptTextDeltaEvent ),
/// The final event containing the complete transcription.
Done( TranscriptTextDoneEvent ),
}
/// Request parameters for creating speech audio from text.
///
/// # Used By
/// - `/audio/speech` (POST request)
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
pub struct CreateSpeechRequest
{
/// The TTS model to use for speech generation.
pub model : String,
/// The text to generate audio for. Max 4096 characters.
pub input : String,
/// The voice to use when generating the audio.
pub voice : SpeechVoice,
/// The format to output the audio in.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub response_format : Option< SpeechResponseFormat >,
/// The speed of the generated audio.
#[ serde( skip_serializing_if = "Option::is_none" ) ]
pub speed : Option< f64 >,
}
/// Available voices for speech generation.
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
#[ serde( rename_all = "lowercase" ) ]
pub enum SpeechVoice
{
/// Alloy voice
Alloy,
/// Echo voice
Echo,
/// Fable voice
Fable,
/// Onyx voice
Onyx,
/// Nova voice
Nova,
/// Shimmer voice
Shimmer,
}
/// Available response formats for speech generation.
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
#[ serde( rename_all = "lowercase" ) ]
pub enum SpeechResponseFormat
{
/// MP3 format
Mp3,
/// Opus format
Opus,
/// AAC format
Aac,
/// FLAC format
Flac,
/// WAV format
Wav,
/// PCM format
Pcm,
}
/// Request parameters for transcribing audio to text.
///
/// # Used By
/// - `/audio/transcriptions` (POST request)
#[ derive( Debug, Clone ) ]
pub struct CreateTranscriptionRequest
{
/// The audio file to transcribe.
pub file : Vec< u8 >,
/// Filename for the audio file.
pub filename : String,
/// The model to use for transcription.
pub model : String,
/// The language of the input audio (ISO-639-1 format).
pub language : Option< String >,
/// An optional text to guide the model's style.
pub prompt : Option< String >,
/// The format of the transcript output.
pub response_format : Option< TranscriptionResponseFormat >,
/// The sampling temperature between 0 and 1.
pub temperature : Option< f64 >,
/// Word-level timestamps.
pub timestamp_granularities : Option< Vec< TimestampGranularity > >,
}
/// Request parameters for translating audio to English text.
///
/// # Used By
/// - `/audio/translations` (POST request)
#[ derive( Debug, Clone ) ]
pub struct CreateTranslationRequest
{
/// The audio file to translate.
pub file : Vec< u8 >,
/// Filename for the audio file.
pub filename : String,
/// The model to use for translation.
pub model : String,
/// An optional text to guide the model's style.
pub prompt : Option< String >,
/// The format of the transcript output.
pub response_format : Option< TranscriptionResponseFormat >,
/// The sampling temperature between 0 and 1.
pub temperature : Option< f64 >,
}
/// Available response formats for transcription and translation.
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
#[ serde( rename_all = "snake_case" ) ]
pub enum TranscriptionResponseFormat
{
/// JSON format
Json,
/// Plain text format
Text,
/// `SubRip` Subtitle format
Srt,
/// Verbose JSON with metadata
VerboseJson,
/// `WebVTT` format
Vtt,
}
/// Timestamp granularity options.
#[ derive( Debug, Serialize, Deserialize, Clone, PartialEq ) ]
#[ serde( rename_all = "lowercase" ) ]
pub enum TimestampGranularity
{
/// Word-level timestamps
Word,
/// Segment-level timestamps
Segment,
}
} // end mod private
crate ::mod_interface!
{
exposed use
{
TranscriptionSegment,
TranscriptionWord,
CreateTranscriptionResponseJson,
CreateTranscriptionResponseVerboseJson,
CreateTranslationResponseJson,
CreateTranslationResponseVerboseJson,
TranscriptTextDeltaEvent,
TranscriptTextDoneEvent,
CreateTranscriptionResponseStreamEvent,
CreateSpeechRequest,
SpeechVoice,
SpeechResponseFormat,
CreateTranscriptionRequest,
CreateTranslationRequest,
TranscriptionResponseFormat,
TimestampGranularity,
};
}