audio_transcription/
audio_transcription.rs

1#![allow(clippy::uninlined_format_args)]
2//! Audio Transcription (Speech-to-Text) example for the openai-ergonomic crate.
3//!
4//! This example demonstrates speech-to-text and translation functionality using `OpenAI`'s
5//! Whisper models. It shows how to transcribe audio files into text with various options.
6//!
7//! ## Features Demonstrated
8//!
9//! - Basic speech-to-text transcription
10//! - Audio translation to English
11//! - Different response formats (json, text, srt, `verbose_json`, vtt)
12//! - Timestamp extraction and segment information
13//! - Language detection and specification
14//! - Temperature control for transcription consistency
15//! - Different audio input formats support
16//! - Model selection (whisper-1, gpt-4o-mini-transcribe, gpt-4o-transcribe)
17//! - Advanced features like word timestamps and log probabilities
18//!
19//! ## Prerequisites
20//!
21//! Set your `OpenAI` API key:
22//! ```bash
23//! export OPENAI_API_KEY="your-key-here"
24//! ```
25//!
26//! You'll also need some audio files to transcribe. This example includes
27//! functionality to create sample audio files for testing if none are available.
28//!
29//! ## Usage
30//!
31//! ```bash
32//! cargo run --example audio_transcription
33//! ```
34
35use openai_client_base::{
36    apis::{audio_api, configuration::Configuration},
37    models::{
38        create_speech_request::ResponseFormat as SpeechResponseFormat,
39        AudioResponseFormat,
40        CreateSpeechRequest,
41        // TranscriptionChunkingStrategy, TranscriptionChunkingStrategyTextVariantEnum,
42        TranscriptionInclude,
43    },
44};
45use openai_ergonomic::{Client, Error};
46use std::io::Write;
47use std::path::PathBuf;
48
49#[tokio::main]
50async fn main() -> Result<(), Box<dyn std::error::Error>> {
51    println!(" OpenAI Ergonomic - Audio Transcription (Speech-to-Text) Example\n");
52
53    // Initialize client from environment variables
54    let client = match Client::from_env() {
55        Ok(client_builder) => {
56            println!(" Client initialized successfully");
57            client_builder.build()
58        }
59        Err(e) => {
60            eprintln!(" Failed to initialize client: {e}");
61            eprintln!(" Make sure OPENAI_API_KEY is set in your environment");
62            return Err(e.into());
63        }
64    };
65
66    // First, create some sample audio files if they don't exist
67    println!("\n Preparing sample audio files...");
68    match create_sample_audio_files(&client).await {
69        Ok(()) => println!(" Sample audio files ready"),
70        Err(e) => {
71            eprintln!(" Failed to create sample audio files: {e}");
72            eprintln!(" You may need to provide your own audio files");
73        }
74    }
75
76    // Example 1: Basic Speech-to-Text
77    println!("\n Example 1: Basic Speech-to-Text Transcription");
78    println!("===============================================");
79
80    match basic_transcription_example(&client).await {
81        Ok(()) => println!(" Basic transcription example completed"),
82        Err(e) => {
83            eprintln!(" Basic transcription example failed: {e}");
84            handle_api_error(&e);
85        }
86    }
87
88    // Example 2: Response Format Comparison
89    println!("\n Example 2: Response Format Comparison");
90    println!("==========================================");
91
92    match response_format_example(&client).await {
93        Ok(()) => println!(" Response format example completed"),
94        Err(e) => {
95            eprintln!(" Response format example failed: {e}");
96            handle_api_error(&e);
97        }
98    }
99
100    // Example 3: Detailed Transcription with Timestamps
101    println!("\n⏰ Example 3: Detailed Transcription with Timestamps");
102    println!("==================================================");
103
104    match detailed_transcription_example(&client).await {
105        Ok(()) => println!(" Detailed transcription example completed"),
106        Err(e) => {
107            eprintln!(" Detailed transcription example failed: {e}");
108            handle_api_error(&e);
109        }
110    }
111
112    // Example 4: Audio Translation
113    println!("\n Example 4: Audio Translation to English");
114    println!("===========================================");
115
116    match translation_example(&client).await {
117        Ok(()) => println!(" Translation example completed"),
118        Err(e) => {
119            eprintln!(" Translation example failed: {e}");
120            handle_api_error(&e);
121        }
122    }
123
124    // Example 5: Advanced Options
125    println!("\n Example 5: Advanced Transcription Options");
126    println!("============================================");
127
128    match advanced_options_example(&client).await {
129        Ok(()) => println!(" Advanced options example completed"),
130        Err(e) => {
131            eprintln!(" Advanced options example failed: {e}");
132            handle_api_error(&e);
133        }
134    }
135
136    println!("\n All audio transcription examples completed! Check the console output above for results.");
137    Ok(())
138}
139
140/// Create sample audio files for testing if they don't exist
141async fn create_sample_audio_files(client: &Client) -> Result<(), Error> {
142    let sample_files = [
143        ("sample_english.mp3", "Hello, this is a sample English audio for transcription testing. The quick brown fox jumps over the lazy dog."),
144        ("sample_long.mp3", "This is a longer audio sample that will be used to demonstrate timestamp extraction and segmentation features. It contains multiple sentences with pauses between them. The purpose is to show how the transcription API can break down longer audio into meaningful segments with accurate timing information."),
145        ("sample_numbers.mp3", "Here are some numbers for testing: one, two, three, four, five. The year is twenty twenty-four. My phone number is five five five, one two three four."),
146    ];
147
148    let configuration = create_configuration(client);
149
150    for (filename, text) in &sample_files {
151        let path = PathBuf::from(filename);
152        if path.exists() {
153            println!("   Sample audio already exists: {filename}");
154        } else {
155            println!("   Creating sample audio: {filename}");
156
157            let request = CreateSpeechRequest::builder()
158                .model("tts-1".to_string())
159                .input((*text).to_string())
160                .voice("alloy".to_string())
161                .response_format(SpeechResponseFormat::Mp3)
162                .speed(0.9) // Slightly slower for clearer transcription
163                .build();
164
165            match audio_api::create_speech()
166                .configuration(&configuration)
167                .create_speech_request(request)
168                .call()
169                .await
170            {
171                Ok(response) => {
172                    let audio_data = response.bytes().await.map_err(Error::Http)?;
173                    save_audio_file(&audio_data, filename)?;
174                    println!("      Created: {filename}");
175                }
176                Err(e) => {
177                    eprintln!("      Failed to create {filename}: {e}");
178                }
179            }
180        }
181    }
182
183    Ok(())
184}
185
186/// Example 1: Basic speech-to-text transcription
187async fn basic_transcription_example(client: &Client) -> Result<(), Error> {
188    println!("Performing basic speech-to-text transcription...");
189
190    let audio_file = PathBuf::from("sample_english.mp3");
191    if !audio_file.exists() {
192        eprintln!(" Audio file not found: {}", audio_file.display());
193        eprintln!(" Run the audio creation step first or provide your own audio file");
194        return Ok(());
195    }
196
197    // Note: Once audio builders are implemented, this would look like:
198    // let transcription = client
199    //     .audio()
200    //     .transcription()
201    //     .file(&audio_file)
202    //     .model("whisper-1")
203    //     .response_format("json")
204    //     .execute()
205    //     .await?;
206
207    let configuration = create_configuration(client);
208
209    println!("   Transcribing: {}", audio_file.display());
210
211    match audio_api::create_transcription()
212        .configuration(&configuration)
213        .file(audio_file.clone())
214        .model("whisper-1")
215        .response_format(AudioResponseFormat::Json)
216        .call()
217        .await
218    {
219        Ok(response) => {
220            println!("   Transcription Results:");
221            println!("     Text: \"{}\"", response.text);
222            println!("     Language: {}", response.language);
223            println!("     Duration: {:.2} seconds", response.duration);
224
225            if let Some(usage) = &response.usage {
226                println!("     Usage: {} seconds", usage.seconds);
227            }
228
229            // Save transcription to file
230            let output_file = "basic_transcription.txt";
231            save_text_file(&response.text, output_file)?;
232            println!("      Saved transcription to: {output_file}");
233        }
234        Err(e) => {
235            eprintln!("      Transcription failed: {e}");
236            return Err(Error::Api {
237                status: 0,
238                message: e.to_string(),
239                error_type: None,
240                error_code: None,
241            });
242        }
243    }
244
245    Ok(())
246}
247
248/// Example 2: Compare different response formats
249async fn response_format_example(client: &Client) -> Result<(), Error> {
250    println!("Comparing different response formats...");
251
252    let audio_file = PathBuf::from("sample_english.mp3");
253    if !audio_file.exists() {
254        eprintln!(" Audio file not found: {}", audio_file.display());
255        return Ok(());
256    }
257
258    let formats = [
259        (
260            AudioResponseFormat::Json,
261            "json",
262            "JSON format with metadata",
263        ),
264        (AudioResponseFormat::Text, "text", "Plain text only"),
265        (AudioResponseFormat::Srt, "srt", "SubRip subtitle format"),
266        (
267            AudioResponseFormat::VerboseJson,
268            "verbose_json",
269            "JSON with detailed timing",
270        ),
271        (AudioResponseFormat::Vtt, "vtt", "WebVTT subtitle format"),
272    ];
273
274    let configuration = create_configuration(client);
275
276    for (format, extension, description) in &formats {
277        println!("   Testing format: {description}");
278
279        match audio_api::create_transcription()
280            .configuration(&configuration)
281            .file(audio_file.clone())
282            .model("whisper-1")
283            .response_format(*format)
284            .call()
285            .await
286        {
287            Ok(response) => {
288                let filename = format!("transcription_sample.{extension}");
289
290                match format {
291                    AudioResponseFormat::Text => {
292                        // For text format, the response is just the text
293                        save_text_file(&response.text, &filename)?;
294                        println!("      Saved as: {filename}");
295                    }
296                    AudioResponseFormat::Json | AudioResponseFormat::VerboseJson => {
297                        // For JSON formats, save the full structured response
298                        let json_output = serde_json::to_string_pretty(&response)
299                            .unwrap_or_else(|_| response.text.clone());
300                        save_text_file(&json_output, &filename)?;
301                        println!("      Saved as: {filename}");
302
303                        if *format == AudioResponseFormat::VerboseJson {
304                            if let Some(segments) = &response.segments {
305                                println!("      Found {} segments", segments.len());
306                            }
307                            if let Some(words) = &response.words {
308                                println!("      Found {} words with timestamps", words.len());
309                            }
310                        }
311                    }
312                    AudioResponseFormat::Srt | AudioResponseFormat::Vtt => {
313                        // For subtitle formats, the text contains the formatted output
314                        save_text_file(&response.text, &filename)?;
315                        println!("      Saved as: {filename}");
316                    }
317                }
318            }
319            Err(e) => {
320                eprintln!("      Failed to transcribe in format {extension}: {e}");
321            }
322        }
323    }
324
325    println!("\n Note: Different formats serve different purposes:");
326    println!("   - JSON: Basic transcription with metadata");
327    println!("   - Text: Just the transcribed text, no metadata");
328    println!("   - SRT: SubRip subtitle format for video");
329    println!("   - Verbose JSON: Includes word-level timestamps and confidence");
330    println!("   - VTT: WebVTT format for web-based subtitles");
331
332    Ok(())
333}
334
335/// Example 3: Detailed transcription with timestamps and segments
336async fn detailed_transcription_example(client: &Client) -> Result<(), Error> {
337    println!("Performing detailed transcription with timestamps...");
338
339    let audio_file = PathBuf::from("sample_long.mp3");
340    if !audio_file.exists() {
341        eprintln!(" Audio file not found: {}", audio_file.display());
342        return Ok(());
343    }
344
345    let configuration = create_configuration(client);
346
347    println!("   Transcribing with detailed timing information...");
348
349    // Request detailed transcription with timestamps
350    match audio_api::create_transcription()
351        .configuration(&configuration)
352        .file(audio_file.clone())
353        .model("whisper-1")
354        .response_format(AudioResponseFormat::VerboseJson)
355        .timestamp_granularities(vec!["word".to_string(), "segment".to_string()])
356        .include(vec![TranscriptionInclude::Logprobs])
357        .temperature(0.0) // Low temperature for consistency
358        .call()
359        .await
360    {
361        Ok(response) => {
362            println!("   Detailed Transcription Results:");
363            println!("     Text: \"{}\"", response.text);
364            println!("     Language: {}", response.language);
365            println!("     Duration: {:.2} seconds", response.duration);
366
367            // Display segment information
368            if let Some(segments) = &response.segments {
369                println!("\n   Segment Analysis ({} segments):", segments.len());
370                for (i, segment) in segments.iter().enumerate() {
371                    println!(
372                        "     Segment {}: [{:.2}s - {:.2}s] \"{}\"",
373                        i + 1,
374                        segment.start,
375                        segment.end,
376                        segment.text
377                    );
378                    let avg_logprob = segment.avg_logprob;
379                    if avg_logprob != 0.0 {
380                        println!("       Confidence: {avg_logprob:.3}");
381                    }
382                }
383            }
384
385            // Display word-level timestamps
386            if let Some(words) = &response.words {
387                println!("\n   Word-level Timestamps (first 10 words):");
388                for (i, word) in words.iter().take(10).enumerate() {
389                    println!(
390                        "     {}: [{:.2}s - {:.2}s] \"{}\"",
391                        i + 1,
392                        word.start,
393                        word.end,
394                        word.word
395                    );
396                }
397                if words.len() > 10 {
398                    println!("     ... and {} more words", words.len() - 10);
399                }
400            }
401
402            // Save detailed results
403            let json_output =
404                serde_json::to_string_pretty(&response).unwrap_or_else(|_| response.text.clone());
405            save_text_file(&json_output, "detailed_transcription.json")?;
406            println!("      Saved detailed results to: detailed_transcription.json");
407        }
408        Err(e) => {
409            eprintln!("      Detailed transcription failed: {e}");
410            return Err(Error::Api {
411                status: 0,
412                message: e.to_string(),
413                error_type: None,
414                error_code: None,
415            });
416        }
417    }
418
419    Ok(())
420}
421
422/// Example 4: Audio translation to English
423async fn translation_example(client: &Client) -> Result<(), Error> {
424    println!("Demonstrating audio translation to English...");
425
426    // For this example, we'll use one of our existing audio files
427    // In a real scenario, you might have audio in different languages
428    let audio_file = PathBuf::from("sample_english.mp3");
429    if !audio_file.exists() {
430        eprintln!(" Audio file not found: {}", audio_file.display());
431        return Ok(());
432    }
433
434    let configuration = create_configuration(client);
435
436    println!("   Translating audio to English...");
437    println!("     Note: This example uses English audio, but translation works with any language");
438
439    match audio_api::create_translation()
440        .configuration(&configuration)
441        .file(audio_file.clone())
442        .model("whisper-1")
443        .response_format("json")
444        .temperature(0.2)
445        .call()
446        .await
447    {
448        Ok(response) => {
449            println!("   Translation Results:");
450            println!("     Translated Text: \"{}\"", response.text);
451
452            // Save translation
453            save_text_file(&response.text, "translation_result.txt")?;
454            println!("      Saved translation to: translation_result.txt");
455
456            println!("\n Translation Notes:");
457            println!("   - Translation always outputs English text regardless of input language");
458            println!(
459                "   - It's different from transcription which preserves the original language"
460            );
461            println!("   - Useful for creating English subtitles from foreign language audio");
462            println!("   - Works with the same audio formats as transcription");
463        }
464        Err(e) => {
465            eprintln!("      Translation failed: {e}");
466            return Err(Error::Api {
467                status: 0,
468                message: e.to_string(),
469                error_type: None,
470                error_code: None,
471            });
472        }
473    }
474
475    Ok(())
476}
477
478/// Example 5: Advanced transcription options
479async fn advanced_options_example(client: &Client) -> Result<(), Error> {
480    println!("Demonstrating advanced transcription options...");
481
482    let audio_file = PathBuf::from("sample_numbers.mp3");
483    if !audio_file.exists() {
484        eprintln!(" Audio file not found: {}", audio_file.display());
485        return Ok(());
486    }
487
488    let configuration = create_configuration(client);
489
490    // Example with language specification and prompt
491    println!("   Advanced transcription with language and prompt...");
492
493    let prompt = "This audio contains numbers and phone numbers. Please transcribe them accurately as digits where appropriate.";
494
495    match audio_api::create_transcription()
496        .configuration(&configuration)
497        .file(audio_file.clone())
498        .model("whisper-1")
499        .language("en") // Specify language for better accuracy
500        .prompt(prompt) // Provide context to improve accuracy
501        .response_format(AudioResponseFormat::VerboseJson)
502        .temperature(0.0) // Deterministic output
503        // .chunking_strategy(TranscriptionChunkingStrategy::TextVariant(TranscriptionChunkingStrategyTextVariantEnum::Auto)) // Commented out due to type mismatch
504        .include(vec![TranscriptionInclude::Logprobs])
505        .call()
506        .await
507    {
508        Ok(response) => {
509            println!("   Advanced Transcription Results:");
510            println!("     Text: \"{}\"", response.text);
511            println!("     Language: {}", response.language);
512            println!("     Duration: {:.2} seconds", response.duration);
513
514            // Show confidence information if available
515            if let Some(logprobs) = &response.logprobs {
516                println!(
517                    "     Log Probabilities: {} tokens with confidence scores",
518                    logprobs.len()
519                );
520            }
521
522            // Analyze segments for number detection
523            if let Some(segments) = &response.segments {
524                println!("\n   Number Detection Analysis:");
525                for (i, segment) in segments.iter().enumerate() {
526                    let contains_numbers = segment.text.chars().any(|c| c.is_ascii_digit());
527                    if contains_numbers {
528                        println!(
529                            "     Segment {} (contains numbers): \"{}\"",
530                            i + 1,
531                            segment.text
532                        );
533                        let confidence = segment.avg_logprob;
534                        if confidence != 0.0 {
535                            println!("       Confidence: {confidence:.3}");
536                        }
537                    }
538                }
539            }
540
541            // Save results
542            let json_output =
543                serde_json::to_string_pretty(&response).unwrap_or_else(|_| response.text.clone());
544            save_text_file(&json_output, "advanced_transcription.json")?;
545            println!("      Saved advanced results to: advanced_transcription.json");
546        }
547        Err(e) => {
548            eprintln!("      Advanced transcription failed: {e}");
549            return Err(Error::Api {
550                status: 0,
551                message: e.to_string(),
552                error_type: None,
553                error_code: None,
554            });
555        }
556    }
557
558    println!("\n Advanced Options Summary:");
559    println!("   - Language: Specify input language for better accuracy");
560    println!("   - Prompt: Provide context to guide transcription style");
561    println!("   - Temperature: Control randomness (0.0 = deterministic)");
562    println!("   - Chunking Strategy: How to split long audio (auto/hierarchical)");
563    println!("   - Include: Additional data like log probabilities");
564    println!("   - Timestamp Granularities: Word-level or segment-level timing");
565
566    Ok(())
567}
568
569/// Helper function to create configuration from client
570fn create_configuration(client: &Client) -> Configuration {
571    let mut configuration = Configuration::new();
572    configuration.bearer_access_token = Some(client.config().api_key().to_string());
573
574    if let Some(base_url) = client.config().base_url() {
575        configuration.base_path = base_url.to_string();
576    }
577
578    if let Some(org_id) = client.config().organization_id() {
579        configuration.user_agent = Some(format!(
580            "openai-ergonomic/{} org/{}",
581            env!("CARGO_PKG_VERSION"),
582            org_id
583        ));
584    }
585
586    configuration
587}
588
589/// Helper function to save audio data to file
590fn save_audio_file(audio_data: &[u8], filename: &str) -> Result<(), Error> {
591    let path = PathBuf::from(filename);
592    let mut file = std::fs::File::create(&path).map_err(Error::File)?;
593    file.write_all(audio_data).map_err(Error::File)?;
594    Ok(())
595}
596
597/// Helper function to save text data to file
598fn save_text_file(text: &str, filename: &str) -> Result<(), Error> {
599    let path = PathBuf::from(filename);
600    let mut file = std::fs::File::create(&path).map_err(Error::File)?;
601    file.write_all(text.as_bytes()).map_err(Error::File)?;
602    Ok(())
603}
604
605/// Comprehensive error handling helper
606fn handle_api_error(error: &Error) {
607    match error {
608        Error::Api {
609            status,
610            message,
611            error_type,
612            error_code,
613        } => {
614            eprintln!(" API Error [{status}]: {message}");
615            if let Some(error_type) = error_type {
616                eprintln!("   Type: {error_type}");
617            }
618            if let Some(error_code) = error_code {
619                eprintln!("   Code: {error_code}");
620            }
621
622            // Provide specific guidance based on error type
623            match *status {
624                401 => eprintln!(" Check your API key: export OPENAI_API_KEY=\"your-key\""),
625                429 => eprintln!(" Rate limited - try again in a moment"),
626                500..=599 => eprintln!(" Server error - try again later"),
627                _ => {}
628            }
629        }
630        Error::InvalidRequest(msg) => {
631            eprintln!(" Invalid Request: {msg}");
632            eprintln!(" Check your request parameters and audio file format");
633        }
634        Error::Config(msg) => {
635            eprintln!(" Configuration Error: {msg}");
636            eprintln!(" Check your client configuration");
637        }
638        Error::Http(err) => {
639            eprintln!(" HTTP Error: {err}");
640            eprintln!(" Check your network connection");
641        }
642        Error::HttpMiddleware(err) => {
643            eprintln!(" HTTP Middleware Error: {err}");
644            eprintln!(" Check your network connection and middleware configuration");
645        }
646        Error::Json(err) => {
647            eprintln!(" JSON Error: {err}");
648            eprintln!(" Response parsing failed - may be a temporary issue");
649        }
650        Error::Authentication(msg) => {
651            eprintln!(" Authentication Error: {msg}");
652            eprintln!(" Check your API key");
653        }
654        Error::RateLimit(msg) => {
655            eprintln!(" Rate Limit Error: {msg}");
656            eprintln!(" Try again in a moment");
657        }
658        Error::Stream(msg) => {
659            eprintln!(" Stream Error: {msg}");
660            eprintln!(" Connection issue with streaming");
661        }
662        Error::File(err) => {
663            eprintln!(" File Error: {err}");
664            eprintln!(" Check file permissions and paths, ensure audio file exists");
665        }
666        Error::Builder(msg) => {
667            eprintln!(" Builder Error: {msg}");
668            eprintln!(" Check your request builder configuration");
669        }
670        Error::Internal(msg) => {
671            eprintln!(" Internal Error: {msg}");
672            eprintln!(" This may be a bug, please report it");
673        }
674        Error::StreamConnection { message } => {
675            eprintln!(" Stream Connection Error: {message}");
676            eprintln!(" Check your network connection");
677        }
678        Error::StreamParsing { message, chunk } => {
679            eprintln!(" Stream Parsing Error: {message}");
680            eprintln!("   Problematic chunk: {chunk}");
681            eprintln!(" The response stream may be corrupted");
682        }
683        Error::StreamBuffer { message } => {
684            eprintln!(" Stream Buffer Error: {message}");
685            eprintln!(" The stream buffer encountered an issue");
686        }
687    }
688}