multi_speaker_tts/
multi_speaker_tts.rs

1use base64::{engine::general_purpose, Engine as _};
2use gemini_rust::{Gemini, GenerationConfig, Part, SpeakerVoiceConfig, SpeechConfig};
3use std::fs::File;
4use std::io::Write;
5
6#[tokio::main]
7async fn main() -> Result<(), Box<dyn std::error::Error>> {
8    // Load API key from environment variable
9    let api_key =
10        std::env::var("GEMINI_API_KEY").expect("Please set GEMINI_API_KEY environment variable");
11
12    // Create client with TTS-enabled model
13    let client = Gemini::with_model(api_key, "models/gemini-2.5-flash-preview-tts".to_string());
14
15    println!("šŸŽ­ Gemini Multi-Speaker Speech Generation Example");
16    println!("Generating multi-speaker audio from dialogue...\n");
17
18    // Create multi-speaker configuration
19    let speakers = vec![
20        SpeakerVoiceConfig::new("Alice", "Puck"),
21        SpeakerVoiceConfig::new("Bob", "Charon"),
22    ];
23
24    // Create generation config with multi-speaker speech settings
25    let generation_config = GenerationConfig {
26        response_modalities: Some(vec!["AUDIO".to_string()]),
27        speech_config: Some(SpeechConfig::multi_speaker(speakers)),
28        ..Default::default()
29    };
30
31    // Create a dialogue with speaker tags
32    let dialogue = r#"
33Alice: Hello there! I'm excited to demonstrate multi-speaker text-to-speech with Gemini.
34
35Bob: That's amazing! I can't believe how natural this sounds. The different voices really bring the conversation to life.
36
37Alice: Exactly! Each speaker has their own distinct voice characteristics, making it easy to follow who's speaking.
38
39Bob: This technology opens up so many possibilities for audio content creation, educational materials, and accessibility features.
40
41Alice: I couldn't agree more. It's remarkable how far AI-generated speech has come!
42"#;
43
44    match client
45        .generate_content()
46        .with_user_message(dialogue)
47        .with_generation_config(generation_config)
48        .execute()
49        .await
50    {
51        Ok(response) => {
52            println!("āœ… Multi-speaker speech generation completed!");
53
54            // Check if we have candidates
55            for (i, candidate) in response.candidates.iter().enumerate() {
56                if let Some(parts) = &candidate.content.parts {
57                    for (j, part) in parts.iter().enumerate() {
58                        match part {
59                            // Look for inline data with audio MIME type
60                            Part::InlineData { inline_data } => {
61                                if inline_data.mime_type.starts_with("audio/") {
62                                    println!("šŸ“„ Found audio data: {}", inline_data.mime_type);
63
64                                    // Decode base64 audio data
65                                    match general_purpose::STANDARD.decode(&inline_data.data) {
66                                        Ok(audio_bytes) => {
67                                            let filename =
68                                                format!("multi_speaker_dialogue_{}_{}.pcm", i, j);
69
70                                            // Save audio to file
71                                            match File::create(&filename) {
72                                                Ok(mut file) => {
73                                                    if let Err(e) = file.write_all(&audio_bytes) {
74                                                        eprintln!(
75                                                            "āŒ Error writing audio file: {}",
76                                                            e
77                                                        );
78                                                    } else {
79                                                        println!(
80                                                            "šŸ’¾ Multi-speaker audio saved as: {}",
81                                                            filename
82                                                        );
83                                                        println!("šŸŽ§ Play with: aplay {} (Linux) or afplay {} (macOS)", filename, filename);
84                                                        println!("šŸ‘„ Features Alice (Puck voice) and Bob (Charon voice)");
85                                                    }
86                                                }
87                                                Err(e) => {
88                                                    eprintln!("āŒ Error creating audio file: {}", e)
89                                                }
90                                            }
91                                        }
92                                        Err(e) => {
93                                            eprintln!("āŒ Error decoding base64 audio: {}", e)
94                                        }
95                                    }
96                                }
97                            }
98                            // Display any text content
99                            Part::Text { text, thought } => {
100                                if thought.unwrap_or(false) {
101                                    println!("šŸ’­ Model thought: {}", text);
102                                } else {
103                                    println!("šŸ“ Generated text: {}", text);
104                                }
105                            }
106                            _ => {
107                                // Handle other part types if needed
108                            }
109                        }
110                    }
111                }
112            }
113
114            // Display usage metadata if available
115            if let Some(usage_metadata) = &response.usage_metadata {
116                println!("\nšŸ“Š Usage Statistics:");
117                println!("   Prompt tokens: {}", usage_metadata.prompt_token_count);
118                println!("   Total tokens: {}", usage_metadata.total_token_count);
119                if let Some(thoughts_tokens) = usage_metadata.thoughts_token_count {
120                    println!("   Thinking tokens: {}", thoughts_tokens);
121                }
122            }
123        }
124        Err(e) => {
125            eprintln!("āŒ Error generating multi-speaker speech: {}", e);
126            eprintln!("\nšŸ’” Troubleshooting tips:");
127            eprintln!("   1. Make sure GEMINI_API_KEY environment variable is set");
128            eprintln!("   2. Verify you have access to the Gemini TTS model");
129            eprintln!("   3. Check your internet connection");
130            eprintln!("   4. Ensure speaker names in dialogue match configured speakers");
131            eprintln!("   5. Make sure the model 'gemini-2.5-flash-preview-tts' supports multi-speaker TTS");
132        }
133    }
134
135    println!("\nšŸŽ‰ Example completed!");
136    println!("šŸ’” Tips for multi-speaker TTS:");
137    println!("   • Use clear speaker names (Alice:, Bob:, etc.)");
138    println!("   • Configure voice for each speaker beforehand");
139    println!("   • Available voices: Puck, Charon, Kore, Fenrir, Aoede");
140    println!("   • Each speaker maintains consistent voice characteristics");
141
142    Ok(())
143}