multi_speaker_tts/
multi_speaker_tts.rs1use base64::{engine::general_purpose, Engine as _};
2use gemini_rust::{Gemini, GenerationConfig, Part, SpeakerVoiceConfig, SpeechConfig};
3use std::fs::File;
4use std::io::Write;
5
6#[tokio::main]
7async fn main() -> Result<(), Box<dyn std::error::Error>> {
8 let api_key =
10 std::env::var("GEMINI_API_KEY").expect("Please set GEMINI_API_KEY environment variable");
11
12 let client = Gemini::with_model(api_key, "models/gemini-2.5-flash-preview-tts".to_string());
14
15 println!("š Gemini Multi-Speaker Speech Generation Example");
16 println!("Generating multi-speaker audio from dialogue...\n");
17
18 let speakers = vec![
20 SpeakerVoiceConfig::new("Alice", "Puck"),
21 SpeakerVoiceConfig::new("Bob", "Charon"),
22 ];
23
24 let generation_config = GenerationConfig {
26 response_modalities: Some(vec!["AUDIO".to_string()]),
27 speech_config: Some(SpeechConfig::multi_speaker(speakers)),
28 ..Default::default()
29 };
30
31 let dialogue = r#"
33Alice: Hello there! I'm excited to demonstrate multi-speaker text-to-speech with Gemini.
34
35Bob: That's amazing! I can't believe how natural this sounds. The different voices really bring the conversation to life.
36
37Alice: Exactly! Each speaker has their own distinct voice characteristics, making it easy to follow who's speaking.
38
39Bob: This technology opens up so many possibilities for audio content creation, educational materials, and accessibility features.
40
41Alice: I couldn't agree more. It's remarkable how far AI-generated speech has come!
42"#;
43
44 match client
45 .generate_content()
46 .with_user_message(dialogue)
47 .with_generation_config(generation_config)
48 .execute()
49 .await
50 {
51 Ok(response) => {
52 println!("ā
Multi-speaker speech generation completed!");
53
54 for (i, candidate) in response.candidates.iter().enumerate() {
56 if let Some(parts) = &candidate.content.parts {
57 for (j, part) in parts.iter().enumerate() {
58 match part {
59 Part::InlineData { inline_data } => {
61 if inline_data.mime_type.starts_with("audio/") {
62 println!("š Found audio data: {}", inline_data.mime_type);
63
64 match general_purpose::STANDARD.decode(&inline_data.data) {
66 Ok(audio_bytes) => {
67 let filename =
68 format!("multi_speaker_dialogue_{}_{}.pcm", i, j);
69
70 match File::create(&filename) {
72 Ok(mut file) => {
73 if let Err(e) = file.write_all(&audio_bytes) {
74 eprintln!(
75 "ā Error writing audio file: {}",
76 e
77 );
78 } else {
79 println!(
80 "š¾ Multi-speaker audio saved as: {}",
81 filename
82 );
83 println!("š§ Play with: aplay {} (Linux) or afplay {} (macOS)", filename, filename);
84 println!("š„ Features Alice (Puck voice) and Bob (Charon voice)");
85 }
86 }
87 Err(e) => {
88 eprintln!("ā Error creating audio file: {}", e)
89 }
90 }
91 }
92 Err(e) => {
93 eprintln!("ā Error decoding base64 audio: {}", e)
94 }
95 }
96 }
97 }
98 Part::Text { text, thought } => {
100 if thought.unwrap_or(false) {
101 println!("š Model thought: {}", text);
102 } else {
103 println!("š Generated text: {}", text);
104 }
105 }
106 _ => {
107 }
109 }
110 }
111 }
112 }
113
114 if let Some(usage_metadata) = &response.usage_metadata {
116 println!("\nš Usage Statistics:");
117 println!(" Prompt tokens: {}", usage_metadata.prompt_token_count);
118 println!(" Total tokens: {}", usage_metadata.total_token_count);
119 if let Some(thoughts_tokens) = usage_metadata.thoughts_token_count {
120 println!(" Thinking tokens: {}", thoughts_tokens);
121 }
122 }
123 }
124 Err(e) => {
125 eprintln!("ā Error generating multi-speaker speech: {}", e);
126 eprintln!("\nš” Troubleshooting tips:");
127 eprintln!(" 1. Make sure GEMINI_API_KEY environment variable is set");
128 eprintln!(" 2. Verify you have access to the Gemini TTS model");
129 eprintln!(" 3. Check your internet connection");
130 eprintln!(" 4. Ensure speaker names in dialogue match configured speakers");
131 eprintln!(" 5. Make sure the model 'gemini-2.5-flash-preview-tts' supports multi-speaker TTS");
132 }
133 }
134
135 println!("\nš Example completed!");
136 println!("š” Tips for multi-speaker TTS:");
137 println!(" ⢠Use clear speaker names (Alice:, Bob:, etc.)");
138 println!(" ⢠Configure voice for each speaker beforehand");
139 println!(" ⢠Available voices: Puck, Charon, Kore, Fenrir, Aoede");
140 println!(" ⢠Each speaker maintains consistent voice characteristics");
141
142 Ok(())
143}