simple_speech_generation/
simple_speech_generation.rs

1use base64::{engine::general_purpose, Engine as _};
2use gemini_rust::{Gemini, GenerationConfig, Part, PrebuiltVoiceConfig, SpeechConfig, VoiceConfig};
3use std::fs::File;
4use std::io::Write;
5
6#[tokio::main]
7async fn main() -> Result<(), Box<dyn std::error::Error>> {
8    // Load API key from environment variable
9    let api_key =
10        std::env::var("GEMINI_API_KEY").expect("Please set GEMINI_API_KEY environment variable");
11
12    // Create client with TTS-enabled model
13    let client = Gemini::with_model(api_key, "models/gemini-2.5-flash-preview-tts".to_string());
14
15    println!("šŸŽ¤ Gemini Speech Generation Example");
16    println!("Generating audio from text...\n");
17
18    // Create generation config with speech settings
19    let generation_config = GenerationConfig {
20        response_modalities: Some(vec!["AUDIO".to_string()]),
21        speech_config: Some(SpeechConfig {
22            voice_config: Some(VoiceConfig {
23                prebuilt_voice_config: Some(PrebuiltVoiceConfig {
24                    voice_name: "Puck".to_string(),
25                }),
26            }),
27            multi_speaker_voice_config: None,
28        }),
29        ..Default::default()
30    };
31
32    match client
33        .generate_content()
34        .with_user_message("Hello! This is a demonstration of text-to-speech using Google's Gemini API. The voice you're hearing is generated entirely by AI.")
35        .with_generation_config(generation_config)
36        .execute()
37        .await {
38        Ok(response) => {
39            println!("āœ… Speech generation completed!");
40
41            // Check if we have candidates
42            for (i, candidate) in response.candidates.iter().enumerate() {
43                if let Some(parts) = &candidate.content.parts {
44                    for (j, part) in parts.iter().enumerate() {
45                        match part {
46                            // Look for inline data with audio MIME type
47                            Part::InlineData { inline_data } => {
48                                if inline_data.mime_type.starts_with("audio/") {
49                                    println!("šŸ“„ Found audio data: {}", inline_data.mime_type);
50
51                                    // Decode base64 audio data using the new API
52                                    match general_purpose::STANDARD.decode(&inline_data.data) {
53                                        Ok(audio_bytes) => {
54                                            let filename = format!("speech_output_{}_{}.pcm", i, j);
55
56                                            // Save audio to file
57                                            match File::create(&filename) {
58                                                Ok(mut file) => {
59                                                    if let Err(e) = file.write_all(&audio_bytes) {
60                                                        eprintln!("āŒ Error writing audio file: {}", e);
61                                                    } else {
62                                                        println!("šŸ’¾ Audio saved as: {}", filename);
63                                                        println!("šŸ”Š You can play it with: aplay {} (Linux) or afplay {} (macOS)", filename, filename);
64                                                    }
65                                                },
66                                                Err(e) => eprintln!("āŒ Error creating audio file: {}", e),
67                                            }
68                                        },
69                                        Err(e) => eprintln!("āŒ Error decoding base64 audio: {}", e),
70                                    }
71                                }
72                            },
73                            // Display any text content
74                            Part::Text { text, thought } => {
75                                if thought.unwrap_or(false) {
76                                    println!("šŸ’­ Thought: {}", text);
77                                } else {
78                                    println!("šŸ“ Text content: {}", text);
79                                }
80                            },
81                            _ => {
82                                // Handle other part types if needed
83                            }
84                        }
85                    }
86                }
87            }
88
89            // Display usage metadata if available
90            if let Some(usage_metadata) = &response.usage_metadata {
91                println!("\nšŸ“Š Usage Statistics:");
92                println!("   Prompt tokens: {}", usage_metadata.prompt_token_count);
93                println!("   Total tokens: {}", usage_metadata.total_token_count);
94            }
95        },
96        Err(e) => {
97            eprintln!("āŒ Error generating speech: {}", e);
98            eprintln!("\nšŸ’” Troubleshooting tips:");
99            eprintln!("   1. Make sure GEMINI_API_KEY environment variable is set");
100            eprintln!("   2. Verify you have access to the Gemini TTS model");
101            eprintln!("   3. Check your internet connection");
102            eprintln!("   4. Ensure the model 'gemini-2.5-flash-preview-tts' is available");
103        }
104    }
105
106    Ok(())
107}