simple_speech_generation/
simple_speech_generation.rs1use base64::{engine::general_purpose, Engine as _};
2use gemini_rust::{Gemini, GenerationConfig, Part, PrebuiltVoiceConfig, SpeechConfig, VoiceConfig};
3use std::fs::File;
4use std::io::Write;
5
6#[tokio::main]
7async fn main() -> Result<(), Box<dyn std::error::Error>> {
8 let api_key =
10 std::env::var("GEMINI_API_KEY").expect("Please set GEMINI_API_KEY environment variable");
11
12 let client = Gemini::with_model(api_key, "models/gemini-2.5-flash-preview-tts".to_string());
14
15 println!("š¤ Gemini Speech Generation Example");
16 println!("Generating audio from text...\n");
17
18 let generation_config = GenerationConfig {
20 response_modalities: Some(vec!["AUDIO".to_string()]),
21 speech_config: Some(SpeechConfig {
22 voice_config: Some(VoiceConfig {
23 prebuilt_voice_config: Some(PrebuiltVoiceConfig {
24 voice_name: "Puck".to_string(),
25 }),
26 }),
27 multi_speaker_voice_config: None,
28 }),
29 ..Default::default()
30 };
31
32 match client
33 .generate_content()
34 .with_user_message("Hello! This is a demonstration of text-to-speech using Google's Gemini API. The voice you're hearing is generated entirely by AI.")
35 .with_generation_config(generation_config)
36 .execute()
37 .await {
38 Ok(response) => {
39 println!("ā
Speech generation completed!");
40
41 for (i, candidate) in response.candidates.iter().enumerate() {
43 if let Some(parts) = &candidate.content.parts {
44 for (j, part) in parts.iter().enumerate() {
45 match part {
46 Part::InlineData { inline_data } => {
48 if inline_data.mime_type.starts_with("audio/") {
49 println!("š Found audio data: {}", inline_data.mime_type);
50
51 match general_purpose::STANDARD.decode(&inline_data.data) {
53 Ok(audio_bytes) => {
54 let filename = format!("speech_output_{}_{}.pcm", i, j);
55
56 match File::create(&filename) {
58 Ok(mut file) => {
59 if let Err(e) = file.write_all(&audio_bytes) {
60 eprintln!("ā Error writing audio file: {}", e);
61 } else {
62 println!("š¾ Audio saved as: {}", filename);
63 println!("š You can play it with: aplay {} (Linux) or afplay {} (macOS)", filename, filename);
64 }
65 },
66 Err(e) => eprintln!("ā Error creating audio file: {}", e),
67 }
68 },
69 Err(e) => eprintln!("ā Error decoding base64 audio: {}", e),
70 }
71 }
72 },
73 Part::Text { text, thought } => {
75 if thought.unwrap_or(false) {
76 println!("š Thought: {}", text);
77 } else {
78 println!("š Text content: {}", text);
79 }
80 },
81 _ => {
82 }
84 }
85 }
86 }
87 }
88
89 if let Some(usage_metadata) = &response.usage_metadata {
91 println!("\nš Usage Statistics:");
92 println!(" Prompt tokens: {}", usage_metadata.prompt_token_count);
93 println!(" Total tokens: {}", usage_metadata.total_token_count);
94 }
95 },
96 Err(e) => {
97 eprintln!("ā Error generating speech: {}", e);
98 eprintln!("\nš” Troubleshooting tips:");
99 eprintln!(" 1. Make sure GEMINI_API_KEY environment variable is set");
100 eprintln!(" 2. Verify you have access to the Gemini TTS model");
101 eprintln!(" 3. Check your internet connection");
102 eprintln!(" 4. Ensure the model 'gemini-2.5-flash-preview-tts' is available");
103 }
104 }
105
106 Ok(())
107}