use siumai::prelude::*;
use std::path::Path;
#[tokio::main]
async fn main() -> Result<(), Box<dyn std::error::Error>> {
println!("🎭 Multimodal Processing Example");
println!("=================================\n");
demonstrate_image_analysis().await?;
demonstrate_audio_processing().await?;
demonstrate_combined_modalities().await?;
demonstrate_multimodal_conversation().await?;
Ok(())
}
async fn demonstrate_image_analysis() -> Result<(), Box<dyn std::error::Error>> {
println!("🖼️ 1. Image Analysis");
println!(" Analyzing images with AI vision models\n");
let api_key = std::env::var("OPENAI_API_KEY")
.or_else(|_| std::env::var("ANTHROPIC_API_KEY"))
.unwrap_or_else(|_| {
println!(" ⚠️ No API key found. Using demo mode.");
"demo-key".to_string()
});
let _client = if std::env::var("OPENAI_API_KEY").is_ok() {
Siumai::builder()
.openai()
.api_key(&api_key)
.model("gpt-4o") .temperature(0.3)
.build()
.await?
} else if std::env::var("ANTHROPIC_API_KEY").is_ok() {
Siumai::builder()
.anthropic()
.api_key(&api_key)
.model("claude-3-5-sonnet-20241022") .temperature(0.3)
.build()
.await?
} else {
println!(" 📝 Demo: Would analyze image with vision model");
println!(" 💡 Set OPENAI_API_KEY or ANTHROPIC_API_KEY to try real image analysis\n");
return Ok(());
};
let scenarios = vec![
(
"Chart Analysis",
"Analyze this chart and explain the trends you see.",
),
(
"Code Screenshot",
"What programming language is this? Explain what the code does.",
),
(
"Document OCR",
"Extract and summarize the text from this document.",
),
(
"Scene Description",
"Describe this scene in detail, including objects, people, and setting.",
),
];
for (scenario, prompt) in scenarios {
println!(" 📊 {}", scenario);
println!(" Prompt: {}", prompt);
println!(" 📝 Demo: Would process image with vision model");
println!(" 🔍 Expected: Detailed analysis based on image content\n");
}
Ok(())
}
async fn demonstrate_audio_processing() -> Result<(), Box<dyn std::error::Error>> {
println!("🎵 2. Audio Processing");
println!(" Processing audio content with AI models\n");
let scenarios = vec![
(
"Speech Transcription",
"Convert speech to text",
"audio/speech.mp3",
),
(
"Music Analysis",
"Analyze musical content and style",
"audio/music.wav",
),
(
"Sound Classification",
"Identify and classify sounds",
"audio/environment.wav",
),
(
"Language Detection",
"Detect the language being spoken",
"audio/multilingual.mp3",
),
];
for (scenario, description, file_path) in scenarios {
println!(" 🎧 {}", scenario);
println!(" Description: {}", description);
println!(" File: {}", file_path);
println!(" 📝 Demo: Would process audio file");
println!(" 🔍 Expected: Transcription or analysis results\n");
}
Ok(())
}
async fn demonstrate_combined_modalities() -> Result<(), Box<dyn std::error::Error>> {
println!("🎭 3. Combined Multimodal Content");
println!(" Using text, images, and audio together\n");
let scenarios = vec![
(
"Video Analysis",
"Analyze this video frame and its audio track",
vec!["image/frame.jpg", "audio/soundtrack.mp3"],
),
(
"Presentation Review",
"Review this slide and its speaker notes",
vec!["image/slide.png", "audio/narration.wav"],
),
(
"Document Analysis",
"Analyze this document image and related audio explanation",
vec!["image/document.jpg", "audio/explanation.mp3"],
),
];
for (scenario, prompt, files) in scenarios {
println!(" 🎬 {}", scenario);
println!(" Prompt: {}", prompt);
println!(" Files: {:?}", files);
println!(" 📝 Demo: Would process multiple content types together");
println!(" 🔍 Expected: Comprehensive analysis across modalities\n");
}
Ok(())
}
async fn demonstrate_multimodal_conversation() -> Result<(), Box<dyn std::error::Error>> {
println!("💬 4. Multimodal Conversation");
println!(" Building a conversation with mixed content types\n");
let conversation_steps = vec![
("User", "Text", "I have a chart I'd like you to analyze"),
(
"Assistant",
"Text",
"I'd be happy to help! Please share the chart.",
),
("User", "Image", "[Uploads chart image]"),
(
"Assistant",
"Text",
"I can see this is a sales performance chart showing...",
),
("User", "Text", "Can you explain the trends in more detail?"),
(
"Assistant",
"Text",
"Certainly! The chart shows three key trends...",
),
(
"User",
"Audio",
"[Uploads audio question about specific data point]",
),
(
"Assistant",
"Text",
"Based on your audio question about Q3 data...",
),
];
println!(" 📱 Conversation Flow:");
for (i, (speaker, content_type, content)) in conversation_steps.iter().enumerate() {
let icon = match *content_type {
"Text" => "💬",
"Image" => "🖼️",
"Audio" => "🎵",
_ => "📄",
};
println!(
" {}. {} {}: {} {}",
i + 1,
if *speaker == "User" { "👤" } else { "🤖" },
speaker,
icon,
content
);
}
println!("\n 💡 Key Benefits of Multimodal Conversations:");
println!(" • Rich context from multiple content types");
println!(" • Natural interaction patterns");
println!(" • Comprehensive understanding");
println!(" • Flexible communication methods");
println!("\n 🔧 Implementation Tips:");
println!(" • Use appropriate models for each content type");
println!(" • Consider file size and format limitations");
println!(" • Handle different processing times gracefully");
println!(" • Provide fallbacks for unsupported content");
println!("\n 📊 Performance Considerations:");
println!(" • Larger request sizes with multimodal content");
println!(" • Different pricing for different modalities");
println!(" • Provider-specific capabilities and limits");
println!(" • Network bandwidth requirements");
println!("\n✨ Multimodal processing complete! You now understand how to work");
println!(" with text, images, and audio in AI conversations.");
Ok(())
}
#[allow(dead_code)]
fn file_exists(path: &str) -> bool {
Path::new(path).exists()
}
#[allow(dead_code)]
fn get_file_extension(path: &str) -> Option<&str> {
Path::new(path).extension()?.to_str()
}
#[allow(dead_code)]
fn is_supported_image_format(path: &str) -> bool {
matches!(
get_file_extension(path),
Some("jpg") | Some("jpeg") | Some("png") | Some("gif") | Some("webp")
)
}
#[allow(dead_code)]
fn is_supported_audio_format(path: &str) -> bool {
matches!(
get_file_extension(path),
Some("mp3") | Some("wav") | Some("m4a") | Some("ogg")
)
}