1use std::io::Write;
2
3use anyhow::Result;
4use mistralrs::{
5 AudioInput, ChatCompletionChunkResponse, ChunkChoice, Delta, Response, TextMessageRole,
6 VisionMessages, VisionModelBuilder,
7};
8
9#[tokio::main]
10async fn main() -> Result<()> {
11 let model = VisionModelBuilder::new("microsoft/Phi-4-multimodal-instruct")
12 .with_logging()
13 .build()
14 .await?;
15
16 let audio_bytes =
17 reqwest::get("https://upload.wikimedia.org/wikipedia/commons/4/42/Bird_singing.ogg")
18 .await?
19 .bytes()
20 .await?
21 .to_vec();
22 let audio = AudioInput::from_bytes(&audio_bytes)?;
23
24 let image_bytes =
25 reqwest::get("https://www.allaboutbirds.org/guide/assets/og/528129121-1200px.jpg")
26 .await?
27 .bytes()
28 .await?
29 .to_vec();
30 let image = image::load_from_memory(&image_bytes)?;
31
32 let messages = VisionMessages::new().add_multimodal_message(
33 TextMessageRole::User,
34 "Describe in detail what is happening.",
35 vec![image],
36 vec![audio],
37 &model,
38 )?;
39
40 let mut stream = model.stream_chat_request(messages).await?;
41
42 while let Some(chunk) = stream.next().await {
43 if let Response::Chunk(ChatCompletionChunkResponse { choices, .. }) = chunk {
44 if let Some(ChunkChoice {
45 delta:
46 Delta {
47 content: Some(content),
48 ..
49 },
50 ..
51 }) = choices.first()
52 {
53 print!("{content}");
54 std::io::stdout().flush()?;
55 };
56 } else {
57 }
59 }
60 Ok(())
61}