Skip to main content

phi4mm_audio/
main.rs

1use std::io::Write;
2
3use anyhow::Result;
4use mistralrs::{
5    AudioInput, ChatCompletionChunkResponse, ChunkChoice, Delta, Response, TextMessageRole,
6    VisionMessages, VisionModelBuilder,
7};
8
9#[tokio::main]
10async fn main() -> Result<()> {
11    let model = VisionModelBuilder::new("microsoft/Phi-4-multimodal-instruct")
12        .with_logging()
13        .build()
14        .await?;
15
16    let audio_bytes =
17        reqwest::get("https://upload.wikimedia.org/wikipedia/commons/4/42/Bird_singing.ogg")
18            .await?
19            .bytes()
20            .await?
21            .to_vec();
22    let audio = AudioInput::from_bytes(&audio_bytes)?;
23
24    let image_bytes =
25        reqwest::get("https://www.allaboutbirds.org/guide/assets/og/528129121-1200px.jpg")
26            .await?
27            .bytes()
28            .await?
29            .to_vec();
30    let image = image::load_from_memory(&image_bytes)?;
31
32    let messages = VisionMessages::new().add_multimodal_message(
33        TextMessageRole::User,
34        "Describe in detail what is happening.",
35        vec![image],
36        vec![audio],
37        &model,
38    )?;
39
40    let mut stream = model.stream_chat_request(messages).await?;
41
42    while let Some(chunk) = stream.next().await {
43        if let Response::Chunk(ChatCompletionChunkResponse { choices, .. }) = chunk {
44            if let Some(ChunkChoice {
45                delta:
46                    Delta {
47                        content: Some(content),
48                        ..
49                    },
50                ..
51            }) = choices.first()
52            {
53                print!("{content}");
54                std::io::stdout().flush()?;
55            };
56        } else {
57            // Handle errors
58        }
59    }
60    Ok(())
61}