1use anyhow::Result;
2use mistralrs::{AudioInput, TextMessageRole, VisionMessages, VisionModelBuilder};
3
4#[tokio::main]
5async fn main() -> Result<()> {
6 let model = VisionModelBuilder::new("../hf_models/gemma3n_e4b")
7 .with_logging()
8 .build()
9 .await?;
10
11 let audio_bytes = std::fs::read("sample_speech.wav")?;
12 let audio = AudioInput::from_bytes(&audio_bytes)?;
13
14 let messages = VisionMessages::new().add_multimodal_message(
15 TextMessageRole::User,
16 "What is being said?",
17 vec![],
18 vec![audio],
19 &model,
20 )?;
21
22 let response = model.send_chat_request(messages).await?;
23
24 println!("{}", response.choices[0].message.content.as_ref().unwrap());
25 Ok(())
26}