1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
//! # mistralrs, Blazing-Fast LLM Inference in Rust
//!
//! The Rust SDK for [mistral.rs](https://github.com/EricLBuehler/mistral.rs), a high-performance
//! LLM inference engine supporting text, multimodal, speech, image generation, and embedding models.
//!
//! ## Quick Start
//!
//! ```no_run
//! use mistralrs::{IsqBits, ModelBuilder, TextMessages, TextMessageRole};
//!
//! #[tokio::main]
//! async fn main() -> mistralrs::error::Result<()> {
//! let model = ModelBuilder::new("Qwen/Qwen3-4B")
//! .with_auto_isq(IsqBits::Four)
//! .build()
//! .await?;
//!
//! let response = model.chat("What is Rust's ownership model?").await?;
//! println!("{response}");
//! Ok(())
//! }
//! ```
//!
//! ## Capabilities
//!
//! | Capability | Builder | Example |
//! |---|---|---|
//! | Any model (auto-detect) | [`ModelBuilder`] | `examples/getting_started/text_generation/` |
//! | Text generation | [`TextModelBuilder`] | `examples/getting_started/text_generation/` |
//! | Multimodal (image+text) | [`MultimodalModelBuilder`] | `examples/getting_started/multimodal/` |
//! | GGUF quantized models | [`GgufModelBuilder`] | `examples/getting_started/gguf/` |
//! | Image generation | [`DiffusionModelBuilder`] | `examples/models/diffusion/` |
//! | Speech synthesis | [`SpeechModelBuilder`] | `examples/models/speech/` |
//! | Embeddings | [`EmbeddingModelBuilder`] | `examples/getting_started/embedding/` |
//! | Structured output | [`Model::generate_structured`] | `examples/advanced/json_schema/` |
//! | Tool calling | [`Tool`], [`ToolChoice`] | `examples/advanced/tools/` |
//! | Agents | [`AgentBuilder`] | `examples/advanced/agent/` |
//! | Multi-model | [`MultiModelBuilder`] | `examples/advanced/multi_model/` |
//! | LoRA / X-LoRA | [`LoraModelBuilder`], [`XLoraModelBuilder`] | `examples/advanced/lora/` |
//! | AnyMoE | [`AnyMoeModelBuilder`] | `examples/advanced/anymoe/` |
//! | MCP client | [`McpClientConfig`] | `examples/advanced/mcp_client/` |
//!
//! ## Model Loading
//!
//! All models are created through builder structs that follow a consistent pattern:
//!
//! ```no_run
//! # use mistralrs::*;
//! # async fn example() -> error::Result<()> {
//! let model = ModelBuilder::new("Qwen/Qwen3-4B")
//! .with_auto_isq(IsqBits::Four) // In-situ quantization (auto-selects best type)
//! .with_logging() // Enable logging
//! .with_paged_attn(PagedAttentionMetaBuilder::default().build()?)
//! .build()
//! .await?;
//! # Ok(())
//! # }
//! ```
//!
//! Use [`ModelBuilder::with_auto_isq`] for automatic platform-optimal quantization (e.g., `with_auto_isq(IsqBits::Four)`),
//! or [`ModelBuilder::with_isq`] with a specific [`IsqType`]: `Q4_0`, `Q4_1`, `Q4K`, `Q5_0`, `Q5_1`, `Q5K`,
//! `Q6K`, `Q8_0`, `Q8_1`, `HQQ4`, `HQQ8`, and more.
//!
//! ## Choosing a Request Type
//!
//! | Type | Use When | Sampling |
//! |---|---|---|
//! | [`TextMessages`] | Simple text-only chat, no special settings needed | Deterministic |
//! | [`MultimodalMessages`] | Your prompt includes images or audio | Deterministic |
//! | [`RequestBuilder`] | You need tools, logprobs, custom sampling, constraints, adapters, or web search | Configurable |
//!
//! `TextMessages` and `MultimodalMessages` can be converted into a [`RequestBuilder`] via
//! `Into<RequestBuilder>` if you start simple and later need more control.
//!
//! ## Streaming
//!
//! The stream returned by [`Model::stream_chat_request`] implements
//! [`futures::Stream`], so you can use `StreamExt` combinators:
//!
//! ```no_run
//! use futures::StreamExt;
//! use mistralrs::*;
//!
//! # async fn example(model: Model) -> error::Result<()> {
//! let messages = TextMessages::new()
//! .add_message(TextMessageRole::User, "Tell me a joke.");
//!
//! let mut stream = model.stream_chat_request(messages).await?;
//! while let Some(chunk) = stream.next().await {
//! if let Response::Chunk(c) = chunk {
//! if let Some(text) = c.choices.first().and_then(|ch| ch.delta.content.as_ref()) {
//! print!("{text}");
//! }
//! }
//! }
//! # Ok(())
//! # }
//! ```
//!
//! ## Structured Output
//!
//! Derive [`schemars::JsonSchema`] on your type and the model will be constrained to
//! produce valid JSON matching the schema:
//!
//! ```no_run
//! use mistralrs::*;
//! use schemars::JsonSchema;
//! use serde::Deserialize;
//!
//! #[derive(Deserialize, JsonSchema)]
//! struct City {
//! name: String,
//! country: String,
//! population: u64,
//! }
//!
//! # async fn example(model: Model) -> error::Result<()> {
//! let messages = TextMessages::new()
//! .add_message(TextMessageRole::User, "Give me info about Paris.");
//!
//! let city: City = model.generate_structured::<City>(messages).await?;
//! println!("{}: pop. {}", city.name, city.population);
//! # Ok(())
//! # }
//! ```
//!
//! ## Blocking API
//!
//! For non-async applications, use [`blocking::BlockingModel`]:
//!
//! ```no_run
//! use mistralrs::blocking::BlockingModel;
//! use mistralrs::{IsqBits, ModelBuilder};
//!
//! fn main() -> mistralrs::error::Result<()> {
//! let model = BlockingModel::from_auto_builder(
//! ModelBuilder::new("Qwen/Qwen3-4B")
//! .with_auto_isq(IsqBits::Four),
//! )?;
//! let answer = model.chat("What is 2+2?")?;
//! println!("{answer}");
//! Ok(())
//! }
//! ```
//!
//! ## Error Handling
//!
//! All public methods return [`error::Result<T>`](error::Result) with a structured
//! [`error::Error`] enum. Variants include [`ModelLoad`](error::Error::ModelLoad),
//! [`Inference`](error::Error::Inference), [`RequestValidation`](error::Error::RequestValidation),
//! and more. The error type implements `std::error::Error`, so it works seamlessly with
//! `anyhow` and `eyre`.
//!
//! ## MCP (Model Context Protocol)
//!
//! ```no_run
//! # use mistralrs::*;
//! # async fn example() -> error::Result<()> {
//! let mcp_config = McpClientConfig {
//! servers: vec![/* your server configs */],
//! auto_register_tools: true,
//! tool_timeout_secs: Some(30),
//! max_concurrent_calls: Some(5),
//! };
//!
//! let model = ModelBuilder::new("path/to/model")
//! .with_auto_isq(IsqBits::Eight)
//! .with_mcp_client(mcp_config)
//! .build()
//! .await?;
//! # Ok(())
//! # }
//! ```
//!
//! ## Feature Flags
//!
//! | Flag | Effect |
//! |---|---|
//! | `cuda` | CUDA GPU support |
//! | `flash-attn` | Flash Attention 2 kernels (requires `cuda`) |
//! | `cudnn` | cuDNN acceleration (requires `cuda`) |
//! | `nccl` | Multi-GPU via NCCL (requires `cuda`) |
//! | `metal` | Apple Metal GPU support |
//! | `accelerate` | Apple Accelerate framework |
//! | `mkl` | Intel MKL acceleration |
//!
//! The default feature set (no flags) builds with pure Rust, no C compiler or system
//! libraries required.
//!
//! ## Architecture
//!
//! ```text
//! ModelBuilder / TextModelBuilder / MultimodalModelBuilder / GgufModelBuilder / ...
//! │
//! ▼
//! Model ──── send_chat_request() ──► Engine ──► Pipeline ──► Output
//! │ │
//! ├── chat() Scheduler + PagedAttention
//! ├── stream_chat_request()
//! ├── generate_structured()
//! └── send_*_with_model() (multi-model dispatch)
//! ```
pub use resolve_isq;
pub use ;
pub use ;
pub use AnyMoeModelBuilder;
pub use ModelBuilder;
pub use DiffusionModelBuilder;
pub use ;
pub use GgufModelBuilder;
pub use GgufLoraModelBuilder;
pub use GgufXLoraModelBuilder;
pub use LoraModelBuilder;
pub use ;
pub use ;
pub use ;
pub use ;
pub use ;
pub use ;
pub use TextSpeculativeBuilder;
pub use SpeechModelBuilder;
pub use ;
pub use XLoraModelBuilder;
pub use ;
pub use cross_entropy as cross_entropy_loss;
/// Low-level types and internals re-exported from `mistralrs_core`.
///
/// Most users don't need these types directly. They're available for advanced
/// use cases like custom pipelines, device mapping, or direct engine access.
// ========== Response Types ==========
pub use ;
// ========== Request Types ==========
pub use ;
// ========== Sampling ==========
pub use ;
// ========== Tool Types ==========
pub use ;
// ========== Config Types ==========
pub use ;
// ========== Audio Types ==========
pub use AudioInput;
// ========== Video Types ==========
pub use VideoInput;
// ========== Custom Logits ==========
pub use CustomLogitsProcessor;
// ========== Model Category ==========
pub use ModelCategory;
// ========== Search Types ==========
pub use ;
// ========== Speech Types ==========
pub use ;
// ========== AnyMoe Types ==========
pub use ;
// ========== Diffusion Types ==========
pub use ;
// ========== Speculative Types ==========
pub use SpeculativeConfig;
// ========== Device Mapping ==========
pub use ;
// ========== Topology ==========
pub use ;
// ========== Loader Types ==========
pub use ;
// ========== Token Source ==========
pub use TokenSource;
// ========== Engine (Advanced) ==========
pub use ;
// ========== Utilities ==========
pub use ;
// ========== llguidance ==========
pub use llguidance;
// Re-export the tool proc macro for ergonomic tool definition
pub use tool;
// Re-export schemars for use in tool definitions
pub use schemars;