qwen3_vl/engine.rs
1//! The [`Engine`] and [`EngineOptions`] types.
2
3use std::{
4 path::{Path, PathBuf},
5 sync::Arc,
6 time::Duration,
7};
8
9use mistralrs::{
10 Constraint, IsqType, MultimodalMessages, MultimodalModelBuilder, RequestBuilder, TextMessageRole,
11};
12use tracing::{debug, info, instrument};
13
14use crate::error::{Error, LoadError};
15use llmtask::Task;
16
17/// Default per-call inference timeout (issue #1 H-001). Five
18/// minutes covers cold-cache Metal JIT specialization on real
19/// keyframes and pathological prompts; a stuck model (kernel
20/// deadlock, GPU OOM) trips this rather than blocking the caller
21/// forever. Override per-engine via
22/// [`EngineOptions::with_inference_timeout`].
23pub const DEFAULT_INFERENCE_TIMEOUT: Duration = Duration::from_secs(300);
24
25/// Configuration for [`Engine::load`].
26#[derive(Debug, Clone)]
27pub struct EngineOptions {
28 model_path: PathBuf,
29 quantization: IsqType,
30 max_tokens: usize,
31 request: RequestOptions,
32 inference_timeout: Duration,
33}
34
35impl EngineOptions {
36 /// Construct with the given model path, default quantization
37 /// (`IsqType::Q4K`), default `max_tokens` (`1024`), and an
38 /// indexing-safe default sampler profile
39 /// ([`RequestOptions::deterministic`]).
40 ///
41 /// The default request is deterministic because this crate's primary
42 /// use case is producing structured output that gets persisted to a
43 /// search index (see [`crate::image_analysis::ImageAnalysisTask`]).
44 /// Stochastic sampling means the same keyframes reprocessed after
45 /// a timeout, retry, or backfill can produce different
46 /// `ImageAnalysis` values,
47 /// silently drifting the index. Greedy decoding closes that hole at
48 /// the cost of diverging from the Qwen3-VL Instruct model card's
49 /// recommended sampler — see [`RequestOptions::deterministic`] for
50 /// the full trade-off.
51 ///
52 /// For one-shot, quality-prioritised use where reproducibility
53 /// doesn't matter, swap the engine default for the model-card
54 /// stochastic profile via
55 /// `EngineOptions::new(path).with_request(RequestOptions::new())`,
56 /// or override per-call via [`Engine::run_with`].
57 pub fn new(model_path: impl Into<PathBuf>) -> Self {
58 Self {
59 model_path: model_path.into(),
60 quantization: IsqType::Q4K,
61 // Issue #1 M-003: bumped from 512 → 1024. The 512 default
62 // truncated complex scenes mid-JSON (many subjects/objects/
63 // actions), surfacing as
64 // ParseError::Json(EOF while parsing a string). 1024 covers
65 // the long tail observed empirically without inflating
66 // worst-case latency materially under greedy decoding.
67 max_tokens: 1024,
68 request: RequestOptions::deterministic(),
69 inference_timeout: DEFAULT_INFERENCE_TIMEOUT,
70 }
71 }
72
73 /// Returns the configured model path.
74 #[cfg_attr(not(tarpaulin), inline(always))]
75 pub fn model_path(&self) -> &Path {
76 &self.model_path
77 }
78
79 /// Builder-style setter for `model_path`.
80 #[cfg_attr(not(tarpaulin), inline(always))]
81 pub fn with_model_path(mut self, val: impl Into<PathBuf>) -> Self {
82 self.model_path = val.into();
83 self
84 }
85
86 /// In-place setter for `model_path`.
87 #[cfg_attr(not(tarpaulin), inline(always))]
88 pub fn set_model_path(&mut self, val: impl Into<PathBuf>) -> &mut Self {
89 self.model_path = val.into();
90 self
91 }
92
93 /// Returns the configured quantization (default `IsqType::Q4K`).
94 #[cfg_attr(not(tarpaulin), inline(always))]
95 pub const fn quantization(&self) -> IsqType {
96 self.quantization
97 }
98
99 /// Builder-style setter for `quantization`.
100 #[cfg_attr(not(tarpaulin), inline(always))]
101 pub const fn with_quantization(mut self, val: IsqType) -> Self {
102 self.quantization = val;
103 self
104 }
105
106 /// In-place setter for `quantization`.
107 #[cfg_attr(not(tarpaulin), inline(always))]
108 pub const fn set_quantization(&mut self, val: IsqType) -> &mut Self {
109 self.quantization = val;
110 self
111 }
112
113 /// Returns the configured `max_tokens` ceiling (default `1024`).
114 #[cfg_attr(not(tarpaulin), inline(always))]
115 pub const fn max_tokens(&self) -> usize {
116 self.max_tokens
117 }
118
119 /// Builder-style setter for `max_tokens`. Any value is accepted at
120 /// the type level (no setter-side validation); a value of `0` is
121 /// clamped up to `1` at request time inside [`Engine::run_with`]
122 /// before being passed to mistralrs's `set_sampler_max_len`, so a
123 /// zero here means "let the model emit at least one token", not
124 /// "skip generation entirely".
125 #[cfg_attr(not(tarpaulin), inline(always))]
126 pub const fn with_max_tokens(mut self, val: usize) -> Self {
127 self.max_tokens = val;
128 self
129 }
130
131 /// In-place setter for `max_tokens`. See [`Self::with_max_tokens`]
132 /// for the runtime `0 → 1` clamp note.
133 #[cfg_attr(not(tarpaulin), inline(always))]
134 pub const fn set_max_tokens(&mut self, val: usize) -> &mut Self {
135 self.max_tokens = val;
136 self
137 }
138
139 /// Returns the engine-level default [`RequestOptions`]. This is the
140 /// sampler profile used by [`Engine::run`]; per-call overrides go
141 /// through [`Engine::run_with`].
142 ///
143 /// Default ([`EngineOptions::new`]): [`RequestOptions::deterministic`]
144 /// — see that constructor for the indexing-vs-quality trade-off.
145 #[cfg_attr(not(tarpaulin), inline(always))]
146 pub const fn request(&self) -> &RequestOptions {
147 &self.request
148 }
149
150 /// Builder-style setter for `request`. Replaces the engine-level
151 /// default sampler profile wholesale.
152 #[cfg_attr(not(tarpaulin), inline(always))]
153 pub fn with_request(mut self, val: RequestOptions) -> Self {
154 self.request = val;
155 self
156 }
157
158 /// In-place setter for `request`. Replaces the engine-level default
159 /// sampler profile wholesale.
160 #[cfg_attr(not(tarpaulin), inline(always))]
161 pub fn set_request(&mut self, val: RequestOptions) -> &mut Self {
162 self.request = val;
163 self
164 }
165
166 /// Returns the per-call inference timeout (default
167 /// [`DEFAULT_INFERENCE_TIMEOUT`] = 5 min). Issue #1 H-001.
168 #[cfg_attr(not(tarpaulin), inline(always))]
169 pub const fn inference_timeout(&self) -> Duration {
170 self.inference_timeout
171 }
172
173 /// Builder-style setter for `inference_timeout`.
174 #[cfg_attr(not(tarpaulin), inline(always))]
175 pub const fn with_inference_timeout(mut self, val: Duration) -> Self {
176 self.inference_timeout = val;
177 self
178 }
179
180 /// In-place setter for `inference_timeout`.
181 #[cfg_attr(not(tarpaulin), inline(always))]
182 pub const fn set_inference_timeout(&mut self, val: Duration) -> &mut Self {
183 self.inference_timeout = val;
184 self
185 }
186}
187
188/// Sampler configuration applied per call by [`Engine::run`] /
189/// [`Engine::run_with`].
190///
191/// Two named presets ship out of the box:
192///
193/// - [`RequestOptions::new`] / [`RequestOptions::default`] — the
194/// Qwen3-VL Instruct (non-thinking) model card sampler
195/// (`temperature 0.7`, `top_p 0.8`, `top_k 20`,
196/// `presence_penalty 1.5`). Best output quality; the same input
197/// can produce different outputs across runs.
198/// - [`RequestOptions::deterministic`] — greedy decoding
199/// (`temperature 0.0`, `top_p 1.0`, `top_k 1`) with
200/// `presence_penalty 1.5` retained (greedy without it falls into
201/// token loops). Bit-stable output for identical inputs; the
202/// right choice for indexing pipelines that must avoid silent
203/// drift on retries / backfills. This is the profile
204/// [`EngineOptions::new`] embeds by default.
205///
206/// `Engine::run_with` applies all four fields to the underlying
207/// mistralrs `RequestBuilder` uniformly — there is no separate
208/// deterministic branch; the preset itself encodes the choice.
209///
210/// `repetition_penalty` and a sampler seed are intentionally absent —
211/// mistralrs 0.8 has no `set_sampler_repetition_penalty` and no
212/// `set_sampler_seed`. Do NOT substitute
213/// `set_sampler_frequency_penalty(1.0)` for the missing
214/// `repetition_penalty`: the math is different (additive vs
215/// multiplicative).
216#[derive(Debug, Clone)]
217pub struct RequestOptions {
218 temperature: f64,
219 top_p: f64,
220 top_k: usize,
221 presence_penalty: f32,
222}
223
224impl RequestOptions {
225 /// Construct with the Qwen3-VL Instruct (non-thinking) model card
226 /// defaults: `temperature 0.7`, `top_p 0.8`, `top_k 20`,
227 /// `presence_penalty 1.5`. Best output quality; not bit-stable
228 /// across runs. Pair with [`EngineOptions::with_request`] (or
229 /// [`Engine::run_with`]) when reproducibility doesn't matter and
230 /// quality does.
231 #[cfg_attr(not(tarpaulin), inline(always))]
232 pub const fn new() -> Self {
233 Self {
234 temperature: 0.7,
235 top_p: 0.8,
236 top_k: 20,
237 presence_penalty: 1.5,
238 }
239 }
240
241 /// Indexing-safe greedy decoding: `temperature 0.0`, `top_p 1.0`,
242 /// `top_k 1`, `presence_penalty 1.5`. Output is bit-stable for
243 /// identical inputs — the right choice for pipelines that persist
244 /// VLM output to a search index, where retries / timeouts /
245 /// backfills must not silently drift the index. This is the preset
246 /// [`EngineOptions::new`] embeds by default.
247 ///
248 /// Greedy decoding is the only deterministic mode mistralrs 0.8
249 /// supports (no `set_sampler_seed`).
250 ///
251 /// **The retained `presence_penalty 1.5` is a documented
252 /// trade-off:**
253 ///
254 /// - In mistralrs 0.8 (re-verify when upgrading; future patches
255 /// could change this and this preset's assumption would no
256 /// longer hold), `presence_penalty` is applied over the full
257 /// `seq.get_toks()` (prompt tokens plus generated tokens,
258 /// verified in
259 /// `mistralrs-core/src/sampler.rs::apply_freq_pres_rep_penalty`
260 /// and `mistralrs-core/src/pipeline/sampling.rs`). With
261 /// `temperature 0` there is no sampling spread, so every token
262 /// appearing in the task prompt gets a flat `-1.5` logit shift
263 /// even before the model emits anything. To minimize the
264 /// collateral on legitimate value tokens,
265 /// [`crate::image_analysis::ImageAnalysisTask`]'s
266 /// `IMAGE_ANALYSIS_PROMPT` is intentionally written without enumerated
267 /// value examples (H1) — format guidance is descriptive
268 /// (word counts, lowercase) rather than enumerative. Residual
269 /// bias only hits scaffolding/instruction tokens (e.g. "scene",
270 /// "describing", "lowercase"), which the model is unlikely to
271 /// want to emit as values, and the JSON schema constraint
272 /// preserves field names and structure regardless.
273 /// - Removing the penalty was tested and broke generation: greedy
274 /// without any repetition control falls into token loops that
275 /// exhaust `max_tokens` mid-string and surface as
276 /// `Error::Parse(Json(EOF while parsing a string))`. mistralrs
277 /// 0.8 has no generated-only repetition mechanism
278 /// (`frequency_penalty` and `repetition_penalty` also operate
279 /// over `seq.get_toks()`), so biased-but-parseable beats
280 /// unbiased-but-broken.
281 ///
282 /// Callers that genuinely want greedy with no repetition control
283 /// can chain `.with_presence_penalty(0.0)` and accept the
284 /// repetition-loop hazard themselves.
285 ///
286 /// Custom `Task` implementations should follow the same prompt
287 /// hygiene: avoid enumerating expected value tokens in the prompt,
288 /// because they will be penalized in this preset.
289 #[cfg_attr(not(tarpaulin), inline(always))]
290 pub const fn deterministic() -> Self {
291 Self {
292 temperature: 0.0,
293 top_p: 1.0,
294 top_k: 1,
295 presence_penalty: 1.5,
296 }
297 }
298
299 /// Returns the configured sampling temperature.
300 #[cfg_attr(not(tarpaulin), inline(always))]
301 pub const fn temperature(&self) -> f64 {
302 self.temperature
303 }
304
305 /// Builder-style setter for `temperature`.
306 #[cfg_attr(not(tarpaulin), inline(always))]
307 pub const fn with_temperature(mut self, val: f64) -> Self {
308 self.temperature = val;
309 self
310 }
311
312 /// In-place setter for `temperature`.
313 #[cfg_attr(not(tarpaulin), inline(always))]
314 pub const fn set_temperature(&mut self, val: f64) -> &mut Self {
315 self.temperature = val;
316 self
317 }
318
319 /// Returns the configured `top_p`. Note: mistralrs 0.8's builder
320 /// method is `set_sampler_topp` (no underscore between `top` and
321 /// `p`).
322 #[cfg_attr(not(tarpaulin), inline(always))]
323 pub const fn top_p(&self) -> f64 {
324 self.top_p
325 }
326
327 /// Builder-style setter for `top_p`.
328 #[cfg_attr(not(tarpaulin), inline(always))]
329 pub const fn with_top_p(mut self, val: f64) -> Self {
330 self.top_p = val;
331 self
332 }
333
334 /// In-place setter for `top_p`.
335 #[cfg_attr(not(tarpaulin), inline(always))]
336 pub const fn set_top_p(&mut self, val: f64) -> &mut Self {
337 self.top_p = val;
338 self
339 }
340
341 /// Returns the configured `top_k`.
342 #[cfg_attr(not(tarpaulin), inline(always))]
343 pub const fn top_k(&self) -> usize {
344 self.top_k
345 }
346
347 /// Builder-style setter for `top_k`.
348 #[cfg_attr(not(tarpaulin), inline(always))]
349 pub const fn with_top_k(mut self, val: usize) -> Self {
350 self.top_k = val;
351 self
352 }
353
354 /// In-place setter for `top_k`.
355 #[cfg_attr(not(tarpaulin), inline(always))]
356 pub const fn set_top_k(&mut self, val: usize) -> &mut Self {
357 self.top_k = val;
358 self
359 }
360
361 /// Returns the configured `presence_penalty`. With the
362 /// [`RequestOptions::deterministic`] preset this is the only
363 /// repetition control mistralrs 0.8 supports — greedy without it
364 /// falls into token loops; see that constructor for the trade-off.
365 #[cfg_attr(not(tarpaulin), inline(always))]
366 pub const fn presence_penalty(&self) -> f32 {
367 self.presence_penalty
368 }
369
370 /// Builder-style setter for `presence_penalty`.
371 #[cfg_attr(not(tarpaulin), inline(always))]
372 pub const fn with_presence_penalty(mut self, val: f32) -> Self {
373 self.presence_penalty = val;
374 self
375 }
376
377 /// In-place setter for `presence_penalty`.
378 #[cfg_attr(not(tarpaulin), inline(always))]
379 pub const fn set_presence_penalty(&mut self, val: f32) -> &mut Self {
380 self.presence_penalty = val;
381 self
382 }
383
384 /// Validate sampler parameters before they reach mistralrs (issue
385 /// #1 H-002).
386 ///
387 /// - `temperature` must be finite and ≥ 0 (negative values invert
388 /// the softmax sign and produce nonsensical distributions).
389 /// - `top_p` must be finite and in `(0, 1]` (0 selects nothing;
390 /// > 1 is meaningless; NaN poisons the sampler).
391 /// - `top_k` must be ≥ 1 (0 selects nothing).
392 /// - `presence_penalty` must be finite (NaN/Inf would poison the
393 /// logit shift).
394 ///
395 /// Called automatically by [`Engine::run_with`]; callers can
396 /// invoke it themselves to fail fast on a bad preset.
397 pub const fn validate(&self) -> Result<(), Error> {
398 if !self.temperature.is_finite() || self.temperature < 0.0 {
399 return Err(Error::InvalidRequest(
400 "temperature must be finite and >= 0.0",
401 ));
402 }
403 if !self.top_p.is_finite() || self.top_p <= 0.0 || self.top_p > 1.0 {
404 return Err(Error::InvalidRequest(
405 "top_p must be finite and in (0.0, 1.0]",
406 ));
407 }
408 if self.top_k == 0 {
409 return Err(Error::InvalidRequest("top_k must be >= 1"));
410 }
411 if !self.presence_penalty.is_finite() {
412 return Err(Error::InvalidRequest("presence_penalty must be finite"));
413 }
414 Ok(())
415 }
416}
417
418impl Default for RequestOptions {
419 fn default() -> Self {
420 Self::new()
421 }
422}
423
424/// A Qwen3-VL structured-output inference engine.
425///
426/// Construct via [`Engine::load`]. `Engine` is `Send + Sync + Clone` —
427/// `mistralrs::Model` is `Arc<MistralRs>` internally, so cloning is cheap.
428/// Concurrent `run()` calls from multiple tasks are safe and are
429/// continuous-batched by mistralrs's scheduler (not parallel decode).
430#[derive(Clone)]
431pub struct Engine {
432 model: Arc<mistralrs::Model>,
433 options: EngineOptions,
434}
435
436impl Engine {
437 /// Load the Qwen3-VL model at `opts.model_path()` with the given
438 /// quantization. Blocks for ~13s on Apple Silicon Metal at first call.
439 /// Holds GPU memory until the last clone is dropped.
440 #[instrument(name = "qwen3_vl::load", skip(opts), fields(model_path = %opts.model_path().display(), quantization = ?opts.quantization()))]
441 pub async fn load(opts: EngineOptions) -> Result<Self, LoadError> {
442 if !opts.model_path().exists() {
443 return Err(LoadError::NotFound(opts.model_path().to_path_buf()));
444 }
445 let started = std::time::Instant::now();
446 info!("loading Qwen3-VL model");
447 let model_id = opts.model_path().to_string_lossy().into_owned();
448 let model = MultimodalModelBuilder::new(model_id)
449 .with_isq(opts.quantization())
450 .build()
451 .await
452 .map_err(|e| LoadError::Build(e.to_string()))?;
453 info!(
454 elapsed_ms = started.elapsed().as_millis() as u64,
455 "model loaded"
456 );
457 Ok(Self {
458 model: Arc::new(model),
459 options: opts,
460 })
461 }
462
463 /// Returns the local model directory the engine was loaded from.
464 #[cfg_attr(not(tarpaulin), inline(always))]
465 pub fn model_path(&self) -> &Path {
466 self.options.model_path()
467 }
468
469 /// Returns the quantization the engine was loaded with.
470 #[cfg_attr(not(tarpaulin), inline(always))]
471 pub const fn quantization(&self) -> IsqType {
472 self.options.quantization()
473 }
474
475 /// Returns the configured `max_tokens` ceiling for [`Engine::run`].
476 #[cfg_attr(not(tarpaulin), inline(always))]
477 pub const fn max_tokens(&self) -> usize {
478 self.options.max_tokens()
479 }
480
481 /// Returns the engine-level default sampler profile. See
482 /// [`EngineOptions::request`].
483 #[cfg_attr(not(tarpaulin), inline(always))]
484 pub const fn request(&self) -> &RequestOptions {
485 self.options.request()
486 }
487
488 /// Returns the per-call inference timeout. Issue #1 H-001.
489 #[cfg_attr(not(tarpaulin), inline(always))]
490 pub const fn inference_timeout(&self) -> Duration {
491 self.options.inference_timeout()
492 }
493
494 /// Optional pre-warm: runs one tiny inference against a 1×1 black image
495 /// to JIT-compile Metal kernels before serving real requests. Logs
496 /// duration at `debug`. Errors are propagated to the caller — typically
497 /// you ignore them in production (warmup is best-effort).
498 ///
499 /// **Caveat:** Metal's kernel JIT specializes per tensor shape, so
500 /// the kernels compiled for a 1×1 image are not guaranteed to match
501 /// the kernels needed for production-sized keyframes (e.g.,
502 /// 720×1280). For shape-matched warmup, use
503 /// [`Self::warmup_with_image`] (issue #1 M-002).
504 #[instrument(name = "qwen3_vl::warmup", skip(self))]
505 pub async fn warmup(&self) -> Result<(), Error> {
506 use image::{DynamicImage, RgbImage};
507 let blank = DynamicImage::ImageRgb8(RgbImage::new(1, 1));
508 self.warmup_with_image(blank).await
509 }
510
511 /// Pre-warm with a caller-supplied image (issue #1 M-002). Use a
512 /// representative production-sized keyframe (e.g., 720×1280 black
513 /// frame, or a real fixture) so Metal's per-shape kernel JIT
514 /// specializes for the shapes the production path will hit. The
515 /// 1×1 [`Self::warmup`] only exercises the load → encode → decode
516 /// pipeline structurally; first real-keyframe inference can still
517 /// incur JIT cost without this.
518 #[instrument(name = "qwen3_vl::warmup_with_image", skip(self, image))]
519 pub async fn warmup_with_image(&self, image: image::DynamicImage) -> Result<(), Error> {
520 let started = std::time::Instant::now();
521 let messages = MultimodalMessages::new().add_image_message(
522 TextMessageRole::User,
523 "Reply with: ok",
524 vec![image],
525 );
526 let request = RequestBuilder::from(messages)
527 .set_sampler_max_len(4)
528 .enable_thinking(false);
529 // Same timeout as run_with: a stuck warmup shouldn't block
530 // worker startup forever.
531 let timeout = self.options.inference_timeout();
532 let _ = tokio::time::timeout(timeout, self.model.send_chat_request(request))
533 .await
534 .map_err(|_| Error::InferenceTimeout(timeout))?
535 .map_err(|e| Error::Inference(e.to_string()))?;
536 debug!(
537 elapsed_ms = started.elapsed().as_millis() as u64,
538 "warmup complete"
539 );
540 Ok(())
541 }
542
543 /// Single-turn, multi-image structured run with the engine-level
544 /// default sampler ([`EngineOptions::request`]). Equivalent to
545 /// [`Self::run_with`] called with that profile.
546 ///
547 /// Consumes `images` because mistralrs's
548 /// `MultimodalMessages::add_image_message` takes `Vec<DynamicImage>`
549 /// by value — borrowing here would force a silent `.to_vec()` clone
550 /// of decoded image data. Returns `Error::NoImages` for an empty
551 /// input.
552 ///
553 /// Dropping the returned future is a fast wakeup, not GPU
554 /// cancellation: mistralrs's engine loop completes the in-flight
555 /// scheduler step in the background; the response is silently
556 /// discarded on send. Wrap in `tokio::time::timeout(..)` for a
557 /// deadline.
558 #[cfg_attr(not(tarpaulin), inline(always))]
559 pub async fn run<T: Task>(
560 &self,
561 task: &T,
562 images: Vec<image::DynamicImage>,
563 ) -> Result<T::Output, Error>
564 where
565 T::ParseError: Send + Sync + 'static,
566 {
567 self.run_with(task, images, self.options.request()).await
568 }
569
570 /// Same as [`Self::run`] but with a caller-supplied
571 /// [`RequestOptions`] that replaces the engine-level default for
572 /// this call. Use this when a specific call needs a sampler profile
573 /// other than [`EngineOptions::request`].
574 ///
575 /// All four fields from `opts` are applied uniformly to the
576 /// underlying mistralrs sampler — there is no separate deterministic
577 /// branch; the preset itself encodes the choice between greedy
578 /// ([`RequestOptions::deterministic`]) and stochastic
579 /// ([`RequestOptions::new`] / `default`).
580 #[instrument(
581 name = "qwen3_vl::run_with",
582 skip(self, task, images, opts),
583 fields(
584 task_kind = std::any::type_name::<T>(),
585 image_count = images.len(),
586 max_tokens = self.options.max_tokens(),
587 temperature = opts.temperature(),
588 ),
589 )]
590 pub async fn run_with<T: Task>(
591 &self,
592 task: &T,
593 images: Vec<image::DynamicImage>,
594 opts: &RequestOptions,
595 ) -> Result<T::Output, Error>
596 where
597 // bound at the call site only.
598 // `Send + Sync + 'static` lets us box the parse error into
599 // `Error::Parse(Box<dyn Error + Send + Sync + 'static>)`,
600 // which works for any Task — including ones whose only
601 // purpose is to receive `UnsupportedGrammar` for routing.
602 T::ParseError: Send + Sync + 'static,
603 {
604 if images.is_empty() {
605 return Err(Error::NoImages);
606 }
607 // Issue #1 H-002: validate sampler parameters before mistralrs
608 // sees them. Negative temperature, top_p > 1.0, top_k = 0, or
609 // non-finite presence_penalty all produce undefined behavior in
610 // mistralrs's sampler.
611 opts.validate()?;
612
613 // Pull the task's grammar and route to mistralrs's
614 // Constraint::JsonSchema. mistralrs 0.8 only accepts JSON
615 // Schema; non-JSON variants (Lark, Regex) are rejected via
616 // UnsupportedGrammar so callers can route to an
617 // llguidance-backed engine instead (e.g., the `lfm` crate).
618 let grammar = task.grammar();
619 let schema = grammar
620 .as_json_schema()
621 .ok_or_else(|| {
622 Error::UnsupportedGrammar(llmtask::UnsupportedGrammar::new(
623 grammar.kind(),
624 "json_schema",
625 ))
626 })?
627 .clone();
628
629 let messages =
630 MultimodalMessages::new().add_image_message(TextMessageRole::User, task.prompt(), images);
631
632 let request = RequestBuilder::from(messages)
633 .set_sampler_max_len(self.options.max_tokens().max(1))
634 .enable_thinking(false)
635 .set_constraint(Constraint::JsonSchema(schema))
636 .set_sampler_temperature(opts.temperature())
637 .set_sampler_topp(opts.top_p())
638 .set_sampler_topk(opts.top_k())
639 .set_sampler_presence_penalty(opts.presence_penalty());
640
641 let started = std::time::Instant::now();
642 // Issue #1 H-001: bound inference duration. A stuck model
643 // (Metal JIT stall, GPU OOM, scheduler deadlock) would
644 // otherwise block the caller indefinitely. Drop on timeout —
645 // mistralrs will silently complete the in-flight scheduler
646 // step in the background and discard the response.
647 let timeout = self.options.inference_timeout();
648 let response = tokio::time::timeout(timeout, self.model.send_chat_request(request))
649 .await
650 .map_err(|_| Error::InferenceTimeout(timeout))?
651 .map_err(|e| Error::Inference(e.to_string()))?;
652 debug!(
653 elapsed_ms = started.elapsed().as_millis() as u64,
654 "inference complete"
655 );
656
657 let choice = response.choices.first().ok_or(Error::Empty)?;
658 // finding: reject length-truncated generations
659 // before parsing. mistralrs `Display for StopReason` maps Eos
660 // → "stop" and `Length`/`ModelLength` → "length"; "stop" is
661 // the only outcome where the constrained decoder produced a
662 // full natural completion. Anything else (length, error, etc.)
663 // means the JSON could be syntactically valid but semantically
664 // incomplete — persisting it to a search index would silently
665 // truncate metadata.
666 if choice.finish_reason != "stop" {
667 let raw_len = choice
668 .message
669 .content
670 .as_ref()
671 .map(|s| s.len())
672 .unwrap_or(0);
673 return Err(Error::Truncated {
674 finish_reason: choice.finish_reason.clone(),
675 raw_len,
676 });
677 }
678 let text = choice
679 .message
680 .content
681 .clone()
682 .filter(|s| !s.trim().is_empty())
683 .ok_or(Error::Empty)?;
684
685 #[cfg(feature = "trace-output")]
686 tracing::trace!(raw = %text, "model output");
687
688 task.parse(&text).map_err(|e| Error::Parse(Box::new(e)))
689 }
690}
691
692#[cfg(test)]
693mod tests {
694 use super::*;
695
696 #[test]
697 fn engine_options_defaults_to_deterministic_request() {
698 // EngineOptions::new embeds RequestOptions::deterministic() as the
699 // engine-level default sampler. This test guards against silent
700 // reversion: if someone later flips the default back to the
701 // stochastic model-card profile, every caller that uses the obvious
702 // ::new() constructor would silently start drifting their search
703 // index on retries/backfills. To get the model-card stochastic
704 // sampler, callers must opt in explicitly via
705 // .with_request(RequestOptions::new()).
706 let opts = EngineOptions::new("/tmp/model");
707 assert_eq!(opts.model_path(), Path::new("/tmp/model"));
708 assert!(matches!(opts.quantization(), IsqType::Q4K));
709 // Issue #1 M-003: default raised from 512 to 1024.
710 assert_eq!(opts.max_tokens(), 1024);
711 let req = opts.request();
712 assert_eq!(req.temperature(), 0.0);
713 assert_eq!(req.top_p(), 1.0);
714 assert_eq!(req.top_k(), 1);
715 assert_eq!(
716 req.presence_penalty(),
717 1.5,
718 "deterministic preset must keep presence_penalty 1.5 — greedy \
719 without it falls into token loops"
720 );
721 }
722
723 #[test]
724 fn engine_options_with_chains() {
725 let opts = EngineOptions::new("/tmp/a")
726 .with_model_path("/tmp/b")
727 .with_quantization(IsqType::Q8_0)
728 .with_max_tokens(1024)
729 .with_request(RequestOptions::new());
730 assert_eq!(opts.model_path(), Path::new("/tmp/b"));
731 assert!(matches!(opts.quantization(), IsqType::Q8_0));
732 assert_eq!(opts.max_tokens(), 1024);
733 // Swapping in RequestOptions::new() flips the engine to the
734 // model-card stochastic profile (temperature 0.7).
735 assert_eq!(opts.request().temperature(), 0.7);
736 }
737
738 #[test]
739 fn engine_options_set_chains() {
740 let mut opts = EngineOptions::new("/tmp/a");
741 opts
742 .set_model_path("/tmp/b")
743 .set_quantization(IsqType::Q8_0)
744 .set_max_tokens(1024)
745 .set_request(RequestOptions::new());
746 assert_eq!(opts.model_path(), Path::new("/tmp/b"));
747 assert!(matches!(opts.quantization(), IsqType::Q8_0));
748 assert_eq!(opts.max_tokens(), 1024);
749 assert_eq!(opts.request().temperature(), 0.7);
750 }
751
752 #[test]
753 fn request_options_defaults_match_model_card() {
754 // Hard-coded against the Qwen3-VL Instruct model card values to
755 // catch silent drift if the defaults are ever edited without a
756 // CHANGELOG note. See indexer/models/qwen3-vl-2b/README.md.
757 let opts = RequestOptions::new();
758 assert_eq!(opts.temperature(), 0.7);
759 assert_eq!(opts.top_p(), 0.8);
760 assert_eq!(opts.top_k(), 20);
761 assert_eq!(opts.presence_penalty(), 1.5);
762 }
763
764 #[test]
765 fn request_options_default_eq_new() {
766 let new_opts = RequestOptions::new();
767 let default_opts = RequestOptions::default();
768 assert_eq!(new_opts.temperature(), default_opts.temperature());
769 assert_eq!(new_opts.top_p(), default_opts.top_p());
770 assert_eq!(new_opts.top_k(), default_opts.top_k());
771 assert_eq!(new_opts.presence_penalty(), default_opts.presence_penalty());
772 }
773
774 #[test]
775 fn request_options_with_chains() {
776 let opts = RequestOptions::new()
777 .with_temperature(0.3)
778 .with_top_p(0.95)
779 .with_top_k(50)
780 .with_presence_penalty(0.0);
781 assert_eq!(opts.temperature(), 0.3);
782 assert_eq!(opts.top_p(), 0.95);
783 assert_eq!(opts.top_k(), 50);
784 assert_eq!(opts.presence_penalty(), 0.0);
785 }
786
787 #[test]
788 fn request_options_set_chains() {
789 let mut opts = RequestOptions::new();
790 opts
791 .set_temperature(0.3)
792 .set_top_p(0.95)
793 .set_top_k(50)
794 .set_presence_penalty(0.0);
795 assert_eq!(opts.temperature(), 0.3);
796 assert_eq!(opts.top_p(), 0.95);
797 assert_eq!(opts.top_k(), 50);
798 assert_eq!(opts.presence_penalty(), 0.0);
799 }
800
801 #[test]
802 fn request_options_deterministic_preset() {
803 // Hard-coded greedy values: temperature=0 + top_k=1 forces argmax,
804 // top_p=1 disables nucleus filtering. presence_penalty 1.5 is kept
805 // (greedy without it falls into token loops). See
806 // RequestOptions::deterministic doc for the trade-off.
807 let opts = RequestOptions::deterministic();
808 assert_eq!(opts.temperature(), 0.0);
809 assert_eq!(opts.top_p(), 1.0);
810 assert_eq!(opts.top_k(), 1);
811 assert_eq!(opts.presence_penalty(), 1.5);
812 }
813
814 // ===== Issue #1 H-002: RequestOptions::validate =====
815
816 #[test]
817 fn request_options_validate_accepts_presets() {
818 // Both shipped presets must validate.
819 assert!(RequestOptions::new().validate().is_ok());
820 assert!(RequestOptions::deterministic().validate().is_ok());
821 }
822
823 #[test]
824 fn request_options_validate_rejects_negative_temperature() {
825 let opts = RequestOptions::new().with_temperature(-0.1);
826 assert!(matches!(opts.validate(), Err(Error::InvalidRequest(_))));
827 }
828
829 #[test]
830 fn request_options_validate_rejects_non_finite_temperature() {
831 assert!(matches!(
832 RequestOptions::new().with_temperature(f64::NAN).validate(),
833 Err(Error::InvalidRequest(_))
834 ));
835 assert!(matches!(
836 RequestOptions::new()
837 .with_temperature(f64::INFINITY)
838 .validate(),
839 Err(Error::InvalidRequest(_))
840 ));
841 }
842
843 #[test]
844 fn request_options_validate_rejects_top_p_out_of_range() {
845 assert!(matches!(
846 RequestOptions::new().with_top_p(0.0).validate(),
847 Err(Error::InvalidRequest(_))
848 ));
849 assert!(matches!(
850 RequestOptions::new().with_top_p(1.5).validate(),
851 Err(Error::InvalidRequest(_))
852 ));
853 assert!(matches!(
854 RequestOptions::new().with_top_p(-0.1).validate(),
855 Err(Error::InvalidRequest(_))
856 ));
857 assert!(matches!(
858 RequestOptions::new().with_top_p(f64::NAN).validate(),
859 Err(Error::InvalidRequest(_))
860 ));
861 }
862
863 #[test]
864 fn request_options_validate_accepts_top_p_one() {
865 // top_p = 1.0 disables nucleus filtering — used by the
866 // deterministic preset. Must pass.
867 assert!(RequestOptions::new().with_top_p(1.0).validate().is_ok());
868 }
869
870 #[test]
871 fn request_options_validate_rejects_top_k_zero() {
872 let opts = RequestOptions::new().with_top_k(0);
873 assert!(matches!(opts.validate(), Err(Error::InvalidRequest(_))));
874 }
875
876 #[test]
877 fn request_options_validate_rejects_non_finite_presence_penalty() {
878 assert!(matches!(
879 RequestOptions::new()
880 .with_presence_penalty(f32::NAN)
881 .validate(),
882 Err(Error::InvalidRequest(_))
883 ));
884 assert!(matches!(
885 RequestOptions::new()
886 .with_presence_penalty(f32::INFINITY)
887 .validate(),
888 Err(Error::InvalidRequest(_))
889 ));
890 }
891
892 #[test]
893 fn request_options_validate_accepts_negative_presence_penalty() {
894 // mistralrs allows negative presence_penalty (encourages
895 // repetition). Validate only checks finiteness.
896 assert!(
897 RequestOptions::new()
898 .with_presence_penalty(-1.0)
899 .validate()
900 .is_ok()
901 );
902 }
903
904 // ===== Issue #1 H-001 + M-003 =====
905
906 #[test]
907 fn engine_options_default_inference_timeout() {
908 let opts = EngineOptions::new("/nonexistent");
909 assert_eq!(opts.inference_timeout(), DEFAULT_INFERENCE_TIMEOUT);
910 assert_eq!(opts.inference_timeout(), Duration::from_secs(300));
911 }
912
913 #[test]
914 fn engine_options_with_inference_timeout() {
915 let opts = EngineOptions::new("/nonexistent").with_inference_timeout(Duration::from_secs(10));
916 assert_eq!(opts.inference_timeout(), Duration::from_secs(10));
917 }
918
919 #[test]
920 fn engine_options_default_max_tokens_bumped_to_1024() {
921 // Issue #1 M-003: default raised from 512 to 1024 to avoid
922 // mid-JSON truncation on complex scenes.
923 let opts = EngineOptions::new("/nonexistent");
924 assert_eq!(opts.max_tokens(), 1024);
925 }
926}