Skip to main content

qwen3_vl/
engine.rs

1//! The [`Engine`] and [`EngineOptions`] types.
2
3use std::{
4  path::{Path, PathBuf},
5  sync::Arc,
6  time::Duration,
7};
8
9use mistralrs::{
10  Constraint, IsqType, MultimodalMessages, MultimodalModelBuilder, RequestBuilder, TextMessageRole,
11};
12use tracing::{debug, info, instrument};
13
14use crate::error::{Error, LoadError};
15use llmtask::Task;
16
17/// Default per-call inference timeout (issue #1 H-001). Five
18/// minutes covers cold-cache Metal JIT specialization on real
19/// keyframes and pathological prompts; a stuck model (kernel
20/// deadlock, GPU OOM) trips this rather than blocking the caller
21/// forever. Override per-engine via
22/// [`EngineOptions::with_inference_timeout`].
23pub const DEFAULT_INFERENCE_TIMEOUT: Duration = Duration::from_secs(300);
24
25/// Configuration for [`Engine::load`].
26#[derive(Debug, Clone)]
27pub struct EngineOptions {
28  model_path: PathBuf,
29  quantization: IsqType,
30  max_tokens: usize,
31  request: RequestOptions,
32  inference_timeout: Duration,
33}
34
35impl EngineOptions {
36  /// Construct with the given model path, default quantization
37  /// (`IsqType::Q4K`), default `max_tokens` (`1024`), and an
38  /// indexing-safe default sampler profile
39  /// ([`RequestOptions::deterministic`]).
40  ///
41  /// The default request is deterministic because this crate's primary
42  /// use case is producing structured output that gets persisted to a
43  /// search index (see [`crate::image_analysis::ImageAnalysisTask`]).
44  /// Stochastic sampling means the same keyframes reprocessed after
45  /// a timeout, retry, or backfill can produce different
46  /// `ImageAnalysis` values,
47  /// silently drifting the index. Greedy decoding closes that hole at
48  /// the cost of diverging from the Qwen3-VL Instruct model card's
49  /// recommended sampler — see [`RequestOptions::deterministic`] for
50  /// the full trade-off.
51  ///
52  /// For one-shot, quality-prioritised use where reproducibility
53  /// doesn't matter, swap the engine default for the model-card
54  /// stochastic profile via
55  /// `EngineOptions::new(path).with_request(RequestOptions::new())`,
56  /// or override per-call via [`Engine::run_with`].
57  pub fn new(model_path: impl Into<PathBuf>) -> Self {
58    Self {
59      model_path: model_path.into(),
60      quantization: IsqType::Q4K,
61      // Issue #1 M-003: bumped from 512 → 1024. The 512 default
62      // truncated complex scenes mid-JSON (many subjects/objects/
63      // actions), surfacing as
64      // ParseError::Json(EOF while parsing a string). 1024 covers
65      // the long tail observed empirically without inflating
66      // worst-case latency materially under greedy decoding.
67      max_tokens: 1024,
68      request: RequestOptions::deterministic(),
69      inference_timeout: DEFAULT_INFERENCE_TIMEOUT,
70    }
71  }
72
73  /// Returns the configured model path.
74  #[cfg_attr(not(tarpaulin), inline(always))]
75  pub fn model_path(&self) -> &Path {
76    &self.model_path
77  }
78
79  /// Builder-style setter for `model_path`.
80  #[cfg_attr(not(tarpaulin), inline(always))]
81  pub fn with_model_path(mut self, val: impl Into<PathBuf>) -> Self {
82    self.model_path = val.into();
83    self
84  }
85
86  /// In-place setter for `model_path`.
87  #[cfg_attr(not(tarpaulin), inline(always))]
88  pub fn set_model_path(&mut self, val: impl Into<PathBuf>) -> &mut Self {
89    self.model_path = val.into();
90    self
91  }
92
93  /// Returns the configured quantization (default `IsqType::Q4K`).
94  #[cfg_attr(not(tarpaulin), inline(always))]
95  pub const fn quantization(&self) -> IsqType {
96    self.quantization
97  }
98
99  /// Builder-style setter for `quantization`.
100  #[cfg_attr(not(tarpaulin), inline(always))]
101  pub const fn with_quantization(mut self, val: IsqType) -> Self {
102    self.quantization = val;
103    self
104  }
105
106  /// In-place setter for `quantization`.
107  #[cfg_attr(not(tarpaulin), inline(always))]
108  pub const fn set_quantization(&mut self, val: IsqType) -> &mut Self {
109    self.quantization = val;
110    self
111  }
112
113  /// Returns the configured `max_tokens` ceiling (default `1024`).
114  #[cfg_attr(not(tarpaulin), inline(always))]
115  pub const fn max_tokens(&self) -> usize {
116    self.max_tokens
117  }
118
119  /// Builder-style setter for `max_tokens`. Any value is accepted at
120  /// the type level (no setter-side validation); a value of `0` is
121  /// clamped up to `1` at request time inside [`Engine::run_with`]
122  /// before being passed to mistralrs's `set_sampler_max_len`, so a
123  /// zero here means "let the model emit at least one token", not
124  /// "skip generation entirely".
125  #[cfg_attr(not(tarpaulin), inline(always))]
126  pub const fn with_max_tokens(mut self, val: usize) -> Self {
127    self.max_tokens = val;
128    self
129  }
130
131  /// In-place setter for `max_tokens`. See [`Self::with_max_tokens`]
132  /// for the runtime `0 → 1` clamp note.
133  #[cfg_attr(not(tarpaulin), inline(always))]
134  pub const fn set_max_tokens(&mut self, val: usize) -> &mut Self {
135    self.max_tokens = val;
136    self
137  }
138
139  /// Returns the engine-level default [`RequestOptions`]. This is the
140  /// sampler profile used by [`Engine::run`]; per-call overrides go
141  /// through [`Engine::run_with`].
142  ///
143  /// Default ([`EngineOptions::new`]): [`RequestOptions::deterministic`]
144  /// — see that constructor for the indexing-vs-quality trade-off.
145  #[cfg_attr(not(tarpaulin), inline(always))]
146  pub const fn request(&self) -> &RequestOptions {
147    &self.request
148  }
149
150  /// Builder-style setter for `request`. Replaces the engine-level
151  /// default sampler profile wholesale.
152  #[cfg_attr(not(tarpaulin), inline(always))]
153  pub fn with_request(mut self, val: RequestOptions) -> Self {
154    self.request = val;
155    self
156  }
157
158  /// In-place setter for `request`. Replaces the engine-level default
159  /// sampler profile wholesale.
160  #[cfg_attr(not(tarpaulin), inline(always))]
161  pub fn set_request(&mut self, val: RequestOptions) -> &mut Self {
162    self.request = val;
163    self
164  }
165
166  /// Returns the per-call inference timeout (default
167  /// [`DEFAULT_INFERENCE_TIMEOUT`] = 5 min). Issue #1 H-001.
168  #[cfg_attr(not(tarpaulin), inline(always))]
169  pub const fn inference_timeout(&self) -> Duration {
170    self.inference_timeout
171  }
172
173  /// Builder-style setter for `inference_timeout`.
174  #[cfg_attr(not(tarpaulin), inline(always))]
175  pub const fn with_inference_timeout(mut self, val: Duration) -> Self {
176    self.inference_timeout = val;
177    self
178  }
179
180  /// In-place setter for `inference_timeout`.
181  #[cfg_attr(not(tarpaulin), inline(always))]
182  pub const fn set_inference_timeout(&mut self, val: Duration) -> &mut Self {
183    self.inference_timeout = val;
184    self
185  }
186}
187
188/// Sampler configuration applied per call by [`Engine::run`] /
189/// [`Engine::run_with`].
190///
191/// Two named presets ship out of the box:
192///
193/// - [`RequestOptions::new`] / [`RequestOptions::default`] — the
194///   Qwen3-VL Instruct (non-thinking) model card sampler
195///   (`temperature 0.7`, `top_p 0.8`, `top_k 20`,
196///   `presence_penalty 1.5`). Best output quality; the same input
197///   can produce different outputs across runs.
198/// - [`RequestOptions::deterministic`] — greedy decoding
199///   (`temperature 0.0`, `top_p 1.0`, `top_k 1`) with
200///   `presence_penalty 1.5` retained (greedy without it falls into
201///   token loops). Bit-stable output for identical inputs; the
202///   right choice for indexing pipelines that must avoid silent
203///   drift on retries / backfills. This is the profile
204///   [`EngineOptions::new`] embeds by default.
205///
206/// `Engine::run_with` applies all four fields to the underlying
207/// mistralrs `RequestBuilder` uniformly — there is no separate
208/// deterministic branch; the preset itself encodes the choice.
209///
210/// `repetition_penalty` and a sampler seed are intentionally absent —
211/// mistralrs 0.8 has no `set_sampler_repetition_penalty` and no
212/// `set_sampler_seed`. Do NOT substitute
213/// `set_sampler_frequency_penalty(1.0)` for the missing
214/// `repetition_penalty`: the math is different (additive vs
215/// multiplicative).
216#[derive(Debug, Clone)]
217pub struct RequestOptions {
218  temperature: f64,
219  top_p: f64,
220  top_k: usize,
221  presence_penalty: f32,
222}
223
224impl RequestOptions {
225  /// Construct with the Qwen3-VL Instruct (non-thinking) model card
226  /// defaults: `temperature 0.7`, `top_p 0.8`, `top_k 20`,
227  /// `presence_penalty 1.5`. Best output quality; not bit-stable
228  /// across runs. Pair with [`EngineOptions::with_request`] (or
229  /// [`Engine::run_with`]) when reproducibility doesn't matter and
230  /// quality does.
231  #[cfg_attr(not(tarpaulin), inline(always))]
232  pub const fn new() -> Self {
233    Self {
234      temperature: 0.7,
235      top_p: 0.8,
236      top_k: 20,
237      presence_penalty: 1.5,
238    }
239  }
240
241  /// Indexing-safe greedy decoding: `temperature 0.0`, `top_p 1.0`,
242  /// `top_k 1`, `presence_penalty 1.5`. Output is bit-stable for
243  /// identical inputs — the right choice for pipelines that persist
244  /// VLM output to a search index, where retries / timeouts /
245  /// backfills must not silently drift the index. This is the preset
246  /// [`EngineOptions::new`] embeds by default.
247  ///
248  /// Greedy decoding is the only deterministic mode mistralrs 0.8
249  /// supports (no `set_sampler_seed`).
250  ///
251  /// **The retained `presence_penalty 1.5` is a documented
252  /// trade-off:**
253  ///
254  /// - In mistralrs 0.8 (re-verify when upgrading; future patches
255  ///   could change this and this preset's assumption would no
256  ///   longer hold), `presence_penalty` is applied over the full
257  ///   `seq.get_toks()` (prompt tokens plus generated tokens,
258  ///   verified in
259  ///   `mistralrs-core/src/sampler.rs::apply_freq_pres_rep_penalty`
260  ///   and `mistralrs-core/src/pipeline/sampling.rs`). With
261  ///   `temperature 0` there is no sampling spread, so every token
262  ///   appearing in the task prompt gets a flat `-1.5` logit shift
263  ///   even before the model emits anything. To minimize the
264  ///   collateral on legitimate value tokens,
265  ///   [`crate::image_analysis::ImageAnalysisTask`]'s
266  ///   `IMAGE_ANALYSIS_PROMPT` is intentionally written without enumerated
267  ///   value examples (H1) — format guidance is descriptive
268  ///   (word counts, lowercase) rather than enumerative. Residual
269  ///   bias only hits scaffolding/instruction tokens (e.g. "scene",
270  ///   "describing", "lowercase"), which the model is unlikely to
271  ///   want to emit as values, and the JSON schema constraint
272  ///   preserves field names and structure regardless.
273  /// - Removing the penalty was tested and broke generation: greedy
274  ///   without any repetition control falls into token loops that
275  ///   exhaust `max_tokens` mid-string and surface as
276  ///   `Error::Parse(Json(EOF while parsing a string))`. mistralrs
277  ///   0.8 has no generated-only repetition mechanism
278  ///   (`frequency_penalty` and `repetition_penalty` also operate
279  ///   over `seq.get_toks()`), so biased-but-parseable beats
280  ///   unbiased-but-broken.
281  ///
282  /// Callers that genuinely want greedy with no repetition control
283  /// can chain `.with_presence_penalty(0.0)` and accept the
284  /// repetition-loop hazard themselves.
285  ///
286  /// Custom `Task` implementations should follow the same prompt
287  /// hygiene: avoid enumerating expected value tokens in the prompt,
288  /// because they will be penalized in this preset.
289  #[cfg_attr(not(tarpaulin), inline(always))]
290  pub const fn deterministic() -> Self {
291    Self {
292      temperature: 0.0,
293      top_p: 1.0,
294      top_k: 1,
295      presence_penalty: 1.5,
296    }
297  }
298
299  /// Returns the configured sampling temperature.
300  #[cfg_attr(not(tarpaulin), inline(always))]
301  pub const fn temperature(&self) -> f64 {
302    self.temperature
303  }
304
305  /// Builder-style setter for `temperature`.
306  #[cfg_attr(not(tarpaulin), inline(always))]
307  pub const fn with_temperature(mut self, val: f64) -> Self {
308    self.temperature = val;
309    self
310  }
311
312  /// In-place setter for `temperature`.
313  #[cfg_attr(not(tarpaulin), inline(always))]
314  pub const fn set_temperature(&mut self, val: f64) -> &mut Self {
315    self.temperature = val;
316    self
317  }
318
319  /// Returns the configured `top_p`. Note: mistralrs 0.8's builder
320  /// method is `set_sampler_topp` (no underscore between `top` and
321  /// `p`).
322  #[cfg_attr(not(tarpaulin), inline(always))]
323  pub const fn top_p(&self) -> f64 {
324    self.top_p
325  }
326
327  /// Builder-style setter for `top_p`.
328  #[cfg_attr(not(tarpaulin), inline(always))]
329  pub const fn with_top_p(mut self, val: f64) -> Self {
330    self.top_p = val;
331    self
332  }
333
334  /// In-place setter for `top_p`.
335  #[cfg_attr(not(tarpaulin), inline(always))]
336  pub const fn set_top_p(&mut self, val: f64) -> &mut Self {
337    self.top_p = val;
338    self
339  }
340
341  /// Returns the configured `top_k`.
342  #[cfg_attr(not(tarpaulin), inline(always))]
343  pub const fn top_k(&self) -> usize {
344    self.top_k
345  }
346
347  /// Builder-style setter for `top_k`.
348  #[cfg_attr(not(tarpaulin), inline(always))]
349  pub const fn with_top_k(mut self, val: usize) -> Self {
350    self.top_k = val;
351    self
352  }
353
354  /// In-place setter for `top_k`.
355  #[cfg_attr(not(tarpaulin), inline(always))]
356  pub const fn set_top_k(&mut self, val: usize) -> &mut Self {
357    self.top_k = val;
358    self
359  }
360
361  /// Returns the configured `presence_penalty`. With the
362  /// [`RequestOptions::deterministic`] preset this is the only
363  /// repetition control mistralrs 0.8 supports — greedy without it
364  /// falls into token loops; see that constructor for the trade-off.
365  #[cfg_attr(not(tarpaulin), inline(always))]
366  pub const fn presence_penalty(&self) -> f32 {
367    self.presence_penalty
368  }
369
370  /// Builder-style setter for `presence_penalty`.
371  #[cfg_attr(not(tarpaulin), inline(always))]
372  pub const fn with_presence_penalty(mut self, val: f32) -> Self {
373    self.presence_penalty = val;
374    self
375  }
376
377  /// In-place setter for `presence_penalty`.
378  #[cfg_attr(not(tarpaulin), inline(always))]
379  pub const fn set_presence_penalty(&mut self, val: f32) -> &mut Self {
380    self.presence_penalty = val;
381    self
382  }
383
384  /// Validate sampler parameters before they reach mistralrs (issue
385  /// #1 H-002).
386  ///
387  /// - `temperature` must be finite and ≥ 0 (negative values invert
388  ///   the softmax sign and produce nonsensical distributions).
389  /// - `top_p` must be finite and in `(0, 1]` (0 selects nothing;
390  ///   > 1 is meaningless; NaN poisons the sampler).
391  /// - `top_k` must be ≥ 1 (0 selects nothing).
392  /// - `presence_penalty` must be finite (NaN/Inf would poison the
393  ///   logit shift).
394  ///
395  /// Called automatically by [`Engine::run_with`]; callers can
396  /// invoke it themselves to fail fast on a bad preset.
397  pub const fn validate(&self) -> Result<(), Error> {
398    if !self.temperature.is_finite() || self.temperature < 0.0 {
399      return Err(Error::InvalidRequest(
400        "temperature must be finite and >= 0.0",
401      ));
402    }
403    if !self.top_p.is_finite() || self.top_p <= 0.0 || self.top_p > 1.0 {
404      return Err(Error::InvalidRequest(
405        "top_p must be finite and in (0.0, 1.0]",
406      ));
407    }
408    if self.top_k == 0 {
409      return Err(Error::InvalidRequest("top_k must be >= 1"));
410    }
411    if !self.presence_penalty.is_finite() {
412      return Err(Error::InvalidRequest("presence_penalty must be finite"));
413    }
414    Ok(())
415  }
416}
417
418impl Default for RequestOptions {
419  fn default() -> Self {
420    Self::new()
421  }
422}
423
424/// A Qwen3-VL structured-output inference engine.
425///
426/// Construct via [`Engine::load`]. `Engine` is `Send + Sync + Clone` —
427/// `mistralrs::Model` is `Arc<MistralRs>` internally, so cloning is cheap.
428/// Concurrent `run()` calls from multiple tasks are safe and are
429/// continuous-batched by mistralrs's scheduler (not parallel decode).
430#[derive(Clone)]
431pub struct Engine {
432  model: Arc<mistralrs::Model>,
433  options: EngineOptions,
434}
435
436impl Engine {
437  /// Load the Qwen3-VL model at `opts.model_path()` with the given
438  /// quantization. Blocks for ~13s on Apple Silicon Metal at first call.
439  /// Holds GPU memory until the last clone is dropped.
440  #[instrument(name = "qwen3_vl::load", skip(opts), fields(model_path = %opts.model_path().display(), quantization = ?opts.quantization()))]
441  pub async fn load(opts: EngineOptions) -> Result<Self, LoadError> {
442    if !opts.model_path().exists() {
443      return Err(LoadError::NotFound(opts.model_path().to_path_buf()));
444    }
445    let started = std::time::Instant::now();
446    info!("loading Qwen3-VL model");
447    let model_id = opts.model_path().to_string_lossy().into_owned();
448    let model = MultimodalModelBuilder::new(model_id)
449      .with_isq(opts.quantization())
450      .build()
451      .await
452      .map_err(|e| LoadError::Build(e.to_string()))?;
453    info!(
454      elapsed_ms = started.elapsed().as_millis() as u64,
455      "model loaded"
456    );
457    Ok(Self {
458      model: Arc::new(model),
459      options: opts,
460    })
461  }
462
463  /// Returns the local model directory the engine was loaded from.
464  #[cfg_attr(not(tarpaulin), inline(always))]
465  pub fn model_path(&self) -> &Path {
466    self.options.model_path()
467  }
468
469  /// Returns the quantization the engine was loaded with.
470  #[cfg_attr(not(tarpaulin), inline(always))]
471  pub const fn quantization(&self) -> IsqType {
472    self.options.quantization()
473  }
474
475  /// Returns the configured `max_tokens` ceiling for [`Engine::run`].
476  #[cfg_attr(not(tarpaulin), inline(always))]
477  pub const fn max_tokens(&self) -> usize {
478    self.options.max_tokens()
479  }
480
481  /// Returns the engine-level default sampler profile. See
482  /// [`EngineOptions::request`].
483  #[cfg_attr(not(tarpaulin), inline(always))]
484  pub const fn request(&self) -> &RequestOptions {
485    self.options.request()
486  }
487
488  /// Returns the per-call inference timeout. Issue #1 H-001.
489  #[cfg_attr(not(tarpaulin), inline(always))]
490  pub const fn inference_timeout(&self) -> Duration {
491    self.options.inference_timeout()
492  }
493
494  /// Optional pre-warm: runs one tiny inference against a 1×1 black image
495  /// to JIT-compile Metal kernels before serving real requests. Logs
496  /// duration at `debug`. Errors are propagated to the caller — typically
497  /// you ignore them in production (warmup is best-effort).
498  ///
499  /// **Caveat:** Metal's kernel JIT specializes per tensor shape, so
500  /// the kernels compiled for a 1×1 image are not guaranteed to match
501  /// the kernels needed for production-sized keyframes (e.g.,
502  /// 720×1280). For shape-matched warmup, use
503  /// [`Self::warmup_with_image`] (issue #1 M-002).
504  #[instrument(name = "qwen3_vl::warmup", skip(self))]
505  pub async fn warmup(&self) -> Result<(), Error> {
506    use image::{DynamicImage, RgbImage};
507    let blank = DynamicImage::ImageRgb8(RgbImage::new(1, 1));
508    self.warmup_with_image(blank).await
509  }
510
511  /// Pre-warm with a caller-supplied image (issue #1 M-002). Use a
512  /// representative production-sized keyframe (e.g., 720×1280 black
513  /// frame, or a real fixture) so Metal's per-shape kernel JIT
514  /// specializes for the shapes the production path will hit. The
515  /// 1×1 [`Self::warmup`] only exercises the load → encode → decode
516  /// pipeline structurally; first real-keyframe inference can still
517  /// incur JIT cost without this.
518  #[instrument(name = "qwen3_vl::warmup_with_image", skip(self, image))]
519  pub async fn warmup_with_image(&self, image: image::DynamicImage) -> Result<(), Error> {
520    let started = std::time::Instant::now();
521    let messages = MultimodalMessages::new().add_image_message(
522      TextMessageRole::User,
523      "Reply with: ok",
524      vec![image],
525    );
526    let request = RequestBuilder::from(messages)
527      .set_sampler_max_len(4)
528      .enable_thinking(false);
529    // Same timeout as run_with: a stuck warmup shouldn't block
530    // worker startup forever.
531    let timeout = self.options.inference_timeout();
532    let _ = tokio::time::timeout(timeout, self.model.send_chat_request(request))
533      .await
534      .map_err(|_| Error::InferenceTimeout(timeout))?
535      .map_err(|e| Error::Inference(e.to_string()))?;
536    debug!(
537      elapsed_ms = started.elapsed().as_millis() as u64,
538      "warmup complete"
539    );
540    Ok(())
541  }
542
543  /// Single-turn, multi-image structured run with the engine-level
544  /// default sampler ([`EngineOptions::request`]). Equivalent to
545  /// [`Self::run_with`] called with that profile.
546  ///
547  /// Consumes `images` because mistralrs's
548  /// `MultimodalMessages::add_image_message` takes `Vec<DynamicImage>`
549  /// by value — borrowing here would force a silent `.to_vec()` clone
550  /// of decoded image data. Returns `Error::NoImages` for an empty
551  /// input.
552  ///
553  /// Dropping the returned future is a fast wakeup, not GPU
554  /// cancellation: mistralrs's engine loop completes the in-flight
555  /// scheduler step in the background; the response is silently
556  /// discarded on send. Wrap in `tokio::time::timeout(..)` for a
557  /// deadline.
558  #[cfg_attr(not(tarpaulin), inline(always))]
559  pub async fn run<T: Task>(
560    &self,
561    task: &T,
562    images: Vec<image::DynamicImage>,
563  ) -> Result<T::Output, Error>
564  where
565    T::ParseError: Send + Sync + 'static,
566  {
567    self.run_with(task, images, self.options.request()).await
568  }
569
570  /// Same as [`Self::run`] but with a caller-supplied
571  /// [`RequestOptions`] that replaces the engine-level default for
572  /// this call. Use this when a specific call needs a sampler profile
573  /// other than [`EngineOptions::request`].
574  ///
575  /// All four fields from `opts` are applied uniformly to the
576  /// underlying mistralrs sampler — there is no separate deterministic
577  /// branch; the preset itself encodes the choice between greedy
578  /// ([`RequestOptions::deterministic`]) and stochastic
579  /// ([`RequestOptions::new`] / `default`).
580  #[instrument(
581    name = "qwen3_vl::run_with",
582    skip(self, task, images, opts),
583    fields(
584      task_kind = std::any::type_name::<T>(),
585      image_count = images.len(),
586      max_tokens = self.options.max_tokens(),
587      temperature = opts.temperature(),
588    ),
589  )]
590  pub async fn run_with<T: Task>(
591    &self,
592    task: &T,
593    images: Vec<image::DynamicImage>,
594    opts: &RequestOptions,
595  ) -> Result<T::Output, Error>
596  where
597    // bound at the call site only.
598    // `Send + Sync + 'static` lets us box the parse error into
599    // `Error::Parse(Box<dyn Error + Send + Sync + 'static>)`,
600    // which works for any Task — including ones whose only
601    // purpose is to receive `UnsupportedGrammar` for routing.
602    T::ParseError: Send + Sync + 'static,
603  {
604    if images.is_empty() {
605      return Err(Error::NoImages);
606    }
607    // Issue #1 H-002: validate sampler parameters before mistralrs
608    // sees them. Negative temperature, top_p > 1.0, top_k = 0, or
609    // non-finite presence_penalty all produce undefined behavior in
610    // mistralrs's sampler.
611    opts.validate()?;
612
613    // Pull the task's grammar and route to mistralrs's
614    // Constraint::JsonSchema. mistralrs 0.8 only accepts JSON
615    // Schema; non-JSON variants (Lark, Regex) are rejected via
616    // UnsupportedGrammar so callers can route to an
617    // llguidance-backed engine instead (e.g., the `lfm` crate).
618    let grammar = task.grammar();
619    let schema = grammar
620      .as_json_schema()
621      .ok_or_else(|| {
622        Error::UnsupportedGrammar(llmtask::UnsupportedGrammar::new(
623          grammar.kind(),
624          "json_schema",
625        ))
626      })?
627      .clone();
628
629    let messages =
630      MultimodalMessages::new().add_image_message(TextMessageRole::User, task.prompt(), images);
631
632    let request = RequestBuilder::from(messages)
633      .set_sampler_max_len(self.options.max_tokens().max(1))
634      .enable_thinking(false)
635      .set_constraint(Constraint::JsonSchema(schema))
636      .set_sampler_temperature(opts.temperature())
637      .set_sampler_topp(opts.top_p())
638      .set_sampler_topk(opts.top_k())
639      .set_sampler_presence_penalty(opts.presence_penalty());
640
641    let started = std::time::Instant::now();
642    // Issue #1 H-001: bound inference duration. A stuck model
643    // (Metal JIT stall, GPU OOM, scheduler deadlock) would
644    // otherwise block the caller indefinitely. Drop on timeout —
645    // mistralrs will silently complete the in-flight scheduler
646    // step in the background and discard the response.
647    let timeout = self.options.inference_timeout();
648    let response = tokio::time::timeout(timeout, self.model.send_chat_request(request))
649      .await
650      .map_err(|_| Error::InferenceTimeout(timeout))?
651      .map_err(|e| Error::Inference(e.to_string()))?;
652    debug!(
653      elapsed_ms = started.elapsed().as_millis() as u64,
654      "inference complete"
655    );
656
657    let choice = response.choices.first().ok_or(Error::Empty)?;
658    // finding: reject length-truncated generations
659    // before parsing. mistralrs `Display for StopReason` maps Eos
660    // → "stop" and `Length`/`ModelLength` → "length"; "stop" is
661    // the only outcome where the constrained decoder produced a
662    // full natural completion. Anything else (length, error, etc.)
663    // means the JSON could be syntactically valid but semantically
664    // incomplete — persisting it to a search index would silently
665    // truncate metadata.
666    if choice.finish_reason != "stop" {
667      let raw_len = choice
668        .message
669        .content
670        .as_ref()
671        .map(|s| s.len())
672        .unwrap_or(0);
673      return Err(Error::Truncated {
674        finish_reason: choice.finish_reason.clone(),
675        raw_len,
676      });
677    }
678    let text = choice
679      .message
680      .content
681      .clone()
682      .filter(|s| !s.trim().is_empty())
683      .ok_or(Error::Empty)?;
684
685    #[cfg(feature = "trace-output")]
686    tracing::trace!(raw = %text, "model output");
687
688    task.parse(&text).map_err(|e| Error::Parse(Box::new(e)))
689  }
690}
691
692#[cfg(test)]
693mod tests {
694  use super::*;
695
696  #[test]
697  fn engine_options_defaults_to_deterministic_request() {
698    // EngineOptions::new embeds RequestOptions::deterministic() as the
699    // engine-level default sampler. This test guards against silent
700    // reversion: if someone later flips the default back to the
701    // stochastic model-card profile, every caller that uses the obvious
702    // ::new() constructor would silently start drifting their search
703    // index on retries/backfills. To get the model-card stochastic
704    // sampler, callers must opt in explicitly via
705    // .with_request(RequestOptions::new()).
706    let opts = EngineOptions::new("/tmp/model");
707    assert_eq!(opts.model_path(), Path::new("/tmp/model"));
708    assert!(matches!(opts.quantization(), IsqType::Q4K));
709    // Issue #1 M-003: default raised from 512 to 1024.
710    assert_eq!(opts.max_tokens(), 1024);
711    let req = opts.request();
712    assert_eq!(req.temperature(), 0.0);
713    assert_eq!(req.top_p(), 1.0);
714    assert_eq!(req.top_k(), 1);
715    assert_eq!(
716      req.presence_penalty(),
717      1.5,
718      "deterministic preset must keep presence_penalty 1.5 — greedy \
719       without it falls into token loops"
720    );
721  }
722
723  #[test]
724  fn engine_options_with_chains() {
725    let opts = EngineOptions::new("/tmp/a")
726      .with_model_path("/tmp/b")
727      .with_quantization(IsqType::Q8_0)
728      .with_max_tokens(1024)
729      .with_request(RequestOptions::new());
730    assert_eq!(opts.model_path(), Path::new("/tmp/b"));
731    assert!(matches!(opts.quantization(), IsqType::Q8_0));
732    assert_eq!(opts.max_tokens(), 1024);
733    // Swapping in RequestOptions::new() flips the engine to the
734    // model-card stochastic profile (temperature 0.7).
735    assert_eq!(opts.request().temperature(), 0.7);
736  }
737
738  #[test]
739  fn engine_options_set_chains() {
740    let mut opts = EngineOptions::new("/tmp/a");
741    opts
742      .set_model_path("/tmp/b")
743      .set_quantization(IsqType::Q8_0)
744      .set_max_tokens(1024)
745      .set_request(RequestOptions::new());
746    assert_eq!(opts.model_path(), Path::new("/tmp/b"));
747    assert!(matches!(opts.quantization(), IsqType::Q8_0));
748    assert_eq!(opts.max_tokens(), 1024);
749    assert_eq!(opts.request().temperature(), 0.7);
750  }
751
752  #[test]
753  fn request_options_defaults_match_model_card() {
754    // Hard-coded against the Qwen3-VL Instruct model card values to
755    // catch silent drift if the defaults are ever edited without a
756    // CHANGELOG note. See indexer/models/qwen3-vl-2b/README.md.
757    let opts = RequestOptions::new();
758    assert_eq!(opts.temperature(), 0.7);
759    assert_eq!(opts.top_p(), 0.8);
760    assert_eq!(opts.top_k(), 20);
761    assert_eq!(opts.presence_penalty(), 1.5);
762  }
763
764  #[test]
765  fn request_options_default_eq_new() {
766    let new_opts = RequestOptions::new();
767    let default_opts = RequestOptions::default();
768    assert_eq!(new_opts.temperature(), default_opts.temperature());
769    assert_eq!(new_opts.top_p(), default_opts.top_p());
770    assert_eq!(new_opts.top_k(), default_opts.top_k());
771    assert_eq!(new_opts.presence_penalty(), default_opts.presence_penalty());
772  }
773
774  #[test]
775  fn request_options_with_chains() {
776    let opts = RequestOptions::new()
777      .with_temperature(0.3)
778      .with_top_p(0.95)
779      .with_top_k(50)
780      .with_presence_penalty(0.0);
781    assert_eq!(opts.temperature(), 0.3);
782    assert_eq!(opts.top_p(), 0.95);
783    assert_eq!(opts.top_k(), 50);
784    assert_eq!(opts.presence_penalty(), 0.0);
785  }
786
787  #[test]
788  fn request_options_set_chains() {
789    let mut opts = RequestOptions::new();
790    opts
791      .set_temperature(0.3)
792      .set_top_p(0.95)
793      .set_top_k(50)
794      .set_presence_penalty(0.0);
795    assert_eq!(opts.temperature(), 0.3);
796    assert_eq!(opts.top_p(), 0.95);
797    assert_eq!(opts.top_k(), 50);
798    assert_eq!(opts.presence_penalty(), 0.0);
799  }
800
801  #[test]
802  fn request_options_deterministic_preset() {
803    // Hard-coded greedy values: temperature=0 + top_k=1 forces argmax,
804    // top_p=1 disables nucleus filtering. presence_penalty 1.5 is kept
805    // (greedy without it falls into token loops). See
806    // RequestOptions::deterministic doc for the trade-off.
807    let opts = RequestOptions::deterministic();
808    assert_eq!(opts.temperature(), 0.0);
809    assert_eq!(opts.top_p(), 1.0);
810    assert_eq!(opts.top_k(), 1);
811    assert_eq!(opts.presence_penalty(), 1.5);
812  }
813
814  // ===== Issue #1 H-002: RequestOptions::validate =====
815
816  #[test]
817  fn request_options_validate_accepts_presets() {
818    // Both shipped presets must validate.
819    assert!(RequestOptions::new().validate().is_ok());
820    assert!(RequestOptions::deterministic().validate().is_ok());
821  }
822
823  #[test]
824  fn request_options_validate_rejects_negative_temperature() {
825    let opts = RequestOptions::new().with_temperature(-0.1);
826    assert!(matches!(opts.validate(), Err(Error::InvalidRequest(_))));
827  }
828
829  #[test]
830  fn request_options_validate_rejects_non_finite_temperature() {
831    assert!(matches!(
832      RequestOptions::new().with_temperature(f64::NAN).validate(),
833      Err(Error::InvalidRequest(_))
834    ));
835    assert!(matches!(
836      RequestOptions::new()
837        .with_temperature(f64::INFINITY)
838        .validate(),
839      Err(Error::InvalidRequest(_))
840    ));
841  }
842
843  #[test]
844  fn request_options_validate_rejects_top_p_out_of_range() {
845    assert!(matches!(
846      RequestOptions::new().with_top_p(0.0).validate(),
847      Err(Error::InvalidRequest(_))
848    ));
849    assert!(matches!(
850      RequestOptions::new().with_top_p(1.5).validate(),
851      Err(Error::InvalidRequest(_))
852    ));
853    assert!(matches!(
854      RequestOptions::new().with_top_p(-0.1).validate(),
855      Err(Error::InvalidRequest(_))
856    ));
857    assert!(matches!(
858      RequestOptions::new().with_top_p(f64::NAN).validate(),
859      Err(Error::InvalidRequest(_))
860    ));
861  }
862
863  #[test]
864  fn request_options_validate_accepts_top_p_one() {
865    // top_p = 1.0 disables nucleus filtering — used by the
866    // deterministic preset. Must pass.
867    assert!(RequestOptions::new().with_top_p(1.0).validate().is_ok());
868  }
869
870  #[test]
871  fn request_options_validate_rejects_top_k_zero() {
872    let opts = RequestOptions::new().with_top_k(0);
873    assert!(matches!(opts.validate(), Err(Error::InvalidRequest(_))));
874  }
875
876  #[test]
877  fn request_options_validate_rejects_non_finite_presence_penalty() {
878    assert!(matches!(
879      RequestOptions::new()
880        .with_presence_penalty(f32::NAN)
881        .validate(),
882      Err(Error::InvalidRequest(_))
883    ));
884    assert!(matches!(
885      RequestOptions::new()
886        .with_presence_penalty(f32::INFINITY)
887        .validate(),
888      Err(Error::InvalidRequest(_))
889    ));
890  }
891
892  #[test]
893  fn request_options_validate_accepts_negative_presence_penalty() {
894    // mistralrs allows negative presence_penalty (encourages
895    // repetition). Validate only checks finiteness.
896    assert!(
897      RequestOptions::new()
898        .with_presence_penalty(-1.0)
899        .validate()
900        .is_ok()
901    );
902  }
903
904  // ===== Issue #1 H-001 + M-003 =====
905
906  #[test]
907  fn engine_options_default_inference_timeout() {
908    let opts = EngineOptions::new("/nonexistent");
909    assert_eq!(opts.inference_timeout(), DEFAULT_INFERENCE_TIMEOUT);
910    assert_eq!(opts.inference_timeout(), Duration::from_secs(300));
911  }
912
913  #[test]
914  fn engine_options_with_inference_timeout() {
915    let opts = EngineOptions::new("/nonexistent").with_inference_timeout(Duration::from_secs(10));
916    assert_eq!(opts.inference_timeout(), Duration::from_secs(10));
917  }
918
919  #[test]
920  fn engine_options_default_max_tokens_bumped_to_1024() {
921    // Issue #1 M-003: default raised from 512 to 1024 to avoid
922    // mid-JSON truncation on complex scenes.
923    let opts = EngineOptions::new("/nonexistent");
924    assert_eq!(opts.max_tokens(), 1024);
925  }
926}