qwen3_vl/
image_analysis.rs

1//! The image-analysis preset: [`ImageAnalysisTask`] produces a
2//! typed [`ImageAnalysis`] with nine fields (scene category,
3//! free-form description, five detection lists, shot-type label,
4//! search tags). The "scene" wording survives in the `scene` field
5//! and in the prompt because the upstream use case is video keyframes
6//! representing scenes — but the type itself is engine-output for a
7//! single image and works for any single-image analysis pipeline.
8//!
9//! `ImageAnalysis` lives in the `llmtask` sibling crate (re-exported
10//! at the top of this module); this engine doesn't depend on
11//! `findit-proto`, so any consumer can map the result into its own
12//! wire shape. The legacy `findit-proto::database::SceneVlmResult`
13//! paired each detection-array entry with a `confidence` float;
14//! `llmtask::ImageAnalysis` exposes those buckets as plain
15//! `Vec<SmolStr>`. VLM self-reported per-detection confidence is
16//! poorly calibrated, and a flat hardcoded confidence on every entry
17//! is a no-op for both UX and search-time ranking. If a downstream
18//! consumer needs per-detection scores, the right place to get them
19//! is from search-time embedding similarity or scene-aggregation
20//! metrics (frame frequency, etc.), not from VLM self-report. The
21//! `findit-proto` mapping (when revived) can stamp a fixed value on
22//! its side or compute one from those non-VLM sources.
23//!
24//! `colors` is intentionally NOT a VLM output: dominant-color
25//! extraction is a closed-form image-processing problem (k-means /
26//! histogram clustering on pixel data + a perceptual-distance lookup
27//! against a named-color dataset like xkcd's), so making the VLM emit
28//! it would be slower, less accurate, and non-deterministic compared
29//! to running the algorithm on the keyframes directly. That belongs
30//! in whatever orchestrates keyframes → final record, not in this
31//! crate. `lighting` stays — semantic lighting terms ("backlit",
32//! "spotlight", "golden hour") need scene-level visual reasoning that
33//! pixel statistics alone can't reproduce.
34
35use serde::Deserialize;
36use serde_json::{Value, json};
37use smol_str::SmolStr;
38
39use llmtask::{JsonParseError, Task};
40
41pub use llmtask::ImageAnalysis;
42
43/// The scene-analysis prompt — verbatim port from `findit-qwen/src/lib.rs:382-401`.
44// IMAGE_ANALYSIS_PROMPT is intentionally written WITHOUT enumerated
45// example values. In deterministic (greedy) mode, mistralrs 0.8's
46// `presence_penalty` is applied over
47// `seq.get_toks()` (prompt + generated tokens), so every value-token
48// the prompt enumerates as an example gets a `-presence_penalty`
49// logit shift before the model emits anything. Listing example values
50// like "office", "wide shot", or "birthday cake with candles"
51// systematically biases the model AWAY from those exact terms when a
52// scene legitimately matches one. The fix here removes value-token
53// examples from the prompt; format guidance moves to descriptive
54// constraints (word counts, lowercase) so the model still knows the
55// expected shape without enumerating the vocabulary it's penalized
56// against.
57const IMAGE_ANALYSIS_PROMPT: &str = r#"Analyze the following video keyframes (in chronological order) from a single scene.
58
59Return ONLY a valid JSON object with exactly these fields:
60scene: a single short scene-category label in lowercase English, 1-3 words, no full sentence.
61description: 1-2 concise sentences in English describing the stable visual facts across the scene. Cover who is present, what they are doing, the setting, and the overall mood or visual style. If readable on-screen text appears, quote that text first, then continue the description.
62subjects: array of distinct people or animals as short noun phrases (each 2-6 words) with visible distinguishing features.
63objects: array of notable, search-relevant objects as short noun phrases (each 2-6 words).
64actions: array of visible actions as short verb phrases (each 1-4 words).
65mood: array of single-word or two-word adjectives describing the scene's overall emotional tone.
66shot_type: a single short camera-shot label in lowercase English, 1-2 words (a cinematography term).
67lighting: array of single-word or two-word lighting descriptors.
68tags: array of 8-12 short English search tags in lowercase. Prefer high-confidence search terms, complementary synonyms, style words, and culture-specific terms only when visually supported.
69
70Rules:
71- Use only information supported by the keyframes.
72- Prefer concrete visual facts over speculation.
73- Keep arrays deduplicated.
74- Use empty arrays or empty strings when a field is unknown.
75- Do not return markdown or any text outside the JSON object."#;
76
77const REQUIRED_FIELDS: &[&str] = &[
78  "scene",
79  "description",
80  "subjects",
81  "objects",
82  "actions",
83  "mood",
84  "shot_type",
85  "lighting",
86  "tags",
87];
88
89/// The scene-analysis task. Construct via [`ImageAnalysisTask::new`].
90#[derive(Clone)]
91pub struct ImageAnalysisTask {
92  schema: Value,
93  accept_empty: bool,
94}
95
96impl ImageAnalysisTask {
97  /// Construct with `accept_empty = false` (a payload that lacks the
98  /// required indexable content — `description` AND `tags` both
99  /// populated, OR at least one of the substantive detection buckets
100  /// `subjects` / `objects` / `actions` non-empty — is treated as a
101  /// model regression and rejected; see [`Self::with_accept_empty`]
102  /// for the full predicate and the opt-in alternative).
103  pub fn new() -> Self {
104    Self {
105      schema: build_schema(),
106      accept_empty: false,
107    }
108  }
109
110  /// Returns whether the parser accepts payloads that lack the
111  /// required indexable content (`description` AND `tags` both
112  /// non-empty). See [`Self::with_accept_empty`] for the trade-off.
113  #[cfg_attr(not(tarpaulin), inline(always))]
114  pub const fn accept_empty(&self) -> bool {
115    self.accept_empty
116  }
117
118  /// Builder-style setter for `accept_empty`.
119  ///
120  /// When `false` (default), the parser rejects payloads that lack
121  /// the required indexable content as [`JsonParseError::NoUsableFields`].
122  /// The composite threshold accepts a
123  /// payload when **either**:
124  ///
125  /// - `description` AND `tags` are both populated (the prose +
126  ///   keyword path; matches the integration-test smoke criterion),
127  ///   OR
128  /// - at least one of the **substantive** detection buckets —
129  ///   `subjects`, `objects`, or `actions` — is non-empty (the
130  ///   substantive-detection path; preserves who/what/where search
131  ///   metadata even when the model fails to summarize).
132  ///
133  /// Style/attribute buckets (`mood`, `lighting`) and single-label
134  /// fields (`scene`, `shot_type`) are intentionally NOT in the
135  /// substantive path. A payload like `lighting: ["natural light"]`
136  /// or `mood: ["calm"]` alone (description and tags empty, no
137  /// substantive detections) is more often a regression than a
138  /// legitimate weak-but-real scene; rejecting it surfaces the
139  /// failure instead of writing a single-attribute stub to the
140  /// search index.
141  ///
142  /// Tags-only, scene-only, description-only, shot_type-only,
143  /// mood/lighting-only, and fully-empty payloads all fail both
144  /// paths and are rejected. This is the right setting for
145  /// indexing pipelines: it surfaces decoder/model regressions that
146  /// would otherwise silently overwrite real metadata with sparse
147  /// search records.
148  ///
149  /// When `true`, the parser bypasses the indexable-content check and
150  /// returns whatever round-trips through the schema. IMAGE_ANALYSIS_PROMPT
151  /// explicitly tells the model to "Use empty arrays or empty strings
152  /// when a field is unknown", so on truly low-information frames
153  /// (blank, fade-to-black, plain color) compliant model output can
154  /// legitimately be sparse or fully-empty. Use this knob if your
155  /// pipeline distinguishes "low-information scene" from "no useful
156  /// content" via something other than the parser (e.g. scenesdetect's
157  /// keyframe scoring).
158  #[cfg_attr(not(tarpaulin), inline(always))]
159  pub const fn with_accept_empty(mut self, val: bool) -> Self {
160    self.accept_empty = val;
161    self
162  }
163
164  /// In-place setter for `accept_empty`. See
165  /// [`Self::with_accept_empty`] for the trade-off.
166  #[cfg_attr(not(tarpaulin), inline(always))]
167  pub const fn set_accept_empty(&mut self, val: bool) -> &mut Self {
168    self.accept_empty = val;
169    self
170  }
171}
172
173impl Default for ImageAnalysisTask {
174  fn default() -> Self {
175    Self::new()
176  }
177}
178
179impl Task for ImageAnalysisTask {
180  type Output = ImageAnalysis;
181  type Value = serde_json::Value;
182  type ParseError = llmtask::JsonParseError;
183
184  fn prompt(&self) -> &str {
185    IMAGE_ANALYSIS_PROMPT
186  }
187
188  fn schema(&self) -> &serde_json::Value {
189    &self.schema
190  }
191
192  fn grammar(&self) -> llmtask::Grammar {
193    // Cached JSON Schema cloned once per call. mistralrs 0.8's
194    // Constraint::JsonSchema(Value) requires owned data anyway.
195    llmtask::Grammar::JsonSchema(self.schema.clone())
196  }
197
198  fn parse(&self, raw: &str) -> Result<Self::Output, JsonParseError> {
199    let value: Value = serde_json::from_str(raw.trim())?;
200    let object = value
201      .as_object()
202      .ok_or_else(|| JsonParseError::Json(serde::de::Error::custom("expected top-level object")))?;
203    let missing = missing_required_fields(object);
204    if !missing.is_empty() {
205      return Err(JsonParseError::MissingFields(missing));
206    }
207    let payload: QwenScenePayload = serde_json::from_value(value)?;
208    // Indexable-content gate. IMAGE_ANALYSIS_PROMPT instructs the model to "Use
209    // empty arrays or empty strings when a field is unknown", so a
210    // truly compliant response on a blank/fade-to-black frame can be
211    // partially or fully empty. But a decoder/model regression on a
212    // normal frame also produces sparse output, and silently
213    // overwriting real search metadata with that is worse than
214    // failing.
215    //
216    // Composite threshold:
217    // a payload is usable iff EITHER
218    //   (a) `description` AND `tags` are both populated (typical
219    //       "good" model output, matches the integration-test smoke
220    //       criterion), OR
221    //   (b) at least one of the substantive detection buckets
222    //       (`subjects` / `objects` / `actions`) is non-empty
223    //       (the model produced who/what/where evidence even
224    //       when prose+keywords are missing).
225    //
226    // Style/attribute buckets (`mood` / `lighting`) and single-label
227    // fields (`scene` / `shot_type`) are intentionally NOT in the
228    // substantive path — payloads that populate ONLY those (with
229    // description and tags empty) are more often regression signals
230    // than legitimate scenes, and writing a single-attribute stub to
231    // a search index masks the failure.
232    // Callers that distinguish "low-information scene" from
233    // "regression" elsewhere opt in via
234    // `ImageAnalysisTask::with_accept_empty(true)`.
235    if !self.accept_empty && payload.lacks_indexable_content() {
236      return Err(JsonParseError::NoUsableFields);
237    }
238    Ok(payload.into_scene_analysis())
239  }
240}
241
242fn build_schema() -> Value {
243  json!({
244    "type": "object",
245    "properties": {
246      "scene": { "type": "string" },
247      "description": { "type": "string" },
248      "subjects": { "type": "array", "items": { "type": "string" } },
249      "objects": { "type": "array", "items": { "type": "string" } },
250      "actions": { "type": "array", "items": { "type": "string" } },
251      "mood": { "type": "array", "items": { "type": "string" } },
252      "shot_type": { "type": "string" },
253      "lighting": { "type": "array", "items": { "type": "string" } },
254      "tags": { "type": "array", "items": { "type": "string" } }
255    },
256    "required": REQUIRED_FIELDS,
257    "additionalProperties": false
258  })
259}
260
261/// Returns required field names that are either absent from the object or
262/// present as JSON `null`.
263///
264/// Both cases violate the schema (every required field must be a string or
265/// an array of strings, never null). Treating them identically here matters
266/// because the per-field deserializers further down the pipeline silently
267/// coerce `null` into the field's default (`Option::None` for strings,
268/// empty `DetectionLabels` / `TagList` for arrays). Without this check, a
269/// model response like `{"subjects": null, "tags": ["x"], ...}` would be
270/// accepted as a valid `ImageAnalysis` with an empty `subjects` list —
271/// silently dropping schema-required content and hiding constrained-decoder
272/// drift.
273fn missing_required_fields(object: &serde_json::Map<String, Value>) -> Vec<&'static str> {
274  REQUIRED_FIELDS
275    .iter()
276    .copied()
277    .filter(|field| match object.get(*field) {
278      None => true,
279      Some(value) => value.is_null(),
280    })
281    .collect()
282}
283
284#[derive(Debug, Default, Deserialize)]
285#[serde(deny_unknown_fields)]
286struct QwenScenePayload {
287  #[serde(default, deserialize_with = "deserialize_optional_trimmed_string")]
288  scene: Option<String>,
289  #[serde(default, deserialize_with = "deserialize_optional_trimmed_string")]
290  description: Option<String>,
291  #[serde(default)]
292  subjects: DetectionLabels,
293  #[serde(default)]
294  objects: DetectionLabels,
295  #[serde(default)]
296  actions: DetectionLabels,
297  #[serde(default)]
298  mood: DetectionLabels,
299  #[serde(default, deserialize_with = "deserialize_optional_single_label")]
300  shot_type: Option<String>,
301  #[serde(default)]
302  lighting: DetectionLabels,
303  #[serde(default)]
304  tags: TagList,
305}
306
307impl QwenScenePayload {
308  /// `true` if the payload lacks the minimum content required to
309  /// produce a useful indexing record. Composite threshold:
310  ///
311  /// - **prose+keyword path**: `description` AND `tags` both
312  ///   populated — the typical "good" model output that the
313  ///   integration test smoke-pins.
314  /// - **substantive-detection path**: at least one of the
315  ///   substantive detection buckets `subjects` / `objects` /
316  ///   `actions` is non-empty — these answer "who/what is in the
317  ///   scene and what's happening", which is search-relevant content
318  ///   on its own even when prose+keywords are missing.
319  ///
320  /// Returns `true` (lacks content) when **neither** path holds.
321  /// Used by [`ImageAnalysisTask::parse`] to surface model regressions as
322  /// [`JsonParseError::NoUsableFields`] unless the caller opts into
323  /// `accept_empty = true`.
324  ///
325  /// **Buckets intentionally excluded from the substantive path:**
326  ///
327  /// - `mood` / `lighting` — these are style/attribute buckets
328  ///   (search filter axes), not standalone content. A regression
329  ///   that returns only `lighting: ["natural light"]` or
330  ///   `mood: ["calm"]` (description and tags empty, no
331  ///   subjects/objects/actions) is more likely a model failure
332  ///   than a legitimate "we managed to detect mood but nothing
333  ///   else" case, and silently overwriting a richer search record
334  ///   with a single-attribute stub is what this gate prevents.
335  /// - `scene` / `shot_type` — single-label fields. "Scene-only" or
336  ///   "shot_type-only" payloads remain regression signals this
337  ///   gate is designed to catch.
338  fn lacks_indexable_content(&self) -> bool {
339    // `description` deserializes via `deserialize_optional_trimmed_string`
340    // which collapses empty/whitespace strings to `None`, so checking
341    // `is_none()` suffices.
342    let has_prose_and_keywords = self.description.is_some() && !self.tags.0.is_empty();
343    let has_substantive_detection =
344      !self.subjects.0.is_empty() || !self.objects.0.is_empty() || !self.actions.0.is_empty();
345    !has_prose_and_keywords && !has_substantive_detection
346  }
347
348  fn into_scene_analysis(self) -> ImageAnalysis {
349    // Internal `Option<String>` collapses to `SmolStr` (empty for None).
350    // Public `ImageAnalysis` uses empty-string-as-absence to keep the
351    // accessor surface simple — see IMAGE_ANALYSIS_PROMPT, which already
352    // instructs the model to emit empty strings for unknown fields.
353    let to_labels =
354      |list: DetectionLabels| -> Vec<SmolStr> { list.0.into_iter().map(SmolStr::from).collect() };
355    ImageAnalysis::new()
356      .with_scene(self.scene.map(SmolStr::from).unwrap_or_default())
357      .with_description(self.description.map(SmolStr::from).unwrap_or_default())
358      .with_subjects(to_labels(self.subjects))
359      .with_objects(to_labels(self.objects))
360      .with_actions(to_labels(self.actions))
361      .with_mood(to_labels(self.mood))
362      .with_shot_type(self.shot_type.map(SmolStr::from).unwrap_or_default())
363      .with_lighting(to_labels(self.lighting))
364      .with_tags(self.tags.0.into_iter().map(SmolStr::from).collect())
365  }
366}
367
368/// Used for the `tags` field. The string fallback is split on commas /
369/// semicolons / newlines, because tag-list drift (model dropped the
370/// array around a flat comma-separated string) is the historically
371/// common case for that field.
372#[derive(Debug, Default)]
373struct TagList(Vec<String>);
374
375impl<'de> Deserialize<'de> for TagList {
376  fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
377  where
378    D: serde::Deserializer<'de>,
379  {
380    #[derive(Deserialize)]
381    #[serde(untagged)]
382    enum Repr {
383      String(String),
384      List(Vec<String>),
385    }
386
387    let raw = Option::<Repr>::deserialize(deserializer)?;
388    let mut values = Vec::new();
389    match raw {
390      // String fallback: model returned a flattened comma/semicolon/newline-
391      // separated tag string instead of an array (real production drift
392      // for the tags field).
393      Some(Repr::String(value)) => push_string_list_items(&mut values, &value),
394      // Array form: trim and dedupe each element verbatim. Do NOT split
395      // on commas — a tag like `"july 4, 2026"` must stay one entry.
396      Some(Repr::List(items)) => {
397        for item in items {
398          push_array_item(&mut values, item);
399        }
400      }
401      None => {}
402    }
403    Ok(Self(values))
404  }
405}
406
407/// Used for detection-array fields (`subjects`, `objects`, `actions`,
408/// `mood`, `lighting`). Detection labels can naturally contain commas
409/// (e.g. `"red, white, and blue flag"`, `"middle-aged man in red
410/// jacket, sunglasses"`). String-fallback splitting was wrong for
411/// these fields — caught the case
412/// where model drift could turn one comma-bearing label into three
413/// bogus detections.
414///
415/// Behavior:
416/// - JSON array: trim and dedupe each element verbatim (no splitting).
417/// - JSON string: treat as a single-element list. Single label, no
418///   comma-split. This preserves the data when the constrained
419///   decoder drifts to a scalar string (rare with `JsonSchema`
420///   constraint but defensive).
421/// - JSON null / missing: empty list.
422#[derive(Debug, Default)]
423struct DetectionLabels(Vec<String>);
424
425impl<'de> Deserialize<'de> for DetectionLabels {
426  fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
427  where
428    D: serde::Deserializer<'de>,
429  {
430    #[derive(Deserialize)]
431    #[serde(untagged)]
432    enum Repr {
433      String(String),
434      List(Vec<String>),
435    }
436
437    let raw = Option::<Repr>::deserialize(deserializer)?;
438    let mut values = Vec::new();
439    match raw {
440      // Single string → one detection label, no splitting.
441      Some(Repr::String(value)) => push_array_item(&mut values, value),
442      Some(Repr::List(items)) => {
443        for item in items {
444          push_array_item(&mut values, item);
445        }
446      }
447      None => {}
448    }
449    Ok(Self(values))
450  }
451}
452
453fn push_array_item(values: &mut Vec<String>, raw: String) {
454  let trimmed = raw.trim();
455  if !trimmed.is_empty() && !values.iter().any(|existing| existing == trimmed) {
456    values.push(trimmed.to_owned());
457  }
458}
459
460fn push_string_list_items(values: &mut Vec<String>, raw: &str) {
461  for part in raw.split([',', ';', '\n']) {
462    let part = part.trim();
463    if !part.is_empty() && !values.iter().any(|existing| existing == part) {
464      values.push(part.to_owned());
465    }
466  }
467}
468
469fn deserialize_optional_trimmed_string<'de, D>(deserializer: D) -> Result<Option<String>, D::Error>
470where
471  D: serde::Deserializer<'de>,
472{
473  Ok(Option::<String>::deserialize(deserializer)?.and_then(normalize_string))
474}
475
476fn deserialize_optional_single_label<'de, D>(deserializer: D) -> Result<Option<String>, D::Error>
477where
478  D: serde::Deserializer<'de>,
479{
480  #[derive(Deserialize)]
481  #[serde(untagged)]
482  enum Repr {
483    String(String),
484    List(Vec<String>),
485  }
486
487  match Option::<Repr>::deserialize(deserializer)? {
488    Some(Repr::String(value)) => Ok(normalize_string(value)),
489    Some(Repr::List(values)) => {
490      let mut normalized = values.into_iter().filter_map(normalize_string);
491      let first = normalized.next();
492      if normalized.next().is_some() {
493        return Err(serde::de::Error::custom(
494          "expected a single shot_type label, got multiple values",
495        ));
496      }
497      Ok(first)
498    }
499    None => Ok(None),
500  }
501}
502
503fn normalize_string(value: String) -> Option<String> {
504  let trimmed = value.trim();
505  (!trimmed.is_empty()).then(|| trimmed.to_owned())
506}
507
508#[cfg(test)]
509mod tests {
510  use smol_str::SmolStr;
511
512  use super::*;
513
514  /// /11 H1 invariant: `IMAGE_ANALYSIS_PROMPT`
515  /// must not enumerate value tokens. mistralrs 0.8 applies
516  /// `presence_penalty` over `seq.get_toks()` (prompt + generated),
517  /// so any value-token in the prompt gets a `-presence_penalty`
518  /// logit shift before generation — biasing the model away from
519  /// legitimate matches in the deterministic-mode default. Format
520  /// guidance must use descriptive constraints (word counts,
521  /// lowercase, etc.) instead of enumerated examples.
522  ///
523  /// This guard catches accidental regressions (someone copy-pastes a
524  /// new field-instruction line that includes `e.g. "..."` examples,
525  /// or reverts to the pre-prompt). It is not a defense
526  /// against deliberate edits — a determined reverter can also
527  /// remove tokens from this list.
528  #[test]
529  fn scene_prompt_does_not_enumerate_value_tokens() {
530    let prompt_lower = IMAGE_ANALYSIS_PROMPT.to_lowercase();
531    // Distinctive multi-word phrases (and a few unambiguous
532    // single words) that appeared in the pre-prompt's
533    // `e.g.` enumerations across the nine field-instruction lines.
534    let banned_tokens = [
535      "stage performance",
536      "middle-aged man",
537      "golden retriever",
538      "birthday cake",
539      "vintage red sports car",
540      "cutting cake",
541      "taking photos",
542      "wide shot",
543      "close-up",
544      "medium shot",
545      "over-the-shoulder",
546      "celebratory",
547      "natural light",
548      "low light",
549      "backlit",
550    ];
551    for token in banned_tokens {
552      assert!(
553        !prompt_lower.contains(&token.to_lowercase()),
554        "IMAGE_ANALYSIS_PROMPT must not enumerate value token {token:?} \
555         (prompt-vocabulary tokens get \
556         -presence_penalty logit shift in deterministic mode); \
557         use descriptive format guidance (word counts, lowercase) \
558         instead of `e.g. \"...\"` examples"
559      );
560    }
561  }
562
563  // --- 7 ports verbatim from findit-qwen/src/lib.rs:1068-1124 ---
564
565  #[test]
566  fn parse_valid_json() {
567    let json = r#"{"scene":"beach","description":"Sunset over the ocean","subjects":["person"],"objects":["sun"],"actions":["watching"],"mood":["calm"],"shot_type":"wide shot","lighting":["golden hour"],"tags":["sunset","ocean"]}"#;
568    let task = ImageAnalysisTask::new();
569    let result = task.parse(json).expect("parse should succeed");
570    assert_eq!(result.scene(), "beach");
571    assert_eq!(result.description(), "Sunset over the ocean");
572    assert_eq!(result.mood().len(), 1);
573    assert_eq!(result.subjects().len(), 1);
574  }
575
576  #[test]
577  fn reject_json_with_wrapper_text() {
578    let text =
579      "Here is the analysis:\n{\"scene\":\"office\",\"description\":\"People working\"}\nDone.";
580    let task = ImageAnalysisTask::new();
581    assert!(task.parse(text).is_err());
582  }
583
584  #[test]
585  fn reject_plain_text_output() {
586    let text = "A beautiful sunset over the ocean.";
587    let task = ImageAnalysisTask::new();
588    assert!(task.parse(text).is_err());
589  }
590
591  #[test]
592  fn parse_comma_separated_tag_string() {
593    let json = r#"{"scene":"stage performance","description":"A singer on stage","subjects":[],"objects":["microphone"],"actions":["singing"],"mood":["energetic"],"shot_type":"medium shot","lighting":["spotlight"],"tags":"concert, live music, spotlight"}"#;
594    let task = ImageAnalysisTask::new();
595    let result = task.parse(json).expect("parse should succeed");
596    assert_eq!(
597      result.tags(),
598      &[
599        SmolStr::from("concert"),
600        SmolStr::from("live music"),
601        SmolStr::from("spotlight"),
602      ][..]
603    );
604  }
605
606  #[test]
607  fn reject_empty_json_payload() {
608    let task = ImageAnalysisTask::new();
609    assert!(task.parse("{}").is_err());
610  }
611
612  #[test]
613  fn reject_unknown_json_fields() {
614    let json = r#"{"description":"A singer on stage","extra":"unexpected"}"#;
615    let task = ImageAnalysisTask::new();
616    assert!(task.parse(json).is_err());
617  }
618
619  #[test]
620  fn reject_missing_required_fields() {
621    let json = r#"{"description":"A singer on stage","tags":["concert"]}"#;
622    let task = ImageAnalysisTask::new();
623    assert!(task.parse(json).is_err());
624  }
625
626  #[test]
627  fn parse_array_form_subjects() {
628    // Array form: each element becomes one detection, no splitting.
629    let json_list = r#"{"scene":"x","description":"y","subjects":["a","b"],"objects":[],"actions":[],"mood":[],"shot_type":"x","lighting":[],"tags":["t"]}"#;
630    let task = ImageAnalysisTask::new();
631    let result = task.parse(json_list).expect("list-form parse");
632    assert_eq!(result.subjects().len(), 2);
633    assert_eq!(result.subjects()[0], "a");
634    assert_eq!(result.subjects()[1], "b");
635  }
636
637  #[test]
638  fn subjects_string_form_treated_as_single_label() {
639    // Previously the string-fallback branch of StringList split
640    // scalar strings on commas, so a model drift to `"subjects":
641    // "red, white, and blue flag"` was silently turned into three
642    // bogus detections ("red", "white", "and blue
643    // flag"). The fix uses a separate `DetectionLabels` deserializer for
644    // detection-array fields that wraps the string as a single label —
645    // detection labels can naturally contain commas. (`tags` keeps
646    // comma-split behavior; that field is the historically common
647    // tag-list-as-string drift case and tests it separately.)
648    let json = r#"{"scene":"x","description":"y","subjects":"middle-aged man, in red jacket","objects":[],"actions":[],"mood":[],"shot_type":"x","lighting":[],"tags":["t"]}"#;
649    let task = ImageAnalysisTask::new();
650    let result = task.parse(json).expect("string-form parse");
651    assert_eq!(
652      result.subjects().len(),
653      1,
654      "string-form must wrap as a single label, not comma-split"
655    );
656    assert_eq!(result.subjects()[0], "middle-aged man, in red jacket");
657  }
658
659  /// Sparse payloads are a real failure mode in two distinct cases:
660  /// (a) decoder/model regression that overwrites real content with a
661  /// sparse output (only a few fields populated), and (b) a truly
662  /// low-information frame (blank, fade-to-black) where the model
663  /// legitimately complied with IMAGE_ANALYSIS_PROMPT's "Use empty arrays or
664  /// empty strings when a field is unknown" instruction. The default
665  /// behavior is to reject anything that lacks the indexable-content
666  /// threshold (`description` AND `tags` both populated) — surfacing
667  /// (a) as `JsonParseError::NoUsableFields` so the indexing pipeline
668  /// retries or skips. Callers that distinguish (b) elsewhere (e.g.,
669  /// through scenesdetect's keyframe scoring) opt into pass-through
670  /// via `ImageAnalysisTask::with_accept_empty(true)`.
671  ///
672  /// The cluster of tests below pins both halves of that contract:
673  /// the all-empty case, the partial-empty cases (tags-only,
674  /// scene-only, description-only), and the lower bound of
675  /// acceptance (description AND tags both populated, everything
676  /// else empty).
677  #[test]
678  fn reject_all_required_fields_empty_payload_by_default() {
679    let json = r#"{
680      "scene": "",
681      "description": "",
682      "subjects": [],
683      "objects": [],
684      "actions": [],
685      "mood": [],
686      "shot_type": "",
687      "lighting": [],
688      "tags": []
689    }"#;
690    let task = ImageAnalysisTask::new();
691    let err = task
692      .parse(json)
693      .expect_err("default ImageAnalysisTask must reject all-empty payload");
694    assert!(
695      matches!(err, JsonParseError::NoUsableFields),
696      "expected NoUsableFields, got {err:?}"
697    );
698  }
699
700  #[test]
701  fn accept_all_required_fields_empty_payload_when_opted_in() {
702    let json = r#"{
703      "scene": "",
704      "description": "",
705      "subjects": [],
706      "objects": [],
707      "actions": [],
708      "mood": [],
709      "shot_type": "",
710      "lighting": [],
711      "tags": []
712    }"#;
713    let task = ImageAnalysisTask::new().with_accept_empty(true);
714    let result = task
715      .parse(json)
716      .expect("opt-in must accept the all-empty payload");
717    assert!(result.scene().is_empty());
718    assert!(result.description().is_empty());
719    assert!(result.subjects().is_empty());
720    assert!(result.objects().is_empty());
721    assert!(result.actions().is_empty());
722    assert!(result.mood().is_empty());
723    assert!(result.shot_type().is_empty());
724    assert!(result.lighting().is_empty());
725    assert!(result.tags().is_empty());
726  }
727
728  /// A payload with only `tags` non-empty (description and every
729  /// detection bucket empty) carries keyword coverage but no
730  /// prose, and is more often a model
731  /// regression than a legitimate scene. The default predicate now
732  /// rejects it as `NoUsableFields` so the indexing pipeline doesn't
733  /// silently overwrite a rich record with a tags-only stub.
734  #[test]
735  fn reject_tags_only_payload_by_default() {
736    let json = r#"{
737      "scene": "",
738      "description": "",
739      "subjects": [],
740      "objects": [],
741      "actions": [],
742      "mood": [],
743      "shot_type": "",
744      "lighting": [],
745      "tags": ["concert", "live music"]
746    }"#;
747    let task = ImageAnalysisTask::new();
748    let err = task
749      .parse(json)
750      .expect_err("default ImageAnalysisTask must reject tags-only payload");
751    assert!(
752      matches!(err, JsonParseError::NoUsableFields),
753      "expected NoUsableFields, got {err:?}"
754    );
755  }
756
757  /// Companion to the tags-only case: a payload with only `scene`
758  /// populated has a single-label scene tag but lacks both prose
759  /// and tag coverage. Reject by default — the indexing pipeline
760  /// retries or skips.
761  #[test]
762  fn reject_scene_only_payload_by_default() {
763    let json = r#"{
764      "scene": "office",
765      "description": "",
766      "subjects": [],
767      "objects": [],
768      "actions": [],
769      "mood": [],
770      "shot_type": "",
771      "lighting": [],
772      "tags": []
773    }"#;
774    let task = ImageAnalysisTask::new();
775    let err = task
776      .parse(json)
777      .expect_err("default ImageAnalysisTask must reject scene-only payload");
778    assert!(
779      matches!(err, JsonParseError::NoUsableFields),
780      "expected NoUsableFields, got {err:?}"
781    );
782  }
783
784  /// Symmetric to tags-only: a payload with only `description`
785  /// populated carries prose but no keyword coverage. Reject by
786  /// default — the threshold is `description` AND `tags` both
787  /// populated.
788  #[test]
789  fn reject_description_only_payload_by_default() {
790    let json = r#"{
791      "scene": "",
792      "description": "People working in an office",
793      "subjects": [],
794      "objects": [],
795      "actions": [],
796      "mood": [],
797      "shot_type": "",
798      "lighting": [],
799      "tags": []
800    }"#;
801    let task = ImageAnalysisTask::new();
802    let err = task
803      .parse(json)
804      .expect_err("default ImageAnalysisTask must reject description-only payload");
805    assert!(
806      matches!(err, JsonParseError::NoUsableFields),
807      "expected NoUsableFields, got {err:?}"
808    );
809  }
810
811  /// Pins the lower bound of acceptance: `description` AND `tags`
812  /// both populated, everything else empty. The parser must accept
813  /// this minimal-but-indexable shape — it's not a regression, it's
814  /// a real scene whose semantic buckets the model couldn't classify.
815  #[test]
816  fn accept_minimal_indexable_payload() {
817    let json = r#"{
818      "scene": "",
819      "description": "Two people talking",
820      "subjects": [],
821      "objects": [],
822      "actions": [],
823      "mood": [],
824      "shot_type": "",
825      "lighting": [],
826      "tags": ["conversation"]
827    }"#;
828    let task = ImageAnalysisTask::new();
829    let result = task
830      .parse(json)
831      .expect("description+tags must clear the indexable threshold");
832    assert_eq!(result.description(), "Two people talking");
833    assert_eq!(result.tags(), &[SmolStr::from("conversation")][..]);
834    // Empty buckets are preserved, not coerced to defaults.
835    assert!(result.subjects().is_empty());
836    assert!(result.objects().is_empty());
837    assert!(result.scene().is_empty());
838  }
839
840  /// A payload with rich detection buckets (subjects + objects +
841  /// actions populated) but empty `description` and empty `tags`
842  /// carries real structured search metadata — the per-category
843  /// fields are the whole reason
844  /// this crate exposes them. The composite predicate accepts via
845  /// the detection-rich path so a partial decoder/model miss on
846  /// description+tags doesn't discard otherwise-indexable content.
847  #[test]
848  fn accept_detection_rich_payload_with_empty_description_and_tags() {
849    let json = r#"{
850      "scene": "",
851      "description": "",
852      "subjects": ["middle-aged woman in red dress"],
853      "objects": ["wedding cake"],
854      "actions": ["cutting cake"],
855      "mood": [],
856      "shot_type": "",
857      "lighting": [],
858      "tags": []
859    }"#;
860    let task = ImageAnalysisTask::new();
861    let result = task.parse(json).expect(
862      "detection-rich payload must clear the indexable threshold via \
863       the detection-bucket path even when description+tags are empty",
864    );
865    assert_eq!(result.subjects().len(), 1);
866    assert_eq!(result.objects().len(), 1);
867    assert_eq!(result.actions().len(), 1);
868    assert!(result.description().is_empty());
869    assert!(result.tags().is_empty());
870  }
871
872  /// Pins that a single substantive detection bucket (subjects
873  /// only here) is sufficient to clear the indexable threshold,
874  /// even with description and tags both empty. Locks down the
875  /// substantive-detection path of the
876  /// predicate so a future refactor can't accidentally narrow it
877  /// back to "all detection buckets non-empty". Companion tests
878  /// for `objects`-only and `actions`-only follow.
879  #[test]
880  fn accept_subjects_only_payload() {
881    let json = r#"{
882      "scene": "",
883      "description": "",
884      "subjects": ["a single subject label"],
885      "objects": [],
886      "actions": [],
887      "mood": [],
888      "shot_type": "",
889      "lighting": [],
890      "tags": []
891    }"#;
892    let task = ImageAnalysisTask::new();
893    let result = task
894      .parse(json)
895      .expect("subjects-only must clear the indexable threshold");
896    assert_eq!(result.subjects().len(), 1);
897  }
898
899  #[test]
900  fn accept_objects_only_payload() {
901    let json = r#"{
902      "scene": "",
903      "description": "",
904      "subjects": [],
905      "objects": ["a single object label"],
906      "actions": [],
907      "mood": [],
908      "shot_type": "",
909      "lighting": [],
910      "tags": []
911    }"#;
912    let task = ImageAnalysisTask::new();
913    let result = task
914      .parse(json)
915      .expect("objects-only must clear the indexable threshold");
916    assert_eq!(result.objects().len(), 1);
917  }
918
919  #[test]
920  fn accept_actions_only_payload() {
921    let json = r#"{
922      "scene": "",
923      "description": "",
924      "subjects": [],
925      "objects": [],
926      "actions": ["a single action label"],
927      "mood": [],
928      "shot_type": "",
929      "lighting": [],
930      "tags": []
931    }"#;
932    let task = ImageAnalysisTask::new();
933    let result = task
934      .parse(json)
935      .expect("actions-only must clear the indexable threshold");
936    assert_eq!(result.actions().len(), 1);
937  }
938
939  /// Style/attribute buckets (mood, lighting) are NOT substantive
940  /// on their own. A payload that populates only `mood: ["calm"]`
941  /// (description, tags, and all substantive detection buckets
942  /// empty) is more likely a model regression
943  /// than a legitimate scene where the model could detect mood but
944  /// nothing else, and writing that to the search index is the
945  /// failure this gate is designed to prevent.
946  #[test]
947  fn reject_mood_only_payload_by_default() {
948    let json = r#"{
949      "scene": "",
950      "description": "",
951      "subjects": [],
952      "objects": [],
953      "actions": [],
954      "mood": ["calm"],
955      "shot_type": "",
956      "lighting": [],
957      "tags": []
958    }"#;
959    let task = ImageAnalysisTask::new();
960    let err = task
961      .parse(json)
962      .expect_err("default ImageAnalysisTask must reject mood-only payload");
963    assert!(
964      matches!(err, JsonParseError::NoUsableFields),
965      "expected NoUsableFields, got {err:?}"
966    );
967  }
968
969  /// See `reject_mood_only_payload_by_default`. `lighting` is a
970  /// style/attribute bucket; lighting-only is a regression signal.
971  #[test]
972  fn reject_lighting_only_payload_by_default() {
973    let json = r#"{
974      "scene": "",
975      "description": "",
976      "subjects": [],
977      "objects": [],
978      "actions": [],
979      "mood": [],
980      "shot_type": "",
981      "lighting": ["natural light"],
982      "tags": []
983    }"#;
984    let task = ImageAnalysisTask::new();
985    let err = task
986      .parse(json)
987      .expect_err("default ImageAnalysisTask must reject lighting-only payload");
988    assert!(
989      matches!(err, JsonParseError::NoUsableFields),
990      "expected NoUsableFields, got {err:?}"
991    );
992  }
993
994  /// Companion to the two single-attribute reject tests: even when
995  /// BOTH style/attribute buckets are populated, without any
996  /// substantive content (subjects/objects/actions) and without
997  /// description+tags, the payload still fails the threshold. Pins
998  /// that the substantive-detection path can't be satisfied by
999  /// piling up attribute buckets — the categorical separation
1000  /// matters.
1001  #[test]
1002  fn reject_attribute_only_payload_by_default() {
1003    let json = r#"{
1004      "scene": "",
1005      "description": "",
1006      "subjects": [],
1007      "objects": [],
1008      "actions": [],
1009      "mood": ["tense"],
1010      "shot_type": "",
1011      "lighting": ["low light"],
1012      "tags": []
1013    }"#;
1014    let task = ImageAnalysisTask::new();
1015    let err = task
1016      .parse(json)
1017      .expect_err("style-attribute-only payload must reject regardless of bucket count");
1018    assert!(
1019      matches!(err, JsonParseError::NoUsableFields),
1020      "expected NoUsableFields, got {err:?}"
1021    );
1022  }
1023
1024  #[test]
1025  fn reject_null_required_array() {
1026    // Regression for the prior null-tolerance bug: a required array
1027    // field set to `null` was treated as an empty list
1028    // (because the array deserializer uses Option::<Repr>::deserialize
1029    // which maps null -> None). If at least one other field was non-
1030    // empty, the parse returned an Ok value with the null field
1031    // silently coerced to []. That hides constrained-
1032    // decoder drift and drops schema-required search content.
1033    //
1034    // The fix: missing_required_fields now flags both absent AND null
1035    // values, so this parse must return MissingFields("subjects").
1036    let json = r#"{
1037      "scene": "office",
1038      "description": "people working",
1039      "subjects": null,
1040      "objects": [],
1041      "actions": [],
1042      "mood": [],
1043      "shot_type": "wide",
1044      "lighting": [],
1045      "tags": ["work"]
1046    }"#;
1047    let task = ImageAnalysisTask::new();
1048    let err = task
1049      .parse(json)
1050      .expect_err("null required field must be rejected");
1051    match err {
1052      JsonParseError::MissingFields(fields) => {
1053        assert!(
1054          fields.contains(&"subjects"),
1055          "expected 'subjects' in MissingFields, got {fields:?}"
1056        );
1057      }
1058      other => panic!("expected MissingFields, got {other:?}"),
1059    }
1060  }
1061
1062  #[test]
1063  fn reject_null_required_string() {
1064    // Same hazard for string-typed required fields: deserialize_optional_
1065    // trimmed_string maps null -> None for `scene` / `description`. Without
1066    // the F2 fix, this would parse with scene=None and succeed because
1067    // tags is non-empty. Must be rejected.
1068    let json = r#"{
1069      "scene": null,
1070      "description": "people working",
1071      "subjects": ["person"],
1072      "objects": [],
1073      "actions": [],
1074      "mood": [],
1075      "shot_type": "wide",
1076      "lighting": [],
1077      "tags": ["work"]
1078    }"#;
1079    let task = ImageAnalysisTask::new();
1080    let err = task
1081      .parse(json)
1082      .expect_err("null required field must be rejected");
1083    match err {
1084      JsonParseError::MissingFields(fields) => {
1085        assert!(
1086          fields.contains(&"scene"),
1087          "expected 'scene' in MissingFields, got {fields:?}"
1088        );
1089      }
1090      other => panic!("expected MissingFields, got {other:?}"),
1091    }
1092  }
1093
1094  #[test]
1095  fn reject_multiple_null_required_fields() {
1096    // Multiple null fields must all be reported in one go.
1097    let json = r#"{
1098      "scene": null,
1099      "description": null,
1100      "subjects": null,
1101      "objects": [],
1102      "actions": [],
1103      "mood": [],
1104      "shot_type": "wide",
1105      "lighting": [],
1106      "tags": ["work"]
1107    }"#;
1108    let task = ImageAnalysisTask::new();
1109    let err = task
1110      .parse(json)
1111      .expect_err("null required fields must be rejected");
1112    match err {
1113      JsonParseError::MissingFields(fields) => {
1114        assert!(fields.contains(&"scene"), "missing 'scene' in {fields:?}");
1115        assert!(
1116          fields.contains(&"description"),
1117          "missing 'description' in {fields:?}"
1118        );
1119        assert!(
1120          fields.contains(&"subjects"),
1121          "missing 'subjects' in {fields:?}"
1122        );
1123      }
1124      other => panic!("expected MissingFields, got {other:?}"),
1125    }
1126  }
1127
1128  #[test]
1129  fn array_elements_are_not_comma_split() {
1130    // Regression: previously, the array branch ran every element
1131    // through the comma/semicolon/newline splitter. A valid
1132    // constrained response with a comma-containing label like
1133    // `"red, white, and blue flag"` would be corrupted into three
1134    // separate entries. The type split between `DetectionLabels`
1135    // (used here) and `TagList` makes this even stricter:
1136    // detection arrays never split on commas, even in the
1137    // string-fallback branch.
1138    let json = r#"{
1139      "scene": "patriotic event",
1140      "description": "Flag display",
1141      "subjects": ["middle-aged man, in red jacket"],
1142      "objects": ["red, white, and blue flag", "birthday cake with candles, balloons"],
1143      "actions": ["waving"],
1144      "mood": ["festive"],
1145      "shot_type": "wide shot",
1146      "lighting": ["natural, dramatic backlight"],
1147      "tags": ["july 4, 2026"]
1148    }"#;
1149    let task = ImageAnalysisTask::new();
1150    let result = task.parse(json).expect("parse should succeed");
1151    assert_eq!(result.subjects().len(), 1);
1152    assert_eq!(result.subjects()[0], "middle-aged man, in red jacket");
1153    assert_eq!(result.objects().len(), 2);
1154    assert_eq!(result.objects()[0], "red, white, and blue flag");
1155    assert_eq!(result.objects()[1], "birthday cake with candles, balloons");
1156    assert_eq!(result.lighting().len(), 1);
1157    assert_eq!(result.lighting()[0], "natural, dramatic backlight");
1158    assert_eq!(result.tags().len(), 1);
1159    assert_eq!(result.tags()[0].as_str(), "july 4, 2026");
1160  }
1161
1162  #[test]
1163  fn parse_shot_type_list_form() {
1164    // shot_type accepts the list form `["wide shot"]` (one element)
1165    // via `deserialize_optional_single_label`.
1166    let json_one = r#"{"scene":"x","description":"y","subjects":[],"objects":[],"actions":[],"mood":[],"shot_type":["wide shot"],"lighting":[],"tags":["t"]}"#;
1167    let task = ImageAnalysisTask::new();
1168    let result = task.parse(json_one).expect("single-element list parse");
1169    assert_eq!(result.shot_type(), "wide shot");
1170
1171    // Multi-element list is rejected.
1172    let json_many = r#"{"scene":"x","description":"y","subjects":[],"objects":[],"actions":[],"mood":[],"shot_type":["wide","close-up"],"lighting":[],"tags":["t"]}"#;
1173    assert!(task.parse(json_many).is_err());
1174  }
1175}
qwen3_vl/image_analysis.rs

qwen3_vl/
image_analysis.rs