use serde::Deserialize;
use serde_json::{Value, json};
use smol_str::SmolStr;
use llmtask::{JsonParseError, Task};
pub use llmtask::ImageAnalysis;
const IMAGE_ANALYSIS_PROMPT: &str = r#"Analyze the following video keyframes (in chronological order) from a single scene.
Return ONLY a valid JSON object with exactly these fields:
scene: a single short scene-category label in lowercase English, 1-3 words, no full sentence.
description: 1-2 concise sentences in English describing the stable visual facts across the scene. Cover who is present, what they are doing, the setting, and the overall mood or visual style. If readable on-screen text appears, quote that text first, then continue the description.
subjects: array of distinct people or animals as short noun phrases (each 2-6 words) with visible distinguishing features.
objects: array of notable, search-relevant objects as short noun phrases (each 2-6 words).
actions: array of visible actions as short verb phrases (each 1-4 words).
mood: array of single-word or two-word adjectives describing the scene's overall emotional tone.
shot_type: a single short camera-shot label in lowercase English, 1-2 words (a cinematography term).
lighting: array of single-word or two-word lighting descriptors.
tags: array of 8-12 short English search tags in lowercase. Prefer high-confidence search terms, complementary synonyms, style words, and culture-specific terms only when visually supported.
Rules:
- Use only information supported by the keyframes.
- Prefer concrete visual facts over speculation.
- Keep arrays deduplicated.
- Use empty arrays or empty strings when a field is unknown.
- Do not return markdown or any text outside the JSON object."#;
const REQUIRED_FIELDS: &[&str] = &[
"scene",
"description",
"subjects",
"objects",
"actions",
"mood",
"shot_type",
"lighting",
"tags",
];
#[derive(Clone)]
pub struct ImageAnalysisTask {
schema: Value,
accept_empty: bool,
}
impl ImageAnalysisTask {
pub fn new() -> Self {
Self {
schema: build_schema(),
accept_empty: false,
}
}
#[cfg_attr(not(tarpaulin), inline(always))]
pub const fn accept_empty(&self) -> bool {
self.accept_empty
}
#[cfg_attr(not(tarpaulin), inline(always))]
pub const fn with_accept_empty(mut self, val: bool) -> Self {
self.accept_empty = val;
self
}
#[cfg_attr(not(tarpaulin), inline(always))]
pub const fn set_accept_empty(&mut self, val: bool) -> &mut Self {
self.accept_empty = val;
self
}
}
impl Default for ImageAnalysisTask {
fn default() -> Self {
Self::new()
}
}
impl Task for ImageAnalysisTask {
type Output = ImageAnalysis;
type Value = serde_json::Value;
type ParseError = llmtask::JsonParseError;
fn prompt(&self) -> &str {
IMAGE_ANALYSIS_PROMPT
}
fn schema(&self) -> &serde_json::Value {
&self.schema
}
fn grammar(&self) -> llmtask::Grammar {
llmtask::Grammar::JsonSchema(self.schema.clone())
}
fn parse(&self, raw: &str) -> Result<Self::Output, JsonParseError> {
let value: Value = serde_json::from_str(raw.trim())?;
let object = value
.as_object()
.ok_or_else(|| JsonParseError::Json(serde::de::Error::custom("expected top-level object")))?;
let missing = missing_required_fields(object);
if !missing.is_empty() {
return Err(JsonParseError::MissingFields(missing));
}
let payload: LfmScenePayload = serde_json::from_value(value)?;
if !self.accept_empty && payload.lacks_indexable_content() {
return Err(JsonParseError::NoUsableFields);
}
Ok(payload.into_scene_analysis())
}
}
fn build_schema() -> Value {
json!({
"type": "object",
"properties": {
"scene": { "type": "string" },
"description": { "type": "string" },
"subjects": { "type": "array", "items": { "type": "string" } },
"objects": { "type": "array", "items": { "type": "string" } },
"actions": { "type": "array", "items": { "type": "string" } },
"mood": { "type": "array", "items": { "type": "string" } },
"shot_type": { "type": "string" },
"lighting": { "type": "array", "items": { "type": "string" } },
"tags": { "type": "array", "items": { "type": "string" } }
},
"required": REQUIRED_FIELDS,
"additionalProperties": false
})
}
fn missing_required_fields(object: &serde_json::Map<String, Value>) -> Vec<&'static str> {
REQUIRED_FIELDS
.iter()
.copied()
.filter(|field| match object.get(*field) {
None => true,
Some(value) => value.is_null(),
})
.collect()
}
#[derive(Debug, Default, Deserialize)]
#[serde(deny_unknown_fields)]
struct LfmScenePayload {
#[serde(default, deserialize_with = "deserialize_optional_trimmed_string")]
scene: Option<String>,
#[serde(default, deserialize_with = "deserialize_optional_trimmed_string")]
description: Option<String>,
#[serde(default)]
subjects: DetectionLabels,
#[serde(default)]
objects: DetectionLabels,
#[serde(default)]
actions: DetectionLabels,
#[serde(default)]
mood: DetectionLabels,
#[serde(default, deserialize_with = "deserialize_optional_single_label")]
shot_type: Option<String>,
#[serde(default)]
lighting: DetectionLabels,
#[serde(default)]
tags: TagList,
}
impl LfmScenePayload {
fn lacks_indexable_content(&self) -> bool {
let has_prose_and_keywords = self.description.is_some() && !self.tags.0.is_empty();
let has_substantive_detection =
!self.subjects.0.is_empty() || !self.objects.0.is_empty() || !self.actions.0.is_empty();
!has_prose_and_keywords && !has_substantive_detection
}
fn into_scene_analysis(self) -> ImageAnalysis {
let to_labels =
|list: DetectionLabels| -> Vec<SmolStr> { list.0.into_iter().map(SmolStr::from).collect() };
ImageAnalysis::new()
.with_scene(self.scene.map(SmolStr::from).unwrap_or_default())
.with_description(self.description.map(SmolStr::from).unwrap_or_default())
.with_subjects(to_labels(self.subjects))
.with_objects(to_labels(self.objects))
.with_actions(to_labels(self.actions))
.with_mood(to_labels(self.mood))
.with_shot_type(self.shot_type.map(SmolStr::from).unwrap_or_default())
.with_lighting(to_labels(self.lighting))
.with_tags(self.tags.0.into_iter().map(SmolStr::from).collect())
}
}
#[derive(Debug, Default)]
struct TagList(Vec<String>);
impl<'de> Deserialize<'de> for TagList {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
#[derive(Deserialize)]
#[serde(untagged)]
enum Repr {
String(String),
List(Vec<String>),
}
let raw = Option::<Repr>::deserialize(deserializer)?;
let mut values = Vec::new();
match raw {
Some(Repr::String(value)) => push_string_list_items(&mut values, &value),
Some(Repr::List(items)) => {
for item in items {
push_array_item(&mut values, item);
}
}
None => {}
}
Ok(Self(values))
}
}
#[derive(Debug, Default)]
struct DetectionLabels(Vec<String>);
impl<'de> Deserialize<'de> for DetectionLabels {
fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
where
D: serde::Deserializer<'de>,
{
#[derive(Deserialize)]
#[serde(untagged)]
enum Repr {
String(String),
List(Vec<String>),
}
let raw = Option::<Repr>::deserialize(deserializer)?;
let mut values = Vec::new();
match raw {
Some(Repr::String(value)) => push_array_item(&mut values, value),
Some(Repr::List(items)) => {
for item in items {
push_array_item(&mut values, item);
}
}
None => {}
}
Ok(Self(values))
}
}
fn push_array_item(values: &mut Vec<String>, raw: String) {
let trimmed = raw.trim();
if !trimmed.is_empty() && !values.iter().any(|existing| existing == trimmed) {
values.push(trimmed.to_owned());
}
}
fn push_string_list_items(values: &mut Vec<String>, raw: &str) {
for part in raw.split([',', ';', '\n']) {
let part = part.trim();
if !part.is_empty() && !values.iter().any(|existing| existing == part) {
values.push(part.to_owned());
}
}
}
fn deserialize_optional_trimmed_string<'de, D>(deserializer: D) -> Result<Option<String>, D::Error>
where
D: serde::Deserializer<'de>,
{
Ok(Option::<String>::deserialize(deserializer)?.and_then(normalize_string))
}
fn deserialize_optional_single_label<'de, D>(deserializer: D) -> Result<Option<String>, D::Error>
where
D: serde::Deserializer<'de>,
{
#[derive(Deserialize)]
#[serde(untagged)]
enum Repr {
String(String),
List(Vec<String>),
}
match Option::<Repr>::deserialize(deserializer)? {
Some(Repr::String(value)) => Ok(normalize_string(value)),
Some(Repr::List(values)) => {
let mut normalized = values.into_iter().filter_map(normalize_string);
let first = normalized.next();
if normalized.next().is_some() {
return Err(serde::de::Error::custom(
"expected a single shot_type label, got multiple values",
));
}
Ok(first)
}
None => Ok(None),
}
}
fn normalize_string(value: String) -> Option<String> {
let trimmed = value.trim();
(!trimmed.is_empty()).then(|| trimmed.to_owned())
}
#[cfg(test)]
mod tests {
use smol_str::SmolStr;
use super::*;
#[test]
fn scene_prompt_does_not_enumerate_value_tokens() {
let prompt_lower = IMAGE_ANALYSIS_PROMPT.to_lowercase();
let banned_tokens = [
"stage performance",
"middle-aged man",
"golden retriever",
"birthday cake",
"vintage red sports car",
"cutting cake",
"taking photos",
"wide shot",
"close-up",
"medium shot",
"over-the-shoulder",
"celebratory",
"natural light",
"low light",
"backlit",
];
for token in banned_tokens {
assert!(
!prompt_lower.contains(&token.to_lowercase()),
"IMAGE_ANALYSIS_PROMPT must not enumerate value token {token:?} \
(prompt-vocabulary tokens get \
-presence_penalty logit shift in deterministic mode); \
use descriptive format guidance (word counts, lowercase) \
instead of `e.g. \"...\"` examples"
);
}
}
#[test]
fn parse_valid_json() {
let json = r#"{"scene":"beach","description":"Sunset over the ocean","subjects":["person"],"objects":["sun"],"actions":["watching"],"mood":["calm"],"shot_type":"wide shot","lighting":["golden hour"],"tags":["sunset","ocean"]}"#;
let task = ImageAnalysisTask::new();
let result = task.parse(json).expect("parse should succeed");
assert_eq!(result.scene(), "beach");
assert_eq!(result.description(), "Sunset over the ocean");
assert_eq!(result.mood().len(), 1);
assert_eq!(result.subjects().len(), 1);
}
#[test]
fn reject_json_with_wrapper_text() {
let text =
"Here is the analysis:\n{\"scene\":\"office\",\"description\":\"People working\"}\nDone.";
let task = ImageAnalysisTask::new();
assert!(task.parse(text).is_err());
}
#[test]
fn reject_plain_text_output() {
let text = "A beautiful sunset over the ocean.";
let task = ImageAnalysisTask::new();
assert!(task.parse(text).is_err());
}
#[test]
fn parse_comma_separated_tag_string() {
let json = r#"{"scene":"stage performance","description":"A singer on stage","subjects":[],"objects":["microphone"],"actions":["singing"],"mood":["energetic"],"shot_type":"medium shot","lighting":["spotlight"],"tags":"concert, live music, spotlight"}"#;
let task = ImageAnalysisTask::new();
let result = task.parse(json).expect("parse should succeed");
assert_eq!(
result.tags(),
&[
SmolStr::from("concert"),
SmolStr::from("live music"),
SmolStr::from("spotlight"),
][..]
);
}
#[test]
fn reject_empty_json_payload() {
let task = ImageAnalysisTask::new();
assert!(task.parse("{}").is_err());
}
#[test]
fn reject_unknown_json_fields() {
let json = r#"{"description":"A singer on stage","extra":"unexpected"}"#;
let task = ImageAnalysisTask::new();
assert!(task.parse(json).is_err());
}
#[test]
fn reject_missing_required_fields() {
let json = r#"{"description":"A singer on stage","tags":["concert"]}"#;
let task = ImageAnalysisTask::new();
assert!(task.parse(json).is_err());
}
#[test]
fn parse_array_form_subjects() {
let json_list = r#"{"scene":"x","description":"y","subjects":["a","b"],"objects":[],"actions":[],"mood":[],"shot_type":"x","lighting":[],"tags":["t"]}"#;
let task = ImageAnalysisTask::new();
let result = task.parse(json_list).expect("list-form parse");
assert_eq!(result.subjects().len(), 2);
assert_eq!(result.subjects()[0], "a");
assert_eq!(result.subjects()[1], "b");
}
#[test]
fn subjects_string_form_treated_as_single_label() {
let json = r#"{"scene":"x","description":"y","subjects":"middle-aged man, in red jacket","objects":[],"actions":[],"mood":[],"shot_type":"x","lighting":[],"tags":["t"]}"#;
let task = ImageAnalysisTask::new();
let result = task.parse(json).expect("string-form parse");
assert_eq!(
result.subjects().len(),
1,
"string-form must wrap as a single label, not comma-split"
);
assert_eq!(result.subjects()[0], "middle-aged man, in red jacket");
}
#[test]
fn reject_all_required_fields_empty_payload_by_default() {
let json = r#"{
"scene": "",
"description": "",
"subjects": [],
"objects": [],
"actions": [],
"mood": [],
"shot_type": "",
"lighting": [],
"tags": []
}"#;
let task = ImageAnalysisTask::new();
let err = task
.parse(json)
.expect_err("default ImageAnalysisTask must reject all-empty payload");
assert!(
matches!(err, JsonParseError::NoUsableFields),
"expected NoUsableFields, got {err:?}"
);
}
#[test]
fn accept_all_required_fields_empty_payload_when_opted_in() {
let json = r#"{
"scene": "",
"description": "",
"subjects": [],
"objects": [],
"actions": [],
"mood": [],
"shot_type": "",
"lighting": [],
"tags": []
}"#;
let task = ImageAnalysisTask::new().with_accept_empty(true);
let result = task
.parse(json)
.expect("opt-in must accept the all-empty payload");
assert!(result.scene().is_empty());
assert!(result.description().is_empty());
assert!(result.subjects().is_empty());
assert!(result.objects().is_empty());
assert!(result.actions().is_empty());
assert!(result.mood().is_empty());
assert!(result.shot_type().is_empty());
assert!(result.lighting().is_empty());
assert!(result.tags().is_empty());
}
#[test]
fn reject_tags_only_payload_by_default() {
let json = r#"{
"scene": "",
"description": "",
"subjects": [],
"objects": [],
"actions": [],
"mood": [],
"shot_type": "",
"lighting": [],
"tags": ["concert", "live music"]
}"#;
let task = ImageAnalysisTask::new();
let err = task
.parse(json)
.expect_err("default ImageAnalysisTask must reject tags-only payload");
assert!(
matches!(err, JsonParseError::NoUsableFields),
"expected NoUsableFields, got {err:?}"
);
}
#[test]
fn reject_scene_only_payload_by_default() {
let json = r#"{
"scene": "office",
"description": "",
"subjects": [],
"objects": [],
"actions": [],
"mood": [],
"shot_type": "",
"lighting": [],
"tags": []
}"#;
let task = ImageAnalysisTask::new();
let err = task
.parse(json)
.expect_err("default ImageAnalysisTask must reject scene-only payload");
assert!(
matches!(err, JsonParseError::NoUsableFields),
"expected NoUsableFields, got {err:?}"
);
}
#[test]
fn reject_description_only_payload_by_default() {
let json = r#"{
"scene": "",
"description": "People working in an office",
"subjects": [],
"objects": [],
"actions": [],
"mood": [],
"shot_type": "",
"lighting": [],
"tags": []
}"#;
let task = ImageAnalysisTask::new();
let err = task
.parse(json)
.expect_err("default ImageAnalysisTask must reject description-only payload");
assert!(
matches!(err, JsonParseError::NoUsableFields),
"expected NoUsableFields, got {err:?}"
);
}
#[test]
fn accept_minimal_indexable_payload() {
let json = r#"{
"scene": "",
"description": "Two people talking",
"subjects": [],
"objects": [],
"actions": [],
"mood": [],
"shot_type": "",
"lighting": [],
"tags": ["conversation"]
}"#;
let task = ImageAnalysisTask::new();
let result = task
.parse(json)
.expect("description+tags must clear the indexable threshold");
assert_eq!(result.description(), "Two people talking");
assert_eq!(result.tags(), &[SmolStr::from("conversation")][..]);
assert!(result.subjects().is_empty());
assert!(result.objects().is_empty());
assert!(result.scene().is_empty());
}
#[test]
fn accept_detection_rich_payload_with_empty_description_and_tags() {
let json = r#"{
"scene": "",
"description": "",
"subjects": ["middle-aged woman in red dress"],
"objects": ["wedding cake"],
"actions": ["cutting cake"],
"mood": [],
"shot_type": "",
"lighting": [],
"tags": []
}"#;
let task = ImageAnalysisTask::new();
let result = task.parse(json).expect(
"detection-rich payload must clear the indexable threshold via \
the detection-bucket path even when description+tags are empty",
);
assert_eq!(result.subjects().len(), 1);
assert_eq!(result.objects().len(), 1);
assert_eq!(result.actions().len(), 1);
assert!(result.description().is_empty());
assert!(result.tags().is_empty());
}
#[test]
fn accept_subjects_only_payload() {
let json = r#"{
"scene": "",
"description": "",
"subjects": ["a single subject label"],
"objects": [],
"actions": [],
"mood": [],
"shot_type": "",
"lighting": [],
"tags": []
}"#;
let task = ImageAnalysisTask::new();
let result = task
.parse(json)
.expect("subjects-only must clear the indexable threshold");
assert_eq!(result.subjects().len(), 1);
}
#[test]
fn accept_objects_only_payload() {
let json = r#"{
"scene": "",
"description": "",
"subjects": [],
"objects": ["a single object label"],
"actions": [],
"mood": [],
"shot_type": "",
"lighting": [],
"tags": []
}"#;
let task = ImageAnalysisTask::new();
let result = task
.parse(json)
.expect("objects-only must clear the indexable threshold");
assert_eq!(result.objects().len(), 1);
}
#[test]
fn accept_actions_only_payload() {
let json = r#"{
"scene": "",
"description": "",
"subjects": [],
"objects": [],
"actions": ["a single action label"],
"mood": [],
"shot_type": "",
"lighting": [],
"tags": []
}"#;
let task = ImageAnalysisTask::new();
let result = task
.parse(json)
.expect("actions-only must clear the indexable threshold");
assert_eq!(result.actions().len(), 1);
}
#[test]
fn reject_mood_only_payload_by_default() {
let json = r#"{
"scene": "",
"description": "",
"subjects": [],
"objects": [],
"actions": [],
"mood": ["calm"],
"shot_type": "",
"lighting": [],
"tags": []
}"#;
let task = ImageAnalysisTask::new();
let err = task
.parse(json)
.expect_err("default ImageAnalysisTask must reject mood-only payload");
assert!(
matches!(err, JsonParseError::NoUsableFields),
"expected NoUsableFields, got {err:?}"
);
}
#[test]
fn reject_lighting_only_payload_by_default() {
let json = r#"{
"scene": "",
"description": "",
"subjects": [],
"objects": [],
"actions": [],
"mood": [],
"shot_type": "",
"lighting": ["natural light"],
"tags": []
}"#;
let task = ImageAnalysisTask::new();
let err = task
.parse(json)
.expect_err("default ImageAnalysisTask must reject lighting-only payload");
assert!(
matches!(err, JsonParseError::NoUsableFields),
"expected NoUsableFields, got {err:?}"
);
}
#[test]
fn reject_attribute_only_payload_by_default() {
let json = r#"{
"scene": "",
"description": "",
"subjects": [],
"objects": [],
"actions": [],
"mood": ["tense"],
"shot_type": "",
"lighting": ["low light"],
"tags": []
}"#;
let task = ImageAnalysisTask::new();
let err = task
.parse(json)
.expect_err("style-attribute-only payload must reject regardless of bucket count");
assert!(
matches!(err, JsonParseError::NoUsableFields),
"expected NoUsableFields, got {err:?}"
);
}
#[test]
fn reject_null_required_array() {
let json = r#"{
"scene": "office",
"description": "people working",
"subjects": null,
"objects": [],
"actions": [],
"mood": [],
"shot_type": "wide",
"lighting": [],
"tags": ["work"]
}"#;
let task = ImageAnalysisTask::new();
let err = task
.parse(json)
.expect_err("null required field must be rejected");
match err {
JsonParseError::MissingFields(fields) => {
assert!(
fields.contains(&"subjects"),
"expected 'subjects' in MissingFields, got {fields:?}"
);
}
other => panic!("expected MissingFields, got {other:?}"),
}
}
#[test]
fn reject_null_required_string() {
let json = r#"{
"scene": null,
"description": "people working",
"subjects": ["person"],
"objects": [],
"actions": [],
"mood": [],
"shot_type": "wide",
"lighting": [],
"tags": ["work"]
}"#;
let task = ImageAnalysisTask::new();
let err = task
.parse(json)
.expect_err("null required field must be rejected");
match err {
JsonParseError::MissingFields(fields) => {
assert!(
fields.contains(&"scene"),
"expected 'scene' in MissingFields, got {fields:?}"
);
}
other => panic!("expected MissingFields, got {other:?}"),
}
}
#[test]
fn reject_multiple_null_required_fields() {
let json = r#"{
"scene": null,
"description": null,
"subjects": null,
"objects": [],
"actions": [],
"mood": [],
"shot_type": "wide",
"lighting": [],
"tags": ["work"]
}"#;
let task = ImageAnalysisTask::new();
let err = task
.parse(json)
.expect_err("null required fields must be rejected");
match err {
JsonParseError::MissingFields(fields) => {
assert!(fields.contains(&"scene"), "missing 'scene' in {fields:?}");
assert!(
fields.contains(&"description"),
"missing 'description' in {fields:?}"
);
assert!(
fields.contains(&"subjects"),
"missing 'subjects' in {fields:?}"
);
}
other => panic!("expected MissingFields, got {other:?}"),
}
}
#[test]
fn array_elements_are_not_comma_split() {
let json = r#"{
"scene": "patriotic event",
"description": "Flag display",
"subjects": ["middle-aged man, in red jacket"],
"objects": ["red, white, and blue flag", "birthday cake with candles, balloons"],
"actions": ["waving"],
"mood": ["festive"],
"shot_type": "wide shot",
"lighting": ["natural, dramatic backlight"],
"tags": ["july 4, 2026"]
}"#;
let task = ImageAnalysisTask::new();
let result = task.parse(json).expect("parse should succeed");
assert_eq!(result.subjects().len(), 1);
assert_eq!(result.subjects()[0], "middle-aged man, in red jacket");
assert_eq!(result.objects().len(), 2);
assert_eq!(result.objects()[0], "red, white, and blue flag");
assert_eq!(result.objects()[1], "birthday cake with candles, balloons");
assert_eq!(result.lighting().len(), 1);
assert_eq!(result.lighting()[0], "natural, dramatic backlight");
assert_eq!(result.tags().len(), 1);
assert_eq!(result.tags()[0].as_str(), "july 4, 2026");
}
#[test]
fn parse_shot_type_list_form() {
let json_one = r#"{"scene":"x","description":"y","subjects":[],"objects":[],"actions":[],"mood":[],"shot_type":["wide shot"],"lighting":[],"tags":["t"]}"#;
let task = ImageAnalysisTask::new();
let result = task.parse(json_one).expect("single-element list parse");
assert_eq!(result.shot_type(), "wide shot");
let json_many = r#"{"scene":"x","description":"y","subjects":[],"objects":[],"actions":[],"mood":[],"shot_type":["wide","close-up"],"lighting":[],"tags":["t"]}"#;
assert!(task.parse(json_many).is_err());
}
}