Skip to main content

roder_api/
media.rs

1use serde::{Deserialize, Serialize};
2use time::OffsetDateTime;
3
4pub type MediaArtifactId = String;
5
6/// Provider id reserved for the deterministic offline image generator.
7pub const FAKE_MEDIA_PROVIDER_ID: &str = "fake";
8
9#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
10#[serde(rename_all = "camelCase")]
11pub enum MediaKind {
12    Image,
13    Video,
14    Audio,
15    Other,
16}
17
18#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
19#[serde(rename_all = "camelCase")]
20pub enum MediaPreviewStrategy {
21    InlineImage,
22    Thumbnail,
23    MetadataOnly,
24}
25
26#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
27#[serde(rename_all = "camelCase")]
28pub struct MediaDimensions {
29    pub width: u32,
30    pub height: u32,
31}
32
33/// Provider-reported metadata about how an artifact was generated. Persisted
34/// alongside the artifact so safety/watermark provenance survives restarts.
35#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
36#[serde(rename_all = "camelCase")]
37pub struct MediaGenerationMetadata {
38    pub provider: String,
39    #[serde(default, skip_serializing_if = "Option::is_none")]
40    pub model: Option<String>,
41    #[serde(default, skip_serializing_if = "Option::is_none")]
42    pub revised_prompt: Option<String>,
43    /// Watermark scheme applied by the provider, e.g. `synthid`.
44    #[serde(default, skip_serializing_if = "Option::is_none")]
45    pub watermark: Option<String>,
46    #[serde(default, skip_serializing_if = "Option::is_none")]
47    pub safety: Option<String>,
48    #[serde(default, skip_serializing_if = "Option::is_none")]
49    pub provider_response_id: Option<String>,
50}
51
52#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
53#[serde(rename_all = "camelCase")]
54pub struct MediaArtifact {
55    pub id: MediaArtifactId,
56    pub kind: MediaKind,
57    pub mime_type: String,
58    #[serde(default, skip_serializing_if = "Option::is_none")]
59    pub dimensions: Option<MediaDimensions>,
60    #[serde(default, skip_serializing_if = "Option::is_none")]
61    pub duration_millis: Option<u64>,
62    pub byte_size: u64,
63    pub provider: String,
64    pub prompt_hash: String,
65    pub store_path: String,
66    #[serde(default, skip_serializing_if = "Option::is_none")]
67    pub thumbnail_path: Option<String>,
68    #[serde(default, skip_serializing_if = "Option::is_none")]
69    pub generation: Option<MediaGenerationMetadata>,
70    #[serde(with = "time::serde::rfc3339")]
71    pub created_at: OffsetDateTime,
72    #[serde(default)]
73    pub roder_owned: bool,
74}
75
76#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
77#[serde(rename_all = "camelCase")]
78pub struct MediaPreview {
79    pub artifact_id: MediaArtifactId,
80    pub strategy: MediaPreviewStrategy,
81    #[serde(default, skip_serializing_if = "Option::is_none")]
82    pub thumbnail_path: Option<String>,
83    pub fallback_label: String,
84    #[serde(default, skip_serializing_if = "Option::is_none")]
85    pub warning: Option<String>,
86}
87
88#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
89#[serde(rename_all = "camelCase")]
90pub struct MediaAttachment {
91    pub artifact_id: MediaArtifactId,
92    pub mime_type: String,
93    pub data_url: String,
94}
95
96#[derive(Debug, Clone, Copy, Serialize, Deserialize, PartialEq, Eq)]
97#[serde(rename_all = "camelCase")]
98pub enum ImageGenerationAction {
99    Auto,
100    Generate,
101    Edit,
102}
103
104/// Inline reference/edit image input passed to an image provider.
105#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
106#[serde(rename_all = "camelCase")]
107pub struct MediaImageInput {
108    pub bytes_base64: String,
109    pub mime_type: String,
110}
111
112/// Canonical provider-neutral media generation request. All option fields are
113/// optional so legacy `{ "prompt": ... }` tool arguments keep decoding.
114#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
115#[serde(rename_all = "camelCase")]
116pub struct MediaGenerationRequest {
117    #[serde(default)]
118    pub prompt: String,
119    #[serde(default, skip_serializing_if = "Option::is_none")]
120    pub provider: Option<String>,
121    #[serde(default, skip_serializing_if = "Option::is_none")]
122    pub model: Option<String>,
123    #[serde(default, skip_serializing_if = "Option::is_none")]
124    pub action: Option<ImageGenerationAction>,
125    /// Roder artifact ids resolved into [`Self::input_images`] before the
126    /// provider call.
127    #[serde(default, skip_serializing_if = "Vec::is_empty")]
128    pub input_artifacts: Vec<MediaArtifactId>,
129    #[serde(default, skip_serializing_if = "Vec::is_empty")]
130    pub input_images: Vec<MediaImageInput>,
131    #[serde(default, skip_serializing_if = "Option::is_none")]
132    pub count: Option<u32>,
133    #[serde(default, skip_serializing_if = "Option::is_none")]
134    pub aspect_ratio: Option<String>,
135    /// Pixel size such as `1536x1024` (OpenAI Image API style).
136    #[serde(default, skip_serializing_if = "Option::is_none")]
137    pub size: Option<String>,
138    /// Resolution tier such as `1K`, `2K`, or `4K` (Gemini image config style).
139    #[serde(default, skip_serializing_if = "Option::is_none")]
140    pub image_size: Option<String>,
141    #[serde(default, skip_serializing_if = "Option::is_none")]
142    pub quality: Option<String>,
143    #[serde(default, skip_serializing_if = "Option::is_none")]
144    pub output_format: Option<String>,
145    #[serde(default, skip_serializing_if = "Option::is_none")]
146    pub background: Option<String>,
147    /// 0-100 compression for lossy output formats.
148    #[serde(default, skip_serializing_if = "Option::is_none")]
149    pub output_compression: Option<u8>,
150    #[serde(default, skip_serializing_if = "Option::is_none")]
151    pub moderation: Option<String>,
152    /// Requested partial-image preview count where the provider supports
153    /// streaming; providers that do not support it must reject or ignore it
154    /// explicitly rather than silently stream.
155    #[serde(default, skip_serializing_if = "Option::is_none")]
156    pub partial_images: Option<u32>,
157    #[serde(default, skip_serializing_if = "Option::is_none")]
158    pub output_path: Option<String>,
159    /// Bounded, documented pass-through settings for one provider. Values are
160    /// redacted from transcripts like all other request fields and must not
161    /// change safety or storage semantics.
162    #[serde(default, skip_serializing_if = "Option::is_none")]
163    pub provider_options: Option<serde_json::Map<String, serde_json::Value>>,
164}
165
166#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
167#[serde(rename_all = "camelCase")]
168pub struct MediaGenerationUsage {
169    #[serde(default, skip_serializing_if = "Option::is_none")]
170    pub input_tokens: Option<u64>,
171    #[serde(default, skip_serializing_if = "Option::is_none")]
172    pub input_image_tokens: Option<u64>,
173    #[serde(default, skip_serializing_if = "Option::is_none")]
174    pub output_tokens: Option<u64>,
175    #[serde(default, skip_serializing_if = "Option::is_none")]
176    pub total_tokens: Option<u64>,
177}
178
179#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
180#[serde(rename_all = "camelCase")]
181pub struct MediaGenerationOutput {
182    pub artifact: MediaArtifact,
183    pub preview: MediaPreview,
184    #[serde(default, skip_serializing_if = "Option::is_none")]
185    pub revised_prompt: Option<String>,
186}
187
188#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
189#[serde(rename_all = "camelCase")]
190pub struct MediaGenerationResponse {
191    pub provider: String,
192    #[serde(default, skip_serializing_if = "Option::is_none")]
193    pub model: Option<String>,
194    pub outputs: Vec<MediaGenerationOutput>,
195    #[serde(default, skip_serializing_if = "Option::is_none")]
196    pub revised_prompt: Option<String>,
197    #[serde(default, skip_serializing_if = "Option::is_none")]
198    pub provider_response_id: Option<String>,
199    #[serde(default, skip_serializing_if = "Option::is_none")]
200    pub usage: Option<MediaGenerationUsage>,
201    /// Watermark scheme applied to every output, e.g. `synthid`.
202    #[serde(default, skip_serializing_if = "Option::is_none")]
203    pub watermark: Option<String>,
204    #[serde(default, skip_serializing_if = "Option::is_none")]
205    pub safety: Option<String>,
206    /// Provider-reported errors for individual requested outputs that were
207    /// not generated, while the rest of the batch succeeded.
208    #[serde(default, skip_serializing_if = "Vec::is_empty")]
209    pub output_errors: Vec<String>,
210}
211
212impl MediaGenerationResponse {
213    pub fn primary_artifact(&self) -> Option<&MediaArtifact> {
214        self.outputs.first().map(|output| &output.artifact)
215    }
216}
217
218/// One raw provider output, before Roder persists it as an artifact.
219#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
220#[serde(rename_all = "camelCase")]
221pub struct GeneratedImage {
222    pub bytes_base64: String,
223    pub mime_type: String,
224    #[serde(default, skip_serializing_if = "Option::is_none")]
225    pub dimensions: Option<MediaDimensions>,
226    #[serde(default, skip_serializing_if = "Option::is_none")]
227    pub revised_prompt: Option<String>,
228    #[serde(default, skip_serializing_if = "Option::is_none")]
229    pub watermark: Option<String>,
230    #[serde(default, skip_serializing_if = "Option::is_none")]
231    pub safety: Option<String>,
232}
233
234/// Provider result for one image generation call, prior to artifact storage.
235#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq)]
236#[serde(rename_all = "camelCase")]
237pub struct ImageGenerationBatch {
238    pub provider: String,
239    pub model: String,
240    pub images: Vec<GeneratedImage>,
241    #[serde(default, skip_serializing_if = "Option::is_none")]
242    pub provider_response_id: Option<String>,
243    #[serde(default, skip_serializing_if = "Option::is_none")]
244    pub usage: Option<MediaGenerationUsage>,
245    #[serde(default, skip_serializing_if = "Vec::is_empty")]
246    pub output_errors: Vec<String>,
247}
248
249#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
250#[serde(rename_all = "camelCase")]
251pub struct ImageModelDescriptor {
252    pub id: String,
253    pub display_name: String,
254    pub provider: String,
255    #[serde(default)]
256    pub is_default: bool,
257    /// Compatibility/legacy model kept for callers pinned to older ids.
258    #[serde(default)]
259    pub legacy: bool,
260    #[serde(default)]
261    pub supports_edit: bool,
262    #[serde(default)]
263    pub supports_multiple_outputs: bool,
264    #[serde(default, skip_serializing_if = "Vec::is_empty")]
265    pub supported_aspect_ratios: Vec<String>,
266    #[serde(default, skip_serializing_if = "Vec::is_empty")]
267    pub supported_sizes: Vec<String>,
268    #[serde(default, skip_serializing_if = "Vec::is_empty")]
269    pub supported_image_sizes: Vec<String>,
270    #[serde(default)]
271    pub supports_transparent_background: bool,
272    #[serde(default)]
273    pub supports_partial_images: bool,
274}
275
276#[derive(Debug, Clone, Default, Serialize, Deserialize, PartialEq, Eq)]
277#[serde(rename_all = "camelCase")]
278pub struct MediaProviderDescriptor {
279    pub id: String,
280    pub display_name: String,
281    #[serde(default)]
282    pub supports_images: bool,
283    #[serde(default)]
284    pub supports_videos: bool,
285    /// Whether the provider has the credentials it needs to serve requests.
286    #[serde(default)]
287    pub configured: bool,
288    #[serde(default, skip_serializing_if = "Option::is_none")]
289    pub default_model: Option<String>,
290    #[serde(default, skip_serializing_if = "Vec::is_empty")]
291    pub image_models: Vec<ImageModelDescriptor>,
292}
293
294/// Extension-host service for media (image/video) generation providers.
295/// Providers return raw bytes; the core runtime persists artifacts, applies
296/// policy, and emits events.
297#[async_trait::async_trait]
298pub trait MediaGeneratorProvider: Send + Sync + 'static {
299    fn provider_id(&self) -> &str;
300
301    fn descriptor(&self) -> MediaProviderDescriptor;
302
303    async fn generate_image(
304        &self,
305        _request: MediaGenerationRequest,
306    ) -> anyhow::Result<ImageGenerationBatch> {
307        anyhow::bail!(
308            "image generation is not supported by provider {}",
309            self.provider_id()
310        )
311    }
312}
313
314pub fn data_url(mime_type: &str, bytes_base64: &str) -> String {
315    format!("data:{mime_type};base64,{bytes_base64}")
316}
317
318#[cfg(test)]
319mod tests {
320    use super::*;
321
322    fn image_artifact() -> MediaArtifact {
323        MediaArtifact {
324            id: "media-image-1".to_string(),
325            kind: MediaKind::Image,
326            mime_type: "image/png".to_string(),
327            dimensions: Some(MediaDimensions {
328                width: 1,
329                height: 1,
330            }),
331            duration_millis: None,
332            byte_size: 67,
333            provider: "fake".to_string(),
334            prompt_hash: "hash".to_string(),
335            store_path: "/tmp/image.png".to_string(),
336            thumbnail_path: Some("/tmp/image.thumb.png".to_string()),
337            generation: None,
338            created_at: OffsetDateTime::UNIX_EPOCH,
339            roder_owned: true,
340        }
341    }
342
343    #[test]
344    fn image_and_video_artifacts_serialize_as_camel_case_metadata() {
345        let image = image_artifact();
346        let video = MediaArtifact {
347            kind: MediaKind::Video,
348            mime_type: "video/mp4".to_string(),
349            duration_millis: Some(1_000),
350            ..image.clone()
351        };
352
353        let value = serde_json::to_value(&image).unwrap();
354        assert_eq!(value["mimeType"], "image/png");
355        assert_eq!(value["dimensions"]["width"], 1);
356        assert_eq!(value["thumbnailPath"], "/tmp/image.thumb.png");
357        assert!(value.get("generation").is_none());
358        assert_eq!(serde_json::to_value(video).unwrap()["durationMillis"], 1000);
359    }
360
361    #[test]
362    fn minimum_text_to_image_request_decodes_from_legacy_arguments() {
363        let request: MediaGenerationRequest =
364            serde_json::from_value(serde_json::json!({ "prompt": "tiny" })).unwrap();
365        assert_eq!(request.prompt, "tiny");
366        assert!(request.provider.is_none());
367        assert!(request.model.is_none());
368        assert!(request.input_artifacts.is_empty());
369        assert!(request.provider_options.is_none());
370
371        let legacy: MediaGenerationRequest = serde_json::from_value(serde_json::json!({
372            "prompt": "tiny",
373            "model": "gpt-image-2",
374            "outputPath": "/tmp/out.png"
375        }))
376        .unwrap();
377        assert_eq!(legacy.model.as_deref(), Some("gpt-image-2"));
378        assert_eq!(legacy.output_path.as_deref(), Some("/tmp/out.png"));
379    }
380
381    #[test]
382    fn image_edit_request_serializes_canonical_camel_case_fields() {
383        let request = MediaGenerationRequest {
384            prompt: "Make this screenshot look like a clean launch graphic".to_string(),
385            provider: Some("openai".to_string()),
386            model: Some("gpt-image-2".to_string()),
387            action: Some(ImageGenerationAction::Edit),
388            input_artifacts: vec!["media-image-123".to_string()],
389            size: Some("1536x1024".to_string()),
390            output_format: Some("png".to_string()),
391            ..MediaGenerationRequest::default()
392        };
393
394        let value = serde_json::to_value(&request).unwrap();
395        assert_eq!(value["provider"], "openai");
396        assert_eq!(value["action"], "edit");
397        assert_eq!(value["inputArtifacts"][0], "media-image-123");
398        assert_eq!(value["size"], "1536x1024");
399        assert_eq!(value["outputFormat"], "png");
400        assert!(value.get("inputImages").is_none());
401    }
402
403    #[test]
404    fn google_style_request_serializes_aspect_ratio_and_image_size() {
405        let request = MediaGenerationRequest {
406            prompt: "A polished product hero image".to_string(),
407            provider: Some("google".to_string()),
408            model: Some("gemini-3-pro-image".to_string()),
409            aspect_ratio: Some("16:9".to_string()),
410            image_size: Some("2K".to_string()),
411            ..MediaGenerationRequest::default()
412        };
413
414        let value = serde_json::to_value(&request).unwrap();
415        assert_eq!(value["aspectRatio"], "16:9");
416        assert_eq!(value["imageSize"], "2K");
417    }
418
419    #[test]
420    fn multi_output_response_round_trips_with_usage_and_metadata() {
421        let artifact = image_artifact();
422        let preview = MediaPreview {
423            artifact_id: artifact.id.clone(),
424            strategy: MediaPreviewStrategy::Thumbnail,
425            thumbnail_path: None,
426            fallback_label: "fake image/png".to_string(),
427            warning: None,
428        };
429        let response = MediaGenerationResponse {
430            provider: "openai".to_string(),
431            model: Some("gpt-image-2".to_string()),
432            outputs: vec![
433                MediaGenerationOutput {
434                    artifact: artifact.clone(),
435                    preview: preview.clone(),
436                    revised_prompt: Some("a tiny test image".to_string()),
437                },
438                MediaGenerationOutput {
439                    artifact,
440                    preview,
441                    revised_prompt: None,
442                },
443            ],
444            revised_prompt: Some("a tiny test image".to_string()),
445            provider_response_id: Some("resp_123".to_string()),
446            usage: Some(MediaGenerationUsage {
447                input_tokens: Some(12),
448                input_image_tokens: None,
449                output_tokens: Some(4_160),
450                total_tokens: Some(4_172),
451            }),
452            watermark: None,
453            safety: None,
454            output_errors: vec!["third output was rejected by moderation".to_string()],
455        };
456
457        let value = serde_json::to_value(&response).unwrap();
458        assert_eq!(value["outputs"].as_array().unwrap().len(), 2);
459        assert_eq!(value["outputs"][0]["revisedPrompt"], "a tiny test image");
460        assert_eq!(value["providerResponseId"], "resp_123");
461        assert_eq!(value["usage"]["totalTokens"], 4_172);
462        assert_eq!(
463            value["outputErrors"][0],
464            "third output was rejected by moderation"
465        );
466        let round_trip: MediaGenerationResponse = serde_json::from_value(value).unwrap();
467        assert_eq!(round_trip, response);
468    }
469
470    #[test]
471    fn partial_stream_preference_and_provider_options_round_trip() {
472        let request: MediaGenerationRequest = serde_json::from_value(serde_json::json!({
473            "prompt": "stream me",
474            "provider": "openai",
475            "partialImages": 2,
476            "providerOptions": { "user": "roder-tests" }
477        }))
478        .unwrap();
479        assert_eq!(request.partial_images, Some(2));
480        assert_eq!(
481            request
482                .provider_options
483                .as_ref()
484                .and_then(|options| options.get("user"))
485                .and_then(|value| value.as_str()),
486            Some("roder-tests")
487        );
488    }
489
490    #[test]
491    fn google_generation_metadata_persists_synthid_watermark() {
492        let mut artifact = image_artifact();
493        artifact.generation = Some(MediaGenerationMetadata {
494            provider: "google".to_string(),
495            model: Some("gemini-3.1-flash-image".to_string()),
496            revised_prompt: None,
497            watermark: Some("synthid".to_string()),
498            safety: None,
499            provider_response_id: None,
500        });
501
502        let value = serde_json::to_value(&artifact).unwrap();
503        assert_eq!(value["generation"]["provider"], "google");
504        assert_eq!(value["generation"]["watermark"], "synthid");
505        let round_trip: MediaArtifact = serde_json::from_value(value).unwrap();
506        assert_eq!(
507            round_trip.generation.unwrap().watermark.as_deref(),
508            Some("synthid")
509        );
510    }
511
512    #[test]
513    fn openai_batch_metadata_round_trips() {
514        let batch = ImageGenerationBatch {
515            provider: "openai".to_string(),
516            model: "gpt-image-2".to_string(),
517            images: vec![GeneratedImage {
518                bytes_base64: "iVBORw0KGgo=".to_string(),
519                mime_type: "image/png".to_string(),
520                dimensions: Some(MediaDimensions {
521                    width: 1024,
522                    height: 1024,
523                }),
524                revised_prompt: Some("a revised prompt".to_string()),
525                watermark: None,
526                safety: None,
527            }],
528            provider_response_id: Some("img_123".to_string()),
529            usage: Some(MediaGenerationUsage {
530                input_tokens: Some(10),
531                input_image_tokens: Some(0),
532                output_tokens: Some(1_056),
533                total_tokens: Some(1_066),
534            }),
535            output_errors: Vec::new(),
536        };
537
538        let value = serde_json::to_value(&batch).unwrap();
539        assert_eq!(value["images"][0]["revisedPrompt"], "a revised prompt");
540        assert_eq!(value["usage"]["inputImageTokens"], 0);
541        let round_trip: ImageGenerationBatch = serde_json::from_value(value).unwrap();
542        assert_eq!(round_trip, batch);
543    }
544}