Skip to main content

enact_core/providers/
trait.rs

1//! Model provider trait and capabilities
2//!
3//! This module defines the `ModelProvider` trait and related types. Provider implementations
4//! are in the separate `enact-providers` crate to enable independent release cadence.
5//!
6//! ## ⚠️ CODE OWNERSHIP & FORBIDDEN PATTERNS
7//!
8//! **Providers are replaceable adapters - execution semantics must never depend on provider behavior.**
9//!
10//! ### Code Ownership
11//! - Provider implementations live in `enact-providers` crate
12//! - This module contains only the trait and types (`ChatRequest`, `ChatResponse`, `ModelCapabilities`)
13//! - `enact-providers` depends on `enact-core` (for the trait), but `enact-core` must NOT depend on `enact-providers`
14//! - Providers are thin HTTP adapters
15//! - They should only see `ChatRequest`/`ChatResponse` and tool schemas
16//! - If providers need kernel types, that logic belongs in the kernel
17//!
18//! ### Explicitly Forbidden Patterns
19//!
20//! These patterns are **forbidden forever**. If any of these happen, Enact loses its "Now" guarantee.
21//!
22//! 1. **Providers enforcing policy** – Policy enforcement belongs to kernel, not providers.
23//!    - Providers are thin adapters, not policy engines
24//!    - No policy checks in provider implementations
25//!    - No quota enforcement in providers
26//!
27//! 2. **Providers importing kernel/flow/policy** – Providers must not import kernel types.
28//!    - `enact-providers` must NOT import `enact_core::kernel`, `enact_core::flow`, or `enact_core::policy`
29//!    - Providers should only see `ChatRequest`/`ChatResponse` and tool schemas
30//!    - If providers need kernel types, that logic belongs in the kernel
31//!    - No circular dependencies: `enact-core` must NOT depend on `enact-providers`
32//!
33//! 3. **Global registries or dynamic discovery in kernel** – Providers are resolved before kernel execution.
34//!    - The kernel receives resolved providers via `ExecutionRequest`, not provider names or registry lookups
35//!    - Provider resolution happens outside kernel (in runner/control plane)
36//!    - No provider registry in kernel
37//!
38//! 4. **Execution semantics depending on provider behavior** – Providers are replaceable adapters.
39//!    - Execution semantics must never depend on provider-specific behavior
40//!    - All providers must implement the same trait interface
41//!    - No provider-specific logic in kernel
42//!
43//! ### Invariants Enforced
44//!
45//! - **Providers are replaceable adapters**: Execution semantics must never depend on provider behavior
46//! - **No global registries or dynamic discovery in kernel**: Providers are resolved before kernel execution
47//! - **Provider resolution happens outside kernel**: Resolution belongs in runner/control plane, not in the kernel
48//! - **Providers must not import kernel/flow/policy**: `enact-providers` must NOT import `enact_core::kernel`, `enact_core::flow`, or `enact_core::policy`
49//! - **Crate boundary separation**: Provider implementations in `enact-providers`, trait in `enact-core`, one-way dependency only
50//!
51//! @see docs/TECHNICAL/04-KERNEL_INVARIANTS.md
52
53use async_trait::async_trait;
54use serde::{Deserialize, Serialize};
55use serde_json::Value;
56
57// =============================================================================
58// Model Capabilities - What a model can do
59// =============================================================================
60
61/// Model capabilities - declares what a model supports.
62///
63/// Aligned with enact-providers config.yml `capabilities` and related fields:
64/// - `tool_calls` → supports_tools, `reasoning` → supports_reasoning
65/// - runtime.max_tokens_default, cost.cost_per_1m_input/output etc. map to max_tokens, cost_per_1m_*, cost_per_1m_pixels
66#[derive(Debug, Clone, Serialize, Deserialize)]
67pub struct ModelCapabilities {
68    /// Maximum context window (tokens)
69    pub max_tokens: u32,
70    /// Maximum output tokens
71    pub max_output_tokens: u32,
72    /// Supports streaming responses
73    pub supports_streaming: bool,
74    /// Supports tool/function calling (config: capabilities.tool_calls)
75    pub supports_tools: bool,
76    /// Supports reasoning/thinking (config: capabilities.reasoning)
77    pub supports_reasoning: bool,
78    /// Supports vision/images as input
79    pub supports_vision: bool,
80    /// Supports structured output (JSON mode)
81    pub supports_json_mode: bool,
82    /// Supports embedding generation
83    pub supports_embeddings: bool,
84    /// Supports image generation (DALL-E, Flux, etc.)
85    pub supports_image_generation: bool,
86    /// Supports audio transcription (speech-to-text)
87    pub supports_audio_transcription: bool,
88    /// Supports text-to-speech
89    pub supports_speech: bool,
90    /// Supports video generation
91    pub supports_video_generation: bool,
92    /// Is PII-safe (no data retention)
93    pub pii_safe: bool,
94    /// Cost per 1K input tokens (USD); from config cost.cost_per_1m_input
95    pub cost_per_1m_input: Option<f64>,
96    /// Cost per 1K output tokens (USD); from config cost.cost_per_1m_output
97    pub cost_per_1m_output: Option<f64>,
98    /// Cost per 1K pixels for image (USD); from config cost.cost_per_1m_pixels
99    pub cost_per_1m_pixels: Option<f64>,
100}
101
102impl Default for ModelCapabilities {
103    fn default() -> Self {
104        Self {
105            max_tokens: 4096,
106            max_output_tokens: 4096,
107            supports_streaming: true,
108            supports_tools: false,
109            supports_reasoning: false,
110            supports_vision: false,
111            supports_json_mode: false,
112            supports_embeddings: false,
113            supports_image_generation: false,
114            supports_audio_transcription: false,
115            supports_speech: false,
116            supports_video_generation: false,
117            pii_safe: false,
118            cost_per_1m_input: None,
119            cost_per_1m_output: None,
120            cost_per_1m_pixels: None,
121        }
122    }
123}
124
125impl ModelCapabilities {
126    /// GPT-4 capabilities
127    pub fn gpt4() -> Self {
128        Self {
129            max_tokens: 128_000,
130            max_output_tokens: 4096,
131            supports_streaming: true,
132            supports_tools: true,
133            supports_reasoning: false,
134            supports_vision: true,
135            supports_json_mode: true,
136            supports_embeddings: true,
137            supports_image_generation: false,
138            supports_audio_transcription: false,
139            supports_speech: false,
140            supports_video_generation: false,
141            pii_safe: false,
142            cost_per_1m_input: Some(0.03),
143            cost_per_1m_output: Some(0.06),
144            cost_per_1m_pixels: None,
145        }
146    }
147
148    /// Claude 3 Opus capabilities
149    pub fn claude3_opus() -> Self {
150        Self {
151            max_tokens: 200_000,
152            max_output_tokens: 4096,
153            supports_streaming: true,
154            supports_tools: true,
155            supports_reasoning: false,
156            supports_vision: true,
157            supports_json_mode: true,
158            supports_embeddings: true,
159            supports_image_generation: false,
160            supports_audio_transcription: false,
161            supports_speech: false,
162            supports_video_generation: false,
163            pii_safe: false,
164            cost_per_1m_input: Some(0.015),
165            cost_per_1m_output: Some(0.075),
166            cost_per_1m_pixels: None,
167        }
168    }
169
170    /// Gemini Pro capabilities
171    pub fn gemini_pro() -> Self {
172        Self {
173            max_tokens: 1_000_000,
174            max_output_tokens: 8192,
175            supports_streaming: true,
176            supports_tools: true,
177            supports_reasoning: false,
178            supports_vision: true,
179            supports_json_mode: true,
180            supports_embeddings: true,
181            supports_image_generation: false,
182            supports_audio_transcription: false,
183            supports_speech: false,
184            supports_video_generation: false,
185            pii_safe: false,
186            cost_per_1m_input: Some(0.00125),
187            cost_per_1m_output: Some(0.005),
188            cost_per_1m_pixels: None,
189        }
190    }
191}
192
193// =============================================================================
194// Chat Types (OpenAI-compatible)
195// =============================================================================
196
197/// Tool definition for chat request (OpenAI shape: type "function", function { name, description, parameters })
198#[derive(Debug, Clone, Serialize, Deserialize)]
199pub struct ChatTool {
200    #[serde(rename = "type")]
201    pub tool_type: String,
202    pub function: ChatToolFunction,
203}
204
205/// Function part of a ChatTool
206#[derive(Debug, Clone, Serialize, Deserialize)]
207pub struct ChatToolFunction {
208    pub name: String,
209    pub description: String,
210    pub parameters: Value,
211}
212
213/// Tool choice: "auto" | "none" | or specific function (OpenAI shape)
214#[derive(Debug, Clone, Serialize, Deserialize)]
215#[serde(untagged)]
216pub enum ToolChoice {
217    String(String),
218    Specific {
219        #[serde(rename = "type")]
220        choice_type: String,
221        function: ToolChoiceFunction,
222    },
223}
224
225#[derive(Debug, Clone, Serialize, Deserialize)]
226pub struct ToolChoiceFunction {
227    pub name: String,
228}
229
230/// One tool call in an assistant message (OpenAI shape: id, type "function", function { name, arguments })
231#[derive(Debug, Clone, Serialize, Deserialize)]
232pub struct MessageToolCall {
233    pub id: String,
234    #[serde(rename = "type")]
235    pub call_type: String,
236    pub function: MessageToolCallFunction,
237}
238
239#[derive(Debug, Clone, Serialize, Deserialize)]
240pub struct MessageToolCallFunction {
241    pub name: String,
242    pub arguments: String,
243}
244
245// =============================================================================
246// Multimodal Content Types
247// =============================================================================
248
249/// Image URL structure for vision messages (OpenAI format)
250#[derive(Debug, Clone, Serialize, Deserialize)]
251pub struct ImageUrlContent {
252    /// The URL of the image (can be a data URL with base64)
253    pub url: String,
254    /// Optional detail level: "low", "high", or "auto"
255    #[serde(skip_serializing_if = "Option::is_none")]
256    pub detail: Option<String>,
257}
258
259/// A content part for multimodal messages (text or image)
260#[derive(Debug, Clone, Serialize, Deserialize)]
261#[serde(tag = "type", rename_all = "snake_case")]
262pub enum ContentPart {
263    /// Text content part
264    Text { text: String },
265    /// Image URL content part (for vision models)
266    ImageUrl { image_url: ImageUrlContent },
267}
268
269impl ContentPart {
270    /// Create a text content part
271    pub fn text(text: impl Into<String>) -> Self {
272        ContentPart::Text { text: text.into() }
273    }
274
275    /// Create an image URL content part from a URL
276    pub fn image_url(url: impl Into<String>) -> Self {
277        ContentPart::ImageUrl {
278            image_url: ImageUrlContent {
279                url: url.into(),
280                detail: None,
281            },
282        }
283    }
284
285    /// Create an image content part from base64 data
286    pub fn image_base64(base64_data: impl Into<String>, mime_type: impl Into<String>) -> Self {
287        let data_url = format!("data:{};base64,{}", mime_type.into(), base64_data.into());
288        ContentPart::ImageUrl {
289            image_url: ImageUrlContent {
290                url: data_url,
291                detail: None,
292            },
293        }
294    }
295}
296
297/// Message content - can be either a simple string or multimodal content parts
298#[derive(Debug, Clone, Serialize, Deserialize)]
299#[serde(untagged)]
300pub enum MessageContent {
301    /// Simple text content
302    Text(String),
303    /// Multimodal content (text + images)
304    Parts(Vec<ContentPart>),
305}
306
307impl MessageContent {
308    /// Check if this content contains any images
309    pub fn has_images(&self) -> bool {
310        match self {
311            MessageContent::Text(_) => false,
312            MessageContent::Parts(parts) => parts
313                .iter()
314                .any(|p| matches!(p, ContentPart::ImageUrl { .. })),
315        }
316    }
317
318    /// Get the text content (concatenated if multimodal)
319    pub fn as_text(&self) -> String {
320        match self {
321            MessageContent::Text(s) => s.clone(),
322            MessageContent::Parts(parts) => parts
323                .iter()
324                .filter_map(|p| match p {
325                    ContentPart::Text { text } => Some(text.as_str()),
326                    _ => None,
327                })
328                .collect::<Vec<_>>()
329                .join("\n"),
330        }
331    }
332}
333
334impl From<String> for MessageContent {
335    fn from(s: String) -> Self {
336        MessageContent::Text(s)
337    }
338}
339
340impl From<&str> for MessageContent {
341    fn from(s: &str) -> Self {
342        MessageContent::Text(s.to_string())
343    }
344}
345
346/// Chat message (OpenAI shape: role, optional content, optional tool_calls, optional tool_call_id for role "tool")
347#[derive(Debug, Clone, Serialize, Deserialize)]
348pub struct ChatMessage {
349    pub role: String,
350    /// Text-only content (for backward compatibility)
351    #[serde(skip_serializing_if = "Option::is_none")]
352    pub content: Option<String>,
353    /// Multimodal content (text + images) - providers should prefer this if present
354    #[serde(skip_serializing_if = "Option::is_none")]
355    pub multimodal_content: Option<Vec<ContentPart>>,
356    #[serde(skip_serializing_if = "Option::is_none")]
357    pub tool_calls: Option<Vec<MessageToolCall>>,
358    #[serde(skip_serializing_if = "Option::is_none")]
359    pub tool_call_id: Option<String>,
360}
361
362impl ChatMessage {
363    pub fn system(content: impl Into<String>) -> Self {
364        Self {
365            role: "system".to_string(),
366            content: Some(content.into()),
367            multimodal_content: None,
368            tool_calls: None,
369            tool_call_id: None,
370        }
371    }
372
373    pub fn user(content: impl Into<String>) -> Self {
374        Self {
375            role: "user".to_string(),
376            content: Some(content.into()),
377            multimodal_content: None,
378            tool_calls: None,
379            tool_call_id: None,
380        }
381    }
382
383    /// User message with images (multimodal content for vision models)
384    ///
385    /// Creates a user message with text and one or more images.
386    /// The images should be provided as base64-encoded data with their MIME type.
387    ///
388    /// # Example
389    /// ```ignore
390    /// let msg = ChatMessage::user_with_images(
391    ///     "Describe this image",
392    ///     vec![("data:image/jpeg;base64,...", "image/jpeg")],
393    /// );
394    /// ```
395    pub fn user_with_images<S: Into<String>>(
396        text: S,
397        images: Vec<(Vec<u8>, String)>, // (raw bytes, mime_type)
398    ) -> Self {
399        use base64::Engine;
400        let mut parts = vec![ContentPart::text(text)];
401
402        for (data, mime_type) in images {
403            let b64 = base64::engine::general_purpose::STANDARD.encode(&data);
404            parts.push(ContentPart::image_base64(b64, mime_type));
405        }
406
407        Self {
408            role: "user".to_string(),
409            content: None, // content is None when using multimodal
410            multimodal_content: Some(parts),
411            tool_calls: None,
412            tool_call_id: None,
413        }
414    }
415
416    pub fn assistant(content: impl Into<String>) -> Self {
417        Self {
418            role: "assistant".to_string(),
419            content: Some(content.into()),
420            multimodal_content: None,
421            tool_calls: None,
422            tool_call_id: None,
423        }
424    }
425
426    /// Assistant message with tool calls (content may be None or empty)
427    pub fn assistant_with_tool_calls(
428        content: Option<String>,
429        tool_calls: Vec<MessageToolCall>,
430    ) -> Self {
431        Self {
432            role: "assistant".to_string(),
433            content,
434            multimodal_content: None,
435            tool_calls: Some(tool_calls),
436            tool_call_id: None,
437        }
438    }
439
440    /// Tool result message (role "tool")
441    pub fn tool_result(tool_call_id: impl Into<String>, content: impl Into<String>) -> Self {
442        Self {
443            role: "tool".to_string(),
444            content: Some(content.into()),
445            multimodal_content: None,
446            tool_calls: None,
447            tool_call_id: Some(tool_call_id.into()),
448        }
449    }
450
451    /// Check if this message contains multimodal content (images)
452    pub fn has_images(&self) -> bool {
453        self.multimodal_content
454            .as_ref()
455            .map(|parts| {
456                parts
457                    .iter()
458                    .any(|p| matches!(p, ContentPart::ImageUrl { .. }))
459            })
460            .unwrap_or(false)
461    }
462
463    /// Get the effective content for serialization to providers
464    /// Returns either multimodal content parts or text-only content
465    pub fn effective_content(&self) -> MessageContent {
466        if let Some(parts) = &self.multimodal_content {
467            MessageContent::Parts(parts.clone())
468        } else if let Some(text) = &self.content {
469            MessageContent::Text(text.clone())
470        } else {
471            MessageContent::Text(String::new())
472        }
473    }
474}
475
476/// Chat completion request
477#[derive(Debug, Clone, Serialize)]
478pub struct ChatRequest {
479    pub messages: Vec<ChatMessage>,
480    #[serde(skip_serializing_if = "Option::is_none")]
481    pub max_tokens: Option<u32>,
482    #[serde(skip_serializing_if = "Option::is_none")]
483    pub temperature: Option<f32>,
484    #[serde(skip_serializing_if = "Option::is_none")]
485    pub tools: Option<Vec<ChatTool>>,
486    #[serde(skip_serializing_if = "Option::is_none")]
487    pub tool_choice: Option<ToolChoice>,
488}
489
490/// Chat completion response
491#[derive(Debug, Clone, Deserialize)]
492pub struct ChatResponse {
493    pub id: String,
494    pub choices: Vec<ChatChoice>,
495    pub usage: Option<ChatUsage>,
496}
497
498#[derive(Debug, Clone, Deserialize)]
499pub struct ChatChoice {
500    pub index: u32,
501    pub message: ChatMessage,
502    pub finish_reason: Option<String>,
503}
504
505#[derive(Debug, Clone, Deserialize)]
506pub struct ChatUsage {
507    pub prompt_tokens: u32,
508    pub completion_tokens: u32,
509    pub total_tokens: u32,
510}
511
512// =============================================================================
513// Embedding Types
514// =============================================================================
515
516/// Embedding request
517#[derive(Debug, Clone, Serialize)]
518pub struct EmbeddingRequest {
519    /// Text to embed (single string or array of strings for batch)
520    pub input: String,
521    /// Optional model override
522    #[serde(skip_serializing_if = "Option::is_none")]
523    pub model: Option<String>,
524}
525
526/// Embedding response
527#[derive(Debug, Clone, Deserialize)]
528pub struct EmbeddingResponse {
529    /// Embedding data (one per input)
530    pub data: Vec<EmbeddingData>,
531    /// Model used for embedding
532    pub model: String,
533    /// Token usage information
534    pub usage: Option<EmbeddingUsage>,
535}
536
537#[derive(Debug, Clone, Deserialize)]
538pub struct EmbeddingData {
539    /// The embedding vector
540    pub embedding: Vec<f32>,
541    /// Index of the embedding in the batch
542    pub index: u32,
543}
544
545#[derive(Debug, Clone, Deserialize)]
546pub struct EmbeddingUsage {
547    /// Number of tokens in the input
548    pub prompt_tokens: u32,
549    /// Total tokens used
550    pub total_tokens: u32,
551}
552
553// =============================================================================
554// Image Generation Types (OpenAI /v1/images/generations compatible)
555// =============================================================================
556
557/// Image generation request (OpenAI-compatible)
558#[derive(Debug, Clone, Serialize, Deserialize)]
559pub struct ImageGenerationRequest {
560    /// The text prompt describing the image to generate
561    pub prompt: String,
562    /// Optional model override
563    #[serde(skip_serializing_if = "Option::is_none")]
564    pub model: Option<String>,
565    /// Number of images to generate (default: 1)
566    #[serde(skip_serializing_if = "Option::is_none")]
567    pub n: Option<u32>,
568    /// Image size (e.g., "1024x1024", "1792x1024", "1024x1792")
569    #[serde(skip_serializing_if = "Option::is_none")]
570    pub size: Option<String>,
571    /// Quality level ("standard" or "hd")
572    #[serde(skip_serializing_if = "Option::is_none")]
573    pub quality: Option<String>,
574    /// Style ("vivid" or "natural")
575    #[serde(skip_serializing_if = "Option::is_none")]
576    pub style: Option<String>,
577    /// Response format ("url" or "b64_json")
578    #[serde(skip_serializing_if = "Option::is_none")]
579    pub response_format: Option<String>,
580    /// A unique identifier for the end-user
581    #[serde(skip_serializing_if = "Option::is_none")]
582    pub user: Option<String>,
583}
584
585/// Image generation response (OpenAI-compatible)
586#[derive(Debug, Clone, Serialize, Deserialize)]
587pub struct ImageGenerationResponse {
588    /// Unix timestamp of when the response was created
589    pub created: u64,
590    /// Array of generated images
591    pub data: Vec<ImageData>,
592}
593
594/// Individual generated image data
595#[derive(Debug, Clone, Serialize, Deserialize)]
596pub struct ImageData {
597    /// URL of the generated image (if response_format is "url")
598    #[serde(skip_serializing_if = "Option::is_none")]
599    pub url: Option<String>,
600    /// Base64-encoded image (if response_format is "b64_json")
601    #[serde(skip_serializing_if = "Option::is_none")]
602    pub b64_json: Option<String>,
603    /// Revised prompt (if model revised the prompt)
604    #[serde(skip_serializing_if = "Option::is_none")]
605    pub revised_prompt: Option<String>,
606}
607
608// =============================================================================
609// Audio Transcription Types (OpenAI /v1/audio/transcriptions compatible)
610// =============================================================================
611
612/// Audio transcription request (OpenAI-compatible)
613#[derive(Debug, Clone)]
614pub struct AudioTranscriptionRequest {
615    /// Audio file bytes
616    pub file: Vec<u8>,
617    /// Original filename (for format detection)
618    pub filename: String,
619    /// Optional model override
620    pub model: Option<String>,
621    /// Language of the audio (ISO-639-1 code, e.g., "en")
622    pub language: Option<String>,
623    /// Prompt to guide the model's style
624    pub prompt: Option<String>,
625    /// Response format ("json", "text", "srt", "verbose_json", "vtt")
626    pub response_format: Option<String>,
627    /// Temperature for sampling (0-1)
628    pub temperature: Option<f32>,
629}
630
631/// Audio transcription response (OpenAI-compatible)
632#[derive(Debug, Clone, Serialize, Deserialize)]
633pub struct AudioTranscriptionResponse {
634    /// The transcribed text
635    pub text: String,
636    /// Task type (always "transcribe")
637    #[serde(skip_serializing_if = "Option::is_none")]
638    pub task: Option<String>,
639    /// Language detected or specified
640    #[serde(skip_serializing_if = "Option::is_none")]
641    pub language: Option<String>,
642    /// Duration of the audio in seconds
643    #[serde(skip_serializing_if = "Option::is_none")]
644    pub duration: Option<f64>,
645    /// Word-level timestamps (verbose_json only)
646    #[serde(skip_serializing_if = "Option::is_none")]
647    pub words: Option<Vec<TranscriptionWord>>,
648    /// Segment-level timestamps (verbose_json only)
649    #[serde(skip_serializing_if = "Option::is_none")]
650    pub segments: Option<Vec<TranscriptionSegment>>,
651}
652
653/// Word-level transcription timing
654#[derive(Debug, Clone, Serialize, Deserialize)]
655pub struct TranscriptionWord {
656    /// The transcribed word
657    pub word: String,
658    /// Start time in seconds
659    pub start: f64,
660    /// End time in seconds
661    pub end: f64,
662}
663
664/// Segment-level transcription
665#[derive(Debug, Clone, Serialize, Deserialize)]
666pub struct TranscriptionSegment {
667    /// Segment ID
668    pub id: u32,
669    /// Start time in seconds
670    pub start: f64,
671    /// End time in seconds
672    pub end: f64,
673    /// Transcribed text
674    pub text: String,
675}
676
677// =============================================================================
678// Text-to-Speech Types (OpenAI /v1/audio/speech compatible)
679// =============================================================================
680
681/// Text-to-speech request (OpenAI-compatible)
682#[derive(Debug, Clone, Serialize, Deserialize)]
683pub struct SpeechRequest {
684    /// The text to convert to speech (max 4096 chars)
685    pub input: String,
686    /// Optional model override
687    #[serde(skip_serializing_if = "Option::is_none")]
688    pub model: Option<String>,
689    /// Voice to use (e.g., "alloy", "echo", "fable", "onyx", "nova", "shimmer")
690    pub voice: String,
691    /// Audio format ("mp3", "opus", "aac", "flac", "wav", "pcm")
692    #[serde(skip_serializing_if = "Option::is_none")]
693    pub response_format: Option<String>,
694    /// Speed of speech (0.25-4.0, default 1.0)
695    #[serde(skip_serializing_if = "Option::is_none")]
696    pub speed: Option<f32>,
697}
698
699/// Text-to-speech response
700#[derive(Debug, Clone)]
701pub struct SpeechResponse {
702    /// Audio data bytes
703    pub audio: Vec<u8>,
704    /// Audio format/content type (e.g., "audio/mpeg", "audio/opus")
705    pub content_type: String,
706}
707
708// =============================================================================
709// Video Generation Types
710// =============================================================================
711
712/// Video generation request
713#[derive(Debug, Clone, Serialize, Deserialize)]
714pub struct VideoGenerationRequest {
715    /// Text prompt describing the video to generate
716    pub prompt: String,
717    /// Optional model override
718    #[serde(skip_serializing_if = "Option::is_none")]
719    pub model: Option<String>,
720    /// Duration in seconds
721    #[serde(skip_serializing_if = "Option::is_none")]
722    pub duration: Option<f32>,
723    /// Video size (e.g., "1920x1080", "1280x720")
724    #[serde(skip_serializing_if = "Option::is_none")]
725    pub size: Option<String>,
726    /// Frames per second
727    #[serde(skip_serializing_if = "Option::is_none")]
728    pub fps: Option<u32>,
729    /// Reference image for image-to-video (base64)
730    #[serde(skip_serializing_if = "Option::is_none")]
731    pub image: Option<String>,
732    /// Negative prompt (what to avoid)
733    #[serde(skip_serializing_if = "Option::is_none")]
734    pub negative_prompt: Option<String>,
735    /// Seed for reproducibility
736    #[serde(skip_serializing_if = "Option::is_none")]
737    pub seed: Option<u64>,
738}
739
740/// Video generation response
741#[derive(Debug, Clone, Serialize, Deserialize)]
742pub struct VideoGenerationResponse {
743    /// Unix timestamp of when the response was created
744    pub created: u64,
745    /// Array of generated videos
746    pub data: Vec<VideoData>,
747}
748
749/// Individual generated video data
750#[derive(Debug, Clone, Serialize, Deserialize)]
751pub struct VideoData {
752    /// URL of the generated video
753    #[serde(skip_serializing_if = "Option::is_none")]
754    pub url: Option<String>,
755    /// Base64-encoded video
756    #[serde(skip_serializing_if = "Option::is_none")]
757    pub b64_json: Option<String>,
758    /// Revised prompt (if model revised the prompt)
759    #[serde(skip_serializing_if = "Option::is_none")]
760    pub revised_prompt: Option<String>,
761}
762
763/// Model provider trait
764#[async_trait]
765pub trait ModelProvider: Send + Sync {
766    /// Provider name
767    fn name(&self) -> &str;
768
769    /// Model name being used
770    fn model(&self) -> &str {
771        "default"
772    }
773
774    /// Get model capabilities
775    fn capabilities(&self) -> ModelCapabilities {
776        ModelCapabilities::default()
777    }
778
779    /// Whether this provider requires network access.
780    /// Default is `true` for cloud LLM providers.
781    /// Override to `false` for local providers (e.g., Ollama).
782    fn requires_network(&self) -> bool {
783        true
784    }
785
786    /// Create a chat completion
787    async fn chat(&self, request: ChatRequest) -> anyhow::Result<ChatResponse>;
788
789    /// Generate embeddings (if supported)
790    ///
791    /// Default implementation returns an error. Providers that support embeddings
792    /// should override this method.
793    async fn embed(&self, _request: EmbeddingRequest) -> anyhow::Result<EmbeddingResponse> {
794        anyhow::bail!("Embeddings not supported by this provider")
795    }
796
797    /// Generate images (if supported)
798    ///
799    /// Default implementation returns an error. Providers that support image generation
800    /// (e.g., DALL-E, Flux, Stable Diffusion) should override this method.
801    async fn generate_image(
802        &self,
803        _request: ImageGenerationRequest,
804    ) -> anyhow::Result<ImageGenerationResponse> {
805        anyhow::bail!("Image generation not supported by this provider")
806    }
807
808    /// Transcribe audio to text (if supported)
809    ///
810    /// Default implementation returns an error. Providers that support audio transcription
811    /// (e.g., Whisper) should override this method.
812    async fn transcribe(
813        &self,
814        _request: AudioTranscriptionRequest,
815    ) -> anyhow::Result<AudioTranscriptionResponse> {
816        anyhow::bail!("Audio transcription not supported by this provider")
817    }
818
819    /// Generate speech from text (if supported)
820    ///
821    /// Default implementation returns an error. Providers that support text-to-speech
822    /// should override this method.
823    async fn speak(&self, _request: SpeechRequest) -> anyhow::Result<SpeechResponse> {
824        anyhow::bail!("Text-to-speech not supported by this provider")
825    }
826
827    /// Generate video (if supported)
828    ///
829    /// Default implementation returns an error. Providers that support video generation
830    /// (e.g., Runway, Pika) should override this method.
831    async fn generate_video(
832        &self,
833        _request: VideoGenerationRequest,
834    ) -> anyhow::Result<VideoGenerationResponse> {
835        anyhow::bail!("Video generation not supported by this provider")
836    }
837}