enact-core 0.0.2

//! Model provider trait and capabilities
//!
//! This module defines the `ModelProvider` trait and related types. Provider implementations
//! are in the separate `enact-providers` crate to enable independent release cadence.
//!
//! ## ⚠️ CODE OWNERSHIP & FORBIDDEN PATTERNS
//!
//! **Providers are replaceable adapters - execution semantics must never depend on provider behavior.**
//!
//! ### Code Ownership
//! - Provider implementations live in `enact-providers` crate
//! - This module contains only the trait and types (`ChatRequest`, `ChatResponse`, `ModelCapabilities`)
//! - `enact-providers` depends on `enact-core` (for the trait), but `enact-core` must NOT depend on `enact-providers`
//! - Providers are thin HTTP adapters
//! - They should only see `ChatRequest`/`ChatResponse` and tool schemas
//! - If providers need kernel types, that logic belongs in the kernel
//!
//! ### Explicitly Forbidden Patterns
//!
//! These patterns are **forbidden forever**. If any of these happen, Enact loses its "Now" guarantee.
//!
//! 1. **Providers enforcing policy** – Policy enforcement belongs to kernel, not providers.
//!    - Providers are thin adapters, not policy engines
//!    - No policy checks in provider implementations
//!    - No quota enforcement in providers
//!
//! 2. **Providers importing kernel/flow/policy** – Providers must not import kernel types.
//!    - `enact-providers` must NOT import `enact_core::kernel`, `enact_core::flow`, or `enact_core::policy`
//!    - Providers should only see `ChatRequest`/`ChatResponse` and tool schemas
//!    - If providers need kernel types, that logic belongs in the kernel
//!    - No circular dependencies: `enact-core` must NOT depend on `enact-providers`
//!
//! 3. **Global registries or dynamic discovery in kernel** – Providers are resolved before kernel execution.
//!    - The kernel receives resolved providers via `ExecutionRequest`, not provider names or registry lookups
//!    - Provider resolution happens outside kernel (in runner/control plane)
//!    - No provider registry in kernel
//!
//! 4. **Execution semantics depending on provider behavior** – Providers are replaceable adapters.
//!    - Execution semantics must never depend on provider-specific behavior
//!    - All providers must implement the same trait interface
//!    - No provider-specific logic in kernel
//!
//! ### Invariants Enforced
//!
//! - **Providers are replaceable adapters**: Execution semantics must never depend on provider behavior
//! - **No global registries or dynamic discovery in kernel**: Providers are resolved before kernel execution
//! - **Provider resolution happens outside kernel**: Resolution belongs in runner/control plane, not in the kernel
//! - **Providers must not import kernel/flow/policy**: `enact-providers` must NOT import `enact_core::kernel`, `enact_core::flow`, or `enact_core::policy`
//! - **Crate boundary separation**: Provider implementations in `enact-providers`, trait in `enact-core`, one-way dependency only
//!
//! @see docs/TECHNICAL/04-KERNEL_INVARIANTS.md

use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use serde_json::Value;

// =============================================================================
// Model Capabilities - What a model can do
// =============================================================================

/// Model capabilities - declares what a model supports.
///
/// Aligned with enact-providers config.yml `capabilities` and related fields:
/// - `tool_calls` → supports_tools, `reasoning` → supports_reasoning
/// - runtime.max_tokens_default, cost.cost_per_1m_input/output etc. map to max_tokens, cost_per_1m_*, cost_per_1m_pixels
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ModelCapabilities {
    /// Maximum context window (tokens)
    pub max_tokens: u32,
    /// Maximum output tokens
    pub max_output_tokens: u32,
    /// Supports streaming responses
    pub supports_streaming: bool,
    /// Supports tool/function calling (config: capabilities.tool_calls)
    pub supports_tools: bool,
    /// Supports reasoning/thinking (config: capabilities.reasoning)
    pub supports_reasoning: bool,
    /// Supports vision/images as input
    pub supports_vision: bool,
    /// Supports structured output (JSON mode)
    pub supports_json_mode: bool,
    /// Supports embedding generation
    pub supports_embeddings: bool,
    /// Supports image generation (DALL-E, Flux, etc.)
    pub supports_image_generation: bool,
    /// Supports audio transcription (speech-to-text)
    pub supports_audio_transcription: bool,
    /// Supports text-to-speech
    pub supports_speech: bool,
    /// Supports video generation
    pub supports_video_generation: bool,
    /// Is PII-safe (no data retention)
    pub pii_safe: bool,
    /// Cost per 1K input tokens (USD); from config cost.cost_per_1m_input
    pub cost_per_1m_input: Option<f64>,
    /// Cost per 1K output tokens (USD); from config cost.cost_per_1m_output
    pub cost_per_1m_output: Option<f64>,
    /// Cost per 1K pixels for image (USD); from config cost.cost_per_1m_pixels
    pub cost_per_1m_pixels: Option<f64>,
}

impl Default for ModelCapabilities {
    fn default() -> Self {
        Self {
            max_tokens: 4096,
            max_output_tokens: 4096,
            supports_streaming: true,
            supports_tools: false,
            supports_reasoning: false,
            supports_vision: false,
            supports_json_mode: false,
            supports_embeddings: false,
            supports_image_generation: false,
            supports_audio_transcription: false,
            supports_speech: false,
            supports_video_generation: false,
            pii_safe: false,
            cost_per_1m_input: None,
            cost_per_1m_output: None,
            cost_per_1m_pixels: None,
        }
    }
}

impl ModelCapabilities {
    /// GPT-4 capabilities
    pub fn gpt4() -> Self {
        Self {
            max_tokens: 128_000,
            max_output_tokens: 4096,
            supports_streaming: true,
            supports_tools: true,
            supports_reasoning: false,
            supports_vision: true,
            supports_json_mode: true,
            supports_embeddings: true,
            supports_image_generation: false,
            supports_audio_transcription: false,
            supports_speech: false,
            supports_video_generation: false,
            pii_safe: false,
            cost_per_1m_input: Some(0.03),
            cost_per_1m_output: Some(0.06),
            cost_per_1m_pixels: None,
        }
    }

    /// Claude 3 Opus capabilities
    pub fn claude3_opus() -> Self {
        Self {
            max_tokens: 200_000,
            max_output_tokens: 4096,
            supports_streaming: true,
            supports_tools: true,
            supports_reasoning: false,
            supports_vision: true,
            supports_json_mode: true,
            supports_embeddings: true,
            supports_image_generation: false,
            supports_audio_transcription: false,
            supports_speech: false,
            supports_video_generation: false,
            pii_safe: false,
            cost_per_1m_input: Some(0.015),
            cost_per_1m_output: Some(0.075),
            cost_per_1m_pixels: None,
        }
    }

    /// Gemini Pro capabilities
    pub fn gemini_pro() -> Self {
        Self {
            max_tokens: 1_000_000,
            max_output_tokens: 8192,
            supports_streaming: true,
            supports_tools: true,
            supports_reasoning: false,
            supports_vision: true,
            supports_json_mode: true,
            supports_embeddings: true,
            supports_image_generation: false,
            supports_audio_transcription: false,
            supports_speech: false,
            supports_video_generation: false,
            pii_safe: false,
            cost_per_1m_input: Some(0.00125),
            cost_per_1m_output: Some(0.005),
            cost_per_1m_pixels: None,
        }
    }
}

// =============================================================================
// Chat Types (OpenAI-compatible)
// =============================================================================

/// Tool definition for chat request (OpenAI shape: type "function", function { name, description, parameters })
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatTool {
    #[serde(rename = "type")]
    pub tool_type: String,
    pub function: ChatToolFunction,
}

/// Function part of a ChatTool
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatToolFunction {
    pub name: String,
    pub description: String,
    pub parameters: Value,
}

/// Tool choice: "auto" | "none" | or specific function (OpenAI shape)
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum ToolChoice {
    String(String),
    Specific {
        #[serde(rename = "type")]
        choice_type: String,
        function: ToolChoiceFunction,
    },
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ToolChoiceFunction {
    pub name: String,
}

/// One tool call in an assistant message (OpenAI shape: id, type "function", function { name, arguments })
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MessageToolCall {
    pub id: String,
    #[serde(rename = "type")]
    pub call_type: String,
    pub function: MessageToolCallFunction,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct MessageToolCallFunction {
    pub name: String,
    pub arguments: String,
}

// =============================================================================
// Multimodal Content Types
// =============================================================================

/// Image URL structure for vision messages (OpenAI format)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImageUrlContent {
    /// The URL of the image (can be a data URL with base64)
    pub url: String,
    /// Optional detail level: "low", "high", or "auto"
    #[serde(skip_serializing_if = "Option::is_none")]
    pub detail: Option<String>,
}

/// A content part for multimodal messages (text or image)
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum ContentPart {
    /// Text content part
    Text { text: String },
    /// Image URL content part (for vision models)
    ImageUrl { image_url: ImageUrlContent },
}

impl ContentPart {
    /// Create a text content part
    pub fn text(text: impl Into<String>) -> Self {
        ContentPart::Text { text: text.into() }
    }

    /// Create an image URL content part from a URL
    pub fn image_url(url: impl Into<String>) -> Self {
        ContentPart::ImageUrl {
            image_url: ImageUrlContent {
                url: url.into(),
                detail: None,
            },
        }
    }

    /// Create an image content part from base64 data
    pub fn image_base64(base64_data: impl Into<String>, mime_type: impl Into<String>) -> Self {
        let data_url = format!("data:{};base64,{}", mime_type.into(), base64_data.into());
        ContentPart::ImageUrl {
            image_url: ImageUrlContent {
                url: data_url,
                detail: None,
            },
        }
    }
}

/// Message content - can be either a simple string or multimodal content parts
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(untagged)]
pub enum MessageContent {
    /// Simple text content
    Text(String),
    /// Multimodal content (text + images)
    Parts(Vec<ContentPart>),
}

impl MessageContent {
    /// Check if this content contains any images
    pub fn has_images(&self) -> bool {
        match self {
            MessageContent::Text(_) => false,
            MessageContent::Parts(parts) => parts
                .iter()
                .any(|p| matches!(p, ContentPart::ImageUrl { .. })),
        }
    }

    /// Get the text content (concatenated if multimodal)
    pub fn as_text(&self) -> String {
        match self {
            MessageContent::Text(s) => s.clone(),
            MessageContent::Parts(parts) => parts
                .iter()
                .filter_map(|p| match p {
                    ContentPart::Text { text } => Some(text.as_str()),
                    _ => None,
                })
                .collect::<Vec<_>>()
                .join("\n"),
        }
    }
}

impl From<String> for MessageContent {
    fn from(s: String) -> Self {
        MessageContent::Text(s)
    }
}

impl From<&str> for MessageContent {
    fn from(s: &str) -> Self {
        MessageContent::Text(s.to_string())
    }
}

/// Chat message (OpenAI shape: role, optional content, optional tool_calls, optional tool_call_id for role "tool")
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChatMessage {
    pub role: String,
    /// Text-only content (for backward compatibility)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub content: Option<String>,
    /// Multimodal content (text + images) - providers should prefer this if present
    #[serde(skip_serializing_if = "Option::is_none")]
    pub multimodal_content: Option<Vec<ContentPart>>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub tool_calls: Option<Vec<MessageToolCall>>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub tool_call_id: Option<String>,
}

impl ChatMessage {
    pub fn system(content: impl Into<String>) -> Self {
        Self {
            role: "system".to_string(),
            content: Some(content.into()),
            multimodal_content: None,
            tool_calls: None,
            tool_call_id: None,
        }
    }

    pub fn user(content: impl Into<String>) -> Self {
        Self {
            role: "user".to_string(),
            content: Some(content.into()),
            multimodal_content: None,
            tool_calls: None,
            tool_call_id: None,
        }
    }

    /// User message with images (multimodal content for vision models)
    ///
    /// Creates a user message with text and one or more images.
    /// The images should be provided as base64-encoded data with their MIME type.
    ///
    /// # Example
    /// ```ignore
    /// let msg = ChatMessage::user_with_images(
    ///     "Describe this image",
    ///     vec![("data:image/jpeg;base64,...", "image/jpeg")],
    /// );
    /// ```
    pub fn user_with_images<S: Into<String>>(
        text: S,
        images: Vec<(Vec<u8>, String)>, // (raw bytes, mime_type)
    ) -> Self {
        use base64::Engine;
        let mut parts = vec![ContentPart::text(text)];

        for (data, mime_type) in images {
            let b64 = base64::engine::general_purpose::STANDARD.encode(&data);
            parts.push(ContentPart::image_base64(b64, mime_type));
        }

        Self {
            role: "user".to_string(),
            content: None, // content is None when using multimodal
            multimodal_content: Some(parts),
            tool_calls: None,
            tool_call_id: None,
        }
    }

    pub fn assistant(content: impl Into<String>) -> Self {
        Self {
            role: "assistant".to_string(),
            content: Some(content.into()),
            multimodal_content: None,
            tool_calls: None,
            tool_call_id: None,
        }
    }

    /// Assistant message with tool calls (content may be None or empty)
    pub fn assistant_with_tool_calls(
        content: Option<String>,
        tool_calls: Vec<MessageToolCall>,
    ) -> Self {
        Self {
            role: "assistant".to_string(),
            content,
            multimodal_content: None,
            tool_calls: Some(tool_calls),
            tool_call_id: None,
        }
    }

    /// Tool result message (role "tool")
    pub fn tool_result(tool_call_id: impl Into<String>, content: impl Into<String>) -> Self {
        Self {
            role: "tool".to_string(),
            content: Some(content.into()),
            multimodal_content: None,
            tool_calls: None,
            tool_call_id: Some(tool_call_id.into()),
        }
    }

    /// Check if this message contains multimodal content (images)
    pub fn has_images(&self) -> bool {
        self.multimodal_content
            .as_ref()
            .map(|parts| {
                parts
                    .iter()
                    .any(|p| matches!(p, ContentPart::ImageUrl { .. }))
            })
            .unwrap_or(false)
    }

    /// Get the effective content for serialization to providers
    /// Returns either multimodal content parts or text-only content
    pub fn effective_content(&self) -> MessageContent {
        if let Some(parts) = &self.multimodal_content {
            MessageContent::Parts(parts.clone())
        } else if let Some(text) = &self.content {
            MessageContent::Text(text.clone())
        } else {
            MessageContent::Text(String::new())
        }
    }
}

/// Chat completion request
#[derive(Debug, Clone, Serialize)]
pub struct ChatRequest {
    pub messages: Vec<ChatMessage>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub max_tokens: Option<u32>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub temperature: Option<f32>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub tools: Option<Vec<ChatTool>>,
    #[serde(skip_serializing_if = "Option::is_none")]
    pub tool_choice: Option<ToolChoice>,
}

/// Chat completion response
#[derive(Debug, Clone, Deserialize)]
pub struct ChatResponse {
    pub id: String,
    pub choices: Vec<ChatChoice>,
    pub usage: Option<ChatUsage>,
}

#[derive(Debug, Clone, Deserialize)]
pub struct ChatChoice {
    pub index: u32,
    pub message: ChatMessage,
    pub finish_reason: Option<String>,
}

#[derive(Debug, Clone, Deserialize)]
pub struct ChatUsage {
    pub prompt_tokens: u32,
    pub completion_tokens: u32,
    pub total_tokens: u32,
}

// =============================================================================
// Embedding Types
// =============================================================================

/// Embedding request
#[derive(Debug, Clone, Serialize)]
pub struct EmbeddingRequest {
    /// Text to embed (single string or array of strings for batch)
    pub input: String,
    /// Optional model override
    #[serde(skip_serializing_if = "Option::is_none")]
    pub model: Option<String>,
}

/// Embedding response
#[derive(Debug, Clone, Deserialize)]
pub struct EmbeddingResponse {
    /// Embedding data (one per input)
    pub data: Vec<EmbeddingData>,
    /// Model used for embedding
    pub model: String,
    /// Token usage information
    pub usage: Option<EmbeddingUsage>,
}

#[derive(Debug, Clone, Deserialize)]
pub struct EmbeddingData {
    /// The embedding vector
    pub embedding: Vec<f32>,
    /// Index of the embedding in the batch
    pub index: u32,
}

#[derive(Debug, Clone, Deserialize)]
pub struct EmbeddingUsage {
    /// Number of tokens in the input
    pub prompt_tokens: u32,
    /// Total tokens used
    pub total_tokens: u32,
}

// =============================================================================
// Image Generation Types (OpenAI /v1/images/generations compatible)
// =============================================================================

/// Image generation request (OpenAI-compatible)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImageGenerationRequest {
    /// The text prompt describing the image to generate
    pub prompt: String,
    /// Optional model override
    #[serde(skip_serializing_if = "Option::is_none")]
    pub model: Option<String>,
    /// Number of images to generate (default: 1)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub n: Option<u32>,
    /// Image size (e.g., "1024x1024", "1792x1024", "1024x1792")
    #[serde(skip_serializing_if = "Option::is_none")]
    pub size: Option<String>,
    /// Quality level ("standard" or "hd")
    #[serde(skip_serializing_if = "Option::is_none")]
    pub quality: Option<String>,
    /// Style ("vivid" or "natural")
    #[serde(skip_serializing_if = "Option::is_none")]
    pub style: Option<String>,
    /// Response format ("url" or "b64_json")
    #[serde(skip_serializing_if = "Option::is_none")]
    pub response_format: Option<String>,
    /// A unique identifier for the end-user
    #[serde(skip_serializing_if = "Option::is_none")]
    pub user: Option<String>,
}

/// Image generation response (OpenAI-compatible)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImageGenerationResponse {
    /// Unix timestamp of when the response was created
    pub created: u64,
    /// Array of generated images
    pub data: Vec<ImageData>,
}

/// Individual generated image data
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ImageData {
    /// URL of the generated image (if response_format is "url")
    #[serde(skip_serializing_if = "Option::is_none")]
    pub url: Option<String>,
    /// Base64-encoded image (if response_format is "b64_json")
    #[serde(skip_serializing_if = "Option::is_none")]
    pub b64_json: Option<String>,
    /// Revised prompt (if model revised the prompt)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub revised_prompt: Option<String>,
}

// =============================================================================
// Audio Transcription Types (OpenAI /v1/audio/transcriptions compatible)
// =============================================================================

/// Audio transcription request (OpenAI-compatible)
#[derive(Debug, Clone)]
pub struct AudioTranscriptionRequest {
    /// Audio file bytes
    pub file: Vec<u8>,
    /// Original filename (for format detection)
    pub filename: String,
    /// Optional model override
    pub model: Option<String>,
    /// Language of the audio (ISO-639-1 code, e.g., "en")
    pub language: Option<String>,
    /// Prompt to guide the model's style
    pub prompt: Option<String>,
    /// Response format ("json", "text", "srt", "verbose_json", "vtt")
    pub response_format: Option<String>,
    /// Temperature for sampling (0-1)
    pub temperature: Option<f32>,
}

/// Audio transcription response (OpenAI-compatible)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AudioTranscriptionResponse {
    /// The transcribed text
    pub text: String,
    /// Task type (always "transcribe")
    #[serde(skip_serializing_if = "Option::is_none")]
    pub task: Option<String>,
    /// Language detected or specified
    #[serde(skip_serializing_if = "Option::is_none")]
    pub language: Option<String>,
    /// Duration of the audio in seconds
    #[serde(skip_serializing_if = "Option::is_none")]
    pub duration: Option<f64>,
    /// Word-level timestamps (verbose_json only)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub words: Option<Vec<TranscriptionWord>>,
    /// Segment-level timestamps (verbose_json only)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub segments: Option<Vec<TranscriptionSegment>>,
}

/// Word-level transcription timing
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TranscriptionWord {
    /// The transcribed word
    pub word: String,
    /// Start time in seconds
    pub start: f64,
    /// End time in seconds
    pub end: f64,
}

/// Segment-level transcription
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TranscriptionSegment {
    /// Segment ID
    pub id: u32,
    /// Start time in seconds
    pub start: f64,
    /// End time in seconds
    pub end: f64,
    /// Transcribed text
    pub text: String,
}

// =============================================================================
// Text-to-Speech Types (OpenAI /v1/audio/speech compatible)
// =============================================================================

/// Text-to-speech request (OpenAI-compatible)
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SpeechRequest {
    /// The text to convert to speech (max 4096 chars)
    pub input: String,
    /// Optional model override
    #[serde(skip_serializing_if = "Option::is_none")]
    pub model: Option<String>,
    /// Voice to use (e.g., "alloy", "echo", "fable", "onyx", "nova", "shimmer")
    pub voice: String,
    /// Audio format ("mp3", "opus", "aac", "flac", "wav", "pcm")
    #[serde(skip_serializing_if = "Option::is_none")]
    pub response_format: Option<String>,
    /// Speed of speech (0.25-4.0, default 1.0)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub speed: Option<f32>,
}

/// Text-to-speech response
#[derive(Debug, Clone)]
pub struct SpeechResponse {
    /// Audio data bytes
    pub audio: Vec<u8>,
    /// Audio format/content type (e.g., "audio/mpeg", "audio/opus")
    pub content_type: String,
}

// =============================================================================
// Video Generation Types
// =============================================================================

/// Video generation request
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VideoGenerationRequest {
    /// Text prompt describing the video to generate
    pub prompt: String,
    /// Optional model override
    #[serde(skip_serializing_if = "Option::is_none")]
    pub model: Option<String>,
    /// Duration in seconds
    #[serde(skip_serializing_if = "Option::is_none")]
    pub duration: Option<f32>,
    /// Video size (e.g., "1920x1080", "1280x720")
    #[serde(skip_serializing_if = "Option::is_none")]
    pub size: Option<String>,
    /// Frames per second
    #[serde(skip_serializing_if = "Option::is_none")]
    pub fps: Option<u32>,
    /// Reference image for image-to-video (base64)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub image: Option<String>,
    /// Negative prompt (what to avoid)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub negative_prompt: Option<String>,
    /// Seed for reproducibility
    #[serde(skip_serializing_if = "Option::is_none")]
    pub seed: Option<u64>,
}

/// Video generation response
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VideoGenerationResponse {
    /// Unix timestamp of when the response was created
    pub created: u64,
    /// Array of generated videos
    pub data: Vec<VideoData>,
}

/// Individual generated video data
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct VideoData {
    /// URL of the generated video
    #[serde(skip_serializing_if = "Option::is_none")]
    pub url: Option<String>,
    /// Base64-encoded video
    #[serde(skip_serializing_if = "Option::is_none")]
    pub b64_json: Option<String>,
    /// Revised prompt (if model revised the prompt)
    #[serde(skip_serializing_if = "Option::is_none")]
    pub revised_prompt: Option<String>,
}

/// Model provider trait
#[async_trait]
pub trait ModelProvider: Send + Sync {
    /// Provider name
    fn name(&self) -> &str;

    /// Model name being used
    fn model(&self) -> &str {
        "default"
    }

    /// Get model capabilities
    fn capabilities(&self) -> ModelCapabilities {
        ModelCapabilities::default()
    }

    /// Whether this provider requires network access.
    /// Default is `true` for cloud LLM providers.
    /// Override to `false` for local providers (e.g., Ollama).
    fn requires_network(&self) -> bool {
        true
    }

    /// Create a chat completion
    async fn chat(&self, request: ChatRequest) -> anyhow::Result<ChatResponse>;

    /// Generate embeddings (if supported)
    ///
    /// Default implementation returns an error. Providers that support embeddings
    /// should override this method.
    async fn embed(&self, _request: EmbeddingRequest) -> anyhow::Result<EmbeddingResponse> {
        anyhow::bail!("Embeddings not supported by this provider")
    }

    /// Generate images (if supported)
    ///
    /// Default implementation returns an error. Providers that support image generation
    /// (e.g., DALL-E, Flux, Stable Diffusion) should override this method.
    async fn generate_image(
        &self,
        _request: ImageGenerationRequest,
    ) -> anyhow::Result<ImageGenerationResponse> {
        anyhow::bail!("Image generation not supported by this provider")
    }

    /// Transcribe audio to text (if supported)
    ///
    /// Default implementation returns an error. Providers that support audio transcription
    /// (e.g., Whisper) should override this method.
    async fn transcribe(
        &self,
        _request: AudioTranscriptionRequest,
    ) -> anyhow::Result<AudioTranscriptionResponse> {
        anyhow::bail!("Audio transcription not supported by this provider")
    }

    /// Generate speech from text (if supported)
    ///
    /// Default implementation returns an error. Providers that support text-to-speech
    /// should override this method.
    async fn speak(&self, _request: SpeechRequest) -> anyhow::Result<SpeechResponse> {
        anyhow::bail!("Text-to-speech not supported by this provider")
    }

    /// Generate video (if supported)
    ///
    /// Default implementation returns an error. Providers that support video generation
    /// (e.g., Runway, Pika) should override this method.
    async fn generate_video(
        &self,
        _request: VideoGenerationRequest,
    ) -> anyhow::Result<VideoGenerationResponse> {
        anyhow::bail!("Video generation not supported by this provider")
    }
}