roder-protocol 0.1.2

use std::collections::BTreeMap;

use roder_api::inference::ProviderAuthType;
use roder_api::speech::{
    SpeechCapabilities, SpeechModelDescriptor, SpeechSegment, SpeechSynthesisCapabilities,
    SpeechSynthesisModelDescriptor,
};
use serde::{Deserialize, Serialize};

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SpeechProviderDescriptor {
    pub id: String,
    pub name: String,
    pub description: Option<String>,
    pub auth_type: ProviderAuthType,
    pub auth_label: Option<String>,
    pub authenticated: bool,
    pub auth_detail: Option<String>,
    pub recommended: bool,
    pub sort_order: i32,
    pub capabilities: SpeechCapabilities,
    pub models: Vec<SpeechModelDescriptor>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SpeechProvidersListResult {
    pub providers: Vec<SpeechProviderDescriptor>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SpeechSynthesisProviderDescriptor {
    pub id: String,
    pub name: String,
    pub description: Option<String>,
    pub auth_type: ProviderAuthType,
    pub auth_label: Option<String>,
    pub authenticated: bool,
    pub auth_detail: Option<String>,
    pub recommended: bool,
    pub sort_order: i32,
    pub capabilities: SpeechSynthesisCapabilities,
    pub models: Vec<SpeechSynthesisModelDescriptor>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SpeechSynthesisProvidersListResult {
    pub providers: Vec<SpeechSynthesisProviderDescriptor>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SpeechAudioPayload {
    pub bytes_base64: String,
    pub mime_type: String,
    pub filename: Option<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SpeechTranscribeParams {
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub provider: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub model: Option<String>,
    pub audio: SpeechAudioPayload,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub language: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub prompt: Option<String>,
    #[serde(default)]
    pub diarization: bool,
    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
    pub metadata: BTreeMap<String, serde_json::Value>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SpeechTranscribeResult {
    pub provider: String,
    pub model: String,
    pub text: String,
    pub language: Option<String>,
    pub duration_millis: Option<u64>,
    pub segments: Vec<SpeechSegment>,
    pub provider_response_id: Option<String>,
    #[serde(default)]
    pub metadata: serde_json::Value,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SpeechSynthesizeParams {
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub provider: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub model: Option<String>,
    pub text: String,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub voice: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub audio_format: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub prompt: Option<String>,
    #[serde(default, skip_serializing_if = "Option::is_none")]
    pub voice_sample: Option<SpeechAudioPayload>,
    #[serde(default, skip_serializing_if = "BTreeMap::is_empty")]
    pub metadata: BTreeMap<String, serde_json::Value>,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct SpeechSynthesizeResult {
    pub provider: String,
    pub model: String,
    pub audio: SpeechAudioPayload,
    pub duration_millis: Option<u64>,
    pub provider_response_id: Option<String>,
    #[serde(default)]
    pub metadata: serde_json::Value,
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn transcribe_params_use_camel_case_audio_bytes() {
        let params: SpeechTranscribeParams = serde_json::from_value(serde_json::json!({
            "provider": "openai-speech",
            "model": "gpt-4o-mini-transcribe",
            "audio": {
                "bytesBase64": "YXVkaW8=",
                "mimeType": "audio/wav",
                "filename": "clip.wav"
            },
            "language": "en",
            "diarization": false
        }))
        .unwrap();

        assert_eq!(params.audio.bytes_base64, "YXVkaW8=");
        assert_eq!(params.audio.mime_type, "audio/wav");
        assert_eq!(params.audio.filename.as_deref(), Some("clip.wav"));
    }

    #[test]
    fn synthesize_params_use_camel_case_audio_format() {
        let params: SpeechSynthesizeParams = serde_json::from_value(serde_json::json!({
            "provider": "xiaomi-mimo",
            "model": "mimo-v2.5-tts",
            "text": "hello",
            "audioFormat": "wav",
            "voiceSample": {
                "bytesBase64": "dm9pY2U=",
                "mimeType": "audio/wav",
                "filename": "voice.wav"
            }
        }))
        .unwrap();

        assert_eq!(params.audio_format.as_deref(), Some("wav"));
        assert_eq!(
            params.voice_sample.unwrap().bytes_base64.as_str(),
            "dm9pY2U="
        );
    }
}