roder-api 0.1.0

use std::collections::BTreeMap;

use serde::{Deserialize, Serialize};

use crate::extension::{SpeechSynthesizerId, SpeechTranscriberId};
use crate::inference::ProviderAuthType;

#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct SpeechCapabilities {
    pub batch: bool,
    pub streaming: bool,
    pub diarization: bool,
    pub timestamps: bool,
    pub language_hints: bool,
    pub prompt: bool,
}

#[derive(Debug, Clone, Copy, Default, Serialize, Deserialize, PartialEq, Eq)]
#[serde(rename_all = "camelCase")]
pub struct SpeechSynthesisCapabilities {
    pub batch: bool,
    pub streaming: bool,
    pub builtin_voices: bool,
    pub voice_design: bool,
    pub voice_clone: bool,
    pub prompt: bool,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct SpeechProviderMetadata {
    pub name: String,
    pub description: Option<String>,
    pub auth_type: ProviderAuthType,
    pub auth_label: Option<String>,
    pub auth_configured: Option<bool>,
    pub recommended: bool,
    pub sort_order: i32,
}

impl SpeechProviderMetadata {
    pub fn local(name: impl Into<String>) -> Self {
        Self {
            name: name.into(),
            description: None,
            auth_type: ProviderAuthType::None,
            auth_label: None,
            auth_configured: Some(true),
            recommended: false,
            sort_order: 100,
        }
    }
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct SpeechModelDescriptor {
    pub id: String,
    pub name: String,
    pub description: Option<String>,
    pub capabilities: SpeechCapabilities,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct SpeechSynthesisModelDescriptor {
    pub id: String,
    pub name: String,
    pub description: Option<String>,
    pub capabilities: SpeechSynthesisCapabilities,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
pub struct SpeechAudio {
    pub bytes: Vec<u8>,
    pub mime_type: String,
    pub filename: Option<String>,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SpeechTranscriptionRequest {
    pub model: String,
    pub audio: SpeechAudio,
    pub language: Option<String>,
    pub prompt: Option<String>,
    pub diarization: bool,
    #[serde(default)]
    pub metadata: BTreeMap<String, serde_json::Value>,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SpeechSegment {
    pub text: String,
    pub start_millis: Option<u64>,
    pub end_millis: Option<u64>,
    pub speaker: Option<String>,
    pub confidence: Option<f32>,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SpeechTranscriptionResult {
    pub text: String,
    pub language: Option<String>,
    pub duration_millis: Option<u64>,
    pub segments: Vec<SpeechSegment>,
    pub provider_response_id: Option<String>,
    #[serde(default)]
    pub metadata: serde_json::Value,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SpeechSynthesisRequest {
    pub model: String,
    pub text: String,
    pub voice: Option<String>,
    pub audio_format: Option<String>,
    pub prompt: Option<String>,
    pub voice_sample: Option<SpeechAudio>,
    #[serde(default)]
    pub metadata: BTreeMap<String, serde_json::Value>,
}

#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct SpeechSynthesisResult {
    pub audio: SpeechAudio,
    pub duration_millis: Option<u64>,
    pub provider_response_id: Option<String>,
    #[serde(default)]
    pub metadata: serde_json::Value,
}

#[derive(Debug, Clone, Copy)]
pub struct SpeechProviderContext<'a> {
    pub provider_id: &'a str,
}

#[async_trait::async_trait]
pub trait SpeechTranscriber: Send + Sync + 'static {
    fn id(&self) -> SpeechTranscriberId;

    fn capabilities(&self) -> SpeechCapabilities;

    fn metadata(&self) -> SpeechProviderMetadata;

    async fn list_models(
        &self,
        ctx: SpeechProviderContext<'_>,
    ) -> anyhow::Result<Vec<SpeechModelDescriptor>>;

    async fn transcribe(
        &self,
        ctx: SpeechProviderContext<'_>,
        request: SpeechTranscriptionRequest,
    ) -> anyhow::Result<SpeechTranscriptionResult>;
}

#[async_trait::async_trait]
pub trait SpeechSynthesizer: Send + Sync + 'static {
    fn id(&self) -> SpeechSynthesizerId;

    fn capabilities(&self) -> SpeechSynthesisCapabilities;

    fn metadata(&self) -> SpeechProviderMetadata;

    async fn list_models(
        &self,
        ctx: SpeechProviderContext<'_>,
    ) -> anyhow::Result<Vec<SpeechSynthesisModelDescriptor>>;

    async fn synthesize(
        &self,
        ctx: SpeechProviderContext<'_>,
        request: SpeechSynthesisRequest,
    ) -> anyhow::Result<SpeechSynthesisResult>;
}