vona-core 0.1.1

Core traits, event types, session driver, skill registry and runtime policy surface for real-time speech-to-speech runtimes
Documentation
use crate::types::{AudioInputFrame, AudioOutputFrame, ExternalContextEvent, SkillCall};
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use serde_json::Value;
use thiserror::Error;

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RealtimeVoiceModelFamily {
    HostedRealtimeApi { provider: String, model: String },
    MoshiFamily { variant: String },
    SeamlessFamily { variant: String },
    OpenRealtimeModel { name: String },
    Custom { name: String },
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct RealtimeVoiceSessionConfig {
    pub session_id: String,
    pub input_sample_rate_hz: u32,
    pub output_sample_rate_hz: u32,
    pub channels: u16,
    pub model_family: RealtimeVoiceModelFamily,
    #[serde(default)]
    pub metadata: Value,
}

impl Default for RealtimeVoiceSessionConfig {
    fn default() -> Self {
        Self {
            session_id: "default-realtime-session".to_string(),
            input_sample_rate_hz: 24_000,
            output_sample_rate_hz: 24_000,
            channels: 1,
            model_family: RealtimeVoiceModelFamily::Custom {
                name: "unspecified".to_string(),
            },
            metadata: Value::Null,
        }
    }
}

#[derive(Debug, Clone, Default, PartialEq, Eq, Serialize, Deserialize)]
pub struct RealtimeVoiceCapabilities {
    pub supports_full_duplex: bool,
    pub supports_streaming_audio_input: bool,
    pub supports_streaming_audio_output: bool,
    pub supports_tool_calls: bool,
    pub supports_interruption: bool,
    pub supports_context_injection: bool,
    pub is_hosted_service: bool,
    pub max_input_chunk_ms: Option<u32>,
}

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RealtimeVoiceControl {
    StartResponse,
    CommitInput,
    ClearOutput,
    Interrupt { reason: Option<String> },
    Close,
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RealtimeVoiceInput {
    Audio(AudioInputFrame),
    Text { text: String },
    ToolResult(ExternalContextEvent),
    Control(RealtimeVoiceControl),
}

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RealtimeLatencyStage {
    InputReceived,
    FirstAudio,
    ToolCallEmitted,
    OutputCleared,
}

#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct RealtimeLatencyMark {
    pub stage: RealtimeLatencyStage,
    pub elapsed_ms: u64,
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "snake_case")]
pub enum RealtimeVoiceOutput {
    Audio(AudioOutputFrame),
    TranscriptFragment { text: String, final_fragment: bool },
    ToolCall(SkillCall),
    Interruption { reason: Option<String> },
    LatencyMark(RealtimeLatencyMark),
    ResponseCompleted { reason: Option<String> },
    Closed { reason: Option<String> },
}

#[derive(Debug, Clone, Error, PartialEq, Eq)]
pub enum RealtimeVoiceError {
    #[error("realtime session start failed: {0}")]
    Start(String),
    #[error("realtime send failed: {0}")]
    Send(String),
    #[error("realtime receive failed: {0}")]
    Receive(String),
    #[error("realtime session close failed: {0}")]
    Close(String),
}

#[async_trait]
pub trait RealtimeVoiceBackend: Send + Sync {
    type Session: Send + Sync;

    fn realtime_capabilities(&self) -> RealtimeVoiceCapabilities;

    async fn start_realtime_session(
        &self,
        config: RealtimeVoiceSessionConfig,
    ) -> Result<Self::Session, RealtimeVoiceError>;

    async fn send_realtime_event(
        &self,
        session: &mut Self::Session,
        input: RealtimeVoiceInput,
    ) -> Result<(), RealtimeVoiceError>;

    async fn recv_realtime_event(
        &self,
        session: &mut Self::Session,
    ) -> Result<Option<RealtimeVoiceOutput>, RealtimeVoiceError>;

    async fn close_realtime_session(
        &self,
        session: Self::Session,
    ) -> Result<(), RealtimeVoiceError>;
}