smart-tree 8.0.1

Smart Tree - An intelligent, AI-friendly directory visualization tool
Documentation
//! Voice Integration - Transcription and TTS via liquid-rust
//!
//! Provides voice capabilities for the dashboard:
//! - Speech-to-text transcription with salience analysis
//! - Speaker recognition via Phoenix Protocol
//! - Text-to-speech with multiple voice personas
//!
//! Requires the `voice` feature flag and liquid-rust models.
//! Currently returns "not implemented" stubs until liquid-rust is integrated.

use axum::{
    extract::Multipart,
    http::StatusCode,
    Json,
};
use serde::{Deserialize, Serialize};

/// Transcription result with salience and speaker info
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TranscriptionResult {
    /// Transcribed text
    pub text: String,
    /// Salience score (0.0 to 1.0) - how important/urgent
    pub salience: f32,
    /// Identified speaker (if registered in Phoenix DB)
    pub speaker: Option<String>,
    /// Speaker identification confidence
    pub speaker_confidence: Option<f32>,
    /// Emotional profile
    pub emotion: Option<EmotionProfile>,
}

/// Emotional profile from voice analysis
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct EmotionProfile {
    /// Valence: positive (1.0) to negative (-1.0)
    pub valence: f32,
    /// Arousal: excited (1.0) to calm (0.0)
    pub arousal: f32,
    /// Voice stability (0.0 to 1.0)
    pub stability: f32,
}

/// TTS request
#[derive(Debug, Deserialize)]
pub struct SpeakRequest {
    /// Text to speak
    pub text: String,
    /// Voice persona to use
    #[serde(default = "default_voice")]
    pub voice: String,
}

fn default_voice() -> String {
    "aye".to_string()
}

/// Speaker registration request
#[derive(Debug, Deserialize)]
pub struct RegisterSpeakerRequest {
    /// Label for the speaker (e.g., "Hue")
    pub label: String,
}

// =============================================================================
// Stub API Handlers (until liquid-rust is integrated)
// =============================================================================

/// Transcribe uploaded audio
///
/// POST /api/voice/transcribe
/// Content-Type: multipart/form-data
///
/// Returns: TranscriptionResult with text, salience, and optional speaker ID
pub async fn transcribe(
    mut _multipart: Multipart,
) -> Result<Json<TranscriptionResult>, (StatusCode, String)> {
    // TODO: Enable when liquid-rust is integrated
    // For now, return a stub response
    Err((
        StatusCode::NOT_IMPLEMENTED,
        "Voice transcription requires liquid-rust integration. \
         See docs/plans/2025-11-11-realtime-collaborative-dashboard-design.md"
            .to_string(),
    ))
}

/// Register a speaker for Phoenix Protocol recognition
///
/// POST /api/voice/register
/// Content-Type: multipart/form-data
/// Fields: label (text), audio (file)
pub async fn register_speaker(
    mut _multipart: Multipart,
) -> Result<Json<serde_json::Value>, (StatusCode, String)> {
    Err((
        StatusCode::NOT_IMPLEMENTED,
        "Speaker registration requires liquid-rust integration.".to_string(),
    ))
}

/// Generate speech from text using TTS
///
/// POST /api/voice/speak
/// Content-Type: application/json
/// Body: { "text": "Hello", "voice": "aye" }
///
/// Available voices: aye, omnimom, claude, alert, sky, adam, bella, nicole, michael
pub async fn speak(
    Json(_req): Json<SpeakRequest>,
) -> Result<impl axum::response::IntoResponse, (StatusCode, String)> {
    Err::<([(axum::http::header::HeaderName, &str); 1], Vec<u8>), _>((
        StatusCode::NOT_IMPLEMENTED,
        "TTS requires liquid-rust integration.".to_string(),
    ))
}

// =============================================================================
// Future: Full implementation when liquid-rust is integrated
// =============================================================================
//
// When liquid-rust is ready, this module will provide:
//
// 1. VoiceEngine struct holding:
//    - LfmModel for transcription
//    - PhoenixSpeakerDB for speaker recognition
//    - TtsEngine for text-to-speech
//
// 2. Real implementations of:
//    - transcribe() -> Decode audio, run inference, analyze salience
//    - register_speaker() -> Add voice to Phoenix DB
//    - speak() -> Generate WAV audio from text
//
// 3. Integration with dashboard state:
//    - Voice hints sent via WebSocket
//    - Salience metrics displayed in UI
//    - Speaker identification in activity log
//
// See ../liquid-rust/examples/aye_ears.rs for reference implementation.