st/web_dashboard/voice.rs
1//! Voice Integration - Transcription and TTS via liquid-rust
2//!
3//! Provides voice capabilities for the dashboard:
4//! - Speech-to-text transcription with salience analysis
5//! - Speaker recognition via Phoenix Protocol
6//! - Text-to-speech with multiple voice personas
7//!
8//! Requires the `voice` feature flag and liquid-rust models.
9//! Currently returns "not implemented" stubs until liquid-rust is integrated.
10
11use axum::{
12 extract::Multipart,
13 http::StatusCode,
14 Json,
15};
16use serde::{Deserialize, Serialize};
17
18/// Transcription result with salience and speaker info
19#[derive(Debug, Clone, Serialize, Deserialize)]
20pub struct TranscriptionResult {
21 /// Transcribed text
22 pub text: String,
23 /// Salience score (0.0 to 1.0) - how important/urgent
24 pub salience: f32,
25 /// Identified speaker (if registered in Phoenix DB)
26 pub speaker: Option<String>,
27 /// Speaker identification confidence
28 pub speaker_confidence: Option<f32>,
29 /// Emotional profile
30 pub emotion: Option<EmotionProfile>,
31}
32
33/// Emotional profile from voice analysis
34#[derive(Debug, Clone, Serialize, Deserialize)]
35pub struct EmotionProfile {
36 /// Valence: positive (1.0) to negative (-1.0)
37 pub valence: f32,
38 /// Arousal: excited (1.0) to calm (0.0)
39 pub arousal: f32,
40 /// Voice stability (0.0 to 1.0)
41 pub stability: f32,
42}
43
44/// TTS request
45#[derive(Debug, Deserialize)]
46pub struct SpeakRequest {
47 /// Text to speak
48 pub text: String,
49 /// Voice persona to use
50 #[serde(default = "default_voice")]
51 pub voice: String,
52}
53
54fn default_voice() -> String {
55 "aye".to_string()
56}
57
58/// Speaker registration request
59#[derive(Debug, Deserialize)]
60pub struct RegisterSpeakerRequest {
61 /// Label for the speaker (e.g., "Hue")
62 pub label: String,
63}
64
65// =============================================================================
66// Stub API Handlers (until liquid-rust is integrated)
67// =============================================================================
68
69/// Transcribe uploaded audio
70///
71/// POST /api/voice/transcribe
72/// Content-Type: multipart/form-data
73///
74/// Returns: TranscriptionResult with text, salience, and optional speaker ID
75pub async fn transcribe(
76 mut _multipart: Multipart,
77) -> Result<Json<TranscriptionResult>, (StatusCode, String)> {
78 // TODO: Enable when liquid-rust is integrated
79 // For now, return a stub response
80 Err((
81 StatusCode::NOT_IMPLEMENTED,
82 "Voice transcription requires liquid-rust integration. \
83 See docs/plans/2025-11-11-realtime-collaborative-dashboard-design.md"
84 .to_string(),
85 ))
86}
87
88/// Register a speaker for Phoenix Protocol recognition
89///
90/// POST /api/voice/register
91/// Content-Type: multipart/form-data
92/// Fields: label (text), audio (file)
93pub async fn register_speaker(
94 mut _multipart: Multipart,
95) -> Result<Json<serde_json::Value>, (StatusCode, String)> {
96 Err((
97 StatusCode::NOT_IMPLEMENTED,
98 "Speaker registration requires liquid-rust integration.".to_string(),
99 ))
100}
101
102/// Generate speech from text using TTS
103///
104/// POST /api/voice/speak
105/// Content-Type: application/json
106/// Body: { "text": "Hello", "voice": "aye" }
107///
108/// Available voices: aye, omnimom, claude, alert, sky, adam, bella, nicole, michael
109pub async fn speak(
110 Json(_req): Json<SpeakRequest>,
111) -> Result<impl axum::response::IntoResponse, (StatusCode, String)> {
112 Err::<([(axum::http::header::HeaderName, &str); 1], Vec<u8>), _>((
113 StatusCode::NOT_IMPLEMENTED,
114 "TTS requires liquid-rust integration.".to_string(),
115 ))
116}
117
118// =============================================================================
119// Future: Full implementation when liquid-rust is integrated
120// =============================================================================
121//
122// When liquid-rust is ready, this module will provide:
123//
124// 1. VoiceEngine struct holding:
125// - LfmModel for transcription
126// - PhoenixSpeakerDB for speaker recognition
127// - TtsEngine for text-to-speech
128//
129// 2. Real implementations of:
130// - transcribe() -> Decode audio, run inference, analyze salience
131// - register_speaker() -> Add voice to Phoenix DB
132// - speak() -> Generate WAV audio from text
133//
134// 3. Integration with dashboard state:
135// - Voice hints sent via WebSocket
136// - Salience metrics displayed in UI
137// - Speaker identification in activity log
138//
139// See ../liquid-rust/examples/aye_ears.rs for reference implementation.