whisrs 0.1.6

Linux-first voice-to-text dictation tool with Groq, OpenAI, and local Whisper backends
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
//! OpenAI Realtime API transcription backend (true streaming via WebSocket).
//!
//! Connects to `wss://api.openai.com/v1/realtime` and streams base64-encoded
//! 24kHz PCM audio. Receives incremental transcription deltas.

use async_trait::async_trait;
use base64::Engine;
use futures_util::{SinkExt, StreamExt};
use serde::{Deserialize, Serialize};
use tokio::sync::mpsc;
use tokio_tungstenite::tungstenite;
use tracing::{debug, error, info};

use crate::audio::AudioChunk;

use super::{TranscriptionBackend, TranscriptionConfig};

/// OpenAI Realtime API transcription backend.
pub struct OpenAIRealtimeBackend {
    api_key: String,
}

impl OpenAIRealtimeBackend {
    /// Create a new OpenAI Realtime backend.
    pub fn new(api_key: String) -> Self {
        Self { api_key }
    }

    /// Resolve the API key from the struct field or environment variable.
    fn resolve_api_key(&self) -> anyhow::Result<String> {
        if !self.api_key.is_empty() {
            return Ok(self.api_key.clone());
        }
        std::env::var("WHISRS_OPENAI_API_KEY")
            .map_err(|_| anyhow::anyhow!("no OpenAI API key configured — set WHISRS_OPENAI_API_KEY or add [openai] to config.toml"))
    }
}

// ---------------------------------------------------------------------------
// WebSocket message types (manually defined per task requirements)
// ---------------------------------------------------------------------------

/// Client message: session.update
#[derive(Debug, Serialize)]
struct SessionUpdate {
    #[serde(rename = "type")]
    msg_type: String,
    session: SessionConfig,
}

#[derive(Debug, Serialize)]
struct SessionConfig {
    input_audio_format: String,
    input_audio_transcription: AudioTranscriptionConfig,
    turn_detection: TurnDetectionConfig,
}

#[derive(Debug, Serialize)]
struct AudioTranscriptionConfig {
    model: String,
    #[serde(skip_serializing_if = "String::is_empty")]
    language: String,
}

#[derive(Debug, Serialize)]
struct TurnDetectionConfig {
    #[serde(rename = "type")]
    detection_type: String,
}

/// Client message: input_audio_buffer.append
#[derive(Debug, Serialize)]
struct AudioBufferAppend {
    #[serde(rename = "type")]
    msg_type: String,
    audio: String,
}

/// Server message envelope — we parse the `type` field first, then
/// deserialize the specific variant.
#[derive(Debug, Deserialize)]
struct ServerMessage {
    #[serde(rename = "type")]
    msg_type: String,
    /// Transcription delta text.
    #[serde(default)]
    delta: Option<String>,
    /// Completed transcript text.
    #[serde(default)]
    transcript: Option<String>,
    /// Error details.
    #[serde(default)]
    error: Option<ServerError>,
}

#[derive(Debug, Deserialize)]
struct ServerError {
    #[serde(default)]
    message: String,
}

impl SessionUpdate {
    fn new(model: &str, language: &str) -> Self {
        // Map "auto" to empty string (let the API auto-detect).
        let lang = if language == "auto" {
            String::new()
        } else {
            language.to_string()
        };
        Self {
            msg_type: "transcription_session.update".to_string(),
            session: SessionConfig {
                input_audio_format: "pcm16".to_string(),
                input_audio_transcription: AudioTranscriptionConfig {
                    model: model.to_string(),
                    language: lang,
                },
                turn_detection: TurnDetectionConfig {
                    detection_type: "server_vad".to_string(),
                },
            },
        }
    }
}

impl AudioBufferAppend {
    fn new(base64_audio: String) -> Self {
        Self {
            msg_type: "input_audio_buffer.append".to_string(),
            audio: base64_audio,
        }
    }
}

/// Resample 16kHz i16 samples to 24kHz i16 samples using linear interpolation.
fn resample_16k_to_24k(samples: &[i16]) -> Vec<i16> {
    if samples.is_empty() {
        return Vec::new();
    }

    let ratio = 24_000.0 / 16_000.0; // 1.5
    let output_len = (samples.len() as f64 * ratio).ceil() as usize;
    let mut output = Vec::with_capacity(output_len);

    for i in 0..output_len {
        let src_pos = i as f64 / ratio;
        let src_idx = src_pos as usize;
        let frac = src_pos - src_idx as f64;

        let sample = if src_idx + 1 < samples.len() {
            let a = samples[src_idx] as f64;
            let b = samples[src_idx + 1] as f64;
            (a + frac * (b - a)) as i16
        } else if src_idx < samples.len() {
            samples[src_idx]
        } else {
            0
        };

        output.push(sample);
    }

    output
}

/// Encode i16 PCM samples to base64.
fn encode_pcm_base64(samples: &[i16]) -> String {
    let bytes: Vec<u8> = samples.iter().flat_map(|s| s.to_le_bytes()).collect();
    base64::engine::general_purpose::STANDARD.encode(&bytes)
}

#[async_trait]
impl TranscriptionBackend for OpenAIRealtimeBackend {
    async fn transcribe(
        &self,
        audio: &[u8],
        config: &TranscriptionConfig,
    ) -> anyhow::Result<String> {
        // For non-streaming use, we set up the full WebSocket pipeline with a
        // single audio chunk, then collect the result.
        let (audio_tx, audio_rx) = mpsc::channel::<AudioChunk>(16);
        let (text_tx, mut text_rx) = mpsc::channel::<String>(16);

        // Decode WAV to get raw samples.
        let cursor = std::io::Cursor::new(audio);
        let reader = hound::WavReader::new(cursor)?;
        let samples: Vec<i16> = reader.into_samples::<i16>().collect::<Result<_, _>>()?;

        // Send all audio as one chunk, then close.
        audio_tx.send(samples).await.ok();
        drop(audio_tx);

        let config_clone = config.clone();
        let stream_result = self.transcribe_stream(audio_rx, text_tx, &config_clone);

        // Collect all text pieces.
        let collector = async {
            let mut full_text = String::new();
            while let Some(text) = text_rx.recv().await {
                if !full_text.is_empty() {
                    full_text.push(' ');
                }
                full_text.push_str(&text);
            }
            full_text
        };

        let (stream_res, text) = tokio::join!(stream_result, collector);
        stream_res?;

        Ok(text)
    }

    async fn transcribe_stream(
        &self,
        mut audio_rx: mpsc::Receiver<AudioChunk>,
        text_tx: mpsc::Sender<String>,
        config: &TranscriptionConfig,
    ) -> anyhow::Result<()> {
        let api_key = self.resolve_api_key()?;
        let model = &config.model;
        let url = "wss://api.openai.com/v1/realtime?intent=transcription".to_string();

        info!("connecting to OpenAI Realtime API: {url}");

        let request = tungstenite::http::Request::builder()
            .uri(&url)
            .header("Authorization", format!("Bearer {api_key}"))
            .header("OpenAI-Beta", "realtime=v1")
            .header(
                "Sec-WebSocket-Key",
                tungstenite::handshake::client::generate_key(),
            )
            .header("Sec-WebSocket-Version", "13")
            .header("Host", "api.openai.com")
            .header("Connection", "Upgrade")
            .header("Upgrade", "websocket")
            .body(())?;

        let (ws_stream, _response) = tokio_tungstenite::connect_async(request).await?;
        let (mut ws_sink, mut ws_source) = ws_stream.split();

        info!("connected to OpenAI Realtime API");

        // Send transcription session configuration.
        let session_update = SessionUpdate::new(model, &config.language);
        let session_json = serde_json::to_string(&session_update)?;
        ws_sink
            .send(tungstenite::Message::Text(session_json.into()))
            .await?;
        debug!("sent transcription_session.update for model={model}");

        // Spawn a task to send audio.
        let send_task = tokio::spawn(async move {
            while let Some(chunk) = audio_rx.recv().await {
                // Resample 16kHz to 24kHz for the Realtime API.
                let resampled = resample_16k_to_24k(&chunk);
                let b64 = encode_pcm_base64(&resampled);
                let msg = AudioBufferAppend::new(b64);
                let json = match serde_json::to_string(&msg) {
                    Ok(j) => j,
                    Err(e) => {
                        error!("failed to serialize audio buffer append: {e}");
                        continue;
                    }
                };
                if ws_sink
                    .send(tungstenite::Message::Text(json.into()))
                    .await
                    .is_err()
                {
                    error!("WebSocket send failed — connection may be closed");
                    break;
                }
            }

            // All real audio sent. Send a short silence burst so the
            // server VAD detects end-of-speech and triggers transcription
            // for any remaining buffered audio.
            debug!("sending silence for VAD end-of-speech detection");
            let silence_samples = vec![0i16; 12_000]; // 0.5s at 24kHz
            let silence_b64 = encode_pcm_base64(&silence_samples);
            let msg = AudioBufferAppend::new(silence_b64);
            if let Ok(json) = serde_json::to_string(&msg) {
                ws_sink
                    .send(tungstenite::Message::Text(json.into()))
                    .await
                    .ok();
            }

            // Wait for the server to process remaining audio and send
            // transcription events, then close the WebSocket. This ends
            // the receive loop via the Close frame.
            tokio::time::sleep(std::time::Duration::from_secs(3)).await;
            debug!("closing WebSocket after post-silence delay");
            ws_sink.send(tungstenite::Message::Close(None)).await.ok();
        });

        // Receive transcription events (with a timeout to avoid hanging forever).
        let timeout_duration = std::time::Duration::from_secs(15);
        while let Ok(Some(msg_result)) =
            tokio::time::timeout(timeout_duration, ws_source.next()).await
        {
            match msg_result {
                Ok(tungstenite::Message::Text(text)) => {
                    match serde_json::from_str::<ServerMessage>(&text) {
                        Ok(server_msg) => match server_msg.msg_type.as_str() {
                            "conversation.item.input_audio_transcription.delta" => {
                                if let Some(delta) = server_msg.delta {
                                    if !delta.is_empty() {
                                        debug!("realtime delta: {delta}");
                                        text_tx.send(delta).await.ok();
                                    }
                                }
                            }
                            "conversation.item.input_audio_transcription.completed" => {
                                if let Some(transcript) = server_msg.transcript {
                                    debug!("realtime completed: {transcript}");
                                }
                            }
                            "error" | "conversation.item.input_audio_transcription.failed" => {
                                let err_msg = server_msg
                                    .error
                                    .map(|e| e.message)
                                    .unwrap_or_else(|| "unknown error".to_string());
                                error!("OpenAI Realtime error: {err_msg}");
                                // Log the raw message for debugging.
                                debug!("raw error message: {text}");
                            }
                            "session.created"
                            | "session.updated"
                            | "transcription_session.created"
                            | "transcription_session.updated" => {
                                debug!("session event: {}", server_msg.msg_type);
                            }
                            "input_audio_buffer.committed"
                            | "input_audio_buffer.speech_started"
                            | "input_audio_buffer.speech_stopped" => {
                                debug!("audio buffer event: {}", server_msg.msg_type);
                            }
                            other => {
                                debug!("unhandled server message type: {other}");
                            }
                        },
                        Err(e) => {
                            debug!("failed to parse server message: {e}");
                        }
                    }
                }
                Ok(tungstenite::Message::Close(_)) => {
                    info!("WebSocket closed by server");
                    break;
                }
                Err(e) => {
                    error!("WebSocket receive error: {e}");
                    break;
                }
                _ => {}
            }
        }

        send_task.await.ok();
        info!("OpenAI Realtime stream finished");

        Ok(())
    }

    fn supports_streaming(&self) -> bool {
        true
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn session_update_serialization() {
        let msg = SessionUpdate::new("gpt-4o-mini-transcribe", "en");
        let json = serde_json::to_value(&msg).unwrap();

        assert_eq!(json["type"], "transcription_session.update");
        assert_eq!(json["session"]["input_audio_format"], "pcm16");
        assert_eq!(
            json["session"]["input_audio_transcription"]["model"],
            "gpt-4o-mini-transcribe"
        );
        assert_eq!(
            json["session"]["input_audio_transcription"]["language"],
            "en"
        );
        assert_eq!(json["session"]["turn_detection"]["type"], "server_vad");
    }

    #[test]
    fn session_update_auto_language_omitted() {
        let msg = SessionUpdate::new("gpt-4o-transcribe", "auto");
        let json = serde_json::to_value(&msg).unwrap();

        // "auto" should be converted to empty string and skipped
        assert!(json["session"]["input_audio_transcription"]
            .get("language")
            .is_none());
    }

    #[test]
    fn audio_buffer_append_serialization() {
        let msg = AudioBufferAppend::new("AQID".to_string());
        let json = serde_json::to_value(&msg).unwrap();

        assert_eq!(json["type"], "input_audio_buffer.append");
        assert_eq!(json["audio"], "AQID");
    }

    #[test]
    fn parse_delta_message() {
        let json =
            r#"{"type": "conversation.item.input_audio_transcription.delta", "delta": "Hello "}"#;
        let msg: ServerMessage = serde_json::from_str(json).unwrap();
        assert_eq!(
            msg.msg_type,
            "conversation.item.input_audio_transcription.delta"
        );
        assert_eq!(msg.delta.as_deref(), Some("Hello "));
    }

    #[test]
    fn parse_completed_message() {
        let json = r#"{"type": "conversation.item.input_audio_transcription.completed", "transcript": "Hello world"}"#;
        let msg: ServerMessage = serde_json::from_str(json).unwrap();
        assert_eq!(
            msg.msg_type,
            "conversation.item.input_audio_transcription.completed"
        );
        assert_eq!(msg.transcript.as_deref(), Some("Hello world"));
    }

    #[test]
    fn parse_error_message() {
        let json = r#"{"type": "error", "error": {"message": "Invalid API key"}}"#;
        let msg: ServerMessage = serde_json::from_str(json).unwrap();
        assert_eq!(msg.msg_type, "error");
        assert_eq!(msg.error.unwrap().message, "Invalid API key");
    }

    #[test]
    fn parse_session_created() {
        let json = r#"{"type": "session.created"}"#;
        let msg: ServerMessage = serde_json::from_str(json).unwrap();
        assert_eq!(msg.msg_type, "session.created");
    }

    #[test]
    fn resample_empty() {
        let result = resample_16k_to_24k(&[]);
        assert!(result.is_empty());
    }

    #[test]
    fn resample_ratio() {
        // 16000 samples at 16kHz = 1 second.
        // At 24kHz, 1 second = 24000 samples.
        let input: Vec<i16> = vec![100; 16000];
        let output = resample_16k_to_24k(&input);
        // Allow some rounding tolerance.
        assert!(
            (output.len() as i64 - 24000).abs() <= 2,
            "expected ~24000, got {}",
            output.len()
        );
    }

    #[test]
    fn encode_pcm_base64_roundtrip() {
        let samples: Vec<i16> = vec![1, 2, 3, -1];
        let encoded = encode_pcm_base64(&samples);

        let decoded_bytes = base64::engine::general_purpose::STANDARD
            .decode(&encoded)
            .unwrap();
        let decoded_samples: Vec<i16> = decoded_bytes
            .chunks_exact(2)
            .map(|c| i16::from_le_bytes([c[0], c[1]]))
            .collect();
        assert_eq!(decoded_samples, samples);
    }
}