llmsdk_provider/transcription_model.rs
1//! Speech-to-text model trait and supporting types.
2//!
3//! Mirrors `@ai-sdk/provider/src/transcription-model/v4/*`. Implementations
4//! turn binary audio into transcribed text plus per-segment timing info.
5// Rust guideline compliant 2026-02-21
6
7use async_trait::async_trait;
8use serde::{Deserialize, Serialize};
9
10use crate::error::Result;
11use crate::shared::{FileBytes, Headers, ProviderMetadata, ProviderOptions, RequestInfo, Warning};
12
13/// Contract every speech-to-text model implements.
14///
15/// Mirrors `TranscriptionModelV4`.
16#[async_trait]
17pub trait TranscriptionModel: Send + Sync + std::fmt::Debug {
18 /// Provider id, e.g. `"openai.transcription"`.
19 fn provider(&self) -> &str;
20
21 /// Model id, e.g. `"whisper-1"` / `"gpt-4o-transcribe"`.
22 fn model_id(&self) -> &str;
23
24 /// Specification version (currently `"v4"`).
25 fn specification_version(&self) -> &'static str {
26 "v4"
27 }
28
29 /// Transcribe audio into text + segments.
30 ///
31 /// # Errors
32 ///
33 /// Returns a [`crate::ProviderError`] when the upstream call fails or
34 /// the response is malformed.
35 async fn do_generate(&self, options: TranscriptionOptions) -> Result<TranscriptionResult>;
36}
37
38/// Options for one [`TranscriptionModel::do_generate`] call.
39///
40/// Mirrors `TranscriptionModelV4CallOptions`.
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct TranscriptionOptions {
43 /// Raw audio bytes (or a base64 string captured as `FileBytes::Base64`).
44 pub audio: FileBytes,
45 /// IANA media type of the audio (e.g. `"audio/wav"`).
46 #[serde(rename = "mediaType")]
47 pub media_type: String,
48 /// Provider-specific options.
49 #[serde(
50 default,
51 rename = "providerOptions",
52 skip_serializing_if = "Option::is_none"
53 )]
54 pub provider_options: Option<ProviderOptions>,
55 /// Extra HTTP headers.
56 #[serde(default, skip_serializing_if = "Option::is_none")]
57 pub headers: Option<Headers>,
58}
59
60/// Result of [`TranscriptionModel::do_generate`].
61///
62/// Mirrors `TranscriptionModelV4Result`.
63#[derive(Debug, Clone)]
64pub struct TranscriptionResult {
65 /// Full transcribed text.
66 pub text: String,
67 /// Time-indexed segments (or words promoted to segments when only word
68 /// timings are available).
69 pub segments: Vec<TranscriptionSegment>,
70 /// Detected ISO 639-1 language code (e.g. `"en"`).
71 pub language: Option<String>,
72 /// Audio duration in seconds.
73 pub duration_in_seconds: Option<f64>,
74 /// Warnings.
75 pub warnings: Vec<Warning>,
76 /// Wire-level request information for telemetry and debugging.
77 ///
78 /// Mirrors `TranscriptionModelV4Result.request` (upstream
79 /// `transcription-model-v4-result.ts:53-58`); `None` for providers that
80 /// do not have a meaningful HTTP request body (e.g. local runtimes).
81 pub request: Option<RequestInfo>,
82 /// Response info (timestamp / headers / model id).
83 pub response: TranscriptionResponseInfo,
84 /// Provider-specific metadata.
85 pub provider_metadata: Option<ProviderMetadata>,
86}
87
88/// One transcription segment.
89#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct TranscriptionSegment {
91 /// Segment text.
92 pub text: String,
93 /// Start time in seconds.
94 #[serde(rename = "startSecond")]
95 pub start_second: f64,
96 /// End time in seconds.
97 #[serde(rename = "endSecond")]
98 pub end_second: f64,
99}
100
101/// Response metadata for transcription results.
102#[derive(Debug, Clone)]
103pub struct TranscriptionResponseInfo {
104 /// Timestamp the call started (RFC 3339).
105 pub timestamp: String,
106 /// Model id used.
107 pub model_id: String,
108 /// Response headers.
109 pub headers: Option<Headers>,
110 /// Raw response body (for debugging).
111 pub body: Option<serde_json::Value>,
112}