Skip to main content

llmsdk_provider/
transcription_model.rs

1//! Speech-to-text model trait and supporting types.
2//!
3//! Mirrors `@ai-sdk/provider/src/transcription-model/v4/*`. Implementations
4//! turn binary audio into transcribed text plus per-segment timing info.
5// Rust guideline compliant 2026-02-21
6
7use async_trait::async_trait;
8use serde::{Deserialize, Serialize};
9
10use crate::error::Result;
11use crate::shared::{FileBytes, Headers, ProviderMetadata, ProviderOptions, RequestInfo, Warning};
12
13/// Contract every speech-to-text model implements.
14///
15/// Mirrors `TranscriptionModelV4`.
16#[async_trait]
17pub trait TranscriptionModel: Send + Sync + std::fmt::Debug {
18    /// Provider id, e.g. `"openai.transcription"`.
19    fn provider(&self) -> &str;
20
21    /// Model id, e.g. `"whisper-1"` / `"gpt-4o-transcribe"`.
22    fn model_id(&self) -> &str;
23
24    /// Specification version (currently `"v4"`).
25    fn specification_version(&self) -> &'static str {
26        "v4"
27    }
28
29    /// Transcribe audio into text + segments.
30    ///
31    /// # Errors
32    ///
33    /// Returns a [`crate::ProviderError`] when the upstream call fails or
34    /// the response is malformed.
35    async fn do_generate(&self, options: TranscriptionOptions) -> Result<TranscriptionResult>;
36}
37
38/// Options for one [`TranscriptionModel::do_generate`] call.
39///
40/// Mirrors `TranscriptionModelV4CallOptions`.
41#[derive(Debug, Clone, Serialize, Deserialize)]
42pub struct TranscriptionOptions {
43    /// Raw audio bytes (or a base64 string captured as `FileBytes::Base64`).
44    pub audio: FileBytes,
45    /// IANA media type of the audio (e.g. `"audio/wav"`).
46    #[serde(rename = "mediaType")]
47    pub media_type: String,
48    /// Provider-specific options.
49    #[serde(
50        default,
51        rename = "providerOptions",
52        skip_serializing_if = "Option::is_none"
53    )]
54    pub provider_options: Option<ProviderOptions>,
55    /// Extra HTTP headers.
56    #[serde(default, skip_serializing_if = "Option::is_none")]
57    pub headers: Option<Headers>,
58}
59
60/// Result of [`TranscriptionModel::do_generate`].
61///
62/// Mirrors `TranscriptionModelV4Result`.
63#[derive(Debug, Clone)]
64pub struct TranscriptionResult {
65    /// Full transcribed text.
66    pub text: String,
67    /// Time-indexed segments (or words promoted to segments when only word
68    /// timings are available).
69    pub segments: Vec<TranscriptionSegment>,
70    /// Detected ISO 639-1 language code (e.g. `"en"`).
71    pub language: Option<String>,
72    /// Audio duration in seconds.
73    pub duration_in_seconds: Option<f64>,
74    /// Warnings.
75    pub warnings: Vec<Warning>,
76    /// Wire-level request information for telemetry and debugging.
77    ///
78    /// Mirrors `TranscriptionModelV4Result.request` (upstream
79    /// `transcription-model-v4-result.ts:53-58`); `None` for providers that
80    /// do not have a meaningful HTTP request body (e.g. local runtimes).
81    pub request: Option<RequestInfo>,
82    /// Response info (timestamp / headers / model id).
83    pub response: TranscriptionResponseInfo,
84    /// Provider-specific metadata.
85    pub provider_metadata: Option<ProviderMetadata>,
86}
87
88/// One transcription segment.
89#[derive(Debug, Clone, Serialize, Deserialize)]
90pub struct TranscriptionSegment {
91    /// Segment text.
92    pub text: String,
93    /// Start time in seconds.
94    #[serde(rename = "startSecond")]
95    pub start_second: f64,
96    /// End time in seconds.
97    #[serde(rename = "endSecond")]
98    pub end_second: f64,
99}
100
101/// Response metadata for transcription results.
102#[derive(Debug, Clone)]
103pub struct TranscriptionResponseInfo {
104    /// Timestamp the call started (RFC 3339).
105    pub timestamp: String,
106    /// Model id used.
107    pub model_id: String,
108    /// Response headers.
109    pub headers: Option<Headers>,
110    /// Raw response body (for debugging).
111    pub body: Option<serde_json::Value>,
112}