sarvam-rs 0.2.0

Rust SDK for Sarvam AI APIs — chat, translation, speech-to-text, text-to-speech, transliteration, and language identification
Documentation
use std::path::Path;

use crate::config::SarvamConfig;
use crate::error::{Result, SarvamError};
use crate::types::speech_to_text::*;

pub struct SpeechToTextApi {
    config: SarvamConfig,
    client: reqwest::Client,
}

pub struct SpeechToTextRequestBuilder {
    config: SarvamConfig,
    client: reqwest::Client,
    file_path: String,
    model: Option<SpeechToTextModel>,
    mode: Option<SttMode>,
    language_code: Option<SpeechToTextLanguage>,
    input_audio_codec: Option<InputAudioCodec>,
}

impl SpeechToTextRequestBuilder {
    pub fn model(mut self, model: SpeechToTextModel) -> Self {
        self.model = Some(model);
        self
    }

    pub fn mode(mut self, mode: SttMode) -> Self {
        self.mode = Some(mode);
        self
    }

    pub fn language_code(mut self, lang: impl Into<String>) -> Self {
        self.language_code = Some(SpeechToTextLanguage::from_str(&lang.into()));
        self
    }

    pub fn language_enum(mut self, lang: SpeechToTextLanguage) -> Self {
        self.language_code = Some(lang);
        self
    }

    pub fn input_audio_codec(mut self, codec: InputAudioCodec) -> Self {
        self.input_audio_codec = Some(codec);
        self
    }

    pub async fn send(self) -> Result<SpeechToTextResponse> {
        let file_name = Path::new(&self.file_path)
            .file_name()
            .map(|n| n.to_string_lossy().to_string())
            .unwrap_or_else(|| "audio.wav".to_string());

        let file_bytes = tokio::fs::read(&self.file_path).await.map_err(|e| {
            SarvamError::Custom(format!("Failed to read file '{}': {}", self.file_path, e))
        })?;

        let file_part = reqwest::multipart::Part::bytes(file_bytes)
            .file_name(file_name)
            .mime_str("application/octet-stream")
            .map_err(|e| SarvamError::Custom(e.to_string()))?;

        let mut form = reqwest::multipart::Form::new().part("file", file_part);

        if let Some(model) = &self.model {
            let val = serde_json::to_value(model)
                .map_err(|e| SarvamError::Custom(e.to_string()))?
                .as_str()
                .ok_or_else(|| SarvamError::Custom("Invalid model value".into()))?
                .to_string();
            form = form.text("model", val);
        }

        if let Some(mode) = &self.mode {
            let val = serde_json::to_value(mode)
                .map_err(|e| SarvamError::Custom(e.to_string()))?
                .as_str()
                .ok_or_else(|| SarvamError::Custom("Invalid mode value".into()))?
                .to_string();
            form = form.text("mode", val);
        }

        if let Some(lang) = &self.language_code {
            let val = serde_json::to_value(lang)
                .map_err(|e| SarvamError::Custom(e.to_string()))?
                .as_str()
                .ok_or_else(|| SarvamError::Custom("Invalid language value".into()))?
                .to_string();
            form = form.text("language_code", val);
        }

        if let Some(codec) = &self.input_audio_codec {
            let val = serde_json::to_value(codec)
                .map_err(|e| SarvamError::Custom(e.to_string()))?
                .as_str()
                .ok_or_else(|| SarvamError::Custom("Invalid codec value".into()))?
                .to_string();
            form = form.text("input_audio_codec", val);
        }

        let url = format!("{}/speech-to-text", self.config.base_url);

        let response = self
            .client
            .post(&url)
            .header("api-subscription-key", &self.config.api_subscription_key)
            .multipart(form)
            .timeout(self.config.timeout)
            .send()
            .await?;

        let status = response.status();
        if !status.is_success() {
            let body = response.text().await.unwrap_or_default();
            return Err(SarvamError::from_response(status, &body));
        }

        response
            .json::<SpeechToTextResponse>()
            .await
            .map_err(SarvamError::from)
    }
}

impl SpeechToTextApi {
    pub(crate) fn new(config: SarvamConfig, client: reqwest::Client) -> Self {
        Self { config, client }
    }

    pub fn transcribe(&self, file_path: impl Into<String>) -> SpeechToTextRequestBuilder {
        SpeechToTextRequestBuilder {
            config: self.config.clone(),
            client: self.client.clone(),
            file_path: file_path.into(),
            model: None,
            mode: None,
            language_code: None,
            input_audio_codec: None,
        }
    }

    #[cfg(feature = "streaming")]
    pub fn stream(&self) -> crate::streaming::SttStreamBuilder {
        crate::streaming::SttStreamBuilder::new(self.config.clone())
    }
}

impl SpeechToTextLanguage {
    pub fn from_str(s: &str) -> Self {
        match s {
            "unknown" => SpeechToTextLanguage::Unknown,
            "hi-IN" => SpeechToTextLanguage::HiIn,
            "bn-IN" => SpeechToTextLanguage::BnIn,
            "kn-IN" => SpeechToTextLanguage::KnIn,
            "ml-IN" => SpeechToTextLanguage::MlIn,
            "mr-IN" => SpeechToTextLanguage::MrIn,
            "od-IN" => SpeechToTextLanguage::OdIn,
            "pa-IN" => SpeechToTextLanguage::PaIn,
            "ta-IN" => SpeechToTextLanguage::TaIn,
            "te-IN" => SpeechToTextLanguage::TeIn,
            "en-IN" => SpeechToTextLanguage::EnIn,
            "gu-IN" => SpeechToTextLanguage::GuIn,
            "as-IN" => SpeechToTextLanguage::AsIn,
            "ur-IN" => SpeechToTextLanguage::UrIn,
            "ne-IN" => SpeechToTextLanguage::NeIn,
            "kok-IN" => SpeechToTextLanguage::KokIn,
            "ks-IN" => SpeechToTextLanguage::KsIn,
            "sd-IN" => SpeechToTextLanguage::SdIn,
            "sa-IN" => SpeechToTextLanguage::SaIn,
            "sat-IN" => SpeechToTextLanguage::SatIn,
            "mni-IN" => SpeechToTextLanguage::MniIn,
            "brx-IN" => SpeechToTextLanguage::BrxIn,
            "mai-IN" => SpeechToTextLanguage::MaiIn,
            "doi-IN" => SpeechToTextLanguage::DoiIn,
            _ => SpeechToTextLanguage::Unknown,
        }
    }
}