use std::path::{Path, PathBuf};
use crate::{TranscriptionEngine, TranscriptionResult, TranscriptionSegment};
use super::model::{SenseVoiceError, SenseVoiceModel};
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Language {
Auto,
Chinese,
English,
Japanese,
Korean,
Cantonese,
}
impl Language {
fn as_str(&self) -> &str {
match self {
Language::Auto => "auto",
Language::Chinese => "zh",
Language::English => "en",
Language::Japanese => "ja",
Language::Korean => "ko",
Language::Cantonese => "yue",
}
}
}
impl Default for Language {
fn default() -> Self {
Language::Auto
}
}
#[derive(Debug, Clone, Default, PartialEq)]
pub enum QuantizationType {
#[default]
FP32,
Int8,
}
#[derive(Debug, Clone, Default)]
pub struct SenseVoiceModelParams {
pub quantization: QuantizationType,
}
impl SenseVoiceModelParams {
pub fn fp32() -> Self {
Self {
quantization: QuantizationType::FP32,
}
}
pub fn int8() -> Self {
Self {
quantization: QuantizationType::Int8,
}
}
}
#[derive(Debug, Clone)]
pub struct SenseVoiceInferenceParams {
pub language: Language,
pub use_itn: bool,
}
impl Default for SenseVoiceInferenceParams {
fn default() -> Self {
Self {
language: Language::Auto,
use_itn: true,
}
}
}
pub struct SenseVoiceEngine {
loaded_model_path: Option<PathBuf>,
model: Option<SenseVoiceModel>,
}
impl SenseVoiceEngine {
pub fn new() -> Self {
Self {
loaded_model_path: None,
model: None,
}
}
}
impl Default for SenseVoiceEngine {
fn default() -> Self {
Self::new()
}
}
impl Drop for SenseVoiceEngine {
fn drop(&mut self) {
self.unload_model();
}
}
impl TranscriptionEngine for SenseVoiceEngine {
type InferenceParams = SenseVoiceInferenceParams;
type ModelParams = SenseVoiceModelParams;
fn load_model_with_params(
&mut self,
model_path: &Path,
params: Self::ModelParams,
) -> Result<(), Box<dyn std::error::Error>> {
self.unload_model();
let quantized = matches!(params.quantization, QuantizationType::Int8);
self.model = Some(SenseVoiceModel::new(model_path, quantized)?);
self.loaded_model_path = Some(model_path.to_path_buf());
log::info!("Loaded SenseVoice model from {:?}", model_path);
Ok(())
}
fn unload_model(&mut self) {
if self.model.is_some() {
log::debug!("Unloading SenseVoice model");
self.model = None;
self.loaded_model_path = None;
}
}
fn transcribe_samples(
&mut self,
samples: Vec<f32>,
params: Option<Self::InferenceParams>,
) -> Result<TranscriptionResult, Box<dyn std::error::Error>> {
let model = self
.model
.as_mut()
.ok_or_else(|| SenseVoiceError::ModelNotLoaded)?;
let params = params.unwrap_or_default();
log::debug!(
"Transcribing {} samples ({:.2}s), language={:?}, use_itn={}",
samples.len(),
samples.len() as f32 / 16000.0,
params.language,
params.use_itn,
);
let result = model.transcribe(&samples, params.language.as_str(), params.use_itn)?;
let segments = if !result.timestamps.is_empty() {
let mut segs = Vec::new();
for (i, token) in result.tokens.iter().enumerate() {
let start = result.timestamps.get(i).copied().unwrap_or(0.0);
let end = result
.timestamps
.get(i + 1)
.copied()
.unwrap_or(start + 0.06); segs.push(TranscriptionSegment {
start,
end,
text: token.clone(),
});
}
Some(segs)
} else {
None
};
Ok(TranscriptionResult {
text: result.text,
segments,
})
}
}