use std::path::Path;
use async_trait::async_trait;
use serde::{Deserialize, Serialize};
use super::Result;
#[derive(Debug, Clone, Serialize, Deserialize, PartialEq)]
pub struct WordTiming {
pub word: String,
pub start: f64,
pub end: f64,
pub confidence: f32,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TranscriptSegment {
pub text: String,
pub start: f64,
pub end: f64,
pub confidence: f32,
#[serde(skip_serializing_if = "Option::is_none")]
pub language: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub speaker: Option<String>,
#[serde(skip_serializing_if = "Option::is_none")]
pub words: Option<Vec<WordTiming>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SpeakerSegment {
pub speaker: String,
pub start: f64,
pub end: f64,
#[serde(skip_serializing_if = "Option::is_none")]
pub embedding: Option<Vec<f32>>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TranscriptionResult {
pub segments: Vec<TranscriptSegment>,
pub language: String,
pub duration_seconds: f64,
pub model: String,
pub backend: String,
pub rtfx: f64,
pub processing_time_seconds: f64,
#[serde(skip_serializing_if = "Option::is_none")]
pub speakers: Option<Vec<SpeakerSegment>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub footnotes: Option<Vec<String>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub active_reading: Option<crate::analyze::active_reading::ActiveReadingMetadata>,
}
#[derive(Debug, Clone, Default)]
pub struct TranscribeOptions {
pub language: Option<String>,
pub word_timestamps: bool,
pub diarize: bool,
pub max_duration_seconds: Option<u32>,
pub include_embeddings: bool,
}
#[async_trait]
pub trait AsrBackend: Send + Sync {
fn name(&self) -> &'static str;
fn supported_languages(&self) -> &'static [&'static str];
fn is_available(&self) -> bool;
async fn transcribe(
&self,
audio_path: &Path,
opts: TranscribeOptions,
) -> Result<TranscriptionResult>;
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn word_timing_json_roundtrip() {
let wt = WordTiming {
word: "hello".to_string(),
start: 1.23,
end: 1.89,
confidence: 0.97,
};
let json = serde_json::to_string(&wt).expect("serialize");
let decoded: WordTiming = serde_json::from_str(&json).expect("deserialize");
assert_eq!(decoded, wt);
}
#[test]
fn transcript_segment_omits_none_fields() {
let seg = TranscriptSegment {
text: "Hei maailma".to_string(),
start: 0.0,
end: 1.5,
confidence: 0.94,
language: None,
speaker: None,
words: None,
};
let json = serde_json::to_string(&seg).expect("serialize");
assert!(!json.contains("language"));
assert!(!json.contains("speaker"));
assert!(!json.contains("words"));
}
#[test]
fn transcript_segment_includes_some_fields() {
let seg = TranscriptSegment {
text: "Test".to_string(),
start: 0.0,
end: 1.0,
confidence: 0.9,
language: Some("fi".to_string()),
speaker: Some("SPEAKER_00".to_string()),
words: Some(vec![WordTiming {
word: "Test".to_string(),
start: 0.0,
end: 1.0,
confidence: 0.9,
}]),
};
let json = serde_json::to_string(&seg).expect("serialize");
assert!(json.contains("\"language\""));
assert!(json.contains("\"speaker\""));
assert!(json.contains("\"words\""));
assert!(json.contains("SPEAKER_00"));
assert!(json.contains("\"fi\""));
}
#[test]
fn transcription_result_omits_speakers_when_none() {
let result = TranscriptionResult {
segments: vec![],
language: "en".to_string(),
duration_seconds: 30.0,
model: "parakeet-tdt-0.6b-v3".to_string(),
backend: "fluidaudio".to_string(),
rtfx: 143.0,
processing_time_seconds: 0.21,
speakers: None,
footnotes: None,
active_reading: None,
};
let json = serde_json::to_string(&result).expect("serialize");
assert!(!json.contains("speakers"));
}
#[test]
fn transcribe_options_default_is_minimal() {
let opts = TranscribeOptions::default();
assert!(opts.language.is_none());
assert!(!opts.word_timestamps);
assert!(!opts.diarize);
assert!(opts.max_duration_seconds.is_none());
assert!(
!opts.include_embeddings,
"include_embeddings must default to false"
);
}
#[test]
fn speaker_segment_omits_embedding_when_none() {
let seg = SpeakerSegment {
speaker: "SPEAKER_00".to_string(),
start: 0.0,
end: 1.5,
embedding: None,
};
let json = serde_json::to_string(&seg).expect("serialize");
assert!(
!json.contains("embedding"),
"embedding must be absent when None: {json}"
);
}
#[test]
fn speaker_segment_includes_embedding_when_some() {
let emb: Vec<f32> = (0..256).map(|i| i as f32 / 256.0).collect();
let seg = SpeakerSegment {
speaker: "SPEAKER_01".to_string(),
start: 1.5,
end: 3.0,
embedding: Some(emb.clone()),
};
let json = serde_json::to_string(&seg).expect("serialize");
assert!(
json.contains("\"embedding\""),
"embedding must be present: {json}"
);
let decoded: SpeakerSegment = serde_json::from_str(&json).expect("deserialize");
let decoded_emb = decoded
.embedding
.expect("embedding present after roundtrip");
assert_eq!(decoded_emb.len(), 256);
assert!((decoded_emb[0] - emb[0]).abs() < f32::EPSILON);
}
#[test]
fn transcribe_options_include_embeddings_can_be_enabled() {
let opts = TranscribeOptions {
include_embeddings: true,
..Default::default()
};
assert!(opts.include_embeddings);
}
}