entrenar/eval/generative/asr.rs
1//! Automatic Speech Recognition (ASR) evaluation metrics
2//!
3//! Provides Word Error Rate (WER) and Real-Time Factor inverse (RTFx)
4//! for evaluating speech recognition and audio processing models.
5
6/// Compute Word Error Rate via word-level Levenshtein edit distance.
7///
8/// WER = (Substitutions + Deletions + Insertions) / Reference Length
9///
10/// Returns 0.0 for identical strings. Can exceed 1.0 when hypothesis is
11/// much longer than reference.
12///
13/// # Panics
14///
15/// Returns `f64::INFINITY` if reference is empty and hypothesis is non-empty.
16pub fn word_error_rate(reference: &str, hypothesis: &str) -> f64 {
17 let ref_words: Vec<&str> = reference.split_whitespace().collect();
18 let hyp_words: Vec<&str> = hypothesis.split_whitespace().collect();
19
20 let n = ref_words.len();
21 let m = hyp_words.len();
22
23 if n == 0 && m == 0 {
24 return 0.0;
25 }
26 if n == 0 {
27 return f64::INFINITY;
28 }
29
30 // Dynamic programming table for edit distance
31 let mut dp = vec![vec![0usize; m + 1]; n + 1];
32
33 for i in 0..=n {
34 dp[i][0] = i;
35 }
36 for j in 0..=m {
37 dp[0][j] = j;
38 }
39
40 for i in 1..=n {
41 for j in 1..=m {
42 let cost = usize::from(ref_words[i - 1] != hyp_words[j - 1]);
43 dp[i][j] = (dp[i - 1][j] + 1) // deletion
44 .min(dp[i][j - 1] + 1) // insertion
45 .min(dp[i - 1][j - 1] + cost); // substitution
46 }
47 }
48
49 dp[n][m] as f64 / n as f64
50}
51
52/// Compute inverse Real-Time Factor (RTFx).
53///
54/// RTFx = audio_duration / processing_time
55///
56/// Higher is better: RTFx=100 means the model processes audio 100x faster
57/// than real-time.
58///
59/// Returns 0.0 if `processing_secs` is zero or negative.
60pub fn real_time_factor_inverse(processing_secs: f64, audio_duration_secs: f64) -> f64 {
61 if processing_secs <= 0.0 {
62 return 0.0;
63 }
64 audio_duration_secs / processing_secs
65}