Skip to main content

entrenar/eval/generative/
asr.rs

1//! Automatic Speech Recognition (ASR) evaluation metrics
2//!
3//! Provides Word Error Rate (WER) and Real-Time Factor inverse (RTFx)
4//! for evaluating speech recognition and audio processing models.
5
6/// Compute Word Error Rate via word-level Levenshtein edit distance.
7///
8/// WER = (Substitutions + Deletions + Insertions) / Reference Length
9///
10/// Returns 0.0 for identical strings. Can exceed 1.0 when hypothesis is
11/// much longer than reference.
12///
13/// # Panics
14///
15/// Returns `f64::INFINITY` if reference is empty and hypothesis is non-empty.
16pub fn word_error_rate(reference: &str, hypothesis: &str) -> f64 {
17    let ref_words: Vec<&str> = reference.split_whitespace().collect();
18    let hyp_words: Vec<&str> = hypothesis.split_whitespace().collect();
19
20    let n = ref_words.len();
21    let m = hyp_words.len();
22
23    if n == 0 && m == 0 {
24        return 0.0;
25    }
26    if n == 0 {
27        return f64::INFINITY;
28    }
29
30    // Dynamic programming table for edit distance
31    let mut dp = vec![vec![0usize; m + 1]; n + 1];
32
33    for i in 0..=n {
34        dp[i][0] = i;
35    }
36    for j in 0..=m {
37        dp[0][j] = j;
38    }
39
40    for i in 1..=n {
41        for j in 1..=m {
42            let cost = usize::from(ref_words[i - 1] != hyp_words[j - 1]);
43            dp[i][j] = (dp[i - 1][j] + 1) // deletion
44                .min(dp[i][j - 1] + 1) // insertion
45                .min(dp[i - 1][j - 1] + cost); // substitution
46        }
47    }
48
49    dp[n][m] as f64 / n as f64
50}
51
52/// Compute inverse Real-Time Factor (RTFx).
53///
54/// RTFx = audio_duration / processing_time
55///
56/// Higher is better: RTFx=100 means the model processes audio 100x faster
57/// than real-time.
58///
59/// Returns 0.0 if `processing_secs` is zero or negative.
60pub fn real_time_factor_inverse(processing_secs: f64, audio_duration_secs: f64) -> f64 {
61    if processing_secs <= 0.0 {
62        return 0.0;
63    }
64    audio_duration_secs / processing_secs
65}