Skip to main content

oxicuda_seq/metrics/
ter.rs

1//! Translation Edit Rate (TER).
2//!
3//! Reference: Snover, M., Dorr, B., Schwartz, R., Micciulla, L. & Makhoul, J.
4//! (2006). *A Study of Translation Edit Rate with Targeted Human Annotation*.
5//! AMTA 2006. <https://aclanthology.org/2006.amta-papers.25/>.
6//!
7//! # Metric
8//!
9//! TER is the minimum number of word-level edits needed to turn a hypothesis into
10//! a reference, normalised by the reference length:
11//!
12//! ```text
13//! TER = (num_edits + num_shifts) / ref_length
14//! ```
15//!
16//! The edits are the usual insert / delete / substitute of Levenshtein distance,
17//! **plus block shifts**: a contiguous run of hypothesis words may be moved to a
18//! different position as a single edit. Finding the optimal set of shifts is
19//! NP-hard, so TER uses a **greedy shift heuristic** (Snover et al. §2.2):
20//!
21//! 1. Compute the current edit distance between the (possibly shifted) hypothesis
22//!    and the reference.
23//! 2. Search every contiguous block of hypothesis words that also occurs in the
24//!    reference and whose two endpoints are currently *mis-aligned* (so moving the
25//!    block could help). For each candidate destination, tentatively apply the
26//!    shift and recompute the edit distance.
27//! 3. Apply the single shift that **reduces** the edit distance the most (each
28//!    shift counts as one edit). Repeat until no shift helps.
29//! 4. Add the remaining insert/delete/substitute edit distance.
30//!
31//! This module reuses [`crate::metrics::edit_distance::align`] for the word-level
32//! edit distance and alignment backtrace. Production code never panics: all
33//! fallible paths return [`SeqError`] (notably an empty reference is rejected).
34
35use crate::error::{SeqError, SeqResult};
36use crate::metrics::edit_distance::{EditOp, align};
37
38/// Result of a Translation Edit Rate computation.
39#[derive(Debug, Clone, Copy, PartialEq)]
40pub struct TerResult {
41    /// TER score = `(num_edits + num_shifts) / ref_len`.
42    pub score: f64,
43    /// Number of insert/delete/substitute edits after all shifts were applied.
44    pub num_edits: usize,
45    /// Number of block shifts the greedy heuristic applied.
46    pub num_shifts: usize,
47    /// Reference length used for normalisation.
48    pub ref_len: usize,
49}
50
51/// Maximum block length considered by the greedy shift search.
52///
53/// Snover et al. cap shifts at 10 words; longer matching blocks are rare and the
54/// cap bounds the search to keep the heuristic tractable.
55const MAX_SHIFT_LEN: usize = 10;
56
57/// Number of insert/delete/substitute edits between two slices.
58fn edit_distance<T: Eq>(a: &[T], b: &[T]) -> usize {
59    align(a, b).counts.distance()
60}
61
62/// Mark, for each hypothesis position, whether it is currently aligned to an
63/// equal reference word (a "match" in the optimal alignment).
64///
65/// Positions that are *not* matched are the ones a shift might usefully relocate.
66fn aligned_mask<T: Eq>(hyp: &[T], ref_: &[T]) -> Vec<bool> {
67    let mut mask = vec![false; hyp.len()];
68    for op in align(hyp, ref_).ops {
69        if let EditOp::Match { src, .. } = op {
70            if src < mask.len() {
71                mask[src] = true;
72            }
73        }
74    }
75    mask
76}
77
78/// Apply a block shift: move `len` words starting at `from` so the block begins at
79/// `to` (index expressed in the *post-removal* sequence). Returns the new vector.
80fn apply_shift<T: Clone>(seq: &[T], from: usize, len: usize, to: usize) -> Vec<T> {
81    let mut block: Vec<T> = seq[from..from + len].to_vec();
82    let mut rest: Vec<T> = Vec::with_capacity(seq.len() - len);
83    rest.extend_from_slice(&seq[..from]);
84    rest.extend_from_slice(&seq[from + len..]);
85    let mut out = Vec::with_capacity(seq.len());
86    out.extend_from_slice(&rest[..to]);
87    out.append(&mut block);
88    out.extend_from_slice(&rest[to..]);
89    out
90}
91
92/// Find the single best shift that reduces the edit distance the most.
93///
94/// Returns `Some((from, len, to, new_distance))` for the most-reducing shift, or
95/// `None` if no shift strictly lowers the current distance.
96fn best_shift<T: Eq + Clone>(
97    hyp: &[T],
98    ref_: &[T],
99    current: usize,
100) -> Option<(usize, usize, usize, usize)> {
101    let h = hyp.len();
102    if h == 0 {
103        return None;
104    }
105    let mask = aligned_mask(hyp, ref_);
106
107    let mut best: Option<(usize, usize, usize, usize)> = None;
108    // For every candidate block [from, from+len) in the hypothesis …
109    let max_len = MAX_SHIFT_LEN.min(h);
110    for len in 1..=max_len {
111        for from in 0..=h - len {
112            // Only consider blocks that are not already fully aligned in place
113            // (shifting an already-matched block cannot help and wastes an edit).
114            let block_aligned = (from..from + len).all(|p| mask[p]);
115            if block_aligned {
116                continue;
117            }
118            // The block must occur in the reference for a shift to plausibly help.
119            if !occurs_in(ref_, &hyp[from..from + len]) {
120                continue;
121            }
122            // Try every destination in the post-removal sequence.
123            let rest_len = h - len;
124            for to in 0..=rest_len {
125                // Skip the no-op shift (putting the block back where it was).
126                if to == from {
127                    continue;
128                }
129                let shifted = apply_shift(hyp, from, len, to);
130                let dist = edit_distance(&shifted, ref_);
131                // One shift costs one edit; only keep it if it nets a reduction.
132                if dist + 1 < current {
133                    let better = match best {
134                        None => true,
135                        Some((_, _, _, bd)) => dist < bd,
136                    };
137                    if better {
138                        best = Some((from, len, to, dist));
139                    }
140                }
141            }
142        }
143    }
144    best
145}
146
147/// Whether `needle` appears as a contiguous sub-slice of `haystack`.
148fn occurs_in<T: Eq>(haystack: &[T], needle: &[T]) -> bool {
149    if needle.is_empty() || needle.len() > haystack.len() {
150        return false;
151    }
152    let last = haystack.len() - needle.len();
153    for start in 0..=last {
154        if haystack[start..start + needle.len()] == *needle {
155            return true;
156        }
157    }
158    false
159}
160
161/// Core TER computation over generic comparable tokens.
162///
163/// Greedily applies block shifts (each counting as one edit) while they reduce the
164/// edit distance, then adds the residual insert/delete/substitute distance and
165/// normalises by `ref_.len()`. Returns [`SeqError::EmptyInput`] for an empty
166/// reference (the metric is undefined when `ref_len == 0`).
167fn ter_tokens<T: Eq + Clone>(hyp: &[T], ref_: &[T]) -> SeqResult<TerResult> {
168    let ref_len = ref_.len();
169    if ref_len == 0 {
170        return Err(SeqError::EmptyInput);
171    }
172
173    let mut current_hyp: Vec<T> = hyp.to_vec();
174    let mut num_shifts = 0usize;
175    let mut current_dist = edit_distance(&current_hyp, ref_);
176
177    // Greedy shift loop: keep applying the most-reducing shift until none helps.
178    // The loop must terminate because each accepted shift strictly lowers the
179    // total (distance + shifts) lower bound: distance drops by at least 2 while
180    // shifts rise by 1, so the bound `current_dist + num_shifts` strictly
181    // decreases and is bounded below by 0.
182    loop {
183        if current_dist == 0 {
184            break;
185        }
186        match best_shift(&current_hyp, ref_, current_dist) {
187            Some((from, len, to, new_dist)) => {
188                current_hyp = apply_shift(&current_hyp, from, len, to);
189                current_dist = new_dist;
190                num_shifts += 1;
191            }
192            None => break,
193        }
194    }
195
196    let num_edits = current_dist;
197    let score = (num_edits + num_shifts) as f64 / ref_len as f64;
198    Ok(TerResult {
199        score,
200        num_edits,
201        num_shifts,
202        ref_len,
203    })
204}
205
206/// Translation Edit Rate between a hypothesis and a reference (word strings).
207///
208/// `TER = (num_edits + num_shifts) / ref_length`. Returns
209/// [`SeqError::EmptyInput`] if the reference is empty.
210pub fn ter(hyp: &[&str], ref_: &[&str]) -> SeqResult<TerResult> {
211    ter_tokens(hyp, ref_)
212}
213
214/// Token-id variant of [`ter`] for already-tokenised integer sequences.
215pub fn ter_ids(hyp: &[usize], ref_: &[usize]) -> SeqResult<TerResult> {
216    ter_tokens(hyp, ref_)
217}
218
219#[cfg(test)]
220mod tests {
221    use super::*;
222
223    fn words(s: &str) -> Vec<&str> {
224        s.split_whitespace().collect()
225    }
226
227    #[test]
228    fn identical_sentences_score_zero() {
229        let h = words("the cat sat on the mat");
230        let r = words("the cat sat on the mat");
231        let res = ter(&h, &r).expect("ter");
232        assert_eq!(res.num_edits, 0);
233        assert_eq!(res.num_shifts, 0);
234        assert!(res.score.abs() < 1e-12, "score={}", res.score);
235        assert_eq!(res.ref_len, 6);
236    }
237
238    #[test]
239    fn pure_substitutions_no_shifts() {
240        // Two words substituted, none movable → 2 edits, 0 shifts, 2/5.
241        let r = words("a b c d e");
242        let h = words("a x c y e");
243        let res = ter(&h, &r).expect("ter");
244        assert_eq!(res.num_shifts, 0);
245        assert_eq!(res.num_edits, 2);
246        assert!((res.score - 2.0 / 5.0).abs() < 1e-12, "score={}", res.score);
247    }
248
249    #[test]
250    fn block_reordering_finds_shift_and_lowers_score() {
251        // Reference: A B C D E F ; hypothesis has the block "E F" moved up front.
252        // A pure edit distance must delete "E F" at the front and insert them at
253        // the end (or substitute), whereas one block shift fixes it cheaply.
254        let r = words("A B C D E F");
255        let h = words("E F A B C D");
256        let no_shift = align(&h, &r).counts.distance() as f64 / r.len() as f64;
257        let res = ter(&h, &r).expect("ter");
258        assert!(
259            res.num_shifts >= 1,
260            "expected a shift, got {}",
261            res.num_shifts
262        );
263        assert!(
264            res.score < no_shift - 1e-12,
265            "shifted score {} should beat no-shift {}",
266            res.score,
267            no_shift
268        );
269        // After the optimal shift the sentences are identical: 1 shift, 0 edits.
270        assert_eq!(res.num_edits, 0);
271        assert_eq!(res.num_shifts, 1);
272        assert!((res.score - 1.0 / 6.0).abs() < 1e-12, "score={}", res.score);
273    }
274
275    #[test]
276    fn single_swap_is_one_shift() {
277        // Swapping a single adjacent pair: "b a" → "a b" is one shift.
278        let r = words("a b c");
279        let h = words("b a c");
280        let res = ter(&h, &r).expect("ter");
281        assert_eq!(res.num_edits, 0);
282        assert_eq!(res.num_shifts, 1);
283        assert!((res.score - 1.0 / 3.0).abs() < 1e-12);
284    }
285
286    #[test]
287    fn insertions_counted() {
288        // Hypothesis has one extra word → one deletion edit, no shifts.
289        let r = words("the quick fox");
290        let h = words("the quick brown fox");
291        let res = ter(&h, &r).expect("ter");
292        assert_eq!(res.num_shifts, 0);
293        assert_eq!(res.num_edits, 1);
294        assert!((res.score - 1.0 / 3.0).abs() < 1e-12);
295    }
296
297    #[test]
298    fn deletions_counted() {
299        // Hypothesis is missing one word → one insertion edit, no shifts.
300        let r = words("the quick brown fox");
301        let h = words("the quick fox");
302        let res = ter(&h, &r).expect("ter");
303        assert_eq!(res.num_shifts, 0);
304        assert_eq!(res.num_edits, 1);
305        assert!((res.score - 1.0 / 4.0).abs() < 1e-12);
306    }
307
308    #[test]
309    fn normalisation_by_reference_length() {
310        // Identical-content but different reference lengths scale the score.
311        let r_short = words("a b");
312        let h_short = words("a x");
313        let res_short = ter(&h_short, &r_short).expect("ter");
314        assert!((res_short.score - 1.0 / 2.0).abs() < 1e-12);
315
316        let r_long = words("a b c d");
317        let h_long = words("a x c d");
318        let res_long = ter(&h_long, &r_long).expect("ter");
319        assert!((res_long.score - 1.0 / 4.0).abs() < 1e-12);
320    }
321
322    #[test]
323    fn empty_reference_is_error() {
324        let h = words("a b c");
325        let r: Vec<&str> = Vec::new();
326        assert!(ter(&h, &r).is_err());
327    }
328
329    #[test]
330    fn empty_hypothesis_against_reference() {
331        // Empty hypothesis: all reference words are insertions, no shifts.
332        let h: Vec<&str> = Vec::new();
333        let r = words("a b c");
334        let res = ter(&h, &r).expect("ter");
335        assert_eq!(res.num_shifts, 0);
336        assert_eq!(res.num_edits, 3);
337        assert!((res.score - 1.0).abs() < 1e-12);
338    }
339
340    #[test]
341    fn token_id_variant_matches_string_variant() {
342        // Same structural reordering expressed as token ids.
343        let h_ids = vec![4usize, 5, 0, 1, 2, 3];
344        let r_ids = vec![0usize, 1, 2, 3, 4, 5];
345        let res = ter_ids(&h_ids, &r_ids).expect("ter");
346        assert_eq!(res.num_edits, 0);
347        assert_eq!(res.num_shifts, 1);
348        assert!((res.score - 1.0 / 6.0).abs() < 1e-12);
349    }
350
351    #[test]
352    fn shift_never_increases_total_cost() {
353        // The greedy heuristic must never produce a TER above the plain
354        // edit-distance / ref-len (shifts are only taken when they reduce cost).
355        let cases = [
356            ("the cat sat", "the sat cat"),
357            ("one two three four", "four three two one"),
358            ("a b c d e", "b c d e a"),
359            ("hello world foo bar", "foo bar hello world"),
360        ];
361        for (hs, rs) in cases {
362            let h = words(hs);
363            let r = words(rs);
364            let baseline = align(&h, &r).counts.distance() as f64 / r.len() as f64;
365            let res = ter(&h, &r).expect("ter");
366            assert!(
367                res.score <= baseline + 1e-12,
368                "case ({hs} | {rs}): ter {} > baseline {}",
369                res.score,
370                baseline
371            );
372        }
373    }
374
375    #[test]
376    fn far_block_move_is_single_shift() {
377        // Move a 2-word block across a long sentence; still one shift, zero edits.
378        let r = words("w x a b c d y z");
379        let h = words("a b w x c d y z");
380        let res = ter(&h, &r).expect("ter");
381        assert_eq!(res.num_edits, 0);
382        assert_eq!(res.num_shifts, 1);
383        assert!((res.score - 1.0 / 8.0).abs() < 1e-12, "score={}", res.score);
384    }
385}