textdistance/algorithms/
roberts.rs

1//! Roberts similarity
2#![cfg(feature = "std")]
3use crate::counter::Counter;
4use crate::{Algorithm, Result};
5
6/// [Roberts similarity].
7///
8/// The metric is always normalized on the interval from 0.0 to 1.0.
9///
10/// [Roberts similarity]: https://github.com/chrislit/abydos/blob/master/abydos/distance/_roberts.py
11#[derive(Default)]
12pub struct Roberts {}
13
14impl Algorithm<f64> for Roberts {
15    fn for_iter<C, E>(&self, s1: C, s2: C) -> Result<f64>
16    where
17        C: Iterator<Item = E>,
18        E: Eq + core::hash::Hash,
19    {
20        let c1 = Counter::from_iter(s1);
21        let c2 = Counter::from_iter(s2);
22        let n1 = c1.count();
23        let n2 = c2.count();
24        if n1 == 0 && n2 == 0 {
25            return Result {
26                abs: 1.0,
27                is_distance: false,
28                max: 1.,
29                len1: n1,
30                len2: n2,
31            };
32        }
33
34        let cm = c1.merge(&c2);
35        let alphabet = cm.keys();
36        let mut s1: f64 = 0.;
37        let mut s2: usize = 0;
38        for key in alphabet {
39            let v1 = c1.get(key).unwrap_or(&0);
40            let v2 = c2.get(key).unwrap_or(&0);
41            if v1 != &0 && v2 != &0 {
42                s1 += ((v1 + v2) * v1.min(v2)) as f64 / *v1.max(v2) as f64;
43            }
44            s2 += v1 + v2;
45        }
46
47        Result {
48            abs: s1 / s2 as f64,
49            is_distance: false,
50            max: 1.,
51            len1: n1,
52            len2: n2,
53        }
54    }
55}
56
57#[cfg(test)]
58mod tests {
59    use crate::str::roberts;
60    use assert2::assert;
61    use rstest::rstest;
62
63    fn is_close(a: f64, b: f64) -> bool {
64        (a - b).abs() < 1E-5
65    }
66
67    #[rstest]
68    #[case("", "", 1.)]
69    #[case("a", "a", 1.)]
70    #[case("", "a", 0.)]
71    #[case("a", "", 0.)]
72    // Parity with abydos.
73    // By default, abydos uses bi-grams with word separators to tokenize any passed text
74    // for Roberts. And that's what gets tested. However, textdistance uses bag of chars
75    // by default and doesn't add any word separators ever. So, instead of using results
76    // from tests, I've put results of running the values through `Roberts(qval=1).sim(a, b)`.
77    #[case("cat", "hat", 0.6666666666666666)]
78    #[case("Niall", "Neil", 0.6111111111111112)]
79    #[case("aluminum", "Catalan", 0.3555555555555555)]
80    #[case("ATCG", "TAGC", 1.0)]
81    #[case("Nigel", "Niall", 0.55)]
82    #[case("Niall", "Nigel", 0.55)]
83    #[case("Colin", "Coiln", 1.0)]
84    #[case("Coiln", "Colin", 1.0)]
85    #[case("ATCAACGAGT", "AACGATTAG", 0.9210526315789473)]
86    fn function_str(#[case] s1: &str, #[case] s2: &str, #[case] exp: f64) {
87        let act = roberts(s1, s2);
88        let ok = is_close(act, exp);
89        assert!(ok, "roberts({}, {}) is {}, not {}", s1, s2, act, exp);
90    }
91}