use lazy_static::lazy_static;
use regex::Regex;
use std::collections::HashSet;
use std::hash::Hash;
use std::iter::FromIterator;
pub fn find_words_iter<'n, 'h>(
needle: &'n str,
haystack: &'h str,
threshold: f32,
) -> Matches<'n, 'h> {
lazy_static! {
static ref WORD_RX: Regex = Regex::new(r"\w+").unwrap();
}
let words = WORD_RX.find_iter(haystack);
Matches {
needle,
haystack_words: words,
threshold,
}
}
pub struct Matches<'n, 'h> {
needle: &'n str,
haystack_words: regex::Matches<'static, 'h>,
threshold: f32,
}
impl<'n, 'h> Iterator for Matches<'n, 'h> {
type Item = Match<'h>;
fn next(&mut self) -> Option<Self::Item> {
while let Some(m) = self.haystack_words.next() {
let w = m.as_str();
if similarity(self.needle, w) > self.threshold {
let m2 = Match {
text: w,
start: m.start(),
end: m.end(),
};
return Some(m2);
}
}
None
}
}
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub struct Match<'t> {
text: &'t str,
start: usize,
end: usize,
}
impl<'t> Match<'t> {
pub fn start(self) -> usize {
self.start
}
pub fn end(self) -> usize {
self.end
}
pub fn as_str(self) -> &'t str {
self.text
}
}
pub fn similarity(a: &str, b: &str) -> f32 {
lazy_static! {
static ref RX: Regex = Regex::new(r"^|$|\W+").unwrap();
}
let a = RX.replace_all(a, " ").to_lowercase();
let b = RX.replace_all(b, " ").to_lowercase();
let ta = trigrams(&a);
let tb = trigrams(&b);
jaccard(&ta, &tb)
}
fn jaccard<T>(s1: &HashSet<T>, s2: &HashSet<T>) -> f32
where
T: Hash + Eq,
{
let i = s1.intersection(&s2).count() as f32;
let u = s1.union(&s2).count() as f32;
if u == 0.0 {
1.0
} else {
i / u
}
}
fn trigrams(s: &str) -> HashSet<&str> {
let idxs = rune_indexes(&s);
HashSet::from_iter(
(0..idxs.len() - 3)
.map(|i| &s[idxs[i]..idxs[i + 3]])
.filter(|t| !t.ends_with(" ")),
)
}
fn rune_indexes(s: &str) -> Vec<usize> {
let mut idxs: Vec<usize> = s.char_indices().map(|(i, _)| i).collect();
idxs.push(s.len());
idxs
}
#[cfg(test)]
mod tests {
use super::*;
use table_test::table_test;
#[test]
fn empty() {
assert_eq!(similarity(&"", &""), 1.0, "checking similarity of '' to ''");
}
#[test]
fn same_string() {
let strs = vec!["", "a", "ab", "abc", "abcd"];
for a in strs {
assert_eq!(
similarity(&a, &a),
1.0,
"checking similarity of '{}' to itself",
a
);
}
}
#[test]
fn zero_similarity_for_nothing_in_common() {
let va = vec!["abc", "abcd"];
for a in va {
let vb = vec!["def", "efgh"];
for b in vb {
assert_eq!(
similarity(&a, &b),
0.0,
"checking that '{}' and '{}' have similarity of zero",
a,
b
);
assert_eq!(
similarity(&b, &a),
0.0,
"checking that '{}' and '{}' have similarity of zero",
b,
a
);
}
}
}
#[test]
fn non_ascii_unicode() {
assert_eq!(similarity(&"🐕", &"🐕"), 1.0, "dog matches dog");
assert_eq!(
similarity(&"ö`üǜ", &"asd"),
0.0,
"no match between ö`üǜ and asd"
);
assert_eq!(
similarity(&"ö`üǜ", &"ouu"),
0.0,
"no match between ö`üǜ… and ouu"
);
}
#[test]
fn case_ignored() {
assert_eq!(similarity("A", "a"), 1.0, "A is a");
assert_eq!(similarity("a", "A"), 1.0, "a is A");
}
#[test]
fn fuzzy_matches() {
assert_eq!(similarity(&"a", &"ab"), 0.25, "checking a and ab");
assert_eq!(similarity(&"foo", &"food"), 0.5, "checking foo and food");
assert_eq!(
similarity(&"bar", &"barred"),
0.375,
"checking bar and barred"
);
assert_eq!(
similarity(&"ing bear", &"ing boar"),
0.5,
"checking ing bear and ing boar"
);
assert_eq!(
similarity(&"dancing bear", &"dancing boar"),
0.625,
"checking dancing bear and dancing boar"
);
assert_eq!(
similarity(&"sir sly", &"srsly"),
0.3,
"checking sir sly and srsly"
);
assert_eq!(
similarity(&"same, but different?", &"same but different"),
1.0,
"checking same but different"
);
}
#[test]
fn finding() {
let table = vec![
(("", ""), vec![]),
(("a", ""), vec![]),
(("a", "a"), vec![(0, 1)]),
(("a", "ab"), vec![]),
(("a", "ba"), vec![]),
(("ab", "abc"), vec![(0, 3)]),
(("a", "ababa"), vec![]),
(("a", "a b a b a"), vec![(0, 1), (4, 5), (8, 9)]),
(("riddums", "riddims"), vec![(0, "riddums".len())]),
(
("riddums", "funky riddims"),
vec![("funky ".len(), "funky riddums".len())],
),
];
for (validator, (needle, haystack), expected) in table_test!(table) {
let threshold = 0.3;
let actual: Vec<(usize, usize)> = find_words_iter(needle, haystack, threshold)
.map(|m| (m.start, m.end))
.collect();
validator
.given(&format!("needle = '{}', haystack = '{}'", needle, haystack))
.when("find_vec")
.then(&format!("it should return {:?}", expected))
.assert_eq(expected, actual);
}
}
}