1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275
#![deny(missing_docs)]
//! This crate enables grouping values based on string similarity via
//! [Jaro-Winkler distance](https://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance) and
//! [complete-linkage clustering](https://en.wikipedia.org/wiki/Complete-linkage_clustering).
//!
//! # Example: Identify likely repeated merchants based on merchant name
//!
//! ```
//! use group_similar::{Config, Threshold, group_similar};
//!
//! #[derive(Eq, PartialEq, std::hash::Hash, Debug, Ord, PartialOrd)]
//! struct Merchant {
//! id: usize,
//! name: String
//! }
//!
//!
//! impl std::fmt::Display for Merchant {
//! fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
//! write!(f, "{}", self.name)
//! }
//! }
//!
//! impl AsRef<str> for Merchant {
//! fn as_ref(&self) -> &str {
//! &self.name
//! }
//! }
//!
//! let merchants = vec![
//! Merchant {
//! id: 1,
//! name: "McDonalds 105109".to_string()
//! },
//! Merchant {
//! id: 2,
//! name: "McDonalds 105110".to_string()
//! },
//! Merchant {
//! id: 3,
//! name: "Target ID1244".to_string()
//! },
//! Merchant {
//! id: 4,
//! name: "Target ID125".to_string()
//! },
//! Merchant {
//! id: 5,
//! name: "Amazon.com TID120159120".to_string()
//! },
//! Merchant {
//! id: 6,
//! name: "Target".to_string()
//! },
//! Merchant {
//! id: 7,
//! name: "Target.com".to_string()
//! },
//! ];
//!
//! let config = Config::jaro_winkler(Threshold::default());
//! let results = group_similar(&merchants, &config);
//!
//! assert_eq!(results.get(&merchants[0]), Some(&vec![&merchants[1]]));
//! assert_eq!(results.get(&merchants[2]), Some(&vec![&merchants[3], &merchants[5], &merchants[6]]));
//! assert_eq!(results.get(&merchants[4]), Some(&vec![]));
//! ```
//!
mod config;
pub use config::{Config, Threshold};
use kodama::linkage;
use rayon::iter::{IntoParallelRefIterator, ParallelIterator};
use std::collections::BTreeMap;
/// Group records based on a particular configuration
pub fn group_similar<'a, 'b, V>(
records: &'a [V],
config: &'b Config<V>,
) -> BTreeMap<&'a V, Vec<&'a V>>
where
V: std::hash::Hash + AsRef<str> + Eq + Sync + Ord,
{
let mut results = BTreeMap::new();
let mut condensed = similarity_matrix(records, &config.compare);
let dend = linkage(&mut condensed, records.len(), config.method);
let mut dendro: Dendro<f64, &V> = BTreeMap::default();
for (idx, record) in records.iter().enumerate() {
dendro.insert(idx, Dendrogram::Node(record));
}
let base = records.len();
for (idx, step) in dend.steps().iter().enumerate() {
dendro.insert(base + idx, Dendrogram::Group(step));
}
// We reverse this to start with the most dissimilar first, ultimately operating within the
// threshold. The result is that the largest groups bubble up to the top, ensuring that we're
// casting the widest net from closer matches when building the result set.
for (idx, step) in dend.steps().iter().enumerate().rev() {
if config.threshold.within(step.dissimilarity) {
match extract_values(&mut dendro, base + idx).as_slice() {
[first, rest @ ..] => {
results.entry(*first).or_insert(vec![]).extend(rest);
}
[] => {}
}
}
}
for node in get_remaining_nodes(&dendro) {
results.insert(node, vec![]);
}
results
}
type Dendro<'a, F, V> = BTreeMap<usize, Dendrogram<'a, F, V>>;
/// Recursively retrieve this and all child values to completion
///
/// If the value retrieved by index is present in the BTreeMap, we remove it. If it's an item
/// directly, we push that onto the list of results. If it's a `Step` (which contains pointers to
/// other steps or values), we recurse both sides (as a step can be linked to a value, or even
/// another step).
fn extract_values<'a, 'b, F, V>(dendro: &mut Dendro<'a, F, V>, at: usize) -> Vec<V> {
let mut results = vec![];
if let Some(result) = dendro.remove(&at) {
match result {
Dendrogram::Group(step) => {
results.extend(extract_values(dendro, step.cluster1));
results.extend(extract_values(dendro, step.cluster2));
}
Dendrogram::Node(v) => results.push(v),
}
}
results
}
/// Extract all item values from the BTreeMap
///
/// Given `extract_values` mutates the BTreeMap, removing values (so as not to duplicate results across
/// different groups, the net result is a BTreeMap that may contain steps (e.g. that didn't meet the
/// threshold criterion as the steps are too dissimilar) or items (if an item itself is too
/// dissimilar from any other items or steps).
///
/// In this case, we still need to return those items, and have their associated matches be an
/// emtpy Vec, when we add them to the returned BTreeMap.
fn get_remaining_nodes<'a, 'b, F, V>(dendro: &'b Dendro<'a, F, V>) -> Vec<&'b V> {
let mut results = vec![];
for value in dendro.values() {
if let Dendrogram::Node(v) = value {
results.push(v);
}
}
results
}
#[derive(Debug)]
enum Dendrogram<'a, F, V> {
Group(&'a kodama::Step<F>),
Node(V),
}
/// This function is very heavily inspired by the example provided within the kodama documentation
/// (at https://docs.rs/kodama)
///
/// The biggest differences? This allows for the comparison operation to be provided, and it
/// compares values leveraging Rayon to speed up the operations. Due to the use of Rayon, we
/// construct an intermediate vector to hold positions for calculations.
pub fn similarity_matrix<V, F>(inputs: &[V], compare: &F) -> Vec<f64>
where
F: Fn(&V, &V) -> f64 + Send + Sync,
V: Sync + AsRef<str>,
{
if inputs.is_empty() {
return vec![];
}
let ilen = inputs.len();
let intermediate = (0..ilen - 1)
.flat_map(|row| (row + 1..ilen).map(move |col| (row, col)))
.collect::<Vec<_>>();
intermediate
.par_iter()
.map(|(row, col)| (compare)(&inputs[*row], &inputs[*col]))
.collect::<Vec<_>>()
}
#[cfg(test)]
mod tests {
use crate::{group_similar, Config};
use std::collections::BTreeMap;
use std::convert::TryInto;
#[test]
fn allows_for_empty_list() {
let threshold = 1.0_f64.try_into().expect("permissive threshold");
let config: Config<&str> = Config::jaro_winkler(threshold);
let values = vec![];
let outcome = BTreeMap::new();
assert_eq!(outcome, group_similar(&values, &config));
}
#[test]
fn groups_all_together_with_permissive_threshold() {
let threshold = 1.0_f64.try_into().expect("permissive threshold");
let config = Config::jaro_winkler(threshold);
let values = vec!["hello", "world"];
let mut outcome = BTreeMap::new();
outcome.insert(&"hello", vec![&"world"]);
assert_eq!(outcome, group_similar(&values, &config));
}
#[test]
fn groups_nothing_together_with_strict_threshold() {
let threshold = 0.0_f64.try_into().expect("strict threshold");
let config = Config::jaro_winkler(threshold);
let values = vec!["hello", "world"];
let mut outcome = BTreeMap::new();
outcome.insert(&"hello", vec![]);
outcome.insert(&"world", vec![]);
assert_eq!(outcome, group_similar(&values, &config));
}
#[test]
fn groups_similar_values() {
let threshold = 0.25_f64.try_into().expect("fairly strict threshold");
let config = Config::jaro_winkler(threshold);
let values = vec!["Jane", "June", "Joan", "Joseph"];
let mut outcome = BTreeMap::new();
outcome.insert(&"Jane", vec![&"June"]);
outcome.insert(&"Joan", vec![]);
outcome.insert(&"Joseph", vec![]);
assert_eq!(outcome, group_similar(&values, &config));
}
#[test]
fn groups_similar_values_loosely() {
let threshold = 0.5_f64.try_into().expect("fairly permissive threshold");
let config = Config::jaro_winkler(threshold);
let values = vec![
"Henry", "Jane", "June", "Joan", "José", "Barry", "Joseph", "Mary", "Henry", "Harry",
];
let mut outcome = BTreeMap::new();
outcome.insert(&"José", vec![&"Joseph", &"Joan", &"Jane", &"June"]);
outcome.insert(&"Henry", vec![&"Henry", &"Mary", &"Barry", &"Harry"]);
assert_eq!(outcome, group_similar(&values, &config));
}
}