Function cluster_strings

Source
pub fn cluster_strings<'a>(
    inputs: &'a Vec<&'a str>,
    max_edit_frac: f32,
    n_threads: usize,
) -> Result<Vec<Vec<&'a str>>, ValueError>
Expand description

Group similar input strings into clusters.

Strings will be grouped into a cluster if the Levenshtein distance between the strings is below ‘max_edit_frac’ of the shorter string’s length.

§Examples

Basic usage:

let inputs = vec!["aaaa", "aaax", "bbbb", "bbbz"];
let expected = vec![vec!["aaaa", "aaax"], vec!["bbbb", "bbbz"]];

let clusters = clustr::cluster_strings(&inputs, 0.25, 1)?;

assert_eq!(clusters, expected);

§Multiple threads:

let inputs = vec!["aa", "bb", "aa", "bb"];
let expected = vec![vec!["aa", "aa"], vec!["bb", "bb"]];

let results = clustr::cluster_strings(&inputs, 0.0, 4)?;
  
// Order of returned clusters nondeterministic
for e in expected {
    assert!(results.contains(&e));
}