gecliht 0.2.0

A disparate collection of text manipulation and formatting algorithms.
Documentation
//! A collection of algorithms for measuring the distance between two strings.
//!
//! These metrics are based on the number of changes needed to make the sequence
//! of characters in one string match that in the other. Different metrics use 
//! different operations.
//!

use std::cmp;
use std::collections::HashSet;
use std::iter::FromIterator;

use super::ngrams::word_ngrams;

/// Returns a count of the number of different characters in the two words,
/// assuming they are the same size.
///
/// # Examples
///
/// ```
/// assert_eq!(gecliht::hamming_distance("This string", "that strong"), 
///            Ok(4));
/// assert_eq!(gecliht::hamming_distance("This string", "and that strong"), 
///            Err("The words are not the same size"));
/// ```
///
/// # Errors
///
/// An error is raised if the two words are not of equal length.
///
pub fn hamming_distance (word1: &str, word2: &str) -> Result<usize, &'static str> {
    let chars1: Vec<char> = word1.chars().collect();
    let chars2: Vec<char> = word2.chars().collect();

    if chars1.len() != chars2.len() {
        return Err("The words are not the same size");
    }

    let count = chars1.iter()
        .zip(chars2.iter())
        .filter(|(c1, c2)| c1 != c2)
        .count();

    Ok(count)
}

/// Returns the minimum number of single-character edits (insertions, deletions 
/// or substitutions) required to change one word into the other.
///
/// # Example
///
/// ```
/// assert_eq!(gecliht::levenshtein_distance("Sunday", "saturday"), 4);
/// assert_eq!(gecliht::levenshtein_distance("cast", "cats"), 2);
/// ```
///
pub fn levenshtein_distance (word1: &str, word2: &str) -> usize {
    compute_seq_distance(word1.chars().collect(), word2.chars().collect(), false)
}

/// Returns the minimum number of single-character edits (insertions, deletions, 
/// substitutions or transpositions) required to change one word into the other.
///
/// Also known as the Damerau-Levenshtein distance.
///
/// # Example
///
/// ```
/// assert_eq!(gecliht::optimal_string_alignment_distance("kitten", "sitting"), 3);
/// assert_eq!(gecliht::optimal_string_alignment_distance("cast", "cats"), 1);
/// ```
///
pub fn optimal_string_alignment_distance (word1: &str, word2: &str) -> usize {
    compute_seq_distance(word1.chars().collect(), word2.chars().collect(), true)
}

// Depending on settings of osa, will compute the levenshtein or the optimal 
// alignment of the two given vectors.
fn compute_seq_distance<T> (items1: Vec<T>, items2: Vec<T>, osa: bool) -> usize
where T: std::cmp::Eq {
    if items1 == items2 { 
        0
    } else if items1.is_empty() {
        items2.len()
    } else if items2.is_empty() { 
        items1.len()
    } else {
        let mut curr1: Vec<usize> = (0..=items2.len()).collect();
        let mut curr2 = vec![0; items2.len()+1];
        for i in 0..items1.len() {
            let mut curr = vec![0; curr1.len()];
            // compute current row values from the previous row
            curr[0] = i+1;
            for j in 0..items2.len() {
                let cost = if items1[i] == items2[j] { 0 } else { 1 };
                curr[j+1] = cmp::min(1+curr[j], cmp::min(1+curr1[j+1], cost+curr1[j]));
                // transpositions also counted for optimal-string alignment 
                if osa && i>1 && j>1 && items1[i] == items2[j-1] && items1[i-1] == items2[j] {
                    curr[j+1] = cmp::min(curr[j+1], cost+curr2[j-1]);
                }
            }
            curr2 = curr1;
            curr1 = curr;
        }
        curr1.pop().unwrap_or_else(|| 0) // return last element of curr1
    }
}

/// Returns the sorenson_dice_index measure based on 2-grams of letters in the 
/// two words.
///
/// # Example
///
/// ```
/// let measure = gecliht::sorenson_dice_similarity("sympathize", "sympthise");
///
/// assert!((measure-0.5882).abs() < 0.001);
/// ```
///
pub fn sorenson_dice_similarity (word1: &str, word2: &str) -> f64 {
    sorenson_dice_similarity_n(word1, word2, 2)
}

/// Returns the sorenson_dice_index measure based on n-grams of letters in the 
/// two words.
///
/// # Example
///
/// ```
/// let measure_2 = gecliht::sorenson_dice_similarity_n("sympathize", "sympthise", 2);
/// let measure_3 = gecliht::sorenson_dice_similarity_n("sympathize", "sympthise", 3);
/// let measure_4 = gecliht::sorenson_dice_similarity_n("sympathize", "sympthise", 4);
///
/// assert!((measure_2-0.5882).abs() < 0.001);
/// assert!((measure_3-0.4000).abs() < 0.001);
/// assert!((measure_4-0.1538).abs() < 0.001);
/// ```
///
pub fn sorenson_dice_similarity_n (word1: &str, word2: &str, n: usize) -> f64 {
    let ngrams1: HashSet<_> = HashSet::from_iter(word_ngrams(word1, n));
    let ngrams2: HashSet<_> = HashSet::from_iter(word_ngrams(word2, n));

    if ngrams1.is_empty() || ngrams2.is_empty() {
        1.0
    } else {
        let total_size = (ngrams1.len() + ngrams2.len()) as f64;
        let intersection: HashSet<_> = ngrams1.intersection(&ngrams2).collect();
        let intersection_size = intersection.len() as f64;

        (2.0 * intersection_size) / total_size
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_hamming_distance () {
        assert_eq!(Ok(0), hamming_distance("", ""));
        assert_eq!(Ok(0), hamming_distance("string", "string"));
        assert_eq!(Ok(3), hamming_distance("this string", "that strong"));
        assert_eq!(Ok(4), hamming_distance("This string", "that strong"));
        assert_eq!(Err("The words are not the same size"), hamming_distance("no", "not"));
    }

    #[test]
    fn test_levenshtein_distance () {
        assert_eq!(0, levenshtein_distance("sitting", "sitting"));
        assert_eq!(7, levenshtein_distance("sitting", ""));
        assert_eq!(6, levenshtein_distance("", "kitten"));
        assert_eq!(3, levenshtein_distance("sitting", "kitten"));
        assert_eq!(3, levenshtein_distance("Sunday", "Saturday"));
        assert_eq!(4, levenshtein_distance("Sunday", "saturday"));
        assert_eq!(2, levenshtein_distance("cast", "cats"));
    }

    #[test]
    fn test_optimal_string_alignment_distance () {
        assert_eq!(3, optimal_string_alignment_distance("kitten", "sitting"));
        assert_eq!(3, optimal_string_alignment_distance("this string", "that strnig"));
        assert_eq!(3, optimal_string_alignment_distance("this string", "that strnig"));
        assert_eq!(1, optimal_string_alignment_distance("cast", "cats"));
    }

    fn test_approx_same (num1: f64, num2: f64, eps: f64) {
        assert!((num1-num2).abs() < eps);
    }

    #[test]
    fn test_sorenson_dice_similarity () {
        test_approx_same(0.8, sorenson_dice_similarity("Healed", "Sealed"), 0.01);
        test_approx_same(0.55, sorenson_dice_similarity("Healed", "Healthy"), 0.01);
        test_approx_same(0.44, sorenson_dice_similarity("Healed", "Heard"), 0.01);
        test_approx_same(0.40, sorenson_dice_similarity("Healed", "Herded"), 0.01);
        test_approx_same(0.25, sorenson_dice_similarity("Healed", "Help"), 0.01);
        test_approx_same(0.0, sorenson_dice_similarity("Healed", "Sold"), 0.01);
        test_approx_same(1.0, sorenson_dice_similarity("abcde fghi", "abcde fghi"), 0.01);
        test_approx_same(0.5882, sorenson_dice_similarity("sympathize", "sympthise"), 0.01);
        test_approx_same(0.5882, sorenson_dice_similarity_n("sympathize", "sympthise", 2), 0.01);
        test_approx_same(0.4, sorenson_dice_similarity_n("sympathize", "sympthise", 3), 0.01);
        test_approx_same(0.1538, sorenson_dice_similarity_n("sympathize", "sympthise", 4), 0.01);
    }
}