mailidator 0.1.0

A lightweight Rust library for checking email address misspellings
Documentation
//! Implementation of the Sift3 string distance algorithm
//!
//! Sift3 is a fast and accurate string distance algorithm that provides
//! a good balance between performance and accuracy for fuzzy string matching.

/// Calculate the Sift3 distance between two strings
///
/// Returns a normalized distance value between 0.0 and 1.0, where:
/// - 0.0 means strings are identical
/// - 1.0 means strings are completely different
///
/// # Examples
///
/// ```rust
/// use mailidator::sift3;
///
/// let dist = sift3::distance("gmail", "gmaik");
/// assert!(dist > 0.0 && dist < 1.0);
/// ```
pub fn distance(s1: &str, s2: &str) -> f64 {
    // Fast path for identical strings
    if s1 == s2 {
        return 0.0;
    }

    let len1 = s1.len();
    let len2 = s2.len();

    // Fast path for empty strings
    if len1 == 0 && len2 == 0 {
        return 0.0;
    }
    if len1 == 0 || len2 == 0 {
        return 1.0;
    }

    // Early exit for very different lengths
    let max_len = len1.max(len2);
    let min_len = len1.min(len2);
    if max_len > min_len * 3 {
        return 1.0;
    }

    // Use bytes for ASCII domains (email domains are ASCII-only)
    // This is much faster than char iteration for typical email domains
    if s1.is_ascii() && s2.is_ascii() {
        return distance_ascii_fast(s1.as_bytes(), s2.as_bytes());
    }

    // Fallback to Unicode-aware version for non-ASCII
    distance_unicode(s1, s2)
}

/// Fast ASCII-only distance calculation using bytes
#[inline]
fn distance_ascii_fast(s1: &[u8], s2: &[u8]) -> f64 {
    let len1 = s1.len();
    let len2 = s2.len();
    let max_len = len1.max(len2);

    let mut c1 = 0;
    let mut c2 = 0;
    let mut lcss = 0; // longest common subsequence
    let mut local_cs = 0; // local common substring
    const OFFSET_THRESHOLD: usize = 5;

    while c1 < len1 && c2 < len2 {
        if s1[c1] == s2[c2] {
            local_cs += 1;
        } else {
            lcss += local_cs;
            local_cs = 0;

            // Look for the character in a limited range
            if c1 != c2 {
                let start1 = c1.saturating_sub(OFFSET_THRESHOLD);
                let end1 = (c1 + OFFSET_THRESHOLD + 1).min(len1);
                let start2 = c2.saturating_sub(OFFSET_THRESHOLD);
                let end2 = (c2 + OFFSET_THRESHOLD + 1).min(len2);

                // Look for s1[c1] in s2 range
                if let Some(pos) = s2[start2..end2].iter().position(|&b| b == s1[c1]) {
                    c2 = start2 + pos;
                } else if let Some(pos) = s1[start1..end1].iter().position(|&b| b == s2[c2]) {
                    // Look for s2[c2] in s1 range
                    c1 = start1 + pos;
                }
            }
        }

        c1 += 1;
        c2 += 1;
    }

    lcss += local_cs;
    1.0 - (lcss as f64 / max_len as f64)
}

/// Unicode-aware distance calculation (slower fallback)
fn distance_unicode(s1: &str, s2: &str) -> f64 {
    let s1_chars: Vec<char> = s1.chars().collect();
    let s2_chars: Vec<char> = s2.chars().collect();
    let len1 = s1_chars.len();
    let len2 = s2_chars.len();
    let max_len = len1.max(len2);

    let mut c1 = 0;
    let mut c2 = 0;
    let mut lcss = 0;
    let mut local_cs = 0;
    const OFFSET_THRESHOLD: usize = 5;

    while c1 < len1 && c2 < len2 {
        if s1_chars[c1] == s2_chars[c2] {
            local_cs += 1;
        } else {
            lcss += local_cs;
            local_cs = 0;

            if c1 != c2 {
                let start1 = c1.saturating_sub(OFFSET_THRESHOLD);
                let end1 = (c1 + OFFSET_THRESHOLD + 1).min(len1);
                let start2 = c2.saturating_sub(OFFSET_THRESHOLD);
                let end2 = (c2 + OFFSET_THRESHOLD + 1).min(len2);

                // Look for s1[c1] in s2 range
                if let Some(pos) = s2_chars[start2..end2]
                    .iter()
                    .position(|&ch| ch == s1_chars[c1])
                {
                    c2 = start2 + pos;
                } else if let Some(pos) = s1_chars[start1..end1]
                    .iter()
                    .position(|&ch| ch == s2_chars[c2])
                {
                    // Look for s2[c2] in s1 range
                    c1 = start1 + pos;
                }
            }
        }

        c1 += 1;
        c2 += 1;
    }

    lcss += local_cs;
    1.0 - (lcss as f64 / max_len as f64)
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_identical_strings() {
        assert_eq!(distance("hello", "hello"), 0.0);
        assert_eq!(distance("", ""), 0.0);
    }

    #[test]
    fn test_completely_different_strings() {
        let dist = distance("abc", "xyz");
        assert!(dist > 0.5); // Should be high distance
    }

    #[test]
    fn test_empty_strings() {
        assert_eq!(distance("", "hello"), 1.0);
        assert_eq!(distance("hello", ""), 1.0);
    }

    #[test]
    fn test_similar_strings() {
        let dist = distance("gmail", "gmaik");
        assert!(dist > 0.0 && dist < 0.5); // Should detect similarity

        let dist = distance("yahoo", "yaho");
        assert!(dist > 0.0 && dist < 0.5); // Should detect similarity
    }

    #[test]
    fn test_case_sensitivity() {
        let dist = distance("Gmail", "gmail");
        assert!(dist > 0.0); // Different cases should have some distance
    }

    #[test]
    fn test_substring() {
        let dist = distance("test", "testing");
        assert!(dist > 0.0 && dist < 1.0);
    }
}