#[must_use]
pub fn soundex(word: &str) -> String {
let cleaned: String = word
.chars()
.filter(|c| !c.is_whitespace() && *c != '\u{200C}')
.collect();
if cleaned.is_empty() {
return String::from("0000");
}
let chars: Vec<char> = cleaned.chars().collect();
let mut code = String::with_capacity(4);
code.push(first_letter_repr(chars[0]));
let mut prev_digit = phoneme_code(chars[0]);
for &c in &chars[1..] {
let d = phoneme_code(c);
if d == '0' {
prev_digit = '0';
continue;
}
if d == prev_digit {
continue;
}
code.push(d);
prev_digit = d;
if code.chars().count() >= 4 {
break;
}
}
while code.chars().count() < 4 {
code.push('0');
}
code
}
#[must_use]
pub fn matches(a: &str, b: &str) -> bool {
soundex(a) == soundex(b)
}
fn first_letter_repr(c: char) -> char {
match c {
'ص' | 'ث' => 'س',
'ذ' | 'ض' | 'ظ' => 'ز',
'ط' => 'ت',
'غ' => 'ق',
'ح' => 'ه',
'ا' | 'آ' | 'إ' | 'أ' => 'ا',
_ => c,
}
}
fn phoneme_code(c: char) -> char {
match c {
'ب' | 'پ' => '1',
'ف' | 'و' => '2',
'ت' | 'د' | 'ط' => '3',
'ث' | 'س' | 'ص' | 'ز' | 'ذ' | 'ض' | 'ظ' | 'ش' | 'ژ' => '4',
'ج' | 'چ' | 'ک' | 'گ' | 'ق' | 'غ' | 'خ' => '5',
'ل' => '6',
'م' | 'ن' => '7',
'ر' => '8',
_ => '0',
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn fixed_length() {
assert_eq!(soundex("ا").chars().count(), 4);
assert_eq!(soundex("کتاب").chars().count(), 4);
assert_eq!(soundex("سپاسگزاری").chars().count(), 4);
}
#[test]
fn homophones_same_code() {
assert_eq!(soundex("صبر"), soundex("سبر"));
assert_eq!(soundex("ذرت"), soundex("زرت"));
}
#[test]
fn distinct_words_distinct_codes() {
assert_ne!(soundex("کتاب"), soundex("کنار"));
assert_ne!(soundex("سفر"), soundex("کتاب"));
}
#[test]
fn empty_input() {
assert_eq!(soundex(""), "0000");
}
#[test]
fn matches_helper() {
assert!(matches("صبر", "سبر"));
assert!(!matches("کتاب", "سفر"));
}
#[test]
fn pads_short_word() {
let s = soundex("ا");
assert_eq!(s.chars().count(), 4);
assert!(s.ends_with("000"));
}
}