pub fn russell_soundex (word: &str) -> String {
fn encode(key: &str) -> &str {
match key {
"A" | "E" | "H" | "I" | "O" | "U" | "W" | "Y" => "0",
"B" | "F" | "P" | "V" => "1",
"C" | "S" | "K" | "G" | "J" | "Q" | "X" | "Z" => "2",
"D" | "T" => "3",
"L" => "4",
"M" | "N" => "5",
"R" => "6",
_ => key,
}
}
let mut chars : Vec<String> = word.chars()
.filter(|c| c.is_alphanumeric())
.map(|c| c.to_uppercase().to_string())
.collect ();
if chars.is_empty() { return "".to_string();
}
for i in 1..chars.len() {
chars[i] = encode(&chars[i]).to_string ();
}
for i in (2..(chars.len())).rev() {
if chars[i] == chars[i-1] {
chars.remove(i);
}
}
if encode(&chars[0]) == chars[1] { chars.remove(1);
}
for i in (1..chars.len()).rev() {
if chars[i] == "0" {
chars.remove(i);
}
}
while chars.len() < 4 {
chars.push("0".to_string());
}
let mut result = "".to_string();
for i in 0..4 {
result.push_str(&chars[i]);
}
result
}
pub fn refined_soundex (word: &str) -> String {
fn encode(key: &str) -> &str {
match key {
"A" | "E" | "H" | "I" | "O" | "U" | "W" | "Y" => "0",
"B" | "P" => "1",
"F" | "V" => "2",
"C" | "S" | "K" => "3",
"G" | "J" => "4",
"Q" | "X" | "Z" => "5",
"D" | "T" => "6",
"L" => "7",
"M" | "N" => "8",
"R" => "9",
_ => key,
}
}
let mut chars : Vec<String> = word.chars()
.filter(|c| c.is_alphabetic())
.map(|c| c.to_uppercase().to_string())
.collect ();
if chars.is_empty() { return "".to_string();
}
chars.insert(0, chars[0].clone());
for i in 1..chars.len() {
chars[i] = encode(&chars[i]).to_string ();
}
for i in (2..(chars.len())).rev() {
if chars[i] == chars[i-1] {
chars.remove(i);
}
}
let mut result = "".to_string();
for c in chars.iter () {
result.push_str(&c);
}
result
}
pub fn daitch_mokotoff (word: &str) -> String {
const CODES: &[(&str, &[&str], &[&str], &[&str])] = &[
("SCHTSCH", &["2"], &["4"], &["4"]),
("SCHTSH", &["2"], &["4"], &["4"]),
("SCHTCH", &["2"], &["4"], &["4"]),
("SHTCH", &["2"], &["4"], &["4"]),
("SHTSH", &["2"], &["4"], &["4"]),
("STSCH", &["2"], &["4"], &["4"]),
("TTSCH", &["4"], &["4"], &["4"]),
("ZHDZH", &["2"], &["4"], &["4"]),
("SHCH", &["2"], &["4"], &["4"]),
("STCH", &["2"], &["4"], &["4"]),
("STRZ", &["2"], &["4"], &["4"]),
("STRS", &["2"], &["4"], &["4"]),
("STSH", &["2"], &["4"], &["4"]),
("SZCZ", &["2"], &["4"], &["4"]),
("SZCS", &["2"], &["4"], &["4"]),
("TTCH", &["4"], &["4"], &["4"]),
("TTSZ", &["4"], &["4"], &["4"]),
("ZDZH", &["2"], &["4"], &["4"]),
("ZSCH", &["4"], &["4"], &["4"]),
("CHS", &["5"], &["54"], &["54"]),
("CSZ", &["4"], &["4"], &["4"]),
("CZS", &["4"], &["4"], &["4"]),
("DRZ", &["4"], &["4"], &["4"]),
("DRS", &["4"], &["4"], &["4"]),
("DSH", &["4"], &["4"], &["4"]),
("DSZ", &["4"], &["4"], &["4"]),
("DZH", &["4"], &["4"], &["4"]),
("DZS", &["4"], &["4"], &["4"]),
("SCH", &["4"], &["4"], &["4"]),
("SHT", &["2"], &["43"], &["43"]),
("SZT", &["2"], &["43"], &["43"]),
("SHD", &["2"], &["43"], &["43"]),
("SZD", &["2"], &["43"], &["43"]),
("TCH", &["4"], &["4"], &["4"]),
("TRZ", &["4"], &["4"], &["4"]),
("TRS", &["4"], &["4"], &["4"]),
("TSCH", &["4"], &["4"], &["4"]),
("TSH", &["4"], &["4"], &["4"]),
("TTS", &["4"], &["4"], &["4"]),
("TTZ", &["4"], &["4"], &["4"]),
("TZS", &["4"], &["4"], &["4"]),
("TSZ", &["4"], &["4"], &["4"]),
("ZDZ", &["2"], &["4"], &["4"]),
("ZHD", &["2"], &["43"], &["43"]),
("ZSH", &["4"], &["4"], &["4"]),
("AI", &["0"], &["1"], &["-"]),
("AJ", &["0"], &["1"], &["-"]),
("AY", &["0"], &["1"], &["-"]),
("AU", &["0"], &["7"], &["-"]),
("CH", &["5", "4"], &["5", "4"], &["5", "4"]),
("CK", &["5", "45"], &["5", "45"], &["5", "45"]),
("CK", &["4"], &["4"], &["4"]),
("CS", &["4"], &["4"], &["4"]),
("CZ", &["4"], &["4"], &["4"]),
("DS", &["4"], &["4"], &["4"]),
("DT", &["3"], &["3"], &["3"]),
("DZ", &["4"], &["4"], &["4"]),
("EI", &["0"], &["1"], &["-"]),
("EJ", &["0"], &["1"], &["-"]),
("EY", &["0"], &["1"], &["-"]),
("EU", &["1"], &["1"], &["-"]),
("FB", &["7"], &["7"], &["7"]),
("IA", &["1"], &["-"], &["-"]),
("IE", &["1"], &["-"], &["-"]),
("IO", &["1"], &["-"], &["-"]),
("IU", &["1"], &["-"], &["-"]),
("KS", &["5"], &["54"], &["54"]),
("KH", &["5"], &["5"], &["5"]),
("MN", &["-"], &["66"], &["66"]),
("NM", &["-"], &["66"], &["66"]),
("OI", &["0"], &["1"], &["-"]),
("OJ", &["0"], &["1"], &["-"]),
("OY", &["0"], &["1"], &["-"]),
("PF", &["7"], &["7"], &["7"]),
("PH", &["7"], &["7"], &["7"]),
("RS", &["4", "94"], &["4", "94"], &["4", "94"]),
("RZ", &["4", "94"], &["4", "94"], &["4", "94"]),
("SC", &["2"], &["4"], &["4"]),
("SD", &["2"], &["43"], &["43"]),
("SH", &["4"], &["4"], &["4"]),
("ST", &["2"], &["43"], &["43"]),
("SZ", &["4"], &["4"], &["4"]),
("TC", &["4"], &["4"], &["4"]),
("TH", &["3"], &["3"], &["3"]),
("TS", &["4"], &["4"], &["4"]),
("TZ", &["4"], &["4"], &["4"]),
("UI", &["0"], &["1"], &["-"]),
("UJ", &["0"], &["1"], &["-"]),
("UY", &["0"], &["1"], &["-"]),
("UE", &["0"], &["-"], &["-"]),
("ZD", &["2"], &["43"], &["43"]),
("ZH", &["4"], &["4"], &["4"]),
("ZS", &["4"], &["4"], &["4"]),
("A", &["0"], &["-"], &["-"]),
("Ä„", &["-"], &["-"], &["6", "-"]), ("B", &["7"], &["7"], &["7"]),
("C", &["4", "5"], &["4", "5"], &["4", "5"]),
("D", &["3"], &["3"], &["3"]),
("Ę", &["-"], &["-"], &["6", "-"]), ("E", &["0"], &["-"], &["-"]),
("F", &["7"], &["7"], &["7"]),
("G", &["5"], &["5"], &["5"]),
("H", &["5"], &["5"], &["-"]),
("I", &["0"], &["-"], &["-"]),
("J", &["1", "4"], &["-", "4"], &["-", "4"]),
("K", &["5"], &["5"], &["5"]),
("L", &["8"], &["8"], &["8"]),
("M", &["6"], &["6"], &["6"]),
("N", &["6"], &["6"], &["6"]),
("O", &["0"], &["-"], &["-"]),
("P", &["7"], &["7"], &["7"]),
("Q", &["5"], &["5"], &["5"]),
("R", &["9"], &["9"], &["9"]),
("S", &["4"], &["4"], &["4"]),
("Å¢", &["3", "4"], &["3", "4"], &["3", "4"]), ("T", &["3"], &["3"], &["3"]),
("U", &["0"], &["-"], &["-"]),
("V", &["7"], &["7"], &["7"]),
("W", &["7"], &["7"], &["7"]),
("X", &["5"], &["54"], &["54"]),
("Y", &["1"], &["-"], &["-"]),
("Z", &["4"], &["4"], &["4"]),
];
fn longest_match (word: &str) -> &(&str, &[&str], &[&str], &[&str]) {
for code in CODES.iter() {
if word.starts_with(code.0) {
return code;
}
}
&("", &[], &[], &[]) }
fn generate_result(codes: &Vec<&[&str]>) -> String {
let mut result = "".to_string();
for item in codes.iter() {
if result.len() == 6 { break; }
if item.len() > 0 {
result.push_str(item[0]);
}
}
while result.len() < 6 {
result.push_str("0");
}
result
}
fn match_result<'a>(word: &'a str,
posn: usize,
next_posn: usize,
code: &(&'a str, &'a[&'a str], &'a[&'a str], &'a[&'a str])
) -> &'a[&'a str] {
if posn == 0 {
code.1
} else {
match word.get(next_posn..next_posn+1) {
Some(c) => {
if "AEIOU".contains(c) {
code.2
} else {
code.3
}
},
None => code.3,
}
}
}
fn is_valid(word: &str, posn: usize, next_posn: usize, code: &(&str, &[&str], &[&str], &[&str])) -> bool {
let mr = match_result(word, posn, next_posn, code);
mr.len() > 0 && mr[0] != "-"
}
let word = word.to_uppercase();
let mut posn = 0;
let mut code = longest_match(&word[0..]);
let mut result: Vec<&[&str]> = vec![];
while posn < word.len() {
if code.0 == "" {
posn += 1;
code = longest_match(&word[posn..]);
} else {
let next_posn = posn + code.0.len();
if is_valid(&word, posn, next_posn, code) {
result.push(match_result(&word, posn, next_posn, code));
}
posn += code.0.len();
code = longest_match(&word[posn..]);
}
}
generate_result(&result)
}
pub fn metaphone (word: &str) -> String {
const DUPLICATES: &[char] = &['B', 'F', 'H', 'J', 'K', 'L', 'M', 'N', 'P', 'R', 'S', 'T', 'W', 'X', 'Y'];
const VOWELS: &[char] = &['A', 'E', 'I', 'O', 'U'];
fn last_was(chars: &[char], i: usize, c: char) -> bool {
i > 0 && chars[i-1] == c
}
fn next_is(chars: &[char], i: usize, c: char) -> bool {
i+1 < chars.len() && chars[i+1] == c
}
fn next_is_eiy(chars: &[char], i: usize) -> bool {
i+1 < chars.len() && ['E', 'I', 'Y'].contains(&chars[i+1])
}
fn last_was_vowel(chars: &[char], i: usize) -> bool {
i > 0 && VOWELS.contains(&chars[i-1])
}
fn next_is_vowel(chars: &[char], i: usize) -> bool {
i+1 < chars.len() && VOWELS.contains(&chars[i+1])
}
fn next_two(chars: &[char], i: usize, c: char, d: char) -> bool {
i+2 < chars.len() && chars[i+1] == c && chars[i+2] == d
}
let mut chars: Vec<char> = word .to_uppercase()
.chars()
.filter(|c| c.is_alphabetic())
.collect();
for i in (1..chars.len()).rev() {
if chars[i] == chars[i-1] && DUPLICATES.contains(&chars[i]) {
chars.remove(i);
}
}
if chars.len() > 0 && chars[0] == 'X' {
chars[0] = 'S';
}
if chars.len() > 1 {
if chars[0] == 'W' && chars[1] == 'H' {
chars.remove(1);
} else if (chars[0] == 'A' && chars[1] == 'E') ||
(chars[0] == 'G' && chars[1] == 'N') ||
(chars[0] == 'K' && chars[1] == 'N') ||
(chars[0] == 'P' && chars[1] == 'N') ||
(chars[0] == 'W' && chars[1] == 'R') {
chars.remove(0);
}
}
if chars[chars.len()-2] == 'M' && chars[chars.len()-1] == 'B' {
chars.pop();
}
let mut result = String::from("");
let mut i = 0;
while i < chars.len() {
let c = &chars[i];
match c {
'A' | 'E' | 'I' | 'O' | 'U' => {
if i == 0 { result.push(*c);
}
},
'B' | 'F' | 'J' | 'L' | 'M' | 'N' | 'R' => {
result.push(*c);
},
'C' => {
if next_is(&chars, i, 'H') {
if last_was(&chars, i, 'S') {
result.push('K');
} else {
result.push('X');
}
} else if next_two(&chars, i, 'I', 'A') {
result.push('X');
} else if next_is_eiy(&chars, i) {
if last_was(&chars, i, 'S') {
} else {
result.push('S');
}
} else {
result.push('K');
}
},
'D' => {
if next_is(&chars, i, 'G') && next_is_eiy(&chars, i+1) {
result.push('J');
} else {
result.push('T');
}
},
'G' => {
if next_is(&chars, i, 'G') {
} else if next_is(&chars, i, 'H') &&
i < chars.len()-1 && !next_is_vowel(&chars, i) {
} else if next_is(&chars, i, 'N') {
} else if last_was(&chars, i, 'D') && next_is_eiy(&chars, i) {
} else if !last_was(&chars, i, 'G') && next_is_eiy(&chars, i) {
result.push('J');
} else {
result.push('K');
}
},
'H' => {
if last_was_vowel(&chars, i) && !next_is_vowel(&chars, i) {
} else if i > 0 && ['C', 'S', 'P', 'T', 'G'].contains(&chars[i-1]) {
} else {
result.push('H');
}
},
'K' => {
if !last_was(&chars, i, 'C') {
result.push('K');
}
},
'P' => {
if next_is(&chars, i, 'H') {
result.push('F');
} else {
result.push('P');
}
},
'Q' => {
result.push('K');
},
'S' => {
if next_is(&chars, i, 'H') || next_two(&chars, i, 'I', 'O') || next_two(&chars, i, 'I', 'A') {
result.push('X');
} else {
result.push('S');
}
},
'T' => {
if next_two(&chars, i, 'C', 'H') {
} else if next_is(&chars, i, 'H') {
result.push('0');
i += 1;
} else if next_two(&chars, i, 'I', 'A') || next_two(&chars, i, 'I', 'O') {
result.push ('X'); } else {
result.push('T');
}
},
'V' => {
result.push('F');
},
'W' | 'Y' => {
if next_is_vowel(&chars, i) {
result.push(*c);
}
},
'X' => {
result.push('S');
result.push('K');
},
'Z' => {
result.push('S');
},
_ => { },
}
i += 1;
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_russell_soundex () {
let tests = [
("Ackermann", "A265"),
("Azuron", "A265"),
("Euler", "E460"),
("Ellery", "E460"),
("Gauss", "G200"),
("Ghosh", "G200"),
("Hilbert", "H416"),
("Heilbronn", "H416"),
("Knuth", "K530"),
("Kant", "K530"),
("Lloyd", "L300"),
("Ladd", "L300"),
("Lukasiewicz", "L222"),
("Lissajous", "L222"),
("SanFrancisco", "S516"),
("\"SanFrancisco\"", "S516"),
("", "")
];
for (wrd, res) in tests.iter() {
assert_eq!(res.to_string(), russell_soundex(wrd));
}
}
#[test]
fn test_refined_soundex () {
let tests = [
("Braz", "B1905"),
("Broz", "B1905"),
("Caren", "C30908"),
("Caron", "C30908"),
("Carren", "C30908"),
("Charon", "C30908"),
("Corain", "C30908"),
("Coram", "C30908"),
("Corran", "C30908"),
("Corrin", "C30908"),
("Corwin", "C30908"),
("Curran", "C30908"),
("Curreen", "C30908"),
("Currin", "C30908"),
("Currom", "C30908"),
("Currum", "C30908"),
("Curwen", "C30908"),
("Hairs", "H093"),
("Hark", "H093"),
("Hars", "H093"),
("Hayers", "H093"),
("Heers", "H093"),
("Hiers", "H093"),
("Lambard", "L7081096"),
("Lambart", "L7081096"),
("Lambert", "L7081096"),
("Lambird", "L7081096"),
("Lampaert", "L7081096"),
("Lampard", "L7081096"),
("Lampart", "L7081096"),
("Lamperd", "L7081096"),
("Lampert", "L7081096"),
("Lamport", "L7081096"),
("Limbert", "L7081096"),
("Lombard", "L7081096"),
("Nolton", "N807608"),
("Noulton", "N807608"),
("", "")
];
for (wrd, res) in tests.iter() {
assert_eq!(res.to_string(), refined_soundex(wrd));
}
}
#[test]
fn test_daitch_mokotoff () {
let tests = [
("MANHEIM", "665600"),
("MINTZ", "664000"),
("TOPF", "370000"),
("AUERBACH", "097500"),
("OHRBACH", "097500"),
("LIPSHITZ", "874400"),
("LIPPSZYC", "877440"),
("LEWINSKY", "876450"),
("lewinsky", "876450"),
("LEVINSKI", "876450"),
("SZLAMAWICZ", "486740"),
("SHLAMOVITZ", "486740")
];
for (wrd, res) in tests.iter() {
assert_eq!(res.to_string(), daitch_mokotoff(wrd));
}
}
#[test]
fn test_metaphone () {
let tests = [
("ANASTHA", "ANS0"),
("DAVIS-CARTER", "TFSKRTR"),
("ESCARMANT", "ESKRMNT"),
("MCCALL", "MKKL"), ("MCCROREY", "MKKRR"),
("MERSEAL", "MRSL"),
("PIEURISSAINT", "PRSNT"),
("ROTMAN", "RTMN"),
("SCHEVEL", "SKFL"), ("SCHROM", "SKRM"),
("SEAL", "SL"),
("SPARR", "SPR"),
("STARLEPER", "STRLPR"),
("THRASH", "0RX"),
("LOGGING", "LKNK"),
("LOGIC", "LJK"),
("JUDGES", "JJS"),
("SHOOS", "XS"),
("SHOES", "XS"),
("CHUTE", "XT"),
("SCHUSS", "SKS"), ("OTTO", "OT"),
("ERIC", "ERK"),
("BUCK", "BK"),
("COCK", "KK"),
("DAVE", "TF"),
("CATHERINE", "K0RN"),
("KATHERINE", "K0RN"),
("AUBREY", "ABR"),
("BRYAN", "BRYN"),
("BRYCE", "BRS"),
("STEVEN", "STFN"),
("RICHARD", "RXRT"),
("HEIDI", "HT"),
("AUTO", "AT"),
("MAURICE", "MRS"),
("RANDY", "RNT"),
("CAMBRILLO", "KMBRL"),
("BRIAN", "BRN"),
("RAY", "R"),
("GEOFF", "JF"),
("BOB", "BB"),
("AHA", "AH"),
("AAH", "A"),
("PAUL", "PL"),
("BATTLEY", "BTL"),
("WROTE" , "RT"),
("THIS", "0S")
];
for (wrd, res) in tests.iter() {
assert_eq!(res.to_string(), metaphone(wrd));
}
}
}