1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
extern crate itertools;
use itertools::Itertools;
use std::ascii::AsciiExt;
fn american_soundex_code(c: u8) -> Option<u8> {
match c {
b'B' | b'F' | b'P' | b'V' => Some(b'1'),
b'C' | b'G' | b'J' | b'K' | b'Q' | b'S' | b'X' | b'Z' => Some(b'2'),
b'D' | b'T' => Some(b'3'),
b'L' => Some(b'4'),
b'M' | b'N' => Some(b'5'),
b'R' => Some(b'6'),
_ => None,
}
}
pub fn american_soundex(s: &str) -> String {
let chars = move || {
s.chars()
.filter(|c| c.is_ascii())
.filter_map(|c| c.to_uppercase().next())
.map(|c| c as u8)
};
let mut codes: Vec<_> = chars()
.take(1).chain(chars().skip(1).filter(|&c| c != b'H' && c != b'W'))
.map(|c| (c, american_soundex_code(c)))
.group_by(|&(_, code)| code).into_iter()
.filter_map(|(_, mut g)| g.next())
.enumerate().filter_map(|(i, (c, code))| {
if i == 0 {
Some(c)
} else {
code
}
})
.pad_using(4, |_| b'0').take(4)
.collect();
if b'1' <= codes[0] && codes[0] <= b'9' {
codes[0] = chars().next().unwrap();
}
debug_assert!(codes.iter().all(|c| c.is_ascii()));
unsafe { String::from_utf8_unchecked(codes) }
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn american_soundex_correct() {
let params: Vec<(&str, &str)> = vec![
("", "0000"),
("007bond", "0153"),
("Ashcraft", "A261"),
("Pfister", "P236"),
("Robert", "R163"),
("Rubin", "R150"),
("Rupert", "R163"),
("Toto", "T300"),
("Tymczak", "T522"),
("husobee", "H210"),
("touchstone", "T235"),
("heart ❤", "H630"),
];
for (i, o) in params {
assert_eq!(american_soundex(i), o.to_string());
}
}
}