use alloc::string::String;
use alloc::vec::Vec;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SoundexAlgorithm {
Lk82,
Udom83,
MetaSound,
}
pub fn soundex(word: &str, algo: SoundexAlgorithm) -> String {
match algo {
SoundexAlgorithm::Lk82 => lk82(word),
SoundexAlgorithm::Udom83 => udom83(word),
SoundexAlgorithm::MetaSound => metasound(word),
}
}
pub fn sounds_like(a: &str, b: &str, algo: SoundexAlgorithm) -> bool {
if a.is_empty() || b.is_empty() {
return false;
}
let code_a = soundex(a, algo);
!code_a.chars().all(|c| c == '0') && code_a == soundex(b, algo)
}
pub fn lk82(word: &str) -> String {
encode(word, lk82_code)
}
fn lk82_code(c: char) -> u8 {
match c {
'อ' => b'0',
'ก' | 'ข' | 'ค' | 'ฆ' => b'1',
'จ' | 'ช' | 'ซ' | 'ศ' | 'ษ' | 'ส' | 'ฉ' | 'ฌ' | 'ญ' => b'2',
'ต' | 'ถ' | 'ท' | 'ธ' | 'ฏ' | 'ฐ' | 'ฑ' | 'ฒ' | 'ด' | 'ฎ' => b'3',
'บ' | 'ป' | 'พ' | 'ผ' | 'ภ' | 'ฝ' | 'ฟ' => b'4',
'ม' => b'5',
'น' | 'ณ' => b'6',
'ง' => b'7',
'ล' | 'ร' | 'ฬ' => b'8',
'ว' => b'9',
'ย' => b'A',
'ห' | 'ฮ' => b'B',
_ => b'0',
}
}
pub fn udom83(word: &str) -> String {
encode(word, udom83_code)
}
fn udom83_code(c: char) -> u8 {
match c {
'อ' => b'0',
'ก' | 'ข' | 'ค' | 'ฆ' => b'1',
'จ' | 'ช' | 'ฉ' | 'ฌ' => b'2',
'ซ' | 'ศ' | 'ษ' | 'ส' => b'3',
'ต' | 'ถ' | 'ท' | 'ธ' | 'ฏ' | 'ฐ' | 'ฑ' | 'ฒ' | 'ด' | 'ฎ' => b'4',
'บ' | 'ป' | 'พ' | 'ผ' | 'ภ' | 'ฝ' | 'ฟ' => b'5',
'ม' => b'6',
'น' | 'ณ' | 'ญ' => b'7',
'ง' => b'8',
'ล' | 'ฬ' => b'9',
'ร' => b'A',
'ว' => b'B',
'ย' => b'C',
'ห' | 'ฮ' => b'D',
_ => b'0',
}
}
pub fn metasound(word: &str) -> String {
let chars: Vec<char> = word.chars().collect();
let len = chars.len();
let mut result = String::new();
let mut i = 0;
while i < len {
let lead = if is_ms_lead(chars[i]) {
let v = chars[i];
i += 1;
Some(v)
} else {
None
};
if i >= len || !is_thai_consonant(chars[i]) {
if lead.is_none() {
i += 1; }
continue;
}
let initial = chars[i];
i += 1;
if i < len && chars[i] == '\u{0E4C}' {
i += 1;
continue;
}
let mut upper: Option<char> = None;
let mut nikhahit = false;
while i < len {
match chars[i] {
c if is_ms_upper(c) => {
upper = Some(c);
i += 1;
}
c if is_ms_tone(c) => {
i += 1;
}
'\u{0E4D}' => {
nikhahit = true;
i += 1;
}
_ => break,
}
}
let follow = if i < len && is_ms_follow(chars[i]) {
let v = chars[i];
i += 1;
Some(v)
} else {
None
};
let final_c = if i < len && is_thai_consonant(chars[i]) {
let next = i + 1;
if next < len && chars[next] == '\u{0E4C}' {
i += 2;
None
} else if next < len
&& (is_ms_upper(chars[next])
|| is_ms_follow(chars[next])
|| is_ms_lead(chars[next]))
{
None
} else {
let fc = chars[i];
i += 1;
Some(fc)
}
} else {
None
};
result.push(ms_initial_code(initial) as char);
result.push(ms_vowel_code(lead, upper, follow, nikhahit) as char);
result.push(ms_final_code(final_c) as char);
}
if result.is_empty() {
"000".into()
} else {
result
}
}
fn ms_initial_code(c: char) -> u8 {
match c {
'ก' | 'ข' | 'ค' | 'ฆ' => b'1',
'ง' => b'2',
'จ' | 'ช' | 'ฉ' | 'ฌ' => b'3',
'ซ' | 'ศ' | 'ษ' | 'ส' => b'4',
'ญ' | 'ย' => b'5',
'ฎ' | 'ด' => b'6',
'ฏ' | 'ต' => b'7',
'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => b'8',
'น' | 'ณ' => b'9',
'บ' => b'A',
'ป' => b'B',
'ผ' | 'พ' | 'ภ' => b'C',
'ฝ' | 'ฟ' => b'D',
'ม' => b'E',
'ร' => b'F',
'ล' | 'ฬ' => b'G',
'ว' => b'H',
'ห' | 'ฮ' => b'I',
_ => b'J', }
}
fn ms_vowel_code(
lead: Option<char>,
upper: Option<char>,
follow: Option<char>,
nikhahit: bool,
) -> u8 {
if nikhahit {
return b'D';
}
match lead {
Some('ไ') | Some('ใ') => b'E', Some('เ') => match follow {
Some('\u{0E32}') => b'F', Some('\u{0E30}') => b'8', _ => b'8', },
Some('แ') => b'8', Some('โ') => b'9', _ => match upper {
Some('\u{0E31}') => b'0', Some('\u{0E34}') => b'2', Some('\u{0E35}') => b'3', Some('\u{0E36}') => b'4', Some('\u{0E37}') => b'5', Some('\u{0E38}') => b'6', Some('\u{0E39}') => b'7', _ => match follow {
Some('\u{0E30}') => b'0', Some('\u{0E32}') => b'1', Some('\u{0E33}') => b'D', _ => b'0', },
},
}
}
fn ms_final_code(c: Option<char>) -> u8 {
match c {
Some('ก') => b'1', Some('น') | Some('ณ') | Some('ญ') | Some('ร') | Some('ล') | Some('ฬ') => b'2', Some('ม') => b'3', Some('ง') => b'4', Some('ย') | Some('ว') => b'5', _ => b'6', }
}
fn is_ms_lead(c: char) -> bool {
matches!(c, '\u{0E40}'..='\u{0E44}') }
fn is_ms_upper(c: char) -> bool {
c == '\u{0E31}' || matches!(c, '\u{0E34}'..='\u{0E3A}')
}
fn is_ms_follow(c: char) -> bool {
matches!(c, '\u{0E30}' | '\u{0E32}' | '\u{0E33}') }
fn is_ms_tone(c: char) -> bool {
matches!(c, '\u{0E48}'..='\u{0E4B}') }
fn strip_silent(s: &str) -> String {
let chars: Vec<char> = s.chars().collect();
let mut out = String::new();
let mut i = 0;
while i < chars.len() {
if i + 1 < chars.len() && chars[i + 1] == '\u{0E4C}' {
i += 2;
continue;
}
out.push(chars[i]);
i += 1;
}
out
}
fn is_thai_consonant(c: char) -> bool {
('\u{0E01}'..='\u{0E2E}').contains(&c)
}
fn encode(word: &str, code_fn: fn(char) -> u8) -> String {
const LEN: usize = 4;
let stripped = strip_silent(word);
let mut codes: Vec<u8> = Vec::with_capacity(LEN);
let mut last: Option<u8> = None;
for ch in stripped.chars() {
if !is_thai_consonant(ch) {
continue;
}
let code = code_fn(ch);
if Some(code) != last {
codes.push(code);
last = Some(code);
}
if codes.len() == LEN {
break;
}
}
while codes.len() < LEN {
codes.push(b'0');
}
String::from_utf8(codes).expect("soundex codes are ASCII")
}
pub fn thai_english_soundex(word: &str) -> String {
let chars: Vec<char> = word.chars().collect();
let len = chars.len();
let mut result = String::new();
let mut last_digit: Option<char> = None;
let mut is_first = true;
let mut i = 0;
while i < len {
let c = chars[i];
if is_cl_skip(c) {
i += 1;
continue;
}
if is_thai_consonant(c) && i + 1 < len && chars[i + 1] == '\u{0E4C}' {
i += 2;
continue;
}
if !c.is_ascii_alphabetic() && !is_thai_consonant(c) {
i += 1;
continue;
}
let code = cl_code(c, is_first);
if !code.is_empty() {
is_first = false;
}
for digit in code.chars() {
if Some(digit) != last_digit {
result.push(digit);
last_digit = Some(digit);
}
}
i += 1;
}
result
}
pub fn english_soundex(word: &str) -> String {
let mut chars = word
.chars()
.filter(|c| c.is_ascii_alphabetic())
.map(|c| c.to_ascii_uppercase());
let first = match chars.next() {
Some(c) => c,
None => return String::new(),
};
let mut code = String::with_capacity(4);
code.push(first);
let mut last = std_soundex_digit(first);
for c in chars {
let d = std_soundex_digit(c);
if d == '0' {
last = '0'; } else if d != last {
code.push(d);
last = d;
if code.len() == 4 {
break;
}
}
}
while code.len() < 4 {
code.push('0');
}
code
}
fn std_soundex_digit(c: char) -> char {
match c {
'B' | 'F' | 'P' | 'V' => '1',
'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => '2',
'D' | 'T' => '3',
'L' => '4',
'M' | 'N' => '5',
'R' => '6',
_ => '0',
}
}
pub fn sounds_like_cross_lang(a: &str, b: &str) -> bool {
let code_a = thai_english_soundex(a);
!code_a.is_empty() && code_a == thai_english_soundex(b)
}
fn cl_code(c: char, is_first: bool) -> &'static str {
if c.is_ascii_alphabetic() {
let cu = c.to_ascii_uppercase();
return if is_first {
match cu {
'A' | 'E' | 'I' | 'O' | 'U' | 'H' | 'W' | 'Y' => "0",
'B' | 'F' | 'P' | 'V' => "1",
'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => "2",
'D' | 'T' => "3",
'L' => "4",
'M' | 'N' => "5",
'R' => "6",
_ => "",
}
} else {
match cu {
'A' | 'E' | 'I' | 'O' | 'U' => "7",
'H' => "8",
'W' => "1",
'Y' => "9",
'B' | 'F' | 'P' | 'V' => "1",
'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => "2",
'D' | 'T' => "3",
'L' => "4",
'M' | 'N' => "5",
'R' => "6",
_ => "",
}
};
}
if is_first {
match c {
'อ' | 'ห' | 'ฮ' | 'ว' | 'ญ' | 'ย' => "0",
'ก' | 'ข' | 'ฃ' | 'ค' | 'ฅ' | 'ฆ' => "2",
'จ' | 'ฉ' | 'ช' | 'ฌ' => "2",
'ซ' | 'ศ' | 'ษ' | 'ส' => "2",
'ง' => "52",
'ฎ' | 'ด' | 'ฏ' | 'ต' | 'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => "3",
'ล' | 'ฬ' => "4",
'ม' | 'ณ' | 'น' => "5",
'ร' => "6",
'บ' | 'ป' | 'ผ' | 'พ' | 'ภ' | 'ฝ' | 'ฟ' => "1",
_ => "",
}
} else {
match c {
'อ' => "",
'ห' | 'ฮ' => "8",
'ว' => "1",
'ญ' | 'ย' => "9",
'ก' | 'ข' | 'ฃ' | 'ค' | 'ฅ' | 'ฆ' => "2",
'จ' | 'ฉ' | 'ช' | 'ฌ' => "2",
'ซ' | 'ศ' | 'ษ' | 'ส' => "2",
'ง' => "52",
'ฎ' | 'ด' | 'ฏ' | 'ต' | 'ฐ' | 'ฑ' | 'ฒ' | 'ถ' | 'ท' | 'ธ' => "3",
'ล' | 'ฬ' => "4",
'ม' | 'ณ' | 'น' => "5",
'ร' => "6",
'บ' | 'ป' | 'ผ' | 'พ' | 'ภ' | 'ฝ' | 'ฟ' => "1",
_ => "",
}
}
}
fn is_cl_skip(c: char) -> bool {
matches!(
c,
'\u{0E30}'..='\u{0E3A}' | '\u{0E40}'..='\u{0E44}' | '\u{0E47}'..='\u{0E4E}' )
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn lk82_worked_examples() {
assert_eq!(lk82("กาน"), "1600");
assert_eq!(lk82("ขาน"), "1600");
assert_eq!(lk82("คาน"), "1600");
assert_eq!(lk82("บ้าน"), "4600");
assert_eq!(lk82("มาก"), "5100");
assert_eq!(lk82("นาค"), "6100");
assert_eq!(lk82("กรุงเทพ"), "1873");
}
#[test]
fn lk82_same_initial_velar() {
assert_eq!(lk82("กาน"), lk82("ขาน"));
assert_eq!(lk82("กาน"), lk82("คาน"));
}
#[test]
fn lk82_different_initials() {
assert_ne!(lk82("กาน"), lk82("ปาน"));
assert_ne!(lk82("มาน"), lk82("นาน"));
}
#[test]
fn lk82_always_four_chars() {
assert_eq!(lk82("ก").len(), 4);
assert_eq!(lk82("กรุงเทพมหานคร").len(), 4);
}
#[test]
fn lk82_empty_and_no_thai() {
assert_eq!(lk82(""), "0000");
assert_eq!(lk82("123"), "0000");
assert_eq!(lk82("hello"), "0000");
}
#[test]
fn lk82_strips_silent_consonant() {
assert_eq!(lk82("กรณ์"), lk82("กร"));
}
#[test]
fn lk82_deduplicates_adjacent_same_group() {
assert_eq!(lk82("กข"), "1000");
}
#[test]
fn udom83_always_four_chars() {
assert_eq!(udom83("ก").len(), 4);
assert_eq!(udom83("กรุงเทพมหานคร").len(), 4);
}
#[test]
fn udom83_separates_liquids() {
assert_ne!(udom83("ลาน"), udom83("ราน"));
}
#[test]
fn udom83_sibilant_separate_from_affricate() {
assert_ne!(udom83("สาน"), udom83("ชาน"));
assert_eq!(udom83("สาน"), udom83("ซาน"));
}
#[test]
fn udom83_empty_and_no_thai() {
assert_eq!(udom83(""), "0000");
assert_eq!(udom83("abc"), "0000");
}
#[test]
fn metasound_worked_examples() {
assert_eq!(metasound("กาน"), "112");
assert_eq!(metasound("ขาน"), "112");
assert_eq!(metasound("กาม"), "113");
}
#[test]
fn metasound_same_initial_group() {
assert_eq!(metasound("กาน"), metasound("ขาน"));
assert_eq!(metasound("กาน"), metasound("คาน"));
}
#[test]
fn metasound_distinguishes_finals() {
assert_ne!(metasound("กาน"), metasound("กาม"));
assert_ne!(metasound("กาน"), metasound("กาง"));
}
#[test]
fn metasound_vowel_length() {
assert_ne!(metasound("กาน"), metasound("กะ"));
}
#[test]
fn metasound_lead_vowel_classes() {
let e_code = metasound("เกน");
assert_eq!(&e_code[1..2], "8");
let ai_code = metasound("ไก");
assert_eq!(&ai_code[1..2], "E");
}
#[test]
fn metasound_empty_and_no_thai() {
assert_eq!(metasound(""), "000");
assert_eq!(metasound("abc"), "000");
assert_eq!(metasound("123"), "000");
}
#[test]
fn metasound_open_syllable() {
assert_eq!(metasound("กา"), "116");
}
#[test]
fn metasound_sara_am() {
let code = metasound("กำ");
assert_eq!(&code[1..2], "D");
}
#[test]
fn soundex_dispatches_to_lk82() {
assert_eq!(soundex("กาน", SoundexAlgorithm::Lk82), lk82("กาน"));
}
#[test]
fn soundex_dispatches_to_udom83() {
assert_eq!(soundex("กาน", SoundexAlgorithm::Udom83), udom83("กาน"));
}
#[test]
fn soundex_dispatches_to_metasound() {
assert_eq!(
soundex("กาน", SoundexAlgorithm::MetaSound),
metasound("กาน")
);
}
#[test]
fn sounds_like_lk82_positive() {
assert!(sounds_like("กาน", "ขาน", SoundexAlgorithm::Lk82));
}
#[test]
fn sounds_like_lk82_negative() {
assert!(!sounds_like("กิน", "มิน", SoundexAlgorithm::Lk82));
}
#[test]
fn sounds_like_udom83_splits_liquids() {
assert!(!sounds_like("ลาน", "ราน", SoundexAlgorithm::Udom83));
}
#[test]
fn sounds_like_metasound_positive() {
assert!(sounds_like("กาน", "ขาน", SoundexAlgorithm::MetaSound));
}
#[test]
fn sounds_like_metasound_negative() {
assert!(!sounds_like("กาน", "กาม", SoundexAlgorithm::MetaSound));
}
#[test]
fn sounds_like_empty_returns_false() {
assert!(!sounds_like("", "กาน", SoundexAlgorithm::Lk82));
assert!(!sounds_like("กาน", "", SoundexAlgorithm::Lk82));
}
#[test]
fn english_soundex_standard_examples() {
assert_eq!(english_soundex("Robert"), "R163");
assert_eq!(english_soundex("Rupert"), "R163"); assert_eq!(english_soundex("McDonald"), "M235");
assert_eq!(english_soundex("Smith"), "S530");
assert_eq!(english_soundex("Thompson"), "T512");
}
#[test]
fn english_soundex_always_four_chars() {
assert_eq!(english_soundex("A").len(), 4);
assert_eq!(english_soundex("Robert").len(), 4);
}
#[test]
fn english_soundex_empty_and_no_alpha() {
assert_eq!(english_soundex(""), "");
assert_eq!(english_soundex("123"), "");
}
#[test]
fn english_soundex_case_insensitive() {
assert_eq!(english_soundex("robert"), english_soundex("Robert"));
assert_eq!(english_soundex("ROBERT"), english_soundex("Robert"));
}
#[test]
fn english_soundex_vowel_separates_same_code() {
assert_eq!(english_soundex("Abba"), "A100");
assert_eq!(&english_soundex("Ababar")[..2], "A1");
}
#[test]
fn english_soundex_adjacent_same_code_collapsed() {
assert_eq!(english_soundex("Jack"), "J200");
}
#[test]
fn thai_english_soundex_english_numeric_codes() {
assert_eq!(thai_english_soundex("Robert"), "671763");
assert_eq!(thai_english_soundex("Rupert"), "671763"); }
#[test]
fn thai_english_soundex_thai_direct_encoding() {
assert_eq!(thai_english_soundex("กน"), "25"); assert_eq!(thai_english_soundex("ร"), "6"); assert_eq!(thai_english_soundex("ก"), "2"); }
#[test]
fn thai_english_soundex_ng_two_digits() {
assert_eq!(thai_english_soundex("ง"), "52");
}
#[test]
fn thai_english_soundex_thai_vowels_skipped_english_vowels_to_7() {
assert_eq!(thai_english_soundex("กิน"), "25"); assert!(thai_english_soundex("Robert").contains('7')); }
#[test]
fn thai_english_soundex_cross_lang_prefix_match() {
let en = thai_english_soundex("McDonald");
let th = thai_english_soundex("แมคโดนัลด์");
assert!(en.len() >= 3 && th.len() >= 3, "codes too short");
assert_eq!(&en[..3], &th[..3]);
}
#[test]
fn thai_english_soundex_variable_length_and_empty() {
assert_eq!(thai_english_soundex(""), "");
assert_eq!(thai_english_soundex("123"), "");
let long = thai_english_soundex("กรุงเทพมหานคร");
assert!(long.len() > 2);
}
#[test]
fn sounds_like_cross_lang_same_english() {
assert!(sounds_like_cross_lang("Robert", "Rupert"));
}
#[test]
fn sounds_like_cross_lang_same_thai_initial_group() {
assert!(sounds_like_cross_lang("กาน", "คาน"));
}
#[test]
fn sounds_like_cross_lang_different() {
assert!(!sounds_like_cross_lang("Robert", "Smith"));
assert!(!sounds_like_cross_lang("กาน", "บาน")); }
#[test]
fn sounds_like_cross_lang_empty_returns_false() {
assert!(!sounds_like_cross_lang("", "Robert"));
assert!(!sounds_like_cross_lang("Robert", ""));
}
}