use alloc::string::String;
use alloc::vec::Vec;
#[inline]
fn is_tone(c: char) -> bool {
matches!(c, '\u{0E48}'..='\u{0E4B}')
}
pub fn normalize(text: &str) -> String {
if text.is_empty() {
return String::new();
}
let chars: Vec<char> = text.chars().collect();
let n = chars.len();
let mut out = String::with_capacity(text.len());
let mut i = 0;
while i < n {
let c = chars[i];
if is_tone(c) {
let mut last = c;
while i + 1 < n && is_tone(chars[i + 1]) {
i += 1;
last = chars[i];
}
out.push(last);
i += 1;
continue;
}
if c == '\u{0E4D}' && i + 1 < n && chars[i + 1] == '\u{0E32}' {
out.push('\u{0E33}'); i += 2;
continue;
}
out.push(c);
i += 1;
}
out
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_string() {
assert_eq!(normalize(""), "");
}
#[test]
fn ascii_passthrough() {
assert_eq!(normalize("hello 123"), "hello 123");
}
#[test]
fn correctly_encoded_thai_unchanged() {
assert_eq!(normalize("กินข้าว"), "กินข้าว");
assert_eq!(normalize("สวัสดี"), "สวัสดี");
assert_eq!(normalize("สวัสดีชาวโลก"), "สวัสดีชาวโลก");
assert_eq!(normalize("ธนาคารแห่งนั้น"), "ธนาคารแห่งนั้น");
}
#[test]
fn mixed_script_passthrough() {
let s = "ธนาคาร100แห่ง";
assert_eq!(normalize(s), s);
}
#[test]
fn duplicate_same_tone_removed() {
let input = "\u{0E01}\u{0E48}\u{0E48}";
let expected = "\u{0E01}\u{0E48}";
assert_eq!(normalize(input), expected);
}
#[test]
fn different_tone_keeps_last() {
let input = "\u{0E01}\u{0E48}\u{0E49}";
let expected = "\u{0E01}\u{0E49}";
assert_eq!(normalize(input), expected);
}
#[test]
fn triple_tone_keeps_last() {
let input = "\u{0E01}\u{0E48}\u{0E49}\u{0E4A}";
let expected = "\u{0E01}\u{0E4A}";
assert_eq!(normalize(input), expected);
}
#[test]
fn single_tone_unchanged() {
let input = "\u{0E01}\u{0E49}";
assert_eq!(normalize(input), input);
}
#[test]
fn tone_in_real_word() {
let input = "\u{0E02}\u{0E49}\u{0E49}\u{0E32}\u{0E27}"; let expected = "\u{0E02}\u{0E49}\u{0E32}\u{0E27}"; assert_eq!(normalize(input), expected);
}
#[test]
fn nikhahit_plus_sara_aa_composed() {
let input = "\u{0E4D}\u{0E32}";
assert_eq!(normalize(input), "\u{0E33}");
}
#[test]
fn sara_am_in_word_context() {
let input = "\u{0E01}\u{0E4D}\u{0E32}";
let expected = "\u{0E01}\u{0E33}";
assert_eq!(normalize(input), expected);
}
#[test]
fn nikhahit_without_sara_aa_unchanged() {
let input = "\u{0E01}\u{0E4D}\u{0E01}";
assert_eq!(normalize(input), input);
}
#[test]
fn already_sara_am_unchanged() {
let input = "\u{0E01}\u{0E33}";
assert_eq!(normalize(input), input);
}
#[test]
fn nam_decomposed_composes_to_nam() {
let input = "\u{0E19}\u{0E49}\u{0E4D}\u{0E32}";
let expected = "\u{0E19}\u{0E49}\u{0E33}";
assert_eq!(normalize(input), expected);
}
#[test]
fn output_length_never_exceeds_input_length() {
let inputs = [
"กินข้าว",
"สวัสดีชาวโลก",
"\u{0E01}\u{0E48}\u{0E48}", "\u{0E4D}\u{0E32}", ];
for s in inputs {
let out = normalize(s);
assert!(
out.chars().count() <= s.chars().count(),
"normalize grew the string: {s:?} → {out:?}"
);
}
}
}