pub fn normalize(text: &str) -> String {
let lowered: String = text.chars().flat_map(char::to_lowercase).collect();
let mut out = String::with_capacity(lowered.len());
for c in lowered.chars() {
if is_combining_mark(c) {
continue;
}
if let Some(folded) = fold_non_decomposing(c) {
out.push_str(folded);
} else if let Some(base) = strip_latin_diacritic(c) {
out.push(base);
} else if let Some(full) = halfwidth_to_fullwidth(c) {
out.push(full);
} else {
out.push(c);
}
}
out
}
fn fold_non_decomposing(c: char) -> Option<&'static str> {
Some(match c {
'ł' => "l",
'ø' => "o",
'ß' => "ss",
'æ' => "ae",
'œ' => "oe",
'þ' => "th",
'ð' => "d",
'đ' => "d",
'ħ' => "h",
'ı' => "i", 'ŋ' => "n",
_ => return None,
})
}
fn strip_latin_diacritic(c: char) -> Option<char> {
Some(match c {
'à' | 'á' | 'â' | 'ã' | 'ä' | 'å' => 'a',
'ç' => 'c',
'è' | 'é' | 'ê' | 'ë' => 'e',
'ì' | 'í' | 'î' | 'ï' => 'i',
'ñ' => 'n',
'ò' | 'ó' | 'ô' | 'õ' | 'ö' => 'o',
'ù' | 'ú' | 'û' | 'ü' => 'u',
'ý' | 'ÿ' => 'y',
'ā' | 'ă' | 'ą' => 'a',
'ć' | 'ĉ' | 'ċ' | 'č' => 'c',
'ď' => 'd',
'ē' | 'ĕ' | 'ė' | 'ę' | 'ě' => 'e',
'ĝ' | 'ğ' | 'ġ' | 'ģ' => 'g',
'ĥ' => 'h',
'ĩ' | 'ī' | 'ĭ' | 'į' => 'i',
'ĵ' => 'j',
'ķ' => 'k',
'ĺ' | 'ļ' | 'ľ' | 'ŀ' => 'l',
'ń' | 'ņ' | 'ň' => 'n',
'ō' | 'ŏ' | 'ő' => 'o',
'ŕ' | 'ŗ' | 'ř' => 'r',
'ś' | 'ŝ' | 'ş' | 'š' => 's',
'ţ' | 'ť' | 'ŧ' => 't',
'ũ' | 'ū' | 'ŭ' | 'ů' | 'ű' | 'ų' => 'u',
'ŵ' => 'w',
'ŷ' => 'y',
'ź' | 'ż' | 'ž' => 'z',
_ => return None,
})
}
fn is_combining_mark(c: char) -> bool {
matches!(
c as u32,
0x0300..=0x036F | 0x0483..=0x0489 | 0x0591..=0x05BD | 0x05BF
| 0x05C1 | 0x05C2
| 0x05C4 | 0x05C5
| 0x05C7
| 0x0610..=0x061A | 0x064B..=0x065F | 0x0670 | 0x06D6..=0x06DC | 0x06DF..=0x06E4
| 0x06E7 | 0x06E8
| 0x06EA..=0x06ED
| 0xFE20..=0xFE2F )
}
fn halfwidth_to_fullwidth(c: char) -> Option<char> {
const HW_KATAKANA: [u32; 56] = [
0x30F2, 0x30A1, 0x30A3, 0x30A5, 0x30A7, 0x30A9, 0x30E3, 0x30E5, 0x30E7, 0x30C3, 0x30FC,
0x30A2, 0x30A4, 0x30A6, 0x30A8, 0x30AA, 0x30AB, 0x30AD, 0x30AF, 0x30B1, 0x30B3, 0x30B5,
0x30B7, 0x30B9, 0x30BB, 0x30BD, 0x30BF, 0x30C1, 0x30C4, 0x30C6, 0x30C8, 0x30CA, 0x30CB,
0x30CC, 0x30CD, 0x30CE, 0x30CF, 0x30D2, 0x30D5, 0x30D8, 0x30DB, 0x30DE, 0x30DF, 0x30E0,
0x30E1, 0x30E2, 0x30E4, 0x30E6, 0x30E8, 0x30E9, 0x30EA, 0x30EB, 0x30EC, 0x30ED, 0x30EF,
0x30F3,
];
let u = c as u32;
if (0xFF66..=0xFF9D).contains(&u) {
return char::from_u32(HW_KATAKANA[(u - 0xFF66) as usize]);
}
None
}
pub fn is_cjk(c: char) -> bool {
matches!(
c as u32,
0x3040..=0x309F | 0x30A0..=0x30FF | 0x3400..=0x4DBF | 0x4E00..=0x9FFF | 0xF900..=0xFAFF | 0x1100..=0x11FF | 0xAC00..=0xD7AF )
}
pub fn ngrams(normalized: &str) -> Vec<&str> {
let mut out = Vec::new();
let chars: Vec<(usize, bool)> = normalized
.char_indices()
.map(|(b, c)| (b, is_cjk(c)))
.collect();
if chars.is_empty() {
return out;
}
let end_byte = normalized.len();
let mut i = 0;
while i < chars.len() {
let cjk = chars[i].1;
let mut j = i + 1;
while j < chars.len() && chars[j].1 == cjk {
j += 1;
}
let n = if cjk { 2 } else { 3 };
if j - i >= n {
for k in i..=j - n {
let start = chars[k].0;
let end = if k + n < chars.len() {
chars[k + n].0
} else {
end_byte
};
out.push(&normalized[start..end]);
}
}
i = j;
}
out
}
#[cfg(test)]
#[path = "tokenize_tests.rs"]
mod tests;