pub fn expand_for_search(input: &str) -> String {
let mut out = String::with_capacity(input.len() * 2);
out.push_str(input);
let cjk: Vec<char> = input.chars().filter(|ch| is_cjk(*ch)).collect();
if cjk.is_empty() {
return out;
}
out.push(' ');
for ch in &cjk {
out.push(*ch);
out.push(' ');
}
for window in cjk.windows(2) {
out.extend(window);
out.push(' ');
}
for window in cjk.windows(3) {
out.extend(window);
out.push(' ');
}
out
}
pub fn is_cjk(ch: char) -> bool {
matches!(
ch as u32,
0x3400..=0x4DBF
| 0x4E00..=0x9FFF
| 0xF900..=0xFAFF
| 0x20000..=0x2A6DF
| 0x2A700..=0x2B73F
| 0x2B740..=0x2B81F
| 0x2B820..=0x2CEAF
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn expands_cjk_unigrams_and_bigrams() {
let text = expand_for_search("苹果");
assert!(text.contains("苹"));
assert!(text.contains("果"));
assert!(text.contains("苹果"));
}
}