use unicode_normalization::{UnicodeNormalization, char::is_combining_mark};
pub fn tokenize(input: &str) -> Vec<String> {
let mut tokens = Vec::new();
let mut current = String::new();
let mut prev_char: Option<char> = None;
let push_token = |t: &mut Vec<String>, c: &mut String| {
if !c.is_empty() {
if c.is_ascii() {
t.push(c.to_ascii_lowercase());
} else {
let folded: String = c.nfd().filter(|ch| !is_combining_mark(*ch)).collect();
t.push(folded.to_lowercase());
}
c.clear();
}
};
for c in input.chars() {
if !c.is_alphanumeric() || c == '\u{2014}' {
push_token(&mut tokens, &mut current);
prev_char = Some(c);
continue;
}
if let Some(p) = prev_char {
let is_camel = p.is_lowercase() && c.is_uppercase();
let is_num_transition =
(p.is_alphabetic() && c.is_numeric()) || (p.is_numeric() && c.is_alphabetic());
let is_cjk_transition = is_cjk(p) || is_cjk(c);
if is_camel || is_num_transition || is_cjk_transition {
push_token(&mut tokens, &mut current);
}
}
current.push(c);
prev_char = Some(c);
}
push_token(&mut tokens, &mut current);
tokens.sort_unstable();
tokens.dedup();
tokens
}
fn is_cjk(c: char) -> bool {
let u = c as u32;
(0x3040..=0x309F).contains(&u) || (0x30A0..=0x30FF).contains(&u) || (0x4E00..=0x9FFF).contains(&u) || (0xAC00..=0xD7AF).contains(&u) }
pub(crate) fn fold_path(input: &str) -> String {
input
.nfd()
.filter(|ch| !is_combining_mark(*ch))
.collect::<String>()
.to_lowercase()
}
const SYNTH_PATH_TOKEN_TAG: char = '\x00';
const SYNTH_VOLUME_TOKEN_TAG: char = '\x01';
const SYNTH_EXT_TOKEN_TAG: char = '\x02';
pub(crate) fn synthesize_path_token(orig: &str) -> String {
format!("{SYNTH_PATH_TOKEN_TAG}{orig}")
}
pub(crate) fn synthesize_volume_token(orig: &str) -> String {
format!("{SYNTH_VOLUME_TOKEN_TAG}{orig}")
}
pub(crate) fn synthesize_ext_token(orig: &str) -> String {
format!("{SYNTH_EXT_TOKEN_TAG}{orig}")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_tokenize_basic() {
let tokens = tokenize("hello world");
assert_eq!(tokens, vec!["hello", "world"]);
}
#[test]
fn test_tokenize_camel_case() {
let tokens = tokenize("MySuperFile");
assert_eq!(tokens, vec!["file", "my", "super"]);
}
#[test]
fn test_tokenize_numeric_transition() {
let tokens = tokenize("report2023.txt");
assert_eq!(tokens, vec!["2023", "report", "txt"]);
}
#[test]
fn test_tokenize_normalization() {
let tokens = tokenize("café");
assert_eq!(tokens, vec!["cafe"]);
}
#[test]
fn test_tokenize_cjk() {
let tokens = tokenize("日本語");
assert_eq!(tokens, vec!["日", "本", "語"]);
}
#[test]
fn test_fold_path() {
assert_eq!(fold_path("Café/Report_2023"), "cafe/report_2023");
}
#[test]
fn test_synthetic_tokens() {
assert_eq!(synthesize_path_token("abc"), "\x00abc");
assert_eq!(synthesize_volume_token("c:"), "\x01c:");
assert_eq!(synthesize_ext_token("pdf"), "\x02pdf");
}
#[test]
fn test_tokenize_case_insensitivity() {
let tokens = tokenize("hElLo");
assert_eq!(tokens, vec!["el", "h", "lo"]);
let tokens2 = tokenize("Hello HELLO");
assert_eq!(tokens2, vec!["hello"]);
}
}