#[derive(Debug, Clone, PartialEq)]
pub struct Token {
pub term: String, pub offset: usize, }
fn is_cjk(c: char) -> bool {
matches!(c,
'\u{4E00}'..='\u{9FFF}' | '\u{3400}'..='\u{4DBF}' | '\u{20000}'..='\u{2A6DF}' | '\u{F900}'..='\u{FAFF}' | '\u{2E80}'..='\u{2EFF}' | '\u{3040}'..='\u{309F}' | '\u{30A0}'..='\u{30FF}' | '\u{AC00}'..='\u{D7AF}' )
}
fn is_word_char(c: char) -> bool {
c.is_alphanumeric() && !is_cjk(c)
}
fn nfkc_lower(s: &str) -> String {
s.chars().map(|c| {
let half = match c {
'\u{FF01}'..='\u{FF5E}' => {
char::from_u32(c as u32 - 0xFF01 + 0x21).unwrap_or(c)
}
'\u{3000}' => ' ', _ => c,
};
half.to_lowercase().next().unwrap_or(half)
}).collect()
}
pub struct Tokenizer;
impl Tokenizer {
pub fn new() -> Self { Tokenizer }
pub fn tokenize(&self, text: &str) -> Vec<Token> {
let normalized = nfkc_lower(text);
let chars: Vec<char> = normalized.chars().collect();
let mut tokens = Vec::new();
let mut i = 0;
while i < chars.len() {
let c = chars[i];
if is_cjk(c) {
let start = i;
let mut end = i;
while end < chars.len() && is_cjk(chars[end]) {
end += 1;
}
for j in start..end {
if j + 1 < end {
let term: String = chars[j..j+2].iter().collect();
tokens.push(Token { term, offset: j });
}
if end - start == 1 {
tokens.push(Token { term: chars[j].to_string(), offset: j });
}
}
i = end;
} else if is_word_char(c) {
let start = i;
while i < chars.len() && is_word_char(chars[i]) {
i += 1;
}
let term: String = chars[start..i].iter().collect();
if !term.is_empty() {
tokens.push(Token { term, offset: start });
}
} else {
i += 1; }
}
tokens
}
pub fn terms(&self, text: &str) -> Vec<String> {
let mut seen = std::collections::HashSet::new();
self.tokenize(text).into_iter()
.filter(|t| seen.insert(t.term.clone()))
.map(|t| t.term)
.collect()
}
}
impl Default for Tokenizer {
fn default() -> Self { Self::new() }
}
#[cfg(test)]
mod tests {
use super::*;
fn terms(text: &str) -> Vec<String> {
Tokenizer::new().tokenize(text).into_iter().map(|t| t.term).collect()
}
#[test]
fn english_basic() {
let t = terms("Hello World");
assert_eq!(t, vec!["hello", "world"]);
}
#[test]
fn english_punctuation() {
let t = terms("foo, bar! baz.");
assert_eq!(t, vec!["foo", "bar", "baz"]);
}
#[test]
fn english_lowercase() {
let t = terms("SQL DATABASE");
assert_eq!(t, vec!["sql", "database"]);
}
#[test]
fn cjk_bigram() {
let t = terms("資料庫");
assert_eq!(t, vec!["資料", "料庫"]);
}
#[test]
fn cjk_single_char() {
let t = terms("書");
assert_eq!(t, vec!["書"]);
}
#[test]
fn mixed_text() {
let t = terms("SQL 資料庫");
assert!(t.contains(&"sql".to_string()));
assert!(t.contains(&"資料".to_string()));
assert!(t.contains(&"料庫".to_string()));
}
#[test]
fn japanese() {
let t = terms("データベース");
assert!(t.len() >= 2);
assert_eq!(t[0], "デー");
}
#[test]
fn korean() {
let t = terms("데이터베이스");
assert!(t.len() >= 2);
}
#[test]
fn fullwidth_ascii() {
let t = terms("SQL");
assert_eq!(t, vec!["sql"]);
}
#[test]
fn empty_string() {
assert!(terms("").is_empty());
}
#[test]
fn numbers_and_letters() {
let t = terms("v2.0 release");
assert!(t.contains(&"v2".to_string()) || t.contains(&"v".to_string()));
assert!(t.contains(&"release".to_string()));
}
#[test]
fn dedup_terms() {
let terms = Tokenizer::new().terms("the cat sat on the mat");
assert_eq!(terms.iter().filter(|t| t.as_str() == "the").count(), 1);
}
}