use rust_stemmers::{Algorithm, Stemmer};
use unicode_normalization::UnicodeNormalization;
use super::language::cjk::{bigram, script};
use super::language::stop_words;
pub trait TextAnalyzer: Send + Sync {
fn analyze(&self, text: &str) -> Vec<String>;
fn name(&self) -> &str;
}
pub fn analyze(text: &str) -> Vec<String> {
let stemmer = Stemmer::create(Algorithm::English);
let en_stops = stop_words::stop_words("en");
tokenize_with_stemmer(text, &stemmer, "en", en_stops)
}
pub fn tokenize_no_stem(text: &str) -> Vec<String> {
let en_stops = stop_words::stop_words("en");
tokenize_raw(text, "en", en_stops)
}
pub(crate) fn tokenize_raw(text: &str, lang: &str, stop_list: &[&str]) -> Vec<String> {
let mut normalized = String::with_capacity(text.len());
for c in text.chars() {
if script::is_cjk(c) || script::is_hangul_jamo(c) || script::is_thai(c) {
for lc in c.to_lowercase() {
normalized.push(lc);
}
} else {
for decomposed in c.nfd() {
if unicode_normalization::char::is_combining_mark(decomposed) {
continue;
}
for lc in decomposed.to_lowercase() {
normalized.push(lc);
}
}
}
}
let mut tokens = Vec::new();
for word in normalized.split(|c: char| !c.is_alphanumeric() && c != '-' && c != '_') {
let trimmed = word.trim_matches(|c: char| c == '-' || c == '_');
if trimmed.is_empty() || trimmed.len() <= 1 {
continue;
}
if trimmed.chars().any(script::needs_segmentation) {
let cjk_tokens = if matches!(lang, "ja" | "zh" | "ko" | "th") {
super::language::cjk::segmenter::segment(trimmed, lang)
} else {
bigram::tokenize_cjk(trimmed)
};
for token in cjk_tokens {
if !token.is_empty() && !is_stop_word_in_list(&token, stop_list) {
tokens.push(token);
}
}
continue;
}
if !is_stop_word_in_list(trimmed, stop_list) {
tokens.push(trimmed.to_string());
}
}
tokens
}
pub(crate) fn tokenize_with_stemmer(
text: &str,
stemmer: &Stemmer,
lang: &str,
stop_list: &[&str],
) -> Vec<String> {
let mut normalized = String::with_capacity(text.len());
for c in text.chars() {
if script::is_cjk(c) || script::is_hangul_jamo(c) || script::is_thai(c) {
for lc in c.to_lowercase() {
normalized.push(lc);
}
} else {
for decomposed in c.nfd() {
if unicode_normalization::char::is_combining_mark(decomposed) {
continue;
}
for lc in decomposed.to_lowercase() {
normalized.push(lc);
}
}
}
}
let mut tokens = Vec::new();
for word in normalized.split(|c: char| !c.is_alphanumeric() && c != '-' && c != '_') {
let trimmed = word.trim_matches(|c: char| c == '-' || c == '_');
if trimmed.is_empty() {
continue;
}
if trimmed.chars().any(script::needs_segmentation) {
let chars: Vec<char> = trimmed.chars().collect();
let mut i = 0;
while i < chars.len() {
let is_seg = script::needs_segmentation(chars[i]);
let run_end = chars[i..]
.iter()
.position(|&c| script::needs_segmentation(c) != is_seg)
.map_or(chars.len(), |p| i + p);
let run: String = chars[i..run_end].iter().collect();
if is_seg {
let cjk_tokens = if matches!(lang, "ja" | "zh" | "ko" | "th") {
super::language::cjk::segmenter::segment(&run, lang)
} else {
bigram::tokenize_cjk(&run)
};
for token in cjk_tokens {
if !token.is_empty() && !is_stop_word_in_list(&token, stop_list) {
tokens.push(token);
}
}
} else if run.len() > 1 && !is_stop_word_in_list(&run, stop_list) {
let stemmed = stemmer.stem(&run);
if !stemmed.is_empty() {
tokens.push(stemmed.into_owned());
}
}
i = run_end;
}
continue;
}
if trimmed.len() <= 1 {
continue;
}
if is_stop_word_in_list(trimmed, stop_list) {
continue;
}
let stemmed = stemmer.stem(trimmed);
if !stemmed.is_empty() {
tokens.push(stemmed.into_owned());
}
}
tokens
}
fn is_stop_word_in_list(word: &str, list: &[&str]) -> bool {
list.binary_search(&word).is_ok()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn basic_analysis() {
let tokens = analyze("The quick Brown FOX jumped over the lazy dog");
assert!(tokens.contains(&"quick".to_string()));
assert!(tokens.contains(&"brown".to_string()));
assert!(tokens.contains(&"fox".to_string()));
assert!(tokens.contains(&"jump".to_string()));
assert!(tokens.contains(&"lazi".to_string()));
assert!(tokens.contains(&"dog".to_string()));
assert!(!tokens.contains(&"the".to_string()));
}
#[test]
fn stop_words_removed() {
let tokens = analyze("this is a test of the system");
assert_eq!(tokens, vec!["test", "system"]);
}
#[test]
fn stemming_works() {
let tokens = analyze("running databases distributed systems");
assert!(tokens.contains(&"run".to_string()));
assert!(tokens.contains(&"databas".to_string()));
assert!(tokens.contains(&"distribut".to_string()));
assert!(tokens.contains(&"system".to_string()));
}
#[test]
fn unicode_normalization() {
let tokens = analyze("cafe\u{0301}");
assert_eq!(tokens, vec!["cafe"]);
}
#[test]
fn hyphenated_words_preserved() {
let tokens = analyze("e-mail real-time");
assert!(tokens.contains(&"e-mail".to_string()) || tokens.contains(&"email".to_string()));
}
#[test]
fn empty_and_single_char_filtered() {
let tokens = analyze("I a x ");
assert!(tokens.is_empty());
}
#[test]
fn cjk_routed_to_bigrams() {
let tokens = analyze("全文検索 system");
assert!(tokens.contains(&"全文".to_string()));
assert!(tokens.contains(&"文検".to_string()));
assert!(tokens.contains(&"検索".to_string()));
assert!(tokens.contains(&"system".to_string()));
}
#[test]
fn mixed_cjk_latin() {
let tokens = analyze("Tokyo東京Tower");
assert!(tokens.iter().any(|t| t == "tokyo"));
assert!(tokens.iter().any(|t| t == "tower"));
}
#[test]
fn korean_text_bigrams() {
let tokens = analyze("한국어 검색");
assert!(tokens.contains(&"한국".to_string()));
assert!(tokens.contains(&"국어".to_string()));
}
}