use jieba_rs::Jieba;
use std::sync::LazyLock;
static JIEBA: LazyLock<Jieba> = LazyLock::new(Jieba::new);
pub fn jieba_cut(text: &str) -> Vec<String> {
JIEBA
.cut(text, false)
.into_iter()
.map(|s| s.to_string())
.filter(|s| !s.trim().is_empty())
.collect()
}
pub fn bigrams(text: &str) -> Vec<String> {
let mut tokens = Vec::new();
let mut cjk_buf: Vec<char> = Vec::new();
for ch in text.chars() {
if is_cjk_char(ch) {
cjk_buf.push(ch);
} else {
if !cjk_buf.is_empty() {
tokens.extend(bigrams_from_chars(&cjk_buf));
cjk_buf.clear();
}
if !ch.is_whitespace() {
tokens.push(ch.to_lowercase().to_string());
}
}
}
if !cjk_buf.is_empty() {
tokens.extend(bigrams_from_chars(&cjk_buf));
}
tokens
}
fn bigrams_from_chars(chars: &[char]) -> Vec<String> {
if chars.len() <= 1 {
return chars.iter().map(|c| c.to_string()).collect();
}
chars
.windows(2)
.map(|w| format!("{}{}", w[0], w[1]))
.collect()
}
pub fn chars(text: &str) -> Vec<String> {
let mut tokens = Vec::new();
for ch in text.chars() {
if is_cjk_char(ch) {
tokens.push(ch.to_string());
} else if !ch.is_whitespace() {
tokens.push(ch.to_lowercase().to_string());
}
}
tokens
}
fn is_cjk_char(ch: char) -> bool {
matches!(ch,
'\u{4e00}'..='\u{9fff}' | '\u{3400}'..='\u{4dbf}' | '\u{3040}'..='\u{309f}' | '\u{30a0}'..='\u{30ff}' | '\u{ac00}'..='\u{d7af}' | '\u{f900}'..='\u{faff}' )
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_jieba_cut_chinese() {
let tokens = jieba_cut("机器学习是人工智能的子领域");
assert!(!tokens.is_empty());
let joined = tokens.join(" ");
assert!(joined.contains("机器"), "expected '机器' in: {joined}");
assert!(joined.contains("学习"), "expected '学习' in: {joined}");
}
#[test]
fn test_jieba_cut_filters_whitespace() {
let tokens = jieba_cut(" 你好 世界 ");
for t in &tokens {
assert!(!t.trim().is_empty(), "got whitespace token: '{t}'");
}
}
#[test]
fn test_jieba_cut_empty() {
let tokens = jieba_cut("");
assert!(tokens.is_empty());
}
#[test]
fn test_bigrams_basic() {
let tokens = bigrams("机器学习");
assert_eq!(tokens, vec!["机器", "器学", "学习"]);
}
#[test]
fn test_bigrams_single_char() {
let tokens = bigrams("学");
assert_eq!(tokens, vec!["学"]);
}
#[test]
fn test_bigrams_mixed() {
let tokens = bigrams("机器AI学习");
assert!(tokens.contains(&"机器".to_string()));
assert!(tokens.contains(&"a".to_string()));
assert!(tokens.contains(&"i".to_string()));
assert!(tokens.contains(&"学习".to_string()));
}
#[test]
fn test_chars_basic() {
let tokens = chars("机器学习");
assert_eq!(tokens, vec!["机", "器", "学", "习"]);
}
#[test]
fn test_chars_mixed() {
let tokens = chars("机器AI");
assert_eq!(tokens, vec!["机", "器", "a", "i"]);
}
#[test]
fn test_chars_empty() {
let tokens = chars("");
assert!(tokens.is_empty());
}
#[test]
fn test_chars_whitespace_only() {
let tokens = chars(" ");
assert!(tokens.is_empty());
}
#[test]
fn test_bigrams_japanese_hiragana() {
let tokens = bigrams("あいう");
assert_eq!(tokens, vec!["あい", "いう"]);
}
#[test]
fn test_chars_korean() {
let tokens = chars("한글");
assert_eq!(tokens, vec!["한", "글"]);
}
}