use std::sync::OnceLock;
use jieba_rs::Jieba;
fn jieba() -> &'static Jieba {
static INSTANCE: OnceLock<Jieba> = OnceLock::new();
INSTANCE.get_or_init(Jieba::new)
}
fn is_search_char(c: char) -> bool {
c == '_' || c.is_alphanumeric()
}
fn is_search_token(token: &str) -> bool {
!token.is_empty() && token.chars().all(is_search_char)
}
pub fn tokenize_indexing(text: &str) -> String {
let trimmed = text.trim();
if trimmed.is_empty() {
return String::new();
}
let mut seen = std::collections::HashSet::<String>::new();
let mut out: Vec<String> = Vec::new();
for tok in jieba().cut_for_search(trimmed, true) {
let t = tok.trim();
if !is_search_token(t) {
continue;
}
if seen.insert(t.to_owned()) {
out.push(t.to_owned());
}
}
out.join(" ")
}
pub fn tokenize_query(text: &str) -> String {
let trimmed = text.trim();
if trimmed.is_empty() {
return String::new();
}
let mut seen = std::collections::HashSet::<String>::new();
let mut out: Vec<String> = Vec::new();
for tok in jieba().cut_for_search(trimmed, true) {
let t = tok.trim();
if !is_search_token(t) {
continue;
}
if seen.insert(t.to_owned()) {
let escaped = t.replace('"', "\"\"");
out.push(format!("\"{escaped}\""));
}
}
out.join(" ")
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn ascii_passes_through_word_boundaries() {
let tokens = tokenize_indexing("hello world");
assert!(tokens.contains("hello"));
assert!(tokens.contains("world"));
}
#[test]
fn chinese_phrase_segments_into_words() {
let tokens = tokenize_indexing("我的项目偏好");
let any_multi_char_token = tokens
.split_whitespace()
.any(|t| t.chars().filter(|c| !c.is_ascii()).count() >= 2);
assert!(
any_multi_char_token,
"expected at least one multi-char Chinese token in {tokens:?}"
);
}
#[test]
fn punctuation_is_dropped() {
let tokens = tokenize_indexing("hello, world!");
let toks: Vec<_> = tokens.split_whitespace().collect();
assert!(!toks.iter().any(|t| t.contains(',')));
assert!(!toks.iter().any(|t| t.contains('!')));
}
#[test]
fn dedup_preserves_first_position() {
let tokens = tokenize_indexing("alpha beta alpha gamma alpha");
assert_eq!(tokens, "alpha beta gamma");
}
#[test]
fn query_form_quotes_each_token() {
let q = tokenize_query("项目 偏好");
let parts: Vec<_> = q.split_whitespace().collect();
assert!(!parts.is_empty());
for p in &parts {
assert!(p.starts_with('"') && p.ends_with('"'), "bad quote: {p}");
}
}
#[test]
fn query_form_escapes_embedded_quote() {
let q = tokenize_query(r#"say "hi""#);
for tok in q.split_whitespace() {
assert!(tok.starts_with('"') && tok.ends_with('"'));
assert!(tok.len() >= 2);
}
}
#[test]
fn empty_input_yields_empty_output() {
assert!(tokenize_indexing("").is_empty());
assert!(tokenize_indexing(" ").is_empty());
assert!(tokenize_query("").is_empty());
assert!(tokenize_query(" ").is_empty());
}
#[test]
fn mixed_chinese_english_round_trip_via_query() {
let indexed = tokenize_indexing("Anamnesis 是跨 agent 记忆基础设施");
let query = tokenize_query("记忆");
let q_inner: String = query
.trim_matches('"')
.chars()
.take_while(|c| *c != '"')
.collect();
assert!(
indexed.split_whitespace().any(|w| w == q_inner),
"indexed stream {indexed:?} should contain query token {q_inner:?}"
);
}
}