use std::sync::Arc;
use jieba_rs::{Jieba, TokenizeMode};
use tantivy::tokenizer::{Token, TokenStream, Tokenizer};
#[derive(Clone)]
pub struct JiebaTokenizer {
jieba: Arc<Jieba>,
}
impl JiebaTokenizer {
pub fn new() -> Self {
Self {
jieba: Arc::new(Jieba::new()),
}
}
}
impl Default for JiebaTokenizer {
fn default() -> Self {
Self::new()
}
}
pub struct JiebaTokenStream {
tokens: Vec<Token>,
cursor: usize,
}
impl TokenStream for JiebaTokenStream {
fn advance(&mut self) -> bool {
if self.cursor >= self.tokens.len() {
return false;
}
self.cursor += 1;
true
}
fn token(&self) -> &Token {
&self.tokens[self.cursor - 1]
}
fn token_mut(&mut self) -> &mut Token {
&mut self.tokens[self.cursor - 1]
}
}
impl Tokenizer for JiebaTokenizer {
type TokenStream<'a> = JiebaTokenStream;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
let mut char_byte: Vec<usize> = text.char_indices().map(|(b, _)| b).collect();
char_byte.push(text.len());
let segs = self.jieba.tokenize(text, TokenizeMode::Search, true);
let mut tokens = Vec::with_capacity(segs.len());
let mut position: usize = 0;
for seg in segs {
if seg.word.trim().is_empty() {
continue;
}
tokens.push(Token {
offset_from: char_byte[seg.start],
offset_to: char_byte[seg.end],
position,
text: seg.word.to_lowercase(),
position_length: 1,
});
position += 1;
}
JiebaTokenStream { tokens, cursor: 0 }
}
}
#[cfg(test)]
mod tests {
use super::*;
fn tokenize(text: &str) -> Vec<String> {
let mut tk = JiebaTokenizer::new();
let mut stream = tk.token_stream(text);
let mut out = Vec::new();
while stream.advance() {
out.push(stream.token().text.clone());
}
out
}
#[test]
fn english_lowercased() {
let tokens = tokenize("The Quick Brown Fox");
assert!(tokens.iter().any(|t| t == "quick"));
assert!(tokens.iter().any(|t| t == "brown"));
assert!(!tokens.iter().any(|t| t == "Brown"), "should be lowercased");
}
#[test]
fn chinese_splits_into_words() {
let tokens = tokenize("蒙牛奶粉冲泡指南");
assert!(
tokens.iter().any(|t| t == "蒙牛"),
"expected 蒙牛 as a token, got: {tokens:?}"
);
assert!(
tokens.iter().any(|t| t == "奶粉"),
"expected 奶粉 as a token, got: {tokens:?}"
);
}
#[test]
fn mixed_zh_en() {
let tokens = tokenize("Apple 苹果 brand 品牌");
assert!(tokens.iter().any(|t| t == "apple"));
assert!(tokens.iter().any(|t| t == "苹果"));
assert!(tokens.iter().any(|t| t == "brand"));
assert!(tokens.iter().any(|t| t == "品牌"));
}
#[test]
fn mixed_cjk_ascii_multiline_does_not_panic() {
let text = "量子纠缠是量子力学中两个粒子相互关联的现象 quantum entanglement.\n\
The capital of France is Paris, a city famous for the Eiffel Tower.\n\
Rust ownership and borrowing prevent data races at compile time.";
let tokens = tokenize(text);
assert!(tokens.iter().any(|t| t == "量子"), "got: {tokens:?}");
assert!(tokens.iter().any(|t| t == "quantum"), "got: {tokens:?}");
assert!(tokens.iter().any(|t| t == "paris"), "got: {tokens:?}");
}
#[test]
fn offsets_slice_back_to_token() {
let text = "量子纠缠 quantum 力学 entanglement";
let mut tk = JiebaTokenizer::new();
let mut stream = tk.token_stream(text);
while stream.advance() {
let t = stream.token();
assert!(
t.offset_to <= text.len(),
"offset_to {} > len {}",
t.offset_to,
text.len()
);
assert!(
text.is_char_boundary(t.offset_from),
"from not char boundary: {}",
t.offset_from
);
assert!(
text.is_char_boundary(t.offset_to),
"to not char boundary: {}",
t.offset_to
);
assert_eq!(
text[t.offset_from..t.offset_to].to_lowercase(),
t.text,
"slice must equal token text"
);
}
}
#[test]
fn empty_input() {
let tokens = tokenize("");
assert!(tokens.is_empty());
}
#[test]
fn whitespace_only() {
let tokens = tokenize(" \n\t ");
assert!(
tokens.is_empty(),
"whitespace should not produce tokens, got: {tokens:?}"
);
}
}