pub use crate::index::text_types::{Tokenizer, Token, Position};
pub use crate::index::text_types::{WhitespaceTokenizer, NgramTokenizer};
#[cfg(feature = "tokenizer-jieba")]
mod jieba_plugin {
use super::*;
use jieba_rs::Jieba;
use std::sync::Arc;
pub struct JiebaTokenizer {
jieba: Arc<Jieba>,
mode: JiebaMode,
case_sensitive: bool,
min_len: usize,
max_len: usize,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum JiebaMode {
Precise,
Full,
Search,
}
impl Default for JiebaTokenizer {
fn default() -> Self {
Self {
jieba: Arc::new(Jieba::new()),
mode: JiebaMode::Search, case_sensitive: false,
min_len: 1,
max_len: 64,
}
}
}
impl JiebaTokenizer {
pub fn new() -> Self {
Self::default()
}
pub fn with_mode(mut self, mode: JiebaMode) -> Self {
self.mode = mode;
self
}
pub fn case_sensitive(mut self, sensitive: bool) -> Self {
self.case_sensitive = sensitive;
self
}
pub fn with_length_range(mut self, min: usize, max: usize) -> Self {
self.min_len = min;
self.max_len = max;
self
}
pub fn load_dict(&mut self, _dict_path: &str) -> Result<(), String> {
Ok(())
}
}
impl Tokenizer for JiebaTokenizer {
fn tokenize(&self, text: &str) -> Vec<Token> {
let words = match self.mode {
JiebaMode::Precise => self.jieba.cut(text, false),
JiebaMode::Full => self.jieba.cut(text, true),
JiebaMode::Search => self.jieba.cut_for_search(text, false),
};
words
.into_iter()
.enumerate()
.filter_map(|(i, word)| {
let word_str = word.trim();
if word_str.is_empty() {
return None;
}
let len = word_str.chars().count();
if len < self.min_len || len > self.max_len {
return None;
}
let text = if self.case_sensitive {
word_str.to_string()
} else {
word_str.to_lowercase()
};
Some(Token {
text,
position: i as u32,
})
})
.collect()
}
fn name(&self) -> &str {
"jieba"
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_jieba_tokenizer() {
let tokenizer = JiebaTokenizer::default();
let tokens = tokenizer.tokenize("我爱自然语言处理");
debug_log!("Tokens: {:?}", tokens.iter().map(|t| &t.text).collect::<Vec<_>>());
assert!(!tokens.is_empty());
assert!(tokens.iter().any(|t| t.text == "自然语言"));
}
#[test]
fn test_jieba_modes() {
let text = "我来到北京清华大学";
let precise = JiebaTokenizer::default().with_mode(JiebaMode::Precise);
let tokens = precise.tokenize(text);
debug_log!("Precise: {:?}", tokens.iter().map(|t| &t.text).collect::<Vec<_>>());
let full = JiebaTokenizer::default().with_mode(JiebaMode::Full);
let tokens = full.tokenize(text);
debug_log!("Full: {:?}", tokens.iter().map(|t| &t.text).collect::<Vec<_>>());
let search = JiebaTokenizer::default().with_mode(JiebaMode::Search);
let tokens = search.tokenize(text);
debug_log!("Search: {:?}", tokens.iter().map(|t| &t.text).collect::<Vec<_>>());
}
}
}
#[cfg(feature = "tokenizer-jieba")]
pub use jieba_plugin::{JiebaTokenizer, JiebaMode};
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum TokenizerType {
Whitespace,
Ngram,
#[cfg(feature = "tokenizer-jieba")]
Jieba,
}
pub struct TokenizerFactory;
impl TokenizerFactory {
pub fn create(tokenizer_type: TokenizerType) -> Box<dyn Tokenizer> {
match tokenizer_type {
TokenizerType::Whitespace => Box::new(WhitespaceTokenizer::default()),
TokenizerType::Ngram => Box::new(NgramTokenizer::new(2)),
#[cfg(feature = "tokenizer-jieba")]
TokenizerType::Jieba => Box::new(JiebaTokenizer::default()),
}
}
pub fn from_name(name: &str) -> Option<Box<dyn Tokenizer>> {
match name {
"whitespace" => Some(Box::new(WhitespaceTokenizer::default())),
"ngram" | "ngram2" => Some(Box::new(NgramTokenizer::new(2))),
"ngram3" => Some(Box::new(NgramTokenizer::new(3))),
#[cfg(feature = "tokenizer-jieba")]
"jieba" => Some(Box::new(JiebaTokenizer::default())),
_ => None,
}
}
pub fn available_tokenizers() -> Vec<&'static str> {
let tokenizers = vec!["whitespace", "ngram"];
#[cfg(feature = "tokenizer-jieba")]
let tokenizers = {
let mut t = tokenizers;
t.push("jieba");
t
};
tokenizers
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_whitespace_tokenizer() {
let tokenizer = WhitespaceTokenizer::default();
let tokens = tokenizer.tokenize("Hello World Test");
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].text, "hello");
assert_eq!(tokens[1].text, "world");
}
#[test]
fn test_ngram_tokenizer() {
let tokenizer = NgramTokenizer::new(2);
let tokens = tokenizer.tokenize("你好世界");
assert!(!tokens.is_empty());
}
#[test]
fn test_tokenizer_factory() {
let tokenizer = TokenizerFactory::from_name("whitespace").unwrap();
let tokens = tokenizer.tokenize("test");
assert_eq!(tokens.len(), 1);
let available = TokenizerFactory::available_tokenizers();
debug_log!("Available tokenizers: {:?}", available);
assert!(available.contains(&"whitespace"));
assert!(available.contains(&"ngram"));
}
#[test]
fn test_custom_tokenizer() {
struct CharTokenizer;
impl Tokenizer for CharTokenizer {
fn tokenize(&self, text: &str) -> Vec<Token> {
text.chars()
.enumerate()
.map(|(i, c)| Token {
text: c.to_string(),
position: i as u32,
})
.collect()
}
fn name(&self) -> &str {
"char"
}
}
let tokenizer = CharTokenizer;
let tokens = tokenizer.tokenize("ABC");
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0].text, "A");
assert_eq!(tokens[1].text, "B");
assert_eq!(tokens[2].text, "C");
}
}