use unicode_segmentation::UnicodeSegmentation;
use super::LanguageTag;
pub trait Tokenizer: Send + Sync {
fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str>;
fn tokenize_iter<'a>(&'a self, text: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a>;
}
#[derive(Clone, Debug, Default)]
pub struct WhitespaceTokenizer {
lowercase: bool,
}
impl WhitespaceTokenizer {
pub fn new() -> Self {
Self::default()
}
pub fn with_locale(_locale: &str) -> Self {
Self::default()
}
pub fn lowercase(mut self, lowercase: bool) -> Self {
self.lowercase = lowercase;
self
}
}
impl Tokenizer for WhitespaceTokenizer {
fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
text.split_whitespace().collect()
}
fn tokenize_iter<'a>(&'a self, text: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a> {
Box::new(text.split_whitespace())
}
}
#[derive(Clone, Debug, Default)]
pub struct UnicodeWordTokenizer;
impl UnicodeWordTokenizer {
pub fn new() -> Self {
Self
}
}
impl Tokenizer for UnicodeWordTokenizer {
fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
text.unicode_words().collect()
}
fn tokenize_iter<'a>(&'a self, text: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a> {
Box::new(text.unicode_words())
}
}
#[derive(Clone, Debug, Default)]
pub struct CharacterTokenizer {
include_punctuation: bool,
}
impl CharacterTokenizer {
pub fn new() -> Self {
Self::default()
}
pub fn include_punctuation(mut self, include: bool) -> Self {
self.include_punctuation = include;
self
}
}
impl Tokenizer for CharacterTokenizer {
fn tokenize<'a>(&self, text: &'a str) -> Vec<&'a str> {
text.graphemes(true)
.filter(|g| {
let c = g.chars().next().unwrap_or(' ');
!c.is_whitespace() && (self.include_punctuation || !c.is_ascii_punctuation())
})
.collect()
}
fn tokenize_iter<'a>(&'a self, text: &'a str) -> Box<dyn Iterator<Item = &'a str> + 'a> {
let include_punct = self.include_punctuation;
Box::new(text.graphemes(true).filter(move |g| {
let c = g.chars().next().unwrap_or(' ');
!c.is_whitespace() && (include_punct || !c.is_ascii_punctuation())
}))
}
}
pub fn create_tokenizer(lang: &LanguageTag) -> Box<dyn Tokenizer> {
match lang.language() {
"zh" | "ja" | "ko" => Box::new(CharacterTokenizer::new()),
"th" | "km" | "lo" | "my" => Box::new(CharacterTokenizer::new()),
_ => Box::new(UnicodeWordTokenizer::new()),
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_whitespace_tokenizer() {
let tokenizer = WhitespaceTokenizer::new();
let tokens = tokenizer.tokenize("The quick brown fox");
assert_eq!(tokens, vec!["The", "quick", "brown", "fox"]);
}
#[test]
fn test_unicode_word_tokenizer() {
let tokenizer = UnicodeWordTokenizer::new();
let tokens = tokenizer.tokenize("Hello, world! How are you?");
assert_eq!(tokens, vec!["Hello", "world", "How", "are", "you"]);
}
#[test]
fn test_character_tokenizer() {
let tokenizer = CharacterTokenizer::new();
let tokens = tokenizer.tokenize("hello world");
assert_eq!(
tokens,
vec!["h", "e", "l", "l", "o", "w", "o", "r", "l", "d"]
);
}
#[test]
fn test_create_tokenizer_english() {
let lang = LanguageTag::new("en");
let tokenizer = create_tokenizer(&lang);
let tokens = tokenizer.tokenize("Hello world");
assert_eq!(tokens, vec!["Hello", "world"]);
}
#[test]
fn test_create_tokenizer_chinese() {
let lang = LanguageTag::new("zh");
let tokenizer = create_tokenizer(&lang);
let tokens = tokenizer.tokenize("hello");
assert_eq!(tokens, vec!["h", "e", "l", "l", "o"]);
}
}