#[cfg(any(feature = "native", feature = "wasm"))]
mod hf_tokenizer;
#[cfg(any(feature = "native", feature = "wasm"))]
pub use hf_tokenizer::{HfTokenizer, TokenizerSource};
#[cfg(feature = "native")]
pub use hf_tokenizer::{TokenizerCache, tokenizer_cache};
use std::collections::HashMap;
use std::sync::Arc;
use parking_lot::RwLock;
use rust_stemmers::Algorithm;
use serde::{Deserialize, Serialize};
use stop_words::LANGUAGE;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
pub struct Token {
pub text: String,
pub position: u32,
pub offset_from: usize,
pub offset_to: usize,
}
impl Token {
pub fn new(text: String, position: u32, offset_from: usize, offset_to: usize) -> Self {
Self {
text,
position,
offset_from,
offset_to,
}
}
}
pub trait Tokenizer: Send + Sync + Clone + 'static {
fn tokenize(&self, text: &str) -> Vec<Token>;
}
#[derive(Debug, Clone, Default)]
pub struct SimpleTokenizer;
impl Tokenizer for SimpleTokenizer {
fn tokenize(&self, text: &str) -> Vec<Token> {
let mut tokens = Vec::new();
let mut position = 0u32;
for (offset, word) in split_whitespace_with_offsets(text) {
if !word.is_empty() {
tokens.push(Token::new(
word.to_string(),
position,
offset,
offset + word.len(),
));
position += 1;
}
}
tokens
}
}
#[derive(Debug, Clone, Default)]
pub struct LowercaseTokenizer;
impl Tokenizer for LowercaseTokenizer {
fn tokenize(&self, text: &str) -> Vec<Token> {
let mut tokens = Vec::new();
let mut position = 0u32;
for (offset, word) in split_whitespace_with_offsets(text) {
if !word.is_empty() {
let cleaned: String = word
.chars()
.filter(|c| c.is_alphanumeric())
.flat_map(|c| c.to_lowercase())
.collect();
if !cleaned.is_empty() {
tokens.push(Token::new(cleaned, position, offset, offset + word.len()));
position += 1;
}
}
}
tokens
}
}
fn split_whitespace_with_offsets(text: &str) -> impl Iterator<Item = (usize, &str)> {
let mut offset = 0;
text.split_whitespace().map(move |word| {
let word_start = text[offset..].find(word).unwrap() + offset;
offset = word_start + word.len();
(word_start, word)
})
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
#[allow(missing_docs)]
#[derive(Default)]
pub enum Language {
Arabic,
Danish,
Dutch,
#[default]
English,
Finnish,
French,
German,
Greek,
Hungarian,
Italian,
Norwegian,
Portuguese,
Romanian,
Russian,
Spanish,
Swedish,
Tamil,
Turkish,
}
impl Language {
fn to_algorithm(self) -> Algorithm {
match self {
Language::Arabic => Algorithm::Arabic,
Language::Danish => Algorithm::Danish,
Language::Dutch => Algorithm::Dutch,
Language::English => Algorithm::English,
Language::Finnish => Algorithm::Finnish,
Language::French => Algorithm::French,
Language::German => Algorithm::German,
Language::Greek => Algorithm::Greek,
Language::Hungarian => Algorithm::Hungarian,
Language::Italian => Algorithm::Italian,
Language::Norwegian => Algorithm::Norwegian,
Language::Portuguese => Algorithm::Portuguese,
Language::Romanian => Algorithm::Romanian,
Language::Russian => Algorithm::Russian,
Language::Spanish => Algorithm::Spanish,
Language::Swedish => Algorithm::Swedish,
Language::Tamil => Algorithm::Tamil,
Language::Turkish => Algorithm::Turkish,
}
}
fn to_stop_words_language(self) -> LANGUAGE {
match self {
Language::Arabic => LANGUAGE::Arabic,
Language::Danish => LANGUAGE::Danish,
Language::Dutch => LANGUAGE::Dutch,
Language::English => LANGUAGE::English,
Language::Finnish => LANGUAGE::Finnish,
Language::French => LANGUAGE::French,
Language::German => LANGUAGE::German,
Language::Greek => LANGUAGE::Greek,
Language::Hungarian => LANGUAGE::Hungarian,
Language::Italian => LANGUAGE::Italian,
Language::Norwegian => LANGUAGE::Norwegian,
Language::Portuguese => LANGUAGE::Portuguese,
Language::Romanian => LANGUAGE::Romanian,
Language::Russian => LANGUAGE::Russian,
Language::Spanish => LANGUAGE::Spanish,
Language::Swedish => LANGUAGE::Swedish,
Language::Tamil => LANGUAGE::English, Language::Turkish => LANGUAGE::Turkish,
}
}
}
#[derive(Debug, Clone)]
pub struct StopWordTokenizer<T: Tokenizer> {
inner: T,
stop_words: HashSet<String>,
}
use std::collections::HashSet;
impl<T: Tokenizer> StopWordTokenizer<T> {
pub fn new(inner: T, language: Language) -> Self {
let stop_words: HashSet<String> = stop_words::get(language.to_stop_words_language())
.into_iter()
.map(|s| s.to_string())
.collect();
Self { inner, stop_words }
}
pub fn english(inner: T) -> Self {
Self::new(inner, Language::English)
}
pub fn with_custom_stop_words(inner: T, stop_words: HashSet<String>) -> Self {
Self { inner, stop_words }
}
pub fn is_stop_word(&self, word: &str) -> bool {
self.stop_words.contains(word)
}
}
impl<T: Tokenizer> Tokenizer for StopWordTokenizer<T> {
fn tokenize(&self, text: &str) -> Vec<Token> {
self.inner
.tokenize(text)
.into_iter()
.filter(|token| !self.stop_words.contains(&token.text))
.collect()
}
}
#[derive(Debug, Clone)]
pub struct StemmerTokenizer {
language: Language,
}
impl StemmerTokenizer {
pub fn new(language: Language) -> Self {
Self { language }
}
pub fn english() -> Self {
Self::new(Language::English)
}
}
impl Default for StemmerTokenizer {
fn default() -> Self {
Self::english()
}
}
impl Tokenizer for StemmerTokenizer {
fn tokenize(&self, text: &str) -> Vec<Token> {
let stemmer = rust_stemmers::Stemmer::create(self.language.to_algorithm());
let mut tokens = Vec::new();
let mut position = 0u32;
for (offset, word) in split_whitespace_with_offsets(text) {
if !word.is_empty() {
let cleaned: String = word
.chars()
.filter(|c| c.is_alphanumeric())
.flat_map(|c| c.to_lowercase())
.collect();
if !cleaned.is_empty() {
let stemmed = stemmer.stem(&cleaned);
tokens.push(Token::new(
stemmed.into_owned(),
position,
offset,
offset + word.len(),
));
position += 1;
}
}
}
tokens
}
}
#[derive(Debug, Clone)]
pub struct MultiLanguageStemmer {
default_language: Language,
}
impl MultiLanguageStemmer {
pub fn new(default_language: Language) -> Self {
Self { default_language }
}
pub fn tokenize_with_language(&self, text: &str, language: Language) -> Vec<Token> {
let stemmer = rust_stemmers::Stemmer::create(language.to_algorithm());
let mut tokens = Vec::new();
let mut position = 0u32;
for (offset, word) in split_whitespace_with_offsets(text) {
if !word.is_empty() {
let cleaned: String = word
.chars()
.filter(|c| c.is_alphanumeric())
.flat_map(|c| c.to_lowercase())
.collect();
if !cleaned.is_empty() {
let stemmed = stemmer.stem(&cleaned);
tokens.push(Token::new(
stemmed.into_owned(),
position,
offset,
offset + word.len(),
));
position += 1;
}
}
}
tokens
}
pub fn default_language(&self) -> Language {
self.default_language
}
}
impl Default for MultiLanguageStemmer {
fn default() -> Self {
Self::new(Language::English)
}
}
impl Tokenizer for MultiLanguageStemmer {
fn tokenize(&self, text: &str) -> Vec<Token> {
self.tokenize_with_language(text, self.default_language)
}
}
#[derive(Clone)]
pub struct LanguageAwareTokenizer<F>
where
F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
{
language_selector: F,
stemmer: MultiLanguageStemmer,
}
impl<F> LanguageAwareTokenizer<F>
where
F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
{
pub fn new(language_selector: F) -> Self {
Self {
language_selector,
stemmer: MultiLanguageStemmer::default(),
}
}
pub fn tokenize_with_hint(&self, text: &str, language_hint: &str) -> Vec<Token> {
let language = (self.language_selector)(language_hint);
self.stemmer.tokenize_with_language(text, language)
}
}
impl<F> Tokenizer for LanguageAwareTokenizer<F>
where
F: Fn(&str) -> Language + Clone + Send + Sync + 'static,
{
fn tokenize(&self, text: &str) -> Vec<Token> {
self.stemmer.tokenize_with_language(text, Language::English)
}
}
pub fn parse_language(s: &str) -> Language {
match s.to_lowercase().as_str() {
"ar" | "arabic" => Language::Arabic,
"da" | "danish" => Language::Danish,
"nl" | "dutch" => Language::Dutch,
"en" | "english" => Language::English,
"fi" | "finnish" => Language::Finnish,
"fr" | "french" => Language::French,
"de" | "german" => Language::German,
"el" | "greek" => Language::Greek,
"hu" | "hungarian" => Language::Hungarian,
"it" | "italian" => Language::Italian,
"no" | "norwegian" => Language::Norwegian,
"pt" | "portuguese" => Language::Portuguese,
"ro" | "romanian" => Language::Romanian,
"ru" | "russian" => Language::Russian,
"es" | "spanish" => Language::Spanish,
"sv" | "swedish" => Language::Swedish,
"ta" | "tamil" => Language::Tamil,
"tr" | "turkish" => Language::Turkish,
_ => Language::English, }
}
pub type BoxedTokenizer = Box<dyn TokenizerClone>;
pub trait TokenizerClone: Send + Sync {
fn tokenize(&self, text: &str) -> Vec<Token>;
fn clone_box(&self) -> BoxedTokenizer;
}
impl<T: Tokenizer> TokenizerClone for T {
fn tokenize(&self, text: &str) -> Vec<Token> {
Tokenizer::tokenize(self, text)
}
fn clone_box(&self) -> BoxedTokenizer {
Box::new(self.clone())
}
}
impl Clone for BoxedTokenizer {
fn clone(&self) -> Self {
self.clone_box()
}
}
#[derive(Clone)]
pub struct TokenizerRegistry {
tokenizers: Arc<RwLock<HashMap<String, BoxedTokenizer>>>,
}
impl TokenizerRegistry {
pub fn new() -> Self {
let registry = Self {
tokenizers: Arc::new(RwLock::new(HashMap::new())),
};
registry.register_defaults();
registry
}
fn register_defaults(&self) {
self.register("default", LowercaseTokenizer);
self.register("simple", SimpleTokenizer);
self.register("lowercase", LowercaseTokenizer);
self.register("raw", SimpleTokenizer);
self.register("en_stem", StemmerTokenizer::new(Language::English));
self.register("english", StemmerTokenizer::new(Language::English));
self.register("ar_stem", StemmerTokenizer::new(Language::Arabic));
self.register("arabic", StemmerTokenizer::new(Language::Arabic));
self.register("da_stem", StemmerTokenizer::new(Language::Danish));
self.register("danish", StemmerTokenizer::new(Language::Danish));
self.register("nl_stem", StemmerTokenizer::new(Language::Dutch));
self.register("dutch", StemmerTokenizer::new(Language::Dutch));
self.register("fi_stem", StemmerTokenizer::new(Language::Finnish));
self.register("finnish", StemmerTokenizer::new(Language::Finnish));
self.register("fr_stem", StemmerTokenizer::new(Language::French));
self.register("french", StemmerTokenizer::new(Language::French));
self.register("de_stem", StemmerTokenizer::new(Language::German));
self.register("german", StemmerTokenizer::new(Language::German));
self.register("el_stem", StemmerTokenizer::new(Language::Greek));
self.register("greek", StemmerTokenizer::new(Language::Greek));
self.register("hu_stem", StemmerTokenizer::new(Language::Hungarian));
self.register("hungarian", StemmerTokenizer::new(Language::Hungarian));
self.register("it_stem", StemmerTokenizer::new(Language::Italian));
self.register("italian", StemmerTokenizer::new(Language::Italian));
self.register("no_stem", StemmerTokenizer::new(Language::Norwegian));
self.register("norwegian", StemmerTokenizer::new(Language::Norwegian));
self.register("pt_stem", StemmerTokenizer::new(Language::Portuguese));
self.register("portuguese", StemmerTokenizer::new(Language::Portuguese));
self.register("ro_stem", StemmerTokenizer::new(Language::Romanian));
self.register("romanian", StemmerTokenizer::new(Language::Romanian));
self.register("ru_stem", StemmerTokenizer::new(Language::Russian));
self.register("russian", StemmerTokenizer::new(Language::Russian));
self.register("es_stem", StemmerTokenizer::new(Language::Spanish));
self.register("spanish", StemmerTokenizer::new(Language::Spanish));
self.register("sv_stem", StemmerTokenizer::new(Language::Swedish));
self.register("swedish", StemmerTokenizer::new(Language::Swedish));
self.register("ta_stem", StemmerTokenizer::new(Language::Tamil));
self.register("tamil", StemmerTokenizer::new(Language::Tamil));
self.register("tr_stem", StemmerTokenizer::new(Language::Turkish));
self.register("turkish", StemmerTokenizer::new(Language::Turkish));
self.register(
"en_stop",
StopWordTokenizer::new(LowercaseTokenizer, Language::English),
);
self.register(
"de_stop",
StopWordTokenizer::new(LowercaseTokenizer, Language::German),
);
self.register(
"fr_stop",
StopWordTokenizer::new(LowercaseTokenizer, Language::French),
);
self.register(
"ru_stop",
StopWordTokenizer::new(LowercaseTokenizer, Language::Russian),
);
self.register(
"es_stop",
StopWordTokenizer::new(LowercaseTokenizer, Language::Spanish),
);
self.register(
"en_stem_stop",
StopWordTokenizer::new(StemmerTokenizer::new(Language::English), Language::English),
);
self.register(
"de_stem_stop",
StopWordTokenizer::new(StemmerTokenizer::new(Language::German), Language::German),
);
self.register(
"fr_stem_stop",
StopWordTokenizer::new(StemmerTokenizer::new(Language::French), Language::French),
);
self.register(
"ru_stem_stop",
StopWordTokenizer::new(StemmerTokenizer::new(Language::Russian), Language::Russian),
);
self.register(
"es_stem_stop",
StopWordTokenizer::new(StemmerTokenizer::new(Language::Spanish), Language::Spanish),
);
}
pub fn register<T: Tokenizer>(&self, name: &str, tokenizer: T) {
let mut tokenizers = self.tokenizers.write();
tokenizers.insert(name.to_string(), Box::new(tokenizer));
}
pub fn get(&self, name: &str) -> Option<BoxedTokenizer> {
let tokenizers = self.tokenizers.read();
tokenizers.get(name).cloned()
}
pub fn contains(&self, name: &str) -> bool {
let tokenizers = self.tokenizers.read();
tokenizers.contains_key(name)
}
pub fn names(&self) -> Vec<String> {
let tokenizers = self.tokenizers.read();
tokenizers.keys().cloned().collect()
}
}
impl Default for TokenizerRegistry {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_simple_tokenizer() {
let tokenizer = SimpleTokenizer;
let tokens = Tokenizer::tokenize(&tokenizer, "hello world");
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "hello");
assert_eq!(tokens[0].position, 0);
assert_eq!(tokens[1].text, "world");
assert_eq!(tokens[1].position, 1);
}
#[test]
fn test_lowercase_tokenizer() {
let tokenizer = LowercaseTokenizer;
let tokens = Tokenizer::tokenize(&tokenizer, "Hello, World!");
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "hello");
assert_eq!(tokens[1].text, "world");
}
#[test]
fn test_empty_text() {
let tokenizer = SimpleTokenizer;
let tokens = Tokenizer::tokenize(&tokenizer, "");
assert!(tokens.is_empty());
}
#[test]
fn test_stemmer_tokenizer_english() {
let tokenizer = StemmerTokenizer::english();
let tokens = Tokenizer::tokenize(&tokenizer, "Dogs are running quickly");
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0].text, "dog"); assert_eq!(tokens[1].text, "are"); assert_eq!(tokens[2].text, "run"); assert_eq!(tokens[3].text, "quick"); }
#[test]
fn test_stemmer_tokenizer_preserves_offsets() {
let tokenizer = StemmerTokenizer::english();
let tokens = Tokenizer::tokenize(&tokenizer, "Running dogs");
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "run");
assert_eq!(tokens[0].offset_from, 0);
assert_eq!(tokens[0].offset_to, 7); assert_eq!(tokens[1].text, "dog");
assert_eq!(tokens[1].offset_from, 8);
assert_eq!(tokens[1].offset_to, 12); }
#[test]
fn test_stemmer_tokenizer_german() {
let tokenizer = StemmerTokenizer::new(Language::German);
let tokens = Tokenizer::tokenize(&tokenizer, "Häuser Bücher");
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "haus"); assert_eq!(tokens[1].text, "buch"); }
#[test]
fn test_stemmer_tokenizer_russian() {
let tokenizer = StemmerTokenizer::new(Language::Russian);
let tokens = Tokenizer::tokenize(&tokenizer, "бегущие собаки");
assert_eq!(tokens.len(), 2);
assert_eq!(tokens[0].text, "бегущ"); assert_eq!(tokens[1].text, "собак"); }
#[test]
fn test_multi_language_stemmer() {
let stemmer = MultiLanguageStemmer::new(Language::English);
let tokens = stemmer.tokenize_with_language("running dogs", Language::English);
assert_eq!(tokens[0].text, "run");
assert_eq!(tokens[1].text, "dog");
let tokens = stemmer.tokenize_with_language("Häuser Bücher", Language::German);
assert_eq!(tokens[0].text, "haus");
assert_eq!(tokens[1].text, "buch");
let tokens = stemmer.tokenize_with_language("бегущие собаки", Language::Russian);
assert_eq!(tokens[0].text, "бегущ");
assert_eq!(tokens[1].text, "собак");
}
#[test]
fn test_language_aware_tokenizer() {
let tokenizer = LanguageAwareTokenizer::new(parse_language);
let tokens = tokenizer.tokenize_with_hint("running dogs", "en");
assert_eq!(tokens[0].text, "run");
assert_eq!(tokens[1].text, "dog");
let tokens = tokenizer.tokenize_with_hint("Häuser Bücher", "de");
assert_eq!(tokens[0].text, "haus");
assert_eq!(tokens[1].text, "buch");
let tokens = tokenizer.tokenize_with_hint("бегущие собаки", "russian");
assert_eq!(tokens[0].text, "бегущ");
assert_eq!(tokens[1].text, "собак");
}
#[test]
fn test_parse_language() {
assert_eq!(parse_language("en"), Language::English);
assert_eq!(parse_language("english"), Language::English);
assert_eq!(parse_language("English"), Language::English);
assert_eq!(parse_language("de"), Language::German);
assert_eq!(parse_language("german"), Language::German);
assert_eq!(parse_language("ru"), Language::Russian);
assert_eq!(parse_language("russian"), Language::Russian);
assert_eq!(parse_language("unknown"), Language::English); }
#[test]
fn test_tokenizer_registry_defaults() {
let registry = TokenizerRegistry::new();
assert!(registry.contains("default"));
assert!(registry.contains("simple"));
assert!(registry.contains("lowercase"));
assert!(registry.contains("en_stem"));
assert!(registry.contains("german"));
assert!(registry.contains("russian"));
}
#[test]
fn test_tokenizer_registry_get() {
let registry = TokenizerRegistry::new();
let tokenizer = registry.get("en_stem").unwrap();
let tokens = tokenizer.tokenize("running dogs");
assert_eq!(tokens[0].text, "run");
assert_eq!(tokens[1].text, "dog");
let tokenizer = registry.get("german").unwrap();
let tokens = tokenizer.tokenize("Häuser Bücher");
assert_eq!(tokens[0].text, "haus");
assert_eq!(tokens[1].text, "buch");
}
#[test]
fn test_tokenizer_registry_custom() {
let registry = TokenizerRegistry::new();
registry.register("my_tokenizer", LowercaseTokenizer);
assert!(registry.contains("my_tokenizer"));
let tokenizer = registry.get("my_tokenizer").unwrap();
let tokens = tokenizer.tokenize("Hello World");
assert_eq!(tokens[0].text, "hello");
assert_eq!(tokens[1].text, "world");
}
#[test]
fn test_tokenizer_registry_nonexistent() {
let registry = TokenizerRegistry::new();
assert!(registry.get("nonexistent").is_none());
}
#[test]
fn test_stop_word_tokenizer_english() {
let tokenizer = StopWordTokenizer::english(LowercaseTokenizer);
let tokens = Tokenizer::tokenize(&tokenizer, "The quick brown fox jumps over the lazy dog");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(!texts.contains(&"the"));
assert!(!texts.contains(&"over"));
assert!(texts.contains(&"quick"));
assert!(texts.contains(&"brown"));
assert!(texts.contains(&"fox"));
assert!(texts.contains(&"jumps"));
assert!(texts.contains(&"lazy"));
assert!(texts.contains(&"dog"));
}
#[test]
fn test_stop_word_tokenizer_with_stemmer() {
let tokenizer = StopWordTokenizer::new(StemmerTokenizer::english(), Language::English);
let tokens = Tokenizer::tokenize(&tokenizer, "elephants galaxies quantum");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); assert!(texts.contains(&"quantum")); }
#[test]
fn test_stop_word_tokenizer_german() {
let tokenizer = StopWordTokenizer::new(LowercaseTokenizer, Language::German);
let tokens = Tokenizer::tokenize(&tokenizer, "Der Hund und die Katze");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(!texts.contains(&"der"));
assert!(!texts.contains(&"und"));
assert!(!texts.contains(&"die"));
assert!(texts.contains(&"hund"));
assert!(texts.contains(&"katze"));
}
#[test]
fn test_stop_word_tokenizer_custom() {
let custom_stops: HashSet<String> = ["foo", "bar"].iter().map(|s| s.to_string()).collect();
let tokenizer = StopWordTokenizer::with_custom_stop_words(LowercaseTokenizer, custom_stops);
let tokens = Tokenizer::tokenize(&tokenizer, "foo baz bar qux");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(!texts.contains(&"foo"));
assert!(!texts.contains(&"bar"));
assert!(texts.contains(&"baz"));
assert!(texts.contains(&"qux"));
}
#[test]
fn test_stop_word_tokenizer_is_stop_word() {
let tokenizer = StopWordTokenizer::english(LowercaseTokenizer);
assert!(tokenizer.is_stop_word("the"));
assert!(tokenizer.is_stop_word("and"));
assert!(tokenizer.is_stop_word("is"));
assert!(!tokenizer.is_stop_word("elephant"));
assert!(!tokenizer.is_stop_word("quantum"));
}
#[test]
fn test_tokenizer_registry_stop_word_tokenizers() {
let registry = TokenizerRegistry::new();
assert!(registry.contains("en_stop"));
assert!(registry.contains("en_stem_stop"));
assert!(registry.contains("de_stop"));
assert!(registry.contains("ru_stop"));
let tokenizer = registry.get("en_stop").unwrap();
let tokens = tokenizer.tokenize("The quick fox");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(!texts.contains(&"the"));
assert!(texts.contains(&"quick"));
assert!(texts.contains(&"fox"));
let tokenizer = registry.get("en_stem_stop").unwrap();
let tokens = tokenizer.tokenize("elephants galaxies");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(texts.contains(&"eleph")); assert!(texts.contains(&"galaxi")); }
}