mod alphanum_only;
mod ascii_folding_filter;
mod empty_tokenizer;
mod facet_tokenizer;
mod lower_caser;
mod ngram_tokenizer;
mod raw_tokenizer;
mod regex_tokenizer;
mod remove_long;
mod simple_tokenizer;
mod split_compound_words;
mod stop_word_filter;
mod tokenized_string;
mod tokenizer;
mod tokenizer_manager;
mod whitespace_tokenizer;
#[cfg(feature = "stemmer")]
mod stemmer;
pub use tokenizer_api::{BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer};
pub use self::alphanum_only::AlphaNumOnlyFilter;
pub use self::ascii_folding_filter::{AsciiFoldingFilter, to_ascii};
pub use self::facet_tokenizer::FacetTokenizer;
pub use self::lower_caser::LowerCaser;
pub use self::ngram_tokenizer::NgramTokenizer;
pub use self::raw_tokenizer::RawTokenizer;
pub use self::regex_tokenizer::RegexTokenizer;
pub use self::remove_long::RemoveLongFilter;
pub use self::simple_tokenizer::{SimpleTokenStream, SimpleTokenizer};
pub use self::split_compound_words::SplitCompoundWords;
#[cfg(feature = "stemmer")]
pub use self::stemmer::{Language, Stemmer};
pub use self::stop_word_filter::StopWordFilter;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{TextAnalyzer, TextAnalyzerBuilder};
pub use self::tokenizer_manager::TokenizerManager;
pub use self::whitespace_tokenizer::WhitespaceTokenizer;
pub const MAX_TOKEN_LEN: usize = u16::MAX as usize - 5;
#[cfg(test)]
pub(crate) mod tests {
use super::{Token, TokenizerManager};
pub fn assert_token(token: &Token, position: usize, text: &str, from: usize, to: usize) {
assert_eq!(
token.position, position,
"expected position {position} but {token:?}"
);
assert_eq!(token.text, text, "expected text {text} but {token:?}");
assert_eq!(
token.offset_from, from,
"expected offset_from {from} but {token:?}"
);
assert_eq!(token.offset_to, to, "expected offset_to {to} but {token:?}");
}
#[test]
fn test_raw_tokenizer2() {
let tokenizer_manager = TokenizerManager::default();
let mut en_tokenizer = tokenizer_manager.get("raw").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1);
assert_token(&tokens[0], 0, "Hello, happy tax payer!", 0, 23);
}
#[test]
fn test_tokenizer_does_not_exist() {
let tokenizer_manager = TokenizerManager::default();
assert!(tokenizer_manager.get("en_doesnotexist").is_none());
}
#[test]
fn test_tokenizer_empty() {
let tokenizer_manager = TokenizerManager::default();
let mut en_tokenizer = tokenizer_manager.get("default").unwrap();
{
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
{
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
en_tokenizer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
}
#[test]
fn test_whitespace_tokenizer() {
let tokenizer_manager = TokenizerManager::default();
let mut ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
let mut tokens: Vec<Token> = vec![];
{
let mut add_token = |token: &Token| {
tokens.push(token.clone());
};
ws_tokenizer
.token_stream("Hello, happy tax payer!")
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "Hello,", 0, 6);
assert_token(&tokens[1], 1, "happy", 7, 12);
assert_token(&tokens[2], 2, "tax", 13, 16);
assert_token(&tokens[3], 3, "payer!", 17, 23);
}
}