#![allow(rustdoc::broken_intra_doc_links)]
mod char_filter;
pub mod config;
mod filter;
mod token;
mod tokenizer;
pub use char_filter::{
CharFilter, HtmlStripCharFilter, MappingCharFilter, OffsetCorrection, PatternReplaceCharFilter,
correct_offset,
};
pub use filter::{
AsciiFoldingFilter, EdgeNGramTokenFilter, LowercaseFilter, NGramTokenFilter, ShingleFilter,
StemmerAlgorithm, StemmerFilter, StopFilter, SynonymFilter, TokenFilter,
};
pub use token::Token;
pub use tokenizer::{
EdgeNGramTokenizer, KeywordTokenizer, LetterTokenizer, NGramTokenizer, PathHierarchyTokenizer,
PatternTokenizer, StandardTokenizer, Tokenizer, WhitespaceTokenizer,
};
use std::collections::HashMap;
pub struct Analyzer {
name: String,
char_filters: Vec<Box<dyn CharFilter>>,
tokenizer: Box<dyn Tokenizer>,
filters: Vec<Box<dyn TokenFilter>>,
}
impl Analyzer {
pub fn new(
name: impl Into<String>,
tokenizer: impl Tokenizer + 'static,
filters: Vec<Box<dyn TokenFilter>>,
) -> Self {
Self {
name: name.into(),
char_filters: Vec::new(),
tokenizer: Box::new(tokenizer),
filters,
}
}
pub fn with_char_filters(
name: impl Into<String>,
char_filters: Vec<Box<dyn CharFilter>>,
tokenizer: impl Tokenizer + 'static,
filters: Vec<Box<dyn TokenFilter>>,
) -> Self {
Self {
name: name.into(),
char_filters,
tokenizer: Box::new(tokenizer),
filters,
}
}
pub fn from_boxed(
name: impl Into<String>,
char_filters: Vec<Box<dyn CharFilter>>,
tokenizer: Box<dyn Tokenizer>,
filters: Vec<Box<dyn TokenFilter>>,
) -> Self {
Self {
name: name.into(),
char_filters,
tokenizer,
filters,
}
}
pub fn analyze(&self, text: &str) -> Vec<Token> {
let (filtered_text, corrections) = self.apply_char_filters(text);
let tokenize_input = if corrections.is_empty() {
text
} else {
&filtered_text
};
let mut tokens = Vec::new();
self.tokenizer.tokenize(tokenize_input, &mut tokens);
if !corrections.is_empty() {
for token in &mut tokens {
token.offset_from = correct_offset(token.offset_from, &corrections);
token.offset_to = correct_offset(token.offset_to, &corrections);
}
}
for filter in &self.filters {
filter.apply(&mut tokens);
}
tokens
}
pub fn name(&self) -> &str {
&self.name
}
fn apply_char_filters(&self, text: &str) -> (String, Vec<OffsetCorrection>) {
if self.char_filters.is_empty() {
return (String::new(), Vec::new());
}
let mut current = text.to_string();
let mut all_corrections = Vec::new();
for cf in &self.char_filters {
let (filtered, corrections) = cf.filter(¤t);
all_corrections.extend(corrections);
current = filtered;
}
(current, all_corrections)
}
}
pub struct AnalyzerRegistry {
analyzers: HashMap<String, Analyzer>,
}
impl std::fmt::Debug for AnalyzerRegistry {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("AnalyzerRegistry")
.field("analyzers", &self.analyzers.keys().collect::<Vec<_>>())
.finish()
}
}
impl AnalyzerRegistry {
pub fn new() -> Self {
let mut registry = Self {
analyzers: HashMap::new(),
};
registry.register(standard_analyzer());
registry.register(simple_analyzer());
registry.register(whitespace_analyzer());
registry.register(keyword_analyzer());
registry.register(stop_analyzer());
registry
}
pub fn register(&mut self, analyzer: Analyzer) {
self.analyzers.insert(analyzer.name.clone(), analyzer);
}
pub fn get(&self, name: &str) -> &Analyzer {
self.analyzers
.get(name)
.unwrap_or_else(|| self.analyzers.get("standard").unwrap())
}
pub fn try_get(&self, name: &str) -> Option<&Analyzer> {
self.analyzers.get(name)
}
pub fn names(&self) -> Vec<&str> {
self.analyzers.keys().map(String::as_str).collect()
}
}
impl Default for AnalyzerRegistry {
fn default() -> Self {
Self::new()
}
}
pub fn standard_analyzer() -> Analyzer {
Analyzer::new(
"standard",
StandardTokenizer,
vec![Box::new(LowercaseFilter)],
)
}
pub fn simple_analyzer() -> Analyzer {
Analyzer::new("simple", LetterTokenizer, vec![Box::new(LowercaseFilter)])
}
pub fn whitespace_analyzer() -> Analyzer {
Analyzer::new("whitespace", WhitespaceTokenizer, vec![])
}
pub fn keyword_analyzer() -> Analyzer {
Analyzer::new("keyword", KeywordTokenizer, vec![])
}
pub fn stop_analyzer() -> Analyzer {
Analyzer::new(
"stop",
StandardTokenizer,
vec![Box::new(LowercaseFilter), Box::new(StopFilter::english())],
)
}
pub fn language_analyzer(algorithm: StemmerAlgorithm) -> Analyzer {
Analyzer::new(
"language",
StandardTokenizer,
vec![
Box::new(LowercaseFilter),
Box::new(StopFilter::english()),
Box::new(StemmerFilter::new(algorithm)),
],
)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn standard_analyzer_basic() {
let analyzer = standard_analyzer();
let tokens = analyzer.analyze("The Quick Brown Fox");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["the", "quick", "brown", "fox"]);
}
#[test]
fn standard_analyzer_name() {
let analyzer = standard_analyzer();
assert_eq!(analyzer.name(), "standard");
}
#[test]
fn simple_analyzer_strips_numbers() {
let analyzer = simple_analyzer();
let tokens = analyzer.analyze("Hello123World");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["hello", "world"]);
}
#[test]
fn whitespace_analyzer_preserves_everything() {
let analyzer = whitespace_analyzer();
let tokens = analyzer.analyze("Hello, World!");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["Hello,", "World!"]);
}
#[test]
fn keyword_analyzer_single_token() {
let analyzer = keyword_analyzer();
let tokens = analyzer.analyze("Hello, World!");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0].text, "Hello, World!");
}
#[test]
fn stop_analyzer_removes_stop_words() {
let analyzer = stop_analyzer();
let tokens = analyzer.analyze("The quick brown fox is a test");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["quick", "brown", "fox", "test"]);
}
#[test]
fn language_analyzer_stems() {
let analyzer = language_analyzer(StemmerAlgorithm::English);
let tokens = analyzer.analyze("The cats are running quickly");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["cat", "run", "quick"]);
}
#[test]
fn analyzer_preserves_positions() {
let analyzer = stop_analyzer();
let tokens = analyzer.analyze("the quick brown fox");
assert_eq!(tokens[0].text, "quick");
assert_eq!(tokens[0].position, 1); }
#[test]
fn analyzer_empty_input() {
let analyzer = standard_analyzer();
let tokens = analyzer.analyze("");
assert!(tokens.is_empty());
}
#[test]
fn registry_has_builtins() {
let registry = AnalyzerRegistry::new();
let names = registry.names();
assert!(names.contains(&"standard"));
assert!(names.contains(&"simple"));
assert!(names.contains(&"whitespace"));
assert!(names.contains(&"keyword"));
assert!(names.contains(&"stop"));
}
#[test]
fn registry_get_standard() {
let registry = AnalyzerRegistry::new();
let analyzer = registry.get("standard");
assert_eq!(analyzer.name(), "standard");
}
#[test]
fn registry_fallback_to_standard() {
let registry = AnalyzerRegistry::new();
let analyzer = registry.get("nonexistent");
assert_eq!(analyzer.name(), "standard");
}
#[test]
fn registry_try_get_returns_none() {
let registry = AnalyzerRegistry::new();
assert!(registry.try_get("nonexistent").is_none());
assert!(registry.try_get("standard").is_some());
}
#[test]
fn registry_custom_analyzer() {
let mut registry = AnalyzerRegistry::new();
registry.register(Analyzer::new(
"custom",
WhitespaceTokenizer,
vec![Box::new(LowercaseFilter)],
));
let analyzer = registry.get("custom");
assert_eq!(analyzer.name(), "custom");
let tokens = analyzer.analyze("Hello World");
assert_eq!(tokens[0].text, "hello");
}
#[test]
fn analyze_realistic_document() {
let analyzer = standard_analyzer();
let text = "Elasticsearch is a distributed, RESTful search and \
analytics engine. It centrally stores your data for \
lightning fast search.";
let tokens = analyzer.analyze(text);
assert!(tokens.len() > 10);
assert!(tokens.iter().all(|t| t.text == t.text.to_lowercase()));
for token in &tokens {
assert_eq!(
text[token.offset_from..token.offset_to].to_lowercase(),
token.text
);
}
}
#[test]
fn stop_analyzer_realistic() {
let analyzer = stop_analyzer();
let text = "The quick brown fox jumps over the lazy dog";
let tokens = analyzer.analyze(text);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(!texts.contains(&"the"));
assert!(texts.contains(&"quick"));
assert!(texts.contains(&"over")); }
#[test]
fn language_analyzer_realistic() {
let analyzer = language_analyzer(StemmerAlgorithm::English);
let text = "The users were searching for documents containing these keywords";
let tokens = analyzer.analyze(text);
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(!texts.contains(&"the"));
assert!(!texts.contains(&"these")); assert!(texts.contains(&"user")); assert!(texts.contains(&"search")); assert!(texts.contains(&"document")); assert!(texts.contains(&"keyword")); }
}