1#![allow(rustdoc::broken_intra_doc_links)]
4
5mod char_filter;
20pub mod config;
21mod filter;
22mod token;
23mod tokenizer;
24
25pub use char_filter::{
26 CharFilter, HtmlStripCharFilter, MappingCharFilter, OffsetCorrection, PatternReplaceCharFilter,
27 correct_offset,
28};
29pub use filter::{
30 AsciiFoldingFilter, EdgeNGramTokenFilter, LowercaseFilter, NGramTokenFilter, ShingleFilter,
31 StemmerAlgorithm, StemmerFilter, StopFilter, SynonymFilter, TokenFilter,
32};
33pub use token::Token;
34pub use tokenizer::{
35 EdgeNGramTokenizer, KeywordTokenizer, LetterTokenizer, NGramTokenizer, PathHierarchyTokenizer,
36 PatternTokenizer, StandardTokenizer, Tokenizer, WhitespaceTokenizer,
37};
38
39use std::collections::HashMap;
40
41pub struct Analyzer {
50 name: String,
51 char_filters: Vec<Box<dyn CharFilter>>,
52 tokenizer: Box<dyn Tokenizer>,
53 filters: Vec<Box<dyn TokenFilter>>,
54}
55
56impl Analyzer {
57 pub fn new(
59 name: impl Into<String>,
60 tokenizer: impl Tokenizer + 'static,
61 filters: Vec<Box<dyn TokenFilter>>,
62 ) -> Self {
63 Self {
64 name: name.into(),
65 char_filters: Vec::new(),
66 tokenizer: Box::new(tokenizer),
67 filters,
68 }
69 }
70
71 pub fn with_char_filters(
73 name: impl Into<String>,
74 char_filters: Vec<Box<dyn CharFilter>>,
75 tokenizer: impl Tokenizer + 'static,
76 filters: Vec<Box<dyn TokenFilter>>,
77 ) -> Self {
78 Self {
79 name: name.into(),
80 char_filters,
81 tokenizer: Box::new(tokenizer),
82 filters,
83 }
84 }
85
86 pub fn from_boxed(
88 name: impl Into<String>,
89 char_filters: Vec<Box<dyn CharFilter>>,
90 tokenizer: Box<dyn Tokenizer>,
91 filters: Vec<Box<dyn TokenFilter>>,
92 ) -> Self {
93 Self {
94 name: name.into(),
95 char_filters,
96 tokenizer,
97 filters,
98 }
99 }
100
101 pub fn analyze(&self, text: &str) -> Vec<Token> {
106 let (filtered_text, corrections) = self.apply_char_filters(text);
108 let tokenize_input = if corrections.is_empty() {
109 text
110 } else {
111 &filtered_text
112 };
113
114 let mut tokens = Vec::new();
116 self.tokenizer.tokenize(tokenize_input, &mut tokens);
117
118 if !corrections.is_empty() {
120 for token in &mut tokens {
121 token.offset_from = correct_offset(token.offset_from, &corrections);
122 token.offset_to = correct_offset(token.offset_to, &corrections);
123 }
124 }
125
126 for filter in &self.filters {
128 filter.apply(&mut tokens);
129 }
130 tokens
131 }
132
133 pub fn name(&self) -> &str {
135 &self.name
136 }
137
138 fn apply_char_filters(&self, text: &str) -> (String, Vec<OffsetCorrection>) {
140 if self.char_filters.is_empty() {
141 return (String::new(), Vec::new());
142 }
143
144 let mut current = text.to_string();
145 let mut all_corrections = Vec::new();
146
147 for cf in &self.char_filters {
148 let (filtered, corrections) = cf.filter(¤t);
149 all_corrections.extend(corrections);
150 current = filtered;
151 }
152
153 (current, all_corrections)
154 }
155}
156
157pub struct AnalyzerRegistry {
165 analyzers: HashMap<String, Analyzer>,
166}
167
168impl std::fmt::Debug for AnalyzerRegistry {
169 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
170 f.debug_struct("AnalyzerRegistry")
171 .field("analyzers", &self.analyzers.keys().collect::<Vec<_>>())
172 .finish()
173 }
174}
175
176impl AnalyzerRegistry {
177 pub fn new() -> Self {
179 let mut registry = Self {
180 analyzers: HashMap::new(),
181 };
182 registry.register(standard_analyzer());
183 registry.register(simple_analyzer());
184 registry.register(whitespace_analyzer());
185 registry.register(keyword_analyzer());
186 registry.register(stop_analyzer());
187 registry
188 }
189
190 pub fn register(&mut self, analyzer: Analyzer) {
193 self.analyzers.insert(analyzer.name.clone(), analyzer);
194 }
195
196 pub fn get(&self, name: &str) -> &Analyzer {
198 self.analyzers
199 .get(name)
200 .unwrap_or_else(|| self.analyzers.get("standard").unwrap())
201 }
202
203 pub fn try_get(&self, name: &str) -> Option<&Analyzer> {
205 self.analyzers.get(name)
206 }
207
208 pub fn names(&self) -> Vec<&str> {
210 self.analyzers.keys().map(String::as_str).collect()
211 }
212}
213
214impl Default for AnalyzerRegistry {
215 fn default() -> Self {
216 Self::new()
217 }
218}
219
220pub fn standard_analyzer() -> Analyzer {
228 Analyzer::new(
229 "standard",
230 StandardTokenizer,
231 vec![Box::new(LowercaseFilter)],
232 )
233}
234
235pub fn simple_analyzer() -> Analyzer {
241 Analyzer::new("simple", LetterTokenizer, vec![Box::new(LowercaseFilter)])
242}
243
244pub fn whitespace_analyzer() -> Analyzer {
250 Analyzer::new("whitespace", WhitespaceTokenizer, vec![])
251}
252
253pub fn keyword_analyzer() -> Analyzer {
259 Analyzer::new("keyword", KeywordTokenizer, vec![])
260}
261
262pub fn stop_analyzer() -> Analyzer {
268 Analyzer::new(
269 "stop",
270 StandardTokenizer,
271 vec![Box::new(LowercaseFilter), Box::new(StopFilter::english())],
272 )
273}
274
275pub fn language_analyzer(algorithm: StemmerAlgorithm) -> Analyzer {
282 Analyzer::new(
283 "language",
284 StandardTokenizer,
285 vec![
286 Box::new(LowercaseFilter),
287 Box::new(StopFilter::english()),
288 Box::new(StemmerFilter::new(algorithm)),
289 ],
290 )
291}
292
293#[cfg(test)]
294mod tests {
295 use super::*;
296
297 #[test]
300 fn standard_analyzer_basic() {
301 let analyzer = standard_analyzer();
302 let tokens = analyzer.analyze("The Quick Brown Fox");
303 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
304 assert_eq!(texts, vec!["the", "quick", "brown", "fox"]);
305 }
306
307 #[test]
308 fn standard_analyzer_name() {
309 let analyzer = standard_analyzer();
310 assert_eq!(analyzer.name(), "standard");
311 }
312
313 #[test]
314 fn simple_analyzer_strips_numbers() {
315 let analyzer = simple_analyzer();
316 let tokens = analyzer.analyze("Hello123World");
317 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
318 assert_eq!(texts, vec!["hello", "world"]);
319 }
320
321 #[test]
322 fn whitespace_analyzer_preserves_everything() {
323 let analyzer = whitespace_analyzer();
324 let tokens = analyzer.analyze("Hello, World!");
325 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
326 assert_eq!(texts, vec!["Hello,", "World!"]);
327 }
328
329 #[test]
330 fn keyword_analyzer_single_token() {
331 let analyzer = keyword_analyzer();
332 let tokens = analyzer.analyze("Hello, World!");
333 assert_eq!(tokens.len(), 1);
334 assert_eq!(tokens[0].text, "Hello, World!");
335 }
336
337 #[test]
338 fn stop_analyzer_removes_stop_words() {
339 let analyzer = stop_analyzer();
340 let tokens = analyzer.analyze("The quick brown fox is a test");
341 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
342 assert_eq!(texts, vec!["quick", "brown", "fox", "test"]);
343 }
344
345 #[test]
346 fn language_analyzer_stems() {
347 let analyzer = language_analyzer(StemmerAlgorithm::English);
348 let tokens = analyzer.analyze("The cats are running quickly");
349 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
350 assert_eq!(texts, vec!["cat", "run", "quick"]);
351 }
352
353 #[test]
354 fn analyzer_preserves_positions() {
355 let analyzer = stop_analyzer();
356 let tokens = analyzer.analyze("the quick brown fox");
357 assert_eq!(tokens[0].text, "quick");
359 assert_eq!(tokens[0].position, 1); }
361
362 #[test]
363 fn analyzer_empty_input() {
364 let analyzer = standard_analyzer();
365 let tokens = analyzer.analyze("");
366 assert!(tokens.is_empty());
367 }
368
369 #[test]
372 fn registry_has_builtins() {
373 let registry = AnalyzerRegistry::new();
374 let names = registry.names();
375 assert!(names.contains(&"standard"));
376 assert!(names.contains(&"simple"));
377 assert!(names.contains(&"whitespace"));
378 assert!(names.contains(&"keyword"));
379 assert!(names.contains(&"stop"));
380 }
381
382 #[test]
383 fn registry_get_standard() {
384 let registry = AnalyzerRegistry::new();
385 let analyzer = registry.get("standard");
386 assert_eq!(analyzer.name(), "standard");
387 }
388
389 #[test]
390 fn registry_fallback_to_standard() {
391 let registry = AnalyzerRegistry::new();
392 let analyzer = registry.get("nonexistent");
393 assert_eq!(analyzer.name(), "standard");
394 }
395
396 #[test]
397 fn registry_try_get_returns_none() {
398 let registry = AnalyzerRegistry::new();
399 assert!(registry.try_get("nonexistent").is_none());
400 assert!(registry.try_get("standard").is_some());
401 }
402
403 #[test]
404 fn registry_custom_analyzer() {
405 let mut registry = AnalyzerRegistry::new();
406 registry.register(Analyzer::new(
407 "custom",
408 WhitespaceTokenizer,
409 vec![Box::new(LowercaseFilter)],
410 ));
411
412 let analyzer = registry.get("custom");
413 assert_eq!(analyzer.name(), "custom");
414 let tokens = analyzer.analyze("Hello World");
415 assert_eq!(tokens[0].text, "hello");
416 }
417
418 #[test]
421 fn analyze_realistic_document() {
422 let analyzer = standard_analyzer();
423 let text = "Elasticsearch is a distributed, RESTful search and \
424 analytics engine. It centrally stores your data for \
425 lightning fast search.";
426 let tokens = analyzer.analyze(text);
427
428 assert!(tokens.len() > 10);
430 assert!(tokens.iter().all(|t| t.text == t.text.to_lowercase()));
431
432 for token in &tokens {
434 assert_eq!(
435 text[token.offset_from..token.offset_to].to_lowercase(),
436 token.text
437 );
438 }
439 }
440
441 #[test]
442 fn stop_analyzer_realistic() {
443 let analyzer = stop_analyzer();
444 let text = "The quick brown fox jumps over the lazy dog";
445 let tokens = analyzer.analyze(text);
446 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
447
448 assert!(!texts.contains(&"the"));
450 assert!(texts.contains(&"quick"));
451 assert!(texts.contains(&"over")); }
453
454 #[test]
455 fn language_analyzer_realistic() {
456 let analyzer = language_analyzer(StemmerAlgorithm::English);
457 let text = "The users were searching for documents containing these keywords";
458 let tokens = analyzer.analyze(text);
459 let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
460
461 assert!(!texts.contains(&"the"));
463 assert!(!texts.contains(&"these")); assert!(texts.contains(&"user")); assert!(texts.contains(&"search")); assert!(texts.contains(&"document")); assert!(texts.contains(&"keyword")); }
469}