Skip to main content

luci/analysis/
mod.rs

1// Obsidian [[wikilinks]] in doc comments are intentional — they link to
2// design and reference docs in docs/. Rustdoc doesn't understand them.
3#![allow(rustdoc::broken_intra_doc_links)]
4
5//! `luci-analysis` — text analysis pipeline for Luci.
6//!
7//! Transforms raw text into indexed terms via a three-stage pipeline:
8//!
9//! ```text
10//! Raw Text → Tokenizer → Token Filters → Indexed Terms
11//! ```
12//!
13//! Provides the `Tokenizer` and `TokenFilter` traits with built-in
14//! implementations matching Elasticsearch's analyzer model: `standard`,
15//! `simple`, `whitespace`, and `keyword` analyzers.
16//!
17//! See [[analyzers]] for the full specification.
18
19mod char_filter;
20pub mod config;
21mod filter;
22mod token;
23mod tokenizer;
24
25pub use char_filter::{
26    CharFilter, HtmlStripCharFilter, MappingCharFilter, OffsetCorrection, PatternReplaceCharFilter,
27    correct_offset,
28};
29pub use filter::{
30    AsciiFoldingFilter, EdgeNGramTokenFilter, LowercaseFilter, NGramTokenFilter, ShingleFilter,
31    StemmerAlgorithm, StemmerFilter, StopFilter, SynonymFilter, TokenFilter,
32};
33pub use token::Token;
34pub use tokenizer::{
35    EdgeNGramTokenizer, KeywordTokenizer, LetterTokenizer, NGramTokenizer, PathHierarchyTokenizer,
36    PatternTokenizer, StandardTokenizer, Tokenizer, WhitespaceTokenizer,
37};
38
39use std::collections::HashMap;
40
41/// A complete text analysis pipeline: char filters + tokenizer + token filters.
42///
43/// Combines zero or more [`CharFilter`]s, a single [`Tokenizer`], and zero or
44/// more [`TokenFilter`]s. Character filters preprocess the raw text, the
45/// tokenizer breaks it into tokens, then each token filter transforms the
46/// stream in order.
47///
48/// See [[analyzers#Pipeline Stages]].
49pub struct Analyzer {
50    name: String,
51    char_filters: Vec<Box<dyn CharFilter>>,
52    tokenizer: Box<dyn Tokenizer>,
53    filters: Vec<Box<dyn TokenFilter>>,
54}
55
56impl Analyzer {
57    /// Create a new analyzer with the given name, tokenizer, and filters.
58    pub fn new(
59        name: impl Into<String>,
60        tokenizer: impl Tokenizer + 'static,
61        filters: Vec<Box<dyn TokenFilter>>,
62    ) -> Self {
63        Self {
64            name: name.into(),
65            char_filters: Vec::new(),
66            tokenizer: Box::new(tokenizer),
67            filters,
68        }
69    }
70
71    /// Create a new analyzer with char filters, tokenizer, and token filters.
72    pub fn with_char_filters(
73        name: impl Into<String>,
74        char_filters: Vec<Box<dyn CharFilter>>,
75        tokenizer: impl Tokenizer + 'static,
76        filters: Vec<Box<dyn TokenFilter>>,
77    ) -> Self {
78        Self {
79            name: name.into(),
80            char_filters,
81            tokenizer: Box::new(tokenizer),
82            filters,
83        }
84    }
85
86    /// Create from already-boxed components (used by config builder).
87    pub fn from_boxed(
88        name: impl Into<String>,
89        char_filters: Vec<Box<dyn CharFilter>>,
90        tokenizer: Box<dyn Tokenizer>,
91        filters: Vec<Box<dyn TokenFilter>>,
92    ) -> Self {
93        Self {
94            name: name.into(),
95            char_filters,
96            tokenizer,
97            filters,
98        }
99    }
100
101    /// Run the full analysis pipeline on the input text.
102    ///
103    /// Applies char filters, tokenizes, corrects offsets, then applies
104    /// token filters in order.
105    pub fn analyze(&self, text: &str) -> Vec<Token> {
106        // Phase 1: apply char filters
107        let (filtered_text, corrections) = self.apply_char_filters(text);
108        let tokenize_input = if corrections.is_empty() {
109            text
110        } else {
111            &filtered_text
112        };
113
114        // Phase 2: tokenize
115        let mut tokens = Vec::new();
116        self.tokenizer.tokenize(tokenize_input, &mut tokens);
117
118        // Phase 3: correct offsets back to original text
119        if !corrections.is_empty() {
120            for token in &mut tokens {
121                token.offset_from = correct_offset(token.offset_from, &corrections);
122                token.offset_to = correct_offset(token.offset_to, &corrections);
123            }
124        }
125
126        // Phase 4: token filters
127        for filter in &self.filters {
128            filter.apply(&mut tokens);
129        }
130        tokens
131    }
132
133    /// The analyzer's name (e.g., `"standard"`, `"simple"`).
134    pub fn name(&self) -> &str {
135        &self.name
136    }
137
138    /// Apply all char filters in sequence, accumulating corrections.
139    fn apply_char_filters(&self, text: &str) -> (String, Vec<OffsetCorrection>) {
140        if self.char_filters.is_empty() {
141            return (String::new(), Vec::new());
142        }
143
144        let mut current = text.to_string();
145        let mut all_corrections = Vec::new();
146
147        for cf in &self.char_filters {
148            let (filtered, corrections) = cf.filter(&current);
149            all_corrections.extend(corrections);
150            current = filtered;
151        }
152
153        (current, all_corrections)
154    }
155}
156
157/// Registry of named analyzers with fallback resolution.
158///
159/// Implements the analyzer resolution chain from [[analyzers#Analyzer Resolution]]:
160/// 1. Look up by exact name
161/// 2. Fall back to the `standard` analyzer
162///
163/// All built-in analyzers are registered on construction.
164pub struct AnalyzerRegistry {
165    analyzers: HashMap<String, Analyzer>,
166}
167
168impl std::fmt::Debug for AnalyzerRegistry {
169    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
170        f.debug_struct("AnalyzerRegistry")
171            .field("analyzers", &self.analyzers.keys().collect::<Vec<_>>())
172            .finish()
173    }
174}
175
176impl AnalyzerRegistry {
177    /// Create a new registry with all built-in analyzers pre-registered.
178    pub fn new() -> Self {
179        let mut registry = Self {
180            analyzers: HashMap::new(),
181        };
182        registry.register(standard_analyzer());
183        registry.register(simple_analyzer());
184        registry.register(whitespace_analyzer());
185        registry.register(keyword_analyzer());
186        registry.register(stop_analyzer());
187        registry
188    }
189
190    /// Register a custom analyzer. Overwrites any existing analyzer with
191    /// the same name.
192    pub fn register(&mut self, analyzer: Analyzer) {
193        self.analyzers.insert(analyzer.name.clone(), analyzer);
194    }
195
196    /// Look up an analyzer by name, falling back to `standard`.
197    pub fn get(&self, name: &str) -> &Analyzer {
198        self.analyzers
199            .get(name)
200            .unwrap_or_else(|| self.analyzers.get("standard").unwrap())
201    }
202
203    /// Look up an analyzer by name. Returns `None` if not found.
204    pub fn try_get(&self, name: &str) -> Option<&Analyzer> {
205        self.analyzers.get(name)
206    }
207
208    /// List all registered analyzer names.
209    pub fn names(&self) -> Vec<&str> {
210        self.analyzers.keys().map(String::as_str).collect()
211    }
212}
213
214impl Default for AnalyzerRegistry {
215    fn default() -> Self {
216        Self::new()
217    }
218}
219
220// --- Built-in analyzer constructors ---
221
222/// `standard` analyzer: UAX#29 tokenizer + lowercase filter.
223///
224/// The default analyzer for `text` fields.
225///
226/// See [[analyzers#Built-in Analyzers]].
227pub fn standard_analyzer() -> Analyzer {
228    Analyzer::new(
229        "standard",
230        StandardTokenizer,
231        vec![Box::new(LowercaseFilter)],
232    )
233}
234
235/// `simple` analyzer: letter tokenizer + lowercase filter.
236///
237/// Splits on non-letter characters and lowercases.
238///
239/// See [[analyzers#Built-in Analyzers]].
240pub fn simple_analyzer() -> Analyzer {
241    Analyzer::new("simple", LetterTokenizer, vec![Box::new(LowercaseFilter)])
242}
243
244/// `whitespace` analyzer: whitespace tokenizer, no filters.
245///
246/// Splits on whitespace only, preserving case and punctuation.
247///
248/// See [[analyzers#Built-in Analyzers]].
249pub fn whitespace_analyzer() -> Analyzer {
250    Analyzer::new("whitespace", WhitespaceTokenizer, vec![])
251}
252
253/// `keyword` analyzer: keyword tokenizer, no filters.
254///
255/// Emits the entire input as a single token. Used for exact-match fields.
256///
257/// See [[analyzers#Built-in Analyzers]].
258pub fn keyword_analyzer() -> Analyzer {
259    Analyzer::new("keyword", KeywordTokenizer, vec![])
260}
261
262/// `stop` analyzer: UAX#29 tokenizer + lowercase + English stop words.
263///
264/// Like `standard` but removes common English stop words.
265///
266/// See [[analyzers#Built-in Analyzers]].
267pub fn stop_analyzer() -> Analyzer {
268    Analyzer::new(
269        "stop",
270        StandardTokenizer,
271        vec![Box::new(LowercaseFilter), Box::new(StopFilter::english())],
272    )
273}
274
275/// `language` analyzer: UAX#29 tokenizer + lowercase + stop words + stemmer.
276///
277/// The most aggressive built-in analyzer — normalizes, removes stop words,
278/// and stems. Best recall for free-text search.
279///
280/// See [[analyzers#Built-in Analyzers]].
281pub fn language_analyzer(algorithm: StemmerAlgorithm) -> Analyzer {
282    Analyzer::new(
283        "language",
284        StandardTokenizer,
285        vec![
286            Box::new(LowercaseFilter),
287            Box::new(StopFilter::english()),
288            Box::new(StemmerFilter::new(algorithm)),
289        ],
290    )
291}
292
293#[cfg(test)]
294mod tests {
295    use super::*;
296
297    // --- Analyzer ---
298
299    #[test]
300    fn standard_analyzer_basic() {
301        let analyzer = standard_analyzer();
302        let tokens = analyzer.analyze("The Quick Brown Fox");
303        let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
304        assert_eq!(texts, vec!["the", "quick", "brown", "fox"]);
305    }
306
307    #[test]
308    fn standard_analyzer_name() {
309        let analyzer = standard_analyzer();
310        assert_eq!(analyzer.name(), "standard");
311    }
312
313    #[test]
314    fn simple_analyzer_strips_numbers() {
315        let analyzer = simple_analyzer();
316        let tokens = analyzer.analyze("Hello123World");
317        let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
318        assert_eq!(texts, vec!["hello", "world"]);
319    }
320
321    #[test]
322    fn whitespace_analyzer_preserves_everything() {
323        let analyzer = whitespace_analyzer();
324        let tokens = analyzer.analyze("Hello, World!");
325        let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
326        assert_eq!(texts, vec!["Hello,", "World!"]);
327    }
328
329    #[test]
330    fn keyword_analyzer_single_token() {
331        let analyzer = keyword_analyzer();
332        let tokens = analyzer.analyze("Hello, World!");
333        assert_eq!(tokens.len(), 1);
334        assert_eq!(tokens[0].text, "Hello, World!");
335    }
336
337    #[test]
338    fn stop_analyzer_removes_stop_words() {
339        let analyzer = stop_analyzer();
340        let tokens = analyzer.analyze("The quick brown fox is a test");
341        let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
342        assert_eq!(texts, vec!["quick", "brown", "fox", "test"]);
343    }
344
345    #[test]
346    fn language_analyzer_stems() {
347        let analyzer = language_analyzer(StemmerAlgorithm::English);
348        let tokens = analyzer.analyze("The cats are running quickly");
349        let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
350        assert_eq!(texts, vec!["cat", "run", "quick"]);
351    }
352
353    #[test]
354    fn analyzer_preserves_positions() {
355        let analyzer = stop_analyzer();
356        let tokens = analyzer.analyze("the quick brown fox");
357        // "the" removed, but positions preserved from tokenization.
358        assert_eq!(tokens[0].text, "quick");
359        assert_eq!(tokens[0].position, 1); // position 0 was "the"
360    }
361
362    #[test]
363    fn analyzer_empty_input() {
364        let analyzer = standard_analyzer();
365        let tokens = analyzer.analyze("");
366        assert!(tokens.is_empty());
367    }
368
369    // --- AnalyzerRegistry ---
370
371    #[test]
372    fn registry_has_builtins() {
373        let registry = AnalyzerRegistry::new();
374        let names = registry.names();
375        assert!(names.contains(&"standard"));
376        assert!(names.contains(&"simple"));
377        assert!(names.contains(&"whitespace"));
378        assert!(names.contains(&"keyword"));
379        assert!(names.contains(&"stop"));
380    }
381
382    #[test]
383    fn registry_get_standard() {
384        let registry = AnalyzerRegistry::new();
385        let analyzer = registry.get("standard");
386        assert_eq!(analyzer.name(), "standard");
387    }
388
389    #[test]
390    fn registry_fallback_to_standard() {
391        let registry = AnalyzerRegistry::new();
392        let analyzer = registry.get("nonexistent");
393        assert_eq!(analyzer.name(), "standard");
394    }
395
396    #[test]
397    fn registry_try_get_returns_none() {
398        let registry = AnalyzerRegistry::new();
399        assert!(registry.try_get("nonexistent").is_none());
400        assert!(registry.try_get("standard").is_some());
401    }
402
403    #[test]
404    fn registry_custom_analyzer() {
405        let mut registry = AnalyzerRegistry::new();
406        registry.register(Analyzer::new(
407            "custom",
408            WhitespaceTokenizer,
409            vec![Box::new(LowercaseFilter)],
410        ));
411
412        let analyzer = registry.get("custom");
413        assert_eq!(analyzer.name(), "custom");
414        let tokens = analyzer.analyze("Hello World");
415        assert_eq!(tokens[0].text, "hello");
416    }
417
418    // --- End-to-end ---
419
420    #[test]
421    fn analyze_realistic_document() {
422        let analyzer = standard_analyzer();
423        let text = "Elasticsearch is a distributed, RESTful search and \
424                    analytics engine. It centrally stores your data for \
425                    lightning fast search.";
426        let tokens = analyzer.analyze(text);
427
428        // Should produce lowercased terms without punctuation.
429        assert!(tokens.len() > 10);
430        assert!(tokens.iter().all(|t| t.text == t.text.to_lowercase()));
431
432        // Offsets should point back to the original text.
433        for token in &tokens {
434            assert_eq!(
435                text[token.offset_from..token.offset_to].to_lowercase(),
436                token.text
437            );
438        }
439    }
440
441    #[test]
442    fn stop_analyzer_realistic() {
443        let analyzer = stop_analyzer();
444        let text = "The quick brown fox jumps over the lazy dog";
445        let tokens = analyzer.analyze(text);
446        let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
447
448        // "the" (appears twice) and "over" (not a stop word) should be handled correctly.
449        assert!(!texts.contains(&"the"));
450        assert!(texts.contains(&"quick"));
451        assert!(texts.contains(&"over")); // "over" is not in default stop words
452    }
453
454    #[test]
455    fn language_analyzer_realistic() {
456        let analyzer = language_analyzer(StemmerAlgorithm::English);
457        let text = "The users were searching for documents containing these keywords";
458        let tokens = analyzer.analyze(text);
459        let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
460
461        // Stop words removed, remaining words stemmed.
462        assert!(!texts.contains(&"the"));
463        assert!(!texts.contains(&"these")); // stop word
464        assert!(texts.contains(&"user")); // "users" → "user"
465        assert!(texts.contains(&"search")); // "searching" → "search"
466        assert!(texts.contains(&"document")); // "documents" → "document"
467        assert!(texts.contains(&"keyword")); // "keywords" → "keyword"
468    }
469}