use tantivy::tokenizer::{
Language, LowerCaser, NgramTokenizer, RawTokenizer, RemoveLongFilter, SimpleTokenizer, Stemmer,
StopWordFilter, TextAnalyzer, WhitespaceTokenizer,
};
use super::TextIndexError;
const MAX_TOKEN_LEN: usize = 255;
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Analyzer {
StandardEnglish,
Standard,
Whitespace,
Raw,
NGram { min: usize, max: usize },
}
impl Analyzer {
pub fn tokenizer_name(&self) -> String {
match self {
Analyzer::StandardEnglish => "interstellar_standard_en".to_string(),
Analyzer::Standard => "interstellar_standard".to_string(),
Analyzer::Whitespace => "interstellar_whitespace".to_string(),
Analyzer::Raw => "interstellar_raw".to_string(),
Analyzer::NGram { min, max } => format!("interstellar_ngram_{}_{}", min, max),
}
}
pub fn build(&self) -> Result<TextAnalyzer, TextIndexError> {
Ok(match self {
Analyzer::StandardEnglish => TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(MAX_TOKEN_LEN))
.filter(LowerCaser)
.filter(
StopWordFilter::new(Language::English).expect("English stopwords are bundled"),
)
.filter(Stemmer::new(Language::English))
.build(),
Analyzer::Standard => TextAnalyzer::builder(SimpleTokenizer::default())
.filter(RemoveLongFilter::limit(MAX_TOKEN_LEN))
.filter(LowerCaser)
.build(),
Analyzer::Whitespace => TextAnalyzer::builder(WhitespaceTokenizer::default())
.filter(RemoveLongFilter::limit(MAX_TOKEN_LEN))
.build(),
Analyzer::Raw => TextAnalyzer::builder(RawTokenizer::default()).build(),
Analyzer::NGram { min, max } => {
if *min == 0 || *max == 0 {
return Err(TextIndexError::UnsupportedConfig(format!(
"NGram analyzer requires min >= 1 and max >= 1 (got min={}, max={})",
min, max
)));
}
if min > max {
return Err(TextIndexError::UnsupportedConfig(format!(
"NGram analyzer requires min <= max (got min={}, max={})",
min, max
)));
}
let ngram = NgramTokenizer::new(*min, *max, false).map_err(|e| {
TextIndexError::UnsupportedConfig(format!("invalid NGram parameters: {}", e))
})?;
TextAnalyzer::builder(ngram)
.filter(RemoveLongFilter::limit(MAX_TOKEN_LEN))
.filter(LowerCaser)
.build()
}
})
}
#[cfg(test)]
pub fn tokens(&self, text: &str) -> Result<Vec<String>, TextIndexError> {
use tantivy::tokenizer::TokenStream;
let mut analyzer = self.build()?;
let mut stream = analyzer.token_stream(text);
let mut out = Vec::new();
while stream.advance() {
out.push(stream.token().text.clone());
}
Ok(out)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn standard_english_lowercases_stems_and_removes_stopwords() {
let toks = Analyzer::StandardEnglish
.tokens("The quick brown foxes are jumping")
.unwrap();
assert_eq!(toks, vec!["quick", "brown", "fox", "jump"]);
}
#[test]
fn standard_lowercases_but_does_not_stem() {
let toks = Analyzer::Standard.tokens("The Quick Brown FOXES").unwrap();
assert_eq!(toks, vec!["the", "quick", "brown", "foxes"]);
}
#[test]
fn whitespace_preserves_case_and_splits_on_whitespace_only() {
let toks = Analyzer::Whitespace
.tokens("Hello, WORLD! quick-brown")
.unwrap();
assert_eq!(toks, vec!["Hello,", "WORLD!", "quick-brown"]);
}
#[test]
fn raw_emits_a_single_token() {
let toks = Analyzer::Raw.tokens("hello world HELLO").unwrap();
assert_eq!(toks, vec!["hello world HELLO"]);
}
#[test]
fn ngram_emits_all_windows() {
let toks = Analyzer::NGram { min: 2, max: 3 }.tokens("abcd").unwrap();
assert!(toks.contains(&"ab".to_string()));
assert!(toks.contains(&"bc".to_string()));
assert!(toks.contains(&"cd".to_string()));
assert!(toks.contains(&"abc".to_string()));
assert!(toks.contains(&"bcd".to_string()));
}
#[test]
fn ngram_zero_min_rejected() {
let result = Analyzer::NGram { min: 0, max: 3 }.build();
assert!(matches!(result, Err(TextIndexError::UnsupportedConfig(_))));
}
#[test]
fn ngram_min_greater_than_max_rejected() {
let result = Analyzer::NGram { min: 5, max: 2 }.build();
assert!(matches!(result, Err(TextIndexError::UnsupportedConfig(_))));
}
#[test]
fn tokenizer_names_are_stable_and_distinct() {
let names = [
Analyzer::StandardEnglish.tokenizer_name(),
Analyzer::Standard.tokenizer_name(),
Analyzer::Whitespace.tokenizer_name(),
Analyzer::Raw.tokenizer_name(),
Analyzer::NGram { min: 2, max: 3 }.tokenizer_name(),
Analyzer::NGram { min: 3, max: 4 }.tokenizer_name(),
];
let unique: std::collections::HashSet<_> = names.iter().collect();
assert_eq!(unique.len(), names.len());
}
#[test]
fn long_tokens_are_filtered() {
let big = "a".repeat(MAX_TOKEN_LEN + 10);
let input = format!("ok {} fine", big);
let toks = Analyzer::Standard.tokens(&input).unwrap();
assert_eq!(toks, vec!["ok", "fine"]);
}
}