use serde_json::Value;
use std::collections::HashMap;
use crate::analysis::tokenizer::TokenChar;
use crate::analysis::{
Analyzer, AnalyzerRegistry, AsciiFoldingFilter, CharFilter, EdgeNGramTokenFilter,
EdgeNGramTokenizer, HtmlStripCharFilter, KeywordTokenizer, LetterTokenizer, LowercaseFilter,
MappingCharFilter, NGramTokenFilter, NGramTokenizer, PathHierarchyTokenizer,
PatternReplaceCharFilter, PatternTokenizer, ShingleFilter, StandardTokenizer, StemmerFilter,
StopFilter, SynonymFilter, TokenFilter, Tokenizer, WhitespaceTokenizer,
};
#[derive(Clone, Debug, Default)]
pub struct AnalysisConfig {
pub char_filters: HashMap<String, Value>,
pub tokenizers: HashMap<String, Value>,
pub filters: HashMap<String, Value>,
pub analyzers: HashMap<String, Value>,
}
impl AnalysisConfig {
pub fn from_json(analysis: &Value) -> Result<Self, String> {
let obj = analysis.as_object().ok_or("analysis must be an object")?;
let parse_section = |key: &str| -> HashMap<String, Value> {
obj.get(key)
.and_then(|v| v.as_object())
.map(|m| m.iter().map(|(k, v)| (k.clone(), v.clone())).collect())
.unwrap_or_default()
};
Ok(Self {
char_filters: parse_section("char_filter"),
tokenizers: parse_section("tokenizer"),
filters: parse_section("filter"),
analyzers: parse_section("analyzer"),
})
}
pub fn to_json(&self) -> Value {
let mut obj = serde_json::Map::new();
if !self.char_filters.is_empty() {
obj.insert(
"char_filter".to_string(),
Value::Object(
self.char_filters
.iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect(),
),
);
}
if !self.tokenizers.is_empty() {
obj.insert(
"tokenizer".to_string(),
Value::Object(
self.tokenizers
.iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect(),
),
);
}
if !self.filters.is_empty() {
obj.insert(
"filter".to_string(),
Value::Object(
self.filters
.iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect(),
),
);
}
if !self.analyzers.is_empty() {
obj.insert(
"analyzer".to_string(),
Value::Object(
self.analyzers
.iter()
.map(|(k, v)| (k.clone(), v.clone()))
.collect(),
),
);
}
Value::Object(obj)
}
pub fn build_registry(&self) -> Result<AnalyzerRegistry, String> {
let mut registry = AnalyzerRegistry::new();
for (name, config) in &self.analyzers {
let analyzer = self.build_analyzer(name, config)?;
registry.register(analyzer);
}
Ok(registry)
}
fn build_analyzer(&self, name: &str, config: &Value) -> Result<Analyzer, String> {
let obj = config
.as_object()
.ok_or_else(|| format!("analyzer '{name}' must be an object"))?;
let tokenizer_name = obj
.get("tokenizer")
.and_then(|v| v.as_str())
.ok_or_else(|| format!("analyzer '{name}' requires a 'tokenizer' field"))?;
let tokenizer = self.build_tokenizer(tokenizer_name)?;
let char_filter_names: Vec<&str> = obj
.get("char_filter")
.and_then(|v| v.as_array())
.map(|arr| arr.iter().filter_map(|v| v.as_str()).collect())
.unwrap_or_default();
let mut char_filters: Vec<Box<dyn CharFilter>> = Vec::new();
for cf_name in &char_filter_names {
char_filters.push(self.build_char_filter(cf_name)?);
}
let filter_names: Vec<&str> = obj
.get("filter")
.and_then(|v| v.as_array())
.map(|arr| arr.iter().filter_map(|v| v.as_str()).collect())
.unwrap_or_default();
let mut filters: Vec<Box<dyn TokenFilter>> = Vec::new();
for f_name in &filter_names {
filters.push(self.build_token_filter(f_name)?);
}
Ok(Analyzer::from_boxed(name, char_filters, tokenizer, filters))
}
fn build_tokenizer(&self, name: &str) -> Result<Box<dyn Tokenizer>, String> {
if let Some(config) = self.tokenizers.get(name) {
return self.build_custom_tokenizer(name, config);
}
match name {
"standard" => Ok(Box::new(StandardTokenizer)),
"whitespace" => Ok(Box::new(WhitespaceTokenizer)),
"letter" => Ok(Box::new(LetterTokenizer)),
"keyword" => Ok(Box::new(KeywordTokenizer)),
_ => Err(format!("unknown tokenizer: '{name}'")),
}
}
fn build_custom_tokenizer(
&self,
name: &str,
config: &Value,
) -> Result<Box<dyn Tokenizer>, String> {
let obj = config
.as_object()
.ok_or_else(|| format!("tokenizer '{name}' must be an object"))?;
let typ = obj.get("type").and_then(|v| v.as_str()).unwrap_or(name);
match typ {
"standard" => Ok(Box::new(StandardTokenizer)),
"whitespace" => Ok(Box::new(WhitespaceTokenizer)),
"letter" => Ok(Box::new(LetterTokenizer)),
"keyword" => Ok(Box::new(KeywordTokenizer)),
"ngram" => {
let min_gram = obj.get("min_gram").and_then(|v| v.as_u64()).unwrap_or(1) as usize;
let max_gram = obj.get("max_gram").and_then(|v| v.as_u64()).unwrap_or(2) as usize;
if min_gram > max_gram {
return Err(format!(
"tokenizer '{name}': min_gram ({min_gram}) > max_gram ({max_gram})"
));
}
let token_chars = parse_token_chars(obj)?;
Ok(Box::new(NGramTokenizer::new(
min_gram,
max_gram,
token_chars,
)))
}
"edge_ngram" => {
let min_gram = obj.get("min_gram").and_then(|v| v.as_u64()).unwrap_or(1) as usize;
let max_gram = obj.get("max_gram").and_then(|v| v.as_u64()).unwrap_or(2) as usize;
if min_gram > max_gram {
return Err(format!(
"tokenizer '{name}': min_gram ({min_gram}) > max_gram ({max_gram})"
));
}
let token_chars = parse_token_chars(obj)?;
Ok(Box::new(EdgeNGramTokenizer::new(
min_gram,
max_gram,
token_chars,
)))
}
"pattern" => {
let pattern = obj
.get("pattern")
.and_then(|v| v.as_str())
.unwrap_or(r"\W+");
let tok = PatternTokenizer::new(pattern)
.map_err(|e| format!("tokenizer '{name}': invalid pattern: {e}"))?;
Ok(Box::new(tok))
}
"path_hierarchy" => {
let separator = obj
.get("separator")
.and_then(|v| v.as_str())
.and_then(|s| s.chars().next())
.unwrap_or('/');
let replacement = obj
.get("replacement")
.and_then(|v| v.as_str())
.and_then(|s| s.chars().next());
Ok(Box::new(PathHierarchyTokenizer::new(
separator,
replacement,
)))
}
_ => Err(format!("unknown tokenizer type: '{typ}'")),
}
}
fn build_char_filter(&self, name: &str) -> Result<Box<dyn CharFilter>, String> {
if let Some(config) = self.char_filters.get(name) {
return self.build_custom_char_filter(name, config);
}
match name {
"html_strip" => Ok(Box::new(HtmlStripCharFilter)),
_ => Err(format!("unknown char_filter: '{name}'")),
}
}
fn build_custom_char_filter(
&self,
name: &str,
config: &Value,
) -> Result<Box<dyn CharFilter>, String> {
let obj = config
.as_object()
.ok_or_else(|| format!("char_filter '{name}' must be an object"))?;
let typ = obj.get("type").and_then(|v| v.as_str()).unwrap_or(name);
match typ {
"html_strip" => Ok(Box::new(HtmlStripCharFilter)),
"mapping" => {
let mappings = obj
.get("mappings")
.and_then(|v| v.as_array())
.map(|arr| {
arr.iter()
.filter_map(|v| v.as_str())
.filter_map(|s| {
let parts: Vec<&str> = s.splitn(2, "=>").collect();
if parts.len() == 2 {
Some((parts[0].trim().to_string(), parts[1].trim().to_string()))
} else {
None
}
})
.collect()
})
.unwrap_or_default();
Ok(Box::new(MappingCharFilter::new(mappings)))
}
"pattern_replace" => {
let pattern = obj
.get("pattern")
.and_then(|v| v.as_str())
.ok_or_else(|| format!("char_filter '{name}': 'pattern' is required"))?;
let replacement = obj
.get("replacement")
.and_then(|v| v.as_str())
.unwrap_or("");
let cf = PatternReplaceCharFilter::new(pattern, replacement)
.map_err(|e| format!("char_filter '{name}': invalid pattern: {e}"))?;
Ok(Box::new(cf))
}
_ => Err(format!("unknown char_filter type: '{typ}'")),
}
}
fn build_token_filter(&self, name: &str) -> Result<Box<dyn TokenFilter>, String> {
if let Some(config) = self.filters.get(name) {
return self.build_custom_token_filter(name, config);
}
match name {
"lowercase" => Ok(Box::new(LowercaseFilter)),
"stop" => Ok(Box::new(StopFilter::english())),
"asciifolding" => Ok(Box::new(AsciiFoldingFilter::new(false))),
_ => Err(format!("unknown filter: '{name}'")),
}
}
fn build_custom_token_filter(
&self,
name: &str,
config: &Value,
) -> Result<Box<dyn TokenFilter>, String> {
let obj = config
.as_object()
.ok_or_else(|| format!("filter '{name}' must be an object"))?;
let typ = obj.get("type").and_then(|v| v.as_str()).unwrap_or(name);
match typ {
"lowercase" => Ok(Box::new(LowercaseFilter)),
"stop" => {
let stopwords = obj
.get("stopwords")
.and_then(|v| v.as_array())
.map(|arr| {
arr.iter()
.filter_map(|v| v.as_str().map(String::from))
.collect::<Vec<_>>()
})
.map(|words| StopFilter::new(words))
.unwrap_or_else(StopFilter::english);
Ok(Box::new(stopwords))
}
"stemmer" => {
let lang = obj
.get("language")
.and_then(|v| v.as_str())
.unwrap_or("english");
let algorithm = parse_stemmer_language(lang)?;
Ok(Box::new(StemmerFilter::new(algorithm)))
}
"asciifolding" => {
let preserve = obj
.get("preserve_original")
.and_then(|v| v.as_bool())
.unwrap_or(false);
Ok(Box::new(AsciiFoldingFilter::new(preserve)))
}
"ngram" => {
let min_gram = obj.get("min_gram").and_then(|v| v.as_u64()).unwrap_or(1) as usize;
let max_gram = obj.get("max_gram").and_then(|v| v.as_u64()).unwrap_or(2) as usize;
Ok(Box::new(NGramTokenFilter::new(min_gram, max_gram)))
}
"edge_ngram" => {
let min_gram = obj.get("min_gram").and_then(|v| v.as_u64()).unwrap_or(1) as usize;
let max_gram = obj.get("max_gram").and_then(|v| v.as_u64()).unwrap_or(2) as usize;
let preserve = obj
.get("preserve_original")
.and_then(|v| v.as_bool())
.unwrap_or(false);
Ok(Box::new(EdgeNGramTokenFilter::new(
min_gram, max_gram, preserve,
)))
}
"synonym" => {
let expand = obj.get("expand").and_then(|v| v.as_bool()).unwrap_or(true);
let synonyms: Vec<String> = obj
.get("synonyms")
.and_then(|v| v.as_array())
.map(|arr| {
arr.iter()
.filter_map(|v| v.as_str().map(String::from))
.collect()
})
.unwrap_or_default();
Ok(Box::new(SynonymFilter::new(&synonyms, expand)))
}
"shingle" => {
let min_size = obj
.get("min_shingle_size")
.and_then(|v| v.as_u64())
.unwrap_or(2) as usize;
let max_size = obj
.get("max_shingle_size")
.and_then(|v| v.as_u64())
.unwrap_or(2) as usize;
let output_unigrams = obj
.get("output_unigrams")
.and_then(|v| v.as_bool())
.unwrap_or(true);
Ok(Box::new(ShingleFilter::new(
min_size,
max_size,
output_unigrams,
)))
}
_ => Err(format!("unknown filter type: '{typ}'")),
}
}
}
fn parse_token_chars(obj: &serde_json::Map<String, Value>) -> Result<Vec<TokenChar>, String> {
obj.get("token_chars")
.and_then(|v| v.as_array())
.map(|arr| {
arr.iter()
.filter_map(|v| v.as_str())
.map(|s| {
TokenChar::from_str(s)
.ok_or_else(|| format!("unknown token_chars value: '{s}'"))
})
.collect::<Result<Vec<_>, _>>()
})
.unwrap_or(Ok(Vec::new()))
}
fn parse_stemmer_language(lang: &str) -> Result<rust_stemmers::Algorithm, String> {
match lang.to_lowercase().as_str() {
"english" => Ok(rust_stemmers::Algorithm::English),
"french" => Ok(rust_stemmers::Algorithm::French),
"german" => Ok(rust_stemmers::Algorithm::German),
"spanish" => Ok(rust_stemmers::Algorithm::Spanish),
"italian" => Ok(rust_stemmers::Algorithm::Italian),
"portuguese" => Ok(rust_stemmers::Algorithm::Portuguese),
"dutch" => Ok(rust_stemmers::Algorithm::Dutch),
"swedish" => Ok(rust_stemmers::Algorithm::Swedish),
"norwegian" => Ok(rust_stemmers::Algorithm::Norwegian),
"danish" => Ok(rust_stemmers::Algorithm::Danish),
"finnish" => Ok(rust_stemmers::Algorithm::Finnish),
"hungarian" => Ok(rust_stemmers::Algorithm::Hungarian),
"romanian" => Ok(rust_stemmers::Algorithm::Romanian),
"russian" => Ok(rust_stemmers::Algorithm::Russian),
"turkish" => Ok(rust_stemmers::Algorithm::Turkish),
"arabic" => Ok(rust_stemmers::Algorithm::Arabic),
"greek" => Ok(rust_stemmers::Algorithm::Greek),
_ => Err(format!("unsupported stemmer language: '{lang}'")),
}
}
#[cfg(test)]
mod tests {
use super::*;
use serde_json::json;
#[test]
fn empty_config() {
let config = AnalysisConfig::default();
let registry = config.build_registry().unwrap();
assert!(registry.try_get("standard").is_some());
}
#[test]
fn custom_analyzer_basic() {
let analysis = json!({
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "standard",
"filter": ["lowercase"]
}
}
});
let config = AnalysisConfig::from_json(&analysis).unwrap();
let registry = config.build_registry().unwrap();
let analyzer = registry.try_get("my_analyzer").unwrap();
let tokens = analyzer.analyze("Hello World");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["hello", "world"]);
}
#[test]
fn custom_edge_ngram_analyzer() {
let analysis = json!({
"tokenizer": {
"my_tok": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 5,
"token_chars": ["letter"]
}
},
"analyzer": {
"autocomplete": {
"tokenizer": "my_tok",
"filter": ["lowercase"]
}
}
});
let config = AnalysisConfig::from_json(&analysis).unwrap();
let registry = config.build_registry().unwrap();
let analyzer = registry.try_get("autocomplete").unwrap();
let tokens = analyzer.analyze("Quick");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["qu", "qui", "quic", "quick"]);
}
#[test]
fn custom_char_filter_html() {
let analysis = json!({
"analyzer": {
"html_analyzer": {
"char_filter": ["html_strip"],
"tokenizer": "standard",
"filter": ["lowercase"]
}
}
});
let config = AnalysisConfig::from_json(&analysis).unwrap();
let registry = config.build_registry().unwrap();
let analyzer = registry.try_get("html_analyzer").unwrap();
let tokens = analyzer.analyze("<p>Hello <b>World</b></p>");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["hello", "world"]);
}
#[test]
fn custom_synonym_filter() {
let analysis = json!({
"filter": {
"my_synonyms": {
"type": "synonym",
"synonyms": ["quick, fast, speedy"]
}
},
"analyzer": {
"syn_analyzer": {
"tokenizer": "standard",
"filter": ["lowercase", "my_synonyms"]
}
}
});
let config = AnalysisConfig::from_json(&analysis).unwrap();
let registry = config.build_registry().unwrap();
let analyzer = registry.try_get("syn_analyzer").unwrap();
let tokens = analyzer.analyze("Quick fox");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(texts.contains(&"quick"));
assert!(texts.contains(&"fast"));
assert!(texts.contains(&"fox"));
}
#[test]
fn custom_asciifolding() {
let analysis = json!({
"analyzer": {
"folding": {
"tokenizer": "standard",
"filter": ["lowercase", "asciifolding"]
}
}
});
let config = AnalysisConfig::from_json(&analysis).unwrap();
let registry = config.build_registry().unwrap();
let analyzer = registry.try_get("folding").unwrap();
let tokens = analyzer.analyze("Café résumé");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["cafe", "resume"]);
}
#[test]
fn missing_tokenizer_error() {
let analysis = json!({
"analyzer": {
"bad": {
"tokenizer": "nonexistent",
"filter": ["lowercase"]
}
}
});
let config = AnalysisConfig::from_json(&analysis).unwrap();
let err = config.build_registry().unwrap_err();
assert!(err.contains("unknown tokenizer"), "got: {err}");
}
#[test]
fn missing_filter_error() {
let analysis = json!({
"analyzer": {
"bad": {
"tokenizer": "standard",
"filter": ["nonexistent"]
}
}
});
let config = AnalysisConfig::from_json(&analysis).unwrap();
let err = config.build_registry().unwrap_err();
assert!(err.contains("unknown filter"), "got: {err}");
}
#[test]
fn missing_tokenizer_field_error() {
let analysis = json!({
"analyzer": {
"bad": {
"filter": ["lowercase"]
}
}
});
let config = AnalysisConfig::from_json(&analysis).unwrap();
let err = config.build_registry().unwrap_err();
assert!(err.contains("requires a 'tokenizer'"), "got: {err}");
}
#[test]
fn invalid_ngram_params() {
let analysis = json!({
"tokenizer": {
"bad_tok": {
"type": "ngram",
"min_gram": 5,
"max_gram": 2
}
},
"analyzer": {
"bad": {
"tokenizer": "bad_tok"
}
}
});
let config = AnalysisConfig::from_json(&analysis).unwrap();
let err = config.build_registry().unwrap_err();
assert!(err.contains("min_gram"), "got: {err}");
}
#[test]
fn round_trip_serialization() {
let analysis = json!({
"tokenizer": {
"my_tok": {
"type": "edge_ngram",
"min_gram": 2,
"max_gram": 10
}
},
"analyzer": {
"my_analyzer": {
"tokenizer": "my_tok",
"filter": ["lowercase"]
}
}
});
let config = AnalysisConfig::from_json(&analysis).unwrap();
let json = config.to_json();
let config2 = AnalysisConfig::from_json(&json).unwrap();
let registry = config2.build_registry().unwrap();
assert!(registry.try_get("my_analyzer").is_some());
}
#[test]
fn custom_mapping_char_filter() {
let analysis = json!({
"char_filter": {
"emoticons": {
"type": "mapping",
"mappings": [":) => happy", ":( => sad"]
}
},
"analyzer": {
"emo": {
"char_filter": ["emoticons"],
"tokenizer": "standard",
"filter": ["lowercase"]
}
}
});
let config = AnalysisConfig::from_json(&analysis).unwrap();
let registry = config.build_registry().unwrap();
let analyzer = registry.try_get("emo").unwrap();
let tokens = analyzer.analyze("I am :) today");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert!(texts.contains(&"happy"));
}
#[test]
fn custom_stop_words() {
let analysis = json!({
"filter": {
"my_stop": {
"type": "stop",
"stopwords": ["hello", "world"]
}
},
"analyzer": {
"custom_stop": {
"tokenizer": "standard",
"filter": ["lowercase", "my_stop"]
}
}
});
let config = AnalysisConfig::from_json(&analysis).unwrap();
let registry = config.build_registry().unwrap();
let analyzer = registry.try_get("custom_stop").unwrap();
let tokens = analyzer.analyze("Hello World Foo");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["foo"]);
}
#[test]
fn pattern_tokenizer_config() {
let analysis = json!({
"tokenizer": {
"comma_tok": {
"type": "pattern",
"pattern": ","
}
},
"analyzer": {
"csv": {
"tokenizer": "comma_tok"
}
}
});
let config = AnalysisConfig::from_json(&analysis).unwrap();
let registry = config.build_registry().unwrap();
let analyzer = registry.try_get("csv").unwrap();
let tokens = analyzer.analyze("a,b,c");
let texts: Vec<&str> = tokens.iter().map(|t| t.text.as_str()).collect();
assert_eq!(texts, vec!["a", "b", "c"]);
}
}