use std::collections::HashMap;
use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct AnalyzerDefinition {
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub char_filters: Vec<CharFilterConfig>,
pub tokenizer: TokenizerConfig,
#[serde(default, skip_serializing_if = "Vec::is_empty")]
pub token_filters: Vec<TokenFilterConfig>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum TokenizerConfig {
Whitespace,
UnicodeWord,
Regex {
#[serde(default = "default_regex_pattern")]
pattern: String,
#[serde(default)]
gaps: bool,
},
Ngram {
min_gram: usize,
max_gram: usize,
},
Lindera {
mode: String,
dict: String,
#[serde(default)]
user_dict: Option<String>,
},
Whole,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum CharFilterConfig {
UnicodeNormalization {
form: String,
},
PatternReplace {
pattern: String,
replacement: String,
},
Mapping {
mapping: HashMap<String, String>,
},
JapaneseIterationMark {
#[serde(default = "default_true")]
kanji: bool,
#[serde(default = "default_true")]
kana: bool,
},
}
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(tag = "type", rename_all = "snake_case")]
pub enum TokenFilterConfig {
Lowercase,
Stop {
#[serde(default)]
words: Option<Vec<String>>,
},
Stem {
#[serde(default)]
stem_type: Option<String>,
},
Boost {
boost: f32,
},
Limit {
limit: usize,
},
Strip,
RemoveEmpty,
FlattenGraph,
}
fn default_regex_pattern() -> String {
r"\w+".to_string()
}
fn default_true() -> bool {
true
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_analyzer_definition_serde_roundtrip() {
let def = AnalyzerDefinition {
char_filters: vec![CharFilterConfig::UnicodeNormalization {
form: "nfkc".into(),
}],
tokenizer: TokenizerConfig::Regex {
pattern: r"\w+".into(),
gaps: false,
},
token_filters: vec![
TokenFilterConfig::Lowercase,
TokenFilterConfig::Stop {
words: Some(vec!["the".into(), "a".into()]),
},
TokenFilterConfig::Stem { stem_type: None },
],
};
let json = serde_json::to_string(&def).unwrap();
let deserialized: AnalyzerDefinition = serde_json::from_str(&json).unwrap();
assert_eq!(deserialized.token_filters.len(), 3);
assert_eq!(deserialized.char_filters.len(), 1);
}
#[test]
fn test_tokenizer_config_variants() {
let configs = vec![
r#"{"type": "whitespace"}"#,
r#"{"type": "unicode_word"}"#,
r#"{"type": "regex", "pattern": "\\w+", "gaps": false}"#,
r#"{"type": "ngram", "min_gram": 2, "max_gram": 3}"#,
r#"{"type": "whole"}"#,
];
for json in configs {
let config: TokenizerConfig = serde_json::from_str(json).unwrap();
let serialized = serde_json::to_string(&config).unwrap();
let _roundtrip: TokenizerConfig = serde_json::from_str(&serialized).unwrap();
}
}
#[test]
fn test_char_filter_config_variants() {
let configs = vec![
r#"{"type": "unicode_normalization", "form": "nfkc"}"#,
r#"{"type": "pattern_replace", "pattern": "foo", "replacement": "bar"}"#,
r#"{"type": "mapping", "mapping": {"a": "b"}}"#,
r#"{"type": "japanese_iteration_mark"}"#,
];
for json in configs {
let config: CharFilterConfig = serde_json::from_str(json).unwrap();
let serialized = serde_json::to_string(&config).unwrap();
let _roundtrip: CharFilterConfig = serde_json::from_str(&serialized).unwrap();
}
}
#[test]
fn test_token_filter_config_variants() {
let configs = vec![
r#"{"type": "lowercase"}"#,
r#"{"type": "stop"}"#,
r#"{"type": "stop", "words": ["the", "a"]}"#,
r#"{"type": "stem"}"#,
r#"{"type": "stem", "stem_type": "porter"}"#,
r#"{"type": "boost", "boost": 2.0}"#,
r#"{"type": "limit", "limit": 100}"#,
r#"{"type": "strip"}"#,
r#"{"type": "remove_empty"}"#,
r#"{"type": "flatten_graph"}"#,
];
for json in configs {
let config: TokenFilterConfig = serde_json::from_str(json).unwrap();
let serialized = serde_json::to_string(&config).unwrap();
let _roundtrip: TokenFilterConfig = serde_json::from_str(&serialized).unwrap();
}
}
#[test]
fn test_full_schema_with_analyzers_json() {
let json = r#"{
"char_filters": [{"type": "unicode_normalization", "form": "nfkc"}],
"tokenizer": {"type": "lindera", "mode": "normal", "dict": "embedded://unidic"},
"token_filters": [{"type": "lowercase"}]
}"#;
let def: AnalyzerDefinition = serde_json::from_str(json).unwrap();
assert!(matches!(def.tokenizer, TokenizerConfig::Lindera { .. }));
}
#[test]
fn test_minimal_definition() {
let json = r#"{"tokenizer": {"type": "whitespace"}}"#;
let def: AnalyzerDefinition = serde_json::from_str(json).unwrap();
assert!(def.char_filters.is_empty());
assert!(def.token_filters.is_empty());
}
}