Skip to main content

nodedb_fts/index/
analyzer_config.rs

1//! Per-collection analyzer configuration stored in backend metadata.
2//!
3//! Schema:
4//! - `"{collection}:analyzer" → analyzer_name` (e.g., "german", "cjk_bigram")
5//! - `"{collection}:language" → lang_code` (e.g., "de", "ja")
6//!
7//! Applied automatically at both index time and query time.
8
9use crate::analyzer::language::stemmer::{LanguageAnalyzer, NoStemAnalyzer};
10use crate::analyzer::pipeline::{TextAnalyzer, analyze};
11use crate::analyzer::standard::StandardAnalyzer;
12use crate::backend::FtsBackend;
13use crate::index::FtsIndex;
14
15impl<B: FtsBackend> FtsIndex<B> {
16    /// Set the analyzer for a collection. Persists to backend metadata.
17    pub fn set_collection_analyzer(
18        &self,
19        collection: &str,
20        analyzer_name: &str,
21    ) -> Result<(), B::Error> {
22        let key = format!("{collection}:analyzer");
23        self.backend.write_meta(&key, analyzer_name.as_bytes())
24    }
25
26    /// Set the language for a collection. Persists to backend metadata.
27    pub fn set_collection_language(
28        &self,
29        collection: &str,
30        lang_code: &str,
31    ) -> Result<(), B::Error> {
32        let key = format!("{collection}:language");
33        self.backend.write_meta(&key, lang_code.as_bytes())
34    }
35
36    /// Get the configured analyzer name for a collection.
37    pub fn get_collection_analyzer(&self, collection: &str) -> Result<Option<String>, B::Error> {
38        let key = format!("{collection}:analyzer");
39        match self.backend.read_meta(&key)? {
40            Some(bytes) => Ok(std::str::from_utf8(&bytes).ok().map(String::from)),
41            None => Ok(None),
42        }
43    }
44
45    /// Get the configured language for a collection.
46    pub fn get_collection_language(&self, collection: &str) -> Result<Option<String>, B::Error> {
47        let key = format!("{collection}:language");
48        match self.backend.read_meta(&key)? {
49            Some(bytes) => Ok(std::str::from_utf8(&bytes).ok().map(String::from)),
50            None => Ok(None),
51        }
52    }
53
54    /// Analyze text using the collection's configured analyzer.
55    ///
56    /// Falls back to the standard English analyzer if no analyzer is configured.
57    pub fn analyze_for_collection(
58        &self,
59        collection: &str,
60        text: &str,
61    ) -> Result<Vec<String>, B::Error> {
62        let analyzer_name = self.get_collection_analyzer(collection)?;
63        match analyzer_name.as_deref() {
64            Some(name) => Ok(resolve_analyzer(name).analyze(text)),
65            None => Ok(analyze(text)),
66        }
67    }
68
69    /// Tokenize text WITHOUT stemming for fuzzy matching.
70    ///
71    /// Returns raw (unstemmed but normalized) tokens so that fuzzy edit
72    /// distance is computed on original word forms, not stemmed forms.
73    pub fn tokenize_raw_for_collection(
74        &self,
75        collection: &str,
76        text: &str,
77    ) -> Result<Vec<String>, B::Error> {
78        let lang = self.get_collection_language(collection)?;
79        let lang_code = lang.as_deref().unwrap_or("en");
80        let stop_list = crate::analyzer::language::stop_words::stop_words(lang_code);
81        Ok(crate::analyzer::pipeline::tokenize_raw(
82            text, lang_code, stop_list,
83        ))
84    }
85}
86
87/// Resolve an analyzer name to a `Box<dyn TextAnalyzer>`.
88fn resolve_analyzer(name: &str) -> Box<dyn TextAnalyzer> {
89    match name {
90        "standard" => Box::new(StandardAnalyzer),
91        _ => {
92            if let Some(a) = LanguageAnalyzer::new(name) {
93                Box::new(a)
94            } else if let Some(a) = NoStemAnalyzer::new(name) {
95                Box::new(a)
96            } else {
97                Box::new(StandardAnalyzer)
98            }
99        }
100    }
101}
102
103#[cfg(test)]
104mod tests {
105    use crate::backend::memory::MemoryBackend;
106    use crate::index::FtsIndex;
107
108    #[test]
109    fn default_analyzer() {
110        let idx = FtsIndex::new(MemoryBackend::new());
111        let tokens = idx
112            .analyze_for_collection("col", "The quick brown fox")
113            .unwrap();
114        assert!(tokens.contains(&"quick".to_string()));
115        assert!(!tokens.contains(&"the".to_string()));
116    }
117
118    #[test]
119    fn configured_german_analyzer() {
120        let idx = FtsIndex::new(MemoryBackend::new());
121        idx.set_collection_analyzer("col", "german").unwrap();
122
123        let tokens = idx
124            .analyze_for_collection("col", "Die Datenbanken sind schnell")
125            .unwrap();
126        assert!(!tokens.iter().any(|t| t == "die" || t == "sind"));
127        assert!(!tokens.is_empty());
128    }
129
130    #[test]
131    fn configured_hindi_no_stem() {
132        let idx = FtsIndex::new(MemoryBackend::new());
133        idx.set_collection_analyzer("col", "hindi").unwrap();
134
135        let tokens = idx.analyze_for_collection("col", "यह एक परीक्षा है").unwrap();
136        assert!(!tokens.iter().any(|t| t == "यह" || t == "है"));
137    }
138
139    #[test]
140    fn analyzer_persists() {
141        let idx = FtsIndex::new(MemoryBackend::new());
142        idx.set_collection_analyzer("col", "french").unwrap();
143        idx.set_collection_language("col", "fr").unwrap();
144
145        assert_eq!(
146            idx.get_collection_analyzer("col").unwrap().as_deref(),
147            Some("french")
148        );
149        assert_eq!(
150            idx.get_collection_language("col").unwrap().as_deref(),
151            Some("fr")
152        );
153    }
154}