Skip to main content

nodedb_fts/index/
analyzer_config.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Per-collection analyzer configuration stored in backend metadata.
4//!
5//! Uses structural `(tid, collection, subkey)` meta blobs:
6//! - `subkey = "analyzer"` → analyzer name (e.g. "german", "cjk_bigram")
7//! - `subkey = "language"` → lang code (e.g. "de", "ja")
8//!
9//! Applied automatically at both index time and query time.
10
11use crate::analyzer::language::stemmer::{LanguageAnalyzer, NoStemAnalyzer};
12use crate::analyzer::pipeline::{TextAnalyzer, analyze};
13use crate::analyzer::standard::StandardAnalyzer;
14use crate::backend::FtsBackend;
15use crate::index::FtsIndex;
16
17impl<B: FtsBackend> FtsIndex<B> {
18    /// Set the analyzer for a collection. Persists to backend metadata.
19    pub fn set_collection_analyzer(
20        &self,
21        tid: u64,
22        collection: &str,
23        analyzer_name: &str,
24    ) -> Result<(), B::Error> {
25        self.backend
26            .write_meta(tid, collection, "analyzer", analyzer_name.as_bytes())
27    }
28
29    /// Set the language for a collection. Persists to backend metadata.
30    pub fn set_collection_language(
31        &self,
32        tid: u64,
33        collection: &str,
34        lang_code: &str,
35    ) -> Result<(), B::Error> {
36        self.backend
37            .write_meta(tid, collection, "language", lang_code.as_bytes())
38    }
39
40    /// Get the configured analyzer name for a collection.
41    pub fn get_collection_analyzer(
42        &self,
43        tid: u64,
44        collection: &str,
45    ) -> Result<Option<String>, B::Error> {
46        match self.backend.read_meta(tid, collection, "analyzer")? {
47            Some(bytes) => Ok(std::str::from_utf8(&bytes).ok().map(String::from)),
48            None => Ok(None),
49        }
50    }
51
52    /// Get the configured language for a collection.
53    pub fn get_collection_language(
54        &self,
55        tid: u64,
56        collection: &str,
57    ) -> Result<Option<String>, B::Error> {
58        match self.backend.read_meta(tid, collection, "language")? {
59            Some(bytes) => Ok(std::str::from_utf8(&bytes).ok().map(String::from)),
60            None => Ok(None),
61        }
62    }
63
64    /// Analyze text using the collection's configured analyzer.
65    ///
66    /// Falls back to the standard English analyzer if no analyzer is configured.
67    pub fn analyze_for_collection(
68        &self,
69        tid: u64,
70        collection: &str,
71        text: &str,
72    ) -> Result<Vec<String>, B::Error> {
73        let analyzer_name = self.get_collection_analyzer(tid, collection)?;
74        match analyzer_name.as_deref() {
75            Some(name) => Ok(resolve_analyzer(name).analyze(text)),
76            None => Ok(analyze(text)),
77        }
78    }
79
80    /// Tokenize text WITHOUT stemming for fuzzy matching.
81    pub fn tokenize_raw_for_collection(
82        &self,
83        tid: u64,
84        collection: &str,
85        text: &str,
86    ) -> Result<Vec<String>, B::Error> {
87        let lang = self.get_collection_language(tid, collection)?;
88        let lang_code = lang.as_deref().unwrap_or("en");
89        let stop_list = crate::analyzer::language::stop_words::stop_words(lang_code);
90        Ok(crate::analyzer::pipeline::tokenize_raw(
91            text, lang_code, stop_list,
92        ))
93    }
94}
95
96/// Resolve an analyzer name to a `Box<dyn TextAnalyzer>`.
97fn resolve_analyzer(name: &str) -> Box<dyn TextAnalyzer> {
98    match name {
99        "standard" => Box::new(StandardAnalyzer),
100        _ => {
101            if let Some(a) = LanguageAnalyzer::new(name) {
102                Box::new(a)
103            } else if let Some(a) = NoStemAnalyzer::new(name) {
104                Box::new(a)
105            } else {
106                Box::new(StandardAnalyzer)
107            }
108        }
109    }
110}
111
112#[cfg(test)]
113mod tests {
114    use crate::backend::memory::MemoryBackend;
115    use crate::index::FtsIndex;
116
117    const T: u64 = 1;
118
119    #[test]
120    fn default_analyzer() {
121        let idx = FtsIndex::new(MemoryBackend::new());
122        let tokens = idx
123            .analyze_for_collection(T, "col", "The quick brown fox")
124            .unwrap();
125        assert!(tokens.contains(&"quick".to_string()));
126        assert!(!tokens.contains(&"the".to_string()));
127    }
128
129    #[test]
130    fn configured_german_analyzer() {
131        let idx = FtsIndex::new(MemoryBackend::new());
132        idx.set_collection_analyzer(T, "col", "german").unwrap();
133
134        let tokens = idx
135            .analyze_for_collection(T, "col", "Die Datenbanken sind schnell")
136            .unwrap();
137        assert!(!tokens.iter().any(|t| t == "die" || t == "sind"));
138        assert!(!tokens.is_empty());
139    }
140
141    #[test]
142    fn configured_hindi_no_stem() {
143        let idx = FtsIndex::new(MemoryBackend::new());
144        idx.set_collection_analyzer(T, "col", "hindi").unwrap();
145
146        let tokens = idx
147            .analyze_for_collection(T, "col", "यह एक परीक्षा है")
148            .unwrap();
149        assert!(!tokens.iter().any(|t| t == "यह" || t == "है"));
150    }
151
152    #[test]
153    fn analyzer_persists() {
154        let idx = FtsIndex::new(MemoryBackend::new());
155        idx.set_collection_analyzer(T, "col", "french").unwrap();
156        idx.set_collection_language(T, "col", "fr").unwrap();
157
158        assert_eq!(
159            idx.get_collection_analyzer(T, "col").unwrap().as_deref(),
160            Some("french")
161        );
162        assert_eq!(
163            idx.get_collection_language(T, "col").unwrap().as_deref(),
164            Some("fr")
165        );
166    }
167}