Skip to main content

nodedb_fts/index/
analyzer_config.rs

1//! Per-collection analyzer configuration stored in backend metadata.
2//!
3//! Uses structural `(tid, collection, subkey)` meta blobs:
4//! - `subkey = "analyzer"` → analyzer name (e.g. "german", "cjk_bigram")
5//! - `subkey = "language"` → lang code (e.g. "de", "ja")
6//!
7//! Applied automatically at both index time and query time.
8
9use crate::analyzer::language::stemmer::{LanguageAnalyzer, NoStemAnalyzer};
10use crate::analyzer::pipeline::{TextAnalyzer, analyze};
11use crate::analyzer::standard::StandardAnalyzer;
12use crate::backend::FtsBackend;
13use crate::index::FtsIndex;
14
15impl<B: FtsBackend> FtsIndex<B> {
16    /// Set the analyzer for a collection. Persists to backend metadata.
17    pub fn set_collection_analyzer(
18        &self,
19        tid: u32,
20        collection: &str,
21        analyzer_name: &str,
22    ) -> Result<(), B::Error> {
23        self.backend
24            .write_meta(tid, collection, "analyzer", analyzer_name.as_bytes())
25    }
26
27    /// Set the language for a collection. Persists to backend metadata.
28    pub fn set_collection_language(
29        &self,
30        tid: u32,
31        collection: &str,
32        lang_code: &str,
33    ) -> Result<(), B::Error> {
34        self.backend
35            .write_meta(tid, collection, "language", lang_code.as_bytes())
36    }
37
38    /// Get the configured analyzer name for a collection.
39    pub fn get_collection_analyzer(
40        &self,
41        tid: u32,
42        collection: &str,
43    ) -> Result<Option<String>, B::Error> {
44        match self.backend.read_meta(tid, collection, "analyzer")? {
45            Some(bytes) => Ok(std::str::from_utf8(&bytes).ok().map(String::from)),
46            None => Ok(None),
47        }
48    }
49
50    /// Get the configured language for a collection.
51    pub fn get_collection_language(
52        &self,
53        tid: u32,
54        collection: &str,
55    ) -> Result<Option<String>, B::Error> {
56        match self.backend.read_meta(tid, collection, "language")? {
57            Some(bytes) => Ok(std::str::from_utf8(&bytes).ok().map(String::from)),
58            None => Ok(None),
59        }
60    }
61
62    /// Analyze text using the collection's configured analyzer.
63    ///
64    /// Falls back to the standard English analyzer if no analyzer is configured.
65    pub fn analyze_for_collection(
66        &self,
67        tid: u32,
68        collection: &str,
69        text: &str,
70    ) -> Result<Vec<String>, B::Error> {
71        let analyzer_name = self.get_collection_analyzer(tid, collection)?;
72        match analyzer_name.as_deref() {
73            Some(name) => Ok(resolve_analyzer(name).analyze(text)),
74            None => Ok(analyze(text)),
75        }
76    }
77
78    /// Tokenize text WITHOUT stemming for fuzzy matching.
79    pub fn tokenize_raw_for_collection(
80        &self,
81        tid: u32,
82        collection: &str,
83        text: &str,
84    ) -> Result<Vec<String>, B::Error> {
85        let lang = self.get_collection_language(tid, collection)?;
86        let lang_code = lang.as_deref().unwrap_or("en");
87        let stop_list = crate::analyzer::language::stop_words::stop_words(lang_code);
88        Ok(crate::analyzer::pipeline::tokenize_raw(
89            text, lang_code, stop_list,
90        ))
91    }
92}
93
94/// Resolve an analyzer name to a `Box<dyn TextAnalyzer>`.
95fn resolve_analyzer(name: &str) -> Box<dyn TextAnalyzer> {
96    match name {
97        "standard" => Box::new(StandardAnalyzer),
98        _ => {
99            if let Some(a) = LanguageAnalyzer::new(name) {
100                Box::new(a)
101            } else if let Some(a) = NoStemAnalyzer::new(name) {
102                Box::new(a)
103            } else {
104                Box::new(StandardAnalyzer)
105            }
106        }
107    }
108}
109
110#[cfg(test)]
111mod tests {
112    use crate::backend::memory::MemoryBackend;
113    use crate::index::FtsIndex;
114
115    const T: u32 = 1;
116
117    #[test]
118    fn default_analyzer() {
119        let idx = FtsIndex::new(MemoryBackend::new());
120        let tokens = idx
121            .analyze_for_collection(T, "col", "The quick brown fox")
122            .unwrap();
123        assert!(tokens.contains(&"quick".to_string()));
124        assert!(!tokens.contains(&"the".to_string()));
125    }
126
127    #[test]
128    fn configured_german_analyzer() {
129        let idx = FtsIndex::new(MemoryBackend::new());
130        idx.set_collection_analyzer(T, "col", "german").unwrap();
131
132        let tokens = idx
133            .analyze_for_collection(T, "col", "Die Datenbanken sind schnell")
134            .unwrap();
135        assert!(!tokens.iter().any(|t| t == "die" || t == "sind"));
136        assert!(!tokens.is_empty());
137    }
138
139    #[test]
140    fn configured_hindi_no_stem() {
141        let idx = FtsIndex::new(MemoryBackend::new());
142        idx.set_collection_analyzer(T, "col", "hindi").unwrap();
143
144        let tokens = idx
145            .analyze_for_collection(T, "col", "यह एक परीक्षा है")
146            .unwrap();
147        assert!(!tokens.iter().any(|t| t == "यह" || t == "है"));
148    }
149
150    #[test]
151    fn analyzer_persists() {
152        let idx = FtsIndex::new(MemoryBackend::new());
153        idx.set_collection_analyzer(T, "col", "french").unwrap();
154        idx.set_collection_language(T, "col", "fr").unwrap();
155
156        assert_eq!(
157            idx.get_collection_analyzer(T, "col").unwrap().as_deref(),
158            Some("french")
159        );
160        assert_eq!(
161            idx.get_collection_language(T, "col").unwrap().as_deref(),
162            Some("fr")
163        );
164    }
165}