nodedb_fts/index/
analyzer_config.rs1use crate::analyzer::language::stemmer::{LanguageAnalyzer, NoStemAnalyzer};
10use crate::analyzer::pipeline::{TextAnalyzer, analyze};
11use crate::analyzer::standard::StandardAnalyzer;
12use crate::backend::FtsBackend;
13use crate::index::FtsIndex;
14
15impl<B: FtsBackend> FtsIndex<B> {
16 pub fn set_collection_analyzer(
18 &self,
19 collection: &str,
20 analyzer_name: &str,
21 ) -> Result<(), B::Error> {
22 let key = format!("{collection}:analyzer");
23 self.backend.write_meta(&key, analyzer_name.as_bytes())
24 }
25
26 pub fn set_collection_language(
28 &self,
29 collection: &str,
30 lang_code: &str,
31 ) -> Result<(), B::Error> {
32 let key = format!("{collection}:language");
33 self.backend.write_meta(&key, lang_code.as_bytes())
34 }
35
36 pub fn get_collection_analyzer(&self, collection: &str) -> Result<Option<String>, B::Error> {
38 let key = format!("{collection}:analyzer");
39 match self.backend.read_meta(&key)? {
40 Some(bytes) => Ok(std::str::from_utf8(&bytes).ok().map(String::from)),
41 None => Ok(None),
42 }
43 }
44
45 pub fn get_collection_language(&self, collection: &str) -> Result<Option<String>, B::Error> {
47 let key = format!("{collection}:language");
48 match self.backend.read_meta(&key)? {
49 Some(bytes) => Ok(std::str::from_utf8(&bytes).ok().map(String::from)),
50 None => Ok(None),
51 }
52 }
53
54 pub fn analyze_for_collection(
58 &self,
59 collection: &str,
60 text: &str,
61 ) -> Result<Vec<String>, B::Error> {
62 let analyzer_name = self.get_collection_analyzer(collection)?;
63 match analyzer_name.as_deref() {
64 Some(name) => Ok(resolve_analyzer(name).analyze(text)),
65 None => Ok(analyze(text)),
66 }
67 }
68
69 pub fn tokenize_raw_for_collection(
74 &self,
75 collection: &str,
76 text: &str,
77 ) -> Result<Vec<String>, B::Error> {
78 let lang = self.get_collection_language(collection)?;
79 let lang_code = lang.as_deref().unwrap_or("en");
80 let stop_list = crate::analyzer::language::stop_words::stop_words(lang_code);
81 Ok(crate::analyzer::pipeline::tokenize_raw(
82 text, lang_code, stop_list,
83 ))
84 }
85}
86
87fn resolve_analyzer(name: &str) -> Box<dyn TextAnalyzer> {
89 match name {
90 "standard" => Box::new(StandardAnalyzer),
91 _ => {
92 if let Some(a) = LanguageAnalyzer::new(name) {
93 Box::new(a)
94 } else if let Some(a) = NoStemAnalyzer::new(name) {
95 Box::new(a)
96 } else {
97 Box::new(StandardAnalyzer)
98 }
99 }
100 }
101}
102
103#[cfg(test)]
104mod tests {
105 use crate::backend::memory::MemoryBackend;
106 use crate::index::FtsIndex;
107
108 #[test]
109 fn default_analyzer() {
110 let idx = FtsIndex::new(MemoryBackend::new());
111 let tokens = idx
112 .analyze_for_collection("col", "The quick brown fox")
113 .unwrap();
114 assert!(tokens.contains(&"quick".to_string()));
115 assert!(!tokens.contains(&"the".to_string()));
116 }
117
118 #[test]
119 fn configured_german_analyzer() {
120 let idx = FtsIndex::new(MemoryBackend::new());
121 idx.set_collection_analyzer("col", "german").unwrap();
122
123 let tokens = idx
124 .analyze_for_collection("col", "Die Datenbanken sind schnell")
125 .unwrap();
126 assert!(!tokens.iter().any(|t| t == "die" || t == "sind"));
127 assert!(!tokens.is_empty());
128 }
129
130 #[test]
131 fn configured_hindi_no_stem() {
132 let idx = FtsIndex::new(MemoryBackend::new());
133 idx.set_collection_analyzer("col", "hindi").unwrap();
134
135 let tokens = idx.analyze_for_collection("col", "यह एक परीक्षा है").unwrap();
136 assert!(!tokens.iter().any(|t| t == "यह" || t == "है"));
137 }
138
139 #[test]
140 fn analyzer_persists() {
141 let idx = FtsIndex::new(MemoryBackend::new());
142 idx.set_collection_analyzer("col", "french").unwrap();
143 idx.set_collection_language("col", "fr").unwrap();
144
145 assert_eq!(
146 idx.get_collection_analyzer("col").unwrap().as_deref(),
147 Some("french")
148 );
149 assert_eq!(
150 idx.get_collection_language("col").unwrap().as_deref(),
151 Some("fr")
152 );
153 }
154}