nodedb_fts/index/
analyzer_config.rs1use crate::analyzer::language::stemmer::{LanguageAnalyzer, NoStemAnalyzer};
10use crate::analyzer::pipeline::{TextAnalyzer, analyze};
11use crate::analyzer::standard::StandardAnalyzer;
12use crate::backend::FtsBackend;
13use crate::index::FtsIndex;
14
15impl<B: FtsBackend> FtsIndex<B> {
16 pub fn set_collection_analyzer(
18 &self,
19 tid: u32,
20 collection: &str,
21 analyzer_name: &str,
22 ) -> Result<(), B::Error> {
23 self.backend
24 .write_meta(tid, collection, "analyzer", analyzer_name.as_bytes())
25 }
26
27 pub fn set_collection_language(
29 &self,
30 tid: u32,
31 collection: &str,
32 lang_code: &str,
33 ) -> Result<(), B::Error> {
34 self.backend
35 .write_meta(tid, collection, "language", lang_code.as_bytes())
36 }
37
38 pub fn get_collection_analyzer(
40 &self,
41 tid: u32,
42 collection: &str,
43 ) -> Result<Option<String>, B::Error> {
44 match self.backend.read_meta(tid, collection, "analyzer")? {
45 Some(bytes) => Ok(std::str::from_utf8(&bytes).ok().map(String::from)),
46 None => Ok(None),
47 }
48 }
49
50 pub fn get_collection_language(
52 &self,
53 tid: u32,
54 collection: &str,
55 ) -> Result<Option<String>, B::Error> {
56 match self.backend.read_meta(tid, collection, "language")? {
57 Some(bytes) => Ok(std::str::from_utf8(&bytes).ok().map(String::from)),
58 None => Ok(None),
59 }
60 }
61
62 pub fn analyze_for_collection(
66 &self,
67 tid: u32,
68 collection: &str,
69 text: &str,
70 ) -> Result<Vec<String>, B::Error> {
71 let analyzer_name = self.get_collection_analyzer(tid, collection)?;
72 match analyzer_name.as_deref() {
73 Some(name) => Ok(resolve_analyzer(name).analyze(text)),
74 None => Ok(analyze(text)),
75 }
76 }
77
78 pub fn tokenize_raw_for_collection(
80 &self,
81 tid: u32,
82 collection: &str,
83 text: &str,
84 ) -> Result<Vec<String>, B::Error> {
85 let lang = self.get_collection_language(tid, collection)?;
86 let lang_code = lang.as_deref().unwrap_or("en");
87 let stop_list = crate::analyzer::language::stop_words::stop_words(lang_code);
88 Ok(crate::analyzer::pipeline::tokenize_raw(
89 text, lang_code, stop_list,
90 ))
91 }
92}
93
94fn resolve_analyzer(name: &str) -> Box<dyn TextAnalyzer> {
96 match name {
97 "standard" => Box::new(StandardAnalyzer),
98 _ => {
99 if let Some(a) = LanguageAnalyzer::new(name) {
100 Box::new(a)
101 } else if let Some(a) = NoStemAnalyzer::new(name) {
102 Box::new(a)
103 } else {
104 Box::new(StandardAnalyzer)
105 }
106 }
107 }
108}
109
110#[cfg(test)]
111mod tests {
112 use crate::backend::memory::MemoryBackend;
113 use crate::index::FtsIndex;
114
115 const T: u32 = 1;
116
117 #[test]
118 fn default_analyzer() {
119 let idx = FtsIndex::new(MemoryBackend::new());
120 let tokens = idx
121 .analyze_for_collection(T, "col", "The quick brown fox")
122 .unwrap();
123 assert!(tokens.contains(&"quick".to_string()));
124 assert!(!tokens.contains(&"the".to_string()));
125 }
126
127 #[test]
128 fn configured_german_analyzer() {
129 let idx = FtsIndex::new(MemoryBackend::new());
130 idx.set_collection_analyzer(T, "col", "german").unwrap();
131
132 let tokens = idx
133 .analyze_for_collection(T, "col", "Die Datenbanken sind schnell")
134 .unwrap();
135 assert!(!tokens.iter().any(|t| t == "die" || t == "sind"));
136 assert!(!tokens.is_empty());
137 }
138
139 #[test]
140 fn configured_hindi_no_stem() {
141 let idx = FtsIndex::new(MemoryBackend::new());
142 idx.set_collection_analyzer(T, "col", "hindi").unwrap();
143
144 let tokens = idx
145 .analyze_for_collection(T, "col", "यह एक परीक्षा है")
146 .unwrap();
147 assert!(!tokens.iter().any(|t| t == "यह" || t == "है"));
148 }
149
150 #[test]
151 fn analyzer_persists() {
152 let idx = FtsIndex::new(MemoryBackend::new());
153 idx.set_collection_analyzer(T, "col", "french").unwrap();
154 idx.set_collection_language(T, "col", "fr").unwrap();
155
156 assert_eq!(
157 idx.get_collection_analyzer(T, "col").unwrap().as_deref(),
158 Some("french")
159 );
160 assert_eq!(
161 idx.get_collection_language(T, "col").unwrap().as_deref(),
162 Some("fr")
163 );
164 }
165}