nodedb_fts/index/
analyzer_config.rs1use crate::analyzer::language::stemmer::{LanguageAnalyzer, NoStemAnalyzer};
12use crate::analyzer::pipeline::{TextAnalyzer, analyze};
13use crate::analyzer::standard::StandardAnalyzer;
14use crate::backend::FtsBackend;
15use crate::index::FtsIndex;
16
17impl<B: FtsBackend> FtsIndex<B> {
18 pub fn set_collection_analyzer(
20 &self,
21 tid: u64,
22 collection: &str,
23 analyzer_name: &str,
24 ) -> Result<(), B::Error> {
25 self.backend
26 .write_meta(tid, collection, "analyzer", analyzer_name.as_bytes())
27 }
28
29 pub fn set_collection_language(
31 &self,
32 tid: u64,
33 collection: &str,
34 lang_code: &str,
35 ) -> Result<(), B::Error> {
36 self.backend
37 .write_meta(tid, collection, "language", lang_code.as_bytes())
38 }
39
40 pub fn get_collection_analyzer(
42 &self,
43 tid: u64,
44 collection: &str,
45 ) -> Result<Option<String>, B::Error> {
46 match self.backend.read_meta(tid, collection, "analyzer")? {
47 Some(bytes) => Ok(std::str::from_utf8(&bytes).ok().map(String::from)),
48 None => Ok(None),
49 }
50 }
51
52 pub fn get_collection_language(
54 &self,
55 tid: u64,
56 collection: &str,
57 ) -> Result<Option<String>, B::Error> {
58 match self.backend.read_meta(tid, collection, "language")? {
59 Some(bytes) => Ok(std::str::from_utf8(&bytes).ok().map(String::from)),
60 None => Ok(None),
61 }
62 }
63
64 pub fn analyze_for_collection(
68 &self,
69 tid: u64,
70 collection: &str,
71 text: &str,
72 ) -> Result<Vec<String>, B::Error> {
73 let analyzer_name = self.get_collection_analyzer(tid, collection)?;
74 match analyzer_name.as_deref() {
75 Some(name) => Ok(resolve_analyzer(name).analyze(text)),
76 None => Ok(analyze(text)),
77 }
78 }
79
80 pub fn tokenize_raw_for_collection(
82 &self,
83 tid: u64,
84 collection: &str,
85 text: &str,
86 ) -> Result<Vec<String>, B::Error> {
87 let lang = self.get_collection_language(tid, collection)?;
88 let lang_code = lang.as_deref().unwrap_or("en");
89 let stop_list = crate::analyzer::language::stop_words::stop_words(lang_code);
90 Ok(crate::analyzer::pipeline::tokenize_raw(
91 text, lang_code, stop_list,
92 ))
93 }
94}
95
96fn resolve_analyzer(name: &str) -> Box<dyn TextAnalyzer> {
98 match name {
99 "standard" => Box::new(StandardAnalyzer),
100 _ => {
101 if let Some(a) = LanguageAnalyzer::new(name) {
102 Box::new(a)
103 } else if let Some(a) = NoStemAnalyzer::new(name) {
104 Box::new(a)
105 } else {
106 Box::new(StandardAnalyzer)
107 }
108 }
109 }
110}
111
112#[cfg(test)]
113mod tests {
114 use crate::backend::memory::MemoryBackend;
115 use crate::index::FtsIndex;
116
117 const T: u64 = 1;
118
119 #[test]
120 fn default_analyzer() {
121 let idx = FtsIndex::new(MemoryBackend::new());
122 let tokens = idx
123 .analyze_for_collection(T, "col", "The quick brown fox")
124 .unwrap();
125 assert!(tokens.contains(&"quick".to_string()));
126 assert!(!tokens.contains(&"the".to_string()));
127 }
128
129 #[test]
130 fn configured_german_analyzer() {
131 let idx = FtsIndex::new(MemoryBackend::new());
132 idx.set_collection_analyzer(T, "col", "german").unwrap();
133
134 let tokens = idx
135 .analyze_for_collection(T, "col", "Die Datenbanken sind schnell")
136 .unwrap();
137 assert!(!tokens.iter().any(|t| t == "die" || t == "sind"));
138 assert!(!tokens.is_empty());
139 }
140
141 #[test]
142 fn configured_hindi_no_stem() {
143 let idx = FtsIndex::new(MemoryBackend::new());
144 idx.set_collection_analyzer(T, "col", "hindi").unwrap();
145
146 let tokens = idx
147 .analyze_for_collection(T, "col", "यह एक परीक्षा है")
148 .unwrap();
149 assert!(!tokens.iter().any(|t| t == "यह" || t == "है"));
150 }
151
152 #[test]
153 fn analyzer_persists() {
154 let idx = FtsIndex::new(MemoryBackend::new());
155 idx.set_collection_analyzer(T, "col", "french").unwrap();
156 idx.set_collection_language(T, "col", "fr").unwrap();
157
158 assert_eq!(
159 idx.get_collection_analyzer(T, "col").unwrap().as_deref(),
160 Some("french")
161 );
162 assert_eq!(
163 idx.get_collection_language(T, "col").unwrap().as_deref(),
164 Some("fr")
165 );
166 }
167}