Skip to main content

nodedb_fts/index/
synonym_groups.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Tenant-scoped synonym group persistence in the FTS backend.
4//!
5//! Synonym groups are stored in backend meta using a sentinel collection
6//! `"_synonym_groups"` with:
7//! - subkey `"_index"` → JSON array of all group names for the tenant (the index)
8//! - subkey `<group_name>` → JSON-serialized `SynonymGroupRecord`
9//!
10//! At query time, all groups for the tenant are loaded, merged into a
11//! `SynonymMap`, and applied to the query token stream.
12//!
13//! **OR-expansion semantics**: for a group `{a, b, c}`, querying any term
14//! expands to all other terms. A query for `db` with group `{db, database,
15//! datastore}` matches documents containing `database` or `datastore` as
16//! well. This is the only sensible default for synonym search.
17
18use crate::analyzer::synonym::SynonymMap;
19use crate::backend::FtsBackend;
20use crate::index::writer::FtsIndex;
21
22/// Sentinel collection name for synonym group meta storage.
23const SYNONYM_GROUPS_COLLECTION: &str = "_synonym_groups";
24
25/// Special meta subkey that holds the JSON array of all group names.
26const INDEX_SUBKEY: &str = "_index";
27
28/// Serialized group record: name + terms list.
29#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
30pub struct SynonymGroupRecord {
31    pub name: String,
32    pub terms: Vec<String>,
33    pub created_at: u64,
34}
35
36impl<B: FtsBackend> FtsIndex<B> {
37    /// Persist a synonym group. Overwrites any existing group with the same name.
38    pub fn put_synonym_group(&self, tid: u64, record: &SynonymGroupRecord) -> Result<(), B::Error> {
39        // Write the record itself.
40        let bytes = sonic_rs::to_vec(record).unwrap_or_default();
41        self.backend
42            .write_meta(tid, SYNONYM_GROUPS_COLLECTION, &record.name, &bytes)?;
43
44        // Update the name index.
45        let mut names = self.read_name_index(tid)?;
46        if !names.contains(&record.name) {
47            names.push(record.name.clone());
48            self.write_name_index(tid, &names)?;
49        }
50        Ok(())
51    }
52
53    /// Delete a synonym group. Returns `true` if it existed.
54    pub fn delete_synonym_group(&self, tid: u64, name: &str) -> Result<bool, B::Error> {
55        // An empty byte slice is a tombstone written by a prior delete.
56        // Only treat a non-empty (non-tombstoned) record as "existing".
57        let existed = self
58            .backend
59            .read_meta(tid, SYNONYM_GROUPS_COLLECTION, name)?
60            .is_some_and(|b| !b.is_empty());
61
62        if existed {
63            // Tombstone the record.
64            self.backend
65                .write_meta(tid, SYNONYM_GROUPS_COLLECTION, name, &[])?;
66            // Remove from the name index.
67            let mut names = self.read_name_index(tid)?;
68            names.retain(|n| n != name);
69            self.write_name_index(tid, &names)?;
70        }
71        Ok(existed)
72    }
73
74    /// Read a single synonym group by name. Returns `None` if not found or tombstoned.
75    pub fn get_synonym_group(
76        &self,
77        tid: u64,
78        name: &str,
79    ) -> Result<Option<SynonymGroupRecord>, B::Error> {
80        match self
81            .backend
82            .read_meta(tid, SYNONYM_GROUPS_COLLECTION, name)?
83        {
84            None => Ok(None),
85            Some(bytes) if bytes.is_empty() => Ok(None),
86            Some(bytes) => Ok(sonic_rs::from_slice::<SynonymGroupRecord>(&bytes).ok()),
87        }
88    }
89
90    /// List all synonym group records for a tenant.
91    pub fn list_synonym_groups(&self, tid: u64) -> Result<Vec<SynonymGroupRecord>, B::Error> {
92        let names = self.read_name_index(tid)?;
93        let mut groups = Vec::with_capacity(names.len());
94        for name in &names {
95            if let Some(rec) = self.get_synonym_group(tid, name)? {
96                groups.push(rec);
97            }
98        }
99        Ok(groups)
100    }
101
102    /// Build an in-memory `SynonymMap` from a slice of synonym group records.
103    ///
104    /// Each term in every group maps to all other terms in that group
105    /// (bidirectional OR-expansion). Terms are analyzed with the default
106    /// analyzer so synonym keys match the stemmed tokens produced at query
107    /// time by `search_with_mode`.
108    pub fn build_synonym_map_for_tenant(
109        &self,
110        _tid: u64,
111        all_groups: &[SynonymGroupRecord],
112    ) -> SynonymMap {
113        let mut map = SynonymMap::new();
114        for group in all_groups {
115            if group.terms.len() < 2 {
116                continue;
117            }
118            // Analyze each term through the same pipeline used at query time
119            // so that synonym keys align with stemmed query tokens.
120            let analyzed: Vec<Vec<String>> = group
121                .terms
122                .iter()
123                .map(|t| crate::analyzer::pipeline::analyze(t))
124                .collect();
125
126            for (i, my_tokens) in analyzed.iter().enumerate() {
127                let other_tokens: Vec<&str> = analyzed
128                    .iter()
129                    .enumerate()
130                    .filter(|(j, _)| *j != i)
131                    .flat_map(|(_, ts)| ts.iter().map(|s| s.as_str()))
132                    .collect();
133                for my_token in my_tokens {
134                    map.add(my_token, &other_tokens);
135                }
136            }
137        }
138        map
139    }
140
141    /// Load all synonym groups for a tenant and build the expansion map.
142    ///
143    /// Called at FTS query time inside `search_with_mode` to expand query
144    /// tokens before BM25 scoring.
145    pub fn expand_query_with_synonyms(
146        &self,
147        tid: u64,
148        tokens: Vec<String>,
149    ) -> Result<Vec<String>, B::Error> {
150        let groups = self.list_synonym_groups(tid)?;
151        if groups.is_empty() {
152            return Ok(tokens);
153        }
154        let map = self.build_synonym_map_for_tenant(tid, &groups);
155        let expanded = map.expand(&tokens);
156        Ok(expanded)
157    }
158
159    // ── internal helpers ──────────────────────────────────────────────────────
160
161    fn read_name_index(&self, tid: u64) -> Result<Vec<String>, B::Error> {
162        match self
163            .backend
164            .read_meta(tid, SYNONYM_GROUPS_COLLECTION, INDEX_SUBKEY)?
165        {
166            None => Ok(Vec::new()),
167            Some(bytes) if bytes.is_empty() => Ok(Vec::new()),
168            Some(bytes) => Ok(sonic_rs::from_slice::<Vec<String>>(&bytes).unwrap_or_default()),
169        }
170    }
171
172    fn write_name_index(&self, tid: u64, names: &[String]) -> Result<(), B::Error> {
173        let bytes = sonic_rs::to_vec(names).unwrap_or_default();
174        self.backend
175            .write_meta(tid, SYNONYM_GROUPS_COLLECTION, INDEX_SUBKEY, &bytes)
176    }
177}
178
179#[cfg(test)]
180mod tests {
181    use crate::backend::memory::MemoryBackend;
182    use crate::index::writer::FtsIndex;
183
184    use super::SynonymGroupRecord;
185
186    const T: u64 = 1;
187
188    fn idx() -> FtsIndex<MemoryBackend> {
189        FtsIndex::new(MemoryBackend::new())
190    }
191
192    fn rec(name: &str, terms: &[&str]) -> SynonymGroupRecord {
193        SynonymGroupRecord {
194            name: name.to_string(),
195            terms: terms.iter().map(|s| s.to_string()).collect(),
196            created_at: 0,
197        }
198    }
199
200    #[test]
201    fn put_and_get() {
202        let i = idx();
203        i.put_synonym_group(T, &rec("db_terms", &["database", "db", "datastore"]))
204            .unwrap();
205        let got = i.get_synonym_group(T, "db_terms").unwrap().unwrap();
206        assert_eq!(got.name, "db_terms");
207        assert_eq!(got.terms.len(), 3);
208    }
209
210    #[test]
211    fn delete_removes() {
212        let i = idx();
213        i.put_synonym_group(T, &rec("g1", &["a", "b"])).unwrap();
214        assert!(i.delete_synonym_group(T, "g1").unwrap());
215        assert!(!i.delete_synonym_group(T, "g1").unwrap());
216        assert!(i.get_synonym_group(T, "g1").unwrap().is_none());
217    }
218
219    #[test]
220    fn list_reflects_puts_and_deletes() {
221        let i = idx();
222        i.put_synonym_group(T, &rec("g1", &["a", "b"])).unwrap();
223        i.put_synonym_group(T, &rec("g2", &["x", "y"])).unwrap();
224        let names: Vec<String> = i
225            .list_synonym_groups(T)
226            .unwrap()
227            .into_iter()
228            .map(|r| r.name)
229            .collect();
230        assert_eq!(names.len(), 2);
231
232        i.delete_synonym_group(T, "g1").unwrap();
233        let names2: Vec<String> = i
234            .list_synonym_groups(T)
235            .unwrap()
236            .into_iter()
237            .map(|r| r.name)
238            .collect();
239        assert_eq!(names2, vec!["g2"]);
240    }
241
242    #[test]
243    fn synonym_map_bidirectional() {
244        let i = idx();
245        let recs = vec![rec("db_terms", &["db", "database", "datastore"])];
246        let map = i.build_synonym_map_for_tenant(T, &recs);
247
248        // Terms are analyzed/stemmed before building the map:
249        // "database" → "databas", "datastore" → "datastor", "db" → "db"
250        let expanded = map.expand(&["db".to_string()]);
251        assert!(expanded.contains(&"databas".to_string()));
252        assert!(expanded.contains(&"datastor".to_string()));
253
254        let expanded2 = map.expand(&["databas".to_string()]);
255        assert!(expanded2.contains(&"db".to_string()));
256        assert!(expanded2.contains(&"datastor".to_string()));
257    }
258
259    #[test]
260    fn expand_query_with_synonyms_no_groups() {
261        let i = idx();
262        let tokens = vec!["hello".to_string(), "world".to_string()];
263        let expanded = i.expand_query_with_synonyms(T, tokens.clone()).unwrap();
264        assert_eq!(expanded, tokens);
265    }
266
267    #[test]
268    fn expand_query_expands_matching_token() {
269        let i = idx();
270        i.put_synonym_group(T, &rec("db_terms", &["db", "database", "datastore"]))
271            .unwrap();
272        // expand_query_with_synonyms receives already-analyzed tokens ("databas" not "database")
273        // because search_with_mode analyzes first, then expands.
274        // The synonym map stores analyzed stems: "database" → "databas".
275        let tokens = vec!["db".to_string(), "perform".to_string()];
276        let expanded = i.expand_query_with_synonyms(T, tokens).unwrap();
277        assert!(expanded.contains(&"db".to_string()));
278        assert!(expanded.contains(&"databas".to_string()));
279        assert!(expanded.contains(&"datastor".to_string()));
280        assert!(expanded.contains(&"perform".to_string()));
281    }
282}