Skip to main content

nodedb_fts/index/
synonym_groups.rs

1// SPDX-License-Identifier: Apache-2.0
2
3//! Tenant-scoped synonym group persistence in the FTS backend.
4//!
5//! Synonym groups are stored in backend meta using a sentinel collection
6//! `"_synonym_groups"` with:
7//! - subkey `"_index"` → JSON array of all group names for the tenant (the index)
8//! - subkey `<group_name>` → JSON-serialized `SynonymGroupRecord`
9//!
10//! At query time, all groups for the tenant are loaded, merged into a
11//! `SynonymMap`, and applied to the query token stream.
12//!
13//! **OR-expansion semantics**: for a group `{a, b, c}`, querying any term
14//! expands to all other terms. A query for `db` with group `{db, database,
15//! datastore}` matches documents containing `database` or `datastore` as
16//! well. This is the only sensible default for synonym search.
17
18use crate::analyzer::synonym::SynonymMap;
19use crate::backend::FtsBackend;
20use crate::index::writer::FtsIndex;
21
22/// Sentinel collection name for synonym group meta storage.
23const SYNONYM_GROUPS_COLLECTION: &str = "_synonym_groups";
24
25/// Special meta subkey that holds the JSON array of all group names.
26const INDEX_SUBKEY: &str = "_index";
27
28/// Serialized group record: name + terms list.
29#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
30pub struct SynonymGroupRecord {
31    pub name: String,
32    pub terms: Vec<String>,
33    pub created_at: u64,
34}
35
36impl<B: FtsBackend> FtsIndex<B> {
37    /// Persist a synonym group. Overwrites any existing group with the same name.
38    pub fn put_synonym_group(&self, tid: u64, record: &SynonymGroupRecord) -> Result<(), B::Error> {
39        // Write the record itself.
40        let bytes = sonic_rs::to_vec(record).unwrap_or_default();
41        self.backend
42            .write_meta(tid, SYNONYM_GROUPS_COLLECTION, &record.name, &bytes)?;
43
44        // Update the name index.
45        let mut names = self.read_name_index(tid)?;
46        if !names.contains(&record.name) {
47            names.push(record.name.clone());
48            self.write_name_index(tid, &names)?;
49        }
50        Ok(())
51    }
52
53    /// Delete a synonym group. Returns `true` if it existed.
54    pub fn delete_synonym_group(&self, tid: u64, name: &str) -> Result<bool, B::Error> {
55        // An empty byte slice is a tombstone written by a prior delete.
56        // Only treat a non-empty (non-tombstoned) record as "existing".
57        let existed = self
58            .backend
59            .read_meta(tid, SYNONYM_GROUPS_COLLECTION, name)?
60            .is_some_and(|b| !b.is_empty());
61
62        if existed {
63            // Tombstone the record.
64            self.backend
65                .write_meta(tid, SYNONYM_GROUPS_COLLECTION, name, &[])?;
66            // Remove from the name index.
67            let mut names = self.read_name_index(tid)?;
68            names.retain(|n| n != name);
69            self.write_name_index(tid, &names)?;
70        }
71        Ok(existed)
72    }
73
74    /// Read a single synonym group by name. Returns `None` if not found or tombstoned.
75    pub fn get_synonym_group(
76        &self,
77        tid: u64,
78        name: &str,
79    ) -> Result<Option<SynonymGroupRecord>, B::Error> {
80        match self
81            .backend
82            .read_meta(tid, SYNONYM_GROUPS_COLLECTION, name)?
83        {
84            None => Ok(None),
85            Some(bytes) if bytes.is_empty() => Ok(None),
86            Some(bytes) => Ok(sonic_rs::from_slice::<SynonymGroupRecord>(&bytes).ok()),
87        }
88    }
89
90    /// List all synonym group records for a tenant.
91    pub fn list_synonym_groups(&self, tid: u64) -> Result<Vec<SynonymGroupRecord>, B::Error> {
92        let names = self.read_name_index(tid)?;
93        // no-governor: list_synonym_groups returns Result<_, B::Error> and cannot propagate MemError; synonym group counts are bounded by tenant config (typically <1000 entries of ~200 bytes each)
94        let mut groups = Vec::with_capacity(names.len());
95        for name in &names {
96            if let Some(rec) = self.get_synonym_group(tid, name)? {
97                groups.push(rec);
98            }
99        }
100        Ok(groups)
101    }
102
103    /// Build an in-memory `SynonymMap` from a slice of synonym group records.
104    ///
105    /// Each term in every group maps to all other terms in that group
106    /// (bidirectional OR-expansion). Terms are analyzed with the default
107    /// analyzer so synonym keys match the stemmed tokens produced at query
108    /// time by `search_with_mode`.
109    pub fn build_synonym_map_for_tenant(
110        &self,
111        _tid: u64,
112        all_groups: &[SynonymGroupRecord],
113    ) -> SynonymMap {
114        let mut map = SynonymMap::new();
115        for group in all_groups {
116            if group.terms.len() < 2 {
117                continue;
118            }
119            // Analyze each term through the same pipeline used at query time
120            // so that synonym keys align with stemmed query tokens.
121            let analyzed: Vec<Vec<String>> = group
122                .terms
123                .iter()
124                .map(|t| crate::analyzer::pipeline::analyze(t))
125                .collect();
126
127            for (i, my_tokens) in analyzed.iter().enumerate() {
128                let other_tokens: Vec<&str> = analyzed
129                    .iter()
130                    .enumerate()
131                    .filter(|(j, _)| *j != i)
132                    .flat_map(|(_, ts)| ts.iter().map(|s| s.as_str()))
133                    .collect();
134                for my_token in my_tokens {
135                    map.add(my_token, &other_tokens);
136                }
137            }
138        }
139        map
140    }
141
142    /// Load all synonym groups for a tenant and build the expansion map.
143    ///
144    /// Called at FTS query time inside `search_with_mode` to expand query
145    /// tokens before BM25 scoring.
146    pub fn expand_query_with_synonyms(
147        &self,
148        tid: u64,
149        tokens: Vec<String>,
150    ) -> Result<Vec<String>, B::Error> {
151        let groups = self.list_synonym_groups(tid)?;
152        eprintln!(
153            "[synonym_debug] tid={tid} tokens={tokens:?} groups_count={}",
154            groups.len()
155        );
156        if groups.is_empty() {
157            return Ok(tokens);
158        }
159        let map = self.build_synonym_map_for_tenant(tid, &groups);
160        let expanded = map.expand(&tokens);
161        eprintln!("[synonym_debug] expanded={expanded:?}");
162        Ok(expanded)
163    }
164
165    // ── internal helpers ──────────────────────────────────────────────────────
166
167    fn read_name_index(&self, tid: u64) -> Result<Vec<String>, B::Error> {
168        match self
169            .backend
170            .read_meta(tid, SYNONYM_GROUPS_COLLECTION, INDEX_SUBKEY)?
171        {
172            None => Ok(Vec::new()),
173            Some(bytes) if bytes.is_empty() => Ok(Vec::new()),
174            Some(bytes) => Ok(sonic_rs::from_slice::<Vec<String>>(&bytes).unwrap_or_default()),
175        }
176    }
177
178    fn write_name_index(&self, tid: u64, names: &[String]) -> Result<(), B::Error> {
179        let bytes = sonic_rs::to_vec(names).unwrap_or_default();
180        self.backend
181            .write_meta(tid, SYNONYM_GROUPS_COLLECTION, INDEX_SUBKEY, &bytes)
182    }
183}
184
185#[cfg(test)]
186mod tests {
187    use crate::backend::memory::MemoryBackend;
188    use crate::index::writer::FtsIndex;
189
190    use super::SynonymGroupRecord;
191
192    const T: u64 = 1;
193
194    fn idx() -> FtsIndex<MemoryBackend> {
195        FtsIndex::new(MemoryBackend::new())
196    }
197
198    fn rec(name: &str, terms: &[&str]) -> SynonymGroupRecord {
199        SynonymGroupRecord {
200            name: name.to_string(),
201            terms: terms.iter().map(|s| s.to_string()).collect(),
202            created_at: 0,
203        }
204    }
205
206    #[test]
207    fn put_and_get() {
208        let i = idx();
209        i.put_synonym_group(T, &rec("db_terms", &["database", "db", "datastore"]))
210            .unwrap();
211        let got = i.get_synonym_group(T, "db_terms").unwrap().unwrap();
212        assert_eq!(got.name, "db_terms");
213        assert_eq!(got.terms.len(), 3);
214    }
215
216    #[test]
217    fn delete_removes() {
218        let i = idx();
219        i.put_synonym_group(T, &rec("g1", &["a", "b"])).unwrap();
220        assert!(i.delete_synonym_group(T, "g1").unwrap());
221        assert!(!i.delete_synonym_group(T, "g1").unwrap());
222        assert!(i.get_synonym_group(T, "g1").unwrap().is_none());
223    }
224
225    #[test]
226    fn list_reflects_puts_and_deletes() {
227        let i = idx();
228        i.put_synonym_group(T, &rec("g1", &["a", "b"])).unwrap();
229        i.put_synonym_group(T, &rec("g2", &["x", "y"])).unwrap();
230        let names: Vec<String> = i
231            .list_synonym_groups(T)
232            .unwrap()
233            .into_iter()
234            .map(|r| r.name)
235            .collect();
236        assert_eq!(names.len(), 2);
237
238        i.delete_synonym_group(T, "g1").unwrap();
239        let names2: Vec<String> = i
240            .list_synonym_groups(T)
241            .unwrap()
242            .into_iter()
243            .map(|r| r.name)
244            .collect();
245        assert_eq!(names2, vec!["g2"]);
246    }
247
248    #[test]
249    fn synonym_map_bidirectional() {
250        let i = idx();
251        let recs = vec![rec("db_terms", &["db", "database", "datastore"])];
252        let map = i.build_synonym_map_for_tenant(T, &recs);
253
254        // Terms are analyzed/stemmed before building the map:
255        // "database" → "databas", "datastore" → "datastor", "db" → "db"
256        let expanded = map.expand(&["db".to_string()]);
257        assert!(expanded.contains(&"databas".to_string()));
258        assert!(expanded.contains(&"datastor".to_string()));
259
260        let expanded2 = map.expand(&["databas".to_string()]);
261        assert!(expanded2.contains(&"db".to_string()));
262        assert!(expanded2.contains(&"datastor".to_string()));
263    }
264
265    #[test]
266    fn expand_query_with_synonyms_no_groups() {
267        let i = idx();
268        let tokens = vec!["hello".to_string(), "world".to_string()];
269        let expanded = i.expand_query_with_synonyms(T, tokens.clone()).unwrap();
270        assert_eq!(expanded, tokens);
271    }
272
273    #[test]
274    fn expand_query_expands_matching_token() {
275        let i = idx();
276        i.put_synonym_group(T, &rec("db_terms", &["db", "database", "datastore"]))
277            .unwrap();
278        // expand_query_with_synonyms receives already-analyzed tokens ("databas" not "database")
279        // because search_with_mode analyzes first, then expands.
280        // The synonym map stores analyzed stems: "database" → "databas".
281        let tokens = vec!["db".to_string(), "perform".to_string()];
282        let expanded = i.expand_query_with_synonyms(T, tokens).unwrap();
283        assert!(expanded.contains(&"db".to_string()));
284        assert!(expanded.contains(&"databas".to_string()));
285        assert!(expanded.contains(&"datastor".to_string()));
286        assert!(expanded.contains(&"perform".to_string()));
287    }
288}