use crate::analyzer::synonym::SynonymMap;
use crate::backend::FtsBackend;
use crate::index::writer::FtsIndex;
const SYNONYM_GROUPS_COLLECTION: &str = "_synonym_groups";
const INDEX_SUBKEY: &str = "_index";
#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
pub struct SynonymGroupRecord {
pub name: String,
pub terms: Vec<String>,
pub created_at: u64,
}
impl<B: FtsBackend> FtsIndex<B> {
pub fn put_synonym_group(&self, tid: u64, record: &SynonymGroupRecord) -> Result<(), B::Error> {
let bytes = sonic_rs::to_vec(record).unwrap_or_default();
self.backend
.write_meta(tid, SYNONYM_GROUPS_COLLECTION, &record.name, &bytes)?;
let mut names = self.read_name_index(tid)?;
if !names.contains(&record.name) {
names.push(record.name.clone());
self.write_name_index(tid, &names)?;
}
Ok(())
}
pub fn delete_synonym_group(&self, tid: u64, name: &str) -> Result<bool, B::Error> {
let existed = self
.backend
.read_meta(tid, SYNONYM_GROUPS_COLLECTION, name)?
.is_some_and(|b| !b.is_empty());
if existed {
self.backend
.write_meta(tid, SYNONYM_GROUPS_COLLECTION, name, &[])?;
let mut names = self.read_name_index(tid)?;
names.retain(|n| n != name);
self.write_name_index(tid, &names)?;
}
Ok(existed)
}
pub fn get_synonym_group(
&self,
tid: u64,
name: &str,
) -> Result<Option<SynonymGroupRecord>, B::Error> {
match self
.backend
.read_meta(tid, SYNONYM_GROUPS_COLLECTION, name)?
{
None => Ok(None),
Some(bytes) if bytes.is_empty() => Ok(None),
Some(bytes) => Ok(sonic_rs::from_slice::<SynonymGroupRecord>(&bytes).ok()),
}
}
pub fn list_synonym_groups(&self, tid: u64) -> Result<Vec<SynonymGroupRecord>, B::Error> {
let names = self.read_name_index(tid)?;
let mut groups = Vec::with_capacity(names.len());
for name in &names {
if let Some(rec) = self.get_synonym_group(tid, name)? {
groups.push(rec);
}
}
Ok(groups)
}
pub fn build_synonym_map_for_tenant(
&self,
_tid: u64,
all_groups: &[SynonymGroupRecord],
) -> SynonymMap {
let mut map = SynonymMap::new();
for group in all_groups {
if group.terms.len() < 2 {
continue;
}
let analyzed: Vec<Vec<String>> = group
.terms
.iter()
.map(|t| crate::analyzer::pipeline::analyze(t))
.collect();
for (i, my_tokens) in analyzed.iter().enumerate() {
let other_tokens: Vec<&str> = analyzed
.iter()
.enumerate()
.filter(|(j, _)| *j != i)
.flat_map(|(_, ts)| ts.iter().map(|s| s.as_str()))
.collect();
for my_token in my_tokens {
map.add(my_token, &other_tokens);
}
}
}
map
}
pub fn expand_query_with_synonyms(
&self,
tid: u64,
tokens: Vec<String>,
) -> Result<Vec<String>, B::Error> {
let groups = self.list_synonym_groups(tid)?;
eprintln!(
"[synonym_debug] tid={tid} tokens={tokens:?} groups_count={}",
groups.len()
);
if groups.is_empty() {
return Ok(tokens);
}
let map = self.build_synonym_map_for_tenant(tid, &groups);
let expanded = map.expand(&tokens);
eprintln!("[synonym_debug] expanded={expanded:?}");
Ok(expanded)
}
fn read_name_index(&self, tid: u64) -> Result<Vec<String>, B::Error> {
match self
.backend
.read_meta(tid, SYNONYM_GROUPS_COLLECTION, INDEX_SUBKEY)?
{
None => Ok(Vec::new()),
Some(bytes) if bytes.is_empty() => Ok(Vec::new()),
Some(bytes) => Ok(sonic_rs::from_slice::<Vec<String>>(&bytes).unwrap_or_default()),
}
}
fn write_name_index(&self, tid: u64, names: &[String]) -> Result<(), B::Error> {
let bytes = sonic_rs::to_vec(names).unwrap_or_default();
self.backend
.write_meta(tid, SYNONYM_GROUPS_COLLECTION, INDEX_SUBKEY, &bytes)
}
}
#[cfg(test)]
mod tests {
use crate::backend::memory::MemoryBackend;
use crate::index::writer::FtsIndex;
use super::SynonymGroupRecord;
const T: u64 = 1;
fn idx() -> FtsIndex<MemoryBackend> {
FtsIndex::new(MemoryBackend::new())
}
fn rec(name: &str, terms: &[&str]) -> SynonymGroupRecord {
SynonymGroupRecord {
name: name.to_string(),
terms: terms.iter().map(|s| s.to_string()).collect(),
created_at: 0,
}
}
#[test]
fn put_and_get() {
let i = idx();
i.put_synonym_group(T, &rec("db_terms", &["database", "db", "datastore"]))
.unwrap();
let got = i.get_synonym_group(T, "db_terms").unwrap().unwrap();
assert_eq!(got.name, "db_terms");
assert_eq!(got.terms.len(), 3);
}
#[test]
fn delete_removes() {
let i = idx();
i.put_synonym_group(T, &rec("g1", &["a", "b"])).unwrap();
assert!(i.delete_synonym_group(T, "g1").unwrap());
assert!(!i.delete_synonym_group(T, "g1").unwrap());
assert!(i.get_synonym_group(T, "g1").unwrap().is_none());
}
#[test]
fn list_reflects_puts_and_deletes() {
let i = idx();
i.put_synonym_group(T, &rec("g1", &["a", "b"])).unwrap();
i.put_synonym_group(T, &rec("g2", &["x", "y"])).unwrap();
let names: Vec<String> = i
.list_synonym_groups(T)
.unwrap()
.into_iter()
.map(|r| r.name)
.collect();
assert_eq!(names.len(), 2);
i.delete_synonym_group(T, "g1").unwrap();
let names2: Vec<String> = i
.list_synonym_groups(T)
.unwrap()
.into_iter()
.map(|r| r.name)
.collect();
assert_eq!(names2, vec!["g2"]);
}
#[test]
fn synonym_map_bidirectional() {
let i = idx();
let recs = vec![rec("db_terms", &["db", "database", "datastore"])];
let map = i.build_synonym_map_for_tenant(T, &recs);
let expanded = map.expand(&["db".to_string()]);
assert!(expanded.contains(&"databas".to_string()));
assert!(expanded.contains(&"datastor".to_string()));
let expanded2 = map.expand(&["databas".to_string()]);
assert!(expanded2.contains(&"db".to_string()));
assert!(expanded2.contains(&"datastor".to_string()));
}
#[test]
fn expand_query_with_synonyms_no_groups() {
let i = idx();
let tokens = vec!["hello".to_string(), "world".to_string()];
let expanded = i.expand_query_with_synonyms(T, tokens.clone()).unwrap();
assert_eq!(expanded, tokens);
}
#[test]
fn expand_query_expands_matching_token() {
let i = idx();
i.put_synonym_group(T, &rec("db_terms", &["db", "database", "datastore"]))
.unwrap();
let tokens = vec!["db".to_string(), "perform".to_string()];
let expanded = i.expand_query_with_synonyms(T, tokens).unwrap();
assert!(expanded.contains(&"db".to_string()));
assert!(expanded.contains(&"databas".to_string()));
assert!(expanded.contains(&"datastor".to_string()));
assert!(expanded.contains(&"perform".to_string()));
}
}