Skip to main content

synaptic_vectorstores/
multi_vector.rs

1use std::collections::HashMap;
2use std::sync::Arc;
3
4use async_trait::async_trait;
5use synaptic_core::SynapticError;
6use synaptic_embeddings::Embeddings;
7use synaptic_retrieval::{Document, Retriever};
8use tokio::sync::RwLock;
9
10use crate::VectorStore;
11
12/// A retriever that maps multiple child vectors back to parent documents.
13///
14/// Each parent document can have multiple sub-documents (e.g., summaries,
15/// smaller chunks) stored in the vector store. Retrieval finds relevant
16/// sub-documents, then returns the original parent documents.
17pub struct MultiVectorRetriever<S: VectorStore> {
18    vectorstore: Arc<S>,
19    embeddings: Arc<dyn Embeddings>,
20    /// Parent document store, keyed by document ID.
21    docstore: Arc<RwLock<HashMap<String, Document>>>,
22    /// Metadata key linking child documents to their parent.
23    id_key: String,
24    k: usize,
25}
26
27impl<S: VectorStore + 'static> MultiVectorRetriever<S> {
28    /// Create a new `MultiVectorRetriever`.
29    ///
30    /// - `vectorstore`: the vector store to search for child documents.
31    /// - `embeddings`: the embeddings provider for embedding child documents.
32    /// - `k`: the number of child documents to retrieve for parent lookup.
33    pub fn new(vectorstore: Arc<S>, embeddings: Arc<dyn Embeddings>, k: usize) -> Self {
34        Self {
35            vectorstore,
36            embeddings,
37            docstore: Arc::new(RwLock::new(HashMap::new())),
38            id_key: "parent_id".to_string(),
39            k,
40        }
41    }
42
43    /// Set a custom metadata key linking child documents to their parent ID.
44    /// Defaults to `"parent_id"`.
45    pub fn with_id_key(mut self, key: impl Into<String>) -> Self {
46        self.id_key = key.into();
47        self
48    }
49
50    /// Add parent documents and their associated child documents.
51    ///
52    /// Parents are stored in the internal docstore. Children are embedded and
53    /// added to the vector store. Each child document must have the `id_key`
54    /// metadata field set to the parent document's ID.
55    pub async fn add_documents(
56        &self,
57        parent_docs: Vec<Document>,
58        child_docs: Vec<Document>,
59    ) -> Result<(), SynapticError> {
60        // Store parents in the docstore
61        {
62            let mut store = self.docstore.write().await;
63            for doc in parent_docs {
64                store.insert(doc.id.clone(), doc);
65            }
66        }
67
68        // Add children to the vector store
69        self.vectorstore
70            .add_documents(child_docs, self.embeddings.as_ref())
71            .await?;
72
73        Ok(())
74    }
75}
76
77#[async_trait]
78impl<S: VectorStore + 'static> Retriever for MultiVectorRetriever<S> {
79    async fn retrieve(&self, query: &str, top_k: usize) -> Result<Vec<Document>, SynapticError> {
80        let k = if top_k > 0 { top_k } else { self.k };
81
82        // Search vectorstore for child documents
83        let children = self
84            .vectorstore
85            .similarity_search(query, k, self.embeddings.as_ref())
86            .await?;
87
88        // Look up parent documents from the docstore, deduplicating
89        let docstore = self.docstore.read().await;
90        let mut seen = std::collections::HashSet::new();
91        let mut parents = Vec::new();
92
93        for child in &children {
94            if let Some(parent_id_value) = child.metadata.get(&self.id_key) {
95                if let Some(parent_id) = parent_id_value.as_str() {
96                    if seen.insert(parent_id.to_string()) {
97                        if let Some(parent) = docstore.get(parent_id) {
98                            parents.push(parent.clone());
99                        }
100                    }
101                }
102            }
103        }
104
105        Ok(parents)
106    }
107}