Skip to main content

cognis_rag/vectorstore/
mod.rs

1//! Vector store trait + SearchResult + Filter.
2
3use std::collections::HashMap;
4
5use async_trait::async_trait;
6use cognis_core::schemars::{self, JsonSchema};
7use serde::{Deserialize, Serialize};
8
9use cognis_core::Result;
10
11mod in_memory;
12pub use in_memory::InMemoryVectorStore;
13
14#[cfg(feature = "vectorstore-chroma")]
15pub mod chroma;
16#[cfg(feature = "vectorstore-chroma")]
17pub use chroma::{ChromaBuilder, ChromaProvider};
18
19#[cfg(feature = "vectorstore-qdrant")]
20pub mod qdrant;
21#[cfg(feature = "vectorstore-qdrant")]
22pub use qdrant::{QdrantBuilder, QdrantProvider};
23
24#[cfg(feature = "vectorstore-pinecone")]
25pub mod pinecone;
26#[cfg(feature = "vectorstore-pinecone")]
27pub use pinecone::{PineconeBuilder, PineconeProvider};
28
29#[cfg(feature = "vectorstore-weaviate")]
30pub mod weaviate;
31#[cfg(feature = "vectorstore-weaviate")]
32pub use weaviate::{WeaviateBuilder, WeaviateProvider};
33
34#[cfg(feature = "vectorstore-faiss")]
35pub mod faiss;
36#[cfg(feature = "vectorstore-faiss")]
37pub use faiss::{
38    FaissConfig, FaissIndex, FaissIndexType, FaissMetric, FaissVectorStore, FlatIndex, HNSWIndex,
39    IVFFlatIndex,
40};
41
42/// Metadata filter applied to similarity search.
43///
44/// All conditions on a filter combine with AND semantics. An empty filter
45/// matches everything (the same as no filter at all).
46#[derive(Debug, Default, Clone, Serialize, Deserialize, JsonSchema)]
47pub struct Filter {
48    /// Metadata key/value pairs every result must match exactly.
49    /// Compared with serde_json `==` after both sides are coerced to
50    /// `serde_json::Value`.
51    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
52    pub equals: HashMap<String, serde_json::Value>,
53    /// Metadata keys whose values must appear in the listed set.
54    /// Equivalent to `metadata[key] in values`.
55    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
56    pub r#in: HashMap<String, Vec<serde_json::Value>>,
57    /// Numeric metadata keys with `>=` lower bound.
58    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
59    pub gte: HashMap<String, f64>,
60    /// Numeric metadata keys with `<=` upper bound.
61    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
62    pub lte: HashMap<String, f64>,
63}
64
65impl Filter {
66    /// Empty filter (matches everything).
67    pub fn new() -> Self {
68        Self::default()
69    }
70
71    /// Builder: require `metadata[key] == value`.
72    pub fn equals(mut self, key: impl Into<String>, value: impl Into<serde_json::Value>) -> Self {
73        self.equals.insert(key.into(), value.into());
74        self
75    }
76
77    /// Builder: require `metadata[key]` to be one of `values`.
78    pub fn one_of<I, V>(mut self, key: impl Into<String>, values: I) -> Self
79    where
80        I: IntoIterator<Item = V>,
81        V: Into<serde_json::Value>,
82    {
83        self.r#in
84            .insert(key.into(), values.into_iter().map(Into::into).collect());
85        self
86    }
87
88    /// Builder: require `metadata[key] >= n` (numeric).
89    pub fn gte(mut self, key: impl Into<String>, n: f64) -> Self {
90        self.gte.insert(key.into(), n);
91        self
92    }
93
94    /// Builder: require `metadata[key] <= n` (numeric).
95    pub fn lte(mut self, key: impl Into<String>, n: f64) -> Self {
96        self.lte.insert(key.into(), n);
97        self
98    }
99
100    /// True if no conditions are set.
101    pub fn is_empty(&self) -> bool {
102        self.equals.is_empty() && self.r#in.is_empty() && self.gte.is_empty() && self.lte.is_empty()
103    }
104
105    /// Whether `metadata` satisfies every condition on this filter.
106    pub fn matches(&self, metadata: &HashMap<String, serde_json::Value>) -> bool {
107        for (k, v) in &self.equals {
108            match metadata.get(k) {
109                Some(actual) if actual == v => {}
110                _ => return false,
111            }
112        }
113        for (k, allowed) in &self.r#in {
114            match metadata.get(k) {
115                Some(actual) if allowed.iter().any(|v| v == actual) => {}
116                _ => return false,
117            }
118        }
119        for (k, lo) in &self.gte {
120            match metadata.get(k).and_then(|v| v.as_f64()) {
121                Some(n) if n >= *lo => {}
122                _ => return false,
123            }
124        }
125        for (k, hi) in &self.lte {
126            match metadata.get(k).and_then(|v| v.as_f64()) {
127                Some(n) if n <= *hi => {}
128                _ => return false,
129            }
130        }
131        true
132    }
133}
134
135/// One document returned by a similarity search.
136#[derive(Debug, Clone, Serialize, Deserialize)]
137pub struct SearchResult {
138    /// Document ID assigned by the store.
139    pub id: String,
140    /// Original text content.
141    pub text: String,
142    /// Similarity score — higher = more similar (per the store's
143    /// configured Distance metric).
144    pub score: f32,
145    /// User-supplied metadata stored with the document.
146    pub metadata: HashMap<String, serde_json::Value>,
147}
148
149/// A vector store: holds documents + their embeddings, supports
150/// add + similarity search + delete.
151#[async_trait]
152pub trait VectorStore: Send + Sync {
153    /// Add documents (text + optional metadata). The store is responsible
154    /// for embedding them. Returns the IDs assigned.
155    async fn add_texts(
156        &mut self,
157        texts: Vec<String>,
158        metadata: Option<Vec<HashMap<String, serde_json::Value>>>,
159    ) -> Result<Vec<String>>;
160
161    /// Add pre-embedded vectors directly. Useful when the caller has
162    /// already paid the embedding cost.
163    async fn add_vectors(
164        &mut self,
165        vectors: Vec<Vec<f32>>,
166        texts: Vec<String>,
167        metadata: Option<Vec<HashMap<String, serde_json::Value>>>,
168    ) -> Result<Vec<String>>;
169
170    /// Similarity search: embed the query, return top-k matches.
171    async fn similarity_search(&self, query: &str, k: usize) -> Result<Vec<SearchResult>>;
172
173    /// Similarity search by pre-computed query vector.
174    async fn similarity_search_by_vector(
175        &self,
176        query_vector: Vec<f32>,
177        k: usize,
178    ) -> Result<Vec<SearchResult>>;
179
180    /// Similarity search with a metadata filter.
181    ///
182    /// Default impl runs `similarity_search` with `k * 4` candidates and
183    /// post-filters in the caller. Stores with native filter support
184    /// (Qdrant, Pinecone, ...) override for efficiency.
185    async fn similarity_search_with_filter(
186        &self,
187        query: &str,
188        k: usize,
189        filter: &Filter,
190    ) -> Result<Vec<SearchResult>> {
191        if filter.is_empty() {
192            return self.similarity_search(query, k).await;
193        }
194        let candidates = self.similarity_search(query, k.saturating_mul(4)).await?;
195        Ok(candidates
196            .into_iter()
197            .filter(|r| filter.matches(&r.metadata))
198            .take(k)
199            .collect())
200    }
201
202    /// Delete documents by ID. IDs not found are silently ignored.
203    async fn delete(&mut self, ids: Vec<String>) -> Result<()>;
204
205    /// Number of documents currently stored.
206    fn len(&self) -> usize;
207
208    /// True if no documents are stored.
209    fn is_empty(&self) -> bool {
210        self.len() == 0
211    }
212}