Skip to main content

velesdb_core/collection/search/
text.rs

1//! Text and hybrid search methods for Collection.
2
3use super::resolve;
4use super::OrderedFloat;
5use crate::collection::types::Collection;
6use crate::error::Result;
7use crate::point::{Point, SearchResult};
8use crate::storage::{PayloadStorage, VectorStorage};
9use crate::validation::validate_dimension_match;
10
11impl Collection {
12    /// Performs full-text search using BM25.
13    ///
14    /// # Arguments
15    ///
16    /// * `query` - Text query to search for
17    /// * `k` - Maximum number of results to return
18    ///
19    /// # Returns
20    ///
21    /// Vector of search results sorted by BM25 score (descending).
22    ///
23    /// # Errors
24    ///
25    /// Returns an error if storage retrieval fails.
26    pub fn text_search(&self, query: &str, k: usize) -> Result<Vec<SearchResult>> {
27        let bm25_results = self.text_index.search(query, k);
28
29        let vector_storage = self.vector_storage.read();
30        let payload_storage = self.payload_storage.read();
31
32        Ok(resolve::resolve_id_score_pairs(
33            &bm25_results,
34            bm25_results.len(),
35            &*vector_storage,
36            &*payload_storage,
37        ))
38    }
39
40    /// Performs full-text search with metadata filtering.
41    ///
42    /// # Arguments
43    ///
44    /// * `query` - Text query to search for
45    /// * `k` - Maximum number of results to return
46    /// * `filter` - Metadata filter to apply
47    ///
48    /// # Returns
49    ///
50    /// Vector of search results sorted by BM25 score (descending).
51    ///
52    /// # Errors
53    ///
54    /// Returns an error if storage retrieval fails.
55    pub fn text_search_with_filter(
56        &self,
57        query: &str,
58        k: usize,
59        filter: &crate::filter::Filter,
60    ) -> Result<Vec<SearchResult>> {
61        // Retrieve more candidates for filtering
62        let candidates_k = k.saturating_mul(4).max(k + 10);
63        let bm25_results = self.text_index.search(query, candidates_k);
64
65        let vector_storage = self.vector_storage.read();
66        let payload_storage = self.payload_storage.read();
67
68        Ok(bm25_results
69            .into_iter()
70            .filter_map(|(id, score)| {
71                let vector = vector_storage.retrieve(id).ok().flatten()?;
72                let payload = payload_storage.retrieve(id).ok().flatten();
73
74                // Apply filter - if no payload, filter fails
75                let payload_ref = payload.as_ref()?;
76                if !filter.matches(payload_ref) {
77                    return None;
78                }
79
80                let point = Point {
81                    id,
82                    vector,
83                    payload,
84                    sparse_vectors: None,
85                };
86
87                Some(SearchResult::new(point, score))
88            })
89            .take(k)
90            .collect())
91    }
92
93    /// Performs hybrid search combining vector similarity and full-text search.
94    ///
95    /// Uses Reciprocal Rank Fusion (RRF) to combine results from both searches.
96    ///
97    /// # Arguments
98    ///
99    /// * `vector_query` - Query vector for similarity search
100    /// * `text_query` - Text query for BM25 search
101    /// * `k` - Maximum number of results to return
102    /// * `vector_weight` - Weight for vector results (0.0-1.0, default 0.5)
103    ///
104    /// # Performance (v0.9+)
105    ///
106    /// - **Streaming RRF**: `BinaryHeap` maintains top-k during fusion (O(n log k) vs O(n log n))
107    /// - **Vector-first gating**: Text search limited to 2k candidates for efficiency
108    /// - **`FxHashMap`**: Faster hashing for score aggregation
109    ///
110    /// # Errors
111    ///
112    /// Returns an error if the query vector dimension doesn't match.
113    pub fn hybrid_search(
114        &self,
115        vector_query: &[f32],
116        text_query: &str,
117        k: usize,
118        vector_weight: Option<f32>,
119    ) -> Result<Vec<SearchResult>> {
120        use crate::index::VectorIndex;
121
122        let config = self.config.read();
123        validate_dimension_match(config.dimension, vector_query.len())?;
124        let metric = config.metric;
125        drop(config);
126
127        let weight = vector_weight.unwrap_or(0.5).clamp(0.0, 1.0);
128        let text_weight = 1.0 - weight;
129
130        let overfetch_k = k * 2;
131        let raw_vector_results = self.index.search(vector_query, overfetch_k);
132        let vector_results =
133            self.merge_delta(raw_vector_results, vector_query, overfetch_k, metric);
134        let text_results = self.text_index.search(text_query, k * 2);
135
136        let fused_scores =
137            Self::compute_rrf_scores(&vector_results, &text_results, weight, text_weight);
138
139        let scored_ids = Self::top_k_from_scores(fused_scores, k);
140        Ok(self.resolve_scored_ids(&scored_ids))
141    }
142
143    /// Computes RRF fused scores from vector and text search results.
144    #[allow(clippy::cast_precision_loss)]
145    fn compute_rrf_scores(
146        vector_results: &[crate::scored_result::ScoredResult],
147        text_results: &[(u64, f32)],
148        vector_weight: f32,
149        text_weight: f32,
150    ) -> rustc_hash::FxHashMap<u64, f32> {
151        let mut fused: rustc_hash::FxHashMap<u64, f32> =
152            rustc_hash::FxHashMap::with_capacity_and_hasher(
153                vector_results.len() + text_results.len(),
154                rustc_hash::FxBuildHasher,
155            );
156        for (rank, sr) in vector_results.iter().enumerate() {
157            *fused.entry(sr.id).or_insert(0.0) += vector_weight / (rank as f32 + 60.0);
158        }
159        for (rank, (id, _)) in text_results.iter().enumerate() {
160            *fused.entry(*id).or_insert(0.0) += text_weight / (rank as f32 + 60.0);
161        }
162        fused
163    }
164
165    /// Extracts top-k IDs from fused scores using a streaming min-heap.
166    fn top_k_from_scores(
167        fused_scores: rustc_hash::FxHashMap<u64, f32>,
168        k: usize,
169    ) -> Vec<(u64, f32)> {
170        use std::cmp::Reverse;
171        use std::collections::BinaryHeap;
172
173        let mut heap: BinaryHeap<Reverse<(OrderedFloat, u64)>> = BinaryHeap::with_capacity(k + 1);
174        for (id, score) in fused_scores {
175            heap.push(Reverse((OrderedFloat(score), id)));
176            if heap.len() > k {
177                heap.pop();
178            }
179        }
180        let mut scored: Vec<(u64, f32)> = heap
181            .into_iter()
182            .map(|Reverse((OrderedFloat(s), id))| (id, s))
183            .collect();
184        scored.sort_by(|a, b| b.1.total_cmp(&a.1));
185        scored
186    }
187
188    /// Resolves scored IDs to full `SearchResult` with point data.
189    fn resolve_scored_ids(&self, scored_ids: &[(u64, f32)]) -> Vec<SearchResult> {
190        let vector_storage = self.vector_storage.read();
191        let payload_storage = self.payload_storage.read();
192
193        resolve::resolve_id_score_pairs(
194            scored_ids,
195            scored_ids.len(),
196            &*vector_storage,
197            &*payload_storage,
198        )
199    }
200
201    /// Performs hybrid search (vector + text) with metadata filtering.
202    ///
203    /// Uses Reciprocal Rank Fusion (RRF) to combine results from both searches,
204    /// then applies metadata filter.
205    ///
206    /// # Arguments
207    ///
208    /// * `vector_query` - Query vector for similarity search
209    /// * `text_query` - Text query for BM25 search
210    /// * `k` - Maximum number of results to return
211    /// * `vector_weight` - Weight for vector results (0.0-1.0, default 0.5)
212    /// * `filter` - Metadata filter to apply
213    ///
214    /// # Errors
215    ///
216    /// Returns an error if the query vector dimension doesn't match.
217    pub fn hybrid_search_with_filter(
218        &self,
219        vector_query: &[f32],
220        text_query: &str,
221        k: usize,
222        vector_weight: Option<f32>,
223        filter: &crate::filter::Filter,
224    ) -> Result<Vec<SearchResult>> {
225        use crate::index::VectorIndex;
226
227        let config = self.config.read();
228        validate_dimension_match(config.dimension, vector_query.len())?;
229        let metric = config.metric;
230        drop(config);
231
232        let weight = vector_weight.unwrap_or(0.5).clamp(0.0, 1.0);
233        let text_weight = 1.0 - weight;
234        let candidates_k = k.saturating_mul(4).max(k + 10);
235
236        let raw_vector_results = self.index.search(vector_query, candidates_k);
237        let vector_results =
238            self.merge_delta(raw_vector_results, vector_query, candidates_k, metric);
239        let text_results = self.text_index.search(text_query, candidates_k);
240
241        let fused_scores =
242            Self::compute_rrf_scores(&vector_results, &text_results, weight, text_weight);
243
244        let mut scored_ids: Vec<_> = fused_scores.into_iter().collect();
245        scored_ids.sort_by(|a, b| b.1.total_cmp(&a.1));
246
247        Ok(self.resolve_scored_ids_filtered(&scored_ids, filter, k))
248    }
249
250    /// Resolves scored IDs to `SearchResult` with metadata filter applied.
251    fn resolve_scored_ids_filtered(
252        &self,
253        scored_ids: &[(u64, f32)],
254        filter: &crate::filter::Filter,
255        k: usize,
256    ) -> Vec<SearchResult> {
257        let vector_storage = self.vector_storage.read();
258        let payload_storage = self.payload_storage.read();
259
260        scored_ids
261            .iter()
262            .filter_map(|&(id, score)| {
263                let vector = vector_storage.retrieve(id).ok().flatten()?;
264                let payload = payload_storage.retrieve(id).ok().flatten();
265                let payload_ref = payload.as_ref()?;
266                if !filter.matches(payload_ref) {
267                    return None;
268                }
269                Some(SearchResult::new(
270                    Point {
271                        id,
272                        vector,
273                        payload,
274                        sparse_vectors: None,
275                    },
276                    score,
277                ))
278            })
279            .take(k)
280            .collect()
281    }
282}