manifoldb_vector/ops/maxsim.rs
1//! MaxSim scoring for ColBERT-style late interaction models.
2//!
3//! MaxSim computes similarity between multi-vector embeddings by taking the
4//! maximum similarity for each query token across all document tokens:
5//!
6//! ```text
7//! MaxSim(Q, D) = sum_{q_i in Q} max_{d_j in D} (q_i · d_j)
8//! ```
9//!
10//! This module provides:
11//! - [`maxsim`] - Compute MaxSim score between two multi-vectors
12//! - [`maxsim_batch`] - Compute MaxSim scores against multiple documents
13//! - [`MaxSimScorer`] - Reusable scorer with pre-computed query data
14//! - [`MaxSimScan`] - Operator for multi-vector similarity search
15
16use crate::distance::dot_product;
17use crate::types::MultiVectorEmbedding;
18
19/// Compute the MaxSim score between a query and document multi-vector.
20///
21/// MaxSim sums the maximum dot product for each query token across all document tokens:
22/// ```text
23/// MaxSim(Q, D) = sum_{q_i in Q} max_{d_j in D} (q_i · d_j)
24/// ```
25///
26/// # Arguments
27///
28/// * `query` - Query multi-vector embedding (per-token embeddings)
29/// * `document` - Document multi-vector embedding (per-token embeddings)
30///
31/// # Returns
32///
33/// The MaxSim score (higher is more similar).
34///
35/// # Panics
36///
37/// Panics if query and document have different dimensions.
38///
39/// # Example
40///
41/// ```
42/// use manifoldb_vector::ops::maxsim::maxsim;
43/// use manifoldb_vector::types::MultiVectorEmbedding;
44///
45/// let query = MultiVectorEmbedding::new(vec![
46/// vec![1.0, 0.0, 0.0], // Query token 1
47/// vec![0.0, 1.0, 0.0], // Query token 2
48/// ]).unwrap();
49///
50/// let doc = MultiVectorEmbedding::new(vec![
51/// vec![1.0, 0.0, 0.0], // Doc token 1 (matches query token 1)
52/// vec![0.0, 0.5, 0.5], // Doc token 2
53/// vec![0.0, 0.0, 1.0], // Doc token 3
54/// ]).unwrap();
55///
56/// let score = maxsim(&query, &doc);
57/// // Query token 1: max with doc tokens = 1.0 (with doc token 1)
58/// // Query token 2: max with doc tokens = 0.5 (with doc token 2)
59/// // Total MaxSim = 1.0 + 0.5 = 1.5
60/// assert!((score - 1.5).abs() < 1e-6);
61/// ```
62#[must_use]
63pub fn maxsim(query: &MultiVectorEmbedding, document: &MultiVectorEmbedding) -> f32 {
64 assert_eq!(
65 query.dimension(),
66 document.dimension(),
67 "query and document must have the same dimension"
68 );
69
70 let mut total_score = 0.0_f32;
71
72 // For each query token, find the max dot product with any document token
73 for q in query.iter() {
74 let mut max_sim = f32::NEG_INFINITY;
75
76 for d in document.iter() {
77 let sim = dot_product(q, d);
78 if sim > max_sim {
79 max_sim = sim;
80 }
81 }
82
83 // Handle empty document case
84 if max_sim.is_finite() {
85 total_score += max_sim;
86 }
87 }
88
89 total_score
90}
91
92/// Compute MaxSim scores between a query and multiple documents.
93///
94/// This is more efficient than calling `maxsim` repeatedly because it can
95/// batch operations and reuse query data.
96///
97/// # Returns
98///
99/// A vector of (document_index, score) pairs, sorted by score descending.
100///
101/// # Panics
102///
103/// Panics if any document has a different dimension than the query.
104#[must_use]
105pub fn maxsim_batch(
106 query: &MultiVectorEmbedding,
107 documents: &[MultiVectorEmbedding],
108) -> Vec<(usize, f32)> {
109 let mut scores: Vec<(usize, f32)> =
110 documents.iter().enumerate().map(|(i, doc)| (i, maxsim(query, doc))).collect();
111
112 // Sort by score descending
113 scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
114
115 scores
116}
117
118/// A reusable MaxSim scorer with pre-computed query data.
119///
120/// Use this when scoring the same query against multiple documents to
121/// avoid redundant computation.
122///
123/// # Example
124///
125/// ```
126/// use manifoldb_vector::ops::maxsim::MaxSimScorer;
127/// use manifoldb_vector::types::MultiVectorEmbedding;
128///
129/// let query = MultiVectorEmbedding::new(vec![
130/// vec![1.0, 0.0],
131/// vec![0.0, 1.0],
132/// ]).unwrap();
133///
134/// let scorer = MaxSimScorer::new(query);
135///
136/// let doc1 = MultiVectorEmbedding::new(vec![vec![1.0, 0.0]]).unwrap();
137/// let doc2 = MultiVectorEmbedding::new(vec![vec![0.5, 0.5]]).unwrap();
138///
139/// let score1 = scorer.score(&doc1);
140/// let score2 = scorer.score(&doc2);
141/// ```
142#[derive(Debug, Clone)]
143pub struct MaxSimScorer {
144 query: MultiVectorEmbedding,
145}
146
147impl MaxSimScorer {
148 /// Create a new MaxSim scorer for the given query.
149 #[must_use]
150 pub fn new(query: MultiVectorEmbedding) -> Self {
151 Self { query }
152 }
153
154 /// Score a document against the query.
155 #[must_use]
156 pub fn score(&self, document: &MultiVectorEmbedding) -> f32 {
157 maxsim(&self.query, document)
158 }
159
160 /// Score multiple documents and return results sorted by score descending.
161 #[must_use]
162 pub fn score_batch(&self, documents: &[MultiVectorEmbedding]) -> Vec<(usize, f32)> {
163 maxsim_batch(&self.query, documents)
164 }
165
166 /// Get the query multi-vector.
167 #[must_use]
168 pub fn query(&self) -> &MultiVectorEmbedding {
169 &self.query
170 }
171
172 /// Get the dimension of the query vectors.
173 #[must_use]
174 pub fn dimension(&self) -> usize {
175 self.query.dimension()
176 }
177}
178
179/// Compute MaxSim with normalized vectors (for cosine-like similarity).
180///
181/// This normalizes both query and document token vectors to unit length
182/// before computing MaxSim, making it equivalent to MaxSim over cosine
183/// similarities.
184///
185/// # Returns
186///
187/// MaxSim score where each token similarity is in [-1, 1].
188#[must_use]
189pub fn maxsim_cosine(query: &MultiVectorEmbedding, document: &MultiVectorEmbedding) -> f32 {
190 maxsim(&query.normalize(), &document.normalize())
191}
192
193/// Convert MaxSim score to a distance (lower is more similar).
194///
195/// This computes `max_possible_score - score` where the max possible score
196/// is the number of query tokens (assuming normalized vectors with max dot product 1.0).
197#[inline]
198#[must_use]
199pub fn maxsim_to_distance(score: f32, num_query_tokens: usize) -> f32 {
200 (num_query_tokens as f32) - score
201}
202
203/// Convert a distance back to MaxSim score.
204#[inline]
205#[must_use]
206pub fn distance_to_maxsim(distance: f32, num_query_tokens: usize) -> f32 {
207 (num_query_tokens as f32) - distance
208}
209
210#[cfg(test)]
211mod tests {
212 use super::*;
213
214 #[test]
215 fn test_maxsim_identical() {
216 let query = MultiVectorEmbedding::new(vec![vec![1.0, 0.0], vec![0.0, 1.0]]).unwrap();
217
218 let doc = MultiVectorEmbedding::new(vec![vec![1.0, 0.0], vec![0.0, 1.0]]).unwrap();
219
220 let score = maxsim(&query, &doc);
221 // Each query token matches perfectly with one doc token
222 // Score = 1.0 + 1.0 = 2.0
223 assert!((score - 2.0).abs() < 1e-6);
224 }
225
226 #[test]
227 fn test_maxsim_partial_match() {
228 let query = MultiVectorEmbedding::new(vec![vec![1.0, 0.0], vec![0.0, 1.0]]).unwrap();
229
230 let doc = MultiVectorEmbedding::new(vec![
231 vec![1.0, 0.0], // Matches query token 1
232 vec![0.5, 0.5], // Partial match
233 ])
234 .unwrap();
235
236 let score = maxsim(&query, &doc);
237 // Query token 1: max(1.0, 0.5) = 1.0
238 // Query token 2: max(0.0, 0.5) = 0.5
239 // Total = 1.5
240 assert!((score - 1.5).abs() < 1e-6);
241 }
242
243 #[test]
244 fn test_maxsim_orthogonal() {
245 let query = MultiVectorEmbedding::new(vec![vec![1.0, 0.0, 0.0]]).unwrap();
246
247 let doc = MultiVectorEmbedding::new(vec![vec![0.0, 1.0, 0.0]]).unwrap();
248
249 let score = maxsim(&query, &doc);
250 // Orthogonal vectors have dot product 0
251 assert!((score - 0.0).abs() < 1e-6);
252 }
253
254 #[test]
255 fn test_maxsim_batch() {
256 let query = MultiVectorEmbedding::new(vec![vec![1.0, 0.0]]).unwrap();
257
258 let docs = vec![
259 MultiVectorEmbedding::new(vec![vec![0.5, 0.0]]).unwrap(),
260 MultiVectorEmbedding::new(vec![vec![1.0, 0.0]]).unwrap(),
261 MultiVectorEmbedding::new(vec![vec![0.0, 1.0]]).unwrap(),
262 ];
263
264 let scores = maxsim_batch(&query, &docs);
265
266 // Should be sorted by score descending
267 assert_eq!(scores.len(), 3);
268 assert_eq!(scores[0].0, 1); // doc1 has score 1.0
269 assert_eq!(scores[1].0, 0); // doc0 has score 0.5
270 assert_eq!(scores[2].0, 2); // doc2 has score 0.0
271 }
272
273 #[test]
274 fn test_scorer() {
275 let query = MultiVectorEmbedding::new(vec![vec![1.0, 0.0]]).unwrap();
276
277 let scorer = MaxSimScorer::new(query);
278
279 let doc1 = MultiVectorEmbedding::new(vec![vec![1.0, 0.0]]).unwrap();
280 let doc2 = MultiVectorEmbedding::new(vec![vec![0.5, 0.5]]).unwrap();
281
282 assert!((scorer.score(&doc1) - 1.0).abs() < 1e-6);
283 assert!((scorer.score(&doc2) - 0.5).abs() < 1e-6);
284 }
285
286 #[test]
287 fn test_maxsim_cosine() {
288 let query = MultiVectorEmbedding::new(vec![vec![2.0, 0.0]]).unwrap();
289
290 let doc = MultiVectorEmbedding::new(vec![vec![3.0, 0.0]]).unwrap();
291
292 // With cosine, vectors are normalized, so [2,0] and [3,0] both become [1,0]
293 let score = maxsim_cosine(&query, &doc);
294 assert!((score - 1.0).abs() < 1e-6);
295 }
296
297 #[test]
298 fn test_distance_conversion() {
299 let score = 1.5;
300 let num_tokens = 2;
301
302 let distance = maxsim_to_distance(score, num_tokens);
303 assert!((distance - 0.5).abs() < 1e-6);
304
305 let recovered = distance_to_maxsim(distance, num_tokens);
306 assert!((recovered - score).abs() < 1e-6);
307 }
308
309 #[test]
310 fn test_maxsim_many_tokens() {
311 // Simulate a more realistic ColBERT scenario with multiple tokens
312 let query = MultiVectorEmbedding::new(vec![
313 vec![1.0, 0.0, 0.0, 0.0],
314 vec![0.0, 1.0, 0.0, 0.0],
315 vec![0.0, 0.0, 1.0, 0.0],
316 ])
317 .unwrap();
318
319 let doc = MultiVectorEmbedding::new(vec![
320 vec![0.9, 0.1, 0.0, 0.0], // Close to query token 1
321 vec![0.0, 0.8, 0.2, 0.0], // Close to query token 2
322 vec![0.0, 0.0, 0.7, 0.3], // Close to query token 3
323 vec![0.5, 0.5, 0.0, 0.0], // Another token
324 ])
325 .unwrap();
326
327 let score = maxsim(&query, &doc);
328 // Query token 1: max over doc = 0.9 (doc token 1)
329 // Query token 2: max over doc = 0.8 (doc token 2)
330 // Query token 3: max over doc = 0.7 (doc token 3)
331 // Total = 0.9 + 0.8 + 0.7 = 2.4
332 assert!((score - 2.4).abs() < 1e-6);
333 }
334
335 #[test]
336 fn test_maxsim_single_doc_token() {
337 let query = MultiVectorEmbedding::new(vec![vec![1.0, 0.0], vec![0.0, 1.0], vec![0.5, 0.5]])
338 .unwrap();
339
340 // Document with only one token
341 let doc = MultiVectorEmbedding::new(vec![vec![0.6, 0.8]]).unwrap();
342
343 let score = maxsim(&query, &doc);
344 // All query tokens match against the single doc token
345 // Query token 1: 1.0*0.6 + 0.0*0.8 = 0.6
346 // Query token 2: 0.0*0.6 + 1.0*0.8 = 0.8
347 // Query token 3: 0.5*0.6 + 0.5*0.8 = 0.7
348 // Total = 0.6 + 0.8 + 0.7 = 2.1
349 assert!((score - 2.1).abs() < 1e-6);
350 }
351}