manifoldb_vector/ops/
maxsim.rs

1//! MaxSim scoring for ColBERT-style late interaction models.
2//!
3//! MaxSim computes similarity between multi-vector embeddings by taking the
4//! maximum similarity for each query token across all document tokens:
5//!
6//! ```text
7//! MaxSim(Q, D) = sum_{q_i in Q} max_{d_j in D} (q_i · d_j)
8//! ```
9//!
10//! This module provides:
11//! - [`maxsim`] - Compute MaxSim score between two multi-vectors
12//! - [`maxsim_batch`] - Compute MaxSim scores against multiple documents
13//! - [`MaxSimScorer`] - Reusable scorer with pre-computed query data
14//! - [`MaxSimScan`] - Operator for multi-vector similarity search
15
16use crate::distance::dot_product;
17use crate::types::MultiVectorEmbedding;
18
19/// Compute the MaxSim score between a query and document multi-vector.
20///
21/// MaxSim sums the maximum dot product for each query token across all document tokens:
22/// ```text
23/// MaxSim(Q, D) = sum_{q_i in Q} max_{d_j in D} (q_i · d_j)
24/// ```
25///
26/// # Arguments
27///
28/// * `query` - Query multi-vector embedding (per-token embeddings)
29/// * `document` - Document multi-vector embedding (per-token embeddings)
30///
31/// # Returns
32///
33/// The MaxSim score (higher is more similar).
34///
35/// # Panics
36///
37/// Panics if query and document have different dimensions.
38///
39/// # Example
40///
41/// ```
42/// use manifoldb_vector::ops::maxsim::maxsim;
43/// use manifoldb_vector::types::MultiVectorEmbedding;
44///
45/// let query = MultiVectorEmbedding::new(vec![
46///     vec![1.0, 0.0, 0.0],  // Query token 1
47///     vec![0.0, 1.0, 0.0],  // Query token 2
48/// ]).unwrap();
49///
50/// let doc = MultiVectorEmbedding::new(vec![
51///     vec![1.0, 0.0, 0.0],  // Doc token 1 (matches query token 1)
52///     vec![0.0, 0.5, 0.5],  // Doc token 2
53///     vec![0.0, 0.0, 1.0],  // Doc token 3
54/// ]).unwrap();
55///
56/// let score = maxsim(&query, &doc);
57/// // Query token 1: max with doc tokens = 1.0 (with doc token 1)
58/// // Query token 2: max with doc tokens = 0.5 (with doc token 2)
59/// // Total MaxSim = 1.0 + 0.5 = 1.5
60/// assert!((score - 1.5).abs() < 1e-6);
61/// ```
62#[must_use]
63pub fn maxsim(query: &MultiVectorEmbedding, document: &MultiVectorEmbedding) -> f32 {
64    assert_eq!(
65        query.dimension(),
66        document.dimension(),
67        "query and document must have the same dimension"
68    );
69
70    let mut total_score = 0.0_f32;
71
72    // For each query token, find the max dot product with any document token
73    for q in query.iter() {
74        let mut max_sim = f32::NEG_INFINITY;
75
76        for d in document.iter() {
77            let sim = dot_product(q, d);
78            if sim > max_sim {
79                max_sim = sim;
80            }
81        }
82
83        // Handle empty document case
84        if max_sim.is_finite() {
85            total_score += max_sim;
86        }
87    }
88
89    total_score
90}
91
92/// Compute MaxSim scores between a query and multiple documents.
93///
94/// This is more efficient than calling `maxsim` repeatedly because it can
95/// batch operations and reuse query data.
96///
97/// # Returns
98///
99/// A vector of (document_index, score) pairs, sorted by score descending.
100///
101/// # Panics
102///
103/// Panics if any document has a different dimension than the query.
104#[must_use]
105pub fn maxsim_batch(
106    query: &MultiVectorEmbedding,
107    documents: &[MultiVectorEmbedding],
108) -> Vec<(usize, f32)> {
109    let mut scores: Vec<(usize, f32)> =
110        documents.iter().enumerate().map(|(i, doc)| (i, maxsim(query, doc))).collect();
111
112    // Sort by score descending
113    scores.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
114
115    scores
116}
117
118/// A reusable MaxSim scorer with pre-computed query data.
119///
120/// Use this when scoring the same query against multiple documents to
121/// avoid redundant computation.
122///
123/// # Example
124///
125/// ```
126/// use manifoldb_vector::ops::maxsim::MaxSimScorer;
127/// use manifoldb_vector::types::MultiVectorEmbedding;
128///
129/// let query = MultiVectorEmbedding::new(vec![
130///     vec![1.0, 0.0],
131///     vec![0.0, 1.0],
132/// ]).unwrap();
133///
134/// let scorer = MaxSimScorer::new(query);
135///
136/// let doc1 = MultiVectorEmbedding::new(vec![vec![1.0, 0.0]]).unwrap();
137/// let doc2 = MultiVectorEmbedding::new(vec![vec![0.5, 0.5]]).unwrap();
138///
139/// let score1 = scorer.score(&doc1);
140/// let score2 = scorer.score(&doc2);
141/// ```
142#[derive(Debug, Clone)]
143pub struct MaxSimScorer {
144    query: MultiVectorEmbedding,
145}
146
147impl MaxSimScorer {
148    /// Create a new MaxSim scorer for the given query.
149    #[must_use]
150    pub fn new(query: MultiVectorEmbedding) -> Self {
151        Self { query }
152    }
153
154    /// Score a document against the query.
155    #[must_use]
156    pub fn score(&self, document: &MultiVectorEmbedding) -> f32 {
157        maxsim(&self.query, document)
158    }
159
160    /// Score multiple documents and return results sorted by score descending.
161    #[must_use]
162    pub fn score_batch(&self, documents: &[MultiVectorEmbedding]) -> Vec<(usize, f32)> {
163        maxsim_batch(&self.query, documents)
164    }
165
166    /// Get the query multi-vector.
167    #[must_use]
168    pub fn query(&self) -> &MultiVectorEmbedding {
169        &self.query
170    }
171
172    /// Get the dimension of the query vectors.
173    #[must_use]
174    pub fn dimension(&self) -> usize {
175        self.query.dimension()
176    }
177}
178
179/// Compute MaxSim with normalized vectors (for cosine-like similarity).
180///
181/// This normalizes both query and document token vectors to unit length
182/// before computing MaxSim, making it equivalent to MaxSim over cosine
183/// similarities.
184///
185/// # Returns
186///
187/// MaxSim score where each token similarity is in [-1, 1].
188#[must_use]
189pub fn maxsim_cosine(query: &MultiVectorEmbedding, document: &MultiVectorEmbedding) -> f32 {
190    maxsim(&query.normalize(), &document.normalize())
191}
192
193/// Convert MaxSim score to a distance (lower is more similar).
194///
195/// This computes `max_possible_score - score` where the max possible score
196/// is the number of query tokens (assuming normalized vectors with max dot product 1.0).
197#[inline]
198#[must_use]
199pub fn maxsim_to_distance(score: f32, num_query_tokens: usize) -> f32 {
200    (num_query_tokens as f32) - score
201}
202
203/// Convert a distance back to MaxSim score.
204#[inline]
205#[must_use]
206pub fn distance_to_maxsim(distance: f32, num_query_tokens: usize) -> f32 {
207    (num_query_tokens as f32) - distance
208}
209
210#[cfg(test)]
211mod tests {
212    use super::*;
213
214    #[test]
215    fn test_maxsim_identical() {
216        let query = MultiVectorEmbedding::new(vec![vec![1.0, 0.0], vec![0.0, 1.0]]).unwrap();
217
218        let doc = MultiVectorEmbedding::new(vec![vec![1.0, 0.0], vec![0.0, 1.0]]).unwrap();
219
220        let score = maxsim(&query, &doc);
221        // Each query token matches perfectly with one doc token
222        // Score = 1.0 + 1.0 = 2.0
223        assert!((score - 2.0).abs() < 1e-6);
224    }
225
226    #[test]
227    fn test_maxsim_partial_match() {
228        let query = MultiVectorEmbedding::new(vec![vec![1.0, 0.0], vec![0.0, 1.0]]).unwrap();
229
230        let doc = MultiVectorEmbedding::new(vec![
231            vec![1.0, 0.0], // Matches query token 1
232            vec![0.5, 0.5], // Partial match
233        ])
234        .unwrap();
235
236        let score = maxsim(&query, &doc);
237        // Query token 1: max(1.0, 0.5) = 1.0
238        // Query token 2: max(0.0, 0.5) = 0.5
239        // Total = 1.5
240        assert!((score - 1.5).abs() < 1e-6);
241    }
242
243    #[test]
244    fn test_maxsim_orthogonal() {
245        let query = MultiVectorEmbedding::new(vec![vec![1.0, 0.0, 0.0]]).unwrap();
246
247        let doc = MultiVectorEmbedding::new(vec![vec![0.0, 1.0, 0.0]]).unwrap();
248
249        let score = maxsim(&query, &doc);
250        // Orthogonal vectors have dot product 0
251        assert!((score - 0.0).abs() < 1e-6);
252    }
253
254    #[test]
255    fn test_maxsim_batch() {
256        let query = MultiVectorEmbedding::new(vec![vec![1.0, 0.0]]).unwrap();
257
258        let docs = vec![
259            MultiVectorEmbedding::new(vec![vec![0.5, 0.0]]).unwrap(),
260            MultiVectorEmbedding::new(vec![vec![1.0, 0.0]]).unwrap(),
261            MultiVectorEmbedding::new(vec![vec![0.0, 1.0]]).unwrap(),
262        ];
263
264        let scores = maxsim_batch(&query, &docs);
265
266        // Should be sorted by score descending
267        assert_eq!(scores.len(), 3);
268        assert_eq!(scores[0].0, 1); // doc1 has score 1.0
269        assert_eq!(scores[1].0, 0); // doc0 has score 0.5
270        assert_eq!(scores[2].0, 2); // doc2 has score 0.0
271    }
272
273    #[test]
274    fn test_scorer() {
275        let query = MultiVectorEmbedding::new(vec![vec![1.0, 0.0]]).unwrap();
276
277        let scorer = MaxSimScorer::new(query);
278
279        let doc1 = MultiVectorEmbedding::new(vec![vec![1.0, 0.0]]).unwrap();
280        let doc2 = MultiVectorEmbedding::new(vec![vec![0.5, 0.5]]).unwrap();
281
282        assert!((scorer.score(&doc1) - 1.0).abs() < 1e-6);
283        assert!((scorer.score(&doc2) - 0.5).abs() < 1e-6);
284    }
285
286    #[test]
287    fn test_maxsim_cosine() {
288        let query = MultiVectorEmbedding::new(vec![vec![2.0, 0.0]]).unwrap();
289
290        let doc = MultiVectorEmbedding::new(vec![vec![3.0, 0.0]]).unwrap();
291
292        // With cosine, vectors are normalized, so [2,0] and [3,0] both become [1,0]
293        let score = maxsim_cosine(&query, &doc);
294        assert!((score - 1.0).abs() < 1e-6);
295    }
296
297    #[test]
298    fn test_distance_conversion() {
299        let score = 1.5;
300        let num_tokens = 2;
301
302        let distance = maxsim_to_distance(score, num_tokens);
303        assert!((distance - 0.5).abs() < 1e-6);
304
305        let recovered = distance_to_maxsim(distance, num_tokens);
306        assert!((recovered - score).abs() < 1e-6);
307    }
308
309    #[test]
310    fn test_maxsim_many_tokens() {
311        // Simulate a more realistic ColBERT scenario with multiple tokens
312        let query = MultiVectorEmbedding::new(vec![
313            vec![1.0, 0.0, 0.0, 0.0],
314            vec![0.0, 1.0, 0.0, 0.0],
315            vec![0.0, 0.0, 1.0, 0.0],
316        ])
317        .unwrap();
318
319        let doc = MultiVectorEmbedding::new(vec![
320            vec![0.9, 0.1, 0.0, 0.0], // Close to query token 1
321            vec![0.0, 0.8, 0.2, 0.0], // Close to query token 2
322            vec![0.0, 0.0, 0.7, 0.3], // Close to query token 3
323            vec![0.5, 0.5, 0.0, 0.0], // Another token
324        ])
325        .unwrap();
326
327        let score = maxsim(&query, &doc);
328        // Query token 1: max over doc = 0.9 (doc token 1)
329        // Query token 2: max over doc = 0.8 (doc token 2)
330        // Query token 3: max over doc = 0.7 (doc token 3)
331        // Total = 0.9 + 0.8 + 0.7 = 2.4
332        assert!((score - 2.4).abs() < 1e-6);
333    }
334
335    #[test]
336    fn test_maxsim_single_doc_token() {
337        let query = MultiVectorEmbedding::new(vec![vec![1.0, 0.0], vec![0.0, 1.0], vec![0.5, 0.5]])
338            .unwrap();
339
340        // Document with only one token
341        let doc = MultiVectorEmbedding::new(vec![vec![0.6, 0.8]]).unwrap();
342
343        let score = maxsim(&query, &doc);
344        // All query tokens match against the single doc token
345        // Query token 1: 1.0*0.6 + 0.0*0.8 = 0.6
346        // Query token 2: 0.0*0.6 + 1.0*0.8 = 0.8
347        // Query token 3: 0.5*0.6 + 0.5*0.8 = 0.7
348        // Total = 0.6 + 0.8 + 0.7 = 2.1
349        assert!((score - 2.1).abs() < 1e-6);
350    }
351}