ruvector-postgres 2.0.5

High-performance PostgreSQL vector database extension v2 - pgvector drop-in replacement with 230+ SQL functions, SIMD acceleration, Flash Attention, GNN layers, hybrid search, multi-tenancy, self-healing, and self-learning capabilities
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
//! Fusion algorithms for combining vector and keyword search results
//!
//! Provides:
//! - RRF (Reciprocal Rank Fusion) - default, robust
//! - Linear blend - simple weighted combination
//! - Learned fusion - query-adaptive weights

use serde::{Deserialize, Serialize};
use std::collections::HashMap;

/// Document ID type (matches with database row IDs)
pub type DocId = i64;

/// Default RRF constant
pub const DEFAULT_RRF_K: usize = 60;

/// Default alpha for linear fusion (0.5 = equal weight)
pub const DEFAULT_ALPHA: f32 = 0.5;

/// Fusion method selection
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum FusionMethod {
    /// Reciprocal Rank Fusion (default)
    Rrf,
    /// Linear weighted combination
    Linear,
    /// Learned/adaptive fusion based on query features
    Learned,
}

impl Default for FusionMethod {
    fn default() -> Self {
        FusionMethod::Rrf
    }
}

impl std::str::FromStr for FusionMethod {
    type Err = String;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s.to_lowercase().as_str() {
            "rrf" | "reciprocal" | "reciprocal_rank" => Ok(FusionMethod::Rrf),
            "linear" | "blend" | "weighted" => Ok(FusionMethod::Linear),
            "learned" | "adaptive" | "auto" => Ok(FusionMethod::Learned),
            _ => Err(format!("Unknown fusion method: {}", s)),
        }
    }
}

/// Fusion configuration
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct FusionConfig {
    /// Fusion method to use
    pub method: FusionMethod,
    /// RRF constant (typically 60)
    pub rrf_k: usize,
    /// Alpha for linear fusion (0 = all keyword, 1 = all vector)
    pub alpha: f32,
}

impl Default for FusionConfig {
    fn default() -> Self {
        Self {
            method: FusionMethod::Rrf,
            rrf_k: DEFAULT_RRF_K,
            alpha: DEFAULT_ALPHA,
        }
    }
}

/// Search result from a single branch (vector or keyword)
#[derive(Debug, Clone)]
pub struct BranchResult {
    /// Document ID
    pub doc_id: DocId,
    /// Original score from the branch
    pub score: f32,
    /// Rank in the result set (0-indexed)
    pub rank: usize,
}

/// Fused search result combining both branches
#[derive(Debug, Clone)]
pub struct FusedResult {
    /// Document ID
    pub doc_id: DocId,
    /// Final fused score
    pub hybrid_score: f32,
    /// Original vector score (if present)
    pub vector_score: Option<f32>,
    /// Original keyword score (if present)
    pub keyword_score: Option<f32>,
}

/// Reciprocal Rank Fusion (RRF)
///
/// RRF score = sum over sources: 1 / (k + rank)
///
/// Properties:
/// - Robust to different score scales
/// - No need for score calibration
/// - Works well with partial overlap
///
/// Reference: "Reciprocal Rank Fusion outperforms Condorcet and individual Rank Learning Methods"
pub fn rrf_fusion(
    vector_results: &[(DocId, f32)],  // (id, distance - lower is better)
    keyword_results: &[(DocId, f32)], // (id, BM25 score - higher is better)
    k: usize,
    limit: usize,
) -> Vec<FusedResult> {
    let mut scores: HashMap<DocId, (f32, Option<f32>, Option<f32>)> = HashMap::new();

    // Vector ranking (lower distance = higher rank, so already sorted best first)
    for (rank, (doc_id, distance)) in vector_results.iter().enumerate() {
        let rrf_score = 1.0 / (k + rank + 1) as f32;
        let entry = scores.entry(*doc_id).or_insert((0.0, None, None));
        entry.0 += rrf_score;
        entry.1 = Some(*distance);
    }

    // Keyword ranking (higher BM25 = higher rank, already sorted best first)
    for (rank, (doc_id, bm25_score)) in keyword_results.iter().enumerate() {
        let rrf_score = 1.0 / (k + rank + 1) as f32;
        let entry = scores.entry(*doc_id).or_insert((0.0, None, None));
        entry.0 += rrf_score;
        entry.2 = Some(*bm25_score);
    }

    // Sort by fused score (descending)
    let mut results: Vec<FusedResult> = scores
        .into_iter()
        .map(
            |(doc_id, (hybrid_score, vector_score, keyword_score))| FusedResult {
                doc_id,
                hybrid_score,
                vector_score,
                keyword_score,
            },
        )
        .collect();

    results.sort_by(|a, b| {
        b.hybrid_score
            .partial_cmp(&a.hybrid_score)
            .unwrap_or(std::cmp::Ordering::Equal)
    });
    results.truncate(limit);
    results
}

/// Normalize vector distances to similarity scores [0, 1]
///
/// Converts distance (lower = better) to similarity (higher = better)
fn normalize_to_similarity(results: &[(DocId, f32)]) -> Vec<(DocId, f32)> {
    if results.is_empty() {
        return Vec::new();
    }

    // Find min/max distances
    let (min_dist, max_dist) = results
        .iter()
        .fold((f32::MAX, f32::MIN), |(min, max), (_, d)| {
            (min.min(*d), max.max(*d))
        });

    let range = (max_dist - min_dist).max(1e-6);

    results
        .iter()
        .map(|(id, dist)| {
            // Convert distance to similarity: 1 - normalized_distance
            let similarity = 1.0 - (dist - min_dist) / range;
            (*id, similarity)
        })
        .collect()
}

/// Min-max normalize scores to [0, 1]
fn min_max_normalize(results: &[(DocId, f32)]) -> Vec<(DocId, f32)> {
    if results.is_empty() {
        return Vec::new();
    }

    let (min_score, max_score) = results
        .iter()
        .fold((f32::MAX, f32::MIN), |(min, max), (_, s)| {
            (min.min(*s), max.max(*s))
        });

    let range = (max_score - min_score).max(1e-6);

    results
        .iter()
        .map(|(id, score)| {
            let normalized = (score - min_score) / range;
            (*id, normalized)
        })
        .collect()
}

/// Linear fusion with alpha blending
///
/// score = alpha * vector_similarity + (1 - alpha) * keyword_score
///
/// Note: Scores must be normalized before fusion
pub fn linear_fusion(
    vector_results: &[(DocId, f32)],  // (id, distance)
    keyword_results: &[(DocId, f32)], // (id, BM25 score)
    alpha: f32,
    limit: usize,
) -> Vec<FusedResult> {
    // Normalize vector scores (distance -> similarity)
    let vec_scores: HashMap<DocId, f32> = normalize_to_similarity(vector_results)
        .into_iter()
        .collect();

    // Normalize keyword scores to [0, 1]
    let kw_scores: HashMap<DocId, f32> = min_max_normalize(keyword_results).into_iter().collect();

    // Combine scores
    let mut combined: HashMap<DocId, (f32, Option<f32>, Option<f32>)> = HashMap::new();

    // Add vector results
    for (doc_id, sim) in &vec_scores {
        let entry = combined.entry(*doc_id).or_insert((0.0, None, None));
        entry.0 += alpha * sim;
        // Store original distance
        if let Some((_, dist)) = vector_results.iter().find(|(id, _)| id == doc_id) {
            entry.1 = Some(*dist);
        }
    }

    // Add keyword results
    for (doc_id, norm_score) in &kw_scores {
        let entry = combined.entry(*doc_id).or_insert((0.0, None, None));
        entry.0 += (1.0 - alpha) * norm_score;
        // Store original BM25 score
        if let Some((_, score)) = keyword_results.iter().find(|(id, _)| id == doc_id) {
            entry.2 = Some(*score);
        }
    }

    // Sort by fused score
    let mut results: Vec<FusedResult> = combined
        .into_iter()
        .map(
            |(doc_id, (hybrid_score, vector_score, keyword_score))| FusedResult {
                doc_id,
                hybrid_score,
                vector_score,
                keyword_score,
            },
        )
        .collect();

    results.sort_by(|a, b| {
        b.hybrid_score
            .partial_cmp(&a.hybrid_score)
            .unwrap_or(std::cmp::Ordering::Equal)
    });
    results.truncate(limit);
    results
}

/// Query features for learned fusion
#[derive(Debug, Clone)]
pub struct QueryFeatures {
    /// L2 norm of query embedding
    pub embedding_norm: f32,
    /// Number of query terms
    pub term_count: usize,
    /// Average IDF of query terms
    pub avg_term_idf: f32,
    /// Whether query appears to need exact matching
    pub has_exact_match: bool,
    /// Classified query type
    pub query_type: QueryType,
}

/// Query type classification for learned fusion
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum QueryType {
    /// Navigational query (looking for specific entity)
    Navigational,
    /// Informational query (seeking information)
    Informational,
    /// Transactional query (action-oriented)
    Transactional,
    /// Unknown/mixed
    Unknown,
}

/// Simple fusion model for learned/adaptive fusion
///
/// In production, this would be a trained ML model (e.g., GNN, logistic regression)
pub struct FusionModel {
    /// Default alpha when model can't make prediction
    pub default_alpha: f32,
    /// Weight for embedding norm
    pub norm_weight: f32,
    /// Weight for term count
    pub term_weight: f32,
    /// Weight for avg IDF
    pub idf_weight: f32,
    /// Bias for exact match queries (favor keyword)
    pub exact_match_bias: f32,
}

impl Default for FusionModel {
    fn default() -> Self {
        Self {
            default_alpha: 0.5,
            norm_weight: 0.1,
            term_weight: -0.05,     // More terms -> slight keyword preference
            idf_weight: 0.15,       // Rare terms -> vector preference
            exact_match_bias: -0.2, // Exact match -> keyword preference
        }
    }
}

impl FusionModel {
    /// Predict optimal alpha for a query
    pub fn predict_alpha(&self, features: &QueryFeatures) -> f32 {
        let mut alpha = self.default_alpha;

        // Adjust based on embedding norm (high norm -> more distinctive)
        alpha += self.norm_weight * (features.embedding_norm - 1.0).clamp(-1.0, 1.0);

        // Adjust based on term count
        alpha += self.term_weight * (features.term_count as f32 - 3.0).clamp(-3.0, 3.0);

        // Adjust based on avg IDF (high IDF = rare terms, favor vector)
        alpha += self.idf_weight * (features.avg_term_idf - 3.0).clamp(-3.0, 3.0);

        // Adjust for exact match intent
        if features.has_exact_match {
            alpha += self.exact_match_bias;
        }

        // Adjust based on query type
        match features.query_type {
            QueryType::Navigational => alpha -= 0.15, // Favor keyword
            QueryType::Informational => alpha += 0.1, // Favor vector
            QueryType::Transactional => alpha -= 0.05,
            QueryType::Unknown => {}
        }

        // Clamp to valid range
        alpha.clamp(0.0, 1.0)
    }
}

/// Learned fusion using query characteristics
pub fn learned_fusion(
    query_embedding: &[f32],
    query_terms: &[String],
    vector_results: &[(DocId, f32)],
    keyword_results: &[(DocId, f32)],
    model: &FusionModel,
    avg_term_idf: f32, // Pre-computed average IDF
    limit: usize,
) -> Vec<FusedResult> {
    // Compute query features
    let embedding_norm = l2_norm(query_embedding);
    let has_exact_match = detect_exact_match_intent(query_terms);
    let query_type = classify_query_type(query_terms);

    let features = QueryFeatures {
        embedding_norm,
        term_count: query_terms.len(),
        avg_term_idf,
        has_exact_match,
        query_type,
    };

    // Predict optimal alpha
    let alpha = model.predict_alpha(&features);

    // Use linear fusion with predicted alpha
    linear_fusion(vector_results, keyword_results, alpha, limit)
}

/// Compute L2 norm of a vector
fn l2_norm(v: &[f32]) -> f32 {
    v.iter().map(|x| x * x).sum::<f32>().sqrt()
}

/// Detect if query seems to need exact matching
fn detect_exact_match_intent(terms: &[String]) -> bool {
    // Heuristics for exact match intent:
    // - Quoted phrases (handled upstream)
    // - Product codes, error codes, IDs
    // - Very short queries (1-2 terms)

    if terms.len() <= 2 {
        return true;
    }

    terms.iter().any(|t| {
        // Looks like a code/ID
        t.chars().any(|c| c.is_numeric()) && t.len() >= 3 && t.len() <= 20
    })
}

/// Classify query type based on terms
fn classify_query_type(terms: &[String]) -> QueryType {
    let terms_lower: Vec<String> = terms.iter().map(|t| t.to_lowercase()).collect();

    // Navigational indicators
    let nav_indicators = ["website", "login", "home", "official", "download"];
    if terms_lower
        .iter()
        .any(|t| nav_indicators.contains(&t.as_str()))
    {
        return QueryType::Navigational;
    }

    // Transactional indicators
    let trans_indicators = ["buy", "purchase", "order", "price", "cheap", "best", "deal"];
    if terms_lower
        .iter()
        .any(|t| trans_indicators.contains(&t.as_str()))
    {
        return QueryType::Transactional;
    }

    // Informational indicators
    let info_indicators = [
        "how", "what", "why", "when", "where", "guide", "tutorial", "explain",
    ];
    if terms_lower
        .iter()
        .any(|t| info_indicators.contains(&t.as_str()))
    {
        return QueryType::Informational;
    }

    QueryType::Unknown
}

/// Fuse results using the specified method
pub fn fuse_results(
    vector_results: &[(DocId, f32)],
    keyword_results: &[(DocId, f32)],
    config: &FusionConfig,
    limit: usize,
) -> Vec<FusedResult> {
    match config.method {
        FusionMethod::Rrf => rrf_fusion(vector_results, keyword_results, config.rrf_k, limit),
        FusionMethod::Linear => linear_fusion(vector_results, keyword_results, config.alpha, limit),
        FusionMethod::Learned => {
            // Learned fusion requires additional context
            // Fall back to RRF if no model is available
            rrf_fusion(vector_results, keyword_results, config.rrf_k, limit)
        }
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn sample_vector_results() -> Vec<(DocId, f32)> {
        vec![
            (1, 0.1), // Best (lowest distance)
            (2, 0.2),
            (3, 0.3),
            (4, 0.5),
            (5, 0.8),
        ]
    }

    fn sample_keyword_results() -> Vec<(DocId, f32)> {
        vec![
            (3, 8.5), // Best (highest BM25)
            (1, 7.2),
            (6, 5.0),
            (2, 3.5),
            (7, 2.0),
        ]
    }

    #[test]
    fn test_rrf_fusion() {
        let vector = sample_vector_results();
        let keyword = sample_keyword_results();

        let results = rrf_fusion(&vector, &keyword, 60, 5);

        assert!(!results.is_empty());
        // Doc 1 and 3 should be near top (they appear in both)
        let top_ids: Vec<DocId> = results.iter().map(|r| r.doc_id).collect();
        assert!(top_ids.contains(&1) || top_ids.contains(&3));
    }

    #[test]
    fn test_linear_fusion_alpha_1() {
        let vector = sample_vector_results();
        let keyword = sample_keyword_results();

        // Alpha = 1.0 means only vector
        let results = linear_fusion(&vector, &keyword, 1.0, 5);

        assert!(!results.is_empty());
        // With alpha=1, vector-only docs should dominate
        let first = &results[0];
        assert!(first.vector_score.is_some());
    }

    #[test]
    fn test_linear_fusion_alpha_0() {
        let vector = sample_vector_results();
        let keyword = sample_keyword_results();

        // Alpha = 0.0 means only keyword
        let results = linear_fusion(&vector, &keyword, 0.0, 5);

        assert!(!results.is_empty());
        // With alpha=0, keyword-only docs can appear
        let first = &results[0];
        assert!(first.keyword_score.is_some());
    }

    #[test]
    fn test_normalization() {
        let results = vec![(1, 0.1), (2, 0.5), (3, 0.9)];
        let normalized = normalize_to_similarity(&results);

        assert_eq!(normalized.len(), 3);
        // Lowest distance should have highest similarity
        let (id1, sim1) = normalized[0];
        assert_eq!(id1, 1);
        assert!((sim1 - 1.0).abs() < 0.01, "Best should be ~1.0");
    }

    #[test]
    fn test_fusion_model() {
        let model = FusionModel::default();

        // Short navigational query
        let features = QueryFeatures {
            embedding_norm: 1.0,
            term_count: 2,
            avg_term_idf: 2.0,
            has_exact_match: true,
            query_type: QueryType::Navigational,
        };

        let alpha = model.predict_alpha(&features);
        assert!(
            alpha < 0.5,
            "Navigational query should favor keyword (alpha < 0.5)"
        );

        // Long informational query
        let features2 = QueryFeatures {
            embedding_norm: 1.2,
            term_count: 6,
            avg_term_idf: 5.0,
            has_exact_match: false,
            query_type: QueryType::Informational,
        };

        let alpha2 = model.predict_alpha(&features2);
        assert!(
            alpha2 > 0.4,
            "Informational query with rare terms should favor vector"
        );
    }

    #[test]
    fn test_query_type_classification() {
        let nav = classify_query_type(&["github".into(), "login".into()]);
        assert_eq!(nav, QueryType::Navigational);

        let info = classify_query_type(&["how".into(), "to".into(), "cook".into(), "pasta".into()]);
        assert_eq!(info, QueryType::Informational);

        let trans = classify_query_type(&["buy".into(), "laptop".into()]);
        assert_eq!(trans, QueryType::Transactional);
    }

    #[test]
    fn test_exact_match_detection() {
        assert!(detect_exact_match_intent(&["ERR001".into()]));
        assert!(detect_exact_match_intent(&["SKU12345".into()]));
        assert!(!detect_exact_match_intent(&[
            "database".into(),
            "connection".into(),
            "error".into(),
            "handling".into()
        ]));
    }

    #[test]
    fn test_empty_results() {
        let results = rrf_fusion(&[], &[], 60, 10);
        assert!(results.is_empty());

        let results2 = linear_fusion(&[], &[], 0.5, 10);
        assert!(results2.is_empty());
    }
}