sqlite_graphrag/storage/fusion.rs
1//! RRF (Reciprocal Rank Fusion) utilities shared between `hybrid-search` and
2//! `deep-research`.
3//!
4//! The formula used is the canonical RRF score:
5//!
6//! ```text
7//! score(d) = sum_over_lists { weight * 1 / (rrf_k + rank(d)) }
8//! ```
9//!
10//! where `rank` is 1-indexed position in each ordered list. The map returned
11//! by [`rrf_fuse`] contains un-normalised scores; callers that need a `[0,1]`
12//! range should divide by the theoretical maximum:
13//!
14//! ```text
15//! max_possible = sum_over_lists { weight * 1 / (rrf_k + 1) }
16//! ```
17
18use std::collections::BTreeMap;
19
20/// Fuse multiple ranked lists of integer IDs via Reciprocal Rank Fusion.
21///
22/// Each element of `lists` is `(weight, ranked_ids)` where `ranked_ids` is
23/// ordered best-first (index 0 = rank 1).
24///
25/// Returns a `BTreeMap<id, combined_score>` using un-normalised RRF scores.
26/// Higher score means higher relevance. The `BTreeMap` is chosen over
27/// `HashMap` so iteration order is **deterministic** (sorted by id) —
28/// this lets callers serialise the result to canonical JSON and run
29/// equality assertions across processes and platforms.
30///
31/// # Examples
32///
33/// ```
34/// use sqlite_graphrag::storage::fusion::rrf_fuse;
35///
36/// // Two lists with equal weight — item 1 appears in both at rank 1 and 2
37/// // so it accumulates more score than item 2 (rank 2) or item 3 (rank 1 only).
38/// let knn: Vec<i64> = vec![1, 2];
39/// let fts: Vec<i64> = vec![1, 3];
40/// let scores = rrf_fuse(&[(1.0, &knn), (1.0, &fts)], 60.0);
41/// assert!(scores[&1] > scores[&2]);
42/// assert!(scores[&1] > scores[&3]);
43/// ```
44pub fn rrf_fuse(lists: &[(f64, &Vec<i64>)], rrf_k: f64) -> BTreeMap<i64, f64> {
45 // BTreeMap is constructed empty and grown via `entry().or_insert()` —
46 // there is no `with_capacity` constructor for BTreeMap, so we skip the
47 // pre-allocation that HashMap could express.
48 let mut combined: BTreeMap<i64, f64> = BTreeMap::new();
49 let _total_ids: usize = 0; // reserved for future diagnostics; silences unused warning
50 for (weight, ids) in lists {
51 for (rank, &id) in ids.iter().enumerate() {
52 // rank is 0-indexed here; formula uses 1-indexed, so we add 1.
53 let contribution = weight * (1.0 / (rrf_k + rank as f64 + 1.0));
54 *combined.entry(id).or_insert(0.0) += contribution;
55 }
56 }
57 combined
58}
59
60/// Compute the theoretical maximum RRF score for a given set of weights and
61/// `rrf_k`.
62///
63/// Useful for normalising `rrf_fuse` scores to `[0, 1]`:
64///
65/// ```
66/// use sqlite_graphrag::storage::fusion::{rrf_fuse, rrf_max_possible};
67///
68/// let weights = vec![1.0_f64, 1.0_f64];
69/// let max = rrf_max_possible(&weights, 60.0);
70/// assert!(max > 0.0);
71/// ```
72pub fn rrf_max_possible(weights: &[f64], rrf_k: f64) -> f64 {
73 weights.iter().map(|w| w * (1.0 / (rrf_k + 1.0))).sum()
74}
75
76#[cfg(test)]
77mod tests {
78 use super::*;
79
80 #[test]
81 fn rrf_fuse_single_list_rank_order_preserved() {
82 // Items at lower rank index get higher scores.
83 let list = vec![10i64, 20, 30];
84 let scores = rrf_fuse(&[(1.0, &list)], 60.0);
85 assert!(scores[&10] > scores[&20]);
86 assert!(scores[&20] > scores[&30]);
87 }
88
89 #[test]
90 fn rrf_fuse_two_lists_overlap_accumulates() {
91 // Item 1 appears first in both lists — must beat item 2 (rank 1 in one list only).
92 let knn = vec![1i64, 2];
93 let fts = vec![1i64, 3];
94 let scores = rrf_fuse(&[(1.0, &knn), (1.0, &fts)], 60.0);
95 assert!(scores[&1] > scores[&2], "overlap item must score higher");
96 assert!(scores[&1] > scores[&3], "overlap item must score higher");
97 }
98
99 #[test]
100 fn rrf_fuse_empty_lists_returns_empty() {
101 let empty: Vec<i64> = vec![];
102 let scores = rrf_fuse(&[(1.0, &empty)], 60.0);
103 assert!(scores.is_empty());
104 }
105
106 #[test]
107 fn rrf_fuse_zero_weight_list_has_no_effect() {
108 let list_a = vec![1i64, 2];
109 let list_b = vec![3i64, 4];
110 let scores_with = rrf_fuse(&[(1.0, &list_a), (0.0, &list_b)], 60.0);
111 // Items 3 and 4 should have score 0.0 (or not present).
112 assert_eq!(scores_with.get(&3).copied().unwrap_or(0.0), 0.0);
113 assert_eq!(scores_with.get(&4).copied().unwrap_or(0.0), 0.0);
114 }
115
116 #[test]
117 fn rrf_fuse_weights_scale_contribution() {
118 // Higher weight means higher score for same rank.
119 let list = vec![1i64];
120 let low = rrf_fuse(&[(0.5, &list)], 60.0);
121 let high = rrf_fuse(&[(2.0, &list)], 60.0);
122 assert!(high[&1] > low[&1]);
123 }
124
125 #[test]
126 fn rrf_max_possible_sums_weights() {
127 // With rrf_k=60, max for one list of weight 1.0 is 1/(60+1) ≈ 0.01639.
128 let max = rrf_max_possible(&[1.0], 60.0);
129 let expected = 1.0 / 61.0;
130 assert!((max - expected).abs() < 1e-9);
131
132 // Two equal-weight lists: sum of both.
133 let max2 = rrf_max_possible(&[1.0, 1.0], 60.0);
134 assert!((max2 - 2.0 / 61.0).abs() < 1e-9);
135 }
136
137 #[test]
138 fn rrf_fuse_deterministic_for_same_input() {
139 let list_a = vec![1i64, 2, 3];
140 let list_b = vec![2i64, 1, 4];
141 let scores_1 = rrf_fuse(&[(1.0, &list_a), (1.0, &list_b)], 60.0);
142 let scores_2 = rrf_fuse(&[(1.0, &list_a), (1.0, &list_b)], 60.0);
143 for id in [1i64, 2, 3, 4] {
144 assert_eq!(
145 scores_1.get(&id).copied().unwrap_or(0.0),
146 scores_2.get(&id).copied().unwrap_or(0.0)
147 );
148 }
149 }
150}