sqlite_graphrag/storage/
fusion.rs

1//! RRF (Reciprocal Rank Fusion) utilities shared between `hybrid-search` and
2//! `deep-research`.
3//!
4//! The formula used is the canonical RRF score:
5//!
6//! ```text
7//! score(d) = sum_over_lists { weight * 1 / (rrf_k + rank(d)) }
8//! ```
9//!
10//! where `rank` is 1-indexed position in each ordered list.  The map returned
11//! by [`rrf_fuse`] contains un-normalised scores; callers that need a `[0,1]`
12//! range should divide by the theoretical maximum:
13//!
14//! ```text
15//! max_possible = sum_over_lists { weight * 1 / (rrf_k + 1) }
16//! ```
17
18use std::collections::BTreeMap;
19
20/// Fuse multiple ranked lists of integer IDs via Reciprocal Rank Fusion.
21///
22/// Each element of `lists` is `(weight, ranked_ids)` where `ranked_ids` is
23/// ordered best-first (index 0 = rank 1).
24///
25/// Returns a `BTreeMap<id, combined_score>` using un-normalised RRF scores.
26/// Higher score means higher relevance. The `BTreeMap` is chosen over
27/// `HashMap` so iteration order is **deterministic** (sorted by id) —
28/// this lets callers serialise the result to canonical JSON and run
29/// equality assertions across processes and platforms.
30///
31/// # Examples
32///
33/// ```
34/// use sqlite_graphrag::storage::fusion::rrf_fuse;
35///
36/// // Two lists with equal weight — item 1 appears in both at rank 1 and 2
37/// // so it accumulates more score than item 2 (rank 2) or item 3 (rank 1 only).
38/// let knn: Vec<i64> = vec![1, 2];
39/// let fts: Vec<i64> = vec![1, 3];
40/// let scores = rrf_fuse(&[(1.0, &knn), (1.0, &fts)], 60.0);
41/// assert!(scores[&1] > scores[&2]);
42/// assert!(scores[&1] > scores[&3]);
43/// ```
44pub fn rrf_fuse(lists: &[(f64, &Vec<i64>)], rrf_k: f64) -> BTreeMap<i64, f64> {
45    // BTreeMap is constructed empty and grown via `entry().or_insert()` —
46    // there is no `with_capacity` constructor for BTreeMap, so we skip the
47    // pre-allocation that HashMap could express.
48    let mut combined: BTreeMap<i64, f64> = BTreeMap::new();
49    let _total_ids: usize = 0; // reserved for future diagnostics; silences unused warning
50    for (weight, ids) in lists {
51        for (rank, &id) in ids.iter().enumerate() {
52            // rank is 0-indexed here; formula uses 1-indexed, so we add 1.
53            let contribution = weight * (1.0 / (rrf_k + rank as f64 + 1.0));
54            *combined.entry(id).or_insert(0.0) += contribution;
55        }
56    }
57    combined
58}
59
60/// Compute the theoretical maximum RRF score for a given set of weights and
61/// `rrf_k`.
62///
63/// Useful for normalising `rrf_fuse` scores to `[0, 1]`:
64///
65/// ```
66/// use sqlite_graphrag::storage::fusion::{rrf_fuse, rrf_max_possible};
67///
68/// let weights = vec![1.0_f64, 1.0_f64];
69/// let max = rrf_max_possible(&weights, 60.0);
70/// assert!(max > 0.0);
71/// ```
72pub fn rrf_max_possible(weights: &[f64], rrf_k: f64) -> f64 {
73    weights.iter().map(|w| w * (1.0 / (rrf_k + 1.0))).sum()
74}
75
76#[cfg(test)]
77mod tests {
78    use super::*;
79
80    #[test]
81    fn rrf_fuse_single_list_rank_order_preserved() {
82        // Items at lower rank index get higher scores.
83        let list = vec![10i64, 20, 30];
84        let scores = rrf_fuse(&[(1.0, &list)], 60.0);
85        assert!(scores[&10] > scores[&20]);
86        assert!(scores[&20] > scores[&30]);
87    }
88
89    #[test]
90    fn rrf_fuse_two_lists_overlap_accumulates() {
91        // Item 1 appears first in both lists — must beat item 2 (rank 1 in one list only).
92        let knn = vec![1i64, 2];
93        let fts = vec![1i64, 3];
94        let scores = rrf_fuse(&[(1.0, &knn), (1.0, &fts)], 60.0);
95        assert!(scores[&1] > scores[&2], "overlap item must score higher");
96        assert!(scores[&1] > scores[&3], "overlap item must score higher");
97    }
98
99    #[test]
100    fn rrf_fuse_empty_lists_returns_empty() {
101        let empty: Vec<i64> = vec![];
102        let scores = rrf_fuse(&[(1.0, &empty)], 60.0);
103        assert!(scores.is_empty());
104    }
105
106    #[test]
107    fn rrf_fuse_zero_weight_list_has_no_effect() {
108        let list_a = vec![1i64, 2];
109        let list_b = vec![3i64, 4];
110        let scores_with = rrf_fuse(&[(1.0, &list_a), (0.0, &list_b)], 60.0);
111        // Items 3 and 4 should have score 0.0 (or not present).
112        assert_eq!(scores_with.get(&3).copied().unwrap_or(0.0), 0.0);
113        assert_eq!(scores_with.get(&4).copied().unwrap_or(0.0), 0.0);
114    }
115
116    #[test]
117    fn rrf_fuse_weights_scale_contribution() {
118        // Higher weight means higher score for same rank.
119        let list = vec![1i64];
120        let low = rrf_fuse(&[(0.5, &list)], 60.0);
121        let high = rrf_fuse(&[(2.0, &list)], 60.0);
122        assert!(high[&1] > low[&1]);
123    }
124
125    #[test]
126    fn rrf_max_possible_sums_weights() {
127        // With rrf_k=60, max for one list of weight 1.0 is 1/(60+1) ≈ 0.01639.
128        let max = rrf_max_possible(&[1.0], 60.0);
129        let expected = 1.0 / 61.0;
130        assert!((max - expected).abs() < 1e-9);
131
132        // Two equal-weight lists: sum of both.
133        let max2 = rrf_max_possible(&[1.0, 1.0], 60.0);
134        assert!((max2 - 2.0 / 61.0).abs() < 1e-9);
135    }
136
137    #[test]
138    fn rrf_fuse_deterministic_for_same_input() {
139        let list_a = vec![1i64, 2, 3];
140        let list_b = vec![2i64, 1, 4];
141        let scores_1 = rrf_fuse(&[(1.0, &list_a), (1.0, &list_b)], 60.0);
142        let scores_2 = rrf_fuse(&[(1.0, &list_a), (1.0, &list_b)], 60.0);
143        for id in [1i64, 2, 3, 4] {
144            assert_eq!(
145                scores_1.get(&id).copied().unwrap_or(0.0),
146                scores_2.get(&id).copied().unwrap_or(0.0)
147            );
148        }
149    }
150}
sqlite_graphrag/storage/fusion.rs

sqlite_graphrag/storage/
fusion.rs