Skip to main content

ripvec_core/
hybrid.rs

1//! Hybrid semantic + keyword search with Reciprocal Rank Fusion (RRF).
2//!
3//! [`HybridIndex`] wraps a [`SearchIndex`] (dense vector search) and a
4//! [`Bm25Index`] (BM25 keyword search) and fuses their ranked results via
5//! Reciprocal Rank Fusion so that chunks appearing high in either list
6//! bubble to the top of the combined ranking.
7
8use std::collections::HashMap;
9use std::fmt;
10use std::str::FromStr;
11
12use crate::bm25::Bm25Index;
13use crate::chunk::CodeChunk;
14use crate::index::SearchIndex;
15
16/// Controls which retrieval strategy is used during search.
17#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
18pub enum SearchMode {
19    /// Fuse semantic (vector) and keyword (BM25) results via RRF.
20    #[default]
21    Hybrid,
22    /// Dense vector cosine-similarity ranking only.
23    Semantic,
24    /// BM25 keyword ranking only.
25    Keyword,
26}
27
28impl fmt::Display for SearchMode {
29    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
30        match self {
31            Self::Hybrid => f.write_str("hybrid"),
32            Self::Semantic => f.write_str("semantic"),
33            Self::Keyword => f.write_str("keyword"),
34        }
35    }
36}
37
38/// Error returned when a `SearchMode` string cannot be parsed.
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct ParseSearchModeError(String);
41
42impl fmt::Display for ParseSearchModeError {
43    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
44        write!(
45            f,
46            "unknown search mode {:?}; expected hybrid, semantic, or keyword",
47            self.0
48        )
49    }
50}
51
52impl std::error::Error for ParseSearchModeError {}
53
54impl FromStr for SearchMode {
55    type Err = ParseSearchModeError;
56
57    fn from_str(s: &str) -> Result<Self, Self::Err> {
58        match s {
59            "hybrid" => Ok(Self::Hybrid),
60            "semantic" => Ok(Self::Semantic),
61            "keyword" => Ok(Self::Keyword),
62            other => Err(ParseSearchModeError(other.to_string())),
63        }
64    }
65}
66
67/// Combined semantic + keyword search index with RRF fusion.
68///
69/// Build once from chunks and pre-computed embeddings; query repeatedly
70/// via [`search`](Self::search).
71pub struct HybridIndex {
72    /// Semantic (dense vector) search index.
73    pub semantic: SearchIndex,
74    /// BM25 keyword search index.
75    bm25: Bm25Index,
76}
77
78impl HybridIndex {
79    /// Build a `HybridIndex` from raw chunks and their pre-computed embeddings.
80    ///
81    /// Constructs both the [`SearchIndex`] and [`Bm25Index`] in one call.
82    /// `cascade_dim` is forwarded to [`SearchIndex::new`] for optional MRL
83    /// cascade pre-filtering.
84    ///
85    /// # Errors
86    ///
87    /// Returns an error if the BM25 index cannot be built (e.g., tantivy
88    /// schema or writer failure).
89    pub fn new(
90        chunks: Vec<CodeChunk>,
91        embeddings: &[Vec<f32>],
92        cascade_dim: Option<usize>,
93    ) -> crate::Result<Self> {
94        let bm25 = Bm25Index::build(&chunks)?;
95        let semantic = SearchIndex::new(chunks, embeddings, cascade_dim);
96        Ok(Self { semantic, bm25 })
97    }
98
99    /// Assemble a `HybridIndex` from pre-built components.
100    ///
101    /// Useful when the caller has already constructed the sub-indices
102    /// separately (e.g., loaded from a cache).
103    #[must_use]
104    pub fn from_parts(semantic: SearchIndex, bm25: Bm25Index) -> Self {
105        Self { semantic, bm25 }
106    }
107
108    /// Search the index and return `(chunk_index, score)` pairs.
109    ///
110    /// Dispatches based on `mode`:
111    /// - [`SearchMode::Semantic`] — pure dense vector search via
112    ///   [`SearchIndex::rank`].
113    /// - [`SearchMode::Keyword`] — pure BM25 keyword search, truncated to
114    ///   `top_k`.
115    /// - [`SearchMode::Hybrid`] — retrieves both ranked lists, fuses them
116    ///   with [`rrf_fuse`], then truncates to `top_k`.
117    ///
118    /// Scores are min-max normalized to `[0, 1]` regardless of mode, so
119    /// a threshold of 0.5 always means "above midpoint of the score range"
120    /// whether the underlying scores are cosine similarity, BM25, or RRF.
121    #[must_use]
122    pub fn search(
123        &self,
124        query_embedding: &[f32],
125        query_text: &str,
126        top_k: usize,
127        threshold: f32,
128        mode: SearchMode,
129    ) -> Vec<(usize, f32)> {
130        let mut raw = match mode {
131            SearchMode::Semantic => {
132                // Fetch more than top_k so normalization has a meaningful range.
133                self.semantic
134                    .rank_turboquant(query_embedding, top_k.max(100), 0.0)
135            }
136            SearchMode::Keyword => self.bm25.search(query_text, top_k.max(100)),
137            SearchMode::Hybrid => {
138                let sem = self
139                    .semantic
140                    .rank_turboquant(query_embedding, top_k.max(100), 0.0);
141                let kw = self.bm25.search(query_text, top_k.max(100));
142                rrf_fuse(&sem, &kw, 60.0)
143            }
144        };
145
146        // Min-max normalize scores to [0, 1] so threshold is model-agnostic.
147        if let (Some(max), Some(min)) = (raw.first().map(|(_, s)| *s), raw.last().map(|(_, s)| *s))
148        {
149            let range = max - min;
150            if range > f32::EPSILON {
151                for (_, score) in &mut raw {
152                    *score = (*score - min) / range;
153                }
154            } else {
155                // All scores identical — normalize to 1.0
156                for (_, score) in &mut raw {
157                    *score = 1.0;
158                }
159            }
160        }
161
162        // Apply threshold on normalized scores, then truncate
163        raw.retain(|(_, score)| *score >= threshold);
164        raw.truncate(top_k);
165        raw
166    }
167
168    /// All chunks in the index.
169    #[must_use]
170    pub fn chunks(&self) -> &[CodeChunk] {
171        &self.semantic.chunks
172    }
173}
174
175/// Reciprocal Rank Fusion of two ranked lists.
176///
177/// Each entry in `semantic` and `bm25` is `(chunk_index, _score)`.
178/// The fused score for a chunk is the sum of `1 / (k + rank + 1)` across
179/// every list the chunk appears in, where `rank` is 0-based.
180///
181/// Returns all chunks that appear in either list, sorted descending by
182/// fused RRF score.
183///
184/// `k` should typically be 60.0 — a conventional constant that smooths the
185/// ranking boost for the very top results.
186#[must_use]
187pub fn rrf_fuse(semantic: &[(usize, f32)], bm25: &[(usize, f32)], k: f32) -> Vec<(usize, f32)> {
188    let mut scores: HashMap<usize, f32> = HashMap::new();
189
190    for (rank, &(idx, _)) in semantic.iter().enumerate() {
191        *scores.entry(idx).or_insert(0.0) += 1.0 / (k + rank as f32 + 1.0);
192    }
193    for (rank, &(idx, _)) in bm25.iter().enumerate() {
194        *scores.entry(idx).or_insert(0.0) += 1.0 / (k + rank as f32 + 1.0);
195    }
196
197    let mut results: Vec<(usize, f32)> = scores.into_iter().collect();
198    results.sort_unstable_by(|a, b| {
199        b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)) // stable tie-break by chunk index
200    });
201    results
202}
203
204/// Apply a multiplicative PageRank boost to search results.
205///
206/// For each result, looks up the file's PageRank score and applies:
207///   `boosted_score = score * (1.0 + alpha * normalized_pagerank)`
208///
209/// This amplifies already-relevant results from structurally important files
210/// without promoting irrelevant results (multiplicative, not additive).
211/// Results are re-sorted after boosting.
212///
213/// `pagerank_by_file` maps relative file paths to their PageRank scores
214/// (pre-normalized to [0, 1] by dividing by max rank).
215/// `alpha` controls boost strength — 0.3 means the top-ranked file gets
216/// a 30% score boost. The `alpha` field from [`RepoGraph`] is recommended.
217pub fn boost_with_pagerank<S: std::hash::BuildHasher>(
218    results: &mut [(usize, f32)],
219    chunks: &[CodeChunk],
220    pagerank_by_file: &HashMap<String, f32, S>,
221    alpha: f32,
222) {
223    for (idx, score) in results.iter_mut() {
224        if let Some(chunk) = chunks.get(*idx) {
225            let rank = pagerank_by_file
226                .get(&chunk.file_path)
227                .copied()
228                .unwrap_or(0.0);
229            *score *= 1.0 + alpha * rank;
230        }
231    }
232    // Re-sort descending by boosted score
233    results.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
234}
235
236/// Build a normalized PageRank lookup table from a [`RepoGraph`].
237///
238/// Returns a map from relative file path to PageRank score normalized
239/// to [0, 1] (divided by the maximum rank in the graph). Files with
240/// no PageRank score map to 0.0.
241#[must_use]
242pub fn pagerank_lookup(graph: &crate::repo_map::RepoGraph) -> HashMap<String, f32> {
243    let max_rank = graph.base_ranks.iter().copied().fold(0.0_f32, f32::max);
244    if max_rank <= f32::EPSILON {
245        return HashMap::new();
246    }
247    graph
248        .files
249        .iter()
250        .zip(graph.base_ranks.iter())
251        .map(|(file, &rank)| (file.path.clone(), rank / max_rank))
252        .collect()
253}
254
255#[cfg(test)]
256mod tests {
257    use super::*;
258
259    #[test]
260    fn rrf_union_semantics() {
261        // sem: [0, 1, 2], bm25: [3, 0, 4]
262        // Chunk 0 appears in both lists → highest RRF score.
263        // Chunks 1, 2, 3, 4 appear in exactly one list → all five appear.
264        let sem = vec![(0, 0.9), (1, 0.8), (2, 0.7)];
265        let bm25 = vec![(3, 10.0), (0, 8.0), (4, 6.0)];
266
267        let fused = rrf_fuse(&sem, &bm25, 60.0);
268
269        let indices: Vec<usize> = fused.iter().map(|&(i, _)| i).collect();
270
271        // All 5 unique chunks must appear
272        for expected in [0, 1, 2, 3, 4] {
273            assert!(
274                indices.contains(&expected),
275                "chunk {expected} missing from fused results"
276            );
277        }
278        assert_eq!(fused.len(), 5);
279
280        // Chunk 0 must rank first (double-list bonus)
281        assert_eq!(indices[0], 0, "chunk 0 should rank first");
282    }
283
284    #[test]
285    fn rrf_single_list() {
286        // Only semantic results; BM25 is empty.
287        let sem = vec![(0, 0.9), (1, 0.8)];
288        let bm25: Vec<(usize, f32)> = vec![];
289
290        let fused = rrf_fuse(&sem, &bm25, 60.0);
291
292        assert_eq!(fused.len(), 2);
293        // Chunk 0 ranked first in sem list → higher RRF score than chunk 1
294        assert_eq!(fused[0].0, 0);
295        assert_eq!(fused[1].0, 1);
296        assert!(fused[0].1 > fused[1].1);
297    }
298
299    #[test]
300    fn search_mode_roundtrip() {
301        assert_eq!("hybrid".parse::<SearchMode>().unwrap(), SearchMode::Hybrid);
302        assert_eq!(
303            "semantic".parse::<SearchMode>().unwrap(),
304            SearchMode::Semantic
305        );
306        assert_eq!(
307            "keyword".parse::<SearchMode>().unwrap(),
308            SearchMode::Keyword
309        );
310
311        let err = "invalid".parse::<SearchMode>();
312        assert!(err.is_err(), "expected parse error for 'invalid'");
313        let msg = err.unwrap_err().to_string();
314        assert!(
315            msg.contains("invalid"),
316            "error message should echo the bad input"
317        );
318    }
319
320    #[test]
321    fn search_mode_display() {
322        assert_eq!(SearchMode::Hybrid.to_string(), "hybrid");
323        assert_eq!(SearchMode::Semantic.to_string(), "semantic");
324        assert_eq!(SearchMode::Keyword.to_string(), "keyword");
325    }
326
327    #[test]
328    fn pagerank_boost_amplifies_relevant() {
329        let chunks = vec![
330            CodeChunk {
331                file_path: "important.rs".into(),
332                name: "a".into(),
333                kind: "function".into(),
334                start_line: 1,
335                end_line: 10,
336                content: String::new(),
337                enriched_content: String::new(),
338            },
339            CodeChunk {
340                file_path: "obscure.rs".into(),
341                name: "b".into(),
342                kind: "function".into(),
343                start_line: 1,
344                end_line: 10,
345                content: String::new(),
346                enriched_content: String::new(),
347            },
348        ];
349
350        // Both start with same score; important.rs has high PageRank
351        let mut results = vec![(0, 0.8_f32), (1, 0.8)];
352        let mut pr = HashMap::new();
353        pr.insert("important.rs".to_string(), 1.0); // max PageRank
354        pr.insert("obscure.rs".to_string(), 0.1); // low PageRank
355
356        boost_with_pagerank(&mut results, &chunks, &pr, 0.3);
357
358        // important.rs should now rank higher
359        assert_eq!(
360            results[0].0, 0,
361            "important.rs should rank first after boost"
362        );
363        assert!(results[0].1 > results[1].1);
364
365        // Verify the math: 0.8 * (1 + 0.3 * 1.0) = 1.04
366        assert!((results[0].1 - 1.04).abs() < 0.001);
367        // 0.8 * (1 + 0.3 * 0.1) = 0.824
368        assert!((results[1].1 - 0.824).abs() < 0.001);
369    }
370
371    #[test]
372    fn pagerank_boost_zero_relevance_stays_zero() {
373        let chunks = vec![CodeChunk {
374            file_path: "important.rs".into(),
375            name: "a".into(),
376            kind: "function".into(),
377            start_line: 1,
378            end_line: 10,
379            content: String::new(),
380            enriched_content: String::new(),
381        }];
382
383        let mut results = vec![(0, 0.0_f32)];
384        let mut pr = HashMap::new();
385        pr.insert("important.rs".to_string(), 1.0);
386
387        boost_with_pagerank(&mut results, &chunks, &pr, 0.3);
388
389        // Zero score stays zero regardless of PageRank
390        assert_eq!(results[0].1, 0.0);
391    }
392
393    #[test]
394    fn pagerank_boost_unknown_file_no_effect() {
395        let chunks = vec![CodeChunk {
396            file_path: "unknown.rs".into(),
397            name: "a".into(),
398            kind: "function".into(),
399            start_line: 1,
400            end_line: 10,
401            content: String::new(),
402            enriched_content: String::new(),
403        }];
404
405        let mut results = vec![(0, 0.5_f32)];
406        let pr = HashMap::new(); // empty — no PageRank data
407
408        boost_with_pagerank(&mut results, &chunks, &pr, 0.3);
409
410        // No PageRank data → no boost
411        assert_eq!(results[0].1, 0.5);
412    }
413}