Skip to main content

ripvec_core/
hybrid.rs

1//! Hybrid semantic + keyword search with Reciprocal Rank Fusion (RRF).
2//!
3//! Search mode enum + helper functions for PageRank boosting and lookup
4//! used by the ripvec engine. Pre-v3.0.0 this also contained `HybridIndex`
5//! (the transformer-engine search index), which is now gone.
6
7use std::collections::HashMap;
8use std::fmt;
9use std::str::FromStr;
10
11use crate::chunk::CodeChunk;
12
13/// Controls which retrieval strategy is used during search.
14#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
15pub enum SearchMode {
16    /// Fuse semantic (vector) and keyword (BM25) results via RRF.
17    #[default]
18    Hybrid,
19    /// Dense vector cosine-similarity ranking only.
20    Semantic,
21    /// BM25 keyword ranking only.
22    Keyword,
23}
24
25impl fmt::Display for SearchMode {
26    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
27        match self {
28            Self::Hybrid => f.write_str("hybrid"),
29            Self::Semantic => f.write_str("semantic"),
30            Self::Keyword => f.write_str("keyword"),
31        }
32    }
33}
34
35/// Error returned when a `SearchMode` string cannot be parsed.
36#[derive(Debug, Clone, PartialEq, Eq)]
37pub struct ParseSearchModeError(String);
38
39impl fmt::Display for ParseSearchModeError {
40    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
41        write!(
42            f,
43            "unknown search mode {:?}; expected hybrid, semantic, or keyword",
44            self.0
45        )
46    }
47}
48
49impl std::error::Error for ParseSearchModeError {}
50
51impl FromStr for SearchMode {
52    type Err = ParseSearchModeError;
53
54    fn from_str(s: &str) -> Result<Self, Self::Err> {
55        match s {
56            "hybrid" => Ok(Self::Hybrid),
57            "semantic" => Ok(Self::Semantic),
58            "keyword" => Ok(Self::Keyword),
59            other => Err(ParseSearchModeError(other.to_string())),
60        }
61    }
62}
63
64#[must_use]
65pub fn pagerank_boost_factor(percentile: f32, alpha: f32) -> f32 {
66    if percentile <= 0.0 || alpha <= 0.0 {
67        return 1.0;
68    }
69    let z = (percentile.clamp(0.0, 1.0) - 0.5) / PAGERANK_SIGMOID_STEEPNESS;
70    let sigmoid = 1.0 / (1.0 + (-z).exp());
71    1.0 + alpha * sigmoid
72}
73
74/// Apply a multiplicative PageRank boost to search results.
75///
76/// For each result, looks up the chunk's PageRank percentile and applies
77/// the sigmoid boost from [`pagerank_boost_factor`].
78///
79/// Results are re-sorted after boosting.
80///
81/// `pagerank_by_file` maps relative file paths to their **PageRank
82/// percentile** in the corpus distribution — not the raw rank value.
83/// Build it via [`pagerank_lookup`], which switched to percentile in
84/// service of the sigmoid curve.
85///
86/// `alpha` controls the maximum boost (ceiling = `1 + alpha`). The
87/// `alpha` field from [`RepoGraph`] is recommended (auto-tuned from
88/// graph density).
89pub fn boost_with_pagerank<S: std::hash::BuildHasher>(
90    results: &mut [(usize, f32)],
91    chunks: &[CodeChunk],
92    pagerank_by_file: &HashMap<String, f32, S>,
93    alpha: f32,
94) {
95    // Operates on `&mut [_]` (not `&mut Vec<_>`) so we can't delegate
96    // to `crate::ranking::PageRankBoost::apply` directly (the trait
97    // method takes `&mut Vec` to allow truncation layers). Replicate
98    // the boost loop inline; both paths share `lookup_rank` +
99    // `pagerank_boost_factor` so the curve stays consistent.
100    for (idx, score) in results.iter_mut() {
101        if let Some(chunk) = chunks.get(*idx) {
102            let rank = lookup_rank(pagerank_by_file, &chunk.file_path, &chunk.name);
103            *score *= pagerank_boost_factor(rank, alpha);
104        }
105    }
106    results.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
107}
108
109/// Build a per-file PageRank lookup table from a `RepoGraph`.
110///
111/// Returns a `file_path -> percentile` map, plus `file_path::name`
112/// entries for definitions. Percentiles are in `[0, 1]`.
113#[must_use]
114pub fn pagerank_lookup(graph: &crate::repo_map::RepoGraph) -> HashMap<String, f32> {
115    // Switched from `rank / max_rank` (proportional) to percentile in
116    // the corpus distribution. Rationale: a top-K result set typically
117    // contains files whose raw ranks are all in a tiny band near zero
118    // (Tokio: max in top-10 was 0.028 out of 1.0). Proportional
119    // normalization gave uniformly tiny boosts. Percentile separates
120    // "bottom decile (tests, leaves)" from "top half (impls, hubs)"
121    // crisply, and pairs with the sigmoid in `pagerank_boost_factor`
122    // to put the rank-transition where the action is.
123    //
124    // Definition-level and file-level percentiles use independent
125    // distributions: `def_ranks` and `base_ranks`. A file that has no
126    // defs still gets a file-level percentile from `base_ranks`.
127    let def_pct = make_percentile_fn(&graph.def_ranks);
128    let base_pct = make_percentile_fn(&graph.base_ranks);
129    let mut map = HashMap::new();
130    for (file_idx, file) in graph.files.iter().enumerate() {
131        for (def_idx, def) in file.defs.iter().enumerate() {
132            let flat = graph.def_offsets[file_idx] + def_idx;
133            if let Some(&rank) = graph.def_ranks.get(flat) {
134                let key = format!("{}::{}", file.path, def.name);
135                map.insert(key, def_pct(rank));
136            }
137        }
138        if file_idx < graph.base_ranks.len() {
139            map.insert(file.path.clone(), base_pct(graph.base_ranks[file_idx]));
140        }
141    }
142    map
143}
144
145/// Build a `value → percentile` function from a slice of rank values.
146///
147/// Sorts a copy once at build time, then each lookup is a binary search
148/// over the sorted slice. Returns the empirical CDF: the fraction of
149/// values strictly less than the queried value. Handles empty input
150/// and `NaN` defensively.
151fn make_percentile_fn(values: &[f32]) -> impl Fn(f32) -> f32 + '_ {
152    let mut sorted: Vec<f32> = values.iter().copied().filter(|v| v.is_finite()).collect();
153    sorted.sort_unstable_by(f32::total_cmp);
154    move |value: f32| {
155        if sorted.is_empty() {
156            return 0.0;
157        }
158        // partition_point returns the count of elements strictly less
159        // than `value` (because the predicate is `<`).
160        let count_below = sorted.partition_point(|&v| v < value);
161        #[expect(
162            clippy::cast_precision_loss,
163            reason = "rank counts well below f32 precision threshold"
164        )]
165        let pct = count_below as f32 / sorted.len() as f32;
166        pct
167    }
168}
169
170// -----------------------------------------------------------------------------
171// Helpers preserved post-surgery (v3.0.0)
172// -----------------------------------------------------------------------------
173
174/// PageRank percentile -> boost sigmoid steepness.
175///
176/// Controls how steeply the sigmoid transitions from "no boost" (low
177/// percentile) to "max boost" (high percentile). A smaller value produces
178/// a sharper transition centered at the median.
179const PAGERANK_SIGMOID_STEEPNESS: f32 = 0.15;
180
181/// Look up the PageRank score for a chunk, with cascading fallbacks.
182///
183/// Tries the chunk's `definition::name` key first, then the bare file path,
184/// then strips leading path components one at a time until a hit. Returns
185/// 0.0 if no key matches.
186#[must_use]
187pub(crate) fn lookup_rank_for_chunk<S: std::hash::BuildHasher>(
188    pr: &HashMap<String, f32, S>,
189    file_path: &str,
190    name: &str,
191) -> f32 {
192    lookup_rank(pr, file_path, name)
193}
194
195#[must_use]
196fn lookup_rank<S: std::hash::BuildHasher>(
197    pr: &HashMap<String, f32, S>,
198    file_path: &str,
199    name: &str,
200) -> f32 {
201    let def_key = format!("{file_path}::{name}");
202    if let Some(&r) = pr.get(&def_key) {
203        return r;
204    }
205    if let Some(&r) = pr.get(file_path) {
206        return r;
207    }
208    let mut rest = file_path;
209    while let Some(idx) = rest.find('/') {
210        rest = &rest[idx + 1..];
211        if rest.is_empty() {
212            break;
213        }
214        let def_key = format!("{rest}::{name}");
215        if let Some(&r) = pr.get(&def_key) {
216            return r;
217        }
218        if let Some(&r) = pr.get(rest) {
219            return r;
220        }
221    }
222    0.0
223}
224
225#[cfg(test)]
226mod tests {
227    use super::*;
228
229    #[test]
230    fn search_mode_roundtrip() {
231        assert_eq!("hybrid".parse::<SearchMode>().unwrap(), SearchMode::Hybrid);
232        assert_eq!(
233            "semantic".parse::<SearchMode>().unwrap(),
234            SearchMode::Semantic
235        );
236        assert_eq!(
237            "keyword".parse::<SearchMode>().unwrap(),
238            SearchMode::Keyword
239        );
240
241        let err = "invalid".parse::<SearchMode>();
242        assert!(err.is_err(), "expected parse error for 'invalid'");
243        let msg = err.unwrap_err().to_string();
244        assert!(
245            msg.contains("invalid"),
246            "error message should echo the bad input"
247        );
248    }
249
250    #[test]
251    fn search_mode_display() {
252        assert_eq!(SearchMode::Hybrid.to_string(), "hybrid");
253        assert_eq!(SearchMode::Semantic.to_string(), "semantic");
254        assert_eq!(SearchMode::Keyword.to_string(), "keyword");
255    }
256
257    #[test]
258    fn pagerank_boost_amplifies_relevant() {
259        let chunks = vec![
260            CodeChunk {
261                file_path: "important.rs".into(),
262                name: "a".into(),
263                kind: "function".into(),
264                start_line: 1,
265                end_line: 10,
266                content: String::new(),
267                enriched_content: String::new(),
268            },
269            CodeChunk {
270                file_path: "obscure.rs".into(),
271                name: "b".into(),
272                kind: "function".into(),
273                start_line: 1,
274                end_line: 10,
275                content: String::new(),
276                enriched_content: String::new(),
277            },
278        ];
279
280        // Both start with same score; important.rs has high PageRank
281        let mut results = vec![(0, 0.8_f32), (1, 0.8)];
282        let mut pr = HashMap::new();
283        pr.insert("important.rs".to_string(), 1.0); // max PageRank
284        pr.insert("obscure.rs".to_string(), 0.1); // low PageRank
285
286        boost_with_pagerank(&mut results, &chunks, &pr, 0.3);
287
288        // important.rs should now rank higher
289        assert_eq!(
290            results[0].0, 0,
291            "important.rs should rank first after boost"
292        );
293        assert!(results[0].1 > results[1].1);
294
295        // Boost values reflect the sigmoid-on-percentile curve in
296        // `pagerank_boost_factor` (alpha=0.3 here):
297        // - percentile=1.0: sigmoid(3.33) ≈ 0.965, boost ≈ 1.29 → 1.032
298        // - percentile=0.1: sigmoid(-2.67) ≈ 0.065, boost ≈ 1.02 → 0.816
299        assert!(
300            (results[0].1 - 1.032).abs() < 0.01,
301            "rank=1.0 boost: expected ~1.032, got {}",
302            results[0].1
303        );
304        assert!(
305            (results[1].1 - 0.816).abs() < 0.01,
306            "rank=0.1 boost: expected ~0.816, got {}",
307            results[1].1
308        );
309    }
310
311    #[test]
312    fn pagerank_boost_zero_relevance_stays_zero() {
313        let chunks = vec![CodeChunk {
314            file_path: "important.rs".into(),
315            name: "a".into(),
316            kind: "function".into(),
317            start_line: 1,
318            end_line: 10,
319            content: String::new(),
320            enriched_content: String::new(),
321        }];
322
323        let mut results = vec![(0, 0.0_f32)];
324        let mut pr = HashMap::new();
325        pr.insert("important.rs".to_string(), 1.0);
326
327        boost_with_pagerank(&mut results, &chunks, &pr, 0.3);
328
329        // Zero score stays zero regardless of PageRank
330        assert!(results[0].1.abs() < f32::EPSILON);
331    }
332
333    #[test]
334    fn pagerank_boost_unknown_file_no_effect() {
335        let chunks = vec![CodeChunk {
336            file_path: "unknown.rs".into(),
337            name: "a".into(),
338            kind: "function".into(),
339            start_line: 1,
340            end_line: 10,
341            content: String::new(),
342            enriched_content: String::new(),
343        }];
344
345        let mut results = vec![(0, 0.5_f32)];
346        let pr = HashMap::new(); // empty — no PageRank data
347
348        boost_with_pagerank(&mut results, &chunks, &pr, 0.3);
349
350        // No PageRank data → no boost
351        assert!((results[0].1 - 0.5).abs() < f32::EPSILON);
352    }
353}