ripvec_core/hybrid.rs
1//! Hybrid semantic + keyword search with Reciprocal Rank Fusion (RRF).
2//!
3//! [`HybridIndex`] wraps a [`SearchIndex`] (dense vector search) and a
4//! [`Bm25Index`] (BM25 keyword search) and fuses their ranked results via
5//! Reciprocal Rank Fusion so that chunks appearing high in either list
6//! bubble to the top of the combined ranking.
7
8use std::collections::HashMap;
9use std::fmt;
10use std::str::FromStr;
11
12use crate::bm25::Bm25Index;
13use crate::chunk::CodeChunk;
14use crate::index::SearchIndex;
15
16/// Controls which retrieval strategy is used during search.
17#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
18pub enum SearchMode {
19 /// Fuse semantic (vector) and keyword (BM25) results via RRF.
20 #[default]
21 Hybrid,
22 /// Dense vector cosine-similarity ranking only.
23 Semantic,
24 /// BM25 keyword ranking only.
25 Keyword,
26}
27
28impl fmt::Display for SearchMode {
29 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
30 match self {
31 Self::Hybrid => f.write_str("hybrid"),
32 Self::Semantic => f.write_str("semantic"),
33 Self::Keyword => f.write_str("keyword"),
34 }
35 }
36}
37
38/// Error returned when a `SearchMode` string cannot be parsed.
39#[derive(Debug, Clone, PartialEq, Eq)]
40pub struct ParseSearchModeError(String);
41
42impl fmt::Display for ParseSearchModeError {
43 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
44 write!(
45 f,
46 "unknown search mode {:?}; expected hybrid, semantic, or keyword",
47 self.0
48 )
49 }
50}
51
52impl std::error::Error for ParseSearchModeError {}
53
54impl FromStr for SearchMode {
55 type Err = ParseSearchModeError;
56
57 fn from_str(s: &str) -> Result<Self, Self::Err> {
58 match s {
59 "hybrid" => Ok(Self::Hybrid),
60 "semantic" => Ok(Self::Semantic),
61 "keyword" => Ok(Self::Keyword),
62 other => Err(ParseSearchModeError(other.to_string())),
63 }
64 }
65}
66
67/// Combined semantic + keyword search index with RRF fusion.
68///
69/// Build once from chunks and pre-computed embeddings; query repeatedly
70/// via [`search`](Self::search).
71pub struct HybridIndex {
72 /// Semantic (dense vector) search index.
73 pub semantic: SearchIndex,
74 /// BM25 keyword search index.
75 bm25: Bm25Index,
76}
77
78impl HybridIndex {
79 /// Build a `HybridIndex` from raw chunks and their pre-computed embeddings.
80 ///
81 /// Constructs both the [`SearchIndex`] and [`Bm25Index`] in one call.
82 /// `cascade_dim` is forwarded to [`SearchIndex::new`] for optional MRL
83 /// cascade pre-filtering.
84 ///
85 /// # Errors
86 ///
87 /// Returns an error if the BM25 index cannot be built (e.g., tantivy
88 /// schema or writer failure).
89 pub fn new(
90 chunks: Vec<CodeChunk>,
91 embeddings: &[Vec<f32>],
92 cascade_dim: Option<usize>,
93 ) -> crate::Result<Self> {
94 let bm25 = Bm25Index::build(&chunks)?;
95 let semantic = SearchIndex::new(chunks, embeddings, cascade_dim);
96 Ok(Self { semantic, bm25 })
97 }
98
99 /// Assemble a `HybridIndex` from pre-built components.
100 ///
101 /// Useful when the caller has already constructed the sub-indices
102 /// separately (e.g., loaded from a cache).
103 #[must_use]
104 pub fn from_parts(semantic: SearchIndex, bm25: Bm25Index) -> Self {
105 Self { semantic, bm25 }
106 }
107
108 /// Search the index and return `(chunk_index, score)` pairs.
109 ///
110 /// Dispatches based on `mode`:
111 /// - [`SearchMode::Semantic`] — pure dense vector search via
112 /// [`SearchIndex::rank`].
113 /// - [`SearchMode::Keyword`] — pure BM25 keyword search, truncated to
114 /// `top_k`.
115 /// - [`SearchMode::Hybrid`] — retrieves both ranked lists, fuses them
116 /// with [`rrf_fuse`], then truncates to `top_k`.
117 ///
118 /// Scores are min-max normalized to `[0, 1]` regardless of mode, so
119 /// a threshold of 0.5 always means "above midpoint of the score range"
120 /// whether the underlying scores are cosine similarity, BM25, or RRF.
121 #[must_use]
122 pub fn search(
123 &self,
124 query_embedding: &[f32],
125 query_text: &str,
126 top_k: usize,
127 threshold: f32,
128 mode: SearchMode,
129 ) -> Vec<(usize, f32)> {
130 let mut raw = match mode {
131 SearchMode::Semantic => {
132 // Fetch more than top_k so normalization has a meaningful range.
133 self.semantic
134 .rank_turboquant(query_embedding, top_k.max(100), 0.0)
135 }
136 SearchMode::Keyword => self.bm25.search(query_text, top_k.max(100)),
137 SearchMode::Hybrid => {
138 let sem = self
139 .semantic
140 .rank_turboquant(query_embedding, top_k.max(100), 0.0);
141 let kw = self.bm25.search(query_text, top_k.max(100));
142 rrf_fuse(&sem, &kw, 60.0)
143 }
144 };
145
146 // Min-max normalize scores to [0, 1] so threshold is model-agnostic.
147 if let (Some(max), Some(min)) = (raw.first().map(|(_, s)| *s), raw.last().map(|(_, s)| *s))
148 {
149 let range = max - min;
150 if range > f32::EPSILON {
151 for (_, score) in &mut raw {
152 *score = (*score - min) / range;
153 }
154 } else {
155 // All scores identical — normalize to 1.0
156 for (_, score) in &mut raw {
157 *score = 1.0;
158 }
159 }
160 }
161
162 // Apply threshold on normalized scores, then truncate
163 raw.retain(|(_, score)| *score >= threshold);
164 raw.truncate(top_k);
165 raw
166 }
167
168 /// All chunks in the index.
169 #[must_use]
170 pub fn chunks(&self) -> &[CodeChunk] {
171 &self.semantic.chunks
172 }
173}
174
175impl crate::searchable::SearchableIndex for HybridIndex {
176 fn chunks(&self) -> &[CodeChunk] {
177 HybridIndex::chunks(self)
178 }
179
180 fn search(&self, query_text: &str, top_k: usize, mode: SearchMode) -> Vec<(usize, f32)> {
181 // Trait-shape search: no caller-supplied query embedding.
182 // BM25 handles text directly. Semantic and hybrid modes would
183 // require an embedded query vector — without one they would
184 // search against a zero vector, which matches nothing
185 // useful — so we route all three modes through keyword. The
186 // caller wanting semantic results should use
187 // `search_from_chunk` (the canonical goto-definition pattern)
188 // which supplies the source chunk's embedding.
189 let _ = mode;
190 HybridIndex::search(self, &[], query_text, top_k, 0.0, SearchMode::Keyword)
191 }
192
193 fn search_from_chunk(
194 &self,
195 chunk_idx: usize,
196 query_text: &str,
197 top_k: usize,
198 mode: SearchMode,
199 ) -> Vec<(usize, f32)> {
200 let embedding = self.semantic.embedding(chunk_idx).unwrap_or_default();
201 let effective_mode = if embedding.is_empty() {
202 SearchMode::Keyword
203 } else {
204 mode
205 };
206 HybridIndex::search(self, &embedding, query_text, top_k, 0.0, effective_mode)
207 }
208
209 fn as_any(&self) -> &dyn std::any::Any {
210 self
211 }
212}
213
214/// Reciprocal Rank Fusion of two ranked lists.
215///
216/// Each entry in `semantic` and `bm25` is `(chunk_index, _score)`.
217/// The fused score for a chunk is the sum of `1 / (k + rank + 1)` across
218/// every list the chunk appears in, where `rank` is 0-based.
219///
220/// Returns all chunks that appear in either list, sorted descending by
221/// fused RRF score.
222///
223/// `k` should typically be 60.0 — a conventional constant that smooths the
224/// ranking boost for the very top results.
225#[must_use]
226pub fn rrf_fuse(semantic: &[(usize, f32)], bm25: &[(usize, f32)], k: f32) -> Vec<(usize, f32)> {
227 let mut scores: HashMap<usize, f32> = HashMap::new();
228
229 for (rank, &(idx, _)) in semantic.iter().enumerate() {
230 *scores.entry(idx).or_insert(0.0) += 1.0 / (k + rank as f32 + 1.0);
231 }
232 for (rank, &(idx, _)) in bm25.iter().enumerate() {
233 *scores.entry(idx).or_insert(0.0) += 1.0 / (k + rank as f32 + 1.0);
234 }
235
236 let mut results: Vec<(usize, f32)> = scores.into_iter().collect();
237 results.sort_unstable_by(|a, b| {
238 b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)) // stable tie-break by chunk index
239 });
240 results
241}
242
243/// Sigmoid steepness for the PageRank percentile boost. Lower values
244/// produce a sharper transition between "below median" (low boost) and
245/// "above median" (full boost).
246const PAGERANK_SIGMOID_STEEPNESS: f32 = 0.15;
247
248/// Sigmoid-shaped multiplicative boost factor for a single PageRank
249/// **percentile** in the corpus (not the raw rank value).
250///
251/// Returns the multiplier (so the final score is `dense_score * factor`).
252///
253/// ```text
254/// factor = 1 + alpha * sigmoid((percentile - 0.5) / s)
255/// sigmoid(z) = 1 / (1 + exp(-z))
256/// ```
257///
258/// where `s = PAGERANK_SIGMOID_STEEPNESS`.
259///
260/// ## Why this shape, with examples
261///
262/// The first attempt used a logarithmic saturation curve on raw rank
263/// values. That failed because raw ranks in a top-K result set
264/// concentrate in a tiny band (max ≈ 0.028 in Tokio), producing
265/// uniformly tiny boosts. The next attempt added a "presence floor"
266/// for `rank > 0`, which failed because tests also have tiny-but-
267/// positive PR from PageRank's damping term — both impl and test
268/// cleared the floor equally.
269///
270/// Switching the input to **percentile in the corpus** fixes both
271/// pathologies. A test with no inbound edges sits in the bottom decile
272/// of the PR distribution (percentile ≈ 0.05); a typical
273/// implementation file sits above the median. The sigmoid then makes
274/// the transition between "below median" (no boost) and "above median"
275/// (near-full boost) sharp:
276///
277/// | percentile | sigmoid | boost (α=0.5) |
278/// |------------|---------|---------------|
279/// | 0.05 (low test) | 0.04 | 1.02× |
280/// | 0.30 | 0.21 | 1.10× |
281/// | 0.50 (median) | 0.50 | 1.25× |
282/// | 0.70 | 0.79 | 1.40× |
283/// | 0.95 (top impl)| 0.95 | 1.47× |
284///
285/// Ceiling at `1 + α` — with `α = 0.5` that's 1.5×, bounded enough to
286/// keep PageRank a tiebreaker rather than a dominator: an irrelevant
287/// top-PR file with dense score 0.6 gets `0.6 × 1.5 = 0.9` and still
288/// loses to a relevant low-PR file scoring above 0.9.
289///
290/// This matches the two design constraints:
291/// 1. A test (low percentile) should not be lifted above an impl
292/// (high percentile) on similar dense scores. Sigmoid centered at
293/// 0.5 makes "below median" almost-no-boost.
294/// 2. A heavily-imported file shouldn't dominate. The sigmoid plateau
295/// above `percentile > 0.85` means a singularly-popular file gets
296/// barely more boost than a moderately-popular one.
297#[must_use]
298pub fn pagerank_boost_factor(percentile: f32, alpha: f32) -> f32 {
299 if percentile <= 0.0 || alpha <= 0.0 {
300 return 1.0;
301 }
302 let z = (percentile.clamp(0.0, 1.0) - 0.5) / PAGERANK_SIGMOID_STEEPNESS;
303 let sigmoid = 1.0 / (1.0 + (-z).exp());
304 1.0 + alpha * sigmoid
305}
306
307/// Apply a multiplicative PageRank boost to search results.
308///
309/// For each result, looks up the chunk's PageRank percentile and applies
310/// the sigmoid boost from [`pagerank_boost_factor`].
311///
312/// Results are re-sorted after boosting.
313///
314/// `pagerank_by_file` maps relative file paths to their **PageRank
315/// percentile** in the corpus distribution — not the raw rank value.
316/// Build it via [`pagerank_lookup`], which switched to percentile in
317/// service of the sigmoid curve.
318///
319/// `alpha` controls the maximum boost (ceiling = `1 + alpha`). The
320/// `alpha` field from [`RepoGraph`] is recommended (auto-tuned from
321/// graph density).
322pub fn boost_with_pagerank<S: std::hash::BuildHasher>(
323 results: &mut [(usize, f32)],
324 chunks: &[CodeChunk],
325 pagerank_by_file: &HashMap<String, f32, S>,
326 alpha: f32,
327) {
328 // Operates on `&mut [_]` (not `&mut Vec<_>`) so we can't delegate
329 // to `crate::ranking::PageRankBoost::apply` directly (the trait
330 // method takes `&mut Vec` to allow truncation layers). Replicate
331 // the boost loop inline; both paths share `lookup_rank` +
332 // `pagerank_boost_factor` so the curve stays consistent.
333 for (idx, score) in results.iter_mut() {
334 if let Some(chunk) = chunks.get(*idx) {
335 let rank = lookup_rank(pagerank_by_file, &chunk.file_path, &chunk.name);
336 *score *= pagerank_boost_factor(rank, alpha);
337 }
338 }
339 results.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
340}
341
342/// `boost_with_pagerank` variant that operates on `SearchResult` directly,
343/// for callers that don't have the raw `(usize, f32)` pair at hand.
344///
345/// Same boost math as [`boost_with_pagerank`]; re-sorts in place.
346pub fn boost_with_pagerank_results<S: std::hash::BuildHasher>(
347 results: &mut [crate::embed::SearchResult],
348 pagerank_by_file: &HashMap<String, f32, S>,
349 alpha: f32,
350) {
351 // SearchResult shape; inline math like `boost_with_pagerank`.
352 for r in results.iter_mut() {
353 let rank = lookup_rank(pagerank_by_file, &r.chunk.file_path, &r.chunk.name);
354 r.similarity *= pagerank_boost_factor(rank, alpha);
355 }
356 results.sort_unstable_by(|a, b| b.similarity.total_cmp(&a.similarity));
357}
358
359/// Resolve a chunk's PageRank score from a path that may be rooted
360/// differently than the graph keys.
361///
362/// Background: `RepoGraph` stores `FileNode.path` as `path.strip_prefix(root)`
363/// where `root` is the **canonicalized** corpus root. Chunk
364/// `file_path` is `path.display()` where `path` came from the walker —
365/// which uses the caller-supplied root **as-is** (not canonicalized).
366/// When the caller passes `tests/corpus/code/tokio`, chunk paths look
367/// like `tests/corpus/code/tokio/tokio/src/.../foo.rs` while graph
368/// keys look like `tokio/src/.../foo.rs`. Direct lookup never hits.
369///
370/// This function tries: definition-level exact (`"file::name"`),
371/// file-level exact, then walks the chunk path one segment at a time
372/// from the left and retries each suffix. First match wins.
373///
374/// (The proper fix is to normalize chunk paths at chunk-creation time
375/// to be relative to the canonicalized corpus root; that's a larger
376/// refactor planned alongside the `RankingLayer` work. Suffix matching
377/// is the surgical patch that makes PageRank actually function.)
378/// Re-exported under a longer name for use from the
379/// [`crate::ranking`] module. Kept as a `pub(crate)` symbol so it
380/// doesn't leak into the public surface; the canonical access point
381/// is [`crate::ranking::PageRankBoost`].
382pub(crate) fn lookup_rank_for_chunk<S: std::hash::BuildHasher>(
383 pr: &HashMap<String, f32, S>,
384 file_path: &str,
385 name: &str,
386) -> f32 {
387 lookup_rank(pr, file_path, name)
388}
389
390fn lookup_rank<S: std::hash::BuildHasher>(
391 pr: &HashMap<String, f32, S>,
392 file_path: &str,
393 name: &str,
394) -> f32 {
395 let def_key = format!("{file_path}::{name}");
396 if let Some(&r) = pr.get(&def_key) {
397 return r;
398 }
399 if let Some(&r) = pr.get(file_path) {
400 return r;
401 }
402 // Slide a left-edge cursor through the path. For
403 // `a/b/c/d/foo.rs` try `b/c/d/foo.rs`, then `c/d/foo.rs`, etc.
404 // Path components are typically <= 8 levels, so this is cheap.
405 let mut rest = file_path;
406 while let Some(idx) = rest.find('/') {
407 rest = &rest[idx + 1..];
408 if rest.is_empty() {
409 break;
410 }
411 let def_key = format!("{rest}::{name}");
412 if let Some(&r) = pr.get(&def_key) {
413 return r;
414 }
415 if let Some(&r) = pr.get(rest) {
416 return r;
417 }
418 }
419 0.0
420}
421
422/// Build a normalized PageRank lookup table from a [`RepoGraph`].
423///
424/// Returns a map from `"file_path::def_name"` to definition-level PageRank
425/// normalized to `[0, 1]`. Also inserts file-level entries (`"file_path"`)
426/// as aggregated fallback for chunks that don't match a specific definition.
427#[must_use]
428pub fn pagerank_lookup(graph: &crate::repo_map::RepoGraph) -> HashMap<String, f32> {
429 // Switched from `rank / max_rank` (proportional) to percentile in
430 // the corpus distribution. Rationale: a top-K result set typically
431 // contains files whose raw ranks are all in a tiny band near zero
432 // (Tokio: max in top-10 was 0.028 out of 1.0). Proportional
433 // normalization gave uniformly tiny boosts. Percentile separates
434 // "bottom decile (tests, leaves)" from "top half (impls, hubs)"
435 // crisply, and pairs with the sigmoid in `pagerank_boost_factor`
436 // to put the rank-transition where the action is.
437 //
438 // Definition-level and file-level percentiles use independent
439 // distributions: `def_ranks` and `base_ranks`. A file that has no
440 // defs still gets a file-level percentile from `base_ranks`.
441 let def_pct = make_percentile_fn(&graph.def_ranks);
442 let base_pct = make_percentile_fn(&graph.base_ranks);
443 let mut map = HashMap::new();
444 for (file_idx, file) in graph.files.iter().enumerate() {
445 for (def_idx, def) in file.defs.iter().enumerate() {
446 let flat = graph.def_offsets[file_idx] + def_idx;
447 if let Some(&rank) = graph.def_ranks.get(flat) {
448 let key = format!("{}::{}", file.path, def.name);
449 map.insert(key, def_pct(rank));
450 }
451 }
452 if file_idx < graph.base_ranks.len() {
453 map.insert(file.path.clone(), base_pct(graph.base_ranks[file_idx]));
454 }
455 }
456 map
457}
458
459/// Build a `value → percentile` function from a slice of rank values.
460///
461/// Sorts a copy once at build time, then each lookup is a binary search
462/// over the sorted slice. Returns the empirical CDF: the fraction of
463/// values strictly less than the queried value. Handles empty input
464/// and `NaN` defensively.
465fn make_percentile_fn(values: &[f32]) -> impl Fn(f32) -> f32 + '_ {
466 let mut sorted: Vec<f32> = values.iter().copied().filter(|v| v.is_finite()).collect();
467 sorted.sort_unstable_by(f32::total_cmp);
468 move |value: f32| {
469 if sorted.is_empty() {
470 return 0.0;
471 }
472 // partition_point returns the count of elements strictly less
473 // than `value` (because the predicate is `<`).
474 let count_below = sorted.partition_point(|&v| v < value);
475 #[expect(
476 clippy::cast_precision_loss,
477 reason = "rank counts well below f32 precision threshold"
478 )]
479 let pct = count_below as f32 / sorted.len() as f32;
480 pct
481 }
482}
483
484#[cfg(test)]
485mod tests {
486 use super::*;
487
488 #[test]
489 fn rrf_union_semantics() {
490 // sem: [0, 1, 2], bm25: [3, 0, 4]
491 // Chunk 0 appears in both lists → highest RRF score.
492 // Chunks 1, 2, 3, 4 appear in exactly one list → all five appear.
493 let sem = vec![(0, 0.9), (1, 0.8), (2, 0.7)];
494 let bm25 = vec![(3, 10.0), (0, 8.0), (4, 6.0)];
495
496 let fused = rrf_fuse(&sem, &bm25, 60.0);
497
498 let indices: Vec<usize> = fused.iter().map(|&(i, _)| i).collect();
499
500 // All 5 unique chunks must appear
501 for expected in [0, 1, 2, 3, 4] {
502 assert!(
503 indices.contains(&expected),
504 "chunk {expected} missing from fused results"
505 );
506 }
507 assert_eq!(fused.len(), 5);
508
509 // Chunk 0 must rank first (double-list bonus)
510 assert_eq!(indices[0], 0, "chunk 0 should rank first");
511 }
512
513 #[test]
514 fn rrf_single_list() {
515 // Only semantic results; BM25 is empty.
516 let sem = vec![(0, 0.9), (1, 0.8)];
517 let bm25: Vec<(usize, f32)> = vec![];
518
519 let fused = rrf_fuse(&sem, &bm25, 60.0);
520
521 assert_eq!(fused.len(), 2);
522 // Chunk 0 ranked first in sem list → higher RRF score than chunk 1
523 assert_eq!(fused[0].0, 0);
524 assert_eq!(fused[1].0, 1);
525 assert!(fused[0].1 > fused[1].1);
526 }
527
528 #[test]
529 fn search_mode_roundtrip() {
530 assert_eq!("hybrid".parse::<SearchMode>().unwrap(), SearchMode::Hybrid);
531 assert_eq!(
532 "semantic".parse::<SearchMode>().unwrap(),
533 SearchMode::Semantic
534 );
535 assert_eq!(
536 "keyword".parse::<SearchMode>().unwrap(),
537 SearchMode::Keyword
538 );
539
540 let err = "invalid".parse::<SearchMode>();
541 assert!(err.is_err(), "expected parse error for 'invalid'");
542 let msg = err.unwrap_err().to_string();
543 assert!(
544 msg.contains("invalid"),
545 "error message should echo the bad input"
546 );
547 }
548
549 #[test]
550 fn search_mode_display() {
551 assert_eq!(SearchMode::Hybrid.to_string(), "hybrid");
552 assert_eq!(SearchMode::Semantic.to_string(), "semantic");
553 assert_eq!(SearchMode::Keyword.to_string(), "keyword");
554 }
555
556 #[test]
557 fn pagerank_boost_amplifies_relevant() {
558 let chunks = vec![
559 CodeChunk {
560 file_path: "important.rs".into(),
561 name: "a".into(),
562 kind: "function".into(),
563 start_line: 1,
564 end_line: 10,
565 content: String::new(),
566 enriched_content: String::new(),
567 },
568 CodeChunk {
569 file_path: "obscure.rs".into(),
570 name: "b".into(),
571 kind: "function".into(),
572 start_line: 1,
573 end_line: 10,
574 content: String::new(),
575 enriched_content: String::new(),
576 },
577 ];
578
579 // Both start with same score; important.rs has high PageRank
580 let mut results = vec![(0, 0.8_f32), (1, 0.8)];
581 let mut pr = HashMap::new();
582 pr.insert("important.rs".to_string(), 1.0); // max PageRank
583 pr.insert("obscure.rs".to_string(), 0.1); // low PageRank
584
585 boost_with_pagerank(&mut results, &chunks, &pr, 0.3);
586
587 // important.rs should now rank higher
588 assert_eq!(
589 results[0].0, 0,
590 "important.rs should rank first after boost"
591 );
592 assert!(results[0].1 > results[1].1);
593
594 // Boost values reflect the sigmoid-on-percentile curve in
595 // `pagerank_boost_factor` (alpha=0.3 here):
596 // - percentile=1.0: sigmoid(3.33) ≈ 0.965, boost ≈ 1.29 → 1.032
597 // - percentile=0.1: sigmoid(-2.67) ≈ 0.065, boost ≈ 1.02 → 0.816
598 assert!(
599 (results[0].1 - 1.032).abs() < 0.01,
600 "rank=1.0 boost: expected ~1.032, got {}",
601 results[0].1
602 );
603 assert!(
604 (results[1].1 - 0.816).abs() < 0.01,
605 "rank=0.1 boost: expected ~0.816, got {}",
606 results[1].1
607 );
608 }
609
610 #[test]
611 fn pagerank_boost_zero_relevance_stays_zero() {
612 let chunks = vec![CodeChunk {
613 file_path: "important.rs".into(),
614 name: "a".into(),
615 kind: "function".into(),
616 start_line: 1,
617 end_line: 10,
618 content: String::new(),
619 enriched_content: String::new(),
620 }];
621
622 let mut results = vec![(0, 0.0_f32)];
623 let mut pr = HashMap::new();
624 pr.insert("important.rs".to_string(), 1.0);
625
626 boost_with_pagerank(&mut results, &chunks, &pr, 0.3);
627
628 // Zero score stays zero regardless of PageRank
629 assert!(results[0].1.abs() < f32::EPSILON);
630 }
631
632 #[test]
633 fn pagerank_boost_unknown_file_no_effect() {
634 let chunks = vec![CodeChunk {
635 file_path: "unknown.rs".into(),
636 name: "a".into(),
637 kind: "function".into(),
638 start_line: 1,
639 end_line: 10,
640 content: String::new(),
641 enriched_content: String::new(),
642 }];
643
644 let mut results = vec![(0, 0.5_f32)];
645 let pr = HashMap::new(); // empty — no PageRank data
646
647 boost_with_pagerank(&mut results, &chunks, &pr, 0.3);
648
649 // No PageRank data → no boost
650 assert!((results[0].1 - 0.5).abs() < f32::EPSILON);
651 }
652}