ripvec-core 3.1.1

Semantic code + document search engine. Cacheless static-embedding + cross-encoder rerank by default; optional ModernBERT/BGE transformer engines with GPU backends. Tree-sitter chunking, hybrid BM25 + PageRank, composable ranking layers.
Documentation
//! Hybrid semantic + keyword search with Reciprocal Rank Fusion (RRF).
//!
//! Search mode enum + helper functions for PageRank boosting and lookup
//! used by the ripvec engine. Pre-v3.0.0 this also contained `HybridIndex`
//! (the transformer-engine search index), which is now gone.

use std::collections::HashMap;
use std::fmt;
use std::str::FromStr;

use crate::chunk::CodeChunk;

/// Controls which retrieval strategy is used during search.
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
pub enum SearchMode {
    /// Fuse semantic (vector) and keyword (BM25) results via RRF.
    #[default]
    Hybrid,
    /// Dense vector cosine-similarity ranking only.
    Semantic,
    /// BM25 keyword ranking only.
    Keyword,
}

impl fmt::Display for SearchMode {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
            Self::Hybrid => f.write_str("hybrid"),
            Self::Semantic => f.write_str("semantic"),
            Self::Keyword => f.write_str("keyword"),
        }
    }
}

/// Error returned when a `SearchMode` string cannot be parsed.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ParseSearchModeError(String);

impl fmt::Display for ParseSearchModeError {
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        write!(
            f,
            "unknown search mode {:?}; expected hybrid, semantic, or keyword",
            self.0
        )
    }
}

impl std::error::Error for ParseSearchModeError {}

impl FromStr for SearchMode {
    type Err = ParseSearchModeError;

    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "hybrid" => Ok(Self::Hybrid),
            "semantic" => Ok(Self::Semantic),
            "keyword" => Ok(Self::Keyword),
            other => Err(ParseSearchModeError(other.to_string())),
        }
    }
}

#[must_use]
pub fn pagerank_boost_factor(percentile: f32, alpha: f32) -> f32 {
    if percentile <= 0.0 || alpha <= 0.0 {
        return 1.0;
    }
    let z = (percentile.clamp(0.0, 1.0) - 0.5) / PAGERANK_SIGMOID_STEEPNESS;
    let sigmoid = 1.0 / (1.0 + (-z).exp());
    1.0 + alpha * sigmoid
}

/// Apply a multiplicative PageRank boost to search results.
///
/// For each result, looks up the chunk's PageRank percentile and applies
/// the sigmoid boost from [`pagerank_boost_factor`].
///
/// Results are re-sorted after boosting.
///
/// `pagerank_by_file` maps relative file paths to their **PageRank
/// percentile** in the corpus distribution — not the raw rank value.
/// Build it via [`pagerank_lookup`], which switched to percentile in
/// service of the sigmoid curve.
///
/// `alpha` controls the maximum boost (ceiling = `1 + alpha`). The
/// `alpha` field from [`RepoGraph`] is recommended (auto-tuned from
/// graph density).
pub fn boost_with_pagerank<S: std::hash::BuildHasher>(
    results: &mut [(usize, f32)],
    chunks: &[CodeChunk],
    pagerank_by_file: &HashMap<String, f32, S>,
    alpha: f32,
) {
    // Operates on `&mut [_]` (not `&mut Vec<_>`) so we can't delegate
    // to `crate::ranking::PageRankBoost::apply` directly (the trait
    // method takes `&mut Vec` to allow truncation layers). Replicate
    // the boost loop inline; both paths share `lookup_rank` +
    // `pagerank_boost_factor` so the curve stays consistent.
    for (idx, score) in results.iter_mut() {
        if let Some(chunk) = chunks.get(*idx) {
            let rank = lookup_rank(pagerank_by_file, &chunk.file_path, &chunk.name);
            *score *= pagerank_boost_factor(rank, alpha);
        }
    }
    results.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
}

/// Build a per-file PageRank lookup table from a `RepoGraph`.
///
/// Returns a `file_path -> percentile` map, plus `file_path::name`
/// entries for definitions. Percentiles are in `[0, 1]`.
#[must_use]
pub fn pagerank_lookup(graph: &crate::repo_map::RepoGraph) -> HashMap<String, f32> {
    // Switched from `rank / max_rank` (proportional) to percentile in
    // the corpus distribution. Rationale: a top-K result set typically
    // contains files whose raw ranks are all in a tiny band near zero
    // (Tokio: max in top-10 was 0.028 out of 1.0). Proportional
    // normalization gave uniformly tiny boosts. Percentile separates
    // "bottom decile (tests, leaves)" from "top half (impls, hubs)"
    // crisply, and pairs with the sigmoid in `pagerank_boost_factor`
    // to put the rank-transition where the action is.
    //
    // Definition-level and file-level percentiles use independent
    // distributions: `def_ranks` and `base_ranks`. A file that has no
    // defs still gets a file-level percentile from `base_ranks`.
    let def_pct = make_percentile_fn(&graph.def_ranks);
    let base_pct = make_percentile_fn(&graph.base_ranks);
    let mut map = HashMap::new();
    for (file_idx, file) in graph.files.iter().enumerate() {
        for (def_idx, def) in file.defs.iter().enumerate() {
            let flat = graph.def_offsets[file_idx] + def_idx;
            if let Some(&rank) = graph.def_ranks.get(flat) {
                let key = format!("{}::{}", file.path, def.name);
                map.insert(key, def_pct(rank));
            }
        }
        if file_idx < graph.base_ranks.len() {
            map.insert(file.path.clone(), base_pct(graph.base_ranks[file_idx]));
        }
    }
    map
}

/// Build a `value → percentile` function from a slice of rank values.
///
/// Sorts a copy once at build time, then each lookup is a binary search
/// over the sorted slice. Returns the empirical CDF: the fraction of
/// values strictly less than the queried value. Handles empty input
/// and `NaN` defensively.
fn make_percentile_fn(values: &[f32]) -> impl Fn(f32) -> f32 + '_ {
    let mut sorted: Vec<f32> = values.iter().copied().filter(|v| v.is_finite()).collect();
    sorted.sort_unstable_by(f32::total_cmp);
    move |value: f32| {
        if sorted.is_empty() {
            return 0.0;
        }
        // partition_point returns the count of elements strictly less
        // than `value` (because the predicate is `<`).
        let count_below = sorted.partition_point(|&v| v < value);
        #[expect(
            clippy::cast_precision_loss,
            reason = "rank counts well below f32 precision threshold"
        )]
        let pct = count_below as f32 / sorted.len() as f32;
        pct
    }
}

// -----------------------------------------------------------------------------
// Helpers preserved post-surgery (v3.0.0)
// -----------------------------------------------------------------------------

/// PageRank percentile -> boost sigmoid steepness.
///
/// Controls how steeply the sigmoid transitions from "no boost" (low
/// percentile) to "max boost" (high percentile). A smaller value produces
/// a sharper transition centered at the median.
const PAGERANK_SIGMOID_STEEPNESS: f32 = 0.15;

/// Look up the PageRank score for a chunk, with cascading fallbacks.
///
/// Tries the chunk's `definition::name` key first, then the bare file path,
/// then strips leading path components one at a time until a hit. Returns
/// 0.0 if no key matches.
#[must_use]
pub(crate) fn lookup_rank_for_chunk<S: std::hash::BuildHasher>(
    pr: &HashMap<String, f32, S>,
    file_path: &str,
    name: &str,
) -> f32 {
    lookup_rank(pr, file_path, name)
}

#[must_use]
fn lookup_rank<S: std::hash::BuildHasher>(
    pr: &HashMap<String, f32, S>,
    file_path: &str,
    name: &str,
) -> f32 {
    let def_key = format!("{file_path}::{name}");
    if let Some(&r) = pr.get(&def_key) {
        return r;
    }
    if let Some(&r) = pr.get(file_path) {
        return r;
    }
    let mut rest = file_path;
    while let Some(idx) = rest.find('/') {
        rest = &rest[idx + 1..];
        if rest.is_empty() {
            break;
        }
        let def_key = format!("{rest}::{name}");
        if let Some(&r) = pr.get(&def_key) {
            return r;
        }
        if let Some(&r) = pr.get(rest) {
            return r;
        }
    }
    0.0
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn search_mode_roundtrip() {
        assert_eq!("hybrid".parse::<SearchMode>().unwrap(), SearchMode::Hybrid);
        assert_eq!(
            "semantic".parse::<SearchMode>().unwrap(),
            SearchMode::Semantic
        );
        assert_eq!(
            "keyword".parse::<SearchMode>().unwrap(),
            SearchMode::Keyword
        );

        let err = "invalid".parse::<SearchMode>();
        assert!(err.is_err(), "expected parse error for 'invalid'");
        let msg = err.unwrap_err().to_string();
        assert!(
            msg.contains("invalid"),
            "error message should echo the bad input"
        );
    }

    #[test]
    fn search_mode_display() {
        assert_eq!(SearchMode::Hybrid.to_string(), "hybrid");
        assert_eq!(SearchMode::Semantic.to_string(), "semantic");
        assert_eq!(SearchMode::Keyword.to_string(), "keyword");
    }

    #[test]
    fn pagerank_boost_amplifies_relevant() {
        let chunks = vec![
            CodeChunk {
                file_path: "important.rs".into(),
                name: "a".into(),
                kind: "function".into(),
                start_line: 1,
                end_line: 10,
                content: String::new(),
                enriched_content: String::new(),
            },
            CodeChunk {
                file_path: "obscure.rs".into(),
                name: "b".into(),
                kind: "function".into(),
                start_line: 1,
                end_line: 10,
                content: String::new(),
                enriched_content: String::new(),
            },
        ];

        // Both start with same score; important.rs has high PageRank
        let mut results = vec![(0, 0.8_f32), (1, 0.8)];
        let mut pr = HashMap::new();
        pr.insert("important.rs".to_string(), 1.0); // max PageRank
        pr.insert("obscure.rs".to_string(), 0.1); // low PageRank

        boost_with_pagerank(&mut results, &chunks, &pr, 0.3);

        // important.rs should now rank higher
        assert_eq!(
            results[0].0, 0,
            "important.rs should rank first after boost"
        );
        assert!(results[0].1 > results[1].1);

        // Boost values reflect the sigmoid-on-percentile curve in
        // `pagerank_boost_factor` (alpha=0.3 here):
        // - percentile=1.0: sigmoid(3.33) ≈ 0.965, boost ≈ 1.29 → 1.032
        // - percentile=0.1: sigmoid(-2.67) ≈ 0.065, boost ≈ 1.02 → 0.816
        assert!(
            (results[0].1 - 1.032).abs() < 0.01,
            "rank=1.0 boost: expected ~1.032, got {}",
            results[0].1
        );
        assert!(
            (results[1].1 - 0.816).abs() < 0.01,
            "rank=0.1 boost: expected ~0.816, got {}",
            results[1].1
        );
    }

    #[test]
    fn pagerank_boost_zero_relevance_stays_zero() {
        let chunks = vec![CodeChunk {
            file_path: "important.rs".into(),
            name: "a".into(),
            kind: "function".into(),
            start_line: 1,
            end_line: 10,
            content: String::new(),
            enriched_content: String::new(),
        }];

        let mut results = vec![(0, 0.0_f32)];
        let mut pr = HashMap::new();
        pr.insert("important.rs".to_string(), 1.0);

        boost_with_pagerank(&mut results, &chunks, &pr, 0.3);

        // Zero score stays zero regardless of PageRank
        assert!(results[0].1.abs() < f32::EPSILON);
    }

    #[test]
    fn pagerank_boost_unknown_file_no_effect() {
        let chunks = vec![CodeChunk {
            file_path: "unknown.rs".into(),
            name: "a".into(),
            kind: "function".into(),
            start_line: 1,
            end_line: 10,
            content: String::new(),
            enriched_content: String::new(),
        }];

        let mut results = vec![(0, 0.5_f32)];
        let pr = HashMap::new(); // empty — no PageRank data

        boost_with_pagerank(&mut results, &chunks, &pr, 0.3);

        // No PageRank data → no boost
        assert!((results[0].1 - 0.5).abs() < f32::EPSILON);
    }
}