ripvec-core 3.1.2

//! Composable ranking layers for search results.
//!
//! ## Why this module exists
//!
//! Before this refactor, ranking logic was scattered across four call
//! sites with bespoke layer combinations:
//!
//! | Call site | PageRank | Path penalty | Threshold | TopK |
//! |---|---|---|---|---|
//! | CLI `run_oneshot` (indexed) | ❌ | ❌ | inside `search()` | inside `search()` |
//! | CLI `run_oneshot` (stateless) | ❌ | ❌ | inside `search()` | inside `search()` |
//! | MCP `search_code` | ✅ | ❌ | inside `search()` | inside `search()` |
//! | LSP nav/symbols | ✅ (α=0.3 hardcoded) | ❌ | inside `search()` | inside `search()` |
//! | `RipvecIndex::search` | ✅ (optional) | ✅ | inside reranker | inside reranker |
//!
//! Three concrete bugs landed today because of this scatter: (1)
//! PageRank silently absent from the CLI; (2) PageRank lookups hit
//! zero entries due to path-rooting mismatch — the same bug present
//! in every call site that used `boost_with_pagerank` before today's
//! fix; (3) path penalty regex matched the corpus-root prefix when
//! invoked from CWD-rooted chunk paths.
//!
//! The fix: a single [`RankingLayer`] trait that each call site
//! composes into a pipeline. Layers are independently testable, the
//! pipeline shape at each call site is explicit, and adding a new
//! ranking signal (e.g., recency, file-saturation diversification)
//! is a single new `impl RankingLayer`.
//!
//! ## Convention
//!
//! Layers operate on `Vec<(chunk_idx, score)>` with a parallel
//! `&[CodeChunk]` for metadata lookup. Layers MAY:
//!
//! - Mutate scores in place (boost / penalty layers).
//! - Reorder the vec (sort layers — most boost layers re-sort
//!   internally so downstream layers see descending order).
//! - Drop entries (threshold / topK layers).
//!
//! When a layer reorders, it MUST leave the vec sorted descending by
//! score so downstream layers (especially threshold + topK) operate
//! on a meaningful ordering.

use std::collections::HashMap;
use std::path::PathBuf;

use crate::chunk::CodeChunk;

/// A composable layer in the ranking pipeline.
///
/// Implementations operate on the full `(idx, score)` list plus the
/// canonical chunks slice. See the module-level docs for ordering
/// conventions.
pub trait RankingLayer: Send + Sync {
    /// Apply this layer's transformation.
    fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]);
}

/// Apply a sequence of ranking layers in order.
///
/// Each layer's effect is visible to subsequent layers. Returns the
/// final `items` after all layers have run.
pub fn apply_chain(
    items: &mut Vec<(usize, f32)>,
    chunks: &[CodeChunk],
    layers: &[Box<dyn RankingLayer>],
) {
    for layer in layers {
        layer.apply(items, chunks);
    }
}

// ---------------------------------------------------------------------------
// PageRankBoost
// ---------------------------------------------------------------------------

/// Multiplicative PageRank boost using the sigmoid-on-percentile curve
/// from [`crate::hybrid::pagerank_boost_factor`].
///
/// `pagerank_by_file` maps relative file paths to percentile values
/// in the corpus distribution (build it via
/// [`crate::hybrid::pagerank_lookup`]). `alpha` controls the maximum
/// boost; ceiling is `1 + alpha`.
pub struct PageRankBoost {
    /// Shared via `Arc` so per-query construction is a pointer-bump
    /// instead of a full HashMap clone. The earlier shape took
    /// `HashMap<String, f32>` by value and `RipvecIndex::apply_pagerank_layer`
    /// called `lookup.clone()` on every query; on a 1M-chunk corpus that
    /// was ~5.9 s of `String::clone` per 20-query batch (profile, samply,
    /// 2026-05-21). Cloning an Arc is a relaxed atomic increment.
    pagerank: std::sync::Arc<HashMap<String, f32>>,
    alpha: f32,
}

impl PageRankBoost {
    /// Construct from a pre-built percentile lookup.
    ///
    /// Take `Arc<HashMap<..>>` so the caller can build the lookup once
    /// at index time and hand out cheap clones per query.
    #[must_use]
    pub fn new(pagerank: std::sync::Arc<HashMap<String, f32>>, alpha: f32) -> Self {
        Self { pagerank, alpha }
    }
}

impl RankingLayer for PageRankBoost {
    fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
        for (idx, score) in items.iter_mut() {
            if let Some(chunk) = chunks.get(*idx) {
                let rank = crate::hybrid::lookup_rank_for_chunk(
                    &self.pagerank,
                    &chunk.file_path,
                    &chunk.name,
                );
                *score *= crate::hybrid::pagerank_boost_factor(rank, self.alpha);
            }
        }
        items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
    }
}

// ---------------------------------------------------------------------------
// PathPenalty
// ---------------------------------------------------------------------------

/// Multiplicative path-shape penalty for test files, examples, etc.
///
/// Wraps [`crate::encoder::ripvec::penalties::file_path_penalty`].
/// Strips `corpus_root` from each chunk path before regex matching so
/// the test/examples/d.ts regexes operate on repo-relative paths
/// (otherwise `tests/corpus/code/X` itself triggers `test_dir_re` for
/// every chunk).
///
/// **Not used in the default BERT pipeline.** Path-name heuristics are
/// brittle and hide intent from the user; PageRank now carries the
/// "structural importance" signal through percentile-based boost.
/// Kept here for the semble pipeline's reference-impl parity with the
/// Python upstream.
pub struct PathPenalty {
    corpus_root: PathBuf,
}

impl PathPenalty {
    #[must_use]
    pub fn new(corpus_root: PathBuf) -> Self {
        Self { corpus_root }
    }
}

impl RankingLayer for PathPenalty {
    fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
        let prefix = self.corpus_root.to_string_lossy().into_owned();
        let trimmed_root = prefix.trim_end_matches('/');
        for (idx, score) in items.iter_mut() {
            if let Some(chunk) = chunks.get(*idx) {
                let rel = chunk
                    .file_path
                    .strip_prefix(trimmed_root)
                    .map(|s| s.trim_start_matches('/'))
                    .unwrap_or(&chunk.file_path);
                let penalty = crate::encoder::ripvec::penalties::file_path_penalty(rel);
                *score *= penalty;
            }
        }
        items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
    }
}

// ---------------------------------------------------------------------------
// Threshold + TopK
// ---------------------------------------------------------------------------

/// Drop items with score below `min_score`. Preserves ordering.
pub struct Threshold {
    pub min_score: f32,
}

impl RankingLayer for Threshold {
    fn apply(&self, items: &mut Vec<(usize, f32)>, _chunks: &[CodeChunk]) {
        items.retain(|(_, score)| *score >= self.min_score);
    }
}

/// Truncate to the top `k` items. Caller is responsible for ensuring
/// the list is sorted descending by score before this layer runs;
/// most boost layers re-sort internally so the typical pipeline
/// order is `boosts...` → `Threshold` → `TopK`.
pub struct TopK {
    pub k: usize,
}

impl RankingLayer for TopK {
    fn apply(&self, items: &mut Vec<(usize, f32)>, _chunks: &[CodeChunk]) {
        if self.k > 0 {
            items.truncate(self.k);
        }
    }
}

/// Cross-encoder rerank layer.
///
/// Replaces every result's `(chunk_idx, score)` similarity with a
/// fresh score from the cross-encoder, then re-sorts. The reranker
/// joins query and document text and runs a BERT-with-classifier
/// forward pass per pair, which is structurally higher quality than
/// the bi-encoder's dual-tower similarity but O(candidates) cost.
///
/// Construct via [`Self::new`]. Holds an [`Arc`] to a
/// [`crate::rerank::Reranker`] so the layer chain can be cheaply
/// cloned and the model isn't reloaded per call.
///
/// ## Auto-detect via `query`
///
/// The layer's score-rewrite is unconditional: if it's in the
/// pipeline, it runs. Auto-detect (skip rerank for symbol-shaped
/// queries) belongs at the call site that builds the layer chain,
/// not here. The call site already knows the query; it can decide
/// whether to push this layer into the chain at all.
pub struct CrossEncoderRerank {
    reranker: std::sync::Arc<crate::rerank::Reranker>,
    query: String,
    /// Cap on candidates the reranker sees. Cost is linear in this.
    candidates: usize,
    /// Blend factor between bi-encoder and cross-encoder scores.
    /// `blend = 1.0`: pure cross-encoder (replace). `blend = 0.0`:
    /// pure bi-encoder (rerank is a no-op). The default `0.7` puts
    /// most weight on the cross-encoder while preserving the bi-
    /// encoder's coarse ordering as a tiebreaker — important when
    /// the cross-encoder's sigmoid scores are compressed near 0.5
    /// (no candidate clearly relevant), the original bi-encoder
    /// ordering shouldn't get blown up by 1% rerank-score noise.
    blend: f32,
}

impl CrossEncoderRerank {
    /// Build a rerank layer over `reranker` for `query`. Limits the
    /// pool to `candidates` (typical: 100). Uses the default blend
    /// factor of 0.7 (heavy cross-encoder, tiebroken by bi-encoder).
    #[must_use]
    pub fn new(
        reranker: std::sync::Arc<crate::rerank::Reranker>,
        query: String,
        candidates: usize,
    ) -> Self {
        Self {
            reranker,
            query,
            candidates,
            blend: 0.7,
        }
    }

    /// Override the bi/cross-encoder blend factor. `0.0` = pure
    /// bi-encoder (rerank is a no-op), `1.0` = pure cross-encoder
    /// (replace).
    #[must_use]
    pub fn with_blend(mut self, blend: f32) -> Self {
        self.blend = blend.clamp(0.0, 1.0);
        self
    }
}

impl RankingLayer for CrossEncoderRerank {
    fn apply(&self, items: &mut Vec<(usize, f32)>, chunks: &[CodeChunk]) {
        // Cap to top-`candidates` before invoking the reranker. Cost
        // is linear in the cap, so trimming is meaningful — even
        // 100 vs 200 is a doubling.
        if items.len() > self.candidates {
            items.truncate(self.candidates);
        }
        if items.is_empty() {
            return;
        }
        // Build `(query, doc_text)` pairs aligned with `items`.
        // Out-of-range indices are dropped (shouldn't happen, but
        // defensive against malformed input).
        let pairs: Vec<(&str, &str)> = items
            .iter()
            .filter_map(|&(idx, _)| {
                chunks
                    .get(idx)
                    .map(|c| (self.query.as_str(), c.content.as_str()))
            })
            .collect();
        let Ok(scores) = self.reranker.score_pairs(&pairs) else {
            // Rerank failed — leave the existing scores untouched.
            // Logging happens at the call site; the layer is silent
            // about errors since it has no logging context.
            return;
        };
        // Min-max normalize both score arrays to [0, 1] within this
        // candidate set, then blend. The reranker returns raw logits
        // (sentence-transformers ms-marco config declares Identity
        // activation), which span ~[-11, +5]; bi-encoder scores arrive
        // here in the RRF-normalized [0, ~1] range. Without per-set
        // normalization the magnitude-dominant signal wins by default
        // — the `blend = 0.7` weighting is only meaningful when both
        // sides live on the same scale. Min-max preserves ordering
        // within each signal while making the blend a true convex
        // combination.
        let bi: Vec<f32> = items.iter().map(|&(_, s)| s).collect();
        let cross_norm = min_max_normalize(&scores);
        let bi_norm = min_max_normalize(&bi);
        for ((item, &cross), &bi_n) in items.iter_mut().zip(cross_norm.iter()).zip(bi_norm.iter()) {
            item.1 = self.blend * cross + (1.0 - self.blend) * bi_n;
        }
        items.sort_unstable_by(|a, b| b.1.total_cmp(&a.1).then_with(|| a.0.cmp(&b.0)));
    }
}

/// Linear-rescale a score array into `[0, 1]` based on its own
/// min/max. Used by [`CrossEncoderRerank`] to put raw cross-encoder
/// logits and RRF bi-encoder scores on the same scale before
/// convex-combining them. Degenerate inputs (empty or all-equal)
/// return all-`0.5` so the layer still contributes a neutral signal
/// rather than NaN/`0.0` collapse.
fn min_max_normalize(xs: &[f32]) -> Vec<f32> {
    if xs.is_empty() {
        return Vec::new();
    }
    let mut lo = f32::INFINITY;
    let mut hi = f32::NEG_INFINITY;
    for &x in xs {
        if x < lo {
            lo = x;
        }
        if x > hi {
            hi = x;
        }
    }
    let span = hi - lo;
    if span.abs() < f32::EPSILON {
        return vec![0.5; xs.len()];
    }
    xs.iter().map(|&x| (x - lo) / span).collect()
}

#[cfg(test)]
mod tests {
    use super::*;

    fn dummy_chunk(file: &str, name: &str) -> CodeChunk {
        CodeChunk {
            file_path: file.into(),
            name: name.into(),
            kind: "function".into(),
            start_line: 1,
            end_line: 10,
            content: String::new(),
            enriched_content: String::new(),
        }
    }

    #[test]
    fn threshold_drops_below_min() {
        let chunks = vec![dummy_chunk("a.rs", "f"), dummy_chunk("b.rs", "g")];
        let mut items = vec![(0, 0.9), (1, 0.3)];
        Threshold { min_score: 0.5 }.apply(&mut items, &chunks);
        assert_eq!(items, vec![(0, 0.9)]);
    }

    #[test]
    fn topk_truncates() {
        let chunks = vec![
            dummy_chunk("a.rs", "f"),
            dummy_chunk("b.rs", "g"),
            dummy_chunk("c.rs", "h"),
        ];
        let mut items = vec![(0, 0.9), (1, 0.8), (2, 0.7)];
        TopK { k: 2 }.apply(&mut items, &chunks);
        assert_eq!(items, vec![(0, 0.9), (1, 0.8)]);
    }

    #[test]
    fn topk_zero_keeps_all() {
        let chunks = vec![dummy_chunk("a.rs", "f"), dummy_chunk("b.rs", "g")];
        let mut items = vec![(0, 0.9), (1, 0.8)];
        TopK { k: 0 }.apply(&mut items, &chunks);
        assert_eq!(items.len(), 2);
    }

    #[test]
    fn chain_runs_layers_in_order() {
        // Three items: scores 1.0, 0.6, 0.3. Threshold at 0.5, then top 1.
        let chunks = vec![
            dummy_chunk("a.rs", "f"),
            dummy_chunk("b.rs", "g"),
            dummy_chunk("c.rs", "h"),
        ];
        let mut items = vec![(0, 1.0), (1, 0.6), (2, 0.3)];
        let layers: Vec<Box<dyn RankingLayer>> = vec![
            Box::new(Threshold { min_score: 0.5 }),
            Box::new(TopK { k: 1 }),
        ];
        apply_chain(&mut items, &chunks, &layers);
        assert_eq!(items, vec![(0, 1.0)]);
    }

    #[test]
    fn pagerank_boost_layer_reorders() {
        let chunks = vec![
            dummy_chunk("important.rs", "a"),
            dummy_chunk("obscure.rs", "b"),
        ];
        let mut items = vec![(0, 0.8), (1, 0.8)];
        let mut pr = HashMap::new();
        pr.insert("important.rs".to_string(), 1.0); // top percentile
        pr.insert("obscure.rs".to_string(), 0.1); // bottom decile
        PageRankBoost::new(std::sync::Arc::new(pr), 0.3).apply(&mut items, &chunks);
        // important.rs should rank first after boost.
        assert_eq!(items[0].0, 0);
        assert!(items[0].1 > items[1].1);
    }
}