ripvec-core 3.0.2

//! Search configuration, results, and file I/O helpers.
//!
//! The transformer streaming pipeline (`embed_all`, `embed_all_batch`,
//! `embed_all_streaming`, `embed_distributed`) was removed when the transformer engines came out.
//! Embedding is now dispatched exclusively through
//! [`VectorEncoder::embed_root`](crate::encoder::VectorEncoder::embed_root).
//!
//! Surviving items:
//! - [`SearchConfig`] — pipeline tuning parameters (walk filters, batch size, scope).
//! - [`Scope`] — intent-shaped corpus axis (code / docs / all).
//! - [`PROSE_EXTENSIONS`] — canonical prose file extensions.
//! - [`SearchResult`] — chunk + similarity score pair.
//! - [`apply_structural_boost`] — PageRank boost post-processing for MCP.

use std::path::Path;

use crate::chunk::{ChunkConfig, CodeChunk};

/// Default batch size for embedding inference.
pub const DEFAULT_BATCH_SIZE: usize = 32;

/// Runtime configuration for the search pipeline.
///
/// All tuning parameters that were previously compile-time constants are
/// gathered here so they can be set from CLI arguments without recompiling.
#[derive(Debug, Clone)]
pub struct SearchConfig {
    /// Chunks per inference call. Larger values amortize call overhead
    /// but consume more memory. Default: 32.
    pub batch_size: usize,
    /// Maximum tokens fed to the model per chunk. `0` means no limit.
    /// Capping tokens controls inference cost for minified or dense source.
    /// BERT attention cost scales linearly with token count, and CLS pooling
    /// means the first token's representation carries most semantic weight.
    /// Default: 128 (7.7× faster than 512, with minimal quality loss).
    pub max_tokens: usize,
    /// Chunking parameters forwarded to the chunking phase.
    pub chunk: ChunkConfig,
    /// Force all files to be chunked as plain text (sliding windows only).
    /// When `false` (default), files with recognized extensions use tree-sitter
    /// semantic chunking, and unrecognized extensions fall back to sliding windows.
    pub text_mode: bool,
    /// MRL cascade pre-filter dimension.
    ///
    /// When set, [`SearchIndex`](crate::index::SearchIndex) stores a truncated
    /// and L2-re-normalized copy of the embedding matrix at this dimension for
    /// fast two-phase cascade search. `None` (default) disables cascade search.
    pub cascade_dim: Option<usize>,
    /// Optional file type filter (e.g. "rust", "python", "js").
    ///
    /// When set, only files matching this type (using ripgrep's built-in type
    /// database) are collected during the walk phase.
    pub file_type: Option<String>,
    /// File extensions to exclude during the walk phase.
    pub exclude_extensions: Vec<String>,
    /// File extensions to include during the walk phase. Empty means
    /// "no extension whitelist" (other filters still apply). Non-empty
    /// restricts walking to files whose extension matches one of these
    /// (normalized lowercase, with or without leading dot).
    pub include_extensions: Vec<String>,
    /// Additional `.gitignore`-style patterns to exclude during the walk phase.
    pub ignore_patterns: Vec<String>,
    /// Intent-shaped scope: code, docs, or all. Drives the default
    /// extension whitelist when `include_extensions` is empty and the
    /// rerank gate in the MCP layer (`docs` and `all`-on-mixed-corpus
    /// fire rerank; `code` skips). See [`Scope`].
    pub scope: Scope,
    /// Search mode: hybrid (default), semantic, or keyword.
    pub mode: crate::hybrid::SearchMode,
}

/// Intent-shaped scope for a search invocation.
///
/// Used as the user-facing axis for picking what kind of files
/// participate in a search and whether the prose-tuned cross-encoder
/// reranker fires. Maps internally to extension allow-lists and to
/// the rerank gate's policy table.
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
#[serde(rename_all = "lowercase")]
pub enum Scope {
    /// Only code-language files. Cross-encoder rerank is skipped — the
    /// ms-marco model is out-of-domain for code chunks.
    Code,
    /// Only prose / documentation files (`md`, `rst`, `txt`, `adoc`,
    /// `mdx`, `org`). Cross-encoder rerank fires by default on NL
    /// queries.
    Docs,
    /// No extension whitelist; the rerank gate decides based on the
    /// indexed corpus's prose fraction (see
    /// `RipvecIndex::corpus_class`). Default.
    #[default]
    All,
}

/// Canonical prose file extensions for `Scope::Docs`. Kept in sync with
/// [`crate::encoder::ripvec::ranking::is_prose_path`].
pub const PROSE_EXTENSIONS: &[&str] = &[
    "md", "markdown", "mdx", "rst", "txt", "text", "adoc", "asciidoc", "org",
];

impl SearchConfig {
    /// Convert search configuration into shared walk filters.
    ///
    /// Resolves the scope-implied extension whitelist:
    ///
    /// - Explicit `include_extensions` always wins.
    /// - Otherwise `Scope::Docs` injects the canonical prose set
    ///   ([`PROSE_EXTENSIONS`]).
    /// - `Scope::Code` injects the canonical prose set as
    ///   *exclusions* (so prose files are skipped during walk).
    /// - `Scope::All` leaves the include set empty (no whitelist).
    #[must_use]
    pub fn walk_options(&self) -> crate::walk::WalkOptions {
        let mut include = self.include_extensions.clone();
        let mut exclude = self.exclude_extensions.clone();
        if include.is_empty() {
            match self.scope {
                Scope::Docs => {
                    include.extend(PROSE_EXTENSIONS.iter().map(|s| (*s).to_string()));
                }
                Scope::Code => {
                    for ext in PROSE_EXTENSIONS {
                        if !exclude.iter().any(|e| e.eq_ignore_ascii_case(ext)) {
                            exclude.push((*ext).to_string());
                        }
                    }
                }
                Scope::All => {}
            }
        }
        crate::walk::WalkOptions {
            file_type: self.file_type.clone(),
            include_extensions: include,
            exclude_extensions: exclude,
            ignore_patterns: self.ignore_patterns.clone(),
        }
    }

    /// Merge ignore patterns from `.ripvec/config.toml`, if present.
    pub fn apply_repo_config(&mut self, root: &Path) {
        let Some((_, config)) = crate::cache::config::find_config(root) else {
            return;
        };
        for pattern in config.ignore.patterns {
            if !pattern.trim().is_empty() && !self.ignore_patterns.contains(&pattern) {
                self.ignore_patterns.push(pattern);
            }
        }
    }
}

impl Default for SearchConfig {
    fn default() -> Self {
        Self {
            batch_size: DEFAULT_BATCH_SIZE,
            max_tokens: 0,
            chunk: ChunkConfig::default(),
            text_mode: false,
            cascade_dim: None,
            file_type: None,
            exclude_extensions: Vec::new(),
            include_extensions: Vec::new(),
            ignore_patterns: Vec::new(),
            scope: Scope::All,
            mode: crate::hybrid::SearchMode::Hybrid,
        }
    }
}

/// A search result pairing a code chunk with its similarity score.
#[derive(Debug, Clone)]
pub struct SearchResult {
    /// The matched code chunk.
    pub chunk: CodeChunk,
    /// Cosine similarity to the query (0.0 to 1.0).
    pub similarity: f32,
}

/// Normalize similarity scores to `[0,1]` and apply a `PageRank` structural boost.
///
/// Each result's similarity is min-max normalized, then a weighted `PageRank`
/// score is added: `final = normalized + alpha * pagerank`. This promotes
/// architecturally important files (many dependents) in search results.
///
/// Called from the MCP search handler which has access to the `RepoGraph`,
/// rather than from [`search`](crate::encoder::ripvec::index) directly.
pub fn apply_structural_boost<S: ::std::hash::BuildHasher>(
    results: &mut [SearchResult],
    file_ranks: &std::collections::HashMap<String, f32, S>,
    alpha: f32,
) {
    if results.is_empty() || alpha == 0.0 {
        return;
    }

    let min = results
        .iter()
        .map(|r| r.similarity)
        .fold(f32::INFINITY, f32::min);
    let max = results
        .iter()
        .map(|r| r.similarity)
        .fold(f32::NEG_INFINITY, f32::max);
    let range = (max - min).max(1e-12);

    for r in results.iter_mut() {
        let normalized = (r.similarity - min) / range;
        let pr = file_ranks.get(&r.chunk.file_path).copied().unwrap_or(0.0);
        r.similarity = normalized + alpha * pr;
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    fn make_result(file_path: &str, similarity: f32) -> SearchResult {
        SearchResult {
            chunk: CodeChunk {
                file_path: file_path.to_string(),
                name: "test".to_string(),
                kind: "function".to_string(),
                start_line: 1,
                end_line: 10,
                enriched_content: String::new(),
                content: String::new(),
            },
            similarity,
        }
    }

    #[test]
    fn structural_boost_normalizes_and_applies() {
        let mut results = vec![
            make_result("src/a.rs", 0.8),
            make_result("src/b.rs", 0.4),
            make_result("src/c.rs", 0.6),
        ];
        let mut ranks = std::collections::HashMap::new();
        ranks.insert("src/a.rs".to_string(), 0.5);
        ranks.insert("src/b.rs".to_string(), 1.0);
        ranks.insert("src/c.rs".to_string(), 0.0);

        apply_structural_boost(&mut results, &ranks, 0.2);

        // a: normalized=(0.8-0.4)/0.4=1.0, boost=0.2*0.5=0.1 => 1.1
        assert!((results[0].similarity - 1.1).abs() < 1e-6);
        // b: normalized=(0.4-0.4)/0.4=0.0, boost=0.2*1.0=0.2 => 0.2
        assert!((results[1].similarity - 0.2).abs() < 1e-6);
        // c: normalized=(0.6-0.4)/0.4=0.5, boost=0.2*0.0=0.0 => 0.5
        assert!((results[2].similarity - 0.5).abs() < 1e-6);
    }

    #[test]
    fn structural_boost_noop_on_empty() {
        let mut results: Vec<SearchResult> = vec![];
        let ranks = std::collections::HashMap::new();
        apply_structural_boost(&mut results, &ranks, 0.2);
        assert!(results.is_empty());
    }

    #[test]
    fn structural_boost_noop_on_zero_alpha() {
        let mut results = vec![make_result("src/a.rs", 0.8)];
        let mut ranks = std::collections::HashMap::new();
        ranks.insert("src/a.rs".to_string(), 1.0);
        apply_structural_boost(&mut results, &ranks, 0.0);
        // Should be unchanged
        assert!((results[0].similarity - 0.8).abs() < 1e-6);
    }
}