Skip to main content

ripvec_core/
embed.rs

1//! Search configuration, results, and file I/O helpers.
2//!
3//! The transformer streaming pipeline (`embed_all`, `embed_all_batch`,
4//! `embed_all_streaming`, `embed_distributed`) was removed when the transformer engines came out.
5//! Embedding is now dispatched exclusively through
6//! [`VectorEncoder::embed_root`](crate::encoder::VectorEncoder::embed_root).
7//!
8//! Surviving items:
9//! - [`SearchConfig`] — pipeline tuning parameters (walk filters, batch size, corpus).
10//! - [`Corpus`] — which chunks participate (code / docs / all).
11//! - [`RerankPolicy`] — whether the cross-encoder reranker fires (auto / always / never).
12//! - [`PROSE_EXTENSIONS`] — canonical prose file extensions.
13//! - [`SearchResult`] — chunk + similarity score pair.
14//! - [`apply_structural_boost`] — PageRank boost post-processing for MCP.
15//!
16//! ## 4.0 API change
17//!
18//! `Scope` is replaced by two orthogonal axes:
19//! - [`Corpus`] controls *which* chunks are searched (previously coupled to rerank policy in `Scope`).
20//! - [`RerankPolicy`] controls *whether* the cross-encoder reranker fires (previously implicit).
21//!
22//! A backward-compatibility type alias `Scope = Corpus` is provided so that callers
23//! outside the MCP layer (examples, CLI) continue to compile; prefer [`Corpus`] in new code.
24
25use std::path::Path;
26
27use crate::chunk::{ChunkConfig, CodeChunk};
28
29/// Default batch size for embedding inference.
30pub const DEFAULT_BATCH_SIZE: usize = 32;
31
32/// Runtime configuration for the search pipeline.
33///
34/// All tuning parameters that were previously compile-time constants are
35/// gathered here so they can be set from CLI arguments without recompiling.
36#[derive(Debug, Clone)]
37pub struct SearchConfig {
38    /// Chunks per inference call. Larger values amortize call overhead
39    /// but consume more memory. Default: 32.
40    pub batch_size: usize,
41    /// Maximum tokens fed to the model per chunk. `0` means no limit.
42    /// Capping tokens controls inference cost for minified or dense source.
43    /// BERT attention cost scales linearly with token count, and CLS pooling
44    /// means the first token's representation carries most semantic weight.
45    /// Default: 128 (7.7× faster than 512, with minimal quality loss).
46    pub max_tokens: usize,
47    /// Chunking parameters forwarded to the chunking phase.
48    pub chunk: ChunkConfig,
49    /// Force all files to be chunked as plain text (sliding windows only).
50    /// When `false` (default), files with recognized extensions use tree-sitter
51    /// semantic chunking, and unrecognized extensions fall back to sliding windows.
52    pub text_mode: bool,
53    /// MRL cascade pre-filter dimension.
54    ///
55    /// When set, [`SearchIndex`](crate::index::SearchIndex) stores a truncated
56    /// and L2-re-normalized copy of the embedding matrix at this dimension for
57    /// fast two-phase cascade search. `None` (default) disables cascade search.
58    pub cascade_dim: Option<usize>,
59    /// Optional file type filter (e.g. "rust", "python", "js").
60    ///
61    /// When set, only files matching this type (using ripgrep's built-in type
62    /// database) are collected during the walk phase.
63    pub file_type: Option<String>,
64    /// File extensions to exclude during the walk phase.
65    pub exclude_extensions: Vec<String>,
66    /// File extensions to include during the walk phase. Empty means
67    /// "no extension whitelist" (other filters still apply). Non-empty
68    /// restricts walking to files whose extension matches one of these
69    /// (normalized lowercase, with or without leading dot).
70    pub include_extensions: Vec<String>,
71    /// Additional `.gitignore`-style patterns to exclude during the walk phase.
72    pub ignore_patterns: Vec<String>,
73    /// Corpus axis: which chunks participate in the search. Drives the default
74    /// extension whitelist when `include_extensions` is empty. See [`Corpus`].
75    ///
76    /// The `scope` field alias is provided for backward compatibility; prefer
77    /// `corpus` in new code.
78    pub corpus: Corpus,
79    /// Search mode: hybrid (default), semantic, or keyword.
80    pub mode: crate::hybrid::SearchMode,
81}
82
83/// Corpus axis for a search invocation: which chunks participate.
84///
85/// Controls the extension allow-list applied to the indexed corpus before
86/// scoring. Orthogonal to rerank policy — see [`RerankPolicy`] for the
87/// reranker control axis.
88///
89/// Maps internally to extension allow-lists (via `SearchConfig::walk_options`).
90/// The MCP layer maps user-facing `corpus=` parameter values to this enum.
91#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
92#[serde(rename_all = "lowercase")]
93pub enum Corpus {
94    /// Only code-language files. The default for all search tools.
95    #[default]
96    Code,
97    /// Only prose / documentation files (`md`, `rst`, `txt`, `adoc`,
98    /// `mdx`, `org`).
99    Docs,
100    /// No extension whitelist — all file types participate.
101    All,
102}
103
104/// Reranker firing policy for a search invocation.
105///
106/// Controls whether the cross-encoder reranker (TinyBERT-L-2-v2) runs
107/// after the bi-encoder retrieval step. Orthogonal to corpus selection —
108/// see [`Corpus`] for the corpus axis.
109///
110/// The `Auto` policy mirrors the pre-4.0 implicit behavior: rerank fires
111/// when the corpus is prose-heavy (≥ 30% of indexed chunks are prose) and
112/// the query is NL-shaped (not a symbol identifier).
113#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
114#[serde(rename_all = "lowercase")]
115pub enum RerankPolicy {
116    /// Rerank fires when the corpus is prose-heavy and the query is NL-shaped.
117    /// This was the pre-4.0 implicit default.
118    #[default]
119    Auto,
120    /// Force rerank on regardless of corpus shape or query type.
121    Always,
122    /// Skip rerank unconditionally — fastest path.
123    Never,
124}
125
126/// Backward-compatibility alias: `Scope` is now [`Corpus`].
127///
128/// Callers outside the MCP layer (examples, CLI) that reference `Scope`
129/// continue to compile. Prefer `Corpus` in new code.
130pub type Scope = Corpus;
131
132/// Canonical prose file extensions for `Scope::Docs`. Kept in sync with
133/// [`crate::encoder::ripvec::ranking::is_prose_path`].
134pub const PROSE_EXTENSIONS: &[&str] = &[
135    "md", "markdown", "mdx", "rst", "txt", "text", "adoc", "asciidoc", "org",
136];
137
138impl SearchConfig {
139    /// Convert search configuration into shared walk filters.
140    ///
141    /// Resolves the corpus-implied extension whitelist:
142    ///
143    /// - Explicit `include_extensions` always wins.
144    /// - Otherwise `Corpus::Docs` injects the canonical prose set
145    ///   ([`PROSE_EXTENSIONS`]).
146    /// - `Corpus::Code` injects the canonical prose set as
147    ///   *exclusions* (so prose files are skipped during walk).
148    /// - `Corpus::All` leaves the include set empty (no whitelist).
149    #[must_use]
150    pub fn walk_options(&self) -> crate::walk::WalkOptions {
151        let mut include = self.include_extensions.clone();
152        let mut exclude = self.exclude_extensions.clone();
153        if include.is_empty() {
154            match self.corpus {
155                Corpus::Docs => {
156                    include.extend(PROSE_EXTENSIONS.iter().map(|s| (*s).to_string()));
157                }
158                Corpus::Code => {
159                    for ext in PROSE_EXTENSIONS {
160                        if !exclude.iter().any(|e| e.eq_ignore_ascii_case(ext)) {
161                            exclude.push((*ext).to_string());
162                        }
163                    }
164                }
165                Corpus::All => {}
166            }
167        }
168        crate::walk::WalkOptions {
169            file_type: self.file_type.clone(),
170            include_extensions: include,
171            exclude_extensions: exclude,
172            ignore_patterns: self.ignore_patterns.clone(),
173        }
174    }
175
176    /// Merge ignore patterns from `.ripvec/config.toml`, if present.
177    pub fn apply_repo_config(&mut self, root: &Path) {
178        let Some((_, config)) = crate::cache::config::find_config(root) else {
179            return;
180        };
181        for pattern in config.ignore.patterns {
182            if !pattern.trim().is_empty() && !self.ignore_patterns.contains(&pattern) {
183                self.ignore_patterns.push(pattern);
184            }
185        }
186    }
187}
188
189impl Default for SearchConfig {
190    fn default() -> Self {
191        Self {
192            batch_size: DEFAULT_BATCH_SIZE,
193            max_tokens: 0,
194            chunk: ChunkConfig::default(),
195            text_mode: false,
196            cascade_dim: None,
197            file_type: None,
198            exclude_extensions: Vec::new(),
199            include_extensions: Vec::new(),
200            ignore_patterns: Vec::new(),
201            // 4.0: default corpus is Code (only code files); the MCP layer's
202            // index build always uses Corpus::All so the full corpus is indexed
203            // and any per-query corpus filter is applied at search time.
204            corpus: Corpus::All,
205            mode: crate::hybrid::SearchMode::Hybrid,
206        }
207    }
208}
209
210/// A search result pairing a code chunk with its similarity score.
211#[derive(Debug, Clone)]
212pub struct SearchResult {
213    /// The matched code chunk.
214    pub chunk: CodeChunk,
215    /// Cosine similarity to the query (0.0 to 1.0).
216    pub similarity: f32,
217}
218
219/// Normalize similarity scores to `[0,1]` and apply a `PageRank` structural boost.
220///
221/// Each result's similarity is min-max normalized, then a weighted `PageRank`
222/// score is added: `final = normalized + alpha * pagerank`. This promotes
223/// architecturally important files (many dependents) in search results.
224///
225/// Called from the MCP search handler which has access to the `RepoGraph`,
226/// rather than from [`search`](crate::encoder::ripvec::index) directly.
227pub fn apply_structural_boost<S: ::std::hash::BuildHasher>(
228    results: &mut [SearchResult],
229    file_ranks: &std::collections::HashMap<String, f32, S>,
230    alpha: f32,
231) {
232    if results.is_empty() || alpha == 0.0 {
233        return;
234    }
235
236    let min = results
237        .iter()
238        .map(|r| r.similarity)
239        .fold(f32::INFINITY, f32::min);
240    let max = results
241        .iter()
242        .map(|r| r.similarity)
243        .fold(f32::NEG_INFINITY, f32::max);
244    let range = (max - min).max(1e-12);
245
246    for r in results.iter_mut() {
247        let normalized = (r.similarity - min) / range;
248        let pr = file_ranks.get(&r.chunk.file_path).copied().unwrap_or(0.0);
249        r.similarity = normalized + alpha * pr;
250    }
251}
252
253#[cfg(test)]
254mod tests {
255    use super::*;
256
257    fn make_result(file_path: &str, similarity: f32) -> SearchResult {
258        SearchResult {
259            chunk: CodeChunk {
260                file_path: file_path.to_string(),
261                name: "test".to_string(),
262                kind: "function".to_string(),
263                content_kind: crate::chunk::ContentKind::Code,
264                start_line: 1,
265                end_line: 10,
266                symbol_line: 1,
267                enriched_content: String::new(),
268                content: String::new(),
269                qualified_name: None,
270            },
271            similarity,
272        }
273    }
274
275    #[test]
276    fn structural_boost_normalizes_and_applies() {
277        let mut results = vec![
278            make_result("src/a.rs", 0.8),
279            make_result("src/b.rs", 0.4),
280            make_result("src/c.rs", 0.6),
281        ];
282        let mut ranks = std::collections::HashMap::new();
283        ranks.insert("src/a.rs".to_string(), 0.5);
284        ranks.insert("src/b.rs".to_string(), 1.0);
285        ranks.insert("src/c.rs".to_string(), 0.0);
286
287        apply_structural_boost(&mut results, &ranks, 0.2);
288
289        // a: normalized=(0.8-0.4)/0.4=1.0, boost=0.2*0.5=0.1 => 1.1
290        assert!((results[0].similarity - 1.1).abs() < 1e-6);
291        // b: normalized=(0.4-0.4)/0.4=0.0, boost=0.2*1.0=0.2 => 0.2
292        assert!((results[1].similarity - 0.2).abs() < 1e-6);
293        // c: normalized=(0.6-0.4)/0.4=0.5, boost=0.2*0.0=0.0 => 0.5
294        assert!((results[2].similarity - 0.5).abs() < 1e-6);
295    }
296
297    #[test]
298    fn structural_boost_noop_on_empty() {
299        let mut results: Vec<SearchResult> = vec![];
300        let ranks = std::collections::HashMap::new();
301        apply_structural_boost(&mut results, &ranks, 0.2);
302        assert!(results.is_empty());
303    }
304
305    #[test]
306    fn structural_boost_noop_on_zero_alpha() {
307        let mut results = vec![make_result("src/a.rs", 0.8)];
308        let mut ranks = std::collections::HashMap::new();
309        ranks.insert("src/a.rs".to_string(), 1.0);
310        apply_structural_boost(&mut results, &ranks, 0.0);
311        // Should be unchanged
312        assert!((results[0].similarity - 0.8).abs() < 1e-6);
313    }
314}