ripvec_core/embed.rs
1//! Search configuration, results, and file I/O helpers.
2//!
3//! The transformer streaming pipeline (`embed_all`, `embed_all_batch`,
4//! `embed_all_streaming`, `embed_distributed`) was removed when the transformer engines came out.
5//! Embedding is now dispatched exclusively through
6//! [`VectorEncoder::embed_root`](crate::encoder::VectorEncoder::embed_root).
7//!
8//! Surviving items:
9//! - [`SearchConfig`] — pipeline tuning parameters (walk filters, batch size, scope).
10//! - [`Scope`] — intent-shaped corpus axis (code / docs / all).
11//! - [`PROSE_EXTENSIONS`] — canonical prose file extensions.
12//! - [`SearchResult`] — chunk + similarity score pair.
13//! - [`apply_structural_boost`] — PageRank boost post-processing for MCP.
14
15use std::path::Path;
16
17use crate::chunk::{ChunkConfig, CodeChunk};
18
19/// Default batch size for embedding inference.
20pub const DEFAULT_BATCH_SIZE: usize = 32;
21
22/// Runtime configuration for the search pipeline.
23///
24/// All tuning parameters that were previously compile-time constants are
25/// gathered here so they can be set from CLI arguments without recompiling.
26#[derive(Debug, Clone)]
27pub struct SearchConfig {
28 /// Chunks per inference call. Larger values amortize call overhead
29 /// but consume more memory. Default: 32.
30 pub batch_size: usize,
31 /// Maximum tokens fed to the model per chunk. `0` means no limit.
32 /// Capping tokens controls inference cost for minified or dense source.
33 /// BERT attention cost scales linearly with token count, and CLS pooling
34 /// means the first token's representation carries most semantic weight.
35 /// Default: 128 (7.7× faster than 512, with minimal quality loss).
36 pub max_tokens: usize,
37 /// Chunking parameters forwarded to the chunking phase.
38 pub chunk: ChunkConfig,
39 /// Force all files to be chunked as plain text (sliding windows only).
40 /// When `false` (default), files with recognized extensions use tree-sitter
41 /// semantic chunking, and unrecognized extensions fall back to sliding windows.
42 pub text_mode: bool,
43 /// MRL cascade pre-filter dimension.
44 ///
45 /// When set, [`SearchIndex`](crate::index::SearchIndex) stores a truncated
46 /// and L2-re-normalized copy of the embedding matrix at this dimension for
47 /// fast two-phase cascade search. `None` (default) disables cascade search.
48 pub cascade_dim: Option<usize>,
49 /// Optional file type filter (e.g. "rust", "python", "js").
50 ///
51 /// When set, only files matching this type (using ripgrep's built-in type
52 /// database) are collected during the walk phase.
53 pub file_type: Option<String>,
54 /// File extensions to exclude during the walk phase.
55 pub exclude_extensions: Vec<String>,
56 /// File extensions to include during the walk phase. Empty means
57 /// "no extension whitelist" (other filters still apply). Non-empty
58 /// restricts walking to files whose extension matches one of these
59 /// (normalized lowercase, with or without leading dot).
60 pub include_extensions: Vec<String>,
61 /// Additional `.gitignore`-style patterns to exclude during the walk phase.
62 pub ignore_patterns: Vec<String>,
63 /// Intent-shaped scope: code, docs, or all. Drives the default
64 /// extension whitelist when `include_extensions` is empty and the
65 /// rerank gate in the MCP layer (`docs` and `all`-on-mixed-corpus
66 /// fire rerank; `code` skips). See [`Scope`].
67 pub scope: Scope,
68 /// Search mode: hybrid (default), semantic, or keyword.
69 pub mode: crate::hybrid::SearchMode,
70}
71
72/// Intent-shaped scope for a search invocation.
73///
74/// Used as the user-facing axis for picking what kind of files
75/// participate in a search and whether the prose-tuned cross-encoder
76/// reranker fires. Maps internally to extension allow-lists and to
77/// the rerank gate's policy table.
78#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize, serde::Deserialize)]
79#[serde(rename_all = "lowercase")]
80pub enum Scope {
81 /// Only code-language files. Cross-encoder rerank is skipped — the
82 /// ms-marco model is out-of-domain for code chunks.
83 Code,
84 /// Only prose / documentation files (`md`, `rst`, `txt`, `adoc`,
85 /// `mdx`, `org`). Cross-encoder rerank fires by default on NL
86 /// queries.
87 Docs,
88 /// No extension whitelist; the rerank gate decides based on the
89 /// indexed corpus's prose fraction (see
90 /// `RipvecIndex::corpus_class`). Default.
91 #[default]
92 All,
93}
94
95/// Canonical prose file extensions for `Scope::Docs`. Kept in sync with
96/// [`crate::encoder::ripvec::ranking::is_prose_path`].
97pub const PROSE_EXTENSIONS: &[&str] = &[
98 "md", "markdown", "mdx", "rst", "txt", "text", "adoc", "asciidoc", "org",
99];
100
101impl SearchConfig {
102 /// Convert search configuration into shared walk filters.
103 ///
104 /// Resolves the scope-implied extension whitelist:
105 ///
106 /// - Explicit `include_extensions` always wins.
107 /// - Otherwise `Scope::Docs` injects the canonical prose set
108 /// ([`PROSE_EXTENSIONS`]).
109 /// - `Scope::Code` injects the canonical prose set as
110 /// *exclusions* (so prose files are skipped during walk).
111 /// - `Scope::All` leaves the include set empty (no whitelist).
112 #[must_use]
113 pub fn walk_options(&self) -> crate::walk::WalkOptions {
114 let mut include = self.include_extensions.clone();
115 let mut exclude = self.exclude_extensions.clone();
116 if include.is_empty() {
117 match self.scope {
118 Scope::Docs => {
119 include.extend(PROSE_EXTENSIONS.iter().map(|s| (*s).to_string()));
120 }
121 Scope::Code => {
122 for ext in PROSE_EXTENSIONS {
123 if !exclude.iter().any(|e| e.eq_ignore_ascii_case(ext)) {
124 exclude.push((*ext).to_string());
125 }
126 }
127 }
128 Scope::All => {}
129 }
130 }
131 crate::walk::WalkOptions {
132 file_type: self.file_type.clone(),
133 include_extensions: include,
134 exclude_extensions: exclude,
135 ignore_patterns: self.ignore_patterns.clone(),
136 }
137 }
138
139 /// Merge ignore patterns from `.ripvec/config.toml`, if present.
140 pub fn apply_repo_config(&mut self, root: &Path) {
141 let Some((_, config)) = crate::cache::config::find_config(root) else {
142 return;
143 };
144 for pattern in config.ignore.patterns {
145 if !pattern.trim().is_empty() && !self.ignore_patterns.contains(&pattern) {
146 self.ignore_patterns.push(pattern);
147 }
148 }
149 }
150}
151
152impl Default for SearchConfig {
153 fn default() -> Self {
154 Self {
155 batch_size: DEFAULT_BATCH_SIZE,
156 max_tokens: 0,
157 chunk: ChunkConfig::default(),
158 text_mode: false,
159 cascade_dim: None,
160 file_type: None,
161 exclude_extensions: Vec::new(),
162 include_extensions: Vec::new(),
163 ignore_patterns: Vec::new(),
164 scope: Scope::All,
165 mode: crate::hybrid::SearchMode::Hybrid,
166 }
167 }
168}
169
170/// A search result pairing a code chunk with its similarity score.
171#[derive(Debug, Clone)]
172pub struct SearchResult {
173 /// The matched code chunk.
174 pub chunk: CodeChunk,
175 /// Cosine similarity to the query (0.0 to 1.0).
176 pub similarity: f32,
177}
178
179/// Normalize similarity scores to `[0,1]` and apply a `PageRank` structural boost.
180///
181/// Each result's similarity is min-max normalized, then a weighted `PageRank`
182/// score is added: `final = normalized + alpha * pagerank`. This promotes
183/// architecturally important files (many dependents) in search results.
184///
185/// Called from the MCP search handler which has access to the `RepoGraph`,
186/// rather than from [`search`](crate::encoder::ripvec::index) directly.
187pub fn apply_structural_boost<S: ::std::hash::BuildHasher>(
188 results: &mut [SearchResult],
189 file_ranks: &std::collections::HashMap<String, f32, S>,
190 alpha: f32,
191) {
192 if results.is_empty() || alpha == 0.0 {
193 return;
194 }
195
196 let min = results
197 .iter()
198 .map(|r| r.similarity)
199 .fold(f32::INFINITY, f32::min);
200 let max = results
201 .iter()
202 .map(|r| r.similarity)
203 .fold(f32::NEG_INFINITY, f32::max);
204 let range = (max - min).max(1e-12);
205
206 for r in results.iter_mut() {
207 let normalized = (r.similarity - min) / range;
208 let pr = file_ranks.get(&r.chunk.file_path).copied().unwrap_or(0.0);
209 r.similarity = normalized + alpha * pr;
210 }
211}
212
213#[cfg(test)]
214mod tests {
215 use super::*;
216
217 fn make_result(file_path: &str, similarity: f32) -> SearchResult {
218 SearchResult {
219 chunk: CodeChunk {
220 file_path: file_path.to_string(),
221 name: "test".to_string(),
222 kind: "function".to_string(),
223 start_line: 1,
224 end_line: 10,
225 enriched_content: String::new(),
226 content: String::new(),
227 },
228 similarity,
229 }
230 }
231
232 #[test]
233 fn structural_boost_normalizes_and_applies() {
234 let mut results = vec![
235 make_result("src/a.rs", 0.8),
236 make_result("src/b.rs", 0.4),
237 make_result("src/c.rs", 0.6),
238 ];
239 let mut ranks = std::collections::HashMap::new();
240 ranks.insert("src/a.rs".to_string(), 0.5);
241 ranks.insert("src/b.rs".to_string(), 1.0);
242 ranks.insert("src/c.rs".to_string(), 0.0);
243
244 apply_structural_boost(&mut results, &ranks, 0.2);
245
246 // a: normalized=(0.8-0.4)/0.4=1.0, boost=0.2*0.5=0.1 => 1.1
247 assert!((results[0].similarity - 1.1).abs() < 1e-6);
248 // b: normalized=(0.4-0.4)/0.4=0.0, boost=0.2*1.0=0.2 => 0.2
249 assert!((results[1].similarity - 0.2).abs() < 1e-6);
250 // c: normalized=(0.6-0.4)/0.4=0.5, boost=0.2*0.0=0.0 => 0.5
251 assert!((results[2].similarity - 0.5).abs() < 1e-6);
252 }
253
254 #[test]
255 fn structural_boost_noop_on_empty() {
256 let mut results: Vec<SearchResult> = vec![];
257 let ranks = std::collections::HashMap::new();
258 apply_structural_boost(&mut results, &ranks, 0.2);
259 assert!(results.is_empty());
260 }
261
262 #[test]
263 fn structural_boost_noop_on_zero_alpha() {
264 let mut results = vec![make_result("src/a.rs", 0.8)];
265 let mut ranks = std::collections::HashMap::new();
266 ranks.insert("src/a.rs".to_string(), 1.0);
267 apply_structural_boost(&mut results, &ranks, 0.0);
268 // Should be unchanged
269 assert!((results[0].similarity - 0.8).abs() < 1e-6);
270 }
271}