lean_ctx/tools/
ctx_search.rs1use std::collections::HashSet;
2use std::path::Path;
3use std::path::PathBuf;
4
5use ignore::WalkBuilder;
6use regex::RegexBuilder;
7
8use crate::core::protocol;
9use crate::core::symbol_map::{self, SymbolMap};
10use crate::core::tokens::count_tokens;
11use crate::tools::CrpMode;
12
13const MAX_FILE_SIZE: u64 = 512_000;
14const MAX_WALK_DEPTH: usize = 20;
15
16pub fn handle(
18 pattern: &str,
19 dir: &str,
20 ext_filter: Option<&str>,
21 max_results: usize,
22 _crp_mode: CrpMode,
23 respect_gitignore: bool,
24 allow_secret_paths: bool,
25) -> (String, usize) {
26 const MAX_PATTERN_LEN: usize = 1024;
27 const MAX_REGEX_SIZE: usize = 1 << 20; let redact = crate::core::redaction::redaction_enabled_for_active_role();
30 if pattern.len() > MAX_PATTERN_LEN {
31 return (
32 format!(
33 "ERROR: pattern too long ({} > {MAX_PATTERN_LEN} chars)",
34 pattern.len()
35 ),
36 0,
37 );
38 }
39 let re = match RegexBuilder::new(pattern)
40 .size_limit(MAX_REGEX_SIZE)
41 .dfa_size_limit(MAX_REGEX_SIZE)
42 .build()
43 {
44 Ok(r) => r,
45 Err(e) => return (format!("ERROR: invalid regex: {e}"), 0),
46 };
47
48 let root = Path::new(dir);
49 if !root.exists() {
50 return (format!("ERROR: {dir} does not exist"), 0);
51 }
52
53 let walker = WalkBuilder::new(root)
54 .hidden(true)
55 .max_depth(Some(MAX_WALK_DEPTH))
56 .git_ignore(respect_gitignore)
57 .git_global(respect_gitignore)
58 .git_exclude(respect_gitignore)
59 .build();
60
61 let mut files: Vec<PathBuf> = Vec::new();
62 let mut matches = Vec::new();
63 let mut raw_tokens_accum: usize = 0;
64 let mut files_searched = 0u32;
65 let mut files_skipped_size = 0u32;
66 let mut files_skipped_encoding = 0u32;
67 let mut files_skipped_boundary = 0u32;
68
69 for entry in walker.filter_map(std::result::Result::ok) {
70 if entry.file_type().is_none_or(|ft| ft.is_dir()) {
71 continue;
72 }
73
74 if entry.file_type().is_some_and(|ft| ft.is_symlink()) {
75 continue;
76 }
77
78 let path = entry.path();
79
80 if is_binary_ext(path) || is_generated_file(path) {
81 continue;
82 }
83
84 if !allow_secret_paths && crate::core::io_boundary::is_secret_like(path).is_some() {
85 files_skipped_boundary += 1;
86 continue;
87 }
88
89 if let Some(ext) = ext_filter {
90 let file_ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
91 if file_ext != ext {
92 continue;
93 }
94 }
95
96 if let Ok(meta) = std::fs::metadata(path) {
97 if meta.len() > MAX_FILE_SIZE {
98 files_skipped_size += 1;
99 continue;
100 }
101 }
102
103 files.push(path.to_path_buf());
104 }
105
106 files.sort_unstable_by(|a, b| a.as_os_str().cmp(b.as_os_str()));
108
109 let root_str = root.to_string_lossy();
110 for path in &files {
111 if matches.len() >= max_results {
112 break;
113 }
114
115 let Ok(content) = std::fs::read_to_string(path) else {
116 files_skipped_encoding += 1;
117 continue;
118 };
119
120 files_searched += 1;
121
122 for (i, line) in content.lines().enumerate() {
123 if re.is_match(line) {
124 let short_path =
125 protocol::shorten_path_relative(&path.to_string_lossy(), &root_str);
126 raw_tokens_accum += count_tokens(line.trim()) + 2;
128 let shown = if redact {
129 crate::core::redaction::redact_text(line.trim())
130 } else {
131 line.trim().to_string()
132 };
133 matches.push(format!("{short_path}:{} {}", i + 1, shown));
134 if matches.len() >= max_results {
135 break;
136 }
137 }
138 }
139 }
140
141 if matches.is_empty() {
142 let mut msg = format!("0 matches for '{pattern}' in {files_searched} files");
143 if files_skipped_size > 0 {
144 msg.push_str(&format!(" ({files_skipped_size} large files skipped)"));
145 }
146 if files_skipped_encoding > 0 {
147 msg.push_str(&format!(
148 " ({files_skipped_encoding} files skipped: binary/encoding)"
149 ));
150 }
151 if files_skipped_boundary > 0 {
152 msg.push_str(&format!(
153 " ({files_skipped_boundary} secret-like files skipped by boundary policy)"
154 ));
155 }
156 return (msg, 0);
157 }
158
159 let matched_files: Vec<&str> = {
161 let mut seen = HashSet::new();
162 matches
163 .iter()
164 .filter_map(|m| {
165 let file = extract_file_from_match(m);
166 if seen.insert(file) {
167 Some(file)
168 } else {
169 None
170 }
171 })
172 .collect()
173 };
174
175 let mut result = format!("{} matches in {} files", matches.len(), files_searched);
176 if matched_files.len() > 1 {
177 result.push_str(" [");
178 result.push_str(&matched_files.join(", "));
179 result.push(']');
180 }
181 result.push_str(":\n");
182 result.push_str(&matches.join("\n"));
183
184 if files_skipped_size > 0 {
185 result.push_str(&format!("\n({files_skipped_size} files >512KB skipped)"));
186 }
187 if files_skipped_encoding > 0 {
188 result.push_str(&format!(
189 "\n({files_skipped_encoding} files skipped: binary/encoding)"
190 ));
191 }
192 if files_skipped_boundary > 0 {
193 result.push_str(&format!(
194 "\n({files_skipped_boundary} secret-like files skipped by boundary policy)"
195 ));
196 }
197
198 let scope_hint = monorepo_scope_hint(&matches, dir);
199
200 {
201 let file_ext = ext_filter.unwrap_or("rs");
202 let mut sym = SymbolMap::new();
203 let idents = symbol_map::extract_identifiers(&result, file_ext);
204 for ident in &idents {
205 sym.register(ident);
206 }
207 if sym.len() >= 3 {
208 let sym_table = sym.format_table();
209 let compressed = sym.apply(&result);
210 let original_tok = count_tokens(&result);
211 let compressed_tok = count_tokens(&compressed) + count_tokens(&sym_table);
212 let net_saving = original_tok.saturating_sub(compressed_tok);
213 if original_tok > 0 && net_saving * 100 / original_tok >= 5 {
214 result = format!("{compressed}{sym_table}");
215 }
216 }
217 }
218
219 if let Some(hint) = scope_hint {
220 result.push_str(&hint);
221 }
222
223 let sent = count_tokens(&result);
224
225 let native_estimate = (raw_tokens_accum as f64 * 2.5).ceil() as usize;
229 let original = native_estimate.max(raw_tokens_accum);
230 let savings = protocol::format_savings(original, sent);
231
232 (format!("{result}\n{savings}"), original)
233}
234
235fn is_binary_ext(path: &Path) -> bool {
236 let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
237 matches!(
238 ext,
239 "png"
240 | "jpg"
241 | "jpeg"
242 | "gif"
243 | "webp"
244 | "ico"
245 | "svg"
246 | "woff"
247 | "woff2"
248 | "ttf"
249 | "eot"
250 | "pdf"
251 | "zip"
252 | "tar"
253 | "gz"
254 | "br"
255 | "zst"
256 | "bz2"
257 | "xz"
258 | "mp3"
259 | "mp4"
260 | "webm"
261 | "ogg"
262 | "wasm"
263 | "so"
264 | "dylib"
265 | "dll"
266 | "exe"
267 | "lock"
268 | "map"
269 | "snap"
270 | "patch"
271 | "db"
272 | "sqlite"
273 | "parquet"
274 | "arrow"
275 | "bin"
276 | "o"
277 | "a"
278 | "class"
279 | "pyc"
280 | "pyo"
281 )
282}
283
284fn is_generated_file(path: &Path) -> bool {
285 let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
286 name.ends_with(".min.js")
287 || name.ends_with(".min.css")
288 || name.ends_with(".bundle.js")
289 || name.ends_with(".chunk.js")
290 || name.ends_with(".d.ts")
291 || name.ends_with(".js.map")
292 || name.ends_with(".css.map")
293}
294
295fn extract_file_from_match(line: &str) -> &str {
297 let start = if line.len() >= 2
298 && line.as_bytes().first().is_some_and(u8::is_ascii_alphabetic)
299 && line.as_bytes().get(1) == Some(&b':')
300 {
301 2
302 } else {
303 0
304 };
305 match line[start..].find(':') {
306 Some(pos) => &line[..start + pos],
307 None => line,
308 }
309}
310
311fn monorepo_scope_hint(matches: &[String], search_dir: &str) -> Option<String> {
312 let top_dirs: HashSet<&str> = matches
313 .iter()
314 .filter_map(|m| {
315 let path = extract_file_from_match(m);
316 let relative = path.strip_prefix("./").unwrap_or(path);
317 let relative = relative.strip_prefix(search_dir).unwrap_or(relative);
318 let relative = relative.strip_prefix('/').unwrap_or(relative);
319 relative.split('/').next()
320 })
321 .collect();
322
323 if top_dirs.len() > 3 {
324 let mut dirs: Vec<&&str> = top_dirs.iter().collect();
325 dirs.sort();
326 let dir_list: Vec<String> = dirs.iter().take(6).map(|d| format!("'{d}'")).collect();
327 let extra = if top_dirs.len() > 6 {
328 format!(", +{} more", top_dirs.len() - 6)
329 } else {
330 String::new()
331 };
332 Some(format!(
333 "\n\nResults span {} directories ({}{}). \
334 Use the 'path' parameter to scope to a specific service, \
335 e.g. path=\"{}/\".",
336 top_dirs.len(),
337 dir_list.join(", "),
338 extra,
339 dirs[0]
340 ))
341 } else {
342 None
343 }
344}
345
346#[cfg(test)]
347mod tests {
348 use super::*;
349 use crate::tools::CrpMode;
350
351 #[test]
352 fn search_results_are_deterministically_ordered_by_path() {
353 let dir = tempfile::tempdir().unwrap();
354 let a = dir.path().join("a.txt");
355 let b = dir.path().join("b.txt");
356 std::fs::write(&b, "match\n").unwrap();
357 std::fs::write(&a, "match\n").unwrap();
358
359 let (out, _orig) = handle(
360 "match",
361 dir.path().to_string_lossy().as_ref(),
362 Some("txt"),
363 10,
364 CrpMode::Off,
365 true,
366 true,
367 );
368
369 let mut match_lines: Vec<&str> = out
370 .lines()
371 .filter(|l| l.contains(".txt:") && l.contains("match"))
372 .collect();
373 match_lines.truncate(2);
375 assert_eq!(match_lines.len(), 2);
376 assert!(
377 match_lines[0].contains("a.txt:"),
378 "first match should come from a.txt, got: {}",
379 match_lines[0]
380 );
381 assert!(
382 match_lines[1].contains("b.txt:"),
383 "second match should come from b.txt, got: {}",
384 match_lines[1]
385 );
386 }
387
388 #[test]
389 fn secret_like_files_are_skipped_by_default() {
390 let dir = tempfile::tempdir().unwrap();
391 let secret = dir.path().join("key.pem");
392 let ok = dir.path().join("ok.txt");
393 std::fs::write(&secret, "match\n").unwrap();
394 std::fs::write(&ok, "match\n").unwrap();
395
396 let (out, _orig) = handle(
397 "match",
398 dir.path().to_string_lossy().as_ref(),
399 None,
400 10,
401 CrpMode::Off,
402 true,
403 false,
404 );
405
406 assert!(out.contains("ok.txt:"), "expected ok.txt match, got: {out}");
407 assert!(
408 !out.contains("key.pem:"),
409 "secret-like file should be skipped, got: {out}"
410 );
411 assert!(
412 out.contains("secret-like files skipped"),
413 "expected boundary skip note, got: {out}"
414 );
415 }
416}