Skip to main content

lean_ctx/tools/
ctx_search.rs

1use std::collections::HashSet;
2use std::path::Path;
3use std::path::PathBuf;
4
5use ignore::WalkBuilder;
6use regex::RegexBuilder;
7
8use crate::core::protocol;
9use crate::core::symbol_map::{self, SymbolMap};
10use crate::core::tokens::count_tokens;
11use crate::tools::CrpMode;
12
13const MAX_FILE_SIZE: u64 = 512_000;
14const MAX_WALK_DEPTH: usize = 20;
15
16/// Searches files for a regex pattern with compressed output and monorepo scope hints.
17pub fn handle(
18    pattern: &str,
19    dir: &str,
20    ext_filter: Option<&str>,
21    max_results: usize,
22    _crp_mode: CrpMode,
23    respect_gitignore: bool,
24    allow_secret_paths: bool,
25) -> (String, usize) {
26    const MAX_PATTERN_LEN: usize = 1024;
27    const MAX_REGEX_SIZE: usize = 1 << 20; // 1 MiB DFA limit
28
29    let redact = crate::core::redaction::redaction_enabled_for_active_role();
30    if pattern.len() > MAX_PATTERN_LEN {
31        return (
32            format!(
33                "ERROR: pattern too long ({} > {MAX_PATTERN_LEN} chars)",
34                pattern.len()
35            ),
36            0,
37        );
38    }
39    let re = match RegexBuilder::new(pattern)
40        .size_limit(MAX_REGEX_SIZE)
41        .dfa_size_limit(MAX_REGEX_SIZE)
42        .build()
43    {
44        Ok(r) => r,
45        Err(e) => return (format!("ERROR: invalid regex: {e}"), 0),
46    };
47
48    let root = Path::new(dir);
49    if !root.exists() {
50        return (format!("ERROR: {dir} does not exist"), 0);
51    }
52
53    let walker = WalkBuilder::new(root)
54        .hidden(true)
55        .max_depth(Some(MAX_WALK_DEPTH))
56        .git_ignore(respect_gitignore)
57        .git_global(respect_gitignore)
58        .git_exclude(respect_gitignore)
59        .build();
60
61    let mut files: Vec<PathBuf> = Vec::new();
62    let mut matches = Vec::new();
63    let mut raw_result_lines = Vec::new();
64    let mut files_searched = 0u32;
65    let mut files_skipped_size = 0u32;
66    let mut files_skipped_encoding = 0u32;
67    let mut files_skipped_boundary = 0u32;
68
69    for entry in walker.filter_map(std::result::Result::ok) {
70        if entry.file_type().is_none_or(|ft| ft.is_dir()) {
71            continue;
72        }
73
74        if entry.file_type().is_some_and(|ft| ft.is_symlink()) {
75            continue;
76        }
77
78        let path = entry.path();
79
80        if is_binary_ext(path) || is_generated_file(path) {
81            continue;
82        }
83
84        if !allow_secret_paths && crate::core::io_boundary::is_secret_like(path).is_some() {
85            files_skipped_boundary += 1;
86            continue;
87        }
88
89        if let Some(ext) = ext_filter {
90            let file_ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
91            if file_ext != ext {
92                continue;
93            }
94        }
95
96        if let Ok(meta) = std::fs::metadata(path) {
97            if meta.len() > MAX_FILE_SIZE {
98                files_skipped_size += 1;
99                continue;
100            }
101        }
102
103        files.push(path.to_path_buf());
104    }
105
106    // Deterministic search: stable file ordering makes max_results truncation reproducible.
107    files.sort_by(|a, b| a.to_string_lossy().cmp(&b.to_string_lossy()));
108
109    for path in files {
110        let Ok(content) = std::fs::read_to_string(&path) else {
111            files_skipped_encoding += 1;
112            continue;
113        };
114
115        files_searched += 1;
116
117        for (i, line) in content.lines().enumerate() {
118            if re.is_match(line) {
119                let short_path = protocol::shorten_path(&path.to_string_lossy());
120                let full_path = path.to_string_lossy();
121                raw_result_lines.push(format!("{full_path}:{}: {}", i + 1, line.trim()));
122                let shown = if redact {
123                    crate::core::redaction::redact_text(line.trim())
124                } else {
125                    line.trim().to_string()
126                };
127                matches.push(format!("{short_path}:{} {}", i + 1, shown));
128                if matches.len() >= max_results {
129                    break;
130                }
131            }
132        }
133
134        if matches.len() >= max_results {
135            break;
136        }
137    }
138
139    if matches.is_empty() {
140        let mut msg = format!("0 matches for '{pattern}' in {files_searched} files");
141        if files_skipped_size > 0 {
142            msg.push_str(&format!(" ({files_skipped_size} large files skipped)"));
143        }
144        if files_skipped_encoding > 0 {
145            msg.push_str(&format!(
146                " ({files_skipped_encoding} files skipped: binary/encoding)"
147            ));
148        }
149        if files_skipped_boundary > 0 {
150            msg.push_str(&format!(
151                " ({files_skipped_boundary} secret-like files skipped by boundary policy)"
152            ));
153        }
154        return (msg, 0);
155    }
156
157    // Prefix-cache-friendly: structural file list before per-query match content
158    let matched_files: Vec<&str> = {
159        let mut seen = HashSet::new();
160        matches
161            .iter()
162            .filter_map(|m| {
163                let file = m.split(':').next()?;
164                if seen.insert(file) {
165                    Some(file)
166                } else {
167                    None
168                }
169            })
170            .collect()
171    };
172
173    let mut result = format!("{} matches in {} files", matches.len(), files_searched);
174    if matched_files.len() > 1 {
175        result.push_str(" [");
176        result.push_str(&matched_files.join(", "));
177        result.push(']');
178    }
179    result.push_str(":\n");
180    result.push_str(&matches.join("\n"));
181
182    if files_skipped_size > 0 {
183        result.push_str(&format!("\n({files_skipped_size} files >512KB skipped)"));
184    }
185    if files_skipped_encoding > 0 {
186        result.push_str(&format!(
187            "\n({files_skipped_encoding} files skipped: binary/encoding)"
188        ));
189    }
190    if files_skipped_boundary > 0 {
191        result.push_str(&format!(
192            "\n({files_skipped_boundary} secret-like files skipped by boundary policy)"
193        ));
194    }
195
196    let scope_hint = monorepo_scope_hint(&matches, dir);
197
198    {
199        let file_ext = ext_filter.unwrap_or("rs");
200        let mut sym = SymbolMap::new();
201        let idents = symbol_map::extract_identifiers(&result, file_ext);
202        for ident in &idents {
203            sym.register(ident);
204        }
205        if sym.len() >= 3 {
206            let sym_table = sym.format_table();
207            let compressed = sym.apply(&result);
208            let original_tok = count_tokens(&result);
209            let compressed_tok = count_tokens(&compressed) + count_tokens(&sym_table);
210            let net_saving = original_tok.saturating_sub(compressed_tok);
211            if original_tok > 0 && net_saving * 100 / original_tok >= 5 {
212                result = format!("{compressed}{sym_table}");
213            }
214        }
215    }
216
217    if let Some(hint) = scope_hint {
218        result.push_str(&hint);
219    }
220
221    let raw_output = raw_result_lines.join("\n");
222    let raw_tokens = count_tokens(&raw_output);
223    let sent = count_tokens(&result);
224
225    // The "original" cost is what a native grep with context lines would produce.
226    // rg defaults to showing full paths + 2 context lines per match. We estimate
227    // the native cost as ~3x the raw match output (context + separators + headers).
228    let native_estimate = (raw_tokens as f64 * 2.5).ceil() as usize;
229    let original = native_estimate.max(raw_tokens);
230    let savings = protocol::format_savings(original, sent);
231
232    (format!("{result}\n{savings}"), original)
233}
234
235fn is_binary_ext(path: &Path) -> bool {
236    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
237    matches!(
238        ext,
239        "png"
240            | "jpg"
241            | "jpeg"
242            | "gif"
243            | "webp"
244            | "ico"
245            | "svg"
246            | "woff"
247            | "woff2"
248            | "ttf"
249            | "eot"
250            | "pdf"
251            | "zip"
252            | "tar"
253            | "gz"
254            | "br"
255            | "zst"
256            | "bz2"
257            | "xz"
258            | "mp3"
259            | "mp4"
260            | "webm"
261            | "ogg"
262            | "wasm"
263            | "so"
264            | "dylib"
265            | "dll"
266            | "exe"
267            | "lock"
268            | "map"
269            | "snap"
270            | "patch"
271            | "db"
272            | "sqlite"
273            | "parquet"
274            | "arrow"
275            | "bin"
276            | "o"
277            | "a"
278            | "class"
279            | "pyc"
280            | "pyo"
281    )
282}
283
284fn is_generated_file(path: &Path) -> bool {
285    let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
286    name.ends_with(".min.js")
287        || name.ends_with(".min.css")
288        || name.ends_with(".bundle.js")
289        || name.ends_with(".chunk.js")
290        || name.ends_with(".d.ts")
291        || name.ends_with(".js.map")
292        || name.ends_with(".css.map")
293}
294
295fn monorepo_scope_hint(matches: &[String], search_dir: &str) -> Option<String> {
296    let top_dirs: HashSet<&str> = matches
297        .iter()
298        .filter_map(|m| {
299            let path = m.split(':').next()?;
300            let relative = path.strip_prefix("./").unwrap_or(path);
301            let relative = relative.strip_prefix(search_dir).unwrap_or(relative);
302            let relative = relative.strip_prefix('/').unwrap_or(relative);
303            relative.split('/').next()
304        })
305        .collect();
306
307    if top_dirs.len() > 3 {
308        let mut dirs: Vec<&&str> = top_dirs.iter().collect();
309        dirs.sort();
310        let dir_list: Vec<String> = dirs.iter().take(6).map(|d| format!("'{d}'")).collect();
311        let extra = if top_dirs.len() > 6 {
312            format!(", +{} more", top_dirs.len() - 6)
313        } else {
314            String::new()
315        };
316        Some(format!(
317            "\n\nResults span {} directories ({}{}). \
318             Use the 'path' parameter to scope to a specific service, \
319             e.g. path=\"{}/\".",
320            top_dirs.len(),
321            dir_list.join(", "),
322            extra,
323            dirs[0]
324        ))
325    } else {
326        None
327    }
328}
329
330#[cfg(test)]
331mod tests {
332    use super::*;
333    use crate::tools::CrpMode;
334
335    #[test]
336    fn search_results_are_deterministically_ordered_by_path() {
337        let dir = tempfile::tempdir().unwrap();
338        let a = dir.path().join("a.txt");
339        let b = dir.path().join("b.txt");
340        std::fs::write(&b, "match\n").unwrap();
341        std::fs::write(&a, "match\n").unwrap();
342
343        let (out, _orig) = handle(
344            "match",
345            dir.path().to_string_lossy().as_ref(),
346            Some("txt"),
347            10,
348            CrpMode::Off,
349            true,
350            true,
351        );
352
353        let mut match_lines: Vec<&str> = out
354            .lines()
355            .filter(|l| l.contains(".txt:") && l.contains("match"))
356            .collect();
357        // Expect exactly the 2 match lines, ordered a.txt then b.txt.
358        match_lines.truncate(2);
359        assert_eq!(match_lines.len(), 2);
360        assert!(
361            match_lines[0].contains("a.txt:"),
362            "first match should come from a.txt, got: {}",
363            match_lines[0]
364        );
365        assert!(
366            match_lines[1].contains("b.txt:"),
367            "second match should come from b.txt, got: {}",
368            match_lines[1]
369        );
370    }
371
372    #[test]
373    fn secret_like_files_are_skipped_by_default() {
374        let dir = tempfile::tempdir().unwrap();
375        let secret = dir.path().join("key.pem");
376        let ok = dir.path().join("ok.txt");
377        std::fs::write(&secret, "match\n").unwrap();
378        std::fs::write(&ok, "match\n").unwrap();
379
380        let (out, _orig) = handle(
381            "match",
382            dir.path().to_string_lossy().as_ref(),
383            None,
384            10,
385            CrpMode::Off,
386            true,
387            false,
388        );
389
390        assert!(out.contains("ok.txt:"), "expected ok.txt match, got: {out}");
391        assert!(
392            !out.contains("key.pem:"),
393            "secret-like file should be skipped, got: {out}"
394        );
395        assert!(
396            out.contains("secret-like files skipped"),
397            "expected boundary skip note, got: {out}"
398        );
399    }
400}