Skip to main content

lean_ctx/tools/
ctx_search.rs

1use std::collections::HashSet;
2use std::path::Path;
3use std::path::PathBuf;
4
5use ignore::WalkBuilder;
6use regex::RegexBuilder;
7
8use crate::core::protocol;
9use crate::core::symbol_map::{self, SymbolMap};
10use crate::core::tokens::count_tokens;
11use crate::tools::CrpMode;
12
13const MAX_FILE_SIZE: u64 = 512_000;
14const MAX_WALK_DEPTH: usize = 20;
15
16/// Searches files for a regex pattern with compressed output and monorepo scope hints.
17pub fn handle(
18    pattern: &str,
19    dir: &str,
20    ext_filter: Option<&str>,
21    max_results: usize,
22    _crp_mode: CrpMode,
23    respect_gitignore: bool,
24    allow_secret_paths: bool,
25) -> (String, usize) {
26    const MAX_PATTERN_LEN: usize = 1024;
27    const MAX_REGEX_SIZE: usize = 1 << 20; // 1 MiB DFA limit
28
29    let redact = crate::core::redaction::redaction_enabled_for_active_role();
30    if pattern.len() > MAX_PATTERN_LEN {
31        return (
32            format!(
33                "ERROR: pattern too long ({} > {MAX_PATTERN_LEN} chars)",
34                pattern.len()
35            ),
36            0,
37        );
38    }
39    let re = match RegexBuilder::new(pattern)
40        .size_limit(MAX_REGEX_SIZE)
41        .dfa_size_limit(MAX_REGEX_SIZE)
42        .build()
43    {
44        Ok(r) => r,
45        Err(e) => return (format!("ERROR: invalid regex: {e}"), 0),
46    };
47
48    let root = Path::new(dir);
49    if !root.exists() {
50        return (format!("ERROR: {dir} does not exist"), 0);
51    }
52
53    let walker = WalkBuilder::new(root)
54        .hidden(true)
55        .max_depth(Some(MAX_WALK_DEPTH))
56        .git_ignore(respect_gitignore)
57        .git_global(respect_gitignore)
58        .git_exclude(respect_gitignore)
59        .build();
60
61    let mut files: Vec<PathBuf> = Vec::new();
62    let mut matches = Vec::new();
63    let mut raw_tokens_accum: usize = 0;
64    let mut files_searched = 0u32;
65    let mut files_skipped_size = 0u32;
66    let mut files_skipped_encoding = 0u32;
67    let mut files_skipped_boundary = 0u32;
68
69    for entry in walker.filter_map(std::result::Result::ok) {
70        if entry.file_type().is_none_or(|ft| ft.is_dir()) {
71            continue;
72        }
73
74        if entry.file_type().is_some_and(|ft| ft.is_symlink()) {
75            continue;
76        }
77
78        let path = entry.path();
79
80        if is_binary_ext(path) || is_generated_file(path) {
81            continue;
82        }
83
84        if !allow_secret_paths && crate::core::io_boundary::is_secret_like(path).is_some() {
85            files_skipped_boundary += 1;
86            continue;
87        }
88
89        if let Some(ext) = ext_filter {
90            let file_ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
91            if file_ext != ext {
92                continue;
93            }
94        }
95
96        if let Ok(meta) = std::fs::metadata(path) {
97            if meta.len() > MAX_FILE_SIZE {
98                files_skipped_size += 1;
99                continue;
100            }
101        }
102
103        files.push(path.to_path_buf());
104    }
105
106    // Deterministic search: stable file ordering makes max_results truncation reproducible.
107    files.sort_unstable_by(|a, b| a.as_os_str().cmp(b.as_os_str()));
108
109    let root_str = root.to_string_lossy();
110    for path in &files {
111        if matches.len() >= max_results {
112            break;
113        }
114
115        let Ok(content) = std::fs::read_to_string(path) else {
116            files_skipped_encoding += 1;
117            continue;
118        };
119
120        files_searched += 1;
121
122        for (i, line) in content.lines().enumerate() {
123            if re.is_match(line) {
124                let short_path =
125                    protocol::shorten_path_relative(&path.to_string_lossy(), &root_str);
126                // Count raw tokens incrementally (avoids separate Vec + join)
127                raw_tokens_accum += count_tokens(line.trim()) + 2;
128                let shown = if redact {
129                    crate::core::redaction::redact_text(line.trim())
130                } else {
131                    line.trim().to_string()
132                };
133                matches.push(format!("{short_path}:{} {}", i + 1, shown));
134                if matches.len() >= max_results {
135                    break;
136                }
137            }
138        }
139    }
140
141    if matches.is_empty() {
142        let mut msg = format!("0 matches for '{pattern}' in {files_searched} files");
143        if files_skipped_size > 0 {
144            msg.push_str(&format!(" ({files_skipped_size} large files skipped)"));
145        }
146        if files_skipped_encoding > 0 {
147            msg.push_str(&format!(
148                " ({files_skipped_encoding} files skipped: binary/encoding)"
149            ));
150        }
151        if files_skipped_boundary > 0 {
152            msg.push_str(&format!(
153                " ({files_skipped_boundary} secret-like files skipped by boundary policy)"
154            ));
155        }
156        return (msg, 0);
157    }
158
159    // Prefix-cache-friendly: structural file list before per-query match content
160    let matched_files: Vec<&str> = {
161        let mut seen = HashSet::new();
162        matches
163            .iter()
164            .filter_map(|m| {
165                let file = extract_file_from_match(m);
166                if seen.insert(file) {
167                    Some(file)
168                } else {
169                    None
170                }
171            })
172            .collect()
173    };
174
175    let mut result = format!("{} matches in {} files", matches.len(), files_searched);
176    if matched_files.len() > 1 {
177        result.push_str(" [");
178        result.push_str(&matched_files.join(", "));
179        result.push(']');
180    }
181    result.push_str(":\n");
182    result.push_str(&matches.join("\n"));
183
184    if files_skipped_size > 0 {
185        result.push_str(&format!("\n({files_skipped_size} files >512KB skipped)"));
186    }
187    if files_skipped_encoding > 0 {
188        result.push_str(&format!(
189            "\n({files_skipped_encoding} files skipped: binary/encoding)"
190        ));
191    }
192    if files_skipped_boundary > 0 {
193        result.push_str(&format!(
194            "\n({files_skipped_boundary} secret-like files skipped by boundary policy)"
195        ));
196    }
197
198    let scope_hint = monorepo_scope_hint(&matches, dir);
199
200    {
201        let file_ext = ext_filter.unwrap_or("rs");
202        let mut sym = SymbolMap::new();
203        let idents = symbol_map::extract_identifiers(&result, file_ext);
204        for ident in &idents {
205            sym.register(ident);
206        }
207        if sym.len() >= 3 {
208            let sym_table = sym.format_table();
209            let compressed = sym.apply(&result);
210            let original_tok = count_tokens(&result);
211            let compressed_tok = count_tokens(&compressed) + count_tokens(&sym_table);
212            let net_saving = original_tok.saturating_sub(compressed_tok);
213            if original_tok > 0 && net_saving * 100 / original_tok >= 5 {
214                result = format!("{compressed}{sym_table}");
215            }
216        }
217    }
218
219    if let Some(hint) = scope_hint {
220        result.push_str(&hint);
221    }
222
223    let sent = count_tokens(&result);
224
225    // The "original" cost is what a native grep with context lines would produce.
226    // rg defaults to showing full paths + 2 context lines per match. We estimate
227    // the native cost as ~3x the raw match output (context + separators + headers).
228    let native_estimate = (raw_tokens_accum as f64 * 2.5).ceil() as usize;
229    let original = native_estimate.max(raw_tokens_accum);
230    let savings = protocol::format_savings(original, sent);
231
232    (format!("{result}\n{savings}"), original)
233}
234
235fn is_binary_ext(path: &Path) -> bool {
236    let ext = path.extension().and_then(|e| e.to_str()).unwrap_or("");
237    matches!(
238        ext,
239        "png"
240            | "jpg"
241            | "jpeg"
242            | "gif"
243            | "webp"
244            | "ico"
245            | "svg"
246            | "woff"
247            | "woff2"
248            | "ttf"
249            | "eot"
250            | "pdf"
251            | "zip"
252            | "tar"
253            | "gz"
254            | "br"
255            | "zst"
256            | "bz2"
257            | "xz"
258            | "mp3"
259            | "mp4"
260            | "webm"
261            | "ogg"
262            | "wasm"
263            | "so"
264            | "dylib"
265            | "dll"
266            | "exe"
267            | "lock"
268            | "map"
269            | "snap"
270            | "patch"
271            | "db"
272            | "sqlite"
273            | "parquet"
274            | "arrow"
275            | "bin"
276            | "o"
277            | "a"
278            | "class"
279            | "pyc"
280            | "pyo"
281    )
282}
283
284fn is_generated_file(path: &Path) -> bool {
285    let name = path.file_name().and_then(|n| n.to_str()).unwrap_or("");
286    name.ends_with(".min.js")
287        || name.ends_with(".min.css")
288        || name.ends_with(".bundle.js")
289        || name.ends_with(".chunk.js")
290        || name.ends_with(".d.ts")
291        || name.ends_with(".js.map")
292        || name.ends_with(".css.map")
293}
294
295/// Extract file path from a grep match line, handling Windows drive letters (e.g. "C:").
296fn extract_file_from_match(line: &str) -> &str {
297    let start = if line.len() >= 2
298        && line.as_bytes().first().is_some_and(u8::is_ascii_alphabetic)
299        && line.as_bytes().get(1) == Some(&b':')
300    {
301        2
302    } else {
303        0
304    };
305    match line[start..].find(':') {
306        Some(pos) => &line[..start + pos],
307        None => line,
308    }
309}
310
311fn monorepo_scope_hint(matches: &[String], search_dir: &str) -> Option<String> {
312    let top_dirs: HashSet<&str> = matches
313        .iter()
314        .filter_map(|m| {
315            let path = extract_file_from_match(m);
316            let relative = path.strip_prefix("./").unwrap_or(path);
317            let relative = relative.strip_prefix(search_dir).unwrap_or(relative);
318            let relative = relative.strip_prefix('/').unwrap_or(relative);
319            relative.split('/').next()
320        })
321        .collect();
322
323    if top_dirs.len() > 3 {
324        let mut dirs: Vec<&&str> = top_dirs.iter().collect();
325        dirs.sort();
326        let dir_list: Vec<String> = dirs.iter().take(6).map(|d| format!("'{d}'")).collect();
327        let extra = if top_dirs.len() > 6 {
328            format!(", +{} more", top_dirs.len() - 6)
329        } else {
330            String::new()
331        };
332        Some(format!(
333            "\n\nResults span {} directories ({}{}). \
334             Use the 'path' parameter to scope to a specific service, \
335             e.g. path=\"{}/\".",
336            top_dirs.len(),
337            dir_list.join(", "),
338            extra,
339            dirs[0]
340        ))
341    } else {
342        None
343    }
344}
345
346#[cfg(test)]
347mod tests {
348    use super::*;
349    use crate::tools::CrpMode;
350
351    #[test]
352    fn search_results_are_deterministically_ordered_by_path() {
353        let dir = tempfile::tempdir().unwrap();
354        let a = dir.path().join("a.txt");
355        let b = dir.path().join("b.txt");
356        std::fs::write(&b, "match\n").unwrap();
357        std::fs::write(&a, "match\n").unwrap();
358
359        let (out, _orig) = handle(
360            "match",
361            dir.path().to_string_lossy().as_ref(),
362            Some("txt"),
363            10,
364            CrpMode::Off,
365            true,
366            true,
367        );
368
369        let mut match_lines: Vec<&str> = out
370            .lines()
371            .filter(|l| l.contains(".txt:") && l.contains("match"))
372            .collect();
373        // Expect exactly the 2 match lines, ordered a.txt then b.txt.
374        match_lines.truncate(2);
375        assert_eq!(match_lines.len(), 2);
376        assert!(
377            match_lines[0].contains("a.txt:"),
378            "first match should come from a.txt, got: {}",
379            match_lines[0]
380        );
381        assert!(
382            match_lines[1].contains("b.txt:"),
383            "second match should come from b.txt, got: {}",
384            match_lines[1]
385        );
386    }
387
388    #[test]
389    fn secret_like_files_are_skipped_by_default() {
390        let dir = tempfile::tempdir().unwrap();
391        let secret = dir.path().join("key.pem");
392        let ok = dir.path().join("ok.txt");
393        std::fs::write(&secret, "match\n").unwrap();
394        std::fs::write(&ok, "match\n").unwrap();
395
396        let (out, _orig) = handle(
397            "match",
398            dir.path().to_string_lossy().as_ref(),
399            None,
400            10,
401            CrpMode::Off,
402            true,
403            false,
404        );
405
406        assert!(out.contains("ok.txt:"), "expected ok.txt match, got: {out}");
407        assert!(
408            !out.contains("key.pem:"),
409            "secret-like file should be skipped, got: {out}"
410        );
411        assert!(
412            out.contains("secret-like files skipped"),
413            "expected boundary skip note, got: {out}"
414        );
415    }
416}