1use anyhow::Result;
2use ignore::WalkBuilder;
3use lazy_static::lazy_static;
4use probe_code::search::tokenization;
5use std::collections::{HashMap, HashSet};
6use std::path::{Path, PathBuf};
7use std::sync::{Arc, RwLock};
8use std::time::Instant;
9
10#[derive(Debug, Clone)]
12pub struct FileList {
13 pub files: Vec<PathBuf>,
15 #[allow(dead_code)]
17 pub created_at: Instant,
18}
19
20lazy_static! {
22 static ref FILE_LIST_CACHE: RwLock<HashMap<String, Arc<FileList>>> =
23 RwLock::new(HashMap::new());
24}
25
26fn format_duration(duration: std::time::Duration) -> String {
28 if duration.as_millis() < 1000 {
29 let duration_millis = duration.as_millis();
30 format!("{duration_millis}ms")
31 } else {
32 let duration_secs = duration.as_secs_f64();
33 format!("{duration_secs:.2}s")
34 }
35}
36
37fn generate_cache_key(path: &Path, allow_tests: bool, custom_ignores: &[String]) -> String {
39 let path_str = path.to_string_lossy();
41 let allow_tests_str = if allow_tests {
42 "with_tests"
43 } else {
44 "no_tests"
45 };
46
47 let ignores_hash = if custom_ignores.is_empty() {
49 "no_ignores".to_string()
50 } else {
51 let mut hash = 0u64;
53 for ignore in custom_ignores {
54 for byte in ignore.bytes() {
55 hash = hash.wrapping_mul(31).wrapping_add(byte as u64);
56 }
57 }
58 format!("ignores_{hash:x}")
59 };
60
61 format!("{path_str}_{allow_tests_str}_{ignores_hash}")
62}
63
64pub fn get_file_list(
67 path: &Path,
68 allow_tests: bool,
69 custom_ignores: &[String],
70) -> Result<Arc<FileList>> {
71 let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
72 let start_time = Instant::now();
73
74 if debug_mode {
75 println!("DEBUG: Getting file list for path: {path:?}");
76 println!("DEBUG: allow_tests: {allow_tests}");
77 println!("DEBUG: custom_ignores: {custom_ignores:?}");
78 }
79
80 let cache_key = generate_cache_key(path, allow_tests, custom_ignores);
82
83 {
85 let cache = FILE_LIST_CACHE.read().unwrap();
86 if let Some(file_list) = cache.get(&cache_key) {
87 let elapsed = start_time.elapsed();
88 if debug_mode {
89 println!(
90 "DEBUG: Found file list in cache with {} files (retrieved in {})",
91 file_list.files.len(),
92 format_duration(elapsed)
93 );
94 }
95 return Ok(Arc::clone(file_list));
96 }
97 }
98
99 if debug_mode {
101 println!("DEBUG: File list not found in cache, building new list");
102 }
103
104 let file_list = build_file_list(path, allow_tests, custom_ignores)?;
105 let file_count = file_list.files.len();
106
107 let file_list = Arc::new(file_list);
109 {
110 let mut cache = FILE_LIST_CACHE.write().unwrap();
111 cache.insert(cache_key, Arc::clone(&file_list));
112 }
113
114 let elapsed = start_time.elapsed();
115 if debug_mode {
116 println!(
117 "DEBUG: Built and cached new file list with {} files in {}",
118 file_count,
119 format_duration(elapsed)
120 );
121 }
122
123 Ok(file_list)
124}
125
126fn build_file_list(path: &Path, allow_tests: bool, custom_ignores: &[String]) -> Result<FileList> {
128 let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
129 let start_time = Instant::now();
130
131 if debug_mode {
132 println!("DEBUG: Building file list for path: {path:?}");
133 }
134
135 let builder_start = Instant::now();
137 let mut builder = WalkBuilder::new(path);
138
139 builder.git_ignore(true);
141 builder.git_global(true);
142 builder.git_exclude(true);
143
144 builder.threads(rayon::current_num_threads());
146
147 let mut common_ignores: Vec<String> = vec![
149 "node_modules",
150 "vendor",
151 "target",
152 "dist",
153 "build",
154 ".git",
155 ".svn",
156 ".hg",
157 ".idea",
158 ".vscode",
159 "__pycache__",
160 "*.pyc",
161 "*.pyo",
162 "*.class",
163 "*.o",
164 "*.obj",
165 "*.a",
166 "*.lib",
167 "*.so",
168 "*.dylib",
169 "*.dll",
170 "*.exe",
171 "*.out",
172 "*.app",
173 "*.jar",
174 "*.war",
175 "*.ear",
176 "*.zip",
177 "*.tar.gz",
178 "*.rar",
179 "*.log",
180 "*.tmp",
181 "*.temp",
182 "*.swp",
183 "*.swo",
184 "*.bak",
185 "*.orig",
186 "*.DS_Store",
187 "Thumbs.db",
188 "*.yml",
189 "*.yaml",
190 "*.json",
191 "*.tconf",
192 "*.conf",
193 "go.sum",
194 ]
195 .into_iter()
196 .map(String::from)
197 .collect();
198
199 if !allow_tests {
201 let test_patterns: Vec<String> = vec![
202 "*_test.rs",
203 "*_tests.rs",
204 "test_*.rs",
205 "tests.rs",
206 "*.spec.js",
207 "*.test.js",
208 "*.spec.ts",
209 "*.test.ts",
210 "*.spec.jsx",
211 "*.test.jsx",
212 "*.spec.tsx",
213 "*.test.tsx",
214 "test_*.py",
215 "*_test.go",
216 "test_*.c",
217 "*_test.c",
218 "*_test.cpp",
219 "*_test.cc",
220 "*_test.cxx",
221 "*Test.java",
222 "*_test.rb",
223 "test_*.rb",
224 "*_spec.rb",
225 "*Test.php",
226 "test_*.php",
227 "**/tests/**",
228 "**/test/**",
229 "**/__tests__/**",
230 "**/__test__/**",
231 "**/spec/**",
232 "**/specs/**",
233 ]
234 .into_iter()
235 .map(String::from)
236 .collect();
237 common_ignores.extend(test_patterns);
238 }
239
240 for pattern in custom_ignores {
242 common_ignores.push(pattern.clone());
243 }
244
245 let mut override_builder = ignore::overrides::OverrideBuilder::new(path);
247
248 for pattern in &common_ignores {
250 if let Err(err) = override_builder.add(&format!("!{pattern}")) {
251 eprintln!("Error adding ignore pattern {pattern:?}: {err}");
252 }
253 }
254
255 match override_builder.build() {
257 Ok(overrides) => {
258 builder.overrides(overrides);
259 }
260 Err(err) => {
261 eprintln!("Error building ignore overrides: {err}");
262 }
263 }
264
265 let builder_duration = builder_start.elapsed();
266
267 if debug_mode {
268 println!(
269 "DEBUG: Builder configuration completed in {}",
270 format_duration(builder_duration)
271 );
272 }
273
274 let walk_start = Instant::now();
276 let mut files = Vec::new();
277 let mut total_files = 0;
278
279 for result in builder.build() {
280 total_files += 1;
281 let entry = match result {
282 Ok(entry) => entry,
283 Err(err) => {
284 eprintln!("Error walking directory: {err}");
285 continue;
286 }
287 };
288
289 if !entry.file_type().is_some_and(|ft| ft.is_file()) {
291 continue;
292 }
293
294 files.push(entry.path().to_path_buf());
295 }
296
297 let walk_duration = walk_start.elapsed();
298
299 if debug_mode {
300 println!(
301 "DEBUG: Directory walk completed in {} - Found {} files out of {} entries",
302 format_duration(walk_duration),
303 files.len(),
304 total_files
305 );
306 }
307
308 let total_duration = start_time.elapsed();
309
310 if debug_mode {
311 println!(
312 "DEBUG: Total file list building completed in {}",
313 format_duration(total_duration)
314 );
315 }
316
317 Ok(FileList {
318 files,
319 created_at: Instant::now(),
320 })
321}
322
323pub fn find_matching_filenames(
326 path: &Path,
327 queries: &[String],
328 already_found_files: &HashSet<PathBuf>,
329 custom_ignores: &[String],
330 allow_tests: bool,
331 term_indices: &HashMap<String, usize>,
332 language: Option<&str>,
333) -> Result<HashMap<PathBuf, HashSet<usize>>> {
334 let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
335 let start_time = Instant::now();
336
337 if debug_mode {
338 println!("DEBUG: Finding files with matching filenames");
339 println!("DEBUG: Queries: {queries:?}");
340 println!(
341 "DEBUG: Already found files count: {}",
342 already_found_files.len()
343 );
344 println!("DEBUG: Term indices: {term_indices:?}");
345 }
346
347 let file_list = get_file_list_by_language(path, allow_tests, custom_ignores, language)?;
349
350 if debug_mode {
351 println!(
352 "DEBUG: Searching through {} files from cache",
353 file_list.files.len()
354 );
355 }
356
357 let query_tokens: Vec<String> = queries
359 .iter()
360 .flat_map(|q| tokenization::tokenize(q))
361 .collect();
362
363 if debug_mode {
364 println!("DEBUG: Query tokens for filename matching: {query_tokens:?}");
365 }
366
367 let mut matching_files = HashMap::new();
369
370 for file_path in &file_list.files {
371 if already_found_files.contains(file_path) {
373 continue;
374 }
375
376 let relative_path = file_path.to_string_lossy().to_string();
378
379 let filename_tokens = tokenization::tokenize(&relative_path);
381
382 if debug_mode && !filename_tokens.is_empty() {
383 println!("DEBUG: Path '{relative_path}' tokenized as: {filename_tokens:?}");
384 }
385 let mut matched_terms = HashSet::new();
387
388 for (term, &idx) in term_indices {
389 let term_tokens = tokenization::tokenize(term);
390
391 let matched = term_tokens.iter().any(|term_token| {
393 filename_tokens.iter().any(|filename_token| {
394 filename_token.contains(term_token) || term_token.contains(filename_token)
395 })
396 });
397
398 if matched {
399 matched_terms.insert(idx);
400 if debug_mode {
401 println!(
402 "DEBUG: Term '{term}' matched path '{relative_path}', adding index {idx}"
403 );
404 }
405 }
406 }
407
408 if !matched_terms.is_empty() {
410 matching_files.insert(file_path.clone(), matched_terms);
411 }
412 }
413
414 let elapsed = start_time.elapsed();
415
416 if debug_mode {
417 println!(
418 "DEBUG: Found {} files with matching filenames in {}",
419 matching_files.len(),
420 format_duration(elapsed)
421 );
422 }
423
424 Ok(matching_files)
425}
426
427fn get_language_extensions(language: &str) -> Vec<String> {
429 match language.to_lowercase().as_str() {
430 "rust" => vec![".rs".to_string()],
431 "javascript" => vec![".js".to_string(), ".jsx".to_string(), ".mjs".to_string()],
432 "typescript" => vec![".ts".to_string(), ".tsx".to_string()],
433 "python" => vec![".py".to_string(), ".pyw".to_string(), ".pyi".to_string()],
434 "go" => vec![".go".to_string()],
435 "c" => vec![".c".to_string(), ".h".to_string()],
436 "cpp" => vec![
437 ".cpp".to_string(),
438 ".cc".to_string(),
439 ".cxx".to_string(),
440 ".hpp".to_string(),
441 ".hxx".to_string(),
442 ".h".to_string(),
443 ],
444 "java" => vec![".java".to_string()],
445 "ruby" => vec![".rb".to_string(), ".rake".to_string()],
446 "php" => vec![".php".to_string()],
447 "swift" => vec![".swift".to_string()],
448 "csharp" => vec![".cs".to_string()],
449 _ => vec![], }
451}
452
453pub fn get_file_list_by_language(
455 path: &Path,
456 allow_tests: bool,
457 custom_ignores: &[String],
458 language: Option<&str>,
459) -> Result<Arc<FileList>> {
460 if language.is_none() {
462 return get_file_list(path, allow_tests, custom_ignores);
463 }
464
465 let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
466 let start_time = Instant::now();
467
468 if debug_mode {
469 println!("DEBUG: Getting file list for path: {path:?} with language filter: {language:?}");
470 }
471
472 let full_file_list = get_file_list(path, allow_tests, custom_ignores)?;
474
475 let extensions = get_language_extensions(language.unwrap());
477
478 if debug_mode {
479 println!("DEBUG: Filtering files by extensions: {extensions:?}");
480 }
481
482 let filtered_files = if extensions.is_empty() {
484 full_file_list.files.clone()
486 } else {
487 full_file_list
488 .files
489 .iter()
490 .filter(|file| {
491 if let Some(ext) = file.extension() {
492 let ext_lossy = ext.to_string_lossy();
493 let ext_str = format!(".{ext_lossy}");
494 extensions.iter().any(|e| e == &ext_str)
495 } else {
496 false
497 }
498 })
499 .cloned()
500 .collect()
501 };
502
503 let elapsed = start_time.elapsed();
504 if debug_mode {
505 println!(
506 "DEBUG: Filtered file list by language in {} - Found {} files out of {}",
507 format_duration(elapsed),
508 filtered_files.len(),
509 full_file_list.files.len()
510 );
511 }
512
513 Ok(Arc::new(FileList {
515 files: filtered_files,
516 created_at: Instant::now(),
517 }))
518}
519
520#[cfg(test)]
521mod tests {
522 use super::*;
523 use std::fs;
524 use tempfile::TempDir;
525
526 #[test]
527 fn test_underscore_directory_traversal_unix_paths() {
528 let temp_dir = TempDir::new().unwrap();
529
530 let underscore_dir = temp_dir.path().join("docs_packages").join("hello_kitty");
532 fs::create_dir_all(&underscore_dir).unwrap();
533
534 let test_file = underscore_dir.join("test.txt");
535 fs::write(&test_file, "test content with search term").unwrap();
536
537 let parent_file = temp_dir.path().join("docs_packages").join("parent.txt");
539 fs::write(&parent_file, "parent content").unwrap();
540
541 let file_list = get_file_list(temp_dir.path(), true, &[]).unwrap();
542
543 assert!(
544 file_list.files.iter().any(|f| f == &test_file),
545 "File in nested underscore directory should be found: {:?}",
546 test_file
547 );
548 assert!(
549 file_list.files.iter().any(|f| f == &parent_file),
550 "File in underscore directory should be found: {:?}",
551 parent_file
552 );
553 }
554
555 #[test]
556 fn test_underscore_directory_traversal_windows_style_paths() {
557 let temp_dir = TempDir::new().unwrap();
558
559 let underscore_dir = temp_dir
561 .path()
562 .join("C_drive")
563 .join("_ai")
564 .join("docs")
565 .join("docs_packages")
566 .join("helloKitty");
567 fs::create_dir_all(&underscore_dir).unwrap();
568
569 let test_file = underscore_dir.join("dog.txt");
570 fs::write(&test_file, "bad kitty > dog.txt").unwrap();
571
572 let ai_dir_file = temp_dir
574 .path()
575 .join("C_drive")
576 .join("_ai")
577 .join("config.txt");
578 fs::create_dir_all(ai_dir_file.parent().unwrap()).unwrap();
579 fs::write(&ai_dir_file, "ai configuration").unwrap();
580
581 let docs_packages_file = temp_dir
582 .path()
583 .join("C_drive")
584 .join("_ai")
585 .join("docs")
586 .join("docs_packages")
587 .join("readme.md");
588 fs::create_dir_all(docs_packages_file.parent().unwrap()).unwrap();
589 fs::write(&docs_packages_file, "documentation packages").unwrap();
590
591 let file_list = get_file_list(temp_dir.path(), true, &[]).unwrap();
592
593 assert!(
594 file_list.files.iter().any(|f| f == &test_file),
595 "File in deeply nested underscore directory should be found: {:?}",
596 test_file
597 );
598 assert!(
599 file_list.files.iter().any(|f| f == &ai_dir_file),
600 "File in _ai directory should be found: {:?}",
601 ai_dir_file
602 );
603 assert!(
604 file_list.files.iter().any(|f| f == &docs_packages_file),
605 "File in docs_packages directory should be found: {:?}",
606 docs_packages_file
607 );
608 }
609
610 #[test]
611 fn test_underscore_directory_with_custom_ignores() {
612 let temp_dir = TempDir::new().unwrap();
613
614 let underscore_dir = temp_dir.path().join("test_packages").join("sub_dir");
616 fs::create_dir_all(&underscore_dir).unwrap();
617
618 let test_file = underscore_dir.join("test.rs");
619 fs::write(&test_file, "fn test() {}").unwrap();
620
621 let ignored_file = underscore_dir.join("ignored.tmp");
622 fs::write(&ignored_file, "temporary content").unwrap();
623
624 let custom_ignores = vec!["*.tmp".to_string()];
626 let file_list = get_file_list(temp_dir.path(), true, &custom_ignores).unwrap();
627
628 assert!(
629 file_list.files.iter().any(|f| f == &test_file),
630 "Rust file in underscore directory should be found: {:?}",
631 test_file
632 );
633 assert!(
634 !file_list.files.iter().any(|f| f == &ignored_file),
635 "Ignored file should not be found: {:?}",
636 ignored_file
637 );
638 }
639
640 #[test]
641 fn test_multiple_underscore_patterns() {
642 let temp_dir = TempDir::new().unwrap();
643
644 let patterns = vec![
646 "single_underscore",
647 "multiple_under_scores",
648 "_leading_underscore",
649 "trailing_underscore_",
650 "__double__underscore__",
651 "mixed-dash_underscore",
652 ];
653
654 let mut expected_files = Vec::new();
655
656 for pattern in patterns {
657 let dir = temp_dir.path().join(pattern);
658 fs::create_dir_all(&dir).unwrap();
659
660 let file = dir.join("content.txt");
661 fs::write(&file, format!("content in {}", pattern)).unwrap();
662 expected_files.push(file);
663 }
664
665 let file_list = get_file_list(temp_dir.path(), true, &[]).unwrap();
666
667 for expected_file in &expected_files {
668 assert!(
669 file_list.files.iter().any(|f| f == expected_file),
670 "File in underscore directory should be found: {:?}",
671 expected_file
672 );
673 }
674 }
675
676 #[test]
677 fn test_underscore_directories_respect_gitignore_patterns() {
678 let temp_dir = TempDir::new().unwrap();
679
680 let node_modules_dir = temp_dir.path().join("project_dir").join("node_modules");
682 fs::create_dir_all(&node_modules_dir).unwrap();
683 let node_file = node_modules_dir.join("package.js");
684 fs::write(&node_file, "module content").unwrap();
685
686 let target_dir = temp_dir.path().join("rust_project").join("target");
687 fs::create_dir_all(&target_dir).unwrap();
688 let target_file = target_dir.join("binary");
689 fs::write(&target_file, "binary content").unwrap();
690
691 let valid_dir = temp_dir.path().join("valid_project").join("src_files");
693 fs::create_dir_all(&valid_dir).unwrap();
694 let valid_file = valid_dir.join("main.rs");
695 fs::write(&valid_file, "fn main() {}").unwrap();
696
697 let file_list = get_file_list(temp_dir.path(), true, &[]).unwrap();
698
699 assert!(
700 !file_list.files.iter().any(|f| f == &node_file),
701 "Files in node_modules should be ignored: {:?}",
702 node_file
703 );
704 assert!(
705 !file_list.files.iter().any(|f| f == &target_file),
706 "Files in target directory should be ignored: {:?}",
707 target_file
708 );
709
710 assert!(
711 file_list.files.iter().any(|f| f == &valid_file),
712 "Files in valid underscore directories should be found: {:?}",
713 valid_file
714 );
715 }
716}