probe_code/search/
file_list_cache.rs

1use anyhow::Result;
2use ignore::WalkBuilder;
3use lazy_static::lazy_static;
4use probe_code::search::tokenization;
5use std::collections::{HashMap, HashSet};
6use std::path::{Path, PathBuf};
7use std::sync::{Arc, RwLock};
8use std::time::Instant;
9
10/// A struct to hold the cached file list for a specific directory
11#[derive(Debug, Clone)]
12pub struct FileList {
13    /// The list of files in the directory (respecting ignore patterns)
14    pub files: Vec<PathBuf>,
15    /// When this cache was created
16    #[allow(dead_code)]
17    pub created_at: Instant,
18}
19
20// Global in-memory cache for file lists
21lazy_static! {
22    static ref FILE_LIST_CACHE: RwLock<HashMap<String, Arc<FileList>>> =
23        RwLock::new(HashMap::new());
24}
25
26/// Helper function to format duration in a human-readable way
27fn format_duration(duration: std::time::Duration) -> String {
28    if duration.as_millis() < 1000 {
29        let duration_millis = duration.as_millis();
30        format!("{duration_millis}ms")
31    } else {
32        let duration_secs = duration.as_secs_f64();
33        format!("{duration_secs:.2}s")
34    }
35}
36
37/// Generate a cache key for a specific directory and options
38fn generate_cache_key(path: &Path, allow_tests: bool, custom_ignores: &[String]) -> String {
39    // Create a unique identifier for this cache based on the path and options
40    let path_str = path.to_string_lossy();
41    let allow_tests_str = if allow_tests {
42        "with_tests"
43    } else {
44        "no_tests"
45    };
46
47    // Create a hash of the custom ignores to include in the cache key
48    let ignores_hash = if custom_ignores.is_empty() {
49        "no_ignores".to_string()
50    } else {
51        // Simple hash function for the custom ignores
52        let mut hash = 0u64;
53        for ignore in custom_ignores {
54            for byte in ignore.bytes() {
55                hash = hash.wrapping_mul(31).wrapping_add(byte as u64);
56            }
57        }
58        format!("ignores_{hash:x}")
59    };
60
61    format!("{path_str}_{allow_tests_str}_{ignores_hash}")
62}
63
64/// Get a list of files in a directory, respecting ignore patterns and test file exclusions.
65/// This function will use a cached list if available, or build and cache a new list if not.
66pub fn get_file_list(
67    path: &Path,
68    allow_tests: bool,
69    custom_ignores: &[String],
70) -> Result<Arc<FileList>> {
71    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
72    let start_time = Instant::now();
73
74    if debug_mode {
75        println!("DEBUG: Getting file list for path: {path:?}");
76        println!("DEBUG: allow_tests: {allow_tests}");
77        println!("DEBUG: custom_ignores: {custom_ignores:?}");
78    }
79
80    // Create a cache key for this request
81    let cache_key = generate_cache_key(path, allow_tests, custom_ignores);
82
83    // Check if we have this file list in the cache
84    {
85        let cache = FILE_LIST_CACHE.read().unwrap();
86        if let Some(file_list) = cache.get(&cache_key) {
87            let elapsed = start_time.elapsed();
88            if debug_mode {
89                println!(
90                    "DEBUG: Found file list in cache with {} files (retrieved in {})",
91                    file_list.files.len(),
92                    format_duration(elapsed)
93                );
94            }
95            return Ok(Arc::clone(file_list));
96        }
97    }
98
99    // If not in cache, build the file list
100    if debug_mode {
101        println!("DEBUG: File list not found in cache, building new list");
102    }
103
104    let file_list = build_file_list(path, allow_tests, custom_ignores)?;
105    let file_count = file_list.files.len();
106
107    // Cache the file list
108    let file_list = Arc::new(file_list);
109    {
110        let mut cache = FILE_LIST_CACHE.write().unwrap();
111        cache.insert(cache_key, Arc::clone(&file_list));
112    }
113
114    let elapsed = start_time.elapsed();
115    if debug_mode {
116        println!(
117            "DEBUG: Built and cached new file list with {} files in {}",
118            file_count,
119            format_duration(elapsed)
120        );
121    }
122
123    Ok(file_list)
124}
125
126/// Build a list of files in a directory, respecting ignore patterns and test file exclusions.
127fn build_file_list(path: &Path, allow_tests: bool, custom_ignores: &[String]) -> Result<FileList> {
128    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
129    let start_time = Instant::now();
130
131    if debug_mode {
132        println!("DEBUG: Building file list for path: {path:?}");
133    }
134
135    // Create a WalkBuilder that respects .gitignore files and common ignore patterns
136    let builder_start = Instant::now();
137    let mut builder = WalkBuilder::new(path);
138
139    // Configure the builder
140    builder.git_ignore(true);
141    builder.git_global(true);
142    builder.git_exclude(true);
143
144    // Enable parallel walking for large directories
145    builder.threads(rayon::current_num_threads());
146
147    // Add common directories to ignore
148    let mut common_ignores: Vec<String> = vec![
149        "node_modules",
150        "vendor",
151        "target",
152        "dist",
153        "build",
154        ".git",
155        ".svn",
156        ".hg",
157        ".idea",
158        ".vscode",
159        "__pycache__",
160        "*.pyc",
161        "*.pyo",
162        "*.class",
163        "*.o",
164        "*.obj",
165        "*.a",
166        "*.lib",
167        "*.so",
168        "*.dylib",
169        "*.dll",
170        "*.exe",
171        "*.out",
172        "*.app",
173        "*.jar",
174        "*.war",
175        "*.ear",
176        "*.zip",
177        "*.tar.gz",
178        "*.rar",
179        "*.log",
180        "*.tmp",
181        "*.temp",
182        "*.swp",
183        "*.swo",
184        "*.bak",
185        "*.orig",
186        "*.DS_Store",
187        "Thumbs.db",
188        "*.yml",
189        "*.yaml",
190        "*.json",
191        "*.tconf",
192        "*.conf",
193        "go.sum",
194    ]
195    .into_iter()
196    .map(String::from)
197    .collect();
198
199    // Add test file patterns if allow_tests is false
200    if !allow_tests {
201        let test_patterns: Vec<String> = vec![
202            "*_test.rs",
203            "*_tests.rs",
204            "test_*.rs",
205            "tests.rs",
206            "*.spec.js",
207            "*.test.js",
208            "*.spec.ts",
209            "*.test.ts",
210            "*.spec.jsx",
211            "*.test.jsx",
212            "*.spec.tsx",
213            "*.test.tsx",
214            "test_*.py",
215            "*_test.go",
216            "test_*.c",
217            "*_test.c",
218            "*_test.cpp",
219            "*_test.cc",
220            "*_test.cxx",
221            "*Test.java",
222            "*_test.rb",
223            "test_*.rb",
224            "*_spec.rb",
225            "*Test.php",
226            "test_*.php",
227            "**/tests/**",
228            "**/test/**",
229            "**/__tests__/**",
230            "**/__test__/**",
231            "**/spec/**",
232            "**/specs/**",
233        ]
234        .into_iter()
235        .map(String::from)
236        .collect();
237        common_ignores.extend(test_patterns);
238    }
239
240    // Add custom ignore patterns to the common ignores
241    for pattern in custom_ignores {
242        common_ignores.push(pattern.clone());
243    }
244
245    // Create a single override builder for all ignore patterns
246    let mut override_builder = ignore::overrides::OverrideBuilder::new(path);
247
248    // Add all ignore patterns to the override builder
249    for pattern in &common_ignores {
250        if let Err(err) = override_builder.add(&format!("!{pattern}")) {
251            eprintln!("Error adding ignore pattern {pattern:?}: {err}");
252        }
253    }
254
255    // Build and apply the overrides
256    match override_builder.build() {
257        Ok(overrides) => {
258            builder.overrides(overrides);
259        }
260        Err(err) => {
261            eprintln!("Error building ignore overrides: {err}");
262        }
263    }
264
265    let builder_duration = builder_start.elapsed();
266
267    if debug_mode {
268        println!(
269            "DEBUG: Builder configuration completed in {}",
270            format_duration(builder_duration)
271        );
272    }
273
274    // Collect files
275    let walk_start = Instant::now();
276    let mut files = Vec::new();
277    let mut total_files = 0;
278
279    for result in builder.build() {
280        total_files += 1;
281        let entry = match result {
282            Ok(entry) => entry,
283            Err(err) => {
284                eprintln!("Error walking directory: {err}");
285                continue;
286            }
287        };
288
289        // Skip directories
290        if !entry.file_type().is_some_and(|ft| ft.is_file()) {
291            continue;
292        }
293
294        files.push(entry.path().to_path_buf());
295    }
296
297    let walk_duration = walk_start.elapsed();
298
299    if debug_mode {
300        println!(
301            "DEBUG: Directory walk completed in {} - Found {} files out of {} entries",
302            format_duration(walk_duration),
303            files.len(),
304            total_files
305        );
306    }
307
308    let total_duration = start_time.elapsed();
309
310    if debug_mode {
311        println!(
312            "DEBUG: Total file list building completed in {}",
313            format_duration(total_duration)
314        );
315    }
316
317    Ok(FileList {
318        files,
319        created_at: Instant::now(),
320    })
321}
322
323/// Find files whose names match query words
324/// Returns a map of file paths to the term indices that matched the filename
325pub fn find_matching_filenames(
326    path: &Path,
327    queries: &[String],
328    already_found_files: &HashSet<PathBuf>,
329    custom_ignores: &[String],
330    allow_tests: bool,
331    term_indices: &HashMap<String, usize>,
332    language: Option<&str>,
333) -> Result<HashMap<PathBuf, HashSet<usize>>> {
334    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
335    let start_time = Instant::now();
336
337    if debug_mode {
338        println!("DEBUG: Finding files with matching filenames");
339        println!("DEBUG: Queries: {queries:?}");
340        println!(
341            "DEBUG: Already found files count: {}",
342            already_found_files.len()
343        );
344        println!("DEBUG: Term indices: {term_indices:?}");
345    }
346
347    // Get the cached file list, with language filtering if specified
348    let file_list = get_file_list_by_language(path, allow_tests, custom_ignores, language)?;
349
350    if debug_mode {
351        println!(
352            "DEBUG: Searching through {} files from cache",
353            file_list.files.len()
354        );
355    }
356
357    // Tokenize query terms for matching using the standard tokenizer
358    let query_tokens: Vec<String> = queries
359        .iter()
360        .flat_map(|q| tokenization::tokenize(q))
361        .collect();
362
363    if debug_mode {
364        println!("DEBUG: Query tokens for filename matching: {query_tokens:?}");
365    }
366
367    // Search each file for matching filenames
368    let mut matching_files = HashMap::new();
369
370    for file_path in &file_list.files {
371        // Skip if this file is already in the results
372        if already_found_files.contains(file_path) {
373            continue;
374        }
375
376        // Get the full relative path including directory structure
377        let relative_path = file_path.to_string_lossy().to_string();
378
379        // Tokenize the full relative path using the standard tokenizer
380        let filename_tokens = tokenization::tokenize(&relative_path);
381
382        if debug_mode && !filename_tokens.is_empty() {
383            println!("DEBUG: Path '{relative_path}' tokenized as: {filename_tokens:?}");
384        }
385        // Find which terms match the filename
386        let mut matched_terms = HashSet::new();
387
388        for (term, &idx) in term_indices {
389            let term_tokens = tokenization::tokenize(term);
390
391            // Check if any term token matches any filename token
392            let matched = term_tokens.iter().any(|term_token| {
393                filename_tokens.iter().any(|filename_token| {
394                    filename_token.contains(term_token) || term_token.contains(filename_token)
395                })
396            });
397
398            if matched {
399                matched_terms.insert(idx);
400                if debug_mode {
401                    println!(
402                        "DEBUG: Term '{term}' matched path '{relative_path}', adding index {idx}"
403                    );
404                }
405            }
406        }
407
408        // Only add the file if we found at least one matching term
409        if !matched_terms.is_empty() {
410            matching_files.insert(file_path.clone(), matched_terms);
411        }
412    }
413
414    let elapsed = start_time.elapsed();
415
416    if debug_mode {
417        println!(
418            "DEBUG: Found {} files with matching filenames in {}",
419            matching_files.len(),
420            format_duration(elapsed)
421        );
422    }
423
424    Ok(matching_files)
425}
426
427/// Get a list of file extensions for a specific programming language
428fn get_language_extensions(language: &str) -> Vec<String> {
429    match language.to_lowercase().as_str() {
430        "rust" => vec![".rs".to_string()],
431        "javascript" => vec![".js".to_string(), ".jsx".to_string(), ".mjs".to_string()],
432        "typescript" => vec![".ts".to_string(), ".tsx".to_string()],
433        "python" => vec![".py".to_string(), ".pyw".to_string(), ".pyi".to_string()],
434        "go" => vec![".go".to_string()],
435        "c" => vec![".c".to_string(), ".h".to_string()],
436        "cpp" => vec![
437            ".cpp".to_string(),
438            ".cc".to_string(),
439            ".cxx".to_string(),
440            ".hpp".to_string(),
441            ".hxx".to_string(),
442            ".h".to_string(),
443        ],
444        "java" => vec![".java".to_string()],
445        "ruby" => vec![".rb".to_string(), ".rake".to_string()],
446        "php" => vec![".php".to_string()],
447        "swift" => vec![".swift".to_string()],
448        "csharp" => vec![".cs".to_string()],
449        _ => vec![], // Return empty vector for unknown languages
450    }
451}
452
453/// Get a list of files in a directory, filtered by language if specified
454pub fn get_file_list_by_language(
455    path: &Path,
456    allow_tests: bool,
457    custom_ignores: &[String],
458    language: Option<&str>,
459) -> Result<Arc<FileList>> {
460    // If no language is specified, use the regular get_file_list function
461    if language.is_none() {
462        return get_file_list(path, allow_tests, custom_ignores);
463    }
464
465    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
466    let start_time = Instant::now();
467
468    if debug_mode {
469        println!("DEBUG: Getting file list for path: {path:?} with language filter: {language:?}");
470    }
471
472    // Get the full file list first
473    let full_file_list = get_file_list(path, allow_tests, custom_ignores)?;
474
475    // Get the extensions for the specified language
476    let extensions = get_language_extensions(language.unwrap());
477
478    if debug_mode {
479        println!("DEBUG: Filtering files by extensions: {extensions:?}");
480    }
481
482    // Filter the files by extension
483    let filtered_files = if extensions.is_empty() {
484        // If no extensions are defined for this language, return the full list
485        full_file_list.files.clone()
486    } else {
487        full_file_list
488            .files
489            .iter()
490            .filter(|file| {
491                if let Some(ext) = file.extension() {
492                    let ext_lossy = ext.to_string_lossy();
493                    let ext_str = format!(".{ext_lossy}");
494                    extensions.iter().any(|e| e == &ext_str)
495                } else {
496                    false
497                }
498            })
499            .cloned()
500            .collect()
501    };
502
503    let elapsed = start_time.elapsed();
504    if debug_mode {
505        println!(
506            "DEBUG: Filtered file list by language in {} - Found {} files out of {}",
507            format_duration(elapsed),
508            filtered_files.len(),
509            full_file_list.files.len()
510        );
511    }
512
513    // Create a new FileList with the filtered files
514    Ok(Arc::new(FileList {
515        files: filtered_files,
516        created_at: Instant::now(),
517    }))
518}
519
520#[cfg(test)]
521mod tests {
522    use super::*;
523    use std::fs;
524    use tempfile::TempDir;
525
526    #[test]
527    fn test_underscore_directory_traversal_unix_paths() {
528        let temp_dir = TempDir::new().unwrap();
529
530        // Create directory structure with underscores (Unix-style paths)
531        let underscore_dir = temp_dir.path().join("docs_packages").join("hello_kitty");
532        fs::create_dir_all(&underscore_dir).unwrap();
533
534        let test_file = underscore_dir.join("test.txt");
535        fs::write(&test_file, "test content with search term").unwrap();
536
537        // Also create a file in the parent underscore directory
538        let parent_file = temp_dir.path().join("docs_packages").join("parent.txt");
539        fs::write(&parent_file, "parent content").unwrap();
540
541        let file_list = get_file_list(temp_dir.path(), true, &[]).unwrap();
542
543        assert!(
544            file_list.files.iter().any(|f| f == &test_file),
545            "File in nested underscore directory should be found: {:?}",
546            test_file
547        );
548        assert!(
549            file_list.files.iter().any(|f| f == &parent_file),
550            "File in underscore directory should be found: {:?}",
551            parent_file
552        );
553    }
554
555    #[test]
556    fn test_underscore_directory_traversal_windows_style_paths() {
557        let temp_dir = TempDir::new().unwrap();
558
559        // Create directory structure similar to Windows paths with underscores
560        let underscore_dir = temp_dir
561            .path()
562            .join("C_drive")
563            .join("_ai")
564            .join("docs")
565            .join("docs_packages")
566            .join("helloKitty");
567        fs::create_dir_all(&underscore_dir).unwrap();
568
569        let test_file = underscore_dir.join("dog.txt");
570        fs::write(&test_file, "bad kitty > dog.txt").unwrap();
571
572        // Create additional test files in various underscore directories
573        let ai_dir_file = temp_dir
574            .path()
575            .join("C_drive")
576            .join("_ai")
577            .join("config.txt");
578        fs::create_dir_all(ai_dir_file.parent().unwrap()).unwrap();
579        fs::write(&ai_dir_file, "ai configuration").unwrap();
580
581        let docs_packages_file = temp_dir
582            .path()
583            .join("C_drive")
584            .join("_ai")
585            .join("docs")
586            .join("docs_packages")
587            .join("readme.md");
588        fs::create_dir_all(docs_packages_file.parent().unwrap()).unwrap();
589        fs::write(&docs_packages_file, "documentation packages").unwrap();
590
591        let file_list = get_file_list(temp_dir.path(), true, &[]).unwrap();
592
593        assert!(
594            file_list.files.iter().any(|f| f == &test_file),
595            "File in deeply nested underscore directory should be found: {:?}",
596            test_file
597        );
598        assert!(
599            file_list.files.iter().any(|f| f == &ai_dir_file),
600            "File in _ai directory should be found: {:?}",
601            ai_dir_file
602        );
603        assert!(
604            file_list.files.iter().any(|f| f == &docs_packages_file),
605            "File in docs_packages directory should be found: {:?}",
606            docs_packages_file
607        );
608    }
609
610    #[test]
611    fn test_underscore_directory_with_custom_ignores() {
612        let temp_dir = TempDir::new().unwrap();
613
614        // Create directory structure with underscores
615        let underscore_dir = temp_dir.path().join("test_packages").join("sub_dir");
616        fs::create_dir_all(&underscore_dir).unwrap();
617
618        let test_file = underscore_dir.join("test.rs");
619        fs::write(&test_file, "fn test() {}").unwrap();
620
621        let ignored_file = underscore_dir.join("ignored.tmp");
622        fs::write(&ignored_file, "temporary content").unwrap();
623
624        // Test with custom ignore patterns
625        let custom_ignores = vec!["*.tmp".to_string()];
626        let file_list = get_file_list(temp_dir.path(), true, &custom_ignores).unwrap();
627
628        assert!(
629            file_list.files.iter().any(|f| f == &test_file),
630            "Rust file in underscore directory should be found: {:?}",
631            test_file
632        );
633        assert!(
634            !file_list.files.iter().any(|f| f == &ignored_file),
635            "Ignored file should not be found: {:?}",
636            ignored_file
637        );
638    }
639
640    #[test]
641    fn test_multiple_underscore_patterns() {
642        let temp_dir = TempDir::new().unwrap();
643
644        // Create various underscore directory patterns
645        let patterns = vec![
646            "single_underscore",
647            "multiple_under_scores",
648            "_leading_underscore",
649            "trailing_underscore_",
650            "__double__underscore__",
651            "mixed-dash_underscore",
652        ];
653
654        let mut expected_files = Vec::new();
655
656        for pattern in patterns {
657            let dir = temp_dir.path().join(pattern);
658            fs::create_dir_all(&dir).unwrap();
659
660            let file = dir.join("content.txt");
661            fs::write(&file, format!("content in {}", pattern)).unwrap();
662            expected_files.push(file);
663        }
664
665        let file_list = get_file_list(temp_dir.path(), true, &[]).unwrap();
666
667        for expected_file in &expected_files {
668            assert!(
669                file_list.files.iter().any(|f| f == expected_file),
670                "File in underscore directory should be found: {:?}",
671                expected_file
672            );
673        }
674    }
675
676    #[test]
677    fn test_underscore_directories_respect_gitignore_patterns() {
678        let temp_dir = TempDir::new().unwrap();
679
680        // Create underscore directories that should be ignored by common patterns
681        let node_modules_dir = temp_dir.path().join("project_dir").join("node_modules");
682        fs::create_dir_all(&node_modules_dir).unwrap();
683        let node_file = node_modules_dir.join("package.js");
684        fs::write(&node_file, "module content").unwrap();
685
686        let target_dir = temp_dir.path().join("rust_project").join("target");
687        fs::create_dir_all(&target_dir).unwrap();
688        let target_file = target_dir.join("binary");
689        fs::write(&target_file, "binary content").unwrap();
690
691        // Create underscore directories that should NOT be ignored
692        let valid_dir = temp_dir.path().join("valid_project").join("src_files");
693        fs::create_dir_all(&valid_dir).unwrap();
694        let valid_file = valid_dir.join("main.rs");
695        fs::write(&valid_file, "fn main() {}").unwrap();
696
697        let file_list = get_file_list(temp_dir.path(), true, &[]).unwrap();
698
699        assert!(
700            !file_list.files.iter().any(|f| f == &node_file),
701            "Files in node_modules should be ignored: {:?}",
702            node_file
703        );
704        assert!(
705            !file_list.files.iter().any(|f| f == &target_file),
706            "Files in target directory should be ignored: {:?}",
707            target_file
708        );
709
710        assert!(
711            file_list.files.iter().any(|f| f == &valid_file),
712            "Files in valid underscore directories should be found: {:?}",
713            valid_file
714        );
715    }
716}