ricecoder_research/
codebase_scanner.rs

1//! Codebase scanning and file discovery
2
3use crate::error::ResearchError;
4use crate::models::{Framework, Language};
5use ignore::WalkBuilder;
6use std::collections::HashSet;
7use std::path::{Path, PathBuf};
8
9/// File metadata extracted during scanning
10#[derive(Debug, Clone)]
11pub struct FileMetadata {
12    /// Path to the file
13    pub path: PathBuf,
14    /// File language
15    pub language: Option<Language>,
16    /// File size in bytes
17    pub size: u64,
18    /// Whether the file is a test file
19    pub is_test: bool,
20}
21
22/// Result of codebase scanning
23#[derive(Debug, Clone)]
24pub struct ScanResult {
25    /// All files found in the codebase
26    pub files: Vec<FileMetadata>,
27    /// Languages detected in the codebase
28    pub languages: Vec<Language>,
29    /// Frameworks detected in the codebase
30    pub frameworks: Vec<Framework>,
31    /// Source directories
32    pub source_dirs: Vec<PathBuf>,
33    /// Test directories
34    pub test_dirs: Vec<PathBuf>,
35}
36
37/// Scans a codebase to discover files and extract metadata
38pub struct CodebaseScanner;
39
40impl CodebaseScanner {
41    /// Scan a project directory and extract file metadata
42    ///
43    /// # Arguments
44    /// * `root` - Root directory of the project
45    ///
46    /// # Returns
47    /// A `ScanResult` containing all discovered files and metadata
48    pub fn scan(root: &Path) -> Result<ScanResult, ResearchError> {
49        if !root.exists() {
50            return Err(ResearchError::ProjectNotFound {
51                path: root.to_path_buf(),
52                reason: "Cannot scan codebase: root directory does not exist".to_string(),
53            });
54        }
55
56        let mut files = Vec::new();
57        let mut languages = HashSet::new();
58        let mut source_dirs = HashSet::new();
59        let mut test_dirs = HashSet::new();
60
61        // Use ignore crate to respect .gitignore
62        let walker = WalkBuilder::new(root).hidden(true).git_ignore(true).build();
63
64        for entry in walker {
65            let entry = match entry {
66                Ok(e) => e,
67                Err(_) => continue,
68            };
69
70            let path = entry.path();
71
72            // Skip directories
73            if path.is_dir() {
74                continue;
75            }
76
77            // Extract language from file extension
78            let language = Self::detect_language(path);
79            let is_test = Self::is_test_file(path);
80
81            // Track source and test directories
82            if let Some(parent) = path.parent() {
83                if is_test {
84                    test_dirs.insert(parent.to_path_buf());
85                } else if language.is_some() {
86                    source_dirs.insert(parent.to_path_buf());
87                }
88            }
89
90            if let Ok(metadata) = std::fs::metadata(path) {
91                files.push(FileMetadata {
92                    path: path.to_path_buf(),
93                    language: language.clone(),
94                    size: metadata.len(),
95                    is_test,
96                });
97
98                if let Some(lang) = language {
99                    languages.insert(lang);
100                }
101            }
102        }
103
104        // Convert HashSets to Vecs
105        let mut languages_vec: Vec<Language> = languages.into_iter().collect();
106        languages_vec.sort_by(|a, b| format!("{:?}", a).cmp(&format!("{:?}", b)));
107
108        let mut source_dirs_vec: Vec<PathBuf> = source_dirs.into_iter().collect();
109        source_dirs_vec.sort();
110
111        let mut test_dirs_vec: Vec<PathBuf> = test_dirs.into_iter().collect();
112        test_dirs_vec.sort();
113
114        Ok(ScanResult {
115            files,
116            languages: languages_vec,
117            frameworks: Vec::new(), // Will be populated by other components
118            source_dirs: source_dirs_vec,
119            test_dirs: test_dirs_vec,
120        })
121    }
122
123    /// Detect the language of a file based on its extension
124    fn detect_language(path: &Path) -> Option<Language> {
125        let extension = path.extension()?.to_str()?;
126
127        match extension {
128            "rs" => Some(Language::Rust),
129            "ts" | "tsx" | "js" | "jsx" => Some(Language::TypeScript),
130            "py" => Some(Language::Python),
131            "go" => Some(Language::Go),
132            "java" => Some(Language::Java),
133            "kt" | "kts" => Some(Language::Kotlin),
134            "cs" => Some(Language::CSharp),
135            "php" => Some(Language::Php),
136            "rb" => Some(Language::Ruby),
137            "swift" => Some(Language::Swift),
138            "dart" => Some(Language::Dart),
139            _ => None,
140        }
141    }
142
143    /// Check if a file is a test file
144    fn is_test_file(path: &Path) -> bool {
145        // Check for common test directory patterns
146        for component in path.components() {
147            if let std::path::Component::Normal(name) = component {
148                let name_str = name.to_string_lossy();
149                if name_str == "tests" || name_str == "test" || name_str == "__tests__" {
150                    return true;
151                }
152            }
153        }
154
155        // Check for test file naming patterns
156        let file_name = path.file_name().unwrap_or_default().to_string_lossy();
157        file_name.ends_with("_test.rs")
158            || file_name.ends_with(".test.ts")
159            || file_name.ends_with(".test.js")
160            || file_name.ends_with("_test.py")
161            || file_name.ends_with("_test.go")
162            || file_name.ends_with("Test.java")
163            || file_name.ends_with("Test.kt")
164            || file_name.ends_with("Tests.cs")
165            || file_name.ends_with("_test.rb")
166            || file_name.ends_with("Tests.swift")
167    }
168}
169
170#[cfg(test)]
171mod tests {
172    use super::*;
173    use std::fs;
174    use tempfile::TempDir;
175
176    #[test]
177    fn test_detect_language_rust() {
178        let path = PathBuf::from("main.rs");
179        assert_eq!(
180            CodebaseScanner::detect_language(&path),
181            Some(Language::Rust)
182        );
183    }
184
185    #[test]
186    fn test_detect_language_typescript() {
187        let path = PathBuf::from("main.ts");
188        assert_eq!(
189            CodebaseScanner::detect_language(&path),
190            Some(Language::TypeScript)
191        );
192    }
193
194    #[test]
195    fn test_detect_language_python() {
196        let path = PathBuf::from("main.py");
197        assert_eq!(
198            CodebaseScanner::detect_language(&path),
199            Some(Language::Python)
200        );
201    }
202
203    #[test]
204    fn test_detect_language_unknown() {
205        let path = PathBuf::from("README.md");
206        assert_eq!(CodebaseScanner::detect_language(&path), None);
207    }
208
209    #[test]
210    fn test_is_test_file_rust() {
211        let path = PathBuf::from("src/lib_test.rs");
212        assert!(CodebaseScanner::is_test_file(&path));
213    }
214
215    #[test]
216    fn test_is_test_file_typescript() {
217        let path = PathBuf::from("src/main.test.ts");
218        assert!(CodebaseScanner::is_test_file(&path));
219    }
220
221    #[test]
222    fn test_is_test_file_directory() {
223        let path = PathBuf::from("tests/integration.rs");
224        assert!(CodebaseScanner::is_test_file(&path));
225    }
226
227    #[test]
228    fn test_is_test_file_not_test() {
229        let path = PathBuf::from("src/main.rs");
230        assert!(!CodebaseScanner::is_test_file(&path));
231    }
232
233    #[test]
234    fn test_scan_simple_project() -> Result<(), Box<dyn std::error::Error>> {
235        let temp_dir = TempDir::new()?;
236        let root = temp_dir.path();
237
238        // Create some test files
239        fs::create_dir_all(root.join("src"))?;
240        fs::create_dir_all(root.join("tests"))?;
241        fs::write(root.join("src/main.rs"), "fn main() {}")?;
242        fs::write(root.join("src/lib.rs"), "pub fn lib() {}")?;
243        fs::write(
244            root.join("tests/integration_test.rs"),
245            "#[test]\nfn test() {}",
246        )?;
247
248        let result = CodebaseScanner::scan(root)?;
249
250        assert_eq!(result.files.len(), 3);
251        assert!(result.languages.contains(&Language::Rust));
252        assert!(!result.source_dirs.is_empty());
253        assert!(!result.test_dirs.is_empty());
254
255        Ok(())
256    }
257
258    #[test]
259    fn test_scan_nonexistent_directory() {
260        let result = CodebaseScanner::scan(Path::new("/nonexistent/path"));
261        assert!(result.is_err());
262    }
263}