codeprysm_core/
discovery.rs

1//! Root Discovery Module
2//!
3//! Discovers git repositories and code directories under a workspace root.
4//! Used for multi-root workspace support.
5
6use std::collections::HashSet;
7use std::path::{Path, PathBuf};
8
9use thiserror::Error;
10use tracing::{debug, info, warn};
11use walkdir::WalkDir;
12
13use crate::parser::SupportedLanguage;
14
15/// Errors during root discovery
16#[derive(Debug, Error)]
17pub enum DiscoveryError {
18    #[error("Root path does not exist: {0}")]
19    RootNotFound(PathBuf),
20
21    #[error("IO error: {0}")]
22    Io(#[from] std::io::Error),
23
24    #[error("No code roots found under {0}")]
25    NoRootsFound(PathBuf),
26}
27
28pub type Result<T> = std::result::Result<T, DiscoveryError>;
29
30/// Type of discovered root
31#[derive(Debug, Clone, PartialEq, Eq)]
32pub enum RootType {
33    /// Git repository with optional metadata
34    GitRepository {
35        remote: Option<String>,
36        branch: Option<String>,
37        commit: Option<String>,
38    },
39    /// Directory containing source files but no .git
40    CodeDirectory,
41}
42
43impl RootType {
44    /// Check if this is a git repository
45    pub fn is_git(&self) -> bool {
46        matches!(self, RootType::GitRepository { .. })
47    }
48}
49
50/// A discovered code root
51#[derive(Debug, Clone)]
52pub struct DiscoveredRoot {
53    /// Absolute path to the root
54    pub path: PathBuf,
55    /// Relative path from workspace root
56    pub relative_path: String,
57    /// Type of root (git repo or code directory)
58    pub root_type: RootType,
59    /// Name derived from directory name
60    pub name: String,
61}
62
63impl DiscoveredRoot {
64    /// Check if this is a git repository
65    pub fn is_git(&self) -> bool {
66        self.root_type.is_git()
67    }
68}
69
70/// Configuration for root discovery
71#[derive(Debug, Clone)]
72pub struct DiscoveryConfig {
73    /// Maximum depth to search for roots
74    pub max_depth: usize,
75    /// Directories to skip during search
76    pub exclude_dirs: HashSet<String>,
77    /// Whether to include non-git code directories
78    pub include_code_dirs: bool,
79}
80
81impl Default for DiscoveryConfig {
82    fn default() -> Self {
83        let exclude_dirs: HashSet<String> = [
84            "node_modules",
85            "target",
86            "build",
87            "dist",
88            "__pycache__",
89            ".venv",
90            "venv",
91            ".idea",
92            ".vscode",
93            "vendor",
94            "bin",
95            "obj",
96            ".tox",
97            ".mypy_cache",
98            ".pytest_cache",
99            ".coverage",
100            "coverage",
101            ".next",
102            ".nuxt",
103        ]
104        .iter()
105        .map(|s| s.to_string())
106        .collect();
107
108        Self {
109            max_depth: 3,
110            exclude_dirs,
111            include_code_dirs: true,
112        }
113    }
114}
115
116/// Root discovery service
117pub struct RootDiscovery {
118    config: DiscoveryConfig,
119}
120
121impl Default for RootDiscovery {
122    fn default() -> Self {
123        Self::with_defaults()
124    }
125}
126
127impl RootDiscovery {
128    /// Create a new RootDiscovery with custom configuration
129    pub fn new(config: DiscoveryConfig) -> Self {
130        Self { config }
131    }
132
133    /// Create a new RootDiscovery with default configuration
134    pub fn with_defaults() -> Self {
135        Self::new(DiscoveryConfig::default())
136    }
137
138    /// Create with a custom max depth
139    pub fn with_max_depth(mut self, max_depth: usize) -> Self {
140        self.config.max_depth = max_depth;
141        self
142    }
143
144    /// Discover all code roots under the given path
145    ///
146    /// Returns a list of discovered roots. If the root path itself is a git repo
147    /// or contains source files, it will be the only root returned.
148    pub fn discover(&self, root_path: &Path) -> Result<Vec<DiscoveredRoot>> {
149        let root_path = root_path
150            .canonicalize()
151            .map_err(|_| DiscoveryError::RootNotFound(root_path.to_path_buf()))?;
152
153        info!("Discovering code roots under {:?}", root_path);
154
155        // Check if root itself is a git repo
156        if self.is_git_repo(&root_path) {
157            info!("Root is a git repository");
158            return Ok(vec![self.create_discovered_root(&root_path, &root_path)?]);
159        }
160
161        // Check if root has source files but no subdirectories to search
162        if self.has_source_files(&root_path) && !self.has_discoverable_subdirs(&root_path) {
163            info!("Root is a code directory");
164            return Ok(vec![self.create_discovered_root(&root_path, &root_path)?]);
165        }
166
167        let mut roots = Vec::new();
168        let mut discovered_paths: HashSet<PathBuf> = HashSet::new();
169
170        // Walk the directory tree looking for git repos and code directories
171        for entry in WalkDir::new(&root_path)
172            .max_depth(self.config.max_depth)
173            .into_iter()
174            .filter_entry(|e| {
175                if !e.file_type().is_dir() {
176                    return true;
177                }
178                // Always allow the root entry (depth 0) to be traversed
179                // This handles temp directories with names like ".tmpXXXXXX"
180                if e.depth() == 0 {
181                    return true;
182                }
183                let name = e.file_name().to_string_lossy();
184                // Skip hidden directories and excluded directories
185                !name.starts_with('.') && !self.config.exclude_dirs.contains(name.as_ref())
186            })
187        {
188            let entry = match entry {
189                Ok(e) => e,
190                Err(e) => {
191                    warn!("Error walking directory: {}", e);
192                    continue;
193                }
194            };
195
196            if !entry.file_type().is_dir() {
197                continue;
198            }
199
200            let path = entry.path();
201
202            // Skip the root itself (we already checked it)
203            if path == root_path {
204                continue;
205            }
206
207            // Skip if this path is under an already-discovered root
208            if discovered_paths.iter().any(|p| path.starts_with(p)) {
209                continue;
210            }
211
212            // Check if this is a git repo
213            if self.is_git_repo(path) {
214                debug!("Found git repository: {:?}", path);
215                if let Ok(root) = self.create_discovered_root(path, &root_path) {
216                    discovered_paths.insert(path.to_path_buf());
217                    roots.push(root);
218                }
219                continue;
220            }
221
222            // Check for code directories only if enabled and at appropriate depth
223            if self.config.include_code_dirs {
224                // Only consider as a code dir if it has source files
225                // and is not nested under another potential code dir
226                let is_nested = roots.iter().any(|r| path.starts_with(&r.path));
227                if !is_nested && self.has_source_files(path) {
228                    // Don't add intermediate directories as code dirs if they have
229                    // subdirectories that might be git repos
230                    if !self.has_git_subdirs(path) {
231                        debug!("Found code directory: {:?}", path);
232                        if let Ok(root) = self.create_discovered_root(path, &root_path) {
233                            roots.push(root);
234                        }
235                    }
236                }
237            }
238        }
239
240        // If no roots found and root has source files, treat root as code directory
241        if roots.is_empty() && self.has_source_files(&root_path) {
242            info!("No sub-roots found, treating root as code directory");
243            roots.push(self.create_discovered_root(&root_path, &root_path)?);
244        }
245
246        if roots.is_empty() {
247            return Err(DiscoveryError::NoRootsFound(root_path));
248        }
249
250        // Sort roots by path for deterministic ordering
251        roots.sort_by(|a, b| a.path.cmp(&b.path));
252
253        info!("Discovered {} code root(s)", roots.len());
254        for root in &roots {
255            info!(
256                "  - {} ({:?}) at {}",
257                root.name,
258                if root.is_git() { "git" } else { "code" },
259                root.relative_path
260            );
261        }
262
263        Ok(roots)
264    }
265
266    /// Check if a directory is a git repository
267    fn is_git_repo(&self, path: &Path) -> bool {
268        path.join(".git").exists()
269    }
270
271    /// Check if a directory has any discoverable subdirectories
272    fn has_discoverable_subdirs(&self, path: &Path) -> bool {
273        if let Ok(entries) = std::fs::read_dir(path) {
274            for entry in entries.flatten() {
275                if entry.file_type().map(|t| t.is_dir()).unwrap_or(false) {
276                    let name = entry.file_name().to_string_lossy().to_string();
277                    if !name.starts_with('.') && !self.config.exclude_dirs.contains(&name) {
278                        return true;
279                    }
280                }
281            }
282        }
283        false
284    }
285
286    /// Check if a directory has any git repos in its subdirectories
287    fn has_git_subdirs(&self, path: &Path) -> bool {
288        if let Ok(entries) = std::fs::read_dir(path) {
289            for entry in entries.flatten() {
290                if entry.file_type().map(|t| t.is_dir()).unwrap_or(false) {
291                    let subpath = entry.path();
292                    if self.is_git_repo(&subpath) {
293                        return true;
294                    }
295                }
296            }
297        }
298        false
299    }
300
301    /// Check if a directory directly contains supported source files
302    fn has_source_files(&self, path: &Path) -> bool {
303        if let Ok(entries) = std::fs::read_dir(path) {
304            for entry in entries.flatten() {
305                let entry_path = entry.path();
306                if entry_path.is_file() && SupportedLanguage::from_path(&entry_path).is_some() {
307                    return true;
308                }
309            }
310        }
311        false
312    }
313
314    /// Create a DiscoveredRoot from a path
315    fn create_discovered_root(&self, path: &Path, root_path: &Path) -> Result<DiscoveredRoot> {
316        let relative_path = path
317            .strip_prefix(root_path)
318            .map(|p| {
319                let s = p.to_string_lossy().to_string();
320                if s.is_empty() { ".".to_string() } else { s }
321            })
322            .unwrap_or_else(|_| ".".to_string());
323
324        let name = path
325            .file_name()
326            .map(|n| n.to_string_lossy().to_string())
327            .unwrap_or_else(|| {
328                // For root path, try to get the directory name
329                root_path
330                    .file_name()
331                    .map(|n| n.to_string_lossy().to_string())
332                    .unwrap_or_else(|| "root".to_string())
333            });
334
335        let root_type = if self.is_git_repo(path) {
336            let git_info = extract_git_metadata(path);
337            RootType::GitRepository {
338                remote: git_info.0,
339                branch: git_info.1,
340                commit: git_info.2,
341            }
342        } else {
343            RootType::CodeDirectory
344        };
345
346        Ok(DiscoveredRoot {
347            path: path.to_path_buf(),
348            relative_path,
349            root_type,
350            name,
351        })
352    }
353}
354
355/// Extract git metadata from a repository
356fn extract_git_metadata(repo_path: &Path) -> (Option<String>, Option<String>, Option<String>) {
357    let git_dir = repo_path.join(".git");
358    if !git_dir.exists() {
359        return (None, None, None);
360    }
361
362    // Try to get remote URL
363    let remote = std::process::Command::new("git")
364        .args(["remote", "get-url", "origin"])
365        .current_dir(repo_path)
366        .output()
367        .ok()
368        .filter(|o| o.status.success())
369        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
370        .filter(|s| !s.is_empty());
371
372    // Try to get current branch
373    let branch = std::process::Command::new("git")
374        .args(["rev-parse", "--abbrev-ref", "HEAD"])
375        .current_dir(repo_path)
376        .output()
377        .ok()
378        .filter(|o| o.status.success())
379        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
380        .filter(|s| !s.is_empty());
381
382    // Try to get current commit SHA
383    let commit = std::process::Command::new("git")
384        .args(["rev-parse", "HEAD"])
385        .current_dir(repo_path)
386        .output()
387        .ok()
388        .filter(|o| o.status.success())
389        .map(|o| String::from_utf8_lossy(&o.stdout).trim().to_string())
390        .filter(|s| !s.is_empty());
391
392    (remote, branch, commit)
393}
394
395#[cfg(test)]
396mod tests {
397    use super::*;
398    use tempfile::TempDir;
399
400    #[test]
401    fn test_discovery_config_defaults() {
402        let config = DiscoveryConfig::default();
403        assert_eq!(config.max_depth, 3);
404        assert!(config.exclude_dirs.contains("node_modules"));
405        assert!(config.exclude_dirs.contains("target"));
406        assert!(config.include_code_dirs);
407    }
408
409    #[test]
410    fn test_is_git_repo() {
411        let temp = TempDir::new().unwrap();
412        let discovery = RootDiscovery::with_defaults();
413
414        assert!(!discovery.is_git_repo(temp.path()));
415
416        std::fs::create_dir(temp.path().join(".git")).unwrap();
417        assert!(discovery.is_git_repo(temp.path()));
418    }
419
420    #[test]
421    fn test_has_source_files() {
422        let temp = TempDir::new().unwrap();
423        let discovery = RootDiscovery::with_defaults();
424
425        // Empty directory
426        assert!(!discovery.has_source_files(temp.path()));
427
428        // Add a non-source file
429        std::fs::write(temp.path().join("readme.txt"), "hello").unwrap();
430        assert!(!discovery.has_source_files(temp.path()));
431
432        // Add a source file
433        std::fs::write(temp.path().join("main.py"), "print('hello')").unwrap();
434        assert!(discovery.has_source_files(temp.path()));
435    }
436
437    #[test]
438    fn test_discover_single_git_repo() {
439        let temp = TempDir::new().unwrap();
440
441        // Create a git repo
442        std::fs::create_dir(temp.path().join(".git")).unwrap();
443        std::fs::write(temp.path().join("main.py"), "print('hello')").unwrap();
444
445        let discovery = RootDiscovery::with_defaults();
446        let roots = discovery.discover(temp.path()).unwrap();
447
448        assert_eq!(roots.len(), 1);
449        assert!(roots[0].is_git());
450        assert_eq!(roots[0].relative_path, ".");
451    }
452
453    #[test]
454    fn test_discover_multiple_git_repos() {
455        let temp = TempDir::new().unwrap();
456
457        // Create two git repos
458        let repo_a = temp.path().join("repo-a");
459        let repo_b = temp.path().join("repo-b");
460
461        std::fs::create_dir_all(repo_a.join(".git")).unwrap();
462        std::fs::write(repo_a.join("main.py"), "# repo a").unwrap();
463
464        std::fs::create_dir_all(repo_b.join(".git")).unwrap();
465        std::fs::write(repo_b.join("main.rs"), "fn main() {}").unwrap();
466
467        let discovery = RootDiscovery::with_defaults();
468        let roots = discovery.discover(temp.path()).unwrap();
469
470        assert_eq!(roots.len(), 2);
471        assert!(roots.iter().any(|r| r.name == "repo-a"));
472        assert!(roots.iter().any(|r| r.name == "repo-b"));
473    }
474
475    #[test]
476    fn test_discover_code_directory() {
477        let temp = TempDir::new().unwrap();
478
479        // Create a code directory (no .git)
480        std::fs::write(temp.path().join("main.py"), "print('hello')").unwrap();
481
482        let discovery = RootDiscovery::with_defaults();
483        let roots = discovery.discover(temp.path()).unwrap();
484
485        assert_eq!(roots.len(), 1);
486        assert!(!roots[0].is_git());
487        assert_eq!(roots[0].root_type, RootType::CodeDirectory);
488    }
489
490    #[test]
491    fn test_discover_mixed_roots() {
492        let temp = TempDir::new().unwrap();
493
494        // Git repo
495        let git_repo = temp.path().join("git-project");
496        std::fs::create_dir_all(git_repo.join(".git")).unwrap();
497        std::fs::write(git_repo.join("main.py"), "# git project").unwrap();
498
499        // Code directory
500        let code_dir = temp.path().join("scripts");
501        std::fs::create_dir_all(&code_dir).unwrap();
502        std::fs::write(code_dir.join("util.py"), "# utilities").unwrap();
503
504        let discovery = RootDiscovery::with_defaults();
505        let roots = discovery.discover(temp.path()).unwrap();
506
507        assert_eq!(roots.len(), 2);
508
509        let git_root = roots.iter().find(|r| r.name == "git-project").unwrap();
510        assert!(git_root.is_git());
511
512        let code_root = roots.iter().find(|r| r.name == "scripts").unwrap();
513        assert!(!code_root.is_git());
514    }
515
516    #[test]
517    fn test_discover_skips_nested_repos() {
518        let temp = TempDir::new().unwrap();
519
520        // Parent git repo
521        std::fs::create_dir(temp.path().join(".git")).unwrap();
522        std::fs::write(temp.path().join("main.py"), "# parent").unwrap();
523
524        // Nested git repo (should be skipped)
525        let nested = temp.path().join("nested");
526        std::fs::create_dir_all(nested.join(".git")).unwrap();
527        std::fs::write(nested.join("lib.py"), "# nested").unwrap();
528
529        let discovery = RootDiscovery::with_defaults();
530        let roots = discovery.discover(temp.path()).unwrap();
531
532        // Only parent should be discovered since root is a git repo
533        assert_eq!(roots.len(), 1);
534        assert_eq!(roots[0].relative_path, ".");
535    }
536
537    #[test]
538    fn test_discover_skips_excluded_dirs() {
539        let temp = TempDir::new().unwrap();
540
541        // Create node_modules with source files (should be skipped)
542        let node_modules = temp.path().join("node_modules").join("some-package");
543        std::fs::create_dir_all(&node_modules).unwrap();
544        std::fs::write(node_modules.join("index.js"), "// package").unwrap();
545
546        // Create actual code
547        std::fs::write(temp.path().join("app.js"), "// app").unwrap();
548
549        let discovery = RootDiscovery::with_defaults();
550        let roots = discovery.discover(temp.path()).unwrap();
551
552        assert_eq!(roots.len(), 1);
553        assert_eq!(roots[0].relative_path, ".");
554    }
555
556    #[test]
557    fn test_root_type_is_git() {
558        let git_type = RootType::GitRepository {
559            remote: Some("origin".to_string()),
560            branch: Some("main".to_string()),
561            commit: None,
562        };
563        assert!(git_type.is_git());
564
565        let code_type = RootType::CodeDirectory;
566        assert!(!code_type.is_git());
567    }
568
569    #[test]
570    fn test_with_max_depth() {
571        let discovery = RootDiscovery::with_defaults().with_max_depth(5);
572        assert_eq!(discovery.config.max_depth, 5);
573    }
574
575    #[test]
576    fn test_no_roots_found_error() {
577        let temp = TempDir::new().unwrap();
578
579        // Empty directory with no source files
580        let discovery = RootDiscovery::with_defaults();
581        let result = discovery.discover(temp.path());
582
583        assert!(matches!(result, Err(DiscoveryError::NoRootsFound(_))));
584    }
585}