Skip to main content

cgx_engine/
walker.rs

1use std::path::{Path, PathBuf};
2
3use ignore::WalkBuilder;
4
5/// Source language detected from file extension.
6#[derive(Debug, Clone, PartialEq, Eq, Hash)]
7pub enum Language {
8    TypeScript,
9    JavaScript,
10    Python,
11    Rust,
12    Go,
13    Java,
14    CSharp,
15    Php,
16    /// Extension not recognised — file is skipped by the parser.
17    Unknown,
18}
19
20/// A source file that has been read from disk and is ready for parsing.
21#[derive(Debug, Clone)]
22pub struct SourceFile {
23    /// Absolute path on disk.
24    pub path: PathBuf,
25    /// Path relative to the repo root, used as the stable identifier in the graph.
26    pub relative_path: String,
27    pub language: Language,
28    pub content: String,
29    pub size_bytes: u64,
30}
31
32/// Directory names that are never source code — skip them entirely.
33const SKIP_DIRS: &[&str] = &[
34    "node_modules",
35    "target",
36    "dist",
37    "__pycache__",
38    ".git",
39    ".next",
40    "out",
41    "coverage",
42    "vendor",
43    "venv",
44    ".venv",
45    ".tox",
46    "build",
47    "generated",
48];
49
50/// A directory name matches this if it ends with one of these suffixes (after a `-` or `_`).
51const SKIP_DIR_SUFFIXES: &[&str] = &["-dist", "_dist", "-build", "_build", "-out", "_out"];
52
53fn should_skip_dir(name: &str) -> bool {
54    if SKIP_DIRS.contains(&name) {
55        return true;
56    }
57    SKIP_DIR_SUFFIXES.iter().any(|suf| name.ends_with(suf))
58}
59
60/// Walk a repository and return every parseable source file.
61///
62/// Respects `.gitignore` and `.cgxignore` rules, skips build-artifact
63/// directories (`target/`, `node_modules/`, `dist/`, …), binary files,
64/// minified bundles, and files larger than 2 MB.
65pub fn walk_repo(repo_path: &Path) -> anyhow::Result<Vec<SourceFile>> {
66    let mut files = Vec::new();
67    let canonical = repo_path.canonicalize()?;
68
69    let mut walker = WalkBuilder::new(&canonical);
70    walker.standard_filters(true);
71    walker.hidden(true);
72    // Respect .cgxignore files anywhere in the tree (same semantics as .gitignore)
73    walker.add_custom_ignore_filename(".cgxignore");
74    // Programmatic directory filter: prune entire build-artifact trees early
75    walker.filter_entry(|e| {
76        if e.file_type().map(|ft| ft.is_dir()).unwrap_or(false) {
77            let name = e.file_name().to_string_lossy();
78            // Also skip minified single-file bundles inside any directory
79            !should_skip_dir(&name)
80        } else {
81            // Skip minified / bundled JS files by name convention
82            let name = e.file_name().to_string_lossy();
83            !name.ends_with(".min.js")
84                && !name.ends_with(".min.ts")
85                && !name.ends_with(".bundle.js")
86                && !name.ends_with(".chunk.js")
87        }
88    });
89
90    for entry in walker.build() {
91        let entry = match entry {
92            Ok(e) => e,
93            Err(_) => continue,
94        };
95
96        if !entry.file_type().map(|ft| ft.is_file()).unwrap_or(false) {
97            continue;
98        }
99
100        let path = entry.path().to_path_buf();
101
102        let metadata = match std::fs::metadata(&path) {
103            Ok(m) => m,
104            Err(_) => continue,
105        };
106        let size_bytes = metadata.len();
107
108        if size_bytes > 2 * 1024 * 1024 {
109            continue;
110        }
111
112        let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
113            continue;
114        };
115        let language = detect_language(ext);
116
117        if matches!(language, Language::Unknown) {
118            continue;
119        }
120
121        if is_binary(&path)? {
122            continue;
123        }
124
125        let content = match std::fs::read_to_string(&path) {
126            Ok(c) => c,
127            Err(_) => continue,
128        };
129
130        let relative_path = match path.strip_prefix(&canonical) {
131            Ok(r) => r.to_string_lossy().to_string(),
132            Err(_) => path.to_string_lossy().to_string(),
133        };
134
135        // Belt-and-suspenders: reject files inside any excluded directory component
136        if relative_path.split('/').any(should_skip_dir) {
137            continue;
138        }
139
140        files.push(SourceFile {
141            path,
142            relative_path,
143            language,
144            content,
145            size_bytes,
146        });
147    }
148
149    Ok(files)
150}
151
152fn detect_language(ext: &str) -> Language {
153    match ext {
154        "ts" | "tsx" => Language::TypeScript,
155        "js" | "jsx" | "mjs" | "cjs" => Language::JavaScript,
156        "py" => Language::Python,
157        "rs" => Language::Rust,
158        "go" => Language::Go,
159        "java" => Language::Java,
160        "cs" => Language::CSharp,
161        "php" => Language::Php,
162        _ => Language::Unknown,
163    }
164}
165
166fn is_binary(path: &Path) -> anyhow::Result<bool> {
167    let mut file = std::fs::File::open(path)?;
168    let mut buf = vec![0u8; 8192];
169    let n = std::io::Read::read(&mut file, &mut buf).unwrap_or(0);
170    Ok(buf[..n].contains(&0))
171}