Skip to main content

cgx_engine/
walker.rs

1use std::path::{Path, PathBuf};
2
3use ignore::WalkBuilder;
4
5#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6pub enum Language {
7    TypeScript,
8    JavaScript,
9    Python,
10    Rust,
11    Go,
12    Java,
13    CSharp,
14    Php,
15    Unknown,
16}
17
18#[derive(Debug, Clone)]
19pub struct SourceFile {
20    pub path: PathBuf,
21    pub relative_path: String,
22    pub language: Language,
23    pub content: String,
24    pub size_bytes: u64,
25}
26
27/// Directory names that are never source code — skip them entirely.
28const SKIP_DIRS: &[&str] = &[
29    "node_modules",
30    "target",
31    "dist",
32    "__pycache__",
33    ".git",
34    ".next",
35    "out",
36    "coverage",
37    "vendor",
38    "venv",
39    ".venv",
40    ".tox",
41    "build",
42    "generated",
43];
44
45/// A directory name matches this if it ends with one of these suffixes (after a `-` or `_`).
46const SKIP_DIR_SUFFIXES: &[&str] = &["-dist", "_dist", "-build", "_build", "-out", "_out"];
47
48fn should_skip_dir(name: &str) -> bool {
49    if SKIP_DIRS.contains(&name) {
50        return true;
51    }
52    SKIP_DIR_SUFFIXES.iter().any(|suf| name.ends_with(suf))
53}
54
55pub fn walk_repo(repo_path: &Path) -> anyhow::Result<Vec<SourceFile>> {
56    let mut files = Vec::new();
57    let canonical = repo_path.canonicalize()?;
58
59    let mut walker = WalkBuilder::new(&canonical);
60    walker.standard_filters(true);
61    walker.hidden(true);
62    // Respect .cgxignore files anywhere in the tree (same semantics as .gitignore)
63    walker.add_custom_ignore_filename(".cgxignore");
64    // Programmatic directory filter: prune entire build-artifact trees early
65    walker.filter_entry(|e| {
66        if e.file_type().map(|ft| ft.is_dir()).unwrap_or(false) {
67            let name = e.file_name().to_string_lossy();
68            // Also skip minified single-file bundles inside any directory
69            !should_skip_dir(&name)
70        } else {
71            // Skip minified / bundled JS files by name convention
72            let name = e.file_name().to_string_lossy();
73            !name.ends_with(".min.js")
74                && !name.ends_with(".min.ts")
75                && !name.ends_with(".bundle.js")
76                && !name.ends_with(".chunk.js")
77        }
78    });
79
80    for entry in walker.build() {
81        let entry = match entry {
82            Ok(e) => e,
83            Err(_) => continue,
84        };
85
86        if !entry.file_type().map(|ft| ft.is_file()).unwrap_or(false) {
87            continue;
88        }
89
90        let path = entry.path().to_path_buf();
91
92        let metadata = match std::fs::metadata(&path) {
93            Ok(m) => m,
94            Err(_) => continue,
95        };
96        let size_bytes = metadata.len();
97
98        if size_bytes > 2 * 1024 * 1024 {
99            continue;
100        }
101
102        let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
103            continue;
104        };
105        let language = detect_language(ext);
106
107        if matches!(language, Language::Unknown) {
108            continue;
109        }
110
111        if is_binary(&path)? {
112            continue;
113        }
114
115        let content = match std::fs::read_to_string(&path) {
116            Ok(c) => c,
117            Err(_) => continue,
118        };
119
120        let relative_path = match path.strip_prefix(&canonical) {
121            Ok(r) => r.to_string_lossy().to_string(),
122            Err(_) => path.to_string_lossy().to_string(),
123        };
124
125        // Belt-and-suspenders: reject files inside any excluded directory component
126        if relative_path.split('/').any(should_skip_dir) {
127            continue;
128        }
129
130        files.push(SourceFile {
131            path,
132            relative_path,
133            language,
134            content,
135            size_bytes,
136        });
137    }
138
139    Ok(files)
140}
141
142fn detect_language(ext: &str) -> Language {
143    match ext {
144        "ts" | "tsx" => Language::TypeScript,
145        "js" | "jsx" | "mjs" | "cjs" => Language::JavaScript,
146        "py" => Language::Python,
147        "rs" => Language::Rust,
148        "go" => Language::Go,
149        "java" => Language::Java,
150        "cs" => Language::CSharp,
151        "php" => Language::Php,
152        _ => Language::Unknown,
153    }
154}
155
156fn is_binary(path: &Path) -> anyhow::Result<bool> {
157    let mut file = std::fs::File::open(path)?;
158    let mut buf = vec![0u8; 8192];
159    let n = std::io::Read::read(&mut file, &mut buf).unwrap_or(0);
160    Ok(buf[..n].contains(&0))
161}