Skip to main content

cgx_engine/
walker.rs

1use std::path::{Path, PathBuf};
2
3use ignore::WalkBuilder;
4
5#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6pub enum Language {
7    TypeScript,
8    JavaScript,
9    Python,
10    Rust,
11    Go,
12    Java,
13    CSharp,
14    Php,
15    Unknown,
16}
17
18#[derive(Debug, Clone)]
19pub struct SourceFile {
20    pub path: PathBuf,
21    pub relative_path: String,
22    pub language: Language,
23    pub content: String,
24    pub size_bytes: u64,
25}
26
27/// Directory names that are never source code — skip them entirely.
28const SKIP_DIRS: &[&str] = &[
29    "node_modules",
30    "target",
31    "dist",
32    "__pycache__",
33    ".git",
34    ".next",
35    "out",
36    "coverage",
37    "vendor",
38    "venv",
39    ".venv",
40    ".tox",
41    "build",
42    "generated",
43];
44
45/// A directory name matches this if it ends with one of these suffixes (after a `-` or `_`).
46const SKIP_DIR_SUFFIXES: &[&str] = &["-dist", "_dist", "-build", "_build", "-out", "_out"];
47
48fn should_skip_dir(name: &str) -> bool {
49    if SKIP_DIRS.contains(&name) {
50        return true;
51    }
52    SKIP_DIR_SUFFIXES
53        .iter()
54        .any(|suf| name.ends_with(suf))
55}
56
57pub fn walk_repo(repo_path: &Path) -> anyhow::Result<Vec<SourceFile>> {
58    let mut files = Vec::new();
59    let canonical = repo_path.canonicalize()?;
60
61    let mut walker = WalkBuilder::new(&canonical);
62    walker.standard_filters(true);
63    walker.hidden(true);
64    // Respect .cgxignore files anywhere in the tree (same semantics as .gitignore)
65    walker.add_custom_ignore_filename(".cgxignore");
66    // Programmatic directory filter: prune entire build-artifact trees early
67    walker.filter_entry(|e| {
68        if e.file_type().map(|ft| ft.is_dir()).unwrap_or(false) {
69            let name = e.file_name().to_string_lossy();
70            // Also skip minified single-file bundles inside any directory
71            !should_skip_dir(&name)
72        } else {
73            // Skip minified / bundled JS files by name convention
74            let name = e.file_name().to_string_lossy();
75            !name.ends_with(".min.js")
76                && !name.ends_with(".min.ts")
77                && !name.ends_with(".bundle.js")
78                && !name.ends_with(".chunk.js")
79        }
80    });
81
82    for entry in walker.build() {
83        let entry = match entry {
84            Ok(e) => e,
85            Err(_) => continue,
86        };
87
88        if !entry.file_type().map(|ft| ft.is_file()).unwrap_or(false) {
89            continue;
90        }
91
92        let path = entry.path().to_path_buf();
93
94        let metadata = match std::fs::metadata(&path) {
95            Ok(m) => m,
96            Err(_) => continue,
97        };
98        let size_bytes = metadata.len();
99
100        if size_bytes > 2 * 1024 * 1024 {
101            continue;
102        }
103
104        let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
105            continue;
106        };
107        let language = detect_language(ext);
108
109        if matches!(language, Language::Unknown) {
110            continue;
111        }
112
113        if is_binary(&path)? {
114            continue;
115        }
116
117        let content = match std::fs::read_to_string(&path) {
118            Ok(c) => c,
119            Err(_) => continue,
120        };
121
122        let relative_path = match path.strip_prefix(&canonical) {
123            Ok(r) => r.to_string_lossy().to_string(),
124            Err(_) => path.to_string_lossy().to_string(),
125        };
126
127        // Belt-and-suspenders: reject files inside any excluded directory component
128        if relative_path
129            .split('/')
130            .any(|component| should_skip_dir(component))
131        {
132            continue;
133        }
134
135        files.push(SourceFile {
136            path,
137            relative_path,
138            language,
139            content,
140            size_bytes,
141        });
142    }
143
144    Ok(files)
145}
146
147fn detect_language(ext: &str) -> Language {
148    match ext {
149        "ts" | "tsx" => Language::TypeScript,
150        "js" | "jsx" | "mjs" | "cjs" => Language::JavaScript,
151        "py" => Language::Python,
152        "rs" => Language::Rust,
153        "go" => Language::Go,
154        "java" => Language::Java,
155        "cs" => Language::CSharp,
156        "php" => Language::Php,
157        _ => Language::Unknown,
158    }
159}
160
161fn is_binary(path: &Path) -> anyhow::Result<bool> {
162    let mut file = std::fs::File::open(path)?;
163    let mut buf = vec![0u8; 8192];
164    let n = std::io::Read::read(&mut file, &mut buf).unwrap_or(0);
165    Ok(buf[..n].contains(&0))
166}