1use std::path::{Path, PathBuf};
2
3use ignore::WalkBuilder;
4
5#[derive(Debug, Clone, PartialEq, Eq, Hash)]
7pub enum Language {
8 TypeScript,
9 JavaScript,
10 Python,
11 Rust,
12 Go,
13 Java,
14 CSharp,
15 Php,
16 Unknown,
18}
19
20#[derive(Debug, Clone)]
22pub struct SourceFile {
23 pub path: PathBuf,
25 pub relative_path: String,
27 pub language: Language,
28 pub content: String,
29 pub size_bytes: u64,
30}
31
32const SKIP_DIRS: &[&str] = &[
34 "node_modules",
35 "target",
36 "dist",
37 "__pycache__",
38 ".git",
39 ".next",
40 "out",
41 "coverage",
42 "vendor",
43 "venv",
44 ".venv",
45 ".tox",
46 "build",
47 "generated",
48];
49
50const SKIP_DIR_SUFFIXES: &[&str] = &["-dist", "_dist", "-build", "_build", "-out", "_out"];
52
53fn should_skip_dir(name: &str) -> bool {
54 if SKIP_DIRS.contains(&name) {
55 return true;
56 }
57 SKIP_DIR_SUFFIXES.iter().any(|suf| name.ends_with(suf))
58}
59
60pub fn walk_repo(repo_path: &Path) -> anyhow::Result<Vec<SourceFile>> {
66 let mut files = Vec::new();
67 let canonical = repo_path.canonicalize()?;
68
69 let mut walker = WalkBuilder::new(&canonical);
70 walker.standard_filters(true);
71 walker.hidden(true);
72 walker.add_custom_ignore_filename(".cgxignore");
74 walker.filter_entry(|e| {
76 if e.file_type().map(|ft| ft.is_dir()).unwrap_or(false) {
77 let name = e.file_name().to_string_lossy();
78 !should_skip_dir(&name)
80 } else {
81 let name = e.file_name().to_string_lossy();
83 !name.ends_with(".min.js")
84 && !name.ends_with(".min.ts")
85 && !name.ends_with(".bundle.js")
86 && !name.ends_with(".chunk.js")
87 }
88 });
89
90 for entry in walker.build() {
91 let entry = match entry {
92 Ok(e) => e,
93 Err(_) => continue,
94 };
95
96 if !entry.file_type().map(|ft| ft.is_file()).unwrap_or(false) {
97 continue;
98 }
99
100 let path = entry.path().to_path_buf();
101
102 let metadata = match std::fs::metadata(&path) {
103 Ok(m) => m,
104 Err(_) => continue,
105 };
106 let size_bytes = metadata.len();
107
108 if size_bytes > 2 * 1024 * 1024 {
109 continue;
110 }
111
112 let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
113 continue;
114 };
115 let language = detect_language(ext);
116
117 if matches!(language, Language::Unknown) {
118 continue;
119 }
120
121 if is_binary(&path)? {
122 continue;
123 }
124
125 let content = match std::fs::read_to_string(&path) {
126 Ok(c) => c,
127 Err(_) => continue,
128 };
129
130 let relative_path = match path.strip_prefix(&canonical) {
131 Ok(r) => r.to_string_lossy().to_string(),
132 Err(_) => path.to_string_lossy().to_string(),
133 };
134
135 if relative_path.split('/').any(should_skip_dir) {
137 continue;
138 }
139
140 files.push(SourceFile {
141 path,
142 relative_path,
143 language,
144 content,
145 size_bytes,
146 });
147 }
148
149 Ok(files)
150}
151
152fn detect_language(ext: &str) -> Language {
153 match ext {
154 "ts" | "tsx" => Language::TypeScript,
155 "js" | "jsx" | "mjs" | "cjs" => Language::JavaScript,
156 "py" => Language::Python,
157 "rs" => Language::Rust,
158 "go" => Language::Go,
159 "java" => Language::Java,
160 "cs" => Language::CSharp,
161 "php" => Language::Php,
162 _ => Language::Unknown,
163 }
164}
165
166fn is_binary(path: &Path) -> anyhow::Result<bool> {
167 let mut file = std::fs::File::open(path)?;
168 let mut buf = vec![0u8; 8192];
169 let n = std::io::Read::read(&mut file, &mut buf).unwrap_or(0);
170 Ok(buf[..n].contains(&0))
171}