1use std::path::{Path, PathBuf};
2
3use ignore::WalkBuilder;
4
5#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6pub enum Language {
7 TypeScript,
8 JavaScript,
9 Python,
10 Rust,
11 Go,
12 Java,
13 CSharp,
14 Php,
15 Unknown,
16}
17
18#[derive(Debug, Clone)]
19pub struct SourceFile {
20 pub path: PathBuf,
21 pub relative_path: String,
22 pub language: Language,
23 pub content: String,
24 pub size_bytes: u64,
25}
26
27const SKIP_DIRS: &[&str] = &[
29 "node_modules",
30 "target",
31 "dist",
32 "__pycache__",
33 ".git",
34 ".next",
35 "out",
36 "coverage",
37 "vendor",
38 "venv",
39 ".venv",
40 ".tox",
41 "build",
42 "generated",
43];
44
45const SKIP_DIR_SUFFIXES: &[&str] = &["-dist", "_dist", "-build", "_build", "-out", "_out"];
47
48fn should_skip_dir(name: &str) -> bool {
49 if SKIP_DIRS.contains(&name) {
50 return true;
51 }
52 SKIP_DIR_SUFFIXES
53 .iter()
54 .any(|suf| name.ends_with(suf))
55}
56
57pub fn walk_repo(repo_path: &Path) -> anyhow::Result<Vec<SourceFile>> {
58 let mut files = Vec::new();
59 let canonical = repo_path.canonicalize()?;
60
61 let mut walker = WalkBuilder::new(&canonical);
62 walker.standard_filters(true);
63 walker.hidden(true);
64 walker.add_custom_ignore_filename(".cgxignore");
66 walker.filter_entry(|e| {
68 if e.file_type().map(|ft| ft.is_dir()).unwrap_or(false) {
69 let name = e.file_name().to_string_lossy();
70 !should_skip_dir(&name)
72 } else {
73 let name = e.file_name().to_string_lossy();
75 !name.ends_with(".min.js")
76 && !name.ends_with(".min.ts")
77 && !name.ends_with(".bundle.js")
78 && !name.ends_with(".chunk.js")
79 }
80 });
81
82 for entry in walker.build() {
83 let entry = match entry {
84 Ok(e) => e,
85 Err(_) => continue,
86 };
87
88 if !entry.file_type().map(|ft| ft.is_file()).unwrap_or(false) {
89 continue;
90 }
91
92 let path = entry.path().to_path_buf();
93
94 let metadata = match std::fs::metadata(&path) {
95 Ok(m) => m,
96 Err(_) => continue,
97 };
98 let size_bytes = metadata.len();
99
100 if size_bytes > 2 * 1024 * 1024 {
101 continue;
102 }
103
104 let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
105 continue;
106 };
107 let language = detect_language(ext);
108
109 if matches!(language, Language::Unknown) {
110 continue;
111 }
112
113 if is_binary(&path)? {
114 continue;
115 }
116
117 let content = match std::fs::read_to_string(&path) {
118 Ok(c) => c,
119 Err(_) => continue,
120 };
121
122 let relative_path = match path.strip_prefix(&canonical) {
123 Ok(r) => r.to_string_lossy().to_string(),
124 Err(_) => path.to_string_lossy().to_string(),
125 };
126
127 if relative_path
129 .split('/')
130 .any(|component| should_skip_dir(component))
131 {
132 continue;
133 }
134
135 files.push(SourceFile {
136 path,
137 relative_path,
138 language,
139 content,
140 size_bytes,
141 });
142 }
143
144 Ok(files)
145}
146
147fn detect_language(ext: &str) -> Language {
148 match ext {
149 "ts" | "tsx" => Language::TypeScript,
150 "js" | "jsx" | "mjs" | "cjs" => Language::JavaScript,
151 "py" => Language::Python,
152 "rs" => Language::Rust,
153 "go" => Language::Go,
154 "java" => Language::Java,
155 "cs" => Language::CSharp,
156 "php" => Language::Php,
157 _ => Language::Unknown,
158 }
159}
160
161fn is_binary(path: &Path) -> anyhow::Result<bool> {
162 let mut file = std::fs::File::open(path)?;
163 let mut buf = vec![0u8; 8192];
164 let n = std::io::Read::read(&mut file, &mut buf).unwrap_or(0);
165 Ok(buf[..n].contains(&0))
166}