1use std::path::{Path, PathBuf};
2
3use ignore::WalkBuilder;
4
5#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6pub enum Language {
7 TypeScript,
8 JavaScript,
9 Python,
10 Rust,
11 Go,
12 Java,
13 CSharp,
14 Php,
15 Unknown,
16}
17
18#[derive(Debug, Clone)]
19pub struct SourceFile {
20 pub path: PathBuf,
21 pub relative_path: String,
22 pub language: Language,
23 pub content: String,
24 pub size_bytes: u64,
25}
26
27const SKIP_DIRS: &[&str] = &[
29 "node_modules",
30 "target",
31 "dist",
32 "__pycache__",
33 ".git",
34 ".next",
35 "out",
36 "coverage",
37 "vendor",
38 "venv",
39 ".venv",
40 ".tox",
41 "build",
42 "generated",
43];
44
45const SKIP_DIR_SUFFIXES: &[&str] = &["-dist", "_dist", "-build", "_build", "-out", "_out"];
47
48fn should_skip_dir(name: &str) -> bool {
49 if SKIP_DIRS.contains(&name) {
50 return true;
51 }
52 SKIP_DIR_SUFFIXES.iter().any(|suf| name.ends_with(suf))
53}
54
55pub fn walk_repo(repo_path: &Path) -> anyhow::Result<Vec<SourceFile>> {
56 let mut files = Vec::new();
57 let canonical = repo_path.canonicalize()?;
58
59 let mut walker = WalkBuilder::new(&canonical);
60 walker.standard_filters(true);
61 walker.hidden(true);
62 walker.add_custom_ignore_filename(".cgxignore");
64 walker.filter_entry(|e| {
66 if e.file_type().map(|ft| ft.is_dir()).unwrap_or(false) {
67 let name = e.file_name().to_string_lossy();
68 !should_skip_dir(&name)
70 } else {
71 let name = e.file_name().to_string_lossy();
73 !name.ends_with(".min.js")
74 && !name.ends_with(".min.ts")
75 && !name.ends_with(".bundle.js")
76 && !name.ends_with(".chunk.js")
77 }
78 });
79
80 for entry in walker.build() {
81 let entry = match entry {
82 Ok(e) => e,
83 Err(_) => continue,
84 };
85
86 if !entry.file_type().map(|ft| ft.is_file()).unwrap_or(false) {
87 continue;
88 }
89
90 let path = entry.path().to_path_buf();
91
92 let metadata = match std::fs::metadata(&path) {
93 Ok(m) => m,
94 Err(_) => continue,
95 };
96 let size_bytes = metadata.len();
97
98 if size_bytes > 2 * 1024 * 1024 {
99 continue;
100 }
101
102 let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
103 continue;
104 };
105 let language = detect_language(ext);
106
107 if matches!(language, Language::Unknown) {
108 continue;
109 }
110
111 if is_binary(&path)? {
112 continue;
113 }
114
115 let content = match std::fs::read_to_string(&path) {
116 Ok(c) => c,
117 Err(_) => continue,
118 };
119
120 let relative_path = match path.strip_prefix(&canonical) {
121 Ok(r) => r.to_string_lossy().to_string(),
122 Err(_) => path.to_string_lossy().to_string(),
123 };
124
125 if relative_path.split('/').any(should_skip_dir) {
127 continue;
128 }
129
130 files.push(SourceFile {
131 path,
132 relative_path,
133 language,
134 content,
135 size_bytes,
136 });
137 }
138
139 Ok(files)
140}
141
142fn detect_language(ext: &str) -> Language {
143 match ext {
144 "ts" | "tsx" => Language::TypeScript,
145 "js" | "jsx" | "mjs" | "cjs" => Language::JavaScript,
146 "py" => Language::Python,
147 "rs" => Language::Rust,
148 "go" => Language::Go,
149 "java" => Language::Java,
150 "cs" => Language::CSharp,
151 "php" => Language::Php,
152 _ => Language::Unknown,
153 }
154}
155
156fn is_binary(path: &Path) -> anyhow::Result<bool> {
157 let mut file = std::fs::File::open(path)?;
158 let mut buf = vec![0u8; 8192];
159 let n = std::io::Read::read(&mut file, &mut buf).unwrap_or(0);
160 Ok(buf[..n].contains(&0))
161}