1use std::path::{Path, PathBuf};
2
3use ignore::WalkBuilder;
4
5#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6pub enum Language {
7 TypeScript,
8 JavaScript,
9 Python,
10 Rust,
11 Go,
12 Java,
13 CSharp,
14 Php,
15 Unknown,
16}
17
18#[derive(Debug, Clone)]
19pub struct SourceFile {
20 pub path: PathBuf,
21 pub relative_path: String,
22 pub language: Language,
23 pub content: String,
24 pub size_bytes: u64,
25}
26
27pub fn walk_repo(repo_path: &Path) -> anyhow::Result<Vec<SourceFile>> {
28 let mut files = Vec::new();
29 let canonical = repo_path.canonicalize()?;
30
31 let mut walker = WalkBuilder::new(&canonical);
32 walker.standard_filters(true);
33 walker.hidden(true);
34 let mut override_builder = ignore::overrides::OverrideBuilder::new(&canonical);
36 for pattern in &[
37 "!node_modules/",
38 "!target/",
39 "!dist/",
40 "!__pycache__/",
41 "!.git/",
42 ] {
43 let _ = override_builder.add(pattern);
44 }
45 let overrides = override_builder.build()?;
46 walker.overrides(overrides);
47
48 for entry in walker.build() {
49 let entry = match entry {
50 Ok(e) => e,
51 Err(_) => continue,
52 };
53
54 if !entry.file_type().map(|ft| ft.is_file()).unwrap_or(false) {
55 continue;
56 }
57
58 let path = entry.path().to_path_buf();
59
60 let metadata = match std::fs::metadata(&path) {
61 Ok(m) => m,
62 Err(_) => continue,
63 };
64 let size_bytes = metadata.len();
65
66 if size_bytes > 2 * 1024 * 1024 {
67 continue;
68 }
69
70 let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
71 continue;
72 };
73 let language = detect_language(ext);
74
75 if matches!(language, Language::Unknown) {
76 continue;
77 }
78
79 if is_binary(&path)? {
80 continue;
81 }
82
83 let content = match std::fs::read_to_string(&path) {
84 Ok(c) => c,
85 Err(_) => continue,
86 };
87
88 let relative_path = match path.strip_prefix(&canonical) {
89 Ok(r) => r.to_string_lossy().to_string(),
90 Err(_) => path.to_string_lossy().to_string(),
91 };
92
93 files.push(SourceFile {
94 path,
95 relative_path,
96 language,
97 content,
98 size_bytes,
99 });
100 }
101
102 Ok(files)
103}
104
105fn detect_language(ext: &str) -> Language {
106 match ext {
107 "ts" | "tsx" => Language::TypeScript,
108 "js" | "jsx" | "mjs" | "cjs" => Language::JavaScript,
109 "py" => Language::Python,
110 "rs" => Language::Rust,
111 "go" => Language::Go,
112 "java" => Language::Java,
113 "cs" => Language::CSharp,
114 "php" => Language::Php,
115 _ => Language::Unknown,
116 }
117}
118
119fn is_binary(path: &Path) -> anyhow::Result<bool> {
120 let mut file = std::fs::File::open(path)?;
121 let mut buf = vec![0u8; 8192];
122 let n = std::io::Read::read(&mut file, &mut buf).unwrap_or(0);
123 Ok(buf[..n].contains(&0))
124}