1use std::path::{Path, PathBuf};
2
3use ignore::WalkBuilder;
4
5#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6pub enum Language {
7 TypeScript,
8 JavaScript,
9 Python,
10 Rust,
11 Go,
12 Java,
13 CSharp,
14 Php,
15 Unknown,
16}
17
18#[derive(Debug, Clone)]
19pub struct SourceFile {
20 pub path: PathBuf,
21 pub relative_path: String,
22 pub language: Language,
23 pub content: String,
24 pub size_bytes: u64,
25}
26
27pub fn walk_repo(repo_path: &Path) -> anyhow::Result<Vec<SourceFile>> {
28 let mut files = Vec::new();
29 let canonical = repo_path.canonicalize()?;
30
31 let mut walker = WalkBuilder::new(&canonical);
32 walker.standard_filters(true);
33 walker.hidden(true);
34 let mut override_builder = ignore::overrides::OverrideBuilder::new(&canonical);
36 for pattern in &["!node_modules/", "!target/", "!dist/", "!__pycache__/", "!.git/"] {
37 let _ = override_builder.add(pattern);
38 }
39 let overrides = override_builder.build()?;
40 walker.overrides(overrides);
41
42 for entry in walker.build() {
43 let entry = match entry {
44 Ok(e) => e,
45 Err(_) => continue,
46 };
47
48 if !entry.file_type().map(|ft| ft.is_file()).unwrap_or(false) {
49 continue;
50 }
51
52 let path = entry.path().to_path_buf();
53
54 let metadata = match std::fs::metadata(&path) {
55 Ok(m) => m,
56 Err(_) => continue,
57 };
58 let size_bytes = metadata.len();
59
60 if size_bytes > 2 * 1024 * 1024 {
61 continue;
62 }
63
64 let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
65 continue;
66 };
67 let language = detect_language(ext);
68
69 if matches!(language, Language::Unknown) {
70 continue;
71 }
72
73 if is_binary(&path)? {
74 continue;
75 }
76
77 let content = match std::fs::read_to_string(&path) {
78 Ok(c) => c,
79 Err(_) => continue,
80 };
81
82 let relative_path = match path.strip_prefix(&canonical) {
83 Ok(r) => r.to_string_lossy().to_string(),
84 Err(_) => path.to_string_lossy().to_string(),
85 };
86
87 files.push(SourceFile {
88 path,
89 relative_path,
90 language,
91 content,
92 size_bytes,
93 });
94 }
95
96 Ok(files)
97}
98
99fn detect_language(ext: &str) -> Language {
100 match ext {
101 "ts" | "tsx" => Language::TypeScript,
102 "js" | "jsx" | "mjs" | "cjs" => Language::JavaScript,
103 "py" => Language::Python,
104 "rs" => Language::Rust,
105 "go" => Language::Go,
106 "java" => Language::Java,
107 "cs" => Language::CSharp,
108 "php" => Language::Php,
109 _ => Language::Unknown,
110 }
111}
112
113fn is_binary(path: &Path) -> anyhow::Result<bool> {
114 let mut file = std::fs::File::open(path)?;
115 let mut buf = vec![0u8; 8192];
116 let n = std::io::Read::read(&mut file, &mut buf).unwrap_or(0);
117 Ok(buf[..n].contains(&0))
118}