Skip to main content

cgx_engine/
walker.rs

1use std::path::{Path, PathBuf};
2
3use ignore::WalkBuilder;
4
5#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6pub enum Language {
7    TypeScript,
8    JavaScript,
9    Python,
10    Rust,
11    Go,
12    Java,
13    CSharp,
14    Php,
15    Unknown,
16}
17
18#[derive(Debug, Clone)]
19pub struct SourceFile {
20    pub path: PathBuf,
21    pub relative_path: String,
22    pub language: Language,
23    pub content: String,
24    pub size_bytes: u64,
25}
26
27pub fn walk_repo(repo_path: &Path) -> anyhow::Result<Vec<SourceFile>> {
28    let mut files = Vec::new();
29    let canonical = repo_path.canonicalize()?;
30
31    let mut walker = WalkBuilder::new(&canonical);
32    walker.standard_filters(true);
33    walker.hidden(true);
34    // Explicitly add overrides for common non-source directories
35    let mut override_builder = ignore::overrides::OverrideBuilder::new(&canonical);
36    for pattern in &["!node_modules/", "!target/", "!dist/", "!__pycache__/", "!.git/"] {
37        let _ = override_builder.add(pattern);
38    }
39    let overrides = override_builder.build()?;
40    walker.overrides(overrides);
41
42    for entry in walker.build() {
43        let entry = match entry {
44            Ok(e) => e,
45            Err(_) => continue,
46        };
47
48        if !entry.file_type().map(|ft| ft.is_file()).unwrap_or(false) {
49            continue;
50        }
51
52        let path = entry.path().to_path_buf();
53
54        let metadata = match std::fs::metadata(&path) {
55            Ok(m) => m,
56            Err(_) => continue,
57        };
58        let size_bytes = metadata.len();
59
60        if size_bytes > 2 * 1024 * 1024 {
61            continue;
62        }
63
64        let Some(ext) = path.extension().and_then(|e| e.to_str()) else {
65            continue;
66        };
67        let language = detect_language(ext);
68
69        if matches!(language, Language::Unknown) {
70            continue;
71        }
72
73        if is_binary(&path)? {
74            continue;
75        }
76
77        let content = match std::fs::read_to_string(&path) {
78            Ok(c) => c,
79            Err(_) => continue,
80        };
81
82        let relative_path = match path.strip_prefix(&canonical) {
83            Ok(r) => r.to_string_lossy().to_string(),
84            Err(_) => path.to_string_lossy().to_string(),
85        };
86
87        files.push(SourceFile {
88            path,
89            relative_path,
90            language,
91            content,
92            size_bytes,
93        });
94    }
95
96    Ok(files)
97}
98
99fn detect_language(ext: &str) -> Language {
100    match ext {
101        "ts" | "tsx" => Language::TypeScript,
102        "js" | "jsx" | "mjs" | "cjs" => Language::JavaScript,
103        "py" => Language::Python,
104        "rs" => Language::Rust,
105        "go" => Language::Go,
106        "java" => Language::Java,
107        "cs" => Language::CSharp,
108        "php" => Language::Php,
109        _ => Language::Unknown,
110    }
111}
112
113fn is_binary(path: &Path) -> anyhow::Result<bool> {
114    let mut file = std::fs::File::open(path)?;
115    let mut buf = vec![0u8; 8192];
116    let n = std::io::Read::read(&mut file, &mut buf).unwrap_or(0);
117    Ok(buf[..n].contains(&0))
118}