Skip to main content

sem_core/utils/
scan.rs

1/// File names that are excluded from repo-wide scans by default.
2const DEFAULT_EXCLUDED_FILES: &[&str] = &[
3    "Cargo.lock",
4    "package-lock.json",
5    "yarn.lock",
6    "pnpm-lock.yaml",
7    "Gemfile.lock",
8    "Pipfile.lock",
9    "poetry.lock",
10    "composer.lock",
11    "go.sum",
12    "flake.lock",
13];
14
15/// Directory names that are excluded wherever they appear in repo-wide scans.
16const DEFAULT_EXCLUDED_ANY_DIRS: &[&str] = &[
17    "fixtures",
18    "fixture",
19    "benchmarks",
20    "vendor",
21    "node_modules",
22    "test-harness",
23    ".next",
24    ".turbo",
25    ".cache",
26    "coverage",
27];
28
29/// Top-level output directories excluded by default.
30const DEFAULT_EXCLUDED_ROOT_DIRS: &[&str] = &["out", "dist", "build", "target"];
31
32/// File suffixes for generated text assets that do not produce useful entities.
33const DEFAULT_EXCLUDED_SUFFIXES: &[&str] = &[".min.js", ".min.css"];
34
35/// File suffixes that are not useful source text for semantic extraction.
36const BINARY_FILE_SUFFIXES: &[&str] = &[
37    ".png",
38    ".jpg",
39    ".jpeg",
40    ".gif",
41    ".webp",
42    ".ico",
43    ".tiff",
44    ".tif",
45    ".bmp",
46    ".heic",
47    ".heif",
48    ".avif",
49    ".woff",
50    ".woff2",
51    ".ttf",
52    ".otf",
53    ".eot",
54    ".mp3",
55    ".mp4",
56    ".mov",
57    ".avi",
58    ".webm",
59    ".ogg",
60    ".wav",
61    ".flac",
62    ".m4a",
63    ".m4v",
64    ".mkv",
65    ".zip",
66    ".tar",
67    ".tar.gz",
68    ".tgz",
69    ".bz2",
70    ".gz",
71    ".xz",
72    ".7z",
73    ".rar",
74    ".so",
75    ".dylib",
76    ".dll",
77    ".a",
78    ".lib",
79    ".o",
80    ".obj",
81    ".class",
82    ".jar",
83    ".pyc",
84    ".pyo",
85    ".pdb",
86    ".exe",
87    ".app",
88    ".apk",
89    ".ipa",
90    ".aar",
91    ".swiftmodule",
92    ".swiftdoc",
93    ".swiftsourceinfo",
94    ".wasm",
95    ".car",
96    ".icns",
97    ".riv",
98    ".pdf",
99    ".nib",
100    ".storyboardc",
101    ".db",
102    ".sqlite",
103    ".sqlite3",
104    ".realm",
105    ".profdata",
106];
107
108/// Directory/package suffixes that contain compiled assets rather than source.
109const BINARY_DIR_SUFFIXES: &[&str] = &[".framework", ".xcframework", ".dsym", ".app"];
110
111pub fn is_default_excluded(rel_path: &str) -> bool {
112    let normalized = rel_path.replace('\\', "/");
113    let lower = normalized.to_ascii_lowercase();
114
115    if let Some(file_name) = lower.rsplit('/').next() {
116        if DEFAULT_EXCLUDED_FILES
117            .iter()
118            .any(|excluded| excluded.eq_ignore_ascii_case(file_name))
119        {
120            return true;
121        }
122    }
123
124    if DEFAULT_EXCLUDED_SUFFIXES
125        .iter()
126        .any(|suffix| lower.ends_with(suffix))
127    {
128        return true;
129    }
130
131    let components: Vec<&str> = lower.split('/').collect();
132    if components
133        .iter()
134        .any(|component| DEFAULT_EXCLUDED_ANY_DIRS.contains(component))
135    {
136        return true;
137    }
138
139    if components
140        .first()
141        .is_some_and(|component| DEFAULT_EXCLUDED_ROOT_DIRS.contains(component))
142    {
143        return true;
144    }
145
146    components
147        .windows(3)
148        .any(|window| window == ["_next", "static", "chunks"])
149}
150
151pub fn is_probably_binary_path(rel_path: &str) -> bool {
152    let normalized = rel_path.replace('\\', "/");
153    let lower = normalized.to_ascii_lowercase();
154
155    if BINARY_FILE_SUFFIXES
156        .iter()
157        .any(|suffix| lower.ends_with(suffix))
158    {
159        return true;
160    }
161
162    lower.split('/').any(|component| {
163        BINARY_DIR_SUFFIXES
164            .iter()
165            .any(|suffix| component.ends_with(suffix))
166    })
167}
168
169#[cfg(test)]
170mod tests {
171    use super::*;
172
173    #[test]
174    fn default_excludes_generated_and_build_paths() {
175        assert!(is_default_excluded("dist/app.js"));
176        assert!(is_default_excluded("site/out/_next/static/chunks/app.js"));
177        assert!(is_default_excluded("src/generated.min.js"));
178        assert!(is_default_excluded("target/debug/build.rs"));
179        assert!(!is_default_excluded("src/app.js"));
180        assert!(!is_default_excluded("src/build/mod.rs"));
181        assert!(!is_default_excluded("packages/compiler/build/index.ts"));
182        assert!(!is_default_excluded("tools/dist/analyzer.py"));
183    }
184
185    #[test]
186    fn detects_binary_asset_paths() {
187        assert!(is_probably_binary_path("Snapshots/icon.png"));
188        assert!(is_probably_binary_path("Frameworks/Foo.framework/Foo"));
189        assert!(is_probably_binary_path("Modules/Foo.swiftmodule"));
190        assert!(!is_probably_binary_path("Assets/icon.svg"));
191        assert!(!is_probably_binary_path("src/main.rs"));
192    }
193}