1use std::collections::HashMap;
14use std::path::Path;
15
16#[derive(Debug, Clone, PartialEq, Eq, Hash)]
18pub enum SemanticCategory {
19 Documentation,
21 SourceCode,
22 Tests,
23 Configuration,
24 BuildSystem,
25 Dependencies,
26 Assets,
27 Data,
28 Scripts,
29 Generated,
30
31 ProjectRoot,
33 Development,
34 Deployment,
35
36 Unknown,
38}
39
40impl SemanticCategory {
41 pub fn display_name(&self) -> &'static str {
43 match self {
44 Self::Documentation => "๐ Documentation",
45 Self::SourceCode => "๐ป Source Code",
46 Self::Tests => "๐งช Tests",
47 Self::Configuration => "โ๏ธ Configuration",
48 Self::BuildSystem => "๐จ Build System",
49 Self::Dependencies => "๐ฆ Dependencies",
50 Self::Assets => "๐จ Assets",
51 Self::Data => "๐พ Data",
52 Self::Scripts => "๐ Scripts",
53 Self::Generated => "๐ค Generated",
54 Self::ProjectRoot => "๐ณ Project Root",
55 Self::Development => "๐ ๏ธ Development",
56 Self::Deployment => "๐ Deployment",
57 Self::Unknown => "โ Other",
58 }
59 }
60
61 pub fn wave_signature(&self) -> u32 {
63 match self {
66 Self::Documentation => 0x1B8D4C7A, Self::SourceCode => 0x73A9E2F5, Self::Tests => 0x9F2E6B31, Self::Configuration => 0x2C7DB5A3, Self::BuildSystem => 0xE4739AC2, Self::Dependencies => 0x5BA3F18E, Self::Assets => 0xA7E2C94D, Self::Data => 0x3F91D6B8, Self::Scripts => 0x8C5A7E2F, Self::Generated => 0xD2B847A6, Self::ProjectRoot => 0x618033FF, Self::Development => 0xB4E9A5C7, Self::Deployment => 0x7F3DA928, Self::Unknown => 0x4B1D8A73, }
81 }
82}
83
84pub struct SemanticAnalyzer {
86 patterns: Vec<(SemanticCategory, Vec<&'static str>)>,
88}
89
90impl SemanticAnalyzer {
91 pub fn new() -> Self {
92 let patterns = vec![
94 (
96 SemanticCategory::Generated,
97 vec![
98 ".o",
99 ".a",
100 ".so",
101 ".dll",
102 ".dylib",
103 ".exe",
104 ".app",
105 ".class",
106 ".jar",
107 ".war",
108 ".pyc",
109 ".pyo",
110 ".pyd",
111 ".min.js",
112 ".min.css",
113 ".bundle.js",
114 ".chunk.js",
115 "generated",
116 "gen",
117 "auto",
118 "autogen",
119 ".g.dart",
120 ],
121 ),
122 (
124 SemanticCategory::Data,
125 vec![
126 ".csv", ".tsv", ".parquet", ".feather", ".arrow", ".db", ".sqlite", ".sql",
127 ".mdb", ".dbf", ".h5", ".hdf5", ".nc", ".zarr", ".npy", ".npz", "data",
128 "datasets", "corpus", "samples",
129 ],
130 ),
131 (
133 SemanticCategory::Assets,
134 vec![
135 ".png",
136 ".jpg",
137 ".jpeg",
138 ".gif",
139 ".svg",
140 ".ico",
141 ".webp",
142 ".mp3",
143 ".wav",
144 ".ogg",
145 ".mp4",
146 ".webm",
147 ".mov",
148 ".ttf",
149 ".otf",
150 ".woff",
151 ".woff2",
152 ".eot",
153 ".css",
154 ".scss",
155 ".sass",
156 ".less",
157 ".styl",
158 "assets",
159 "static",
160 "public",
161 "resources",
162 "media",
163 ],
164 ),
165 (
167 SemanticCategory::Scripts,
168 vec![
169 ".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", "scripts", "bin",
170 "tools", "utils", "hooks", "install", "setup", "deploy", "run", "start",
171 "stop",
172 ],
173 ),
174 (
176 SemanticCategory::Tests,
177 vec![
178 "test",
179 "tests",
180 "spec",
181 "specs",
182 "__tests__",
183 "_test",
184 "test_",
185 ".test.",
186 ".spec.",
187 "_spec.",
188 "integration",
189 "unit",
190 "e2e",
191 ],
192 ),
193 (
195 SemanticCategory::BuildSystem,
196 vec![
197 "Makefile",
198 "makefile",
199 "CMakeLists",
200 "build",
201 "BUILD",
202 "Cargo.toml",
203 "package.json",
204 "pom.xml",
205 "build.gradle",
206 "setup.py",
207 "setup.cfg",
208 "pyproject.toml",
209 "composer.json",
210 ".bazel",
211 "meson.build",
212 "SConstruct",
213 "Rakefile",
214 ],
215 ),
216 (
218 SemanticCategory::Configuration,
219 vec![
220 ".config",
221 ".conf",
222 ".cfg",
223 ".ini",
224 ".env",
225 ".properties",
226 ".json",
227 ".yaml",
228 ".yml",
229 ".toml",
230 ".xml",
231 "settings",
232 "config",
233 "configuration",
234 ".gitignore",
235 ".dockerignore",
236 ],
237 ),
238 (
240 SemanticCategory::Dependencies,
241 vec![
242 "node_modules",
243 "vendor",
244 "packages",
245 ".packages",
246 "target",
247 "venv",
248 ".venv",
249 "env",
250 ".env",
251 "virtualenv",
252 "__pycache__",
253 "dist",
254 "build",
255 ".gradle",
256 ".m2",
257 "Cargo.lock",
258 "package-lock.json",
259 "yarn.lock",
260 "poetry.lock",
261 "Gemfile.lock",
262 "requirements.txt",
263 ],
264 ),
265 (
267 SemanticCategory::Documentation,
268 vec![
269 "README",
270 "readme",
271 "LICENSE",
272 "CHANGELOG",
273 "AUTHORS",
274 "CONTRIBUTORS",
275 "INSTALL",
276 "GUIDE",
277 "TUTORIAL",
278 "DOCS",
279 "NOTES",
280 "TODO",
281 ".md",
282 ".rst",
283 ".txt",
284 ".adoc",
285 ".org",
286 ".tex",
287 ],
288 ),
289 (
291 SemanticCategory::SourceCode,
292 vec![
293 ".rs", ".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".java", ".c", ".cpp", ".h",
294 ".hpp", ".cs", ".rb", ".php", ".swift", ".kt", ".scala", ".r", ".jl", ".ml",
295 ".hs", ".ex", ".exs", ".clj", ".dart", ".nim",
296 ],
297 ),
298 ];
299
300 Self { patterns }
301 }
302
303 pub fn categorize(&self, path: &Path) -> SemanticCategory {
305 let path_str = path.to_string_lossy().to_lowercase();
306 let file_name = path
307 .file_name()
308 .and_then(|n| n.to_str())
309 .unwrap_or("")
310 .to_lowercase();
311
312 if file_name == "cargo.toml"
314 || file_name == "package.json"
315 || file_name == "makefile"
316 || file_name == "cmakelists.txt"
317 || file_name == "build.gradle"
318 || file_name == "setup.py"
319 {
320 return SemanticCategory::BuildSystem;
321 }
322
323 if self.is_test_file(&path_str, &file_name) {
325 return SemanticCategory::Tests;
326 }
327
328 for (category, patterns) in &self.patterns {
330 for pattern in patterns {
331 if self.matches_pattern(&file_name, &path_str, pattern) {
332 return category.clone();
333 }
334 }
335 }
336
337 if (path.parent().is_none() || path.components().count() == 1)
339 && (file_name == "cargo.toml"
340 || file_name == "package.json"
341 || file_name == "setup.py"
342 || file_name == "go.mod")
343 {
344 return SemanticCategory::ProjectRoot;
345 }
346
347 SemanticCategory::Unknown
348 }
349
350 fn matches_pattern(&self, file_name: &str, path_str: &str, pattern: &str) -> bool {
352 if pattern.starts_with('.') && pattern.len() > 1 {
353 file_name.ends_with(pattern) || path_str.contains(&format!("{}/", pattern))
355 } else {
356 file_name.contains(pattern) || path_str.contains(pattern)
358 }
359 }
360
361 fn is_test_file(&self, path_str: &str, file_name: &str) -> bool {
363 for (category, patterns) in &self.patterns {
365 if *category == SemanticCategory::Tests {
366 return patterns
367 .iter()
368 .any(|pattern| self.matches_pattern(file_name, path_str, pattern));
369 }
370 }
371 false
372 }
373
374 pub fn similarity(&self, path1: &Path, path2: &Path) -> f32 {
377 let cat1 = self.categorize(path1);
378 let cat2 = self.categorize(path2);
379
380 if cat1 == cat2 {
381 let mut similarity = 0.8;
383
384 if path1.extension() == path2.extension() {
386 similarity += 0.1;
387 }
388
389 if path1.parent() == path2.parent() {
391 similarity += 0.1;
392 }
393
394 similarity
395 } else {
396 let wave1 = cat1.wave_signature();
398 let wave2 = cat2.wave_signature();
399
400 let interference = (wave1 ^ wave2).count_ones();
402 let max_bits = 32;
403
404 1.0 - (interference as f32 / max_bits as f32)
406 }
407 }
408}
409
410impl Default for SemanticAnalyzer {
411 fn default() -> Self {
412 Self::new()
413 }
414}
415
416pub fn group_by_semantics<'a>(files: &[&'a Path]) -> HashMap<SemanticCategory, Vec<&'a Path>> {
418 let analyzer = SemanticAnalyzer::new();
419 let mut groups: HashMap<SemanticCategory, Vec<&'a Path>> = HashMap::new();
420
421 for file in files {
422 let category = analyzer.categorize(file);
423 groups.entry(category).or_default().push(file);
424 }
425
426 groups
427}
428
429#[cfg(test)]
430mod tests {
431 use super::*;
432 use std::path::PathBuf;
433
434 #[test]
435 fn test_categorization() {
436 let analyzer = SemanticAnalyzer::new();
437
438 assert_eq!(
440 analyzer.categorize(&PathBuf::from("README.md")),
441 SemanticCategory::Documentation
442 );
443 assert_eq!(
444 analyzer.categorize(&PathBuf::from("main.rs")),
445 SemanticCategory::SourceCode
446 );
447 assert_eq!(
448 analyzer.categorize(&PathBuf::from("test_utils.rs")),
449 SemanticCategory::Tests
450 );
451 assert_eq!(
452 analyzer.categorize(&PathBuf::from("Cargo.toml")),
453 SemanticCategory::BuildSystem
454 );
455 assert_eq!(
456 analyzer.categorize(&PathBuf::from("config.yaml")),
457 SemanticCategory::Configuration
458 );
459 assert_eq!(
460 analyzer.categorize(&PathBuf::from("logo.png")),
461 SemanticCategory::Assets
462 );
463 assert_eq!(
464 analyzer.categorize(&PathBuf::from("data.csv")),
465 SemanticCategory::Data
466 );
467 assert_eq!(
468 analyzer.categorize(&PathBuf::from("install.sh")),
469 SemanticCategory::Scripts
470 );
471 assert_eq!(
472 analyzer.categorize(&PathBuf::from("main.o")),
473 SemanticCategory::Generated
474 );
475 }
476
477 #[test]
478 fn test_wave_signatures() {
479 let doc_wave = SemanticCategory::Documentation.wave_signature();
481 let code_wave = SemanticCategory::SourceCode.wave_signature();
482 let test_wave = SemanticCategory::Tests.wave_signature();
483
484 assert_ne!(doc_wave, code_wave);
485 assert_ne!(doc_wave, test_wave);
486 assert_ne!(code_wave, test_wave);
487 }
488
489 #[test]
490 fn test_similarity() {
491 let analyzer = SemanticAnalyzer::new();
492
493 let similarity = analyzer.similarity(&PathBuf::from("main.rs"), &PathBuf::from("lib.rs"));
495 assert!(
496 similarity > 0.7,
497 "Expected similarity > 0.7, got {}",
498 similarity
499 );
500
501 let similarity =
503 analyzer.similarity(&PathBuf::from("main.rs"), &PathBuf::from("README.md"));
504 assert!(
505 similarity < 0.6,
506 "Expected similarity < 0.6, got {}",
507 similarity
508 );
509 }
510}