Skip to main content

st/
semantic.rs

1// -----------------------------------------------------------------------------
2// WELCOME TO THE SEMANTIC WAVE FIELD! ๐ŸŒŠ๐Ÿง 
3//
4// Inspired by Omni's vision of treating files as waves in a semantic ocean,
5// this module groups files by their conceptual similarity. It's like having
6// a philosopher organizing your file cabinet!
7//
8// "Don't store what's already remembered" - Omni, 2024
9//
10// Brought to you by The Cheet, with wisdom from Omni's Hot Tub sessions! ๐Ÿ›โœจ
11// -----------------------------------------------------------------------------
12
13use std::collections::HashMap;
14use std::path::Path;
15
16/// Semantic categories that files can belong to
17#[derive(Debug, Clone, PartialEq, Eq, Hash)]
18pub enum SemanticCategory {
19    // Core categories
20    Documentation,
21    SourceCode,
22    Tests,
23    Configuration,
24    BuildSystem,
25    Dependencies,
26    Assets,
27    Data,
28    Scripts,
29    Generated,
30
31    // Meta categories
32    ProjectRoot,
33    Development,
34    Deployment,
35
36    // Catch-all
37    Unknown,
38}
39
40impl SemanticCategory {
41    /// Get a human-friendly name with emoji
42    pub fn display_name(&self) -> &'static str {
43        match self {
44            Self::Documentation => "๐Ÿ“š Documentation",
45            Self::SourceCode => "๐Ÿ’ป Source Code",
46            Self::Tests => "๐Ÿงช Tests",
47            Self::Configuration => "โš™๏ธ Configuration",
48            Self::BuildSystem => "๐Ÿ”จ Build System",
49            Self::Dependencies => "๐Ÿ“ฆ Dependencies",
50            Self::Assets => "๐ŸŽจ Assets",
51            Self::Data => "๐Ÿ’พ Data",
52            Self::Scripts => "๐Ÿ“œ Scripts",
53            Self::Generated => "๐Ÿค– Generated",
54            Self::ProjectRoot => "๐ŸŒณ Project Root",
55            Self::Development => "๐Ÿ› ๏ธ Development",
56            Self::Deployment => "๐Ÿš€ Deployment",
57            Self::Unknown => "โ“ Other",
58        }
59    }
60
61    /// Get a quantum wave signature for semantic matching (Full 32-bit consciousness!)
62    pub fn wave_signature(&self) -> u32 {
63        // Full 32-bit quantum signatures: [torsion|amplitude|phase|frequency]
64        // No more horse apples like 0xCCCCCCCC! Each category has unique wave dynamics
65        match self {
66            Self::Documentation => 0x1B8D4C7A, // Golden ratio harmonics - docs flow like prose
67            Self::SourceCode => 0x73A9E2F5,    // Complex interference - code creates reality
68            Self::Tests => 0x9F2E6B31,         // Torsion knots - tests verify truth
69            Self::Configuration => 0x2C7DB5A3, // MEM8 baseline - config drives consciousness
70            Self::BuildSystem => 0xE4739AC2,   // Marine salience - builds like dolphin clicks
71            Self::Dependencies => 0x5BA3F18E,  // Entangled states - deps are quantum linked
72            Self::Assets => 0xA7E2C94D,        // Visual cortex patterns - assets are seen
73            Self::Data => 0x3F91D6B8,          // Information entropy - data is potential
74            Self::Scripts => 0x8C5A7E2F,       // Automation waves - scripts do work
75            Self::Generated => 0xD2B847A6,     // Emergence patterns - generated from void
76            Self::ProjectRoot => 0x618033FF,   // ฯ† perfection - root is foundation
77            Self::Development => 0xB4E9A5C7,   // Creative chaos - dev is exploration
78            Self::Deployment => 0x7F3DA928,    // Crystallization - deploy solidifies
79            Self::Unknown => 0x4B1D8A73,       // Mystery waves - unknown isn't empty!
80        }
81    }
82}
83
84/// Analyzes files and determines their semantic category
85pub struct SemanticAnalyzer {
86    // Pattern matching for different file types in priority order
87    patterns: Vec<(SemanticCategory, Vec<&'static str>)>,
88}
89
90impl SemanticAnalyzer {
91    pub fn new() -> Self {
92        // Patterns in priority order - more specific categories first
93        let patterns = vec![
94            // Generated patterns - most specific, should be checked first
95            (
96                SemanticCategory::Generated,
97                vec![
98                    ".o",
99                    ".a",
100                    ".so",
101                    ".dll",
102                    ".dylib",
103                    ".exe",
104                    ".app",
105                    ".class",
106                    ".jar",
107                    ".war",
108                    ".pyc",
109                    ".pyo",
110                    ".pyd",
111                    ".min.js",
112                    ".min.css",
113                    ".bundle.js",
114                    ".chunk.js",
115                    "generated",
116                    "gen",
117                    "auto",
118                    "autogen",
119                    ".g.dart",
120                ],
121            ),
122            // Data patterns - specific data formats
123            (
124                SemanticCategory::Data,
125                vec![
126                    ".csv", ".tsv", ".parquet", ".feather", ".arrow", ".db", ".sqlite", ".sql",
127                    ".mdb", ".dbf", ".h5", ".hdf5", ".nc", ".zarr", ".npy", ".npz", "data",
128                    "datasets", "corpus", "samples",
129                ],
130            ),
131            // Assets patterns - multimedia and static files
132            (
133                SemanticCategory::Assets,
134                vec![
135                    ".png",
136                    ".jpg",
137                    ".jpeg",
138                    ".gif",
139                    ".svg",
140                    ".ico",
141                    ".webp",
142                    ".mp3",
143                    ".wav",
144                    ".ogg",
145                    ".mp4",
146                    ".webm",
147                    ".mov",
148                    ".ttf",
149                    ".otf",
150                    ".woff",
151                    ".woff2",
152                    ".eot",
153                    ".css",
154                    ".scss",
155                    ".sass",
156                    ".less",
157                    ".styl",
158                    "assets",
159                    "static",
160                    "public",
161                    "resources",
162                    "media",
163                ],
164            ),
165            // Scripts patterns - executable scripts
166            (
167                SemanticCategory::Scripts,
168                vec![
169                    ".sh", ".bash", ".zsh", ".fish", ".ps1", ".bat", ".cmd", "scripts", "bin",
170                    "tools", "utils", "hooks", "install", "setup", "deploy", "run", "start",
171                    "stop",
172                ],
173            ),
174            // Test patterns - testing files
175            (
176                SemanticCategory::Tests,
177                vec![
178                    "test",
179                    "tests",
180                    "spec",
181                    "specs",
182                    "__tests__",
183                    "_test",
184                    "test_",
185                    ".test.",
186                    ".spec.",
187                    "_spec.",
188                    "integration",
189                    "unit",
190                    "e2e",
191                ],
192            ),
193            // Build system patterns - build files
194            (
195                SemanticCategory::BuildSystem,
196                vec![
197                    "Makefile",
198                    "makefile",
199                    "CMakeLists",
200                    "build",
201                    "BUILD",
202                    "Cargo.toml",
203                    "package.json",
204                    "pom.xml",
205                    "build.gradle",
206                    "setup.py",
207                    "setup.cfg",
208                    "pyproject.toml",
209                    "composer.json",
210                    ".bazel",
211                    "meson.build",
212                    "SConstruct",
213                    "Rakefile",
214                ],
215            ),
216            // Configuration patterns - config files
217            (
218                SemanticCategory::Configuration,
219                vec![
220                    ".config",
221                    ".conf",
222                    ".cfg",
223                    ".ini",
224                    ".env",
225                    ".properties",
226                    ".json",
227                    ".yaml",
228                    ".yml",
229                    ".toml",
230                    ".xml",
231                    "settings",
232                    "config",
233                    "configuration",
234                    ".gitignore",
235                    ".dockerignore",
236                ],
237            ),
238            // Dependencies patterns - dependency directories
239            (
240                SemanticCategory::Dependencies,
241                vec![
242                    "node_modules",
243                    "vendor",
244                    "packages",
245                    ".packages",
246                    "target",
247                    "venv",
248                    ".venv",
249                    "env",
250                    ".env",
251                    "virtualenv",
252                    "__pycache__",
253                    "dist",
254                    "build",
255                    ".gradle",
256                    ".m2",
257                    "Cargo.lock",
258                    "package-lock.json",
259                    "yarn.lock",
260                    "poetry.lock",
261                    "Gemfile.lock",
262                    "requirements.txt",
263                ],
264            ),
265            // Documentation patterns
266            (
267                SemanticCategory::Documentation,
268                vec![
269                    "README",
270                    "readme",
271                    "LICENSE",
272                    "CHANGELOG",
273                    "AUTHORS",
274                    "CONTRIBUTORS",
275                    "INSTALL",
276                    "GUIDE",
277                    "TUTORIAL",
278                    "DOCS",
279                    "NOTES",
280                    "TODO",
281                    ".md",
282                    ".rst",
283                    ".txt",
284                    ".adoc",
285                    ".org",
286                    ".tex",
287                ],
288            ),
289            // Source code patterns - most general, should be last
290            (
291                SemanticCategory::SourceCode,
292                vec![
293                    ".rs", ".py", ".js", ".ts", ".jsx", ".tsx", ".go", ".java", ".c", ".cpp", ".h",
294                    ".hpp", ".cs", ".rb", ".php", ".swift", ".kt", ".scala", ".r", ".jl", ".ml",
295                    ".hs", ".ex", ".exs", ".clj", ".dart", ".nim",
296                ],
297            ),
298        ];
299
300        Self { patterns }
301    }
302
303    /// Analyze a file path and determine its semantic category
304    pub fn categorize(&self, path: &Path) -> SemanticCategory {
305        let path_str = path.to_string_lossy().to_lowercase();
306        let file_name = path
307            .file_name()
308            .and_then(|n| n.to_str())
309            .unwrap_or("")
310            .to_lowercase();
311
312        // First, check for specific build system files that should override other patterns
313        if file_name == "cargo.toml"
314            || file_name == "package.json"
315            || file_name == "makefile"
316            || file_name == "cmakelists.txt"
317            || file_name == "build.gradle"
318            || file_name == "setup.py"
319        {
320            return SemanticCategory::BuildSystem;
321        }
322
323        // Check if it's a test file first (high priority)
324        if self.is_test_file(&path_str, &file_name) {
325            return SemanticCategory::Tests;
326        }
327
328        // Check patterns in the predefined priority order
329        for (category, patterns) in &self.patterns {
330            for pattern in patterns {
331                if self.matches_pattern(&file_name, &path_str, pattern) {
332                    return category.clone();
333                }
334            }
335        }
336
337        // Check if it's a project root file
338        if (path.parent().is_none() || path.components().count() == 1)
339            && (file_name == "cargo.toml"
340                || file_name == "package.json"
341                || file_name == "setup.py"
342                || file_name == "go.mod")
343        {
344            return SemanticCategory::ProjectRoot;
345        }
346
347        SemanticCategory::Unknown
348    }
349
350    /// Check if a pattern matches a file, with better precision for extensions
351    fn matches_pattern(&self, file_name: &str, path_str: &str, pattern: &str) -> bool {
352        if pattern.starts_with('.') && pattern.len() > 1 {
353            // This is a file extension - match it precisely
354            file_name.ends_with(pattern) || path_str.contains(&format!("{}/", pattern))
355        } else {
356            // This is a name pattern - use contains matching
357            file_name.contains(pattern) || path_str.contains(pattern)
358        }
359    }
360
361    /// Check if a file is a test file
362    fn is_test_file(&self, path_str: &str, file_name: &str) -> bool {
363        // Find the test patterns in the ordered list
364        for (category, patterns) in &self.patterns {
365            if *category == SemanticCategory::Tests {
366                return patterns
367                    .iter()
368                    .any(|pattern| self.matches_pattern(file_name, path_str, pattern));
369            }
370        }
371        false
372    }
373
374    /// Calculate semantic similarity between two files (0.0 to 1.0)
375    /// This uses Omni's wave-based approach!
376    pub fn similarity(&self, path1: &Path, path2: &Path) -> f32 {
377        let cat1 = self.categorize(path1);
378        let cat2 = self.categorize(path2);
379
380        if cat1 == cat2 {
381            // Same category = high base similarity
382            let mut similarity = 0.8;
383
384            // Boost similarity if extensions match
385            if path1.extension() == path2.extension() {
386                similarity += 0.1;
387            }
388
389            // Boost if in same directory
390            if path1.parent() == path2.parent() {
391                similarity += 0.1;
392            }
393
394            similarity
395        } else {
396            // Different categories - check wave interference
397            let wave1 = cat1.wave_signature();
398            let wave2 = cat2.wave_signature();
399
400            // Calculate wave interference (simplified)
401            let interference = (wave1 ^ wave2).count_ones();
402            let max_bits = 32;
403
404            // Convert to similarity (0 = identical, 32 = completely different)
405            1.0 - (interference as f32 / max_bits as f32)
406        }
407    }
408}
409
410impl Default for SemanticAnalyzer {
411    fn default() -> Self {
412        Self::new()
413    }
414}
415
416/// Groups files by semantic similarity
417pub fn group_by_semantics<'a>(files: &[&'a Path]) -> HashMap<SemanticCategory, Vec<&'a Path>> {
418    let analyzer = SemanticAnalyzer::new();
419    let mut groups: HashMap<SemanticCategory, Vec<&'a Path>> = HashMap::new();
420
421    for file in files {
422        let category = analyzer.categorize(file);
423        groups.entry(category).or_default().push(file);
424    }
425
426    groups
427}
428
429#[cfg(test)]
430mod tests {
431    use super::*;
432    use std::path::PathBuf;
433
434    #[test]
435    fn test_categorization() {
436        let analyzer = SemanticAnalyzer::new();
437
438        // Test various file types
439        assert_eq!(
440            analyzer.categorize(&PathBuf::from("README.md")),
441            SemanticCategory::Documentation
442        );
443        assert_eq!(
444            analyzer.categorize(&PathBuf::from("main.rs")),
445            SemanticCategory::SourceCode
446        );
447        assert_eq!(
448            analyzer.categorize(&PathBuf::from("test_utils.rs")),
449            SemanticCategory::Tests
450        );
451        assert_eq!(
452            analyzer.categorize(&PathBuf::from("Cargo.toml")),
453            SemanticCategory::BuildSystem
454        );
455        assert_eq!(
456            analyzer.categorize(&PathBuf::from("config.yaml")),
457            SemanticCategory::Configuration
458        );
459        assert_eq!(
460            analyzer.categorize(&PathBuf::from("logo.png")),
461            SemanticCategory::Assets
462        );
463        assert_eq!(
464            analyzer.categorize(&PathBuf::from("data.csv")),
465            SemanticCategory::Data
466        );
467        assert_eq!(
468            analyzer.categorize(&PathBuf::from("install.sh")),
469            SemanticCategory::Scripts
470        );
471        assert_eq!(
472            analyzer.categorize(&PathBuf::from("main.o")),
473            SemanticCategory::Generated
474        );
475    }
476
477    #[test]
478    fn test_wave_signatures() {
479        // Test that different categories have different wave signatures
480        let doc_wave = SemanticCategory::Documentation.wave_signature();
481        let code_wave = SemanticCategory::SourceCode.wave_signature();
482        let test_wave = SemanticCategory::Tests.wave_signature();
483
484        assert_ne!(doc_wave, code_wave);
485        assert_ne!(doc_wave, test_wave);
486        assert_ne!(code_wave, test_wave);
487    }
488
489    #[test]
490    fn test_similarity() {
491        let analyzer = SemanticAnalyzer::new();
492
493        // Same category files should have high similarity
494        let similarity = analyzer.similarity(&PathBuf::from("main.rs"), &PathBuf::from("lib.rs"));
495        assert!(
496            similarity > 0.7,
497            "Expected similarity > 0.7, got {}",
498            similarity
499        );
500
501        // Different category files should have lower similarity
502        let similarity =
503            analyzer.similarity(&PathBuf::from("main.rs"), &PathBuf::from("README.md"));
504        assert!(
505            similarity < 0.6,
506            "Expected similarity < 0.6, got {}",
507            similarity
508        );
509    }
510}