aurora_semantic/
config.rs

1//! Configuration types for the aurora-semantic engine.
2
3use serde::{Deserialize, Serialize};
4use std::path::PathBuf;
5
6use crate::types::Language;
7
8/// Main configuration for the semantic search engine.
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct EngineConfig {
11    /// Directory where indexes are stored.
12    pub index_dir: PathBuf,
13    /// Chunking configuration.
14    pub chunking: ChunkingConfig,
15    /// Embedding configuration.
16    pub embedding: EmbeddingConfig,
17    /// Search configuration.
18    pub search: SearchConfig,
19    /// Ignore patterns configuration.
20    pub ignore: IgnoreConfig,
21    /// Performance tuning.
22    pub performance: PerformanceConfig,
23}
24
25impl EngineConfig {
26    /// Create a new configuration with the given index directory.
27    pub fn new(index_dir: PathBuf) -> Self {
28        Self {
29            index_dir,
30            chunking: ChunkingConfig::default(),
31            embedding: EmbeddingConfig::default(),
32            search: SearchConfig::default(),
33            ignore: IgnoreConfig::default(),
34            performance: PerformanceConfig::default(),
35        }
36    }
37
38    /// Builder-style method to set chunking config.
39    pub fn with_chunking(mut self, config: ChunkingConfig) -> Self {
40        self.chunking = config;
41        self
42    }
43
44    /// Builder-style method to set embedding config.
45    pub fn with_embedding(mut self, config: EmbeddingConfig) -> Self {
46        self.embedding = config;
47        self
48    }
49
50    /// Builder-style method to set search config.
51    pub fn with_search(mut self, config: SearchConfig) -> Self {
52        self.search = config;
53        self
54    }
55
56    /// Builder-style method to set ignore config.
57    pub fn with_ignore(mut self, config: IgnoreConfig) -> Self {
58        self.ignore = config;
59        self
60    }
61
62    /// Builder-style method to set performance config.
63    pub fn with_performance(mut self, config: PerformanceConfig) -> Self {
64        self.performance = config;
65        self
66    }
67}
68
69impl Default for EngineConfig {
70    fn default() -> Self {
71        Self::new(PathBuf::from(".aurora"))
72    }
73}
74
75/// Configuration for code chunking.
76#[derive(Debug, Clone, Serialize, Deserialize)]
77pub struct ChunkingConfig {
78    /// Maximum chunk size in characters.
79    pub max_chunk_size: usize,
80    /// Minimum chunk size in characters.
81    pub min_chunk_size: usize,
82    /// Whether to extract documentation comments.
83    pub extract_comments: bool,
84}
85
86impl Default for ChunkingConfig {
87    fn default() -> Self {
88        Self {
89            max_chunk_size: 2000,
90            min_chunk_size: 50,
91            extract_comments: true,
92        }
93    }
94}
95
96/// Configuration for embedding generation.
97#[derive(Debug, Clone, Serialize, Deserialize)]
98pub struct EmbeddingConfig {
99    /// Embedding dimension.
100    pub dimension: usize,
101    /// Batch size for embedding generation.
102    pub batch_size: usize,
103    /// Maximum sequence length.
104    pub max_length: usize,
105    /// Whether to normalize embeddings.
106    pub normalize: bool,
107}
108
109impl Default for EmbeddingConfig {
110    fn default() -> Self {
111        Self {
112            dimension: 768, // Common for code models like jina-code
113            batch_size: 32,
114            max_length: 512,
115            normalize: true,
116        }
117    }
118}
119
120/// Configuration for search behavior.
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct SearchConfig {
123    /// Default number of results to return.
124    pub default_limit: usize,
125    /// Maximum number of results to return.
126    pub max_limit: usize,
127    /// Default search mode.
128    pub default_mode: SearchMode,
129    /// Weight for lexical results in hybrid search (0.0 to 1.0).
130    pub lexical_weight: f32,
131    /// Weight for semantic results in hybrid search (0.0 to 1.0).
132    pub semantic_weight: f32,
133    /// Minimum score threshold for results (0.0 to 1.0).
134    pub min_score: f32,
135    /// Enable fuzzy matching in lexical search.
136    pub fuzzy_matching: bool,
137    /// Fuzzy matching distance (edit distance).
138    pub fuzzy_distance: u8,
139}
140
141impl Default for SearchConfig {
142    fn default() -> Self {
143        Self {
144            default_limit: 20,
145            max_limit: 100,
146            default_mode: SearchMode::Hybrid,
147            lexical_weight: 0.4,
148            semantic_weight: 0.6,
149            min_score: 0.1,
150            fuzzy_matching: true,
151            fuzzy_distance: 2,
152        }
153    }
154}
155
156/// Search mode selection.
157#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
158pub enum SearchMode {
159    /// Keyword-based lexical search only.
160    Lexical,
161    /// Embedding-based semantic search only.
162    Semantic,
163    /// Combined lexical and semantic search.
164    Hybrid,
165}
166
167impl Default for SearchMode {
168    fn default() -> Self {
169        Self::Hybrid
170    }
171}
172
173/// Configuration for ignore patterns.
174#[derive(Debug, Clone, Serialize, Deserialize)]
175pub struct IgnoreConfig {
176    /// Respect .gitignore files.
177    pub use_gitignore: bool,
178    /// Respect .ignore files.
179    pub use_ignore_files: bool,
180    /// Additional patterns to ignore (glob patterns).
181    pub patterns: Vec<String>,
182    /// File extensions to ignore.
183    pub ignored_extensions: Vec<String>,
184    /// Directories to always ignore (by name, matches anywhere in path).
185    pub ignored_directories: Vec<String>,
186    /// Maximum file size to index (in bytes).
187    pub max_file_size: u64,
188    /// Specific file paths to exclude (relative to workspace root).
189    pub excluded_files: Vec<PathBuf>,
190    /// Specific directory paths to exclude (relative to workspace root).
191    pub excluded_directories: Vec<PathBuf>,
192}
193
194impl Default for IgnoreConfig {
195    fn default() -> Self {
196        Self {
197            use_gitignore: true,
198            use_ignore_files: true,
199            patterns: vec![],
200            ignored_extensions: vec![
201                // ============================================
202                // BINARY / COMPILED
203                // ============================================
204                "exe".into(), "dll".into(), "so".into(), "dylib".into(), "a".into(),
205                "lib".into(), "obj".into(), "o".into(), "ko".into(), "elf".into(),
206                "bin".into(), "out".into(), "app".into(), "msi".into(), "dmg".into(),
207                "deb".into(), "rpm".into(), "apk".into(), "ipa".into(), "aab".into(),
208                "class".into(), "jar".into(), "war".into(), "ear".into(),
209                "pyc".into(), "pyo".into(), "pyd".into(), "whl".into(),
210                "wasm".into(), "wat".into(),
211                "rlib".into(), "rmeta".into(), "d".into(),
212                
213                // ============================================
214                // IMAGES / MEDIA
215                // ============================================
216                "png".into(), "jpg".into(), "jpeg".into(), "gif".into(), "bmp".into(),
217                "ico".into(), "icns".into(), "svg".into(), "webp".into(), "avif".into(),
218                "tiff".into(), "tif".into(), "psd".into(), "ai".into(), "eps".into(),
219                "raw".into(), "cr2".into(), "nef".into(), "heic".into(), "heif".into(),
220                // Video
221                "mp4".into(), "avi".into(), "mov".into(), "wmv".into(), "flv".into(),
222                "mkv".into(), "webm".into(), "m4v".into(), "mpeg".into(), "mpg".into(),
223                "3gp".into(), "ogv".into(),
224                // Audio
225                "mp3".into(), "wav".into(), "ogg".into(), "flac".into(), "aac".into(),
226                "wma".into(), "m4a".into(), "opus".into(), "aiff".into(),
227                
228                // ============================================
229                // FONTS
230                // ============================================
231                "woff".into(), "woff2".into(), "ttf".into(), "otf".into(), "eot".into(),
232                
233                // ============================================
234                // ARCHIVES / COMPRESSED
235                // ============================================
236                "zip".into(), "tar".into(), "gz".into(), "bz2".into(), "xz".into(),
237                "rar".into(), "7z".into(), "tgz".into(), "tbz2".into(), "lz".into(),
238                "lzma".into(), "cab".into(), "iso".into(), "dmg".into(), "pkg".into(),
239                "zst".into(), "lz4".into(), "br".into(),
240                
241                // ============================================
242                // DOCUMENTS / NON-CODE (CRITICAL FOR CODE SEARCH!)
243                // ============================================
244                // Markdown - creates too many chunks, dominates search results
245                "md".into(), "markdown".into(), "mdx".into(),
246                // reStructuredText
247                "rst".into(), "rest".into(),
248                // Plain text docs
249                "txt".into(),
250                // Office documents
251                "pdf".into(), "doc".into(), "docx".into(), "xls".into(), "xlsx".into(),
252                "ppt".into(), "pptx".into(), "odt".into(), "ods".into(), "odp".into(),
253                "rtf".into(), "pages".into(), "numbers".into(), "key".into(),
254                
255                // ============================================
256                // DATABASE FILES
257                // ============================================
258                "db".into(), "sqlite".into(), "sqlite3".into(), "mdb".into(),
259                "accdb".into(), "frm".into(), "myd".into(), "myi".into(),
260                "ibd".into(), "dbf".into(), "sav".into(),
261                
262                // ============================================
263                // LOCK FILES (CRITICAL!)
264                // ============================================
265                "lock".into(), "lockb".into(),
266                
267                // ============================================
268                // MAP / SOURCE MAP FILES
269                // ============================================
270                "map".into(),
271                
272                // ============================================
273                // LOG FILES
274                // ============================================
275                "log".into(),
276                
277                // ============================================
278                // BACKUP / TEMP FILES
279                // ============================================
280                "bak".into(), "backup".into(), "tmp".into(), "temp".into(),
281                "swp".into(), "swo".into(), "swn".into(), // Vim swap files
282                
283                // ============================================
284                // OS SPECIFIC
285                // ============================================
286                "DS_Store".into(), "Thumbs.db".into(), "desktop.ini".into(),
287                
288                // ============================================
289                // CERTIFICATES / KEYS (Security)
290                // ============================================
291                "pem".into(), "crt".into(), "cer".into(), "der".into(),
292                "p12".into(), "pfx".into(), "jks".into(), "keystore".into(),
293            ],
294            ignored_directories: vec![
295                // ============================================
296                // VERSION CONTROL
297                // ============================================
298                ".git".into(), ".svn".into(), ".hg".into(), ".bzr".into(),
299                "_darcs".into(), ".fossil".into(),
300                
301                // ============================================
302                // JAVASCRIPT / NODE.JS / WEB
303                // ============================================
304                "node_modules".into(),
305                ".npm".into(), ".pnpm".into(), ".pnpm-store".into(),
306                ".yarn".into(), ".yarnrc".into(), ".yarn-cache".into(),
307                "bower_components".into(),
308                ".parcel-cache".into(), ".cache".into(),
309                ".turbo".into(), ".vercel".into(), ".netlify".into(),
310                ".next".into(), ".nuxt".into(), ".output".into(),
311                ".svelte-kit".into(), ".astro".into(),
312                ".docusaurus".into(), ".vuepress".into(), ".vitepress".into(),
313                "storybook-static".into(), ".storybook".into(),
314                
315                // ============================================
316                // PYTHON
317                // ============================================
318                "__pycache__".into(), ".pytest_cache".into(), ".mypy_cache".into(),
319                ".ruff_cache".into(), ".pytype".into(),
320                "venv".into(), ".venv".into(), "env".into(), ".env".into(),
321                "virtualenv".into(), ".virtualenv".into(),
322                ".conda".into(), "conda-meta".into(), "envs".into(),
323                ".tox".into(), ".nox".into(),
324                "*.egg-info".into(), ".eggs".into(), "eggs".into(),
325                "site-packages".into(), "dist-packages".into(),
326                ".ipynb_checkpoints".into(),
327                "htmlcov".into(), ".coverage".into(),
328                ".hypothesis".into(),
329                
330                // ============================================
331                // RUST
332                // ============================================
333                "target".into(),
334                ".cargo".into(),
335                
336                // ============================================
337                // GO
338                // ============================================
339                "vendor".into(),
340                "pkg".into(),
341                
342                // ============================================
343                // JAVA / KOTLIN / GRADLE / MAVEN
344                // ============================================
345                ".gradle".into(), "gradle".into(),
346                ".m2".into(), ".mvn".into(),
347                "bin".into(), "out".into(),
348                ".apt_generated".into(), ".apt_generated_tests".into(),
349                "generated-sources".into(), "generated-test-sources".into(),
350                
351                // ============================================
352                // .NET / C#
353                // ============================================
354                "obj".into(), "bin".into(),
355                "packages".into(), ".nuget".into(),
356                "Debug".into(), "Release".into(),
357                "x64".into(), "x86".into(), "ARM".into(), "ARM64".into(),
358                "TestResults".into(),
359                
360                // ============================================
361                // C / C++
362                // ============================================
363                "CMakeFiles".into(), "cmake-build-debug".into(), "cmake-build-release".into(),
364                ".ccache".into(), ".sccache".into(),
365                "Debug".into(), "Release".into(), "MinSizeRel".into(), "RelWithDebInfo".into(),
366                
367                // ============================================
368                // RUBY
369                // ============================================
370                ".bundle".into(), "vendor/bundle".into(),
371                ".gem".into(), "gems".into(),
372                
373                // ============================================
374                // PHP
375                // ============================================
376                "vendor".into(),
377                ".phpunit.cache".into(), ".php-cs-fixer.cache".into(),
378                
379                // ============================================
380                // SWIFT / IOS / MACOS
381                // ============================================
382                ".build".into(), "Build".into(),
383                "DerivedData".into(), "Pods".into(),
384                ".swiftpm".into(), "Carthage".into(),
385                "xcuserdata".into(), "*.xcworkspace".into(),
386                
387                // ============================================
388                // ANDROID
389                // ============================================
390                ".gradle".into(), "gradle".into(),
391                "build".into(), "app/build".into(),
392                ".cxx".into(), ".externalNativeBuild".into(),
393                "captures".into(), ".navigation".into(),
394                "local.properties".into(),
395                
396                // ============================================
397                // FLUTTER / DART
398                // ============================================
399                ".dart_tool".into(), ".pub-cache".into(), ".pub".into(),
400                "build".into(), ".flutter-plugins".into(),
401                "ephemeral".into(),
402                
403                // ============================================
404                // ELECTRON / TAURI
405                // ============================================
406                "release".into(), "src-tauri/target".into(),
407                ".webpack".into(), ".electron".into(),
408                
409                // ============================================
410                // UNITY / GAME DEV
411                // ============================================
412                "Library".into(), "Temp".into(), "Obj".into(),
413                "Build".into(), "Builds".into(), "Logs".into(),
414                "UserSettings".into(), "MemoryCaptures".into(),
415                "Recordings".into(), "Asset Store-5.x".into(),
416                
417                // ============================================
418                // UNREAL ENGINE
419                // ============================================
420                "Binaries".into(), "Intermediate".into(), "Saved".into(),
421                "DerivedDataCache".into(),
422                
423                // ============================================
424                // JUCE (Audio Development)
425                // ============================================
426                "Builds".into(), "JuceLibraryCode".into(),
427                
428                // ============================================
429                // TIZEN
430                // ============================================
431                ".sign".into(), ".build".into(), "Debug-Tizen".into(),
432                "Release-Tizen".into(),
433                
434                // ============================================
435                // IDE / EDITOR CONFIGS
436                // ============================================
437                ".idea".into(), ".vscode".into(), ".vs".into(),
438                ".cursor".into(), ".atom".into(), ".sublime".into(),
439                ".eclipse".into(), ".settings".into(), ".project".into(),
440                ".classpath".into(), ".factorypath".into(),
441                "*.xcodeproj".into(), "*.xcworkspace".into(),
442                ".metals".into(), ".bloop".into(), ".bsp".into(),
443                
444                // ============================================
445                // BUILD OUTPUTS (GENERIC)
446                // ============================================
447                "dist".into(), "build".into(), "out".into(), "output".into(),
448                "_build".into(), ".build".into(),
449                "public".into(), "static".into(), // Often generated
450                "generated".into(), "gen".into(), "auto-generated".into(),
451                
452                // ============================================
453                // TESTING / COVERAGE
454                // ============================================
455                "coverage".into(), ".nyc_output".into(),
456                "test-results".into(), "test-output".into(),
457                "__tests__".into(), "__mocks__".into(),
458                ".jest".into(), "jest-cache".into(),
459                "cypress/videos".into(), "cypress/screenshots".into(),
460                "playwright-report".into(), "test-results".into(),
461                
462                // ============================================
463                // DOCUMENTATION (Generated and Source - CRITICAL!)
464                // ============================================
465                // Documentation directories pollute code search results
466                "docs".into(), "doc".into(), "DOCS".into(), "DOC".into(),
467                "documentation".into(), "Documentation".into(),
468                "docs/_build".into(), "site".into(), "_site".into(),
469                "javadoc".into(), "apidoc".into(), "doxygen".into(),
470                "typedoc".into(), "rustdoc".into(),
471                // Spec/planning directories
472                "specs".into(), "spec".into(), "SPECS".into(),
473                ".specify".into(), ".windsurf".into(),
474                "resources".into(), // Often contains non-code docs
475                
476                // ============================================
477                // LOGS / TEMP
478                // ============================================
479                "logs".into(), "log".into(),
480                "tmp".into(), "temp".into(), ".tmp".into(), ".temp".into(),
481                
482                // ============================================
483                // AURORA / PROJECT SPECIFIC
484                // ============================================
485                ".aurora".into(),
486                
487                // ============================================
488                // MISC / OTHER
489                // ============================================
490                ".terraform".into(), ".pulumi".into(),
491                ".serverless".into(), ".amplify".into(),
492                "cdk.out".into(), ".aws-sam".into(),
493                ".docker".into(), ".vagrant".into(),
494                "helm-charts".into(),
495            ],
496            max_file_size: 512 * 1024, // 512KB - reduced from 1MB
497            // Explicit path exclusions (empty by default, users can add specific paths)
498            excluded_files: vec![],
499            // Always exclude the Aurora index directory by default
500            excluded_directories: vec![PathBuf::from(".aurora")],
501        }
502    }
503}
504
505impl IgnoreConfig {
506    /// Builder-style method to exclude a specific file path (relative to workspace root).
507    ///
508    /// # Example
509    /// ```rust,ignore
510    /// let config = IgnoreConfig::default()
511    ///     .with_excluded_file("src/generated/types.rs");
512    /// ```
513    pub fn with_excluded_file(mut self, path: impl Into<PathBuf>) -> Self {
514        self.excluded_files.push(path.into());
515        self
516    }
517
518    /// Builder-style method to exclude multiple file paths (relative to workspace root).
519    ///
520    /// # Example
521    /// ```rust,ignore
522    /// let config = IgnoreConfig::default()
523    ///     .with_excluded_files(vec![
524    ///         "src/proto/generated.rs".into(),
525    ///         "src/bindings/ffi.rs".into(),
526    ///     ]);
527    /// ```
528    pub fn with_excluded_files(mut self, paths: Vec<PathBuf>) -> Self {
529        self.excluded_files.extend(paths);
530        self
531    }
532
533    /// Builder-style method to exclude a specific directory path (relative to workspace root).
534    ///
535    /// # Example
536    /// ```rust,ignore
537    /// let config = IgnoreConfig::default()
538    ///     .with_excluded_directory("vendor/third-party");
539    /// ```
540    pub fn with_excluded_directory(mut self, path: impl Into<PathBuf>) -> Self {
541        self.excluded_directories.push(path.into());
542        self
543    }
544
545    /// Builder-style method to exclude multiple directory paths (relative to workspace root).
546    ///
547    /// # Example
548    /// ```rust,ignore
549    /// let config = IgnoreConfig::default()
550    ///     .with_excluded_directories(vec![
551    ///         "generated".into(),
552    ///         "vendor/libs".into(),
553    ///     ]);
554    /// ```
555    pub fn with_excluded_directories(mut self, paths: Vec<PathBuf>) -> Self {
556        self.excluded_directories.extend(paths);
557        self
558    }
559
560    /// Builder-style method to add an additional ignore pattern (glob format).
561    pub fn with_pattern(mut self, pattern: impl Into<String>) -> Self {
562        self.patterns.push(pattern.into());
563        self
564    }
565
566    /// Builder-style method to add an ignored file extension.
567    pub fn with_ignored_extension(mut self, ext: impl Into<String>) -> Self {
568        self.ignored_extensions.push(ext.into());
569        self
570    }
571
572    /// Builder-style method to add an ignored directory name.
573    pub fn with_ignored_directory(mut self, dir: impl Into<String>) -> Self {
574        self.ignored_directories.push(dir.into());
575        self
576    }
577
578    /// Builder-style method to set the maximum file size.
579    pub fn with_max_file_size(mut self, size: u64) -> Self {
580        self.max_file_size = size;
581        self
582    }
583}
584
585/// Performance tuning configuration.
586#[derive(Debug, Clone, Serialize, Deserialize)]
587pub struct PerformanceConfig {
588    /// Number of threads for parallel processing.
589    pub num_threads: usize,
590    /// Memory limit for indexing (in bytes).
591    pub memory_limit: usize,
592    /// Enable incremental indexing.
593    pub incremental: bool,
594}
595
596impl Default for PerformanceConfig {
597    fn default() -> Self {
598        Self {
599            num_threads: std::thread::available_parallelism()
600                .map(|p| p.get())
601                .unwrap_or(4),
602            memory_limit: 512 * 1024 * 1024, // 512MB
603            incremental: true,
604        }
605    }
606}
607
608/// Configuration for a specific workspace.
609#[derive(Debug, Clone, Serialize, Deserialize)]
610pub struct WorkspaceConfig {
611    /// Root path of the workspace.
612    pub root_path: PathBuf,
613    /// Workspace-specific ignore patterns (in addition to global).
614    pub additional_ignores: Vec<String>,
615    /// Languages to index (None = all).
616    pub languages: Option<Vec<Language>>,
617    /// Watch for file changes.
618    pub watch_changes: bool,
619}
620
621impl WorkspaceConfig {
622    /// Create a new workspace configuration.
623    pub fn new(root_path: PathBuf) -> Self {
624        Self {
625            root_path,
626            additional_ignores: vec![],
627            languages: None,
628            watch_changes: false,
629        }
630    }
631
632    /// Builder-style method to add ignore patterns.
633    pub fn with_ignores(mut self, patterns: Vec<String>) -> Self {
634        self.additional_ignores = patterns;
635        self
636    }
637
638    /// Builder-style method to set languages.
639    pub fn with_languages(mut self, languages: Vec<Language>) -> Self {
640        self.languages = Some(languages);
641        self
642    }
643
644    /// Builder-style method to enable file watching.
645    pub fn with_watch(mut self) -> Self {
646        self.watch_changes = true;
647        self
648    }
649}