aurora_semantic/config.rs
1//! Configuration types for the aurora-semantic engine.
2
3use serde::{Deserialize, Serialize};
4use std::path::PathBuf;
5
6use crate::types::Language;
7
8/// Main configuration for the semantic search engine.
9#[derive(Debug, Clone, Serialize, Deserialize)]
10pub struct EngineConfig {
11 /// Directory where indexes are stored.
12 pub index_dir: PathBuf,
13 /// Chunking configuration.
14 pub chunking: ChunkingConfig,
15 /// Embedding configuration.
16 pub embedding: EmbeddingConfig,
17 /// Search configuration.
18 pub search: SearchConfig,
19 /// Ignore patterns configuration.
20 pub ignore: IgnoreConfig,
21 /// Performance tuning.
22 pub performance: PerformanceConfig,
23}
24
25impl EngineConfig {
26 /// Create a new configuration with the given index directory.
27 pub fn new(index_dir: PathBuf) -> Self {
28 Self {
29 index_dir,
30 chunking: ChunkingConfig::default(),
31 embedding: EmbeddingConfig::default(),
32 search: SearchConfig::default(),
33 ignore: IgnoreConfig::default(),
34 performance: PerformanceConfig::default(),
35 }
36 }
37
38 /// Builder-style method to set chunking config.
39 pub fn with_chunking(mut self, config: ChunkingConfig) -> Self {
40 self.chunking = config;
41 self
42 }
43
44 /// Builder-style method to set embedding config.
45 pub fn with_embedding(mut self, config: EmbeddingConfig) -> Self {
46 self.embedding = config;
47 self
48 }
49
50 /// Builder-style method to set search config.
51 pub fn with_search(mut self, config: SearchConfig) -> Self {
52 self.search = config;
53 self
54 }
55
56 /// Builder-style method to set ignore config.
57 pub fn with_ignore(mut self, config: IgnoreConfig) -> Self {
58 self.ignore = config;
59 self
60 }
61
62 /// Builder-style method to set performance config.
63 pub fn with_performance(mut self, config: PerformanceConfig) -> Self {
64 self.performance = config;
65 self
66 }
67}
68
69impl Default for EngineConfig {
70 fn default() -> Self {
71 Self::new(PathBuf::from(".aurora"))
72 }
73}
74
75/// Configuration for code chunking.
76#[derive(Debug, Clone, Serialize, Deserialize)]
77pub struct ChunkingConfig {
78 /// Maximum chunk size in characters.
79 pub max_chunk_size: usize,
80 /// Minimum chunk size in characters.
81 pub min_chunk_size: usize,
82 /// Whether to extract documentation comments.
83 pub extract_comments: bool,
84}
85
86impl Default for ChunkingConfig {
87 fn default() -> Self {
88 Self {
89 max_chunk_size: 2000,
90 min_chunk_size: 50,
91 extract_comments: true,
92 }
93 }
94}
95
96/// Configuration for embedding generation.
97#[derive(Debug, Clone, Serialize, Deserialize)]
98pub struct EmbeddingConfig {
99 /// Embedding dimension.
100 pub dimension: usize,
101 /// Batch size for embedding generation.
102 pub batch_size: usize,
103 /// Maximum sequence length.
104 pub max_length: usize,
105 /// Whether to normalize embeddings.
106 pub normalize: bool,
107}
108
109impl Default for EmbeddingConfig {
110 fn default() -> Self {
111 Self {
112 dimension: 768, // Common for code models like jina-code
113 batch_size: 32,
114 max_length: 512,
115 normalize: true,
116 }
117 }
118}
119
120/// Configuration for search behavior.
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub struct SearchConfig {
123 /// Default number of results to return.
124 pub default_limit: usize,
125 /// Maximum number of results to return.
126 pub max_limit: usize,
127 /// Default search mode.
128 pub default_mode: SearchMode,
129 /// Weight for lexical results in hybrid search (0.0 to 1.0).
130 pub lexical_weight: f32,
131 /// Weight for semantic results in hybrid search (0.0 to 1.0).
132 pub semantic_weight: f32,
133 /// Minimum score threshold for results (0.0 to 1.0).
134 pub min_score: f32,
135 /// Enable fuzzy matching in lexical search.
136 pub fuzzy_matching: bool,
137 /// Fuzzy matching distance (edit distance).
138 pub fuzzy_distance: u8,
139}
140
141impl Default for SearchConfig {
142 fn default() -> Self {
143 Self {
144 default_limit: 20,
145 max_limit: 100,
146 default_mode: SearchMode::Hybrid,
147 lexical_weight: 0.4,
148 semantic_weight: 0.6,
149 min_score: 0.1,
150 fuzzy_matching: true,
151 fuzzy_distance: 2,
152 }
153 }
154}
155
156/// Search mode selection.
157#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
158pub enum SearchMode {
159 /// Keyword-based lexical search only.
160 Lexical,
161 /// Embedding-based semantic search only.
162 Semantic,
163 /// Combined lexical and semantic search.
164 Hybrid,
165}
166
167impl Default for SearchMode {
168 fn default() -> Self {
169 Self::Hybrid
170 }
171}
172
173/// Configuration for ignore patterns.
174#[derive(Debug, Clone, Serialize, Deserialize)]
175pub struct IgnoreConfig {
176 /// Respect .gitignore files.
177 pub use_gitignore: bool,
178 /// Respect .ignore files.
179 pub use_ignore_files: bool,
180 /// Additional patterns to ignore (glob patterns).
181 pub patterns: Vec<String>,
182 /// File extensions to ignore.
183 pub ignored_extensions: Vec<String>,
184 /// Directories to always ignore (by name, matches anywhere in path).
185 pub ignored_directories: Vec<String>,
186 /// Maximum file size to index (in bytes).
187 pub max_file_size: u64,
188 /// Specific file paths to exclude (relative to workspace root).
189 pub excluded_files: Vec<PathBuf>,
190 /// Specific directory paths to exclude (relative to workspace root).
191 pub excluded_directories: Vec<PathBuf>,
192}
193
194impl Default for IgnoreConfig {
195 fn default() -> Self {
196 Self {
197 use_gitignore: true,
198 use_ignore_files: true,
199 patterns: vec![],
200 ignored_extensions: vec![
201 // ============================================
202 // BINARY / COMPILED
203 // ============================================
204 "exe".into(), "dll".into(), "so".into(), "dylib".into(), "a".into(),
205 "lib".into(), "obj".into(), "o".into(), "ko".into(), "elf".into(),
206 "bin".into(), "out".into(), "app".into(), "msi".into(), "dmg".into(),
207 "deb".into(), "rpm".into(), "apk".into(), "ipa".into(), "aab".into(),
208 "class".into(), "jar".into(), "war".into(), "ear".into(),
209 "pyc".into(), "pyo".into(), "pyd".into(), "whl".into(),
210 "wasm".into(), "wat".into(),
211 "rlib".into(), "rmeta".into(), "d".into(),
212
213 // ============================================
214 // IMAGES / MEDIA
215 // ============================================
216 "png".into(), "jpg".into(), "jpeg".into(), "gif".into(), "bmp".into(),
217 "ico".into(), "icns".into(), "svg".into(), "webp".into(), "avif".into(),
218 "tiff".into(), "tif".into(), "psd".into(), "ai".into(), "eps".into(),
219 "raw".into(), "cr2".into(), "nef".into(), "heic".into(), "heif".into(),
220 // Video
221 "mp4".into(), "avi".into(), "mov".into(), "wmv".into(), "flv".into(),
222 "mkv".into(), "webm".into(), "m4v".into(), "mpeg".into(), "mpg".into(),
223 "3gp".into(), "ogv".into(),
224 // Audio
225 "mp3".into(), "wav".into(), "ogg".into(), "flac".into(), "aac".into(),
226 "wma".into(), "m4a".into(), "opus".into(), "aiff".into(),
227
228 // ============================================
229 // FONTS
230 // ============================================
231 "woff".into(), "woff2".into(), "ttf".into(), "otf".into(), "eot".into(),
232
233 // ============================================
234 // ARCHIVES / COMPRESSED
235 // ============================================
236 "zip".into(), "tar".into(), "gz".into(), "bz2".into(), "xz".into(),
237 "rar".into(), "7z".into(), "tgz".into(), "tbz2".into(), "lz".into(),
238 "lzma".into(), "cab".into(), "iso".into(), "dmg".into(), "pkg".into(),
239 "zst".into(), "lz4".into(), "br".into(),
240
241 // ============================================
242 // DOCUMENTS / NON-CODE (CRITICAL FOR CODE SEARCH!)
243 // ============================================
244 // Markdown - creates too many chunks, dominates search results
245 "md".into(), "markdown".into(), "mdx".into(),
246 // reStructuredText
247 "rst".into(), "rest".into(),
248 // Plain text docs
249 "txt".into(),
250 // Office documents
251 "pdf".into(), "doc".into(), "docx".into(), "xls".into(), "xlsx".into(),
252 "ppt".into(), "pptx".into(), "odt".into(), "ods".into(), "odp".into(),
253 "rtf".into(), "pages".into(), "numbers".into(), "key".into(),
254
255 // ============================================
256 // DATABASE FILES
257 // ============================================
258 "db".into(), "sqlite".into(), "sqlite3".into(), "mdb".into(),
259 "accdb".into(), "frm".into(), "myd".into(), "myi".into(),
260 "ibd".into(), "dbf".into(), "sav".into(),
261
262 // ============================================
263 // LOCK FILES (CRITICAL!)
264 // ============================================
265 "lock".into(), "lockb".into(),
266
267 // ============================================
268 // MAP / SOURCE MAP FILES
269 // ============================================
270 "map".into(),
271
272 // ============================================
273 // LOG FILES
274 // ============================================
275 "log".into(),
276
277 // ============================================
278 // BACKUP / TEMP FILES
279 // ============================================
280 "bak".into(), "backup".into(), "tmp".into(), "temp".into(),
281 "swp".into(), "swo".into(), "swn".into(), // Vim swap files
282
283 // ============================================
284 // OS SPECIFIC
285 // ============================================
286 "DS_Store".into(), "Thumbs.db".into(), "desktop.ini".into(),
287
288 // ============================================
289 // CERTIFICATES / KEYS (Security)
290 // ============================================
291 "pem".into(), "crt".into(), "cer".into(), "der".into(),
292 "p12".into(), "pfx".into(), "jks".into(), "keystore".into(),
293 ],
294 ignored_directories: vec![
295 // ============================================
296 // VERSION CONTROL
297 // ============================================
298 ".git".into(), ".svn".into(), ".hg".into(), ".bzr".into(),
299 "_darcs".into(), ".fossil".into(),
300
301 // ============================================
302 // JAVASCRIPT / NODE.JS / WEB
303 // ============================================
304 "node_modules".into(),
305 ".npm".into(), ".pnpm".into(), ".pnpm-store".into(),
306 ".yarn".into(), ".yarnrc".into(), ".yarn-cache".into(),
307 "bower_components".into(),
308 ".parcel-cache".into(), ".cache".into(),
309 ".turbo".into(), ".vercel".into(), ".netlify".into(),
310 ".next".into(), ".nuxt".into(), ".output".into(),
311 ".svelte-kit".into(), ".astro".into(),
312 ".docusaurus".into(), ".vuepress".into(), ".vitepress".into(),
313 "storybook-static".into(), ".storybook".into(),
314
315 // ============================================
316 // PYTHON
317 // ============================================
318 "__pycache__".into(), ".pytest_cache".into(), ".mypy_cache".into(),
319 ".ruff_cache".into(), ".pytype".into(),
320 "venv".into(), ".venv".into(), "env".into(), ".env".into(),
321 "virtualenv".into(), ".virtualenv".into(),
322 ".conda".into(), "conda-meta".into(), "envs".into(),
323 ".tox".into(), ".nox".into(),
324 "*.egg-info".into(), ".eggs".into(), "eggs".into(),
325 "site-packages".into(), "dist-packages".into(),
326 ".ipynb_checkpoints".into(),
327 "htmlcov".into(), ".coverage".into(),
328 ".hypothesis".into(),
329
330 // ============================================
331 // RUST
332 // ============================================
333 "target".into(),
334 ".cargo".into(),
335
336 // ============================================
337 // GO
338 // ============================================
339 "vendor".into(),
340 "pkg".into(),
341
342 // ============================================
343 // JAVA / KOTLIN / GRADLE / MAVEN
344 // ============================================
345 ".gradle".into(), "gradle".into(),
346 ".m2".into(), ".mvn".into(),
347 "bin".into(), "out".into(),
348 ".apt_generated".into(), ".apt_generated_tests".into(),
349 "generated-sources".into(), "generated-test-sources".into(),
350
351 // ============================================
352 // .NET / C#
353 // ============================================
354 "obj".into(), "bin".into(),
355 "packages".into(), ".nuget".into(),
356 "Debug".into(), "Release".into(),
357 "x64".into(), "x86".into(), "ARM".into(), "ARM64".into(),
358 "TestResults".into(),
359
360 // ============================================
361 // C / C++
362 // ============================================
363 "CMakeFiles".into(), "cmake-build-debug".into(), "cmake-build-release".into(),
364 ".ccache".into(), ".sccache".into(),
365 "Debug".into(), "Release".into(), "MinSizeRel".into(), "RelWithDebInfo".into(),
366
367 // ============================================
368 // RUBY
369 // ============================================
370 ".bundle".into(), "vendor/bundle".into(),
371 ".gem".into(), "gems".into(),
372
373 // ============================================
374 // PHP
375 // ============================================
376 "vendor".into(),
377 ".phpunit.cache".into(), ".php-cs-fixer.cache".into(),
378
379 // ============================================
380 // SWIFT / IOS / MACOS
381 // ============================================
382 ".build".into(), "Build".into(),
383 "DerivedData".into(), "Pods".into(),
384 ".swiftpm".into(), "Carthage".into(),
385 "xcuserdata".into(), "*.xcworkspace".into(),
386
387 // ============================================
388 // ANDROID
389 // ============================================
390 ".gradle".into(), "gradle".into(),
391 "build".into(), "app/build".into(),
392 ".cxx".into(), ".externalNativeBuild".into(),
393 "captures".into(), ".navigation".into(),
394 "local.properties".into(),
395
396 // ============================================
397 // FLUTTER / DART
398 // ============================================
399 ".dart_tool".into(), ".pub-cache".into(), ".pub".into(),
400 "build".into(), ".flutter-plugins".into(),
401 "ephemeral".into(),
402
403 // ============================================
404 // ELECTRON / TAURI
405 // ============================================
406 "release".into(), "src-tauri/target".into(),
407 ".webpack".into(), ".electron".into(),
408
409 // ============================================
410 // UNITY / GAME DEV
411 // ============================================
412 "Library".into(), "Temp".into(), "Obj".into(),
413 "Build".into(), "Builds".into(), "Logs".into(),
414 "UserSettings".into(), "MemoryCaptures".into(),
415 "Recordings".into(), "Asset Store-5.x".into(),
416
417 // ============================================
418 // UNREAL ENGINE
419 // ============================================
420 "Binaries".into(), "Intermediate".into(), "Saved".into(),
421 "DerivedDataCache".into(),
422
423 // ============================================
424 // JUCE (Audio Development)
425 // ============================================
426 "Builds".into(), "JuceLibraryCode".into(),
427
428 // ============================================
429 // TIZEN
430 // ============================================
431 ".sign".into(), ".build".into(), "Debug-Tizen".into(),
432 "Release-Tizen".into(),
433
434 // ============================================
435 // IDE / EDITOR CONFIGS
436 // ============================================
437 ".idea".into(), ".vscode".into(), ".vs".into(),
438 ".cursor".into(), ".atom".into(), ".sublime".into(),
439 ".eclipse".into(), ".settings".into(), ".project".into(),
440 ".classpath".into(), ".factorypath".into(),
441 "*.xcodeproj".into(), "*.xcworkspace".into(),
442 ".metals".into(), ".bloop".into(), ".bsp".into(),
443
444 // ============================================
445 // BUILD OUTPUTS (GENERIC)
446 // ============================================
447 "dist".into(), "build".into(), "out".into(), "output".into(),
448 "_build".into(), ".build".into(),
449 "public".into(), "static".into(), // Often generated
450 "generated".into(), "gen".into(), "auto-generated".into(),
451
452 // ============================================
453 // TESTING / COVERAGE
454 // ============================================
455 "coverage".into(), ".nyc_output".into(),
456 "test-results".into(), "test-output".into(),
457 "__tests__".into(), "__mocks__".into(),
458 ".jest".into(), "jest-cache".into(),
459 "cypress/videos".into(), "cypress/screenshots".into(),
460 "playwright-report".into(), "test-results".into(),
461
462 // ============================================
463 // DOCUMENTATION (Generated and Source - CRITICAL!)
464 // ============================================
465 // Documentation directories pollute code search results
466 "docs".into(), "doc".into(), "DOCS".into(), "DOC".into(),
467 "documentation".into(), "Documentation".into(),
468 "docs/_build".into(), "site".into(), "_site".into(),
469 "javadoc".into(), "apidoc".into(), "doxygen".into(),
470 "typedoc".into(), "rustdoc".into(),
471 // Spec/planning directories
472 "specs".into(), "spec".into(), "SPECS".into(),
473 ".specify".into(), ".windsurf".into(),
474 "resources".into(), // Often contains non-code docs
475
476 // ============================================
477 // LOGS / TEMP
478 // ============================================
479 "logs".into(), "log".into(),
480 "tmp".into(), "temp".into(), ".tmp".into(), ".temp".into(),
481
482 // ============================================
483 // AURORA / PROJECT SPECIFIC
484 // ============================================
485 ".aurora".into(),
486
487 // ============================================
488 // MISC / OTHER
489 // ============================================
490 ".terraform".into(), ".pulumi".into(),
491 ".serverless".into(), ".amplify".into(),
492 "cdk.out".into(), ".aws-sam".into(),
493 ".docker".into(), ".vagrant".into(),
494 "helm-charts".into(),
495 ],
496 max_file_size: 512 * 1024, // 512KB - reduced from 1MB
497 // Explicit path exclusions (empty by default, users can add specific paths)
498 excluded_files: vec![],
499 // Always exclude the Aurora index directory by default
500 excluded_directories: vec![PathBuf::from(".aurora")],
501 }
502 }
503}
504
505impl IgnoreConfig {
506 /// Builder-style method to exclude a specific file path (relative to workspace root).
507 ///
508 /// # Example
509 /// ```rust,ignore
510 /// let config = IgnoreConfig::default()
511 /// .with_excluded_file("src/generated/types.rs");
512 /// ```
513 pub fn with_excluded_file(mut self, path: impl Into<PathBuf>) -> Self {
514 self.excluded_files.push(path.into());
515 self
516 }
517
518 /// Builder-style method to exclude multiple file paths (relative to workspace root).
519 ///
520 /// # Example
521 /// ```rust,ignore
522 /// let config = IgnoreConfig::default()
523 /// .with_excluded_files(vec![
524 /// "src/proto/generated.rs".into(),
525 /// "src/bindings/ffi.rs".into(),
526 /// ]);
527 /// ```
528 pub fn with_excluded_files(mut self, paths: Vec<PathBuf>) -> Self {
529 self.excluded_files.extend(paths);
530 self
531 }
532
533 /// Builder-style method to exclude a specific directory path (relative to workspace root).
534 ///
535 /// # Example
536 /// ```rust,ignore
537 /// let config = IgnoreConfig::default()
538 /// .with_excluded_directory("vendor/third-party");
539 /// ```
540 pub fn with_excluded_directory(mut self, path: impl Into<PathBuf>) -> Self {
541 self.excluded_directories.push(path.into());
542 self
543 }
544
545 /// Builder-style method to exclude multiple directory paths (relative to workspace root).
546 ///
547 /// # Example
548 /// ```rust,ignore
549 /// let config = IgnoreConfig::default()
550 /// .with_excluded_directories(vec![
551 /// "generated".into(),
552 /// "vendor/libs".into(),
553 /// ]);
554 /// ```
555 pub fn with_excluded_directories(mut self, paths: Vec<PathBuf>) -> Self {
556 self.excluded_directories.extend(paths);
557 self
558 }
559
560 /// Builder-style method to add an additional ignore pattern (glob format).
561 pub fn with_pattern(mut self, pattern: impl Into<String>) -> Self {
562 self.patterns.push(pattern.into());
563 self
564 }
565
566 /// Builder-style method to add an ignored file extension.
567 pub fn with_ignored_extension(mut self, ext: impl Into<String>) -> Self {
568 self.ignored_extensions.push(ext.into());
569 self
570 }
571
572 /// Builder-style method to add an ignored directory name.
573 pub fn with_ignored_directory(mut self, dir: impl Into<String>) -> Self {
574 self.ignored_directories.push(dir.into());
575 self
576 }
577
578 /// Builder-style method to set the maximum file size.
579 pub fn with_max_file_size(mut self, size: u64) -> Self {
580 self.max_file_size = size;
581 self
582 }
583}
584
585/// Performance tuning configuration.
586#[derive(Debug, Clone, Serialize, Deserialize)]
587pub struct PerformanceConfig {
588 /// Number of threads for parallel processing.
589 pub num_threads: usize,
590 /// Memory limit for indexing (in bytes).
591 pub memory_limit: usize,
592 /// Enable incremental indexing.
593 pub incremental: bool,
594}
595
596impl Default for PerformanceConfig {
597 fn default() -> Self {
598 Self {
599 num_threads: std::thread::available_parallelism()
600 .map(|p| p.get())
601 .unwrap_or(4),
602 memory_limit: 512 * 1024 * 1024, // 512MB
603 incremental: true,
604 }
605 }
606}
607
608/// Configuration for a specific workspace.
609#[derive(Debug, Clone, Serialize, Deserialize)]
610pub struct WorkspaceConfig {
611 /// Root path of the workspace.
612 pub root_path: PathBuf,
613 /// Workspace-specific ignore patterns (in addition to global).
614 pub additional_ignores: Vec<String>,
615 /// Languages to index (None = all).
616 pub languages: Option<Vec<Language>>,
617 /// Watch for file changes.
618 pub watch_changes: bool,
619}
620
621impl WorkspaceConfig {
622 /// Create a new workspace configuration.
623 pub fn new(root_path: PathBuf) -> Self {
624 Self {
625 root_path,
626 additional_ignores: vec![],
627 languages: None,
628 watch_changes: false,
629 }
630 }
631
632 /// Builder-style method to add ignore patterns.
633 pub fn with_ignores(mut self, patterns: Vec<String>) -> Self {
634 self.additional_ignores = patterns;
635 self
636 }
637
638 /// Builder-style method to set languages.
639 pub fn with_languages(mut self, languages: Vec<Language>) -> Self {
640 self.languages = Some(languages);
641 self
642 }
643
644 /// Builder-style method to enable file watching.
645 pub fn with_watch(mut self) -> Self {
646 self.watch_changes = true;
647 self
648 }
649}