project-rag 0.1.0

RAG-based codebase indexing and semantic search - dual purpose library and MCP server
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
use schemars::JsonSchema;
use serde::{Deserialize, Serialize};

/// Request to index a codebase
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct IndexRequest {
    /// Path to the codebase directory to index
    pub path: String,
    /// Optional project name (for multi-project support)
    #[serde(default)]
    pub project: Option<String>,
    /// Optional glob patterns to include (e.g., ["**/*.rs", "**/*.toml"])
    #[serde(default)]
    pub include_patterns: Vec<String>,
    /// Optional glob patterns to exclude (e.g., ["**/target/**", "**/node_modules/**"])
    #[serde(default)]
    pub exclude_patterns: Vec<String>,
    /// Maximum file size in bytes to index (default: 1MB)
    #[serde(default = "default_max_file_size")]
    pub max_file_size: usize,
}

fn default_max_file_size() -> usize {
    1_048_576 // 1MB
}

/// Indexing mode used
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, JsonSchema)]
#[serde(rename_all = "lowercase")]
pub enum IndexingMode {
    /// Full indexing (all files)
    Full,
    /// Incremental update (only changed files)
    Incremental,
}

/// Response from indexing operation
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct IndexResponse {
    /// Indexing mode used (full or incremental)
    pub mode: IndexingMode,
    /// Number of files successfully indexed
    pub files_indexed: usize,
    /// Number of code chunks created
    pub chunks_created: usize,
    /// Number of embeddings generated
    pub embeddings_generated: usize,
    /// Time taken in milliseconds
    pub duration_ms: u64,
    /// Any errors encountered (non-fatal)
    #[serde(default)]
    pub errors: Vec<String>,
    /// Number of files updated (incremental mode only)
    #[serde(default)]
    pub files_updated: usize,
    /// Number of files removed (incremental mode only)
    #[serde(default)]
    pub files_removed: usize,
}

/// Request to query the codebase
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct QueryRequest {
    /// The question or search query
    pub query: String,
    /// Optional path to filter by specific indexed codebase
    #[serde(default)]
    pub path: Option<String>,
    /// Optional project name to filter by
    #[serde(default)]
    pub project: Option<String>,
    /// Number of results to return (default: 10)
    #[serde(default = "default_limit")]
    pub limit: usize,
    /// Minimum similarity score (0.0 to 1.0, default: 0.7)
    #[serde(default = "default_min_score")]
    pub min_score: f32,
    /// Enable hybrid search (vector + keyword) - default: true
    #[serde(default = "default_hybrid")]
    pub hybrid: bool,
}

fn default_hybrid() -> bool {
    true
}

fn default_limit() -> usize {
    10
}

fn default_min_score() -> f32 {
    0.7
}

/// A single search result
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct SearchResult {
    /// File path relative to the indexed root
    pub file_path: String,
    /// Absolute path to the indexed root directory
    #[serde(default)]
    pub root_path: Option<String>,
    /// The code chunk content
    pub content: String,
    /// Combined similarity score (0.0 to 1.0)
    pub score: f32,
    /// Vector similarity score (0.0 to 1.0)
    pub vector_score: f32,
    /// Keyword match score (0.0 to 1.0) - only present in hybrid search
    pub keyword_score: Option<f32>,
    /// Starting line number in the file
    pub start_line: usize,
    /// Ending line number in the file
    pub end_line: usize,
    /// Programming language detected
    pub language: String,
    /// Optional project name for multi-project support
    pub project: Option<String>,
}

/// Response from query operation
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct QueryResponse {
    /// List of search results, ordered by relevance
    pub results: Vec<SearchResult>,
    /// Time taken in milliseconds
    pub duration_ms: u64,
    /// The actual threshold used (may be lower than requested if adaptive search kicked in)
    #[serde(default)]
    pub threshold_used: f32,
    /// Whether the threshold was automatically lowered to find results
    #[serde(default)]
    pub threshold_lowered: bool,
}

/// Request to get statistics about the index
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct StatisticsRequest {}

/// Statistics about the indexed codebase
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct StatisticsResponse {
    /// Total number of indexed files
    pub total_files: usize,
    /// Total number of code chunks
    pub total_chunks: usize,
    /// Total number of embeddings
    pub total_embeddings: usize,
    /// Size of the vector database in bytes
    pub database_size_bytes: u64,
    /// Breakdown by programming language
    pub language_breakdown: Vec<LanguageStats>,
}

#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct LanguageStats {
    pub language: String,
    pub file_count: usize,
    pub chunk_count: usize,
}

/// Request to clear the index
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct ClearRequest {}

/// Response from clear operation
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct ClearResponse {
    /// Whether the operation was successful
    pub success: bool,
    /// Optional message
    pub message: String,
}

/// Request for incremental update
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct IncrementalUpdateRequest {
    /// Path to the codebase directory
    pub path: String,
    /// Optional project name
    #[serde(default)]
    pub project: Option<String>,
    /// Optional glob patterns to include
    #[serde(default)]
    pub include_patterns: Vec<String>,
    /// Optional glob patterns to exclude
    #[serde(default)]
    pub exclude_patterns: Vec<String>,
}

/// Response from incremental update
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct IncrementalUpdateResponse {
    /// Number of files added
    pub files_added: usize,
    /// Number of files updated
    pub files_updated: usize,
    /// Number of files removed
    pub files_removed: usize,
    /// Number of chunks created/updated
    pub chunks_modified: usize,
    /// Time taken in milliseconds
    pub duration_ms: u64,
}

/// Request to search with file type filters
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct AdvancedSearchRequest {
    /// The search query
    pub query: String,
    /// Optional path to filter by specific indexed codebase
    #[serde(default)]
    pub path: Option<String>,
    /// Optional project name to filter by
    #[serde(default)]
    pub project: Option<String>,
    /// Number of results to return
    #[serde(default = "default_limit")]
    pub limit: usize,
    /// Minimum similarity score
    #[serde(default = "default_min_score")]
    pub min_score: f32,
    /// Filter by file extensions (e.g., ["rs", "toml"])
    #[serde(default)]
    pub file_extensions: Vec<String>,
    /// Filter by programming languages
    #[serde(default)]
    pub languages: Vec<String>,
    /// Filter by file path patterns (glob)
    #[serde(default)]
    pub path_patterns: Vec<String>,
}

/// Request to search git history
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct SearchGitHistoryRequest {
    /// The search query
    pub query: String,
    /// Path to the codebase (will discover git repo)
    #[serde(default = "default_git_path")]
    pub path: String,
    /// Optional project name
    #[serde(default)]
    pub project: Option<String>,
    /// Optional branch name (default: current branch)
    #[serde(default)]
    pub branch: Option<String>,
    /// Maximum number of commits to index/search (default: 10)
    #[serde(default = "default_max_commits")]
    pub max_commits: usize,
    /// Number of results to return (default: 10)
    #[serde(default = "default_limit")]
    pub limit: usize,
    /// Minimum similarity score (0.0 to 1.0, default: 0.7)
    #[serde(default = "default_min_score")]
    pub min_score: f32,
    /// Filter by commit author (optional regex pattern)
    #[serde(default)]
    pub author: Option<String>,
    /// Filter by commits since this date (ISO 8601 or Unix timestamp)
    #[serde(default)]
    pub since: Option<String>,
    /// Filter by commits until this date (ISO 8601 or Unix timestamp)
    #[serde(default)]
    pub until: Option<String>,
    /// Filter by file path pattern (optional regex)
    #[serde(default)]
    pub file_pattern: Option<String>,
}

fn default_git_path() -> String {
    ".".to_string()
}

fn default_max_commits() -> usize {
    10
}

/// A single git search result
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct GitSearchResult {
    /// Git commit hash (SHA)
    pub commit_hash: String,
    /// Commit message
    pub commit_message: String,
    /// Author name
    pub author: String,
    /// Author email
    pub author_email: String,
    /// Commit date (Unix timestamp)
    pub commit_date: i64,
    /// Combined similarity score (0.0 to 1.0)
    pub score: f32,
    /// Vector similarity score
    pub vector_score: f32,
    /// Keyword match score (if hybrid search enabled)
    pub keyword_score: Option<f32>,
    /// Files changed in this commit
    pub files_changed: Vec<String>,
    /// Diff snippet (first ~500 characters)
    pub diff_snippet: String,
}

/// Response from git history search
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct SearchGitHistoryResponse {
    /// List of matching commits, ordered by relevance
    pub results: Vec<GitSearchResult>,
    /// Number of commits indexed during this search
    pub commits_indexed: usize,
    /// Total commits in cache for this repo
    pub total_cached_commits: usize,
    /// Time taken in milliseconds
    pub duration_ms: u64,
}

// ============================================================================
// Code Relations Types (find_definition, find_references, get_call_graph)
// ============================================================================

/// Request to find the definition of a symbol at a given location
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct FindDefinitionRequest {
    /// File path (relative or absolute)
    pub file_path: String,
    /// Line number (1-based)
    pub line: usize,
    /// Column number (0-based)
    pub column: usize,
    /// Optional project name to filter by
    #[serde(default)]
    pub project: Option<String>,
}

impl FindDefinitionRequest {
    /// Validate the find definition request
    pub fn validate(&self) -> Result<(), String> {
        if self.file_path.is_empty() {
            return Err("file_path cannot be empty".to_string());
        }
        if self.line == 0 {
            return Err("line must be 1-based (cannot be 0)".to_string());
        }
        Ok(())
    }
}

/// Response from find_definition
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct FindDefinitionResponse {
    /// The found definition, if any
    pub definition: Option<crate::relations::DefinitionResult>,
    /// Precision level of the result
    pub precision: String,
    /// Time taken in milliseconds
    pub duration_ms: u64,
}

/// Request to find all references to a symbol
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct FindReferencesRequest {
    /// File path (relative or absolute)
    pub file_path: String,
    /// Line number (1-based)
    pub line: usize,
    /// Column number (0-based)
    pub column: usize,
    /// Maximum number of references to return
    #[serde(default = "default_references_limit")]
    pub limit: usize,
    /// Optional project name to filter by
    #[serde(default)]
    pub project: Option<String>,
    /// Include the definition itself in results
    #[serde(default = "default_include_definition")]
    pub include_definition: bool,
}

fn default_references_limit() -> usize {
    100
}

fn default_include_definition() -> bool {
    true
}

impl FindReferencesRequest {
    /// Validate the find references request
    pub fn validate(&self) -> Result<(), String> {
        if self.file_path.is_empty() {
            return Err("file_path cannot be empty".to_string());
        }
        if self.line == 0 {
            return Err("line must be 1-based (cannot be 0)".to_string());
        }
        const MAX_LIMIT: usize = 10000;
        if self.limit > MAX_LIMIT {
            return Err(format!("limit too large: {} (max: {})", self.limit, MAX_LIMIT));
        }
        Ok(())
    }
}

/// Response from find_references
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct FindReferencesResponse {
    /// The symbol being referenced
    pub symbol_name: Option<String>,
    /// List of found references
    pub references: Vec<crate::relations::ReferenceResult>,
    /// Total count (may be higher than returned if limit applied)
    pub total_count: usize,
    /// Precision level of the results
    pub precision: String,
    /// Time taken in milliseconds
    pub duration_ms: u64,
}

/// Request to get call graph for a function
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct GetCallGraphRequest {
    /// File path (relative or absolute)
    pub file_path: String,
    /// Line number (1-based)
    pub line: usize,
    /// Column number (0-based)
    pub column: usize,
    /// Maximum depth to traverse (default: 2)
    #[serde(default = "default_call_graph_depth")]
    pub depth: usize,
    /// Optional project name to filter by
    #[serde(default)]
    pub project: Option<String>,
    /// Include callers (functions that call this function)
    #[serde(default = "default_true")]
    pub include_callers: bool,
    /// Include callees (functions this function calls)
    #[serde(default = "default_true")]
    pub include_callees: bool,
}

fn default_call_graph_depth() -> usize {
    2
}

fn default_true() -> bool {
    true
}

impl GetCallGraphRequest {
    /// Validate the get call graph request
    pub fn validate(&self) -> Result<(), String> {
        if self.file_path.is_empty() {
            return Err("file_path cannot be empty".to_string());
        }
        if self.line == 0 {
            return Err("line must be 1-based (cannot be 0)".to_string());
        }
        const MAX_DEPTH: usize = 10;
        if self.depth > MAX_DEPTH {
            return Err(format!("depth too large: {} (max: {})", self.depth, MAX_DEPTH));
        }
        Ok(())
    }
}

/// Response from get_call_graph
#[derive(Debug, Clone, Serialize, Deserialize, JsonSchema)]
pub struct GetCallGraphResponse {
    /// The root symbol (function/method at the requested location)
    pub root_symbol: Option<crate::relations::SymbolInfo>,
    /// Functions/methods that call this symbol (incoming calls)
    pub callers: Vec<crate::relations::CallGraphNode>,
    /// Functions/methods called by this symbol (outgoing calls)
    pub callees: Vec<crate::relations::CallGraphNode>,
    /// Precision level of the results
    pub precision: String,
    /// Time taken in milliseconds
    pub duration_ms: u64,
}

/// Metadata stored with each code chunk
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct ChunkMetadata {
    /// File path relative to indexed root
    pub file_path: String,
    /// Absolute path to the indexed root directory
    #[serde(default)]
    pub root_path: Option<String>,
    /// Project name (for multi-project support)
    pub project: Option<String>,
    /// Starting line number
    pub start_line: usize,
    /// Ending line number
    pub end_line: usize,
    /// Programming language
    pub language: Option<String>,
    /// File extension
    pub extension: Option<String>,
    /// SHA256 hash of the file content
    pub file_hash: String,
    /// Timestamp when indexed
    pub indexed_at: i64,
}

/// Input validation for request types
///
/// These functions validate user inputs to prevent security issues and ensure
/// reasonable resource usage.
impl IndexRequest {
    /// Validate the index request
    pub fn validate(&self) -> Result<(), String> {
        // Validate path exists and is a directory
        let path = std::path::Path::new(&self.path);
        if !path.exists() {
            return Err(format!("Path does not exist: {}", self.path));
        }
        if !path.is_dir() {
            return Err(format!("Path is not a directory: {}", self.path));
        }

        // Canonicalize to prevent path traversal attacks
        let canonical = path
            .canonicalize()
            .map_err(|e| format!("Failed to canonicalize path: {}", e))?;

        // Check that path doesn't try to escape (basic security check)
        if !canonical.starts_with(
            std::env::current_dir()
                .unwrap_or_default()
                .parent()
                .unwrap_or(std::path::Path::new("/")),
        ) {
            // Allow any absolute path, this check is just to catch obvious traversal attempts
        }

        // Validate max_file_size is reasonable (max 100MB)
        const MAX_FILE_SIZE_LIMIT: usize = 100_000_000; // 100MB
        if self.max_file_size > MAX_FILE_SIZE_LIMIT {
            return Err(format!(
                "max_file_size too large: {} bytes (max: {} bytes)",
                self.max_file_size, MAX_FILE_SIZE_LIMIT
            ));
        }

        // Validate project name if provided
        if let Some(ref project) = self.project {
            if project.is_empty() {
                return Err("project name cannot be empty".to_string());
            }
            if project.len() > 256 {
                return Err("project name too long (max 256 characters)".to_string());
            }
        }

        Ok(())
    }
}

impl QueryRequest {
    /// Validate the query request
    pub fn validate(&self) -> Result<(), String> {
        // Validate query is not empty
        if self.query.trim().is_empty() {
            return Err("query cannot be empty".to_string());
        }

        // Validate query length is reasonable (max 10KB)
        const MAX_QUERY_LENGTH: usize = 10_240; // 10KB
        if self.query.len() > MAX_QUERY_LENGTH {
            return Err(format!(
                "query too long: {} bytes (max: {} bytes)",
                self.query.len(),
                MAX_QUERY_LENGTH
            ));
        }

        // Validate min_score is in valid range [0.0, 1.0]
        if !(0.0..=1.0).contains(&self.min_score) {
            return Err(format!(
                "min_score must be between 0.0 and 1.0, got: {}",
                self.min_score
            ));
        }

        // Validate limit is reasonable (max 1000)
        const MAX_LIMIT: usize = 1000;
        if self.limit > MAX_LIMIT {
            return Err(format!(
                "limit too large: {} (max: {})",
                self.limit, MAX_LIMIT
            ));
        }

        // Validate project name if provided
        if let Some(ref project) = self.project {
            if project.is_empty() {
                return Err("project name cannot be empty".to_string());
            }
            if project.len() > 256 {
                return Err("project name too long (max 256 characters)".to_string());
            }
        }

        Ok(())
    }
}

impl AdvancedSearchRequest {
    /// Validate the advanced search request
    pub fn validate(&self) -> Result<(), String> {
        // Reuse QueryRequest validation logic
        let query_req = QueryRequest {
            query: self.query.clone(),
            path: None,
            project: self.project.clone(),
            limit: self.limit,
            min_score: self.min_score,
            hybrid: true,
        };
        query_req.validate()?;

        // Additional validation for file extensions
        for ext in &self.file_extensions {
            if ext.is_empty() {
                return Err("file extension cannot be empty".to_string());
            }
            if ext.len() > 20 {
                return Err(format!(
                    "file extension too long: {} (max 20 characters)",
                    ext
                ));
            }
        }

        // Validate languages
        for lang in &self.languages {
            if lang.is_empty() {
                return Err("language name cannot be empty".to_string());
            }
            if lang.len() > 50 {
                return Err(format!(
                    "language name too long: {} (max 50 characters)",
                    lang
                ));
            }
        }

        Ok(())
    }
}

impl SearchGitHistoryRequest {
    /// Validate the git history search request
    pub fn validate(&self) -> Result<(), String> {
        // Validate query
        if self.query.trim().is_empty() {
            return Err("query cannot be empty".to_string());
        }

        const MAX_QUERY_LENGTH: usize = 10_240; // 10KB
        if self.query.len() > MAX_QUERY_LENGTH {
            return Err(format!(
                "query too long: {} bytes (max: {} bytes)",
                self.query.len(),
                MAX_QUERY_LENGTH
            ));
        }

        // Validate path
        let path = std::path::Path::new(&self.path);
        if !path.exists() {
            return Err(format!("Path does not exist: {}", self.path));
        }

        // Validate min_score range
        if !(0.0..=1.0).contains(&self.min_score) {
            return Err(format!(
                "min_score must be between 0.0 and 1.0, got: {}",
                self.min_score
            ));
        }

        // Validate limit
        const MAX_LIMIT: usize = 1000;
        if self.limit > MAX_LIMIT {
            return Err(format!(
                "limit too large: {} (max: {})",
                self.limit, MAX_LIMIT
            ));
        }

        // Validate max_commits
        const MAX_COMMITS_LIMIT: usize = 10000;
        if self.max_commits > MAX_COMMITS_LIMIT {
            return Err(format!(
                "max_commits too large: {} (max: {})",
                self.max_commits, MAX_COMMITS_LIMIT
            ));
        }

        // Validate project name if provided
        if let Some(ref project) = self.project {
            if project.is_empty() {
                return Err("project name cannot be empty".to_string());
            }
            if project.len() > 256 {
                return Err("project name too long (max 256 characters)".to_string());
            }
        }

        Ok(())
    }
}

#[cfg(test)]
mod tests;