Skip to main content

rustant_core/
indexer.rs

1//! Project Context Auto-Indexer
2//!
3//! Background workspace indexer that walks the project directory, respects
4//! `.gitignore`, extracts file paths, function signatures, and content summaries,
5//! then indexes them into the `HybridSearchEngine` for semantic codebase search.
6
7use crate::project_detect::{ProjectInfo, detect_project};
8use crate::search::{HybridSearchEngine, SearchConfig, SearchResult};
9use ignore::WalkBuilder;
10use std::path::{Path, PathBuf};
11use tracing::{debug, info};
12
13/// Maximum file size to index (256 KB).
14const MAX_FILE_SIZE: u64 = 256 * 1024;
15
16/// Maximum number of files to index.
17const MAX_FILES: usize = 5000;
18
19/// File extensions considered indexable source code.
20const SOURCE_EXTENSIONS: &[&str] = &[
21    "rs",
22    "py",
23    "js",
24    "ts",
25    "jsx",
26    "tsx",
27    "go",
28    "java",
29    "rb",
30    "c",
31    "cpp",
32    "cc",
33    "h",
34    "hpp",
35    "cs",
36    "swift",
37    "kt",
38    "scala",
39    "lua",
40    "sh",
41    "bash",
42    "zsh",
43    "toml",
44    "yaml",
45    "yml",
46    "json",
47    "xml",
48    "html",
49    "css",
50    "scss",
51    "sql",
52    "md",
53    "txt",
54    "cfg",
55    "ini",
56    "env",
57    "dockerfile",
58    "makefile",
59];
60
61/// Result of indexing a workspace.
62#[derive(Debug, Clone)]
63pub struct IndexStats {
64    /// Number of files indexed.
65    pub files_indexed: usize,
66    /// Number of entries (facts) written to the search engine.
67    pub entries_indexed: usize,
68    /// Number of files skipped (too large, binary, etc.).
69    pub files_skipped: usize,
70    /// Detected project info.
71    pub project_info: Option<ProjectInfo>,
72}
73
74/// The project context indexer.
75pub struct ProjectIndexer {
76    workspace: PathBuf,
77    engine: HybridSearchEngine,
78    config: IndexerConfig,
79}
80
81/// Configuration for the indexer.
82#[derive(Debug, Clone)]
83pub struct IndexerConfig {
84    /// Maximum file size in bytes to index.
85    pub max_file_size: u64,
86    /// Maximum number of files to index.
87    pub max_files: usize,
88    /// Whether to index file content (not just paths).
89    pub index_content: bool,
90    /// Whether to extract and index function signatures.
91    pub index_signatures: bool,
92}
93
94impl Default for IndexerConfig {
95    fn default() -> Self {
96        Self {
97            max_file_size: MAX_FILE_SIZE,
98            max_files: MAX_FILES,
99            index_content: true,
100            index_signatures: true,
101        }
102    }
103}
104
105impl ProjectIndexer {
106    /// Create a new indexer for the given workspace.
107    pub fn new(
108        workspace: PathBuf,
109        search_config: SearchConfig,
110    ) -> Result<Self, crate::search::SearchError> {
111        let engine = HybridSearchEngine::open(search_config)?;
112        Ok(Self {
113            workspace,
114            engine,
115            config: IndexerConfig::default(),
116        })
117    }
118
119    /// Create a new indexer with custom configuration.
120    pub fn with_config(
121        workspace: PathBuf,
122        search_config: SearchConfig,
123        config: IndexerConfig,
124    ) -> Result<Self, crate::search::SearchError> {
125        let engine = HybridSearchEngine::open(search_config)?;
126        Ok(Self {
127            workspace,
128            engine,
129            config,
130        })
131    }
132
133    /// Run the full indexing pass over the workspace.
134    /// Returns statistics about what was indexed.
135    pub fn index_workspace(&mut self) -> IndexStats {
136        let project_info = detect_project(&self.workspace);
137        info!(
138            "Indexing workspace: {:?} (type: {:?})",
139            self.workspace, project_info.project_type
140        );
141
142        // Index the project structure summary first
143        let structure = self.build_structure_summary(&project_info);
144        let _ = self.engine.index_fact("__project_structure__", &structure);
145
146        let mut files_indexed = 0;
147        let mut entries_indexed = 1; // structure summary counts as 1
148        let mut files_skipped = 0;
149
150        // Walk the workspace respecting .gitignore
151        let walker = WalkBuilder::new(&self.workspace)
152            .hidden(true) // respect hidden files
153            .git_ignore(true) // respect .gitignore
154            .git_global(true) // respect global gitignore
155            .git_exclude(true) // respect .git/info/exclude
156            .max_depth(Some(10))
157            .build();
158
159        for entry in walker.flatten() {
160            if files_indexed >= self.config.max_files {
161                debug!("Reached max files limit ({})", self.config.max_files);
162                break;
163            }
164
165            let path = entry.path();
166
167            // Skip directories and non-files
168            if !path.is_file() {
169                continue;
170            }
171
172            // Skip files that are too large
173            if let Ok(meta) = path.metadata()
174                && meta.len() > self.config.max_file_size
175            {
176                files_skipped += 1;
177                continue;
178            }
179
180            // Check file extension
181            if !is_indexable(path) {
182                files_skipped += 1;
183                continue;
184            }
185
186            // Get relative path
187            let rel_path = path
188                .strip_prefix(&self.workspace)
189                .unwrap_or(path)
190                .to_string_lossy()
191                .to_string();
192
193            // Index the file path as an entry
194            let path_entry = format!("file: {}", rel_path);
195            let fact_id = format!("file:{}", rel_path);
196            if self.engine.index_fact(&fact_id, &path_entry).is_ok() {
197                entries_indexed += 1;
198            }
199
200            // Optionally index file content
201            if self.config.index_content
202                && let Ok(content) = std::fs::read_to_string(path)
203            {
204                // Index a content summary (first N lines + function signatures)
205                let summary = self.summarize_file(&rel_path, &content);
206                if !summary.is_empty() {
207                    let content_id = format!("content:{}", rel_path);
208                    if self.engine.index_fact(&content_id, &summary).is_ok() {
209                        entries_indexed += 1;
210                    }
211                }
212
213                // Extract and index function signatures
214                if self.config.index_signatures {
215                    let signatures = extract_signatures(&content, &rel_path);
216                    for (i, sig) in signatures.iter().enumerate() {
217                        let sig_id = format!("sig:{}:{}", rel_path, i);
218                        if self.engine.index_fact(&sig_id, sig).is_ok() {
219                            entries_indexed += 1;
220                        }
221                    }
222                }
223            }
224
225            files_indexed += 1;
226        }
227
228        info!(
229            "Indexing complete: {} files indexed, {} entries, {} skipped",
230            files_indexed, entries_indexed, files_skipped
231        );
232
233        IndexStats {
234            files_indexed,
235            entries_indexed,
236            files_skipped,
237            project_info: Some(project_info),
238        }
239    }
240
241    /// Search the indexed codebase.
242    pub fn search(&self, query: &str) -> Result<Vec<SearchResult>, crate::search::SearchError> {
243        self.engine.search(query)
244    }
245
246    /// Get the number of indexed entries.
247    pub fn indexed_count(&self) -> usize {
248        self.engine.indexed_count()
249    }
250
251    /// Get a reference to the underlying search engine.
252    pub fn engine(&self) -> &HybridSearchEngine {
253        &self.engine
254    }
255
256    /// Build a project structure summary for the system prompt.
257    pub fn build_structure_summary(&self, info: &ProjectInfo) -> String {
258        let mut summary = String::new();
259
260        summary.push_str(&format!("Project type: {:?}\n", info.project_type));
261
262        if let Some(ref framework) = info.framework {
263            summary.push_str(&format!("Framework: {}\n", framework));
264        }
265        if let Some(ref pm) = info.package_manager {
266            summary.push_str(&format!("Package manager: {}\n", pm));
267        }
268
269        if !info.source_dirs.is_empty() {
270            summary.push_str(&format!(
271                "Source directories: {}\n",
272                info.source_dirs.join(", ")
273            ));
274        }
275
276        // Add directory tree (top-level)
277        summary.push_str("\nTop-level structure:\n");
278        if let Ok(entries) = std::fs::read_dir(&self.workspace) {
279            let mut dirs: Vec<String> = Vec::new();
280            let mut files: Vec<String> = Vec::new();
281
282            for entry in entries.flatten() {
283                let name = entry.file_name().to_string_lossy().to_string();
284                if name.starts_with('.') {
285                    continue;
286                }
287                if entry.path().is_dir() {
288                    dirs.push(format!("  {}/", name));
289                } else {
290                    files.push(format!("  {}", name));
291                }
292            }
293
294            dirs.sort();
295            files.sort();
296
297            for d in &dirs {
298                summary.push_str(d);
299                summary.push('\n');
300            }
301            for f in &files {
302                summary.push_str(f);
303                summary.push('\n');
304            }
305        }
306
307        summary
308    }
309
310    /// Summarize a file's content for indexing.
311    fn summarize_file(&self, path: &str, content: &str) -> String {
312        let lines: Vec<&str> = content.lines().collect();
313        let total_lines = lines.len();
314
315        // Take first few lines (imports, module declaration)
316        let head: Vec<&str> = lines.iter().take(20).copied().collect();
317
318        // Build summary
319        let mut summary = format!("{} ({} lines)\n{}", path, total_lines, head.join("\n"));
320
321        // If file is longer, add a note
322        if total_lines > 20 {
323            summary.push_str(&format!("\n... ({} more lines)", total_lines - 20));
324        }
325
326        summary
327    }
328}
329
330/// Check if a file is indexable based on its extension.
331fn is_indexable(path: &Path) -> bool {
332    // Handle files without extension (Makefile, Dockerfile, etc.)
333    let name = path
334        .file_name()
335        .map(|n| n.to_string_lossy().to_lowercase())
336        .unwrap_or_default();
337
338    if ["makefile", "dockerfile", "rakefile", "gemfile", "procfile"].contains(&name.as_str()) {
339        return true;
340    }
341
342    // Check extension
343    path.extension()
344        .and_then(|ext| ext.to_str())
345        .map(|ext| SOURCE_EXTENSIONS.contains(&ext.to_lowercase().as_str()))
346        .unwrap_or(false)
347}
348
349/// Extract function/method/class signatures from source code.
350fn extract_signatures(content: &str, path: &str) -> Vec<String> {
351    let mut signatures = Vec::new();
352    let ext = Path::new(path)
353        .extension()
354        .and_then(|e| e.to_str())
355        .unwrap_or("");
356
357    for (i, line) in content.lines().enumerate() {
358        let trimmed = line.trim();
359        let sig = match ext {
360            "rs" => extract_rust_signature(trimmed),
361            "py" => extract_python_signature(trimmed),
362            "js" | "jsx" | "ts" | "tsx" => extract_js_signature(trimmed),
363            "go" => extract_go_signature(trimmed),
364            "java" | "kt" | "scala" => extract_java_signature(trimmed),
365            "rb" => extract_ruby_signature(trimmed),
366            "c" | "cpp" | "cc" | "h" | "hpp" => extract_c_signature(trimmed),
367            _ => None,
368        };
369
370        if let Some(sig_text) = sig {
371            signatures.push(format!("{}:{} {}", path, i + 1, sig_text));
372        }
373    }
374
375    signatures
376}
377
378fn extract_rust_signature(line: &str) -> Option<String> {
379    if line.starts_with("pub fn ")
380        || line.starts_with("fn ")
381        || line.starts_with("pub async fn ")
382        || line.starts_with("async fn ")
383        || line.starts_with("pub struct ")
384        || line.starts_with("struct ")
385        || line.starts_with("pub enum ")
386        || line.starts_with("enum ")
387        || line.starts_with("pub trait ")
388        || line.starts_with("trait ")
389        || line.starts_with("impl ")
390        || line.starts_with("pub mod ")
391        || line.starts_with("mod ")
392    {
393        Some(line.trim_end_matches('{').trim().to_string())
394    } else {
395        None
396    }
397}
398
399fn extract_python_signature(line: &str) -> Option<String> {
400    if line.starts_with("def ") || line.starts_with("async def ") || line.starts_with("class ") {
401        Some(line.trim_end_matches(':').trim().to_string())
402    } else {
403        None
404    }
405}
406
407fn extract_js_signature(line: &str) -> Option<String> {
408    if line.starts_with("function ")
409        || line.starts_with("async function ")
410        || line.starts_with("export function ")
411        || line.starts_with("export async function ")
412        || line.starts_with("export default function ")
413        || line.starts_with("class ")
414        || line.starts_with("export class ")
415        || line.contains("=> {")
416    {
417        Some(line.trim_end_matches('{').trim().to_string())
418    } else {
419        None
420    }
421}
422
423fn extract_go_signature(line: &str) -> Option<String> {
424    if line.starts_with("func ") || line.starts_with("type ") {
425        Some(line.trim_end_matches('{').trim().to_string())
426    } else {
427        None
428    }
429}
430
431fn extract_java_signature(line: &str) -> Option<String> {
432    let keywords = [
433        "public ",
434        "private ",
435        "protected ",
436        "static ",
437        "abstract ",
438        "final ",
439    ];
440    let is_declaration = keywords.iter().any(|k| line.starts_with(k))
441        && (line.contains('(') || line.contains("class ") || line.contains("interface "));
442
443    if is_declaration || line.starts_with("class ") || line.starts_with("interface ") {
444        Some(line.trim_end_matches('{').trim().to_string())
445    } else {
446        None
447    }
448}
449
450fn extract_ruby_signature(line: &str) -> Option<String> {
451    if line.starts_with("def ") || line.starts_with("class ") || line.starts_with("module ") {
452        Some(line.trim().to_string())
453    } else {
454        None
455    }
456}
457
458fn extract_c_signature(line: &str) -> Option<String> {
459    // Simplified: look for function-like declarations
460    if (line.contains('(') && !line.starts_with("//") && !line.starts_with('#'))
461        || line.starts_with("struct ")
462        || line.starts_with("class ")
463        || line.starts_with("typedef ")
464    {
465        // Skip preprocessor and comments
466        if line.starts_with('#') || line.starts_with("//") || line.starts_with("/*") {
467            return None;
468        }
469        // Skip simple statements (assignments, returns, etc.)
470        if line.contains('=') && !line.contains("==") && !line.contains("!=") {
471            return None;
472        }
473        Some(line.trim_end_matches('{').trim().to_string())
474    } else {
475        None
476    }
477}
478
479#[cfg(test)]
480mod tests {
481    use super::*;
482    use std::fs;
483    use tempfile::TempDir;
484
485    fn setup_test_workspace() -> (TempDir, PathBuf) {
486        let dir = TempDir::new().unwrap();
487        let path = dir.path().to_path_buf();
488
489        // Create source files
490        fs::create_dir_all(path.join("src")).unwrap();
491        fs::write(
492            path.join("src/main.rs"),
493            "fn main() {\n    println!(\"hello\");\n}\n\npub fn helper() -> bool {\n    true\n}\n",
494        )
495        .unwrap();
496        fs::write(
497            path.join("src/lib.rs"),
498            "pub mod utils;\n\npub struct Config {\n    pub name: String,\n}\n\nimpl Config {\n    pub fn new() -> Self {\n        Self { name: String::new() }\n    }\n}\n",
499        )
500        .unwrap();
501        fs::write(
502            path.join("Cargo.toml"),
503            "[package]\nname = \"test\"\nversion = \"0.1.0\"\n",
504        )
505        .unwrap();
506        fs::write(
507            path.join("README.md"),
508            "# Test Project\n\nA test project.\n",
509        )
510        .unwrap();
511
512        // Create .gitignore
513        fs::write(path.join(".gitignore"), "target/\n*.tmp\n").unwrap();
514
515        // Create a file that should be ignored
516        fs::create_dir_all(path.join("target")).unwrap();
517        fs::write(path.join("target/debug.rs"), "should be ignored").unwrap();
518
519        // Create a binary file (should be skipped)
520        fs::write(path.join("image.png"), [0x89, 0x50, 0x4E, 0x47]).unwrap();
521
522        (dir, path)
523    }
524
525    #[test]
526    fn test_is_indexable() {
527        assert!(is_indexable(Path::new("src/main.rs")));
528        assert!(is_indexable(Path::new("app.py")));
529        assert!(is_indexable(Path::new("index.js")));
530        assert!(is_indexable(Path::new("Makefile")));
531        assert!(is_indexable(Path::new("Dockerfile")));
532        assert!(!is_indexable(Path::new("image.png")));
533        assert!(!is_indexable(Path::new("archive.zip")));
534        assert!(!is_indexable(Path::new("binary.exe")));
535    }
536
537    #[test]
538    fn test_extract_rust_signatures() {
539        let content = "use std::io;\n\npub fn process(data: &[u8]) -> Result<(), Error> {\n    Ok(())\n}\n\nstruct Config {\n    name: String,\n}\n\nimpl Config {\n    fn new() -> Self { todo!() }\n}\n";
540        let sigs = extract_signatures(content, "lib.rs");
541        assert!(sigs.iter().any(|s| s.contains("pub fn process")));
542        assert!(sigs.iter().any(|s| s.contains("struct Config")));
543        assert!(sigs.iter().any(|s| s.contains("impl Config")));
544        assert!(sigs.iter().any(|s| s.contains("fn new")));
545    }
546
547    #[test]
548    fn test_extract_python_signatures() {
549        let content = "import os\n\nclass Handler:\n    def process(self, data):\n        pass\n\nasync def fetch(url):\n    pass\n";
550        let sigs = extract_signatures(content, "handler.py");
551        assert!(sigs.iter().any(|s| s.contains("class Handler")));
552        assert!(sigs.iter().any(|s| s.contains("def process")));
553        assert!(sigs.iter().any(|s| s.contains("async def fetch")));
554    }
555
556    #[test]
557    fn test_extract_js_signatures() {
558        let content = "const x = 1;\n\nfunction handleRequest(req) {\n    return null;\n}\n\nexport class Server {\n}\n";
559        let sigs = extract_signatures(content, "server.js");
560        assert!(sigs.iter().any(|s| s.contains("function handleRequest")));
561        assert!(sigs.iter().any(|s| s.contains("export class Server")));
562    }
563
564    #[test]
565    fn test_index_workspace() {
566        let (_dir, path) = setup_test_workspace();
567
568        let search_config = SearchConfig {
569            index_path: path.join(".rustant/search_index"),
570            db_path: path.join(".rustant/vectors.db"),
571            ..Default::default()
572        };
573
574        let mut indexer = ProjectIndexer::new(path, search_config).unwrap();
575        let stats = indexer.index_workspace();
576
577        // Should have indexed some files
578        assert!(stats.files_indexed > 0, "Should index at least one file");
579        assert!(
580            stats.entries_indexed > 0,
581            "Should create at least one entry"
582        );
583
584        // Project info should be detected
585        assert!(stats.project_info.is_some());
586    }
587
588    #[test]
589    fn test_search_indexed_workspace() {
590        let (_dir, path) = setup_test_workspace();
591
592        let search_config = SearchConfig {
593            index_path: path.join(".rustant/search_index"),
594            db_path: path.join(".rustant/vectors.db"),
595            ..Default::default()
596        };
597
598        let mut indexer = ProjectIndexer::new(path, search_config).unwrap();
599        indexer.index_workspace();
600
601        // Search for something we know is in the workspace
602        let results = indexer.search("main function").unwrap();
603        assert!(
604            !results.is_empty(),
605            "Should find results for 'main function'"
606        );
607
608        // At least one result should reference main.rs
609        let has_main = results.iter().any(|r| r.content.contains("main"));
610        assert!(has_main, "Should find main.rs related content");
611    }
612
613    #[test]
614    fn test_indexer_config() {
615        let config = IndexerConfig::default();
616        assert_eq!(config.max_file_size, MAX_FILE_SIZE);
617        assert_eq!(config.max_files, MAX_FILES);
618        assert!(config.index_content);
619        assert!(config.index_signatures);
620    }
621
622    #[test]
623    fn test_indexer_with_custom_config() {
624        let (_dir, path) = setup_test_workspace();
625
626        let search_config = SearchConfig {
627            index_path: path.join(".rustant/search_index"),
628            db_path: path.join(".rustant/vectors.db"),
629            ..Default::default()
630        };
631
632        let custom = IndexerConfig {
633            max_files: 2,
634            index_content: false,
635            index_signatures: false,
636            ..Default::default()
637        };
638
639        let mut indexer = ProjectIndexer::with_config(path, search_config, custom).unwrap();
640        let stats = indexer.index_workspace();
641
642        // Should respect max_files limit
643        assert!(stats.files_indexed <= 2);
644    }
645
646    #[test]
647    fn test_build_structure_summary() {
648        let (_dir, path) = setup_test_workspace();
649
650        let search_config = SearchConfig {
651            index_path: path.join(".rustant/search_index"),
652            db_path: path.join(".rustant/vectors.db"),
653            ..Default::default()
654        };
655
656        let indexer = ProjectIndexer::new(path.clone(), search_config).unwrap();
657        let info = detect_project(&path);
658        let summary = indexer.build_structure_summary(&info);
659
660        assert!(summary.contains("Project type:"));
661        assert!(summary.contains("Top-level structure:"));
662    }
663
664    #[test]
665    fn test_ignored_files_not_indexed() {
666        let (_dir, path) = setup_test_workspace();
667
668        // Initialize a git repo so .gitignore is respected by the `ignore` crate
669        std::process::Command::new("git")
670            .args(["init"])
671            .current_dir(&path)
672            .output()
673            .expect("git init");
674
675        let search_config = SearchConfig {
676            index_path: path.join(".rustant/search_index"),
677            db_path: path.join(".rustant/vectors.db"),
678            ..Default::default()
679        };
680
681        let mut indexer = ProjectIndexer::new(path, search_config).unwrap();
682        indexer.index_workspace();
683
684        // Search for content that should have been ignored
685        let results = indexer.search("should be ignored").unwrap();
686        let has_target = results
687            .iter()
688            .any(|r| r.content.contains("target/debug.rs"));
689        assert!(
690            !has_target,
691            "Files in target/ should be ignored by .gitignore"
692        );
693    }
694}