1use crate::project_detect::{ProjectInfo, detect_project};
8use crate::search::{HybridSearchEngine, SearchConfig, SearchResult};
9use ignore::WalkBuilder;
10use std::path::{Path, PathBuf};
11use tracing::{debug, info};
12
13const MAX_FILE_SIZE: u64 = 256 * 1024;
15
16const MAX_FILES: usize = 5000;
18
19const SOURCE_EXTENSIONS: &[&str] = &[
21 "rs",
22 "py",
23 "js",
24 "ts",
25 "jsx",
26 "tsx",
27 "go",
28 "java",
29 "rb",
30 "c",
31 "cpp",
32 "cc",
33 "h",
34 "hpp",
35 "cs",
36 "swift",
37 "kt",
38 "scala",
39 "lua",
40 "sh",
41 "bash",
42 "zsh",
43 "toml",
44 "yaml",
45 "yml",
46 "json",
47 "xml",
48 "html",
49 "css",
50 "scss",
51 "sql",
52 "md",
53 "txt",
54 "cfg",
55 "ini",
56 "env",
57 "dockerfile",
58 "makefile",
59];
60
61#[derive(Debug, Clone)]
63pub struct IndexStats {
64 pub files_indexed: usize,
66 pub entries_indexed: usize,
68 pub files_skipped: usize,
70 pub project_info: Option<ProjectInfo>,
72}
73
74pub struct ProjectIndexer {
76 workspace: PathBuf,
77 engine: HybridSearchEngine,
78 config: IndexerConfig,
79}
80
81#[derive(Debug, Clone)]
83pub struct IndexerConfig {
84 pub max_file_size: u64,
86 pub max_files: usize,
88 pub index_content: bool,
90 pub index_signatures: bool,
92}
93
94impl Default for IndexerConfig {
95 fn default() -> Self {
96 Self {
97 max_file_size: MAX_FILE_SIZE,
98 max_files: MAX_FILES,
99 index_content: true,
100 index_signatures: true,
101 }
102 }
103}
104
105impl ProjectIndexer {
106 pub fn new(
108 workspace: PathBuf,
109 search_config: SearchConfig,
110 ) -> Result<Self, crate::search::SearchError> {
111 let engine = HybridSearchEngine::open(search_config)?;
112 Ok(Self {
113 workspace,
114 engine,
115 config: IndexerConfig::default(),
116 })
117 }
118
119 pub fn with_config(
121 workspace: PathBuf,
122 search_config: SearchConfig,
123 config: IndexerConfig,
124 ) -> Result<Self, crate::search::SearchError> {
125 let engine = HybridSearchEngine::open(search_config)?;
126 Ok(Self {
127 workspace,
128 engine,
129 config,
130 })
131 }
132
133 pub fn index_workspace(&mut self) -> IndexStats {
136 let project_info = detect_project(&self.workspace);
137 info!(
138 "Indexing workspace: {:?} (type: {:?})",
139 self.workspace, project_info.project_type
140 );
141
142 let structure = self.build_structure_summary(&project_info);
144 let _ = self.engine.index_fact("__project_structure__", &structure);
145
146 let mut files_indexed = 0;
147 let mut entries_indexed = 1; let mut files_skipped = 0;
149
150 let walker = WalkBuilder::new(&self.workspace)
152 .hidden(true) .git_ignore(true) .git_global(true) .git_exclude(true) .max_depth(Some(10))
157 .build();
158
159 for entry in walker.flatten() {
160 if files_indexed >= self.config.max_files {
161 debug!("Reached max files limit ({})", self.config.max_files);
162 break;
163 }
164
165 let path = entry.path();
166
167 if !path.is_file() {
169 continue;
170 }
171
172 if let Ok(meta) = path.metadata()
174 && meta.len() > self.config.max_file_size
175 {
176 files_skipped += 1;
177 continue;
178 }
179
180 if !is_indexable(path) {
182 files_skipped += 1;
183 continue;
184 }
185
186 let rel_path = path
188 .strip_prefix(&self.workspace)
189 .unwrap_or(path)
190 .to_string_lossy()
191 .to_string();
192
193 let path_entry = format!("file: {}", rel_path);
195 let fact_id = format!("file:{}", rel_path);
196 if self.engine.index_fact(&fact_id, &path_entry).is_ok() {
197 entries_indexed += 1;
198 }
199
200 if self.config.index_content
202 && let Ok(content) = std::fs::read_to_string(path)
203 {
204 let summary = self.summarize_file(&rel_path, &content);
206 if !summary.is_empty() {
207 let content_id = format!("content:{}", rel_path);
208 if self.engine.index_fact(&content_id, &summary).is_ok() {
209 entries_indexed += 1;
210 }
211 }
212
213 if self.config.index_signatures {
215 let signatures = extract_signatures(&content, &rel_path);
216 for (i, sig) in signatures.iter().enumerate() {
217 let sig_id = format!("sig:{}:{}", rel_path, i);
218 if self.engine.index_fact(&sig_id, sig).is_ok() {
219 entries_indexed += 1;
220 }
221 }
222 }
223 }
224
225 files_indexed += 1;
226 }
227
228 info!(
229 "Indexing complete: {} files indexed, {} entries, {} skipped",
230 files_indexed, entries_indexed, files_skipped
231 );
232
233 IndexStats {
234 files_indexed,
235 entries_indexed,
236 files_skipped,
237 project_info: Some(project_info),
238 }
239 }
240
241 pub fn search(&self, query: &str) -> Result<Vec<SearchResult>, crate::search::SearchError> {
243 self.engine.search(query)
244 }
245
246 pub fn indexed_count(&self) -> usize {
248 self.engine.indexed_count()
249 }
250
251 pub fn engine(&self) -> &HybridSearchEngine {
253 &self.engine
254 }
255
256 pub fn build_structure_summary(&self, info: &ProjectInfo) -> String {
258 let mut summary = String::new();
259
260 summary.push_str(&format!("Project type: {:?}\n", info.project_type));
261
262 if let Some(ref framework) = info.framework {
263 summary.push_str(&format!("Framework: {}\n", framework));
264 }
265 if let Some(ref pm) = info.package_manager {
266 summary.push_str(&format!("Package manager: {}\n", pm));
267 }
268
269 if !info.source_dirs.is_empty() {
270 summary.push_str(&format!(
271 "Source directories: {}\n",
272 info.source_dirs.join(", ")
273 ));
274 }
275
276 summary.push_str("\nTop-level structure:\n");
278 if let Ok(entries) = std::fs::read_dir(&self.workspace) {
279 let mut dirs: Vec<String> = Vec::new();
280 let mut files: Vec<String> = Vec::new();
281
282 for entry in entries.flatten() {
283 let name = entry.file_name().to_string_lossy().to_string();
284 if name.starts_with('.') {
285 continue;
286 }
287 if entry.path().is_dir() {
288 dirs.push(format!(" {}/", name));
289 } else {
290 files.push(format!(" {}", name));
291 }
292 }
293
294 dirs.sort();
295 files.sort();
296
297 for d in &dirs {
298 summary.push_str(d);
299 summary.push('\n');
300 }
301 for f in &files {
302 summary.push_str(f);
303 summary.push('\n');
304 }
305 }
306
307 summary
308 }
309
310 fn summarize_file(&self, path: &str, content: &str) -> String {
312 let lines: Vec<&str> = content.lines().collect();
313 let total_lines = lines.len();
314
315 let head: Vec<&str> = lines.iter().take(20).copied().collect();
317
318 let mut summary = format!("{} ({} lines)\n{}", path, total_lines, head.join("\n"));
320
321 if total_lines > 20 {
323 summary.push_str(&format!("\n... ({} more lines)", total_lines - 20));
324 }
325
326 summary
327 }
328}
329
330fn is_indexable(path: &Path) -> bool {
332 let name = path
334 .file_name()
335 .map(|n| n.to_string_lossy().to_lowercase())
336 .unwrap_or_default();
337
338 if ["makefile", "dockerfile", "rakefile", "gemfile", "procfile"].contains(&name.as_str()) {
339 return true;
340 }
341
342 path.extension()
344 .and_then(|ext| ext.to_str())
345 .map(|ext| SOURCE_EXTENSIONS.contains(&ext.to_lowercase().as_str()))
346 .unwrap_or(false)
347}
348
349fn extract_signatures(content: &str, path: &str) -> Vec<String> {
351 let mut signatures = Vec::new();
352 let ext = Path::new(path)
353 .extension()
354 .and_then(|e| e.to_str())
355 .unwrap_or("");
356
357 for (i, line) in content.lines().enumerate() {
358 let trimmed = line.trim();
359 let sig = match ext {
360 "rs" => extract_rust_signature(trimmed),
361 "py" => extract_python_signature(trimmed),
362 "js" | "jsx" | "ts" | "tsx" => extract_js_signature(trimmed),
363 "go" => extract_go_signature(trimmed),
364 "java" | "kt" | "scala" => extract_java_signature(trimmed),
365 "rb" => extract_ruby_signature(trimmed),
366 "c" | "cpp" | "cc" | "h" | "hpp" => extract_c_signature(trimmed),
367 _ => None,
368 };
369
370 if let Some(sig_text) = sig {
371 signatures.push(format!("{}:{} {}", path, i + 1, sig_text));
372 }
373 }
374
375 signatures
376}
377
378fn extract_rust_signature(line: &str) -> Option<String> {
379 if line.starts_with("pub fn ")
380 || line.starts_with("fn ")
381 || line.starts_with("pub async fn ")
382 || line.starts_with("async fn ")
383 || line.starts_with("pub struct ")
384 || line.starts_with("struct ")
385 || line.starts_with("pub enum ")
386 || line.starts_with("enum ")
387 || line.starts_with("pub trait ")
388 || line.starts_with("trait ")
389 || line.starts_with("impl ")
390 || line.starts_with("pub mod ")
391 || line.starts_with("mod ")
392 {
393 Some(line.trim_end_matches('{').trim().to_string())
394 } else {
395 None
396 }
397}
398
399fn extract_python_signature(line: &str) -> Option<String> {
400 if line.starts_with("def ") || line.starts_with("async def ") || line.starts_with("class ") {
401 Some(line.trim_end_matches(':').trim().to_string())
402 } else {
403 None
404 }
405}
406
407fn extract_js_signature(line: &str) -> Option<String> {
408 if line.starts_with("function ")
409 || line.starts_with("async function ")
410 || line.starts_with("export function ")
411 || line.starts_with("export async function ")
412 || line.starts_with("export default function ")
413 || line.starts_with("class ")
414 || line.starts_with("export class ")
415 || line.contains("=> {")
416 {
417 Some(line.trim_end_matches('{').trim().to_string())
418 } else {
419 None
420 }
421}
422
423fn extract_go_signature(line: &str) -> Option<String> {
424 if line.starts_with("func ") || line.starts_with("type ") {
425 Some(line.trim_end_matches('{').trim().to_string())
426 } else {
427 None
428 }
429}
430
431fn extract_java_signature(line: &str) -> Option<String> {
432 let keywords = [
433 "public ",
434 "private ",
435 "protected ",
436 "static ",
437 "abstract ",
438 "final ",
439 ];
440 let is_declaration = keywords.iter().any(|k| line.starts_with(k))
441 && (line.contains('(') || line.contains("class ") || line.contains("interface "));
442
443 if is_declaration || line.starts_with("class ") || line.starts_with("interface ") {
444 Some(line.trim_end_matches('{').trim().to_string())
445 } else {
446 None
447 }
448}
449
450fn extract_ruby_signature(line: &str) -> Option<String> {
451 if line.starts_with("def ") || line.starts_with("class ") || line.starts_with("module ") {
452 Some(line.trim().to_string())
453 } else {
454 None
455 }
456}
457
458fn extract_c_signature(line: &str) -> Option<String> {
459 if (line.contains('(') && !line.starts_with("//") && !line.starts_with('#'))
461 || line.starts_with("struct ")
462 || line.starts_with("class ")
463 || line.starts_with("typedef ")
464 {
465 if line.starts_with('#') || line.starts_with("//") || line.starts_with("/*") {
467 return None;
468 }
469 if line.contains('=') && !line.contains("==") && !line.contains("!=") {
471 return None;
472 }
473 Some(line.trim_end_matches('{').trim().to_string())
474 } else {
475 None
476 }
477}
478
479#[cfg(test)]
480mod tests {
481 use super::*;
482 use std::fs;
483 use tempfile::TempDir;
484
485 fn setup_test_workspace() -> (TempDir, PathBuf) {
486 let dir = TempDir::new().unwrap();
487 let path = dir.path().to_path_buf();
488
489 fs::create_dir_all(path.join("src")).unwrap();
491 fs::write(
492 path.join("src/main.rs"),
493 "fn main() {\n println!(\"hello\");\n}\n\npub fn helper() -> bool {\n true\n}\n",
494 )
495 .unwrap();
496 fs::write(
497 path.join("src/lib.rs"),
498 "pub mod utils;\n\npub struct Config {\n pub name: String,\n}\n\nimpl Config {\n pub fn new() -> Self {\n Self { name: String::new() }\n }\n}\n",
499 )
500 .unwrap();
501 fs::write(
502 path.join("Cargo.toml"),
503 "[package]\nname = \"test\"\nversion = \"0.1.0\"\n",
504 )
505 .unwrap();
506 fs::write(
507 path.join("README.md"),
508 "# Test Project\n\nA test project.\n",
509 )
510 .unwrap();
511
512 fs::write(path.join(".gitignore"), "target/\n*.tmp\n").unwrap();
514
515 fs::create_dir_all(path.join("target")).unwrap();
517 fs::write(path.join("target/debug.rs"), "should be ignored").unwrap();
518
519 fs::write(path.join("image.png"), [0x89, 0x50, 0x4E, 0x47]).unwrap();
521
522 (dir, path)
523 }
524
525 #[test]
526 fn test_is_indexable() {
527 assert!(is_indexable(Path::new("src/main.rs")));
528 assert!(is_indexable(Path::new("app.py")));
529 assert!(is_indexable(Path::new("index.js")));
530 assert!(is_indexable(Path::new("Makefile")));
531 assert!(is_indexable(Path::new("Dockerfile")));
532 assert!(!is_indexable(Path::new("image.png")));
533 assert!(!is_indexable(Path::new("archive.zip")));
534 assert!(!is_indexable(Path::new("binary.exe")));
535 }
536
537 #[test]
538 fn test_extract_rust_signatures() {
539 let content = "use std::io;\n\npub fn process(data: &[u8]) -> Result<(), Error> {\n Ok(())\n}\n\nstruct Config {\n name: String,\n}\n\nimpl Config {\n fn new() -> Self { todo!() }\n}\n";
540 let sigs = extract_signatures(content, "lib.rs");
541 assert!(sigs.iter().any(|s| s.contains("pub fn process")));
542 assert!(sigs.iter().any(|s| s.contains("struct Config")));
543 assert!(sigs.iter().any(|s| s.contains("impl Config")));
544 assert!(sigs.iter().any(|s| s.contains("fn new")));
545 }
546
547 #[test]
548 fn test_extract_python_signatures() {
549 let content = "import os\n\nclass Handler:\n def process(self, data):\n pass\n\nasync def fetch(url):\n pass\n";
550 let sigs = extract_signatures(content, "handler.py");
551 assert!(sigs.iter().any(|s| s.contains("class Handler")));
552 assert!(sigs.iter().any(|s| s.contains("def process")));
553 assert!(sigs.iter().any(|s| s.contains("async def fetch")));
554 }
555
556 #[test]
557 fn test_extract_js_signatures() {
558 let content = "const x = 1;\n\nfunction handleRequest(req) {\n return null;\n}\n\nexport class Server {\n}\n";
559 let sigs = extract_signatures(content, "server.js");
560 assert!(sigs.iter().any(|s| s.contains("function handleRequest")));
561 assert!(sigs.iter().any(|s| s.contains("export class Server")));
562 }
563
564 #[test]
565 fn test_index_workspace() {
566 let (_dir, path) = setup_test_workspace();
567
568 let search_config = SearchConfig {
569 index_path: path.join(".rustant/search_index"),
570 db_path: path.join(".rustant/vectors.db"),
571 ..Default::default()
572 };
573
574 let mut indexer = ProjectIndexer::new(path, search_config).unwrap();
575 let stats = indexer.index_workspace();
576
577 assert!(stats.files_indexed > 0, "Should index at least one file");
579 assert!(
580 stats.entries_indexed > 0,
581 "Should create at least one entry"
582 );
583
584 assert!(stats.project_info.is_some());
586 }
587
588 #[test]
589 fn test_search_indexed_workspace() {
590 let (_dir, path) = setup_test_workspace();
591
592 let search_config = SearchConfig {
593 index_path: path.join(".rustant/search_index"),
594 db_path: path.join(".rustant/vectors.db"),
595 ..Default::default()
596 };
597
598 let mut indexer = ProjectIndexer::new(path, search_config).unwrap();
599 indexer.index_workspace();
600
601 let results = indexer.search("main function").unwrap();
603 assert!(
604 !results.is_empty(),
605 "Should find results for 'main function'"
606 );
607
608 let has_main = results.iter().any(|r| r.content.contains("main"));
610 assert!(has_main, "Should find main.rs related content");
611 }
612
613 #[test]
614 fn test_indexer_config() {
615 let config = IndexerConfig::default();
616 assert_eq!(config.max_file_size, MAX_FILE_SIZE);
617 assert_eq!(config.max_files, MAX_FILES);
618 assert!(config.index_content);
619 assert!(config.index_signatures);
620 }
621
622 #[test]
623 fn test_indexer_with_custom_config() {
624 let (_dir, path) = setup_test_workspace();
625
626 let search_config = SearchConfig {
627 index_path: path.join(".rustant/search_index"),
628 db_path: path.join(".rustant/vectors.db"),
629 ..Default::default()
630 };
631
632 let custom = IndexerConfig {
633 max_files: 2,
634 index_content: false,
635 index_signatures: false,
636 ..Default::default()
637 };
638
639 let mut indexer = ProjectIndexer::with_config(path, search_config, custom).unwrap();
640 let stats = indexer.index_workspace();
641
642 assert!(stats.files_indexed <= 2);
644 }
645
646 #[test]
647 fn test_build_structure_summary() {
648 let (_dir, path) = setup_test_workspace();
649
650 let search_config = SearchConfig {
651 index_path: path.join(".rustant/search_index"),
652 db_path: path.join(".rustant/vectors.db"),
653 ..Default::default()
654 };
655
656 let indexer = ProjectIndexer::new(path.clone(), search_config).unwrap();
657 let info = detect_project(&path);
658 let summary = indexer.build_structure_summary(&info);
659
660 assert!(summary.contains("Project type:"));
661 assert!(summary.contains("Top-level structure:"));
662 }
663
664 #[test]
665 fn test_ignored_files_not_indexed() {
666 let (_dir, path) = setup_test_workspace();
667
668 std::process::Command::new("git")
670 .args(["init"])
671 .current_dir(&path)
672 .output()
673 .expect("git init");
674
675 let search_config = SearchConfig {
676 index_path: path.join(".rustant/search_index"),
677 db_path: path.join(".rustant/vectors.db"),
678 ..Default::default()
679 };
680
681 let mut indexer = ProjectIndexer::new(path, search_config).unwrap();
682 indexer.index_workspace();
683
684 let results = indexer.search("should be ignored").unwrap();
686 let has_target = results
687 .iter()
688 .any(|r| r.content.contains("target/debug.rs"));
689 assert!(
690 !has_target,
691 "Files in target/ should be ignored by .gitignore"
692 );
693 }
694}