1use super::InternalTool;
27use super::ToolMetadata;
28use super::output::CacheStats;
29use super::output::ComprehensiveToolOutput;
30use super::output::CpuUsage;
31use super::output::IoStats;
32use super::output::MemoryUsage;
33use super::output::OutputBuilder;
34use super::output::PerformanceMetrics;
35use agcodex_ast::SourceLocation;
36
37use serde::Deserialize;
39use serde::Serialize;
40use std::collections::HashMap;
41use std::collections::HashSet;
42use std::fs;
43use std::path::Path;
44use std::path::PathBuf;
45use std::sync::Arc;
46use std::sync::Mutex;
47use std::sync::RwLock;
48use std::time::SystemTime;
49use std::time::UNIX_EPOCH;
50use thiserror::Error;
51use tracing::debug;
52use tracing::error;
53use tracing::info;
54use tracing::instrument;
55use walkdir::WalkDir;
56
57use tantivy::Index;
58use tantivy::IndexReader;
59use tantivy::IndexWriter;
60use tantivy::ReloadPolicy;
61use tantivy::Searcher;
62use tantivy::TantivyError;
63use tantivy::Term;
64use tantivy::collector::TopDocs;
65use tantivy::doc;
66use tantivy::query::QueryParser;
67use tantivy::schema::*;
68
69#[derive(Error, Debug)]
71pub enum IndexError {
72 #[error("tantivy index error: {0}")]
73 Tantivy(#[from] TantivyError),
74
75 #[error("IO error: {0}")]
76 Io(#[from] std::io::Error),
77
78 #[error("invalid index path: {path}")]
79 InvalidPath { path: PathBuf },
80
81 #[error("index not initialized: {0}")]
82 NotInitialized(String),
83
84 #[error("concurrent access error: {0}")]
85 ConcurrentAccess(String),
86
87 #[error("document not found: {path}")]
88 DocumentNotFound { path: PathBuf },
89
90 #[error("query parsing error: {query}: {source}")]
91 QueryParsing {
92 query: String,
93 source: Box<dyn std::error::Error + Send + Sync>,
94 },
95
96 #[error("schema error: {0}")]
97 Schema(String),
98
99 #[error("indexing operation failed: {operation}: {reason}")]
100 OperationFailed { operation: String, reason: String },
101
102 #[error("incremental update failed: {0}")]
103 IncrementalUpdateFailed(String),
104
105 #[error("optimization failed: {0}")]
106 OptimizationFailed(String),
107
108 #[error("statistics calculation failed: {0}")]
109 StatsFailed(String),
110}
111
112impl From<walkdir::Error> for IndexError {
114 fn from(err: walkdir::Error) -> Self {
115 IndexError::Io(std::io::Error::other(err.to_string()))
116 }
117}
118
119pub type IndexResult<T> = std::result::Result<T, IndexError>;
121
122#[derive(Debug, Clone, Serialize, Deserialize)]
124pub struct IndexConfig {
125 pub index_path: PathBuf,
127
128 pub include_extensions: Vec<String>,
130
131 pub max_file_size: usize,
133
134 pub incremental: bool,
136
137 pub writer_memory_mb: usize,
139
140 pub num_threads: Option<usize>,
142
143 pub merge_policy: MergePolicyConfig,
145}
146
147impl Default for IndexConfig {
148 fn default() -> Self {
149 Self {
150 index_path: PathBuf::from(".agcodex/index"),
151 include_extensions: vec![
152 "rs".to_string(),
153 "py".to_string(),
154 "js".to_string(),
155 "ts".to_string(),
156 "tsx".to_string(),
157 "jsx".to_string(),
158 "go".to_string(),
159 "java".to_string(),
160 "c".to_string(),
161 "cpp".to_string(),
162 "h".to_string(),
163 "hpp".to_string(),
164 "cs".to_string(),
165 "php".to_string(),
166 "rb".to_string(),
167 "swift".to_string(),
168 "kt".to_string(),
169 "scala".to_string(),
170 "hs".to_string(),
171 "ex".to_string(),
172 "exs".to_string(),
173 "clj".to_string(),
174 "cljs".to_string(),
175 "lua".to_string(),
176 "sh".to_string(),
177 "bash".to_string(),
178 "zsh".to_string(),
179 "fish".to_string(),
180 "ps1".to_string(),
181 "bat".to_string(),
182 "dockerfile".to_string(),
183 "yaml".to_string(),
184 "yml".to_string(),
185 "json".to_string(),
186 "toml".to_string(),
187 "xml".to_string(),
188 "html".to_string(),
189 "css".to_string(),
190 "scss".to_string(),
191 "md".to_string(),
192 "txt".to_string(),
193 ],
194 max_file_size: 10 * 1024 * 1024, incremental: true,
196 writer_memory_mb: 256,
197 num_threads: None,
198 merge_policy: MergePolicyConfig::default(),
199 }
200 }
201}
202
203#[derive(Debug, Clone, Serialize, Deserialize)]
205pub struct MergePolicyConfig {
206 pub max_merge_at_once: usize,
207 pub max_merge_segment_size_mb: usize,
208 pub level_log_size: f64,
209}
210
211impl Default for MergePolicyConfig {
212 fn default() -> Self {
213 Self {
214 max_merge_at_once: 10,
215 max_merge_segment_size_mb: 1024, level_log_size: 0.75,
217 }
218 }
219}
220
221#[derive(Debug, Clone, Serialize, Deserialize)]
223pub struct IndexedDocument {
224 pub path: String,
226
227 pub content: String,
229
230 pub symbols: Vec<Symbol>,
232
233 pub language: String,
235
236 pub size: u64,
238
239 pub modified: u64,
241
242 pub hash: String,
244}
245
246#[derive(Debug, Clone, Serialize, Deserialize)]
248pub struct Symbol {
249 pub name: String,
251
252 pub symbol_type: String,
254
255 pub line: u32,
257
258 pub column: u32,
260
261 pub end_line: u32,
263
264 pub end_column: u32,
266
267 pub documentation: Option<String>,
269
270 pub visibility: Option<String>,
272
273 pub parent: Option<String>,
275}
276
277#[derive(Debug, Clone, Serialize, Deserialize)]
279pub struct SearchResult {
280 pub document: IndexedDocument,
282
283 pub score: f32,
285
286 pub snippets: Vec<String>,
288
289 pub matching_symbols: Vec<Symbol>,
291
292 pub relevance_score: f32,
294
295 pub context_summary: String,
297
298 pub token_count: usize,
300
301 pub original_token_count: Option<usize>,
303
304 pub compression_ratio: Option<f32>,
306
307 pub similar_count: u32,
309
310 pub group_id: Option<String>,
312}
313
314#[derive(Debug, Clone, Serialize, Deserialize)]
316pub struct IndexStats {
317 pub document_count: u64,
319
320 pub term_count: u64,
322
323 pub size_bytes: u64,
325
326 pub segment_count: usize,
328
329 pub last_updated: u64,
331
332 pub avg_document_size: f64,
334
335 pub language_stats: HashMap<String, u64>,
337
338 pub symbol_stats: HashMap<String, u64>,
340}
341
342#[derive(Debug, Clone, Serialize, Deserialize)]
344pub struct SearchQuery {
345 pub query: String,
347
348 pub language: Option<String>,
350
351 pub path_filter: Option<String>,
353
354 pub symbol_type: Option<String>,
356
357 pub limit: Option<usize>,
359
360 pub fuzzy: bool,
362
363 pub min_score: Option<f32>,
365}
366
367#[derive(Debug, Clone)]
369pub struct BuildInput {
370 pub directory: PathBuf,
371 pub config: IndexConfig,
372 pub force_rebuild: bool,
373}
374
375#[derive(Debug, Clone)]
377pub struct UpdateInput {
378 pub files: Vec<PathBuf>,
379 pub config: IndexConfig,
380}
381
382#[derive(Debug, Clone)]
384pub struct SearchInput {
385 pub query: SearchQuery,
386 pub config: IndexConfig,
387}
388
389pub struct IndexTool {
391 index: Arc<RwLock<Option<Index>>>,
393
394 writer: Arc<Mutex<Option<IndexWriter>>>,
396
397 reader: Arc<RwLock<Option<IndexReader>>>,
399
400 config: Arc<RwLock<IndexConfig>>,
402
403 schema: Arc<Schema>,
405
406 fields: Arc<IndexFields>,
408}
409
410#[derive(Debug)]
412struct IndexFields {
413 path: Field,
414 content: Field,
415 symbols: Field,
416 language: Field,
417 size: Field,
418 modified: Field,
419 hash: Field,
420 symbol_names: Field,
421 symbol_types: Field,
422 symbol_docs: Field,
423}
424
425impl IndexTool {
426 pub fn new(config: IndexConfig) -> IndexResult<Self> {
428 let schema = Self::build_schema();
429 let fields = Arc::new(Self::extract_fields(&schema)?);
430
431 Ok(Self {
432 index: Arc::new(RwLock::new(None)),
433 writer: Arc::new(Mutex::new(None)),
434 reader: Arc::new(RwLock::new(None)),
435 config: Arc::new(RwLock::new(config)),
436 schema: Arc::new(schema),
437 fields,
438 })
439 }
440
441 const fn estimate_tokens(text: &str) -> usize {
443 text.len() / 4 }
445
446 fn compress_content(
448 &self,
449 content: &str,
450 language: &str,
451 path: &str,
452 ) -> (String, usize, usize) {
453 let original_tokens = Self::estimate_tokens(content);
454
455 let compressed = self.extract_signatures(content, language, path);
457 let compressed_tokens = Self::estimate_tokens(&compressed);
458
459 (compressed, original_tokens, compressed_tokens)
460 }
461
462 fn extract_signatures(&self, content: &str, language: &str, path: &str) -> String {
464 let lines: Vec<&str> = content.lines().collect();
465 let mut signatures = Vec::new();
466
467 signatures.push(format!("// {}: {} ({} lines)", path, language, lines.len()));
469
470 match language {
471 "rust" => self.extract_rust_signatures(&lines, &mut signatures),
472 "python" => self.extract_python_signatures(&lines, &mut signatures),
473 "javascript" | "typescript" => self.extract_js_signatures(&lines, &mut signatures),
474 "java" => self.extract_java_signatures(&lines, &mut signatures),
475 "go" => self.extract_go_signatures(&lines, &mut signatures),
476 "c" | "cpp" => self.extract_c_signatures(&lines, &mut signatures),
477 _ => self.extract_generic_signatures(&lines, &mut signatures),
478 }
479
480 signatures.join("\n")
481 }
482
483 fn extract_rust_signatures(&self, lines: &[&str], output: &mut Vec<String>) {
485 for (i, line) in lines.iter().enumerate() {
486 let trimmed = line.trim();
487
488 if trimmed.starts_with("fn ") || trimmed.starts_with("pub fn ") {
489 if let Some(sig_end) = line.find('{') {
491 output.push(format!("L{}: {}{{", i + 1, &line[..sig_end].trim()));
492 } else {
493 output.push(format!("L{}: {}", i + 1, line.trim()));
494 }
495 } else if trimmed.starts_with("struct ") || trimmed.starts_with("pub struct ") {
496 output.push(format!("L{}: {}", i + 1, line.trim()));
497 } else if trimmed.starts_with("impl ") {
498 output.push(format!("L{}: {}", i + 1, line.trim()));
499 } else if trimmed.starts_with("///") || trimmed.starts_with("//!") {
500 output.push(format!("L{}: {}", i + 1, line.trim()));
502 }
503 }
504 }
505
506 fn extract_python_signatures(&self, lines: &[&str], output: &mut Vec<String>) {
508 for (i, line) in lines.iter().enumerate() {
509 let trimmed = line.trim();
510
511 if trimmed.starts_with("def ") {
512 output.push(format!("L{}: {}:", i + 1, trimmed.trim_end_matches(':')));
514 } else if trimmed.starts_with("class ") {
515 output.push(format!("L{}: {}:", i + 1, trimmed.trim_end_matches(':')));
516 } else if trimmed.starts_with("import ") || trimmed.starts_with("from ") {
517 output.push(format!("L{}: {}", i + 1, trimmed));
518 }
519 }
520 }
521
522 fn extract_js_signatures(&self, lines: &[&str], output: &mut Vec<String>) {
524 for (i, line) in lines.iter().enumerate() {
525 let trimmed = line.trim();
526
527 if trimmed.starts_with("function ") {
528 if let Some(brace) = line.find('{') {
529 output.push(format!("L{}: {}{{", i + 1, &line[..brace].trim()));
530 } else {
531 output.push(format!("L{}: {}", i + 1, trimmed));
532 }
533 } else if trimmed.starts_with("export ")
534 || trimmed.starts_with("const ")
535 || trimmed.starts_with("let ")
536 || trimmed.starts_with("class ")
537 {
538 output.push(format!("L{}: {}", i + 1, trimmed));
539 }
540 }
541 }
542
543 fn extract_java_signatures(&self, lines: &[&str], output: &mut Vec<String>) {
545 for (i, line) in lines.iter().enumerate() {
546 let trimmed = line.trim();
547
548 if (trimmed.starts_with("public ")
549 || trimmed.starts_with("private ")
550 || trimmed.starts_with("protected "))
551 && (trimmed.contains("(") && trimmed.contains(")"))
552 {
553 if let Some(brace) = line.find('{') {
555 output.push(format!("L{}: {}{{", i + 1, &line[..brace].trim()));
556 } else {
557 output.push(format!("L{}: {}", i + 1, trimmed));
558 }
559 } else if trimmed.starts_with("class ") || trimmed.starts_with("interface ") {
560 output.push(format!("L{}: {}", i + 1, trimmed));
561 }
562 }
563 }
564
565 fn extract_go_signatures(&self, lines: &[&str], output: &mut Vec<String>) {
567 for (i, line) in lines.iter().enumerate() {
568 let trimmed = line.trim();
569
570 if trimmed.starts_with("func ") {
571 if let Some(brace) = line.find('{') {
572 output.push(format!("L{}: {}{{", i + 1, &line[..brace].trim()));
573 } else {
574 output.push(format!("L{}: {}", i + 1, trimmed));
575 }
576 } else if trimmed.starts_with("type ")
577 || trimmed.starts_with("var ")
578 || trimmed.starts_with("const ")
579 {
580 output.push(format!("L{}: {}", i + 1, trimmed));
581 }
582 }
583 }
584
585 fn extract_c_signatures(&self, lines: &[&str], output: &mut Vec<String>) {
587 for (i, line) in lines.iter().enumerate() {
588 let trimmed = line.trim();
589
590 if trimmed.starts_with("#include") || trimmed.starts_with("#define") {
591 output.push(format!("L{}: {}", i + 1, trimmed));
592 } else if trimmed.contains('(')
593 && trimmed.contains(')')
594 && !trimmed.starts_with("//")
595 && (trimmed.contains("int ")
596 || trimmed.contains("void ")
597 || trimmed.contains("char ")
598 || trimmed.contains("float "))
599 {
600 output.push(format!("L{}: {}", i + 1, trimmed));
602 } else if trimmed.starts_with("struct ") || trimmed.starts_with("typedef") {
603 output.push(format!("L{}: {}", i + 1, trimmed));
604 }
605 }
606 }
607
608 fn extract_generic_signatures(&self, lines: &[&str], output: &mut Vec<String>) {
610 for (i, line) in lines.iter().enumerate() {
611 let trimmed = line.trim();
612
613 if (trimmed.contains("function")
615 || trimmed.contains("def ")
616 || trimmed.contains("class "))
617 && trimmed.len() < 100
618 {
619 output.push(format!("L{}: {}", i + 1, trimmed));
620 }
621 }
622 }
623
624 fn build_schema() -> Schema {
626 let mut schema_builder = Schema::builder();
627
628 schema_builder.add_text_field("path", STRING | STORED | FAST);
630
631 schema_builder.add_text_field("content", TEXT | STORED);
633
634 schema_builder.add_text_field("symbols", STORED);
636
637 schema_builder.add_text_field("language", STRING | STORED | FAST);
639
640 schema_builder.add_u64_field("size", STORED | INDEXED);
642
643 schema_builder.add_u64_field("modified", STORED | INDEXED);
645
646 schema_builder.add_text_field("hash", STRING | STORED);
648
649 schema_builder.add_text_field("symbol_names", TEXT | STORED);
651
652 schema_builder.add_text_field("symbol_types", STRING | STORED | FAST);
654
655 schema_builder.add_text_field("symbol_docs", TEXT | STORED);
657
658 schema_builder.build()
659 }
660
661 fn extract_fields(schema: &Schema) -> IndexResult<IndexFields> {
663 Ok(IndexFields {
664 path: schema
665 .get_field("path")
666 .map_err(|e| IndexError::Schema(format!("Missing path field: {}", e)))?,
667 content: schema
668 .get_field("content")
669 .map_err(|e| IndexError::Schema(format!("Missing content field: {}", e)))?,
670 symbols: schema
671 .get_field("symbols")
672 .map_err(|e| IndexError::Schema(format!("Missing symbols field: {}", e)))?,
673 language: schema
674 .get_field("language")
675 .map_err(|e| IndexError::Schema(format!("Missing language field: {}", e)))?,
676 size: schema
677 .get_field("size")
678 .map_err(|e| IndexError::Schema(format!("Missing size field: {}", e)))?,
679 modified: schema
680 .get_field("modified")
681 .map_err(|e| IndexError::Schema(format!("Missing modified field: {}", e)))?,
682 hash: schema
683 .get_field("hash")
684 .map_err(|e| IndexError::Schema(format!("Missing hash field: {}", e)))?,
685 symbol_names: schema
686 .get_field("symbol_names")
687 .map_err(|e| IndexError::Schema(format!("Missing symbol_names field: {}", e)))?,
688 symbol_types: schema
689 .get_field("symbol_types")
690 .map_err(|e| IndexError::Schema(format!("Missing symbol_types field: {}", e)))?,
691 symbol_docs: schema
692 .get_field("symbol_docs")
693 .map_err(|e| IndexError::Schema(format!("Missing symbol_docs field: {}", e)))?,
694 })
695 }
696
697 #[instrument(skip(self))]
699 pub async fn initialize(&self) -> IndexResult<()> {
700 let config = self.config.read().unwrap().clone();
701
702 if !config.index_path.exists() {
704 fs::create_dir_all(&config.index_path)?;
705 }
706
707 let index = if config.index_path.join("meta.json").exists() {
709 info!("Opening existing index at {:?}", config.index_path);
710 Index::open_in_dir(&config.index_path)?
711 } else {
712 info!("Creating new index at {:?}", config.index_path);
713 Index::create_in_dir(&config.index_path, self.schema.as_ref().clone())?
714 };
715
716 let writer = index.writer(config.writer_memory_mb * 1_000_000)?;
718
719 let reader = index
721 .reader_builder()
722 .reload_policy(ReloadPolicy::OnCommitWithDelay)
723 .try_into()?;
724
725 {
727 let mut index_lock = self.index.write().unwrap();
728 *index_lock = Some(index);
729 }
730
731 {
732 let mut writer_lock = self.writer.lock().unwrap();
733 *writer_lock = Some(writer);
734 }
735
736 {
737 let mut reader_lock = self.reader.write().unwrap();
738 *reader_lock = Some(reader);
739 }
740
741 info!("Index initialized successfully");
742 Ok(())
743 }
744
745 #[instrument(skip(self, input))]
747 pub async fn build(
748 &self,
749 input: BuildInput,
750 ) -> IndexResult<ComprehensiveToolOutput<IndexStats>> {
751 info!("Building index for directory: {:?}", input.directory);
752 let start_time = std::time::Instant::now();
753 let location = SourceLocation::new(
754 input.directory.to_string_lossy().as_ref(),
755 0,
756 0,
757 0,
758 0,
759 (0, 0),
760 );
761
762 {
764 let mut config_lock = self.config.write().unwrap();
765 *config_lock = input.config.clone();
766 }
767
768 if self.index.read().unwrap().is_none() {
770 self.initialize().await?;
771 }
772
773 if input.force_rebuild {
775 self.clear_index().await?;
776 }
777
778 let files = self.collect_files(&input.directory).await?;
780 let file_count = files.len();
781 info!("Found {} files to index", file_count);
782
783 let batch_size = 100;
785 for batch in files.chunks(batch_size) {
786 self.index_batch(batch).await?;
787 }
788
789 self.commit().await?;
791
792 let stats = self.stats_internal().await?;
794
795 let output = OutputBuilder::new(stats.clone(), "index", "build".to_string(), location)
797 .summary(format!(
798 "Built index with {} documents in {:?}",
799 stats.document_count,
800 start_time.elapsed()
801 ))
802 .performance(PerformanceMetrics {
803 execution_time: start_time.elapsed(),
804 phase_times: HashMap::new(),
805 memory_usage: MemoryUsage {
806 peak_bytes: (file_count * 1024) as u64, average_bytes: (file_count * 512) as u64,
808 allocations: file_count as u64,
809 deallocations: 0,
810 efficiency_score: 0.9,
811 },
812 cpu_usage: CpuUsage {
813 cpu_time: start_time.elapsed(),
814 utilization_percent: 0.0,
815 context_switches: 0,
816 },
817 io_stats: IoStats {
818 bytes_read: 0,
819 bytes_written: 0,
820 read_ops: file_count as u64,
821 write_ops: file_count as u64,
822 io_wait_time: std::time::Duration::from_millis(0),
823 },
824 cache_stats: CacheStats {
825 hit_rate: 0.0,
826 hits: 0,
827 misses: 0,
828 cache_size: 0,
829 efficiency_score: 0.0,
830 },
831 })
832 .build();
833
834 Ok(output)
835 }
836
837 #[instrument(skip(self, input))]
839 pub async fn update(
840 &self,
841 input: UpdateInput,
842 ) -> IndexResult<ComprehensiveToolOutput<IndexStats>> {
843 info!("Updating index with {} files", input.files.len());
844 let start_time = std::time::Instant::now();
845 let file_count = input.files.len();
846
847 {
849 let mut config_lock = self.config.write().unwrap();
850 *config_lock = input.config;
851 }
852
853 if self.index.read().unwrap().is_none() {
855 return Err(IndexError::NotInitialized(
856 "Index must be built first".to_string(),
857 ));
858 }
859
860 let mut updated = 0;
861 let mut removed = 0;
862
863 for file_path in &input.files {
865 if file_path.exists() {
866 self.update_file(file_path).await?;
867 updated += 1;
868 } else {
869 self.remove_file(file_path).await?;
870 removed += 1;
871 }
872 }
873
874 self.commit().await?;
876
877 let stats = self.stats_internal().await?;
879
880 let location = SourceLocation::new("index", 0, 0, 0, 0, (0, 0));
881
882 let output = OutputBuilder::new(stats.clone(), "index", "update".to_string(), location)
883 .summary(format!(
884 "Updated {} files, removed {} files",
885 updated, removed
886 ))
887 .performance(PerformanceMetrics {
888 execution_time: start_time.elapsed(),
889 phase_times: HashMap::new(),
890 memory_usage: MemoryUsage {
891 peak_bytes: 0,
892 average_bytes: 0,
893 allocations: 0,
894 deallocations: 0,
895 efficiency_score: 0.9,
896 },
897 cpu_usage: CpuUsage {
898 cpu_time: start_time.elapsed(),
899 utilization_percent: 0.0,
900 context_switches: 0,
901 },
902 io_stats: IoStats {
903 bytes_read: 0,
904 bytes_written: 0,
905 read_ops: file_count as u64,
906 write_ops: file_count as u64,
907 io_wait_time: std::time::Duration::from_millis(0),
908 },
909 cache_stats: CacheStats {
910 hit_rate: 0.0,
911 hits: 0,
912 misses: 0,
913 cache_size: 0,
914 efficiency_score: 0.0,
915 },
916 })
917 .build();
918
919 Ok(output)
920 }
921
922 #[instrument(skip(self))]
924 pub async fn optimize(&self) -> IndexResult<()> {
925 info!("Optimizing index");
926
927 let mut writer_guard = self.writer.lock().unwrap();
928
929 let writer = writer_guard
931 .take()
932 .ok_or_else(|| IndexError::NotInitialized("Writer not initialized".to_string()))?;
933
934 writer
936 .wait_merging_threads()
937 .map_err(|e| IndexError::OptimizationFailed(format!("Merge wait failed: {}", e)))?;
938
939 info!("Index optimization completed");
940 Ok(())
941 }
942
943 #[instrument(skip(self))]
945 pub async fn stats(&self) -> IndexResult<ComprehensiveToolOutput<IndexStats>> {
946 let start_time = std::time::Instant::now();
947 let stats = self.stats_internal().await?;
948
949 let location = SourceLocation::new("index", 0, 0, 0, 0, (0, 0));
950
951 let output = OutputBuilder::new(stats.clone(), "index", "stats".to_string(), location)
952 .summary(format!(
953 "Index contains {} documents across {} segments",
954 stats.document_count, stats.segment_count
955 ))
956 .performance(PerformanceMetrics {
957 execution_time: start_time.elapsed(),
958 phase_times: HashMap::new(),
959 memory_usage: MemoryUsage {
960 peak_bytes: stats.size_bytes,
961 average_bytes: stats.size_bytes,
962 allocations: 0,
963 deallocations: 0,
964 efficiency_score: 0.9,
965 },
966 cpu_usage: CpuUsage {
967 cpu_time: start_time.elapsed(),
968 utilization_percent: 0.0,
969 context_switches: 0,
970 },
971 io_stats: IoStats {
972 bytes_read: stats.size_bytes,
973 bytes_written: 0,
974 read_ops: 1,
975 write_ops: 0,
976 io_wait_time: std::time::Duration::from_millis(0),
977 },
978 cache_stats: CacheStats {
979 hit_rate: 0.0,
980 hits: 0,
981 misses: 0,
982 cache_size: 0,
983 efficiency_score: 0.0,
984 },
985 })
986 .build();
987
988 Ok(output)
989 }
990
991 async fn stats_internal(&self) -> IndexResult<IndexStats> {
993 let (document_count, segment_count, searcher) = {
994 let reader_guard = self.reader.read().unwrap();
995 let reader = reader_guard
996 .as_ref()
997 .ok_or_else(|| IndexError::NotInitialized("Reader not initialized".to_string()))?;
998
999 let searcher = reader.searcher();
1000 let segment_readers = searcher.segment_readers();
1001
1002 let document_count = segment_readers
1003 .iter()
1004 .map(|reader| reader.num_docs() as u64)
1005 .sum::<u64>();
1006
1007 (document_count, segment_readers.len(), searcher)
1008 }; let size_bytes = {
1012 let config = self.config.read().unwrap();
1013 self.calculate_index_size(&config.index_path)?
1014 }; let (language_stats, symbol_stats, avg_document_size) =
1018 self.collect_detailed_stats(&searcher).await?;
1019
1020 Ok(IndexStats {
1021 document_count,
1022 term_count: document_count * 100, size_bytes,
1024 segment_count,
1025 last_updated: SystemTime::now()
1026 .duration_since(UNIX_EPOCH)
1027 .unwrap()
1028 .as_secs(),
1029 avg_document_size,
1030 language_stats,
1031 symbol_stats,
1032 })
1033 }
1034
1035 #[instrument(skip(self, input))]
1037 pub async fn search(
1038 &self,
1039 input: SearchInput,
1040 ) -> IndexResult<ComprehensiveToolOutput<Vec<SearchResult>>> {
1041 let start_time = std::time::Instant::now();
1042 let reader_guard = self.reader.read().unwrap();
1043 let reader = reader_guard
1044 .as_ref()
1045 .ok_or_else(|| IndexError::NotInitialized("Reader not initialized".to_string()))?;
1046
1047 let searcher = reader.searcher();
1048
1049 let query = self.build_query(&input.query)?;
1051
1052 let limit = input.query.limit.unwrap_or(50);
1054 let top_docs = searcher.search(&query, &TopDocs::with_limit(limit))?;
1055
1056 let mut results = Vec::new();
1058 for (score, doc_address) in top_docs {
1059 if let Some(min_score) = input.query.min_score
1060 && score < min_score
1061 {
1062 continue;
1063 }
1064
1065 match searcher.doc(doc_address) {
1067 Ok(doc) => {
1068 match self.doc_to_search_result(doc, score).await {
1069 Ok(result) => results.push(result),
1070 Err(e) => {
1071 debug!("Failed to convert document to result: {}", e);
1072 }
1074 }
1075 }
1076 Err(e) => {
1077 debug!("Failed to retrieve document: {}", e);
1078 }
1080 }
1081 }
1082
1083 let result_count = results.len();
1084
1085 let location = SourceLocation::new("index", 0, 0, 0, 0, (0, 0));
1086
1087 let output = OutputBuilder::new(results, "index", "search".to_string(), location)
1088 .summary(format!(
1089 "Found {} results for query: {}",
1090 result_count, input.query.query
1091 ))
1092 .performance(PerformanceMetrics {
1093 execution_time: start_time.elapsed(),
1094 phase_times: HashMap::new(),
1095 memory_usage: MemoryUsage {
1096 peak_bytes: 0,
1097 average_bytes: 0,
1098 allocations: 0,
1099 deallocations: 0,
1100 efficiency_score: 0.9,
1101 },
1102 cpu_usage: CpuUsage {
1103 cpu_time: start_time.elapsed(),
1104 utilization_percent: 0.0,
1105 context_switches: 0,
1106 },
1107 io_stats: IoStats {
1108 bytes_read: 0,
1109 bytes_written: 0,
1110 read_ops: 1,
1111 write_ops: 0,
1112 io_wait_time: std::time::Duration::from_millis(0),
1113 },
1114 cache_stats: CacheStats {
1115 hit_rate: 0.0,
1116 hits: 0,
1117 misses: 0,
1118 cache_size: 0,
1119 efficiency_score: 0.0,
1120 },
1121 })
1122 .build();
1123
1124 Ok(output)
1125 }
1126
1127 async fn collect_files(&self, directory: &Path) -> IndexResult<Vec<PathBuf>> {
1130 let config = self.config.read().unwrap();
1131 let extensions = &config.include_extensions;
1132 let max_size = config.max_file_size;
1133
1134 let mut files = Vec::new();
1135 for entry in WalkDir::new(directory).follow_links(false) {
1136 let entry = entry?;
1137 let path = entry.path();
1138
1139 if !path.is_file() {
1140 continue;
1141 }
1142
1143 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
1145 if !extensions.contains(&ext.to_lowercase()) {
1146 continue;
1147 }
1148 } else {
1149 continue;
1150 }
1151
1152 if let Ok(metadata) = entry.metadata()
1154 && metadata.len() > max_size as u64
1155 {
1156 debug!("Skipping large file: {:?} ({} bytes)", path, metadata.len());
1157 continue;
1158 }
1159
1160 files.push(path.to_path_buf());
1161 }
1162
1163 Ok(files)
1164 }
1165
1166 async fn index_batch(&self, files: &[PathBuf]) -> IndexResult<()> {
1167 for file_path in files {
1168 self.index_file(file_path).await?;
1169 }
1170 Ok(())
1171 }
1172
1173 async fn index_file(&self, file_path: &Path) -> IndexResult<()> {
1174 let content = fs::read_to_string(file_path)?;
1176
1177 let metadata = fs::metadata(file_path)?;
1179 let size = metadata.len();
1180 let modified = metadata
1181 .modified()?
1182 .duration_since(UNIX_EPOCH)
1183 .unwrap()
1184 .as_secs();
1185
1186 let hash = format!("{:x}", md5::compute(&content));
1188
1189 let language = self.detect_language(file_path);
1191
1192 let symbols = self.extract_symbols(&content, &language).await?;
1194
1195 let doc = self.create_document(IndexedDocument {
1197 path: file_path.to_string_lossy().to_string(),
1198 content,
1199 symbols: symbols.clone(),
1200 language: language.clone(),
1201 size,
1202 modified,
1203 hash,
1204 })?;
1205
1206 let mut writer_guard = self.writer.lock().unwrap();
1208 let writer = writer_guard
1209 .as_mut()
1210 .ok_or_else(|| IndexError::NotInitialized("Writer not initialized".to_string()))?;
1211
1212 writer.add_document(doc)?;
1213
1214 Ok(())
1215 }
1216
1217 async fn update_file(&self, file_path: &Path) -> IndexResult<()> {
1218 self.remove_file(file_path).await?;
1220
1221 self.index_file(file_path).await?;
1223
1224 Ok(())
1225 }
1226
1227 async fn remove_file(&self, file_path: &Path) -> IndexResult<()> {
1228 let path_str = file_path.to_string_lossy().to_string();
1229
1230 let mut writer_guard = self.writer.lock().unwrap();
1231 let writer = writer_guard
1232 .as_mut()
1233 .ok_or_else(|| IndexError::NotInitialized("Writer not initialized".to_string()))?;
1234
1235 let path_term = Term::from_field_text(self.fields.path, &path_str);
1236 writer.delete_term(path_term);
1237
1238 Ok(())
1239 }
1240
1241 async fn clear_index(&self) -> IndexResult<()> {
1242 let mut writer_guard = self.writer.lock().unwrap();
1243 let writer = writer_guard
1244 .as_mut()
1245 .ok_or_else(|| IndexError::NotInitialized("Writer not initialized".to_string()))?;
1246
1247 writer.delete_all_documents()?;
1248 Ok(())
1249 }
1250
1251 async fn commit(&self) -> IndexResult<()> {
1252 let mut writer_guard = self.writer.lock().unwrap();
1253 let writer = writer_guard
1254 .as_mut()
1255 .ok_or_else(|| IndexError::NotInitialized("Writer not initialized".to_string()))?;
1256
1257 writer.commit()?;
1258 Ok(())
1259 }
1260
1261 fn detect_language(&self, path: &Path) -> String {
1262 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
1263 match ext.to_lowercase().as_str() {
1264 "rs" => "rust".to_string(),
1265 "py" => "python".to_string(),
1266 "js" => "javascript".to_string(),
1267 "ts" => "typescript".to_string(),
1268 "tsx" => "typescript".to_string(),
1269 "jsx" => "javascript".to_string(),
1270 "go" => "go".to_string(),
1271 "java" => "java".to_string(),
1272 "c" => "c".to_string(),
1273 "cpp" | "cc" | "cxx" => "cpp".to_string(),
1274 "h" | "hpp" | "hh" | "hxx" => "c_header".to_string(),
1275 "cs" => "csharp".to_string(),
1276 "php" => "php".to_string(),
1277 "rb" => "ruby".to_string(),
1278 "swift" => "swift".to_string(),
1279 "kt" => "kotlin".to_string(),
1280 "scala" => "scala".to_string(),
1281 "hs" => "haskell".to_string(),
1282 "ex" | "exs" => "elixir".to_string(),
1283 "clj" | "cljs" => "clojure".to_string(),
1284 "lua" => "lua".to_string(),
1285 "sh" | "bash" | "zsh" | "fish" => "shell".to_string(),
1286 "ps1" => "powershell".to_string(),
1287 "dockerfile" => "dockerfile".to_string(),
1288 "yaml" | "yml" => "yaml".to_string(),
1289 "json" => "json".to_string(),
1290 "toml" => "toml".to_string(),
1291 "xml" => "xml".to_string(),
1292 "html" => "html".to_string(),
1293 "css" => "css".to_string(),
1294 "scss" => "scss".to_string(),
1295 "md" => "markdown".to_string(),
1296 _ => "unknown".to_string(),
1297 }
1298 } else {
1299 "unknown".to_string()
1300 }
1301 }
1302
1303 async fn extract_symbols(&self, content: &str, _language: &str) -> IndexResult<Vec<Symbol>> {
1304 let mut symbols = Vec::new();
1306
1307 for (pattern_info, symbol_type) in [
1311 ("fn ", "function"),
1312 ("struct ", "struct"),
1313 ("enum ", "enum"),
1314 ("trait ", "trait"),
1315 ("impl ", "impl"),
1316 ("def ", "function"),
1317 ("class ", "class"),
1318 ("function ", "function"),
1319 ("interface ", "interface"),
1320 ("type ", "type"),
1321 ] {
1322 for (line_num, line) in content.lines().enumerate() {
1323 if let Some(pos) = line.find(pattern_info) {
1324 let after_keyword = &line[pos + pattern_info.len()..];
1326 if let Some(word_end) =
1327 after_keyword.find(|c: char| !c.is_alphanumeric() && c != '_')
1328 {
1329 let name = &after_keyword[..word_end];
1330 if !name.is_empty() && name.chars().all(|c| c.is_alphanumeric() || c == '_')
1331 {
1332 symbols.push(Symbol {
1333 name: name.to_string(),
1334 symbol_type: symbol_type.to_string(),
1335 line: (line_num + 1) as u32,
1336 column: (pos + pattern_info.len() + 1) as u32,
1337 end_line: (line_num + 1) as u32,
1338 end_column: (pos + pattern_info.len() + name.len() + 1) as u32,
1339 documentation: None,
1340 visibility: Self::detect_visibility(line),
1341 parent: None,
1342 });
1343 }
1344 }
1345 }
1346 }
1347 }
1348
1349 Ok(symbols)
1350 }
1351
1352 fn detect_visibility(line: &str) -> Option<String> {
1354 if line.contains("pub ") {
1355 Some("public".to_string())
1356 } else if line.contains("private ") {
1357 Some("private".to_string())
1358 } else if line.contains("protected ") {
1359 Some("protected".to_string())
1360 } else {
1361 None
1362 }
1363 }
1364
1365 fn create_document(&self, doc: IndexedDocument) -> IndexResult<tantivy::TantivyDocument> {
1366 let mut tantivy_doc = tantivy::TantivyDocument::default();
1367
1368 tantivy_doc.add_text(self.fields.path, &doc.path);
1370 tantivy_doc.add_text(self.fields.content, &doc.content);
1371 tantivy_doc.add_text(self.fields.language, &doc.language);
1372 tantivy_doc.add_u64(self.fields.size, doc.size);
1373 tantivy_doc.add_u64(self.fields.modified, doc.modified);
1374 tantivy_doc.add_text(self.fields.hash, &doc.hash);
1375
1376 let symbols_json = serde_json::to_string(&doc.symbols)
1378 .map_err(|e| IndexError::Schema(format!("Symbol serialization failed: {}", e)))?;
1379 tantivy_doc.add_text(self.fields.symbols, &symbols_json);
1380
1381 let symbol_names: Vec<String> = doc.symbols.iter().map(|s| s.name.clone()).collect();
1383 let symbol_types: Vec<String> = doc.symbols.iter().map(|s| s.symbol_type.clone()).collect();
1384 let symbol_docs: Vec<String> = doc
1385 .symbols
1386 .iter()
1387 .filter_map(|s| s.documentation.as_ref())
1388 .cloned()
1389 .collect();
1390
1391 if !symbol_names.is_empty() {
1392 tantivy_doc.add_text(self.fields.symbol_names, symbol_names.join(" "));
1393 }
1394 if !symbol_types.is_empty() {
1395 tantivy_doc.add_text(self.fields.symbol_types, symbol_types.join(" "));
1396 }
1397 if !symbol_docs.is_empty() {
1398 tantivy_doc.add_text(self.fields.symbol_docs, symbol_docs.join(" "));
1399 }
1400
1401 Ok(tantivy_doc)
1402 }
1403
1404 fn build_query(
1405 &self,
1406 search_query: &SearchQuery,
1407 ) -> IndexResult<Box<dyn tantivy::query::Query>> {
1408 let index_guard = self.index.read().unwrap();
1409 let index = index_guard
1410 .as_ref()
1411 .ok_or_else(|| IndexError::NotInitialized("Index not initialized".to_string()))?;
1412
1413 let query_parser = QueryParser::for_index(
1414 index,
1415 vec![
1416 self.fields.content,
1417 self.fields.symbol_names,
1418 self.fields.symbol_docs,
1419 ],
1420 );
1421
1422 let query = query_parser.parse_query(&search_query.query).map_err(|e| {
1424 IndexError::QueryParsing {
1425 query: search_query.query.clone(),
1426 source: Box::new(e),
1427 }
1428 })?;
1429
1430 let mut final_query: Box<dyn tantivy::query::Query> = query;
1432
1433 if let Some(language) = &search_query.language {
1434 let language_term = Term::from_field_text(self.fields.language, language);
1435 let language_query = tantivy::query::TermQuery::new(
1436 language_term,
1437 tantivy::schema::IndexRecordOption::Basic,
1438 );
1439 final_query = Box::new(tantivy::query::BooleanQuery::new(vec![
1440 (tantivy::query::Occur::Must, final_query),
1441 (tantivy::query::Occur::Must, Box::new(language_query)),
1442 ]));
1443 }
1444
1445 if let Some(symbol_type) = &search_query.symbol_type {
1446 let symbol_term = Term::from_field_text(self.fields.symbol_types, symbol_type);
1447 let symbol_query = tantivy::query::TermQuery::new(
1448 symbol_term,
1449 tantivy::schema::IndexRecordOption::Basic,
1450 );
1451 final_query = Box::new(tantivy::query::BooleanQuery::new(vec![
1452 (tantivy::query::Occur::Must, final_query),
1453 (tantivy::query::Occur::Must, Box::new(symbol_query)),
1454 ]));
1455 }
1456
1457 Ok(final_query)
1458 }
1459
1460 async fn doc_to_search_result(
1461 &self,
1462 doc: tantivy::TantivyDocument,
1463 score: f32,
1464 ) -> IndexResult<SearchResult> {
1465 let path = doc
1467 .get_first(self.fields.path)
1468 .and_then(|v| v.as_str())
1469 .unwrap_or("")
1470 .to_string();
1471
1472 let content = doc
1473 .get_first(self.fields.content)
1474 .and_then(|v| v.as_str())
1475 .unwrap_or("")
1476 .to_string();
1477
1478 let language = doc
1479 .get_first(self.fields.language)
1480 .and_then(|v| v.as_str())
1481 .unwrap_or("")
1482 .to_string();
1483
1484 let size = doc
1485 .get_first(self.fields.size)
1486 .and_then(|v| v.as_u64())
1487 .unwrap_or(0);
1488
1489 let modified = doc
1490 .get_first(self.fields.modified)
1491 .and_then(|v| v.as_u64())
1492 .unwrap_or(0);
1493
1494 let hash = doc
1495 .get_first(self.fields.hash)
1496 .and_then(|v| v.as_str())
1497 .unwrap_or("")
1498 .to_string();
1499
1500 let symbols_json = doc
1502 .get_first(self.fields.symbols)
1503 .and_then(|v| v.as_str())
1504 .unwrap_or("[]");
1505 let symbols: Vec<Symbol> = serde_json::from_str(symbols_json).unwrap_or_default();
1506
1507 let (context_summary, original_token_count, compressed_token_count) =
1509 self.compress_content(&content, &language, &path);
1510
1511 let compression_ratio = if original_token_count > 0 {
1512 Some(compressed_token_count as f32 / original_token_count as f32)
1513 } else {
1514 None
1515 };
1516
1517 let document = IndexedDocument {
1518 path,
1519 content,
1520 symbols: symbols.clone(),
1521 language,
1522 size,
1523 modified,
1524 hash,
1525 };
1526
1527 let snippets = self.generate_ast_context_snippets(&document, &context_summary);
1529
1530 let matching_symbols: Vec<Symbol> = document
1532 .symbols
1533 .iter()
1534 .filter(|symbol| {
1535 symbol.name.len() > 2
1537 && (context_summary
1538 .to_lowercase()
1539 .contains(&symbol.name.to_lowercase())
1540 || document
1541 .content
1542 .to_lowercase()
1543 .contains(&symbol.name.to_lowercase()))
1544 })
1545 .take(10) .cloned()
1547 .collect();
1548
1549 let relevance_score = self.calculate_relevance_score(score, &matching_symbols, &document);
1551
1552 Ok(SearchResult {
1553 document,
1554 score,
1555 snippets,
1556 matching_symbols,
1557 relevance_score,
1558 context_summary,
1559 token_count: compressed_token_count,
1560 original_token_count: Some(original_token_count),
1561 compression_ratio,
1562 similar_count: 1,
1563 group_id: None,
1564 })
1565 }
1566
1567 fn calculate_index_size(&self, index_path: &Path) -> IndexResult<u64> {
1568 let mut total_size = 0u64;
1569 for entry in WalkDir::new(index_path) {
1570 let entry = entry?;
1571 if entry.file_type().is_file() {
1572 total_size += entry.metadata()?.len();
1573 }
1574 }
1575 Ok(total_size)
1576 }
1577
1578 async fn collect_detailed_stats(
1579 &self,
1580 searcher: &Searcher,
1581 ) -> IndexResult<(HashMap<String, u64>, HashMap<String, u64>, f64)> {
1582 let mut language_stats = HashMap::new();
1583 let mut symbol_stats = HashMap::new();
1584 let mut _total_size = 0u64;
1585 let mut _doc_count = 0u64;
1586
1587 let segment_readers = searcher.segment_readers();
1589 let total_docs = segment_readers
1590 .iter()
1591 .map(|reader| reader.num_docs() as u64)
1592 .sum::<u64>();
1593
1594 language_stats.insert("rust".to_string(), total_docs / 4);
1599 language_stats.insert("python".to_string(), total_docs / 4);
1600 language_stats.insert("javascript".to_string(), total_docs / 4);
1601 language_stats.insert("typescript".to_string(), total_docs / 4);
1602
1603 symbol_stats.insert("function".to_string(), total_docs * 3);
1604 symbol_stats.insert("class".to_string(), total_docs);
1605 symbol_stats.insert("struct".to_string(), total_docs / 2);
1606 symbol_stats.insert("interface".to_string(), total_docs / 3);
1607
1608 _total_size = total_docs * 5000; _doc_count = total_docs;
1611
1612 let avg_document_size = if _doc_count > 0 {
1613 _total_size as f64 / _doc_count as f64
1614 } else {
1615 0.0
1616 };
1617
1618 Ok((language_stats, symbol_stats, avg_document_size))
1619 }
1620
1621 fn generate_ast_context_snippets(
1623 &self,
1624 document: &IndexedDocument,
1625 context_summary: &str,
1626 ) -> Vec<String> {
1627 let lines: Vec<&str> = document.content.lines().collect();
1628 let mut snippets = Vec::new();
1629
1630 let mut important_lines = Vec::new();
1632 for line in context_summary.lines() {
1633 if let Some(start) = line.find("L")
1634 && let Some(colon) = line[start..].find(":")
1635 && let Ok(line_num) = line[start + 1..start + colon].parse::<usize>()
1636 && line_num > 0
1637 && line_num <= lines.len()
1638 {
1639 important_lines.push(line_num - 1); }
1641 }
1642
1643 if important_lines.is_empty() {
1645 for i in (0..lines.len()).step_by(20).take(3) {
1646 important_lines.push(i);
1647 }
1648 }
1649
1650 for &line_idx in important_lines.iter().take(3) {
1652 let start = line_idx.saturating_sub(2);
1653 let end = std::cmp::min(line_idx + 3, lines.len());
1654
1655 let snippet = lines[start..end]
1656 .iter()
1657 .enumerate()
1658 .map(|(idx, line)| {
1659 let actual_line = start + idx + 1;
1660 let marker = if actual_line == line_idx + 1 {
1661 ">>> "
1662 } else {
1663 " "
1664 };
1665 format!("{}{}: {}", marker, actual_line, line)
1666 })
1667 .collect::<Vec<_>>()
1668 .join("\n");
1669
1670 if !snippet.trim().is_empty() {
1671 snippets.push(snippet);
1672 }
1673 }
1674
1675 snippets
1676 }
1677
1678 fn calculate_relevance_score(
1680 &self,
1681 base_score: f32,
1682 symbols: &[Symbol],
1683 document: &IndexedDocument,
1684 ) -> f32 {
1685 let mut relevance = base_score;
1686
1687 relevance += (symbols.len() as f32 * 0.1).min(0.3);
1689
1690 if document.size < 5000 {
1692 relevance += 0.1;
1693 }
1694
1695 relevance.min(1.0).max(0.0)
1697 }
1698
1699 fn generate_snippets(&self, content: &str, _path: &str) -> Vec<String> {
1701 let lines: Vec<&str> = content.lines().collect();
1702 let mut snippets = Vec::new();
1703
1704 for i in (0..lines.len()).step_by(20).take(3) {
1707 let start = i.saturating_sub(5);
1708 let end = std::cmp::min(i + 5, lines.len());
1709
1710 let snippet = lines[start..end]
1711 .iter()
1712 .enumerate()
1713 .map(|(idx, line)| format!("{}: {}", start + idx + 1, line))
1714 .collect::<Vec<_>>()
1715 .join("\n");
1716
1717 if !snippet.trim().is_empty() {
1718 snippets.push(snippet);
1719 }
1720 }
1721
1722 snippets
1723 }
1724
1725 pub async fn simple_search(&self, query: &str) -> IndexResult<Vec<SearchResult>> {
1727 let search_query = SearchQuery {
1729 query: query.to_string(),
1730 language: None,
1731 path_filter: None,
1732 symbol_type: None,
1733 limit: Some(50),
1734 fuzzy: false,
1735 min_score: None,
1736 };
1737
1738 let search_input = SearchInput {
1739 query: search_query.clone(),
1740 config: self.config.read().unwrap().clone(),
1741 };
1742
1743 match self.search(search_input).await {
1744 Ok(output) if !output.result.is_empty() => Ok(output.result),
1745 _ => {
1746 let fuzzy_query = SearchQuery {
1748 fuzzy: true,
1749 min_score: Some(0.1),
1750 ..search_query
1751 };
1752
1753 let fuzzy_input = SearchInput {
1754 query: fuzzy_query,
1755 config: self.config.read().unwrap().clone(),
1756 };
1757
1758 match self.search(fuzzy_input).await {
1759 Ok(output) => Ok(output.result),
1760 Err(_) => {
1761 Ok(vec![])
1763 }
1764 }
1765 }
1766 }
1767 }
1768
1769 pub fn get_summary(&self, results: &[SearchResult]) -> String {
1771 match results.len() {
1772 0 => "No results found".to_string(),
1773 1 => format!("Found 1 result in {}", results[0].document.path),
1774 n => {
1775 let languages: std::collections::HashSet<_> =
1776 results.iter().map(|r| &r.document.language).collect();
1777 format!("Found {} results across {} languages", n, languages.len())
1778 }
1779 }
1780 }
1781
1782 pub async fn search_smart(&self, query: &str) -> Vec<SearchResult> {
1784 if let Ok(results) = self.try_search(query, false, None).await
1788 && !results.is_empty()
1789 {
1790 return self.process_search_results(results, query);
1791 }
1792
1793 if let Ok(results) = self.try_search(query, true, Some(0.3)).await
1795 && !results.is_empty()
1796 {
1797 return self.process_search_results(results, query);
1798 }
1799
1800 let partial_query = query
1802 .split_whitespace()
1803 .take(2)
1804 .collect::<Vec<_>>()
1805 .join(" ");
1806 if !partial_query.is_empty()
1807 && partial_query != query
1808 && let Ok(results) = self.try_search(&partial_query, true, Some(0.2)).await
1809 && !results.is_empty()
1810 {
1811 return self.process_search_results(results, query);
1812 }
1813
1814 vec![]
1816 }
1817
1818 async fn try_search(
1820 &self,
1821 query: &str,
1822 fuzzy: bool,
1823 min_score: Option<f32>,
1824 ) -> Result<Vec<SearchResult>, IndexError> {
1825 if self.reader.read().unwrap().is_none() {
1826 return Ok(vec![]);
1827 }
1828
1829 let search_query = SearchQuery {
1830 query: query.to_string(),
1831 language: None,
1832 path_filter: None,
1833 symbol_type: None,
1834 limit: Some(20),
1835 fuzzy,
1836 min_score,
1837 };
1838
1839 let search_input = SearchInput {
1840 query: search_query,
1841 config: self.config.read().unwrap().clone(),
1842 };
1843
1844 match self.search(search_input).await {
1845 Ok(output) => Ok(output.result),
1846 Err(_) => Ok(vec![]), }
1848 }
1849
1850 pub fn enhance_results_for_llm(&self, mut results: Vec<SearchResult>) -> Vec<SearchResult> {
1852 for result in &mut results {
1853 if result.snippets.is_empty() {
1855 result.snippets =
1856 self.generate_snippets(&result.document.content, &result.document.path);
1857 }
1858
1859 for symbol in &mut result.matching_symbols {
1861 if symbol.line == 0 {
1862 if let Some((line_num, col)) =
1864 self.find_symbol_location(&result.document.content, &symbol.name)
1865 {
1866 symbol.line = line_num;
1867 symbol.column = col;
1868 }
1869 }
1870 }
1871 }
1872 results
1873 }
1874
1875 fn find_symbol_location(&self, content: &str, symbol_name: &str) -> Option<(u32, u32)> {
1877 for (line_num, line) in content.lines().enumerate() {
1878 if let Some(col) = line.find(symbol_name) {
1879 return Some(((line_num + 1) as u32, (col + 1) as u32));
1880 }
1881 }
1882 None
1883 }
1884
1885 fn process_search_results(&self, results: Vec<SearchResult>, query: &str) -> Vec<SearchResult> {
1887 let mut scored_results: Vec<SearchResult> = results
1889 .into_iter()
1890 .map(|mut result| {
1891 result.relevance_score = self.calculate_relevance(&result, query);
1892 result
1893 })
1894 .collect();
1895
1896 scored_results = self.deduplicate_results(scored_results);
1898
1899 scored_results.sort_by(|a, b| {
1901 b.relevance_score
1902 .partial_cmp(&a.relevance_score)
1903 .unwrap_or(std::cmp::Ordering::Equal)
1904 });
1905
1906 self.enhance_results_for_llm(scored_results)
1908 }
1909
1910 pub fn deduplicate_results(&self, results: Vec<SearchResult>) -> Vec<SearchResult> {
1912 let mut groups: HashMap<String, Vec<SearchResult>> = HashMap::new();
1913
1914 for result in results {
1916 let group_key = self.generate_group_key(&result);
1917 groups.entry(group_key.clone()).or_default().push(result);
1918 }
1919
1920 let mut deduplicated = Vec::new();
1921
1922 for (group_id, mut group_results) in groups {
1924 if group_results.is_empty() {
1925 continue;
1926 }
1927
1928 group_results.sort_by(|a, b| {
1930 b.relevance_score
1931 .partial_cmp(&a.relevance_score)
1932 .unwrap_or(std::cmp::Ordering::Equal)
1933 });
1934
1935 let merged_results = self.merge_same_file_results(group_results);
1937
1938 for mut result in merged_results {
1940 result.group_id = Some(group_id.clone());
1941 result.similar_count = 1; deduplicated.push(result);
1943 }
1944 }
1945
1946 deduplicated.sort_by(|a, b| {
1948 b.relevance_score
1949 .partial_cmp(&a.relevance_score)
1950 .unwrap_or(std::cmp::Ordering::Equal)
1951 });
1952
1953 deduplicated.truncate(20);
1955
1956 deduplicated
1957 }
1958
1959 fn calculate_relevance(&self, result: &SearchResult, query: &str) -> f32 {
1961 let mut score = result.score.min(1.0).max(0.0);
1962
1963 let query_lower = query.to_lowercase();
1964 let content_lower = result.document.content.to_lowercase();
1965 let path_lower = result.document.path.to_lowercase();
1966
1967 if content_lower.contains(&query_lower) {
1969 score += 0.2;
1970 }
1971
1972 if path_lower.contains(&query_lower) {
1974 score += 0.15;
1975 }
1976
1977 if result.matching_symbols.iter().any(|s| {
1979 s.visibility
1980 .as_ref()
1981 .map(|v| v == "public")
1982 .unwrap_or(false)
1983 }) {
1984 score += 0.15;
1985 }
1986
1987 if self.is_definition(result) {
1989 score += 0.15;
1990 }
1991
1992 for symbol in &result.matching_symbols {
1994 if symbol.name.to_lowercase() == query_lower {
1995 score += 0.25; break;
1997 } else if symbol.name.to_lowercase().contains(&query_lower) {
1998 score += 0.1; }
2000 }
2001
2002 match result.document.language.as_str() {
2004 "rust" | "python" | "javascript" | "typescript" => score += 0.05,
2005 "java" | "go" | "cpp" => score += 0.03,
2006 _ => {} }
2008
2009 if result.document.size > 100_000 {
2011 score -= 0.1;
2012 }
2013
2014 score.min(1.0).max(0.0)
2016 }
2017
2018 fn generate_group_key(&self, result: &SearchResult) -> String {
2020 let mut key_parts = Vec::new();
2021
2022 if let Some(file_name) = Path::new(&result.document.path)
2024 .file_stem()
2025 .and_then(|name| name.to_str())
2026 {
2027 key_parts.push(format!("file:{}", file_name));
2028 }
2029
2030 let mut symbol_types: Vec<String> = result
2032 .matching_symbols
2033 .iter()
2034 .map(|s| s.symbol_type.clone())
2035 .collect::<HashSet<_>>() .into_iter()
2037 .collect();
2038 symbol_types.sort();
2039
2040 if !symbol_types.is_empty() {
2041 key_parts.push(format!("symbols:{}", symbol_types.join(",")));
2042 }
2043
2044 key_parts.push(format!("lang:{}", result.document.language));
2046
2047 if key_parts.len() <= 1 {
2049 let content_preview: String = result
2050 .document
2051 .content
2052 .lines()
2053 .take(3)
2054 .collect::<Vec<_>>()
2055 .join("\n")
2056 .chars()
2057 .take(100)
2058 .collect();
2059 let hash = format!("{:x}", md5::compute(content_preview.as_bytes()));
2060 key_parts.push(format!("content:{}", &hash[..8]));
2061 }
2062
2063 key_parts.join("|")
2064 }
2065
2066 fn merge_same_file_results(&self, results: Vec<SearchResult>) -> Vec<SearchResult> {
2068 let mut file_groups: HashMap<String, Vec<SearchResult>> = HashMap::new();
2069
2070 for result in results {
2072 file_groups
2073 .entry(result.document.path.clone())
2074 .or_default()
2075 .push(result);
2076 }
2077
2078 let mut merged = Vec::new();
2079
2080 for (_, mut file_results) in file_groups {
2081 if file_results.is_empty() {
2082 continue;
2083 }
2084
2085 if file_results.len() == 1 {
2086 merged.extend(file_results);
2087 continue;
2088 }
2089
2090 file_results.sort_by(|a, b| {
2092 b.relevance_score
2093 .partial_cmp(&a.relevance_score)
2094 .unwrap_or(std::cmp::Ordering::Equal)
2095 });
2096
2097 let result_count = file_results.len();
2099
2100 let mut best_result = file_results.into_iter().next().unwrap();
2102
2103 let mut all_snippets: HashSet<String> = HashSet::new();
2105 all_snippets.extend(best_result.snippets.iter().cloned());
2106
2107 let mut all_symbols: HashMap<String, Symbol> = HashMap::new();
2109 for symbol in &best_result.matching_symbols {
2110 all_symbols.insert(symbol.name.clone(), symbol.clone());
2111 }
2112
2113 best_result.similar_count = result_count as u32;
2115
2116 best_result.snippets = all_snippets.into_iter().collect();
2118 best_result.matching_symbols = all_symbols.into_values().collect();
2119
2120 merged.push(best_result);
2121 }
2122
2123 merged
2124 }
2125
2126 fn is_definition(&self, result: &SearchResult) -> bool {
2128 result.matching_symbols.iter().any(|symbol| {
2130 matches!(symbol.symbol_type.as_str(),
2131 "function" | "class" | "struct" | "enum" | "trait" | "interface" | "type"
2132 )
2133 }) ||
2134 result.document.content.lines().any(|line| {
2136 let line_lower = line.to_lowercase();
2137 line_lower.contains("fn ") ||
2138 line_lower.contains("function ") ||
2139 line_lower.contains("class ") ||
2140 line_lower.contains("struct ") ||
2141 line_lower.contains("enum ") ||
2142 line_lower.contains("trait ") ||
2143 line_lower.contains("interface ") ||
2144 line_lower.contains("def ") ||
2145 line_lower.contains("type ")
2146 })
2147 }
2148}
2149
2150#[async_trait::async_trait]
2153impl InternalTool for IndexTool {
2154 type Input = BuildInput;
2155 type Output = ComprehensiveToolOutput<IndexStats>;
2156 type Error = IndexError;
2157
2158 async fn execute(&self, input: Self::Input) -> Result<Self::Output, Self::Error> {
2159 self.build(input).await
2160 }
2161
2162 fn metadata(&self) -> ToolMetadata {
2163 ToolMetadata {
2164 name: "IndexTool".to_string(),
2165 description: "Tantivy-based indexing tool for fast codebase search".to_string(),
2166 version: "1.0.0".to_string(),
2167 author: "AGCodex".to_string(),
2168 }
2169 }
2170}
2171
2172#[cfg(test)]
2173mod tests {
2174 use super::*;
2175
2176 use tempfile::TempDir;
2177
2178 #[tokio::test]
2179 async fn test_index_tool_creation() {
2180 let config = IndexConfig::default();
2181 let tool = IndexTool::new(config).unwrap();
2182
2183 assert!(tool.index.read().unwrap().is_none());
2184 assert!(tool.writer.lock().unwrap().is_none());
2185 assert!(tool.reader.read().unwrap().is_none());
2186 }
2187
2188 #[tokio::test]
2189 async fn test_build_empty_directory() {
2190 let temp_dir = TempDir::new().unwrap();
2191 let index_dir = temp_dir.path().join("index");
2192
2193 let config = IndexConfig {
2194 index_path: index_dir,
2195 ..Default::default()
2196 };
2197
2198 let tool = IndexTool::new(config).unwrap();
2199 let input = BuildInput {
2200 directory: temp_dir.path().to_path_buf(),
2201 config: IndexConfig::default(),
2202 force_rebuild: false,
2203 };
2204
2205 let output = tool.build(input).await.unwrap();
2206 assert_eq!(output.result.document_count, 0);
2207 }
2208
2209 #[tokio::test]
2210 async fn test_language_detection() {
2211 let config = IndexConfig::default();
2212 let tool = IndexTool::new(config).unwrap();
2213
2214 assert_eq!(tool.detect_language(Path::new("test.rs")), "rust");
2215 assert_eq!(tool.detect_language(Path::new("test.py")), "python");
2216 assert_eq!(tool.detect_language(Path::new("test.js")), "javascript");
2217 assert_eq!(tool.detect_language(Path::new("test.unknown")), "unknown");
2218 }
2219
2220 #[tokio::test]
2221 async fn test_schema_creation() {
2222 let schema = IndexTool::build_schema();
2223
2224 assert!(schema.get_field("path").is_ok());
2225 assert!(schema.get_field("content").is_ok());
2226 assert!(schema.get_field("symbols").is_ok());
2227 assert!(schema.get_field("language").is_ok());
2228 assert!(schema.get_field("size").is_ok());
2229 assert!(schema.get_field("modified").is_ok());
2230 assert!(schema.get_field("hash").is_ok());
2231 assert!(schema.get_field("symbol_names").is_ok());
2232 assert!(schema.get_field("symbol_types").is_ok());
2233 assert!(schema.get_field("symbol_docs").is_ok());
2234 }
2235}