codesearch/search/
mod.rs

1use anyhow::Result;
2use colored::Colorize;
3use rayon::prelude::*;
4use serde::Serialize;
5use std::path::{Path, PathBuf};
6use std::time::{Duration, Instant};
7
8use crate::cache::FileMetaStore;
9use crate::chunker::SemanticChunker;
10use crate::embed::{EmbeddingService, ModelType};
11use crate::file::FileWalker;
12use crate::fts::FtsStore;
13use crate::rerank::{rrf_fusion, vector_only, FusedResult, NeuralReranker, DEFAULT_RRF_K};
14use crate::vectordb::VectorStore;
15
16/// Configuration options for search operations
17#[derive(Debug, Clone)]
18pub struct SearchOptions {
19    /// Maximum number of results to return
20    pub max_results: usize,
21    /// Maximum number of results per file
22    pub per_file: Option<usize>,
23    /// Number of content lines to show
24    pub content_lines: usize,
25    /// Whether to show scores
26    pub show_scores: bool,
27    /// Compact output mode
28    pub compact: bool,
29    /// Sync database before search
30    pub sync: bool,
31    /// JSON output mode
32    pub json: bool,
33    /// Optional path filter
34    pub filter_path: Option<String>,
35    /// Optional model override
36    pub model_override: Option<String>,
37    /// Vector-only mode (skip FTS)
38    pub vector_only: bool,
39    /// RRF fusion constant
40    pub rrf_k: Option<usize>,
41    /// Enable neural reranking
42    pub rerank: bool,
43    /// Number of results to rerank
44    pub rerank_top: Option<usize>,
45}
46
47impl Default for SearchOptions {
48    fn default() -> Self {
49        Self {
50            max_results: 10,
51            per_file: None,
52            content_lines: 3,
53            show_scores: false,
54            compact: false,
55            sync: false,
56            json: false,
57            filter_path: None,
58            model_override: None,
59            vector_only: false,
60            rrf_k: None,
61            rerank: false,
62            rerank_top: None,
63        }
64    }
65}
66
67/// JSON output format for search results
68#[derive(Serialize)]
69struct JsonOutput {
70    query: String,
71    results: Vec<JsonResult>,
72    #[serde(skip_serializing_if = "Option::is_none")]
73    timing: Option<JsonTiming>,
74}
75
76#[derive(Serialize)]
77struct JsonResult {
78    path: String,
79    start_line: usize,
80    end_line: usize,
81    kind: String,
82    content: String,
83    score: f32,
84    #[serde(skip_serializing_if = "Option::is_none")]
85    signature: Option<String>,
86    #[serde(skip_serializing_if = "Option::is_none")]
87    context_prev: Option<String>,
88    #[serde(skip_serializing_if = "Option::is_none")]
89    context_next: Option<String>,
90}
91
92#[derive(Serialize)]
93struct JsonTiming {
94    total_ms: u64,
95    embed_ms: u64,
96    search_ms: u64,
97    #[serde(skip_serializing_if = "Option::is_none")]
98    rerank_ms: Option<u64>,
99}
100
101/// Get the database path and project path for a given project directory
102/// Uses automatic database discovery to find indexes in parent/global directories
103fn get_db_path(path: Option<PathBuf>) -> Result<(PathBuf, PathBuf)> {
104    use crate::db_discovery::resolve_database_with_message;
105    resolve_database_with_message(path.as_deref(), "searching")
106}
107
108/// Read model metadata from database
109pub fn read_metadata(db_path: &Path) -> Option<(String, usize, Option<String>)> {
110    let metadata_path = db_path.join("metadata.json");
111    if let Ok(content) = std::fs::read_to_string(&metadata_path) {
112        if let Ok(json) = serde_json::from_str::<serde_json::Value>(&content) {
113            let model = json.get("model_short_name")?.as_str()?.to_string();
114            let dims = json.get("dimensions")?.as_u64()? as usize;
115            let primary_language = json
116                .get("primary_language")
117                .and_then(|v| v.as_str())
118                .map(|s| s.to_string());
119            return Some((model, dims, primary_language));
120        }
121    }
122    None
123}
124
125/// Detect if query contains likely code identifiers
126///
127/// Returns identifiers that look like:
128/// - PascalCase (Class, Struct, Interface)
129/// - snake_case (function, method)
130/// - camelCase (property, variable)
131pub fn detect_identifiers(query: &str) -> Vec<String> {
132    let mut identifiers = Vec::new();
133    for token in query.split_whitespace() {
134        let is_pascal = token
135            .chars()
136            .next()
137            .map(|c| c.is_uppercase())
138            .unwrap_or(false)
139            && token.chars().any(|c| c.is_lowercase())
140            && !["Find", "Show", "Get", "Where", "How", "What", "All"].contains(&token);
141        let is_snake =
142            token.contains('_') && token.chars().all(|c| c.is_alphanumeric() || c == '_');
143        let is_camel = token
144            .chars()
145            .next()
146            .map(|c| c.is_lowercase())
147            .unwrap_or(false)
148            && token.chars().any(|c| c.is_uppercase());
149
150        if is_pascal || is_snake || is_camel {
151            identifiers.push(token.to_string());
152        }
153    }
154    identifiers
155}
156
157/// Detects structural intent in user queries (e.g., "class X", "function foo")
158/// Returns the ChunkKind that matches the intent, if any
159///
160/// This function now only returns a kind when the query contains BOTH:
161/// 1. A structural keyword (class, struct, function, method, enum, interface, trait)
162/// 2. A PascalCase or snake_case identifier suggesting a specific type/function
163///
164/// This prevents excessive noise where "enum" would boost ALL enums in results
165pub fn detect_structural_intent(query: &str) -> Option<crate::chunker::ChunkKind> {
166    use crate::chunker::ChunkKind;
167
168    let query_lower = query.to_lowercase();
169
170    // Check if query contains a PascalCase or snake_case identifier
171    // This indicates the user is looking for a specific type/function, not just any of that kind
172    let has_identifier = contains_identifier(query);
173
174    eprintln!(
175        "🔍 detect_structural_intent: query='{}', has_identifier={}",
176        query, has_identifier
177    );
178
179    if !has_identifier {
180        return None; // No specific identifier - don't apply kind boost
181    }
182
183    let kind = if query_lower.contains("class ") {
184        Some(ChunkKind::Class)
185    } else if query_lower.contains("struct ") {
186        Some(ChunkKind::Struct)
187    } else if query_lower.contains("function ") || query_lower.contains("fn ") {
188        Some(ChunkKind::Function)
189    } else if query_lower.contains("method ") {
190        Some(ChunkKind::Method)
191    } else if query_lower.contains("enum ") {
192        Some(ChunkKind::Enum)
193    } else if query_lower.contains("interface ") {
194        Some(ChunkKind::Interface)
195    } else if query_lower.contains("trait ") {
196        Some(ChunkKind::Trait)
197    } else {
198        None
199    };
200
201    eprintln!("🔍 detect_structural_intent: kind={:?}", kind);
202    kind
203}
204
205/// Checks if query contains a PascalCase or snake_case identifier
206/// indicating a specific type/function name is being searched for
207///
208/// Simple heuristic without regex dependency:
209/// - PascalCase: contains uppercase letter followed by lowercase/digit
210/// - snake_case: contains underscore with lowercase letters around it
211/// - camelCase: contains lowercase letter followed by uppercase letter
212fn contains_identifier(query: &str) -> bool {
213    let chars: Vec<char> = query.chars().collect();
214
215    // Look for PascalCase: uppercase letter followed by lowercase letter or digit
216    for i in 0..chars.len().saturating_sub(1) {
217        if chars[i].is_uppercase() && (chars[i + 1].is_lowercase() || chars[i + 1].is_ascii_digit())
218        {
219            return true;
220        }
221    }
222
223    // Look for snake_case: underscore surrounded by lowercase letters
224    for i in 1..chars.len().saturating_sub(1) {
225        if chars[i] == '_' && chars[i - 1].is_lowercase() && chars[i + 1].is_lowercase() {
226            return true;
227        }
228    }
229
230    // Look for camelCase: lowercase letter followed by uppercase letter
231    for i in 0..chars.len().saturating_sub(1) {
232        if chars[i].is_lowercase() && chars[i + 1].is_uppercase() {
233            return true;
234        }
235    }
236
237    false
238}
239
240/// Boosts results that match a specific ChunkKind by a factor
241pub fn boost_kind(
242    results: &mut Vec<crate::vectordb::SearchResult>,
243    target_kind: crate::chunker::ChunkKind,
244) {
245    let boost_factor = 0.15; // 15% boost for matching kind
246                             // Convert ChunkKind to string for comparison
247    let target_kind_str = format!("{:?}", target_kind);
248    for result in results.iter_mut() {
249        if result.kind == target_kind_str {
250            result.score *= 1.0 + boost_factor;
251        }
252    }
253    // Re-sort after boosting
254    results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
255}
256
257/// Expand query with variants for better matching
258///
259/// OPTIMIZATION: Generate fewer, more targeted variants based on query complexity.
260/// This reduces embedding time and search overhead.
261///
262/// For example:
263/// - "handle_file_modified" → ["handle_file_modified", "fn handle_file_modified", "async fn handle_file_modified", ...]
264/// - "UserService" → ["UserService", "struct UserService", "impl UserService", ...]
265/// - "authentication" → ["authentication", "auth"]
266fn expand_query(query: &str) -> Vec<String> {
267    let mut variants = Vec::new();
268
269    // OPTIMIZATION: Track variant count for logging
270    let original_query = query.to_string();
271
272    // Always include original query
273    variants.push(query.to_string());
274
275    // OPTIMIZATION: Early exit for very short queries or very long complex queries
276    // Short queries: fewer variants needed
277    // Long queries: already descriptive, fewer variants needed
278    if query.len() < 4 || query.len() > 50 {
279        return variants;
280    }
281
282    // Check if query looks like a function name (snake_case with underscores, no spaces)
283    let looks_like_function = query.contains('_') && !query.contains(' ');
284
285    // Check if query looks like a type/struct name (PascalCase, starts with uppercase)
286    let looks_like_type = query
287        .chars()
288        .next()
289        .map(|c| c.is_uppercase())
290        .unwrap_or(false)
291        && !query.contains(' ');
292
293    // OPTIMIZATION: Limit number of variants per category
294    const MAX_FUNCTION_VARIANTS: usize = 5;
295    const MAX_TYPE_VARIANTS: usize = 5;
296    const MAX_CONCEPT_VARIANTS: usize = 2;
297    const MAX_ABBREV_VARIANTS: usize = 2;
298
299    if looks_like_function {
300        // OPTIMIZATION: Only add most relevant function variants
301        // Function name variants - prioritize common prefixes
302        variants.push(format!("fn {}", query));
303        variants.push(format!("async fn {}", query));
304        variants.push(format!("pub fn {}", query));
305
306        // Only add method-style variants if we haven't hit the limit
307        if variants.len() - 1 < MAX_FUNCTION_VARIANTS {
308            variants.push(format!("{} method", query));
309        }
310        if variants.len() - 1 < MAX_FUNCTION_VARIANTS {
311            variants.push(format!("Function: {}", query));
312        }
313    }
314
315    if looks_like_type {
316        // OPTIMIZATION: Only add most relevant type variants
317        // Type/struct name variants - prioritize common keywords
318        variants.push(format!("struct {}", query));
319        variants.push(format!("impl {}", query));
320        variants.push(format!("enum {}", query));
321
322        // Only add more variants if we haven't hit the limit
323        if variants.len() - 1 < MAX_TYPE_VARIANTS {
324            variants.push(format!("class {}", query));
325        }
326        if variants.len() - 1 < MAX_TYPE_VARIANTS {
327            variants.push(format!("Struct: {}", query));
328        }
329    }
330
331    // If query is a single word without underscores and lowercase, it might be a concept
332    let is_single_concept = !query.contains('_')
333        && !query.contains(' ')
334        && query
335            .chars()
336            .next()
337            .map(|c| c.is_lowercase())
338            .unwrap_or(false);
339
340    if is_single_concept {
341        // OPTIMIZATION: Add only most relevant concept variants
342        variants.push(format!("fn {}", query));
343        if variants.len() - 1 < MAX_CONCEPT_VARIANTS {
344            variants.push(format!("{} function", query));
345        }
346    }
347
348    // OPTIMIZATION: Only expand a few common abbreviations
349    let abbreviations: &[(&str, &str)] = &[
350        ("auth", "authentication"),
351        ("config", "configuration"),
352        ("db", "database"),
353        ("conn", "connection"),
354        ("err", "error"),
355        ("msg", "message"),
356    ];
357
358    let mut abbrev_count = 0;
359    for (abbr, full) in abbreviations {
360        if abbrev_count >= MAX_ABBREV_VARIANTS {
361            break;
362        }
363        if query.contains(abbr) {
364            let expanded = query.replace(abbr, full);
365            if expanded != query {
366                variants.push(expanded);
367                abbrev_count += 1;
368            }
369        }
370    }
371
372    // OPTIMIZATION: Cap total variants to avoid excessive processing
373    // Keep original + at most 8 additional variants
374    const MAX_TOTAL_VARIANTS: usize = 9;
375    if variants.len() > MAX_TOTAL_VARIANTS {
376        variants.truncate(MAX_TOTAL_VARIANTS);
377    }
378
379    // OPTIMIZATION: Log variant count for monitoring (when verbose)
380    // This helps track the effectiveness of query variant reduction
381    if std::env::var("CODESEARCH_VERBOSE").is_ok() && variants.len() > 1 {
382        eprintln!(
383            "[optimization] Query expansion: {} -> {} variants (original + {} expansions)",
384            original_query,
385            variants.len(),
386            variants.len() - 1
387        );
388    }
389
390    variants
391}
392
393/// Detect query type and adapt RRF-k accordingly
394/// Returns (vector_k, fts_k) based on query characteristics
395pub fn adapt_rrf_k(query: &str) -> (f64, f64) {
396    let has_identifiers = !detect_identifiers(query).is_empty();
397    let has_structural_intent = detect_structural_intent(query).is_some();
398
399    match (has_identifiers, has_structural_intent) {
400        // Identifier queries: Prioritize vector search (semantic similarity)
401        (true, _) => (12.0, 28.0), // Lower vector k, higher FTS k
402
403        // Structural queries: Balance both
404        (_, true) => (15.0, 25.0),
405
406        // Semantic queries: Balanced
407        _ => (20.0, 20.0),
408    }
409}
410
411/// Search the codebase
412pub async fn search(query: &str, path: Option<PathBuf>, options: SearchOptions) -> Result<()> {
413    let (db_path, _project_path) = get_db_path(path)?;
414
415    if !db_path.exists() {
416        println!("{}", "❌ No database found!".red());
417        println!("   Run {} first", "codesearch index".bright_cyan());
418        println!();
419        println!(
420            "{}",
421            "💡 Tip: codesearch can find databases in parent directories. Use 'codesearch list' to see all indexed projects.".dimmed()
422        );
423        return Ok(());
424    }
425
426    // Read model metadata from database FIRST (needed for sync)
427    let (model_type, dimensions, primary_language) =
428        if let Some(ref model_name) = options.model_override {
429            // User specified a model - use it (warning: may not match indexed data!)
430            let mt = ModelType::parse(model_name).unwrap_or_default();
431            (mt, mt.dimensions(), None)
432        } else if let Some((model_name, dims, lang)) = read_metadata(&db_path) {
433            // Use model from metadata
434            if let Some(mt) = ModelType::parse(&model_name) {
435                (mt, dims, lang)
436            } else {
437                // Model name not recognized, fall back to default
438                eprintln!(
439                    "{}",
440                    "⚠️  Unknown model in metadata, using default".yellow()
441                );
442                (ModelType::default(), 384, None)
443            }
444        } else {
445            // No metadata, fall back to default
446            (ModelType::default(), 384, None)
447        };
448
449    // Perform incremental sync if requested (after we know the model)
450    if options.sync {
451        println!("{}", "🔄 Syncing database...".yellow());
452        sync_database(&db_path, model_type)?;
453    }
454
455    // Load database
456    let start = Instant::now();
457    let store = VectorStore::new(&db_path, dimensions)?;
458    let load_duration = start.elapsed();
459
460    // Initialize embedding service with the correct model
461    let start = Instant::now();
462    let cache_dir = crate::constants::get_global_models_cache_dir()?;
463    let mut embedding_service = EmbeddingService::with_cache_dir(model_type, Some(&cache_dir))?;
464    let model_load_duration = start.elapsed();
465
466    // Expand query with variants for better matching
467    let query_variants = expand_query(query);
468
469    // Embed all query variants in a single batch (OPTIMIZATION: batched ONNX calls)
470    let start = Instant::now();
471    let all_query_embeddings = embedding_service.embed_queries_batch(&query_variants)?;
472
473    let embed_duration = start.elapsed();
474
475    // Search - hybrid by default, vector-only if requested
476    let start = Instant::now();
477
478    // Adaptive retrieval limit based on query type and max_results
479    // For semantic queries, we need more candidates for good RRF fusion
480    // For exact identifier queries, fewer candidates may suffice
481    let has_identifiers = !detect_identifiers(query).is_empty();
482    let retrieval_limit = if options.vector_only {
483        options.max_results
484    } else if has_identifiers {
485        // Identifier queries: fetch fewer results as exact matches are prioritized
486        std::cmp::max(options.max_results * 3, 100)
487    } else {
488        // Semantic queries: need more candidates for good fusion
489        std::cmp::max(options.max_results * 5, 200)
490    };
491
492    // Search with all query variants in parallel and combine results
493    // OPTIMIZATION: Use efficient deduplication with top-N tracking
494    use std::collections::BinaryHeap;
495
496    let vector_search_results: Vec<Vec<crate::vectordb::SearchResult>> = all_query_embeddings
497        .par_iter()
498        .map(|query_emb| store.search(query_emb, retrieval_limit))
499        .collect::<Result<Vec<_>>>()?;
500
501    // OPTIMIZATION: Deduplicate with top-N tracking using BinaryHeap
502    // This avoids collecting all results and then truncating
503    struct HeapEntry {
504        id: u32,
505        score: f32,
506        distance: f32,
507    }
508
509    impl PartialEq for HeapEntry {
510        fn eq(&self, other: &Self) -> bool {
511            self.id == other.id
512        }
513    }
514
515    impl Eq for HeapEntry {}
516
517    impl PartialOrd for HeapEntry {
518        fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
519            Some(self.cmp(other))
520        }
521    }
522
523    impl Ord for HeapEntry {
524        fn cmp(&self, other: &Self) -> std::cmp::Ordering {
525            // Max-heap based on score
526            self.score
527                .partial_cmp(&other.score)
528                .unwrap_or(std::cmp::Ordering::Equal)
529        }
530    }
531
532    // Track top results per chunk ID AND keep one full result per ID
533    let mut top_by_id: std::collections::HashMap<u32, HeapEntry> = std::collections::HashMap::new();
534    let mut full_results_by_id: std::collections::HashMap<u32, crate::vectordb::SearchResult> =
535        std::collections::HashMap::new();
536
537    for results in vector_search_results {
538        for result in results {
539            top_by_id
540                .entry(result.id)
541                .and_modify(|e| {
542                    if result.score > e.score {
543                        e.score = result.score;
544                        e.distance = result.distance;
545                        // Update the stored full result
546                        full_results_by_id.insert(result.id, result.clone());
547                    }
548                })
549                .or_insert_with(|| {
550                    let entry = HeapEntry {
551                        id: result.id,
552                        score: result.score,
553                        distance: result.distance,
554                    };
555                    full_results_by_id.insert(result.id, result.clone());
556                    entry
557                });
558        }
559    }
560
561    // Convert to heap and extract top N
562    let mut heap: BinaryHeap<HeapEntry> = top_by_id.into_values().collect();
563    let mut vector_results: Vec<crate::vectordb::SearchResult> =
564        Vec::with_capacity(retrieval_limit);
565
566    while let Some(entry) = heap.pop() {
567        if vector_results.len() >= retrieval_limit {
568            break;
569        }
570        if let Some(mut result) = full_results_by_id.get(&entry.id).cloned() {
571            result.score = entry.score;
572            result.distance = entry.distance;
573            vector_results.push(result);
574        }
575    }
576
577    // Sort by score descending
578    vector_results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
579
580    // OPTIMIZATION: Early termination for high-confidence exact matches
581    // If top results have very high confidence (very low distance), skip FTS search
582    // This saves ~30-50ms per search for queries with clear matches
583    const HIGH_CONFIDENCE_THRESHOLD: f32 = 0.15; // Distance < 0.15 = very high confidence
584    const EARLY_TERMINATION_TOP_N: usize = 5; // Check top 5 results
585
586    let should_use_vector_only = !options.vector_only && {
587        // Check if top N results all have high confidence
588        let top_results: Vec<_> = vector_results
589            .iter()
590            .take(EARLY_TERMINATION_TOP_N.min(vector_results.len()))
591            .collect();
592
593        let all_high_confidence = top_results
594            .iter()
595            .all(|r| r.distance < HIGH_CONFIDENCE_THRESHOLD);
596
597        // Also ensure we have at least one result
598        !top_results.is_empty() && all_high_confidence
599    };
600
601    // Use vector-only mode if early termination conditions are met
602    let vector_only_mode = options.vector_only || should_use_vector_only;
603
604    // OPTIMIZATION: Log early termination for monitoring
605    if should_use_vector_only && !options.vector_only {
606        eprintln!(
607            "{}",
608            "⚡ Early termination: High-confidence results found, skipping FTS search".green()
609        );
610    }
611
612    let fused_results: Vec<FusedResult> = if vector_only_mode {
613        // Vector-only mode
614        vector_only(&vector_results)
615    } else {
616        // Hybrid search with RRF fusion
617        match FtsStore::new(&db_path) {
618            Ok(fts_store) => {
619                // Detect identifiers for exact match boosting
620                let identifiers = detect_identifiers(query);
621                // Detect structural intent for kind field boosting
622                let structural_intent = detect_structural_intent(query);
623
624                if identifiers.is_empty() {
625                    // No identifiers - standard hybrid search
626                    let fts_results =
627                        fts_store.search(query, retrieval_limit, structural_intent)?;
628                    let k = options.rrf_k.unwrap_or(DEFAULT_RRF_K as usize) as f32;
629                    rrf_fusion(&vector_results, &fts_results, k)
630                } else {
631                    // Has identifiers - use exact match boosting
632                    let fts_results =
633                        fts_store.search(query, retrieval_limit, structural_intent)?;
634
635                    // Search for each identifier and combine exact results
636                    let mut all_exact_results = Vec::new();
637                    let mut seen_exact_ids = std::collections::HashSet::new();
638
639                    for identifier in &identifiers {
640                        if let Ok(exact_matches) =
641                            fts_store.search_exact(identifier, retrieval_limit, structural_intent)
642                        {
643                            for exact_match in exact_matches {
644                                // Deduplicate exact results by chunk ID
645                                if seen_exact_ids.insert(exact_match.chunk_id) {
646                                    all_exact_results.push(exact_match);
647                                }
648                            }
649                        }
650                    }
651
652                    // Use adaptive RRF-k based on query type
653                    let (vector_k, fts_k) = adapt_rrf_k(query);
654                    let k = options.rrf_k.unwrap_or(DEFAULT_RRF_K as usize) as f32;
655                    // Use the smaller of user-specified k and adaptive k (more conservative)
656                    let vector_k_adaptive = vector_k.min(k as f64) as f32;
657                    let fts_k_adaptive = fts_k.min(k as f64) as f32;
658
659                    use crate::rerank::{rrf_fusion_with_exact, EXACT_MATCH_RRF_K};
660                    rrf_fusion_with_exact(
661                        &vector_results,
662                        &fts_results,
663                        &all_exact_results,
664                        vector_k_adaptive,
665                        fts_k_adaptive,
666                        EXACT_MATCH_RRF_K,
667                    )
668                }
669            }
670            Err(_) => {
671                // FTS not available, fall back to vector-only
672                eprintln!(
673                    "{}",
674                    "⚠️  FTS index not found, using vector-only search".yellow()
675                );
676                vector_only(&vector_results)
677            }
678        }
679    };
680
681    // Map fused results back to full SearchResult
682    let mut results: Vec<crate::vectordb::SearchResult> = Vec::new();
683    let chunk_id_to_result: std::collections::HashMap<u32, &crate::vectordb::SearchResult> =
684        vector_results.iter().map(|r| (r.id, r)).collect();
685
686    // OPTIMIZATION: Apply path filter BEFORE expensive operations (reranking, boosting)
687    // This avoids processing results that will be filtered out anyway
688    let should_filter_by_path = options.filter_path.is_some();
689    let filter_path_normalized = options
690        .filter_path
691        .as_ref()
692        .map(|f| f.trim_start_matches("./").to_string());
693
694    // Take top rerank_top results for reranking (or max_results if not reranking)
695    // OPTIMIZATION: Take extra results when path filtering is active to ensure we have enough after filtering
696    let take_multiplier = if should_filter_by_path { 3 } else { 1 };
697    let take_count = if options.rerank {
698        options
699            .rerank_top
700            .unwrap_or(options.max_results)
701            .min(fused_results.len())
702    } else {
703        options.max_results * take_multiplier
704    };
705
706    for fused in fused_results.iter().take(take_count) {
707        if let Some(result) = chunk_id_to_result.get(&fused.chunk_id) {
708            // OPTIMIZATION: Skip early if path filter doesn't match
709            if should_filter_by_path {
710                if let Some(ref filter) = filter_path_normalized {
711                    let path_normalized = result.path.trim_start_matches("./");
712                    if !path_normalized.starts_with(filter) {
713                        continue;
714                    }
715                }
716            }
717
718            // Update score to RRF score
719            let mut r = (*result).clone();
720            r.score = fused.rrf_score;
721            results.push(r);
722        } else {
723            // Result only from FTS, need to fetch from store
724            if let Ok(Some(mut result)) = store.get_chunk_as_result(fused.chunk_id) {
725                // OPTIMIZATION: Skip early if path filter doesn't match
726                if should_filter_by_path {
727                    if let Some(ref filter) = filter_path_normalized {
728                        let path_normalized = result.path.trim_start_matches("./");
729                        if !path_normalized.starts_with(filter) {
730                            continue;
731                        }
732                    }
733                }
734
735                result.score = fused.rrf_score;
736                results.push(result);
737            }
738        }
739    }
740
741    // Log path filtering optimization (verbose mode)
742    if should_filter_by_path {
743        let candidates_processed = take_count;
744        let results_after_filtering = results.len();
745        let filtered_out = candidates_processed.saturating_sub(results_after_filtering);
746        eprintln!(
747            "{}",
748            format!(
749                "🔍 Path filter '{}': {} candidates → {} results ({} filtered out)",
750                filter_path_normalized.as_ref().unwrap_or(&"".to_string()),
751                candidates_processed,
752                results_after_filtering,
753                filtered_out
754            )
755            .blue()
756        );
757    }
758
759    // Language awareness: Boost results from primary language
760    // Extract language from file path (since SearchResult doesn't have language field)
761    if let Some(ref lang) = primary_language {
762        use crate::file::Language;
763        let lang_boost = 0.2; // Boost results from primary language by 20%
764        for result in results.iter_mut() {
765            // Detect language from file path
766            let file_lang = format!(
767                "{:?}",
768                Language::from_path(std::path::Path::new(&result.path))
769            );
770            if file_lang == *lang {
771                result.score *= 1.0 + lang_boost;
772            }
773        }
774        // Re-sort after boosting
775        results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
776    }
777
778    // ChunkKind-Aware Ranking: Boost results matching structural intent
779    if let Some(intent) = detect_structural_intent(query) {
780        boost_kind(&mut results, intent);
781    }
782
783    // Negative Result Check: Report when no exact matches found for identifier queries
784    let identifiers = detect_identifiers(query);
785    if !identifiers.is_empty() && results.is_empty() {
786        eprintln!(
787            "{}",
788            format!(
789                "❓ No exact matches found for identifiers: {}",
790                identifiers.join(", ")
791            )
792            .yellow()
793        );
794        eprintln!("{}", "  Try using broader search terms or running `codesearch index --sync` if the codebase changed.".dimmed());
795    }
796
797    let search_duration = start.elapsed();
798
799    // Neural reranking (if enabled)
800    let mut rerank_duration = Duration::ZERO;
801    if options.rerank && !results.is_empty() {
802        let start = Instant::now();
803
804        // Initialize neural reranker (Jina Reranker v1 Turbo)
805        match NeuralReranker::new() {
806            Ok(mut reranker) => {
807                // Prepare documents for reranking
808                let documents: Vec<String> = results.iter().map(|r| r.content.clone()).collect();
809                let rrf_scores: Vec<f32> = results.iter().map(|r| r.score).collect();
810
811                // Rerank and blend scores
812                match reranker.rerank_and_blend(query, &documents, &rrf_scores) {
813                    Ok(reranked) => {
814                        // Reorder results based on reranked indices
815                        let mut reordered: Vec<crate::vectordb::SearchResult> =
816                            Vec::with_capacity(results.len());
817                        for (idx, score) in reranked {
818                            let mut result = results[idx].clone();
819                            result.score = score;
820                            reordered.push(result);
821                        }
822                        results = reordered;
823                        println!("{}", "✅ Neural reranking applied".green());
824                    }
825                    Err(e) => {
826                        eprintln!("{}", format!("⚠️  Reranking failed: {}", e).yellow());
827                    }
828                }
829            }
830            Err(e) => {
831                eprintln!("{}", format!("⚠️  Could not load reranker: {}", e).yellow());
832            }
833        }
834
835        rerank_duration = start.elapsed();
836    }
837
838    // Filter by path if specified
839    if let Some(ref filter) = options.filter_path {
840        let filter_normalized = filter.trim_start_matches("./");
841        results.retain(|r| {
842            let path_normalized = r.path.trim_start_matches("./");
843            path_normalized.starts_with(filter_normalized)
844        });
845    }
846
847    // Truncate to max_results after reranking and filtering
848    results.truncate(options.max_results);
849
850    // Output results
851    if options.json {
852        let json_results: Vec<JsonResult> = results
853            .iter()
854            .map(|r| JsonResult {
855                path: r.path.clone(),
856                start_line: r.start_line,
857                end_line: r.end_line,
858                kind: r.kind.clone(),
859                content: r.content.clone(),
860                score: r.score,
861                signature: r.signature.clone(),
862                context_prev: r.context_prev.clone(),
863                context_next: r.context_next.clone(),
864            })
865            .collect();
866
867        let timing = if options.show_scores {
868            Some(JsonTiming {
869                total_ms: (load_duration
870                    + model_load_duration
871                    + embed_duration
872                    + search_duration
873                    + rerank_duration)
874                    .as_millis() as u64,
875                embed_ms: embed_duration.as_millis() as u64,
876                search_ms: search_duration.as_millis() as u64,
877                rerank_ms: if options.rerank {
878                    Some(rerank_duration.as_millis() as u64)
879                } else {
880                    None
881                },
882            })
883        } else {
884            None
885        };
886
887        let output = JsonOutput {
888            query: query.to_string(),
889            results: json_results,
890            timing,
891        };
892
893        println!("{}", serde_json::to_string(&output)?);
894        return Ok(());
895    }
896
897    if options.compact {
898        // Show only file paths (like grep -l)
899        let mut seen_files = std::collections::HashSet::new();
900        for result in &results {
901            if !seen_files.contains(&result.path) {
902                println!("{}", result.path);
903                seen_files.insert(result.path.clone());
904            }
905        }
906        return Ok(());
907    }
908
909    // Standard output
910    println!("{}", "🔍 Search Results".bright_cyan().bold());
911    println!("{}", "=".repeat(60));
912    println!("Query: \"{}\"", query.bright_yellow());
913    println!("Found {} results", results.len());
914    println!();
915
916    if options.show_scores {
917        println!("Timing:");
918        println!("   Database load: {:?}", load_duration);
919        println!("   Model load:    {:?}", model_load_duration);
920        println!("   Query embed:   {:?}", embed_duration);
921        println!("   Search:        {:?}", search_duration);
922        if options.rerank {
923            println!("   Reranking:     {:?}", rerank_duration);
924        }
925        println!(
926            "   Total:         {:?}",
927            load_duration
928                + model_load_duration
929                + embed_duration
930                + search_duration
931                + rerank_duration
932        );
933        println!();
934    }
935
936    // Check if no results
937    if results.is_empty() {
938        println!("{}", "No matches found.".dimmed());
939        println!("Try:");
940        println!("  - Using different keywords");
941        println!("  - Making your query more general");
942        println!(
943            "  - Running {} if the codebase changed",
944            "codesearch index --force".bright_cyan()
945        );
946        return Ok(());
947    }
948
949    // Group results by file if per_file > 0
950    if let Some(per_file) = options.per_file {
951        if per_file > 0 && per_file < options.max_results {
952            let mut by_file: std::collections::HashMap<String, Vec<_>> =
953                std::collections::HashMap::new();
954
955            for result in results {
956                by_file.entry(result.path.clone()).or_default().push(result);
957            }
958
959            let mut files: Vec<_> = by_file.into_iter().collect();
960            files.sort_by(|a, b| {
961                b.1.iter()
962                    .map(|r| r.score)
963                    .fold(0.0f32, f32::max)
964                    .partial_cmp(&a.1.iter().map(|r| r.score).fold(0.0f32, f32::max))
965                    .unwrap()
966            });
967
968            for (_file_path, mut file_results) in files {
969                file_results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
970                file_results.truncate(per_file);
971
972                for (idx, result) in file_results.iter().enumerate() {
973                    print_result(
974                        result,
975                        idx == 0,
976                        options.content_lines > 0,
977                        options.show_scores,
978                    )?;
979                }
980            }
981        } else {
982            // Show all results
983            for result in &results {
984                print_result(result, true, options.content_lines > 0, options.show_scores)?;
985            }
986        }
987    } else {
988        // Show all results
989        for result in &results {
990            print_result(result, true, options.content_lines > 0, options.show_scores)?;
991        }
992    }
993
994    Ok(())
995}
996
997/// Sync database by re-indexing changed files
998fn sync_database(db_path: &Path, model_type: ModelType) -> Result<()> {
999    let project_path = db_path.parent().unwrap_or(std::path::Path::new("."));
1000
1001    // Load file metadata store
1002    let mut file_meta =
1003        FileMetaStore::load_or_create(db_path, model_type.short_name(), model_type.dimensions())?;
1004
1005    // Walk the file system
1006    let walker = FileWalker::new(project_path.to_path_buf());
1007    let (files, _stats) = walker.walk()?;
1008
1009    // Initialize services
1010    let cache_dir = crate::constants::get_global_models_cache_dir()?;
1011    let mut embedding_service = EmbeddingService::with_cache_dir(model_type, Some(&cache_dir))?;
1012    let mut chunker = SemanticChunker::new(100, 2000, 10);
1013    let mut store = VectorStore::new(db_path, model_type.dimensions())?;
1014
1015    let mut changes = 0;
1016
1017    // Check for changed files
1018    for file in &files {
1019        let (needs_reindex, old_chunk_ids) = file_meta.check_file(&file.path)?;
1020
1021        if !needs_reindex {
1022            continue;
1023        }
1024
1025        changes += 1;
1026        println!("  📝 {}", file.path.display());
1027
1028        // Delete old chunks
1029        if !old_chunk_ids.is_empty() {
1030            store.delete_chunks(&old_chunk_ids)?;
1031        }
1032
1033        // Read and chunk file
1034        let source_code = match std::fs::read_to_string(&file.path) {
1035            Ok(content) => content,
1036            Err(_) => continue,
1037        };
1038
1039        let chunks = chunker.chunk_semantic(file.language, &file.path, &source_code)?;
1040
1041        if chunks.is_empty() {
1042            file_meta.update_file(&file.path, vec![])?;
1043            continue;
1044        }
1045
1046        // Embed and insert
1047        let embedded_chunks = embedding_service.embed_chunks(chunks)?;
1048        let chunk_ids = store.insert_chunks_with_ids(embedded_chunks)?;
1049        file_meta.update_file(&file.path, chunk_ids)?;
1050    }
1051
1052    // Check for deleted files
1053    let deleted_files = file_meta.find_deleted_files();
1054    for (path, chunk_ids) in &deleted_files {
1055        changes += 1;
1056        println!("  🗑️  {} (deleted)", path);
1057        if !chunk_ids.is_empty() {
1058            store.delete_chunks(chunk_ids)?;
1059        }
1060        file_meta.remove_file(std::path::Path::new(path));
1061    }
1062
1063    // Rebuild index if changes were made
1064    if changes > 0 {
1065        println!("  🔨 Rebuilding index...");
1066        store.build_index()?;
1067        file_meta.save(db_path)?;
1068        println!("  ✅ {} file(s) synced", changes);
1069    } else {
1070        println!("  ✅ Already up to date");
1071    }
1072
1073    Ok(())
1074}
1075
1076fn print_result(
1077    result: &crate::vectordb::SearchResult,
1078    show_file: bool,
1079    show_content: bool,
1080    show_scores: bool,
1081) -> Result<()> {
1082    if show_file {
1083        println!("{}", "─".repeat(60));
1084        let file_display = format!("📄 {}", result.path);
1085        println!("{}", file_display.bright_green());
1086    }
1087
1088    // Show location and kind
1089    let location = format!(
1090        "   Lines {}-{} • {}",
1091        result.start_line, result.end_line, result.kind
1092    );
1093    println!("{}", location.dimmed());
1094
1095    // Show signature if available
1096    if let Some(sig) = &result.signature {
1097        println!("   {}", sig.bright_cyan());
1098    }
1099
1100    // Show score if requested
1101    if show_scores {
1102        let score_color = if result.score > 0.8 {
1103            "green"
1104        } else if result.score > 0.6 {
1105            "yellow"
1106        } else {
1107            "red"
1108        };
1109
1110        let score_text = format!("   Score: {:.3}", result.score);
1111        println!(
1112            "{}",
1113            match score_color {
1114                "green" => score_text.green(),
1115                "yellow" => score_text.yellow(),
1116                _ => score_text.red(),
1117            }
1118        );
1119    }
1120
1121    // Show context if available
1122    if let Some(ctx) = &result.context {
1123        println!("   Context: {}", ctx.dimmed());
1124    }
1125
1126    // Show content if requested
1127    if show_content {
1128        // Show context before (if available)
1129        if let Some(ctx_prev) = &result.context_prev {
1130            println!("\n   {}:", "Context (before)".dimmed());
1131            for line in ctx_prev.lines() {
1132                println!("   │ {}", line.bright_black());
1133            }
1134        }
1135
1136        println!("\n   {}:", "Content".bright_yellow());
1137        for line in result.content.lines().take(10) {
1138            println!("   │ {}", line.dimmed());
1139        }
1140        if result.content.lines().count() > 10 {
1141            println!("   │ {}", "...".dimmed());
1142        }
1143
1144        // Show context after (if available)
1145        if let Some(ctx_next) = &result.context_next {
1146            println!("\n   {}:", "Context (after)".dimmed());
1147            for line in ctx_next.lines() {
1148                println!("   │ {}", line.bright_black());
1149            }
1150        }
1151    } else {
1152        // Show a snippet
1153        let snippet: String = result.content.lines().take(3).collect::<Vec<_>>().join(" ");
1154
1155        let snippet = if snippet.len() > 100 {
1156            format!("{}...", &snippet[..100])
1157        } else {
1158            snippet
1159        };
1160
1161        println!("   {}", snippet.dimmed());
1162    }
1163
1164    println!();
1165
1166    Ok(())
1167}
codesearch/search/mod.rs

codesearch/search/
mod.rs