Skip to main content

qex_core/index/
mod.rs

1pub mod storage;
2
3use crate::chunk::multi_language::MultiLanguageChunker;
4use crate::ignore::walk_files;
5use crate::merkle::change_detector::ChangeDetector;
6use crate::merkle::snapshot::SnapshotManager;
7use crate::merkle::MerkleDAG;
8use crate::search::bm25::BM25Index;
9use crate::search::query::analyze_query;
10use crate::search::ranking::rank_results;
11use crate::search::SearchResult;
12use anyhow::{Context, Result};
13use serde::{Deserialize, Serialize};
14use std::collections::HashSet;
15use std::path::Path;
16use std::time::Instant;
17use storage::ProjectStorage;
18use tracing::{debug, info, warn};
19
20#[cfg(feature = "dense")]
21use crate::search::dense::DenseIndex;
22#[cfg(feature = "dense")]
23use crate::search::hybrid::reciprocal_rank_fusion;
24#[cfg(feature = "dense")]
25use std::collections::HashMap;
26
27/// Default ignored directories for Merkle DAG building
28const MERKLE_IGNORE_DIRS: &[&str] = &[
29    "__pycache__",
30    ".git",
31    ".hg",
32    ".svn",
33    "node_modules",
34    ".venv",
35    "venv",
36    "target",
37    "build",
38    "dist",
39    ".next",
40    ".cache",
41    ".qex",
42];
43
44/// Maximum snapshot age before triggering re-index (seconds)
45const MAX_SNAPSHOT_AGE_SECS: i64 = 300; // 5 minutes
46
47/// Result of an indexing operation
48#[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct IndexResult {
50    pub files_indexed: usize,
51    pub chunks_created: usize,
52    pub time_taken_ms: u64,
53    pub languages: Vec<String>,
54    pub incremental: bool,
55    pub files_added: usize,
56    pub files_removed: usize,
57    pub files_modified: usize,
58}
59
60/// Indexing status for a project
61#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct IndexStatus {
63    pub indexed: bool,
64    pub file_count: usize,
65    pub chunk_count: usize,
66    pub last_indexed: Option<String>,
67    pub languages: Vec<String>,
68}
69
70/// Incremental indexer that manages the full indexing pipeline
71pub struct IncrementalIndexer {
72    chunker: MultiLanguageChunker,
73}
74
75impl IncrementalIndexer {
76    pub fn new() -> Self {
77        Self {
78            chunker: MultiLanguageChunker::new(),
79        }
80    }
81
82    /// Perform a full index of a project directory
83    pub fn full_index(
84        &self,
85        project_path: &Path,
86        extensions: Option<&[&str]>,
87    ) -> Result<IndexResult> {
88        let start = Instant::now();
89        let storage = ProjectStorage::for_project(project_path)?;
90
91        info!("Starting full index of {}", project_path.display());
92
93        // Clear existing index
94        if let Ok(bm25) = BM25Index::open(&storage.tantivy_dir()) {
95            let _ = bm25.clear();
96        }
97        storage.clear()?;
98
99        // Build Merkle DAG
100        let dag = MerkleDAG::build(project_path, MERKLE_IGNORE_DIRS)
101            .context("Failed to build Merkle DAG")?;
102
103        // Walk files
104        let files = walk_files(project_path, extensions);
105        let supported_files: Vec<(String, String)> = files
106            .into_iter()
107            .filter(|(abs, _)| self.chunker.is_supported(abs))
108            .collect();
109
110        info!("Found {} supported files", supported_files.len());
111
112        // Chunk all files in parallel
113        let chunk_results = self.chunker.chunk_files(&supported_files);
114        let mut all_chunks = Vec::new();
115        let mut languages = HashSet::new();
116        let mut error_count = 0;
117
118        for (rel_path, result) in chunk_results {
119            match result {
120                Ok(chunks) => {
121                    for chunk in &chunks {
122                        languages.insert(chunk.language.clone());
123                    }
124                    all_chunks.extend(chunks);
125                }
126                Err(e) => {
127                    debug!("Failed to chunk {}: {}", rel_path, e);
128                    error_count += 1;
129                }
130            }
131        }
132
133        if error_count > 0 {
134            warn!("{} files failed to chunk", error_count);
135        }
136
137        // Index chunks in BM25
138        let bm25 = BM25Index::open(&storage.tantivy_dir())
139            .context("Failed to open BM25 index")?;
140        let chunk_count = bm25.add_chunks(&all_chunks)
141            .context("Failed to add chunks to BM25 index")?;
142
143        // Dense vector indexing (if embedder available)
144        #[cfg(feature = "dense")]
145        {
146            if let Ok(mut embedder) = Self::load_embedder() {
147                info!("Dense search enabled — embedding {} chunks", all_chunks.len());
148                let dims = embedder.info().dimensions;
149                let mut dense = DenseIndex::new(dims)?;
150                dense.add_chunks(&all_chunks, embedder.as_mut())?;
151                dense.save(&storage.dense_dir())?;
152                Self::save_dense_meta(&storage, &embedder.info())?;
153                info!("Dense index saved: {} vectors", dense.len());
154            }
155        }
156
157        // Save snapshot
158        let snapshot_manager = SnapshotManager::new(storage.base_dir().to_path_buf());
159        snapshot_manager.save(&dag)?;
160
161        // Save stats
162        let mut lang_list: Vec<String> = languages.into_iter().collect();
163        lang_list.sort();
164
165        let elapsed = start.elapsed();
166
167        let result = IndexResult {
168            files_indexed: supported_files.len(),
169            chunks_created: chunk_count,
170            time_taken_ms: elapsed.as_millis() as u64,
171            languages: lang_list,
172            incremental: false,
173            files_added: supported_files.len(),
174            files_removed: 0,
175            files_modified: 0,
176        };
177
178        storage.save_stats(&result)?;
179
180        info!(
181            "Full index complete: {} files, {} chunks in {}ms",
182            result.files_indexed, result.chunks_created, result.time_taken_ms
183        );
184
185        Ok(result)
186    }
187
188    /// Perform an incremental index update
189    pub fn incremental_index(
190        &self,
191        project_path: &Path,
192        extensions: Option<&[&str]>,
193    ) -> Result<IndexResult> {
194        let start = Instant::now();
195        let storage = ProjectStorage::for_project(project_path)?;
196        let snapshot_manager = SnapshotManager::new(storage.base_dir().to_path_buf());
197
198        // Load previous snapshot
199        let old_dag = match snapshot_manager.load()? {
200            Some(dag) => dag,
201            None => {
202                info!("No previous snapshot found, performing full index");
203                return self.full_index(project_path, extensions);
204            }
205        };
206
207        // Build current DAG
208        let new_dag = MerkleDAG::build(project_path, MERKLE_IGNORE_DIRS)?;
209
210        // Quick check
211        if !ChangeDetector::has_changes(&old_dag, &new_dag) {
212            info!("No changes detected, skipping index update");
213            return Ok(IndexResult {
214                files_indexed: 0,
215                chunks_created: 0,
216                time_taken_ms: start.elapsed().as_millis() as u64,
217                languages: Vec::new(),
218                incremental: true,
219                files_added: 0,
220                files_removed: 0,
221                files_modified: 0,
222            });
223        }
224
225        // Detect changes
226        let changes = ChangeDetector::detect_changes(&old_dag, &new_dag);
227        info!(
228            "Detected changes: {} added, {} removed, {} modified",
229            changes.added.len(),
230            changes.removed.len(),
231            changes.modified.len()
232        );
233
234        let bm25 = BM25Index::open(&storage.tantivy_dir())?;
235
236        // Remove old chunks for removed and modified files
237        let files_to_remove: Vec<&String> = changes
238            .removed
239            .iter()
240            .chain(changes.modified.iter())
241            .collect();
242
243        for rel_path in &files_to_remove {
244            let abs_path = project_path.join(rel_path);
245            let _ = bm25.remove_file(&abs_path.to_string_lossy());
246        }
247
248        // Chunk and index new/modified files
249        let files_to_add: Vec<(String, String)> = changes
250            .added
251            .iter()
252            .chain(changes.modified.iter())
253            .map(|rel| {
254                let abs = project_path.join(rel).to_string_lossy().to_string();
255                (abs, rel.clone())
256            })
257            .filter(|(abs, _)| self.chunker.is_supported(abs))
258            .collect();
259
260        let chunk_results = self.chunker.chunk_files(&files_to_add);
261        let mut all_chunks = Vec::new();
262        let mut languages = HashSet::new();
263
264        for (_rel_path, result) in chunk_results {
265            if let Ok(chunks) = result {
266                for chunk in &chunks {
267                    languages.insert(chunk.language.clone());
268                }
269                all_chunks.extend(chunks);
270            }
271        }
272
273        let chunk_count = bm25.add_chunks(&all_chunks)?;
274
275        // Dense vector indexing (if embedder available)
276        #[cfg(feature = "dense")]
277        {
278            if let Ok(mut embedder) = Self::load_embedder() {
279                let info = embedder.info();
280                let dims = info.dimensions;
281
282                // Check for dimension mismatch with existing index
283                let mut dense = match Self::check_dense_meta(&storage, &info) {
284                    Ok(()) => DenseIndex::open(&storage.dense_dir(), dims)
285                        .or_else(|_| DenseIndex::new(dims))?,
286                    Err(e) => {
287                        warn!("Dense index mismatch: {}. Rebuilding.", e);
288                        DenseIndex::new(dims)?
289                    }
290                };
291
292                // Remove vectors for deleted/modified files (preserves unchanged files)
293                for rel_path in &files_to_remove {
294                    let abs_path = project_path.join(rel_path);
295                    dense.remove_file(&abs_path.to_string_lossy());
296                }
297
298                if !all_chunks.is_empty() {
299                    dense.add_chunks(&all_chunks, embedder.as_mut())?;
300                }
301                dense.save(&storage.dense_dir())?;
302                Self::save_dense_meta(&storage, &info)?;
303                debug!("Dense index updated: {} vectors", dense.len());
304            }
305        }
306
307        // Update snapshot
308        snapshot_manager.save(&new_dag)?;
309
310        let mut lang_list: Vec<String> = languages.into_iter().collect();
311        lang_list.sort();
312
313        let elapsed = start.elapsed();
314
315        let result = IndexResult {
316            files_indexed: files_to_add.len(),
317            chunks_created: chunk_count,
318            time_taken_ms: elapsed.as_millis() as u64,
319            languages: lang_list,
320            incremental: true,
321            files_added: changes.added.len(),
322            files_removed: changes.removed.len(),
323            files_modified: changes.modified.len(),
324        };
325
326        storage.save_stats(&result)?;
327
328        info!(
329            "Incremental index complete: {} chunks in {}ms",
330            result.chunks_created, result.time_taken_ms
331        );
332
333        Ok(result)
334    }
335
336    /// Auto-index: full if no index, incremental if stale
337    pub fn auto_index(
338        &self,
339        project_path: &Path,
340        force: bool,
341        extensions: Option<&[&str]>,
342    ) -> Result<IndexResult> {
343        let storage = ProjectStorage::for_project(project_path)?;
344
345        if force || !storage.has_index() {
346            return self.full_index(project_path, extensions);
347        }
348
349        let snapshot_manager = SnapshotManager::new(storage.base_dir().to_path_buf());
350
351        // Check if snapshot is stale by age
352        let age_stale = snapshot_manager
353            .snapshot_age_secs()
354            .map(|age| age > MAX_SNAPSHOT_AGE_SECS)
355            .unwrap_or(true);
356
357        if age_stale {
358            return self.incremental_index(project_path, extensions);
359        }
360
361        // Even if age is fresh, check root hash for changes
362        let hash_changed = snapshot_manager
363            .load()
364            .ok()
365            .flatten()
366            .and_then(|old_dag| {
367                let new_dag = MerkleDAG::build(project_path, MERKLE_IGNORE_DIRS).ok()?;
368                Some(ChangeDetector::has_changes(&old_dag, &new_dag))
369            })
370            .unwrap_or(true);
371
372        if hash_changed {
373            self.incremental_index(project_path, extensions)
374        } else {
375            Ok(IndexResult {
376                files_indexed: 0,
377                chunks_created: 0,
378                time_taken_ms: 0,
379                languages: Vec::new(),
380                incremental: true,
381                files_added: 0,
382                files_removed: 0,
383                files_modified: 0,
384            })
385        }
386    }
387
388    /// Search with auto-indexing
389    pub fn search(
390        &self,
391        project_path: &Path,
392        query: &str,
393        limit: usize,
394        extension_filter: Option<&str>,
395    ) -> Result<Vec<SearchResult>> {
396        let storage = ProjectStorage::for_project(project_path)?;
397
398        // Auto-index if needed
399        if !storage.has_index() {
400            info!("No index found, auto-indexing before search");
401            self.full_index(project_path, None)?;
402        }
403
404        let bm25 = BM25Index::open(&storage.tantivy_dir())?;
405        let analyzed = analyze_query(query);
406
407        // Perform BM25 search with processed query (stop words removed + synonyms expanded)
408        let mut results = bm25.search(&analyzed.search_query, limit)?;
409
410        // Hybrid search: combine BM25 + dense results if available
411        #[cfg(feature = "dense")]
412        {
413            if let Some(fused) = Self::try_hybrid_search(&storage, &bm25, &results, query, limit) {
414                results = fused;
415            }
416        }
417
418        // Filter by extension if specified
419        if let Some(ext) = extension_filter {
420            results.retain(|r| r.relative_path.ends_with(&format!(".{}", ext)));
421        }
422
423        // Apply multi-factor ranking (includes dedup, thresholding, truncation)
424        rank_results(&mut results, &analyzed, limit);
425
426        Ok(results)
427    }
428
429    /// Get indexing status
430    pub fn get_status(&self, project_path: &Path) -> Result<IndexStatus> {
431        let storage = ProjectStorage::for_project(project_path)?;
432
433        if !storage.has_index() {
434            return Ok(IndexStatus {
435                indexed: false,
436                file_count: 0,
437                chunk_count: 0,
438                last_indexed: None,
439                languages: Vec::new(),
440            });
441        }
442
443        let snapshot_manager = SnapshotManager::new(storage.base_dir().to_path_buf());
444        let metadata = snapshot_manager.load_metadata()?;
445
446        let bm25 = BM25Index::open(&storage.tantivy_dir())?;
447        let chunk_count = bm25.doc_count().unwrap_or(0) as usize;
448
449        let stats = storage.load_stats()?;
450
451        Ok(IndexStatus {
452            indexed: true,
453            file_count: metadata.as_ref().map(|m| m.file_count).unwrap_or(0),
454            chunk_count,
455            last_indexed: metadata.map(|m| m.timestamp.to_rfc3339()),
456            languages: stats.map(|s| s.languages).unwrap_or_default(),
457        })
458    }
459
460    /// Clear the index for a project
461    pub fn clear_index(&self, project_path: &Path) -> Result<()> {
462        let storage = ProjectStorage::for_project(project_path)?;
463        storage.clear_all()?;
464        info!("Cleared index for {}", project_path.display());
465        Ok(())
466    }
467}
468
469impl IncrementalIndexer {
470    /// Attempt hybrid BM25 + dense vector search.
471    /// Returns fused results on success, None on any failure (graceful fallback to BM25-only).
472    #[cfg(feature = "dense")]
473    fn try_hybrid_search(
474        storage: &ProjectStorage,
475        bm25: &BM25Index,
476        bm25_results: &[SearchResult],
477        query: &str,
478        limit: usize,
479    ) -> Option<Vec<SearchResult>> {
480        let dense_dir = storage.dense_dir();
481        if !dense_dir.join("dense.usearch").exists() {
482            return None;
483        }
484
485        let mut embedder = match Self::load_embedder() {
486            Ok(e) => e,
487            Err(e) => {
488                warn!("Failed to load embedder for hybrid search: {}", e);
489                return None;
490            }
491        };
492
493        let dims = embedder.info().dimensions;
494        let dense = match DenseIndex::open(&dense_dir, dims) {
495            Ok(d) => d,
496            Err(e) => {
497                warn!("Failed to open dense index: {}", e);
498                return None;
499            }
500        };
501
502        if dense.is_empty() {
503            return None;
504        }
505
506        let query_vec = match embedder.encode_query(query) {
507            Ok(v) => v,
508            Err(e) => {
509                warn!("Failed to encode query for dense search: {}", e);
510                return None;
511            }
512        };
513
514        let dense_k = (limit * 3).max(20);
515        let dense_matches = match dense.search(&query_vec, dense_k) {
516            Ok(m) => m,
517            Err(e) => {
518                warn!("Dense search failed: {}", e);
519                return None;
520            }
521        };
522
523        // Build lookup map from BM25 results
524        let mut full_map: HashMap<String, SearchResult> = bm25_results
525            .iter()
526            .map(|r| (r.chunk_id.clone(), r.clone()))
527            .collect();
528
529        // Fetch dense-only results from BM25 by chunk_id
530        let missing_ids: Vec<&str> = dense_matches
531            .iter()
532            .filter(|(cid, _)| !full_map.contains_key(cid))
533            .map(|(cid, _)| cid.as_str())
534            .collect();
535        if !missing_ids.is_empty() {
536            if let Ok(extra) = bm25.get_by_chunk_ids(&missing_ids) {
537                full_map.extend(extra);
538            }
539        }
540
541        let fused = reciprocal_rank_fusion(bm25_results, &dense_matches, &full_map);
542        debug!(
543            "Hybrid search: BM25={} dense={} fused={}",
544            full_map.len(),
545            dense_matches.len(),
546            fused.len()
547        );
548
549        Some(fused)
550    }
551
552    #[cfg(feature = "dense")]
553    fn load_embedder() -> Result<Box<dyn crate::search::embedding::Embedder>> {
554        crate::search::embedding::load_embedder()
555    }
556
557    #[cfg(feature = "dense")]
558    fn save_dense_meta(
559        storage: &ProjectStorage,
560        info: &crate::search::embedding::EmbedderInfo,
561    ) -> Result<()> {
562        let meta_path = storage.dense_dir().join("dense_meta.json");
563        std::fs::create_dir_all(storage.dense_dir())?;
564        let json = serde_json::to_string(info)?;
565        std::fs::write(&meta_path, json)?;
566        Ok(())
567    }
568
569    #[cfg(feature = "dense")]
570    fn check_dense_meta(
571        storage: &ProjectStorage,
572        current: &crate::search::embedding::EmbedderInfo,
573    ) -> Result<()> {
574        let meta_path = storage.dense_dir().join("dense_meta.json");
575        if !meta_path.exists() {
576            return Ok(());
577        }
578        let data = std::fs::read_to_string(&meta_path)?;
579        let saved: crate::search::embedding::EmbedderInfo = serde_json::from_str(&data)?;
580        if saved.dimensions != current.dimensions
581            || saved.provider != current.provider
582            || saved.model_name != current.model_name
583        {
584            anyhow::bail!(
585                "Embedder mismatch: index built with {} / {} ({}d), current is {} / {} ({}d). Re-index required.",
586                saved.provider,
587                saved.model_name,
588                saved.dimensions,
589                current.provider,
590                current.model_name,
591                current.dimensions,
592            );
593        }
594        Ok(())
595    }
596}
597
598impl Default for IncrementalIndexer {
599    fn default() -> Self {
600        Self::new()
601    }
602}