1pub mod storage;
2
3use crate::chunk::multi_language::MultiLanguageChunker;
4use crate::ignore::walk_files;
5use crate::merkle::change_detector::ChangeDetector;
6use crate::merkle::snapshot::SnapshotManager;
7use crate::merkle::MerkleDAG;
8use crate::search::bm25::BM25Index;
9use crate::search::query::analyze_query;
10use crate::search::ranking::rank_results;
11use crate::search::SearchResult;
12use anyhow::{Context, Result};
13use serde::{Deserialize, Serialize};
14use std::collections::HashSet;
15use std::path::Path;
16use std::time::Instant;
17use storage::ProjectStorage;
18use tracing::{debug, info, warn};
19
20#[cfg(feature = "dense")]
21use crate::search::dense::DenseIndex;
22#[cfg(feature = "dense")]
23use crate::search::hybrid::reciprocal_rank_fusion;
24#[cfg(feature = "dense")]
25use std::collections::HashMap;
26
27const MERKLE_IGNORE_DIRS: &[&str] = &[
29 "__pycache__",
30 ".git",
31 ".hg",
32 ".svn",
33 "node_modules",
34 ".venv",
35 "venv",
36 "target",
37 "build",
38 "dist",
39 ".next",
40 ".cache",
41 ".qex",
42];
43
44const MAX_SNAPSHOT_AGE_SECS: i64 = 300; #[derive(Debug, Clone, Serialize, Deserialize)]
49pub struct IndexResult {
50 pub files_indexed: usize,
51 pub chunks_created: usize,
52 pub time_taken_ms: u64,
53 pub languages: Vec<String>,
54 pub incremental: bool,
55 pub files_added: usize,
56 pub files_removed: usize,
57 pub files_modified: usize,
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
62pub struct IndexStatus {
63 pub indexed: bool,
64 pub file_count: usize,
65 pub chunk_count: usize,
66 pub last_indexed: Option<String>,
67 pub languages: Vec<String>,
68}
69
70pub struct IncrementalIndexer {
72 chunker: MultiLanguageChunker,
73}
74
75impl IncrementalIndexer {
76 pub fn new() -> Self {
77 Self {
78 chunker: MultiLanguageChunker::new(),
79 }
80 }
81
82 pub fn full_index(
84 &self,
85 project_path: &Path,
86 extensions: Option<&[&str]>,
87 ) -> Result<IndexResult> {
88 let start = Instant::now();
89 let storage = ProjectStorage::for_project(project_path)?;
90
91 info!("Starting full index of {}", project_path.display());
92
93 if let Ok(bm25) = BM25Index::open(&storage.tantivy_dir()) {
95 let _ = bm25.clear();
96 }
97 storage.clear()?;
98
99 let dag = MerkleDAG::build(project_path, MERKLE_IGNORE_DIRS)
101 .context("Failed to build Merkle DAG")?;
102
103 let files = walk_files(project_path, extensions);
105 let supported_files: Vec<(String, String)> = files
106 .into_iter()
107 .filter(|(abs, _)| self.chunker.is_supported(abs))
108 .collect();
109
110 info!("Found {} supported files", supported_files.len());
111
112 let chunk_results = self.chunker.chunk_files(&supported_files);
114 let mut all_chunks = Vec::new();
115 let mut languages = HashSet::new();
116 let mut error_count = 0;
117
118 for (rel_path, result) in chunk_results {
119 match result {
120 Ok(chunks) => {
121 for chunk in &chunks {
122 languages.insert(chunk.language.clone());
123 }
124 all_chunks.extend(chunks);
125 }
126 Err(e) => {
127 debug!("Failed to chunk {}: {}", rel_path, e);
128 error_count += 1;
129 }
130 }
131 }
132
133 if error_count > 0 {
134 warn!("{} files failed to chunk", error_count);
135 }
136
137 let bm25 = BM25Index::open(&storage.tantivy_dir())
139 .context("Failed to open BM25 index")?;
140 let chunk_count = bm25.add_chunks(&all_chunks)
141 .context("Failed to add chunks to BM25 index")?;
142
143 #[cfg(feature = "dense")]
145 {
146 if let Ok(mut embedder) = Self::load_embedder() {
147 info!("Dense search enabled — embedding {} chunks", all_chunks.len());
148 let dims = embedder.info().dimensions;
149 let mut dense = DenseIndex::new(dims)?;
150 dense.add_chunks(&all_chunks, embedder.as_mut())?;
151 dense.save(&storage.dense_dir())?;
152 Self::save_dense_meta(&storage, &embedder.info())?;
153 info!("Dense index saved: {} vectors", dense.len());
154 }
155 }
156
157 let snapshot_manager = SnapshotManager::new(storage.base_dir().to_path_buf());
159 snapshot_manager.save(&dag)?;
160
161 let mut lang_list: Vec<String> = languages.into_iter().collect();
163 lang_list.sort();
164
165 let elapsed = start.elapsed();
166
167 let result = IndexResult {
168 files_indexed: supported_files.len(),
169 chunks_created: chunk_count,
170 time_taken_ms: elapsed.as_millis() as u64,
171 languages: lang_list,
172 incremental: false,
173 files_added: supported_files.len(),
174 files_removed: 0,
175 files_modified: 0,
176 };
177
178 storage.save_stats(&result)?;
179
180 info!(
181 "Full index complete: {} files, {} chunks in {}ms",
182 result.files_indexed, result.chunks_created, result.time_taken_ms
183 );
184
185 Ok(result)
186 }
187
188 pub fn incremental_index(
190 &self,
191 project_path: &Path,
192 extensions: Option<&[&str]>,
193 ) -> Result<IndexResult> {
194 let start = Instant::now();
195 let storage = ProjectStorage::for_project(project_path)?;
196 let snapshot_manager = SnapshotManager::new(storage.base_dir().to_path_buf());
197
198 let old_dag = match snapshot_manager.load()? {
200 Some(dag) => dag,
201 None => {
202 info!("No previous snapshot found, performing full index");
203 return self.full_index(project_path, extensions);
204 }
205 };
206
207 let new_dag = MerkleDAG::build(project_path, MERKLE_IGNORE_DIRS)?;
209
210 if !ChangeDetector::has_changes(&old_dag, &new_dag) {
212 info!("No changes detected, skipping index update");
213 return Ok(IndexResult {
214 files_indexed: 0,
215 chunks_created: 0,
216 time_taken_ms: start.elapsed().as_millis() as u64,
217 languages: Vec::new(),
218 incremental: true,
219 files_added: 0,
220 files_removed: 0,
221 files_modified: 0,
222 });
223 }
224
225 let changes = ChangeDetector::detect_changes(&old_dag, &new_dag);
227 info!(
228 "Detected changes: {} added, {} removed, {} modified",
229 changes.added.len(),
230 changes.removed.len(),
231 changes.modified.len()
232 );
233
234 let bm25 = BM25Index::open(&storage.tantivy_dir())?;
235
236 let files_to_remove: Vec<&String> = changes
238 .removed
239 .iter()
240 .chain(changes.modified.iter())
241 .collect();
242
243 for rel_path in &files_to_remove {
244 let abs_path = project_path.join(rel_path);
245 let _ = bm25.remove_file(&abs_path.to_string_lossy());
246 }
247
248 let files_to_add: Vec<(String, String)> = changes
250 .added
251 .iter()
252 .chain(changes.modified.iter())
253 .map(|rel| {
254 let abs = project_path.join(rel).to_string_lossy().to_string();
255 (abs, rel.clone())
256 })
257 .filter(|(abs, _)| self.chunker.is_supported(abs))
258 .collect();
259
260 let chunk_results = self.chunker.chunk_files(&files_to_add);
261 let mut all_chunks = Vec::new();
262 let mut languages = HashSet::new();
263
264 for (_rel_path, result) in chunk_results {
265 if let Ok(chunks) = result {
266 for chunk in &chunks {
267 languages.insert(chunk.language.clone());
268 }
269 all_chunks.extend(chunks);
270 }
271 }
272
273 let chunk_count = bm25.add_chunks(&all_chunks)?;
274
275 #[cfg(feature = "dense")]
277 {
278 if let Ok(mut embedder) = Self::load_embedder() {
279 let info = embedder.info();
280 let dims = info.dimensions;
281
282 let mut dense = match Self::check_dense_meta(&storage, &info) {
284 Ok(()) => DenseIndex::open(&storage.dense_dir(), dims)
285 .or_else(|_| DenseIndex::new(dims))?,
286 Err(e) => {
287 warn!("Dense index mismatch: {}. Rebuilding.", e);
288 DenseIndex::new(dims)?
289 }
290 };
291
292 for rel_path in &files_to_remove {
294 let abs_path = project_path.join(rel_path);
295 dense.remove_file(&abs_path.to_string_lossy());
296 }
297
298 if !all_chunks.is_empty() {
299 dense.add_chunks(&all_chunks, embedder.as_mut())?;
300 }
301 dense.save(&storage.dense_dir())?;
302 Self::save_dense_meta(&storage, &info)?;
303 debug!("Dense index updated: {} vectors", dense.len());
304 }
305 }
306
307 snapshot_manager.save(&new_dag)?;
309
310 let mut lang_list: Vec<String> = languages.into_iter().collect();
311 lang_list.sort();
312
313 let elapsed = start.elapsed();
314
315 let result = IndexResult {
316 files_indexed: files_to_add.len(),
317 chunks_created: chunk_count,
318 time_taken_ms: elapsed.as_millis() as u64,
319 languages: lang_list,
320 incremental: true,
321 files_added: changes.added.len(),
322 files_removed: changes.removed.len(),
323 files_modified: changes.modified.len(),
324 };
325
326 storage.save_stats(&result)?;
327
328 info!(
329 "Incremental index complete: {} chunks in {}ms",
330 result.chunks_created, result.time_taken_ms
331 );
332
333 Ok(result)
334 }
335
336 pub fn auto_index(
338 &self,
339 project_path: &Path,
340 force: bool,
341 extensions: Option<&[&str]>,
342 ) -> Result<IndexResult> {
343 let storage = ProjectStorage::for_project(project_path)?;
344
345 if force || !storage.has_index() {
346 return self.full_index(project_path, extensions);
347 }
348
349 let snapshot_manager = SnapshotManager::new(storage.base_dir().to_path_buf());
350
351 let age_stale = snapshot_manager
353 .snapshot_age_secs()
354 .map(|age| age > MAX_SNAPSHOT_AGE_SECS)
355 .unwrap_or(true);
356
357 if age_stale {
358 return self.incremental_index(project_path, extensions);
359 }
360
361 let hash_changed = snapshot_manager
363 .load()
364 .ok()
365 .flatten()
366 .and_then(|old_dag| {
367 let new_dag = MerkleDAG::build(project_path, MERKLE_IGNORE_DIRS).ok()?;
368 Some(ChangeDetector::has_changes(&old_dag, &new_dag))
369 })
370 .unwrap_or(true);
371
372 if hash_changed {
373 self.incremental_index(project_path, extensions)
374 } else {
375 Ok(IndexResult {
376 files_indexed: 0,
377 chunks_created: 0,
378 time_taken_ms: 0,
379 languages: Vec::new(),
380 incremental: true,
381 files_added: 0,
382 files_removed: 0,
383 files_modified: 0,
384 })
385 }
386 }
387
388 pub fn search(
390 &self,
391 project_path: &Path,
392 query: &str,
393 limit: usize,
394 extension_filter: Option<&str>,
395 ) -> Result<Vec<SearchResult>> {
396 let storage = ProjectStorage::for_project(project_path)?;
397
398 if !storage.has_index() {
400 info!("No index found, auto-indexing before search");
401 self.full_index(project_path, None)?;
402 }
403
404 let bm25 = BM25Index::open(&storage.tantivy_dir())?;
405 let analyzed = analyze_query(query);
406
407 let mut results = bm25.search(&analyzed.search_query, limit)?;
409
410 #[cfg(feature = "dense")]
412 {
413 if let Some(fused) = Self::try_hybrid_search(&storage, &bm25, &results, query, limit) {
414 results = fused;
415 }
416 }
417
418 if let Some(ext) = extension_filter {
420 results.retain(|r| r.relative_path.ends_with(&format!(".{}", ext)));
421 }
422
423 rank_results(&mut results, &analyzed, limit);
425
426 Ok(results)
427 }
428
429 pub fn get_status(&self, project_path: &Path) -> Result<IndexStatus> {
431 let storage = ProjectStorage::for_project(project_path)?;
432
433 if !storage.has_index() {
434 return Ok(IndexStatus {
435 indexed: false,
436 file_count: 0,
437 chunk_count: 0,
438 last_indexed: None,
439 languages: Vec::new(),
440 });
441 }
442
443 let snapshot_manager = SnapshotManager::new(storage.base_dir().to_path_buf());
444 let metadata = snapshot_manager.load_metadata()?;
445
446 let bm25 = BM25Index::open(&storage.tantivy_dir())?;
447 let chunk_count = bm25.doc_count().unwrap_or(0) as usize;
448
449 let stats = storage.load_stats()?;
450
451 Ok(IndexStatus {
452 indexed: true,
453 file_count: metadata.as_ref().map(|m| m.file_count).unwrap_or(0),
454 chunk_count,
455 last_indexed: metadata.map(|m| m.timestamp.to_rfc3339()),
456 languages: stats.map(|s| s.languages).unwrap_or_default(),
457 })
458 }
459
460 pub fn clear_index(&self, project_path: &Path) -> Result<()> {
462 let storage = ProjectStorage::for_project(project_path)?;
463 storage.clear_all()?;
464 info!("Cleared index for {}", project_path.display());
465 Ok(())
466 }
467}
468
469impl IncrementalIndexer {
470 #[cfg(feature = "dense")]
473 fn try_hybrid_search(
474 storage: &ProjectStorage,
475 bm25: &BM25Index,
476 bm25_results: &[SearchResult],
477 query: &str,
478 limit: usize,
479 ) -> Option<Vec<SearchResult>> {
480 let dense_dir = storage.dense_dir();
481 if !dense_dir.join("dense.usearch").exists() {
482 return None;
483 }
484
485 let mut embedder = match Self::load_embedder() {
486 Ok(e) => e,
487 Err(e) => {
488 warn!("Failed to load embedder for hybrid search: {}", e);
489 return None;
490 }
491 };
492
493 let dims = embedder.info().dimensions;
494 let dense = match DenseIndex::open(&dense_dir, dims) {
495 Ok(d) => d,
496 Err(e) => {
497 warn!("Failed to open dense index: {}", e);
498 return None;
499 }
500 };
501
502 if dense.is_empty() {
503 return None;
504 }
505
506 let query_vec = match embedder.encode_query(query) {
507 Ok(v) => v,
508 Err(e) => {
509 warn!("Failed to encode query for dense search: {}", e);
510 return None;
511 }
512 };
513
514 let dense_k = (limit * 3).max(20);
515 let dense_matches = match dense.search(&query_vec, dense_k) {
516 Ok(m) => m,
517 Err(e) => {
518 warn!("Dense search failed: {}", e);
519 return None;
520 }
521 };
522
523 let mut full_map: HashMap<String, SearchResult> = bm25_results
525 .iter()
526 .map(|r| (r.chunk_id.clone(), r.clone()))
527 .collect();
528
529 let missing_ids: Vec<&str> = dense_matches
531 .iter()
532 .filter(|(cid, _)| !full_map.contains_key(cid))
533 .map(|(cid, _)| cid.as_str())
534 .collect();
535 if !missing_ids.is_empty() {
536 if let Ok(extra) = bm25.get_by_chunk_ids(&missing_ids) {
537 full_map.extend(extra);
538 }
539 }
540
541 let fused = reciprocal_rank_fusion(bm25_results, &dense_matches, &full_map);
542 debug!(
543 "Hybrid search: BM25={} dense={} fused={}",
544 full_map.len(),
545 dense_matches.len(),
546 fused.len()
547 );
548
549 Some(fused)
550 }
551
552 #[cfg(feature = "dense")]
553 fn load_embedder() -> Result<Box<dyn crate::search::embedding::Embedder>> {
554 crate::search::embedding::load_embedder()
555 }
556
557 #[cfg(feature = "dense")]
558 fn save_dense_meta(
559 storage: &ProjectStorage,
560 info: &crate::search::embedding::EmbedderInfo,
561 ) -> Result<()> {
562 let meta_path = storage.dense_dir().join("dense_meta.json");
563 std::fs::create_dir_all(storage.dense_dir())?;
564 let json = serde_json::to_string(info)?;
565 std::fs::write(&meta_path, json)?;
566 Ok(())
567 }
568
569 #[cfg(feature = "dense")]
570 fn check_dense_meta(
571 storage: &ProjectStorage,
572 current: &crate::search::embedding::EmbedderInfo,
573 ) -> Result<()> {
574 let meta_path = storage.dense_dir().join("dense_meta.json");
575 if !meta_path.exists() {
576 return Ok(());
577 }
578 let data = std::fs::read_to_string(&meta_path)?;
579 let saved: crate::search::embedding::EmbedderInfo = serde_json::from_str(&data)?;
580 if saved.dimensions != current.dimensions
581 || saved.provider != current.provider
582 || saved.model_name != current.model_name
583 {
584 anyhow::bail!(
585 "Embedder mismatch: index built with {} / {} ({}d), current is {} / {} ({}d). Re-index required.",
586 saved.provider,
587 saved.model_name,
588 saved.dimensions,
589 current.provider,
590 current.model_name,
591 current.dimensions,
592 );
593 }
594 Ok(())
595 }
596}
597
598impl Default for IncrementalIndexer {
599 fn default() -> Self {
600 Self::new()
601 }
602}