1use anyhow::Result;
2use ck_core::{
3 FileMetadata, Language, Span, compute_chunk_hash, compute_file_hash, get_sidecar_path,
4};
5use ignore::{WalkBuilder, overrides::OverrideBuilder};
6use rayon::prelude::*;
7use serde::{Deserialize, Serialize};
8use std::collections::{HashMap, HashSet};
9use std::fs;
10use std::io::{Read, Write};
11use std::path::{Path, PathBuf};
12use std::sync::Once;
13use std::sync::atomic::{AtomicBool, Ordering};
14use std::time::SystemTime;
15use tempfile::NamedTempFile;
16use walkdir::WalkDir;
17
18fn legacy_model_config(name: &str, dimensions: Option<usize>) -> ck_models::ModelConfig {
19 ck_models::ModelConfig {
20 name: name.to_string(),
21 provider: "fastembed".to_string(),
22 dimensions: dimensions.unwrap_or(384),
23 max_tokens: 8192,
24 description: "Legacy ck embedding model (inferred from manifest)".to_string(),
25 }
26}
27
28pub type ProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
29
30#[derive(Debug, Clone)]
32pub struct EmbeddingProgress {
33 pub file_name: String,
34 pub file_index: usize,
35 pub total_files: usize,
36 pub chunk_index: usize,
37 pub total_chunks: usize,
38 pub chunk_size: usize,
39}
40
41pub type DetailedProgressCallback = Box<dyn Fn(EmbeddingProgress) + Send + Sync>;
42
43#[derive(Debug, Clone)]
45pub enum IndexingProgress {
46 Starting { total_files: usize },
48 ProcessingFile {
50 file: String,
51 file_number: usize,
52 total_files: usize,
53 file_size: u64,
54 },
55 ChunkingFile { file: String, chunks_found: usize },
57 ProcessingChunk {
59 file: String,
60 chunk_number: usize,
61 total_chunks: usize,
62 chunk_size: usize,
63 },
64 FileComplete {
66 file: String,
67 chunks_processed: usize,
68 file_number: usize,
69 total_files: usize,
70 elapsed_ms: u64,
71 },
72 Complete {
74 total_files: usize,
75 total_chunks: usize,
76 total_elapsed_ms: u64,
77 },
78}
79
80pub type EnhancedProgressCallback = Box<dyn Fn(IndexingProgress) + Send + Sync>;
81
82static INTERRUPTED: AtomicBool = AtomicBool::new(false);
84static HANDLER_INIT: Once = Once::new();
85
86pub const INDEX_INTERRUPTED_MSG: &str = "Indexing interrupted by user";
87
88pub fn request_interrupt() {
89 INTERRUPTED.store(true, Ordering::SeqCst);
90}
91
92fn build_overrides(
94 base_path: &Path,
95 exclude_patterns: &[String],
96) -> Result<ignore::overrides::Override> {
97 let mut builder = OverrideBuilder::new(base_path);
98
99 for pattern in exclude_patterns {
100 if pattern.starts_with('!') {
101 builder.add(pattern)?;
102 } else {
103 builder.add(&format!("!{}", pattern))?;
104 }
105 }
106
107 Ok(builder.build()?)
108}
109
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct IndexEntry {
112 pub metadata: FileMetadata,
113 pub chunks: Vec<ChunkEntry>,
114}
115
116#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct ChunkEntry {
118 pub span: Span,
119 pub embedding: Option<Vec<f32>>,
120 pub chunk_type: Option<String>, #[serde(default)]
122 pub breadcrumb: Option<String>,
123 #[serde(default)]
124 pub ancestry: Option<Vec<String>>,
125 #[serde(default)]
126 pub byte_length: Option<usize>,
127 #[serde(default)]
128 pub estimated_tokens: Option<usize>,
129 #[serde(default)]
130 pub leading_trivia: Option<Vec<String>>,
131 #[serde(default)]
132 pub trailing_trivia: Option<Vec<String>>,
133 #[serde(default)]
135 pub chunk_hash: Option<String>,
136}
137
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct IndexManifest {
140 pub version: String,
141 pub created: u64,
142 pub updated: u64,
143 pub files: HashMap<PathBuf, FileMetadata>,
144 pub embedding_model: Option<String>,
146 pub embedding_dimensions: Option<usize>,
148 #[serde(default)]
152 pub chunk_hash_version: Option<u32>,
153}
154
155impl Default for IndexManifest {
156 fn default() -> Self {
157 let now = SystemTime::now()
158 .duration_since(SystemTime::UNIX_EPOCH)
159 .unwrap()
160 .as_secs();
161
162 Self {
163 version: "0.1.0".to_string(),
164 created: now,
165 updated: now,
166 files: HashMap::new(),
167 embedding_model: None, embedding_dimensions: None,
169 chunk_hash_version: Some(2), }
171 }
172}
173
174fn should_include_file(entry: &ignore::DirEntry, index_dir: &Path) -> bool {
176 let path = entry.path();
177 entry.file_type().is_some_and(|ft| ft.is_file())
178 && is_text_file(path)
179 && !path.starts_with(index_dir)
180}
181
182fn filter_and_collect_files(walker: ignore::Walk, index_dir: &Path) -> Vec<PathBuf> {
184 walker
185 .filter_map(|entry| entry.ok())
186 .filter(|entry| should_include_file(entry, index_dir))
187 .map(|entry| entry.path().to_path_buf())
188 .collect()
189}
190
191pub fn collect_files(
192 path: &Path,
193 options: &ck_core::FileCollectionOptions,
194) -> Result<Vec<PathBuf>> {
195 let index_dir = path.join(".ck");
196
197 if options.respect_gitignore {
198 let overrides = build_overrides(path, &options.exclude_patterns)?;
199 let mut walker_builder = WalkBuilder::new(path);
200 walker_builder
201 .git_ignore(true)
202 .git_global(true)
203 .git_exclude(true)
204 .hidden(true);
205
206 if options.use_ckignore {
208 walker_builder.add_custom_ignore_filename(".ckignore");
209 }
210
211 walker_builder.overrides(overrides);
212 let walker = walker_builder.build();
213
214 Ok(filter_and_collect_files(walker, &index_dir))
215 } else {
216 use ck_core::get_default_exclude_patterns;
218 let default_patterns = get_default_exclude_patterns();
219
220 let mut all_patterns = default_patterns;
222 all_patterns.extend(options.exclude_patterns.iter().cloned());
223 let combined_overrides = build_overrides(path, &all_patterns)?;
224
225 let mut walker_builder = WalkBuilder::new(path);
226 walker_builder
227 .git_ignore(false)
228 .git_global(false)
229 .git_exclude(false)
230 .hidden(true);
231
232 if options.use_ckignore {
234 walker_builder.add_custom_ignore_filename(".ckignore");
235 }
236
237 walker_builder.overrides(combined_overrides);
238 let walker = walker_builder.build();
239
240 Ok(filter_and_collect_files(walker, &index_dir))
241 }
242}
243
244fn collect_files_as_hashset(
245 path: &Path,
246 options: &ck_core::FileCollectionOptions,
247) -> Result<HashSet<PathBuf>> {
248 Ok(collect_files(path, options)?.into_iter().collect())
249}
250
251pub async fn index_directory(
252 path: &Path,
253 compute_embeddings: bool,
254 options: &ck_core::FileCollectionOptions,
255 model: Option<&str>,
256) -> Result<()> {
257 tracing::info!(
258 "index_directory called with compute_embeddings={}",
259 compute_embeddings
260 );
261 let index_dir = path.join(".ck");
262 fs::create_dir_all(&index_dir)?;
263
264 let manifest_path = index_dir.join("manifest.json");
265 let mut manifest = load_or_create_manifest(&manifest_path)?;
266 normalize_manifest_paths(&mut manifest, path);
267
268 let resolved_model = if compute_embeddings {
270 let model_registry = ck_models::ModelRegistry::default();
271 let (alias, config) = model_registry
272 .resolve(model)
273 .map_err(|e| anyhow::anyhow!(e.to_string()))?;
274
275 if let Some(existing_model) = &manifest.embedding_model
276 && existing_model != &config.name
277 {
278 return Err(anyhow::anyhow!(
279 "Model mismatch: Index was created with '{}', but you're trying to use '{}'. \
280 Please run 'ck --clean {}' to remove the old index, then rerun with the new model.",
281 existing_model,
282 config.name,
283 path.display()
284 ));
285 }
286
287 manifest.embedding_model = Some(config.name.clone());
288 manifest.embedding_dimensions = Some(config.dimensions);
289
290 Some((alias, config))
291 } else {
292 None
293 };
294
295 let files = collect_files(path, options)?;
296
297 if compute_embeddings {
298 tracing::info!("Creating embedder for {} files", files.len());
300 let (_, config) = resolved_model
301 .as_ref()
302 .expect("resolved model must be present when computing embeddings");
303 let mut embedder = ck_embed::create_embedder_for_config(config, None)?;
304
305 for file_path in files.iter() {
306 match index_single_file(file_path, path, Some(&mut embedder)) {
307 Ok(entry) => {
308 let sidecar_path = get_sidecar_path(path, file_path);
310 save_index_entry(&sidecar_path, &entry)?;
311
312 let manifest_key = entry.metadata.path.clone();
314 manifest.files.insert(manifest_key, entry.metadata);
315 manifest.updated = SystemTime::now()
316 .duration_since(SystemTime::UNIX_EPOCH)
317 .unwrap()
318 .as_secs();
319 save_manifest(&manifest_path, &manifest)?;
320 }
321 Err(e) => {
322 let error_msg = e.to_string();
324 let is_binary_skip = error_msg.contains("Binary file, skipping");
325 let is_utf8_error = error_msg.contains("stream did not contain valid UTF-8");
326 let is_git_file = file_path.components().any(|c| c.as_os_str() == ".git");
327
328 if !(is_binary_skip || is_utf8_error && is_git_file) {
329 tracing::warn!("Failed to index {:?}: {}", file_path, e);
330 }
331 }
332 }
333 }
334 } else {
335 use std::sync::mpsc;
337 use std::thread;
338
339 let (tx, rx) = mpsc::channel();
340 let files_clone = files.clone();
341 let path_clone = path.to_path_buf();
342
343 let worker_handle = thread::spawn(move || {
345 files_clone.par_iter().for_each(|file_path| {
346 match index_single_file(file_path, &path_clone, None) {
347 Ok(entry) => {
348 if tx.send((file_path.clone(), entry)).is_err() {
349 }
351 }
352 Err(e) => {
353 let error_msg = e.to_string();
355 let is_binary_skip = error_msg.contains("Binary file, skipping");
356 let is_utf8_error =
357 error_msg.contains("stream did not contain valid UTF-8");
358 let is_git_file = file_path.components().any(|c| c.as_os_str() == ".git");
359
360 if !(is_binary_skip || is_utf8_error && is_git_file) {
361 tracing::warn!("Failed to index {:?}: {}", file_path, e);
362 }
363 }
364 }
365 });
366 });
367
368 while let Ok((file_path, entry)) = rx.recv() {
370 let sidecar_path = get_sidecar_path(path, &file_path);
372 save_index_entry(&sidecar_path, &entry)?;
373
374 let manifest_key = entry.metadata.path.clone();
376 manifest.files.insert(manifest_key, entry.metadata);
377 manifest.updated = SystemTime::now()
378 .duration_since(SystemTime::UNIX_EPOCH)
379 .unwrap()
380 .as_secs();
381 save_manifest(&manifest_path, &manifest)?;
382 }
383
384 worker_handle
386 .join()
387 .map_err(|_| anyhow::anyhow!("Worker thread panicked"))?;
388 }
389
390 if !compute_embeddings {
393 manifest.updated = SystemTime::now()
394 .duration_since(SystemTime::UNIX_EPOCH)
395 .unwrap()
396 .as_secs();
397 save_manifest(&manifest_path, &manifest)?;
398 }
399
400 Ok(())
401}
402
403pub async fn index_file(file_path: &Path, compute_embeddings: bool) -> Result<()> {
404 let repo_root = find_repo_root(file_path)?;
405 let index_dir = repo_root.join(".ck");
406 fs::create_dir_all(&index_dir)?;
407
408 let manifest_path = index_dir.join("manifest.json");
409 let mut manifest = load_or_create_manifest(&manifest_path)?;
410
411 let entry = if compute_embeddings {
412 let model_registry = ck_models::ModelRegistry::default();
413 let (alias, config) = if let Some(existing) = manifest.embedding_model.as_deref() {
414 match model_registry.resolve(Some(existing)) {
415 Ok(resolved) => resolved,
416 Err(_) => (
417 existing.to_string(),
418 legacy_model_config(existing, manifest.embedding_dimensions),
419 ),
420 }
421 } else {
422 model_registry
423 .resolve(None)
424 .map_err(|e| anyhow::anyhow!(e.to_string()))?
425 };
426
427 manifest.embedding_model = Some(config.name.clone());
428 manifest.embedding_dimensions = Some(config.dimensions);
429 tracing::debug!("Using embedding model '{}' ({})", config.name, alias);
430
431 let mut embedder = ck_embed::create_embedder_for_config(&config, None)?;
432 index_single_file(file_path, &repo_root, Some(&mut embedder))?
433 } else {
434 index_single_file(file_path, &repo_root, None)?
435 };
436 let sidecar_path = get_sidecar_path(&repo_root, file_path);
437
438 save_index_entry(&sidecar_path, &entry)?;
439 let manifest_key = entry.metadata.path.clone();
440 manifest.files.insert(manifest_key, entry.metadata);
441 manifest.updated = SystemTime::now()
442 .duration_since(SystemTime::UNIX_EPOCH)
443 .unwrap()
444 .as_secs();
445
446 save_manifest(&manifest_path, &manifest)?;
447
448 Ok(())
449}
450
451pub async fn update_index(
452 path: &Path,
453 compute_embeddings: bool,
454 options: &ck_core::FileCollectionOptions,
455) -> Result<()> {
456 let index_dir = path.join(".ck");
457 if !index_dir.exists() {
458 return index_directory(
459 path,
460 compute_embeddings,
461 options,
462 None, )
464 .await;
465 }
466
467 let manifest_path = index_dir.join("manifest.json");
468 let mut manifest = load_or_create_manifest(&manifest_path)?;
469
470 let files = collect_files(path, options)?;
471
472 let updates: Vec<(PathBuf, IndexEntry)> = if compute_embeddings {
473 let model_registry = ck_models::ModelRegistry::default();
475 let (alias, config) = if let Some(existing) = manifest.embedding_model.as_deref() {
476 match model_registry.resolve(Some(existing)) {
477 Ok(resolved) => resolved,
478 Err(_) => (
479 existing.to_string(),
480 legacy_model_config(existing, manifest.embedding_dimensions),
481 ),
482 }
483 } else {
484 model_registry
485 .resolve(None)
486 .map_err(|e| anyhow::anyhow!(e.to_string()))?
487 };
488
489 manifest.embedding_model = Some(config.name.clone());
490 manifest.embedding_dimensions = Some(config.dimensions);
491 tracing::debug!(
492 "Updating index with embedding model '{}' ({})",
493 config.name,
494 alias
495 );
496
497 let mut embedder = ck_embed::create_embedder_for_config(&config, None)?;
498 files
499 .iter()
500 .filter_map(|file_path| {
501 let manifest_key =
502 path_utils::to_manifest_path(&path_utils::to_standard_path(file_path, path));
503
504 let needs_update = match manifest.files.get(&manifest_key) {
505 Some(metadata) => match compute_file_hash(file_path) {
506 Ok(hash) => hash != metadata.hash,
507 Err(_) => false,
508 },
509 None => true,
510 };
511 if needs_update {
512 match index_single_file(file_path, path, Some(&mut embedder)) {
513 Ok(entry) => Some((file_path.clone(), entry)),
514 Err(e) => {
515 let error_msg = e.to_string();
517 let is_binary_skip = error_msg.contains("Binary file, skipping");
518 let is_utf8_error =
519 error_msg.contains("stream did not contain valid UTF-8");
520 let is_git_file =
521 file_path.components().any(|c| c.as_os_str() == ".git");
522
523 if !(is_binary_skip || is_utf8_error && is_git_file) {
524 tracing::warn!("Failed to index {:?}: {}", file_path, e);
525 }
526 None
527 }
528 }
529 } else {
530 None
531 }
532 })
533 .collect()
534 } else {
535 files
537 .par_iter()
538 .filter_map(|file_path| {
539 let manifest_key =
540 path_utils::to_manifest_path(&path_utils::to_standard_path(file_path, path));
541
542 let needs_update = match manifest.files.get(&manifest_key) {
543 Some(metadata) => match compute_file_hash(file_path) {
544 Ok(hash) => hash != metadata.hash,
545 Err(_) => false,
546 },
547 None => true,
548 };
549
550 if needs_update {
551 match index_single_file(file_path, path, None) {
552 Ok(entry) => Some((file_path.clone(), entry)),
553 Err(e) => {
554 let error_msg = e.to_string();
556 let is_binary_skip = error_msg.contains("Binary file, skipping");
557 let is_utf8_error =
558 error_msg.contains("stream did not contain valid UTF-8");
559 let is_git_file =
560 file_path.components().any(|c| c.as_os_str() == ".git");
561
562 if !(is_binary_skip || is_utf8_error && is_git_file) {
563 tracing::warn!("Failed to index {:?}: {}", file_path, e);
564 }
565 None
566 }
567 }
568 } else {
569 None
570 }
571 })
572 .collect()
573 };
574
575 for (file_path, entry) in updates {
576 let sidecar_path = get_sidecar_path(path, &file_path);
577 save_index_entry(&sidecar_path, &entry)?;
578 let manifest_key = entry.metadata.path.clone();
579 manifest.files.insert(manifest_key, entry.metadata);
580 }
581
582 if !manifest.files.is_empty() {
583 manifest.updated = SystemTime::now()
584 .duration_since(SystemTime::UNIX_EPOCH)
585 .unwrap()
586 .as_secs();
587 save_manifest(&manifest_path, &manifest)?;
588 }
589
590 Ok(())
591}
592
593pub fn clean_index(path: &Path) -> Result<()> {
594 let index_dir = path.join(".ck");
595 if index_dir.exists() {
596 fs::remove_dir_all(&index_dir)?;
597 }
598 Ok(())
599}
600
601pub fn cleanup_index(
602 path: &Path,
603 options: &ck_core::FileCollectionOptions,
604) -> Result<CleanupStats> {
605 let index_dir = path.join(".ck");
606 if !index_dir.exists() {
607 return Ok(CleanupStats::default());
608 }
609
610 let manifest_path = index_dir.join("manifest.json");
611 let mut manifest = load_or_create_manifest(&manifest_path)?;
612 normalize_manifest_paths(&mut manifest, path);
613
614 let stats =
616 cleanup_validation::validate_and_cleanup_index(path, &index_dir, &mut manifest, options)?;
617
618 remove_empty_dirs(&index_dir)?;
622
623 if stats.orphaned_entries_removed > 0 {
625 manifest.updated = SystemTime::now()
626 .duration_since(SystemTime::UNIX_EPOCH)
627 .unwrap()
628 .as_secs();
629 save_manifest(&manifest_path, &manifest)?;
630 }
631
632 Ok(stats)
633}
634
635pub fn get_index_stats(path: &Path) -> Result<IndexStats> {
636 let index_dir = path.join(".ck");
637 if !index_dir.exists() {
638 return Ok(IndexStats::default());
639 }
640
641 let manifest_path = index_dir.join("manifest.json");
642 let mut manifest = load_or_create_manifest(&manifest_path)?;
643 normalize_manifest_paths(&mut manifest, path);
644
645 let mut stats = IndexStats {
646 total_files: manifest.files.len(),
647 index_created: manifest.created,
648 index_updated: manifest.updated,
649 ..Default::default()
650 };
651
652 for file_path in manifest.files.keys() {
654 let standard_path = path_utils::from_manifest_path(file_path);
655 let sidecar_path =
656 path_utils::get_sidecar_path_for_standard_path(&index_dir, &standard_path);
657 if sidecar_path.exists()
658 && let Ok(entry) = load_index_entry(&sidecar_path)
659 {
660 stats.total_chunks += entry.chunks.len();
661 stats.total_size_bytes += entry.metadata.size;
662
663 let embedded = entry
665 .chunks
666 .iter()
667 .filter(|c| c.embedding.is_some())
668 .count();
669 stats.embedded_chunks += embedded;
670 }
671 }
672
673 if let Ok(entries) = WalkDir::new(&index_dir)
675 .into_iter()
676 .collect::<Result<Vec<_>, _>>()
677 {
678 for entry in entries {
679 if entry.file_type().is_file()
680 && let Ok(metadata) = entry.metadata()
681 {
682 stats.index_size_bytes += metadata.len();
683 }
684 }
685 }
686
687 Ok(stats)
688}
689
690pub async fn smart_update_index(
691 path: &Path,
692 compute_embeddings: bool,
693 options: &ck_core::FileCollectionOptions,
694) -> Result<UpdateStats> {
695 smart_update_index_with_progress(
696 path,
697 false,
698 None,
699 compute_embeddings,
700 options,
701 None, )
703 .await
704}
705
706pub async fn smart_update_index_with_progress(
707 path: &Path,
708 force_rebuild: bool,
709 progress_callback: Option<ProgressCallback>,
710 compute_embeddings: bool,
711 options: &ck_core::FileCollectionOptions,
712 model: Option<&str>,
713) -> Result<UpdateStats> {
714 smart_update_index_with_detailed_progress(
715 path,
716 force_rebuild,
717 progress_callback,
718 None, compute_embeddings,
720 options,
721 model,
722 )
723 .await
724}
725
726pub async fn smart_update_index_with_detailed_progress(
728 path: &Path,
729 force_rebuild: bool,
730 progress_callback: Option<ProgressCallback>,
731 detailed_progress_callback: Option<DetailedProgressCallback>,
732 compute_embeddings: bool,
733 options: &ck_core::FileCollectionOptions,
734 model: Option<&str>,
735) -> Result<UpdateStats> {
736 let index_dir = path.join(".ck");
737 let mut stats = UpdateStats::default();
738
739 HANDLER_INIT.call_once(|| {
741 let _ = ctrlc::set_handler(move || {
742 INTERRUPTED.store(true, Ordering::SeqCst);
743 eprintln!("\nIndexing interrupted by user. Cleaning up...");
744 });
745 });
746
747 INTERRUPTED.store(false, Ordering::SeqCst);
749
750 if force_rebuild {
751 clean_index(path)?;
752 index_directory(path, compute_embeddings, options, model).await?;
753 let index_stats = get_index_stats(path)?;
754 stats.files_indexed = index_stats.total_files;
755 return Ok(stats);
756 }
757
758 let repo_root = find_repo_root(path)?;
760
761 fs::create_dir_all(&index_dir)?;
767 let manifest_path = index_dir.join("manifest.json");
768 let mut manifest = load_or_create_manifest(&manifest_path)?;
769 normalize_manifest_paths(&mut manifest, &repo_root);
770
771 let resolved_model = if compute_embeddings {
773 let model_registry = ck_models::ModelRegistry::default();
774
775 let resolved = if let Some(requested) = model {
776 model_registry
777 .resolve(Some(requested))
778 .map_err(|e| anyhow::anyhow!(e.to_string()))?
779 } else if let Some(existing_model) = &manifest.embedding_model {
780 match model_registry.resolve(Some(existing_model.as_str())) {
781 Ok(resolved) => resolved,
782 Err(_) => (
783 existing_model.clone(),
784 legacy_model_config(existing_model, manifest.embedding_dimensions),
785 ),
786 }
787 } else {
788 model_registry
789 .resolve(None)
790 .map_err(|e| anyhow::anyhow!(e.to_string()))?
791 };
792
793 if let Some(existing_model) = &manifest.embedding_model
794 && existing_model != &resolved.1.name
795 {
796 return Err(anyhow::anyhow!(
797 "Model mismatch: Index was created with '{}', but you're trying to use '{}'. \
798 Please run 'ck --clean .' to remove the old index, then 'ck --index --model {}' to rebuild with the new model.",
799 existing_model,
800 resolved.1.name,
801 model.unwrap_or("default")
802 ));
803 }
804
805 manifest.embedding_model = Some(resolved.1.name.clone());
806 manifest.embedding_dimensions = Some(resolved.1.dimensions);
807
808 Some(resolved)
809 } else {
810 None
811 };
812
813 let current_files = collect_files(path, options)?;
816
817 let mut files_to_update = Vec::new();
819 let mut manifest_changed = false;
820
821 for file_path in current_files {
822 if INTERRUPTED.load(Ordering::SeqCst) {
824 eprintln!("Indexing interrupted during file scanning.");
825 return Ok(stats);
826 }
827
828 let manifest_key =
829 path_utils::to_manifest_path(&path_utils::to_standard_path(&file_path, &repo_root));
830
831 if let Some(metadata) = manifest.files.get(&manifest_key) {
832 let fs_meta = match fs::metadata(&file_path) {
833 Ok(m) => m,
834 Err(_) => {
835 stats.files_errored += 1;
836 continue;
837 }
838 };
839
840 let fs_last_modified = match fs_meta.modified().and_then(|m| {
841 m.duration_since(SystemTime::UNIX_EPOCH)
842 .map_err(|_| std::io::Error::other("Time error"))
843 }) {
844 Ok(dur) => dur.as_secs(),
845 Err(_) => {
846 stats.files_errored += 1;
847 continue;
848 }
849 };
850 let fs_size = fs_meta.len();
851
852 if fs_last_modified == metadata.last_modified && fs_size == metadata.size {
853 stats.files_up_to_date += 1;
854 continue;
855 }
856
857 let hash = match compute_file_hash(&file_path) {
858 Ok(h) => h,
859 Err(_) => {
860 stats.files_errored += 1;
861 continue;
862 }
863 };
864
865 if hash != metadata.hash {
866 stats.files_modified += 1;
867 files_to_update.push(file_path);
868 } else {
869 stats.files_up_to_date += 1;
870 let standard_path = path_utils::to_standard_path(&file_path, &repo_root);
872 let manifest_path = path_utils::to_manifest_path(&standard_path);
873 let new_metadata = FileMetadata {
874 path: manifest_path.clone(),
875 hash,
876 last_modified: fs_last_modified,
877 size: fs_size,
878 };
879 manifest.files.insert(manifest_path, new_metadata);
880 manifest_changed = true;
881 }
882 } else {
883 stats.files_added += 1;
884 files_to_update.push(file_path);
885 }
886 }
887
888 if compute_embeddings {
890 let (_, config) = resolved_model
892 .as_ref()
893 .expect("resolved model must exist for embedding updates");
894 let mut embedder = ck_embed::create_embedder_for_config(config, None)?;
895 let mut _processed_count = 0;
896
897 for file_path in files_to_update.iter() {
898 if INTERRUPTED.load(Ordering::SeqCst) {
900 eprintln!(
901 "Indexing interrupted. {} files processed.",
902 _processed_count
903 );
904 break;
905 }
906
907 if let Some(ref callback) = progress_callback
908 && let Some(file_name) = file_path.file_name()
909 {
910 callback(&file_name.to_string_lossy());
911 }
912
913 let result = if let Some(ref detailed_callback) = detailed_progress_callback {
915 index_single_file_with_progress(
916 file_path,
917 path,
918 Some(&mut embedder),
919 Some(detailed_callback),
920 _processed_count,
921 files_to_update.len(),
922 )
923 } else {
924 index_single_file_with_progress(file_path, path, Some(&mut embedder), None, 0, 1)
925 };
926
927 match result {
928 Ok((entry, file_chunks_reused, file_chunks_embedded)) => {
929 stats.chunks_reused += file_chunks_reused;
931 stats.chunks_embedded += file_chunks_embedded;
932
933 let sidecar_path = get_sidecar_path(path, file_path);
935 save_index_entry(&sidecar_path, &entry)?;
936
937 let manifest_key = entry.metadata.path.clone();
939 manifest.files.insert(manifest_key, entry.metadata);
940 manifest.updated = SystemTime::now()
941 .duration_since(SystemTime::UNIX_EPOCH)
942 .unwrap()
943 .as_secs();
944 save_manifest(&manifest_path, &manifest)?;
945 _processed_count += 1;
946 }
947 Err(e) => {
948 let error_msg = e.to_string();
950 let is_binary_skip = error_msg.contains("Binary file, skipping");
951 let is_utf8_error = error_msg.contains("stream did not contain valid UTF-8");
952 let is_git_file = file_path.components().any(|c| c.as_os_str() == ".git");
953
954 if !(is_binary_skip || is_utf8_error && is_git_file) {
955 tracing::warn!("Failed to index {:?}: {}", file_path, e);
956 }
957 stats.files_errored += 1;
958 }
959 }
960 }
961
962 stats.files_indexed = _processed_count;
963 } else {
964 use std::sync::mpsc;
966 use std::thread;
967
968 let (tx, rx) = mpsc::channel();
969 let files_clone = files_to_update.clone();
970 let path_clone = path.to_path_buf();
971
972 let worker_handle = thread::spawn(move || {
974 use rayon::prelude::*;
975
976 let result = files_clone.par_iter().try_for_each(|file_path| {
978 if INTERRUPTED.load(Ordering::SeqCst) {
980 return Err("interrupted");
981 }
982
983 match index_single_file(file_path, &path_clone, None) {
984 Ok(entry) => {
985 if tx.send((file_path.clone(), entry)).is_err() {
986 return Err("receiver_dropped");
988 }
989 }
990 Err(e) => {
991 let error_msg = e.to_string();
993 let is_binary_skip = error_msg.contains("Binary file, skipping");
994 let is_utf8_error =
995 error_msg.contains("stream did not contain valid UTF-8");
996 let is_git_file = file_path.components().any(|c| c.as_os_str() == ".git");
997
998 if !(is_binary_skip || is_utf8_error && is_git_file) {
999 tracing::warn!("Failed to index {:?}: {}", file_path, e);
1000 }
1001 }
1002 }
1003 Ok(())
1004 });
1005
1006 if let Err(reason) = result {
1008 tracing::debug!("Worker thread stopped due to: {}", reason);
1009 }
1010 });
1011
1012 let mut _processed_count = 0;
1014 while let Ok((file_path, entry)) = rx.recv() {
1015 if INTERRUPTED.load(Ordering::SeqCst) {
1017 eprintln!(
1018 "Indexing interrupted. {} files processed.",
1019 _processed_count
1020 );
1021 drop(rx); break;
1023 }
1024
1025 if let Some(ref callback) = progress_callback
1026 && let Some(file_name) = file_path.file_name()
1027 {
1028 callback(&file_name.to_string_lossy());
1029 }
1030
1031 let sidecar_path = get_sidecar_path(path, &file_path);
1033 save_index_entry(&sidecar_path, &entry)?;
1034
1035 let manifest_key = entry.metadata.path.clone();
1037 manifest.files.insert(manifest_key, entry.metadata);
1038 manifest.updated = SystemTime::now()
1039 .duration_since(SystemTime::UNIX_EPOCH)
1040 .unwrap()
1041 .as_secs();
1042 save_manifest(&manifest_path, &manifest)?;
1043 _processed_count += 1;
1044 }
1045
1046 stats.files_indexed = _processed_count;
1047
1048 worker_handle
1050 .join()
1051 .map_err(|_| anyhow::anyhow!("Worker thread panicked"))?;
1052 }
1053
1054 if !compute_embeddings
1057 && (stats.files_indexed > 0 || stats.orphaned_files_removed > 0 || manifest_changed)
1058 {
1059 manifest.updated = SystemTime::now()
1060 .duration_since(SystemTime::UNIX_EPOCH)
1061 .unwrap()
1062 .as_secs();
1063 save_manifest(&manifest_path, &manifest)?;
1064 }
1065
1066 Ok(stats)
1067}
1068
1069fn index_single_file(
1070 file_path: &Path,
1071 repo_root: &Path,
1072 embedder: Option<&mut Box<dyn ck_embed::Embedder>>,
1073) -> Result<IndexEntry> {
1074 let (entry, _chunks_reused, _chunks_embedded) =
1075 index_single_file_with_progress(file_path, repo_root, embedder, None, 0, 1)?;
1076 Ok(entry)
1077}
1078
1079fn index_single_file_with_progress(
1080 file_path: &Path,
1081 repo_root: &Path,
1082 embedder: Option<&mut Box<dyn ck_embed::Embedder>>,
1083 detailed_progress: Option<&DetailedProgressCallback>,
1084 file_index: usize,
1085 total_files: usize,
1086) -> Result<(IndexEntry, usize, usize)> {
1087 if !is_text_file(file_path) {
1089 return Err(anyhow::anyhow!("Binary file, skipping"));
1090 }
1091
1092 let chunk_cache: HashMap<String, Vec<f32>> = if embedder.is_some() {
1094 let sidecar_path = get_sidecar_path(repo_root, file_path);
1095 if sidecar_path.exists() {
1096 match load_index_entry(&sidecar_path) {
1097 Ok(old_entry) => old_entry
1098 .chunks
1099 .into_iter()
1100 .filter_map(|chunk| {
1101 if let (Some(hash), Some(embedding)) = (chunk.chunk_hash, chunk.embedding) {
1102 Some((hash, embedding))
1103 } else {
1104 None
1105 }
1106 })
1107 .collect(),
1108 Err(_) => HashMap::new(),
1109 }
1110 } else {
1111 HashMap::new()
1112 }
1113 } else {
1114 HashMap::new()
1115 };
1116
1117 let content_path = preprocess_file(file_path, repo_root)?;
1119 let content = fs::read_to_string(&content_path)?;
1120
1121 let hash = compute_file_hash(file_path)?;
1123 let metadata = fs::metadata(file_path)?;
1124
1125 let standard_path = path_utils::to_standard_path(file_path, repo_root);
1126 let manifest_path = path_utils::to_manifest_path(&standard_path);
1127
1128 let file_metadata = FileMetadata {
1129 path: manifest_path,
1130 hash,
1131 last_modified: metadata
1132 .modified()?
1133 .duration_since(SystemTime::UNIX_EPOCH)?
1134 .as_secs(),
1135 size: metadata.len(),
1136 };
1137
1138 let lang = if ck_core::pdf::is_pdf_file(file_path) {
1140 Some(Language::Pdf)
1141 } else {
1142 ck_core::Language::from_path(file_path)
1143 };
1144
1145 let model_name = embedder.as_ref().map(|e| e.model_name());
1146 let chunks = ck_chunk::chunk_text_with_model(&content, lang, model_name)?;
1147
1148 let mut chunks_reused = 0;
1150 let mut chunks_embedded = 0;
1151
1152 let chunk_entries: Vec<ChunkEntry> = if let Some(embedder) = embedder {
1153 let total_chunks = chunks.len();
1154 let file_name = file_path
1155 .file_name()
1156 .unwrap_or_default()
1157 .to_string_lossy()
1158 .to_string();
1159
1160 if let Some(ref callback) = detailed_progress {
1162 tracing::info!(
1163 "Computing embeddings for {} chunks in {:?}",
1164 total_chunks,
1165 file_path
1166 );
1167
1168 let mut chunk_entries = Vec::new();
1169 for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1170 if INTERRUPTED.load(Ordering::SeqCst) {
1171 return Err(anyhow::anyhow!(INDEX_INTERRUPTED_MSG));
1172 }
1173 callback(EmbeddingProgress {
1175 file_name: file_name.clone(),
1176 file_index,
1177 total_files,
1178 chunk_index,
1179 total_chunks,
1180 chunk_size: chunk.text.len(),
1181 });
1182
1183 let chunk_hash = compute_chunk_hash(
1186 &chunk.text,
1187 &chunk.metadata.leading_trivia,
1188 &chunk.metadata.trailing_trivia,
1189 );
1190
1191 let expected_dim = embedder.dim();
1193 let embedding = if let Some(cached_embedding) = chunk_cache.get(&chunk_hash) {
1194 if cached_embedding.len() == expected_dim {
1195 chunks_reused += 1;
1197 cached_embedding.clone()
1198 } else {
1199 chunks_embedded += 1;
1201 tracing::warn!(
1202 "Chunk in {:?} has cached embedding with dimension {} but current model expects {}. Re-embedding.",
1203 file_path,
1204 cached_embedding.len(),
1205 expected_dim
1206 );
1207 let embeddings = embedder.embed(std::slice::from_ref(&chunk.text))?;
1208 embeddings.into_iter().next().ok_or_else(|| {
1209 anyhow::anyhow!(
1210 "Embedder returned empty results for chunk {} in file {:?}. This may indicate an issue with the embedding model or chunk content.",
1211 chunk_index,
1212 file_path
1213 )
1214 })?
1215 }
1216 } else {
1217 chunks_embedded += 1;
1219 let embeddings = embedder.embed(std::slice::from_ref(&chunk.text))?;
1220 embeddings.into_iter().next().ok_or_else(|| {
1221 anyhow::anyhow!(
1222 "Embedder returned empty results for chunk {} in file {:?}. This may indicate an issue with the embedding model or chunk content.",
1223 chunk_index,
1224 file_path
1225 )
1226 })?
1227 };
1228
1229 let chunk_type_str = match chunk.chunk_type {
1230 ck_chunk::ChunkType::Function => Some("function".to_string()),
1231 ck_chunk::ChunkType::Class => Some("class".to_string()),
1232 ck_chunk::ChunkType::Method => Some("method".to_string()),
1233 ck_chunk::ChunkType::Module => Some("module".to_string()),
1234 ck_chunk::ChunkType::Text => None,
1235 };
1236
1237 let breadcrumb = chunk.metadata.breadcrumb.clone();
1238 let ancestry = if chunk.metadata.ancestry.is_empty() {
1239 None
1240 } else {
1241 Some(chunk.metadata.ancestry.clone())
1242 };
1243 let leading_trivia = if chunk.metadata.leading_trivia.is_empty() {
1244 None
1245 } else {
1246 Some(chunk.metadata.leading_trivia.clone())
1247 };
1248 let trailing_trivia = if chunk.metadata.trailing_trivia.is_empty() {
1249 None
1250 } else {
1251 Some(chunk.metadata.trailing_trivia.clone())
1252 };
1253
1254 chunk_entries.push(ChunkEntry {
1255 span: chunk.span,
1256 embedding: Some(embedding),
1257 chunk_type: chunk_type_str,
1258 breadcrumb,
1259 ancestry,
1260 byte_length: Some(chunk.metadata.byte_length),
1261 estimated_tokens: Some(chunk.metadata.estimated_tokens),
1262 leading_trivia,
1263 trailing_trivia,
1264 chunk_hash: Some(chunk_hash),
1265 });
1266 }
1267 chunk_entries
1268 } else {
1269 let expected_dim = embedder.dim();
1272 let mut chunks_to_embed = Vec::new();
1273 let mut chunk_results: Vec<(ck_chunk::Chunk, String, Option<Vec<f32>>)> = Vec::new();
1274
1275 for chunk in chunks {
1276 let chunk_hash = compute_chunk_hash(
1278 &chunk.text,
1279 &chunk.metadata.leading_trivia,
1280 &chunk.metadata.trailing_trivia,
1281 );
1282 if let Some(cached_embedding) = chunk_cache.get(&chunk_hash) {
1283 if cached_embedding.len() == expected_dim {
1284 chunks_reused += 1;
1286 chunk_results.push((chunk, chunk_hash, Some(cached_embedding.clone())));
1287 } else {
1288 tracing::warn!(
1290 "Chunk in {:?} has cached embedding with dimension {} but current model expects {}. Re-embedding.",
1291 file_path,
1292 cached_embedding.len(),
1293 expected_dim
1294 );
1295 chunks_to_embed.push((chunk.text.clone(), chunk_results.len()));
1296 chunk_results.push((chunk, chunk_hash, None));
1297 }
1298 } else {
1299 chunks_to_embed.push((chunk.text.clone(), chunk_results.len()));
1301 chunk_results.push((chunk, chunk_hash, None));
1302 }
1303 }
1304
1305 if !chunks_to_embed.is_empty() {
1307 let texts: Vec<String> = chunks_to_embed
1308 .iter()
1309 .map(|(text, _)| text.clone())
1310 .collect();
1311 tracing::info!(
1312 "Computing embeddings for {}/{} chunks in {:?} ({} reused from cache)",
1313 texts.len(),
1314 chunk_results.len(),
1315 file_path,
1316 chunks_reused
1317 );
1318 let embeddings = embedder.embed(&texts)?;
1319
1320 if embeddings.len() != chunks_to_embed.len() {
1321 return Err(anyhow::anyhow!(
1322 "Embedder returned {} embeddings for {} chunks in file {:?}. Expected equal counts.",
1323 embeddings.len(),
1324 chunks_to_embed.len(),
1325 file_path
1326 ));
1327 }
1328
1329 chunks_embedded += embeddings.len();
1330
1331 for ((_, result_idx), embedding) in chunks_to_embed.into_iter().zip(embeddings) {
1333 chunk_results[result_idx].2 = Some(embedding);
1334 }
1335 }
1336
1337 chunk_results
1338 .into_iter()
1339 .map(|(chunk, chunk_hash, embedding)| {
1340 let embedding = embedding.expect("All chunks should have embeddings by now");
1341 let chunk_type_str = match chunk.chunk_type {
1342 ck_chunk::ChunkType::Function => Some("function".to_string()),
1343 ck_chunk::ChunkType::Class => Some("class".to_string()),
1344 ck_chunk::ChunkType::Method => Some("method".to_string()),
1345 ck_chunk::ChunkType::Module => Some("module".to_string()),
1346 ck_chunk::ChunkType::Text => None,
1347 };
1348 let breadcrumb = chunk.metadata.breadcrumb.clone();
1349 let ancestry = if chunk.metadata.ancestry.is_empty() {
1350 None
1351 } else {
1352 Some(chunk.metadata.ancestry.clone())
1353 };
1354 let leading_trivia = if chunk.metadata.leading_trivia.is_empty() {
1355 None
1356 } else {
1357 Some(chunk.metadata.leading_trivia.clone())
1358 };
1359 let trailing_trivia = if chunk.metadata.trailing_trivia.is_empty() {
1360 None
1361 } else {
1362 Some(chunk.metadata.trailing_trivia.clone())
1363 };
1364 ChunkEntry {
1365 span: chunk.span,
1366 embedding: Some(embedding),
1367 chunk_type: chunk_type_str,
1368 breadcrumb,
1369 ancestry,
1370 byte_length: Some(chunk.metadata.byte_length),
1371 estimated_tokens: Some(chunk.metadata.estimated_tokens),
1372 leading_trivia,
1373 trailing_trivia,
1374 chunk_hash: Some(chunk_hash),
1375 }
1376 })
1377 .collect()
1378 }
1379 } else {
1380 chunks
1382 .into_iter()
1383 .map(|chunk| {
1384 let chunk_type_str = match chunk.chunk_type {
1385 ck_chunk::ChunkType::Function => Some("function".to_string()),
1386 ck_chunk::ChunkType::Class => Some("class".to_string()),
1387 ck_chunk::ChunkType::Method => Some("method".to_string()),
1388 ck_chunk::ChunkType::Module => Some("module".to_string()),
1389 ck_chunk::ChunkType::Text => None,
1390 };
1391 let breadcrumb = chunk.metadata.breadcrumb.clone();
1392 let ancestry = if chunk.metadata.ancestry.is_empty() {
1393 None
1394 } else {
1395 Some(chunk.metadata.ancestry.clone())
1396 };
1397 let leading_trivia = if chunk.metadata.leading_trivia.is_empty() {
1398 None
1399 } else {
1400 Some(chunk.metadata.leading_trivia.clone())
1401 };
1402 let trailing_trivia = if chunk.metadata.trailing_trivia.is_empty() {
1403 None
1404 } else {
1405 Some(chunk.metadata.trailing_trivia.clone())
1406 };
1407 ChunkEntry {
1408 span: chunk.span,
1409 embedding: None,
1410 chunk_type: chunk_type_str,
1411 breadcrumb,
1412 ancestry,
1413 byte_length: Some(chunk.metadata.byte_length),
1414 estimated_tokens: Some(chunk.metadata.estimated_tokens),
1415 leading_trivia: leading_trivia.clone(),
1416 trailing_trivia: trailing_trivia.clone(),
1417 chunk_hash: Some(compute_chunk_hash(
1418 &chunk.text,
1419 &chunk.metadata.leading_trivia,
1420 &chunk.metadata.trailing_trivia,
1421 )),
1422 }
1423 })
1424 .collect()
1425 };
1426
1427 Ok((
1428 IndexEntry {
1429 metadata: file_metadata,
1430 chunks: chunk_entries,
1431 },
1432 chunks_reused,
1433 chunks_embedded,
1434 ))
1435}
1436
1437fn load_or_create_manifest(path: &Path) -> Result<IndexManifest> {
1438 let mut manifest = if path.exists() {
1439 let data = fs::read(path)?;
1440 serde_json::from_slice(&data)?
1441 } else {
1442 IndexManifest::default()
1443 };
1444
1445 if manifest.chunk_hash_version.is_none() {
1448 manifest.chunk_hash_version = Some(2);
1449 }
1450
1451 Ok(manifest)
1452}
1453
1454fn normalize_manifest_paths(manifest: &mut IndexManifest, repo_root: &Path) {
1455 let original_entries = std::mem::take(&mut manifest.files);
1456 let mut normalized = HashMap::with_capacity(original_entries.len());
1457
1458 for (key, mut metadata) in original_entries {
1459 let standard_key = if key.is_absolute() {
1460 path_utils::to_standard_path(&key, repo_root)
1461 } else {
1462 path_utils::from_manifest_path(&key)
1463 };
1464 let manifest_key = path_utils::to_manifest_path(&standard_key);
1465
1466 let metadata_standard = if metadata.path.is_absolute() {
1467 path_utils::to_standard_path(&metadata.path, repo_root)
1468 } else {
1469 path_utils::from_manifest_path(&metadata.path)
1470 };
1471 metadata.path = path_utils::to_manifest_path(&metadata_standard);
1472
1473 normalized.insert(manifest_key, metadata);
1474 }
1475
1476 manifest.files = normalized;
1477}
1478
1479fn save_manifest(path: &Path, manifest: &IndexManifest) -> Result<()> {
1480 let data = serde_json::to_vec_pretty(manifest)?;
1481 atomic_write(path, &data)
1482}
1483
1484fn save_index_entry(path: &Path, entry: &IndexEntry) -> Result<()> {
1485 let data = bincode::serialize(entry)?;
1486 atomic_write(path, &data)
1487}
1488
1489fn atomic_write(path: &Path, data: &[u8]) -> Result<()> {
1490 let parent = path.parent().unwrap_or_else(|| Path::new("."));
1491 fs::create_dir_all(parent)?;
1492
1493 let mut tmp = NamedTempFile::new_in(parent)?;
1494 tmp.write_all(data)?;
1495 tmp.as_file().sync_all()?;
1496
1497 if path.exists() {
1498 fs::remove_file(path)?;
1499 }
1500
1501 tmp.persist(path)?;
1502 Ok(())
1503}
1504
1505pub fn load_index_entry(path: &Path) -> Result<IndexEntry> {
1506 let data = fs::read(path)?;
1507 Ok(bincode::deserialize(&data)?)
1508}
1509
1510fn find_repo_root(path: &Path) -> Result<PathBuf> {
1511 let mut current = if path.is_file() {
1512 path.parent().unwrap_or(path)
1513 } else {
1514 path
1515 };
1516
1517 loop {
1518 if current.join(".ck").exists() || current.join(".git").exists() {
1519 return Ok(current.to_path_buf());
1520 }
1521
1522 match current.parent() {
1523 Some(parent) => current = parent,
1524 None => return Ok(path.to_path_buf()),
1525 }
1526 }
1527}
1528
1529fn should_reextract(source_path: &Path, cache_path: &Path) -> Result<bool> {
1531 if !cache_path.exists() {
1532 return Ok(true);
1533 }
1534
1535 let source_modified = fs::metadata(source_path)?.modified()?;
1536 let cache_modified = fs::metadata(cache_path)?.modified()?;
1537
1538 Ok(source_modified > cache_modified)
1539}
1540
1541fn extract_pdf_text(path: &Path) -> Result<String> {
1543 pdf_extract::extract_text(path)
1544 .map_err(|e| anyhow::anyhow!("Failed to extract text from PDF {}: {}", path.display(), e))
1545}
1546
1547fn preprocess_file(file_path: &Path, repo_root: &Path) -> Result<PathBuf> {
1551 if ck_core::pdf::is_pdf_file(file_path) {
1552 let cache_path = ck_core::pdf::get_content_cache_path(repo_root, file_path);
1553
1554 if should_reextract(file_path, &cache_path)? {
1556 tracing::debug!(
1557 "Extracting PDF content from {:?} to {:?}",
1558 file_path,
1559 cache_path
1560 );
1561 let extracted_text = extract_pdf_text(file_path)?;
1562
1563 if let Some(parent) = cache_path.parent() {
1565 fs::create_dir_all(parent)?;
1566 }
1567
1568 fs::write(&cache_path, extracted_text)?;
1570 }
1571
1572 Ok(cache_path) } else {
1574 Ok(file_path.to_path_buf()) }
1576}
1577
1578fn is_text_file(path: &Path) -> bool {
1579 if ck_core::pdf::is_pdf_file(path) {
1581 return true;
1582 }
1583
1584 const BUFFER_SIZE: usize = 8192;
1586
1587 match std::fs::File::open(path) {
1588 Ok(mut file) => {
1589 let mut buffer = vec![0; BUFFER_SIZE];
1590 match file.read(&mut buffer) {
1591 Ok(bytes_read) => {
1592 if bytes_read == 0 {
1594 return true;
1595 }
1596
1597 !buffer[..bytes_read].contains(&0)
1599 }
1600 Err(_) => false, }
1602 }
1603 Err(_) => false, }
1605}
1606
1607#[cfg(test)]
1608fn sidecar_to_original_path(
1609 sidecar_path: &Path,
1610 index_dir: &Path,
1611 _repo_root: &Path,
1612) -> Option<PathBuf> {
1613 let relative_path = sidecar_path.strip_prefix(index_dir).ok()?;
1614 let original_path = relative_path.with_extension("");
1615
1616 if let Some(name) = original_path.file_name() {
1618 let name_str = name.to_string_lossy();
1619 if let Some(original_name) = name_str.strip_suffix(".ck") {
1620 let mut result = original_path.clone();
1621 result.set_file_name(original_name);
1622 return Some(result);
1623 }
1624 }
1625
1626 Some(original_path)
1627}
1628
1629fn remove_empty_dirs(dir: &Path) -> Result<()> {
1630 if !dir.is_dir() {
1631 return Ok(());
1632 }
1633
1634 for entry in fs::read_dir(dir)? {
1635 let entry = entry?;
1636 let path = entry.path();
1637 if path.is_dir() {
1638 remove_empty_dirs(&path)?;
1639 if fs::read_dir(&path)?.next().is_none() {
1641 let _ = fs::remove_dir(&path);
1642 }
1643 }
1644 }
1645
1646 Ok(())
1647}
1648
1649#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1650pub struct CleanupStats {
1651 pub orphaned_entries_removed: usize,
1652 pub orphaned_sidecars_removed: usize,
1653}
1654
1655#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1656pub struct IndexStats {
1657 pub total_files: usize,
1658 pub total_chunks: usize,
1659 pub embedded_chunks: usize,
1660 pub total_size_bytes: u64,
1661 pub index_size_bytes: u64,
1662 pub index_created: u64,
1663 pub index_updated: u64,
1664}
1665
1666#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1667pub struct UpdateStats {
1668 pub files_indexed: usize,
1669 pub files_added: usize,
1670 pub files_modified: usize,
1671 pub files_up_to_date: usize,
1672 pub files_errored: usize,
1673 pub orphaned_files_removed: usize,
1674 pub chunks_reused: usize,
1675 pub chunks_embedded: usize,
1676}
1677
1678#[cfg(test)]
1679mod tests {
1680 use super::*;
1681 use std::fs;
1682 use tempfile::TempDir;
1683
1684 struct EmptyResultsEmbedder;
1686
1687 impl ck_embed::Embedder for EmptyResultsEmbedder {
1688 fn id(&self) -> &'static str {
1689 "empty-results-test"
1690 }
1691
1692 fn dim(&self) -> usize {
1693 384
1694 }
1695
1696 fn model_name(&self) -> &str {
1697 "test-empty-results"
1698 }
1699
1700 fn embed(&mut self, _texts: &[String]) -> Result<Vec<Vec<f32>>> {
1701 Ok(Vec::new())
1703 }
1704 }
1705
1706 struct MismatchedCountEmbedder;
1708
1709 impl ck_embed::Embedder for MismatchedCountEmbedder {
1710 fn id(&self) -> &'static str {
1711 "mismatched-count-test"
1712 }
1713
1714 fn dim(&self) -> usize {
1715 384
1716 }
1717
1718 fn model_name(&self) -> &str {
1719 "test-mismatched-count"
1720 }
1721
1722 fn embed(&mut self, texts: &[String]) -> Result<Vec<Vec<f32>>> {
1723 if texts.is_empty() {
1725 Ok(Vec::new())
1726 } else {
1727 Ok(vec![vec![0.0; self.dim()]; texts.len() - 1])
1728 }
1729 }
1730 }
1731
1732 #[test]
1733 fn test_index_single_file_handles_empty_embedding_results() {
1734 let temp_dir = TempDir::new().unwrap();
1735 let test_path = temp_dir.path();
1736
1737 let test_file = test_path.join("test.txt");
1739 fs::write(&test_file, "hello world").unwrap();
1740
1741 let mut empty_embedder: Box<dyn ck_embed::Embedder> = Box::new(EmptyResultsEmbedder);
1743
1744 let result = index_single_file(&test_file, test_path, Some(&mut empty_embedder));
1746
1747 assert!(result.is_err());
1748 let error_msg = result.unwrap_err().to_string();
1749 assert!(error_msg.contains("Embedder returned 0 embeddings for 1 chunks"));
1751 assert!(error_msg.contains("Expected equal counts"));
1752 assert!(error_msg.contains("test.txt"));
1753 }
1754
1755 #[test]
1756 fn test_index_single_file_with_progress_handles_empty_embedding_results() {
1757 let temp_dir = TempDir::new().unwrap();
1758 let test_path = temp_dir.path();
1759
1760 let test_file = test_path.join("test.txt");
1762 fs::write(&test_file, "hello world").unwrap();
1763
1764 let mut empty_embedder: Box<dyn ck_embed::Embedder> = Box::new(EmptyResultsEmbedder);
1766
1767 let dummy_callback: DetailedProgressCallback = Box::new(|_progress: EmbeddingProgress| {});
1769 let result = index_single_file_with_progress(
1770 &test_file,
1771 test_path,
1772 Some(&mut empty_embedder),
1773 Some(&dummy_callback),
1774 0,
1775 1,
1776 );
1777
1778 assert!(result.is_err());
1779 let error_msg = result.unwrap_err().to_string();
1780 assert!(error_msg.contains("Embedder returned empty results"));
1782 assert!(error_msg.contains("chunk 0"));
1783 assert!(error_msg.contains("test.txt"));
1784 }
1785
1786 #[test]
1787 fn test_index_single_file_handles_mismatched_embedding_count() {
1788 let temp_dir = TempDir::new().unwrap();
1789 let test_path = temp_dir.path();
1790
1791 let test_file = test_path.join("test.rs");
1793 fs::write(
1794 &test_file,
1795 "fn main() {\n println!(\"hello\");\n}\n\nfn other() {\n println!(\"world\");\n}",
1796 )
1797 .unwrap();
1798
1799 let mut mismatched_embedder: Box<dyn ck_embed::Embedder> =
1801 Box::new(MismatchedCountEmbedder);
1802
1803 let result = index_single_file(&test_file, test_path, Some(&mut mismatched_embedder));
1805
1806 assert!(result.is_err());
1807 let error_msg = result.unwrap_err().to_string();
1808 assert!(error_msg.contains("Embedder returned"));
1809 assert!(error_msg.contains("embeddings for"));
1810 assert!(error_msg.contains("chunks"));
1811 assert!(error_msg.contains("Expected equal counts"));
1812 }
1813
1814 #[test]
1815 fn test_index_single_file_with_valid_embedder_still_works() {
1816 let temp_dir = TempDir::new().unwrap();
1817 let test_path = temp_dir.path();
1818
1819 let test_file = test_path.join("test.txt");
1821 fs::write(&test_file, "hello world").unwrap();
1822
1823 let dummy_embedder = ck_embed::DummyEmbedder::new();
1825 let mut boxed_embedder: Box<dyn ck_embed::Embedder> = Box::new(dummy_embedder);
1826
1827 let result = index_single_file(&test_file, test_path, Some(&mut boxed_embedder));
1829
1830 assert!(result.is_ok());
1831 let entry = result.unwrap();
1832 assert!(!entry.chunks.is_empty());
1833 for chunk in &entry.chunks {
1835 assert!(chunk.embedding.is_some());
1836 assert_eq!(chunk.embedding.as_ref().unwrap().len(), 384); }
1838 }
1839
1840 #[tokio::test]
1841 async fn test_smart_update_index() {
1842 let temp_dir = TempDir::new().unwrap();
1843 let test_path = temp_dir.path();
1844
1845 fs::write(test_path.join("file1.txt"), "initial content").unwrap();
1847
1848 let file_options = ck_core::FileCollectionOptions {
1849 respect_gitignore: true,
1850 use_ckignore: true,
1851 exclude_patterns: vec![],
1852 };
1853
1854 let stats1 = smart_update_index(test_path, false, &file_options)
1856 .await
1857 .unwrap();
1858 assert_eq!(stats1.files_added, 1);
1859 assert_eq!(stats1.files_indexed, 1);
1860
1861 let stats2 = smart_update_index(test_path, false, &file_options)
1863 .await
1864 .unwrap();
1865 assert_eq!(stats2.files_up_to_date, 1);
1866 assert_eq!(stats2.files_indexed, 0);
1867
1868 fs::write(test_path.join("file1.txt"), "modified content").unwrap();
1870 let stats3 = smart_update_index(test_path, false, &file_options)
1871 .await
1872 .unwrap();
1873 assert_eq!(stats3.files_modified, 1);
1874 assert_eq!(stats3.files_indexed, 1);
1875
1876 fs::write(test_path.join("file2.txt"), "new file content").unwrap();
1878 let stats4 = smart_update_index(test_path, false, &file_options)
1879 .await
1880 .unwrap();
1881 assert_eq!(stats4.files_added, 1);
1882 assert_eq!(stats4.files_up_to_date, 1);
1883 assert_eq!(stats4.files_indexed, 1);
1884 }
1885
1886 #[test]
1887 fn test_cleanup_index() {
1888 let temp_dir = TempDir::new().unwrap();
1889 let test_path = temp_dir.path();
1890
1891 let index_dir = test_path.join(".ck");
1893 fs::create_dir_all(&index_dir).unwrap();
1894
1895 let mut manifest = IndexManifest::default();
1896 manifest.files.insert(
1897 test_path.join("deleted_file.txt"),
1898 FileMetadata {
1899 path: test_path.join("deleted_file.txt"),
1900 hash: "fake_hash".to_string(),
1901 last_modified: 0,
1902 size: 0,
1903 },
1904 );
1905
1906 let manifest_path = index_dir.join("manifest.json");
1907 save_manifest(&manifest_path, &manifest).unwrap();
1908
1909 let file_options = ck_core::FileCollectionOptions {
1911 respect_gitignore: true,
1912 use_ckignore: true,
1913 exclude_patterns: vec![],
1914 };
1915 let stats = cleanup_index(test_path, &file_options).unwrap();
1916 assert_eq!(stats.orphaned_entries_removed, 1);
1917
1918 let updated_manifest = load_or_create_manifest(&manifest_path).unwrap();
1920 assert_eq!(updated_manifest.files.len(), 0);
1921 }
1922
1923 #[test]
1924 fn test_get_index_stats() {
1925 let temp_dir = TempDir::new().unwrap();
1926 let test_path = temp_dir.path();
1927
1928 let stats = get_index_stats(test_path).unwrap();
1930 assert_eq!(stats.total_files, 0);
1931
1932 let index_dir = test_path.join(".ck");
1934 fs::create_dir_all(&index_dir).unwrap();
1935
1936 let mut manifest = IndexManifest::default();
1937 manifest.files.insert(
1938 test_path.join("test.txt"),
1939 FileMetadata {
1940 path: test_path.join("test.txt"),
1941 hash: "test_hash".to_string(),
1942 last_modified: 1234567890,
1943 size: 100,
1944 },
1945 );
1946
1947 let manifest_path = index_dir.join("manifest.json");
1948 save_manifest(&manifest_path, &manifest).unwrap();
1949
1950 let stats = get_index_stats(test_path).unwrap();
1951 assert_eq!(stats.total_files, 1);
1952 }
1953
1954 #[test]
1955 fn test_sidecar_to_original_path() {
1956 let temp_dir = TempDir::new().unwrap();
1957 let index_dir = temp_dir.path().join(".ck");
1958
1959 let sidecar = index_dir.join("test.txt.ck");
1961 let original = sidecar_to_original_path(&sidecar, &index_dir, temp_dir.path());
1962 assert_eq!(original, Some(PathBuf::from("test.txt")));
1963
1964 let nested_sidecar = index_dir.join("src").join("main.rs.ck");
1966 let nested_original =
1967 sidecar_to_original_path(&nested_sidecar, &index_dir, temp_dir.path());
1968 assert_eq!(nested_original, Some(PathBuf::from("src/main.rs")));
1969 }
1970
1971 #[test]
1972 fn test_is_text_file() {
1973 use std::fs::File;
1974 use std::io::Write;
1975 use tempfile::TempDir;
1976
1977 let temp_dir = TempDir::new().unwrap();
1978 let temp_path = temp_dir.path();
1979
1980 let text_file = temp_path.join("test.txt");
1982 let mut file = File::create(&text_file).unwrap();
1983 file.write_all(b"Hello world\nThis is text content")
1984 .unwrap();
1985 assert!(is_text_file(&text_file));
1986
1987 let log_file = temp_path.join("app.log");
1989 let mut file = File::create(&log_file).unwrap();
1990 file.write_all(b"2024-01-15 ERROR: Failed to connect")
1991 .unwrap();
1992 assert!(is_text_file(&log_file));
1993
1994 let no_ext_file = temp_path.join("README");
1996 let mut file = File::create(&no_ext_file).unwrap();
1997 file.write_all(b"This is a README file").unwrap();
1998 assert!(is_text_file(&no_ext_file));
1999
2000 let binary_file = temp_path.join("test.bin");
2002 let mut file = File::create(&binary_file).unwrap();
2003 file.write_all(&[
2004 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x00, 0x57, 0x6F, 0x72, 0x6C, 0x64,
2005 ])
2006 .unwrap(); assert!(!is_text_file(&binary_file));
2008
2009 let empty_file = temp_path.join("empty.txt");
2011 File::create(&empty_file).unwrap();
2012 assert!(is_text_file(&empty_file));
2013
2014 let nonexistent = temp_path.join("nonexistent.txt");
2016 assert!(!is_text_file(&nonexistent));
2017 }
2018
2019 #[test]
2020 fn test_remove_empty_dirs() {
2021 let temp_dir = TempDir::new().unwrap();
2022 let test_path = temp_dir.path();
2023
2024 let nested_dir = test_path.join("level1").join("level2").join("level3");
2026 fs::create_dir_all(&nested_dir).unwrap();
2027
2028 remove_empty_dirs(test_path).unwrap();
2030
2031 assert!(!nested_dir.exists());
2033 assert!(!test_path.join("level1").join("level2").exists());
2034 assert!(!test_path.join("level1").exists());
2035 }
2036
2037 #[test]
2039 fn test_no_ignore_disables_git_exclude() {
2040 let temp_dir = TempDir::new().unwrap();
2041 let test_path = temp_dir.path();
2042
2043 fs::create_dir_all(test_path.join(".git/info")).unwrap();
2045
2046 fs::write(test_path.join("visible.txt"), "visible content").unwrap();
2048
2049 let excluded_dir = test_path.join("excluded_dir");
2051 fs::create_dir(&excluded_dir).unwrap();
2052 fs::write(excluded_dir.join("hidden.txt"), "hidden content").unwrap();
2053
2054 fs::write(test_path.join(".git/info/exclude"), "/excluded_dir\n").unwrap();
2056
2057 let options_respect = ck_core::FileCollectionOptions {
2059 respect_gitignore: true,
2060 use_ckignore: false,
2061 exclude_patterns: vec![],
2062 };
2063 let files = collect_files(test_path, &options_respect).unwrap();
2064 assert_eq!(
2065 files.len(),
2066 1,
2067 "With respect_gitignore=true, .git/info/exclude should hide files, found: {:?}",
2068 files
2069 );
2070
2071 let options_no_ignore = ck_core::FileCollectionOptions {
2073 respect_gitignore: false,
2074 use_ckignore: false,
2075 exclude_patterns: vec![],
2076 };
2077 let files = collect_files(test_path, &options_no_ignore).unwrap();
2078 assert_eq!(
2079 files.len(),
2080 2,
2081 "With respect_gitignore=false, .git/info/exclude should be ignored, found: {:?}",
2082 files
2083 );
2084 }
2085
2086 #[test]
2087 fn test_ckignore_works_without_gitignore() {
2088 let temp_dir = TempDir::new().unwrap();
2090 let test_path = temp_dir.path();
2091
2092 fs::write(test_path.join(".gitignore"), "*.git\n").unwrap();
2094 fs::write(test_path.join(".ckignore"), "*.ck\n").unwrap();
2095
2096 fs::write(test_path.join("normal.txt"), "normal content").unwrap();
2098 fs::write(test_path.join("ignored_by_git.git"), "git ignored").unwrap();
2099 fs::write(test_path.join("ignored_by_ck.ck"), "ck ignored").unwrap();
2100
2101 let options = ck_core::FileCollectionOptions {
2103 respect_gitignore: false,
2104 use_ckignore: true,
2105 exclude_patterns: vec![],
2106 };
2107
2108 let files = collect_files(test_path, &options).unwrap();
2109 let file_names: Vec<String> = files
2110 .iter()
2111 .filter_map(|p| p.file_name())
2112 .map(|n| n.to_string_lossy().to_string())
2113 .collect();
2114
2115 assert!(
2117 file_names.contains(&"normal.txt".to_string()),
2118 "Should find normal.txt"
2119 );
2120
2121 assert!(
2123 file_names.contains(&"ignored_by_git.git".to_string()),
2124 "Should find .git file when respect_gitignore=false"
2125 );
2126
2127 assert!(
2129 !file_names.contains(&"ignored_by_ck.ck".to_string()),
2130 "Should NOT find .ck file when use_ckignore=true"
2131 );
2132
2133 let options_both_disabled = ck_core::FileCollectionOptions {
2135 respect_gitignore: false,
2136 use_ckignore: false,
2137 exclude_patterns: vec![],
2138 };
2139
2140 let files_all = collect_files(test_path, &options_both_disabled).unwrap();
2141 let file_names_all: Vec<String> = files_all
2142 .iter()
2143 .filter_map(|p| p.file_name())
2144 .map(|n| n.to_string_lossy().to_string())
2145 .collect();
2146
2147 assert!(
2149 file_names_all.contains(&"ignored_by_git.git".to_string()),
2150 "Should find .git file"
2151 );
2152 assert!(
2153 file_names_all.contains(&"ignored_by_ck.ck".to_string()),
2154 "Should find .ck file when use_ckignore=false"
2155 );
2156 }
2157}
2158
2159mod cleanup_validation {
2165 use super::*;
2166 pub fn validate_and_cleanup_index(
2170 repo_root: &Path,
2171 index_dir: &Path,
2172 manifest: &mut IndexManifest,
2173 options: &ck_core::FileCollectionOptions,
2174 ) -> Result<CleanupStats> {
2175 let mut stats = CleanupStats::default();
2176
2177 let existing_files = collect_files_as_hashset(repo_root, options)?;
2179 let standard_existing_files: HashSet<PathBuf> = existing_files
2180 .into_iter()
2181 .map(|path| path_utils::to_standard_path(&path, repo_root))
2182 .collect();
2183
2184 let manifest_entries: Vec<PathBuf> =
2186 manifest.files.keys().map(|k| k.to_path_buf()).collect();
2187 for manifest_path in manifest_entries {
2188 let standard_path = path_utils::from_manifest_path(&manifest_path);
2189
2190 if !standard_existing_files.contains(&standard_path) {
2192 remove_manifest_entry(manifest, &manifest_path, repo_root, index_dir, &mut stats)?;
2193 continue;
2194 }
2195
2196 let sidecar_path =
2198 path_utils::get_sidecar_path_for_standard_path(index_dir, &standard_path);
2199 if !sidecar_path.exists() {
2200 remove_manifest_entry(manifest, &manifest_path, repo_root, index_dir, &mut stats)?;
2201 continue;
2202 }
2203 }
2204
2205 cleanup_orphaned_sidecars(index_dir, &standard_existing_files, manifest, &mut stats)?;
2207
2208 Ok(stats)
2209 }
2210
2211 fn remove_manifest_entry(
2213 manifest: &mut IndexManifest,
2214 manifest_path: &Path,
2215 repo_root: &Path,
2216 index_dir: &Path,
2217 stats: &mut CleanupStats,
2218 ) -> Result<()> {
2219 manifest.files.remove(manifest_path);
2220
2221 let standard_path = path_utils::from_manifest_path(manifest_path);
2223 let sidecar_path =
2224 path_utils::get_sidecar_path_for_standard_path(index_dir, &standard_path);
2225 if sidecar_path.exists() {
2226 fs::remove_file(&sidecar_path)?;
2227 stats.orphaned_sidecars_removed += 1;
2228 }
2229
2230 if ck_core::pdf::is_pdf_file(&standard_path) {
2232 let absolute_path = repo_root.join(&standard_path);
2233 let cache_path = ck_core::pdf::get_content_cache_path(repo_root, &absolute_path);
2234 if cache_path.exists() {
2235 fs::remove_file(&cache_path)?;
2236 tracing::debug!("Removed orphaned content cache: {:?}", cache_path);
2237 }
2238 }
2239
2240 stats.orphaned_entries_removed += 1;
2241 tracing::warn!("Removed manifest entry: {:?}", manifest_path);
2242 Ok(())
2243 }
2244
2245 fn cleanup_orphaned_sidecars(
2247 index_dir: &Path,
2248 standard_existing_files: &HashSet<PathBuf>,
2249 manifest: &IndexManifest,
2250 stats: &mut CleanupStats,
2251 ) -> Result<()> {
2252 if !index_dir.exists() {
2253 return Ok(());
2254 }
2255
2256 for entry in WalkDir::new(index_dir) {
2257 let entry = entry?;
2258 if entry.file_type().is_file() {
2259 let sidecar_path = entry.path();
2260 if sidecar_path.extension().and_then(|s| s.to_str()) == Some("ck")
2261 && let Some(standard_path) =
2262 path_utils::sidecar_to_standard_path(sidecar_path, index_dir)
2263 {
2264 let manifest_path = path_utils::to_manifest_path(&standard_path);
2265
2266 if !standard_existing_files.contains(&standard_path)
2268 || !manifest.files.contains_key(&manifest_path)
2269 {
2270 fs::remove_file(sidecar_path)?;
2271 stats.orphaned_sidecars_removed += 1;
2272 }
2273 }
2274 }
2275 }
2276
2277 Ok(())
2278 }
2279}
2280
2281mod path_utils {
2289 use super::*;
2290
2291 pub fn to_standard_path(absolute_path: &Path, repo_root: &Path) -> PathBuf {
2293 if let Ok(relative) = absolute_path.strip_prefix(repo_root) {
2294 relative.to_path_buf()
2295 } else {
2296 absolute_path.to_path_buf()
2297 }
2298 }
2299
2300 pub fn to_manifest_path(standard_path: &Path) -> PathBuf {
2302 PathBuf::from(".").join(standard_path)
2303 }
2304
2305 pub fn from_manifest_path(manifest_path: &Path) -> PathBuf {
2307 if let Ok(relative) = manifest_path.strip_prefix(".") {
2308 relative.to_path_buf()
2309 } else {
2310 manifest_path.to_path_buf()
2311 }
2312 }
2313
2314 pub fn get_sidecar_path_for_standard_path(index_dir: &Path, standard_path: &Path) -> PathBuf {
2316 let sidecar_name = format!("{}.ck", standard_path.display());
2317 index_dir.join(sidecar_name)
2318 }
2319
2320 pub fn sidecar_to_standard_path(sidecar_path: &Path, index_dir: &Path) -> Option<PathBuf> {
2322 let relative_path = sidecar_path.strip_prefix(index_dir).ok()?;
2323 let original_path = relative_path.with_extension("");
2324
2325 if let Some(name) = original_path.file_name() {
2327 let name_str = name.to_string_lossy();
2328 if let Some(original_name) = name_str.strip_suffix(".ck") {
2329 let mut result = original_path.clone();
2330 result.set_file_name(original_name);
2331 return Some(result);
2332 }
2333 }
2334
2335 Some(original_path)
2336 }
2337}