1use anyhow::Result;
2use ck_core::{
3 FileMetadata, Language, Span, compute_chunk_hash, compute_file_hash, get_sidecar_path,
4};
5use ignore::{WalkBuilder, overrides::OverrideBuilder};
6use rayon::prelude::*;
7use serde::{Deserialize, Serialize};
8use std::collections::{HashMap, HashSet};
9use std::fs;
10use std::io::{Read, Write};
11use std::path::{Path, PathBuf};
12use std::sync::Once;
13use std::sync::atomic::{AtomicBool, Ordering};
14use std::time::SystemTime;
15use tempfile::NamedTempFile;
16use walkdir::WalkDir;
17
18fn legacy_model_config(name: &str, dimensions: Option<usize>) -> ck_models::ModelConfig {
19 ck_models::ModelConfig {
20 name: name.to_string(),
21 provider: "fastembed".to_string(),
22 dimensions: dimensions.unwrap_or(384),
23 max_tokens: 8192,
24 description: "Legacy ck embedding model (inferred from manifest)".to_string(),
25 }
26}
27
28pub type ProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
29
30#[derive(Debug, Clone)]
32pub struct EmbeddingProgress {
33 pub file_name: String,
34 pub file_index: usize,
35 pub total_files: usize,
36 pub chunk_index: usize,
37 pub total_chunks: usize,
38 pub chunk_size: usize,
39}
40
41pub type DetailedProgressCallback = Box<dyn Fn(EmbeddingProgress) + Send + Sync>;
42
43#[derive(Debug, Clone)]
45pub enum IndexingProgress {
46 Starting { total_files: usize },
48 ProcessingFile {
50 file: String,
51 file_number: usize,
52 total_files: usize,
53 file_size: u64,
54 },
55 ChunkingFile { file: String, chunks_found: usize },
57 ProcessingChunk {
59 file: String,
60 chunk_number: usize,
61 total_chunks: usize,
62 chunk_size: usize,
63 },
64 FileComplete {
66 file: String,
67 chunks_processed: usize,
68 file_number: usize,
69 total_files: usize,
70 elapsed_ms: u64,
71 },
72 Complete {
74 total_files: usize,
75 total_chunks: usize,
76 total_elapsed_ms: u64,
77 },
78}
79
80pub type EnhancedProgressCallback = Box<dyn Fn(IndexingProgress) + Send + Sync>;
81
82static INTERRUPTED: AtomicBool = AtomicBool::new(false);
84static HANDLER_INIT: Once = Once::new();
85
86pub const INDEX_INTERRUPTED_MSG: &str = "Indexing interrupted by user";
87
88pub fn request_interrupt() {
89 INTERRUPTED.store(true, Ordering::SeqCst);
90}
91
92fn build_overrides(
94 base_path: &Path,
95 exclude_patterns: &[String],
96) -> Result<ignore::overrides::Override> {
97 let mut builder = OverrideBuilder::new(base_path);
98
99 for pattern in exclude_patterns {
100 if pattern.starts_with('!') {
101 builder.add(pattern)?;
102 } else {
103 builder.add(&format!("!{pattern}"))?;
104 }
105 }
106
107 Ok(builder.build()?)
108}
109
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct IndexEntry {
112 pub metadata: FileMetadata,
113 pub chunks: Vec<ChunkEntry>,
114}
115
116#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct ChunkEntry {
118 pub span: Span,
119 pub embedding: Option<Vec<f32>>,
120 pub chunk_type: Option<String>, #[serde(default)]
122 pub breadcrumb: Option<String>,
123 #[serde(default)]
124 pub ancestry: Option<Vec<String>>,
125 #[serde(default)]
126 pub byte_length: Option<usize>,
127 #[serde(default)]
128 pub estimated_tokens: Option<usize>,
129 #[serde(default)]
130 pub leading_trivia: Option<Vec<String>>,
131 #[serde(default)]
132 pub trailing_trivia: Option<Vec<String>>,
133 #[serde(default)]
135 pub chunk_hash: Option<String>,
136}
137
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct IndexManifest {
140 pub version: String,
141 pub created: u64,
142 pub updated: u64,
143 pub files: HashMap<PathBuf, FileMetadata>,
144 pub embedding_model: Option<String>,
146 pub embedding_dimensions: Option<usize>,
148 #[serde(default)]
152 pub chunk_hash_version: Option<u32>,
153}
154
155impl Default for IndexManifest {
156 fn default() -> Self {
157 let now = SystemTime::now()
158 .duration_since(SystemTime::UNIX_EPOCH)
159 .unwrap()
160 .as_secs();
161
162 Self {
163 version: "0.1.0".to_string(),
164 created: now,
165 updated: now,
166 files: HashMap::new(),
167 embedding_model: None, embedding_dimensions: None,
169 chunk_hash_version: Some(2), }
171 }
172}
173
174fn should_include_file(entry: &ignore::DirEntry, index_dir: &Path) -> bool {
176 let path = entry.path();
177 entry.file_type().is_some_and(|ft| ft.is_file())
178 && is_text_file(path)
179 && !path.starts_with(index_dir)
180}
181
182fn filter_and_collect_files(walker: ignore::Walk, index_dir: &Path) -> Vec<PathBuf> {
184 walker
185 .filter_map(std::result::Result::ok)
186 .filter(|entry| should_include_file(entry, index_dir))
187 .map(|entry| entry.path().to_path_buf())
188 .collect()
189}
190
191pub fn collect_files(
192 path: &Path,
193 options: &ck_core::FileCollectionOptions,
194) -> Result<Vec<PathBuf>> {
195 let index_dir = path.join(".ck");
196
197 if options.respect_gitignore {
198 let overrides = build_overrides(path, &options.exclude_patterns)?;
199 let mut walker_builder = WalkBuilder::new(path);
200 walker_builder
201 .git_ignore(true)
202 .git_global(true)
203 .git_exclude(true)
204 .hidden(true);
205
206 if options.use_ckignore {
208 walker_builder.add_custom_ignore_filename(".ckignore");
209 }
210
211 walker_builder.overrides(overrides);
212 let walker = walker_builder.build();
213
214 Ok(filter_and_collect_files(walker, &index_dir))
215 } else {
216 use ck_core::get_default_exclude_patterns;
218 let default_patterns = get_default_exclude_patterns();
219
220 let mut all_patterns = default_patterns;
222 all_patterns.extend(options.exclude_patterns.iter().cloned());
223 let combined_overrides = build_overrides(path, &all_patterns)?;
224
225 let mut walker_builder = WalkBuilder::new(path);
226 walker_builder
227 .git_ignore(false)
228 .git_global(false)
229 .git_exclude(false)
230 .hidden(true);
231
232 if options.use_ckignore {
234 walker_builder.add_custom_ignore_filename(".ckignore");
235 }
236
237 walker_builder.overrides(combined_overrides);
238 let walker = walker_builder.build();
239
240 Ok(filter_and_collect_files(walker, &index_dir))
241 }
242}
243
244fn collect_files_as_hashset(
245 path: &Path,
246 options: &ck_core::FileCollectionOptions,
247) -> Result<HashSet<PathBuf>> {
248 Ok(collect_files(path, options)?.into_iter().collect())
249}
250
251pub async fn index_directory(
252 path: &Path,
253 compute_embeddings: bool,
254 options: &ck_core::FileCollectionOptions,
255 model: Option<&str>,
256) -> Result<()> {
257 tracing::info!(
258 "index_directory called with compute_embeddings={}",
259 compute_embeddings
260 );
261 let index_dir = path.join(".ck");
262 fs::create_dir_all(&index_dir)?;
263
264 let manifest_path = index_dir.join("manifest.json");
265 let mut manifest = load_or_create_manifest(&manifest_path)?;
266 normalize_manifest_paths(&mut manifest, path);
267
268 let resolved_model = if compute_embeddings {
270 let model_registry = ck_models::ModelRegistry::default();
271 let (alias, config) = model_registry
272 .resolve(model)
273 .map_err(|e| anyhow::anyhow!(e.to_string()))?;
274
275 if let Some(existing_model) = &manifest.embedding_model
276 && existing_model != &config.name
277 {
278 return Err(anyhow::anyhow!(
279 "Model mismatch: Index was created with '{}', but you're trying to use '{}'. \
280 Please run 'ck --clean {}' to remove the old index, then rerun with the new model.",
281 existing_model,
282 config.name,
283 path.display()
284 ));
285 }
286
287 manifest.embedding_model = Some(config.name.clone());
288 manifest.embedding_dimensions = Some(config.dimensions);
289
290 Some((alias, config))
291 } else {
292 None
293 };
294
295 let files = collect_files(path, options)?;
296
297 if compute_embeddings {
298 tracing::info!("Creating embedder for {} files", files.len());
300 let (_, config) = resolved_model
301 .as_ref()
302 .expect("resolved model must be present when computing embeddings");
303 let mut embedder = ck_embed::create_embedder_for_config(config, None)?;
304
305 for file_path in files.iter() {
306 match index_single_file(file_path, path, Some(&mut embedder)) {
307 Ok(entry) => {
308 let sidecar_path = get_sidecar_path(path, file_path);
310 save_index_entry(&sidecar_path, &entry)?;
311
312 let manifest_key = entry.metadata.path.clone();
314 manifest.files.insert(manifest_key, entry.metadata);
315 manifest.updated = SystemTime::now()
316 .duration_since(SystemTime::UNIX_EPOCH)
317 .unwrap()
318 .as_secs();
319 save_manifest(&manifest_path, &manifest)?;
320 }
321 Err(e) => {
322 let error_msg = e.to_string();
324 let is_binary_skip = error_msg.contains("Binary file, skipping");
325 let is_utf8_error = error_msg.contains("stream did not contain valid UTF-8");
326 let is_git_file = file_path.components().any(|c| c.as_os_str() == ".git");
327
328 if !(is_binary_skip || is_utf8_error && is_git_file) {
329 tracing::warn!("Failed to index {:?}: {}", file_path, e);
330 }
331 }
332 }
333 }
334 } else {
335 use std::sync::mpsc;
337 use std::thread;
338
339 let (tx, rx) = mpsc::channel();
340 let files_clone = files.clone();
341 let path_clone = path.to_path_buf();
342
343 let worker_handle = thread::spawn(move || {
345 files_clone.par_iter().for_each(|file_path| {
346 match index_single_file(file_path, &path_clone, None) {
347 Ok(entry) => {
348 if tx.send((file_path.clone(), entry)).is_err() {
349 }
351 }
352 Err(e) => {
353 let error_msg = e.to_string();
355 let is_binary_skip = error_msg.contains("Binary file, skipping");
356 let is_utf8_error =
357 error_msg.contains("stream did not contain valid UTF-8");
358 let is_git_file = file_path.components().any(|c| c.as_os_str() == ".git");
359
360 if !(is_binary_skip || is_utf8_error && is_git_file) {
361 tracing::warn!("Failed to index {:?}: {}", file_path, e);
362 }
363 }
364 }
365 });
366 });
367
368 while let Ok((file_path, entry)) = rx.recv() {
370 let sidecar_path = get_sidecar_path(path, &file_path);
372 save_index_entry(&sidecar_path, &entry)?;
373
374 let manifest_key = entry.metadata.path.clone();
376 manifest.files.insert(manifest_key, entry.metadata);
377 manifest.updated = SystemTime::now()
378 .duration_since(SystemTime::UNIX_EPOCH)
379 .unwrap()
380 .as_secs();
381 save_manifest(&manifest_path, &manifest)?;
382 }
383
384 worker_handle
386 .join()
387 .map_err(|_| anyhow::anyhow!("Worker thread panicked"))?;
388 }
389
390 if !compute_embeddings {
393 manifest.updated = SystemTime::now()
394 .duration_since(SystemTime::UNIX_EPOCH)
395 .unwrap()
396 .as_secs();
397 save_manifest(&manifest_path, &manifest)?;
398 }
399
400 Ok(())
401}
402
403pub async fn index_file(file_path: &Path, compute_embeddings: bool) -> Result<()> {
404 let repo_root = find_repo_root(file_path)?;
405 let index_dir = repo_root.join(".ck");
406 fs::create_dir_all(&index_dir)?;
407
408 let manifest_path = index_dir.join("manifest.json");
409 let mut manifest = load_or_create_manifest(&manifest_path)?;
410
411 let entry = if compute_embeddings {
412 let model_registry = ck_models::ModelRegistry::default();
413 let (alias, config) = if let Some(existing) = manifest.embedding_model.as_deref() {
414 match model_registry.resolve(Some(existing)) {
415 Ok(resolved) => resolved,
416 Err(_) => (
417 existing.to_string(),
418 legacy_model_config(existing, manifest.embedding_dimensions),
419 ),
420 }
421 } else {
422 model_registry
423 .resolve(None)
424 .map_err(|e| anyhow::anyhow!(e.to_string()))?
425 };
426
427 manifest.embedding_model = Some(config.name.clone());
428 manifest.embedding_dimensions = Some(config.dimensions);
429 tracing::debug!("Using embedding model '{}' ({})", config.name, alias);
430
431 let mut embedder = ck_embed::create_embedder_for_config(&config, None)?;
432 index_single_file(file_path, &repo_root, Some(&mut embedder))?
433 } else {
434 index_single_file(file_path, &repo_root, None)?
435 };
436 let sidecar_path = get_sidecar_path(&repo_root, file_path);
437
438 save_index_entry(&sidecar_path, &entry)?;
439 let manifest_key = entry.metadata.path.clone();
440 manifest.files.insert(manifest_key, entry.metadata);
441 manifest.updated = SystemTime::now()
442 .duration_since(SystemTime::UNIX_EPOCH)
443 .unwrap()
444 .as_secs();
445
446 save_manifest(&manifest_path, &manifest)?;
447
448 Ok(())
449}
450
451pub async fn update_index(
452 path: &Path,
453 compute_embeddings: bool,
454 options: &ck_core::FileCollectionOptions,
455) -> Result<()> {
456 let index_dir = path.join(".ck");
457 if !index_dir.exists() {
458 return index_directory(
459 path,
460 compute_embeddings,
461 options,
462 None, )
464 .await;
465 }
466
467 let manifest_path = index_dir.join("manifest.json");
468 let mut manifest = load_or_create_manifest(&manifest_path)?;
469
470 let files = collect_files(path, options)?;
471
472 let updates: Vec<(PathBuf, IndexEntry)> = if compute_embeddings {
473 let model_registry = ck_models::ModelRegistry::default();
475 let (alias, config) = if let Some(existing) = manifest.embedding_model.as_deref() {
476 match model_registry.resolve(Some(existing)) {
477 Ok(resolved) => resolved,
478 Err(_) => (
479 existing.to_string(),
480 legacy_model_config(existing, manifest.embedding_dimensions),
481 ),
482 }
483 } else {
484 model_registry
485 .resolve(None)
486 .map_err(|e| anyhow::anyhow!(e.to_string()))?
487 };
488
489 manifest.embedding_model = Some(config.name.clone());
490 manifest.embedding_dimensions = Some(config.dimensions);
491 tracing::debug!(
492 "Updating index with embedding model '{}' ({})",
493 config.name,
494 alias
495 );
496
497 let mut embedder = ck_embed::create_embedder_for_config(&config, None)?;
498 files
499 .iter()
500 .filter_map(|file_path| {
501 let manifest_key =
502 path_utils::to_manifest_path(&path_utils::to_standard_path(file_path, path));
503
504 let needs_update = match manifest.files.get(&manifest_key) {
505 Some(metadata) => match compute_file_hash(file_path) {
506 Ok(hash) => hash != metadata.hash,
507 Err(_) => false,
508 },
509 None => true,
510 };
511 if needs_update {
512 match index_single_file(file_path, path, Some(&mut embedder)) {
513 Ok(entry) => Some((file_path.clone(), entry)),
514 Err(e) => {
515 let error_msg = e.to_string();
517 let is_binary_skip = error_msg.contains("Binary file, skipping");
518 let is_utf8_error =
519 error_msg.contains("stream did not contain valid UTF-8");
520 let is_git_file =
521 file_path.components().any(|c| c.as_os_str() == ".git");
522
523 if !(is_binary_skip || is_utf8_error && is_git_file) {
524 tracing::warn!("Failed to index {:?}: {}", file_path, e);
525 }
526 None
527 }
528 }
529 } else {
530 None
531 }
532 })
533 .collect()
534 } else {
535 files
537 .par_iter()
538 .filter_map(|file_path| {
539 let manifest_key =
540 path_utils::to_manifest_path(&path_utils::to_standard_path(file_path, path));
541
542 let needs_update = match manifest.files.get(&manifest_key) {
543 Some(metadata) => match compute_file_hash(file_path) {
544 Ok(hash) => hash != metadata.hash,
545 Err(_) => false,
546 },
547 None => true,
548 };
549
550 if needs_update {
551 match index_single_file(file_path, path, None) {
552 Ok(entry) => Some((file_path.clone(), entry)),
553 Err(e) => {
554 let error_msg = e.to_string();
556 let is_binary_skip = error_msg.contains("Binary file, skipping");
557 let is_utf8_error =
558 error_msg.contains("stream did not contain valid UTF-8");
559 let is_git_file =
560 file_path.components().any(|c| c.as_os_str() == ".git");
561
562 if !(is_binary_skip || is_utf8_error && is_git_file) {
563 tracing::warn!("Failed to index {:?}: {}", file_path, e);
564 }
565 None
566 }
567 }
568 } else {
569 None
570 }
571 })
572 .collect()
573 };
574
575 for (file_path, entry) in updates {
576 let sidecar_path = get_sidecar_path(path, &file_path);
577 save_index_entry(&sidecar_path, &entry)?;
578 let manifest_key = entry.metadata.path.clone();
579 manifest.files.insert(manifest_key, entry.metadata);
580 }
581
582 if !manifest.files.is_empty() {
583 manifest.updated = SystemTime::now()
584 .duration_since(SystemTime::UNIX_EPOCH)
585 .unwrap()
586 .as_secs();
587 save_manifest(&manifest_path, &manifest)?;
588 }
589
590 Ok(())
591}
592
593pub fn clean_index(path: &Path) -> Result<()> {
594 let index_dir = path.join(".ck");
595 if index_dir.exists() {
596 fs::remove_dir_all(&index_dir)?;
597 }
598 Ok(())
599}
600
601pub fn cleanup_index(
602 path: &Path,
603 options: &ck_core::FileCollectionOptions,
604) -> Result<CleanupStats> {
605 let index_dir = path.join(".ck");
606 if !index_dir.exists() {
607 return Ok(CleanupStats::default());
608 }
609
610 let manifest_path = index_dir.join("manifest.json");
611 let mut manifest = load_or_create_manifest(&manifest_path)?;
612 normalize_manifest_paths(&mut manifest, path);
613
614 let stats =
616 cleanup_validation::validate_and_cleanup_index(path, &index_dir, &mut manifest, options)?;
617
618 remove_empty_dirs(&index_dir)?;
622
623 if stats.orphaned_entries_removed > 0 {
625 manifest.updated = SystemTime::now()
626 .duration_since(SystemTime::UNIX_EPOCH)
627 .unwrap()
628 .as_secs();
629 save_manifest(&manifest_path, &manifest)?;
630 }
631
632 Ok(stats)
633}
634
635pub fn get_index_stats(path: &Path) -> Result<IndexStats> {
636 let index_dir = path.join(".ck");
637 if !index_dir.exists() {
638 return Ok(IndexStats::default());
639 }
640
641 let manifest_path = index_dir.join("manifest.json");
642 let mut manifest = load_or_create_manifest(&manifest_path)?;
643 normalize_manifest_paths(&mut manifest, path);
644
645 let mut stats = IndexStats {
646 total_files: manifest.files.len(),
647 index_created: manifest.created,
648 index_updated: manifest.updated,
649 ..Default::default()
650 };
651
652 for file_path in manifest.files.keys() {
654 let standard_path = path_utils::from_manifest_path(file_path);
655 let sidecar_path =
656 path_utils::get_sidecar_path_for_standard_path(&index_dir, &standard_path);
657 if sidecar_path.exists()
658 && let Ok(entry) = load_index_entry(&sidecar_path)
659 {
660 stats.total_chunks += entry.chunks.len();
661 stats.total_size_bytes += entry.metadata.size;
662
663 let embedded = entry
665 .chunks
666 .iter()
667 .filter(|c| c.embedding.is_some())
668 .count();
669 stats.embedded_chunks += embedded;
670 }
671 }
672
673 if let Ok(entries) = WalkDir::new(&index_dir)
675 .into_iter()
676 .collect::<Result<Vec<_>, _>>()
677 {
678 for entry in entries {
679 if entry.file_type().is_file()
680 && let Ok(metadata) = entry.metadata()
681 {
682 stats.index_size_bytes += metadata.len();
683 }
684 }
685 }
686
687 Ok(stats)
688}
689
690pub async fn smart_update_index(
691 path: &Path,
692 compute_embeddings: bool,
693 options: &ck_core::FileCollectionOptions,
694) -> Result<UpdateStats> {
695 smart_update_index_with_progress(
696 path,
697 false,
698 None,
699 compute_embeddings,
700 options,
701 None, )
703 .await
704}
705
706pub async fn smart_update_index_with_progress(
707 path: &Path,
708 force_rebuild: bool,
709 progress_callback: Option<ProgressCallback>,
710 compute_embeddings: bool,
711 options: &ck_core::FileCollectionOptions,
712 model: Option<&str>,
713) -> Result<UpdateStats> {
714 smart_update_index_with_detailed_progress(
715 path,
716 force_rebuild,
717 progress_callback,
718 None, compute_embeddings,
720 options,
721 model,
722 )
723 .await
724}
725
726pub async fn smart_update_index_with_detailed_progress(
728 path: &Path,
729 force_rebuild: bool,
730 progress_callback: Option<ProgressCallback>,
731 detailed_progress_callback: Option<DetailedProgressCallback>,
732 compute_embeddings: bool,
733 options: &ck_core::FileCollectionOptions,
734 model: Option<&str>,
735) -> Result<UpdateStats> {
736 let index_dir = path.join(".ck");
737 let mut stats = UpdateStats::default();
738
739 HANDLER_INIT.call_once(|| {
741 let _ = ctrlc::set_handler(move || {
742 INTERRUPTED.store(true, Ordering::SeqCst);
743 eprintln!("\nIndexing interrupted by user. Cleaning up...");
744 });
745 });
746
747 INTERRUPTED.store(false, Ordering::SeqCst);
749
750 if force_rebuild {
751 clean_index(path)?;
752 index_directory(path, compute_embeddings, options, model).await?;
753 let index_stats = get_index_stats(path)?;
754 stats.files_indexed = index_stats.total_files;
755 return Ok(stats);
756 }
757
758 let repo_root = find_repo_root(path)?;
760
761 fs::create_dir_all(&index_dir)?;
767 let manifest_path = index_dir.join("manifest.json");
768 let mut manifest = load_or_create_manifest(&manifest_path)?;
769 normalize_manifest_paths(&mut manifest, &repo_root);
770
771 let resolved_model = if compute_embeddings {
773 let model_registry = ck_models::ModelRegistry::default();
774
775 let resolved = if let Some(requested) = model {
776 model_registry
777 .resolve(Some(requested))
778 .map_err(|e| anyhow::anyhow!(e.to_string()))?
779 } else if let Some(existing_model) = &manifest.embedding_model {
780 match model_registry.resolve(Some(existing_model.as_str())) {
781 Ok(resolved) => resolved,
782 Err(_) => (
783 existing_model.clone(),
784 legacy_model_config(existing_model, manifest.embedding_dimensions),
785 ),
786 }
787 } else {
788 model_registry
789 .resolve(None)
790 .map_err(|e| anyhow::anyhow!(e.to_string()))?
791 };
792
793 if let Some(existing_model) = &manifest.embedding_model
794 && existing_model != &resolved.1.name
795 {
796 return Err(anyhow::anyhow!(
797 "Model mismatch: Index was created with '{}', but you're trying to use '{}'. \
798 Please run 'ck --clean .' to remove the old index, then 'ck --index --model {}' to rebuild with the new model.",
799 existing_model,
800 resolved.1.name,
801 model.unwrap_or("default")
802 ));
803 }
804
805 manifest.embedding_model = Some(resolved.1.name.clone());
806 manifest.embedding_dimensions = Some(resolved.1.dimensions);
807
808 Some(resolved)
809 } else {
810 None
811 };
812
813 let current_files = collect_files(path, options)?;
816
817 let mut files_to_update = Vec::new();
819 let mut manifest_changed = false;
820
821 for file_path in current_files {
822 if INTERRUPTED.load(Ordering::SeqCst) {
824 eprintln!("Indexing interrupted during file scanning.");
825 return Ok(stats);
826 }
827
828 let manifest_key =
829 path_utils::to_manifest_path(&path_utils::to_standard_path(&file_path, &repo_root));
830
831 if let Some(metadata) = manifest.files.get(&manifest_key) {
832 let fs_meta = match fs::metadata(&file_path) {
833 Ok(m) => m,
834 Err(_) => {
835 stats.files_errored += 1;
836 continue;
837 }
838 };
839
840 let fs_last_modified = match fs_meta.modified().and_then(|m| {
841 m.duration_since(SystemTime::UNIX_EPOCH)
842 .map_err(|_| std::io::Error::other("Time error"))
843 }) {
844 Ok(dur) => dur.as_secs(),
845 Err(_) => {
846 stats.files_errored += 1;
847 continue;
848 }
849 };
850 let fs_size = fs_meta.len();
851
852 if fs_last_modified == metadata.last_modified && fs_size == metadata.size {
853 stats.files_up_to_date += 1;
854 continue;
855 }
856
857 let hash = match compute_file_hash(&file_path) {
858 Ok(h) => h,
859 Err(_) => {
860 stats.files_errored += 1;
861 continue;
862 }
863 };
864
865 if hash != metadata.hash {
866 stats.files_modified += 1;
867 files_to_update.push(file_path);
868 } else {
869 stats.files_up_to_date += 1;
870 let standard_path = path_utils::to_standard_path(&file_path, &repo_root);
872 let manifest_path = path_utils::to_manifest_path(&standard_path);
873 let new_metadata = FileMetadata {
874 path: manifest_path.clone(),
875 hash,
876 last_modified: fs_last_modified,
877 size: fs_size,
878 };
879 manifest.files.insert(manifest_path, new_metadata);
880 manifest_changed = true;
881 }
882 } else {
883 stats.files_added += 1;
884 files_to_update.push(file_path);
885 }
886 }
887
888 if compute_embeddings {
890 let (_, config) = resolved_model
892 .as_ref()
893 .expect("resolved model must exist for embedding updates");
894 let mut embedder = ck_embed::create_embedder_for_config(config, None)?;
895 let mut _processed_count = 0;
896
897 for file_path in files_to_update.iter() {
898 if INTERRUPTED.load(Ordering::SeqCst) {
900 eprintln!("Indexing interrupted. {_processed_count} files processed.");
901 break;
902 }
903
904 if let Some(ref callback) = progress_callback
905 && let Some(file_name) = file_path.file_name()
906 {
907 callback(&file_name.to_string_lossy());
908 }
909
910 let result = if let Some(ref detailed_callback) = detailed_progress_callback {
912 index_single_file_with_progress(
913 file_path,
914 path,
915 Some(&mut embedder),
916 Some(detailed_callback),
917 _processed_count,
918 files_to_update.len(),
919 )
920 } else {
921 index_single_file_with_progress(file_path, path, Some(&mut embedder), None, 0, 1)
922 };
923
924 match result {
925 Ok((entry, file_chunks_reused, file_chunks_embedded)) => {
926 stats.chunks_reused += file_chunks_reused;
928 stats.chunks_embedded += file_chunks_embedded;
929
930 let sidecar_path = get_sidecar_path(path, file_path);
932 save_index_entry(&sidecar_path, &entry)?;
933
934 let manifest_key = entry.metadata.path.clone();
936 manifest.files.insert(manifest_key, entry.metadata);
937 manifest.updated = SystemTime::now()
938 .duration_since(SystemTime::UNIX_EPOCH)
939 .unwrap()
940 .as_secs();
941 save_manifest(&manifest_path, &manifest)?;
942 _processed_count += 1;
943 }
944 Err(e) => {
945 let error_msg = e.to_string();
947 let is_binary_skip = error_msg.contains("Binary file, skipping");
948 let is_utf8_error = error_msg.contains("stream did not contain valid UTF-8");
949 let is_git_file = file_path.components().any(|c| c.as_os_str() == ".git");
950
951 if !(is_binary_skip || is_utf8_error && is_git_file) {
952 tracing::warn!("Failed to index {:?}: {}", file_path, e);
953 }
954 stats.files_errored += 1;
955 }
956 }
957 }
958
959 stats.files_indexed = _processed_count;
960 } else {
961 use std::sync::mpsc;
963 use std::thread;
964
965 let (tx, rx) = mpsc::channel();
966 let files_clone = files_to_update.clone();
967 let path_clone = path.to_path_buf();
968
969 let worker_handle = thread::spawn(move || {
971 use rayon::prelude::*;
972
973 let result = files_clone.par_iter().try_for_each(|file_path| {
975 if INTERRUPTED.load(Ordering::SeqCst) {
977 return Err("interrupted");
978 }
979
980 match index_single_file(file_path, &path_clone, None) {
981 Ok(entry) => {
982 if tx.send((file_path.clone(), entry)).is_err() {
983 return Err("receiver_dropped");
985 }
986 }
987 Err(e) => {
988 let error_msg = e.to_string();
990 let is_binary_skip = error_msg.contains("Binary file, skipping");
991 let is_utf8_error =
992 error_msg.contains("stream did not contain valid UTF-8");
993 let is_git_file = file_path.components().any(|c| c.as_os_str() == ".git");
994
995 if !(is_binary_skip || is_utf8_error && is_git_file) {
996 tracing::warn!("Failed to index {:?}: {}", file_path, e);
997 }
998 }
999 }
1000 Ok(())
1001 });
1002
1003 if let Err(reason) = result {
1005 tracing::debug!("Worker thread stopped due to: {}", reason);
1006 }
1007 });
1008
1009 let mut _processed_count = 0;
1011 while let Ok((file_path, entry)) = rx.recv() {
1012 if INTERRUPTED.load(Ordering::SeqCst) {
1014 eprintln!("Indexing interrupted. {_processed_count} files processed.");
1015 drop(rx); break;
1017 }
1018
1019 if let Some(ref callback) = progress_callback
1020 && let Some(file_name) = file_path.file_name()
1021 {
1022 callback(&file_name.to_string_lossy());
1023 }
1024
1025 let sidecar_path = get_sidecar_path(path, &file_path);
1027 save_index_entry(&sidecar_path, &entry)?;
1028
1029 let manifest_key = entry.metadata.path.clone();
1031 manifest.files.insert(manifest_key, entry.metadata);
1032 manifest.updated = SystemTime::now()
1033 .duration_since(SystemTime::UNIX_EPOCH)
1034 .unwrap()
1035 .as_secs();
1036 save_manifest(&manifest_path, &manifest)?;
1037 _processed_count += 1;
1038 }
1039
1040 stats.files_indexed = _processed_count;
1041
1042 worker_handle
1044 .join()
1045 .map_err(|_| anyhow::anyhow!("Worker thread panicked"))?;
1046 }
1047
1048 if !compute_embeddings
1051 && (stats.files_indexed > 0 || stats.orphaned_files_removed > 0 || manifest_changed)
1052 {
1053 manifest.updated = SystemTime::now()
1054 .duration_since(SystemTime::UNIX_EPOCH)
1055 .unwrap()
1056 .as_secs();
1057 save_manifest(&manifest_path, &manifest)?;
1058 }
1059
1060 Ok(stats)
1061}
1062
1063fn index_single_file(
1064 file_path: &Path,
1065 repo_root: &Path,
1066 embedder: Option<&mut Box<dyn ck_embed::Embedder>>,
1067) -> Result<IndexEntry> {
1068 let (entry, _chunks_reused, _chunks_embedded) =
1069 index_single_file_with_progress(file_path, repo_root, embedder, None, 0, 1)?;
1070 Ok(entry)
1071}
1072
1073fn index_single_file_with_progress(
1074 file_path: &Path,
1075 repo_root: &Path,
1076 embedder: Option<&mut Box<dyn ck_embed::Embedder>>,
1077 detailed_progress: Option<&DetailedProgressCallback>,
1078 file_index: usize,
1079 total_files: usize,
1080) -> Result<(IndexEntry, usize, usize)> {
1081 if !is_text_file(file_path) {
1083 return Err(anyhow::anyhow!("Binary file, skipping"));
1084 }
1085
1086 let chunk_cache: HashMap<String, Vec<f32>> = if embedder.is_some() {
1088 let sidecar_path = get_sidecar_path(repo_root, file_path);
1089 if sidecar_path.exists() {
1090 match load_index_entry(&sidecar_path) {
1091 Ok(old_entry) => old_entry
1092 .chunks
1093 .into_iter()
1094 .filter_map(|chunk| {
1095 if let (Some(hash), Some(embedding)) = (chunk.chunk_hash, chunk.embedding) {
1096 Some((hash, embedding))
1097 } else {
1098 None
1099 }
1100 })
1101 .collect(),
1102 Err(_) => HashMap::new(),
1103 }
1104 } else {
1105 HashMap::new()
1106 }
1107 } else {
1108 HashMap::new()
1109 };
1110
1111 let content_path = preprocess_file(file_path, repo_root)?;
1113 let content = fs::read_to_string(&content_path)?;
1114
1115 let hash = compute_file_hash(file_path)?;
1117 let metadata = fs::metadata(file_path)?;
1118
1119 let standard_path = path_utils::to_standard_path(file_path, repo_root);
1120 let manifest_path = path_utils::to_manifest_path(&standard_path);
1121
1122 let file_metadata = FileMetadata {
1123 path: manifest_path,
1124 hash,
1125 last_modified: metadata
1126 .modified()?
1127 .duration_since(SystemTime::UNIX_EPOCH)?
1128 .as_secs(),
1129 size: metadata.len(),
1130 };
1131
1132 let lang = if ck_core::pdf::is_pdf_file(file_path) {
1134 Some(Language::Pdf)
1135 } else {
1136 ck_core::Language::from_path(file_path)
1137 };
1138
1139 let model_name = embedder.as_ref().map(|e| e.model_name());
1140 let chunks = ck_chunk::chunk_text_with_model(&content, lang, model_name)?;
1141
1142 let mut chunks_reused = 0;
1144 let mut chunks_embedded = 0;
1145
1146 let chunk_entries: Vec<ChunkEntry> = if let Some(embedder) = embedder {
1147 let total_chunks = chunks.len();
1148 let file_name = file_path
1149 .file_name()
1150 .unwrap_or_default()
1151 .to_string_lossy()
1152 .to_string();
1153
1154 if let Some(ref callback) = detailed_progress {
1156 tracing::info!(
1157 "Computing embeddings for {} chunks in {:?}",
1158 total_chunks,
1159 file_path
1160 );
1161
1162 let mut chunk_entries = Vec::new();
1163 for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1164 if INTERRUPTED.load(Ordering::SeqCst) {
1165 return Err(anyhow::anyhow!(INDEX_INTERRUPTED_MSG));
1166 }
1167 callback(EmbeddingProgress {
1169 file_name: file_name.clone(),
1170 file_index,
1171 total_files,
1172 chunk_index,
1173 total_chunks,
1174 chunk_size: chunk.text.len(),
1175 });
1176
1177 let chunk_hash = compute_chunk_hash(
1180 &chunk.text,
1181 &chunk.metadata.leading_trivia,
1182 &chunk.metadata.trailing_trivia,
1183 );
1184
1185 let expected_dim = embedder.dim();
1187 let embedding = if let Some(cached_embedding) = chunk_cache.get(&chunk_hash) {
1188 if cached_embedding.len() == expected_dim {
1189 chunks_reused += 1;
1191 cached_embedding.clone()
1192 } else {
1193 chunks_embedded += 1;
1195 tracing::warn!(
1196 "Chunk in {:?} has cached embedding with dimension {} but current model expects {}. Re-embedding.",
1197 file_path,
1198 cached_embedding.len(),
1199 expected_dim
1200 );
1201 let embeddings = embedder.embed(std::slice::from_ref(&chunk.text))?;
1202 embeddings.into_iter().next().ok_or_else(|| {
1203 anyhow::anyhow!(
1204 "Embedder returned empty results for chunk {chunk_index} in file {file_path:?}. This may indicate an issue with the embedding model or chunk content."
1205 )
1206 })?
1207 }
1208 } else {
1209 chunks_embedded += 1;
1211 let embeddings = embedder.embed(std::slice::from_ref(&chunk.text))?;
1212 embeddings.into_iter().next().ok_or_else(|| {
1213 anyhow::anyhow!(
1214 "Embedder returned empty results for chunk {chunk_index} in file {file_path:?}. This may indicate an issue with the embedding model or chunk content."
1215 )
1216 })?
1217 };
1218
1219 let chunk_type_str = match chunk.chunk_type {
1220 ck_chunk::ChunkType::Function => Some("function".to_string()),
1221 ck_chunk::ChunkType::Class => Some("class".to_string()),
1222 ck_chunk::ChunkType::Method => Some("method".to_string()),
1223 ck_chunk::ChunkType::Module => Some("module".to_string()),
1224 ck_chunk::ChunkType::Text => None,
1225 };
1226
1227 let breadcrumb = chunk.metadata.breadcrumb.clone();
1228 let ancestry = if chunk.metadata.ancestry.is_empty() {
1229 None
1230 } else {
1231 Some(chunk.metadata.ancestry.clone())
1232 };
1233 let leading_trivia = if chunk.metadata.leading_trivia.is_empty() {
1234 None
1235 } else {
1236 Some(chunk.metadata.leading_trivia.clone())
1237 };
1238 let trailing_trivia = if chunk.metadata.trailing_trivia.is_empty() {
1239 None
1240 } else {
1241 Some(chunk.metadata.trailing_trivia.clone())
1242 };
1243
1244 chunk_entries.push(ChunkEntry {
1245 span: chunk.span,
1246 embedding: Some(embedding),
1247 chunk_type: chunk_type_str,
1248 breadcrumb,
1249 ancestry,
1250 byte_length: Some(chunk.metadata.byte_length),
1251 estimated_tokens: Some(chunk.metadata.estimated_tokens),
1252 leading_trivia,
1253 trailing_trivia,
1254 chunk_hash: Some(chunk_hash),
1255 });
1256 }
1257 chunk_entries
1258 } else {
1259 let expected_dim = embedder.dim();
1262 let mut chunks_to_embed = Vec::new();
1263 let mut chunk_results: Vec<(ck_chunk::Chunk, String, Option<Vec<f32>>)> = Vec::new();
1264
1265 for chunk in chunks {
1266 let chunk_hash = compute_chunk_hash(
1268 &chunk.text,
1269 &chunk.metadata.leading_trivia,
1270 &chunk.metadata.trailing_trivia,
1271 );
1272 if let Some(cached_embedding) = chunk_cache.get(&chunk_hash) {
1273 if cached_embedding.len() == expected_dim {
1274 chunks_reused += 1;
1276 chunk_results.push((chunk, chunk_hash, Some(cached_embedding.clone())));
1277 } else {
1278 tracing::warn!(
1280 "Chunk in {:?} has cached embedding with dimension {} but current model expects {}. Re-embedding.",
1281 file_path,
1282 cached_embedding.len(),
1283 expected_dim
1284 );
1285 chunks_to_embed.push((chunk.text.clone(), chunk_results.len()));
1286 chunk_results.push((chunk, chunk_hash, None));
1287 }
1288 } else {
1289 chunks_to_embed.push((chunk.text.clone(), chunk_results.len()));
1291 chunk_results.push((chunk, chunk_hash, None));
1292 }
1293 }
1294
1295 if !chunks_to_embed.is_empty() {
1297 let texts: Vec<String> = chunks_to_embed
1298 .iter()
1299 .map(|(text, _)| text.clone())
1300 .collect();
1301 tracing::info!(
1302 "Computing embeddings for {}/{} chunks in {:?} ({} reused from cache)",
1303 texts.len(),
1304 chunk_results.len(),
1305 file_path,
1306 chunks_reused
1307 );
1308 let embeddings = embedder.embed(&texts)?;
1309
1310 if embeddings.len() != chunks_to_embed.len() {
1311 return Err(anyhow::anyhow!(
1312 "Embedder returned {} embeddings for {} chunks in file {:?}. Expected equal counts.",
1313 embeddings.len(),
1314 chunks_to_embed.len(),
1315 file_path
1316 ));
1317 }
1318
1319 chunks_embedded += embeddings.len();
1320
1321 for ((_, result_idx), embedding) in chunks_to_embed.into_iter().zip(embeddings) {
1323 chunk_results[result_idx].2 = Some(embedding);
1324 }
1325 }
1326
1327 chunk_results
1328 .into_iter()
1329 .map(|(chunk, chunk_hash, embedding)| {
1330 let embedding = embedding.expect("All chunks should have embeddings by now");
1331 let chunk_type_str = match chunk.chunk_type {
1332 ck_chunk::ChunkType::Function => Some("function".to_string()),
1333 ck_chunk::ChunkType::Class => Some("class".to_string()),
1334 ck_chunk::ChunkType::Method => Some("method".to_string()),
1335 ck_chunk::ChunkType::Module => Some("module".to_string()),
1336 ck_chunk::ChunkType::Text => None,
1337 };
1338 let breadcrumb = chunk.metadata.breadcrumb.clone();
1339 let ancestry = if chunk.metadata.ancestry.is_empty() {
1340 None
1341 } else {
1342 Some(chunk.metadata.ancestry.clone())
1343 };
1344 let leading_trivia = if chunk.metadata.leading_trivia.is_empty() {
1345 None
1346 } else {
1347 Some(chunk.metadata.leading_trivia.clone())
1348 };
1349 let trailing_trivia = if chunk.metadata.trailing_trivia.is_empty() {
1350 None
1351 } else {
1352 Some(chunk.metadata.trailing_trivia.clone())
1353 };
1354 ChunkEntry {
1355 span: chunk.span,
1356 embedding: Some(embedding),
1357 chunk_type: chunk_type_str,
1358 breadcrumb,
1359 ancestry,
1360 byte_length: Some(chunk.metadata.byte_length),
1361 estimated_tokens: Some(chunk.metadata.estimated_tokens),
1362 leading_trivia,
1363 trailing_trivia,
1364 chunk_hash: Some(chunk_hash),
1365 }
1366 })
1367 .collect()
1368 }
1369 } else {
1370 chunks
1372 .into_iter()
1373 .map(|chunk| {
1374 let chunk_type_str = match chunk.chunk_type {
1375 ck_chunk::ChunkType::Function => Some("function".to_string()),
1376 ck_chunk::ChunkType::Class => Some("class".to_string()),
1377 ck_chunk::ChunkType::Method => Some("method".to_string()),
1378 ck_chunk::ChunkType::Module => Some("module".to_string()),
1379 ck_chunk::ChunkType::Text => None,
1380 };
1381 let breadcrumb = chunk.metadata.breadcrumb.clone();
1382 let ancestry = if chunk.metadata.ancestry.is_empty() {
1383 None
1384 } else {
1385 Some(chunk.metadata.ancestry.clone())
1386 };
1387 let leading_trivia = if chunk.metadata.leading_trivia.is_empty() {
1388 None
1389 } else {
1390 Some(chunk.metadata.leading_trivia.clone())
1391 };
1392 let trailing_trivia = if chunk.metadata.trailing_trivia.is_empty() {
1393 None
1394 } else {
1395 Some(chunk.metadata.trailing_trivia.clone())
1396 };
1397 ChunkEntry {
1398 span: chunk.span,
1399 embedding: None,
1400 chunk_type: chunk_type_str,
1401 breadcrumb,
1402 ancestry,
1403 byte_length: Some(chunk.metadata.byte_length),
1404 estimated_tokens: Some(chunk.metadata.estimated_tokens),
1405 leading_trivia: leading_trivia.clone(),
1406 trailing_trivia: trailing_trivia.clone(),
1407 chunk_hash: Some(compute_chunk_hash(
1408 &chunk.text,
1409 &chunk.metadata.leading_trivia,
1410 &chunk.metadata.trailing_trivia,
1411 )),
1412 }
1413 })
1414 .collect()
1415 };
1416
1417 Ok((
1418 IndexEntry {
1419 metadata: file_metadata,
1420 chunks: chunk_entries,
1421 },
1422 chunks_reused,
1423 chunks_embedded,
1424 ))
1425}
1426
1427fn load_or_create_manifest(path: &Path) -> Result<IndexManifest> {
1428 let mut manifest = if path.exists() {
1429 let data = fs::read(path)?;
1430 serde_json::from_slice(&data)?
1431 } else {
1432 IndexManifest::default()
1433 };
1434
1435 if manifest.chunk_hash_version.is_none() {
1438 manifest.chunk_hash_version = Some(2);
1439 }
1440
1441 Ok(manifest)
1442}
1443
1444fn normalize_manifest_paths(manifest: &mut IndexManifest, repo_root: &Path) {
1445 let original_entries = std::mem::take(&mut manifest.files);
1446 let mut normalized = HashMap::with_capacity(original_entries.len());
1447
1448 for (key, mut metadata) in original_entries {
1449 let standard_key = if key.is_absolute() {
1450 path_utils::to_standard_path(&key, repo_root)
1451 } else {
1452 path_utils::from_manifest_path(&key)
1453 };
1454 let manifest_key = path_utils::to_manifest_path(&standard_key);
1455
1456 let metadata_standard = if metadata.path.is_absolute() {
1457 path_utils::to_standard_path(&metadata.path, repo_root)
1458 } else {
1459 path_utils::from_manifest_path(&metadata.path)
1460 };
1461 metadata.path = path_utils::to_manifest_path(&metadata_standard);
1462
1463 normalized.insert(manifest_key, metadata);
1464 }
1465
1466 manifest.files = normalized;
1467}
1468
1469fn save_manifest(path: &Path, manifest: &IndexManifest) -> Result<()> {
1470 let data = serde_json::to_vec_pretty(manifest)?;
1471 atomic_write(path, &data)
1472}
1473
1474fn save_index_entry(path: &Path, entry: &IndexEntry) -> Result<()> {
1475 let data = bincode::serialize(entry)?;
1476 atomic_write(path, &data)
1477}
1478
1479fn atomic_write(path: &Path, data: &[u8]) -> Result<()> {
1480 let parent = path.parent().unwrap_or_else(|| Path::new("."));
1481 fs::create_dir_all(parent)?;
1482
1483 let mut tmp = NamedTempFile::new_in(parent)?;
1484 tmp.write_all(data)?;
1485 tmp.as_file().sync_all()?;
1486
1487 if path.exists() {
1488 fs::remove_file(path)?;
1489 }
1490
1491 tmp.persist(path)?;
1492 Ok(())
1493}
1494
1495pub fn load_index_entry(path: &Path) -> Result<IndexEntry> {
1496 let data = fs::read(path)?;
1497 Ok(bincode::deserialize(&data)?)
1498}
1499
1500fn find_repo_root(path: &Path) -> Result<PathBuf> {
1501 let mut current = if path.is_file() {
1502 path.parent().unwrap_or(path)
1503 } else {
1504 path
1505 };
1506
1507 loop {
1508 if current.join(".ck").exists() || current.join(".git").exists() {
1509 return Ok(current.to_path_buf());
1510 }
1511
1512 match current.parent() {
1513 Some(parent) => current = parent,
1514 None => return Ok(path.to_path_buf()),
1515 }
1516 }
1517}
1518
1519fn should_reextract(source_path: &Path, cache_path: &Path) -> Result<bool> {
1521 if !cache_path.exists() {
1522 return Ok(true);
1523 }
1524
1525 let source_modified = fs::metadata(source_path)?.modified()?;
1526 let cache_modified = fs::metadata(cache_path)?.modified()?;
1527
1528 Ok(source_modified > cache_modified)
1529}
1530
1531fn extract_pdf_text(path: &Path) -> Result<String> {
1533 pdf_extract::extract_text(path)
1534 .map_err(|e| anyhow::anyhow!("Failed to extract text from PDF {}: {}", path.display(), e))
1535}
1536
1537fn preprocess_file(file_path: &Path, repo_root: &Path) -> Result<PathBuf> {
1541 if ck_core::pdf::is_pdf_file(file_path) {
1542 let cache_path = ck_core::pdf::get_content_cache_path(repo_root, file_path);
1543
1544 if should_reextract(file_path, &cache_path)? {
1546 tracing::debug!(
1547 "Extracting PDF content from {:?} to {:?}",
1548 file_path,
1549 cache_path
1550 );
1551 let extracted_text = extract_pdf_text(file_path)?;
1552
1553 if let Some(parent) = cache_path.parent() {
1555 fs::create_dir_all(parent)?;
1556 }
1557
1558 fs::write(&cache_path, extracted_text)?;
1560 }
1561
1562 Ok(cache_path) } else {
1564 Ok(file_path.to_path_buf()) }
1566}
1567
1568fn is_text_file(path: &Path) -> bool {
1569 if ck_core::pdf::is_pdf_file(path) {
1571 return true;
1572 }
1573
1574 const BUFFER_SIZE: usize = 8192;
1576
1577 match std::fs::File::open(path) {
1578 Ok(mut file) => {
1579 let mut buffer = vec![0; BUFFER_SIZE];
1580 match file.read(&mut buffer) {
1581 Ok(bytes_read) => {
1582 if bytes_read == 0 {
1584 return true;
1585 }
1586
1587 !buffer[..bytes_read].contains(&0)
1589 }
1590 Err(_) => false, }
1592 }
1593 Err(_) => false, }
1595}
1596
1597#[cfg(test)]
1598fn sidecar_to_original_path(
1599 sidecar_path: &Path,
1600 index_dir: &Path,
1601 _repo_root: &Path,
1602) -> Option<PathBuf> {
1603 let relative_path = sidecar_path.strip_prefix(index_dir).ok()?;
1604 let original_path = relative_path.with_extension("");
1605
1606 if let Some(name) = original_path.file_name() {
1608 let name_str = name.to_string_lossy();
1609 if let Some(original_name) = name_str.strip_suffix(".ck") {
1610 let mut result = original_path.clone();
1611 result.set_file_name(original_name);
1612 return Some(result);
1613 }
1614 }
1615
1616 Some(original_path)
1617}
1618
1619fn remove_empty_dirs(dir: &Path) -> Result<()> {
1620 if !dir.is_dir() {
1621 return Ok(());
1622 }
1623
1624 for entry in fs::read_dir(dir)? {
1625 let entry = entry?;
1626 let path = entry.path();
1627 if path.is_dir() {
1628 remove_empty_dirs(&path)?;
1629 if fs::read_dir(&path)?.next().is_none() {
1631 let _ = fs::remove_dir(&path);
1632 }
1633 }
1634 }
1635
1636 Ok(())
1637}
1638
1639#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1640pub struct CleanupStats {
1641 pub orphaned_entries_removed: usize,
1642 pub orphaned_sidecars_removed: usize,
1643}
1644
1645#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1646pub struct IndexStats {
1647 pub total_files: usize,
1648 pub total_chunks: usize,
1649 pub embedded_chunks: usize,
1650 pub total_size_bytes: u64,
1651 pub index_size_bytes: u64,
1652 pub index_created: u64,
1653 pub index_updated: u64,
1654}
1655
1656#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1657pub struct UpdateStats {
1658 pub files_indexed: usize,
1659 pub files_added: usize,
1660 pub files_modified: usize,
1661 pub files_up_to_date: usize,
1662 pub files_errored: usize,
1663 pub orphaned_files_removed: usize,
1664 pub chunks_reused: usize,
1665 pub chunks_embedded: usize,
1666}
1667
1668#[cfg(test)]
1669mod tests {
1670 use super::*;
1671 use std::fs;
1672 use tempfile::TempDir;
1673
1674 struct EmptyResultsEmbedder;
1676
1677 impl ck_embed::Embedder for EmptyResultsEmbedder {
1678 fn id(&self) -> &'static str {
1679 "empty-results-test"
1680 }
1681
1682 fn dim(&self) -> usize {
1683 384
1684 }
1685
1686 fn model_name(&self) -> &str {
1687 "test-empty-results"
1688 }
1689
1690 fn embed(&mut self, _texts: &[String]) -> Result<Vec<Vec<f32>>> {
1691 Ok(Vec::new())
1693 }
1694 }
1695
1696 struct MismatchedCountEmbedder;
1698
1699 impl ck_embed::Embedder for MismatchedCountEmbedder {
1700 fn id(&self) -> &'static str {
1701 "mismatched-count-test"
1702 }
1703
1704 fn dim(&self) -> usize {
1705 384
1706 }
1707
1708 fn model_name(&self) -> &str {
1709 "test-mismatched-count"
1710 }
1711
1712 fn embed(&mut self, texts: &[String]) -> Result<Vec<Vec<f32>>> {
1713 if texts.is_empty() {
1715 Ok(Vec::new())
1716 } else {
1717 Ok(vec![vec![0.0; self.dim()]; texts.len() - 1])
1718 }
1719 }
1720 }
1721
1722 #[test]
1723 fn test_index_single_file_handles_empty_embedding_results() {
1724 let temp_dir = TempDir::new().unwrap();
1725 let test_path = temp_dir.path();
1726
1727 let test_file = test_path.join("test.txt");
1729 fs::write(&test_file, "hello world").unwrap();
1730
1731 let mut empty_embedder: Box<dyn ck_embed::Embedder> = Box::new(EmptyResultsEmbedder);
1733
1734 let result = index_single_file(&test_file, test_path, Some(&mut empty_embedder));
1736
1737 assert!(result.is_err());
1738 let error_msg = result.unwrap_err().to_string();
1739 assert!(error_msg.contains("Embedder returned 0 embeddings for 1 chunks"));
1741 assert!(error_msg.contains("Expected equal counts"));
1742 assert!(error_msg.contains("test.txt"));
1743 }
1744
1745 #[test]
1746 fn test_index_single_file_with_progress_handles_empty_embedding_results() {
1747 let temp_dir = TempDir::new().unwrap();
1748 let test_path = temp_dir.path();
1749
1750 let test_file = test_path.join("test.txt");
1752 fs::write(&test_file, "hello world").unwrap();
1753
1754 let mut empty_embedder: Box<dyn ck_embed::Embedder> = Box::new(EmptyResultsEmbedder);
1756
1757 let dummy_callback: DetailedProgressCallback = Box::new(|_progress: EmbeddingProgress| {});
1759 let result = index_single_file_with_progress(
1760 &test_file,
1761 test_path,
1762 Some(&mut empty_embedder),
1763 Some(&dummy_callback),
1764 0,
1765 1,
1766 );
1767
1768 assert!(result.is_err());
1769 let error_msg = result.unwrap_err().to_string();
1770 assert!(error_msg.contains("Embedder returned empty results"));
1772 assert!(error_msg.contains("chunk 0"));
1773 assert!(error_msg.contains("test.txt"));
1774 }
1775
1776 #[test]
1777 fn test_index_single_file_handles_mismatched_embedding_count() {
1778 let temp_dir = TempDir::new().unwrap();
1779 let test_path = temp_dir.path();
1780
1781 let test_file = test_path.join("test.rs");
1783 fs::write(
1784 &test_file,
1785 "fn main() {\n println!(\"hello\");\n}\n\nfn other() {\n println!(\"world\");\n}",
1786 )
1787 .unwrap();
1788
1789 let mut mismatched_embedder: Box<dyn ck_embed::Embedder> =
1791 Box::new(MismatchedCountEmbedder);
1792
1793 let result = index_single_file(&test_file, test_path, Some(&mut mismatched_embedder));
1795
1796 assert!(result.is_err());
1797 let error_msg = result.unwrap_err().to_string();
1798 assert!(error_msg.contains("Embedder returned"));
1799 assert!(error_msg.contains("embeddings for"));
1800 assert!(error_msg.contains("chunks"));
1801 assert!(error_msg.contains("Expected equal counts"));
1802 }
1803
1804 #[test]
1805 fn test_index_single_file_with_valid_embedder_still_works() {
1806 let temp_dir = TempDir::new().unwrap();
1807 let test_path = temp_dir.path();
1808
1809 let test_file = test_path.join("test.txt");
1811 fs::write(&test_file, "hello world").unwrap();
1812
1813 let dummy_embedder = ck_embed::DummyEmbedder::new();
1815 let mut boxed_embedder: Box<dyn ck_embed::Embedder> = Box::new(dummy_embedder);
1816
1817 let result = index_single_file(&test_file, test_path, Some(&mut boxed_embedder));
1819
1820 assert!(result.is_ok());
1821 let entry = result.unwrap();
1822 assert!(!entry.chunks.is_empty());
1823 for chunk in &entry.chunks {
1825 assert!(chunk.embedding.is_some());
1826 assert_eq!(chunk.embedding.as_ref().unwrap().len(), 384); }
1828 }
1829
1830 #[tokio::test]
1831 async fn test_smart_update_index() {
1832 let temp_dir = TempDir::new().unwrap();
1833 let test_path = temp_dir.path();
1834
1835 fs::write(test_path.join("file1.txt"), "initial content").unwrap();
1837
1838 let file_options = ck_core::FileCollectionOptions {
1839 respect_gitignore: true,
1840 use_ckignore: true,
1841 exclude_patterns: vec![],
1842 };
1843
1844 let stats1 = smart_update_index(test_path, false, &file_options)
1846 .await
1847 .unwrap();
1848 assert_eq!(stats1.files_added, 1);
1849 assert_eq!(stats1.files_indexed, 1);
1850
1851 let stats2 = smart_update_index(test_path, false, &file_options)
1853 .await
1854 .unwrap();
1855 assert_eq!(stats2.files_up_to_date, 1);
1856 assert_eq!(stats2.files_indexed, 0);
1857
1858 fs::write(test_path.join("file1.txt"), "modified content").unwrap();
1860 let stats3 = smart_update_index(test_path, false, &file_options)
1861 .await
1862 .unwrap();
1863 assert_eq!(stats3.files_modified, 1);
1864 assert_eq!(stats3.files_indexed, 1);
1865
1866 fs::write(test_path.join("file2.txt"), "new file content").unwrap();
1868 let stats4 = smart_update_index(test_path, false, &file_options)
1869 .await
1870 .unwrap();
1871 assert_eq!(stats4.files_added, 1);
1872 assert_eq!(stats4.files_up_to_date, 1);
1873 assert_eq!(stats4.files_indexed, 1);
1874 }
1875
1876 #[test]
1877 fn test_cleanup_index() {
1878 let temp_dir = TempDir::new().unwrap();
1879 let test_path = temp_dir.path();
1880
1881 let index_dir = test_path.join(".ck");
1883 fs::create_dir_all(&index_dir).unwrap();
1884
1885 let mut manifest = IndexManifest::default();
1886 manifest.files.insert(
1887 test_path.join("deleted_file.txt"),
1888 FileMetadata {
1889 path: test_path.join("deleted_file.txt"),
1890 hash: "fake_hash".to_string(),
1891 last_modified: 0,
1892 size: 0,
1893 },
1894 );
1895
1896 let manifest_path = index_dir.join("manifest.json");
1897 save_manifest(&manifest_path, &manifest).unwrap();
1898
1899 let file_options = ck_core::FileCollectionOptions {
1901 respect_gitignore: true,
1902 use_ckignore: true,
1903 exclude_patterns: vec![],
1904 };
1905 let stats = cleanup_index(test_path, &file_options).unwrap();
1906 assert_eq!(stats.orphaned_entries_removed, 1);
1907
1908 let updated_manifest = load_or_create_manifest(&manifest_path).unwrap();
1910 assert_eq!(updated_manifest.files.len(), 0);
1911 }
1912
1913 #[test]
1914 fn test_get_index_stats() {
1915 let temp_dir = TempDir::new().unwrap();
1916 let test_path = temp_dir.path();
1917
1918 let stats = get_index_stats(test_path).unwrap();
1920 assert_eq!(stats.total_files, 0);
1921
1922 let index_dir = test_path.join(".ck");
1924 fs::create_dir_all(&index_dir).unwrap();
1925
1926 let mut manifest = IndexManifest::default();
1927 manifest.files.insert(
1928 test_path.join("test.txt"),
1929 FileMetadata {
1930 path: test_path.join("test.txt"),
1931 hash: "test_hash".to_string(),
1932 last_modified: 1234567890,
1933 size: 100,
1934 },
1935 );
1936
1937 let manifest_path = index_dir.join("manifest.json");
1938 save_manifest(&manifest_path, &manifest).unwrap();
1939
1940 let stats = get_index_stats(test_path).unwrap();
1941 assert_eq!(stats.total_files, 1);
1942 }
1943
1944 #[test]
1945 fn test_sidecar_to_original_path() {
1946 let temp_dir = TempDir::new().unwrap();
1947 let index_dir = temp_dir.path().join(".ck");
1948
1949 let sidecar = index_dir.join("test.txt.ck");
1951 let original = sidecar_to_original_path(&sidecar, &index_dir, temp_dir.path());
1952 assert_eq!(original, Some(PathBuf::from("test.txt")));
1953
1954 let nested_sidecar = index_dir.join("src").join("main.rs.ck");
1956 let nested_original =
1957 sidecar_to_original_path(&nested_sidecar, &index_dir, temp_dir.path());
1958 assert_eq!(nested_original, Some(PathBuf::from("src/main.rs")));
1959 }
1960
1961 #[test]
1962 fn test_is_text_file() {
1963 use std::fs::File;
1964 use std::io::Write;
1965 use tempfile::TempDir;
1966
1967 let temp_dir = TempDir::new().unwrap();
1968 let temp_path = temp_dir.path();
1969
1970 let text_file = temp_path.join("test.txt");
1972 let mut file = File::create(&text_file).unwrap();
1973 file.write_all(b"Hello world\nThis is text content")
1974 .unwrap();
1975 assert!(is_text_file(&text_file));
1976
1977 let log_file = temp_path.join("app.log");
1979 let mut file = File::create(&log_file).unwrap();
1980 file.write_all(b"2024-01-15 ERROR: Failed to connect")
1981 .unwrap();
1982 assert!(is_text_file(&log_file));
1983
1984 let no_ext_file = temp_path.join("README");
1986 let mut file = File::create(&no_ext_file).unwrap();
1987 file.write_all(b"This is a README file").unwrap();
1988 assert!(is_text_file(&no_ext_file));
1989
1990 let binary_file = temp_path.join("test.bin");
1992 let mut file = File::create(&binary_file).unwrap();
1993 file.write_all(&[
1994 0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x00, 0x57, 0x6F, 0x72, 0x6C, 0x64,
1995 ])
1996 .unwrap(); assert!(!is_text_file(&binary_file));
1998
1999 let empty_file = temp_path.join("empty.txt");
2001 File::create(&empty_file).unwrap();
2002 assert!(is_text_file(&empty_file));
2003
2004 let nonexistent = temp_path.join("nonexistent.txt");
2006 assert!(!is_text_file(&nonexistent));
2007 }
2008
2009 #[test]
2010 fn test_remove_empty_dirs() {
2011 let temp_dir = TempDir::new().unwrap();
2012 let test_path = temp_dir.path();
2013
2014 let nested_dir = test_path.join("level1").join("level2").join("level3");
2016 fs::create_dir_all(&nested_dir).unwrap();
2017
2018 remove_empty_dirs(test_path).unwrap();
2020
2021 assert!(!nested_dir.exists());
2023 assert!(!test_path.join("level1").join("level2").exists());
2024 assert!(!test_path.join("level1").exists());
2025 }
2026
2027 #[test]
2029 fn test_no_ignore_disables_git_exclude() {
2030 let temp_dir = TempDir::new().unwrap();
2031 let test_path = temp_dir.path();
2032
2033 fs::create_dir_all(test_path.join(".git/info")).unwrap();
2035
2036 fs::write(test_path.join("visible.txt"), "visible content").unwrap();
2038
2039 let excluded_dir = test_path.join("excluded_dir");
2041 fs::create_dir(&excluded_dir).unwrap();
2042 fs::write(excluded_dir.join("hidden.txt"), "hidden content").unwrap();
2043
2044 fs::write(test_path.join(".git/info/exclude"), "/excluded_dir\n").unwrap();
2046
2047 let options_respect = ck_core::FileCollectionOptions {
2049 respect_gitignore: true,
2050 use_ckignore: false,
2051 exclude_patterns: vec![],
2052 };
2053 let files = collect_files(test_path, &options_respect).unwrap();
2054 assert_eq!(
2055 files.len(),
2056 1,
2057 "With respect_gitignore=true, .git/info/exclude should hide files, found: {files:?}"
2058 );
2059
2060 let options_no_ignore = ck_core::FileCollectionOptions {
2062 respect_gitignore: false,
2063 use_ckignore: false,
2064 exclude_patterns: vec![],
2065 };
2066 let files = collect_files(test_path, &options_no_ignore).unwrap();
2067 assert_eq!(
2068 files.len(),
2069 2,
2070 "With respect_gitignore=false, .git/info/exclude should be ignored, found: {files:?}"
2071 );
2072 }
2073
2074 #[test]
2075 fn test_ckignore_works_without_gitignore() {
2076 let temp_dir = TempDir::new().unwrap();
2078 let test_path = temp_dir.path();
2079
2080 fs::write(test_path.join(".gitignore"), "*.git\n").unwrap();
2082 fs::write(test_path.join(".ckignore"), "*.ck\n").unwrap();
2083
2084 fs::write(test_path.join("normal.txt"), "normal content").unwrap();
2086 fs::write(test_path.join("ignored_by_git.git"), "git ignored").unwrap();
2087 fs::write(test_path.join("ignored_by_ck.ck"), "ck ignored").unwrap();
2088
2089 let options = ck_core::FileCollectionOptions {
2091 respect_gitignore: false,
2092 use_ckignore: true,
2093 exclude_patterns: vec![],
2094 };
2095
2096 let files = collect_files(test_path, &options).unwrap();
2097 let file_names: Vec<String> = files
2098 .iter()
2099 .filter_map(|p| p.file_name())
2100 .map(|n| n.to_string_lossy().to_string())
2101 .collect();
2102
2103 assert!(
2105 file_names.contains(&"normal.txt".to_string()),
2106 "Should find normal.txt"
2107 );
2108
2109 assert!(
2111 file_names.contains(&"ignored_by_git.git".to_string()),
2112 "Should find .git file when respect_gitignore=false"
2113 );
2114
2115 assert!(
2117 !file_names.contains(&"ignored_by_ck.ck".to_string()),
2118 "Should NOT find .ck file when use_ckignore=true"
2119 );
2120
2121 let options_both_disabled = ck_core::FileCollectionOptions {
2123 respect_gitignore: false,
2124 use_ckignore: false,
2125 exclude_patterns: vec![],
2126 };
2127
2128 let files_all = collect_files(test_path, &options_both_disabled).unwrap();
2129 let file_names_all: Vec<String> = files_all
2130 .iter()
2131 .filter_map(|p| p.file_name())
2132 .map(|n| n.to_string_lossy().to_string())
2133 .collect();
2134
2135 assert!(
2137 file_names_all.contains(&"ignored_by_git.git".to_string()),
2138 "Should find .git file"
2139 );
2140 assert!(
2141 file_names_all.contains(&"ignored_by_ck.ck".to_string()),
2142 "Should find .ck file when use_ckignore=false"
2143 );
2144 }
2145}
2146
2147mod cleanup_validation {
2153 use super::*;
2154 pub fn validate_and_cleanup_index(
2158 repo_root: &Path,
2159 index_dir: &Path,
2160 manifest: &mut IndexManifest,
2161 options: &ck_core::FileCollectionOptions,
2162 ) -> Result<CleanupStats> {
2163 let mut stats = CleanupStats::default();
2164
2165 let existing_files = collect_files_as_hashset(repo_root, options)?;
2167 let standard_existing_files: HashSet<PathBuf> = existing_files
2168 .into_iter()
2169 .map(|path| path_utils::to_standard_path(&path, repo_root))
2170 .collect();
2171
2172 let manifest_entries: Vec<PathBuf> =
2174 manifest.files.keys().map(|k| k.to_path_buf()).collect();
2175 for manifest_path in manifest_entries {
2176 let standard_path = path_utils::from_manifest_path(&manifest_path);
2177
2178 if !standard_existing_files.contains(&standard_path) {
2180 remove_manifest_entry(manifest, &manifest_path, repo_root, index_dir, &mut stats)?;
2181 continue;
2182 }
2183
2184 let sidecar_path =
2186 path_utils::get_sidecar_path_for_standard_path(index_dir, &standard_path);
2187 if !sidecar_path.exists() {
2188 remove_manifest_entry(manifest, &manifest_path, repo_root, index_dir, &mut stats)?;
2189 continue;
2190 }
2191 }
2192
2193 cleanup_orphaned_sidecars(index_dir, &standard_existing_files, manifest, &mut stats)?;
2195
2196 Ok(stats)
2197 }
2198
2199 fn remove_manifest_entry(
2201 manifest: &mut IndexManifest,
2202 manifest_path: &Path,
2203 repo_root: &Path,
2204 index_dir: &Path,
2205 stats: &mut CleanupStats,
2206 ) -> Result<()> {
2207 manifest.files.remove(manifest_path);
2208
2209 let standard_path = path_utils::from_manifest_path(manifest_path);
2211 let sidecar_path =
2212 path_utils::get_sidecar_path_for_standard_path(index_dir, &standard_path);
2213 if sidecar_path.exists() {
2214 fs::remove_file(&sidecar_path)?;
2215 stats.orphaned_sidecars_removed += 1;
2216 }
2217
2218 if ck_core::pdf::is_pdf_file(&standard_path) {
2220 let absolute_path = repo_root.join(&standard_path);
2221 let cache_path = ck_core::pdf::get_content_cache_path(repo_root, &absolute_path);
2222 if cache_path.exists() {
2223 fs::remove_file(&cache_path)?;
2224 tracing::debug!("Removed orphaned content cache: {:?}", cache_path);
2225 }
2226 }
2227
2228 stats.orphaned_entries_removed += 1;
2229 tracing::warn!("Removed manifest entry: {:?}", manifest_path);
2230 Ok(())
2231 }
2232
2233 fn cleanup_orphaned_sidecars(
2235 index_dir: &Path,
2236 standard_existing_files: &HashSet<PathBuf>,
2237 manifest: &IndexManifest,
2238 stats: &mut CleanupStats,
2239 ) -> Result<()> {
2240 if !index_dir.exists() {
2241 return Ok(());
2242 }
2243
2244 for entry in WalkDir::new(index_dir) {
2245 let entry = entry?;
2246 if entry.file_type().is_file() {
2247 let sidecar_path = entry.path();
2248 if sidecar_path.extension().and_then(|s| s.to_str()) == Some("ck")
2249 && let Some(standard_path) =
2250 path_utils::sidecar_to_standard_path(sidecar_path, index_dir)
2251 {
2252 let manifest_path = path_utils::to_manifest_path(&standard_path);
2253
2254 if !standard_existing_files.contains(&standard_path)
2256 || !manifest.files.contains_key(&manifest_path)
2257 {
2258 fs::remove_file(sidecar_path)?;
2259 stats.orphaned_sidecars_removed += 1;
2260 }
2261 }
2262 }
2263 }
2264
2265 Ok(())
2266 }
2267}
2268
2269mod path_utils {
2277 use super::*;
2278
2279 pub fn to_standard_path(absolute_path: &Path, repo_root: &Path) -> PathBuf {
2281 if let Ok(relative) = absolute_path.strip_prefix(repo_root) {
2282 relative.to_path_buf()
2283 } else {
2284 absolute_path.to_path_buf()
2285 }
2286 }
2287
2288 pub fn to_manifest_path(standard_path: &Path) -> PathBuf {
2290 PathBuf::from(".").join(standard_path)
2291 }
2292
2293 pub fn from_manifest_path(manifest_path: &Path) -> PathBuf {
2295 if let Ok(relative) = manifest_path.strip_prefix(".") {
2296 relative.to_path_buf()
2297 } else {
2298 manifest_path.to_path_buf()
2299 }
2300 }
2301
2302 pub fn get_sidecar_path_for_standard_path(index_dir: &Path, standard_path: &Path) -> PathBuf {
2304 let sidecar_name = format!("{}.ck", standard_path.display());
2305 index_dir.join(sidecar_name)
2306 }
2307
2308 pub fn sidecar_to_standard_path(sidecar_path: &Path, index_dir: &Path) -> Option<PathBuf> {
2310 let relative_path = sidecar_path.strip_prefix(index_dir).ok()?;
2311 let original_path = relative_path.with_extension("");
2312
2313 if let Some(name) = original_path.file_name() {
2315 let name_str = name.to_string_lossy();
2316 if let Some(original_name) = name_str.strip_suffix(".ck") {
2317 let mut result = original_path.clone();
2318 result.set_file_name(original_name);
2319 return Some(result);
2320 }
2321 }
2322
2323 Some(original_path)
2324 }
2325}