Skip to main content

ck_index/
lib.rs

1use anyhow::Result;
2use ck_core::{
3    FileMetadata, Language, Span, compute_chunk_hash, compute_file_hash, get_sidecar_path,
4};
5use ignore::{WalkBuilder, overrides::OverrideBuilder};
6use rayon::prelude::*;
7use serde::{Deserialize, Serialize};
8use std::collections::{HashMap, HashSet};
9use std::fs;
10use std::io::{Read, Write};
11use std::path::{Path, PathBuf};
12use std::sync::Once;
13use std::sync::atomic::{AtomicBool, Ordering};
14use std::time::SystemTime;
15use tempfile::NamedTempFile;
16use walkdir::WalkDir;
17
18fn legacy_model_config(name: &str, dimensions: Option<usize>) -> ck_models::ModelConfig {
19    ck_models::ModelConfig {
20        name: name.to_string(),
21        provider: "fastembed".to_string(),
22        dimensions: dimensions.unwrap_or(384),
23        max_tokens: 8192,
24        description: "Legacy ck embedding model (inferred from manifest)".to_string(),
25    }
26}
27
28pub type ProgressCallback = Box<dyn Fn(&str) + Send + Sync>;
29
30/// Detailed progress information for embedding operations
31#[derive(Debug, Clone)]
32pub struct EmbeddingProgress {
33    pub file_name: String,
34    pub file_index: usize,
35    pub total_files: usize,
36    pub chunk_index: usize,
37    pub total_chunks: usize,
38    pub chunk_size: usize,
39}
40
41pub type DetailedProgressCallback = Box<dyn Fn(EmbeddingProgress) + Send + Sync>;
42
43/// Enhanced progress information for granular indexing feedback
44#[derive(Debug, Clone)]
45pub enum IndexingProgress {
46    /// Starting indexing process
47    Starting { total_files: usize },
48    /// Processing a specific file
49    ProcessingFile {
50        file: String,
51        file_number: usize,
52        total_files: usize,
53        file_size: u64,
54    },
55    /// Chunking a file
56    ChunkingFile { file: String, chunks_found: usize },
57    /// Processing chunk for embedding
58    ProcessingChunk {
59        file: String,
60        chunk_number: usize,
61        total_chunks: usize,
62        chunk_size: usize,
63    },
64    /// Finished processing a file
65    FileComplete {
66        file: String,
67        chunks_processed: usize,
68        file_number: usize,
69        total_files: usize,
70        elapsed_ms: u64,
71    },
72    /// Overall completion
73    Complete {
74        total_files: usize,
75        total_chunks: usize,
76        total_elapsed_ms: u64,
77    },
78}
79
80pub type EnhancedProgressCallback = Box<dyn Fn(IndexingProgress) + Send + Sync>;
81
82// Global interrupt flag
83static INTERRUPTED: AtomicBool = AtomicBool::new(false);
84static HANDLER_INIT: Once = Once::new();
85
86pub const INDEX_INTERRUPTED_MSG: &str = "Indexing interrupted by user";
87
88pub fn request_interrupt() {
89    INTERRUPTED.store(true, Ordering::SeqCst);
90}
91
92/// Build override patterns for excluding files during directory traversal
93fn build_overrides(
94    base_path: &Path,
95    exclude_patterns: &[String],
96) -> Result<ignore::overrides::Override> {
97    let mut builder = OverrideBuilder::new(base_path);
98
99    for pattern in exclude_patterns {
100        if pattern.starts_with('!') {
101            builder.add(pattern)?;
102        } else {
103            builder.add(&format!("!{}", pattern))?;
104        }
105    }
106
107    Ok(builder.build()?)
108}
109
110#[derive(Debug, Clone, Serialize, Deserialize)]
111pub struct IndexEntry {
112    pub metadata: FileMetadata,
113    pub chunks: Vec<ChunkEntry>,
114}
115
116#[derive(Debug, Clone, Serialize, Deserialize)]
117pub struct ChunkEntry {
118    pub span: Span,
119    pub embedding: Option<Vec<f32>>,
120    pub chunk_type: Option<String>, // "function", "class", "method", or None for generic
121    #[serde(default)]
122    pub breadcrumb: Option<String>,
123    #[serde(default)]
124    pub ancestry: Option<Vec<String>>,
125    #[serde(default)]
126    pub byte_length: Option<usize>,
127    #[serde(default)]
128    pub estimated_tokens: Option<usize>,
129    #[serde(default)]
130    pub leading_trivia: Option<Vec<String>>,
131    #[serde(default)]
132    pub trailing_trivia: Option<Vec<String>>,
133    /// Blake3 hash of the chunk text for incremental indexing
134    #[serde(default)]
135    pub chunk_hash: Option<String>,
136}
137
138#[derive(Debug, Clone, Serialize, Deserialize)]
139pub struct IndexManifest {
140    pub version: String,
141    pub created: u64,
142    pub updated: u64,
143    pub files: HashMap<PathBuf, FileMetadata>,
144    /// Embedding model used for this index (added in v0.4.2+)
145    pub embedding_model: Option<String>,
146    /// Embedding model dimensions (for validation)
147    pub embedding_dimensions: Option<usize>,
148    /// Chunk hash version for incremental indexing
149    /// - v1 = blake3 of chunk text only
150    /// - v2 = blake3 of chunk text + leading_trivia + trailing_trivia
151    #[serde(default)]
152    pub chunk_hash_version: Option<u32>,
153}
154
155impl Default for IndexManifest {
156    fn default() -> Self {
157        let now = SystemTime::now()
158            .duration_since(SystemTime::UNIX_EPOCH)
159            .unwrap()
160            .as_secs();
161
162        Self {
163            version: "0.1.0".to_string(),
164            created: now,
165            updated: now,
166            files: HashMap::new(),
167            embedding_model: None, // Default to None for backward compatibility
168            embedding_dimensions: None,
169            chunk_hash_version: Some(2), // v2 = blake3 of chunk text + trivia
170        }
171    }
172}
173
174/// Common filtering logic for directory traversal entries
175fn should_include_file(entry: &ignore::DirEntry, index_dir: &Path) -> bool {
176    let path = entry.path();
177    entry.file_type().is_some_and(|ft| ft.is_file())
178        && is_text_file(path)
179        && !path.starts_with(index_dir)
180}
181
182/// Apply common filtering to a WalkBuilder iterator
183fn filter_and_collect_files(walker: ignore::Walk, index_dir: &Path) -> Vec<PathBuf> {
184    walker
185        .filter_map(|entry| entry.ok())
186        .filter(|entry| should_include_file(entry, index_dir))
187        .map(|entry| entry.path().to_path_buf())
188        .collect()
189}
190
191pub fn collect_files(
192    path: &Path,
193    options: &ck_core::FileCollectionOptions,
194) -> Result<Vec<PathBuf>> {
195    let index_dir = path.join(".ck");
196
197    if options.respect_gitignore {
198        let overrides = build_overrides(path, &options.exclude_patterns)?;
199        let mut walker_builder = WalkBuilder::new(path);
200        walker_builder
201            .git_ignore(true)
202            .git_global(true)
203            .git_exclude(true)
204            .hidden(true);
205
206        // Add .ckignore support (hierarchical, like .gitignore)
207        if options.use_ckignore {
208            walker_builder.add_custom_ignore_filename(".ckignore");
209        }
210
211        walker_builder.overrides(overrides);
212        let walker = walker_builder.build();
213
214        Ok(filter_and_collect_files(walker, &index_dir))
215    } else {
216        // Use WalkBuilder without gitignore support, but still apply overrides
217        use ck_core::get_default_exclude_patterns;
218        let default_patterns = get_default_exclude_patterns();
219
220        // Combine default patterns with user exclude patterns
221        let mut all_patterns = default_patterns;
222        all_patterns.extend(options.exclude_patterns.iter().cloned());
223        let combined_overrides = build_overrides(path, &all_patterns)?;
224
225        let mut walker_builder = WalkBuilder::new(path);
226        walker_builder
227            .git_ignore(false)
228            .git_global(false)
229            .git_exclude(false)
230            .hidden(true);
231
232        // Add .ckignore support even without gitignore
233        if options.use_ckignore {
234            walker_builder.add_custom_ignore_filename(".ckignore");
235        }
236
237        walker_builder.overrides(combined_overrides);
238        let walker = walker_builder.build();
239
240        Ok(filter_and_collect_files(walker, &index_dir))
241    }
242}
243
244fn collect_files_as_hashset(
245    path: &Path,
246    options: &ck_core::FileCollectionOptions,
247) -> Result<HashSet<PathBuf>> {
248    Ok(collect_files(path, options)?.into_iter().collect())
249}
250
251pub async fn index_directory(
252    path: &Path,
253    compute_embeddings: bool,
254    options: &ck_core::FileCollectionOptions,
255    model: Option<&str>,
256) -> Result<()> {
257    tracing::info!(
258        "index_directory called with compute_embeddings={}",
259        compute_embeddings
260    );
261    let index_dir = path.join(".ck");
262    fs::create_dir_all(&index_dir)?;
263
264    let manifest_path = index_dir.join("manifest.json");
265    let mut manifest = load_or_create_manifest(&manifest_path)?;
266    normalize_manifest_paths(&mut manifest, path);
267
268    // Handle model configuration for embeddings
269    let resolved_model = if compute_embeddings {
270        let model_registry = ck_models::ModelRegistry::default();
271        let (alias, config) = model_registry
272            .resolve(model)
273            .map_err(|e| anyhow::anyhow!(e.to_string()))?;
274
275        if let Some(existing_model) = &manifest.embedding_model
276            && existing_model != &config.name
277        {
278            return Err(anyhow::anyhow!(
279                "Model mismatch: Index was created with '{}', but you're trying to use '{}'. \
280                Please run 'ck --clean {}' to remove the old index, then rerun with the new model.",
281                existing_model,
282                config.name,
283                path.display()
284            ));
285        }
286
287        manifest.embedding_model = Some(config.name.clone());
288        manifest.embedding_dimensions = Some(config.dimensions);
289
290        Some((alias, config))
291    } else {
292        None
293    };
294
295    let files = collect_files(path, options)?;
296
297    if compute_embeddings {
298        // Sequential processing with small-batch embeddings for streaming performance
299        tracing::info!("Creating embedder for {} files", files.len());
300        let (_, config) = resolved_model
301            .as_ref()
302            .expect("resolved model must be present when computing embeddings");
303        let mut embedder = ck_embed::create_embedder_for_config(config, None)?;
304
305        for file_path in files.iter() {
306            match index_single_file(file_path, path, Some(&mut embedder)) {
307                Ok(entry) => {
308                    // Write sidecar immediately
309                    let sidecar_path = get_sidecar_path(path, file_path);
310                    save_index_entry(&sidecar_path, &entry)?;
311
312                    // Update and save manifest immediately
313                    let manifest_key = entry.metadata.path.clone();
314                    manifest.files.insert(manifest_key, entry.metadata);
315                    manifest.updated = SystemTime::now()
316                        .duration_since(SystemTime::UNIX_EPOCH)
317                        .unwrap()
318                        .as_secs();
319                    save_manifest(&manifest_path, &manifest)?;
320                }
321                Err(e) => {
322                    // Suppress warnings for binary files and UTF-8 errors in .git directories
323                    let error_msg = e.to_string();
324                    let is_binary_skip = error_msg.contains("Binary file, skipping");
325                    let is_utf8_error = error_msg.contains("stream did not contain valid UTF-8");
326                    let is_git_file = file_path.components().any(|c| c.as_os_str() == ".git");
327
328                    if !(is_binary_skip || is_utf8_error && is_git_file) {
329                        tracing::warn!("Failed to index {:?}: {}", file_path, e);
330                    }
331                }
332            }
333        }
334    } else {
335        // Parallel processing with streaming using producer-consumer pattern
336        use std::sync::mpsc;
337        use std::thread;
338
339        let (tx, rx) = mpsc::channel();
340        let files_clone = files.clone();
341        let path_clone = path.to_path_buf();
342
343        // Spawn worker thread for parallel processing
344        let worker_handle = thread::spawn(move || {
345            files_clone.par_iter().for_each(|file_path| {
346                match index_single_file(file_path, &path_clone, None) {
347                    Ok(entry) => {
348                        if tx.send((file_path.clone(), entry)).is_err() {
349                            // Receiver dropped, stop processing
350                        }
351                    }
352                    Err(e) => {
353                        // Suppress warnings for binary files and UTF-8 errors in .git directories
354                        let error_msg = e.to_string();
355                        let is_binary_skip = error_msg.contains("Binary file, skipping");
356                        let is_utf8_error =
357                            error_msg.contains("stream did not contain valid UTF-8");
358                        let is_git_file = file_path.components().any(|c| c.as_os_str() == ".git");
359
360                        if !(is_binary_skip || is_utf8_error && is_git_file) {
361                            tracing::warn!("Failed to index {:?}: {}", file_path, e);
362                        }
363                    }
364                }
365            });
366        });
367
368        // Main thread: stream results as they arrive
369        while let Ok((file_path, entry)) = rx.recv() {
370            // Write sidecar immediately
371            let sidecar_path = get_sidecar_path(path, &file_path);
372            save_index_entry(&sidecar_path, &entry)?;
373
374            // Update and save manifest immediately
375            let manifest_key = entry.metadata.path.clone();
376            manifest.files.insert(manifest_key, entry.metadata);
377            manifest.updated = SystemTime::now()
378                .duration_since(SystemTime::UNIX_EPOCH)
379                .unwrap()
380                .as_secs();
381            save_manifest(&manifest_path, &manifest)?;
382        }
383
384        // Wait for worker to complete
385        worker_handle
386            .join()
387            .map_err(|_| anyhow::anyhow!("Worker thread panicked"))?;
388    }
389
390    // Manifest is already updated after each file in streaming mode
391    // Only save manifest if using parallel processing (non-embedding case)
392    if !compute_embeddings {
393        manifest.updated = SystemTime::now()
394            .duration_since(SystemTime::UNIX_EPOCH)
395            .unwrap()
396            .as_secs();
397        save_manifest(&manifest_path, &manifest)?;
398    }
399
400    Ok(())
401}
402
403pub async fn index_file(file_path: &Path, compute_embeddings: bool) -> Result<()> {
404    let repo_root = find_repo_root(file_path)?;
405    let index_dir = repo_root.join(".ck");
406    fs::create_dir_all(&index_dir)?;
407
408    let manifest_path = index_dir.join("manifest.json");
409    let mut manifest = load_or_create_manifest(&manifest_path)?;
410
411    let entry = if compute_embeddings {
412        let model_registry = ck_models::ModelRegistry::default();
413        let (alias, config) = if let Some(existing) = manifest.embedding_model.as_deref() {
414            match model_registry.resolve(Some(existing)) {
415                Ok(resolved) => resolved,
416                Err(_) => (
417                    existing.to_string(),
418                    legacy_model_config(existing, manifest.embedding_dimensions),
419                ),
420            }
421        } else {
422            model_registry
423                .resolve(None)
424                .map_err(|e| anyhow::anyhow!(e.to_string()))?
425        };
426
427        manifest.embedding_model = Some(config.name.clone());
428        manifest.embedding_dimensions = Some(config.dimensions);
429        tracing::debug!("Using embedding model '{}' ({})", config.name, alias);
430
431        let mut embedder = ck_embed::create_embedder_for_config(&config, None)?;
432        index_single_file(file_path, &repo_root, Some(&mut embedder))?
433    } else {
434        index_single_file(file_path, &repo_root, None)?
435    };
436    let sidecar_path = get_sidecar_path(&repo_root, file_path);
437
438    save_index_entry(&sidecar_path, &entry)?;
439    let manifest_key = entry.metadata.path.clone();
440    manifest.files.insert(manifest_key, entry.metadata);
441    manifest.updated = SystemTime::now()
442        .duration_since(SystemTime::UNIX_EPOCH)
443        .unwrap()
444        .as_secs();
445
446    save_manifest(&manifest_path, &manifest)?;
447
448    Ok(())
449}
450
451pub async fn update_index(
452    path: &Path,
453    compute_embeddings: bool,
454    options: &ck_core::FileCollectionOptions,
455) -> Result<()> {
456    let index_dir = path.join(".ck");
457    if !index_dir.exists() {
458        return index_directory(
459            path,
460            compute_embeddings,
461            options,
462            None, // model - use existing from manifest for update
463        )
464        .await;
465    }
466
467    let manifest_path = index_dir.join("manifest.json");
468    let mut manifest = load_or_create_manifest(&manifest_path)?;
469
470    let files = collect_files(path, options)?;
471
472    let updates: Vec<(PathBuf, IndexEntry)> = if compute_embeddings {
473        // Sequential processing when computing embeddings (for memory efficiency)
474        let model_registry = ck_models::ModelRegistry::default();
475        let (alias, config) = if let Some(existing) = manifest.embedding_model.as_deref() {
476            match model_registry.resolve(Some(existing)) {
477                Ok(resolved) => resolved,
478                Err(_) => (
479                    existing.to_string(),
480                    legacy_model_config(existing, manifest.embedding_dimensions),
481                ),
482            }
483        } else {
484            model_registry
485                .resolve(None)
486                .map_err(|e| anyhow::anyhow!(e.to_string()))?
487        };
488
489        manifest.embedding_model = Some(config.name.clone());
490        manifest.embedding_dimensions = Some(config.dimensions);
491        tracing::debug!(
492            "Updating index with embedding model '{}' ({})",
493            config.name,
494            alias
495        );
496
497        let mut embedder = ck_embed::create_embedder_for_config(&config, None)?;
498        files
499            .iter()
500            .filter_map(|file_path| {
501                let manifest_key =
502                    path_utils::to_manifest_path(&path_utils::to_standard_path(file_path, path));
503
504                let needs_update = match manifest.files.get(&manifest_key) {
505                    Some(metadata) => match compute_file_hash(file_path) {
506                        Ok(hash) => hash != metadata.hash,
507                        Err(_) => false,
508                    },
509                    None => true,
510                };
511                if needs_update {
512                    match index_single_file(file_path, path, Some(&mut embedder)) {
513                        Ok(entry) => Some((file_path.clone(), entry)),
514                        Err(e) => {
515                            // Suppress warnings for binary files and UTF-8 errors in .git directories
516                            let error_msg = e.to_string();
517                            let is_binary_skip = error_msg.contains("Binary file, skipping");
518                            let is_utf8_error =
519                                error_msg.contains("stream did not contain valid UTF-8");
520                            let is_git_file =
521                                file_path.components().any(|c| c.as_os_str() == ".git");
522
523                            if !(is_binary_skip || is_utf8_error && is_git_file) {
524                                tracing::warn!("Failed to index {:?}: {}", file_path, e);
525                            }
526                            None
527                        }
528                    }
529                } else {
530                    None
531                }
532            })
533            .collect()
534    } else {
535        // Parallel processing when not computing embeddings
536        files
537            .par_iter()
538            .filter_map(|file_path| {
539                let manifest_key =
540                    path_utils::to_manifest_path(&path_utils::to_standard_path(file_path, path));
541
542                let needs_update = match manifest.files.get(&manifest_key) {
543                    Some(metadata) => match compute_file_hash(file_path) {
544                        Ok(hash) => hash != metadata.hash,
545                        Err(_) => false,
546                    },
547                    None => true,
548                };
549
550                if needs_update {
551                    match index_single_file(file_path, path, None) {
552                        Ok(entry) => Some((file_path.clone(), entry)),
553                        Err(e) => {
554                            // Suppress warnings for binary files and UTF-8 errors in .git directories
555                            let error_msg = e.to_string();
556                            let is_binary_skip = error_msg.contains("Binary file, skipping");
557                            let is_utf8_error =
558                                error_msg.contains("stream did not contain valid UTF-8");
559                            let is_git_file =
560                                file_path.components().any(|c| c.as_os_str() == ".git");
561
562                            if !(is_binary_skip || is_utf8_error && is_git_file) {
563                                tracing::warn!("Failed to index {:?}: {}", file_path, e);
564                            }
565                            None
566                        }
567                    }
568                } else {
569                    None
570                }
571            })
572            .collect()
573    };
574
575    for (file_path, entry) in updates {
576        let sidecar_path = get_sidecar_path(path, &file_path);
577        save_index_entry(&sidecar_path, &entry)?;
578        let manifest_key = entry.metadata.path.clone();
579        manifest.files.insert(manifest_key, entry.metadata);
580    }
581
582    if !manifest.files.is_empty() {
583        manifest.updated = SystemTime::now()
584            .duration_since(SystemTime::UNIX_EPOCH)
585            .unwrap()
586            .as_secs();
587        save_manifest(&manifest_path, &manifest)?;
588    }
589
590    Ok(())
591}
592
593pub fn clean_index(path: &Path) -> Result<()> {
594    let index_dir = path.join(".ck");
595    if index_dir.exists() {
596        fs::remove_dir_all(&index_dir)?;
597    }
598    Ok(())
599}
600
601pub fn cleanup_index(
602    path: &Path,
603    options: &ck_core::FileCollectionOptions,
604) -> Result<CleanupStats> {
605    let index_dir = path.join(".ck");
606    if !index_dir.exists() {
607        return Ok(CleanupStats::default());
608    }
609
610    let manifest_path = index_dir.join("manifest.json");
611    let mut manifest = load_or_create_manifest(&manifest_path)?;
612    normalize_manifest_paths(&mut manifest, path);
613
614    // Use the new unified cleanup validation
615    let stats =
616        cleanup_validation::validate_and_cleanup_index(path, &index_dir, &mut manifest, options)?;
617
618    // Content cache cleanup is now handled by the unified cleanup validation
619
620    // Remove empty directories in .ck
621    remove_empty_dirs(&index_dir)?;
622
623    // Update manifest if changes were made
624    if stats.orphaned_entries_removed > 0 {
625        manifest.updated = SystemTime::now()
626            .duration_since(SystemTime::UNIX_EPOCH)
627            .unwrap()
628            .as_secs();
629        save_manifest(&manifest_path, &manifest)?;
630    }
631
632    Ok(stats)
633}
634
635pub fn get_index_stats(path: &Path) -> Result<IndexStats> {
636    let index_dir = path.join(".ck");
637    if !index_dir.exists() {
638        return Ok(IndexStats::default());
639    }
640
641    let manifest_path = index_dir.join("manifest.json");
642    let mut manifest = load_or_create_manifest(&manifest_path)?;
643    normalize_manifest_paths(&mut manifest, path);
644
645    let mut stats = IndexStats {
646        total_files: manifest.files.len(),
647        index_created: manifest.created,
648        index_updated: manifest.updated,
649        ..Default::default()
650    };
651
652    // Calculate total chunks and size
653    for file_path in manifest.files.keys() {
654        let standard_path = path_utils::from_manifest_path(file_path);
655        let sidecar_path =
656            path_utils::get_sidecar_path_for_standard_path(&index_dir, &standard_path);
657        if sidecar_path.exists()
658            && let Ok(entry) = load_index_entry(&sidecar_path)
659        {
660            stats.total_chunks += entry.chunks.len();
661            stats.total_size_bytes += entry.metadata.size;
662
663            // Count embedded chunks
664            let embedded = entry
665                .chunks
666                .iter()
667                .filter(|c| c.embedding.is_some())
668                .count();
669            stats.embedded_chunks += embedded;
670        }
671    }
672
673    // Calculate index size on disk
674    if let Ok(entries) = WalkDir::new(&index_dir)
675        .into_iter()
676        .collect::<Result<Vec<_>, _>>()
677    {
678        for entry in entries {
679            if entry.file_type().is_file()
680                && let Ok(metadata) = entry.metadata()
681            {
682                stats.index_size_bytes += metadata.len();
683            }
684        }
685    }
686
687    Ok(stats)
688}
689
690pub async fn smart_update_index(
691    path: &Path,
692    compute_embeddings: bool,
693    options: &ck_core::FileCollectionOptions,
694) -> Result<UpdateStats> {
695    smart_update_index_with_progress(
696        path,
697        false,
698        None,
699        compute_embeddings,
700        options,
701        None, // model - use default for backward compatibility
702    )
703    .await
704}
705
706pub async fn smart_update_index_with_progress(
707    path: &Path,
708    force_rebuild: bool,
709    progress_callback: Option<ProgressCallback>,
710    compute_embeddings: bool,
711    options: &ck_core::FileCollectionOptions,
712    model: Option<&str>,
713) -> Result<UpdateStats> {
714    smart_update_index_with_detailed_progress(
715        path,
716        force_rebuild,
717        progress_callback,
718        None, // No detailed progress callback for backward compatibility
719        compute_embeddings,
720        options,
721        model,
722    )
723    .await
724}
725
726/// Enhanced indexing with detailed embedding progress
727pub async fn smart_update_index_with_detailed_progress(
728    path: &Path,
729    force_rebuild: bool,
730    progress_callback: Option<ProgressCallback>,
731    detailed_progress_callback: Option<DetailedProgressCallback>,
732    compute_embeddings: bool,
733    options: &ck_core::FileCollectionOptions,
734    model: Option<&str>,
735) -> Result<UpdateStats> {
736    let index_dir = path.join(".ck");
737    let mut stats = UpdateStats::default();
738
739    // Set up interrupt handler (only once per process)
740    HANDLER_INIT.call_once(|| {
741        let _ = ctrlc::set_handler(move || {
742            INTERRUPTED.store(true, Ordering::SeqCst);
743            eprintln!("\nIndexing interrupted by user. Cleaning up...");
744        });
745    });
746
747    // Reset interrupt flag for this indexing operation
748    INTERRUPTED.store(false, Ordering::SeqCst);
749
750    if force_rebuild {
751        clean_index(path)?;
752        index_directory(path, compute_embeddings, options, model).await?;
753        let index_stats = get_index_stats(path)?;
754        stats.files_indexed = index_stats.total_files;
755        return Ok(stats);
756    }
757
758    // Find repo root for path normalization
759    let repo_root = find_repo_root(path)?;
760
761    // Skip cleanup during incremental updates to avoid removing valid entries
762    // that may be outside the current search scope or have path normalization issues
763    // Cleanup should be done explicitly with --clean-orphans when needed
764
765    // Then perform incremental update
766    fs::create_dir_all(&index_dir)?;
767    let manifest_path = index_dir.join("manifest.json");
768    let mut manifest = load_or_create_manifest(&manifest_path)?;
769    normalize_manifest_paths(&mut manifest, &repo_root);
770
771    // Handle model configuration for embeddings
772    let resolved_model = if compute_embeddings {
773        let model_registry = ck_models::ModelRegistry::default();
774
775        let resolved = if let Some(requested) = model {
776            model_registry
777                .resolve(Some(requested))
778                .map_err(|e| anyhow::anyhow!(e.to_string()))?
779        } else if let Some(existing_model) = &manifest.embedding_model {
780            match model_registry.resolve(Some(existing_model.as_str())) {
781                Ok(resolved) => resolved,
782                Err(_) => (
783                    existing_model.clone(),
784                    legacy_model_config(existing_model, manifest.embedding_dimensions),
785                ),
786            }
787        } else {
788            model_registry
789                .resolve(None)
790                .map_err(|e| anyhow::anyhow!(e.to_string()))?
791        };
792
793        if let Some(existing_model) = &manifest.embedding_model
794            && existing_model != &resolved.1.name
795        {
796            return Err(anyhow::anyhow!(
797                "Model mismatch: Index was created with '{}', but you're trying to use '{}'. \
798                    Please run 'ck --clean .' to remove the old index, then 'ck --index --model {}' to rebuild with the new model.",
799                existing_model,
800                resolved.1.name,
801                model.unwrap_or("default")
802            ));
803        }
804
805        manifest.embedding_model = Some(resolved.1.name.clone());
806        manifest.embedding_dimensions = Some(resolved.1.dimensions);
807
808        Some(resolved)
809    } else {
810        None
811    };
812
813    // For incremental updates, only process files in the search scope
814    // The cleanup phase already handled removing orphaned files from the entire repo
815    let current_files = collect_files(path, options)?;
816
817    // First pass: determine which files need updating and collect stats
818    let mut files_to_update = Vec::new();
819    let mut manifest_changed = false;
820
821    for file_path in current_files {
822        // Check for interrupt
823        if INTERRUPTED.load(Ordering::SeqCst) {
824            eprintln!("Indexing interrupted during file scanning.");
825            return Ok(stats);
826        }
827
828        let manifest_key =
829            path_utils::to_manifest_path(&path_utils::to_standard_path(&file_path, &repo_root));
830
831        if let Some(metadata) = manifest.files.get(&manifest_key) {
832            let fs_meta = match fs::metadata(&file_path) {
833                Ok(m) => m,
834                Err(_) => {
835                    stats.files_errored += 1;
836                    continue;
837                }
838            };
839
840            let fs_last_modified = match fs_meta.modified().and_then(|m| {
841                m.duration_since(SystemTime::UNIX_EPOCH)
842                    .map_err(|_| std::io::Error::other("Time error"))
843            }) {
844                Ok(dur) => dur.as_secs(),
845                Err(_) => {
846                    stats.files_errored += 1;
847                    continue;
848                }
849            };
850            let fs_size = fs_meta.len();
851
852            if fs_last_modified == metadata.last_modified && fs_size == metadata.size {
853                stats.files_up_to_date += 1;
854                continue;
855            }
856
857            let hash = match compute_file_hash(&file_path) {
858                Ok(h) => h,
859                Err(_) => {
860                    stats.files_errored += 1;
861                    continue;
862                }
863            };
864
865            if hash != metadata.hash {
866                stats.files_modified += 1;
867                files_to_update.push(file_path);
868            } else {
869                stats.files_up_to_date += 1;
870                // Convert to standardized path for manifest storage
871                let standard_path = path_utils::to_standard_path(&file_path, &repo_root);
872                let manifest_path = path_utils::to_manifest_path(&standard_path);
873                let new_metadata = FileMetadata {
874                    path: manifest_path.clone(),
875                    hash,
876                    last_modified: fs_last_modified,
877                    size: fs_size,
878                };
879                manifest.files.insert(manifest_path, new_metadata);
880                manifest_changed = true;
881            }
882        } else {
883            stats.files_added += 1;
884            files_to_update.push(file_path);
885        }
886    }
887
888    // Second pass: index the files that need updating
889    if compute_embeddings {
890        // Sequential processing with streaming - write each file immediately
891        let (_, config) = resolved_model
892            .as_ref()
893            .expect("resolved model must exist for embedding updates");
894        let mut embedder = ck_embed::create_embedder_for_config(config, None)?;
895        let mut _processed_count = 0;
896
897        for file_path in files_to_update.iter() {
898            // Check for interrupt
899            if INTERRUPTED.load(Ordering::SeqCst) {
900                eprintln!(
901                    "Indexing interrupted. {} files processed.",
902                    _processed_count
903                );
904                break;
905            }
906
907            if let Some(ref callback) = progress_callback
908                && let Some(file_name) = file_path.file_name()
909            {
910                callback(&file_name.to_string_lossy());
911            }
912
913            // Call detailed progress version if callback is provided, otherwise use regular version
914            let result = if let Some(ref detailed_callback) = detailed_progress_callback {
915                index_single_file_with_progress(
916                    file_path,
917                    path,
918                    Some(&mut embedder),
919                    Some(detailed_callback),
920                    _processed_count,
921                    files_to_update.len(),
922                )
923            } else {
924                index_single_file_with_progress(file_path, path, Some(&mut embedder), None, 0, 1)
925            };
926
927            match result {
928                Ok((entry, file_chunks_reused, file_chunks_embedded)) => {
929                    // Aggregate chunk statistics
930                    stats.chunks_reused += file_chunks_reused;
931                    stats.chunks_embedded += file_chunks_embedded;
932
933                    // Write sidecar immediately
934                    let sidecar_path = get_sidecar_path(path, file_path);
935                    save_index_entry(&sidecar_path, &entry)?;
936
937                    // Update and save manifest immediately
938                    let manifest_key = entry.metadata.path.clone();
939                    manifest.files.insert(manifest_key, entry.metadata);
940                    manifest.updated = SystemTime::now()
941                        .duration_since(SystemTime::UNIX_EPOCH)
942                        .unwrap()
943                        .as_secs();
944                    save_manifest(&manifest_path, &manifest)?;
945                    _processed_count += 1;
946                }
947                Err(e) => {
948                    // Suppress warnings for binary files and UTF-8 errors in .git directories
949                    let error_msg = e.to_string();
950                    let is_binary_skip = error_msg.contains("Binary file, skipping");
951                    let is_utf8_error = error_msg.contains("stream did not contain valid UTF-8");
952                    let is_git_file = file_path.components().any(|c| c.as_os_str() == ".git");
953
954                    if !(is_binary_skip || is_utf8_error && is_git_file) {
955                        tracing::warn!("Failed to index {:?}: {}", file_path, e);
956                    }
957                    stats.files_errored += 1;
958                }
959            }
960        }
961
962        stats.files_indexed = _processed_count;
963    } else {
964        // Parallel processing with streaming using producer-consumer pattern
965        use std::sync::mpsc;
966        use std::thread;
967
968        let (tx, rx) = mpsc::channel();
969        let files_clone = files_to_update.clone();
970        let path_clone = path.to_path_buf();
971
972        // Spawn worker thread for parallel processing
973        let worker_handle = thread::spawn(move || {
974            use rayon::prelude::*;
975
976            // Use par_iter with try_for_each to allow early exit on interrupt
977            let result = files_clone.par_iter().try_for_each(|file_path| {
978                // Check for interrupt
979                if INTERRUPTED.load(Ordering::SeqCst) {
980                    return Err("interrupted");
981                }
982
983                match index_single_file(file_path, &path_clone, None) {
984                    Ok(entry) => {
985                        if tx.send((file_path.clone(), entry)).is_err() {
986                            // Receiver dropped, stop processing
987                            return Err("receiver_dropped");
988                        }
989                    }
990                    Err(e) => {
991                        // Suppress warnings for binary files and UTF-8 errors in .git directories
992                        let error_msg = e.to_string();
993                        let is_binary_skip = error_msg.contains("Binary file, skipping");
994                        let is_utf8_error =
995                            error_msg.contains("stream did not contain valid UTF-8");
996                        let is_git_file = file_path.components().any(|c| c.as_os_str() == ".git");
997
998                        if !(is_binary_skip || is_utf8_error && is_git_file) {
999                            tracing::warn!("Failed to index {:?}: {}", file_path, e);
1000                        }
1001                    }
1002                }
1003                Ok(())
1004            });
1005
1006            // Log the result for debugging
1007            if let Err(reason) = result {
1008                tracing::debug!("Worker thread stopped due to: {}", reason);
1009            }
1010        });
1011
1012        // Main thread: stream results as they arrive
1013        let mut _processed_count = 0;
1014        while let Ok((file_path, entry)) = rx.recv() {
1015            // Check for interrupt
1016            if INTERRUPTED.load(Ordering::SeqCst) {
1017                eprintln!(
1018                    "Indexing interrupted. {} files processed.",
1019                    _processed_count
1020                );
1021                drop(rx); // Drop receiver to signal worker to stop
1022                break;
1023            }
1024
1025            if let Some(ref callback) = progress_callback
1026                && let Some(file_name) = file_path.file_name()
1027            {
1028                callback(&file_name.to_string_lossy());
1029            }
1030
1031            // Write sidecar immediately
1032            let sidecar_path = get_sidecar_path(path, &file_path);
1033            save_index_entry(&sidecar_path, &entry)?;
1034
1035            // Update and save manifest immediately
1036            let manifest_key = entry.metadata.path.clone();
1037            manifest.files.insert(manifest_key, entry.metadata);
1038            manifest.updated = SystemTime::now()
1039                .duration_since(SystemTime::UNIX_EPOCH)
1040                .unwrap()
1041                .as_secs();
1042            save_manifest(&manifest_path, &manifest)?;
1043            _processed_count += 1;
1044        }
1045
1046        stats.files_indexed = _processed_count;
1047
1048        // Wait for worker to complete
1049        worker_handle
1050            .join()
1051            .map_err(|_| anyhow::anyhow!("Worker thread panicked"))?;
1052    }
1053
1054    // For sequential processing (embeddings), manifest is already saved after each file
1055    // Only save manifest for parallel processing or if there were metadata-only changes
1056    if !compute_embeddings
1057        && (stats.files_indexed > 0 || stats.orphaned_files_removed > 0 || manifest_changed)
1058    {
1059        manifest.updated = SystemTime::now()
1060            .duration_since(SystemTime::UNIX_EPOCH)
1061            .unwrap()
1062            .as_secs();
1063        save_manifest(&manifest_path, &manifest)?;
1064    }
1065
1066    Ok(stats)
1067}
1068
1069fn index_single_file(
1070    file_path: &Path,
1071    repo_root: &Path,
1072    embedder: Option<&mut Box<dyn ck_embed::Embedder>>,
1073) -> Result<IndexEntry> {
1074    let (entry, _chunks_reused, _chunks_embedded) =
1075        index_single_file_with_progress(file_path, repo_root, embedder, None, 0, 1)?;
1076    Ok(entry)
1077}
1078
1079fn index_single_file_with_progress(
1080    file_path: &Path,
1081    repo_root: &Path,
1082    embedder: Option<&mut Box<dyn ck_embed::Embedder>>,
1083    detailed_progress: Option<&DetailedProgressCallback>,
1084    file_index: usize,
1085    total_files: usize,
1086) -> Result<(IndexEntry, usize, usize)> {
1087    // Skip binary files to avoid UTF-8 warnings
1088    if !is_text_file(file_path) {
1089        return Err(anyhow::anyhow!("Binary file, skipping"));
1090    }
1091
1092    // Build chunk cache from old sidecar if it exists (for chunk reuse)
1093    let chunk_cache: HashMap<String, Vec<f32>> = if embedder.is_some() {
1094        let sidecar_path = get_sidecar_path(repo_root, file_path);
1095        if sidecar_path.exists() {
1096            match load_index_entry(&sidecar_path) {
1097                Ok(old_entry) => old_entry
1098                    .chunks
1099                    .into_iter()
1100                    .filter_map(|chunk| {
1101                        if let (Some(hash), Some(embedding)) = (chunk.chunk_hash, chunk.embedding) {
1102                            Some((hash, embedding))
1103                        } else {
1104                            None
1105                        }
1106                    })
1107                    .collect(),
1108                Err(_) => HashMap::new(),
1109            }
1110        } else {
1111            HashMap::new()
1112        }
1113    } else {
1114        HashMap::new()
1115    };
1116
1117    // Preprocess file (extracts PDFs to cache, returns path to readable content)
1118    let content_path = preprocess_file(file_path, repo_root)?;
1119    let content = fs::read_to_string(&content_path)?;
1120
1121    // Always use the ORIGINAL file for hash and metadata
1122    let hash = compute_file_hash(file_path)?;
1123    let metadata = fs::metadata(file_path)?;
1124
1125    let standard_path = path_utils::to_standard_path(file_path, repo_root);
1126    let manifest_path = path_utils::to_manifest_path(&standard_path);
1127
1128    let file_metadata = FileMetadata {
1129        path: manifest_path,
1130        hash,
1131        last_modified: metadata
1132            .modified()?
1133            .duration_since(SystemTime::UNIX_EPOCH)?
1134            .as_secs(),
1135        size: metadata.len(),
1136    };
1137
1138    // Detect language for tree-sitter parsing
1139    let lang = if ck_core::pdf::is_pdf_file(file_path) {
1140        Some(Language::Pdf)
1141    } else {
1142        ck_core::Language::from_path(file_path)
1143    };
1144
1145    let model_name = embedder.as_ref().map(|e| e.model_name());
1146    let chunks = ck_chunk::chunk_text_with_model(&content, lang, model_name)?;
1147
1148    // Track chunk reuse statistics
1149    let mut chunks_reused = 0;
1150    let mut chunks_embedded = 0;
1151
1152    let chunk_entries: Vec<ChunkEntry> = if let Some(embedder) = embedder {
1153        let total_chunks = chunks.len();
1154        let file_name = file_path
1155            .file_name()
1156            .unwrap_or_default()
1157            .to_string_lossy()
1158            .to_string();
1159
1160        // Process chunks with progress reporting
1161        if let Some(ref callback) = detailed_progress {
1162            tracing::info!(
1163                "Computing embeddings for {} chunks in {:?}",
1164                total_chunks,
1165                file_path
1166            );
1167
1168            let mut chunk_entries = Vec::new();
1169            for (chunk_index, chunk) in chunks.into_iter().enumerate() {
1170                if INTERRUPTED.load(Ordering::SeqCst) {
1171                    return Err(anyhow::anyhow!(INDEX_INTERRUPTED_MSG));
1172                }
1173                // Report progress before processing chunk
1174                callback(EmbeddingProgress {
1175                    file_name: file_name.clone(),
1176                    file_index,
1177                    total_files,
1178                    chunk_index,
1179                    total_chunks,
1180                    chunk_size: chunk.text.len(),
1181                });
1182
1183                // Compute chunk hash for cache lookup or storage
1184                // Include trivia so that doc comment changes invalidate the cache
1185                let chunk_hash = compute_chunk_hash(
1186                    &chunk.text,
1187                    &chunk.metadata.leading_trivia,
1188                    &chunk.metadata.trailing_trivia,
1189                );
1190
1191                // Check cache first, but validate dimension matches current embedder
1192                let expected_dim = embedder.dim();
1193                let embedding = if let Some(cached_embedding) = chunk_cache.get(&chunk_hash) {
1194                    if cached_embedding.len() == expected_dim {
1195                        // Dimension matches, safe to reuse
1196                        chunks_reused += 1;
1197                        cached_embedding.clone()
1198                    } else {
1199                        // Dimension mismatch, re-embed (model changed)
1200                        chunks_embedded += 1;
1201                        tracing::warn!(
1202                            "Chunk in {:?} has cached embedding with dimension {} but current model expects {}. Re-embedding.",
1203                            file_path,
1204                            cached_embedding.len(),
1205                            expected_dim
1206                        );
1207                        let embeddings = embedder.embed(std::slice::from_ref(&chunk.text))?;
1208                        embeddings.into_iter().next().ok_or_else(|| {
1209                            anyhow::anyhow!(
1210                                "Embedder returned empty results for chunk {} in file {:?}. This may indicate an issue with the embedding model or chunk content.",
1211                                chunk_index,
1212                                file_path
1213                            )
1214                        })?
1215                    }
1216                } else {
1217                    // No cache hit, compute embedding
1218                    chunks_embedded += 1;
1219                    let embeddings = embedder.embed(std::slice::from_ref(&chunk.text))?;
1220                    embeddings.into_iter().next().ok_or_else(|| {
1221                        anyhow::anyhow!(
1222                            "Embedder returned empty results for chunk {} in file {:?}. This may indicate an issue with the embedding model or chunk content.",
1223                            chunk_index,
1224                            file_path
1225                        )
1226                    })?
1227                };
1228
1229                let chunk_type_str = match chunk.chunk_type {
1230                    ck_chunk::ChunkType::Function => Some("function".to_string()),
1231                    ck_chunk::ChunkType::Class => Some("class".to_string()),
1232                    ck_chunk::ChunkType::Method => Some("method".to_string()),
1233                    ck_chunk::ChunkType::Module => Some("module".to_string()),
1234                    ck_chunk::ChunkType::Text => None,
1235                };
1236
1237                let breadcrumb = chunk.metadata.breadcrumb.clone();
1238                let ancestry = if chunk.metadata.ancestry.is_empty() {
1239                    None
1240                } else {
1241                    Some(chunk.metadata.ancestry.clone())
1242                };
1243                let leading_trivia = if chunk.metadata.leading_trivia.is_empty() {
1244                    None
1245                } else {
1246                    Some(chunk.metadata.leading_trivia.clone())
1247                };
1248                let trailing_trivia = if chunk.metadata.trailing_trivia.is_empty() {
1249                    None
1250                } else {
1251                    Some(chunk.metadata.trailing_trivia.clone())
1252                };
1253
1254                chunk_entries.push(ChunkEntry {
1255                    span: chunk.span,
1256                    embedding: Some(embedding),
1257                    chunk_type: chunk_type_str,
1258                    breadcrumb,
1259                    ancestry,
1260                    byte_length: Some(chunk.metadata.byte_length),
1261                    estimated_tokens: Some(chunk.metadata.estimated_tokens),
1262                    leading_trivia,
1263                    trailing_trivia,
1264                    chunk_hash: Some(chunk_hash),
1265                });
1266            }
1267            chunk_entries
1268        } else {
1269            // Fallback to batch processing for backward compatibility
1270            // First, check which chunks have cached embeddings with dimension validation
1271            let expected_dim = embedder.dim();
1272            let mut chunks_to_embed = Vec::new();
1273            let mut chunk_results: Vec<(ck_chunk::Chunk, String, Option<Vec<f32>>)> = Vec::new();
1274
1275            for chunk in chunks {
1276                // Include trivia so that doc comment changes invalidate the cache
1277                let chunk_hash = compute_chunk_hash(
1278                    &chunk.text,
1279                    &chunk.metadata.leading_trivia,
1280                    &chunk.metadata.trailing_trivia,
1281                );
1282                if let Some(cached_embedding) = chunk_cache.get(&chunk_hash) {
1283                    if cached_embedding.len() == expected_dim {
1284                        // Dimension matches, safe to reuse
1285                        chunks_reused += 1;
1286                        chunk_results.push((chunk, chunk_hash, Some(cached_embedding.clone())));
1287                    } else {
1288                        // Dimension mismatch, need to re-embed
1289                        tracing::warn!(
1290                            "Chunk in {:?} has cached embedding with dimension {} but current model expects {}. Re-embedding.",
1291                            file_path,
1292                            cached_embedding.len(),
1293                            expected_dim
1294                        );
1295                        chunks_to_embed.push((chunk.text.clone(), chunk_results.len()));
1296                        chunk_results.push((chunk, chunk_hash, None));
1297                    }
1298                } else {
1299                    // No cache hit, need to embed
1300                    chunks_to_embed.push((chunk.text.clone(), chunk_results.len()));
1301                    chunk_results.push((chunk, chunk_hash, None));
1302                }
1303            }
1304
1305            // Batch embed only the chunks without cache hits
1306            if !chunks_to_embed.is_empty() {
1307                let texts: Vec<String> = chunks_to_embed
1308                    .iter()
1309                    .map(|(text, _)| text.clone())
1310                    .collect();
1311                tracing::info!(
1312                    "Computing embeddings for {}/{} chunks in {:?} ({} reused from cache)",
1313                    texts.len(),
1314                    chunk_results.len(),
1315                    file_path,
1316                    chunks_reused
1317                );
1318                let embeddings = embedder.embed(&texts)?;
1319
1320                if embeddings.len() != chunks_to_embed.len() {
1321                    return Err(anyhow::anyhow!(
1322                        "Embedder returned {} embeddings for {} chunks in file {:?}. Expected equal counts.",
1323                        embeddings.len(),
1324                        chunks_to_embed.len(),
1325                        file_path
1326                    ));
1327                }
1328
1329                chunks_embedded += embeddings.len();
1330
1331                // Fill in the computed embeddings
1332                for ((_, result_idx), embedding) in chunks_to_embed.into_iter().zip(embeddings) {
1333                    chunk_results[result_idx].2 = Some(embedding);
1334                }
1335            }
1336
1337            chunk_results
1338                .into_iter()
1339                .map(|(chunk, chunk_hash, embedding)| {
1340                    let embedding = embedding.expect("All chunks should have embeddings by now");
1341                    let chunk_type_str = match chunk.chunk_type {
1342                        ck_chunk::ChunkType::Function => Some("function".to_string()),
1343                        ck_chunk::ChunkType::Class => Some("class".to_string()),
1344                        ck_chunk::ChunkType::Method => Some("method".to_string()),
1345                        ck_chunk::ChunkType::Module => Some("module".to_string()),
1346                        ck_chunk::ChunkType::Text => None,
1347                    };
1348                    let breadcrumb = chunk.metadata.breadcrumb.clone();
1349                    let ancestry = if chunk.metadata.ancestry.is_empty() {
1350                        None
1351                    } else {
1352                        Some(chunk.metadata.ancestry.clone())
1353                    };
1354                    let leading_trivia = if chunk.metadata.leading_trivia.is_empty() {
1355                        None
1356                    } else {
1357                        Some(chunk.metadata.leading_trivia.clone())
1358                    };
1359                    let trailing_trivia = if chunk.metadata.trailing_trivia.is_empty() {
1360                        None
1361                    } else {
1362                        Some(chunk.metadata.trailing_trivia.clone())
1363                    };
1364                    ChunkEntry {
1365                        span: chunk.span,
1366                        embedding: Some(embedding),
1367                        chunk_type: chunk_type_str,
1368                        breadcrumb,
1369                        ancestry,
1370                        byte_length: Some(chunk.metadata.byte_length),
1371                        estimated_tokens: Some(chunk.metadata.estimated_tokens),
1372                        leading_trivia,
1373                        trailing_trivia,
1374                        chunk_hash: Some(chunk_hash),
1375                    }
1376                })
1377                .collect()
1378        }
1379    } else {
1380        // No embedder, just store spans without embeddings
1381        chunks
1382            .into_iter()
1383            .map(|chunk| {
1384                let chunk_type_str = match chunk.chunk_type {
1385                    ck_chunk::ChunkType::Function => Some("function".to_string()),
1386                    ck_chunk::ChunkType::Class => Some("class".to_string()),
1387                    ck_chunk::ChunkType::Method => Some("method".to_string()),
1388                    ck_chunk::ChunkType::Module => Some("module".to_string()),
1389                    ck_chunk::ChunkType::Text => None,
1390                };
1391                let breadcrumb = chunk.metadata.breadcrumb.clone();
1392                let ancestry = if chunk.metadata.ancestry.is_empty() {
1393                    None
1394                } else {
1395                    Some(chunk.metadata.ancestry.clone())
1396                };
1397                let leading_trivia = if chunk.metadata.leading_trivia.is_empty() {
1398                    None
1399                } else {
1400                    Some(chunk.metadata.leading_trivia.clone())
1401                };
1402                let trailing_trivia = if chunk.metadata.trailing_trivia.is_empty() {
1403                    None
1404                } else {
1405                    Some(chunk.metadata.trailing_trivia.clone())
1406                };
1407                ChunkEntry {
1408                    span: chunk.span,
1409                    embedding: None,
1410                    chunk_type: chunk_type_str,
1411                    breadcrumb,
1412                    ancestry,
1413                    byte_length: Some(chunk.metadata.byte_length),
1414                    estimated_tokens: Some(chunk.metadata.estimated_tokens),
1415                    leading_trivia: leading_trivia.clone(),
1416                    trailing_trivia: trailing_trivia.clone(),
1417                    chunk_hash: Some(compute_chunk_hash(
1418                        &chunk.text,
1419                        &chunk.metadata.leading_trivia,
1420                        &chunk.metadata.trailing_trivia,
1421                    )),
1422                }
1423            })
1424            .collect()
1425    };
1426
1427    Ok((
1428        IndexEntry {
1429            metadata: file_metadata,
1430            chunks: chunk_entries,
1431        },
1432        chunks_reused,
1433        chunks_embedded,
1434    ))
1435}
1436
1437fn load_or_create_manifest(path: &Path) -> Result<IndexManifest> {
1438    let mut manifest = if path.exists() {
1439        let data = fs::read(path)?;
1440        serde_json::from_slice(&data)?
1441    } else {
1442        IndexManifest::default()
1443    };
1444
1445    // Ensure chunk_hash_version is set to v2 if not already set
1446    // This handles manifests created before the field existed
1447    if manifest.chunk_hash_version.is_none() {
1448        manifest.chunk_hash_version = Some(2);
1449    }
1450
1451    Ok(manifest)
1452}
1453
1454fn normalize_manifest_paths(manifest: &mut IndexManifest, repo_root: &Path) {
1455    let original_entries = std::mem::take(&mut manifest.files);
1456    let mut normalized = HashMap::with_capacity(original_entries.len());
1457
1458    for (key, mut metadata) in original_entries {
1459        let standard_key = if key.is_absolute() {
1460            path_utils::to_standard_path(&key, repo_root)
1461        } else {
1462            path_utils::from_manifest_path(&key)
1463        };
1464        let manifest_key = path_utils::to_manifest_path(&standard_key);
1465
1466        let metadata_standard = if metadata.path.is_absolute() {
1467            path_utils::to_standard_path(&metadata.path, repo_root)
1468        } else {
1469            path_utils::from_manifest_path(&metadata.path)
1470        };
1471        metadata.path = path_utils::to_manifest_path(&metadata_standard);
1472
1473        normalized.insert(manifest_key, metadata);
1474    }
1475
1476    manifest.files = normalized;
1477}
1478
1479fn save_manifest(path: &Path, manifest: &IndexManifest) -> Result<()> {
1480    let data = serde_json::to_vec_pretty(manifest)?;
1481    atomic_write(path, &data)
1482}
1483
1484fn save_index_entry(path: &Path, entry: &IndexEntry) -> Result<()> {
1485    let data = bincode::serialize(entry)?;
1486    atomic_write(path, &data)
1487}
1488
1489fn atomic_write(path: &Path, data: &[u8]) -> Result<()> {
1490    let parent = path.parent().unwrap_or_else(|| Path::new("."));
1491    fs::create_dir_all(parent)?;
1492
1493    let mut tmp = NamedTempFile::new_in(parent)?;
1494    tmp.write_all(data)?;
1495    tmp.as_file().sync_all()?;
1496
1497    if path.exists() {
1498        fs::remove_file(path)?;
1499    }
1500
1501    tmp.persist(path)?;
1502    Ok(())
1503}
1504
1505pub fn load_index_entry(path: &Path) -> Result<IndexEntry> {
1506    let data = fs::read(path)?;
1507    Ok(bincode::deserialize(&data)?)
1508}
1509
1510fn find_repo_root(path: &Path) -> Result<PathBuf> {
1511    let mut current = if path.is_file() {
1512        path.parent().unwrap_or(path)
1513    } else {
1514        path
1515    };
1516
1517    loop {
1518        if current.join(".ck").exists() || current.join(".git").exists() {
1519            return Ok(current.to_path_buf());
1520        }
1521
1522        match current.parent() {
1523            Some(parent) => current = parent,
1524            None => return Ok(path.to_path_buf()),
1525        }
1526    }
1527}
1528
1529/// Check if content needs re-extraction
1530fn should_reextract(source_path: &Path, cache_path: &Path) -> Result<bool> {
1531    if !cache_path.exists() {
1532        return Ok(true);
1533    }
1534
1535    let source_modified = fs::metadata(source_path)?.modified()?;
1536    let cache_modified = fs::metadata(cache_path)?.modified()?;
1537
1538    Ok(source_modified > cache_modified)
1539}
1540
1541/// Extract text content from a PDF file
1542fn extract_pdf_text(path: &Path) -> Result<String> {
1543    pdf_extract::extract_text(path)
1544        .map_err(|e| anyhow::anyhow!("Failed to extract text from PDF {}: {}", path.display(), e))
1545}
1546
1547/// Preprocess a file if needed, returning path to readable content
1548/// For regular files: returns the original path (no preprocessing)
1549/// For PDFs: extracts text to cache, returns cache path
1550fn preprocess_file(file_path: &Path, repo_root: &Path) -> Result<PathBuf> {
1551    if ck_core::pdf::is_pdf_file(file_path) {
1552        let cache_path = ck_core::pdf::get_content_cache_path(repo_root, file_path);
1553
1554        // Check if re-extraction needed
1555        if should_reextract(file_path, &cache_path)? {
1556            tracing::debug!(
1557                "Extracting PDF content from {:?} to {:?}",
1558                file_path,
1559                cache_path
1560            );
1561            let extracted_text = extract_pdf_text(file_path)?;
1562
1563            // Ensure cache directory exists
1564            if let Some(parent) = cache_path.parent() {
1565                fs::create_dir_all(parent)?;
1566            }
1567
1568            // Write extracted text
1569            fs::write(&cache_path, extracted_text)?;
1570        }
1571
1572        Ok(cache_path) // Return path to extracted text
1573    } else {
1574        Ok(file_path.to_path_buf()) // Return original path for regular files
1575    }
1576}
1577
1578fn is_text_file(path: &Path) -> bool {
1579    // PDFs are considered indexable even though they're binary
1580    if ck_core::pdf::is_pdf_file(path) {
1581        return true;
1582    }
1583
1584    // Use NUL byte heuristic like ripgrep - read first 8KB and check for NUL bytes
1585    const BUFFER_SIZE: usize = 8192;
1586
1587    match std::fs::File::open(path) {
1588        Ok(mut file) => {
1589            let mut buffer = vec![0; BUFFER_SIZE];
1590            match file.read(&mut buffer) {
1591                Ok(bytes_read) => {
1592                    // If file is empty, consider it text
1593                    if bytes_read == 0 {
1594                        return true;
1595                    }
1596
1597                    // Check for NUL bytes in the read portion
1598                    !buffer[..bytes_read].contains(&0)
1599                }
1600                Err(_) => false, // If we can't read, assume binary
1601            }
1602        }
1603        Err(_) => false, // If we can't open, assume binary
1604    }
1605}
1606
1607#[cfg(test)]
1608fn sidecar_to_original_path(
1609    sidecar_path: &Path,
1610    index_dir: &Path,
1611    _repo_root: &Path,
1612) -> Option<PathBuf> {
1613    let relative_path = sidecar_path.strip_prefix(index_dir).ok()?;
1614    let original_path = relative_path.with_extension("");
1615
1616    // Handle the .ck extension removal
1617    if let Some(name) = original_path.file_name() {
1618        let name_str = name.to_string_lossy();
1619        if let Some(original_name) = name_str.strip_suffix(".ck") {
1620            let mut result = original_path.clone();
1621            result.set_file_name(original_name);
1622            return Some(result);
1623        }
1624    }
1625
1626    Some(original_path)
1627}
1628
1629fn remove_empty_dirs(dir: &Path) -> Result<()> {
1630    if !dir.is_dir() {
1631        return Ok(());
1632    }
1633
1634    for entry in fs::read_dir(dir)? {
1635        let entry = entry?;
1636        let path = entry.path();
1637        if path.is_dir() {
1638            remove_empty_dirs(&path)?;
1639            // Try to remove if now empty
1640            if fs::read_dir(&path)?.next().is_none() {
1641                let _ = fs::remove_dir(&path);
1642            }
1643        }
1644    }
1645
1646    Ok(())
1647}
1648
1649#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1650pub struct CleanupStats {
1651    pub orphaned_entries_removed: usize,
1652    pub orphaned_sidecars_removed: usize,
1653}
1654
1655#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1656pub struct IndexStats {
1657    pub total_files: usize,
1658    pub total_chunks: usize,
1659    pub embedded_chunks: usize,
1660    pub total_size_bytes: u64,
1661    pub index_size_bytes: u64,
1662    pub index_created: u64,
1663    pub index_updated: u64,
1664}
1665
1666#[derive(Debug, Clone, Default, Serialize, Deserialize)]
1667pub struct UpdateStats {
1668    pub files_indexed: usize,
1669    pub files_added: usize,
1670    pub files_modified: usize,
1671    pub files_up_to_date: usize,
1672    pub files_errored: usize,
1673    pub orphaned_files_removed: usize,
1674    pub chunks_reused: usize,
1675    pub chunks_embedded: usize,
1676}
1677
1678#[cfg(test)]
1679mod tests {
1680    use super::*;
1681    use std::fs;
1682    use tempfile::TempDir;
1683
1684    /// Test embedder that can return empty results to test error handling
1685    struct EmptyResultsEmbedder;
1686
1687    impl ck_embed::Embedder for EmptyResultsEmbedder {
1688        fn id(&self) -> &'static str {
1689            "empty-results-test"
1690        }
1691
1692        fn dim(&self) -> usize {
1693            384
1694        }
1695
1696        fn model_name(&self) -> &str {
1697            "test-empty-results"
1698        }
1699
1700        fn embed(&mut self, _texts: &[String]) -> Result<Vec<Vec<f32>>> {
1701            // Always return empty vector to trigger the panic scenario
1702            Ok(Vec::new())
1703        }
1704    }
1705
1706    /// Test embedder that returns mismatched count of embeddings
1707    struct MismatchedCountEmbedder;
1708
1709    impl ck_embed::Embedder for MismatchedCountEmbedder {
1710        fn id(&self) -> &'static str {
1711            "mismatched-count-test"
1712        }
1713
1714        fn dim(&self) -> usize {
1715            384
1716        }
1717
1718        fn model_name(&self) -> &str {
1719            "test-mismatched-count"
1720        }
1721
1722        fn embed(&mut self, texts: &[String]) -> Result<Vec<Vec<f32>>> {
1723            // Always return one less embedding than requested
1724            if texts.is_empty() {
1725                Ok(Vec::new())
1726            } else {
1727                Ok(vec![vec![0.0; self.dim()]; texts.len() - 1])
1728            }
1729        }
1730    }
1731
1732    #[test]
1733    fn test_index_single_file_handles_empty_embedding_results() {
1734        let temp_dir = TempDir::new().unwrap();
1735        let test_path = temp_dir.path();
1736
1737        // Create a simple test file
1738        let test_file = test_path.join("test.txt");
1739        fs::write(&test_file, "hello world").unwrap();
1740
1741        // Create an embedder that returns empty results
1742        let mut empty_embedder: Box<dyn ck_embed::Embedder> = Box::new(EmptyResultsEmbedder);
1743
1744        // This should return an error, not panic
1745        let result = index_single_file(&test_file, test_path, Some(&mut empty_embedder));
1746
1747        assert!(result.is_err());
1748        let error_msg = result.unwrap_err().to_string();
1749        // The empty embedder triggers the count mismatch error (0 embeddings for 1 chunk)
1750        assert!(error_msg.contains("Embedder returned 0 embeddings for 1 chunks"));
1751        assert!(error_msg.contains("Expected equal counts"));
1752        assert!(error_msg.contains("test.txt"));
1753    }
1754
1755    #[test]
1756    fn test_index_single_file_with_progress_handles_empty_embedding_results() {
1757        let temp_dir = TempDir::new().unwrap();
1758        let test_path = temp_dir.path();
1759
1760        // Create a simple test file
1761        let test_file = test_path.join("test.txt");
1762        fs::write(&test_file, "hello world").unwrap();
1763
1764        // Create an embedder that returns empty results
1765        let mut empty_embedder: Box<dyn ck_embed::Embedder> = Box::new(EmptyResultsEmbedder);
1766
1767        // Use the detailed progress callback to trigger the single-chunk processing path
1768        let dummy_callback: DetailedProgressCallback = Box::new(|_progress: EmbeddingProgress| {});
1769        let result = index_single_file_with_progress(
1770            &test_file,
1771            test_path,
1772            Some(&mut empty_embedder),
1773            Some(&dummy_callback),
1774            0,
1775            1,
1776        );
1777
1778        assert!(result.is_err());
1779        let error_msg = result.unwrap_err().to_string();
1780        // This should hit the single-chunk path and get the specific error
1781        assert!(error_msg.contains("Embedder returned empty results"));
1782        assert!(error_msg.contains("chunk 0"));
1783        assert!(error_msg.contains("test.txt"));
1784    }
1785
1786    #[test]
1787    fn test_index_single_file_handles_mismatched_embedding_count() {
1788        let temp_dir = TempDir::new().unwrap();
1789        let test_path = temp_dir.path();
1790
1791        // Create a test file with multiple chunks (use some code content)
1792        let test_file = test_path.join("test.rs");
1793        fs::write(
1794            &test_file,
1795            "fn main() {\n    println!(\"hello\");\n}\n\nfn other() {\n    println!(\"world\");\n}",
1796        )
1797        .unwrap();
1798
1799        // Create an embedder that returns mismatched count
1800        let mut mismatched_embedder: Box<dyn ck_embed::Embedder> =
1801            Box::new(MismatchedCountEmbedder);
1802
1803        // This should return an error, not silently mismatch
1804        let result = index_single_file(&test_file, test_path, Some(&mut mismatched_embedder));
1805
1806        assert!(result.is_err());
1807        let error_msg = result.unwrap_err().to_string();
1808        assert!(error_msg.contains("Embedder returned"));
1809        assert!(error_msg.contains("embeddings for"));
1810        assert!(error_msg.contains("chunks"));
1811        assert!(error_msg.contains("Expected equal counts"));
1812    }
1813
1814    #[test]
1815    fn test_index_single_file_with_valid_embedder_still_works() {
1816        let temp_dir = TempDir::new().unwrap();
1817        let test_path = temp_dir.path();
1818
1819        // Create a simple test file
1820        let test_file = test_path.join("test.txt");
1821        fs::write(&test_file, "hello world").unwrap();
1822
1823        // Create a dummy embedder that returns proper results
1824        let dummy_embedder = ck_embed::DummyEmbedder::new();
1825        let mut boxed_embedder: Box<dyn ck_embed::Embedder> = Box::new(dummy_embedder);
1826
1827        // This should work fine
1828        let result = index_single_file(&test_file, test_path, Some(&mut boxed_embedder));
1829
1830        assert!(result.is_ok());
1831        let entry = result.unwrap();
1832        assert!(!entry.chunks.is_empty());
1833        // Verify that embeddings are present
1834        for chunk in &entry.chunks {
1835            assert!(chunk.embedding.is_some());
1836            assert_eq!(chunk.embedding.as_ref().unwrap().len(), 384); // DummyEmbedder dimension
1837        }
1838    }
1839
1840    #[tokio::test]
1841    async fn test_smart_update_index() {
1842        let temp_dir = TempDir::new().unwrap();
1843        let test_path = temp_dir.path();
1844
1845        // Create initial file
1846        fs::write(test_path.join("file1.txt"), "initial content").unwrap();
1847
1848        let file_options = ck_core::FileCollectionOptions {
1849            respect_gitignore: true,
1850            use_ckignore: true,
1851            exclude_patterns: vec![],
1852        };
1853
1854        // First index
1855        let stats1 = smart_update_index(test_path, false, &file_options)
1856            .await
1857            .unwrap();
1858        assert_eq!(stats1.files_added, 1);
1859        assert_eq!(stats1.files_indexed, 1);
1860
1861        // No changes, should be up to date
1862        let stats2 = smart_update_index(test_path, false, &file_options)
1863            .await
1864            .unwrap();
1865        assert_eq!(stats2.files_up_to_date, 1);
1866        assert_eq!(stats2.files_indexed, 0);
1867
1868        // Modify file
1869        fs::write(test_path.join("file1.txt"), "modified content").unwrap();
1870        let stats3 = smart_update_index(test_path, false, &file_options)
1871            .await
1872            .unwrap();
1873        assert_eq!(stats3.files_modified, 1);
1874        assert_eq!(stats3.files_indexed, 1);
1875
1876        // Add new file
1877        fs::write(test_path.join("file2.txt"), "new file content").unwrap();
1878        let stats4 = smart_update_index(test_path, false, &file_options)
1879            .await
1880            .unwrap();
1881        assert_eq!(stats4.files_added, 1);
1882        assert_eq!(stats4.files_up_to_date, 1);
1883        assert_eq!(stats4.files_indexed, 1);
1884    }
1885
1886    #[test]
1887    fn test_cleanup_index() {
1888        let temp_dir = TempDir::new().unwrap();
1889        let test_path = temp_dir.path();
1890
1891        // Create index directory and manifest
1892        let index_dir = test_path.join(".ck");
1893        fs::create_dir_all(&index_dir).unwrap();
1894
1895        let mut manifest = IndexManifest::default();
1896        manifest.files.insert(
1897            test_path.join("deleted_file.txt"),
1898            FileMetadata {
1899                path: test_path.join("deleted_file.txt"),
1900                hash: "fake_hash".to_string(),
1901                last_modified: 0,
1902                size: 0,
1903            },
1904        );
1905
1906        let manifest_path = index_dir.join("manifest.json");
1907        save_manifest(&manifest_path, &manifest).unwrap();
1908
1909        // Cleanup should remove orphaned entry
1910        let file_options = ck_core::FileCollectionOptions {
1911            respect_gitignore: true,
1912            use_ckignore: true,
1913            exclude_patterns: vec![],
1914        };
1915        let stats = cleanup_index(test_path, &file_options).unwrap();
1916        assert_eq!(stats.orphaned_entries_removed, 1);
1917
1918        // Check that manifest was updated
1919        let updated_manifest = load_or_create_manifest(&manifest_path).unwrap();
1920        assert_eq!(updated_manifest.files.len(), 0);
1921    }
1922
1923    #[test]
1924    fn test_get_index_stats() {
1925        let temp_dir = TempDir::new().unwrap();
1926        let test_path = temp_dir.path();
1927
1928        // No index exists
1929        let stats = get_index_stats(test_path).unwrap();
1930        assert_eq!(stats.total_files, 0);
1931
1932        // Create index
1933        let index_dir = test_path.join(".ck");
1934        fs::create_dir_all(&index_dir).unwrap();
1935
1936        let mut manifest = IndexManifest::default();
1937        manifest.files.insert(
1938            test_path.join("test.txt"),
1939            FileMetadata {
1940                path: test_path.join("test.txt"),
1941                hash: "test_hash".to_string(),
1942                last_modified: 1234567890,
1943                size: 100,
1944            },
1945        );
1946
1947        let manifest_path = index_dir.join("manifest.json");
1948        save_manifest(&manifest_path, &manifest).unwrap();
1949
1950        let stats = get_index_stats(test_path).unwrap();
1951        assert_eq!(stats.total_files, 1);
1952    }
1953
1954    #[test]
1955    fn test_sidecar_to_original_path() {
1956        let temp_dir = TempDir::new().unwrap();
1957        let index_dir = temp_dir.path().join(".ck");
1958
1959        // Test normal file
1960        let sidecar = index_dir.join("test.txt.ck");
1961        let original = sidecar_to_original_path(&sidecar, &index_dir, temp_dir.path());
1962        assert_eq!(original, Some(PathBuf::from("test.txt")));
1963
1964        // Test nested file
1965        let nested_sidecar = index_dir.join("src").join("main.rs.ck");
1966        let nested_original =
1967            sidecar_to_original_path(&nested_sidecar, &index_dir, temp_dir.path());
1968        assert_eq!(nested_original, Some(PathBuf::from("src/main.rs")));
1969    }
1970
1971    #[test]
1972    fn test_is_text_file() {
1973        use std::fs::File;
1974        use std::io::Write;
1975        use tempfile::TempDir;
1976
1977        let temp_dir = TempDir::new().unwrap();
1978        let temp_path = temp_dir.path();
1979
1980        // Create a text file (no NUL bytes)
1981        let text_file = temp_path.join("test.txt");
1982        let mut file = File::create(&text_file).unwrap();
1983        file.write_all(b"Hello world\nThis is text content")
1984            .unwrap();
1985        assert!(is_text_file(&text_file));
1986
1987        // Create a text file with unusual extension
1988        let log_file = temp_path.join("app.log");
1989        let mut file = File::create(&log_file).unwrap();
1990        file.write_all(b"2024-01-15 ERROR: Failed to connect")
1991            .unwrap();
1992        assert!(is_text_file(&log_file));
1993
1994        // Create a file without extension but with text content
1995        let no_ext_file = temp_path.join("README");
1996        let mut file = File::create(&no_ext_file).unwrap();
1997        file.write_all(b"This is a README file").unwrap();
1998        assert!(is_text_file(&no_ext_file));
1999
2000        // Create a binary file with NUL bytes
2001        let binary_file = temp_path.join("test.bin");
2002        let mut file = File::create(&binary_file).unwrap();
2003        file.write_all(&[
2004            0x48, 0x65, 0x6C, 0x6C, 0x6F, 0x00, 0x57, 0x6F, 0x72, 0x6C, 0x64,
2005        ])
2006        .unwrap(); // "Hello\0World"
2007        assert!(!is_text_file(&binary_file));
2008
2009        // Create an empty file (should be considered text)
2010        let empty_file = temp_path.join("empty.txt");
2011        File::create(&empty_file).unwrap();
2012        assert!(is_text_file(&empty_file));
2013
2014        // Test non-existent file (should return false)
2015        let nonexistent = temp_path.join("nonexistent.txt");
2016        assert!(!is_text_file(&nonexistent));
2017    }
2018
2019    #[test]
2020    fn test_remove_empty_dirs() {
2021        let temp_dir = TempDir::new().unwrap();
2022        let test_path = temp_dir.path();
2023
2024        // Create nested empty directories
2025        let nested_dir = test_path.join("level1").join("level2").join("level3");
2026        fs::create_dir_all(&nested_dir).unwrap();
2027
2028        // Remove empty dirs
2029        remove_empty_dirs(test_path).unwrap();
2030
2031        // Check that empty dirs were removed
2032        assert!(!nested_dir.exists());
2033        assert!(!test_path.join("level1").join("level2").exists());
2034        assert!(!test_path.join("level1").exists());
2035    }
2036
2037    /// Tests that respect_gitignore=false disables .git/info/exclude patterns.
2038    #[test]
2039    fn test_no_ignore_disables_git_exclude() {
2040        let temp_dir = TempDir::new().unwrap();
2041        let test_path = temp_dir.path();
2042
2043        // Create .git/info directory structure
2044        fs::create_dir_all(test_path.join(".git/info")).unwrap();
2045
2046        // Create a visible file at root
2047        fs::write(test_path.join("visible.txt"), "visible content").unwrap();
2048
2049        // Create a directory that will be excluded via .git/info/exclude
2050        let excluded_dir = test_path.join("excluded_dir");
2051        fs::create_dir(&excluded_dir).unwrap();
2052        fs::write(excluded_dir.join("hidden.txt"), "hidden content").unwrap();
2053
2054        // Use .git/info/exclude (not .gitignore) to test git_exclude() behavior
2055        fs::write(test_path.join(".git/info/exclude"), "/excluded_dir\n").unwrap();
2056
2057        // With respect_gitignore=true, .git/info/exclude should be honored
2058        let options_respect = ck_core::FileCollectionOptions {
2059            respect_gitignore: true,
2060            use_ckignore: false,
2061            exclude_patterns: vec![],
2062        };
2063        let files = collect_files(test_path, &options_respect).unwrap();
2064        assert_eq!(
2065            files.len(),
2066            1,
2067            "With respect_gitignore=true, .git/info/exclude should hide files, found: {:?}",
2068            files
2069        );
2070
2071        // With respect_gitignore=false, .git/info/exclude should be ignored
2072        let options_no_ignore = ck_core::FileCollectionOptions {
2073            respect_gitignore: false,
2074            use_ckignore: false,
2075            exclude_patterns: vec![],
2076        };
2077        let files = collect_files(test_path, &options_no_ignore).unwrap();
2078        assert_eq!(
2079            files.len(),
2080            2,
2081            "With respect_gitignore=false, .git/info/exclude should be ignored, found: {:?}",
2082            files
2083        );
2084    }
2085
2086    #[test]
2087    fn test_ckignore_works_without_gitignore() {
2088        // Test that .ckignore is respected even when respect_gitignore is false
2089        let temp_dir = TempDir::new().unwrap();
2090        let test_path = temp_dir.path();
2091
2092        // Create .gitignore and .ckignore with different patterns
2093        fs::write(test_path.join(".gitignore"), "*.git\n").unwrap();
2094        fs::write(test_path.join(".ckignore"), "*.ck\n").unwrap();
2095
2096        // Create test files
2097        fs::write(test_path.join("normal.txt"), "normal content").unwrap();
2098        fs::write(test_path.join("ignored_by_git.git"), "git ignored").unwrap();
2099        fs::write(test_path.join("ignored_by_ck.ck"), "ck ignored").unwrap();
2100
2101        // Test with respect_gitignore=false, use_ckignore=true
2102        let options = ck_core::FileCollectionOptions {
2103            respect_gitignore: false,
2104            use_ckignore: true,
2105            exclude_patterns: vec![],
2106        };
2107
2108        let files = collect_files(test_path, &options).unwrap();
2109        let file_names: Vec<String> = files
2110            .iter()
2111            .filter_map(|p| p.file_name())
2112            .map(|n| n.to_string_lossy().to_string())
2113            .collect();
2114
2115        // Should find normal.txt
2116        assert!(
2117            file_names.contains(&"normal.txt".to_string()),
2118            "Should find normal.txt"
2119        );
2120
2121        // Should find .git file (gitignore not respected)
2122        assert!(
2123            file_names.contains(&"ignored_by_git.git".to_string()),
2124            "Should find .git file when respect_gitignore=false"
2125        );
2126
2127        // Should NOT find .ck file (ckignore is respected)
2128        assert!(
2129            !file_names.contains(&"ignored_by_ck.ck".to_string()),
2130            "Should NOT find .ck file when use_ckignore=true"
2131        );
2132
2133        // Test with both disabled
2134        let options_both_disabled = ck_core::FileCollectionOptions {
2135            respect_gitignore: false,
2136            use_ckignore: false,
2137            exclude_patterns: vec![],
2138        };
2139
2140        let files_all = collect_files(test_path, &options_both_disabled).unwrap();
2141        let file_names_all: Vec<String> = files_all
2142            .iter()
2143            .filter_map(|p| p.file_name())
2144            .map(|n| n.to_string_lossy().to_string())
2145            .collect();
2146
2147        // Should find ALL files when both are disabled
2148        assert!(
2149            file_names_all.contains(&"ignored_by_git.git".to_string()),
2150            "Should find .git file"
2151        );
2152        assert!(
2153            file_names_all.contains(&"ignored_by_ck.ck".to_string()),
2154            "Should find .ck file when use_ckignore=false"
2155        );
2156    }
2157}
2158
2159// ============================================================================
2160// Cleanup Validation Module
2161// ============================================================================
2162
2163/// Comprehensive cleanup and validation for the index
2164mod cleanup_validation {
2165    use super::*;
2166    // IndexManifest is defined in this module
2167
2168    /// Validates and cleans up the index to ensure consistency
2169    pub fn validate_and_cleanup_index(
2170        repo_root: &Path,
2171        index_dir: &Path,
2172        manifest: &mut IndexManifest,
2173        options: &ck_core::FileCollectionOptions,
2174    ) -> Result<CleanupStats> {
2175        let mut stats = CleanupStats::default();
2176
2177        // Step 1: Get all files that actually exist in the repository
2178        let existing_files = collect_files_as_hashset(repo_root, options)?;
2179        let standard_existing_files: HashSet<PathBuf> = existing_files
2180            .into_iter()
2181            .map(|path| path_utils::to_standard_path(&path, repo_root))
2182            .collect();
2183
2184        // Step 2: Validate manifest entries
2185        let manifest_entries: Vec<PathBuf> =
2186            manifest.files.keys().map(|k| k.to_path_buf()).collect();
2187        for manifest_path in manifest_entries {
2188            let standard_path = path_utils::from_manifest_path(&manifest_path);
2189
2190            // Check if file exists in reality
2191            if !standard_existing_files.contains(&standard_path) {
2192                remove_manifest_entry(manifest, &manifest_path, repo_root, index_dir, &mut stats)?;
2193                continue;
2194            }
2195
2196            // Check if sidecar file exists
2197            let sidecar_path =
2198                path_utils::get_sidecar_path_for_standard_path(index_dir, &standard_path);
2199            if !sidecar_path.exists() {
2200                remove_manifest_entry(manifest, &manifest_path, repo_root, index_dir, &mut stats)?;
2201                continue;
2202            }
2203        }
2204
2205        // Step 3: Clean up orphaned sidecar files
2206        cleanup_orphaned_sidecars(index_dir, &standard_existing_files, manifest, &mut stats)?;
2207
2208        Ok(stats)
2209    }
2210
2211    /// Remove a manifest entry and its associated files
2212    fn remove_manifest_entry(
2213        manifest: &mut IndexManifest,
2214        manifest_path: &Path,
2215        repo_root: &Path,
2216        index_dir: &Path,
2217        stats: &mut CleanupStats,
2218    ) -> Result<()> {
2219        manifest.files.remove(manifest_path);
2220
2221        // Remove sidecar file
2222        let standard_path = path_utils::from_manifest_path(manifest_path);
2223        let sidecar_path =
2224            path_utils::get_sidecar_path_for_standard_path(index_dir, &standard_path);
2225        if sidecar_path.exists() {
2226            fs::remove_file(&sidecar_path)?;
2227            stats.orphaned_sidecars_removed += 1;
2228        }
2229
2230        // Remove content cache for PDFs
2231        if ck_core::pdf::is_pdf_file(&standard_path) {
2232            let absolute_path = repo_root.join(&standard_path);
2233            let cache_path = ck_core::pdf::get_content_cache_path(repo_root, &absolute_path);
2234            if cache_path.exists() {
2235                fs::remove_file(&cache_path)?;
2236                tracing::debug!("Removed orphaned content cache: {:?}", cache_path);
2237            }
2238        }
2239
2240        stats.orphaned_entries_removed += 1;
2241        tracing::warn!("Removed manifest entry: {:?}", manifest_path);
2242        Ok(())
2243    }
2244
2245    /// Clean up sidecar files that don't have corresponding manifest entries
2246    fn cleanup_orphaned_sidecars(
2247        index_dir: &Path,
2248        standard_existing_files: &HashSet<PathBuf>,
2249        manifest: &IndexManifest,
2250        stats: &mut CleanupStats,
2251    ) -> Result<()> {
2252        if !index_dir.exists() {
2253            return Ok(());
2254        }
2255
2256        for entry in WalkDir::new(index_dir) {
2257            let entry = entry?;
2258            if entry.file_type().is_file() {
2259                let sidecar_path = entry.path();
2260                if sidecar_path.extension().and_then(|s| s.to_str()) == Some("ck")
2261                    && let Some(standard_path) =
2262                        path_utils::sidecar_to_standard_path(sidecar_path, index_dir)
2263                {
2264                    let manifest_path = path_utils::to_manifest_path(&standard_path);
2265
2266                    // Remove if file doesn't exist in reality or isn't in manifest
2267                    if !standard_existing_files.contains(&standard_path)
2268                        || !manifest.files.contains_key(&manifest_path)
2269                    {
2270                        fs::remove_file(sidecar_path)?;
2271                        stats.orphaned_sidecars_removed += 1;
2272                    }
2273                }
2274            }
2275        }
2276
2277        Ok(())
2278    }
2279}
2280
2281// ============================================================================
2282// Path Utilities Module
2283// ============================================================================
2284
2285/// Standardized path format for the indexing system.
2286/// All paths are stored as relative paths from the repository root without "./" prefix.
2287/// Example: "examples/code/api_client.js" instead of "./examples/code/api_client.js"
2288mod path_utils {
2289    use super::*;
2290
2291    /// Convert an absolute path to a standardized relative path from repo root
2292    pub fn to_standard_path(absolute_path: &Path, repo_root: &Path) -> PathBuf {
2293        if let Ok(relative) = absolute_path.strip_prefix(repo_root) {
2294            relative.to_path_buf()
2295        } else {
2296            absolute_path.to_path_buf()
2297        }
2298    }
2299
2300    /// Convert a standardized path to a manifest path (with "./" prefix for compatibility)
2301    pub fn to_manifest_path(standard_path: &Path) -> PathBuf {
2302        PathBuf::from(".").join(standard_path)
2303    }
2304
2305    /// Convert a manifest path (with "./" prefix) to a standardized path
2306    pub fn from_manifest_path(manifest_path: &Path) -> PathBuf {
2307        if let Ok(relative) = manifest_path.strip_prefix(".") {
2308            relative.to_path_buf()
2309        } else {
2310            manifest_path.to_path_buf()
2311        }
2312    }
2313
2314    /// Get the sidecar path for a standardized file path
2315    pub fn get_sidecar_path_for_standard_path(index_dir: &Path, standard_path: &Path) -> PathBuf {
2316        let sidecar_name = format!("{}.ck", standard_path.display());
2317        index_dir.join(sidecar_name)
2318    }
2319
2320    /// Convert a sidecar path back to a standardized original path
2321    pub fn sidecar_to_standard_path(sidecar_path: &Path, index_dir: &Path) -> Option<PathBuf> {
2322        let relative_path = sidecar_path.strip_prefix(index_dir).ok()?;
2323        let original_path = relative_path.with_extension("");
2324
2325        // Handle the .ck extension removal
2326        if let Some(name) = original_path.file_name() {
2327            let name_str = name.to_string_lossy();
2328            if let Some(original_name) = name_str.strip_suffix(".ck") {
2329                let mut result = original_path.clone();
2330                result.set_file_name(original_name);
2331                return Some(result);
2332            }
2333        }
2334
2335        Some(original_path)
2336    }
2337}