Skip to main content

ygrep_core/
lib.rs

1//! ygrep-core - Core library for ygrep semantic code search
2//!
3//! This crate provides the core functionality for indexing and searching code:
4//! - Tantivy-based full-text indexing
5//! - File system walking with symlink handling
6//! - BM25 text search + semantic vector search (with `embeddings` feature)
7//! - Hybrid search with Reciprocal Rank Fusion
8//! - Configuration management
9
10pub mod config;
11pub mod dashboard;
12#[cfg(feature = "embeddings")]
13pub mod embeddings;
14pub mod error;
15pub mod fs;
16pub mod index;
17pub mod search;
18pub mod watcher;
19
20pub use config::Config;
21pub use error::{Result, YgrepError};
22pub use watcher::{FileWatcher, WatchEvent};
23
24use std::collections::HashMap;
25use std::path::Path;
26use tantivy::Index;
27
28#[cfg(feature = "embeddings")]
29use embeddings::{EmbeddingCache, EmbeddingModel};
30#[cfg(feature = "embeddings")]
31use index::VectorIndex;
32#[cfg(feature = "embeddings")]
33use std::sync::Arc;
34
35/// Embedding dimension for all-MiniLM-L6-v2
36#[cfg(feature = "embeddings")]
37const EMBEDDING_DIM: usize = 384;
38
39/// Sender for routing log messages (e.g. to dashboard TUI instead of stderr)
40pub type LogSender = std::sync::mpsc::Sender<String>;
41
42/// High-level workspace for indexing and searching
43pub struct Workspace {
44    /// Workspace root directory
45    root: std::path::PathBuf,
46    /// Configuration
47    config: Config,
48    /// Tantivy index
49    index: Index,
50    /// Index directory path
51    index_path: std::path::PathBuf,
52    /// Optional log channel — messages go here instead of stderr when set
53    log_tx: Option<LogSender>,
54    /// Vector index for semantic search
55    #[cfg(feature = "embeddings")]
56    vector_index: Arc<VectorIndex>,
57    /// Embedding model
58    #[cfg(feature = "embeddings")]
59    embedding_model: Arc<EmbeddingModel>,
60    /// Embedding cache
61    #[cfg(feature = "embeddings")]
62    embedding_cache: Arc<EmbeddingCache>,
63}
64
65impl Workspace {
66    /// Open an existing workspace (fails if not indexed)
67    pub fn open(root: &Path) -> Result<Self> {
68        let config = Config::load();
69        Self::open_internal(root, config, false)
70    }
71
72    /// Open an existing workspace with custom config (fails if not indexed)
73    pub fn open_with_config(root: &Path, config: Config) -> Result<Self> {
74        Self::open_internal(root, config, false)
75    }
76
77    /// Create or open a workspace for indexing
78    pub fn create(root: &Path) -> Result<Self> {
79        let config = Config::load();
80        Self::open_internal(root, config, true)
81    }
82
83    /// Create or open a workspace with custom config for indexing
84    pub fn create_with_config(root: &Path, config: Config) -> Result<Self> {
85        Self::open_internal(root, config, true)
86    }
87
88    /// Open or create a workspace with custom config
89    /// If create is false, returns an error if the index doesn't exist
90    fn open_internal(root: &Path, config: Config, create: bool) -> Result<Self> {
91        let root = std::fs::canonicalize(root)?;
92
93        // Resolve data directory:
94        // 1. Auto-detect: .ygrep/ directory in workspace root
95        // 2. Relative data_dir in config: resolve against workspace root
96        // 3. Absolute data_dir from config: use as-is
97        let local_ygrep = root.join(".ygrep");
98        let data_dir = if local_ygrep.is_dir() {
99            local_ygrep
100        } else if config.indexer.data_dir.is_relative() {
101            root.join(&config.indexer.data_dir)
102        } else {
103            config.indexer.data_dir.clone()
104        };
105
106        let workspace_hash = hash_path(&root);
107        let index_path = data_dir.join("indexes").join(&workspace_hash);
108
109        // Check if workspace has been properly indexed (workspace.json is written after indexing)
110        let workspace_indexed = index_path.join("workspace.json").exists();
111        // Check if Tantivy files exist (meta.json is created by Tantivy)
112        let tantivy_exists = index_path.join("meta.json").exists();
113
114        // If not creating and workspace not indexed, return error
115        if !create && !workspace_indexed {
116            return Err(YgrepError::Config(format!(
117                "Workspace not indexed: {}",
118                root.display()
119            )));
120        }
121
122        // Open or create Tantivy index
123        let schema = index::build_document_schema();
124
125        // Early check: verify index directory is writable (issue #7).
126        // On macOS, the default data path (~/Library/Application Support/ygrep/) may not
127        // be writable in sandboxed environments, causing cryptic lockfile PermissionDenied
128        // errors.  Detect this upfront and suggest XDG_DATA_HOME.
129        if index_path.exists() {
130            let probe = index_path.join(".ygrep-write-probe");
131            match std::fs::write(&probe, b"") {
132                Ok(()) => {
133                    let _ = std::fs::remove_file(&probe);
134                }
135                Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => {
136                    return Err(YgrepError::Config(format!(
137                        "Index directory is not writable: {}\n\n\
138                         Hint: Create a .ygrep/ directory in your project root for local indexes,\n\
139                         or set YGREP_HOME to a writable location.",
140                        index_path.display()
141                    )));
142                }
143                Err(_) => {} // Other errors (e.g. disk full) — let Tantivy handle them
144            }
145        }
146
147        // Clean up stale lockfiles that may block readers on macOS (issue #7).
148        // Tantivy's reader acquires META_LOCK via flock(); on macOS Intel this can
149        // fail with EPERM if a stale lockfile inode still has an unreleased flock.
150        // Removing the file forces the next acquire_lock() to create a fresh inode.
151        if !create {
152            let _ = std::fs::remove_file(index_path.join(".tantivy-meta.lock"));
153            let _ = std::fs::remove_file(index_path.join(".tantivy-writer.lock"));
154        }
155
156        let index = if tantivy_exists {
157            match Index::open_in_dir(&index_path) {
158                Ok(idx) => idx,
159                Err(e) if create => {
160                    // Corrupt index detected, silently recreate
161                    // Remove corrupted index and create fresh
162                    std::fs::remove_dir_all(&index_path)?;
163                    std::fs::create_dir_all(&index_path)?;
164                    Index::create_in_dir(&index_path, schema)?
165                }
166                Err(e) => return Err(e.into()),
167            }
168        } else {
169            // Create directory only when explicitly creating the index
170            std::fs::create_dir_all(&index_path)?;
171            Index::create_in_dir(&index_path, schema)?
172        };
173
174        // Register our custom code tokenizer
175        index::register_tokenizers(index.tokenizers());
176
177        #[cfg(feature = "embeddings")]
178        let (vector_index, embedding_model, embedding_cache) = {
179            // Create vector index path
180            let vector_path = index_path.join("vectors");
181
182            // Load or create vector index
183            let vector_index = if VectorIndex::exists(&vector_path) {
184                match VectorIndex::load(vector_path.clone()) {
185                    Ok(vi) => Arc::new(vi),
186                    Err(_e) => {
187                        // Corrupt vector index detected, silently recreate
188                        // Remove corrupted vector files and create fresh
189                        if vector_path.exists() {
190                            let _ = std::fs::remove_dir_all(&vector_path);
191                        }
192                        Arc::new(VectorIndex::new(vector_path, EMBEDDING_DIM)?)
193                    }
194                }
195            } else {
196                Arc::new(VectorIndex::new(vector_path, EMBEDDING_DIM)?)
197            };
198
199            // Create embedding model (lazy-loaded on first use)
200            let embedding_model = Arc::new(EmbeddingModel::default()); // Uses all-MiniLM-L6-v2
201
202            // Create embedding cache (100MB cache, 384 dimensions)
203            let embedding_cache = Arc::new(EmbeddingCache::new(100, EMBEDDING_DIM));
204
205            (vector_index, embedding_model, embedding_cache)
206        };
207
208        Ok(Self {
209            root,
210            config,
211            index,
212            index_path,
213            log_tx: None,
214            #[cfg(feature = "embeddings")]
215            vector_index,
216            #[cfg(feature = "embeddings")]
217            embedding_model,
218            #[cfg(feature = "embeddings")]
219            embedding_cache,
220        })
221    }
222
223    /// Set a log channel — all progress/warning output goes here instead of stderr
224    pub fn set_log_tx(&mut self, tx: LogSender) {
225        self.log_tx = Some(tx);
226        #[cfg(feature = "embeddings")]
227        self.embedding_model.set_quiet(true);
228    }
229
230    /// Route a message to the log channel or stderr
231    fn log(&self, msg: impl std::fmt::Display) {
232        if let Some(ref tx) = self.log_tx {
233            let _ = tx.send(msg.to_string());
234        } else {
235            eprintln!("{}", msg);
236        }
237    }
238
239    /// Route a partial-line message (no newline) to the log channel or stderr
240    fn log_inline(&self, msg: impl std::fmt::Display) {
241        if let Some(ref tx) = self.log_tx {
242            let _ = tx.send(msg.to_string());
243        } else {
244            eprint!("{}", msg);
245        }
246    }
247
248    /// Index all files in the workspace (text-only by default, fast)
249    pub fn index_all(&self) -> Result<IndexStats> {
250        self.index_all_with_options(false)
251    }
252
253    /// Index all files with options
254    #[allow(unused_variables)]
255    pub fn index_all_with_options(&self, with_embeddings: bool) -> Result<IndexStats> {
256        // Clear vector index for fresh re-index
257        #[cfg(feature = "embeddings")]
258        self.vector_index.clear();
259
260        // Phase 1: Index all files with BM25 (fast)
261        let indexer =
262            index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)?;
263
264        let mut walker = fs::FileWalker::new(self.root.clone(), self.config.indexer.clone())?;
265
266        let mut indexed = 0;
267        let mut skipped = 0;
268        let mut errors = 0;
269
270        // Collect content for batch embedding
271        #[cfg(feature = "embeddings")]
272        let mut embedding_batch: Vec<(String, String)> = Vec::new(); // (doc_id, content)
273                                                                     // Larger batch size = more efficient SIMD/vectorization in ONNX Runtime
274        #[cfg(feature = "embeddings")]
275        const BATCH_SIZE: usize = 64;
276
277        for entry in walker.walk() {
278            match indexer.index_file(&entry.path) {
279                Ok((doc_id, content)) => {
280                    indexed += 1;
281                    if indexed % 500 == 0 {
282                        self.log_inline(format!("\r  Indexed {} files...          ", indexed));
283                    }
284
285                    // Collect for embedding if enabled (reuse content from indexer)
286                    #[cfg(feature = "embeddings")]
287                    if with_embeddings {
288                        embedding_batch.push((doc_id, content));
289                    }
290                    #[cfg(not(feature = "embeddings"))]
291                    {
292                        let _ = doc_id;
293                        let _ = content;
294                    }
295                }
296                Err(YgrepError::FileTooLarge { .. }) => {
297                    skipped += 1;
298                }
299                Err(e) => {
300                    tracing::debug!("Error indexing {}: {}", entry.path.display(), e);
301                    errors += 1;
302                }
303            }
304        }
305
306        self.log(format!("\r  Indexed {} files.              ", indexed));
307        indexer.commit()?;
308
309        // Track embedded count
310        let mut total_embedded = 0usize;
311
312        // Phase 2: Generate embeddings in batches (if enabled)
313        #[cfg(feature = "embeddings")]
314        if with_embeddings && !embedding_batch.is_empty() {
315            // Filter out very short content (< 50 chars) and very long content (> 50KB)
316            // These don't embed well or are too slow
317            let filtered_batch: Vec<_> = embedding_batch
318                .into_iter()
319                .filter(|(_, content)| {
320                    let len = content.len();
321                    len >= 50 && len <= 50_000
322                })
323                .collect();
324
325            if filtered_batch.is_empty() {
326                self.log("No documents suitable for semantic indexing.");
327            } else {
328                use indicatif::{ProgressBar, ProgressStyle};
329
330                let total_docs = filtered_batch.len() as u64;
331                self.log(format!(
332                    "Building semantic index for {} documents...",
333                    total_docs
334                ));
335
336                // Pre-load the semantic model before starting progress bar
337                self.embedding_model.preload()?;
338
339                let pb = if self.log_tx.is_some() {
340                    ProgressBar::hidden()
341                } else {
342                    ProgressBar::new(total_docs)
343                };
344                pb.set_style(
345                    ProgressStyle::default_bar()
346                        .template("  [{bar:40.cyan/blue}] {pos}/{len} ({percent}%)")
347                        .unwrap()
348                        .progress_chars("━╸─"),
349                );
350                pb.enable_steady_tick(std::time::Duration::from_millis(100));
351
352                for chunk in filtered_batch.chunks(BATCH_SIZE) {
353                    // Truncate to ~4KB for embedding - sufficient context for code, faster tokenization
354                    // Use floor_char_boundary to avoid slicing in the middle of multi-byte UTF-8 characters
355                    const EMBED_TRUNCATE: usize = 4096;
356                    let texts: Vec<&str> = chunk
357                        .iter()
358                        .map(|(_, content)| {
359                            if content.len() > EMBED_TRUNCATE {
360                                let boundary = content.floor_char_boundary(EMBED_TRUNCATE);
361                                &content[..boundary]
362                            } else {
363                                content.as_str()
364                            }
365                        })
366                        .collect();
367
368                    match self.embedding_model.embed_batch(&texts) {
369                        Ok(embeddings) => {
370                            for ((doc_id, _), embedding) in chunk.iter().zip(embeddings) {
371                                if let Err(e) = self.vector_index.insert(doc_id, &embedding) {
372                                    tracing::debug!(
373                                        "Failed to insert embedding for {}: {}",
374                                        doc_id,
375                                        e
376                                    );
377                                }
378                            }
379                            total_embedded += chunk.len();
380                            pb.set_position(total_embedded as u64);
381                        }
382                        Err(e) => {
383                            tracing::warn!("Batch embedding failed: {}", e);
384                            pb.inc(chunk.len() as u64);
385                        }
386                    }
387                }
388
389                pb.finish_and_clear();
390                self.log(format!("  Indexed {} documents.", total_embedded));
391                self.vector_index.save()?;
392            }
393        }
394
395        #[cfg(not(feature = "embeddings"))]
396        if with_embeddings {
397            self.log("Warning: Semantic search feature not available in this build.");
398        }
399
400        let stats = walker.stats();
401
402        // Save workspace metadata for index management
403        let metadata = serde_json::json!({
404            "workspace": self.root.to_string_lossy(),
405            "indexed_at": chrono::Utc::now().to_rfc3339(),
406            "files_indexed": indexed,
407            "semantic": with_embeddings,
408            "schema_version": index::SCHEMA_VERSION,
409        });
410        let metadata_path = self.index_path.join("workspace.json");
411        if let Err(e) = std::fs::write(
412            &metadata_path,
413            serde_json::to_string_pretty(&metadata).unwrap_or_default(),
414        ) {
415            tracing::warn!("Failed to save workspace metadata: {}", e);
416        }
417
418        Ok(IndexStats {
419            indexed,
420            embedded: total_embedded,
421            skipped,
422            errors,
423            unique_paths: stats.visited_paths,
424            unchanged: 0,
425            removed: 0,
426        })
427    }
428
429    /// Build a map of all indexed files: relative_path -> (mtime, doc_id)
430    /// Uses fast fields for efficient columnar reads, skipping chunk documents.
431    /// Returns an empty map if the index is empty or unreadable.
432    pub fn build_indexed_files_map(&self) -> HashMap<String, (u64, String)> {
433        let mut map = HashMap::new();
434
435        let reader = match self.index.reader() {
436            Ok(r) => r,
437            Err(_) => return map,
438        };
439
440        let searcher = reader.searcher();
441
442        for segment_reader in searcher.segment_readers() {
443            let alive_bitset = segment_reader.alive_bitset();
444            let fast_fields = segment_reader.fast_fields();
445
446            // Get fast field columns for path, mtime, chunk_id, doc_id
447            let path_col = match fast_fields.str("path") {
448                Ok(Some(col)) => col,
449                _ => continue,
450            };
451            let mtime_col = match fast_fields.u64("mtime") {
452                Ok(col) => col,
453                Err(_) => continue,
454            };
455            let chunk_id_col = match fast_fields.str("chunk_id") {
456                Ok(Some(col)) => col,
457                _ => continue,
458            };
459            let doc_id_col = match fast_fields.str("doc_id") {
460                Ok(Some(col)) => col,
461                _ => continue,
462            };
463
464            let mut path_buf = String::new();
465            let mut chunk_id_buf = String::new();
466            let mut doc_id_buf = String::new();
467
468            for row_id in 0..segment_reader.max_doc() {
469                // Skip deleted docs
470                if let Some(bitset) = &alive_bitset {
471                    if !bitset.is_alive(row_id) {
472                        continue;
473                    }
474                }
475
476                // Skip chunk documents (chunk_id is non-empty for chunks)
477                chunk_id_buf.clear();
478                let mut is_chunk = false;
479                for ord in chunk_id_col.term_ords(row_id) {
480                    let _ = chunk_id_col.ord_to_str(ord, &mut chunk_id_buf);
481                    if !chunk_id_buf.is_empty() {
482                        is_chunk = true;
483                        break;
484                    }
485                }
486                if is_chunk {
487                    continue;
488                }
489
490                // Read path
491                path_buf.clear();
492                for ord in path_col.term_ords(row_id) {
493                    let _ = path_col.ord_to_str(ord, &mut path_buf);
494                }
495                if path_buf.is_empty() {
496                    continue;
497                }
498
499                // Read mtime from fast field
500                let mtime_val = mtime_col.values_for_doc(row_id).next().unwrap_or(0);
501
502                // Read doc_id
503                doc_id_buf.clear();
504                for ord in doc_id_col.term_ords(row_id) {
505                    let _ = doc_id_col.ord_to_str(ord, &mut doc_id_buf);
506                }
507
508                map.insert(path_buf.clone(), (mtime_val, doc_id_buf.clone()));
509            }
510        }
511
512        map
513    }
514
515    /// Incremental index: only re-index files that changed since last index
516    #[allow(unused_variables)]
517    pub fn index_incremental_with_options(&self, with_embeddings: bool) -> Result<IndexStats> {
518        // Build map of currently indexed files
519        let mut indexed_map = self.build_indexed_files_map();
520
521        // Create indexer (does NOT clear vector index)
522        let indexer =
523            index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)?;
524
525        let mut walker = fs::FileWalker::new(self.root.clone(), self.config.indexer.clone())?;
526
527        let mut indexed = 0;
528        let mut skipped = 0;
529        let mut errors = 0;
530        let mut unchanged = 0;
531
532        #[cfg(feature = "embeddings")]
533        let mut embedding_batch: Vec<(String, String)> = Vec::new();
534        #[cfg(feature = "embeddings")]
535        const BATCH_SIZE: usize = 64;
536
537        for entry in walker.walk() {
538            // Get relative path for this file
539            let rel_path = entry
540                .path
541                .strip_prefix(&self.root)
542                .unwrap_or(&entry.path)
543                .to_string_lossy()
544                .to_string();
545
546            // Get current file mtime
547            let current_mtime = std::fs::metadata(&entry.path)
548                .ok()
549                .and_then(|m| m.modified().ok())
550                .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
551                .map(|d| d.as_secs())
552                .unwrap_or(0);
553
554            // Check if file is unchanged
555            if let Some((stored_mtime, _stored_doc_id)) = indexed_map.remove(&rel_path) {
556                if stored_mtime == current_mtime {
557                    unchanged += 1;
558                    continue;
559                }
560                // mtime differs - re-index below
561            }
562            // else: new file, not in map
563
564            match indexer.index_file(&entry.path) {
565                Ok((doc_id, content)) => {
566                    indexed += 1;
567                    if indexed % 500 == 0 {
568                        self.log_inline(format!("\r  Indexed {} files...          ", indexed));
569                    }
570
571                    #[cfg(feature = "embeddings")]
572                    if with_embeddings {
573                        embedding_batch.push((doc_id, content));
574                    }
575                    #[cfg(not(feature = "embeddings"))]
576                    {
577                        let _ = doc_id;
578                        let _ = content;
579                    }
580                }
581                Err(YgrepError::FileTooLarge { .. }) => {
582                    skipped += 1;
583                }
584                Err(e) => {
585                    tracing::debug!("Error indexing {}: {}", entry.path.display(), e);
586                    errors += 1;
587                }
588            }
589        }
590
591        if indexed > 0 {
592            self.log(format!("\r  Indexed {} files.              ", indexed));
593        }
594
595        // Remove files that no longer exist on disk
596        // Any paths remaining in indexed_map are deleted files
597        let removed = indexed_map.len();
598        for (deleted_path, (_mtime, doc_id)) in &indexed_map {
599            indexer.delete_by_path(deleted_path)?;
600
601            // Also remove stale embeddings from vector index
602            #[cfg(feature = "embeddings")]
603            if !doc_id.is_empty() {
604                self.vector_index.mark_deleted(doc_id);
605            }
606        }
607
608        // Save vector index if we removed any embeddings
609        #[cfg(feature = "embeddings")]
610        if removed > 0 {
611            if let Err(e) = self.vector_index.save() {
612                tracing::debug!("Failed to save vector index after removals: {}", e);
613            }
614        }
615
616        indexer.commit()?;
617
618        // Track embedded count
619        let mut total_embedded = 0usize;
620
621        // Generate embeddings for changed files
622        #[cfg(feature = "embeddings")]
623        if with_embeddings && !embedding_batch.is_empty() {
624            let filtered_batch: Vec<_> = embedding_batch
625                .into_iter()
626                .filter(|(_, content)| {
627                    let len = content.len();
628                    len >= 50 && len <= 50_000
629                })
630                .collect();
631
632            if !filtered_batch.is_empty() {
633                use indicatif::{ProgressBar, ProgressStyle};
634
635                let total_docs = filtered_batch.len() as u64;
636                self.log(format!(
637                    "Building semantic index for {} changed documents...",
638                    total_docs
639                ));
640
641                self.embedding_model.preload()?;
642
643                let pb = if self.log_tx.is_some() {
644                    ProgressBar::hidden()
645                } else {
646                    ProgressBar::new(total_docs)
647                };
648                pb.set_style(
649                    ProgressStyle::default_bar()
650                        .template("  [{bar:40.cyan/blue}] {pos}/{len} ({percent}%)")
651                        .unwrap()
652                        .progress_chars("━╸─"),
653                );
654                pb.enable_steady_tick(std::time::Duration::from_millis(100));
655
656                for chunk in filtered_batch.chunks(BATCH_SIZE) {
657                    const EMBED_TRUNCATE: usize = 4096;
658                    let texts: Vec<&str> = chunk
659                        .iter()
660                        .map(|(_, content)| {
661                            if content.len() > EMBED_TRUNCATE {
662                                let boundary = content.floor_char_boundary(EMBED_TRUNCATE);
663                                &content[..boundary]
664                            } else {
665                                content.as_str()
666                            }
667                        })
668                        .collect();
669
670                    match self.embedding_model.embed_batch(&texts) {
671                        Ok(embeddings) => {
672                            for ((doc_id, _), embedding) in chunk.iter().zip(embeddings) {
673                                if let Err(e) = self.vector_index.insert(doc_id, &embedding) {
674                                    tracing::debug!(
675                                        "Failed to insert embedding for {}: {}",
676                                        doc_id,
677                                        e
678                                    );
679                                }
680                            }
681                            total_embedded += chunk.len();
682                            pb.set_position(total_embedded as u64);
683                        }
684                        Err(e) => {
685                            tracing::warn!("Batch embedding failed: {}", e);
686                            pb.inc(chunk.len() as u64);
687                        }
688                    }
689                }
690
691                pb.finish_and_clear();
692                self.log(format!("  Indexed {} documents.", total_embedded));
693                self.vector_index.save()?;
694            }
695        }
696
697        #[cfg(not(feature = "embeddings"))]
698        if with_embeddings {
699            self.log("Warning: Semantic search feature not available in this build.");
700        }
701
702        let walk_stats = walker.stats();
703
704        // Save workspace metadata
705        let total_files = unchanged + indexed;
706        let metadata = serde_json::json!({
707            "workspace": self.root.to_string_lossy(),
708            "indexed_at": chrono::Utc::now().to_rfc3339(),
709            "files_indexed": total_files,
710            "semantic": with_embeddings,
711            "schema_version": index::SCHEMA_VERSION,
712        });
713        let metadata_path = self.index_path.join("workspace.json");
714        if let Err(e) = std::fs::write(
715            &metadata_path,
716            serde_json::to_string_pretty(&metadata).unwrap_or_default(),
717        ) {
718            tracing::warn!("Failed to save workspace metadata: {}", e);
719        }
720
721        Ok(IndexStats {
722            indexed,
723            embedded: total_embedded,
724            skipped,
725            errors,
726            unique_paths: walk_stats.visited_paths,
727            unchanged,
728            removed,
729        })
730    }
731
732    /// Search the workspace
733    pub fn search(&self, query: &str, limit: Option<usize>) -> Result<search::SearchResult> {
734        let searcher = search::Searcher::new(self.config.search.clone(), self.index.clone());
735        searcher.search(query, limit, false, None, None)
736    }
737
738    /// Search with filters
739    pub fn search_filtered(
740        &self,
741        query: &str,
742        limit: Option<usize>,
743        extensions: Option<Vec<String>>,
744        paths: Option<Vec<String>>,
745        use_regex: bool,
746        case_sensitive: bool,
747        context_before: Option<usize>,
748        context_after: Option<usize>,
749        verbose: bool,
750    ) -> Result<search::SearchResult> {
751        let searcher = search::Searcher::new(self.config.search.clone(), self.index.clone());
752        let filters = search::SearchFilters { extensions, paths };
753        searcher.search_filtered(
754            query,
755            limit,
756            filters,
757            use_regex,
758            case_sensitive,
759            context_before,
760            context_after,
761            verbose,
762        )
763    }
764
765    /// Hybrid search combining BM25 and vector search
766    #[cfg(feature = "embeddings")]
767    pub fn search_hybrid(&self, query: &str, limit: Option<usize>) -> Result<search::SearchResult> {
768        let searcher = search::HybridSearcher::new(
769            self.config.search.clone(),
770            self.index.clone(),
771            self.vector_index.clone(),
772            self.embedding_model.clone(),
773            self.embedding_cache.clone(),
774        );
775        searcher.search(query, limit)
776    }
777
778    /// Check if semantic search is available (vector index has data)
779    #[cfg(feature = "embeddings")]
780    pub fn has_semantic_index(&self) -> bool {
781        !self.vector_index.is_empty()
782    }
783
784    /// Check if semantic search is available (always false without embeddings feature)
785    #[cfg(not(feature = "embeddings"))]
786    pub fn has_semantic_index(&self) -> bool {
787        false
788    }
789
790    /// Get the workspace root
791    pub fn root(&self) -> &Path {
792        &self.root
793    }
794
795    /// Get the index path
796    pub fn index_path(&self) -> &Path {
797        &self.index_path
798    }
799
800    /// Check if the workspace has been indexed
801    /// (workspace.json is only created after actual indexing, not just opening)
802    pub fn is_indexed(&self) -> bool {
803        self.index_path.join("workspace.json").exists()
804    }
805
806    /// Index or re-index a single file (for incremental updates)
807    /// Note: path can be under workspace root OR under a symlink target
808    pub fn index_file(&self, path: &Path) -> Result<()> {
809        // Create indexer and index the file
810        let indexer =
811            index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)?;
812
813        match indexer.index_file(path) {
814            Ok((_doc_id, _content)) => {
815                indexer.commit()?;
816                tracing::debug!("Indexed: {}", path.display());
817                Ok(())
818            }
819            Err(YgrepError::FileTooLarge { .. }) => {
820                tracing::debug!("Skipped (too large): {}", path.display());
821                Ok(())
822            }
823            Err(e) => Err(e),
824        }
825    }
826
827    /// Delete a file from the index (for incremental updates)
828    pub fn delete_file(&self, path: &Path) -> Result<()> {
829        use tantivy::Term;
830
831        // Get the relative path as doc_id
832        let relative_path = path
833            .strip_prefix(&self.root)
834            .unwrap_or(path)
835            .to_string_lossy();
836
837        let schema = self.index.schema();
838        let path_field = schema
839            .get_field("path")
840            .map_err(|_| YgrepError::Config("path field not found in schema".to_string()))?;
841
842        let term = Term::from_field_text(path_field, &relative_path);
843
844        let mut writer = self.index.writer::<tantivy::TantivyDocument>(50_000_000)?;
845        writer.delete_term(term);
846        writer.commit()?;
847
848        tracing::debug!("Deleted from index: {}", path.display());
849        Ok(())
850    }
851
852    /// Create a file watcher for this workspace
853    pub fn create_watcher(&self) -> Result<FileWatcher> {
854        FileWatcher::new(self.root.clone(), self.config.indexer.clone())
855    }
856
857    /// Get the indexer config
858    pub fn indexer_config(&self) -> &config::IndexerConfig {
859        &self.config.indexer
860    }
861
862    /// Read the stored semantic flag from workspace.json metadata
863    /// Returns None if no metadata exists or flag is not set
864    pub fn stored_semantic_flag(&self) -> Option<bool> {
865        self.read_metadata()
866            .and_then(|v| v.get("semantic").and_then(|s| s.as_bool()))
867    }
868
869    /// Read the stored schema version from workspace.json metadata
870    /// Returns None if no metadata exists or version is not set
871    pub fn stored_schema_version(&self) -> Option<u32> {
872        self.read_metadata()
873            .and_then(|v| v.get("schema_version").and_then(|s| s.as_u64()))
874            .map(|v| v as u32)
875    }
876
877    /// Read workspace.json metadata
878    fn read_metadata(&self) -> Option<serde_json::Value> {
879        let metadata_path = self.index_path.join("workspace.json");
880        if metadata_path.exists() {
881            std::fs::read_to_string(&metadata_path)
882                .ok()
883                .and_then(|s| serde_json::from_str::<serde_json::Value>(&s).ok())
884        } else {
885            None
886        }
887    }
888
889    /// Index or re-index a single file with optional semantic indexing (for incremental updates)
890    #[allow(unused_variables)]
891    pub fn index_file_with_options(&self, path: &Path, with_embeddings: bool) -> Result<()> {
892        // Create indexer and index the file
893        let indexer =
894            index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)?;
895        self.index_file_with_indexer(&indexer, path, with_embeddings)
896    }
897
898    /// Index or re-index a single file using an existing Indexer (avoids lock churn)
899    #[allow(unused_variables)]
900    pub fn index_file_with_indexer(
901        &self,
902        indexer: &index::Indexer,
903        path: &Path,
904        with_embeddings: bool,
905    ) -> Result<()> {
906        match indexer.index_file(path) {
907            Ok((doc_id, content)) => {
908                indexer.commit()?;
909                tracing::debug!("Indexed: {}", path.display());
910
911                // Generate embedding if semantic indexing is enabled (reuse content from indexer)
912                #[cfg(feature = "embeddings")]
913                if with_embeddings {
914                    // Only embed files within size bounds
915                    let len = content.len();
916                    if (50..=50_000).contains(&len) {
917                        // Truncate for embedding
918                        const EMBED_TRUNCATE: usize = 4096;
919                        let text = if content.len() > EMBED_TRUNCATE {
920                            let boundary = content.floor_char_boundary(EMBED_TRUNCATE);
921                            &content[..boundary]
922                        } else {
923                            content.as_str()
924                        };
925
926                        match self.embedding_model.embed(text) {
927                            Ok(embedding) => {
928                                if let Err(e) = self.vector_index.insert(&doc_id, &embedding) {
929                                    tracing::debug!(
930                                        "Failed to insert embedding for {}: {}",
931                                        doc_id,
932                                        e
933                                    );
934                                } else {
935                                    // Save vector index after each file (incremental)
936                                    if let Err(e) = self.vector_index.save() {
937                                        tracing::debug!("Failed to save vector index: {}", e);
938                                    }
939                                }
940                            }
941                            Err(e) => {
942                                tracing::debug!(
943                                    "Failed to generate embedding for {}: {}",
944                                    doc_id,
945                                    e
946                                );
947                            }
948                        }
949                    }
950                }
951
952                #[cfg(not(feature = "embeddings"))]
953                {
954                    let _ = doc_id;
955                    let _ = content;
956                }
957
958                Ok(())
959            }
960            Err(YgrepError::FileTooLarge { .. }) => {
961                tracing::debug!("Skipped (too large): {}", path.display());
962                Ok(())
963            }
964            Err(e) => Err(e),
965        }
966    }
967
968    /// Create a persistent Indexer for this workspace (holds a single writer lock)
969    pub fn create_indexer(&self) -> Result<index::Indexer> {
970        index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)
971    }
972
973    /// Create a persistent Indexer with NoMergePolicy (for watch mode)
974    /// Prevents background merge threads from racing with commits.
975    /// Segments accumulate but are consolidated on next incremental index.
976    pub fn create_watch_indexer(&self) -> Result<index::Indexer> {
977        index::Indexer::new_no_merge(self.config.indexer.clone(), self.index.clone(), &self.root)
978    }
979
980    /// Index a single file without committing (for batched watch operations)
981    #[allow(unused_variables)]
982    pub fn index_file_no_commit(
983        &self,
984        indexer: &index::Indexer,
985        path: &Path,
986        with_embeddings: bool,
987    ) -> Result<()> {
988        match indexer.index_file(path) {
989            Ok((doc_id, content)) => {
990                tracing::debug!("Staged: {}", path.display());
991
992                #[cfg(feature = "embeddings")]
993                if with_embeddings {
994                    let len = content.len();
995                    if (50..=50_000).contains(&len) {
996                        const EMBED_TRUNCATE: usize = 4096;
997                        let text = if content.len() > EMBED_TRUNCATE {
998                            let boundary = content.floor_char_boundary(EMBED_TRUNCATE);
999                            &content[..boundary]
1000                        } else {
1001                            content.as_str()
1002                        };
1003
1004                        match self.embedding_model.embed(text) {
1005                            Ok(embedding) => {
1006                                if let Err(e) = self.vector_index.insert(&doc_id, &embedding) {
1007                                    tracing::debug!(
1008                                        "Failed to insert embedding for {}: {}",
1009                                        doc_id,
1010                                        e
1011                                    );
1012                                }
1013                            }
1014                            Err(e) => {
1015                                tracing::debug!(
1016                                    "Failed to generate embedding for {}: {}",
1017                                    doc_id,
1018                                    e
1019                                );
1020                            }
1021                        }
1022                    }
1023                }
1024
1025                #[cfg(not(feature = "embeddings"))]
1026                {
1027                    let _ = doc_id;
1028                    let _ = content;
1029                }
1030
1031                Ok(())
1032            }
1033            Err(YgrepError::FileTooLarge { .. }) => {
1034                tracing::debug!("Skipped (too large): {}", path.display());
1035                Ok(())
1036            }
1037            Err(e) => Err(e),
1038        }
1039    }
1040
1041    /// Delete a file from the index without committing (for batched watch operations)
1042    pub fn delete_file_no_commit(&self, indexer: &index::Indexer, path: &Path) -> Result<()> {
1043        let relative_path = path
1044            .strip_prefix(&self.root)
1045            .unwrap_or(path)
1046            .to_string_lossy();
1047
1048        indexer.delete_by_path(&relative_path)?;
1049        tracing::debug!("Staged delete: {}", path.display());
1050        Ok(())
1051    }
1052
1053    /// Commit all pending indexer changes
1054    pub fn commit_indexer(&self, indexer: &index::Indexer) -> Result<()> {
1055        indexer.commit()
1056    }
1057
1058    /// Delete a file from the index using an existing Indexer (avoids lock churn)
1059    pub fn delete_file_with_indexer(&self, indexer: &index::Indexer, path: &Path) -> Result<()> {
1060        let relative_path = path
1061            .strip_prefix(&self.root)
1062            .unwrap_or(path)
1063            .to_string_lossy();
1064
1065        indexer.delete_by_path(&relative_path)?;
1066        indexer.commit()?;
1067
1068        tracing::debug!("Deleted from index: {}", path.display());
1069        Ok(())
1070    }
1071}
1072
1073/// Statistics from an indexing operation
1074#[derive(Debug, Clone, Default)]
1075pub struct IndexStats {
1076    pub indexed: usize,
1077    pub embedded: usize,
1078    pub skipped: usize,
1079    pub errors: usize,
1080    pub unique_paths: usize,
1081    pub unchanged: usize,
1082    pub removed: usize,
1083}
1084
1085/// Hash a path to create a unique identifier
1086fn hash_path(path: &Path) -> String {
1087    use xxhash_rust::xxh3::xxh3_64;
1088    let hash = xxh3_64(path.to_string_lossy().as_bytes());
1089    format!("{:016x}", hash)
1090}
1091
1092#[cfg(test)]
1093mod tests {
1094    use super::*;
1095    use tempfile::tempdir;
1096
1097    #[test]
1098    fn test_workspace_open() -> Result<()> {
1099        let temp_dir = tempdir().unwrap();
1100
1101        // Create a test file
1102        std::fs::write(temp_dir.path().join("test.rs"), "fn main() {}").unwrap();
1103
1104        let workspace = Workspace::create(temp_dir.path())?;
1105        assert!(workspace.root().exists());
1106
1107        Ok(())
1108    }
1109
1110    #[test]
1111    fn test_workspace_index_and_search() -> Result<()> {
1112        let temp_dir = tempdir().unwrap();
1113
1114        // Create a workspace subdirectory to avoid the walker's hardcoded
1115        // ignore list (which includes "tmp", "var" — common tempdir components)
1116        let workspace_dir = temp_dir.path().join("workspace");
1117        std::fs::create_dir_all(&workspace_dir).unwrap();
1118
1119        // Create test files
1120        std::fs::write(
1121            workspace_dir.join("hello.rs"),
1122            "fn hello_world() { println!(\"Hello!\"); }",
1123        )
1124        .unwrap();
1125        std::fs::write(
1126            workspace_dir.join("goodbye.rs"),
1127            "fn goodbye_world() { println!(\"Bye!\"); }",
1128        )
1129        .unwrap();
1130
1131        let mut config = Config::default();
1132        config.indexer.data_dir = temp_dir.path().join("data");
1133        config.indexer.ignore_patterns = vec![];
1134
1135        let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1136
1137        // Index
1138        let stats = workspace.index_all()?;
1139        assert!(
1140            stats.indexed >= 2,
1141            "Expected at least 2 indexed files, got {}",
1142            stats.indexed
1143        );
1144
1145        // Search
1146        let result = workspace.search("hello", None)?;
1147        assert!(!result.is_empty());
1148        assert!(result.hits.iter().any(|h| h.path.contains("hello")));
1149
1150        Ok(())
1151    }
1152
1153    #[test]
1154    fn test_shared_indexer_multiple_files() -> Result<()> {
1155        let temp_dir = tempdir().unwrap();
1156        let workspace_dir = temp_dir.path().join("workspace");
1157        std::fs::create_dir_all(&workspace_dir).unwrap();
1158
1159        // Create initial file and index
1160        std::fs::write(workspace_dir.join("initial.rs"), "fn initial() {}").unwrap();
1161
1162        let mut config = Config::default();
1163        config.indexer.data_dir = temp_dir.path().join("data");
1164        config.indexer.ignore_patterns = vec![];
1165
1166        let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1167        workspace.index_all()?;
1168
1169        // Create a shared indexer and use it for multiple file operations
1170        let indexer = workspace.create_indexer()?;
1171
1172        // Index several files rapidly with the same indexer (simulates watch loop)
1173        for i in 0..20 {
1174            let filename = format!("file_{}.rs", i);
1175            let content = format!("fn func_{}() {{ /* content {} */ }}", i, i);
1176            std::fs::write(workspace_dir.join(&filename), &content).unwrap();
1177            workspace.index_file_with_indexer(&indexer, &workspace_dir.join(&filename), false)?;
1178        }
1179
1180        // Verify all files are searchable
1181        for i in 0..20 {
1182            let query = format!("func_{}", i);
1183            let result = workspace.search(&query, None)?;
1184            assert!(
1185                !result.is_empty(),
1186                "File {} should be searchable after indexing with shared indexer",
1187                i
1188            );
1189        }
1190
1191        // Delete some files with the same indexer
1192        for i in 0..5 {
1193            let path = workspace_dir.join(format!("file_{}.rs", i));
1194            workspace.delete_file_with_indexer(&indexer, &path)?;
1195        }
1196
1197        // Verify deleted files are gone, others remain
1198        let result = workspace.search("func_0", None)?;
1199        assert!(
1200            result.is_empty(),
1201            "Deleted file should not appear in search"
1202        );
1203
1204        let result = workspace.search("func_10", None)?;
1205        assert!(
1206            !result.is_empty(),
1207            "Non-deleted file should still be searchable"
1208        );
1209
1210        Ok(())
1211    }
1212
1213    #[test]
1214    fn test_shared_indexer_no_lock_contention() -> Result<()> {
1215        // This test verifies that creating a single indexer and reusing it
1216        // for rapid sequential operations doesn't cause LockBusy errors,
1217        // which was the bug when creating a new indexer per file event.
1218        let temp_dir = tempdir().unwrap();
1219        let workspace_dir = temp_dir.path().join("workspace");
1220        std::fs::create_dir_all(&workspace_dir).unwrap();
1221
1222        std::fs::write(workspace_dir.join("seed.rs"), "fn seed() {}").unwrap();
1223
1224        let mut config = Config::default();
1225        config.indexer.data_dir = temp_dir.path().join("data");
1226        config.indexer.ignore_patterns = vec![];
1227
1228        let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1229        workspace.index_all()?;
1230
1231        let indexer = workspace.create_indexer()?;
1232
1233        // Simulate heavy churn: rapidly create, index, modify, re-index, delete
1234        for i in 0..50 {
1235            let path = workspace_dir.join(format!("churn_{}.rs", i));
1236
1237            // Create and index
1238            std::fs::write(&path, format!("fn v1_{}() {{}}", i)).unwrap();
1239            workspace.index_file_with_indexer(&indexer, &path, false)?;
1240
1241            // Modify and re-index (simulates rapid file changes)
1242            std::fs::write(&path, format!("fn v2_{}() {{}}", i)).unwrap();
1243            workspace.index_file_with_indexer(&indexer, &path, false)?;
1244
1245            // Delete half of them
1246            if i % 2 == 0 {
1247                workspace.delete_file_with_indexer(&indexer, &path)?;
1248            }
1249        }
1250
1251        // Verify odd-numbered files are still searchable with updated content
1252        let result = workspace.search("v2_1", None)?;
1253        assert!(
1254            !result.is_empty(),
1255            "Surviving file should have latest content"
1256        );
1257
1258        // Verify even-numbered (deleted) files are gone
1259        let result = workspace.search("v2_0", None)?;
1260        assert!(result.is_empty(), "Deleted file should not appear");
1261
1262        Ok(())
1263    }
1264
1265    #[test]
1266    fn test_per_file_indexer_still_works() -> Result<()> {
1267        // Ensure the original index_file_with_options path (one indexer per call)
1268        // still works correctly for non-watch usage.
1269        let temp_dir = tempdir().unwrap();
1270        let workspace_dir = temp_dir.path().join("workspace");
1271        std::fs::create_dir_all(&workspace_dir).unwrap();
1272
1273        std::fs::write(workspace_dir.join("base.rs"), "fn base() {}").unwrap();
1274
1275        let mut config = Config::default();
1276        config.indexer.data_dir = temp_dir.path().join("data");
1277        config.indexer.ignore_patterns = vec![];
1278
1279        let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1280        workspace.index_all()?;
1281
1282        // Use the per-call indexer path (index_file_with_options)
1283        let path = workspace_dir.join("standalone.rs");
1284        std::fs::write(&path, "fn standalone_function() {}").unwrap();
1285        workspace.index_file_with_options(&path, false)?;
1286
1287        let result = workspace.search("standalone_function", None)?;
1288        assert!(
1289            !result.is_empty(),
1290            "File indexed via index_file_with_options should be searchable"
1291        );
1292
1293        Ok(())
1294    }
1295
1296    #[test]
1297    fn test_batched_commit_multiple_files() -> Result<()> {
1298        // Simulates the watch loop pattern: stage many files, commit once.
1299        // This is the pattern that prevents segment merge warnings under churn.
1300        let temp_dir = tempdir().unwrap();
1301        let workspace_dir = temp_dir.path().join("workspace");
1302        std::fs::create_dir_all(&workspace_dir).unwrap();
1303
1304        std::fs::write(workspace_dir.join("seed.rs"), "fn seed() {}").unwrap();
1305
1306        let mut config = Config::default();
1307        config.indexer.data_dir = temp_dir.path().join("data");
1308        config.indexer.ignore_patterns = vec![];
1309
1310        let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1311        workspace.index_all()?;
1312
1313        let indexer = workspace.create_indexer()?;
1314
1315        // Stage 30 file additions without committing (simulates a git branch switch)
1316        for i in 0..30 {
1317            let path = workspace_dir.join(format!("batch_{}.rs", i));
1318            std::fs::write(&path, format!("fn batch_func_{}() {{}}", i)).unwrap();
1319            workspace.index_file_no_commit(&indexer, &path, false)?;
1320        }
1321
1322        // Single commit for all 30 files
1323        workspace.commit_indexer(&indexer)?;
1324
1325        // All files should be searchable
1326        for i in 0..30 {
1327            let result = workspace.search(&format!("batch_func_{}", i), None)?;
1328            assert!(
1329                !result.is_empty(),
1330                "File {} should be searchable after batched commit",
1331                i
1332            );
1333        }
1334
1335        // Now stage mixed deletes and adds without committing
1336        for i in 0..10 {
1337            let path = workspace_dir.join(format!("batch_{}.rs", i));
1338            workspace.delete_file_no_commit(&indexer, &path)?;
1339        }
1340        for i in 30..40 {
1341            let path = workspace_dir.join(format!("batch_{}.rs", i));
1342            std::fs::write(&path, format!("fn batch_func_{}() {{}}", i)).unwrap();
1343            workspace.index_file_no_commit(&indexer, &path, false)?;
1344        }
1345
1346        // Single commit for mixed batch
1347        workspace.commit_indexer(&indexer)?;
1348
1349        // Deleted files gone
1350        let result = workspace.search("batch_func_0", None)?;
1351        assert!(result.is_empty(), "Deleted file should not appear");
1352
1353        // Surviving and new files present
1354        let result = workspace.search("batch_func_15", None)?;
1355        assert!(!result.is_empty(), "Surviving file should be searchable");
1356
1357        let result = workspace.search("batch_func_35", None)?;
1358        assert!(!result.is_empty(), "Newly added file should be searchable");
1359
1360        Ok(())
1361    }
1362
1363    #[test]
1364    fn test_batched_commit_heavy_churn() -> Result<()> {
1365        // Simulates git-style churn: many files created, modified, and deleted
1366        // in rapid succession with only periodic commits (not per-file).
1367        let temp_dir = tempdir().unwrap();
1368        let workspace_dir = temp_dir.path().join("workspace");
1369        std::fs::create_dir_all(&workspace_dir).unwrap();
1370
1371        std::fs::write(workspace_dir.join("seed.rs"), "fn seed() {}").unwrap();
1372
1373        let mut config = Config::default();
1374        config.indexer.data_dir = temp_dir.path().join("data");
1375        config.indexer.ignore_patterns = vec![];
1376
1377        let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1378        workspace.index_all()?;
1379
1380        let indexer = workspace.create_indexer()?;
1381
1382        // Simulate 5 rapid batches (like 5 debounce windows during git checkout)
1383        for batch in 0..5 {
1384            // Each batch: create/modify/delete 20 files, commit once
1385            for i in 0..20 {
1386                let idx = batch * 20 + i;
1387                let path = workspace_dir.join(format!("churn_{}.rs", idx));
1388
1389                std::fs::write(&path, format!("fn churn_v{}_{} () {{}}", batch, idx)).unwrap();
1390                workspace.index_file_no_commit(&indexer, &path, false)?;
1391            }
1392
1393            // Delete files from previous batch
1394            if batch > 0 {
1395                for i in 0..10 {
1396                    let idx = (batch - 1) * 20 + i;
1397                    let path = workspace_dir.join(format!("churn_{}.rs", idx));
1398                    workspace.delete_file_no_commit(&indexer, &path)?;
1399                }
1400            }
1401
1402            // One commit per batch
1403            workspace.commit_indexer(&indexer)?;
1404        }
1405
1406        // Verify last batch's files are searchable
1407        let result = workspace.search("churn_v4_80", None)?;
1408        assert!(!result.is_empty(), "Latest batch file should be searchable");
1409
1410        // Verify early deleted files are gone
1411        let result = workspace.search("churn_v0_0", None)?;
1412        assert!(
1413            result.is_empty(),
1414            "Deleted file from early batch should be gone"
1415        );
1416
1417        Ok(())
1418    }
1419}