1pub mod config;
11pub mod dashboard;
12#[cfg(feature = "embeddings")]
13pub mod embeddings;
14pub mod error;
15pub mod fs;
16pub mod index;
17pub mod search;
18pub mod watcher;
19
20pub use config::Config;
21pub use error::{Result, YgrepError};
22pub use watcher::{FileWatcher, WatchEvent};
23
24use std::collections::HashMap;
25use std::path::Path;
26use tantivy::Index;
27
28#[cfg(feature = "embeddings")]
29use embeddings::{EmbeddingCache, EmbeddingModel};
30#[cfg(feature = "embeddings")]
31use index::VectorIndex;
32#[cfg(feature = "embeddings")]
33use std::sync::Arc;
34
35#[cfg(feature = "embeddings")]
37const EMBEDDING_DIM: usize = 384;
38
39pub type LogSender = std::sync::mpsc::Sender<String>;
41
42pub struct Workspace {
44 root: std::path::PathBuf,
46 config: Config,
48 index: Index,
50 index_path: std::path::PathBuf,
52 log_tx: Option<LogSender>,
54 #[cfg(feature = "embeddings")]
56 vector_index: Arc<VectorIndex>,
57 #[cfg(feature = "embeddings")]
59 embedding_model: Arc<EmbeddingModel>,
60 #[cfg(feature = "embeddings")]
62 embedding_cache: Arc<EmbeddingCache>,
63}
64
65impl Workspace {
66 pub fn open(root: &Path) -> Result<Self> {
68 let config = Config::load();
69 Self::open_internal(root, config, false)
70 }
71
72 pub fn open_with_config(root: &Path, config: Config) -> Result<Self> {
74 Self::open_internal(root, config, false)
75 }
76
77 pub fn create(root: &Path) -> Result<Self> {
79 let config = Config::load();
80 Self::open_internal(root, config, true)
81 }
82
83 pub fn create_with_config(root: &Path, config: Config) -> Result<Self> {
85 Self::open_internal(root, config, true)
86 }
87
88 fn open_internal(root: &Path, config: Config, create: bool) -> Result<Self> {
91 let root = std::fs::canonicalize(root)?;
92
93 let local_ygrep = root.join(".ygrep");
98 let data_dir = if local_ygrep.is_dir() {
99 local_ygrep
100 } else if config.indexer.data_dir.is_relative() {
101 root.join(&config.indexer.data_dir)
102 } else {
103 config.indexer.data_dir.clone()
104 };
105
106 let workspace_hash = hash_path(&root);
107 let index_path = data_dir.join("indexes").join(&workspace_hash);
108
109 let workspace_indexed = index_path.join("workspace.json").exists();
111 let tantivy_exists = index_path.join("meta.json").exists();
113
114 if !create && !workspace_indexed {
116 return Err(YgrepError::Config(format!(
117 "Workspace not indexed: {}",
118 root.display()
119 )));
120 }
121
122 let schema = index::build_document_schema();
124
125 if index_path.exists() {
130 let probe = index_path.join(".ygrep-write-probe");
131 match std::fs::write(&probe, b"") {
132 Ok(()) => {
133 let _ = std::fs::remove_file(&probe);
134 }
135 Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => {
136 return Err(YgrepError::Config(format!(
137 "Index directory is not writable: {}\n\n\
138 Hint: Create a .ygrep/ directory in your project root for local indexes,\n\
139 or set YGREP_HOME to a writable location.",
140 index_path.display()
141 )));
142 }
143 Err(_) => {} }
145 }
146
147 if !create {
152 let _ = std::fs::remove_file(index_path.join(".tantivy-meta.lock"));
153 let _ = std::fs::remove_file(index_path.join(".tantivy-writer.lock"));
154 }
155
156 let index = if tantivy_exists {
157 match Index::open_in_dir(&index_path) {
158 Ok(idx) => idx,
159 Err(e) if create => {
160 std::fs::remove_dir_all(&index_path)?;
163 std::fs::create_dir_all(&index_path)?;
164 Index::create_in_dir(&index_path, schema)?
165 }
166 Err(e) => return Err(e.into()),
167 }
168 } else {
169 std::fs::create_dir_all(&index_path)?;
171 Index::create_in_dir(&index_path, schema)?
172 };
173
174 index::register_tokenizers(index.tokenizers());
176
177 #[cfg(feature = "embeddings")]
178 let (vector_index, embedding_model, embedding_cache) = {
179 let vector_path = index_path.join("vectors");
181
182 let vector_index = if VectorIndex::exists(&vector_path) {
184 match VectorIndex::load(vector_path.clone()) {
185 Ok(vi) => Arc::new(vi),
186 Err(_e) => {
187 if vector_path.exists() {
190 let _ = std::fs::remove_dir_all(&vector_path);
191 }
192 Arc::new(VectorIndex::new(vector_path, EMBEDDING_DIM)?)
193 }
194 }
195 } else {
196 Arc::new(VectorIndex::new(vector_path, EMBEDDING_DIM)?)
197 };
198
199 let embedding_model = Arc::new(EmbeddingModel::default()); let embedding_cache = Arc::new(EmbeddingCache::new(100, EMBEDDING_DIM));
204
205 (vector_index, embedding_model, embedding_cache)
206 };
207
208 Ok(Self {
209 root,
210 config,
211 index,
212 index_path,
213 log_tx: None,
214 #[cfg(feature = "embeddings")]
215 vector_index,
216 #[cfg(feature = "embeddings")]
217 embedding_model,
218 #[cfg(feature = "embeddings")]
219 embedding_cache,
220 })
221 }
222
223 pub fn set_log_tx(&mut self, tx: LogSender) {
225 self.log_tx = Some(tx);
226 #[cfg(feature = "embeddings")]
227 self.embedding_model.set_quiet(true);
228 }
229
230 fn log(&self, msg: impl std::fmt::Display) {
232 if let Some(ref tx) = self.log_tx {
233 let _ = tx.send(msg.to_string());
234 } else {
235 eprintln!("{}", msg);
236 }
237 }
238
239 fn log_inline(&self, msg: impl std::fmt::Display) {
241 if let Some(ref tx) = self.log_tx {
242 let _ = tx.send(msg.to_string());
243 } else {
244 eprint!("{}", msg);
245 }
246 }
247
248 pub fn index_all(&self) -> Result<IndexStats> {
250 self.index_all_with_options(false)
251 }
252
253 #[allow(unused_variables)]
255 pub fn index_all_with_options(&self, with_embeddings: bool) -> Result<IndexStats> {
256 #[cfg(feature = "embeddings")]
258 self.vector_index.clear();
259
260 let indexer =
262 index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)?;
263
264 let mut walker = fs::FileWalker::new(self.root.clone(), self.config.indexer.clone())?;
265
266 let mut indexed = 0;
267 let mut skipped = 0;
268 let mut errors = 0;
269
270 #[cfg(feature = "embeddings")]
272 let mut embedding_batch: Vec<(String, String)> = Vec::new(); #[cfg(feature = "embeddings")]
275 const BATCH_SIZE: usize = 64;
276
277 for entry in walker.walk() {
278 match indexer.index_file(&entry.path) {
279 Ok((doc_id, content)) => {
280 indexed += 1;
281 if indexed % 500 == 0 {
282 self.log_inline(format!("\r Indexed {} files... ", indexed));
283 }
284
285 #[cfg(feature = "embeddings")]
287 if with_embeddings {
288 embedding_batch.push((doc_id, content));
289 }
290 #[cfg(not(feature = "embeddings"))]
291 {
292 let _ = doc_id;
293 let _ = content;
294 }
295 }
296 Err(YgrepError::FileTooLarge { .. }) => {
297 skipped += 1;
298 }
299 Err(e) => {
300 tracing::debug!("Error indexing {}: {}", entry.path.display(), e);
301 errors += 1;
302 }
303 }
304 }
305
306 self.log(format!("\r Indexed {} files. ", indexed));
307 indexer.commit()?;
308
309 let mut total_embedded = 0usize;
311
312 #[cfg(feature = "embeddings")]
314 if with_embeddings && !embedding_batch.is_empty() {
315 let filtered_batch: Vec<_> = embedding_batch
318 .into_iter()
319 .filter(|(_, content)| {
320 let len = content.len();
321 len >= 50 && len <= 50_000
322 })
323 .collect();
324
325 if filtered_batch.is_empty() {
326 self.log("No documents suitable for semantic indexing.");
327 } else {
328 use indicatif::{ProgressBar, ProgressStyle};
329
330 let total_docs = filtered_batch.len() as u64;
331 self.log(format!(
332 "Building semantic index for {} documents...",
333 total_docs
334 ));
335
336 self.embedding_model.preload()?;
338
339 let pb = if self.log_tx.is_some() {
340 ProgressBar::hidden()
341 } else {
342 ProgressBar::new(total_docs)
343 };
344 pb.set_style(
345 ProgressStyle::default_bar()
346 .template(" [{bar:40.cyan/blue}] {pos}/{len} ({percent}%)")
347 .unwrap()
348 .progress_chars("━╸─"),
349 );
350 pb.enable_steady_tick(std::time::Duration::from_millis(100));
351
352 for chunk in filtered_batch.chunks(BATCH_SIZE) {
353 const EMBED_TRUNCATE: usize = 4096;
356 let texts: Vec<&str> = chunk
357 .iter()
358 .map(|(_, content)| {
359 if content.len() > EMBED_TRUNCATE {
360 let boundary = content.floor_char_boundary(EMBED_TRUNCATE);
361 &content[..boundary]
362 } else {
363 content.as_str()
364 }
365 })
366 .collect();
367
368 match self.embedding_model.embed_batch(&texts) {
369 Ok(embeddings) => {
370 for ((doc_id, _), embedding) in chunk.iter().zip(embeddings) {
371 if let Err(e) = self.vector_index.insert(doc_id, &embedding) {
372 tracing::debug!(
373 "Failed to insert embedding for {}: {}",
374 doc_id,
375 e
376 );
377 }
378 }
379 total_embedded += chunk.len();
380 pb.set_position(total_embedded as u64);
381 }
382 Err(e) => {
383 tracing::warn!("Batch embedding failed: {}", e);
384 pb.inc(chunk.len() as u64);
385 }
386 }
387 }
388
389 pb.finish_and_clear();
390 self.log(format!(" Indexed {} documents.", total_embedded));
391 self.vector_index.save()?;
392 }
393 }
394
395 #[cfg(not(feature = "embeddings"))]
396 if with_embeddings {
397 self.log("Warning: Semantic search feature not available in this build.");
398 }
399
400 let stats = walker.stats();
401
402 let metadata = serde_json::json!({
404 "workspace": self.root.to_string_lossy(),
405 "indexed_at": chrono::Utc::now().to_rfc3339(),
406 "files_indexed": indexed,
407 "semantic": with_embeddings,
408 "schema_version": index::SCHEMA_VERSION,
409 });
410 let metadata_path = self.index_path.join("workspace.json");
411 if let Err(e) = std::fs::write(
412 &metadata_path,
413 serde_json::to_string_pretty(&metadata).unwrap_or_default(),
414 ) {
415 tracing::warn!("Failed to save workspace metadata: {}", e);
416 }
417
418 Ok(IndexStats {
419 indexed,
420 embedded: total_embedded,
421 skipped,
422 errors,
423 unique_paths: stats.visited_paths,
424 unchanged: 0,
425 removed: 0,
426 })
427 }
428
429 pub fn build_indexed_files_map(&self) -> HashMap<String, (u64, String)> {
433 let mut map = HashMap::new();
434
435 let reader = match self.index.reader() {
436 Ok(r) => r,
437 Err(_) => return map,
438 };
439
440 let searcher = reader.searcher();
441
442 for segment_reader in searcher.segment_readers() {
443 let alive_bitset = segment_reader.alive_bitset();
444 let fast_fields = segment_reader.fast_fields();
445
446 let path_col = match fast_fields.str("path") {
448 Ok(Some(col)) => col,
449 _ => continue,
450 };
451 let mtime_col = match fast_fields.u64("mtime") {
452 Ok(col) => col,
453 Err(_) => continue,
454 };
455 let chunk_id_col = match fast_fields.str("chunk_id") {
456 Ok(Some(col)) => col,
457 _ => continue,
458 };
459 let doc_id_col = match fast_fields.str("doc_id") {
460 Ok(Some(col)) => col,
461 _ => continue,
462 };
463
464 let mut path_buf = String::new();
465 let mut chunk_id_buf = String::new();
466 let mut doc_id_buf = String::new();
467
468 for row_id in 0..segment_reader.max_doc() {
469 if let Some(bitset) = &alive_bitset {
471 if !bitset.is_alive(row_id) {
472 continue;
473 }
474 }
475
476 chunk_id_buf.clear();
478 let mut is_chunk = false;
479 for ord in chunk_id_col.term_ords(row_id) {
480 let _ = chunk_id_col.ord_to_str(ord, &mut chunk_id_buf);
481 if !chunk_id_buf.is_empty() {
482 is_chunk = true;
483 break;
484 }
485 }
486 if is_chunk {
487 continue;
488 }
489
490 path_buf.clear();
492 for ord in path_col.term_ords(row_id) {
493 let _ = path_col.ord_to_str(ord, &mut path_buf);
494 }
495 if path_buf.is_empty() {
496 continue;
497 }
498
499 let mtime_val = mtime_col.values_for_doc(row_id).next().unwrap_or(0);
501
502 doc_id_buf.clear();
504 for ord in doc_id_col.term_ords(row_id) {
505 let _ = doc_id_col.ord_to_str(ord, &mut doc_id_buf);
506 }
507
508 map.insert(path_buf.clone(), (mtime_val, doc_id_buf.clone()));
509 }
510 }
511
512 map
513 }
514
515 #[allow(unused_variables)]
517 pub fn index_incremental_with_options(&self, with_embeddings: bool) -> Result<IndexStats> {
518 let mut indexed_map = self.build_indexed_files_map();
520
521 let indexer =
523 index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)?;
524
525 let mut walker = fs::FileWalker::new(self.root.clone(), self.config.indexer.clone())?;
526
527 let mut indexed = 0;
528 let mut skipped = 0;
529 let mut errors = 0;
530 let mut unchanged = 0;
531
532 #[cfg(feature = "embeddings")]
533 let mut embedding_batch: Vec<(String, String)> = Vec::new();
534 #[cfg(feature = "embeddings")]
535 const BATCH_SIZE: usize = 64;
536
537 for entry in walker.walk() {
538 let rel_path = entry
540 .path
541 .strip_prefix(&self.root)
542 .unwrap_or(&entry.path)
543 .to_string_lossy()
544 .to_string();
545
546 let current_mtime = std::fs::metadata(&entry.path)
548 .ok()
549 .and_then(|m| m.modified().ok())
550 .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
551 .map(|d| d.as_secs())
552 .unwrap_or(0);
553
554 if let Some((stored_mtime, _stored_doc_id)) = indexed_map.remove(&rel_path) {
556 if stored_mtime == current_mtime {
557 unchanged += 1;
558 continue;
559 }
560 }
562 match indexer.index_file(&entry.path) {
565 Ok((doc_id, content)) => {
566 indexed += 1;
567 if indexed % 500 == 0 {
568 self.log_inline(format!("\r Indexed {} files... ", indexed));
569 }
570
571 #[cfg(feature = "embeddings")]
572 if with_embeddings {
573 embedding_batch.push((doc_id, content));
574 }
575 #[cfg(not(feature = "embeddings"))]
576 {
577 let _ = doc_id;
578 let _ = content;
579 }
580 }
581 Err(YgrepError::FileTooLarge { .. }) => {
582 skipped += 1;
583 }
584 Err(e) => {
585 tracing::debug!("Error indexing {}: {}", entry.path.display(), e);
586 errors += 1;
587 }
588 }
589 }
590
591 if indexed > 0 {
592 self.log(format!("\r Indexed {} files. ", indexed));
593 }
594
595 let removed = indexed_map.len();
598 for (deleted_path, (_mtime, doc_id)) in &indexed_map {
599 indexer.delete_by_path(deleted_path)?;
600
601 #[cfg(feature = "embeddings")]
603 if !doc_id.is_empty() {
604 self.vector_index.mark_deleted(doc_id);
605 }
606 }
607
608 #[cfg(feature = "embeddings")]
610 if removed > 0 {
611 if let Err(e) = self.vector_index.save() {
612 tracing::debug!("Failed to save vector index after removals: {}", e);
613 }
614 }
615
616 indexer.commit()?;
617
618 let mut total_embedded = 0usize;
620
621 #[cfg(feature = "embeddings")]
623 if with_embeddings && !embedding_batch.is_empty() {
624 let filtered_batch: Vec<_> = embedding_batch
625 .into_iter()
626 .filter(|(_, content)| {
627 let len = content.len();
628 len >= 50 && len <= 50_000
629 })
630 .collect();
631
632 if !filtered_batch.is_empty() {
633 use indicatif::{ProgressBar, ProgressStyle};
634
635 let total_docs = filtered_batch.len() as u64;
636 self.log(format!(
637 "Building semantic index for {} changed documents...",
638 total_docs
639 ));
640
641 self.embedding_model.preload()?;
642
643 let pb = if self.log_tx.is_some() {
644 ProgressBar::hidden()
645 } else {
646 ProgressBar::new(total_docs)
647 };
648 pb.set_style(
649 ProgressStyle::default_bar()
650 .template(" [{bar:40.cyan/blue}] {pos}/{len} ({percent}%)")
651 .unwrap()
652 .progress_chars("━╸─"),
653 );
654 pb.enable_steady_tick(std::time::Duration::from_millis(100));
655
656 for chunk in filtered_batch.chunks(BATCH_SIZE) {
657 const EMBED_TRUNCATE: usize = 4096;
658 let texts: Vec<&str> = chunk
659 .iter()
660 .map(|(_, content)| {
661 if content.len() > EMBED_TRUNCATE {
662 let boundary = content.floor_char_boundary(EMBED_TRUNCATE);
663 &content[..boundary]
664 } else {
665 content.as_str()
666 }
667 })
668 .collect();
669
670 match self.embedding_model.embed_batch(&texts) {
671 Ok(embeddings) => {
672 for ((doc_id, _), embedding) in chunk.iter().zip(embeddings) {
673 if let Err(e) = self.vector_index.insert(doc_id, &embedding) {
674 tracing::debug!(
675 "Failed to insert embedding for {}: {}",
676 doc_id,
677 e
678 );
679 }
680 }
681 total_embedded += chunk.len();
682 pb.set_position(total_embedded as u64);
683 }
684 Err(e) => {
685 tracing::warn!("Batch embedding failed: {}", e);
686 pb.inc(chunk.len() as u64);
687 }
688 }
689 }
690
691 pb.finish_and_clear();
692 self.log(format!(" Indexed {} documents.", total_embedded));
693 self.vector_index.save()?;
694 }
695 }
696
697 #[cfg(not(feature = "embeddings"))]
698 if with_embeddings {
699 self.log("Warning: Semantic search feature not available in this build.");
700 }
701
702 let walk_stats = walker.stats();
703
704 let total_files = unchanged + indexed;
706 let metadata = serde_json::json!({
707 "workspace": self.root.to_string_lossy(),
708 "indexed_at": chrono::Utc::now().to_rfc3339(),
709 "files_indexed": total_files,
710 "semantic": with_embeddings,
711 "schema_version": index::SCHEMA_VERSION,
712 });
713 let metadata_path = self.index_path.join("workspace.json");
714 if let Err(e) = std::fs::write(
715 &metadata_path,
716 serde_json::to_string_pretty(&metadata).unwrap_or_default(),
717 ) {
718 tracing::warn!("Failed to save workspace metadata: {}", e);
719 }
720
721 Ok(IndexStats {
722 indexed,
723 embedded: total_embedded,
724 skipped,
725 errors,
726 unique_paths: walk_stats.visited_paths,
727 unchanged,
728 removed,
729 })
730 }
731
732 pub fn search(&self, query: &str, limit: Option<usize>) -> Result<search::SearchResult> {
734 let searcher = search::Searcher::new(self.config.search.clone(), self.index.clone());
735 searcher.search(query, limit, false, None, None)
736 }
737
738 pub fn search_filtered(
740 &self,
741 query: &str,
742 limit: Option<usize>,
743 extensions: Option<Vec<String>>,
744 paths: Option<Vec<String>>,
745 use_regex: bool,
746 case_sensitive: bool,
747 context_before: Option<usize>,
748 context_after: Option<usize>,
749 verbose: bool,
750 ) -> Result<search::SearchResult> {
751 let searcher = search::Searcher::new(self.config.search.clone(), self.index.clone());
752 let filters = search::SearchFilters { extensions, paths };
753 searcher.search_filtered(
754 query,
755 limit,
756 filters,
757 use_regex,
758 case_sensitive,
759 context_before,
760 context_after,
761 verbose,
762 )
763 }
764
765 #[cfg(feature = "embeddings")]
767 pub fn search_hybrid(&self, query: &str, limit: Option<usize>) -> Result<search::SearchResult> {
768 let searcher = search::HybridSearcher::new(
769 self.config.search.clone(),
770 self.index.clone(),
771 self.vector_index.clone(),
772 self.embedding_model.clone(),
773 self.embedding_cache.clone(),
774 );
775 searcher.search(query, limit)
776 }
777
778 #[cfg(feature = "embeddings")]
780 pub fn has_semantic_index(&self) -> bool {
781 !self.vector_index.is_empty()
782 }
783
784 #[cfg(not(feature = "embeddings"))]
786 pub fn has_semantic_index(&self) -> bool {
787 false
788 }
789
790 pub fn root(&self) -> &Path {
792 &self.root
793 }
794
795 pub fn index_path(&self) -> &Path {
797 &self.index_path
798 }
799
800 pub fn is_indexed(&self) -> bool {
803 self.index_path.join("workspace.json").exists()
804 }
805
806 pub fn index_file(&self, path: &Path) -> Result<()> {
809 let indexer =
811 index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)?;
812
813 match indexer.index_file(path) {
814 Ok((_doc_id, _content)) => {
815 indexer.commit()?;
816 tracing::debug!("Indexed: {}", path.display());
817 Ok(())
818 }
819 Err(YgrepError::FileTooLarge { .. }) => {
820 tracing::debug!("Skipped (too large): {}", path.display());
821 Ok(())
822 }
823 Err(e) => Err(e),
824 }
825 }
826
827 pub fn delete_file(&self, path: &Path) -> Result<()> {
829 use tantivy::Term;
830
831 let relative_path = path
833 .strip_prefix(&self.root)
834 .unwrap_or(path)
835 .to_string_lossy();
836
837 let schema = self.index.schema();
838 let path_field = schema
839 .get_field("path")
840 .map_err(|_| YgrepError::Config("path field not found in schema".to_string()))?;
841
842 let term = Term::from_field_text(path_field, &relative_path);
843
844 let mut writer = self.index.writer::<tantivy::TantivyDocument>(50_000_000)?;
845 writer.delete_term(term);
846 writer.commit()?;
847
848 tracing::debug!("Deleted from index: {}", path.display());
849 Ok(())
850 }
851
852 pub fn create_watcher(&self) -> Result<FileWatcher> {
854 FileWatcher::new(self.root.clone(), self.config.indexer.clone())
855 }
856
857 pub fn indexer_config(&self) -> &config::IndexerConfig {
859 &self.config.indexer
860 }
861
862 pub fn stored_semantic_flag(&self) -> Option<bool> {
865 self.read_metadata()
866 .and_then(|v| v.get("semantic").and_then(|s| s.as_bool()))
867 }
868
869 pub fn stored_schema_version(&self) -> Option<u32> {
872 self.read_metadata()
873 .and_then(|v| v.get("schema_version").and_then(|s| s.as_u64()))
874 .map(|v| v as u32)
875 }
876
877 fn read_metadata(&self) -> Option<serde_json::Value> {
879 let metadata_path = self.index_path.join("workspace.json");
880 if metadata_path.exists() {
881 std::fs::read_to_string(&metadata_path)
882 .ok()
883 .and_then(|s| serde_json::from_str::<serde_json::Value>(&s).ok())
884 } else {
885 None
886 }
887 }
888
889 #[allow(unused_variables)]
891 pub fn index_file_with_options(&self, path: &Path, with_embeddings: bool) -> Result<()> {
892 let indexer =
894 index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)?;
895 self.index_file_with_indexer(&indexer, path, with_embeddings)
896 }
897
898 #[allow(unused_variables)]
900 pub fn index_file_with_indexer(
901 &self,
902 indexer: &index::Indexer,
903 path: &Path,
904 with_embeddings: bool,
905 ) -> Result<()> {
906 match indexer.index_file(path) {
907 Ok((doc_id, content)) => {
908 indexer.commit()?;
909 tracing::debug!("Indexed: {}", path.display());
910
911 #[cfg(feature = "embeddings")]
913 if with_embeddings {
914 let len = content.len();
916 if (50..=50_000).contains(&len) {
917 const EMBED_TRUNCATE: usize = 4096;
919 let text = if content.len() > EMBED_TRUNCATE {
920 let boundary = content.floor_char_boundary(EMBED_TRUNCATE);
921 &content[..boundary]
922 } else {
923 content.as_str()
924 };
925
926 match self.embedding_model.embed(text) {
927 Ok(embedding) => {
928 if let Err(e) = self.vector_index.insert(&doc_id, &embedding) {
929 tracing::debug!(
930 "Failed to insert embedding for {}: {}",
931 doc_id,
932 e
933 );
934 } else {
935 if let Err(e) = self.vector_index.save() {
937 tracing::debug!("Failed to save vector index: {}", e);
938 }
939 }
940 }
941 Err(e) => {
942 tracing::debug!(
943 "Failed to generate embedding for {}: {}",
944 doc_id,
945 e
946 );
947 }
948 }
949 }
950 }
951
952 #[cfg(not(feature = "embeddings"))]
953 {
954 let _ = doc_id;
955 let _ = content;
956 }
957
958 Ok(())
959 }
960 Err(YgrepError::FileTooLarge { .. }) => {
961 tracing::debug!("Skipped (too large): {}", path.display());
962 Ok(())
963 }
964 Err(e) => Err(e),
965 }
966 }
967
968 pub fn create_indexer(&self) -> Result<index::Indexer> {
970 index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)
971 }
972
973 pub fn create_watch_indexer(&self) -> Result<index::Indexer> {
977 index::Indexer::new_no_merge(self.config.indexer.clone(), self.index.clone(), &self.root)
978 }
979
980 #[allow(unused_variables)]
982 pub fn index_file_no_commit(
983 &self,
984 indexer: &index::Indexer,
985 path: &Path,
986 with_embeddings: bool,
987 ) -> Result<()> {
988 match indexer.index_file(path) {
989 Ok((doc_id, content)) => {
990 tracing::debug!("Staged: {}", path.display());
991
992 #[cfg(feature = "embeddings")]
993 if with_embeddings {
994 let len = content.len();
995 if (50..=50_000).contains(&len) {
996 const EMBED_TRUNCATE: usize = 4096;
997 let text = if content.len() > EMBED_TRUNCATE {
998 let boundary = content.floor_char_boundary(EMBED_TRUNCATE);
999 &content[..boundary]
1000 } else {
1001 content.as_str()
1002 };
1003
1004 match self.embedding_model.embed(text) {
1005 Ok(embedding) => {
1006 if let Err(e) = self.vector_index.insert(&doc_id, &embedding) {
1007 tracing::debug!(
1008 "Failed to insert embedding for {}: {}",
1009 doc_id,
1010 e
1011 );
1012 }
1013 }
1014 Err(e) => {
1015 tracing::debug!(
1016 "Failed to generate embedding for {}: {}",
1017 doc_id,
1018 e
1019 );
1020 }
1021 }
1022 }
1023 }
1024
1025 #[cfg(not(feature = "embeddings"))]
1026 {
1027 let _ = doc_id;
1028 let _ = content;
1029 }
1030
1031 Ok(())
1032 }
1033 Err(YgrepError::FileTooLarge { .. }) => {
1034 tracing::debug!("Skipped (too large): {}", path.display());
1035 Ok(())
1036 }
1037 Err(e) => Err(e),
1038 }
1039 }
1040
1041 pub fn delete_file_no_commit(&self, indexer: &index::Indexer, path: &Path) -> Result<()> {
1043 let relative_path = path
1044 .strip_prefix(&self.root)
1045 .unwrap_or(path)
1046 .to_string_lossy();
1047
1048 indexer.delete_by_path(&relative_path)?;
1049 tracing::debug!("Staged delete: {}", path.display());
1050 Ok(())
1051 }
1052
1053 pub fn commit_indexer(&self, indexer: &index::Indexer) -> Result<()> {
1055 indexer.commit()
1056 }
1057
1058 pub fn delete_file_with_indexer(&self, indexer: &index::Indexer, path: &Path) -> Result<()> {
1060 let relative_path = path
1061 .strip_prefix(&self.root)
1062 .unwrap_or(path)
1063 .to_string_lossy();
1064
1065 indexer.delete_by_path(&relative_path)?;
1066 indexer.commit()?;
1067
1068 tracing::debug!("Deleted from index: {}", path.display());
1069 Ok(())
1070 }
1071}
1072
1073#[derive(Debug, Clone, Default)]
1075pub struct IndexStats {
1076 pub indexed: usize,
1077 pub embedded: usize,
1078 pub skipped: usize,
1079 pub errors: usize,
1080 pub unique_paths: usize,
1081 pub unchanged: usize,
1082 pub removed: usize,
1083}
1084
1085fn hash_path(path: &Path) -> String {
1087 use xxhash_rust::xxh3::xxh3_64;
1088 let hash = xxh3_64(path.to_string_lossy().as_bytes());
1089 format!("{:016x}", hash)
1090}
1091
1092#[cfg(test)]
1093mod tests {
1094 use super::*;
1095 use tempfile::tempdir;
1096
1097 #[test]
1098 fn test_workspace_open() -> Result<()> {
1099 let temp_dir = tempdir().unwrap();
1100
1101 std::fs::write(temp_dir.path().join("test.rs"), "fn main() {}").unwrap();
1103
1104 let workspace = Workspace::create(temp_dir.path())?;
1105 assert!(workspace.root().exists());
1106
1107 Ok(())
1108 }
1109
1110 #[test]
1111 fn test_workspace_index_and_search() -> Result<()> {
1112 let temp_dir = tempdir().unwrap();
1113
1114 let workspace_dir = temp_dir.path().join("workspace");
1117 std::fs::create_dir_all(&workspace_dir).unwrap();
1118
1119 std::fs::write(
1121 workspace_dir.join("hello.rs"),
1122 "fn hello_world() { println!(\"Hello!\"); }",
1123 )
1124 .unwrap();
1125 std::fs::write(
1126 workspace_dir.join("goodbye.rs"),
1127 "fn goodbye_world() { println!(\"Bye!\"); }",
1128 )
1129 .unwrap();
1130
1131 let mut config = Config::default();
1132 config.indexer.data_dir = temp_dir.path().join("data");
1133 config.indexer.ignore_patterns = vec![];
1134
1135 let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1136
1137 let stats = workspace.index_all()?;
1139 assert!(
1140 stats.indexed >= 2,
1141 "Expected at least 2 indexed files, got {}",
1142 stats.indexed
1143 );
1144
1145 let result = workspace.search("hello", None)?;
1147 assert!(!result.is_empty());
1148 assert!(result.hits.iter().any(|h| h.path.contains("hello")));
1149
1150 Ok(())
1151 }
1152
1153 #[test]
1154 fn test_shared_indexer_multiple_files() -> Result<()> {
1155 let temp_dir = tempdir().unwrap();
1156 let workspace_dir = temp_dir.path().join("workspace");
1157 std::fs::create_dir_all(&workspace_dir).unwrap();
1158
1159 std::fs::write(workspace_dir.join("initial.rs"), "fn initial() {}").unwrap();
1161
1162 let mut config = Config::default();
1163 config.indexer.data_dir = temp_dir.path().join("data");
1164 config.indexer.ignore_patterns = vec![];
1165
1166 let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1167 workspace.index_all()?;
1168
1169 let indexer = workspace.create_indexer()?;
1171
1172 for i in 0..20 {
1174 let filename = format!("file_{}.rs", i);
1175 let content = format!("fn func_{}() {{ /* content {} */ }}", i, i);
1176 std::fs::write(workspace_dir.join(&filename), &content).unwrap();
1177 workspace.index_file_with_indexer(&indexer, &workspace_dir.join(&filename), false)?;
1178 }
1179
1180 for i in 0..20 {
1182 let query = format!("func_{}", i);
1183 let result = workspace.search(&query, None)?;
1184 assert!(
1185 !result.is_empty(),
1186 "File {} should be searchable after indexing with shared indexer",
1187 i
1188 );
1189 }
1190
1191 for i in 0..5 {
1193 let path = workspace_dir.join(format!("file_{}.rs", i));
1194 workspace.delete_file_with_indexer(&indexer, &path)?;
1195 }
1196
1197 let result = workspace.search("func_0", None)?;
1199 assert!(
1200 result.is_empty(),
1201 "Deleted file should not appear in search"
1202 );
1203
1204 let result = workspace.search("func_10", None)?;
1205 assert!(
1206 !result.is_empty(),
1207 "Non-deleted file should still be searchable"
1208 );
1209
1210 Ok(())
1211 }
1212
1213 #[test]
1214 fn test_shared_indexer_no_lock_contention() -> Result<()> {
1215 let temp_dir = tempdir().unwrap();
1219 let workspace_dir = temp_dir.path().join("workspace");
1220 std::fs::create_dir_all(&workspace_dir).unwrap();
1221
1222 std::fs::write(workspace_dir.join("seed.rs"), "fn seed() {}").unwrap();
1223
1224 let mut config = Config::default();
1225 config.indexer.data_dir = temp_dir.path().join("data");
1226 config.indexer.ignore_patterns = vec![];
1227
1228 let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1229 workspace.index_all()?;
1230
1231 let indexer = workspace.create_indexer()?;
1232
1233 for i in 0..50 {
1235 let path = workspace_dir.join(format!("churn_{}.rs", i));
1236
1237 std::fs::write(&path, format!("fn v1_{}() {{}}", i)).unwrap();
1239 workspace.index_file_with_indexer(&indexer, &path, false)?;
1240
1241 std::fs::write(&path, format!("fn v2_{}() {{}}", i)).unwrap();
1243 workspace.index_file_with_indexer(&indexer, &path, false)?;
1244
1245 if i % 2 == 0 {
1247 workspace.delete_file_with_indexer(&indexer, &path)?;
1248 }
1249 }
1250
1251 let result = workspace.search("v2_1", None)?;
1253 assert!(
1254 !result.is_empty(),
1255 "Surviving file should have latest content"
1256 );
1257
1258 let result = workspace.search("v2_0", None)?;
1260 assert!(result.is_empty(), "Deleted file should not appear");
1261
1262 Ok(())
1263 }
1264
1265 #[test]
1266 fn test_per_file_indexer_still_works() -> Result<()> {
1267 let temp_dir = tempdir().unwrap();
1270 let workspace_dir = temp_dir.path().join("workspace");
1271 std::fs::create_dir_all(&workspace_dir).unwrap();
1272
1273 std::fs::write(workspace_dir.join("base.rs"), "fn base() {}").unwrap();
1274
1275 let mut config = Config::default();
1276 config.indexer.data_dir = temp_dir.path().join("data");
1277 config.indexer.ignore_patterns = vec![];
1278
1279 let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1280 workspace.index_all()?;
1281
1282 let path = workspace_dir.join("standalone.rs");
1284 std::fs::write(&path, "fn standalone_function() {}").unwrap();
1285 workspace.index_file_with_options(&path, false)?;
1286
1287 let result = workspace.search("standalone_function", None)?;
1288 assert!(
1289 !result.is_empty(),
1290 "File indexed via index_file_with_options should be searchable"
1291 );
1292
1293 Ok(())
1294 }
1295
1296 #[test]
1297 fn test_batched_commit_multiple_files() -> Result<()> {
1298 let temp_dir = tempdir().unwrap();
1301 let workspace_dir = temp_dir.path().join("workspace");
1302 std::fs::create_dir_all(&workspace_dir).unwrap();
1303
1304 std::fs::write(workspace_dir.join("seed.rs"), "fn seed() {}").unwrap();
1305
1306 let mut config = Config::default();
1307 config.indexer.data_dir = temp_dir.path().join("data");
1308 config.indexer.ignore_patterns = vec![];
1309
1310 let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1311 workspace.index_all()?;
1312
1313 let indexer = workspace.create_indexer()?;
1314
1315 for i in 0..30 {
1317 let path = workspace_dir.join(format!("batch_{}.rs", i));
1318 std::fs::write(&path, format!("fn batch_func_{}() {{}}", i)).unwrap();
1319 workspace.index_file_no_commit(&indexer, &path, false)?;
1320 }
1321
1322 workspace.commit_indexer(&indexer)?;
1324
1325 for i in 0..30 {
1327 let result = workspace.search(&format!("batch_func_{}", i), None)?;
1328 assert!(
1329 !result.is_empty(),
1330 "File {} should be searchable after batched commit",
1331 i
1332 );
1333 }
1334
1335 for i in 0..10 {
1337 let path = workspace_dir.join(format!("batch_{}.rs", i));
1338 workspace.delete_file_no_commit(&indexer, &path)?;
1339 }
1340 for i in 30..40 {
1341 let path = workspace_dir.join(format!("batch_{}.rs", i));
1342 std::fs::write(&path, format!("fn batch_func_{}() {{}}", i)).unwrap();
1343 workspace.index_file_no_commit(&indexer, &path, false)?;
1344 }
1345
1346 workspace.commit_indexer(&indexer)?;
1348
1349 let result = workspace.search("batch_func_0", None)?;
1351 assert!(result.is_empty(), "Deleted file should not appear");
1352
1353 let result = workspace.search("batch_func_15", None)?;
1355 assert!(!result.is_empty(), "Surviving file should be searchable");
1356
1357 let result = workspace.search("batch_func_35", None)?;
1358 assert!(!result.is_empty(), "Newly added file should be searchable");
1359
1360 Ok(())
1361 }
1362
1363 #[test]
1364 fn test_batched_commit_heavy_churn() -> Result<()> {
1365 let temp_dir = tempdir().unwrap();
1368 let workspace_dir = temp_dir.path().join("workspace");
1369 std::fs::create_dir_all(&workspace_dir).unwrap();
1370
1371 std::fs::write(workspace_dir.join("seed.rs"), "fn seed() {}").unwrap();
1372
1373 let mut config = Config::default();
1374 config.indexer.data_dir = temp_dir.path().join("data");
1375 config.indexer.ignore_patterns = vec![];
1376
1377 let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1378 workspace.index_all()?;
1379
1380 let indexer = workspace.create_indexer()?;
1381
1382 for batch in 0..5 {
1384 for i in 0..20 {
1386 let idx = batch * 20 + i;
1387 let path = workspace_dir.join(format!("churn_{}.rs", idx));
1388
1389 std::fs::write(&path, format!("fn churn_v{}_{} () {{}}", batch, idx)).unwrap();
1390 workspace.index_file_no_commit(&indexer, &path, false)?;
1391 }
1392
1393 if batch > 0 {
1395 for i in 0..10 {
1396 let idx = (batch - 1) * 20 + i;
1397 let path = workspace_dir.join(format!("churn_{}.rs", idx));
1398 workspace.delete_file_no_commit(&indexer, &path)?;
1399 }
1400 }
1401
1402 workspace.commit_indexer(&indexer)?;
1404 }
1405
1406 let result = workspace.search("churn_v4_80", None)?;
1408 assert!(!result.is_empty(), "Latest batch file should be searchable");
1409
1410 let result = workspace.search("churn_v0_0", None)?;
1412 assert!(
1413 result.is_empty(),
1414 "Deleted file from early batch should be gone"
1415 );
1416
1417 Ok(())
1418 }
1419}