1pub mod config;
11pub mod dashboard;
12#[cfg(feature = "embeddings")]
13pub mod embeddings;
14pub mod error;
15pub mod fs;
16pub mod index;
17pub mod search;
18pub mod watcher;
19
20pub use config::Config;
21pub use error::{Result, YgrepError};
22pub use watcher::{FileWatcher, WatchEvent};
23
24use std::collections::HashMap;
25use std::path::Path;
26use tantivy::Index;
27
28#[cfg(feature = "embeddings")]
29use embeddings::{EmbeddingCache, EmbeddingModel};
30#[cfg(feature = "embeddings")]
31use index::VectorIndex;
32#[cfg(feature = "embeddings")]
33use std::sync::Arc;
34
35#[cfg(feature = "embeddings")]
37const EMBEDDING_DIM: usize = 384;
38
39pub type LogSender = std::sync::mpsc::Sender<String>;
41
42pub struct Workspace {
44 root: std::path::PathBuf,
46 config: Config,
48 index: Index,
50 index_path: std::path::PathBuf,
52 log_tx: Option<LogSender>,
54 #[cfg(feature = "embeddings")]
56 vector_index: Arc<VectorIndex>,
57 #[cfg(feature = "embeddings")]
59 embedding_model: Arc<EmbeddingModel>,
60 #[cfg(feature = "embeddings")]
62 embedding_cache: Arc<EmbeddingCache>,
63}
64
65impl Workspace {
66 pub fn open(root: &Path) -> Result<Self> {
68 let config = Config::load();
69 Self::open_internal(root, config, false)
70 }
71
72 pub fn open_with_config(root: &Path, config: Config) -> Result<Self> {
74 Self::open_internal(root, config, false)
75 }
76
77 pub fn create(root: &Path) -> Result<Self> {
79 let config = Config::load();
80 Self::open_internal(root, config, true)
81 }
82
83 pub fn create_with_config(root: &Path, config: Config) -> Result<Self> {
85 Self::open_internal(root, config, true)
86 }
87
88 fn open_internal(root: &Path, config: Config, create: bool) -> Result<Self> {
91 let root = std::fs::canonicalize(root)?;
92
93 let local_ygrep = root.join(".ygrep");
98 let data_dir = if local_ygrep.is_dir() {
99 local_ygrep
100 } else if config.indexer.data_dir.is_relative() {
101 root.join(&config.indexer.data_dir)
102 } else {
103 config.indexer.data_dir.clone()
104 };
105
106 let workspace_hash = hash_path(&root);
107 let index_path = data_dir.join("indexes").join(&workspace_hash);
108
109 let workspace_indexed = index_path.join("workspace.json").exists();
111 let tantivy_exists = index_path.join("meta.json").exists();
113
114 if !create && !workspace_indexed {
116 return Err(YgrepError::Config(format!(
117 "Workspace not indexed: {}",
118 root.display()
119 )));
120 }
121
122 let schema = index::build_document_schema();
124
125 if index_path.exists() {
130 let probe = index_path.join(".ygrep-write-probe");
131 match std::fs::write(&probe, b"") {
132 Ok(()) => {
133 let _ = std::fs::remove_file(&probe);
134 }
135 Err(e) if e.kind() == std::io::ErrorKind::PermissionDenied => {
136 return Err(YgrepError::Config(format!(
137 "Index directory is not writable: {}\n\n\
138 Hint: Create a .ygrep/ directory in your project root for local indexes,\n\
139 or set YGREP_HOME to a writable location.",
140 index_path.display()
141 )));
142 }
143 Err(_) => {} }
145 }
146
147 if !create {
152 let _ = std::fs::remove_file(index_path.join(".tantivy-meta.lock"));
153 let _ = std::fs::remove_file(index_path.join(".tantivy-writer.lock"));
154 }
155
156 let index = if tantivy_exists {
157 match Index::open_in_dir(&index_path) {
158 Ok(idx) => idx,
159 Err(e) if create => {
160 std::fs::remove_dir_all(&index_path)?;
163 std::fs::create_dir_all(&index_path)?;
164 Index::create_in_dir(&index_path, schema)?
165 }
166 Err(e) => return Err(e.into()),
167 }
168 } else {
169 std::fs::create_dir_all(&index_path)?;
171 Index::create_in_dir(&index_path, schema)?
172 };
173
174 index::register_tokenizers(index.tokenizers());
176
177 #[cfg(feature = "embeddings")]
178 let (vector_index, embedding_model, embedding_cache) = {
179 let vector_path = index_path.join("vectors");
181
182 let vector_index = if VectorIndex::exists(&vector_path) {
184 match VectorIndex::load(vector_path.clone()) {
185 Ok(vi) => Arc::new(vi),
186 Err(_e) => {
187 if vector_path.exists() {
190 let _ = std::fs::remove_dir_all(&vector_path);
191 }
192 Arc::new(VectorIndex::new(vector_path, EMBEDDING_DIM)?)
193 }
194 }
195 } else {
196 Arc::new(VectorIndex::new(vector_path, EMBEDDING_DIM)?)
197 };
198
199 let embedding_model = Arc::new(EmbeddingModel::default()); let embedding_cache = Arc::new(EmbeddingCache::new(100, EMBEDDING_DIM));
204
205 (vector_index, embedding_model, embedding_cache)
206 };
207
208 Ok(Self {
209 root,
210 config,
211 index,
212 index_path,
213 log_tx: None,
214 #[cfg(feature = "embeddings")]
215 vector_index,
216 #[cfg(feature = "embeddings")]
217 embedding_model,
218 #[cfg(feature = "embeddings")]
219 embedding_cache,
220 })
221 }
222
223 pub fn set_log_tx(&mut self, tx: LogSender) {
225 self.log_tx = Some(tx);
226 #[cfg(feature = "embeddings")]
227 self.embedding_model.set_quiet(true);
228 }
229
230 fn log(&self, msg: impl std::fmt::Display) {
232 if let Some(ref tx) = self.log_tx {
233 let _ = tx.send(msg.to_string());
234 } else {
235 eprintln!("{}", msg);
236 }
237 }
238
239 fn log_inline(&self, msg: impl std::fmt::Display) {
241 if let Some(ref tx) = self.log_tx {
242 let _ = tx.send(msg.to_string());
243 } else {
244 eprint!("{}", msg);
245 }
246 }
247
248 pub fn index_all(&self) -> Result<IndexStats> {
250 self.index_all_with_options(false)
251 }
252
253 #[allow(unused_variables)]
255 pub fn index_all_with_options(&self, with_embeddings: bool) -> Result<IndexStats> {
256 #[cfg(feature = "embeddings")]
258 self.vector_index.clear();
259
260 let indexer =
262 index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)?;
263
264 let mut walker = fs::FileWalker::new(self.root.clone(), self.config.indexer.clone())?;
265
266 let mut indexed = 0;
267 let mut skipped = 0;
268 let mut errors = 0;
269
270 #[cfg(feature = "embeddings")]
272 let mut embedding_batch: Vec<(String, String)> = Vec::new(); #[cfg(feature = "embeddings")]
275 const BATCH_SIZE: usize = 64;
276
277 for entry in walker.walk() {
278 match indexer.index_file(&entry.path) {
279 Ok((doc_id, content)) => {
280 indexed += 1;
281 if indexed % 500 == 0 {
282 self.log_inline(format!("\r Indexed {} files... ", indexed));
283 }
284
285 #[cfg(feature = "embeddings")]
287 if with_embeddings {
288 embedding_batch.push((doc_id, content));
289 }
290 #[cfg(not(feature = "embeddings"))]
291 {
292 let _ = doc_id;
293 let _ = content;
294 }
295 }
296 Err(YgrepError::FileTooLarge { .. }) => {
297 skipped += 1;
298 }
299 Err(e) => {
300 tracing::debug!("Error indexing {}: {}", entry.path.display(), e);
301 errors += 1;
302 }
303 }
304 }
305
306 self.log(format!("\r Indexed {} files. ", indexed));
307 indexer.commit()?;
308
309 let mut total_embedded = 0usize;
311
312 #[cfg(feature = "embeddings")]
314 if with_embeddings && !embedding_batch.is_empty() {
315 let filtered_batch: Vec<_> = embedding_batch
318 .into_iter()
319 .filter(|(_, content)| {
320 let len = content.len();
321 len >= 50 && len <= 50_000
322 })
323 .collect();
324
325 if filtered_batch.is_empty() {
326 self.log("No documents suitable for semantic indexing.");
327 } else {
328 use indicatif::{ProgressBar, ProgressStyle};
329
330 let total_docs = filtered_batch.len() as u64;
331 self.log(format!(
332 "Building semantic index for {} documents...",
333 total_docs
334 ));
335
336 self.embedding_model.preload()?;
338
339 let pb = if self.log_tx.is_some() {
340 ProgressBar::hidden()
341 } else {
342 ProgressBar::new(total_docs)
343 };
344 pb.set_style(
345 ProgressStyle::default_bar()
346 .template(" [{bar:40.cyan/blue}] {pos}/{len} ({percent}%)")
347 .unwrap()
348 .progress_chars("━╸─"),
349 );
350 pb.enable_steady_tick(std::time::Duration::from_millis(100));
351
352 for chunk in filtered_batch.chunks(BATCH_SIZE) {
353 const EMBED_TRUNCATE: usize = 4096;
356 let texts: Vec<&str> = chunk
357 .iter()
358 .map(|(_, content)| {
359 if content.len() > EMBED_TRUNCATE {
360 let boundary = content.floor_char_boundary(EMBED_TRUNCATE);
361 &content[..boundary]
362 } else {
363 content.as_str()
364 }
365 })
366 .collect();
367
368 match self.embedding_model.embed_batch(&texts) {
369 Ok(embeddings) => {
370 for ((doc_id, _), embedding) in chunk.iter().zip(embeddings) {
371 if let Err(e) = self.vector_index.insert(doc_id, &embedding) {
372 tracing::debug!(
373 "Failed to insert embedding for {}: {}",
374 doc_id,
375 e
376 );
377 }
378 }
379 total_embedded += chunk.len();
380 pb.set_position(total_embedded as u64);
381 }
382 Err(e) => {
383 tracing::warn!("Batch embedding failed: {}", e);
384 pb.inc(chunk.len() as u64);
385 }
386 }
387 }
388
389 pb.finish_and_clear();
390 self.log(format!(" Indexed {} documents.", total_embedded));
391 self.vector_index.save()?;
392 }
393 }
394
395 #[cfg(not(feature = "embeddings"))]
396 if with_embeddings {
397 self.log("Warning: Semantic search feature not available in this build.");
398 }
399
400 let stats = walker.stats();
401
402 let metadata = serde_json::json!({
404 "workspace": self.root.to_string_lossy(),
405 "indexed_at": chrono::Utc::now().to_rfc3339(),
406 "files_indexed": indexed,
407 "semantic": with_embeddings,
408 "schema_version": index::SCHEMA_VERSION,
409 });
410 let metadata_path = self.index_path.join("workspace.json");
411 if let Err(e) = std::fs::write(
412 &metadata_path,
413 serde_json::to_string_pretty(&metadata).unwrap_or_default(),
414 ) {
415 tracing::warn!("Failed to save workspace metadata: {}", e);
416 }
417
418 Ok(IndexStats {
419 indexed,
420 embedded: total_embedded,
421 skipped,
422 errors,
423 unique_paths: stats.visited_paths,
424 unchanged: 0,
425 removed: 0,
426 })
427 }
428
429 pub fn build_indexed_files_map(&self) -> HashMap<String, (u64, String)> {
433 let mut map = HashMap::new();
434
435 let reader = match self.index.reader() {
436 Ok(r) => r,
437 Err(_) => return map,
438 };
439
440 let searcher = reader.searcher();
441
442 for segment_reader in searcher.segment_readers() {
443 let alive_bitset = segment_reader.alive_bitset();
444 let fast_fields = segment_reader.fast_fields();
445
446 let path_col = match fast_fields.str("path") {
448 Ok(Some(col)) => col,
449 _ => continue,
450 };
451 let mtime_col = match fast_fields.u64("mtime") {
452 Ok(col) => col,
453 Err(_) => continue,
454 };
455 let chunk_id_col = match fast_fields.str("chunk_id") {
456 Ok(Some(col)) => col,
457 _ => continue,
458 };
459 let doc_id_col = match fast_fields.str("doc_id") {
460 Ok(Some(col)) => col,
461 _ => continue,
462 };
463
464 let mut path_buf = String::new();
465 let mut chunk_id_buf = String::new();
466 let mut doc_id_buf = String::new();
467
468 for row_id in 0..segment_reader.max_doc() {
469 if let Some(bitset) = &alive_bitset {
471 if !bitset.is_alive(row_id) {
472 continue;
473 }
474 }
475
476 chunk_id_buf.clear();
478 let mut is_chunk = false;
479 for ord in chunk_id_col.term_ords(row_id) {
480 let _ = chunk_id_col.ord_to_str(ord, &mut chunk_id_buf);
481 if !chunk_id_buf.is_empty() {
482 is_chunk = true;
483 break;
484 }
485 }
486 if is_chunk {
487 continue;
488 }
489
490 path_buf.clear();
492 for ord in path_col.term_ords(row_id) {
493 let _ = path_col.ord_to_str(ord, &mut path_buf);
494 }
495 if path_buf.is_empty() {
496 continue;
497 }
498
499 let mtime_val = mtime_col.values_for_doc(row_id).next().unwrap_or(0);
501
502 doc_id_buf.clear();
504 for ord in doc_id_col.term_ords(row_id) {
505 let _ = doc_id_col.ord_to_str(ord, &mut doc_id_buf);
506 }
507
508 map.insert(path_buf.clone(), (mtime_val, doc_id_buf.clone()));
509 }
510 }
511
512 map
513 }
514
515 #[allow(unused_variables)]
517 pub fn index_incremental_with_options(&self, with_embeddings: bool) -> Result<IndexStats> {
518 let mut indexed_map = self.build_indexed_files_map();
520
521 let indexer =
523 index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)?;
524
525 let mut walker = fs::FileWalker::new(self.root.clone(), self.config.indexer.clone())?;
526
527 let mut indexed = 0;
528 let mut skipped = 0;
529 let mut errors = 0;
530 let mut unchanged = 0;
531
532 #[cfg(feature = "embeddings")]
533 let mut embedding_batch: Vec<(String, String)> = Vec::new();
534 #[cfg(feature = "embeddings")]
535 const BATCH_SIZE: usize = 64;
536
537 for entry in walker.walk() {
538 let rel_path = entry
540 .path
541 .strip_prefix(&self.root)
542 .unwrap_or(&entry.path)
543 .to_string_lossy()
544 .to_string();
545
546 let current_mtime = std::fs::metadata(&entry.path)
548 .ok()
549 .and_then(|m| m.modified().ok())
550 .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok())
551 .map(|d| d.as_secs())
552 .unwrap_or(0);
553
554 if let Some((stored_mtime, _stored_doc_id)) = indexed_map.remove(&rel_path) {
556 if stored_mtime == current_mtime {
557 unchanged += 1;
558 continue;
559 }
560 }
562 match indexer.index_file(&entry.path) {
565 Ok((doc_id, content)) => {
566 indexed += 1;
567 if indexed % 500 == 0 {
568 self.log_inline(format!("\r Indexed {} files... ", indexed));
569 }
570
571 #[cfg(feature = "embeddings")]
572 if with_embeddings {
573 embedding_batch.push((doc_id, content));
574 }
575 #[cfg(not(feature = "embeddings"))]
576 {
577 let _ = doc_id;
578 let _ = content;
579 }
580 }
581 Err(YgrepError::FileTooLarge { .. }) => {
582 skipped += 1;
583 }
584 Err(e) => {
585 tracing::debug!("Error indexing {}: {}", entry.path.display(), e);
586 errors += 1;
587 }
588 }
589 }
590
591 if indexed > 0 {
592 self.log(format!("\r Indexed {} files. ", indexed));
593 }
594
595 let removed = indexed_map.len();
598 for (deleted_path, (_mtime, doc_id)) in &indexed_map {
599 indexer.delete_by_path(deleted_path)?;
600
601 #[cfg(feature = "embeddings")]
603 if !doc_id.is_empty() {
604 self.vector_index.mark_deleted(doc_id);
605 }
606 }
607
608 #[cfg(feature = "embeddings")]
610 if removed > 0 {
611 if let Err(e) = self.vector_index.save() {
612 tracing::debug!("Failed to save vector index after removals: {}", e);
613 }
614 }
615
616 indexer.commit()?;
617
618 let mut total_embedded = 0usize;
620
621 #[cfg(feature = "embeddings")]
623 if with_embeddings && !embedding_batch.is_empty() {
624 let filtered_batch: Vec<_> = embedding_batch
625 .into_iter()
626 .filter(|(_, content)| {
627 let len = content.len();
628 len >= 50 && len <= 50_000
629 })
630 .collect();
631
632 if !filtered_batch.is_empty() {
633 use indicatif::{ProgressBar, ProgressStyle};
634
635 let total_docs = filtered_batch.len() as u64;
636 self.log(format!(
637 "Building semantic index for {} changed documents...",
638 total_docs
639 ));
640
641 self.embedding_model.preload()?;
642
643 let pb = if self.log_tx.is_some() {
644 ProgressBar::hidden()
645 } else {
646 ProgressBar::new(total_docs)
647 };
648 pb.set_style(
649 ProgressStyle::default_bar()
650 .template(" [{bar:40.cyan/blue}] {pos}/{len} ({percent}%)")
651 .unwrap()
652 .progress_chars("━╸─"),
653 );
654 pb.enable_steady_tick(std::time::Duration::from_millis(100));
655
656 for chunk in filtered_batch.chunks(BATCH_SIZE) {
657 const EMBED_TRUNCATE: usize = 4096;
658 let texts: Vec<&str> = chunk
659 .iter()
660 .map(|(_, content)| {
661 if content.len() > EMBED_TRUNCATE {
662 let boundary = content.floor_char_boundary(EMBED_TRUNCATE);
663 &content[..boundary]
664 } else {
665 content.as_str()
666 }
667 })
668 .collect();
669
670 match self.embedding_model.embed_batch(&texts) {
671 Ok(embeddings) => {
672 for ((doc_id, _), embedding) in chunk.iter().zip(embeddings) {
673 if let Err(e) = self.vector_index.insert(doc_id, &embedding) {
674 tracing::debug!(
675 "Failed to insert embedding for {}: {}",
676 doc_id,
677 e
678 );
679 }
680 }
681 total_embedded += chunk.len();
682 pb.set_position(total_embedded as u64);
683 }
684 Err(e) => {
685 tracing::warn!("Batch embedding failed: {}", e);
686 pb.inc(chunk.len() as u64);
687 }
688 }
689 }
690
691 pb.finish_and_clear();
692 self.log(format!(" Indexed {} documents.", total_embedded));
693 self.vector_index.save()?;
694 }
695 }
696
697 #[cfg(not(feature = "embeddings"))]
698 if with_embeddings {
699 self.log("Warning: Semantic search feature not available in this build.");
700 }
701
702 let walk_stats = walker.stats();
703
704 let total_files = unchanged + indexed;
706 let metadata = serde_json::json!({
707 "workspace": self.root.to_string_lossy(),
708 "indexed_at": chrono::Utc::now().to_rfc3339(),
709 "files_indexed": total_files,
710 "semantic": with_embeddings,
711 "schema_version": index::SCHEMA_VERSION,
712 });
713 let metadata_path = self.index_path.join("workspace.json");
714 if let Err(e) = std::fs::write(
715 &metadata_path,
716 serde_json::to_string_pretty(&metadata).unwrap_or_default(),
717 ) {
718 tracing::warn!("Failed to save workspace metadata: {}", e);
719 }
720
721 Ok(IndexStats {
722 indexed,
723 embedded: total_embedded,
724 skipped,
725 errors,
726 unique_paths: walk_stats.visited_paths,
727 unchanged,
728 removed,
729 })
730 }
731
732 pub fn search(&self, query: &str, limit: Option<usize>) -> Result<search::SearchResult> {
734 let searcher = search::Searcher::new(self.config.search.clone(), self.index.clone());
735 searcher.search(query, limit, false, None, None)
736 }
737
738 pub fn search_filtered(
740 &self,
741 query: &str,
742 limit: Option<usize>,
743 extensions: Option<Vec<String>>,
744 paths: Option<Vec<String>>,
745 use_regex: bool,
746 case_sensitive: bool,
747 context_before: Option<usize>,
748 context_after: Option<usize>,
749 verbose: bool,
750 ) -> Result<search::SearchResult> {
751 let searcher = search::Searcher::new(self.config.search.clone(), self.index.clone());
752 let filters = search::SearchFilters { extensions, paths };
753 searcher.search_filtered(
754 query,
755 limit,
756 filters,
757 use_regex,
758 case_sensitive,
759 context_before,
760 context_after,
761 verbose,
762 )
763 }
764
765 #[cfg(feature = "embeddings")]
767 pub fn search_hybrid(&self, query: &str, limit: Option<usize>) -> Result<search::SearchResult> {
768 let searcher = search::HybridSearcher::new(
769 self.config.search.clone(),
770 self.index.clone(),
771 self.vector_index.clone(),
772 self.embedding_model.clone(),
773 self.embedding_cache.clone(),
774 );
775 searcher.search(query, limit)
776 }
777
778 #[cfg(feature = "embeddings")]
780 pub fn has_semantic_index(&self) -> bool {
781 !self.vector_index.is_empty()
782 }
783
784 #[cfg(not(feature = "embeddings"))]
786 pub fn has_semantic_index(&self) -> bool {
787 false
788 }
789
790 pub fn root(&self) -> &Path {
792 &self.root
793 }
794
795 pub fn index_path(&self) -> &Path {
797 &self.index_path
798 }
799
800 pub fn is_indexed(&self) -> bool {
803 self.index_path.join("workspace.json").exists()
804 }
805
806 pub fn index_file(&self, path: &Path) -> Result<()> {
809 let indexer =
811 index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)?;
812
813 match indexer.index_file(path) {
814 Ok((_doc_id, _content)) => {
815 indexer.commit()?;
816 tracing::debug!("Indexed: {}", path.display());
817 Ok(())
818 }
819 Err(YgrepError::FileTooLarge { .. }) => {
820 tracing::debug!("Skipped (too large): {}", path.display());
821 Ok(())
822 }
823 Err(e) => Err(e),
824 }
825 }
826
827 pub fn delete_file(&self, path: &Path) -> Result<()> {
829 use tantivy::Term;
830
831 let relative_path = path
833 .strip_prefix(&self.root)
834 .unwrap_or(path)
835 .to_string_lossy();
836
837 let schema = self.index.schema();
838 let path_field = schema
839 .get_field("path")
840 .map_err(|_| YgrepError::Config("path field not found in schema".to_string()))?;
841
842 let term = Term::from_field_text(path_field, &relative_path);
843
844 let mut writer = self.index.writer::<tantivy::TantivyDocument>(50_000_000)?;
845 writer.delete_term(term);
846 writer.commit()?;
847
848 tracing::debug!("Deleted from index: {}", path.display());
849 Ok(())
850 }
851
852 pub fn create_watcher(&self) -> Result<FileWatcher> {
854 FileWatcher::new(self.root.clone(), self.config.indexer.clone())
855 }
856
857 pub fn indexer_config(&self) -> &config::IndexerConfig {
859 &self.config.indexer
860 }
861
862 pub fn stored_semantic_flag(&self) -> Option<bool> {
865 self.read_metadata()
866 .and_then(|v| v.get("semantic").and_then(|s| s.as_bool()))
867 }
868
869 pub fn stored_schema_version(&self) -> Option<u32> {
872 self.read_metadata()
873 .and_then(|v| v.get("schema_version").and_then(|s| s.as_u64()))
874 .map(|v| v as u32)
875 }
876
877 fn read_metadata(&self) -> Option<serde_json::Value> {
879 let metadata_path = self.index_path.join("workspace.json");
880 if metadata_path.exists() {
881 std::fs::read_to_string(&metadata_path)
882 .ok()
883 .and_then(|s| serde_json::from_str::<serde_json::Value>(&s).ok())
884 } else {
885 None
886 }
887 }
888
889 #[allow(unused_variables)]
891 pub fn index_file_with_options(&self, path: &Path, with_embeddings: bool) -> Result<()> {
892 let indexer =
894 index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)?;
895 self.index_file_with_indexer(&indexer, path, with_embeddings)
896 }
897
898 #[allow(unused_variables)]
900 pub fn index_file_with_indexer(
901 &self,
902 indexer: &index::Indexer,
903 path: &Path,
904 with_embeddings: bool,
905 ) -> Result<()> {
906 match indexer.index_file(path) {
907 Ok((doc_id, content)) => {
908 indexer.commit()?;
909 tracing::debug!("Indexed: {}", path.display());
910
911 #[cfg(feature = "embeddings")]
913 if with_embeddings {
914 let len = content.len();
916 if (50..=50_000).contains(&len) {
917 const EMBED_TRUNCATE: usize = 4096;
919 let text = if content.len() > EMBED_TRUNCATE {
920 let boundary = content.floor_char_boundary(EMBED_TRUNCATE);
921 &content[..boundary]
922 } else {
923 content.as_str()
924 };
925
926 match self.embedding_model.embed(text) {
927 Ok(embedding) => {
928 if let Err(e) = self.vector_index.insert(&doc_id, &embedding) {
929 tracing::debug!(
930 "Failed to insert embedding for {}: {}",
931 doc_id,
932 e
933 );
934 } else {
935 if let Err(e) = self.vector_index.save() {
937 tracing::debug!("Failed to save vector index: {}", e);
938 }
939 }
940 }
941 Err(e) => {
942 tracing::debug!(
943 "Failed to generate embedding for {}: {}",
944 doc_id,
945 e
946 );
947 }
948 }
949 }
950 }
951
952 #[cfg(not(feature = "embeddings"))]
953 {
954 let _ = doc_id;
955 let _ = content;
956 }
957
958 Ok(())
959 }
960 Err(YgrepError::FileTooLarge { .. }) => {
961 tracing::debug!("Skipped (too large): {}", path.display());
962 Ok(())
963 }
964 Err(e) => Err(e),
965 }
966 }
967
968 pub fn create_indexer(&self) -> Result<index::Indexer> {
970 index::Indexer::new(self.config.indexer.clone(), self.index.clone(), &self.root)
971 }
972
973 pub fn delete_file_with_indexer(&self, indexer: &index::Indexer, path: &Path) -> Result<()> {
975 let relative_path = path
976 .strip_prefix(&self.root)
977 .unwrap_or(path)
978 .to_string_lossy();
979
980 indexer.delete_by_path(&relative_path)?;
981 indexer.commit()?;
982
983 tracing::debug!("Deleted from index: {}", path.display());
984 Ok(())
985 }
986}
987
988#[derive(Debug, Clone, Default)]
990pub struct IndexStats {
991 pub indexed: usize,
992 pub embedded: usize,
993 pub skipped: usize,
994 pub errors: usize,
995 pub unique_paths: usize,
996 pub unchanged: usize,
997 pub removed: usize,
998}
999
1000fn hash_path(path: &Path) -> String {
1002 use xxhash_rust::xxh3::xxh3_64;
1003 let hash = xxh3_64(path.to_string_lossy().as_bytes());
1004 format!("{:016x}", hash)
1005}
1006
1007#[cfg(test)]
1008mod tests {
1009 use super::*;
1010 use tempfile::tempdir;
1011
1012 #[test]
1013 fn test_workspace_open() -> Result<()> {
1014 let temp_dir = tempdir().unwrap();
1015
1016 std::fs::write(temp_dir.path().join("test.rs"), "fn main() {}").unwrap();
1018
1019 let workspace = Workspace::create(temp_dir.path())?;
1020 assert!(workspace.root().exists());
1021
1022 Ok(())
1023 }
1024
1025 #[test]
1026 fn test_workspace_index_and_search() -> Result<()> {
1027 let temp_dir = tempdir().unwrap();
1028
1029 let workspace_dir = temp_dir.path().join("workspace");
1032 std::fs::create_dir_all(&workspace_dir).unwrap();
1033
1034 std::fs::write(
1036 workspace_dir.join("hello.rs"),
1037 "fn hello_world() { println!(\"Hello!\"); }",
1038 )
1039 .unwrap();
1040 std::fs::write(
1041 workspace_dir.join("goodbye.rs"),
1042 "fn goodbye_world() { println!(\"Bye!\"); }",
1043 )
1044 .unwrap();
1045
1046 let mut config = Config::default();
1047 config.indexer.data_dir = temp_dir.path().join("data");
1048 config.indexer.ignore_patterns = vec![];
1049
1050 let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1051
1052 let stats = workspace.index_all()?;
1054 assert!(
1055 stats.indexed >= 2,
1056 "Expected at least 2 indexed files, got {}",
1057 stats.indexed
1058 );
1059
1060 let result = workspace.search("hello", None)?;
1062 assert!(!result.is_empty());
1063 assert!(result.hits.iter().any(|h| h.path.contains("hello")));
1064
1065 Ok(())
1066 }
1067
1068 #[test]
1069 fn test_shared_indexer_multiple_files() -> Result<()> {
1070 let temp_dir = tempdir().unwrap();
1071 let workspace_dir = temp_dir.path().join("workspace");
1072 std::fs::create_dir_all(&workspace_dir).unwrap();
1073
1074 std::fs::write(workspace_dir.join("initial.rs"), "fn initial() {}").unwrap();
1076
1077 let mut config = Config::default();
1078 config.indexer.data_dir = temp_dir.path().join("data");
1079 config.indexer.ignore_patterns = vec![];
1080
1081 let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1082 workspace.index_all()?;
1083
1084 let indexer = workspace.create_indexer()?;
1086
1087 for i in 0..20 {
1089 let filename = format!("file_{}.rs", i);
1090 let content = format!("fn func_{}() {{ /* content {} */ }}", i, i);
1091 std::fs::write(workspace_dir.join(&filename), &content).unwrap();
1092 workspace.index_file_with_indexer(&indexer, &workspace_dir.join(&filename), false)?;
1093 }
1094
1095 for i in 0..20 {
1097 let query = format!("func_{}", i);
1098 let result = workspace.search(&query, None)?;
1099 assert!(
1100 !result.is_empty(),
1101 "File {} should be searchable after indexing with shared indexer",
1102 i
1103 );
1104 }
1105
1106 for i in 0..5 {
1108 let path = workspace_dir.join(format!("file_{}.rs", i));
1109 workspace.delete_file_with_indexer(&indexer, &path)?;
1110 }
1111
1112 let result = workspace.search("func_0", None)?;
1114 assert!(
1115 result.is_empty(),
1116 "Deleted file should not appear in search"
1117 );
1118
1119 let result = workspace.search("func_10", None)?;
1120 assert!(
1121 !result.is_empty(),
1122 "Non-deleted file should still be searchable"
1123 );
1124
1125 Ok(())
1126 }
1127
1128 #[test]
1129 fn test_shared_indexer_no_lock_contention() -> Result<()> {
1130 let temp_dir = tempdir().unwrap();
1134 let workspace_dir = temp_dir.path().join("workspace");
1135 std::fs::create_dir_all(&workspace_dir).unwrap();
1136
1137 std::fs::write(workspace_dir.join("seed.rs"), "fn seed() {}").unwrap();
1138
1139 let mut config = Config::default();
1140 config.indexer.data_dir = temp_dir.path().join("data");
1141 config.indexer.ignore_patterns = vec![];
1142
1143 let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1144 workspace.index_all()?;
1145
1146 let indexer = workspace.create_indexer()?;
1147
1148 for i in 0..50 {
1150 let path = workspace_dir.join(format!("churn_{}.rs", i));
1151
1152 std::fs::write(&path, format!("fn v1_{}() {{}}", i)).unwrap();
1154 workspace.index_file_with_indexer(&indexer, &path, false)?;
1155
1156 std::fs::write(&path, format!("fn v2_{}() {{}}", i)).unwrap();
1158 workspace.index_file_with_indexer(&indexer, &path, false)?;
1159
1160 if i % 2 == 0 {
1162 workspace.delete_file_with_indexer(&indexer, &path)?;
1163 }
1164 }
1165
1166 let result = workspace.search("v2_1", None)?;
1168 assert!(
1169 !result.is_empty(),
1170 "Surviving file should have latest content"
1171 );
1172
1173 let result = workspace.search("v2_0", None)?;
1175 assert!(result.is_empty(), "Deleted file should not appear");
1176
1177 Ok(())
1178 }
1179
1180 #[test]
1181 fn test_per_file_indexer_still_works() -> Result<()> {
1182 let temp_dir = tempdir().unwrap();
1185 let workspace_dir = temp_dir.path().join("workspace");
1186 std::fs::create_dir_all(&workspace_dir).unwrap();
1187
1188 std::fs::write(workspace_dir.join("base.rs"), "fn base() {}").unwrap();
1189
1190 let mut config = Config::default();
1191 config.indexer.data_dir = temp_dir.path().join("data");
1192 config.indexer.ignore_patterns = vec![];
1193
1194 let workspace = Workspace::create_with_config(&workspace_dir, config)?;
1195 workspace.index_all()?;
1196
1197 let path = workspace_dir.join("standalone.rs");
1199 std::fs::write(&path, "fn standalone_function() {}").unwrap();
1200 workspace.index_file_with_options(&path, false)?;
1201
1202 let result = workspace.search("standalone_function", None)?;
1203 assert!(
1204 !result.is_empty(),
1205 "File indexed via index_file_with_options should be searchable"
1206 );
1207
1208 Ok(())
1209 }
1210}