Skip to main content

fabryk_vector/
builder.rs

1//! VectorIndexBuilder for constructing vector search indices.
2//!
3//! The builder orchestrates content discovery, text extraction, batch
4//! embedding, and index population:
5//!
6//! 1. Discover content files using glob patterns
7//! 2. Parse frontmatter and content
8//! 3. Call VectorExtractor to produce VectorDocuments
9//! 4. Batch embed documents via EmbeddingProvider
10//! 5. Insert into VectorBackend
11//!
12//! # Two-Phase Build
13//!
14//! - Phase 1: Discover + extract all documents (sync, CPU-bound)
15//! - Phase 2: Batch embed + insert (async, may be I/O-bound)
16
17use crate::backend::{SimpleVectorBackend, VectorBackend};
18use crate::embedding::EmbeddingProvider;
19use crate::extractor::VectorExtractor;
20use crate::types::{BuildError, EmbeddedDocument, VectorDocument, VectorIndexStats};
21use fabryk_content::markdown::extract_frontmatter;
22use fabryk_core::{Error, Result};
23use std::path::{Path, PathBuf};
24use std::sync::Arc;
25use std::time::Instant;
26
27// ============================================================================
28// Error handling options
29// ============================================================================
30
31/// Options for handling errors during vector index building.
32#[derive(Clone, Debug, Default)]
33pub enum ErrorHandling {
34    /// Stop on first error.
35    #[default]
36    FailFast,
37    /// Continue and collect errors.
38    Collect,
39    /// Log and skip problematic files.
40    Skip,
41}
42
43// ============================================================================
44// VectorIndexBuilder
45// ============================================================================
46
47/// Builder for constructing vector search indices.
48///
49/// Orchestrates the full pipeline: discover files → extract documents →
50/// batch embed → insert into backend.
51///
52/// # Example
53///
54/// ```rust,ignore
55/// use fabryk_vector::{VectorIndexBuilder, MockEmbeddingProvider, MockVectorExtractor};
56/// use std::sync::Arc;
57///
58/// let provider = Arc::new(MockEmbeddingProvider::new(384));
59/// let extractor = MockVectorExtractor;
60///
61/// let (backend, stats) = VectorIndexBuilder::new(extractor)
62///     .with_content_path("/data/concepts")
63///     .with_embedding_provider(provider)
64///     .build()
65///     .await?;
66/// ```
67pub struct VectorIndexBuilder<E: VectorExtractor> {
68    extractor: E,
69    content_path: Option<PathBuf>,
70    provider: Option<Arc<dyn EmbeddingProvider>>,
71    error_handling: ErrorHandling,
72    batch_size: usize,
73    cache_path: Option<PathBuf>,
74    skip_cache: bool,
75}
76
77impl<E: VectorExtractor> VectorIndexBuilder<E> {
78    /// Creates a new builder with the given extractor.
79    pub fn new(extractor: E) -> Self {
80        Self {
81            extractor,
82            content_path: None,
83            provider: None,
84            error_handling: ErrorHandling::default(),
85            batch_size: 64,
86            cache_path: None,
87            skip_cache: false,
88        }
89    }
90
91    /// Sets the content directory path.
92    pub fn with_content_path(mut self, path: impl Into<PathBuf>) -> Self {
93        self.content_path = Some(path.into());
94        self
95    }
96
97    /// Sets the embedding provider.
98    pub fn with_embedding_provider(mut self, provider: Arc<dyn EmbeddingProvider>) -> Self {
99        self.provider = Some(provider);
100        self
101    }
102
103    /// Sets the error handling strategy.
104    pub fn with_error_handling(mut self, handling: ErrorHandling) -> Self {
105        self.error_handling = handling;
106        self
107    }
108
109    /// Sets the batch size for embedding operations.
110    pub fn with_batch_size(mut self, size: usize) -> Self {
111        self.batch_size = size;
112        self
113    }
114
115    /// Sets the cache file path for vector index persistence.
116    ///
117    /// When set, the builder will:
118    /// 1. Check if the cache is fresh before building (by comparing content hashes)
119    /// 2. Load from cache on hit (fast path, avoids re-embedding)
120    /// 3. Save to cache after a successful build (for next time)
121    pub fn with_cache_path(mut self, path: impl Into<PathBuf>) -> Self {
122        self.cache_path = Some(path.into());
123        self
124    }
125
126    /// Forces a rebuild even if the cache is fresh.
127    pub fn skip_cache(mut self) -> Self {
128        self.skip_cache = true;
129        self
130    }
131
132    /// Builds the vector index.
133    ///
134    /// Returns a `SimpleVectorBackend` populated with embedded documents,
135    /// plus build statistics.
136    ///
137    /// # Phases
138    ///
139    /// 1. **Discover + Extract**: Find content files, parse frontmatter,
140    ///    call extractor to produce `VectorDocument`s.
141    /// 2. **Batch Embed + Insert**: Embed documents in batches via the
142    ///    provider, then insert into the backend.
143    pub async fn build(self) -> Result<(SimpleVectorBackend, VectorIndexStats)> {
144        let start = Instant::now();
145
146        let content_path = self
147            .content_path
148            .as_ref()
149            .ok_or_else(|| Error::config("Content path not set. Use with_content_path() first."))?
150            .clone();
151
152        let provider = self
153            .provider
154            .as_ref()
155            .ok_or_else(|| {
156                Error::config("Embedding provider not set. Use with_embedding_provider() first.")
157            })?
158            .clone();
159
160        // Check cache freshness (if cache configured and not skipped)
161        if let Some(ref cache_path) = self.cache_path
162            && !self.skip_cache
163        {
164            let content_hash = compute_content_hash(&content_path).await?;
165            if SimpleVectorBackend::is_cache_fresh(cache_path, &content_hash)
166                && let Ok(Some(backend)) =
167                    SimpleVectorBackend::load_cache(cache_path, provider.clone())
168            {
169                let doc_count = backend.document_count().unwrap_or(0);
170                log::info!(
171                    "Vector cache is fresh, loaded {} documents from {}",
172                    doc_count,
173                    cache_path.display()
174                );
175                let stats = VectorIndexStats {
176                    documents_indexed: doc_count,
177                    files_processed: 0,
178                    files_skipped: 0,
179                    embedding_dimension: provider.dimension(),
180                    content_hash,
181                    build_duration_ms: start.elapsed().as_millis() as u64,
182                    errors: Vec::new(),
183                    from_cache: true,
184                };
185                return Ok((backend, stats));
186            }
187        }
188
189        // Discover files
190        let files = discover_files(&content_path).await?;
191
192        let mut errors: Vec<BuildError> = Vec::new();
193        let mut documents: Vec<VectorDocument> = Vec::new();
194        let mut files_processed = 0usize;
195        let mut files_skipped = 0usize;
196
197        // ================================================================
198        // Phase 1: Discover + Extract documents
199        // ================================================================
200        for file_path in &files {
201            match self.extract_file(&content_path, file_path) {
202                Ok(doc) => {
203                    documents.push(doc);
204                }
205                Err(e) => {
206                    let build_error = BuildError {
207                        file: file_path.clone(),
208                        message: e.to_string(),
209                    };
210
211                    match self.error_handling {
212                        ErrorHandling::FailFast => return Err(e),
213                        ErrorHandling::Collect => {
214                            files_skipped += 1;
215                            errors.push(build_error);
216                        }
217                        ErrorHandling::Skip => {
218                            files_skipped += 1;
219                            log::warn!("Skipping {}: {}", file_path.display(), build_error.message);
220                            errors.push(build_error);
221                        }
222                    }
223                }
224            }
225            files_processed += 1;
226        }
227
228        // ================================================================
229        // Phase 2: Batch embed + insert
230        // ================================================================
231        let mut embedded_documents: Vec<EmbeddedDocument> = Vec::with_capacity(documents.len());
232
233        for chunk in documents.chunks(self.batch_size) {
234            let texts: Vec<&str> = chunk.iter().map(|d| d.text.as_str()).collect();
235            let embeddings = provider.embed_batch(&texts).await?;
236
237            for (doc, embedding) in chunk.iter().zip(embeddings.into_iter()) {
238                embedded_documents.push(EmbeddedDocument::new(doc.clone(), embedding));
239            }
240        }
241
242        let documents_indexed = embedded_documents.len();
243        let embedding_dimension = provider.dimension();
244
245        // Compute content hash
246        let content_hash = compute_content_hash(&content_path).await?;
247
248        // Build the backend
249        let mut backend = SimpleVectorBackend::new(provider);
250        backend.add_documents(embedded_documents);
251
252        let stats = VectorIndexStats {
253            documents_indexed,
254            files_processed,
255            files_skipped,
256            embedding_dimension,
257            content_hash: content_hash.clone(),
258            build_duration_ms: start.elapsed().as_millis() as u64,
259            errors,
260            from_cache: false,
261        };
262
263        // Save to cache after successful build
264        if let Some(ref cache_path) = self.cache_path
265            && let Err(e) = backend.save_cache(cache_path, &content_hash)
266        {
267            log::warn!("Failed to save vector cache: {e}");
268        }
269
270        Ok((backend, stats))
271    }
272
273    /// Extract a single file to a VectorDocument.
274    fn extract_file(&self, base_path: &Path, file_path: &Path) -> Result<VectorDocument> {
275        let content =
276            std::fs::read_to_string(file_path).map_err(|e| Error::io_with_path(e, file_path))?;
277
278        let fm_result = extract_frontmatter(&content)?;
279
280        let frontmatter = fm_result
281            .value()
282            .cloned()
283            .unwrap_or(yaml_serde::Value::Null);
284        let body = fm_result.body();
285
286        self.extractor
287            .extract_document(base_path, file_path, &frontmatter, body)
288    }
289
290    /// Append documents from a content path into an existing backend.
291    ///
292    /// Unlike `build()`, this does not create a new backend — it adds
293    /// embedded documents to the provided one. Use this to index multiple
294    /// content directories (potentially with different extractors) into
295    /// a single vector search backend.
296    ///
297    /// # Example
298    ///
299    /// ```rust,ignore
300    /// // Build initial index from concept cards
301    /// let (mut backend, stats1) = VectorIndexBuilder::new(card_extractor)
302    ///     .with_content_path(&cards_path)
303    ///     .with_embedding_provider(provider.clone())
304    ///     .build()
305    ///     .await?;
306    ///
307    /// // Append source documents with a different extractor
308    /// let stats2 = VectorIndexBuilder::new(source_extractor)
309    ///     .with_content_path(&sources_path)
310    ///     .with_embedding_provider(provider)
311    ///     .build_append(&mut backend)
312    ///     .await?;
313    /// ```
314    pub async fn build_append(self, backend: &mut SimpleVectorBackend) -> Result<VectorIndexStats> {
315        let start = Instant::now();
316
317        let content_path = self
318            .content_path
319            .as_ref()
320            .ok_or_else(|| Error::config("Content path not set. Use with_content_path() first."))?
321            .clone();
322
323        let provider = self
324            .provider
325            .as_ref()
326            .ok_or_else(|| {
327                Error::config("Embedding provider not set. Use with_embedding_provider() first.")
328            })?
329            .clone();
330
331        let files = discover_files(&content_path).await?;
332
333        let mut errors: Vec<BuildError> = Vec::new();
334        let mut documents: Vec<VectorDocument> = Vec::new();
335        let mut files_processed = 0usize;
336        let mut files_skipped = 0usize;
337
338        // Phase 1: Discover + Extract
339        for file_path in &files {
340            match self.extract_file(&content_path, file_path) {
341                Ok(doc) => {
342                    documents.push(doc);
343                }
344                Err(e) => {
345                    let build_error = BuildError {
346                        file: file_path.clone(),
347                        message: e.to_string(),
348                    };
349
350                    match self.error_handling {
351                        ErrorHandling::FailFast => return Err(e),
352                        ErrorHandling::Collect => {
353                            files_skipped += 1;
354                            errors.push(build_error);
355                        }
356                        ErrorHandling::Skip => {
357                            files_skipped += 1;
358                            log::warn!("Skipping {}: {}", file_path.display(), build_error.message);
359                            errors.push(build_error);
360                        }
361                    }
362                }
363            }
364            files_processed += 1;
365        }
366
367        // Phase 2: Batch embed + insert into existing backend
368        let mut embedded_documents: Vec<EmbeddedDocument> = Vec::with_capacity(documents.len());
369
370        for chunk in documents.chunks(self.batch_size) {
371            let texts: Vec<&str> = chunk.iter().map(|d| d.text.as_str()).collect();
372            let embeddings = provider.embed_batch(&texts).await?;
373
374            for (doc, embedding) in chunk.iter().zip(embeddings.into_iter()) {
375                embedded_documents.push(EmbeddedDocument::new(doc.clone(), embedding));
376            }
377        }
378
379        let documents_indexed = embedded_documents.len();
380        let embedding_dimension = provider.dimension();
381        let content_hash = compute_content_hash(&content_path).await?;
382
383        backend.add_documents(embedded_documents);
384
385        let stats = VectorIndexStats {
386            documents_indexed,
387            files_processed,
388            files_skipped,
389            embedding_dimension,
390            content_hash,
391            build_duration_ms: start.elapsed().as_millis() as u64,
392            errors,
393            from_cache: false,
394        };
395
396        log::info!(
397            "Appended {} vector documents from {} ({} errors)",
398            documents_indexed,
399            content_path.display(),
400            stats.errors.len(),
401        );
402
403        Ok(stats)
404    }
405}
406
407// ============================================================================
408// Helper functions
409// ============================================================================
410
411/// Discover content files in a directory.
412async fn discover_files(base_path: &Path) -> Result<Vec<PathBuf>> {
413    use fabryk_core::util::files::{FindOptions, find_all_files};
414
415    let files = find_all_files(base_path, FindOptions::markdown()).await?;
416    let paths: Vec<PathBuf> = files.into_iter().map(|f| f.path).collect();
417
418    Ok(paths)
419}
420
421/// Compute a content hash for freshness checking.
422///
423/// Hashes all markdown file contents in the directory using blake3.
424pub async fn compute_content_hash(content_path: &Path) -> Result<String> {
425    use fabryk_core::util::files::{FindOptions, find_all_files};
426
427    let files = find_all_files(content_path, FindOptions::markdown()).await?;
428
429    let mut hasher = blake3::Hasher::new();
430    let mut paths: Vec<PathBuf> = files.into_iter().map(|f| f.path).collect();
431    paths.sort(); // Deterministic ordering
432
433    for path in &paths {
434        if let Ok(content) = std::fs::read(path) {
435            hasher.update(path.to_string_lossy().as_bytes());
436            hasher.update(&content);
437        }
438    }
439
440    Ok(hasher.finalize().to_hex().to_string())
441}
442
443// ============================================================================
444// Tests
445// ============================================================================
446
447#[cfg(test)]
448mod tests {
449    use super::*;
450    use crate::backend::VectorBackend;
451    use crate::embedding::MockEmbeddingProvider;
452    use crate::extractor::MockVectorExtractor;
453    use tempfile::tempdir;
454
455    async fn setup_test_files() -> (tempfile::TempDir, PathBuf) {
456        let dir = tempdir().unwrap();
457        let content_dir = dir.path().join("content");
458        std::fs::create_dir(&content_dir).unwrap();
459
460        let file_a =
461            "---\ntitle: \"Concept A\"\ncategory: \"basics\"\n---\n\nContent for concept A.\n";
462        let file_b = "---\ntitle: \"Concept B\"\ncategory: \"advanced\"\ntier: \"intermediate\"\n---\n\nContent for concept B.\n";
463
464        std::fs::write(content_dir.join("concept-a.md"), file_a).unwrap();
465        std::fs::write(content_dir.join("concept-b.md"), file_b).unwrap();
466
467        (dir, content_dir)
468    }
469
470    #[tokio::test]
471    async fn test_builder_basic() {
472        let (_dir, content_dir) = setup_test_files().await;
473        let provider = Arc::new(MockEmbeddingProvider::new(8));
474
475        let (backend, stats) = VectorIndexBuilder::new(MockVectorExtractor)
476            .with_content_path(&content_dir)
477            .with_embedding_provider(provider)
478            .build()
479            .await
480            .unwrap();
481
482        assert_eq!(stats.files_processed, 2);
483        assert_eq!(stats.documents_indexed, 2);
484        assert_eq!(stats.embedding_dimension, 8);
485        assert!(stats.errors.is_empty());
486        assert_eq!(backend.document_count().unwrap(), 2);
487    }
488
489    #[tokio::test]
490    async fn test_builder_content_hash() {
491        let (_dir, content_dir) = setup_test_files().await;
492        let provider = Arc::new(MockEmbeddingProvider::new(8));
493
494        let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
495            .with_content_path(&content_dir)
496            .with_embedding_provider(provider)
497            .build()
498            .await
499            .unwrap();
500
501        assert!(!stats.content_hash.is_empty());
502        // Hash should be hex string
503        assert!(stats.content_hash.chars().all(|c| c.is_ascii_hexdigit()));
504    }
505
506    #[tokio::test]
507    async fn test_builder_content_hash_deterministic() {
508        let (_dir, content_dir) = setup_test_files().await;
509
510        let hash1 = compute_content_hash(&content_dir).await.unwrap();
511        let hash2 = compute_content_hash(&content_dir).await.unwrap();
512
513        assert_eq!(hash1, hash2);
514    }
515
516    #[tokio::test]
517    async fn test_builder_content_hash_changes() {
518        let dir = tempdir().unwrap();
519        let content_dir = dir.path().join("content");
520        std::fs::create_dir(&content_dir).unwrap();
521
522        std::fs::write(
523            content_dir.join("test.md"),
524            "---\ntitle: Test\n---\nOriginal content",
525        )
526        .unwrap();
527
528        let hash1 = compute_content_hash(&content_dir).await.unwrap();
529
530        std::fs::write(
531            content_dir.join("test.md"),
532            "---\ntitle: Test\n---\nModified content",
533        )
534        .unwrap();
535
536        let hash2 = compute_content_hash(&content_dir).await.unwrap();
537
538        assert_ne!(hash1, hash2);
539    }
540
541    #[tokio::test]
542    async fn test_builder_missing_content_path() {
543        let provider = Arc::new(MockEmbeddingProvider::new(8));
544
545        let result = VectorIndexBuilder::new(MockVectorExtractor)
546            .with_embedding_provider(provider)
547            .build()
548            .await;
549
550        assert!(result.is_err());
551    }
552
553    #[tokio::test]
554    async fn test_builder_missing_provider() {
555        let dir = tempdir().unwrap();
556        let content_dir = dir.path().join("content");
557        std::fs::create_dir(&content_dir).unwrap();
558
559        let result = VectorIndexBuilder::new(MockVectorExtractor)
560            .with_content_path(&content_dir)
561            .build()
562            .await;
563
564        assert!(result.is_err());
565    }
566
567    #[tokio::test]
568    async fn test_builder_empty_directory() {
569        let dir = tempdir().unwrap();
570        let content_dir = dir.path().join("empty");
571        std::fs::create_dir(&content_dir).unwrap();
572        let provider = Arc::new(MockEmbeddingProvider::new(8));
573
574        let (backend, stats) = VectorIndexBuilder::new(MockVectorExtractor)
575            .with_content_path(&content_dir)
576            .with_embedding_provider(provider)
577            .build()
578            .await
579            .unwrap();
580
581        assert_eq!(stats.files_processed, 0);
582        assert_eq!(stats.documents_indexed, 0);
583        assert_eq!(backend.document_count().unwrap(), 0);
584    }
585
586    #[tokio::test]
587    async fn test_builder_error_handling_collect() {
588        let dir = tempdir().unwrap();
589        let content_dir = dir.path().join("content");
590        std::fs::create_dir(&content_dir).unwrap();
591
592        std::fs::write(
593            content_dir.join("valid.md"),
594            "---\ntitle: Valid\n---\nContent",
595        )
596        .unwrap();
597        // Create a file that won't parse as valid frontmatter
598        std::fs::write(content_dir.join("invalid.md"), "not yaml frontmatter").unwrap();
599
600        let provider = Arc::new(MockEmbeddingProvider::new(8));
601
602        let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
603            .with_content_path(&content_dir)
604            .with_embedding_provider(provider)
605            .with_error_handling(ErrorHandling::Collect)
606            .build()
607            .await
608            .unwrap();
609
610        assert_eq!(stats.files_processed, 2);
611        // At least the valid file should be indexed
612        assert!(stats.documents_indexed >= 1);
613    }
614
615    #[tokio::test]
616    async fn test_builder_batch_size() {
617        let dir = tempdir().unwrap();
618        let content_dir = dir.path().join("content");
619        std::fs::create_dir(&content_dir).unwrap();
620
621        // Create more files than batch_size
622        for i in 0..5 {
623            let content = format!("---\ntitle: \"Doc {i}\"\n---\n\nContent {i}.\n");
624            std::fs::write(content_dir.join(format!("doc-{i}.md")), content).unwrap();
625        }
626
627        let provider = Arc::new(MockEmbeddingProvider::new(8));
628
629        let (backend, stats) = VectorIndexBuilder::new(MockVectorExtractor)
630            .with_content_path(&content_dir)
631            .with_embedding_provider(provider)
632            .with_batch_size(2) // Small batches
633            .build()
634            .await
635            .unwrap();
636
637        assert_eq!(stats.documents_indexed, 5);
638        assert_eq!(backend.document_count().unwrap(), 5);
639    }
640
641    #[tokio::test]
642    async fn test_builder_build_duration_tracked() {
643        let (_dir, content_dir) = setup_test_files().await;
644        let provider = Arc::new(MockEmbeddingProvider::new(8));
645
646        let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
647            .with_content_path(&content_dir)
648            .with_embedding_provider(provider)
649            .build()
650            .await
651            .unwrap();
652
653        // Build should complete in reasonable time
654        assert!(stats.build_duration_ms < 10_000);
655    }
656
657    // ================================================================
658    // Cache tests
659    // ================================================================
660
661    #[tokio::test]
662    async fn test_builder_cache_hit() {
663        let (_dir, content_dir) = setup_test_files().await;
664        let cache_path = content_dir.parent().unwrap().join("vector-cache.json");
665        let provider = Arc::new(MockEmbeddingProvider::new(8));
666
667        // First build: cold (no cache)
668        let (backend1, stats1) = VectorIndexBuilder::new(MockVectorExtractor)
669            .with_content_path(&content_dir)
670            .with_embedding_provider(provider.clone())
671            .with_cache_path(&cache_path)
672            .build()
673            .await
674            .unwrap();
675        assert!(!stats1.from_cache);
676        assert!(cache_path.exists());
677
678        // Second build: warm (cache hit)
679        let (backend2, stats2) = VectorIndexBuilder::new(MockVectorExtractor)
680            .with_content_path(&content_dir)
681            .with_embedding_provider(provider)
682            .with_cache_path(&cache_path)
683            .build()
684            .await
685            .unwrap();
686        assert!(stats2.from_cache);
687        assert_eq!(
688            backend1.document_count().unwrap(),
689            backend2.document_count().unwrap()
690        );
691    }
692
693    #[tokio::test]
694    async fn test_builder_cache_miss_on_content_change() {
695        let (_dir, content_dir) = setup_test_files().await;
696        let cache_path = content_dir.parent().unwrap().join("vector-cache.json");
697        let provider = Arc::new(MockEmbeddingProvider::new(8));
698
699        // First build
700        let (_, stats1) = VectorIndexBuilder::new(MockVectorExtractor)
701            .with_content_path(&content_dir)
702            .with_embedding_provider(provider.clone())
703            .with_cache_path(&cache_path)
704            .build()
705            .await
706            .unwrap();
707        assert!(!stats1.from_cache);
708
709        // Add a new file (changes content hash)
710        let file_c = "---\ntitle: \"Concept C\"\ncategory: \"new\"\n---\n\nConcept C content.\n";
711        std::fs::write(content_dir.join("concept-c.md"), file_c).unwrap();
712
713        // Second build: cache miss
714        let (backend, stats2) = VectorIndexBuilder::new(MockVectorExtractor)
715            .with_content_path(&content_dir)
716            .with_embedding_provider(provider)
717            .with_cache_path(&cache_path)
718            .build()
719            .await
720            .unwrap();
721        assert!(!stats2.from_cache);
722        assert_eq!(backend.document_count().unwrap(), 3);
723    }
724
725    #[tokio::test]
726    async fn test_builder_skip_cache() {
727        let (_dir, content_dir) = setup_test_files().await;
728        let cache_path = content_dir.parent().unwrap().join("vector-cache.json");
729        let provider = Arc::new(MockEmbeddingProvider::new(8));
730
731        // First build: populates cache
732        VectorIndexBuilder::new(MockVectorExtractor)
733            .with_content_path(&content_dir)
734            .with_embedding_provider(provider.clone())
735            .with_cache_path(&cache_path)
736            .build()
737            .await
738            .unwrap();
739
740        // Second build with skip_cache: forces rebuild
741        let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
742            .with_content_path(&content_dir)
743            .with_embedding_provider(provider)
744            .with_cache_path(&cache_path)
745            .skip_cache()
746            .build()
747            .await
748            .unwrap();
749        assert!(!stats.from_cache);
750        assert_eq!(stats.files_processed, 2);
751    }
752
753    #[tokio::test]
754    async fn test_builder_no_cache_path() {
755        let (_dir, content_dir) = setup_test_files().await;
756        let provider = Arc::new(MockEmbeddingProvider::new(8));
757
758        let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
759            .with_content_path(&content_dir)
760            .with_embedding_provider(provider)
761            .build()
762            .await
763            .unwrap();
764        assert!(!stats.from_cache);
765    }
766}