Skip to main content

fabryk_vector/
builder.rs

1//! VectorIndexBuilder for constructing vector search indices.
2//!
3//! The builder orchestrates content discovery, text extraction, batch
4//! embedding, and index population:
5//!
6//! 1. Discover content files using glob patterns
7//! 2. Parse frontmatter and content
8//! 3. Call VectorExtractor to produce VectorDocuments
9//! 4. Batch embed documents via EmbeddingProvider
10//! 5. Insert into VectorBackend
11//!
12//! # Two-Phase Build
13//!
14//! - Phase 1: Discover + extract all documents (sync, CPU-bound)
15//! - Phase 2: Batch embed + insert (async, may be I/O-bound)
16
17use crate::backend::{SimpleVectorBackend, VectorBackend};
18use crate::embedding::EmbeddingProvider;
19use crate::extractor::VectorExtractor;
20use crate::types::{BuildError, EmbeddedDocument, VectorDocument, VectorIndexStats};
21use fabryk_content::markdown::extract_frontmatter;
22use fabryk_core::{Error, Result};
23use std::path::{Path, PathBuf};
24use std::sync::Arc;
25use std::time::Instant;
26
27// ============================================================================
28// Error handling options
29// ============================================================================
30
31/// Options for handling errors during vector index building.
32#[derive(Clone, Debug, Default)]
33pub enum ErrorHandling {
34    /// Stop on first error.
35    #[default]
36    FailFast,
37    /// Continue and collect errors.
38    Collect,
39    /// Log and skip problematic files.
40    Skip,
41}
42
43// ============================================================================
44// VectorIndexBuilder
45// ============================================================================
46
47/// Builder for constructing vector search indices.
48///
49/// Orchestrates the full pipeline: discover files → extract documents →
50/// batch embed → insert into backend.
51///
52/// # Example
53///
54/// ```rust,ignore
55/// use fabryk_vector::{VectorIndexBuilder, MockEmbeddingProvider, MockVectorExtractor};
56/// use std::sync::Arc;
57///
58/// let provider = Arc::new(MockEmbeddingProvider::new(384));
59/// let extractor = MockVectorExtractor;
60///
61/// let (backend, stats) = VectorIndexBuilder::new(extractor)
62///     .with_content_path("/data/concepts")
63///     .with_embedding_provider(provider)
64///     .build()
65///     .await?;
66/// ```
67pub struct VectorIndexBuilder<E: VectorExtractor> {
68    extractor: E,
69    content_path: Option<PathBuf>,
70    provider: Option<Arc<dyn EmbeddingProvider>>,
71    error_handling: ErrorHandling,
72    batch_size: usize,
73    cache_path: Option<PathBuf>,
74    skip_cache: bool,
75}
76
77impl<E: VectorExtractor> VectorIndexBuilder<E> {
78    /// Creates a new builder with the given extractor.
79    pub fn new(extractor: E) -> Self {
80        Self {
81            extractor,
82            content_path: None,
83            provider: None,
84            error_handling: ErrorHandling::default(),
85            batch_size: 64,
86            cache_path: None,
87            skip_cache: false,
88        }
89    }
90
91    /// Sets the content directory path.
92    pub fn with_content_path(mut self, path: impl Into<PathBuf>) -> Self {
93        self.content_path = Some(path.into());
94        self
95    }
96
97    /// Sets the embedding provider.
98    pub fn with_embedding_provider(mut self, provider: Arc<dyn EmbeddingProvider>) -> Self {
99        self.provider = Some(provider);
100        self
101    }
102
103    /// Sets the error handling strategy.
104    pub fn with_error_handling(mut self, handling: ErrorHandling) -> Self {
105        self.error_handling = handling;
106        self
107    }
108
109    /// Sets the batch size for embedding operations.
110    pub fn with_batch_size(mut self, size: usize) -> Self {
111        self.batch_size = size;
112        self
113    }
114
115    /// Sets the cache file path for vector index persistence.
116    ///
117    /// When set, the builder will:
118    /// 1. Check if the cache is fresh before building (by comparing content hashes)
119    /// 2. Load from cache on hit (fast path, avoids re-embedding)
120    /// 3. Save to cache after a successful build (for next time)
121    pub fn with_cache_path(mut self, path: impl Into<PathBuf>) -> Self {
122        self.cache_path = Some(path.into());
123        self
124    }
125
126    /// Forces a rebuild even if the cache is fresh.
127    pub fn skip_cache(mut self) -> Self {
128        self.skip_cache = true;
129        self
130    }
131
132    /// Builds the vector index.
133    ///
134    /// Returns a `SimpleVectorBackend` populated with embedded documents,
135    /// plus build statistics.
136    ///
137    /// # Phases
138    ///
139    /// 1. **Discover + Extract**: Find content files, parse frontmatter,
140    ///    call extractor to produce `VectorDocument`s.
141    /// 2. **Batch Embed + Insert**: Embed documents in batches via the
142    ///    provider, then insert into the backend.
143    pub async fn build(self) -> Result<(SimpleVectorBackend, VectorIndexStats)> {
144        let start = Instant::now();
145
146        let content_path = self
147            .content_path
148            .as_ref()
149            .ok_or_else(|| Error::config("Content path not set. Use with_content_path() first."))?
150            .clone();
151
152        let provider = self
153            .provider
154            .as_ref()
155            .ok_or_else(|| {
156                Error::config("Embedding provider not set. Use with_embedding_provider() first.")
157            })?
158            .clone();
159
160        // Check cache freshness (if cache configured and not skipped)
161        if let Some(ref cache_path) = self.cache_path {
162            if !self.skip_cache {
163                let content_hash = compute_content_hash(&content_path).await?;
164                if SimpleVectorBackend::is_cache_fresh(cache_path, &content_hash) {
165                    if let Ok(Some(backend)) =
166                        SimpleVectorBackend::load_cache(cache_path, provider.clone())
167                    {
168                        let doc_count = backend.document_count().unwrap_or(0);
169                        log::info!(
170                            "Vector cache is fresh, loaded {} documents from {}",
171                            doc_count,
172                            cache_path.display()
173                        );
174                        let stats = VectorIndexStats {
175                            documents_indexed: doc_count,
176                            files_processed: 0,
177                            files_skipped: 0,
178                            embedding_dimension: provider.dimension(),
179                            content_hash,
180                            build_duration_ms: start.elapsed().as_millis() as u64,
181                            errors: Vec::new(),
182                            from_cache: true,
183                        };
184                        return Ok((backend, stats));
185                    }
186                }
187            }
188        }
189
190        // Discover files
191        let files = discover_files(&content_path).await?;
192
193        let mut errors: Vec<BuildError> = Vec::new();
194        let mut documents: Vec<VectorDocument> = Vec::new();
195        let mut files_processed = 0usize;
196        let mut files_skipped = 0usize;
197
198        // ================================================================
199        // Phase 1: Discover + Extract documents
200        // ================================================================
201        for file_path in &files {
202            match self.extract_file(&content_path, file_path) {
203                Ok(doc) => {
204                    documents.push(doc);
205                }
206                Err(e) => {
207                    let build_error = BuildError {
208                        file: file_path.clone(),
209                        message: e.to_string(),
210                    };
211
212                    match self.error_handling {
213                        ErrorHandling::FailFast => return Err(e),
214                        ErrorHandling::Collect => {
215                            files_skipped += 1;
216                            errors.push(build_error);
217                        }
218                        ErrorHandling::Skip => {
219                            files_skipped += 1;
220                            log::warn!("Skipping {}: {}", file_path.display(), build_error.message);
221                            errors.push(build_error);
222                        }
223                    }
224                }
225            }
226            files_processed += 1;
227        }
228
229        // ================================================================
230        // Phase 2: Batch embed + insert
231        // ================================================================
232        let mut embedded_documents: Vec<EmbeddedDocument> = Vec::with_capacity(documents.len());
233
234        for chunk in documents.chunks(self.batch_size) {
235            let texts: Vec<&str> = chunk.iter().map(|d| d.text.as_str()).collect();
236            let embeddings = provider.embed_batch(&texts).await?;
237
238            for (doc, embedding) in chunk.iter().zip(embeddings.into_iter()) {
239                embedded_documents.push(EmbeddedDocument::new(doc.clone(), embedding));
240            }
241        }
242
243        let documents_indexed = embedded_documents.len();
244        let embedding_dimension = provider.dimension();
245
246        // Compute content hash
247        let content_hash = compute_content_hash(&content_path).await?;
248
249        // Build the backend
250        let mut backend = SimpleVectorBackend::new(provider);
251        backend.add_documents(embedded_documents);
252
253        let stats = VectorIndexStats {
254            documents_indexed,
255            files_processed,
256            files_skipped,
257            embedding_dimension,
258            content_hash: content_hash.clone(),
259            build_duration_ms: start.elapsed().as_millis() as u64,
260            errors,
261            from_cache: false,
262        };
263
264        // Save to cache after successful build
265        if let Some(ref cache_path) = self.cache_path {
266            if let Err(e) = backend.save_cache(cache_path, &content_hash) {
267                log::warn!("Failed to save vector cache: {e}");
268            }
269        }
270
271        Ok((backend, stats))
272    }
273
274    /// Extract a single file to a VectorDocument.
275    fn extract_file(&self, base_path: &Path, file_path: &Path) -> Result<VectorDocument> {
276        let content =
277            std::fs::read_to_string(file_path).map_err(|e| Error::io_with_path(e, file_path))?;
278
279        let fm_result = extract_frontmatter(&content)?;
280
281        let frontmatter = fm_result
282            .value()
283            .cloned()
284            .unwrap_or(serde_yaml::Value::Null);
285        let body = fm_result.body();
286
287        self.extractor
288            .extract_document(base_path, file_path, &frontmatter, body)
289    }
290
291    /// Append documents from a content path into an existing backend.
292    ///
293    /// Unlike `build()`, this does not create a new backend — it adds
294    /// embedded documents to the provided one. Use this to index multiple
295    /// content directories (potentially with different extractors) into
296    /// a single vector search backend.
297    ///
298    /// # Example
299    ///
300    /// ```rust,ignore
301    /// // Build initial index from concept cards
302    /// let (mut backend, stats1) = VectorIndexBuilder::new(card_extractor)
303    ///     .with_content_path(&cards_path)
304    ///     .with_embedding_provider(provider.clone())
305    ///     .build()
306    ///     .await?;
307    ///
308    /// // Append source documents with a different extractor
309    /// let stats2 = VectorIndexBuilder::new(source_extractor)
310    ///     .with_content_path(&sources_path)
311    ///     .with_embedding_provider(provider)
312    ///     .build_append(&mut backend)
313    ///     .await?;
314    /// ```
315    pub async fn build_append(self, backend: &mut SimpleVectorBackend) -> Result<VectorIndexStats> {
316        let start = Instant::now();
317
318        let content_path = self
319            .content_path
320            .as_ref()
321            .ok_or_else(|| Error::config("Content path not set. Use with_content_path() first."))?
322            .clone();
323
324        let provider = self
325            .provider
326            .as_ref()
327            .ok_or_else(|| {
328                Error::config("Embedding provider not set. Use with_embedding_provider() first.")
329            })?
330            .clone();
331
332        let files = discover_files(&content_path).await?;
333
334        let mut errors: Vec<BuildError> = Vec::new();
335        let mut documents: Vec<VectorDocument> = Vec::new();
336        let mut files_processed = 0usize;
337        let mut files_skipped = 0usize;
338
339        // Phase 1: Discover + Extract
340        for file_path in &files {
341            match self.extract_file(&content_path, file_path) {
342                Ok(doc) => {
343                    documents.push(doc);
344                }
345                Err(e) => {
346                    let build_error = BuildError {
347                        file: file_path.clone(),
348                        message: e.to_string(),
349                    };
350
351                    match self.error_handling {
352                        ErrorHandling::FailFast => return Err(e),
353                        ErrorHandling::Collect => {
354                            files_skipped += 1;
355                            errors.push(build_error);
356                        }
357                        ErrorHandling::Skip => {
358                            files_skipped += 1;
359                            log::warn!("Skipping {}: {}", file_path.display(), build_error.message);
360                            errors.push(build_error);
361                        }
362                    }
363                }
364            }
365            files_processed += 1;
366        }
367
368        // Phase 2: Batch embed + insert into existing backend
369        let mut embedded_documents: Vec<EmbeddedDocument> = Vec::with_capacity(documents.len());
370
371        for chunk in documents.chunks(self.batch_size) {
372            let texts: Vec<&str> = chunk.iter().map(|d| d.text.as_str()).collect();
373            let embeddings = provider.embed_batch(&texts).await?;
374
375            for (doc, embedding) in chunk.iter().zip(embeddings.into_iter()) {
376                embedded_documents.push(EmbeddedDocument::new(doc.clone(), embedding));
377            }
378        }
379
380        let documents_indexed = embedded_documents.len();
381        let embedding_dimension = provider.dimension();
382        let content_hash = compute_content_hash(&content_path).await?;
383
384        backend.add_documents(embedded_documents);
385
386        let stats = VectorIndexStats {
387            documents_indexed,
388            files_processed,
389            files_skipped,
390            embedding_dimension,
391            content_hash,
392            build_duration_ms: start.elapsed().as_millis() as u64,
393            errors,
394            from_cache: false,
395        };
396
397        log::info!(
398            "Appended {} vector documents from {} ({} errors)",
399            documents_indexed,
400            content_path.display(),
401            stats.errors.len(),
402        );
403
404        Ok(stats)
405    }
406}
407
408// ============================================================================
409// Helper functions
410// ============================================================================
411
412/// Discover content files in a directory.
413async fn discover_files(base_path: &Path) -> Result<Vec<PathBuf>> {
414    use fabryk_core::util::files::{FindOptions, find_all_files};
415
416    let files = find_all_files(base_path, FindOptions::markdown()).await?;
417    let paths: Vec<PathBuf> = files.into_iter().map(|f| f.path).collect();
418
419    Ok(paths)
420}
421
422/// Compute a content hash for freshness checking.
423///
424/// Hashes all markdown file contents in the directory using blake3.
425pub async fn compute_content_hash(content_path: &Path) -> Result<String> {
426    use fabryk_core::util::files::{FindOptions, find_all_files};
427
428    let files = find_all_files(content_path, FindOptions::markdown()).await?;
429
430    let mut hasher = blake3::Hasher::new();
431    let mut paths: Vec<PathBuf> = files.into_iter().map(|f| f.path).collect();
432    paths.sort(); // Deterministic ordering
433
434    for path in &paths {
435        if let Ok(content) = std::fs::read(path) {
436            hasher.update(path.to_string_lossy().as_bytes());
437            hasher.update(&content);
438        }
439    }
440
441    Ok(hasher.finalize().to_hex().to_string())
442}
443
444// ============================================================================
445// Tests
446// ============================================================================
447
448#[cfg(test)]
449mod tests {
450    use super::*;
451    use crate::backend::VectorBackend;
452    use crate::embedding::MockEmbeddingProvider;
453    use crate::extractor::MockVectorExtractor;
454    use tempfile::tempdir;
455
456    async fn setup_test_files() -> (tempfile::TempDir, PathBuf) {
457        let dir = tempdir().unwrap();
458        let content_dir = dir.path().join("content");
459        std::fs::create_dir(&content_dir).unwrap();
460
461        let file_a =
462            "---\ntitle: \"Concept A\"\ncategory: \"basics\"\n---\n\nContent for concept A.\n";
463        let file_b = "---\ntitle: \"Concept B\"\ncategory: \"advanced\"\ntier: \"intermediate\"\n---\n\nContent for concept B.\n";
464
465        std::fs::write(content_dir.join("concept-a.md"), file_a).unwrap();
466        std::fs::write(content_dir.join("concept-b.md"), file_b).unwrap();
467
468        (dir, content_dir)
469    }
470
471    #[tokio::test]
472    async fn test_builder_basic() {
473        let (_dir, content_dir) = setup_test_files().await;
474        let provider = Arc::new(MockEmbeddingProvider::new(8));
475
476        let (backend, stats) = VectorIndexBuilder::new(MockVectorExtractor)
477            .with_content_path(&content_dir)
478            .with_embedding_provider(provider)
479            .build()
480            .await
481            .unwrap();
482
483        assert_eq!(stats.files_processed, 2);
484        assert_eq!(stats.documents_indexed, 2);
485        assert_eq!(stats.embedding_dimension, 8);
486        assert!(stats.errors.is_empty());
487        assert_eq!(backend.document_count().unwrap(), 2);
488    }
489
490    #[tokio::test]
491    async fn test_builder_content_hash() {
492        let (_dir, content_dir) = setup_test_files().await;
493        let provider = Arc::new(MockEmbeddingProvider::new(8));
494
495        let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
496            .with_content_path(&content_dir)
497            .with_embedding_provider(provider)
498            .build()
499            .await
500            .unwrap();
501
502        assert!(!stats.content_hash.is_empty());
503        // Hash should be hex string
504        assert!(stats.content_hash.chars().all(|c| c.is_ascii_hexdigit()));
505    }
506
507    #[tokio::test]
508    async fn test_builder_content_hash_deterministic() {
509        let (_dir, content_dir) = setup_test_files().await;
510
511        let hash1 = compute_content_hash(&content_dir).await.unwrap();
512        let hash2 = compute_content_hash(&content_dir).await.unwrap();
513
514        assert_eq!(hash1, hash2);
515    }
516
517    #[tokio::test]
518    async fn test_builder_content_hash_changes() {
519        let dir = tempdir().unwrap();
520        let content_dir = dir.path().join("content");
521        std::fs::create_dir(&content_dir).unwrap();
522
523        std::fs::write(
524            content_dir.join("test.md"),
525            "---\ntitle: Test\n---\nOriginal content",
526        )
527        .unwrap();
528
529        let hash1 = compute_content_hash(&content_dir).await.unwrap();
530
531        std::fs::write(
532            content_dir.join("test.md"),
533            "---\ntitle: Test\n---\nModified content",
534        )
535        .unwrap();
536
537        let hash2 = compute_content_hash(&content_dir).await.unwrap();
538
539        assert_ne!(hash1, hash2);
540    }
541
542    #[tokio::test]
543    async fn test_builder_missing_content_path() {
544        let provider = Arc::new(MockEmbeddingProvider::new(8));
545
546        let result = VectorIndexBuilder::new(MockVectorExtractor)
547            .with_embedding_provider(provider)
548            .build()
549            .await;
550
551        assert!(result.is_err());
552    }
553
554    #[tokio::test]
555    async fn test_builder_missing_provider() {
556        let dir = tempdir().unwrap();
557        let content_dir = dir.path().join("content");
558        std::fs::create_dir(&content_dir).unwrap();
559
560        let result = VectorIndexBuilder::new(MockVectorExtractor)
561            .with_content_path(&content_dir)
562            .build()
563            .await;
564
565        assert!(result.is_err());
566    }
567
568    #[tokio::test]
569    async fn test_builder_empty_directory() {
570        let dir = tempdir().unwrap();
571        let content_dir = dir.path().join("empty");
572        std::fs::create_dir(&content_dir).unwrap();
573        let provider = Arc::new(MockEmbeddingProvider::new(8));
574
575        let (backend, stats) = VectorIndexBuilder::new(MockVectorExtractor)
576            .with_content_path(&content_dir)
577            .with_embedding_provider(provider)
578            .build()
579            .await
580            .unwrap();
581
582        assert_eq!(stats.files_processed, 0);
583        assert_eq!(stats.documents_indexed, 0);
584        assert_eq!(backend.document_count().unwrap(), 0);
585    }
586
587    #[tokio::test]
588    async fn test_builder_error_handling_collect() {
589        let dir = tempdir().unwrap();
590        let content_dir = dir.path().join("content");
591        std::fs::create_dir(&content_dir).unwrap();
592
593        std::fs::write(
594            content_dir.join("valid.md"),
595            "---\ntitle: Valid\n---\nContent",
596        )
597        .unwrap();
598        // Create a file that won't parse as valid frontmatter
599        std::fs::write(content_dir.join("invalid.md"), "not yaml frontmatter").unwrap();
600
601        let provider = Arc::new(MockEmbeddingProvider::new(8));
602
603        let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
604            .with_content_path(&content_dir)
605            .with_embedding_provider(provider)
606            .with_error_handling(ErrorHandling::Collect)
607            .build()
608            .await
609            .unwrap();
610
611        assert_eq!(stats.files_processed, 2);
612        // At least the valid file should be indexed
613        assert!(stats.documents_indexed >= 1);
614    }
615
616    #[tokio::test]
617    async fn test_builder_batch_size() {
618        let dir = tempdir().unwrap();
619        let content_dir = dir.path().join("content");
620        std::fs::create_dir(&content_dir).unwrap();
621
622        // Create more files than batch_size
623        for i in 0..5 {
624            let content = format!("---\ntitle: \"Doc {i}\"\n---\n\nContent {i}.\n");
625            std::fs::write(content_dir.join(format!("doc-{i}.md")), content).unwrap();
626        }
627
628        let provider = Arc::new(MockEmbeddingProvider::new(8));
629
630        let (backend, stats) = VectorIndexBuilder::new(MockVectorExtractor)
631            .with_content_path(&content_dir)
632            .with_embedding_provider(provider)
633            .with_batch_size(2) // Small batches
634            .build()
635            .await
636            .unwrap();
637
638        assert_eq!(stats.documents_indexed, 5);
639        assert_eq!(backend.document_count().unwrap(), 5);
640    }
641
642    #[tokio::test]
643    async fn test_builder_build_duration_tracked() {
644        let (_dir, content_dir) = setup_test_files().await;
645        let provider = Arc::new(MockEmbeddingProvider::new(8));
646
647        let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
648            .with_content_path(&content_dir)
649            .with_embedding_provider(provider)
650            .build()
651            .await
652            .unwrap();
653
654        // Build should complete in reasonable time
655        assert!(stats.build_duration_ms < 10_000);
656    }
657
658    // ================================================================
659    // Cache tests
660    // ================================================================
661
662    #[tokio::test]
663    async fn test_builder_cache_hit() {
664        let (_dir, content_dir) = setup_test_files().await;
665        let cache_path = content_dir.parent().unwrap().join("vector-cache.json");
666        let provider = Arc::new(MockEmbeddingProvider::new(8));
667
668        // First build: cold (no cache)
669        let (backend1, stats1) = VectorIndexBuilder::new(MockVectorExtractor)
670            .with_content_path(&content_dir)
671            .with_embedding_provider(provider.clone())
672            .with_cache_path(&cache_path)
673            .build()
674            .await
675            .unwrap();
676        assert!(!stats1.from_cache);
677        assert!(cache_path.exists());
678
679        // Second build: warm (cache hit)
680        let (backend2, stats2) = VectorIndexBuilder::new(MockVectorExtractor)
681            .with_content_path(&content_dir)
682            .with_embedding_provider(provider)
683            .with_cache_path(&cache_path)
684            .build()
685            .await
686            .unwrap();
687        assert!(stats2.from_cache);
688        assert_eq!(
689            backend1.document_count().unwrap(),
690            backend2.document_count().unwrap()
691        );
692    }
693
694    #[tokio::test]
695    async fn test_builder_cache_miss_on_content_change() {
696        let (_dir, content_dir) = setup_test_files().await;
697        let cache_path = content_dir.parent().unwrap().join("vector-cache.json");
698        let provider = Arc::new(MockEmbeddingProvider::new(8));
699
700        // First build
701        let (_, stats1) = VectorIndexBuilder::new(MockVectorExtractor)
702            .with_content_path(&content_dir)
703            .with_embedding_provider(provider.clone())
704            .with_cache_path(&cache_path)
705            .build()
706            .await
707            .unwrap();
708        assert!(!stats1.from_cache);
709
710        // Add a new file (changes content hash)
711        let file_c = "---\ntitle: \"Concept C\"\ncategory: \"new\"\n---\n\nConcept C content.\n";
712        std::fs::write(content_dir.join("concept-c.md"), file_c).unwrap();
713
714        // Second build: cache miss
715        let (backend, stats2) = VectorIndexBuilder::new(MockVectorExtractor)
716            .with_content_path(&content_dir)
717            .with_embedding_provider(provider)
718            .with_cache_path(&cache_path)
719            .build()
720            .await
721            .unwrap();
722        assert!(!stats2.from_cache);
723        assert_eq!(backend.document_count().unwrap(), 3);
724    }
725
726    #[tokio::test]
727    async fn test_builder_skip_cache() {
728        let (_dir, content_dir) = setup_test_files().await;
729        let cache_path = content_dir.parent().unwrap().join("vector-cache.json");
730        let provider = Arc::new(MockEmbeddingProvider::new(8));
731
732        // First build: populates cache
733        VectorIndexBuilder::new(MockVectorExtractor)
734            .with_content_path(&content_dir)
735            .with_embedding_provider(provider.clone())
736            .with_cache_path(&cache_path)
737            .build()
738            .await
739            .unwrap();
740
741        // Second build with skip_cache: forces rebuild
742        let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
743            .with_content_path(&content_dir)
744            .with_embedding_provider(provider)
745            .with_cache_path(&cache_path)
746            .skip_cache()
747            .build()
748            .await
749            .unwrap();
750        assert!(!stats.from_cache);
751        assert_eq!(stats.files_processed, 2);
752    }
753
754    #[tokio::test]
755    async fn test_builder_no_cache_path() {
756        let (_dir, content_dir) = setup_test_files().await;
757        let provider = Arc::new(MockEmbeddingProvider::new(8));
758
759        let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
760            .with_content_path(&content_dir)
761            .with_embedding_provider(provider)
762            .build()
763            .await
764            .unwrap();
765        assert!(!stats.from_cache);
766    }
767}