1use crate::backend::{SimpleVectorBackend, VectorBackend};
18use crate::embedding::EmbeddingProvider;
19use crate::extractor::VectorExtractor;
20use crate::types::{BuildError, EmbeddedDocument, VectorDocument, VectorIndexStats};
21use fabryk_content::markdown::extract_frontmatter;
22use fabryk_core::{Error, Result};
23use std::path::{Path, PathBuf};
24use std::sync::Arc;
25use std::time::Instant;
26
27#[derive(Clone, Debug, Default)]
33pub enum ErrorHandling {
34 #[default]
36 FailFast,
37 Collect,
39 Skip,
41}
42
43pub struct VectorIndexBuilder<E: VectorExtractor> {
68 extractor: E,
69 content_path: Option<PathBuf>,
70 provider: Option<Arc<dyn EmbeddingProvider>>,
71 error_handling: ErrorHandling,
72 batch_size: usize,
73 cache_path: Option<PathBuf>,
74 skip_cache: bool,
75}
76
77impl<E: VectorExtractor> VectorIndexBuilder<E> {
78 pub fn new(extractor: E) -> Self {
80 Self {
81 extractor,
82 content_path: None,
83 provider: None,
84 error_handling: ErrorHandling::default(),
85 batch_size: 64,
86 cache_path: None,
87 skip_cache: false,
88 }
89 }
90
91 pub fn with_content_path(mut self, path: impl Into<PathBuf>) -> Self {
93 self.content_path = Some(path.into());
94 self
95 }
96
97 pub fn with_embedding_provider(mut self, provider: Arc<dyn EmbeddingProvider>) -> Self {
99 self.provider = Some(provider);
100 self
101 }
102
103 pub fn with_error_handling(mut self, handling: ErrorHandling) -> Self {
105 self.error_handling = handling;
106 self
107 }
108
109 pub fn with_batch_size(mut self, size: usize) -> Self {
111 self.batch_size = size;
112 self
113 }
114
115 pub fn with_cache_path(mut self, path: impl Into<PathBuf>) -> Self {
122 self.cache_path = Some(path.into());
123 self
124 }
125
126 pub fn skip_cache(mut self) -> Self {
128 self.skip_cache = true;
129 self
130 }
131
132 pub async fn build(self) -> Result<(SimpleVectorBackend, VectorIndexStats)> {
144 let start = Instant::now();
145
146 let content_path = self
147 .content_path
148 .as_ref()
149 .ok_or_else(|| Error::config("Content path not set. Use with_content_path() first."))?
150 .clone();
151
152 let provider = self
153 .provider
154 .as_ref()
155 .ok_or_else(|| {
156 Error::config("Embedding provider not set. Use with_embedding_provider() first.")
157 })?
158 .clone();
159
160 if let Some(ref cache_path) = self.cache_path {
162 if !self.skip_cache {
163 let content_hash = compute_content_hash(&content_path).await?;
164 if SimpleVectorBackend::is_cache_fresh(cache_path, &content_hash) {
165 if let Ok(Some(backend)) =
166 SimpleVectorBackend::load_cache(cache_path, provider.clone())
167 {
168 let doc_count = backend.document_count().unwrap_or(0);
169 log::info!(
170 "Vector cache is fresh, loaded {} documents from {}",
171 doc_count,
172 cache_path.display()
173 );
174 let stats = VectorIndexStats {
175 documents_indexed: doc_count,
176 files_processed: 0,
177 files_skipped: 0,
178 embedding_dimension: provider.dimension(),
179 content_hash,
180 build_duration_ms: start.elapsed().as_millis() as u64,
181 errors: Vec::new(),
182 from_cache: true,
183 };
184 return Ok((backend, stats));
185 }
186 }
187 }
188 }
189
190 let files = discover_files(&content_path).await?;
192
193 let mut errors: Vec<BuildError> = Vec::new();
194 let mut documents: Vec<VectorDocument> = Vec::new();
195 let mut files_processed = 0usize;
196 let mut files_skipped = 0usize;
197
198 for file_path in &files {
202 match self.extract_file(&content_path, file_path) {
203 Ok(doc) => {
204 documents.push(doc);
205 }
206 Err(e) => {
207 let build_error = BuildError {
208 file: file_path.clone(),
209 message: e.to_string(),
210 };
211
212 match self.error_handling {
213 ErrorHandling::FailFast => return Err(e),
214 ErrorHandling::Collect => {
215 files_skipped += 1;
216 errors.push(build_error);
217 }
218 ErrorHandling::Skip => {
219 files_skipped += 1;
220 log::warn!("Skipping {}: {}", file_path.display(), build_error.message);
221 errors.push(build_error);
222 }
223 }
224 }
225 }
226 files_processed += 1;
227 }
228
229 let mut embedded_documents: Vec<EmbeddedDocument> = Vec::with_capacity(documents.len());
233
234 for chunk in documents.chunks(self.batch_size) {
235 let texts: Vec<&str> = chunk.iter().map(|d| d.text.as_str()).collect();
236 let embeddings = provider.embed_batch(&texts).await?;
237
238 for (doc, embedding) in chunk.iter().zip(embeddings.into_iter()) {
239 embedded_documents.push(EmbeddedDocument::new(doc.clone(), embedding));
240 }
241 }
242
243 let documents_indexed = embedded_documents.len();
244 let embedding_dimension = provider.dimension();
245
246 let content_hash = compute_content_hash(&content_path).await?;
248
249 let mut backend = SimpleVectorBackend::new(provider);
251 backend.add_documents(embedded_documents);
252
253 let stats = VectorIndexStats {
254 documents_indexed,
255 files_processed,
256 files_skipped,
257 embedding_dimension,
258 content_hash: content_hash.clone(),
259 build_duration_ms: start.elapsed().as_millis() as u64,
260 errors,
261 from_cache: false,
262 };
263
264 if let Some(ref cache_path) = self.cache_path {
266 if let Err(e) = backend.save_cache(cache_path, &content_hash) {
267 log::warn!("Failed to save vector cache: {e}");
268 }
269 }
270
271 Ok((backend, stats))
272 }
273
274 fn extract_file(&self, base_path: &Path, file_path: &Path) -> Result<VectorDocument> {
276 let content =
277 std::fs::read_to_string(file_path).map_err(|e| Error::io_with_path(e, file_path))?;
278
279 let fm_result = extract_frontmatter(&content)?;
280
281 let frontmatter = fm_result
282 .value()
283 .cloned()
284 .unwrap_or(serde_yaml::Value::Null);
285 let body = fm_result.body();
286
287 self.extractor
288 .extract_document(base_path, file_path, &frontmatter, body)
289 }
290
291 pub async fn build_append(self, backend: &mut SimpleVectorBackend) -> Result<VectorIndexStats> {
316 let start = Instant::now();
317
318 let content_path = self
319 .content_path
320 .as_ref()
321 .ok_or_else(|| Error::config("Content path not set. Use with_content_path() first."))?
322 .clone();
323
324 let provider = self
325 .provider
326 .as_ref()
327 .ok_or_else(|| {
328 Error::config("Embedding provider not set. Use with_embedding_provider() first.")
329 })?
330 .clone();
331
332 let files = discover_files(&content_path).await?;
333
334 let mut errors: Vec<BuildError> = Vec::new();
335 let mut documents: Vec<VectorDocument> = Vec::new();
336 let mut files_processed = 0usize;
337 let mut files_skipped = 0usize;
338
339 for file_path in &files {
341 match self.extract_file(&content_path, file_path) {
342 Ok(doc) => {
343 documents.push(doc);
344 }
345 Err(e) => {
346 let build_error = BuildError {
347 file: file_path.clone(),
348 message: e.to_string(),
349 };
350
351 match self.error_handling {
352 ErrorHandling::FailFast => return Err(e),
353 ErrorHandling::Collect => {
354 files_skipped += 1;
355 errors.push(build_error);
356 }
357 ErrorHandling::Skip => {
358 files_skipped += 1;
359 log::warn!("Skipping {}: {}", file_path.display(), build_error.message);
360 errors.push(build_error);
361 }
362 }
363 }
364 }
365 files_processed += 1;
366 }
367
368 let mut embedded_documents: Vec<EmbeddedDocument> = Vec::with_capacity(documents.len());
370
371 for chunk in documents.chunks(self.batch_size) {
372 let texts: Vec<&str> = chunk.iter().map(|d| d.text.as_str()).collect();
373 let embeddings = provider.embed_batch(&texts).await?;
374
375 for (doc, embedding) in chunk.iter().zip(embeddings.into_iter()) {
376 embedded_documents.push(EmbeddedDocument::new(doc.clone(), embedding));
377 }
378 }
379
380 let documents_indexed = embedded_documents.len();
381 let embedding_dimension = provider.dimension();
382 let content_hash = compute_content_hash(&content_path).await?;
383
384 backend.add_documents(embedded_documents);
385
386 let stats = VectorIndexStats {
387 documents_indexed,
388 files_processed,
389 files_skipped,
390 embedding_dimension,
391 content_hash,
392 build_duration_ms: start.elapsed().as_millis() as u64,
393 errors,
394 from_cache: false,
395 };
396
397 log::info!(
398 "Appended {} vector documents from {} ({} errors)",
399 documents_indexed,
400 content_path.display(),
401 stats.errors.len(),
402 );
403
404 Ok(stats)
405 }
406}
407
408async fn discover_files(base_path: &Path) -> Result<Vec<PathBuf>> {
414 use fabryk_core::util::files::{FindOptions, find_all_files};
415
416 let files = find_all_files(base_path, FindOptions::markdown()).await?;
417 let paths: Vec<PathBuf> = files.into_iter().map(|f| f.path).collect();
418
419 Ok(paths)
420}
421
422pub async fn compute_content_hash(content_path: &Path) -> Result<String> {
426 use fabryk_core::util::files::{FindOptions, find_all_files};
427
428 let files = find_all_files(content_path, FindOptions::markdown()).await?;
429
430 let mut hasher = blake3::Hasher::new();
431 let mut paths: Vec<PathBuf> = files.into_iter().map(|f| f.path).collect();
432 paths.sort(); for path in &paths {
435 if let Ok(content) = std::fs::read(path) {
436 hasher.update(path.to_string_lossy().as_bytes());
437 hasher.update(&content);
438 }
439 }
440
441 Ok(hasher.finalize().to_hex().to_string())
442}
443
444#[cfg(test)]
449mod tests {
450 use super::*;
451 use crate::backend::VectorBackend;
452 use crate::embedding::MockEmbeddingProvider;
453 use crate::extractor::MockVectorExtractor;
454 use tempfile::tempdir;
455
456 async fn setup_test_files() -> (tempfile::TempDir, PathBuf) {
457 let dir = tempdir().unwrap();
458 let content_dir = dir.path().join("content");
459 std::fs::create_dir(&content_dir).unwrap();
460
461 let file_a =
462 "---\ntitle: \"Concept A\"\ncategory: \"basics\"\n---\n\nContent for concept A.\n";
463 let file_b = "---\ntitle: \"Concept B\"\ncategory: \"advanced\"\ntier: \"intermediate\"\n---\n\nContent for concept B.\n";
464
465 std::fs::write(content_dir.join("concept-a.md"), file_a).unwrap();
466 std::fs::write(content_dir.join("concept-b.md"), file_b).unwrap();
467
468 (dir, content_dir)
469 }
470
471 #[tokio::test]
472 async fn test_builder_basic() {
473 let (_dir, content_dir) = setup_test_files().await;
474 let provider = Arc::new(MockEmbeddingProvider::new(8));
475
476 let (backend, stats) = VectorIndexBuilder::new(MockVectorExtractor)
477 .with_content_path(&content_dir)
478 .with_embedding_provider(provider)
479 .build()
480 .await
481 .unwrap();
482
483 assert_eq!(stats.files_processed, 2);
484 assert_eq!(stats.documents_indexed, 2);
485 assert_eq!(stats.embedding_dimension, 8);
486 assert!(stats.errors.is_empty());
487 assert_eq!(backend.document_count().unwrap(), 2);
488 }
489
490 #[tokio::test]
491 async fn test_builder_content_hash() {
492 let (_dir, content_dir) = setup_test_files().await;
493 let provider = Arc::new(MockEmbeddingProvider::new(8));
494
495 let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
496 .with_content_path(&content_dir)
497 .with_embedding_provider(provider)
498 .build()
499 .await
500 .unwrap();
501
502 assert!(!stats.content_hash.is_empty());
503 assert!(stats.content_hash.chars().all(|c| c.is_ascii_hexdigit()));
505 }
506
507 #[tokio::test]
508 async fn test_builder_content_hash_deterministic() {
509 let (_dir, content_dir) = setup_test_files().await;
510
511 let hash1 = compute_content_hash(&content_dir).await.unwrap();
512 let hash2 = compute_content_hash(&content_dir).await.unwrap();
513
514 assert_eq!(hash1, hash2);
515 }
516
517 #[tokio::test]
518 async fn test_builder_content_hash_changes() {
519 let dir = tempdir().unwrap();
520 let content_dir = dir.path().join("content");
521 std::fs::create_dir(&content_dir).unwrap();
522
523 std::fs::write(
524 content_dir.join("test.md"),
525 "---\ntitle: Test\n---\nOriginal content",
526 )
527 .unwrap();
528
529 let hash1 = compute_content_hash(&content_dir).await.unwrap();
530
531 std::fs::write(
532 content_dir.join("test.md"),
533 "---\ntitle: Test\n---\nModified content",
534 )
535 .unwrap();
536
537 let hash2 = compute_content_hash(&content_dir).await.unwrap();
538
539 assert_ne!(hash1, hash2);
540 }
541
542 #[tokio::test]
543 async fn test_builder_missing_content_path() {
544 let provider = Arc::new(MockEmbeddingProvider::new(8));
545
546 let result = VectorIndexBuilder::new(MockVectorExtractor)
547 .with_embedding_provider(provider)
548 .build()
549 .await;
550
551 assert!(result.is_err());
552 }
553
554 #[tokio::test]
555 async fn test_builder_missing_provider() {
556 let dir = tempdir().unwrap();
557 let content_dir = dir.path().join("content");
558 std::fs::create_dir(&content_dir).unwrap();
559
560 let result = VectorIndexBuilder::new(MockVectorExtractor)
561 .with_content_path(&content_dir)
562 .build()
563 .await;
564
565 assert!(result.is_err());
566 }
567
568 #[tokio::test]
569 async fn test_builder_empty_directory() {
570 let dir = tempdir().unwrap();
571 let content_dir = dir.path().join("empty");
572 std::fs::create_dir(&content_dir).unwrap();
573 let provider = Arc::new(MockEmbeddingProvider::new(8));
574
575 let (backend, stats) = VectorIndexBuilder::new(MockVectorExtractor)
576 .with_content_path(&content_dir)
577 .with_embedding_provider(provider)
578 .build()
579 .await
580 .unwrap();
581
582 assert_eq!(stats.files_processed, 0);
583 assert_eq!(stats.documents_indexed, 0);
584 assert_eq!(backend.document_count().unwrap(), 0);
585 }
586
587 #[tokio::test]
588 async fn test_builder_error_handling_collect() {
589 let dir = tempdir().unwrap();
590 let content_dir = dir.path().join("content");
591 std::fs::create_dir(&content_dir).unwrap();
592
593 std::fs::write(
594 content_dir.join("valid.md"),
595 "---\ntitle: Valid\n---\nContent",
596 )
597 .unwrap();
598 std::fs::write(content_dir.join("invalid.md"), "not yaml frontmatter").unwrap();
600
601 let provider = Arc::new(MockEmbeddingProvider::new(8));
602
603 let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
604 .with_content_path(&content_dir)
605 .with_embedding_provider(provider)
606 .with_error_handling(ErrorHandling::Collect)
607 .build()
608 .await
609 .unwrap();
610
611 assert_eq!(stats.files_processed, 2);
612 assert!(stats.documents_indexed >= 1);
614 }
615
616 #[tokio::test]
617 async fn test_builder_batch_size() {
618 let dir = tempdir().unwrap();
619 let content_dir = dir.path().join("content");
620 std::fs::create_dir(&content_dir).unwrap();
621
622 for i in 0..5 {
624 let content = format!("---\ntitle: \"Doc {i}\"\n---\n\nContent {i}.\n");
625 std::fs::write(content_dir.join(format!("doc-{i}.md")), content).unwrap();
626 }
627
628 let provider = Arc::new(MockEmbeddingProvider::new(8));
629
630 let (backend, stats) = VectorIndexBuilder::new(MockVectorExtractor)
631 .with_content_path(&content_dir)
632 .with_embedding_provider(provider)
633 .with_batch_size(2) .build()
635 .await
636 .unwrap();
637
638 assert_eq!(stats.documents_indexed, 5);
639 assert_eq!(backend.document_count().unwrap(), 5);
640 }
641
642 #[tokio::test]
643 async fn test_builder_build_duration_tracked() {
644 let (_dir, content_dir) = setup_test_files().await;
645 let provider = Arc::new(MockEmbeddingProvider::new(8));
646
647 let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
648 .with_content_path(&content_dir)
649 .with_embedding_provider(provider)
650 .build()
651 .await
652 .unwrap();
653
654 assert!(stats.build_duration_ms < 10_000);
656 }
657
658 #[tokio::test]
663 async fn test_builder_cache_hit() {
664 let (_dir, content_dir) = setup_test_files().await;
665 let cache_path = content_dir.parent().unwrap().join("vector-cache.json");
666 let provider = Arc::new(MockEmbeddingProvider::new(8));
667
668 let (backend1, stats1) = VectorIndexBuilder::new(MockVectorExtractor)
670 .with_content_path(&content_dir)
671 .with_embedding_provider(provider.clone())
672 .with_cache_path(&cache_path)
673 .build()
674 .await
675 .unwrap();
676 assert!(!stats1.from_cache);
677 assert!(cache_path.exists());
678
679 let (backend2, stats2) = VectorIndexBuilder::new(MockVectorExtractor)
681 .with_content_path(&content_dir)
682 .with_embedding_provider(provider)
683 .with_cache_path(&cache_path)
684 .build()
685 .await
686 .unwrap();
687 assert!(stats2.from_cache);
688 assert_eq!(
689 backend1.document_count().unwrap(),
690 backend2.document_count().unwrap()
691 );
692 }
693
694 #[tokio::test]
695 async fn test_builder_cache_miss_on_content_change() {
696 let (_dir, content_dir) = setup_test_files().await;
697 let cache_path = content_dir.parent().unwrap().join("vector-cache.json");
698 let provider = Arc::new(MockEmbeddingProvider::new(8));
699
700 let (_, stats1) = VectorIndexBuilder::new(MockVectorExtractor)
702 .with_content_path(&content_dir)
703 .with_embedding_provider(provider.clone())
704 .with_cache_path(&cache_path)
705 .build()
706 .await
707 .unwrap();
708 assert!(!stats1.from_cache);
709
710 let file_c = "---\ntitle: \"Concept C\"\ncategory: \"new\"\n---\n\nConcept C content.\n";
712 std::fs::write(content_dir.join("concept-c.md"), file_c).unwrap();
713
714 let (backend, stats2) = VectorIndexBuilder::new(MockVectorExtractor)
716 .with_content_path(&content_dir)
717 .with_embedding_provider(provider)
718 .with_cache_path(&cache_path)
719 .build()
720 .await
721 .unwrap();
722 assert!(!stats2.from_cache);
723 assert_eq!(backend.document_count().unwrap(), 3);
724 }
725
726 #[tokio::test]
727 async fn test_builder_skip_cache() {
728 let (_dir, content_dir) = setup_test_files().await;
729 let cache_path = content_dir.parent().unwrap().join("vector-cache.json");
730 let provider = Arc::new(MockEmbeddingProvider::new(8));
731
732 VectorIndexBuilder::new(MockVectorExtractor)
734 .with_content_path(&content_dir)
735 .with_embedding_provider(provider.clone())
736 .with_cache_path(&cache_path)
737 .build()
738 .await
739 .unwrap();
740
741 let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
743 .with_content_path(&content_dir)
744 .with_embedding_provider(provider)
745 .with_cache_path(&cache_path)
746 .skip_cache()
747 .build()
748 .await
749 .unwrap();
750 assert!(!stats.from_cache);
751 assert_eq!(stats.files_processed, 2);
752 }
753
754 #[tokio::test]
755 async fn test_builder_no_cache_path() {
756 let (_dir, content_dir) = setup_test_files().await;
757 let provider = Arc::new(MockEmbeddingProvider::new(8));
758
759 let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
760 .with_content_path(&content_dir)
761 .with_embedding_provider(provider)
762 .build()
763 .await
764 .unwrap();
765 assert!(!stats.from_cache);
766 }
767}