1use crate::backend::{SimpleVectorBackend, VectorBackend};
18use crate::embedding::EmbeddingProvider;
19use crate::extractor::VectorExtractor;
20use crate::types::{BuildError, EmbeddedDocument, VectorDocument, VectorIndexStats};
21use fabryk_content::markdown::extract_frontmatter;
22use fabryk_core::{Error, Result};
23use std::path::{Path, PathBuf};
24use std::sync::Arc;
25use std::time::Instant;
26
27#[derive(Clone, Debug, Default)]
33pub enum ErrorHandling {
34 #[default]
36 FailFast,
37 Collect,
39 Skip,
41}
42
43pub struct VectorIndexBuilder<E: VectorExtractor> {
68 extractor: E,
69 content_path: Option<PathBuf>,
70 provider: Option<Arc<dyn EmbeddingProvider>>,
71 error_handling: ErrorHandling,
72 batch_size: usize,
73 cache_path: Option<PathBuf>,
74 skip_cache: bool,
75}
76
77impl<E: VectorExtractor> VectorIndexBuilder<E> {
78 pub fn new(extractor: E) -> Self {
80 Self {
81 extractor,
82 content_path: None,
83 provider: None,
84 error_handling: ErrorHandling::default(),
85 batch_size: 64,
86 cache_path: None,
87 skip_cache: false,
88 }
89 }
90
91 pub fn with_content_path(mut self, path: impl Into<PathBuf>) -> Self {
93 self.content_path = Some(path.into());
94 self
95 }
96
97 pub fn with_embedding_provider(mut self, provider: Arc<dyn EmbeddingProvider>) -> Self {
99 self.provider = Some(provider);
100 self
101 }
102
103 pub fn with_error_handling(mut self, handling: ErrorHandling) -> Self {
105 self.error_handling = handling;
106 self
107 }
108
109 pub fn with_batch_size(mut self, size: usize) -> Self {
111 self.batch_size = size;
112 self
113 }
114
115 pub fn with_cache_path(mut self, path: impl Into<PathBuf>) -> Self {
122 self.cache_path = Some(path.into());
123 self
124 }
125
126 pub fn skip_cache(mut self) -> Self {
128 self.skip_cache = true;
129 self
130 }
131
132 pub async fn build(self) -> Result<(SimpleVectorBackend, VectorIndexStats)> {
144 let start = Instant::now();
145
146 let content_path = self
147 .content_path
148 .as_ref()
149 .ok_or_else(|| Error::config("Content path not set. Use with_content_path() first."))?
150 .clone();
151
152 let provider = self
153 .provider
154 .as_ref()
155 .ok_or_else(|| {
156 Error::config("Embedding provider not set. Use with_embedding_provider() first.")
157 })?
158 .clone();
159
160 if let Some(ref cache_path) = self.cache_path
162 && !self.skip_cache
163 {
164 let content_hash = compute_content_hash(&content_path).await?;
165 if SimpleVectorBackend::is_cache_fresh(cache_path, &content_hash)
166 && let Ok(Some(backend)) =
167 SimpleVectorBackend::load_cache(cache_path, provider.clone())
168 {
169 let doc_count = backend.document_count().unwrap_or(0);
170 log::info!(
171 "Vector cache is fresh, loaded {} documents from {}",
172 doc_count,
173 cache_path.display()
174 );
175 let stats = VectorIndexStats {
176 documents_indexed: doc_count,
177 files_processed: 0,
178 files_skipped: 0,
179 embedding_dimension: provider.dimension(),
180 content_hash,
181 build_duration_ms: start.elapsed().as_millis() as u64,
182 errors: Vec::new(),
183 from_cache: true,
184 };
185 return Ok((backend, stats));
186 }
187 }
188
189 let files = discover_files(&content_path).await?;
191
192 let mut errors: Vec<BuildError> = Vec::new();
193 let mut documents: Vec<VectorDocument> = Vec::new();
194 let mut files_processed = 0usize;
195 let mut files_skipped = 0usize;
196
197 for file_path in &files {
201 match self.extract_file(&content_path, file_path) {
202 Ok(doc) => {
203 documents.push(doc);
204 }
205 Err(e) => {
206 let build_error = BuildError {
207 file: file_path.clone(),
208 message: e.to_string(),
209 };
210
211 match self.error_handling {
212 ErrorHandling::FailFast => return Err(e),
213 ErrorHandling::Collect => {
214 files_skipped += 1;
215 errors.push(build_error);
216 }
217 ErrorHandling::Skip => {
218 files_skipped += 1;
219 log::warn!("Skipping {}: {}", file_path.display(), build_error.message);
220 errors.push(build_error);
221 }
222 }
223 }
224 }
225 files_processed += 1;
226 }
227
228 let mut embedded_documents: Vec<EmbeddedDocument> = Vec::with_capacity(documents.len());
232
233 for chunk in documents.chunks(self.batch_size) {
234 let texts: Vec<&str> = chunk.iter().map(|d| d.text.as_str()).collect();
235 let embeddings = provider.embed_batch(&texts).await?;
236
237 for (doc, embedding) in chunk.iter().zip(embeddings.into_iter()) {
238 embedded_documents.push(EmbeddedDocument::new(doc.clone(), embedding));
239 }
240 }
241
242 let documents_indexed = embedded_documents.len();
243 let embedding_dimension = provider.dimension();
244
245 let content_hash = compute_content_hash(&content_path).await?;
247
248 let mut backend = SimpleVectorBackend::new(provider);
250 backend.add_documents(embedded_documents);
251
252 let stats = VectorIndexStats {
253 documents_indexed,
254 files_processed,
255 files_skipped,
256 embedding_dimension,
257 content_hash: content_hash.clone(),
258 build_duration_ms: start.elapsed().as_millis() as u64,
259 errors,
260 from_cache: false,
261 };
262
263 if let Some(ref cache_path) = self.cache_path
265 && let Err(e) = backend.save_cache(cache_path, &content_hash)
266 {
267 log::warn!("Failed to save vector cache: {e}");
268 }
269
270 Ok((backend, stats))
271 }
272
273 fn extract_file(&self, base_path: &Path, file_path: &Path) -> Result<VectorDocument> {
275 let content =
276 std::fs::read_to_string(file_path).map_err(|e| Error::io_with_path(e, file_path))?;
277
278 let fm_result = extract_frontmatter(&content)?;
279
280 let frontmatter = fm_result
281 .value()
282 .cloned()
283 .unwrap_or(yaml_serde::Value::Null);
284 let body = fm_result.body();
285
286 self.extractor
287 .extract_document(base_path, file_path, &frontmatter, body)
288 }
289
290 pub async fn build_append(self, backend: &mut SimpleVectorBackend) -> Result<VectorIndexStats> {
315 let start = Instant::now();
316
317 let content_path = self
318 .content_path
319 .as_ref()
320 .ok_or_else(|| Error::config("Content path not set. Use with_content_path() first."))?
321 .clone();
322
323 let provider = self
324 .provider
325 .as_ref()
326 .ok_or_else(|| {
327 Error::config("Embedding provider not set. Use with_embedding_provider() first.")
328 })?
329 .clone();
330
331 let files = discover_files(&content_path).await?;
332
333 let mut errors: Vec<BuildError> = Vec::new();
334 let mut documents: Vec<VectorDocument> = Vec::new();
335 let mut files_processed = 0usize;
336 let mut files_skipped = 0usize;
337
338 for file_path in &files {
340 match self.extract_file(&content_path, file_path) {
341 Ok(doc) => {
342 documents.push(doc);
343 }
344 Err(e) => {
345 let build_error = BuildError {
346 file: file_path.clone(),
347 message: e.to_string(),
348 };
349
350 match self.error_handling {
351 ErrorHandling::FailFast => return Err(e),
352 ErrorHandling::Collect => {
353 files_skipped += 1;
354 errors.push(build_error);
355 }
356 ErrorHandling::Skip => {
357 files_skipped += 1;
358 log::warn!("Skipping {}: {}", file_path.display(), build_error.message);
359 errors.push(build_error);
360 }
361 }
362 }
363 }
364 files_processed += 1;
365 }
366
367 let mut embedded_documents: Vec<EmbeddedDocument> = Vec::with_capacity(documents.len());
369
370 for chunk in documents.chunks(self.batch_size) {
371 let texts: Vec<&str> = chunk.iter().map(|d| d.text.as_str()).collect();
372 let embeddings = provider.embed_batch(&texts).await?;
373
374 for (doc, embedding) in chunk.iter().zip(embeddings.into_iter()) {
375 embedded_documents.push(EmbeddedDocument::new(doc.clone(), embedding));
376 }
377 }
378
379 let documents_indexed = embedded_documents.len();
380 let embedding_dimension = provider.dimension();
381 let content_hash = compute_content_hash(&content_path).await?;
382
383 backend.add_documents(embedded_documents);
384
385 let stats = VectorIndexStats {
386 documents_indexed,
387 files_processed,
388 files_skipped,
389 embedding_dimension,
390 content_hash,
391 build_duration_ms: start.elapsed().as_millis() as u64,
392 errors,
393 from_cache: false,
394 };
395
396 log::info!(
397 "Appended {} vector documents from {} ({} errors)",
398 documents_indexed,
399 content_path.display(),
400 stats.errors.len(),
401 );
402
403 Ok(stats)
404 }
405}
406
407async fn discover_files(base_path: &Path) -> Result<Vec<PathBuf>> {
413 use fabryk_core::util::files::{FindOptions, find_all_files};
414
415 let files = find_all_files(base_path, FindOptions::markdown()).await?;
416 let paths: Vec<PathBuf> = files.into_iter().map(|f| f.path).collect();
417
418 Ok(paths)
419}
420
421pub async fn compute_content_hash(content_path: &Path) -> Result<String> {
425 use fabryk_core::util::files::{FindOptions, find_all_files};
426
427 let files = find_all_files(content_path, FindOptions::markdown()).await?;
428
429 let mut hasher = blake3::Hasher::new();
430 let mut paths: Vec<PathBuf> = files.into_iter().map(|f| f.path).collect();
431 paths.sort(); for path in &paths {
434 if let Ok(content) = std::fs::read(path) {
435 hasher.update(path.to_string_lossy().as_bytes());
436 hasher.update(&content);
437 }
438 }
439
440 Ok(hasher.finalize().to_hex().to_string())
441}
442
443#[cfg(test)]
448mod tests {
449 use super::*;
450 use crate::backend::VectorBackend;
451 use crate::embedding::MockEmbeddingProvider;
452 use crate::extractor::MockVectorExtractor;
453 use tempfile::tempdir;
454
455 async fn setup_test_files() -> (tempfile::TempDir, PathBuf) {
456 let dir = tempdir().unwrap();
457 let content_dir = dir.path().join("content");
458 std::fs::create_dir(&content_dir).unwrap();
459
460 let file_a =
461 "---\ntitle: \"Concept A\"\ncategory: \"basics\"\n---\n\nContent for concept A.\n";
462 let file_b = "---\ntitle: \"Concept B\"\ncategory: \"advanced\"\ntier: \"intermediate\"\n---\n\nContent for concept B.\n";
463
464 std::fs::write(content_dir.join("concept-a.md"), file_a).unwrap();
465 std::fs::write(content_dir.join("concept-b.md"), file_b).unwrap();
466
467 (dir, content_dir)
468 }
469
470 #[tokio::test]
471 async fn test_builder_basic() {
472 let (_dir, content_dir) = setup_test_files().await;
473 let provider = Arc::new(MockEmbeddingProvider::new(8));
474
475 let (backend, stats) = VectorIndexBuilder::new(MockVectorExtractor)
476 .with_content_path(&content_dir)
477 .with_embedding_provider(provider)
478 .build()
479 .await
480 .unwrap();
481
482 assert_eq!(stats.files_processed, 2);
483 assert_eq!(stats.documents_indexed, 2);
484 assert_eq!(stats.embedding_dimension, 8);
485 assert!(stats.errors.is_empty());
486 assert_eq!(backend.document_count().unwrap(), 2);
487 }
488
489 #[tokio::test]
490 async fn test_builder_content_hash() {
491 let (_dir, content_dir) = setup_test_files().await;
492 let provider = Arc::new(MockEmbeddingProvider::new(8));
493
494 let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
495 .with_content_path(&content_dir)
496 .with_embedding_provider(provider)
497 .build()
498 .await
499 .unwrap();
500
501 assert!(!stats.content_hash.is_empty());
502 assert!(stats.content_hash.chars().all(|c| c.is_ascii_hexdigit()));
504 }
505
506 #[tokio::test]
507 async fn test_builder_content_hash_deterministic() {
508 let (_dir, content_dir) = setup_test_files().await;
509
510 let hash1 = compute_content_hash(&content_dir).await.unwrap();
511 let hash2 = compute_content_hash(&content_dir).await.unwrap();
512
513 assert_eq!(hash1, hash2);
514 }
515
516 #[tokio::test]
517 async fn test_builder_content_hash_changes() {
518 let dir = tempdir().unwrap();
519 let content_dir = dir.path().join("content");
520 std::fs::create_dir(&content_dir).unwrap();
521
522 std::fs::write(
523 content_dir.join("test.md"),
524 "---\ntitle: Test\n---\nOriginal content",
525 )
526 .unwrap();
527
528 let hash1 = compute_content_hash(&content_dir).await.unwrap();
529
530 std::fs::write(
531 content_dir.join("test.md"),
532 "---\ntitle: Test\n---\nModified content",
533 )
534 .unwrap();
535
536 let hash2 = compute_content_hash(&content_dir).await.unwrap();
537
538 assert_ne!(hash1, hash2);
539 }
540
541 #[tokio::test]
542 async fn test_builder_missing_content_path() {
543 let provider = Arc::new(MockEmbeddingProvider::new(8));
544
545 let result = VectorIndexBuilder::new(MockVectorExtractor)
546 .with_embedding_provider(provider)
547 .build()
548 .await;
549
550 assert!(result.is_err());
551 }
552
553 #[tokio::test]
554 async fn test_builder_missing_provider() {
555 let dir = tempdir().unwrap();
556 let content_dir = dir.path().join("content");
557 std::fs::create_dir(&content_dir).unwrap();
558
559 let result = VectorIndexBuilder::new(MockVectorExtractor)
560 .with_content_path(&content_dir)
561 .build()
562 .await;
563
564 assert!(result.is_err());
565 }
566
567 #[tokio::test]
568 async fn test_builder_empty_directory() {
569 let dir = tempdir().unwrap();
570 let content_dir = dir.path().join("empty");
571 std::fs::create_dir(&content_dir).unwrap();
572 let provider = Arc::new(MockEmbeddingProvider::new(8));
573
574 let (backend, stats) = VectorIndexBuilder::new(MockVectorExtractor)
575 .with_content_path(&content_dir)
576 .with_embedding_provider(provider)
577 .build()
578 .await
579 .unwrap();
580
581 assert_eq!(stats.files_processed, 0);
582 assert_eq!(stats.documents_indexed, 0);
583 assert_eq!(backend.document_count().unwrap(), 0);
584 }
585
586 #[tokio::test]
587 async fn test_builder_error_handling_collect() {
588 let dir = tempdir().unwrap();
589 let content_dir = dir.path().join("content");
590 std::fs::create_dir(&content_dir).unwrap();
591
592 std::fs::write(
593 content_dir.join("valid.md"),
594 "---\ntitle: Valid\n---\nContent",
595 )
596 .unwrap();
597 std::fs::write(content_dir.join("invalid.md"), "not yaml frontmatter").unwrap();
599
600 let provider = Arc::new(MockEmbeddingProvider::new(8));
601
602 let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
603 .with_content_path(&content_dir)
604 .with_embedding_provider(provider)
605 .with_error_handling(ErrorHandling::Collect)
606 .build()
607 .await
608 .unwrap();
609
610 assert_eq!(stats.files_processed, 2);
611 assert!(stats.documents_indexed >= 1);
613 }
614
615 #[tokio::test]
616 async fn test_builder_batch_size() {
617 let dir = tempdir().unwrap();
618 let content_dir = dir.path().join("content");
619 std::fs::create_dir(&content_dir).unwrap();
620
621 for i in 0..5 {
623 let content = format!("---\ntitle: \"Doc {i}\"\n---\n\nContent {i}.\n");
624 std::fs::write(content_dir.join(format!("doc-{i}.md")), content).unwrap();
625 }
626
627 let provider = Arc::new(MockEmbeddingProvider::new(8));
628
629 let (backend, stats) = VectorIndexBuilder::new(MockVectorExtractor)
630 .with_content_path(&content_dir)
631 .with_embedding_provider(provider)
632 .with_batch_size(2) .build()
634 .await
635 .unwrap();
636
637 assert_eq!(stats.documents_indexed, 5);
638 assert_eq!(backend.document_count().unwrap(), 5);
639 }
640
641 #[tokio::test]
642 async fn test_builder_build_duration_tracked() {
643 let (_dir, content_dir) = setup_test_files().await;
644 let provider = Arc::new(MockEmbeddingProvider::new(8));
645
646 let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
647 .with_content_path(&content_dir)
648 .with_embedding_provider(provider)
649 .build()
650 .await
651 .unwrap();
652
653 assert!(stats.build_duration_ms < 10_000);
655 }
656
657 #[tokio::test]
662 async fn test_builder_cache_hit() {
663 let (_dir, content_dir) = setup_test_files().await;
664 let cache_path = content_dir.parent().unwrap().join("vector-cache.json");
665 let provider = Arc::new(MockEmbeddingProvider::new(8));
666
667 let (backend1, stats1) = VectorIndexBuilder::new(MockVectorExtractor)
669 .with_content_path(&content_dir)
670 .with_embedding_provider(provider.clone())
671 .with_cache_path(&cache_path)
672 .build()
673 .await
674 .unwrap();
675 assert!(!stats1.from_cache);
676 assert!(cache_path.exists());
677
678 let (backend2, stats2) = VectorIndexBuilder::new(MockVectorExtractor)
680 .with_content_path(&content_dir)
681 .with_embedding_provider(provider)
682 .with_cache_path(&cache_path)
683 .build()
684 .await
685 .unwrap();
686 assert!(stats2.from_cache);
687 assert_eq!(
688 backend1.document_count().unwrap(),
689 backend2.document_count().unwrap()
690 );
691 }
692
693 #[tokio::test]
694 async fn test_builder_cache_miss_on_content_change() {
695 let (_dir, content_dir) = setup_test_files().await;
696 let cache_path = content_dir.parent().unwrap().join("vector-cache.json");
697 let provider = Arc::new(MockEmbeddingProvider::new(8));
698
699 let (_, stats1) = VectorIndexBuilder::new(MockVectorExtractor)
701 .with_content_path(&content_dir)
702 .with_embedding_provider(provider.clone())
703 .with_cache_path(&cache_path)
704 .build()
705 .await
706 .unwrap();
707 assert!(!stats1.from_cache);
708
709 let file_c = "---\ntitle: \"Concept C\"\ncategory: \"new\"\n---\n\nConcept C content.\n";
711 std::fs::write(content_dir.join("concept-c.md"), file_c).unwrap();
712
713 let (backend, stats2) = VectorIndexBuilder::new(MockVectorExtractor)
715 .with_content_path(&content_dir)
716 .with_embedding_provider(provider)
717 .with_cache_path(&cache_path)
718 .build()
719 .await
720 .unwrap();
721 assert!(!stats2.from_cache);
722 assert_eq!(backend.document_count().unwrap(), 3);
723 }
724
725 #[tokio::test]
726 async fn test_builder_skip_cache() {
727 let (_dir, content_dir) = setup_test_files().await;
728 let cache_path = content_dir.parent().unwrap().join("vector-cache.json");
729 let provider = Arc::new(MockEmbeddingProvider::new(8));
730
731 VectorIndexBuilder::new(MockVectorExtractor)
733 .with_content_path(&content_dir)
734 .with_embedding_provider(provider.clone())
735 .with_cache_path(&cache_path)
736 .build()
737 .await
738 .unwrap();
739
740 let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
742 .with_content_path(&content_dir)
743 .with_embedding_provider(provider)
744 .with_cache_path(&cache_path)
745 .skip_cache()
746 .build()
747 .await
748 .unwrap();
749 assert!(!stats.from_cache);
750 assert_eq!(stats.files_processed, 2);
751 }
752
753 #[tokio::test]
754 async fn test_builder_no_cache_path() {
755 let (_dir, content_dir) = setup_test_files().await;
756 let provider = Arc::new(MockEmbeddingProvider::new(8));
757
758 let (_, stats) = VectorIndexBuilder::new(MockVectorExtractor)
759 .with_content_path(&content_dir)
760 .with_embedding_provider(provider)
761 .build()
762 .await
763 .unwrap();
764 assert!(!stats.from_cache);
765 }
766}