Skip to main content

fabryk_vector/
types.rs

1//! Common types for the vector search module.
2//!
3//! These types are used across all vector backends and embedding providers,
4//! and are always available regardless of feature flags.
5
6use serde::{Deserialize, Serialize};
7use std::collections::HashMap;
8use std::path::PathBuf;
9
10// ============================================================================
11// Configuration
12// ============================================================================
13
14/// Vector search configuration.
15///
16/// Controls backend selection, embedding model, storage paths, and behavior.
17#[derive(Debug, Clone, Serialize, Deserialize)]
18pub struct VectorConfig {
19    /// Backend type: "lancedb" or "simple".
20    #[serde(default = "default_backend")]
21    pub backend: String,
22
23    /// Embedding provider: "fastembed" or "mock".
24    #[serde(default = "default_provider")]
25    pub provider: String,
26
27    /// Embedding model name (e.g., "bge-small-en-v1.5").
28    #[serde(default = "default_model")]
29    pub model: String,
30
31    /// Embedding dimension (auto-detected if 0).
32    #[serde(default)]
33    pub dimension: usize,
34
35    /// Path to the vector database directory.
36    pub db_path: Option<String>,
37
38    /// Path to content for indexing.
39    pub content_path: Option<String>,
40
41    /// Path to cache directory for embedding models.
42    pub cache_path: Option<String>,
43
44    /// Whether vector search is enabled.
45    #[serde(default = "default_true")]
46    pub enabled: bool,
47
48    /// Default search result limit.
49    #[serde(default = "default_limit")]
50    pub default_limit: usize,
51
52    /// Default similarity threshold (0.0 to 1.0).
53    #[serde(default = "default_threshold")]
54    pub similarity_threshold: f32,
55
56    /// Batch size for embedding operations.
57    #[serde(default = "default_batch_size")]
58    pub batch_size: usize,
59}
60
61fn default_backend() -> String {
62    "lancedb".to_string()
63}
64
65fn default_provider() -> String {
66    "fastembed".to_string()
67}
68
69fn default_model() -> String {
70    "bge-small-en-v1.5".to_string()
71}
72
73fn default_true() -> bool {
74    true
75}
76
77fn default_limit() -> usize {
78    10
79}
80
81fn default_threshold() -> f32 {
82    0.0
83}
84
85fn default_batch_size() -> usize {
86    64
87}
88
89impl Default for VectorConfig {
90    fn default() -> Self {
91        Self {
92            backend: default_backend(),
93            provider: default_provider(),
94            model: default_model(),
95            dimension: 0,
96            db_path: None,
97            content_path: None,
98            cache_path: None,
99            enabled: default_true(),
100            default_limit: default_limit(),
101            similarity_threshold: default_threshold(),
102            batch_size: default_batch_size(),
103        }
104    }
105}
106
107// ============================================================================
108// Documents
109// ============================================================================
110
111/// A document prepared for vector embedding.
112///
113/// Domain-agnostic representation: domains compose the `text` field with
114/// whatever content should be embedded (title, description, body, etc.).
115#[derive(Debug, Clone, Serialize, Deserialize)]
116pub struct VectorDocument {
117    /// Unique document identifier.
118    pub id: String,
119
120    /// Text to be embedded (pre-composed by the domain extractor).
121    pub text: String,
122
123    /// Optional category for filtering.
124    #[serde(skip_serializing_if = "Option::is_none")]
125    pub category: Option<String>,
126
127    /// Arbitrary metadata key-value pairs.
128    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
129    pub metadata: HashMap<String, String>,
130}
131
132impl VectorDocument {
133    /// Create a new vector document.
134    pub fn new(id: impl Into<String>, text: impl Into<String>) -> Self {
135        Self {
136            id: id.into(),
137            text: text.into(),
138            category: None,
139            metadata: HashMap::new(),
140        }
141    }
142
143    /// Set the category.
144    pub fn with_category(mut self, category: impl Into<String>) -> Self {
145        self.category = Some(category.into());
146        self
147    }
148
149    /// Add a metadata key-value pair.
150    pub fn with_metadata(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
151        self.metadata.insert(key.into(), value.into());
152        self
153    }
154}
155
156/// A document with its computed embedding vector.
157#[derive(Debug, Clone, Serialize, Deserialize)]
158pub struct EmbeddedDocument {
159    /// The original document.
160    pub document: VectorDocument,
161
162    /// The embedding vector.
163    pub embedding: Vec<f32>,
164}
165
166impl EmbeddedDocument {
167    /// Create a new embedded document.
168    pub fn new(document: VectorDocument, embedding: Vec<f32>) -> Self {
169        Self {
170            document,
171            embedding,
172        }
173    }
174
175    /// The embedding dimension.
176    pub fn dimension(&self) -> usize {
177        self.embedding.len()
178    }
179}
180
181// ============================================================================
182// Search types
183// ============================================================================
184
185/// Parameters for a vector search request.
186#[derive(Debug, Clone, Default, Serialize, Deserialize)]
187pub struct VectorSearchParams {
188    /// Search query string (will be embedded).
189    pub query: String,
190
191    /// Maximum results to return.
192    #[serde(skip_serializing_if = "Option::is_none")]
193    pub limit: Option<usize>,
194
195    /// Minimum similarity score (0.0 to 1.0).
196    #[serde(skip_serializing_if = "Option::is_none")]
197    pub similarity_threshold: Option<f32>,
198
199    /// Filter by category.
200    #[serde(skip_serializing_if = "Option::is_none")]
201    pub category: Option<String>,
202
203    /// Metadata filters as key-value pairs.
204    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
205    pub metadata_filters: HashMap<String, String>,
206}
207
208impl VectorSearchParams {
209    /// Create search params with a query string.
210    pub fn new(query: impl Into<String>) -> Self {
211        Self {
212            query: query.into(),
213            ..Default::default()
214        }
215    }
216
217    /// Set the result limit.
218    pub fn with_limit(mut self, limit: usize) -> Self {
219        self.limit = Some(limit);
220        self
221    }
222
223    /// Set the similarity threshold.
224    pub fn with_threshold(mut self, threshold: f32) -> Self {
225        self.similarity_threshold = Some(threshold);
226        self
227    }
228
229    /// Set a category filter.
230    pub fn with_category(mut self, category: impl Into<String>) -> Self {
231        self.category = Some(category.into());
232        self
233    }
234
235    /// Add a metadata filter.
236    pub fn with_filter(mut self, key: impl Into<String>, value: impl Into<String>) -> Self {
237        self.metadata_filters.insert(key.into(), value.into());
238        self
239    }
240}
241
242/// A single vector search result.
243#[derive(Debug, Clone, Serialize, Deserialize)]
244pub struct VectorSearchResult {
245    /// Document identifier.
246    pub id: String,
247
248    /// Similarity score (0.0 to 1.0, higher is more similar).
249    pub score: f32,
250
251    /// Raw distance from the query vector.
252    pub distance: f32,
253
254    /// Metadata snapshot from the indexed document.
255    #[serde(default, skip_serializing_if = "HashMap::is_empty")]
256    pub metadata: HashMap<String, String>,
257}
258
259/// Collection of vector search results.
260#[derive(Debug, Clone, Serialize, Deserialize)]
261pub struct VectorSearchResults {
262    /// Search result items, ordered by score (highest first).
263    pub items: Vec<VectorSearchResult>,
264
265    /// Total number of matching documents.
266    pub total: usize,
267
268    /// Backend that executed the search.
269    pub backend: String,
270}
271
272impl VectorSearchResults {
273    /// Create empty results.
274    pub fn empty(backend: &str) -> Self {
275        Self {
276            items: Vec::new(),
277            total: 0,
278            backend: backend.to_string(),
279        }
280    }
281}
282
283// ============================================================================
284// Index statistics
285// ============================================================================
286
287/// Statistics from a vector index build operation.
288#[derive(Debug, Clone, Serialize, Deserialize)]
289pub struct VectorIndexStats {
290    /// Number of documents indexed.
291    pub documents_indexed: usize,
292
293    /// Number of files processed.
294    pub files_processed: usize,
295
296    /// Number of files skipped due to errors.
297    pub files_skipped: usize,
298
299    /// Embedding dimension used.
300    pub embedding_dimension: usize,
301
302    /// Content hash for freshness checking.
303    pub content_hash: String,
304
305    /// Build duration in milliseconds.
306    pub build_duration_ms: u64,
307
308    /// Errors encountered (if not fail-fast).
309    #[serde(default, skip_serializing_if = "Vec::is_empty")]
310    pub errors: Vec<BuildError>,
311
312    /// Whether the result was loaded from cache.
313    #[serde(default)]
314    pub from_cache: bool,
315}
316
317/// An error that occurred during vector index building.
318#[derive(Debug, Clone, Serialize, Deserialize)]
319pub struct BuildError {
320    /// Path to the problematic file.
321    pub file: PathBuf,
322    /// Error message.
323    pub message: String,
324}
325
326// ============================================================================
327// Tests
328// ============================================================================
329
330#[cfg(test)]
331mod tests {
332    use super::*;
333
334    // ------------------------------------------------------------------------
335    // VectorConfig tests
336    // ------------------------------------------------------------------------
337
338    #[test]
339    fn test_vector_config_default() {
340        let config = VectorConfig::default();
341        assert_eq!(config.backend, "lancedb");
342        assert_eq!(config.provider, "fastembed");
343        assert_eq!(config.model, "bge-small-en-v1.5");
344        assert_eq!(config.dimension, 0);
345        assert!(config.db_path.is_none());
346        assert!(config.content_path.is_none());
347        assert!(config.cache_path.is_none());
348        assert!(config.enabled);
349        assert_eq!(config.default_limit, 10);
350        assert_eq!(config.similarity_threshold, 0.0);
351        assert_eq!(config.batch_size, 64);
352    }
353
354    #[test]
355    fn test_vector_config_serialization() {
356        let config = VectorConfig {
357            backend: "lancedb".to_string(),
358            db_path: Some("/tmp/vectors".to_string()),
359            ..Default::default()
360        };
361
362        let json = serde_json::to_string(&config).unwrap();
363        assert!(json.contains("\"backend\":\"lancedb\""));
364        assert!(json.contains("\"/tmp/vectors\""));
365    }
366
367    #[test]
368    fn test_vector_config_deserialization_with_defaults() {
369        let json = r#"{"backend": "lancedb"}"#;
370        let config: VectorConfig = serde_json::from_str(json).unwrap();
371
372        assert_eq!(config.backend, "lancedb");
373        assert_eq!(config.default_limit, 10);
374        assert!(config.enabled);
375        assert_eq!(config.batch_size, 64);
376    }
377
378    // ------------------------------------------------------------------------
379    // VectorDocument tests
380    // ------------------------------------------------------------------------
381
382    #[test]
383    fn test_vector_document_new() {
384        let doc = VectorDocument::new("doc-1", "Hello world");
385        assert_eq!(doc.id, "doc-1");
386        assert_eq!(doc.text, "Hello world");
387        assert!(doc.category.is_none());
388        assert!(doc.metadata.is_empty());
389    }
390
391    #[test]
392    fn test_vector_document_with_category() {
393        let doc = VectorDocument::new("doc-1", "text").with_category("harmony");
394        assert_eq!(doc.category, Some("harmony".to_string()));
395    }
396
397    #[test]
398    fn test_vector_document_with_metadata() {
399        let doc = VectorDocument::new("doc-1", "text")
400            .with_metadata("author", "test")
401            .with_metadata("tier", "beginner");
402
403        assert_eq!(doc.metadata.len(), 2);
404        assert_eq!(doc.metadata.get("author").unwrap(), "test");
405        assert_eq!(doc.metadata.get("tier").unwrap(), "beginner");
406    }
407
408    #[test]
409    fn test_vector_document_serialization() {
410        let doc = VectorDocument::new("doc-1", "text content")
411            .with_category("test")
412            .with_metadata("key", "value");
413
414        let json = serde_json::to_string(&doc).unwrap();
415        assert!(json.contains("doc-1"));
416        assert!(json.contains("text content"));
417        assert!(json.contains("test"));
418
419        let deserialized: VectorDocument = serde_json::from_str(&json).unwrap();
420        assert_eq!(deserialized.id, "doc-1");
421        assert_eq!(deserialized.text, "text content");
422        assert_eq!(deserialized.category, Some("test".to_string()));
423    }
424
425    #[test]
426    fn test_vector_document_serialization_skips_empty() {
427        let doc = VectorDocument::new("doc-1", "text");
428        let json = serde_json::to_string(&doc).unwrap();
429
430        // category and metadata should be omitted when empty/None
431        assert!(!json.contains("category"));
432        assert!(!json.contains("metadata"));
433    }
434
435    // ------------------------------------------------------------------------
436    // EmbeddedDocument tests
437    // ------------------------------------------------------------------------
438
439    #[test]
440    fn test_embedded_document_new() {
441        let doc = VectorDocument::new("doc-1", "text");
442        let embedding = vec![0.1, 0.2, 0.3];
443        let embedded = EmbeddedDocument::new(doc, embedding);
444
445        assert_eq!(embedded.document.id, "doc-1");
446        assert_eq!(embedded.embedding.len(), 3);
447        assert_eq!(embedded.dimension(), 3);
448    }
449
450    // ------------------------------------------------------------------------
451    // VectorSearchParams tests
452    // ------------------------------------------------------------------------
453
454    #[test]
455    fn test_search_params_default() {
456        let params = VectorSearchParams::default();
457        assert!(params.query.is_empty());
458        assert!(params.limit.is_none());
459        assert!(params.similarity_threshold.is_none());
460        assert!(params.category.is_none());
461        assert!(params.metadata_filters.is_empty());
462    }
463
464    #[test]
465    fn test_search_params_builder() {
466        let params = VectorSearchParams::new("semantic query")
467            .with_limit(5)
468            .with_threshold(0.5)
469            .with_category("harmony")
470            .with_filter("tier", "advanced");
471
472        assert_eq!(params.query, "semantic query");
473        assert_eq!(params.limit, Some(5));
474        assert_eq!(params.similarity_threshold, Some(0.5));
475        assert_eq!(params.category, Some("harmony".to_string()));
476        assert_eq!(params.metadata_filters.get("tier").unwrap(), "advanced");
477    }
478
479    #[test]
480    fn test_search_params_serialization() {
481        let params = VectorSearchParams::new("test query").with_limit(10);
482
483        let json = serde_json::to_string(&params).unwrap();
484        assert!(json.contains("test query"));
485        assert!(json.contains("10"));
486
487        // Optional None fields should be skipped
488        let minimal = VectorSearchParams::new("q");
489        let json = serde_json::to_string(&minimal).unwrap();
490        assert!(!json.contains("limit"));
491        assert!(!json.contains("similarity_threshold"));
492    }
493
494    // ------------------------------------------------------------------------
495    // VectorSearchResult tests
496    // ------------------------------------------------------------------------
497
498    #[test]
499    fn test_search_result_serialization() {
500        let result = VectorSearchResult {
501            id: "doc-1".to_string(),
502            score: 0.85,
503            distance: 0.176,
504            metadata: HashMap::from([("category".to_string(), "harmony".to_string())]),
505        };
506
507        let json = serde_json::to_string(&result).unwrap();
508        assert!(json.contains("doc-1"));
509        assert!(json.contains("0.85"));
510    }
511
512    #[test]
513    fn test_search_result_empty_metadata_skipped() {
514        let result = VectorSearchResult {
515            id: "doc-1".to_string(),
516            score: 0.5,
517            distance: 1.0,
518            metadata: HashMap::new(),
519        };
520
521        let json = serde_json::to_string(&result).unwrap();
522        assert!(!json.contains("metadata"));
523    }
524
525    // ------------------------------------------------------------------------
526    // VectorSearchResults tests
527    // ------------------------------------------------------------------------
528
529    #[test]
530    fn test_search_results_empty() {
531        let results = VectorSearchResults::empty("lancedb");
532        assert!(results.items.is_empty());
533        assert_eq!(results.total, 0);
534        assert_eq!(results.backend, "lancedb");
535    }
536
537    // ------------------------------------------------------------------------
538    // VectorIndexStats tests
539    // ------------------------------------------------------------------------
540
541    #[test]
542    fn test_index_stats_serialization() {
543        let stats = VectorIndexStats {
544            documents_indexed: 100,
545            files_processed: 50,
546            files_skipped: 2,
547            embedding_dimension: 384,
548            content_hash: "abc123".to_string(),
549            build_duration_ms: 1500,
550            errors: vec![],
551            from_cache: false,
552        };
553
554        let json = serde_json::to_string(&stats).unwrap();
555        assert!(json.contains("100"));
556        assert!(json.contains("384"));
557        assert!(json.contains("abc123"));
558
559        // Empty errors should be omitted
560        assert!(!json.contains("errors"));
561    }
562
563    #[test]
564    fn test_index_stats_with_errors() {
565        let stats = VectorIndexStats {
566            documents_indexed: 10,
567            files_processed: 12,
568            files_skipped: 2,
569            embedding_dimension: 384,
570            content_hash: "hash".to_string(),
571            build_duration_ms: 500,
572            errors: vec![BuildError {
573                file: PathBuf::from("/test/bad.md"),
574                message: "parse error".to_string(),
575            }],
576            from_cache: false,
577        };
578
579        let json = serde_json::to_string(&stats).unwrap();
580        assert!(json.contains("errors"));
581        assert!(json.contains("parse error"));
582    }
583}