graphrag_core/vector/
mod.rs

1//! Vector store and similarity search.
2//!
3//! Stores embeddings and answers nearest-neighbour queries via cosine similarity
4//! (with an optional HNSW index behind the `vector-hnsw` feature).
5
6#[cfg(feature = "parallel-processing")]
7use crate::parallel::ParallelProcessor;
8use crate::{GraphRAGError, Result};
9use std::collections::hash_map::DefaultHasher;
10use std::collections::HashMap;
11use std::hash::{Hash, Hasher};
12
13#[cfg(feature = "vector-hnsw")]
14use instant_distance::{Builder, Point, Search};
15
16// Vector store abstractions
17
18/// In-memory vector store implementation for testing and development
19pub mod memory_store;
20
21/// Vector store trait and common types
22pub mod store;
23
24#[cfg(feature = "lancedb")]
25/// LanceDB vector store implementation
26pub mod lancedb;
27
28#[cfg(feature = "qdrant")]
29/// Qdrant vector store implementation
30pub mod qdrant;
31
32// Note: Voy vector store (WASM-optimized, 75KB) has been moved to graphrag-wasm crate
33// where it belongs alongside other WASM-specific features.
34// See: graphrag-wasm/src/vector/voy_store.rs
35
36/// Wrapper for Vec<f32> to implement Point trait for vector operations
37#[derive(Debug, Clone, PartialEq)]
38pub struct Vector(Vec<f32>);
39
40impl Vector {
41    /// Create a new vector from raw data
42    pub fn new(vector_data: Vec<f32>) -> Self {
43        Self(vector_data)
44    }
45
46    /// Get the vector data as a slice
47    pub fn as_slice(&self) -> &[f32] {
48        &self.0
49    }
50}
51
52#[cfg(feature = "vector-hnsw")]
53impl Point for Vector {
54    fn distance(&self, other: &Self) -> f32 {
55        // Euclidean distance
56        if self.0.len() != other.0.len() {
57            return f32::INFINITY;
58        }
59
60        self.0
61            .iter()
62            .zip(other.0.iter())
63            .map(|(a, b)| (a - b).powi(2))
64            .sum::<f32>()
65            .sqrt()
66    }
67}
68
69/// Vector index for semantic search
70pub struct VectorIndex {
71    #[cfg(feature = "vector-hnsw")]
72    index: Option<instant_distance::HnswMap<Vector, String>>,
73    #[cfg(not(feature = "vector-hnsw"))]
74    index: Option<()>, // Placeholder when HNSW is not available
75    embeddings: HashMap<String, Vec<f32>>,
76    #[cfg(feature = "parallel-processing")]
77    parallel_processor: Option<ParallelProcessor>,
78}
79
80impl VectorIndex {
81    /// Create a new vector index
82    pub fn new() -> Self {
83        Self {
84            index: None,
85            embeddings: HashMap::new(),
86            #[cfg(feature = "parallel-processing")]
87            parallel_processor: None,
88        }
89    }
90
91    /// Create a new vector index with parallel processing support
92    #[cfg(feature = "parallel-processing")]
93    pub fn with_parallel_processing(parallel_processor: ParallelProcessor) -> Self {
94        Self {
95            index: None,
96            embeddings: HashMap::new(),
97            parallel_processor: Some(parallel_processor),
98        }
99    }
100
101    /// Add a vector to the index
102    pub fn add_vector(&mut self, id: String, embedding: Vec<f32>) -> Result<()> {
103        if embedding.is_empty() {
104            return Err(GraphRAGError::VectorSearch {
105                message: "Empty embedding vector".to_string(),
106            });
107        }
108
109        self.embeddings.insert(id, embedding);
110        Ok(())
111    }
112
113    /// Build the index from all added vectors
114    pub fn build_index(&mut self) -> Result<()> {
115        if self.embeddings.is_empty() {
116            return Err(GraphRAGError::VectorSearch {
117                message: "No embeddings to build index from".to_string(),
118            });
119        }
120
121        #[cfg(feature = "vector-hnsw")]
122        {
123            let points: Vec<Vector> = self
124                .embeddings
125                .values()
126                .map(|v| Vector::new(v.clone()))
127                .collect();
128
129            let values: Vec<String> = self.embeddings.keys().cloned().collect();
130
131            let builder = Builder::default();
132            let index = builder.build(points, values);
133
134            self.index = Some(index);
135        }
136
137        #[cfg(not(feature = "vector-hnsw"))]
138        {
139            println!(
140                "Warning: HNSW vector indexing not available. Install with --features vector-hnsw"
141            );
142            self.index = Some(());
143        }
144
145        Ok(())
146    }
147
148    /// Search for similar vectors
149    pub fn search(&self, query_embedding: &[f32], top_k: usize) -> Result<Vec<(String, f32)>> {
150        let _index = self
151            .index
152            .as_ref()
153            .ok_or_else(|| GraphRAGError::VectorSearch {
154                message: "Index not built. Call build_index() first.".to_string(),
155            })?;
156
157        #[cfg(feature = "vector-hnsw")]
158        {
159            let query_point = Vector::new(query_embedding.to_vec());
160            let mut search = Search::default();
161
162            let results = _index.search(&query_point, &mut search);
163
164            let mut scored_results = Vec::new();
165            for item in results.into_iter().take(top_k) {
166                let distance = item.distance;
167                // Convert distance to similarity using exponential decay for better score distribution
168                let similarity = (-distance).exp().clamp(0.0, 1.0);
169                scored_results.push((item.value.clone(), similarity));
170            }
171
172            Ok(scored_results)
173        }
174
175        #[cfg(not(feature = "vector-hnsw"))]
176        {
177            // Fallback to brute force similarity search
178            let query_vec = query_embedding;
179            let mut scored_results = Vec::new();
180
181            for (id, embedding) in &self.embeddings {
182                let similarity = self.cosine_similarity(query_vec, embedding);
183                scored_results.push((id.clone(), similarity));
184            }
185
186            // Sort by similarity (highest first) and take top_k
187            scored_results
188                .sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
189            scored_results.truncate(top_k);
190
191            Ok(scored_results)
192        }
193    }
194
195    /// Calculate cosine similarity between two vectors (fallback when HNSW is not available)
196    #[cfg(not(feature = "vector-hnsw"))]
197    fn cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
198        if a.len() != b.len() {
199            return 0.0;
200        }
201
202        let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
203        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
204        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
205
206        if norm_a == 0.0 || norm_b == 0.0 {
207            0.0
208        } else {
209            dot_product / (norm_a * norm_b)
210        }
211    }
212
213    /// Get the number of vectors in the index
214    pub fn len(&self) -> usize {
215        self.embeddings.len()
216    }
217
218    /// Check if the index is empty
219    pub fn is_empty(&self) -> bool {
220        self.embeddings.is_empty()
221    }
222
223    /// Get embedding dimension (assuming all embeddings have the same dimension)
224    pub fn dimension(&self) -> Option<usize> {
225        self.embeddings.values().next().map(|v| v.len())
226    }
227
228    /// Remove a vector from the index
229    pub fn remove_vector(&mut self, id: &str) -> Result<()> {
230        self.embeddings.remove(id);
231        // Note: instant-distance doesn't support removal, so we need to rebuild
232        if !self.embeddings.is_empty() {
233            self.build_index()?;
234        } else {
235            self.index = None;
236        }
237        Ok(())
238    }
239
240    /// Get all vector IDs
241    pub fn get_ids(&self) -> Vec<String> {
242        self.embeddings.keys().cloned().collect()
243    }
244
245    /// Check if a vector exists
246    pub fn contains(&self, id: &str) -> bool {
247        self.embeddings.contains_key(id)
248    }
249
250    /// Get embedding by ID
251    pub fn get_embedding(&self, id: &str) -> Option<&Vec<f32>> {
252        self.embeddings.get(id)
253    }
254
255    /// Batch add multiple vectors in parallel with proper synchronization
256    pub fn batch_add_vectors(&mut self, vectors: Vec<(String, Vec<f32>)>) -> Result<()> {
257        #[cfg(feature = "parallel-processing")]
258        if let Some(processor) = self.parallel_processor.clone() {
259            return self.batch_add_vectors_parallel(vectors, &processor);
260        }
261
262        // Sequential fallback
263        for (id, embedding) in vectors {
264            self.add_vector(id, embedding)?;
265        }
266        Ok(())
267    }
268
269    /// Parallel batch vector addition with conflict detection and chunked processing
270    #[cfg(feature = "parallel-processing")]
271    fn batch_add_vectors_parallel(
272        &mut self,
273        vectors: Vec<(String, Vec<f32>)>,
274        processor: &ParallelProcessor,
275    ) -> Result<()> {
276        if !processor.should_use_parallel(vectors.len()) {
277            // Use sequential processing for small batches
278            for (id, embedding) in vectors {
279                self.add_vector(id, embedding)?;
280            }
281            return Ok(());
282        }
283
284        #[cfg(feature = "parallel-processing")]
285        {
286            use rayon::prelude::*;
287            use std::collections::HashMap;
288
289            // Pre-validate all vectors in parallel
290            let validation_results: std::result::Result<Vec<_>, crate::GraphRAGError> = vectors
291                .par_iter()
292                .map(|(id, embedding)| {
293                    if embedding.is_empty() {
294                        Err(crate::GraphRAGError::VectorSearch {
295                            message: format!("Empty embedding vector for ID: {id}"),
296                        })
297                    } else {
298                        Ok((id.clone(), embedding.clone()))
299                    }
300                })
301                .collect();
302
303            let validated_vectors = match validation_results {
304                Ok(vectors) => vectors,
305                Err(e) => {
306                    eprintln!("Vector validation failed: {e}");
307                    // Fall back to sequential processing with validation
308                    for (id, embedding) in vectors {
309                        self.add_vector(id, embedding)?;
310                    }
311                    return Ok(());
312                },
313            };
314
315            // Check for duplicate IDs and resolve conflicts
316            let mut unique_vectors = HashMap::new();
317            for (id, embedding) in validated_vectors {
318                if unique_vectors.contains_key(&id) {
319                    eprintln!("Warning: Duplicate vector ID '{id}' - using latest");
320                }
321                unique_vectors.insert(id, embedding);
322            }
323
324            // Convert to vector pairs for sequential insertion
325            let vector_pairs: Vec<_> = unique_vectors.into_iter().collect();
326
327            // Vector pairs are already validated and deduplicated
328
329            // Apply the validated vectors to the embeddings map sequentially
330            for (id, embedding) in vector_pairs {
331                self.embeddings.insert(id, embedding);
332            }
333
334            println!("Added {} vectors in parallel batch", vectors.len());
335        }
336
337        #[cfg(not(feature = "parallel-processing"))]
338        {
339            // Sequential fallback when parallel processing is not available
340            for (id, embedding) in vectors {
341                self.add_vector(id, embedding)?;
342            }
343        }
344
345        Ok(())
346    }
347
348    /// Batch search for multiple queries in parallel
349    pub fn batch_search(
350        &self,
351        queries: &[Vec<f32>],
352        top_k: usize,
353    ) -> Result<Vec<Vec<(String, f32)>>> {
354        #[cfg(feature = "parallel-processing")]
355        {
356            if let Some(processor) = &self.parallel_processor {
357                if processor.should_use_parallel(queries.len()) {
358                    use rayon::prelude::*;
359                    return queries
360                        .par_iter()
361                        .map(|query| self.search(query, top_k))
362                        .collect();
363                }
364            }
365        }
366
367        // Sequential fallback
368        queries
369            .iter()
370            .map(|query| self.search(query, top_k))
371            .collect()
372    }
373
374    /// Parallel similarity computation between all vectors with optimized chunking
375    pub fn compute_all_similarities(&self) -> HashMap<(String, String), f32> {
376        #[cfg(feature = "parallel-processing")]
377        if let Some(processor) = &self.parallel_processor {
378            return self.compute_similarities_parallel(processor);
379        }
380
381        // Sequential fallback
382        self.compute_similarities_sequential()
383    }
384
385    /// Parallel similarity computation with work-stealing and memory optimization
386    #[cfg(feature = "parallel-processing")]
387    fn compute_similarities_parallel(
388        &self,
389        processor: &ParallelProcessor,
390    ) -> HashMap<(String, String), f32> {
391        let ids: Vec<String> = self.embeddings.keys().cloned().collect();
392        let total_pairs = (ids.len() * (ids.len() - 1)) / 2;
393
394        if !processor.should_use_parallel(total_pairs) {
395            return self.compute_similarities_sequential();
396        }
397
398        #[cfg(feature = "parallel-processing")]
399        {
400            use rayon::prelude::*;
401
402            // Pre-collect embeddings for efficient parallel access
403            let embedding_vec: Vec<(String, Vec<f32>)> = ids
404                .iter()
405                .filter_map(|id| self.embeddings.get(id).map(|emb| (id.clone(), emb.clone())))
406                .collect();
407
408            if embedding_vec.len() < 2 {
409                return HashMap::new();
410            }
411
412            // Generate pairs for parallel processing
413            let mut pairs = Vec::new();
414            for i in 0..embedding_vec.len() {
415                for j in (i + 1)..embedding_vec.len() {
416                    pairs.push((i, j));
417                }
418            }
419
420            // Parallel similarity computation with chunked processing
421            let chunk_size = processor.config().chunk_batch_size.min(pairs.len());
422            let similarities: HashMap<(String, String), f32> = pairs
423                .par_chunks(chunk_size)
424                .map(|chunk| {
425                    let mut local_similarities = HashMap::new();
426
427                    for &(i, j) in chunk {
428                        let (first_id, first_emb) = &embedding_vec[i];
429                        let (second_id, second_emb) = &embedding_vec[j];
430
431                        let similarity = VectorUtils::cosine_similarity(first_emb, second_emb);
432
433                        // Only store similarities above a threshold to save memory
434                        if similarity > 0.1 {
435                            local_similarities
436                                .insert((first_id.clone(), second_id.clone()), similarity);
437                        }
438                    }
439
440                    local_similarities
441                })
442                .reduce(HashMap::new, |mut acc, chunk_similarities| {
443                    acc.extend(chunk_similarities);
444                    acc
445                });
446
447            println!(
448                "Computed {} similarities from {} vectors in parallel",
449                similarities.len(),
450                embedding_vec.len()
451            );
452
453            similarities
454        }
455
456        #[cfg(not(feature = "parallel-processing"))]
457        {
458            self.compute_similarities_sequential()
459        }
460    }
461
462    /// Sequential similarity computation (fallback)
463    fn compute_similarities_sequential(&self) -> HashMap<(String, String), f32> {
464        let ids: Vec<String> = self.embeddings.keys().cloned().collect();
465        let mut similarities = HashMap::new();
466
467        for (i, id1) in ids.iter().enumerate() {
468            if let Some(emb1) = self.embeddings.get(id1) {
469                for id2 in ids.iter().skip(i + 1) {
470                    if let Some(emb2) = self.embeddings.get(id2) {
471                        let sim = VectorUtils::cosine_similarity(emb1, emb2);
472                        // Only store similarities above a threshold to save memory
473                        if sim > 0.1 {
474                            similarities.insert((id1.clone(), id2.clone()), sim);
475                        }
476                    }
477                }
478            }
479        }
480
481        similarities
482    }
483
484    /// Find vectors within a similarity threshold
485    pub fn find_similar(
486        &self,
487        query_embedding: &[f32],
488        threshold: f32,
489    ) -> Result<Vec<(String, f32)>> {
490        let results = self.search(query_embedding, self.len())?;
491        Ok(results
492            .into_iter()
493            .filter(|(_, similarity)| *similarity >= threshold)
494            .collect())
495    }
496
497    /// Calculate statistics about the index
498    pub fn statistics(&self) -> VectorIndexStatistics {
499        let dimension = self.dimension().unwrap_or(0);
500        let vector_count = self.len();
501
502        // Calculate basic statistics
503        let mut min_norm = f32::INFINITY;
504        let mut max_norm: f32 = 0.0;
505        let mut sum_norm = 0.0;
506
507        for embedding in self.embeddings.values() {
508            let norm = embedding.iter().map(|x| x * x).sum::<f32>().sqrt();
509            min_norm = min_norm.min(norm);
510            max_norm = max_norm.max(norm);
511            sum_norm += norm;
512        }
513
514        let avg_norm = if vector_count > 0 {
515            sum_norm / vector_count as f32
516        } else {
517            0.0
518        };
519
520        VectorIndexStatistics {
521            vector_count,
522            dimension,
523            min_norm,
524            max_norm,
525            avg_norm,
526            index_built: self.index.is_some(),
527        }
528    }
529}
530
531impl Default for VectorIndex {
532    fn default() -> Self {
533        Self::new()
534    }
535}
536
537/// Statistics about the vector index
538#[derive(Debug)]
539pub struct VectorIndexStatistics {
540    /// Total number of vectors in the index
541    pub vector_count: usize,
542    /// Dimensionality of vectors
543    pub dimension: usize,
544    /// Minimum vector norm
545    pub min_norm: f32,
546    /// Maximum vector norm
547    pub max_norm: f32,
548    /// Average vector norm
549    pub avg_norm: f32,
550    /// Whether the index has been built
551    pub index_built: bool,
552}
553
554impl VectorIndexStatistics {
555    /// Print statistics
556    pub fn print(&self) {
557        println!("Vector Index Statistics:");
558        println!("  Vector count: {}", self.vector_count);
559        println!("  Dimension: {}", self.dimension);
560        println!("  Index built: {}", self.index_built);
561        if self.vector_count > 0 {
562            println!("  Vector norms:");
563            println!("    Min: {:.4}", self.min_norm);
564            println!("    Max: {:.4}", self.max_norm);
565            println!("    Average: {:.4}", self.avg_norm);
566        }
567    }
568}
569
570/// Utility functions for vector operations
571pub struct VectorUtils;
572
573/// Simple embedding generator using hash-based approach for consistent vectors
574pub struct EmbeddingGenerator {
575    dimension: usize,
576    word_vectors: HashMap<String, Vec<f32>>,
577}
578
579impl EmbeddingGenerator {
580    /// Create a new embedding generator with specified dimension
581    pub fn new(dimension: usize) -> Self {
582        Self {
583            dimension,
584            word_vectors: HashMap::new(),
585        }
586    }
587
588    /// Create a new embedding generator with parallel processing support
589    #[cfg(feature = "parallel-processing")]
590    pub fn with_parallel_processing(
591        dimension: usize,
592        _parallel_processor: ParallelProcessor,
593    ) -> Self {
594        Self {
595            dimension,
596            word_vectors: HashMap::new(),
597        }
598    }
599
600    /// Generate embedding for a text string
601    pub fn generate_embedding(&mut self, text: &str) -> Vec<f32> {
602        let words: Vec<&str> = text.split_whitespace().collect();
603        if words.is_empty() {
604            return vec![0.0; self.dimension];
605        }
606
607        // Get or create word vectors
608        let mut word_embeddings = Vec::new();
609        for word in &words {
610            let normalized_word = word.to_lowercase();
611            if !self.word_vectors.contains_key(&normalized_word) {
612                self.word_vectors.insert(
613                    normalized_word.clone(),
614                    self.generate_word_vector(&normalized_word),
615                );
616            }
617            word_embeddings.push(self.word_vectors[&normalized_word].clone());
618        }
619
620        // Average the word vectors
621        let mut result = vec![0.0; self.dimension];
622        for word_vec in word_embeddings {
623            for (i, value) in word_vec.iter().enumerate() {
624                result[i] += value;
625            }
626        }
627
628        // Normalize by number of words
629        let word_count = words.len() as f32;
630        for value in &mut result {
631            *value /= word_count;
632        }
633
634        // Normalize to unit vector
635        VectorUtils::normalize(&mut result);
636        result
637    }
638
639    /// Generate a consistent vector for a word using hash-based approach
640    fn generate_word_vector(&self, word: &str) -> Vec<f32> {
641        let mut vector = Vec::with_capacity(self.dimension);
642
643        // Use multiple hash seeds for better distribution
644        for i in 0..self.dimension {
645            let mut hasher = DefaultHasher::new();
646            word.hash(&mut hasher);
647            i.hash(&mut hasher);
648
649            let hash = hasher.finish();
650            // Convert hash to float in range [-1, 1]
651            let value = ((hash % 2000) as f32 - 1000.0) / 1000.0;
652            vector.push(value);
653        }
654
655        // Normalize to unit vector for better similarity properties
656        VectorUtils::normalize(&mut vector);
657        vector
658    }
659
660    /// Generate embeddings for multiple texts in batch with parallel processing
661    pub fn batch_generate(&mut self, texts: &[&str]) -> Vec<Vec<f32>> {
662        // Use sequential approach to avoid borrowing issues
663        let mut results = Vec::with_capacity(texts.len());
664        for text in texts {
665            results.push(self.generate_embedding(text));
666        }
667        results
668    }
669
670    /// Parallel batch generation with chunking for very large datasets
671    pub fn batch_generate_chunked(&mut self, texts: &[&str], chunk_size: usize) -> Vec<Vec<f32>> {
672        if texts.len() <= chunk_size {
673            return self.batch_generate(texts);
674        }
675
676        #[cfg(feature = "parallel-processing")]
677        {
678            use rayon::prelude::*;
679
680            // Process in chunks to manage memory usage
681            let results: Vec<Vec<f32>> = texts
682                .par_chunks(chunk_size)
683                .map(|chunk| {
684                    // Each chunk is processed with its own generator state
685                    let mut local_generator = EmbeddingGenerator::new(self.dimension);
686                    local_generator.word_vectors = self.word_vectors.clone(); // Share cached words
687
688                    chunk
689                        .iter()
690                        .map(|&text| local_generator.generate_embedding(text))
691                        .collect::<Vec<_>>()
692                })
693                .flatten()
694                .collect();
695
696            // Update the main generator's word cache with new words from parallel processing
697            // Note: This is a simplified approach - in a more sophisticated implementation,
698            // we would merge the word caches from all parallel workers
699
700            println!(
701                "Generated {} embeddings in parallel chunks of size {}",
702                texts.len(),
703                chunk_size
704            );
705
706            results
707        }
708
709        #[cfg(not(feature = "parallel-processing"))]
710        {
711            // Sequential chunked processing when parallel is not available
712            let mut results = Vec::with_capacity(texts.len());
713
714            for chunk in texts.chunks(chunk_size) {
715                for &text in chunk {
716                    results.push(self.generate_embedding(text));
717                }
718            }
719
720            results
721        }
722    }
723
724    /// Get the embedding dimension
725    pub fn dimension(&self) -> usize {
726        self.dimension
727    }
728
729    /// Get the number of cached word vectors
730    pub fn cached_words(&self) -> usize {
731        self.word_vectors.len()
732    }
733
734    /// Clear the word vector cache
735    pub fn clear_cache(&mut self) {
736        self.word_vectors.clear();
737    }
738}
739
740impl Default for EmbeddingGenerator {
741    fn default() -> Self {
742        Self::new(128) // Default to 128-dimensional embeddings
743    }
744}
745
746impl VectorUtils {
747    /// Calculate cosine similarity between two vectors
748    pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
749        if a.len() != b.len() {
750            return 0.0;
751        }
752
753        let dot_product: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
754        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
755        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
756
757        if norm_a == 0.0 || norm_b == 0.0 {
758            0.0
759        } else {
760            dot_product / (norm_a * norm_b)
761        }
762    }
763
764    /// Calculate Euclidean distance between two vectors
765    pub fn euclidean_distance(a: &[f32], b: &[f32]) -> f32 {
766        if a.len() != b.len() {
767            return f32::INFINITY;
768        }
769
770        a.iter()
771            .zip(b.iter())
772            .map(|(x, y)| (x - y).powi(2))
773            .sum::<f32>()
774            .sqrt()
775    }
776
777    /// Normalize a vector to unit length
778    pub fn normalize(vector: &mut [f32]) {
779        let norm = vector.iter().map(|x| x * x).sum::<f32>().sqrt();
780        if norm > 0.0 {
781            for x in vector {
782                *x /= norm;
783            }
784        }
785    }
786
787    /// Generate a random vector (for testing)
788    pub fn random_vector(dimension: usize) -> Vec<f32> {
789        use std::collections::hash_map::DefaultHasher;
790        use std::hash::{Hash, Hasher};
791
792        let mut vector = Vec::with_capacity(dimension);
793        let mut hasher = DefaultHasher::new();
794
795        for i in 0..dimension {
796            i.hash(&mut hasher);
797            let hash = hasher.finish();
798            let value = ((hash % 1000) as f32 - 500.0) / 1000.0; // Range [-0.5, 0.5]
799            vector.push(value);
800        }
801
802        vector
803    }
804
805    /// Calculate the centroid of multiple vectors
806    pub fn centroid(vectors: &[Vec<f32>]) -> Option<Vec<f32>> {
807        if vectors.is_empty() {
808            return None;
809        }
810
811        let dimension = vectors[0].len();
812        if !vectors.iter().all(|v| v.len() == dimension) {
813            return None; // All vectors must have the same dimension
814        }
815
816        let mut centroid = vec![0.0; dimension];
817        for vector in vectors {
818            for (i, &value) in vector.iter().enumerate() {
819                centroid[i] += value;
820            }
821        }
822
823        let count = vectors.len() as f32;
824        for value in &mut centroid {
825            *value /= count;
826        }
827
828        Some(centroid)
829    }
830}
831
832#[cfg(test)]
833mod tests {
834    use super::*;
835
836    #[test]
837    fn test_vector_index_creation() {
838        let mut index = VectorIndex::new();
839        assert!(index.is_empty());
840
841        let embedding = vec![0.1, 0.2, 0.3];
842        index.add_vector("test".to_string(), embedding).unwrap();
843
844        assert!(!index.is_empty());
845        assert_eq!(index.len(), 1);
846        assert_eq!(index.dimension(), Some(3));
847    }
848
849    #[test]
850    fn test_vector_search() {
851        let mut index = VectorIndex::new();
852
853        // Add some test vectors
854        index
855            .add_vector("doc1".to_string(), vec![1.0, 0.0, 0.0])
856            .unwrap();
857        index
858            .add_vector("doc2".to_string(), vec![0.0, 1.0, 0.0])
859            .unwrap();
860        index
861            .add_vector("doc3".to_string(), vec![0.8, 0.2, 0.0])
862            .unwrap();
863
864        index.build_index().unwrap();
865
866        // Search for similar vectors
867        let query = vec![1.0, 0.0, 0.0];
868        let results = index.search(&query, 2).unwrap();
869
870        assert!(!results.is_empty());
871        assert!(results.len() <= 2);
872
873        // First result should be most similar
874        assert_eq!(results[0].0, "doc1");
875    }
876
877    #[test]
878    fn test_cosine_similarity() {
879        let vec1 = vec![1.0, 0.0, 0.0];
880        let vec2 = vec![1.0, 0.0, 0.0];
881        let vec3 = vec![0.0, 1.0, 0.0];
882
883        assert!((VectorUtils::cosine_similarity(&vec1, &vec2) - 1.0).abs() < 0.001);
884        assert!((VectorUtils::cosine_similarity(&vec1, &vec3) - 0.0).abs() < 0.001);
885    }
886
887    #[test]
888    fn test_vector_normalization() {
889        let mut vector = vec![3.0, 4.0];
890        VectorUtils::normalize(&mut vector);
891
892        let norm = vector.iter().map(|x| x * x).sum::<f32>().sqrt();
893        assert!((norm - 1.0).abs() < 0.001);
894    }
895
896    #[test]
897    fn test_centroid_calculation() {
898        let vectors = vec![vec![1.0, 0.0], vec![0.0, 1.0], vec![1.0, 1.0]];
899
900        let centroid = VectorUtils::centroid(&vectors).unwrap();
901        assert!((centroid[0] - 2.0 / 3.0).abs() < 0.001);
902        assert!((centroid[1] - 2.0 / 3.0).abs() < 0.001);
903    }
904
905    #[test]
906    fn test_embedding_generator() {
907        let mut generator = EmbeddingGenerator::new(64);
908
909        let text1 = "hello world";
910        let text2 = "hello world";
911        let text3 = "goodbye world";
912
913        let embedding1 = generator.generate_embedding(text1);
914        let embedding2 = generator.generate_embedding(text2);
915        let embedding3 = generator.generate_embedding(text3);
916
917        // Same text should produce identical embeddings
918        assert_eq!(embedding1, embedding2);
919
920        // Different text should produce different embeddings
921        assert_ne!(embedding1, embedding3);
922
923        // Check dimension
924        assert_eq!(embedding1.len(), 64);
925
926        // Check that embeddings are normalized
927        let norm1 = embedding1.iter().map(|x| x * x).sum::<f32>().sqrt();
928        assert!((norm1 - 1.0).abs() < 0.001);
929    }
930
931    #[test]
932    fn test_batch_embedding_generation() {
933        let mut generator = EmbeddingGenerator::new(32);
934
935        let texts = vec!["first text", "second text", "third text"];
936        let embeddings = generator.batch_generate(&texts);
937
938        assert_eq!(embeddings.len(), 3);
939        assert!(embeddings.iter().all(|e| e.len() == 32));
940
941        // Each embedding should be different
942        assert_ne!(embeddings[0], embeddings[1]);
943        assert_ne!(embeddings[1], embeddings[2]);
944    }
945
946    #[test]
947    fn test_embedding_similarity() {
948        let mut generator = EmbeddingGenerator::new(64);
949
950        let similar1 = generator.generate_embedding("machine learning artificial intelligence");
951        let similar2 = generator.generate_embedding("artificial intelligence machine learning");
952        let different = generator.generate_embedding("cooking recipes kitchen");
953
954        let sim1 = VectorUtils::cosine_similarity(&similar1, &similar2);
955        let sim2 = VectorUtils::cosine_similarity(&similar1, &different);
956
957        // Similar content should have higher similarity
958        assert!(sim1 > sim2);
959    }
960}
graphrag_core/vector/mod.rs

graphrag_core/vector/
mod.rs