oxirs_vec/
lib.rs

1//! # OxiRS Vector Search
2//!
3//! [![Version](https://img.shields.io/badge/version-0.1.0--alpha.2-orange)](https://github.com/cool-japan/oxirs/releases)
4//! [![docs.rs](https://docs.rs/oxirs-vec/badge.svg)](https://docs.rs/oxirs-vec)
5//!
6//! **Status**: Alpha Release (v0.1.0-alpha.2)
7//! ⚠️ APIs may change. Not recommended for production use.
8//!
9//! Vector index abstractions for semantic similarity and AI-augmented SPARQL querying.
10//!
11//! This crate provides comprehensive vector search capabilities for knowledge graphs,
12//! enabling semantic similarity searches, AI-augmented SPARQL queries, and hybrid
13//! symbolic-vector operations.
14
15#![allow(dead_code)]
16//!
17//! ## Features
18//!
19//! - **Multi-algorithm embeddings**: TF-IDF, sentence transformers, custom models
20//! - **Advanced indexing**: HNSW, flat, quantized, and multi-index support
21//! - **Rich similarity metrics**: Cosine, Euclidean, Pearson, Jaccard, and more
22//! - **SPARQL integration**: `vec:similar` service functions and hybrid queries
23//! - **Performance optimization**: Caching, batching, and parallel processing
24//!
25//! ## Quick Start
26//!
27//! ```rust
28//! use oxirs_vec::{VectorStore, embeddings::EmbeddingStrategy};
29//!
30//! // Create vector store with sentence transformer embeddings
31//! let mut store = VectorStore::with_embedding_strategy(
32//!     EmbeddingStrategy::SentenceTransformer
33//! ).unwrap();
34//!
35//! // Index some content
36//! store.index_resource("http://example.org/doc1", "This is a document about AI")?;
37//! store.index_resource("http://example.org/doc2", "Machine learning tutorial")?;
38//!
39//! // Search for similar content
40//! let results = store.similarity_search("artificial intelligence", 5)?;
41//! ```
42
43use anyhow::Result;
44use std::collections::HashMap;
45
46pub mod adaptive_compression;
47pub mod adaptive_intelligent_caching;
48pub mod advanced_analytics;
49pub mod advanced_benchmarking;
50pub mod advanced_caching;
51pub mod advanced_metrics;
52pub mod advanced_result_merging;
53pub mod automl_optimization;
54pub mod benchmarking;
55pub mod cache_friendly_index;
56pub mod clustering;
57pub mod compression;
58#[cfg(feature = "content-processing")]
59pub mod content_processing;
60pub mod cross_language_alignment;
61pub mod cross_modal_embeddings;
62pub mod distributed_vector_search;
63pub mod embedding_pipeline;
64pub mod embeddings;
65pub mod enhanced_performance_monitoring;
66pub mod faiss_compatibility;
67pub mod faiss_gpu_integration;
68pub mod faiss_integration;
69pub mod faiss_migration_tools;
70pub mod faiss_native_integration;
71pub mod federated_search;
72pub mod gnn_embeddings;
73pub mod gpu;
74pub mod graph_aware_search;
75pub mod graph_indices;
76pub mod hierarchical_similarity;
77pub mod hnsw;
78pub mod huggingface;
79pub mod index;
80pub mod ivf;
81pub mod joint_embedding_spaces;
82pub mod kg_embeddings;
83pub mod lsh;
84pub mod mmap_advanced;
85pub mod mmap_index;
86pub mod opq;
87pub mod oxirs_arq_integration;
88pub mod performance_insights;
89pub mod pq;
90pub mod pytorch;
91pub mod quantum_search;
92pub mod random_utils;
93pub mod rdf_content_enhancement;
94pub mod rdf_integration;
95pub mod real_time_analytics;
96pub mod real_time_embedding_pipeline;
97pub mod real_time_updates;
98pub mod result_fusion;
99pub mod similarity;
100pub mod sparql_integration;
101pub mod sparql_service_endpoint;
102pub mod sparse;
103pub mod storage_optimizations;
104pub mod store_integration;
105pub mod structured_vectors;
106pub mod tensorflow;
107pub mod tree_indices;
108pub mod word2vec;
109
110// Python bindings module
111#[cfg(feature = "python")]
112pub mod python_bindings;
113
114// Re-export commonly used types
115pub use adaptive_compression::{
116    AdaptiveCompressor, CompressionMetrics, CompressionPriorities, MultiLevelCompression,
117    VectorStats,
118};
119pub use adaptive_intelligent_caching::{
120    AccessPatternAnalyzer, AdaptiveIntelligentCache, CacheConfiguration, CacheOptimizer,
121    CachePerformanceMetrics, CacheTier, MLModels, PredictivePrefetcher,
122};
123pub use advanced_analytics::{
124    AnomalyDetection, AnomalyDetector, AnomalyType, ImplementationEffort,
125    OptimizationRecommendation, PerformanceTrends, Priority, QualityAspect, QualityRecommendation,
126    QueryAnalytics, QueryAnomaly, RecommendationType, VectorAnalyticsEngine,
127    VectorDistributionAnalysis, VectorQualityAssessment,
128};
129pub use advanced_benchmarking::{
130    AdvancedBenchmarkConfig, AdvancedBenchmarkResult, AdvancedBenchmarkSuite, AlgorithmParameters,
131    BenchmarkAlgorithm, BuildTimeMetrics, CacheMetrics, DatasetQualityMetrics, DatasetStatistics,
132    DistanceStatistics, EnhancedBenchmarkDataset, HyperparameterTuner, IndexSizeMetrics,
133    LatencyMetrics, MemoryMetrics, ObjectiveFunction, OptimizationStrategy,
134    ParallelBenchmarkConfig, ParameterSpace, ParameterType, ParameterValue, PerformanceMetrics,
135    PerformanceProfiler, QualityDegradation, QualityMetrics, ScalabilityMetrics,
136    StatisticalAnalyzer, StatisticalMetrics, ThroughputMetrics,
137};
138pub use advanced_caching::{
139    BackgroundCacheWorker, CacheAnalysisReport, CacheAnalyzer, CacheConfig, CacheEntry,
140    CacheInvalidator, CacheKey, CacheStats, CacheWarmer, EvictionPolicy, InvalidationStats,
141    MultiLevelCache, MultiLevelCacheStats,
142};
143pub use advanced_result_merging::{
144    AdvancedResultMerger, ConfidenceInterval, DiversityConfig, DiversityMetric, FusionStatistics,
145    MergedResult, RankFusionAlgorithm, RankingFactor, ResultExplanation, ResultMergingConfig,
146    ResultMetadata, ScoreCombinationStrategy, ScoreNormalizationMethod, ScoredResult,
147    SourceContribution, SourceResult, SourceType,
148};
149pub use automl_optimization::{
150    AutoMLConfig, AutoMLOptimizer, AutoMLResults, AutoMLStatistics, IndexConfiguration,
151    IndexParameterSpace, OptimizationMetric, OptimizationTrial, ResourceConstraints, SearchSpace,
152    TrialResult,
153};
154pub use benchmarking::{
155    BenchmarkConfig, BenchmarkDataset, BenchmarkOutputFormat, BenchmarkResult, BenchmarkRunner,
156    BenchmarkSuite, BenchmarkTestCase, MemoryMetrics as BenchmarkMemoryMetrics,
157    PerformanceMetrics as BenchmarkPerformanceMetrics, QualityMetrics as BenchmarkQualityMetrics,
158    ScalabilityMetrics as BenchmarkScalabilityMetrics, SystemInfo,
159};
160pub use cache_friendly_index::{CacheFriendlyVectorIndex, IndexConfig as CacheFriendlyIndexConfig};
161pub use compression::{create_compressor, CompressionMethod, VectorCompressor};
162#[cfg(feature = "content-processing")]
163pub use content_processing::{
164    ChunkType, ChunkingStrategy, ContentChunk, ContentExtractionConfig, ContentLocation,
165    ContentProcessor, DocumentFormat, DocumentStructure, ExtractedContent, ExtractedImage,
166    ExtractedLink, ExtractedTable, FormatHandler, Heading, ProcessingStats, TocEntry,
167};
168pub use cross_modal_embeddings::{
169    AttentionMechanism, AudioData, AudioEncoder, CrossModalConfig, CrossModalEncoder, FusionLayer,
170    FusionStrategy, GraphData, GraphEncoder, ImageData, ImageEncoder, Modality, ModalityData,
171    MultiModalContent, TextEncoder, VideoData, VideoEncoder,
172};
173pub use distributed_vector_search::{
174    ConsistencyLevel, DistributedClusterStats, DistributedNodeConfig, DistributedQuery,
175    DistributedSearchResponse, DistributedVectorSearch, LoadBalancingAlgorithm, NodeHealthStatus,
176    PartitioningStrategy, QueryExecutionStrategy,
177};
178pub use embedding_pipeline::{
179    DimensionalityReduction, EmbeddingPipeline, NormalizationConfig, PostprocessingPipeline,
180    PreprocessingPipeline, TokenizerConfig, VectorNormalization,
181};
182pub use embeddings::{
183    EmbeddableContent, EmbeddingConfig, EmbeddingManager, EmbeddingStrategy, ModelDetails,
184    OpenAIConfig, OpenAIEmbeddingGenerator, SentenceTransformerGenerator, TransformerModelType,
185};
186pub use enhanced_performance_monitoring::{
187    Alert, AlertManager, AlertSeverity, AlertThresholds, AlertType, AnalyticsEngine,
188    AnalyticsReport, DashboardData, EnhancedPerformanceMonitor, ExportConfig, ExportDestination,
189    ExportFormat, LatencyDistribution, MonitoringConfig as EnhancedMonitoringConfig,
190    QualityMetrics as EnhancedQualityMetrics, QualityMetricsCollector, QualityStatistics,
191    QueryInfo, QueryMetricsCollector, QueryStatistics, QueryType, Recommendation,
192    RecommendationCategory, RecommendationPriority, SystemMetrics, SystemMetricsCollector,
193    SystemStatistics, TrendData, TrendDirection,
194};
195pub use faiss_compatibility::{
196    CompressionLevel, ConversionMetrics, ConversionResult, FaissCompatibility, FaissExportConfig,
197    FaissImportConfig, FaissIndexMetadata, FaissIndexType, FaissMetricType, FaissParameter,
198    SimpleVectorIndex,
199};
200pub use federated_search::{
201    AuthenticationConfig, FederatedSearchConfig, FederatedVectorSearch, FederationEndpoint,
202    PrivacyEngine, PrivacyMode, SchemaCompatibility, TrustManager,
203};
204pub use gnn_embeddings::{AggregatorType, GraphSAGE, GCN};
205pub use gpu::{
206    create_default_accelerator, create_memory_optimized_accelerator,
207    create_performance_accelerator, is_gpu_available, GpuAccelerator, GpuBuffer, GpuConfig,
208    GpuDevice, GpuExecutionConfig,
209};
210pub use graph_indices::{
211    DelaunayGraph, GraphIndex, GraphIndexConfig, GraphType, NSWGraph, ONNGGraph, PANNGGraph,
212    RNGGraph,
213};
214pub use hierarchical_similarity::{
215    ConceptHierarchy, HierarchicalSimilarity, HierarchicalSimilarityConfig,
216    HierarchicalSimilarityResult, HierarchicalSimilarityStats, SimilarityContext,
217    SimilarityExplanation, SimilarityTaskType,
218};
219pub use hnsw::{HnswConfig, HnswIndex};
220pub use index::{AdvancedVectorIndex, DistanceMetric, IndexConfig, IndexType, SearchResult};
221pub use ivf::{IvfConfig, IvfIndex, IvfStats, QuantizationStrategy};
222pub use joint_embedding_spaces::{
223    ActivationFunction, AlignmentPair, CLIPAligner, ContrastiveOptimizer, CrossModalAttention,
224    CurriculumLearning, DataAugmentation, DifficultySchedule, DomainAdapter, DomainStatistics,
225    JointEmbeddingConfig, JointEmbeddingSpace, LearningRateSchedule, LinearProjector,
226    PacingFunction, ScheduleType, TemperatureScheduler, TrainingStatistics,
227};
228pub use kg_embeddings::{
229    ComplEx, KGEmbedding, KGEmbeddingConfig, KGEmbeddingModel as KGModel, KGEmbeddingModelType,
230    RotatE, TransE, Triple,
231};
232pub use lsh::{LshConfig, LshFamily, LshIndex, LshStats};
233pub use mmap_index::{MemoryMappedIndexStats, MemoryMappedVectorIndex};
234pub use performance_insights::{
235    AlertingSystem, OptimizationRecommendations, PerformanceInsightsAnalyzer,
236    PerformanceTrends as InsightsPerformanceTrends, QueryComplexity,
237    QueryStatistics as InsightsQueryStatistics, ReportFormat, VectorStatistics,
238};
239pub use pq::{PQConfig, PQIndex, PQStats};
240pub use pytorch::{
241    ArchitectureType, CompileMode, DeviceManager, PyTorchConfig, PyTorchDevice, PyTorchEmbedder,
242    PyTorchModelManager, PyTorchModelMetadata, PyTorchTokenizer,
243};
244pub use quantum_search::{
245    QuantumSearchConfig, QuantumSearchResult, QuantumSearchStatistics, QuantumState,
246    QuantumVectorSearch,
247};
248pub use rdf_content_enhancement::{
249    ComponentWeights, MultiLanguageProcessor, PathConstraint, PathDirection, PropertyAggregator,
250    PropertyPath, RdfContentConfig, RdfContentProcessor, RdfContext, RdfEntity, RdfValue,
251    TemporalInfo,
252};
253pub use rdf_integration::{
254    RdfIntegrationStats, RdfTermMapping, RdfTermMetadata, RdfTermType, RdfVectorConfig,
255    RdfVectorIntegration, RdfVectorSearchResult, SearchMetadata,
256};
257pub use real_time_analytics::{
258    AlertSeverity as AnalyticsAlertSeverity, AlertType as AnalyticsAlertType, AnalyticsConfig,
259    AnalyticsEvent, AnalyticsReport as RealTimeAnalyticsReport,
260    DashboardData as RealTimeDashboardData, ExportFormat as AnalyticsExportFormat,
261    MetricsCollector, PerformanceMonitor, QueryMetrics, SystemMetrics as AnalyticsSystemMetrics,
262    VectorAnalyticsEngine as RealTimeVectorAnalyticsEngine,
263};
264pub use real_time_embedding_pipeline::{
265    AlertThresholds as PipelineAlertThresholds, AutoScalingConfig, CompressionConfig, ContentItem,
266    MonitoringConfig as PipelineMonitoringConfig, PipelineConfig as RealTimeEmbeddingConfig,
267    PipelineStatistics as PipelineStats, ProcessingPriority, ProcessingResult, ProcessingStatus,
268    RealTimeEmbeddingPipeline, VersioningStrategy,
269};
270pub use real_time_updates::{
271    BatchProcessor, RealTimeConfig, RealTimeVectorSearch, RealTimeVectorUpdater, UpdateBatch,
272    UpdateOperation, UpdatePriority, UpdateStats,
273};
274pub use result_fusion::{
275    FusedResults, FusionAlgorithm, FusionConfig, FusionQualityMetrics, FusionStats,
276    ResultFusionEngine, ScoreNormalizationStrategy, SourceResults, VectorSearchResult,
277};
278pub use similarity::{AdaptiveSimilarity, SemanticSimilarity, SimilarityConfig, SimilarityMetric};
279pub use sparql_integration::{
280    CrossLanguageProcessor, FederatedQueryResult, QueryExecutor, SparqlVectorFunctions,
281    SparqlVectorService, VectorOperation, VectorQuery, VectorQueryResult, VectorServiceArg,
282    VectorServiceConfig, VectorServiceResult,
283};
284pub use sparql_service_endpoint::{
285    AuthenticationInfo, AuthenticationType, CustomFunctionRegistry, FederatedOperation,
286    FederatedSearchResult, FederatedServiceEndpoint, FederatedVectorQuery, FunctionMetadata,
287    LoadBalancer, ParameterInfo, ParameterType as ServiceParameterType, PartialSearchResult,
288    QueryScope, ReturnType, ServiceCapability, ServiceEndpointManager, ServiceType,
289};
290pub use sparse::{COOMatrix, CSRMatrix, SparseVector};
291pub use storage_optimizations::{
292    CompressionType, MmapVectorFile, StorageConfig, StorageUtils, VectorBlock, VectorFileHeader,
293    VectorReader, VectorWriter,
294};
295pub use structured_vectors::{
296    ConfidenceScoredVector, HierarchicalVector, NamedDimensionVector, TemporalVector,
297    WeightedDimensionVector,
298};
299pub use tensorflow::{
300    OptimizationLevel, PreprocessingPipeline as TensorFlowPreprocessingPipeline, ServerConfig,
301    SessionConfig, TensorDataType, TensorFlowConfig, TensorFlowDevice, TensorFlowEmbedder,
302    TensorFlowModelInfo, TensorFlowModelServer, TensorSpec,
303};
304pub use tree_indices::{
305    BallTree, CoverTree, KdTree, RandomProjectionTree, TreeIndex, TreeIndexConfig, TreeType, VpTree,
306};
307pub use word2vec::{
308    AggregationMethod, OovStrategy, Word2VecConfig, Word2VecEmbeddingGenerator, Word2VecFormat,
309};
310
311/// Vector identifier type
312pub type VectorId = String;
313
314/// Batch search result type
315pub type BatchSearchResult = Vec<Result<Vec<(String, f32)>>>;
316
317/// Trait for vector store implementations
318pub trait VectorStoreTrait: Send + Sync {
319    /// Insert a vector with metadata
320    fn insert_vector(&mut self, id: VectorId, vector: Vector) -> Result<()>;
321
322    /// Add a vector and return its ID
323    fn add_vector(&mut self, vector: Vector) -> Result<VectorId>;
324
325    /// Get a vector by its ID
326    fn get_vector(&self, id: &VectorId) -> Result<Option<Vector>>;
327
328    /// Get all vector IDs
329    fn get_all_vector_ids(&self) -> Result<Vec<VectorId>>;
330
331    /// Search for similar vectors
332    fn search_similar(&self, query: &Vector, k: usize) -> Result<Vec<(VectorId, f32)>>;
333
334    /// Remove a vector by ID
335    fn remove_vector(&mut self, id: &VectorId) -> Result<bool>;
336
337    /// Get the number of vectors stored
338    fn len(&self) -> usize;
339
340    /// Check if the store is empty
341    fn is_empty(&self) -> bool {
342        self.len() == 0
343    }
344}
345
346/// Precision types for vectors
347#[derive(Debug, Clone, Copy, PartialEq, serde::Serialize, serde::Deserialize)]
348pub enum VectorPrecision {
349    F32,
350    F64,
351    F16,
352    I8,
353    Binary,
354}
355
356/// Multi-precision vector with enhanced functionality
357#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
358pub struct Vector {
359    pub dimensions: usize,
360    pub precision: VectorPrecision,
361    pub values: VectorData,
362    pub metadata: Option<std::collections::HashMap<String, String>>,
363}
364
365/// Vector data storage supporting multiple precisions
366#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
367pub enum VectorData {
368    F32(Vec<f32>),
369    F64(Vec<f64>),
370    F16(Vec<u16>), // Using u16 to represent f16 bits
371    I8(Vec<i8>),
372    Binary(Vec<u8>), // Packed binary representation
373}
374
375impl Vector {
376    /// Create a new F32 vector from values
377    pub fn new(values: Vec<f32>) -> Self {
378        let dimensions = values.len();
379        Self {
380            dimensions,
381            precision: VectorPrecision::F32,
382            values: VectorData::F32(values),
383            metadata: None,
384        }
385    }
386
387    /// Create a new vector with specific precision
388    pub fn with_precision(values: VectorData) -> Self {
389        let (dimensions, precision) = match &values {
390            VectorData::F32(v) => (v.len(), VectorPrecision::F32),
391            VectorData::F64(v) => (v.len(), VectorPrecision::F64),
392            VectorData::F16(v) => (v.len(), VectorPrecision::F16),
393            VectorData::I8(v) => (v.len(), VectorPrecision::I8),
394            VectorData::Binary(v) => (v.len() * 8, VectorPrecision::Binary), // 8 bits per byte
395        };
396
397        Self {
398            dimensions,
399            precision,
400            values,
401            metadata: None,
402        }
403    }
404
405    /// Create a new vector with metadata
406    pub fn with_metadata(
407        values: Vec<f32>,
408        metadata: std::collections::HashMap<String, String>,
409    ) -> Self {
410        let dimensions = values.len();
411        Self {
412            dimensions,
413            precision: VectorPrecision::F32,
414            values: VectorData::F32(values),
415            metadata: Some(metadata),
416        }
417    }
418
419    /// Create F64 vector
420    pub fn f64(values: Vec<f64>) -> Self {
421        Self::with_precision(VectorData::F64(values))
422    }
423
424    /// Create F16 vector (using u16 representation)
425    pub fn f16(values: Vec<u16>) -> Self {
426        Self::with_precision(VectorData::F16(values))
427    }
428
429    /// Create I8 quantized vector
430    pub fn i8(values: Vec<i8>) -> Self {
431        Self::with_precision(VectorData::I8(values))
432    }
433
434    /// Create binary vector
435    pub fn binary(values: Vec<u8>) -> Self {
436        Self::with_precision(VectorData::Binary(values))
437    }
438
439    /// Get vector values as f32 (converting if necessary)
440    pub fn as_f32(&self) -> Vec<f32> {
441        match &self.values {
442            VectorData::F32(v) => v.clone(),
443            VectorData::F64(v) => v.iter().map(|&x| x as f32).collect(),
444            VectorData::F16(v) => v.iter().map(|&x| Self::f16_to_f32(x)).collect(),
445            VectorData::I8(v) => v.iter().map(|&x| x as f32 / 128.0).collect(), // Normalize to [-1, 1]
446            VectorData::Binary(v) => {
447                let mut result = Vec::new();
448                for &byte in v {
449                    for bit in 0..8 {
450                        result.push(if (byte >> bit) & 1 == 1 { 1.0 } else { 0.0 });
451                    }
452                }
453                result
454            }
455        }
456    }
457
458    /// Convert f32 to f16 representation (simplified)
459    #[allow(dead_code)]
460    fn f32_to_f16(value: f32) -> u16 {
461        // Simplified f16 conversion - in practice, use proper IEEE 754 half-precision
462        let bits = value.to_bits();
463        let sign = (bits >> 31) & 0x1;
464        let exp = ((bits >> 23) & 0xff) as i32;
465        let mantissa = bits & 0x7fffff;
466
467        // Simplified conversion
468        let f16_exp = if exp == 0 {
469            0
470        } else {
471            (exp - 127 + 15).clamp(0, 31) as u16
472        };
473
474        let f16_mantissa = (mantissa >> 13) as u16;
475        ((sign as u16) << 15) | (f16_exp << 10) | f16_mantissa
476    }
477
478    /// Convert f16 representation to f32 (simplified)
479    fn f16_to_f32(value: u16) -> f32 {
480        // Simplified f16 conversion - in practice, use proper IEEE 754 half-precision
481        let sign = (value >> 15) & 0x1;
482        let exp = ((value >> 10) & 0x1f) as i32;
483        let mantissa = value & 0x3ff;
484
485        if exp == 0 {
486            if mantissa == 0 {
487                if sign == 1 {
488                    -0.0
489                } else {
490                    0.0
491                }
492            } else {
493                // Denormalized number
494                let f32_exp = -14 - 127;
495                let f32_mantissa = (mantissa as u32) << 13;
496                f32::from_bits(((sign as u32) << 31) | ((f32_exp as u32) << 23) | f32_mantissa)
497            }
498        } else {
499            let f32_exp = exp - 15 + 127;
500            let f32_mantissa = (mantissa as u32) << 13;
501            f32::from_bits(((sign as u32) << 31) | ((f32_exp as u32) << 23) | f32_mantissa)
502        }
503    }
504
505    /// Quantize f32 vector to i8
506    pub fn quantize_to_i8(values: &[f32]) -> Vec<i8> {
507        // Find min/max for normalization
508        let min_val = values.iter().fold(f32::INFINITY, |a, &b| a.min(b));
509        let max_val = values.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
510        let range = max_val - min_val;
511
512        if range == 0.0 {
513            vec![0; values.len()]
514        } else {
515            values
516                .iter()
517                .map(|&x| {
518                    let normalized = (x - min_val) / range; // 0 to 1
519                    let scaled = normalized * 254.0 - 127.0; // -127 to 127
520                    scaled.round().clamp(-127.0, 127.0) as i8
521                })
522                .collect()
523        }
524    }
525
526    /// Convert to binary representation using threshold
527    pub fn to_binary(values: &[f32], threshold: f32) -> Vec<u8> {
528        let mut binary = Vec::new();
529        let mut current_byte = 0u8;
530        let mut bit_position = 0;
531
532        for &value in values {
533            if value > threshold {
534                current_byte |= 1 << bit_position;
535            }
536
537            bit_position += 1;
538            if bit_position == 8 {
539                binary.push(current_byte);
540                current_byte = 0;
541                bit_position = 0;
542            }
543        }
544
545        // Handle remaining bits
546        if bit_position > 0 {
547            binary.push(current_byte);
548        }
549
550        binary
551    }
552
553    /// Calculate cosine similarity with another vector
554    pub fn cosine_similarity(&self, other: &Vector) -> Result<f32> {
555        if self.dimensions != other.dimensions {
556            return Err(anyhow::anyhow!("Vector dimensions must match"));
557        }
558
559        let self_f32 = self.as_f32();
560        let other_f32 = other.as_f32();
561
562        let dot_product: f32 = self_f32.iter().zip(&other_f32).map(|(a, b)| a * b).sum();
563
564        let magnitude_self: f32 = self_f32.iter().map(|x| x * x).sum::<f32>().sqrt();
565        let magnitude_other: f32 = other_f32.iter().map(|x| x * x).sum::<f32>().sqrt();
566
567        if magnitude_self == 0.0 || magnitude_other == 0.0 {
568            return Ok(0.0);
569        }
570
571        Ok(dot_product / (magnitude_self * magnitude_other))
572    }
573
574    /// Calculate Euclidean distance to another vector
575    pub fn euclidean_distance(&self, other: &Vector) -> Result<f32> {
576        if self.dimensions != other.dimensions {
577            return Err(anyhow::anyhow!("Vector dimensions must match"));
578        }
579
580        let self_f32 = self.as_f32();
581        let other_f32 = other.as_f32();
582
583        let distance = self_f32
584            .iter()
585            .zip(&other_f32)
586            .map(|(a, b)| (a - b).powi(2))
587            .sum::<f32>()
588            .sqrt();
589
590        Ok(distance)
591    }
592
593    /// Calculate Manhattan distance (L1 norm) to another vector
594    pub fn manhattan_distance(&self, other: &Vector) -> Result<f32> {
595        if self.dimensions != other.dimensions {
596            return Err(anyhow::anyhow!("Vector dimensions must match"));
597        }
598
599        let self_f32 = self.as_f32();
600        let other_f32 = other.as_f32();
601
602        let distance = self_f32
603            .iter()
604            .zip(&other_f32)
605            .map(|(a, b)| (a - b).abs())
606            .sum();
607
608        Ok(distance)
609    }
610
611    /// Calculate Minkowski distance (general Lp norm) to another vector
612    pub fn minkowski_distance(&self, other: &Vector, p: f32) -> Result<f32> {
613        if self.dimensions != other.dimensions {
614            return Err(anyhow::anyhow!("Vector dimensions must match"));
615        }
616
617        if p <= 0.0 {
618            return Err(anyhow::anyhow!("p must be positive"));
619        }
620
621        let self_f32 = self.as_f32();
622        let other_f32 = other.as_f32();
623
624        if p == f32::INFINITY {
625            // Special case: Chebyshev distance
626            return self.chebyshev_distance(other);
627        }
628
629        let distance = self_f32
630            .iter()
631            .zip(&other_f32)
632            .map(|(a, b)| (a - b).abs().powf(p))
633            .sum::<f32>()
634            .powf(1.0 / p);
635
636        Ok(distance)
637    }
638
639    /// Calculate Chebyshev distance (L∞ norm) to another vector
640    pub fn chebyshev_distance(&self, other: &Vector) -> Result<f32> {
641        if self.dimensions != other.dimensions {
642            return Err(anyhow::anyhow!("Vector dimensions must match"));
643        }
644
645        let self_f32 = self.as_f32();
646        let other_f32 = other.as_f32();
647
648        let distance = self_f32
649            .iter()
650            .zip(&other_f32)
651            .map(|(a, b)| (a - b).abs())
652            .fold(0.0f32, |max, val| max.max(val));
653
654        Ok(distance)
655    }
656
657    /// Get vector magnitude (L2 norm)
658    pub fn magnitude(&self) -> f32 {
659        let values = self.as_f32();
660        values.iter().map(|x| x * x).sum::<f32>().sqrt()
661    }
662
663    /// Normalize vector to unit length
664    pub fn normalize(&mut self) {
665        let mag = self.magnitude();
666        if mag > 0.0 {
667            match &mut self.values {
668                VectorData::F32(values) => {
669                    for value in values {
670                        *value /= mag;
671                    }
672                }
673                VectorData::F64(values) => {
674                    let mag_f64 = mag as f64;
675                    for value in values {
676                        *value /= mag_f64;
677                    }
678                }
679                _ => {
680                    // For other types, convert to f32, normalize, then convert back
681                    let mut f32_values = self.as_f32();
682                    for value in &mut f32_values {
683                        *value /= mag;
684                    }
685                    self.values = VectorData::F32(f32_values);
686                    self.precision = VectorPrecision::F32;
687                }
688            }
689        }
690    }
691
692    /// Get a normalized copy of this vector
693    pub fn normalized(&self) -> Vector {
694        let mut normalized = self.clone();
695        normalized.normalize();
696        normalized
697    }
698
699    /// Add another vector (element-wise)
700    pub fn add(&self, other: &Vector) -> Result<Vector> {
701        if self.dimensions != other.dimensions {
702            return Err(anyhow::anyhow!("Vector dimensions must match"));
703        }
704
705        let self_f32 = self.as_f32();
706        let other_f32 = other.as_f32();
707
708        let result_values: Vec<f32> = self_f32
709            .iter()
710            .zip(&other_f32)
711            .map(|(a, b)| a + b)
712            .collect();
713
714        Ok(Vector::new(result_values))
715    }
716
717    /// Subtract another vector (element-wise)
718    pub fn subtract(&self, other: &Vector) -> Result<Vector> {
719        if self.dimensions != other.dimensions {
720            return Err(anyhow::anyhow!("Vector dimensions must match"));
721        }
722
723        let self_f32 = self.as_f32();
724        let other_f32 = other.as_f32();
725
726        let result_values: Vec<f32> = self_f32
727            .iter()
728            .zip(&other_f32)
729            .map(|(a, b)| a - b)
730            .collect();
731
732        Ok(Vector::new(result_values))
733    }
734
735    /// Scale vector by a scalar
736    pub fn scale(&self, scalar: f32) -> Vector {
737        let values = self.as_f32();
738        let scaled_values: Vec<f32> = values.iter().map(|x| x * scalar).collect();
739
740        Vector::new(scaled_values)
741    }
742
743    /// Get the number of dimensions in the vector
744    pub fn len(&self) -> usize {
745        self.dimensions
746    }
747
748    /// Check if vector is empty (zero dimensions)
749    pub fn is_empty(&self) -> bool {
750        self.dimensions == 0
751    }
752
753    /// Get vector as slice of f32 values
754    pub fn as_slice(&self) -> Vec<f32> {
755        self.as_f32()
756    }
757}
758
759/// Vector index trait for efficient similarity search
760pub trait VectorIndex: Send + Sync {
761    /// Insert a vector with associated URI
762    fn insert(&mut self, uri: String, vector: Vector) -> Result<()>;
763
764    /// Find k nearest neighbors
765    fn search_knn(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>>;
766
767    /// Find all vectors within threshold similarity
768    fn search_threshold(&self, query: &Vector, threshold: f32) -> Result<Vec<(String, f32)>>;
769
770    /// Get a vector by its URI
771    fn get_vector(&self, uri: &str) -> Option<&Vector>;
772
773    /// Add a vector with associated ID and metadata
774    fn add_vector(
775        &mut self,
776        id: VectorId,
777        vector: Vector,
778        _metadata: Option<HashMap<String, String>>,
779    ) -> Result<()> {
780        // Default implementation that delegates to insert
781        self.insert(id, vector)
782    }
783
784    /// Update an existing vector
785    fn update_vector(&mut self, id: VectorId, vector: Vector) -> Result<()> {
786        // Default implementation that delegates to insert
787        self.insert(id, vector)
788    }
789
790    /// Update metadata for a vector
791    fn update_metadata(&mut self, _id: VectorId, _metadata: HashMap<String, String>) -> Result<()> {
792        // Default implementation (no-op)
793        Ok(())
794    }
795
796    /// Remove a vector by its ID
797    fn remove_vector(&mut self, _id: VectorId) -> Result<()> {
798        // Default implementation (no-op)
799        Ok(())
800    }
801}
802
803/// In-memory vector index implementation
804pub struct MemoryVectorIndex {
805    vectors: Vec<(String, Vector)>,
806    similarity_config: similarity::SimilarityConfig,
807}
808
809impl MemoryVectorIndex {
810    pub fn new() -> Self {
811        Self {
812            vectors: Vec::new(),
813            similarity_config: similarity::SimilarityConfig::default(),
814        }
815    }
816
817    pub fn with_similarity_config(config: similarity::SimilarityConfig) -> Self {
818        Self {
819            vectors: Vec::new(),
820            similarity_config: config,
821        }
822    }
823}
824
825impl Default for MemoryVectorIndex {
826    fn default() -> Self {
827        Self::new()
828    }
829}
830
831impl VectorIndex for MemoryVectorIndex {
832    fn insert(&mut self, uri: String, vector: Vector) -> Result<()> {
833        self.vectors.push((uri, vector));
834        Ok(())
835    }
836
837    fn search_knn(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
838        let metric = self.similarity_config.primary_metric;
839        let query_f32 = query.as_f32();
840        let mut similarities: Vec<(String, f32)> = self
841            .vectors
842            .iter()
843            .map(|(uri, vec)| {
844                let vec_f32 = vec.as_f32();
845                let sim = metric.similarity(&query_f32, &vec_f32).unwrap_or(0.0);
846                (uri.clone(), sim)
847            })
848            .collect();
849
850        similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
851        similarities.truncate(k);
852
853        Ok(similarities)
854    }
855
856    fn search_threshold(&self, query: &Vector, threshold: f32) -> Result<Vec<(String, f32)>> {
857        let metric = self.similarity_config.primary_metric;
858        let query_f32 = query.as_f32();
859        let similarities: Vec<(String, f32)> = self
860            .vectors
861            .iter()
862            .filter_map(|(uri, vec)| {
863                let vec_f32 = vec.as_f32();
864                let sim = metric.similarity(&query_f32, &vec_f32).unwrap_or(0.0);
865                if sim >= threshold {
866                    Some((uri.clone(), sim))
867                } else {
868                    None
869                }
870            })
871            .collect();
872
873        Ok(similarities)
874    }
875
876    fn get_vector(&self, uri: &str) -> Option<&Vector> {
877        self.vectors.iter().find(|(u, _)| u == uri).map(|(_, v)| v)
878    }
879}
880
881/// Enhanced vector store with embedding management and advanced features
882pub struct VectorStore {
883    index: Box<dyn VectorIndex>,
884    embedding_manager: Option<embeddings::EmbeddingManager>,
885    config: VectorStoreConfig,
886}
887
888/// Configuration for vector store
889#[derive(Debug, Clone)]
890pub struct VectorStoreConfig {
891    pub auto_embed: bool,
892    pub cache_embeddings: bool,
893    pub similarity_threshold: f32,
894    pub max_results: usize,
895}
896
897impl Default for VectorStoreConfig {
898    fn default() -> Self {
899        Self {
900            auto_embed: true,
901            cache_embeddings: true,
902            similarity_threshold: 0.7,
903            max_results: 100,
904        }
905    }
906}
907
908impl VectorStore {
909    /// Create a new vector store with default memory index
910    pub fn new() -> Self {
911        Self {
912            index: Box::new(MemoryVectorIndex::new()),
913            embedding_manager: None,
914            config: VectorStoreConfig::default(),
915        }
916    }
917
918    /// Create vector store with specific embedding strategy
919    pub fn with_embedding_strategy(strategy: embeddings::EmbeddingStrategy) -> Result<Self> {
920        let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
921
922        Ok(Self {
923            index: Box::new(MemoryVectorIndex::new()),
924            embedding_manager: Some(embedding_manager),
925            config: VectorStoreConfig::default(),
926        })
927    }
928
929    /// Create vector store with custom index
930    pub fn with_index(index: Box<dyn VectorIndex>) -> Self {
931        Self {
932            index,
933            embedding_manager: None,
934            config: VectorStoreConfig::default(),
935        }
936    }
937
938    /// Create vector store with custom index and embedding strategy
939    pub fn with_index_and_embeddings(
940        index: Box<dyn VectorIndex>,
941        strategy: embeddings::EmbeddingStrategy,
942    ) -> Result<Self> {
943        let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
944
945        Ok(Self {
946            index,
947            embedding_manager: Some(embedding_manager),
948            config: VectorStoreConfig::default(),
949        })
950    }
951
952    /// Set vector store configuration
953    pub fn with_config(mut self, config: VectorStoreConfig) -> Self {
954        self.config = config;
955        self
956    }
957
958    /// Index a resource with automatic embedding generation
959    pub fn index_resource(&mut self, uri: String, content: &str) -> Result<()> {
960        if let Some(ref mut embedding_manager) = self.embedding_manager {
961            let embeddable_content = embeddings::EmbeddableContent::Text(content.to_string());
962            let vector = embedding_manager.get_embedding(&embeddable_content)?;
963            self.index.insert(uri, vector)
964        } else {
965            // Generate a simple hash-based vector as fallback
966            let vector = self.generate_fallback_vector(content);
967            self.index.insert(uri, vector)
968        }
969    }
970
971    /// Index an RDF resource with structured content
972    pub fn index_rdf_resource(
973        &mut self,
974        uri: String,
975        label: Option<String>,
976        description: Option<String>,
977        properties: std::collections::HashMap<String, Vec<String>>,
978    ) -> Result<()> {
979        if let Some(ref mut embedding_manager) = self.embedding_manager {
980            let embeddable_content = embeddings::EmbeddableContent::RdfResource {
981                uri: uri.clone(),
982                label,
983                description,
984                properties,
985            };
986            let vector = embedding_manager.get_embedding(&embeddable_content)?;
987            self.index.insert(uri, vector)
988        } else {
989            Err(anyhow::anyhow!(
990                "Embedding manager required for RDF resource indexing"
991            ))
992        }
993    }
994
995    /// Index a pre-computed vector
996    pub fn index_vector(&mut self, uri: String, vector: Vector) -> Result<()> {
997        self.index.insert(uri, vector)
998    }
999
1000    /// Search for similar resources using text query
1001    pub fn similarity_search(&self, query: &str, limit: usize) -> Result<Vec<(String, f32)>> {
1002        let query_vector = if let Some(ref _embedding_manager) = self.embedding_manager {
1003            let _embeddable_content = embeddings::EmbeddableContent::Text(query.to_string());
1004            // We need a mutable reference, but we only have an immutable one
1005            // For now, generate a fallback vector
1006            self.generate_fallback_vector(query)
1007        } else {
1008            self.generate_fallback_vector(query)
1009        };
1010
1011        self.index.search_knn(&query_vector, limit)
1012    }
1013
1014    /// Search for similar resources using a vector query
1015    pub fn similarity_search_vector(
1016        &self,
1017        query: &Vector,
1018        limit: usize,
1019    ) -> Result<Vec<(String, f32)>> {
1020        self.index.search_knn(query, limit)
1021    }
1022
1023    /// Find resources within similarity threshold
1024    pub fn threshold_search(&self, query: &str, threshold: f32) -> Result<Vec<(String, f32)>> {
1025        let query_vector = self.generate_fallback_vector(query);
1026        self.index.search_threshold(&query_vector, threshold)
1027    }
1028
1029    /// Advanced search with multiple options
1030    pub fn advanced_search(&self, options: SearchOptions) -> Result<Vec<(String, f32)>> {
1031        let query_vector = match options.query {
1032            SearchQuery::Text(text) => self.generate_fallback_vector(&text),
1033            SearchQuery::Vector(vector) => vector,
1034        };
1035
1036        let results = match options.search_type {
1037            SearchType::KNN(k) => self.index.search_knn(&query_vector, k)?,
1038            SearchType::Threshold(threshold) => {
1039                self.index.search_threshold(&query_vector, threshold)?
1040            }
1041        };
1042
1043        Ok(results)
1044    }
1045
1046    fn generate_fallback_vector(&self, text: &str) -> Vector {
1047        // Simple hash-based vector generation for fallback
1048        use std::collections::hash_map::DefaultHasher;
1049        use std::hash::{Hash, Hasher};
1050
1051        let mut hasher = DefaultHasher::new();
1052        text.hash(&mut hasher);
1053        let hash = hasher.finish();
1054
1055        let mut values = Vec::with_capacity(384); // Standard embedding size
1056        let mut seed = hash;
1057
1058        for _ in 0..384 {
1059            seed = seed.wrapping_mul(1103515245).wrapping_add(12345);
1060            let normalized = (seed as f32) / (u64::MAX as f32);
1061            values.push((normalized - 0.5) * 2.0); // Range: -1.0 to 1.0
1062        }
1063
1064        Vector::new(values)
1065    }
1066
1067    /// Get embedding manager statistics
1068    pub fn embedding_stats(&self) -> Option<(usize, usize)> {
1069        self.embedding_manager.as_ref().map(|em| em.cache_stats())
1070    }
1071
1072    /// Build vocabulary for TF-IDF embeddings
1073    pub fn build_vocabulary(&mut self, documents: &[String]) -> Result<()> {
1074        if let Some(ref mut embedding_manager) = self.embedding_manager {
1075            embedding_manager.build_vocabulary(documents)
1076        } else {
1077            Ok(()) // No-op if no embedding manager
1078        }
1079    }
1080
1081    /// Calculate similarity between two resources by their URIs
1082    pub fn calculate_similarity(&self, uri1: &str, uri2: &str) -> Result<f32> {
1083        // If the URIs are identical, return perfect similarity
1084        if uri1 == uri2 {
1085            return Ok(1.0);
1086        }
1087
1088        // Get the vectors for both URIs
1089        let vector1 = self
1090            .index
1091            .get_vector(uri1)
1092            .ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri1))?;
1093
1094        let vector2 = self
1095            .index
1096            .get_vector(uri2)
1097            .ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri2))?;
1098
1099        // Calculate cosine similarity between the vectors
1100        vector1.cosine_similarity(vector2)
1101    }
1102
1103    /// Get a vector by its ID (delegates to VectorIndex)
1104    pub fn get_vector(&self, id: &str) -> Option<&Vector> {
1105        self.index.get_vector(id)
1106    }
1107}
1108
1109impl Default for VectorStore {
1110    fn default() -> Self {
1111        Self::new()
1112    }
1113}
1114
1115impl VectorStoreTrait for VectorStore {
1116    fn insert_vector(&mut self, id: VectorId, vector: Vector) -> Result<()> {
1117        self.index.insert(id, vector)
1118    }
1119
1120    fn add_vector(&mut self, vector: Vector) -> Result<VectorId> {
1121        // Generate a unique ID for the vector
1122        let id = format!("vec_{}", uuid::Uuid::new_v4());
1123        self.index.insert(id.clone(), vector)?;
1124        Ok(id)
1125    }
1126
1127    fn get_vector(&self, id: &VectorId) -> Result<Option<Vector>> {
1128        Ok(self.index.get_vector(id).cloned())
1129    }
1130
1131    fn get_all_vector_ids(&self) -> Result<Vec<VectorId>> {
1132        // For now, return empty vec as VectorIndex doesn't provide this method
1133        // This could be enhanced if the underlying index supports it
1134        Ok(Vec::new())
1135    }
1136
1137    fn search_similar(&self, query: &Vector, k: usize) -> Result<Vec<(VectorId, f32)>> {
1138        self.index.search_knn(query, k)
1139    }
1140
1141    fn remove_vector(&mut self, id: &VectorId) -> Result<bool> {
1142        // VectorIndex trait doesn't have remove, so we'll return false for now
1143        // This could be enhanced in the future if needed
1144        let _ = id;
1145        Ok(false)
1146    }
1147
1148    fn len(&self) -> usize {
1149        // VectorIndex trait doesn't have len, so we'll return 0 for now
1150        // This could be enhanced in the future if needed
1151        0
1152    }
1153}
1154
1155/// Search query types
1156#[derive(Debug, Clone)]
1157pub enum SearchQuery {
1158    Text(String),
1159    Vector(Vector),
1160}
1161
1162/// Search operation types
1163#[derive(Debug, Clone)]
1164pub enum SearchType {
1165    KNN(usize),
1166    Threshold(f32),
1167}
1168
1169/// Advanced search options
1170#[derive(Debug, Clone)]
1171pub struct SearchOptions {
1172    pub query: SearchQuery,
1173    pub search_type: SearchType,
1174}
1175
1176/// Vector operation results with enhanced metadata
1177#[derive(Debug, Clone)]
1178pub struct VectorOperationResult {
1179    pub uri: String,
1180    pub similarity: f32,
1181    pub vector: Option<Vector>,
1182    pub metadata: Option<std::collections::HashMap<String, String>>,
1183    pub rank: usize,
1184}
1185
1186/// Document batch processing utilities
1187pub struct DocumentBatchProcessor;
1188
1189impl DocumentBatchProcessor {
1190    /// Process multiple documents in batch for efficient indexing
1191    pub fn batch_index(
1192        store: &mut VectorStore,
1193        documents: &[(String, String)], // (uri, content) pairs
1194    ) -> Result<Vec<Result<()>>> {
1195        let mut results = Vec::new();
1196
1197        for (uri, content) in documents {
1198            let result = store.index_resource(uri.clone(), content);
1199            results.push(result);
1200        }
1201
1202        Ok(results)
1203    }
1204
1205    /// Process multiple queries in batch
1206    pub fn batch_search(
1207        store: &VectorStore,
1208        queries: &[String],
1209        limit: usize,
1210    ) -> Result<BatchSearchResult> {
1211        let mut results = Vec::new();
1212
1213        for query in queries {
1214            let result = store.similarity_search(query, limit);
1215            results.push(result);
1216        }
1217
1218        Ok(results)
1219    }
1220}
1221
1222/// Error types specific to vector operations
1223#[derive(Debug, thiserror::Error)]
1224pub enum VectorError {
1225    #[error("Dimension mismatch: expected {expected}, got {actual}")]
1226    DimensionMismatch { expected: usize, actual: usize },
1227
1228    #[error("Empty vector")]
1229    EmptyVector,
1230
1231    #[error("Index not built")]
1232    IndexNotBuilt,
1233
1234    #[error("Embedding generation failed: {message}")]
1235    EmbeddingError { message: String },
1236
1237    #[error("SPARQL service error: {message}")]
1238    SparqlServiceError { message: String },
1239
1240    #[error("Compression error: {0}")]
1241    CompressionError(String),
1242
1243    #[error("Invalid dimensions: {0}")]
1244    InvalidDimensions(String),
1245
1246    #[error("Unsupported operation: {0}")]
1247    UnsupportedOperation(String),
1248
1249    #[error("Invalid data: {0}")]
1250    InvalidData(String),
1251
1252    #[error("IO error: {0}")]
1253    IoError(#[from] std::io::Error),
1254}
1255
1256/// Utility functions for vector operations
1257pub mod utils {
1258    use super::Vector;
1259
1260    /// Calculate centroid of a set of vectors
1261    pub fn centroid(vectors: &[Vector]) -> Option<Vector> {
1262        if vectors.is_empty() {
1263            return None;
1264        }
1265
1266        let dimensions = vectors[0].dimensions;
1267        let mut sum_values = vec![0.0; dimensions];
1268
1269        for vector in vectors {
1270            if vector.dimensions != dimensions {
1271                return None; // Inconsistent dimensions
1272            }
1273
1274            let vector_f32 = vector.as_f32();
1275            for (i, &value) in vector_f32.iter().enumerate() {
1276                sum_values[i] += value;
1277            }
1278        }
1279
1280        let count = vectors.len() as f32;
1281        for value in &mut sum_values {
1282            *value /= count;
1283        }
1284
1285        Some(Vector::new(sum_values))
1286    }
1287
1288    /// Generate random vector for testing
1289    pub fn random_vector(dimensions: usize, seed: Option<u64>) -> Vector {
1290        use std::collections::hash_map::DefaultHasher;
1291        use std::hash::{Hash, Hasher};
1292
1293        let mut hasher = DefaultHasher::new();
1294        seed.unwrap_or(42).hash(&mut hasher);
1295        let mut rng_state = hasher.finish();
1296
1297        let mut values = Vec::with_capacity(dimensions);
1298        for _ in 0..dimensions {
1299            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
1300            let normalized = (rng_state as f32) / (u64::MAX as f32);
1301            values.push((normalized - 0.5) * 2.0); // Range: -1.0 to 1.0
1302        }
1303
1304        Vector::new(values)
1305    }
1306
1307    /// Convert vector to normalized unit vector
1308    pub fn normalize_vector(vector: &Vector) -> Vector {
1309        vector.normalized()
1310    }
1311}
1312
1313#[cfg(test)]
1314mod tests {
1315    use super::*;
1316    use crate::similarity::SimilarityMetric;
1317
1318    #[test]
1319    fn test_vector_creation() {
1320        let values = vec![1.0, 2.0, 3.0];
1321        let vector = Vector::new(values.clone());
1322
1323        assert_eq!(vector.dimensions, 3);
1324        assert_eq!(vector.precision, VectorPrecision::F32);
1325        assert_eq!(vector.as_f32(), values);
1326    }
1327
1328    #[test]
1329    fn test_multi_precision_vectors() {
1330        // Test F64 vector
1331        let f64_values = vec![1.0, 2.0, 3.0];
1332        let f64_vector = Vector::f64(f64_values.clone());
1333        assert_eq!(f64_vector.precision, VectorPrecision::F64);
1334        assert_eq!(f64_vector.dimensions, 3);
1335
1336        // Test I8 vector
1337        let i8_values = vec![100, -50, 0];
1338        let i8_vector = Vector::i8(i8_values);
1339        assert_eq!(i8_vector.precision, VectorPrecision::I8);
1340        assert_eq!(i8_vector.dimensions, 3);
1341
1342        // Test binary vector
1343        let binary_values = vec![0b10101010, 0b11110000];
1344        let binary_vector = Vector::binary(binary_values);
1345        assert_eq!(binary_vector.precision, VectorPrecision::Binary);
1346        assert_eq!(binary_vector.dimensions, 16); // 2 bytes * 8 bits
1347    }
1348
1349    #[test]
1350    fn test_vector_operations() {
1351        let v1 = Vector::new(vec![1.0, 2.0, 3.0]);
1352        let v2 = Vector::new(vec![4.0, 5.0, 6.0]);
1353
1354        // Test addition
1355        let sum = v1.add(&v2).unwrap();
1356        assert_eq!(sum.as_f32(), vec![5.0, 7.0, 9.0]);
1357
1358        // Test subtraction
1359        let diff = v2.subtract(&v1).unwrap();
1360        assert_eq!(diff.as_f32(), vec![3.0, 3.0, 3.0]);
1361
1362        // Test scaling
1363        let scaled = v1.scale(2.0);
1364        assert_eq!(scaled.as_f32(), vec![2.0, 4.0, 6.0]);
1365    }
1366
1367    #[test]
1368    fn test_cosine_similarity() {
1369        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1370        let v2 = Vector::new(vec![1.0, 0.0, 0.0]);
1371        let v3 = Vector::new(vec![0.0, 1.0, 0.0]);
1372
1373        // Identical vectors should have similarity 1.0
1374        assert!((v1.cosine_similarity(&v2).unwrap() - 1.0).abs() < 0.001);
1375
1376        // Orthogonal vectors should have similarity 0.0
1377        assert!((v1.cosine_similarity(&v3).unwrap()).abs() < 0.001);
1378    }
1379
1380    #[test]
1381    fn test_vector_store() {
1382        let mut store = VectorStore::new();
1383
1384        // Test indexing
1385        store
1386            .index_resource("doc1".to_string(), "This is a test")
1387            .unwrap();
1388        store
1389            .index_resource("doc2".to_string(), "Another test document")
1390            .unwrap();
1391
1392        // Test searching
1393        let results = store.similarity_search("test", 5).unwrap();
1394        assert_eq!(results.len(), 2);
1395
1396        // Results should be sorted by similarity (descending)
1397        assert!(results[0].1 >= results[1].1);
1398    }
1399
1400    #[test]
1401    fn test_similarity_metrics() {
1402        let a = vec![1.0, 2.0, 3.0];
1403        let b = vec![4.0, 5.0, 6.0];
1404
1405        // Test different similarity metrics
1406        let cosine_sim = SimilarityMetric::Cosine.similarity(&a, &b).unwrap();
1407        let euclidean_sim = SimilarityMetric::Euclidean.similarity(&a, &b).unwrap();
1408        let manhattan_sim = SimilarityMetric::Manhattan.similarity(&a, &b).unwrap();
1409
1410        // All similarities should be between 0 and 1
1411        assert!((0.0..=1.0).contains(&cosine_sim));
1412        assert!((0.0..=1.0).contains(&euclidean_sim));
1413        assert!((0.0..=1.0).contains(&manhattan_sim));
1414    }
1415
1416    #[test]
1417    fn test_quantization() {
1418        let values = vec![1.0, -0.5, 0.0, 0.75];
1419        let quantized = Vector::quantize_to_i8(&values);
1420
1421        // Check that quantized values are in the expected range
1422        for &q in &quantized {
1423            assert!((-127..=127).contains(&q));
1424        }
1425    }
1426
1427    #[test]
1428    fn test_binary_conversion() {
1429        let values = vec![0.8, -0.3, 0.1, -0.9];
1430        let binary = Vector::to_binary(&values, 0.0);
1431
1432        // Should have 1 byte (4 values, each becomes 1 bit, packed into bytes)
1433        assert_eq!(binary.len(), 1);
1434
1435        // First bit should be 1 (0.8 > 0.0), second should be 0 (-0.3 < 0.0), etc.
1436        let byte = binary[0];
1437        assert_eq!(byte & 1, 1); // bit 0: 0.8 > 0.0
1438        assert_eq!((byte >> 1) & 1, 0); // bit 1: -0.3 < 0.0
1439        assert_eq!((byte >> 2) & 1, 1); // bit 2: 0.1 > 0.0
1440        assert_eq!((byte >> 3) & 1, 0); // bit 3: -0.9 < 0.0
1441    }
1442
1443    #[test]
1444    fn test_memory_vector_index() {
1445        let mut index = MemoryVectorIndex::new();
1446
1447        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1448        let v2 = Vector::new(vec![0.0, 1.0, 0.0]);
1449
1450        index.insert("v1".to_string(), v1.clone()).unwrap();
1451        index.insert("v2".to_string(), v2.clone()).unwrap();
1452
1453        // Test KNN search
1454        let results = index.search_knn(&v1, 1).unwrap();
1455        assert_eq!(results.len(), 1);
1456        assert_eq!(results[0].0, "v1");
1457
1458        // Test threshold search
1459        let results = index.search_threshold(&v1, 0.5).unwrap();
1460        assert!(!results.is_empty());
1461    }
1462
1463    #[test]
1464    fn test_hnsw_index() {
1465        use crate::hnsw::{HnswConfig, HnswIndex};
1466
1467        let config = HnswConfig::default();
1468        let mut index = HnswIndex::new(config).unwrap();
1469
1470        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1471        let v2 = Vector::new(vec![0.0, 1.0, 0.0]);
1472        let v3 = Vector::new(vec![0.0, 0.0, 1.0]);
1473
1474        index.insert("v1".to_string(), v1.clone()).unwrap();
1475        index.insert("v2".to_string(), v2.clone()).unwrap();
1476        index.insert("v3".to_string(), v3.clone()).unwrap();
1477
1478        // Test KNN search
1479        let results = index.search_knn(&v1, 2).unwrap();
1480        assert!(results.len() <= 2);
1481
1482        // The first result should be v1 itself (highest similarity)
1483        if !results.is_empty() {
1484            assert_eq!(results[0].0, "v1");
1485        }
1486    }
1487
1488    #[test]
1489    fn test_sparql_vector_service() {
1490        use crate::embeddings::EmbeddingStrategy;
1491        use crate::sparql_integration::{
1492            SparqlVectorService, VectorServiceArg, VectorServiceConfig, VectorServiceResult,
1493        };
1494
1495        let config = VectorServiceConfig::default();
1496        let mut service =
1497            SparqlVectorService::new(config, EmbeddingStrategy::SentenceTransformer).unwrap();
1498
1499        // Test vector similarity function
1500        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1501        let v2 = Vector::new(vec![1.0, 0.0, 0.0]);
1502
1503        let args = vec![VectorServiceArg::Vector(v1), VectorServiceArg::Vector(v2)];
1504
1505        let result = service
1506            .execute_function("vector_similarity", &args)
1507            .unwrap();
1508
1509        match result {
1510            VectorServiceResult::Number(similarity) => {
1511                assert!((similarity - 1.0).abs() < 0.001); // Should be very similar
1512            }
1513            _ => panic!("Expected a number result"),
1514        }
1515
1516        // Test text embedding function
1517        let text_args = vec![VectorServiceArg::String("test text".to_string())];
1518        let embed_result = service.execute_function("embed_text", &text_args).unwrap();
1519
1520        match embed_result {
1521            VectorServiceResult::Vector(vector) => {
1522                assert_eq!(vector.dimensions, 384); // Default embedding size
1523            }
1524            _ => panic!("Expected a vector result"),
1525        }
1526    }
1527}
oxirs_vec/lib.rs

oxirs_vec/
lib.rs