oxirs_vec/
lib.rs

1//! # OxiRS Vector Search
2//!
3//! [![Version](https://img.shields.io/badge/version-0.1.0-blue)](https://github.com/cool-japan/oxirs/releases)
4//! [![docs.rs](https://docs.rs/oxirs-vec/badge.svg)](https://docs.rs/oxirs-vec)
5//!
6//! **Status**: Production Release (v0.1.0) - **Production-Ready with Complete Documentation**
7//! **Stability**: Public APIs are stable. Production-ready with comprehensive testing and 100 KB of documentation.
8//!
9//! Vector index abstractions for semantic similarity and AI-augmented SPARQL querying.
10//!
11//! This crate provides comprehensive vector search capabilities for knowledge graphs,
12//! enabling semantic similarity searches, AI-augmented SPARQL queries, and hybrid
13//! symbolic-vector operations.
14
15#![allow(dead_code)]
16//!
17//! ## Features
18//!
19//! - **Multi-algorithm embeddings**: TF-IDF, sentence transformers, custom models
20//! - **Advanced indexing**: HNSW, flat, quantized, and multi-index support
21//! - **Rich similarity metrics**: Cosine, Euclidean, Pearson, Jaccard, and more
22//! - **SPARQL integration**: `vec:similar` service functions and hybrid queries
23//! - **Performance optimization**: Caching, batching, and parallel processing
24//!
25//! ## Quick Start
26//!
27//! ```rust
28//! use oxirs_vec::{VectorStore, embeddings::EmbeddingStrategy};
29//!
30//! // Create vector store with sentence transformer embeddings
31//! let mut store = VectorStore::with_embedding_strategy(
32//!     EmbeddingStrategy::SentenceTransformer
33//! ).unwrap();
34//!
35//! // Index some content
36//! store
37//!     .index_resource(
38//!         "http://example.org/doc1".to_string(),
39//!         "This is a document about AI",
40//!     )
41//!     .unwrap();
42//! store
43//!     .index_resource(
44//!         "http://example.org/doc2".to_string(),
45//!         "Machine learning tutorial",
46//!     )
47//!     .unwrap();
48//!
49//! // Search for similar content
50//! let results = store
51//!     .similarity_search("artificial intelligence", 5)
52//!     .unwrap();
53//!
54//! println!("Found {} matching resources", results.len());
55//! ```
56
57use anyhow::Result;
58use std::collections::HashMap;
59
60pub mod adaptive_compression;
61pub mod adaptive_intelligent_caching;
62pub mod advanced_analytics;
63pub mod advanced_benchmarking;
64pub mod advanced_caching;
65pub mod advanced_metrics;
66pub mod advanced_result_merging;
67pub mod automl_optimization;
68pub mod benchmarking;
69pub mod cache_friendly_index;
70pub mod clustering;
71pub mod compaction;
72pub mod compression;
73#[cfg(feature = "content-processing")]
74pub mod content_processing;
75pub mod crash_recovery;
76pub mod cross_language_alignment;
77pub mod cross_modal_embeddings;
78pub mod diskann;
79pub mod distance_metrics;
80pub mod distributed_vector_search;
81pub mod dynamic_index_selector;
82pub mod embedding_pipeline;
83pub mod embeddings;
84pub mod enhanced_performance_monitoring;
85pub mod faiss_compatibility;
86pub mod faiss_gpu_integration;
87pub mod faiss_integration;
88pub mod faiss_migration_tools;
89pub mod faiss_native_integration;
90pub mod federated_search;
91pub mod filtered_search;
92pub mod gnn_embeddings;
93pub mod gpu;
94pub mod gpu_benchmarks;
95pub mod graph_aware_search;
96pub mod graph_indices;
97pub mod hierarchical_similarity;
98pub mod hnsw;
99pub mod huggingface;
100pub mod hybrid_fusion;
101pub mod hybrid_search;
102pub mod index;
103pub mod ivf;
104pub mod joint_embedding_spaces;
105pub mod kg_embeddings;
106pub mod learned_index;
107pub mod lsh;
108pub mod mmap_advanced;
109pub mod mmap_index;
110pub mod multi_modal_search;
111pub mod multi_tenancy;
112pub mod nsg;
113pub mod opq;
114pub mod oxirs_arq_integration;
115pub mod performance_insights;
116pub mod persistence;
117pub mod personalized_search;
118pub mod pq;
119pub mod pytorch;
120pub mod quantum_search;
121pub mod query_planning;
122pub mod query_rewriter;
123pub mod random_utils;
124pub mod rdf_content_enhancement;
125pub mod rdf_integration;
126pub mod real_time_analytics;
127pub mod real_time_embedding_pipeline;
128pub mod real_time_updates;
129pub mod reranking;
130pub mod result_fusion;
131pub mod similarity;
132pub mod sparql_integration;
133pub mod sparql_service_endpoint;
134pub mod sparse;
135pub mod sq;
136pub mod storage_optimizations;
137pub mod store_integration;
138pub mod structured_vectors;
139pub mod tensorflow;
140pub mod tiering;
141pub mod tree_indices;
142pub mod validation;
143pub mod wal;
144pub mod word2vec;
145
146// Python bindings module
147#[cfg(feature = "python")]
148pub mod python_bindings;
149
150// Re-export commonly used types
151pub use adaptive_compression::{
152    AdaptiveCompressor, CompressionMetrics, CompressionPriorities, MultiLevelCompression,
153    VectorStats,
154};
155pub use adaptive_intelligent_caching::{
156    AccessPatternAnalyzer, AdaptiveIntelligentCache, CacheConfiguration, CacheOptimizer,
157    CachePerformanceMetrics, CacheTier, MLModels, PredictivePrefetcher,
158};
159pub use advanced_analytics::{
160    AnomalyDetection, AnomalyDetector, AnomalyType, ImplementationEffort,
161    OptimizationRecommendation, PerformanceTrends, Priority, QualityAspect, QualityRecommendation,
162    QueryAnalytics, QueryAnomaly, RecommendationType, VectorAnalyticsEngine,
163    VectorDistributionAnalysis, VectorQualityAssessment,
164};
165pub use advanced_benchmarking::{
166    AdvancedBenchmarkConfig, AdvancedBenchmarkResult, AdvancedBenchmarkSuite, AlgorithmParameters,
167    BenchmarkAlgorithm, BuildTimeMetrics, CacheMetrics, DatasetQualityMetrics, DatasetStatistics,
168    DistanceStatistics, EnhancedBenchmarkDataset, HyperparameterTuner, IndexSizeMetrics,
169    LatencyMetrics, MemoryMetrics, ObjectiveFunction, OptimizationStrategy,
170    ParallelBenchmarkConfig, ParameterSpace, ParameterType, ParameterValue, PerformanceMetrics,
171    PerformanceProfiler, QualityDegradation, QualityMetrics, ScalabilityMetrics,
172    StatisticalAnalyzer, StatisticalMetrics, ThroughputMetrics,
173};
174pub use advanced_caching::{
175    BackgroundCacheWorker, CacheAnalysisReport, CacheAnalyzer, CacheConfig, CacheEntry,
176    CacheInvalidator, CacheKey, CacheStats, CacheWarmer, EvictionPolicy, InvalidationStats,
177    MultiLevelCache, MultiLevelCacheStats,
178};
179pub use advanced_result_merging::{
180    AdvancedResultMerger, ConfidenceInterval, DiversityConfig, DiversityMetric, FusionStatistics,
181    MergedResult, RankFusionAlgorithm, RankingFactor, ResultExplanation, ResultMergingConfig,
182    ResultMetadata, ScoreCombinationStrategy, ScoreNormalizationMethod, ScoredResult,
183    SourceContribution, SourceResult, SourceType,
184};
185pub use automl_optimization::{
186    AutoMLConfig, AutoMLOptimizer, AutoMLResults, AutoMLStatistics, IndexConfiguration,
187    IndexParameterSpace, OptimizationMetric, OptimizationTrial, ResourceConstraints, SearchSpace,
188    TrialResult,
189};
190pub use benchmarking::{
191    BenchmarkConfig, BenchmarkDataset, BenchmarkOutputFormat, BenchmarkResult, BenchmarkRunner,
192    BenchmarkSuite, BenchmarkTestCase, MemoryMetrics as BenchmarkMemoryMetrics,
193    PerformanceMetrics as BenchmarkPerformanceMetrics, QualityMetrics as BenchmarkQualityMetrics,
194    ScalabilityMetrics as BenchmarkScalabilityMetrics, SystemInfo,
195};
196pub use cache_friendly_index::{CacheFriendlyVectorIndex, IndexConfig as CacheFriendlyIndexConfig};
197pub use compaction::{
198    CompactionConfig, CompactionManager, CompactionMetrics, CompactionResult, CompactionState,
199    CompactionStatistics, CompactionStrategy,
200};
201pub use compression::{create_compressor, CompressionMethod, VectorCompressor};
202#[cfg(feature = "content-processing")]
203pub use content_processing::{
204    ChunkType, ChunkingStrategy, ContentChunk, ContentExtractionConfig, ContentLocation,
205    ContentProcessor, DocumentFormat, DocumentStructure, ExtractedContent, ExtractedImage,
206    ExtractedLink, ExtractedTable, FormatHandler, Heading, ProcessingStats, TocEntry,
207};
208pub use crash_recovery::{CrashRecoveryManager, RecoveryConfig, RecoveryPolicy, RecoveryStats};
209pub use cross_modal_embeddings::{
210    AttentionMechanism, AudioData, AudioEncoder, CrossModalConfig, CrossModalEncoder, FusionLayer,
211    FusionStrategy, GraphData, GraphEncoder, ImageData, ImageEncoder, Modality, ModalityData,
212    MultiModalContent, TextEncoder, VideoData, VideoEncoder,
213};
214pub use diskann::{
215    DiskAnnBuildStats, DiskAnnBuilder, DiskAnnConfig, DiskAnnError, DiskAnnIndex, DiskAnnResult,
216    DiskStorage, IndexMetadata as DiskAnnIndexMetadata, MemoryMappedStorage, NodeId,
217    PruningStrategy, SearchMode as DiskAnnSearchMode, SearchStats as DiskAnnSearchStats,
218    StorageBackend, VamanaGraph, VamanaNode, VectorId as DiskAnnVectorId,
219};
220pub use distributed_vector_search::{
221    ConsistencyLevel, DistributedClusterStats, DistributedNodeConfig, DistributedQuery,
222    DistributedSearchResponse, DistributedVectorSearch, LoadBalancingAlgorithm, NodeHealthStatus,
223    PartitioningStrategy, QueryExecutionStrategy,
224};
225pub use dynamic_index_selector::{DynamicIndexSelector, IndexSelectorConfig};
226pub use embedding_pipeline::{
227    DimensionalityReduction, EmbeddingPipeline, NormalizationConfig, PostprocessingPipeline,
228    PreprocessingPipeline, TokenizerConfig, VectorNormalization,
229};
230pub use embeddings::{
231    EmbeddableContent, EmbeddingConfig, EmbeddingManager, EmbeddingStrategy, ModelDetails,
232    OpenAIConfig, OpenAIEmbeddingGenerator, SentenceTransformerGenerator, TransformerModelType,
233};
234pub use enhanced_performance_monitoring::{
235    Alert, AlertManager, AlertSeverity, AlertThresholds, AlertType, AnalyticsEngine,
236    AnalyticsReport, DashboardData, EnhancedPerformanceMonitor, ExportConfig, ExportDestination,
237    ExportFormat, LatencyDistribution, MonitoringConfig as EnhancedMonitoringConfig,
238    QualityMetrics as EnhancedQualityMetrics, QualityMetricsCollector, QualityStatistics,
239    QueryInfo, QueryMetricsCollector, QueryStatistics, QueryType, Recommendation,
240    RecommendationCategory, RecommendationPriority, SystemMetrics, SystemMetricsCollector,
241    SystemStatistics, TrendData, TrendDirection,
242};
243pub use faiss_compatibility::{
244    CompressionLevel, ConversionMetrics, ConversionResult, FaissCompatibility, FaissExportConfig,
245    FaissImportConfig, FaissIndexMetadata, FaissIndexType, FaissMetricType, FaissParameter,
246    SimpleVectorIndex,
247};
248pub use federated_search::{
249    AuthenticationConfig, FederatedSearchConfig, FederatedVectorSearch, FederationEndpoint,
250    PrivacyEngine, PrivacyMode, SchemaCompatibility, TrustManager,
251};
252pub use gnn_embeddings::{AggregatorType, GraphSAGE, GCN};
253pub use gpu::{
254    create_default_accelerator, create_memory_optimized_accelerator,
255    create_performance_accelerator, is_gpu_available, GpuAccelerator, GpuBuffer, GpuConfig,
256    GpuDevice, GpuExecutionConfig,
257};
258pub use gpu_benchmarks::{
259    BenchmarkResult as GpuBenchmarkResult, GpuBenchmarkConfig, GpuBenchmarkSuite,
260};
261pub use graph_indices::{
262    DelaunayGraph, GraphIndex, GraphIndexConfig, GraphType, NSWGraph, ONNGGraph, PANNGGraph,
263    RNGGraph,
264};
265pub use hierarchical_similarity::{
266    ConceptHierarchy, HierarchicalSimilarity, HierarchicalSimilarityConfig,
267    HierarchicalSimilarityResult, HierarchicalSimilarityStats, SimilarityContext,
268    SimilarityExplanation, SimilarityTaskType,
269};
270pub use hnsw::{HnswConfig, HnswIndex};
271pub use hybrid_fusion::{
272    FusedResult, HybridFusion, HybridFusionConfig, HybridFusionStatistics, HybridFusionStrategy,
273    NormalizationMethod,
274};
275pub use hybrid_search::{
276    Bm25Scorer, DocumentScore, HybridQuery, HybridResult, HybridSearchConfig, HybridSearchManager,
277    KeywordAlgorithm, KeywordMatch, KeywordSearcher, QueryExpander, RankFusion, RankFusionStrategy,
278    SearchMode, SearchWeights, TfidfScorer,
279};
280pub use index::{AdvancedVectorIndex, DistanceMetric, IndexConfig, IndexType, SearchResult};
281pub use ivf::{IvfConfig, IvfIndex, IvfStats, QuantizationStrategy};
282pub use joint_embedding_spaces::{
283    ActivationFunction, AlignmentPair, CLIPAligner, ContrastiveOptimizer, CrossModalAttention,
284    CurriculumLearning, DataAugmentation, DifficultySchedule, DomainAdapter, DomainStatistics,
285    JointEmbeddingConfig, JointEmbeddingSpace, LearningRateSchedule, LinearProjector,
286    PacingFunction, ScheduleType, TemperatureScheduler, TrainingStatistics,
287};
288pub use kg_embeddings::{
289    ComplEx, KGEmbedding, KGEmbeddingConfig, KGEmbeddingModel as KGModel, KGEmbeddingModelType,
290    RotatE, TransE, Triple,
291};
292pub use lsh::{LshConfig, LshFamily, LshIndex, LshStats};
293pub use mmap_index::{MemoryMappedIndexStats, MemoryMappedVectorIndex};
294pub use multi_tenancy::{
295    AccessControl, AccessPolicy, BillingEngine, BillingMetrics, BillingPeriod, IsolationLevel,
296    IsolationStrategy, MultiTenancyError, MultiTenancyResult, MultiTenantManager, NamespaceManager,
297    Permission, PricingModel, QuotaEnforcer, QuotaLimits, QuotaUsage, RateLimiter, ResourceQuota,
298    ResourceType, Role, Tenant, TenantConfig, TenantContext, TenantId, TenantManagerConfig,
299    TenantMetadata, TenantOperation, TenantStatistics, TenantStatus, UsageRecord,
300};
301pub use nsg::{DistanceMetric as NsgDistanceMetric, NsgConfig, NsgIndex, NsgStats};
302pub use performance_insights::{
303    AlertingSystem, OptimizationRecommendations, PerformanceInsightsAnalyzer,
304    PerformanceTrends as InsightsPerformanceTrends, QueryComplexity,
305    QueryStatistics as InsightsQueryStatistics, ReportFormat, VectorStatistics,
306};
307pub use pq::{PQConfig, PQIndex, PQStats};
308pub use pytorch::{
309    ArchitectureType, CompileMode, DeviceManager, PyTorchConfig, PyTorchDevice, PyTorchEmbedder,
310    PyTorchModelManager, PyTorchModelMetadata, PyTorchTokenizer,
311};
312pub use quantum_search::{
313    QuantumSearchConfig, QuantumSearchResult, QuantumSearchStatistics, QuantumState,
314    QuantumVectorSearch,
315};
316pub use query_planning::{
317    CostModel, IndexStatistics, QueryCharacteristics, QueryPlan, QueryPlanner, QueryStrategy,
318    VectorQueryType,
319};
320pub use query_rewriter::{
321    QueryRewriter, QueryRewriterConfig, QueryVectorStatistics, RewriteRule, RewrittenQuery,
322};
323pub use rdf_content_enhancement::{
324    ComponentWeights, MultiLanguageProcessor, PathConstraint, PathDirection, PropertyAggregator,
325    PropertyPath, RdfContentConfig, RdfContentProcessor, RdfContext, RdfEntity, RdfValue,
326    TemporalInfo,
327};
328pub use rdf_integration::{
329    RdfIntegrationStats, RdfTermMapping, RdfTermMetadata, RdfTermType, RdfVectorConfig,
330    RdfVectorIntegration, RdfVectorSearchResult, SearchMetadata,
331};
332pub use real_time_analytics::{
333    AlertSeverity as AnalyticsAlertSeverity, AlertType as AnalyticsAlertType, AnalyticsConfig,
334    AnalyticsEvent, AnalyticsReport as RealTimeAnalyticsReport,
335    DashboardData as RealTimeDashboardData, ExportFormat as AnalyticsExportFormat,
336    MetricsCollector, PerformanceMonitor, QueryMetrics, SystemMetrics as AnalyticsSystemMetrics,
337    VectorAnalyticsEngine as RealTimeVectorAnalyticsEngine,
338};
339pub use real_time_embedding_pipeline::{
340    AlertThresholds as PipelineAlertThresholds, AutoScalingConfig, CompressionConfig, ContentItem,
341    MonitoringConfig as PipelineMonitoringConfig, PipelineConfig as RealTimeEmbeddingConfig,
342    PipelineStatistics as PipelineStats, ProcessingPriority, ProcessingResult, ProcessingStatus,
343    RealTimeEmbeddingPipeline, VersioningStrategy,
344};
345pub use real_time_updates::{
346    BatchProcessor, RealTimeConfig, RealTimeVectorSearch, RealTimeVectorUpdater, UpdateBatch,
347    UpdateOperation, UpdatePriority, UpdateStats,
348};
349pub use reranking::{
350    CrossEncoder, CrossEncoderBackend, CrossEncoderModel, CrossEncoderReranker, DiversityReranker,
351    DiversityStrategy, FusionStrategy as RerankingFusionStrategy, ModelBackend, ModelConfig,
352    RerankingCache, RerankingCacheConfig, RerankingConfig, RerankingError, RerankingMode,
353    RerankingOutput, RerankingStats, Result as RerankingResult, ScoreFusion, ScoreFusionConfig,
354    ScoredCandidate,
355};
356pub use result_fusion::{
357    FusedResults, FusionAlgorithm, FusionConfig, FusionQualityMetrics, FusionStats,
358    ResultFusionEngine, ScoreNormalizationStrategy, SourceResults, VectorSearchResult,
359};
360pub use similarity::{AdaptiveSimilarity, SemanticSimilarity, SimilarityConfig, SimilarityMetric};
361pub use sparql_integration::{
362    CrossLanguageProcessor, FederatedQueryResult, QueryExecutor, SparqlVectorFunctions,
363    SparqlVectorService, VectorOperation, VectorQuery, VectorQueryResult, VectorServiceArg,
364    VectorServiceConfig, VectorServiceResult,
365};
366pub use sparql_service_endpoint::{
367    AuthenticationInfo, AuthenticationType, CustomFunctionRegistry, FederatedOperation,
368    FederatedSearchResult, FederatedServiceEndpoint, FederatedVectorQuery, FunctionMetadata,
369    LoadBalancer, ParameterInfo, ParameterType as ServiceParameterType, PartialSearchResult,
370    QueryScope, ReturnType, ServiceCapability, ServiceEndpointManager, ServiceType,
371};
372pub use sparse::{COOMatrix, CSRMatrix, SparseVector};
373pub use sq::{QuantizationMode, QuantizationParams, SqConfig, SqIndex, SqStats};
374pub use storage_optimizations::{
375    CompressionType, MmapVectorFile, StorageConfig, StorageUtils, VectorBlock, VectorFileHeader,
376    VectorReader, VectorWriter,
377};
378pub use structured_vectors::{
379    ConfidenceScoredVector, HierarchicalVector, NamedDimensionVector, TemporalVector,
380    WeightedDimensionVector,
381};
382pub use tensorflow::{
383    OptimizationLevel, PreprocessingPipeline as TensorFlowPreprocessingPipeline, ServerConfig,
384    SessionConfig, TensorDataType, TensorFlowConfig, TensorFlowDevice, TensorFlowEmbedder,
385    TensorFlowModelInfo, TensorFlowModelServer, TensorSpec,
386};
387pub use tiering::{
388    IndexMetadata, StorageTier, TierMetrics, TierStatistics, TierTransitionReason, TieringConfig,
389    TieringManager, TieringPolicy,
390};
391pub use tree_indices::{
392    BallTree, CoverTree, KdTree, RandomProjectionTree, TreeIndex, TreeIndexConfig, TreeType, VpTree,
393};
394pub use wal::{WalConfig, WalEntry, WalManager};
395pub use word2vec::{
396    AggregationMethod, OovStrategy, Word2VecConfig, Word2VecEmbeddingGenerator, Word2VecFormat,
397};
398
399/// Vector identifier type
400pub type VectorId = String;
401
402/// Batch search result type
403pub type BatchSearchResult = Vec<Result<Vec<(String, f32)>>>;
404
405/// Trait for vector store implementations
406pub trait VectorStoreTrait: Send + Sync {
407    /// Insert a vector with metadata
408    fn insert_vector(&mut self, id: VectorId, vector: Vector) -> Result<()>;
409
410    /// Add a vector and return its ID
411    fn add_vector(&mut self, vector: Vector) -> Result<VectorId>;
412
413    /// Get a vector by its ID
414    fn get_vector(&self, id: &VectorId) -> Result<Option<Vector>>;
415
416    /// Get all vector IDs
417    fn get_all_vector_ids(&self) -> Result<Vec<VectorId>>;
418
419    /// Search for similar vectors
420    fn search_similar(&self, query: &Vector, k: usize) -> Result<Vec<(VectorId, f32)>>;
421
422    /// Remove a vector by ID
423    fn remove_vector(&mut self, id: &VectorId) -> Result<bool>;
424
425    /// Get the number of vectors stored
426    fn len(&self) -> usize;
427
428    /// Check if the store is empty
429    fn is_empty(&self) -> bool {
430        self.len() == 0
431    }
432}
433
434/// Precision types for vectors
435#[derive(Debug, Clone, Copy, PartialEq, serde::Serialize, serde::Deserialize)]
436pub enum VectorPrecision {
437    F32,
438    F64,
439    F16,
440    I8,
441    Binary,
442}
443
444/// Multi-precision vector with enhanced functionality
445#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
446pub struct Vector {
447    pub dimensions: usize,
448    pub precision: VectorPrecision,
449    pub values: VectorData,
450    pub metadata: Option<std::collections::HashMap<String, String>>,
451}
452
453/// Vector data storage supporting multiple precisions
454#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
455pub enum VectorData {
456    F32(Vec<f32>),
457    F64(Vec<f64>),
458    F16(Vec<u16>), // Using u16 to represent f16 bits
459    I8(Vec<i8>),
460    Binary(Vec<u8>), // Packed binary representation
461}
462
463impl Vector {
464    /// Create a new F32 vector from values
465    pub fn new(values: Vec<f32>) -> Self {
466        let dimensions = values.len();
467        Self {
468            dimensions,
469            precision: VectorPrecision::F32,
470            values: VectorData::F32(values),
471            metadata: None,
472        }
473    }
474
475    /// Create a new vector with specific precision
476    pub fn with_precision(values: VectorData) -> Self {
477        let (dimensions, precision) = match &values {
478            VectorData::F32(v) => (v.len(), VectorPrecision::F32),
479            VectorData::F64(v) => (v.len(), VectorPrecision::F64),
480            VectorData::F16(v) => (v.len(), VectorPrecision::F16),
481            VectorData::I8(v) => (v.len(), VectorPrecision::I8),
482            VectorData::Binary(v) => (v.len() * 8, VectorPrecision::Binary), // 8 bits per byte
483        };
484
485        Self {
486            dimensions,
487            precision,
488            values,
489            metadata: None,
490        }
491    }
492
493    /// Create a new vector with metadata
494    pub fn with_metadata(
495        values: Vec<f32>,
496        metadata: std::collections::HashMap<String, String>,
497    ) -> Self {
498        let dimensions = values.len();
499        Self {
500            dimensions,
501            precision: VectorPrecision::F32,
502            values: VectorData::F32(values),
503            metadata: Some(metadata),
504        }
505    }
506
507    /// Create F64 vector
508    pub fn f64(values: Vec<f64>) -> Self {
509        Self::with_precision(VectorData::F64(values))
510    }
511
512    /// Create F16 vector (using u16 representation)
513    pub fn f16(values: Vec<u16>) -> Self {
514        Self::with_precision(VectorData::F16(values))
515    }
516
517    /// Create I8 quantized vector
518    pub fn i8(values: Vec<i8>) -> Self {
519        Self::with_precision(VectorData::I8(values))
520    }
521
522    /// Create binary vector
523    pub fn binary(values: Vec<u8>) -> Self {
524        Self::with_precision(VectorData::Binary(values))
525    }
526
527    /// Get vector values as f32 (converting if necessary)
528    pub fn as_f32(&self) -> Vec<f32> {
529        match &self.values {
530            VectorData::F32(v) => v.clone(),
531            VectorData::F64(v) => v.iter().map(|&x| x as f32).collect(),
532            VectorData::F16(v) => v.iter().map(|&x| Self::f16_to_f32(x)).collect(),
533            VectorData::I8(v) => v.iter().map(|&x| x as f32 / 128.0).collect(), // Normalize to [-1, 1]
534            VectorData::Binary(v) => {
535                let mut result = Vec::new();
536                for &byte in v {
537                    for bit in 0..8 {
538                        result.push(if (byte >> bit) & 1 == 1 { 1.0 } else { 0.0 });
539                    }
540                }
541                result
542            }
543        }
544    }
545
546    /// Convert f32 to f16 representation (simplified)
547    #[allow(dead_code)]
548    fn f32_to_f16(value: f32) -> u16 {
549        // Simplified f16 conversion - in practice, use proper IEEE 754 half-precision
550        let bits = value.to_bits();
551        let sign = (bits >> 31) & 0x1;
552        let exp = ((bits >> 23) & 0xff) as i32;
553        let mantissa = bits & 0x7fffff;
554
555        // Simplified conversion
556        let f16_exp = if exp == 0 {
557            0
558        } else {
559            (exp - 127 + 15).clamp(0, 31) as u16
560        };
561
562        let f16_mantissa = (mantissa >> 13) as u16;
563        ((sign as u16) << 15) | (f16_exp << 10) | f16_mantissa
564    }
565
566    /// Convert f16 representation to f32 (simplified)
567    fn f16_to_f32(value: u16) -> f32 {
568        // Simplified f16 conversion - in practice, use proper IEEE 754 half-precision
569        let sign = (value >> 15) & 0x1;
570        let exp = ((value >> 10) & 0x1f) as i32;
571        let mantissa = value & 0x3ff;
572
573        if exp == 0 {
574            if mantissa == 0 {
575                if sign == 1 {
576                    -0.0
577                } else {
578                    0.0
579                }
580            } else {
581                // Denormalized number
582                let f32_exp = -14 - 127;
583                let f32_mantissa = (mantissa as u32) << 13;
584                f32::from_bits(((sign as u32) << 31) | ((f32_exp as u32) << 23) | f32_mantissa)
585            }
586        } else {
587            let f32_exp = exp - 15 + 127;
588            let f32_mantissa = (mantissa as u32) << 13;
589            f32::from_bits(((sign as u32) << 31) | ((f32_exp as u32) << 23) | f32_mantissa)
590        }
591    }
592
593    /// Quantize f32 vector to i8
594    pub fn quantize_to_i8(values: &[f32]) -> Vec<i8> {
595        // Find min/max for normalization
596        let min_val = values.iter().fold(f32::INFINITY, |a, &b| a.min(b));
597        let max_val = values.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
598        let range = max_val - min_val;
599
600        if range == 0.0 {
601            vec![0; values.len()]
602        } else {
603            values
604                .iter()
605                .map(|&x| {
606                    let normalized = (x - min_val) / range; // 0 to 1
607                    let scaled = normalized * 254.0 - 127.0; // -127 to 127
608                    scaled.round().clamp(-127.0, 127.0) as i8
609                })
610                .collect()
611        }
612    }
613
614    /// Convert to binary representation using threshold
615    pub fn to_binary(values: &[f32], threshold: f32) -> Vec<u8> {
616        let mut binary = Vec::new();
617        let mut current_byte = 0u8;
618        let mut bit_position = 0;
619
620        for &value in values {
621            if value > threshold {
622                current_byte |= 1 << bit_position;
623            }
624
625            bit_position += 1;
626            if bit_position == 8 {
627                binary.push(current_byte);
628                current_byte = 0;
629                bit_position = 0;
630            }
631        }
632
633        // Handle remaining bits
634        if bit_position > 0 {
635            binary.push(current_byte);
636        }
637
638        binary
639    }
640
641    /// Calculate cosine similarity with another vector
642    pub fn cosine_similarity(&self, other: &Vector) -> Result<f32> {
643        if self.dimensions != other.dimensions {
644            return Err(anyhow::anyhow!("Vector dimensions must match"));
645        }
646
647        let self_f32 = self.as_f32();
648        let other_f32 = other.as_f32();
649
650        let dot_product: f32 = self_f32.iter().zip(&other_f32).map(|(a, b)| a * b).sum();
651
652        let magnitude_self: f32 = self_f32.iter().map(|x| x * x).sum::<f32>().sqrt();
653        let magnitude_other: f32 = other_f32.iter().map(|x| x * x).sum::<f32>().sqrt();
654
655        if magnitude_self == 0.0 || magnitude_other == 0.0 {
656            return Ok(0.0);
657        }
658
659        Ok(dot_product / (magnitude_self * magnitude_other))
660    }
661
662    /// Calculate Euclidean distance to another vector
663    pub fn euclidean_distance(&self, other: &Vector) -> Result<f32> {
664        if self.dimensions != other.dimensions {
665            return Err(anyhow::anyhow!("Vector dimensions must match"));
666        }
667
668        let self_f32 = self.as_f32();
669        let other_f32 = other.as_f32();
670
671        let distance = self_f32
672            .iter()
673            .zip(&other_f32)
674            .map(|(a, b)| (a - b).powi(2))
675            .sum::<f32>()
676            .sqrt();
677
678        Ok(distance)
679    }
680
681    /// Calculate Manhattan distance (L1 norm) to another vector
682    pub fn manhattan_distance(&self, other: &Vector) -> Result<f32> {
683        if self.dimensions != other.dimensions {
684            return Err(anyhow::anyhow!("Vector dimensions must match"));
685        }
686
687        let self_f32 = self.as_f32();
688        let other_f32 = other.as_f32();
689
690        let distance = self_f32
691            .iter()
692            .zip(&other_f32)
693            .map(|(a, b)| (a - b).abs())
694            .sum();
695
696        Ok(distance)
697    }
698
699    /// Calculate Minkowski distance (general Lp norm) to another vector
700    pub fn minkowski_distance(&self, other: &Vector, p: f32) -> Result<f32> {
701        if self.dimensions != other.dimensions {
702            return Err(anyhow::anyhow!("Vector dimensions must match"));
703        }
704
705        if p <= 0.0 {
706            return Err(anyhow::anyhow!("p must be positive"));
707        }
708
709        let self_f32 = self.as_f32();
710        let other_f32 = other.as_f32();
711
712        if p == f32::INFINITY {
713            // Special case: Chebyshev distance
714            return self.chebyshev_distance(other);
715        }
716
717        let distance = self_f32
718            .iter()
719            .zip(&other_f32)
720            .map(|(a, b)| (a - b).abs().powf(p))
721            .sum::<f32>()
722            .powf(1.0 / p);
723
724        Ok(distance)
725    }
726
727    /// Calculate Chebyshev distance (L∞ norm) to another vector
728    pub fn chebyshev_distance(&self, other: &Vector) -> Result<f32> {
729        if self.dimensions != other.dimensions {
730            return Err(anyhow::anyhow!("Vector dimensions must match"));
731        }
732
733        let self_f32 = self.as_f32();
734        let other_f32 = other.as_f32();
735
736        let distance = self_f32
737            .iter()
738            .zip(&other_f32)
739            .map(|(a, b)| (a - b).abs())
740            .fold(0.0f32, |max, val| max.max(val));
741
742        Ok(distance)
743    }
744
745    /// Get vector magnitude (L2 norm)
746    pub fn magnitude(&self) -> f32 {
747        let values = self.as_f32();
748        values.iter().map(|x| x * x).sum::<f32>().sqrt()
749    }
750
751    /// Normalize vector to unit length
752    pub fn normalize(&mut self) {
753        let mag = self.magnitude();
754        if mag > 0.0 {
755            match &mut self.values {
756                VectorData::F32(values) => {
757                    for value in values {
758                        *value /= mag;
759                    }
760                }
761                VectorData::F64(values) => {
762                    let mag_f64 = mag as f64;
763                    for value in values {
764                        *value /= mag_f64;
765                    }
766                }
767                _ => {
768                    // For other types, convert to f32, normalize, then convert back
769                    let mut f32_values = self.as_f32();
770                    for value in &mut f32_values {
771                        *value /= mag;
772                    }
773                    self.values = VectorData::F32(f32_values);
774                    self.precision = VectorPrecision::F32;
775                }
776            }
777        }
778    }
779
780    /// Get a normalized copy of this vector
781    pub fn normalized(&self) -> Vector {
782        let mut normalized = self.clone();
783        normalized.normalize();
784        normalized
785    }
786
787    /// Add another vector (element-wise)
788    pub fn add(&self, other: &Vector) -> Result<Vector> {
789        if self.dimensions != other.dimensions {
790            return Err(anyhow::anyhow!("Vector dimensions must match"));
791        }
792
793        let self_f32 = self.as_f32();
794        let other_f32 = other.as_f32();
795
796        let result_values: Vec<f32> = self_f32
797            .iter()
798            .zip(&other_f32)
799            .map(|(a, b)| a + b)
800            .collect();
801
802        Ok(Vector::new(result_values))
803    }
804
805    /// Subtract another vector (element-wise)
806    pub fn subtract(&self, other: &Vector) -> Result<Vector> {
807        if self.dimensions != other.dimensions {
808            return Err(anyhow::anyhow!("Vector dimensions must match"));
809        }
810
811        let self_f32 = self.as_f32();
812        let other_f32 = other.as_f32();
813
814        let result_values: Vec<f32> = self_f32
815            .iter()
816            .zip(&other_f32)
817            .map(|(a, b)| a - b)
818            .collect();
819
820        Ok(Vector::new(result_values))
821    }
822
823    /// Scale vector by a scalar
824    pub fn scale(&self, scalar: f32) -> Vector {
825        let values = self.as_f32();
826        let scaled_values: Vec<f32> = values.iter().map(|x| x * scalar).collect();
827
828        Vector::new(scaled_values)
829    }
830
831    /// Get the number of dimensions in the vector
832    pub fn len(&self) -> usize {
833        self.dimensions
834    }
835
836    /// Check if vector is empty (zero dimensions)
837    pub fn is_empty(&self) -> bool {
838        self.dimensions == 0
839    }
840
841    /// Get vector as slice of f32 values
842    pub fn as_slice(&self) -> Vec<f32> {
843        self.as_f32()
844    }
845}
846
847/// Vector index trait for efficient similarity search
848pub trait VectorIndex: Send + Sync {
849    /// Insert a vector with associated URI
850    fn insert(&mut self, uri: String, vector: Vector) -> Result<()>;
851
852    /// Find k nearest neighbors
853    fn search_knn(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>>;
854
855    /// Find all vectors within threshold similarity
856    fn search_threshold(&self, query: &Vector, threshold: f32) -> Result<Vec<(String, f32)>>;
857
858    /// Get a vector by its URI
859    fn get_vector(&self, uri: &str) -> Option<&Vector>;
860
861    /// Add a vector with associated ID and metadata
862    fn add_vector(
863        &mut self,
864        id: VectorId,
865        vector: Vector,
866        _metadata: Option<HashMap<String, String>>,
867    ) -> Result<()> {
868        // Default implementation that delegates to insert
869        self.insert(id, vector)
870    }
871
872    /// Update an existing vector
873    fn update_vector(&mut self, id: VectorId, vector: Vector) -> Result<()> {
874        // Default implementation that delegates to insert
875        self.insert(id, vector)
876    }
877
878    /// Update metadata for a vector
879    fn update_metadata(&mut self, _id: VectorId, _metadata: HashMap<String, String>) -> Result<()> {
880        // Default implementation (no-op)
881        Ok(())
882    }
883
884    /// Remove a vector by its ID
885    fn remove_vector(&mut self, _id: VectorId) -> Result<()> {
886        // Default implementation (no-op)
887        Ok(())
888    }
889}
890
891/// In-memory vector index implementation
892pub struct MemoryVectorIndex {
893    vectors: Vec<(String, Vector)>,
894    similarity_config: similarity::SimilarityConfig,
895}
896
897impl MemoryVectorIndex {
898    pub fn new() -> Self {
899        Self {
900            vectors: Vec::new(),
901            similarity_config: similarity::SimilarityConfig::default(),
902        }
903    }
904
905    pub fn with_similarity_config(config: similarity::SimilarityConfig) -> Self {
906        Self {
907            vectors: Vec::new(),
908            similarity_config: config,
909        }
910    }
911}
912
913impl Default for MemoryVectorIndex {
914    fn default() -> Self {
915        Self::new()
916    }
917}
918
919impl VectorIndex for MemoryVectorIndex {
920    fn insert(&mut self, uri: String, vector: Vector) -> Result<()> {
921        // Check if vector already exists and update it
922        if let Some(pos) = self.vectors.iter().position(|(id, _)| id == &uri) {
923            self.vectors[pos] = (uri, vector);
924        } else {
925            self.vectors.push((uri, vector));
926        }
927        Ok(())
928    }
929
930    fn search_knn(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
931        let metric = self.similarity_config.primary_metric;
932        let query_f32 = query.as_f32();
933        let mut similarities: Vec<(String, f32)> = self
934            .vectors
935            .iter()
936            .map(|(uri, vec)| {
937                let vec_f32 = vec.as_f32();
938                let sim = metric.similarity(&query_f32, &vec_f32).unwrap_or(0.0);
939                (uri.clone(), sim)
940            })
941            .collect();
942
943        similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
944        similarities.truncate(k);
945
946        Ok(similarities)
947    }
948
949    fn search_threshold(&self, query: &Vector, threshold: f32) -> Result<Vec<(String, f32)>> {
950        let metric = self.similarity_config.primary_metric;
951        let query_f32 = query.as_f32();
952        let similarities: Vec<(String, f32)> = self
953            .vectors
954            .iter()
955            .filter_map(|(uri, vec)| {
956                let vec_f32 = vec.as_f32();
957                let sim = metric.similarity(&query_f32, &vec_f32).unwrap_or(0.0);
958                if sim >= threshold {
959                    Some((uri.clone(), sim))
960                } else {
961                    None
962                }
963            })
964            .collect();
965
966        Ok(similarities)
967    }
968
969    fn get_vector(&self, uri: &str) -> Option<&Vector> {
970        self.vectors.iter().find(|(u, _)| u == uri).map(|(_, v)| v)
971    }
972
973    fn update_vector(&mut self, id: VectorId, vector: Vector) -> Result<()> {
974        if let Some(pos) = self.vectors.iter().position(|(uri, _)| uri == &id) {
975            self.vectors[pos] = (id, vector);
976            Ok(())
977        } else {
978            Err(anyhow::anyhow!("Vector with id '{}' not found", id))
979        }
980    }
981
982    fn remove_vector(&mut self, id: VectorId) -> Result<()> {
983        if let Some(pos) = self.vectors.iter().position(|(uri, _)| uri == &id) {
984            self.vectors.remove(pos);
985            Ok(())
986        } else {
987            Err(anyhow::anyhow!("Vector with id '{}' not found", id))
988        }
989    }
990}
991
992/// Enhanced vector store with embedding management and advanced features
993pub struct VectorStore {
994    index: Box<dyn VectorIndex>,
995    embedding_manager: Option<embeddings::EmbeddingManager>,
996    config: VectorStoreConfig,
997}
998
999/// Configuration for vector store
1000#[derive(Debug, Clone)]
1001pub struct VectorStoreConfig {
1002    pub auto_embed: bool,
1003    pub cache_embeddings: bool,
1004    pub similarity_threshold: f32,
1005    pub max_results: usize,
1006}
1007
1008impl Default for VectorStoreConfig {
1009    fn default() -> Self {
1010        Self {
1011            auto_embed: true,
1012            cache_embeddings: true,
1013            similarity_threshold: 0.7,
1014            max_results: 100,
1015        }
1016    }
1017}
1018
1019impl VectorStore {
1020    /// Create a new vector store with default memory index
1021    pub fn new() -> Self {
1022        Self {
1023            index: Box::new(MemoryVectorIndex::new()),
1024            embedding_manager: None,
1025            config: VectorStoreConfig::default(),
1026        }
1027    }
1028
1029    /// Create vector store with specific embedding strategy
1030    pub fn with_embedding_strategy(strategy: embeddings::EmbeddingStrategy) -> Result<Self> {
1031        let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
1032
1033        Ok(Self {
1034            index: Box::new(MemoryVectorIndex::new()),
1035            embedding_manager: Some(embedding_manager),
1036            config: VectorStoreConfig::default(),
1037        })
1038    }
1039
1040    /// Create vector store with custom index
1041    pub fn with_index(index: Box<dyn VectorIndex>) -> Self {
1042        Self {
1043            index,
1044            embedding_manager: None,
1045            config: VectorStoreConfig::default(),
1046        }
1047    }
1048
1049    /// Create vector store with custom index and embedding strategy
1050    pub fn with_index_and_embeddings(
1051        index: Box<dyn VectorIndex>,
1052        strategy: embeddings::EmbeddingStrategy,
1053    ) -> Result<Self> {
1054        let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
1055
1056        Ok(Self {
1057            index,
1058            embedding_manager: Some(embedding_manager),
1059            config: VectorStoreConfig::default(),
1060        })
1061    }
1062
1063    /// Set vector store configuration
1064    pub fn with_config(mut self, config: VectorStoreConfig) -> Self {
1065        self.config = config;
1066        self
1067    }
1068
1069    /// Index a resource with automatic embedding generation
1070    pub fn index_resource(&mut self, uri: String, content: &str) -> Result<()> {
1071        if let Some(ref mut embedding_manager) = self.embedding_manager {
1072            let embeddable_content = embeddings::EmbeddableContent::Text(content.to_string());
1073            let vector = embedding_manager.get_embedding(&embeddable_content)?;
1074            self.index.insert(uri, vector)
1075        } else {
1076            // Generate a simple hash-based vector as fallback
1077            let vector = self.generate_fallback_vector(content);
1078            self.index.insert(uri, vector)
1079        }
1080    }
1081
1082    /// Index an RDF resource with structured content
1083    pub fn index_rdf_resource(
1084        &mut self,
1085        uri: String,
1086        label: Option<String>,
1087        description: Option<String>,
1088        properties: std::collections::HashMap<String, Vec<String>>,
1089    ) -> Result<()> {
1090        if let Some(ref mut embedding_manager) = self.embedding_manager {
1091            let embeddable_content = embeddings::EmbeddableContent::RdfResource {
1092                uri: uri.clone(),
1093                label,
1094                description,
1095                properties,
1096            };
1097            let vector = embedding_manager.get_embedding(&embeddable_content)?;
1098            self.index.insert(uri, vector)
1099        } else {
1100            Err(anyhow::anyhow!(
1101                "Embedding manager required for RDF resource indexing"
1102            ))
1103        }
1104    }
1105
1106    /// Index a pre-computed vector
1107    pub fn index_vector(&mut self, uri: String, vector: Vector) -> Result<()> {
1108        self.index.insert(uri, vector)
1109    }
1110
1111    /// Search for similar resources using text query
1112    pub fn similarity_search(&self, query: &str, limit: usize) -> Result<Vec<(String, f32)>> {
1113        let query_vector = if let Some(ref _embedding_manager) = self.embedding_manager {
1114            let _embeddable_content = embeddings::EmbeddableContent::Text(query.to_string());
1115            // We need a mutable reference, but we only have an immutable one
1116            // For now, generate a fallback vector
1117            self.generate_fallback_vector(query)
1118        } else {
1119            self.generate_fallback_vector(query)
1120        };
1121
1122        self.index.search_knn(&query_vector, limit)
1123    }
1124
1125    /// Search for similar resources using a vector query
1126    pub fn similarity_search_vector(
1127        &self,
1128        query: &Vector,
1129        limit: usize,
1130    ) -> Result<Vec<(String, f32)>> {
1131        self.index.search_knn(query, limit)
1132    }
1133
1134    /// Find resources within similarity threshold
1135    pub fn threshold_search(&self, query: &str, threshold: f32) -> Result<Vec<(String, f32)>> {
1136        let query_vector = self.generate_fallback_vector(query);
1137        self.index.search_threshold(&query_vector, threshold)
1138    }
1139
1140    /// Advanced search with multiple options
1141    pub fn advanced_search(&self, options: SearchOptions) -> Result<Vec<(String, f32)>> {
1142        let query_vector = match options.query {
1143            SearchQuery::Text(text) => self.generate_fallback_vector(&text),
1144            SearchQuery::Vector(vector) => vector,
1145        };
1146
1147        let results = match options.search_type {
1148            SearchType::KNN(k) => self.index.search_knn(&query_vector, k)?,
1149            SearchType::Threshold(threshold) => {
1150                self.index.search_threshold(&query_vector, threshold)?
1151            }
1152        };
1153
1154        Ok(results)
1155    }
1156
1157    fn generate_fallback_vector(&self, text: &str) -> Vector {
1158        // Simple hash-based vector generation for fallback
1159        use std::collections::hash_map::DefaultHasher;
1160        use std::hash::{Hash, Hasher};
1161
1162        let mut hasher = DefaultHasher::new();
1163        text.hash(&mut hasher);
1164        let hash = hasher.finish();
1165
1166        let mut values = Vec::with_capacity(384); // Standard embedding size
1167        let mut seed = hash;
1168
1169        for _ in 0..384 {
1170            seed = seed.wrapping_mul(1103515245).wrapping_add(12345);
1171            let normalized = (seed as f32) / (u64::MAX as f32);
1172            values.push((normalized - 0.5) * 2.0); // Range: -1.0 to 1.0
1173        }
1174
1175        Vector::new(values)
1176    }
1177
1178    /// Get embedding manager statistics
1179    pub fn embedding_stats(&self) -> Option<(usize, usize)> {
1180        self.embedding_manager.as_ref().map(|em| em.cache_stats())
1181    }
1182
1183    /// Build vocabulary for TF-IDF embeddings
1184    pub fn build_vocabulary(&mut self, documents: &[String]) -> Result<()> {
1185        if let Some(ref mut embedding_manager) = self.embedding_manager {
1186            embedding_manager.build_vocabulary(documents)
1187        } else {
1188            Ok(()) // No-op if no embedding manager
1189        }
1190    }
1191
1192    /// Calculate similarity between two resources by their URIs
1193    pub fn calculate_similarity(&self, uri1: &str, uri2: &str) -> Result<f32> {
1194        // If the URIs are identical, return perfect similarity
1195        if uri1 == uri2 {
1196            return Ok(1.0);
1197        }
1198
1199        // Get the vectors for both URIs
1200        let vector1 = self
1201            .index
1202            .get_vector(uri1)
1203            .ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri1))?;
1204
1205        let vector2 = self
1206            .index
1207            .get_vector(uri2)
1208            .ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri2))?;
1209
1210        // Calculate cosine similarity between the vectors
1211        vector1.cosine_similarity(vector2)
1212    }
1213
1214    /// Get a vector by its ID (delegates to VectorIndex)
1215    pub fn get_vector(&self, id: &str) -> Option<&Vector> {
1216        self.index.get_vector(id)
1217    }
1218
1219    /// Index a vector with metadata (stub)
1220    pub fn index_vector_with_metadata(
1221        &mut self,
1222        uri: String,
1223        vector: Vector,
1224        _metadata: HashMap<String, String>,
1225    ) -> Result<()> {
1226        // For now, just delegate to index_vector, ignoring metadata
1227        // Future: Extend VectorIndex trait to support metadata
1228        self.index_vector(uri, vector)
1229    }
1230
1231    /// Index a resource with metadata (stub)
1232    pub fn index_resource_with_metadata(
1233        &mut self,
1234        uri: String,
1235        content: &str,
1236        _metadata: HashMap<String, String>,
1237    ) -> Result<()> {
1238        // For now, just delegate to index_resource, ignoring metadata
1239        // Future: Store and utilize metadata
1240        self.index_resource(uri, content)
1241    }
1242
1243    /// Search with additional parameters (stub)
1244    pub fn similarity_search_with_params(
1245        &self,
1246        query: &str,
1247        limit: usize,
1248        _params: HashMap<String, String>,
1249    ) -> Result<Vec<(String, f32)>> {
1250        // For now, just delegate to similarity_search, ignoring params
1251        // Future: Use params for filtering, threshold, etc.
1252        self.similarity_search(query, limit)
1253    }
1254
1255    /// Vector search with additional parameters (stub)
1256    pub fn vector_search_with_params(
1257        &self,
1258        query: &Vector,
1259        limit: usize,
1260        _params: HashMap<String, String>,
1261    ) -> Result<Vec<(String, f32)>> {
1262        // For now, just delegate to similarity_search_vector, ignoring params
1263        // Future: Use params for filtering, distance metric selection, etc.
1264        self.similarity_search_vector(query, limit)
1265    }
1266
1267    /// Get all vector IDs (stub)
1268    pub fn get_vector_ids(&self) -> Result<Vec<String>> {
1269        // VectorIndex trait doesn't provide this method yet
1270        // Future: Add to VectorIndex trait or track separately
1271        Ok(Vec::new())
1272    }
1273
1274    /// Remove a vector by its URI (stub)
1275    pub fn remove_vector(&mut self, uri: &str) -> Result<()> {
1276        // Delegate to VectorIndex trait's remove_vector method
1277        self.index.remove_vector(uri.to_string())
1278    }
1279
1280    /// Get store statistics (stub)
1281    pub fn get_statistics(&self) -> Result<HashMap<String, String>> {
1282        // Return basic statistics as a map
1283        // Future: Provide comprehensive stats from index
1284        let mut stats = HashMap::new();
1285        stats.insert("type".to_string(), "VectorStore".to_string());
1286
1287        if let Some((cache_size, cache_capacity)) = self.embedding_stats() {
1288            stats.insert("embedding_cache_size".to_string(), cache_size.to_string());
1289            stats.insert(
1290                "embedding_cache_capacity".to_string(),
1291                cache_capacity.to_string(),
1292            );
1293        }
1294
1295        Ok(stats)
1296    }
1297
1298    /// Save store to disk (stub)
1299    pub fn save_to_disk(&self, _path: &str) -> Result<()> {
1300        // Stub implementation - serialization not yet implemented
1301        // Future: Serialize index and configuration to disk
1302        Err(anyhow::anyhow!("save_to_disk not yet implemented"))
1303    }
1304
1305    /// Load store from disk (stub)
1306    pub fn load_from_disk(_path: &str) -> Result<Self> {
1307        // Stub implementation - deserialization not yet implemented
1308        // Future: Deserialize index and configuration from disk
1309        Err(anyhow::anyhow!("load_from_disk not yet implemented"))
1310    }
1311
1312    /// Optimize the underlying index (stub)
1313    pub fn optimize_index(&mut self) -> Result<()> {
1314        // Stub implementation - optimization not yet implemented
1315        // Future: Trigger index compaction, rebalancing, etc.
1316        Ok(())
1317    }
1318}
1319
1320impl Default for VectorStore {
1321    fn default() -> Self {
1322        Self::new()
1323    }
1324}
1325
1326impl VectorStoreTrait for VectorStore {
1327    fn insert_vector(&mut self, id: VectorId, vector: Vector) -> Result<()> {
1328        self.index.insert(id, vector)
1329    }
1330
1331    fn add_vector(&mut self, vector: Vector) -> Result<VectorId> {
1332        // Generate a unique ID for the vector
1333        let id = format!("vec_{}", uuid::Uuid::new_v4());
1334        self.index.insert(id.clone(), vector)?;
1335        Ok(id)
1336    }
1337
1338    fn get_vector(&self, id: &VectorId) -> Result<Option<Vector>> {
1339        Ok(self.index.get_vector(id).cloned())
1340    }
1341
1342    fn get_all_vector_ids(&self) -> Result<Vec<VectorId>> {
1343        // For now, return empty vec as VectorIndex doesn't provide this method
1344        // This could be enhanced if the underlying index supports it
1345        Ok(Vec::new())
1346    }
1347
1348    fn search_similar(&self, query: &Vector, k: usize) -> Result<Vec<(VectorId, f32)>> {
1349        self.index.search_knn(query, k)
1350    }
1351
1352    fn remove_vector(&mut self, id: &VectorId) -> Result<bool> {
1353        // VectorIndex trait doesn't have remove, so we'll return false for now
1354        // This could be enhanced in the future if needed
1355        let _ = id;
1356        Ok(false)
1357    }
1358
1359    fn len(&self) -> usize {
1360        // VectorIndex trait doesn't have len, so we'll return 0 for now
1361        // This could be enhanced in the future if needed
1362        0
1363    }
1364}
1365
1366/// Search query types
1367#[derive(Debug, Clone)]
1368pub enum SearchQuery {
1369    Text(String),
1370    Vector(Vector),
1371}
1372
1373/// Search operation types
1374#[derive(Debug, Clone)]
1375pub enum SearchType {
1376    KNN(usize),
1377    Threshold(f32),
1378}
1379
1380/// Advanced search options
1381#[derive(Debug, Clone)]
1382pub struct SearchOptions {
1383    pub query: SearchQuery,
1384    pub search_type: SearchType,
1385}
1386
1387/// Vector operation results with enhanced metadata
1388#[derive(Debug, Clone)]
1389pub struct VectorOperationResult {
1390    pub uri: String,
1391    pub similarity: f32,
1392    pub vector: Option<Vector>,
1393    pub metadata: Option<std::collections::HashMap<String, String>>,
1394    pub rank: usize,
1395}
1396
1397/// Document batch processing utilities
1398pub struct DocumentBatchProcessor;
1399
1400impl DocumentBatchProcessor {
1401    /// Process multiple documents in batch for efficient indexing
1402    pub fn batch_index(
1403        store: &mut VectorStore,
1404        documents: &[(String, String)], // (uri, content) pairs
1405    ) -> Result<Vec<Result<()>>> {
1406        let mut results = Vec::new();
1407
1408        for (uri, content) in documents {
1409            let result = store.index_resource(uri.clone(), content);
1410            results.push(result);
1411        }
1412
1413        Ok(results)
1414    }
1415
1416    /// Process multiple queries in batch
1417    pub fn batch_search(
1418        store: &VectorStore,
1419        queries: &[String],
1420        limit: usize,
1421    ) -> Result<BatchSearchResult> {
1422        let mut results = Vec::new();
1423
1424        for query in queries {
1425            let result = store.similarity_search(query, limit);
1426            results.push(result);
1427        }
1428
1429        Ok(results)
1430    }
1431}
1432
1433/// Error types specific to vector operations
1434#[derive(Debug, thiserror::Error)]
1435pub enum VectorError {
1436    #[error("Dimension mismatch: expected {expected}, got {actual}")]
1437    DimensionMismatch { expected: usize, actual: usize },
1438
1439    #[error("Empty vector")]
1440    EmptyVector,
1441
1442    #[error("Index not built")]
1443    IndexNotBuilt,
1444
1445    #[error("Embedding generation failed: {message}")]
1446    EmbeddingError { message: String },
1447
1448    #[error("SPARQL service error: {message}")]
1449    SparqlServiceError { message: String },
1450
1451    #[error("Compression error: {0}")]
1452    CompressionError(String),
1453
1454    #[error("Invalid dimensions: {0}")]
1455    InvalidDimensions(String),
1456
1457    #[error("Unsupported operation: {0}")]
1458    UnsupportedOperation(String),
1459
1460    #[error("Invalid data: {0}")]
1461    InvalidData(String),
1462
1463    #[error("IO error: {0}")]
1464    IoError(#[from] std::io::Error),
1465}
1466
1467/// Utility functions for vector operations
1468pub mod utils {
1469    use super::Vector;
1470
1471    /// Calculate centroid of a set of vectors
1472    pub fn centroid(vectors: &[Vector]) -> Option<Vector> {
1473        if vectors.is_empty() {
1474            return None;
1475        }
1476
1477        let dimensions = vectors[0].dimensions;
1478        let mut sum_values = vec![0.0; dimensions];
1479
1480        for vector in vectors {
1481            if vector.dimensions != dimensions {
1482                return None; // Inconsistent dimensions
1483            }
1484
1485            let vector_f32 = vector.as_f32();
1486            for (i, &value) in vector_f32.iter().enumerate() {
1487                sum_values[i] += value;
1488            }
1489        }
1490
1491        let count = vectors.len() as f32;
1492        for value in &mut sum_values {
1493            *value /= count;
1494        }
1495
1496        Some(Vector::new(sum_values))
1497    }
1498
1499    /// Generate random vector for testing
1500    pub fn random_vector(dimensions: usize, seed: Option<u64>) -> Vector {
1501        use std::collections::hash_map::DefaultHasher;
1502        use std::hash::{Hash, Hasher};
1503
1504        let mut hasher = DefaultHasher::new();
1505        seed.unwrap_or(42).hash(&mut hasher);
1506        let mut rng_state = hasher.finish();
1507
1508        let mut values = Vec::with_capacity(dimensions);
1509        for _ in 0..dimensions {
1510            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
1511            let normalized = (rng_state as f32) / (u64::MAX as f32);
1512            values.push((normalized - 0.5) * 2.0); // Range: -1.0 to 1.0
1513        }
1514
1515        Vector::new(values)
1516    }
1517
1518    /// Convert vector to normalized unit vector
1519    pub fn normalize_vector(vector: &Vector) -> Vector {
1520        vector.normalized()
1521    }
1522}
1523
1524#[cfg(test)]
1525mod tests {
1526    use super::*;
1527    use crate::similarity::SimilarityMetric;
1528
1529    #[test]
1530    fn test_vector_creation() {
1531        let values = vec![1.0, 2.0, 3.0];
1532        let vector = Vector::new(values.clone());
1533
1534        assert_eq!(vector.dimensions, 3);
1535        assert_eq!(vector.precision, VectorPrecision::F32);
1536        assert_eq!(vector.as_f32(), values);
1537    }
1538
1539    #[test]
1540    fn test_multi_precision_vectors() {
1541        // Test F64 vector
1542        let f64_values = vec![1.0, 2.0, 3.0];
1543        let f64_vector = Vector::f64(f64_values.clone());
1544        assert_eq!(f64_vector.precision, VectorPrecision::F64);
1545        assert_eq!(f64_vector.dimensions, 3);
1546
1547        // Test I8 vector
1548        let i8_values = vec![100, -50, 0];
1549        let i8_vector = Vector::i8(i8_values);
1550        assert_eq!(i8_vector.precision, VectorPrecision::I8);
1551        assert_eq!(i8_vector.dimensions, 3);
1552
1553        // Test binary vector
1554        let binary_values = vec![0b10101010, 0b11110000];
1555        let binary_vector = Vector::binary(binary_values);
1556        assert_eq!(binary_vector.precision, VectorPrecision::Binary);
1557        assert_eq!(binary_vector.dimensions, 16); // 2 bytes * 8 bits
1558    }
1559
1560    #[test]
1561    fn test_vector_operations() {
1562        let v1 = Vector::new(vec![1.0, 2.0, 3.0]);
1563        let v2 = Vector::new(vec![4.0, 5.0, 6.0]);
1564
1565        // Test addition
1566        let sum = v1.add(&v2).unwrap();
1567        assert_eq!(sum.as_f32(), vec![5.0, 7.0, 9.0]);
1568
1569        // Test subtraction
1570        let diff = v2.subtract(&v1).unwrap();
1571        assert_eq!(diff.as_f32(), vec![3.0, 3.0, 3.0]);
1572
1573        // Test scaling
1574        let scaled = v1.scale(2.0);
1575        assert_eq!(scaled.as_f32(), vec![2.0, 4.0, 6.0]);
1576    }
1577
1578    #[test]
1579    fn test_cosine_similarity() {
1580        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1581        let v2 = Vector::new(vec![1.0, 0.0, 0.0]);
1582        let v3 = Vector::new(vec![0.0, 1.0, 0.0]);
1583
1584        // Identical vectors should have similarity 1.0
1585        assert!((v1.cosine_similarity(&v2).unwrap() - 1.0).abs() < 0.001);
1586
1587        // Orthogonal vectors should have similarity 0.0
1588        assert!((v1.cosine_similarity(&v3).unwrap()).abs() < 0.001);
1589    }
1590
1591    #[test]
1592    fn test_vector_store() {
1593        let mut store = VectorStore::new();
1594
1595        // Test indexing
1596        store
1597            .index_resource("doc1".to_string(), "This is a test")
1598            .unwrap();
1599        store
1600            .index_resource("doc2".to_string(), "Another test document")
1601            .unwrap();
1602
1603        // Test searching
1604        let results = store.similarity_search("test", 5).unwrap();
1605        assert_eq!(results.len(), 2);
1606
1607        // Results should be sorted by similarity (descending)
1608        assert!(results[0].1 >= results[1].1);
1609    }
1610
1611    #[test]
1612    fn test_similarity_metrics() {
1613        let a = vec![1.0, 2.0, 3.0];
1614        let b = vec![4.0, 5.0, 6.0];
1615
1616        // Test different similarity metrics
1617        let cosine_sim = SimilarityMetric::Cosine.similarity(&a, &b).unwrap();
1618        let euclidean_sim = SimilarityMetric::Euclidean.similarity(&a, &b).unwrap();
1619        let manhattan_sim = SimilarityMetric::Manhattan.similarity(&a, &b).unwrap();
1620
1621        // All similarities should be between 0 and 1
1622        assert!((0.0..=1.0).contains(&cosine_sim));
1623        assert!((0.0..=1.0).contains(&euclidean_sim));
1624        assert!((0.0..=1.0).contains(&manhattan_sim));
1625    }
1626
1627    #[test]
1628    fn test_quantization() {
1629        let values = vec![1.0, -0.5, 0.0, 0.75];
1630        let quantized = Vector::quantize_to_i8(&values);
1631
1632        // Check that quantized values are in the expected range
1633        for &q in &quantized {
1634            assert!((-127..=127).contains(&q));
1635        }
1636    }
1637
1638    #[test]
1639    fn test_binary_conversion() {
1640        let values = vec![0.8, -0.3, 0.1, -0.9];
1641        let binary = Vector::to_binary(&values, 0.0);
1642
1643        // Should have 1 byte (4 values, each becomes 1 bit, packed into bytes)
1644        assert_eq!(binary.len(), 1);
1645
1646        // First bit should be 1 (0.8 > 0.0), second should be 0 (-0.3 < 0.0), etc.
1647        let byte = binary[0];
1648        assert_eq!(byte & 1, 1); // bit 0: 0.8 > 0.0
1649        assert_eq!((byte >> 1) & 1, 0); // bit 1: -0.3 < 0.0
1650        assert_eq!((byte >> 2) & 1, 1); // bit 2: 0.1 > 0.0
1651        assert_eq!((byte >> 3) & 1, 0); // bit 3: -0.9 < 0.0
1652    }
1653
1654    #[test]
1655    fn test_memory_vector_index() {
1656        let mut index = MemoryVectorIndex::new();
1657
1658        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1659        let v2 = Vector::new(vec![0.0, 1.0, 0.0]);
1660
1661        index.insert("v1".to_string(), v1.clone()).unwrap();
1662        index.insert("v2".to_string(), v2.clone()).unwrap();
1663
1664        // Test KNN search
1665        let results = index.search_knn(&v1, 1).unwrap();
1666        assert_eq!(results.len(), 1);
1667        assert_eq!(results[0].0, "v1");
1668
1669        // Test threshold search
1670        let results = index.search_threshold(&v1, 0.5).unwrap();
1671        assert!(!results.is_empty());
1672    }
1673
1674    #[test]
1675    fn test_hnsw_index() {
1676        use crate::hnsw::{HnswConfig, HnswIndex};
1677
1678        let config = HnswConfig::default();
1679        let mut index = HnswIndex::new(config).unwrap();
1680
1681        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1682        let v2 = Vector::new(vec![0.0, 1.0, 0.0]);
1683        let v3 = Vector::new(vec![0.0, 0.0, 1.0]);
1684
1685        index.insert("v1".to_string(), v1.clone()).unwrap();
1686        index.insert("v2".to_string(), v2.clone()).unwrap();
1687        index.insert("v3".to_string(), v3.clone()).unwrap();
1688
1689        // Test KNN search
1690        let results = index.search_knn(&v1, 2).unwrap();
1691        assert!(results.len() <= 2);
1692
1693        // The first result should be v1 itself (highest similarity)
1694        if !results.is_empty() {
1695            assert_eq!(results[0].0, "v1");
1696        }
1697    }
1698
1699    #[test]
1700    fn test_sparql_vector_service() {
1701        use crate::embeddings::EmbeddingStrategy;
1702        use crate::sparql_integration::{
1703            SparqlVectorService, VectorServiceArg, VectorServiceConfig, VectorServiceResult,
1704        };
1705
1706        let config = VectorServiceConfig::default();
1707        let mut service =
1708            SparqlVectorService::new(config, EmbeddingStrategy::SentenceTransformer).unwrap();
1709
1710        // Test vector similarity function
1711        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1712        let v2 = Vector::new(vec![1.0, 0.0, 0.0]);
1713
1714        let args = vec![VectorServiceArg::Vector(v1), VectorServiceArg::Vector(v2)];
1715
1716        let result = service
1717            .execute_function("vector_similarity", &args)
1718            .unwrap();
1719
1720        match result {
1721            VectorServiceResult::Number(similarity) => {
1722                assert!((similarity - 1.0).abs() < 0.001); // Should be very similar
1723            }
1724            _ => panic!("Expected a number result"),
1725        }
1726
1727        // Test text embedding function
1728        let text_args = vec![VectorServiceArg::String("test text".to_string())];
1729        let embed_result = service.execute_function("embed_text", &text_args).unwrap();
1730
1731        match embed_result {
1732            VectorServiceResult::Vector(vector) => {
1733                assert_eq!(vector.dimensions, 384); // Default embedding size
1734            }
1735            _ => panic!("Expected a vector result"),
1736        }
1737    }
1738}
oxirs_vec/lib.rs

oxirs_vec/
lib.rs