oxirs_vec/
lib.rs

1//! # OxiRS Vector Search
2//!
3//! [![Version](https://img.shields.io/badge/version-0.1.0--alpha.2-orange)](https://github.com/cool-japan/oxirs/releases)
4//! [![docs.rs](https://docs.rs/oxirs-vec/badge.svg)](https://docs.rs/oxirs-vec)
5//!
6//! **Status**: Alpha Release (v0.1.0-alpha.3)
7//! ⚠️ APIs may change. Not recommended for production use.
8//!
9//! Vector index abstractions for semantic similarity and AI-augmented SPARQL querying.
10//!
11//! This crate provides comprehensive vector search capabilities for knowledge graphs,
12//! enabling semantic similarity searches, AI-augmented SPARQL queries, and hybrid
13//! symbolic-vector operations.
14
15#![allow(dead_code)]
16//!
17//! ## Features
18//!
19//! - **Multi-algorithm embeddings**: TF-IDF, sentence transformers, custom models
20//! - **Advanced indexing**: HNSW, flat, quantized, and multi-index support
21//! - **Rich similarity metrics**: Cosine, Euclidean, Pearson, Jaccard, and more
22//! - **SPARQL integration**: `vec:similar` service functions and hybrid queries
23//! - **Performance optimization**: Caching, batching, and parallel processing
24//!
25//! ## Quick Start
26//!
27//! ```rust
28//! use oxirs_vec::{VectorStore, embeddings::EmbeddingStrategy};
29//!
30//! // Create vector store with sentence transformer embeddings
31//! let mut store = VectorStore::with_embedding_strategy(
32//!     EmbeddingStrategy::SentenceTransformer
33//! ).unwrap();
34//!
35//! // Index some content
36//! store
37//!     .index_resource(
38//!         "http://example.org/doc1".to_string(),
39//!         "This is a document about AI",
40//!     )
41//!     .unwrap();
42//! store
43//!     .index_resource(
44//!         "http://example.org/doc2".to_string(),
45//!         "Machine learning tutorial",
46//!     )
47//!     .unwrap();
48//!
49//! // Search for similar content
50//! let results = store
51//!     .similarity_search("artificial intelligence", 5)
52//!     .unwrap();
53//!
54//! println!("Found {} matching resources", results.len());
55//! ```
56
57use anyhow::Result;
58use std::collections::HashMap;
59
60pub mod adaptive_compression;
61pub mod adaptive_intelligent_caching;
62pub mod advanced_analytics;
63pub mod advanced_benchmarking;
64pub mod advanced_caching;
65pub mod advanced_metrics;
66pub mod advanced_result_merging;
67pub mod automl_optimization;
68pub mod benchmarking;
69pub mod cache_friendly_index;
70pub mod clustering;
71pub mod compression;
72#[cfg(feature = "content-processing")]
73pub mod content_processing;
74pub mod cross_language_alignment;
75pub mod cross_modal_embeddings;
76pub mod distributed_vector_search;
77pub mod embedding_pipeline;
78pub mod embeddings;
79pub mod enhanced_performance_monitoring;
80pub mod faiss_compatibility;
81pub mod faiss_gpu_integration;
82pub mod faiss_integration;
83pub mod faiss_migration_tools;
84pub mod faiss_native_integration;
85pub mod federated_search;
86pub mod gnn_embeddings;
87pub mod gpu;
88pub mod graph_aware_search;
89pub mod graph_indices;
90pub mod hierarchical_similarity;
91pub mod hnsw;
92pub mod huggingface;
93pub mod index;
94pub mod ivf;
95pub mod joint_embedding_spaces;
96pub mod kg_embeddings;
97pub mod lsh;
98pub mod mmap_advanced;
99pub mod mmap_index;
100pub mod opq;
101pub mod oxirs_arq_integration;
102pub mod performance_insights;
103pub mod pq;
104pub mod pytorch;
105pub mod quantum_search;
106pub mod random_utils;
107pub mod rdf_content_enhancement;
108pub mod rdf_integration;
109pub mod real_time_analytics;
110pub mod real_time_embedding_pipeline;
111pub mod real_time_updates;
112pub mod result_fusion;
113pub mod similarity;
114pub mod sparql_integration;
115pub mod sparql_service_endpoint;
116pub mod sparse;
117pub mod storage_optimizations;
118pub mod store_integration;
119pub mod structured_vectors;
120pub mod tensorflow;
121pub mod tree_indices;
122pub mod word2vec;
123
124// Python bindings module
125#[cfg(feature = "python")]
126pub mod python_bindings;
127
128// Re-export commonly used types
129pub use adaptive_compression::{
130    AdaptiveCompressor, CompressionMetrics, CompressionPriorities, MultiLevelCompression,
131    VectorStats,
132};
133pub use adaptive_intelligent_caching::{
134    AccessPatternAnalyzer, AdaptiveIntelligentCache, CacheConfiguration, CacheOptimizer,
135    CachePerformanceMetrics, CacheTier, MLModels, PredictivePrefetcher,
136};
137pub use advanced_analytics::{
138    AnomalyDetection, AnomalyDetector, AnomalyType, ImplementationEffort,
139    OptimizationRecommendation, PerformanceTrends, Priority, QualityAspect, QualityRecommendation,
140    QueryAnalytics, QueryAnomaly, RecommendationType, VectorAnalyticsEngine,
141    VectorDistributionAnalysis, VectorQualityAssessment,
142};
143pub use advanced_benchmarking::{
144    AdvancedBenchmarkConfig, AdvancedBenchmarkResult, AdvancedBenchmarkSuite, AlgorithmParameters,
145    BenchmarkAlgorithm, BuildTimeMetrics, CacheMetrics, DatasetQualityMetrics, DatasetStatistics,
146    DistanceStatistics, EnhancedBenchmarkDataset, HyperparameterTuner, IndexSizeMetrics,
147    LatencyMetrics, MemoryMetrics, ObjectiveFunction, OptimizationStrategy,
148    ParallelBenchmarkConfig, ParameterSpace, ParameterType, ParameterValue, PerformanceMetrics,
149    PerformanceProfiler, QualityDegradation, QualityMetrics, ScalabilityMetrics,
150    StatisticalAnalyzer, StatisticalMetrics, ThroughputMetrics,
151};
152pub use advanced_caching::{
153    BackgroundCacheWorker, CacheAnalysisReport, CacheAnalyzer, CacheConfig, CacheEntry,
154    CacheInvalidator, CacheKey, CacheStats, CacheWarmer, EvictionPolicy, InvalidationStats,
155    MultiLevelCache, MultiLevelCacheStats,
156};
157pub use advanced_result_merging::{
158    AdvancedResultMerger, ConfidenceInterval, DiversityConfig, DiversityMetric, FusionStatistics,
159    MergedResult, RankFusionAlgorithm, RankingFactor, ResultExplanation, ResultMergingConfig,
160    ResultMetadata, ScoreCombinationStrategy, ScoreNormalizationMethod, ScoredResult,
161    SourceContribution, SourceResult, SourceType,
162};
163pub use automl_optimization::{
164    AutoMLConfig, AutoMLOptimizer, AutoMLResults, AutoMLStatistics, IndexConfiguration,
165    IndexParameterSpace, OptimizationMetric, OptimizationTrial, ResourceConstraints, SearchSpace,
166    TrialResult,
167};
168pub use benchmarking::{
169    BenchmarkConfig, BenchmarkDataset, BenchmarkOutputFormat, BenchmarkResult, BenchmarkRunner,
170    BenchmarkSuite, BenchmarkTestCase, MemoryMetrics as BenchmarkMemoryMetrics,
171    PerformanceMetrics as BenchmarkPerformanceMetrics, QualityMetrics as BenchmarkQualityMetrics,
172    ScalabilityMetrics as BenchmarkScalabilityMetrics, SystemInfo,
173};
174pub use cache_friendly_index::{CacheFriendlyVectorIndex, IndexConfig as CacheFriendlyIndexConfig};
175pub use compression::{create_compressor, CompressionMethod, VectorCompressor};
176#[cfg(feature = "content-processing")]
177pub use content_processing::{
178    ChunkType, ChunkingStrategy, ContentChunk, ContentExtractionConfig, ContentLocation,
179    ContentProcessor, DocumentFormat, DocumentStructure, ExtractedContent, ExtractedImage,
180    ExtractedLink, ExtractedTable, FormatHandler, Heading, ProcessingStats, TocEntry,
181};
182pub use cross_modal_embeddings::{
183    AttentionMechanism, AudioData, AudioEncoder, CrossModalConfig, CrossModalEncoder, FusionLayer,
184    FusionStrategy, GraphData, GraphEncoder, ImageData, ImageEncoder, Modality, ModalityData,
185    MultiModalContent, TextEncoder, VideoData, VideoEncoder,
186};
187pub use distributed_vector_search::{
188    ConsistencyLevel, DistributedClusterStats, DistributedNodeConfig, DistributedQuery,
189    DistributedSearchResponse, DistributedVectorSearch, LoadBalancingAlgorithm, NodeHealthStatus,
190    PartitioningStrategy, QueryExecutionStrategy,
191};
192pub use embedding_pipeline::{
193    DimensionalityReduction, EmbeddingPipeline, NormalizationConfig, PostprocessingPipeline,
194    PreprocessingPipeline, TokenizerConfig, VectorNormalization,
195};
196pub use embeddings::{
197    EmbeddableContent, EmbeddingConfig, EmbeddingManager, EmbeddingStrategy, ModelDetails,
198    OpenAIConfig, OpenAIEmbeddingGenerator, SentenceTransformerGenerator, TransformerModelType,
199};
200pub use enhanced_performance_monitoring::{
201    Alert, AlertManager, AlertSeverity, AlertThresholds, AlertType, AnalyticsEngine,
202    AnalyticsReport, DashboardData, EnhancedPerformanceMonitor, ExportConfig, ExportDestination,
203    ExportFormat, LatencyDistribution, MonitoringConfig as EnhancedMonitoringConfig,
204    QualityMetrics as EnhancedQualityMetrics, QualityMetricsCollector, QualityStatistics,
205    QueryInfo, QueryMetricsCollector, QueryStatistics, QueryType, Recommendation,
206    RecommendationCategory, RecommendationPriority, SystemMetrics, SystemMetricsCollector,
207    SystemStatistics, TrendData, TrendDirection,
208};
209pub use faiss_compatibility::{
210    CompressionLevel, ConversionMetrics, ConversionResult, FaissCompatibility, FaissExportConfig,
211    FaissImportConfig, FaissIndexMetadata, FaissIndexType, FaissMetricType, FaissParameter,
212    SimpleVectorIndex,
213};
214pub use federated_search::{
215    AuthenticationConfig, FederatedSearchConfig, FederatedVectorSearch, FederationEndpoint,
216    PrivacyEngine, PrivacyMode, SchemaCompatibility, TrustManager,
217};
218pub use gnn_embeddings::{AggregatorType, GraphSAGE, GCN};
219pub use gpu::{
220    create_default_accelerator, create_memory_optimized_accelerator,
221    create_performance_accelerator, is_gpu_available, GpuAccelerator, GpuBuffer, GpuConfig,
222    GpuDevice, GpuExecutionConfig,
223};
224pub use graph_indices::{
225    DelaunayGraph, GraphIndex, GraphIndexConfig, GraphType, NSWGraph, ONNGGraph, PANNGGraph,
226    RNGGraph,
227};
228pub use hierarchical_similarity::{
229    ConceptHierarchy, HierarchicalSimilarity, HierarchicalSimilarityConfig,
230    HierarchicalSimilarityResult, HierarchicalSimilarityStats, SimilarityContext,
231    SimilarityExplanation, SimilarityTaskType,
232};
233pub use hnsw::{HnswConfig, HnswIndex};
234pub use index::{AdvancedVectorIndex, DistanceMetric, IndexConfig, IndexType, SearchResult};
235pub use ivf::{IvfConfig, IvfIndex, IvfStats, QuantizationStrategy};
236pub use joint_embedding_spaces::{
237    ActivationFunction, AlignmentPair, CLIPAligner, ContrastiveOptimizer, CrossModalAttention,
238    CurriculumLearning, DataAugmentation, DifficultySchedule, DomainAdapter, DomainStatistics,
239    JointEmbeddingConfig, JointEmbeddingSpace, LearningRateSchedule, LinearProjector,
240    PacingFunction, ScheduleType, TemperatureScheduler, TrainingStatistics,
241};
242pub use kg_embeddings::{
243    ComplEx, KGEmbedding, KGEmbeddingConfig, KGEmbeddingModel as KGModel, KGEmbeddingModelType,
244    RotatE, TransE, Triple,
245};
246pub use lsh::{LshConfig, LshFamily, LshIndex, LshStats};
247pub use mmap_index::{MemoryMappedIndexStats, MemoryMappedVectorIndex};
248pub use performance_insights::{
249    AlertingSystem, OptimizationRecommendations, PerformanceInsightsAnalyzer,
250    PerformanceTrends as InsightsPerformanceTrends, QueryComplexity,
251    QueryStatistics as InsightsQueryStatistics, ReportFormat, VectorStatistics,
252};
253pub use pq::{PQConfig, PQIndex, PQStats};
254pub use pytorch::{
255    ArchitectureType, CompileMode, DeviceManager, PyTorchConfig, PyTorchDevice, PyTorchEmbedder,
256    PyTorchModelManager, PyTorchModelMetadata, PyTorchTokenizer,
257};
258pub use quantum_search::{
259    QuantumSearchConfig, QuantumSearchResult, QuantumSearchStatistics, QuantumState,
260    QuantumVectorSearch,
261};
262pub use rdf_content_enhancement::{
263    ComponentWeights, MultiLanguageProcessor, PathConstraint, PathDirection, PropertyAggregator,
264    PropertyPath, RdfContentConfig, RdfContentProcessor, RdfContext, RdfEntity, RdfValue,
265    TemporalInfo,
266};
267pub use rdf_integration::{
268    RdfIntegrationStats, RdfTermMapping, RdfTermMetadata, RdfTermType, RdfVectorConfig,
269    RdfVectorIntegration, RdfVectorSearchResult, SearchMetadata,
270};
271pub use real_time_analytics::{
272    AlertSeverity as AnalyticsAlertSeverity, AlertType as AnalyticsAlertType, AnalyticsConfig,
273    AnalyticsEvent, AnalyticsReport as RealTimeAnalyticsReport,
274    DashboardData as RealTimeDashboardData, ExportFormat as AnalyticsExportFormat,
275    MetricsCollector, PerformanceMonitor, QueryMetrics, SystemMetrics as AnalyticsSystemMetrics,
276    VectorAnalyticsEngine as RealTimeVectorAnalyticsEngine,
277};
278pub use real_time_embedding_pipeline::{
279    AlertThresholds as PipelineAlertThresholds, AutoScalingConfig, CompressionConfig, ContentItem,
280    MonitoringConfig as PipelineMonitoringConfig, PipelineConfig as RealTimeEmbeddingConfig,
281    PipelineStatistics as PipelineStats, ProcessingPriority, ProcessingResult, ProcessingStatus,
282    RealTimeEmbeddingPipeline, VersioningStrategy,
283};
284pub use real_time_updates::{
285    BatchProcessor, RealTimeConfig, RealTimeVectorSearch, RealTimeVectorUpdater, UpdateBatch,
286    UpdateOperation, UpdatePriority, UpdateStats,
287};
288pub use result_fusion::{
289    FusedResults, FusionAlgorithm, FusionConfig, FusionQualityMetrics, FusionStats,
290    ResultFusionEngine, ScoreNormalizationStrategy, SourceResults, VectorSearchResult,
291};
292pub use similarity::{AdaptiveSimilarity, SemanticSimilarity, SimilarityConfig, SimilarityMetric};
293pub use sparql_integration::{
294    CrossLanguageProcessor, FederatedQueryResult, QueryExecutor, SparqlVectorFunctions,
295    SparqlVectorService, VectorOperation, VectorQuery, VectorQueryResult, VectorServiceArg,
296    VectorServiceConfig, VectorServiceResult,
297};
298pub use sparql_service_endpoint::{
299    AuthenticationInfo, AuthenticationType, CustomFunctionRegistry, FederatedOperation,
300    FederatedSearchResult, FederatedServiceEndpoint, FederatedVectorQuery, FunctionMetadata,
301    LoadBalancer, ParameterInfo, ParameterType as ServiceParameterType, PartialSearchResult,
302    QueryScope, ReturnType, ServiceCapability, ServiceEndpointManager, ServiceType,
303};
304pub use sparse::{COOMatrix, CSRMatrix, SparseVector};
305pub use storage_optimizations::{
306    CompressionType, MmapVectorFile, StorageConfig, StorageUtils, VectorBlock, VectorFileHeader,
307    VectorReader, VectorWriter,
308};
309pub use structured_vectors::{
310    ConfidenceScoredVector, HierarchicalVector, NamedDimensionVector, TemporalVector,
311    WeightedDimensionVector,
312};
313pub use tensorflow::{
314    OptimizationLevel, PreprocessingPipeline as TensorFlowPreprocessingPipeline, ServerConfig,
315    SessionConfig, TensorDataType, TensorFlowConfig, TensorFlowDevice, TensorFlowEmbedder,
316    TensorFlowModelInfo, TensorFlowModelServer, TensorSpec,
317};
318pub use tree_indices::{
319    BallTree, CoverTree, KdTree, RandomProjectionTree, TreeIndex, TreeIndexConfig, TreeType, VpTree,
320};
321pub use word2vec::{
322    AggregationMethod, OovStrategy, Word2VecConfig, Word2VecEmbeddingGenerator, Word2VecFormat,
323};
324
325/// Vector identifier type
326pub type VectorId = String;
327
328/// Batch search result type
329pub type BatchSearchResult = Vec<Result<Vec<(String, f32)>>>;
330
331/// Trait for vector store implementations
332pub trait VectorStoreTrait: Send + Sync {
333    /// Insert a vector with metadata
334    fn insert_vector(&mut self, id: VectorId, vector: Vector) -> Result<()>;
335
336    /// Add a vector and return its ID
337    fn add_vector(&mut self, vector: Vector) -> Result<VectorId>;
338
339    /// Get a vector by its ID
340    fn get_vector(&self, id: &VectorId) -> Result<Option<Vector>>;
341
342    /// Get all vector IDs
343    fn get_all_vector_ids(&self) -> Result<Vec<VectorId>>;
344
345    /// Search for similar vectors
346    fn search_similar(&self, query: &Vector, k: usize) -> Result<Vec<(VectorId, f32)>>;
347
348    /// Remove a vector by ID
349    fn remove_vector(&mut self, id: &VectorId) -> Result<bool>;
350
351    /// Get the number of vectors stored
352    fn len(&self) -> usize;
353
354    /// Check if the store is empty
355    fn is_empty(&self) -> bool {
356        self.len() == 0
357    }
358}
359
360/// Precision types for vectors
361#[derive(Debug, Clone, Copy, PartialEq, serde::Serialize, serde::Deserialize)]
362pub enum VectorPrecision {
363    F32,
364    F64,
365    F16,
366    I8,
367    Binary,
368}
369
370/// Multi-precision vector with enhanced functionality
371#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
372pub struct Vector {
373    pub dimensions: usize,
374    pub precision: VectorPrecision,
375    pub values: VectorData,
376    pub metadata: Option<std::collections::HashMap<String, String>>,
377}
378
379/// Vector data storage supporting multiple precisions
380#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
381pub enum VectorData {
382    F32(Vec<f32>),
383    F64(Vec<f64>),
384    F16(Vec<u16>), // Using u16 to represent f16 bits
385    I8(Vec<i8>),
386    Binary(Vec<u8>), // Packed binary representation
387}
388
389impl Vector {
390    /// Create a new F32 vector from values
391    pub fn new(values: Vec<f32>) -> Self {
392        let dimensions = values.len();
393        Self {
394            dimensions,
395            precision: VectorPrecision::F32,
396            values: VectorData::F32(values),
397            metadata: None,
398        }
399    }
400
401    /// Create a new vector with specific precision
402    pub fn with_precision(values: VectorData) -> Self {
403        let (dimensions, precision) = match &values {
404            VectorData::F32(v) => (v.len(), VectorPrecision::F32),
405            VectorData::F64(v) => (v.len(), VectorPrecision::F64),
406            VectorData::F16(v) => (v.len(), VectorPrecision::F16),
407            VectorData::I8(v) => (v.len(), VectorPrecision::I8),
408            VectorData::Binary(v) => (v.len() * 8, VectorPrecision::Binary), // 8 bits per byte
409        };
410
411        Self {
412            dimensions,
413            precision,
414            values,
415            metadata: None,
416        }
417    }
418
419    /// Create a new vector with metadata
420    pub fn with_metadata(
421        values: Vec<f32>,
422        metadata: std::collections::HashMap<String, String>,
423    ) -> Self {
424        let dimensions = values.len();
425        Self {
426            dimensions,
427            precision: VectorPrecision::F32,
428            values: VectorData::F32(values),
429            metadata: Some(metadata),
430        }
431    }
432
433    /// Create F64 vector
434    pub fn f64(values: Vec<f64>) -> Self {
435        Self::with_precision(VectorData::F64(values))
436    }
437
438    /// Create F16 vector (using u16 representation)
439    pub fn f16(values: Vec<u16>) -> Self {
440        Self::with_precision(VectorData::F16(values))
441    }
442
443    /// Create I8 quantized vector
444    pub fn i8(values: Vec<i8>) -> Self {
445        Self::with_precision(VectorData::I8(values))
446    }
447
448    /// Create binary vector
449    pub fn binary(values: Vec<u8>) -> Self {
450        Self::with_precision(VectorData::Binary(values))
451    }
452
453    /// Get vector values as f32 (converting if necessary)
454    pub fn as_f32(&self) -> Vec<f32> {
455        match &self.values {
456            VectorData::F32(v) => v.clone(),
457            VectorData::F64(v) => v.iter().map(|&x| x as f32).collect(),
458            VectorData::F16(v) => v.iter().map(|&x| Self::f16_to_f32(x)).collect(),
459            VectorData::I8(v) => v.iter().map(|&x| x as f32 / 128.0).collect(), // Normalize to [-1, 1]
460            VectorData::Binary(v) => {
461                let mut result = Vec::new();
462                for &byte in v {
463                    for bit in 0..8 {
464                        result.push(if (byte >> bit) & 1 == 1 { 1.0 } else { 0.0 });
465                    }
466                }
467                result
468            }
469        }
470    }
471
472    /// Convert f32 to f16 representation (simplified)
473    #[allow(dead_code)]
474    fn f32_to_f16(value: f32) -> u16 {
475        // Simplified f16 conversion - in practice, use proper IEEE 754 half-precision
476        let bits = value.to_bits();
477        let sign = (bits >> 31) & 0x1;
478        let exp = ((bits >> 23) & 0xff) as i32;
479        let mantissa = bits & 0x7fffff;
480
481        // Simplified conversion
482        let f16_exp = if exp == 0 {
483            0
484        } else {
485            (exp - 127 + 15).clamp(0, 31) as u16
486        };
487
488        let f16_mantissa = (mantissa >> 13) as u16;
489        ((sign as u16) << 15) | (f16_exp << 10) | f16_mantissa
490    }
491
492    /// Convert f16 representation to f32 (simplified)
493    fn f16_to_f32(value: u16) -> f32 {
494        // Simplified f16 conversion - in practice, use proper IEEE 754 half-precision
495        let sign = (value >> 15) & 0x1;
496        let exp = ((value >> 10) & 0x1f) as i32;
497        let mantissa = value & 0x3ff;
498
499        if exp == 0 {
500            if mantissa == 0 {
501                if sign == 1 {
502                    -0.0
503                } else {
504                    0.0
505                }
506            } else {
507                // Denormalized number
508                let f32_exp = -14 - 127;
509                let f32_mantissa = (mantissa as u32) << 13;
510                f32::from_bits(((sign as u32) << 31) | ((f32_exp as u32) << 23) | f32_mantissa)
511            }
512        } else {
513            let f32_exp = exp - 15 + 127;
514            let f32_mantissa = (mantissa as u32) << 13;
515            f32::from_bits(((sign as u32) << 31) | ((f32_exp as u32) << 23) | f32_mantissa)
516        }
517    }
518
519    /// Quantize f32 vector to i8
520    pub fn quantize_to_i8(values: &[f32]) -> Vec<i8> {
521        // Find min/max for normalization
522        let min_val = values.iter().fold(f32::INFINITY, |a, &b| a.min(b));
523        let max_val = values.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
524        let range = max_val - min_val;
525
526        if range == 0.0 {
527            vec![0; values.len()]
528        } else {
529            values
530                .iter()
531                .map(|&x| {
532                    let normalized = (x - min_val) / range; // 0 to 1
533                    let scaled = normalized * 254.0 - 127.0; // -127 to 127
534                    scaled.round().clamp(-127.0, 127.0) as i8
535                })
536                .collect()
537        }
538    }
539
540    /// Convert to binary representation using threshold
541    pub fn to_binary(values: &[f32], threshold: f32) -> Vec<u8> {
542        let mut binary = Vec::new();
543        let mut current_byte = 0u8;
544        let mut bit_position = 0;
545
546        for &value in values {
547            if value > threshold {
548                current_byte |= 1 << bit_position;
549            }
550
551            bit_position += 1;
552            if bit_position == 8 {
553                binary.push(current_byte);
554                current_byte = 0;
555                bit_position = 0;
556            }
557        }
558
559        // Handle remaining bits
560        if bit_position > 0 {
561            binary.push(current_byte);
562        }
563
564        binary
565    }
566
567    /// Calculate cosine similarity with another vector
568    pub fn cosine_similarity(&self, other: &Vector) -> Result<f32> {
569        if self.dimensions != other.dimensions {
570            return Err(anyhow::anyhow!("Vector dimensions must match"));
571        }
572
573        let self_f32 = self.as_f32();
574        let other_f32 = other.as_f32();
575
576        let dot_product: f32 = self_f32.iter().zip(&other_f32).map(|(a, b)| a * b).sum();
577
578        let magnitude_self: f32 = self_f32.iter().map(|x| x * x).sum::<f32>().sqrt();
579        let magnitude_other: f32 = other_f32.iter().map(|x| x * x).sum::<f32>().sqrt();
580
581        if magnitude_self == 0.0 || magnitude_other == 0.0 {
582            return Ok(0.0);
583        }
584
585        Ok(dot_product / (magnitude_self * magnitude_other))
586    }
587
588    /// Calculate Euclidean distance to another vector
589    pub fn euclidean_distance(&self, other: &Vector) -> Result<f32> {
590        if self.dimensions != other.dimensions {
591            return Err(anyhow::anyhow!("Vector dimensions must match"));
592        }
593
594        let self_f32 = self.as_f32();
595        let other_f32 = other.as_f32();
596
597        let distance = self_f32
598            .iter()
599            .zip(&other_f32)
600            .map(|(a, b)| (a - b).powi(2))
601            .sum::<f32>()
602            .sqrt();
603
604        Ok(distance)
605    }
606
607    /// Calculate Manhattan distance (L1 norm) to another vector
608    pub fn manhattan_distance(&self, other: &Vector) -> Result<f32> {
609        if self.dimensions != other.dimensions {
610            return Err(anyhow::anyhow!("Vector dimensions must match"));
611        }
612
613        let self_f32 = self.as_f32();
614        let other_f32 = other.as_f32();
615
616        let distance = self_f32
617            .iter()
618            .zip(&other_f32)
619            .map(|(a, b)| (a - b).abs())
620            .sum();
621
622        Ok(distance)
623    }
624
625    /// Calculate Minkowski distance (general Lp norm) to another vector
626    pub fn minkowski_distance(&self, other: &Vector, p: f32) -> Result<f32> {
627        if self.dimensions != other.dimensions {
628            return Err(anyhow::anyhow!("Vector dimensions must match"));
629        }
630
631        if p <= 0.0 {
632            return Err(anyhow::anyhow!("p must be positive"));
633        }
634
635        let self_f32 = self.as_f32();
636        let other_f32 = other.as_f32();
637
638        if p == f32::INFINITY {
639            // Special case: Chebyshev distance
640            return self.chebyshev_distance(other);
641        }
642
643        let distance = self_f32
644            .iter()
645            .zip(&other_f32)
646            .map(|(a, b)| (a - b).abs().powf(p))
647            .sum::<f32>()
648            .powf(1.0 / p);
649
650        Ok(distance)
651    }
652
653    /// Calculate Chebyshev distance (L∞ norm) to another vector
654    pub fn chebyshev_distance(&self, other: &Vector) -> Result<f32> {
655        if self.dimensions != other.dimensions {
656            return Err(anyhow::anyhow!("Vector dimensions must match"));
657        }
658
659        let self_f32 = self.as_f32();
660        let other_f32 = other.as_f32();
661
662        let distance = self_f32
663            .iter()
664            .zip(&other_f32)
665            .map(|(a, b)| (a - b).abs())
666            .fold(0.0f32, |max, val| max.max(val));
667
668        Ok(distance)
669    }
670
671    /// Get vector magnitude (L2 norm)
672    pub fn magnitude(&self) -> f32 {
673        let values = self.as_f32();
674        values.iter().map(|x| x * x).sum::<f32>().sqrt()
675    }
676
677    /// Normalize vector to unit length
678    pub fn normalize(&mut self) {
679        let mag = self.magnitude();
680        if mag > 0.0 {
681            match &mut self.values {
682                VectorData::F32(values) => {
683                    for value in values {
684                        *value /= mag;
685                    }
686                }
687                VectorData::F64(values) => {
688                    let mag_f64 = mag as f64;
689                    for value in values {
690                        *value /= mag_f64;
691                    }
692                }
693                _ => {
694                    // For other types, convert to f32, normalize, then convert back
695                    let mut f32_values = self.as_f32();
696                    for value in &mut f32_values {
697                        *value /= mag;
698                    }
699                    self.values = VectorData::F32(f32_values);
700                    self.precision = VectorPrecision::F32;
701                }
702            }
703        }
704    }
705
706    /// Get a normalized copy of this vector
707    pub fn normalized(&self) -> Vector {
708        let mut normalized = self.clone();
709        normalized.normalize();
710        normalized
711    }
712
713    /// Add another vector (element-wise)
714    pub fn add(&self, other: &Vector) -> Result<Vector> {
715        if self.dimensions != other.dimensions {
716            return Err(anyhow::anyhow!("Vector dimensions must match"));
717        }
718
719        let self_f32 = self.as_f32();
720        let other_f32 = other.as_f32();
721
722        let result_values: Vec<f32> = self_f32
723            .iter()
724            .zip(&other_f32)
725            .map(|(a, b)| a + b)
726            .collect();
727
728        Ok(Vector::new(result_values))
729    }
730
731    /// Subtract another vector (element-wise)
732    pub fn subtract(&self, other: &Vector) -> Result<Vector> {
733        if self.dimensions != other.dimensions {
734            return Err(anyhow::anyhow!("Vector dimensions must match"));
735        }
736
737        let self_f32 = self.as_f32();
738        let other_f32 = other.as_f32();
739
740        let result_values: Vec<f32> = self_f32
741            .iter()
742            .zip(&other_f32)
743            .map(|(a, b)| a - b)
744            .collect();
745
746        Ok(Vector::new(result_values))
747    }
748
749    /// Scale vector by a scalar
750    pub fn scale(&self, scalar: f32) -> Vector {
751        let values = self.as_f32();
752        let scaled_values: Vec<f32> = values.iter().map(|x| x * scalar).collect();
753
754        Vector::new(scaled_values)
755    }
756
757    /// Get the number of dimensions in the vector
758    pub fn len(&self) -> usize {
759        self.dimensions
760    }
761
762    /// Check if vector is empty (zero dimensions)
763    pub fn is_empty(&self) -> bool {
764        self.dimensions == 0
765    }
766
767    /// Get vector as slice of f32 values
768    pub fn as_slice(&self) -> Vec<f32> {
769        self.as_f32()
770    }
771}
772
773/// Vector index trait for efficient similarity search
774pub trait VectorIndex: Send + Sync {
775    /// Insert a vector with associated URI
776    fn insert(&mut self, uri: String, vector: Vector) -> Result<()>;
777
778    /// Find k nearest neighbors
779    fn search_knn(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>>;
780
781    /// Find all vectors within threshold similarity
782    fn search_threshold(&self, query: &Vector, threshold: f32) -> Result<Vec<(String, f32)>>;
783
784    /// Get a vector by its URI
785    fn get_vector(&self, uri: &str) -> Option<&Vector>;
786
787    /// Add a vector with associated ID and metadata
788    fn add_vector(
789        &mut self,
790        id: VectorId,
791        vector: Vector,
792        _metadata: Option<HashMap<String, String>>,
793    ) -> Result<()> {
794        // Default implementation that delegates to insert
795        self.insert(id, vector)
796    }
797
798    /// Update an existing vector
799    fn update_vector(&mut self, id: VectorId, vector: Vector) -> Result<()> {
800        // Default implementation that delegates to insert
801        self.insert(id, vector)
802    }
803
804    /// Update metadata for a vector
805    fn update_metadata(&mut self, _id: VectorId, _metadata: HashMap<String, String>) -> Result<()> {
806        // Default implementation (no-op)
807        Ok(())
808    }
809
810    /// Remove a vector by its ID
811    fn remove_vector(&mut self, _id: VectorId) -> Result<()> {
812        // Default implementation (no-op)
813        Ok(())
814    }
815}
816
817/// In-memory vector index implementation
818pub struct MemoryVectorIndex {
819    vectors: Vec<(String, Vector)>,
820    similarity_config: similarity::SimilarityConfig,
821}
822
823impl MemoryVectorIndex {
824    pub fn new() -> Self {
825        Self {
826            vectors: Vec::new(),
827            similarity_config: similarity::SimilarityConfig::default(),
828        }
829    }
830
831    pub fn with_similarity_config(config: similarity::SimilarityConfig) -> Self {
832        Self {
833            vectors: Vec::new(),
834            similarity_config: config,
835        }
836    }
837}
838
839impl Default for MemoryVectorIndex {
840    fn default() -> Self {
841        Self::new()
842    }
843}
844
845impl VectorIndex for MemoryVectorIndex {
846    fn insert(&mut self, uri: String, vector: Vector) -> Result<()> {
847        self.vectors.push((uri, vector));
848        Ok(())
849    }
850
851    fn search_knn(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
852        let metric = self.similarity_config.primary_metric;
853        let query_f32 = query.as_f32();
854        let mut similarities: Vec<(String, f32)> = self
855            .vectors
856            .iter()
857            .map(|(uri, vec)| {
858                let vec_f32 = vec.as_f32();
859                let sim = metric.similarity(&query_f32, &vec_f32).unwrap_or(0.0);
860                (uri.clone(), sim)
861            })
862            .collect();
863
864        similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
865        similarities.truncate(k);
866
867        Ok(similarities)
868    }
869
870    fn search_threshold(&self, query: &Vector, threshold: f32) -> Result<Vec<(String, f32)>> {
871        let metric = self.similarity_config.primary_metric;
872        let query_f32 = query.as_f32();
873        let similarities: Vec<(String, f32)> = self
874            .vectors
875            .iter()
876            .filter_map(|(uri, vec)| {
877                let vec_f32 = vec.as_f32();
878                let sim = metric.similarity(&query_f32, &vec_f32).unwrap_or(0.0);
879                if sim >= threshold {
880                    Some((uri.clone(), sim))
881                } else {
882                    None
883                }
884            })
885            .collect();
886
887        Ok(similarities)
888    }
889
890    fn get_vector(&self, uri: &str) -> Option<&Vector> {
891        self.vectors.iter().find(|(u, _)| u == uri).map(|(_, v)| v)
892    }
893}
894
895/// Enhanced vector store with embedding management and advanced features
896pub struct VectorStore {
897    index: Box<dyn VectorIndex>,
898    embedding_manager: Option<embeddings::EmbeddingManager>,
899    config: VectorStoreConfig,
900}
901
902/// Configuration for vector store
903#[derive(Debug, Clone)]
904pub struct VectorStoreConfig {
905    pub auto_embed: bool,
906    pub cache_embeddings: bool,
907    pub similarity_threshold: f32,
908    pub max_results: usize,
909}
910
911impl Default for VectorStoreConfig {
912    fn default() -> Self {
913        Self {
914            auto_embed: true,
915            cache_embeddings: true,
916            similarity_threshold: 0.7,
917            max_results: 100,
918        }
919    }
920}
921
922impl VectorStore {
923    /// Create a new vector store with default memory index
924    pub fn new() -> Self {
925        Self {
926            index: Box::new(MemoryVectorIndex::new()),
927            embedding_manager: None,
928            config: VectorStoreConfig::default(),
929        }
930    }
931
932    /// Create vector store with specific embedding strategy
933    pub fn with_embedding_strategy(strategy: embeddings::EmbeddingStrategy) -> Result<Self> {
934        let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
935
936        Ok(Self {
937            index: Box::new(MemoryVectorIndex::new()),
938            embedding_manager: Some(embedding_manager),
939            config: VectorStoreConfig::default(),
940        })
941    }
942
943    /// Create vector store with custom index
944    pub fn with_index(index: Box<dyn VectorIndex>) -> Self {
945        Self {
946            index,
947            embedding_manager: None,
948            config: VectorStoreConfig::default(),
949        }
950    }
951
952    /// Create vector store with custom index and embedding strategy
953    pub fn with_index_and_embeddings(
954        index: Box<dyn VectorIndex>,
955        strategy: embeddings::EmbeddingStrategy,
956    ) -> Result<Self> {
957        let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
958
959        Ok(Self {
960            index,
961            embedding_manager: Some(embedding_manager),
962            config: VectorStoreConfig::default(),
963        })
964    }
965
966    /// Set vector store configuration
967    pub fn with_config(mut self, config: VectorStoreConfig) -> Self {
968        self.config = config;
969        self
970    }
971
972    /// Index a resource with automatic embedding generation
973    pub fn index_resource(&mut self, uri: String, content: &str) -> Result<()> {
974        if let Some(ref mut embedding_manager) = self.embedding_manager {
975            let embeddable_content = embeddings::EmbeddableContent::Text(content.to_string());
976            let vector = embedding_manager.get_embedding(&embeddable_content)?;
977            self.index.insert(uri, vector)
978        } else {
979            // Generate a simple hash-based vector as fallback
980            let vector = self.generate_fallback_vector(content);
981            self.index.insert(uri, vector)
982        }
983    }
984
985    /// Index an RDF resource with structured content
986    pub fn index_rdf_resource(
987        &mut self,
988        uri: String,
989        label: Option<String>,
990        description: Option<String>,
991        properties: std::collections::HashMap<String, Vec<String>>,
992    ) -> Result<()> {
993        if let Some(ref mut embedding_manager) = self.embedding_manager {
994            let embeddable_content = embeddings::EmbeddableContent::RdfResource {
995                uri: uri.clone(),
996                label,
997                description,
998                properties,
999            };
1000            let vector = embedding_manager.get_embedding(&embeddable_content)?;
1001            self.index.insert(uri, vector)
1002        } else {
1003            Err(anyhow::anyhow!(
1004                "Embedding manager required for RDF resource indexing"
1005            ))
1006        }
1007    }
1008
1009    /// Index a pre-computed vector
1010    pub fn index_vector(&mut self, uri: String, vector: Vector) -> Result<()> {
1011        self.index.insert(uri, vector)
1012    }
1013
1014    /// Search for similar resources using text query
1015    pub fn similarity_search(&self, query: &str, limit: usize) -> Result<Vec<(String, f32)>> {
1016        let query_vector = if let Some(ref _embedding_manager) = self.embedding_manager {
1017            let _embeddable_content = embeddings::EmbeddableContent::Text(query.to_string());
1018            // We need a mutable reference, but we only have an immutable one
1019            // For now, generate a fallback vector
1020            self.generate_fallback_vector(query)
1021        } else {
1022            self.generate_fallback_vector(query)
1023        };
1024
1025        self.index.search_knn(&query_vector, limit)
1026    }
1027
1028    /// Search for similar resources using a vector query
1029    pub fn similarity_search_vector(
1030        &self,
1031        query: &Vector,
1032        limit: usize,
1033    ) -> Result<Vec<(String, f32)>> {
1034        self.index.search_knn(query, limit)
1035    }
1036
1037    /// Find resources within similarity threshold
1038    pub fn threshold_search(&self, query: &str, threshold: f32) -> Result<Vec<(String, f32)>> {
1039        let query_vector = self.generate_fallback_vector(query);
1040        self.index.search_threshold(&query_vector, threshold)
1041    }
1042
1043    /// Advanced search with multiple options
1044    pub fn advanced_search(&self, options: SearchOptions) -> Result<Vec<(String, f32)>> {
1045        let query_vector = match options.query {
1046            SearchQuery::Text(text) => self.generate_fallback_vector(&text),
1047            SearchQuery::Vector(vector) => vector,
1048        };
1049
1050        let results = match options.search_type {
1051            SearchType::KNN(k) => self.index.search_knn(&query_vector, k)?,
1052            SearchType::Threshold(threshold) => {
1053                self.index.search_threshold(&query_vector, threshold)?
1054            }
1055        };
1056
1057        Ok(results)
1058    }
1059
1060    fn generate_fallback_vector(&self, text: &str) -> Vector {
1061        // Simple hash-based vector generation for fallback
1062        use std::collections::hash_map::DefaultHasher;
1063        use std::hash::{Hash, Hasher};
1064
1065        let mut hasher = DefaultHasher::new();
1066        text.hash(&mut hasher);
1067        let hash = hasher.finish();
1068
1069        let mut values = Vec::with_capacity(384); // Standard embedding size
1070        let mut seed = hash;
1071
1072        for _ in 0..384 {
1073            seed = seed.wrapping_mul(1103515245).wrapping_add(12345);
1074            let normalized = (seed as f32) / (u64::MAX as f32);
1075            values.push((normalized - 0.5) * 2.0); // Range: -1.0 to 1.0
1076        }
1077
1078        Vector::new(values)
1079    }
1080
1081    /// Get embedding manager statistics
1082    pub fn embedding_stats(&self) -> Option<(usize, usize)> {
1083        self.embedding_manager.as_ref().map(|em| em.cache_stats())
1084    }
1085
1086    /// Build vocabulary for TF-IDF embeddings
1087    pub fn build_vocabulary(&mut self, documents: &[String]) -> Result<()> {
1088        if let Some(ref mut embedding_manager) = self.embedding_manager {
1089            embedding_manager.build_vocabulary(documents)
1090        } else {
1091            Ok(()) // No-op if no embedding manager
1092        }
1093    }
1094
1095    /// Calculate similarity between two resources by their URIs
1096    pub fn calculate_similarity(&self, uri1: &str, uri2: &str) -> Result<f32> {
1097        // If the URIs are identical, return perfect similarity
1098        if uri1 == uri2 {
1099            return Ok(1.0);
1100        }
1101
1102        // Get the vectors for both URIs
1103        let vector1 = self
1104            .index
1105            .get_vector(uri1)
1106            .ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri1))?;
1107
1108        let vector2 = self
1109            .index
1110            .get_vector(uri2)
1111            .ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri2))?;
1112
1113        // Calculate cosine similarity between the vectors
1114        vector1.cosine_similarity(vector2)
1115    }
1116
1117    /// Get a vector by its ID (delegates to VectorIndex)
1118    pub fn get_vector(&self, id: &str) -> Option<&Vector> {
1119        self.index.get_vector(id)
1120    }
1121
1122    /// Index a vector with metadata (stub)
1123    pub fn index_vector_with_metadata(
1124        &mut self,
1125        uri: String,
1126        vector: Vector,
1127        _metadata: HashMap<String, String>,
1128    ) -> Result<()> {
1129        // For now, just delegate to index_vector, ignoring metadata
1130        // Future: Extend VectorIndex trait to support metadata
1131        self.index_vector(uri, vector)
1132    }
1133
1134    /// Index a resource with metadata (stub)
1135    pub fn index_resource_with_metadata(
1136        &mut self,
1137        uri: String,
1138        content: &str,
1139        _metadata: HashMap<String, String>,
1140    ) -> Result<()> {
1141        // For now, just delegate to index_resource, ignoring metadata
1142        // Future: Store and utilize metadata
1143        self.index_resource(uri, content)
1144    }
1145
1146    /// Search with additional parameters (stub)
1147    pub fn similarity_search_with_params(
1148        &self,
1149        query: &str,
1150        limit: usize,
1151        _params: HashMap<String, String>,
1152    ) -> Result<Vec<(String, f32)>> {
1153        // For now, just delegate to similarity_search, ignoring params
1154        // Future: Use params for filtering, threshold, etc.
1155        self.similarity_search(query, limit)
1156    }
1157
1158    /// Vector search with additional parameters (stub)
1159    pub fn vector_search_with_params(
1160        &self,
1161        query: &Vector,
1162        limit: usize,
1163        _params: HashMap<String, String>,
1164    ) -> Result<Vec<(String, f32)>> {
1165        // For now, just delegate to similarity_search_vector, ignoring params
1166        // Future: Use params for filtering, distance metric selection, etc.
1167        self.similarity_search_vector(query, limit)
1168    }
1169
1170    /// Get all vector IDs (stub)
1171    pub fn get_vector_ids(&self) -> Result<Vec<String>> {
1172        // VectorIndex trait doesn't provide this method yet
1173        // Future: Add to VectorIndex trait or track separately
1174        Ok(Vec::new())
1175    }
1176
1177    /// Remove a vector by its URI (stub)
1178    pub fn remove_vector(&mut self, uri: &str) -> Result<()> {
1179        // Delegate to VectorIndex trait's remove_vector method
1180        self.index.remove_vector(uri.to_string())
1181    }
1182
1183    /// Get store statistics (stub)
1184    pub fn get_statistics(&self) -> Result<HashMap<String, String>> {
1185        // Return basic statistics as a map
1186        // Future: Provide comprehensive stats from index
1187        let mut stats = HashMap::new();
1188        stats.insert("type".to_string(), "VectorStore".to_string());
1189
1190        if let Some((cache_size, cache_capacity)) = self.embedding_stats() {
1191            stats.insert("embedding_cache_size".to_string(), cache_size.to_string());
1192            stats.insert(
1193                "embedding_cache_capacity".to_string(),
1194                cache_capacity.to_string(),
1195            );
1196        }
1197
1198        Ok(stats)
1199    }
1200
1201    /// Save store to disk (stub)
1202    pub fn save_to_disk(&self, _path: &str) -> Result<()> {
1203        // Stub implementation - serialization not yet implemented
1204        // Future: Serialize index and configuration to disk
1205        Err(anyhow::anyhow!("save_to_disk not yet implemented"))
1206    }
1207
1208    /// Load store from disk (stub)
1209    pub fn load_from_disk(_path: &str) -> Result<Self> {
1210        // Stub implementation - deserialization not yet implemented
1211        // Future: Deserialize index and configuration from disk
1212        Err(anyhow::anyhow!("load_from_disk not yet implemented"))
1213    }
1214
1215    /// Optimize the underlying index (stub)
1216    pub fn optimize_index(&mut self) -> Result<()> {
1217        // Stub implementation - optimization not yet implemented
1218        // Future: Trigger index compaction, rebalancing, etc.
1219        Ok(())
1220    }
1221}
1222
1223impl Default for VectorStore {
1224    fn default() -> Self {
1225        Self::new()
1226    }
1227}
1228
1229impl VectorStoreTrait for VectorStore {
1230    fn insert_vector(&mut self, id: VectorId, vector: Vector) -> Result<()> {
1231        self.index.insert(id, vector)
1232    }
1233
1234    fn add_vector(&mut self, vector: Vector) -> Result<VectorId> {
1235        // Generate a unique ID for the vector
1236        let id = format!("vec_{}", uuid::Uuid::new_v4());
1237        self.index.insert(id.clone(), vector)?;
1238        Ok(id)
1239    }
1240
1241    fn get_vector(&self, id: &VectorId) -> Result<Option<Vector>> {
1242        Ok(self.index.get_vector(id).cloned())
1243    }
1244
1245    fn get_all_vector_ids(&self) -> Result<Vec<VectorId>> {
1246        // For now, return empty vec as VectorIndex doesn't provide this method
1247        // This could be enhanced if the underlying index supports it
1248        Ok(Vec::new())
1249    }
1250
1251    fn search_similar(&self, query: &Vector, k: usize) -> Result<Vec<(VectorId, f32)>> {
1252        self.index.search_knn(query, k)
1253    }
1254
1255    fn remove_vector(&mut self, id: &VectorId) -> Result<bool> {
1256        // VectorIndex trait doesn't have remove, so we'll return false for now
1257        // This could be enhanced in the future if needed
1258        let _ = id;
1259        Ok(false)
1260    }
1261
1262    fn len(&self) -> usize {
1263        // VectorIndex trait doesn't have len, so we'll return 0 for now
1264        // This could be enhanced in the future if needed
1265        0
1266    }
1267}
1268
1269/// Search query types
1270#[derive(Debug, Clone)]
1271pub enum SearchQuery {
1272    Text(String),
1273    Vector(Vector),
1274}
1275
1276/// Search operation types
1277#[derive(Debug, Clone)]
1278pub enum SearchType {
1279    KNN(usize),
1280    Threshold(f32),
1281}
1282
1283/// Advanced search options
1284#[derive(Debug, Clone)]
1285pub struct SearchOptions {
1286    pub query: SearchQuery,
1287    pub search_type: SearchType,
1288}
1289
1290/// Vector operation results with enhanced metadata
1291#[derive(Debug, Clone)]
1292pub struct VectorOperationResult {
1293    pub uri: String,
1294    pub similarity: f32,
1295    pub vector: Option<Vector>,
1296    pub metadata: Option<std::collections::HashMap<String, String>>,
1297    pub rank: usize,
1298}
1299
1300/// Document batch processing utilities
1301pub struct DocumentBatchProcessor;
1302
1303impl DocumentBatchProcessor {
1304    /// Process multiple documents in batch for efficient indexing
1305    pub fn batch_index(
1306        store: &mut VectorStore,
1307        documents: &[(String, String)], // (uri, content) pairs
1308    ) -> Result<Vec<Result<()>>> {
1309        let mut results = Vec::new();
1310
1311        for (uri, content) in documents {
1312            let result = store.index_resource(uri.clone(), content);
1313            results.push(result);
1314        }
1315
1316        Ok(results)
1317    }
1318
1319    /// Process multiple queries in batch
1320    pub fn batch_search(
1321        store: &VectorStore,
1322        queries: &[String],
1323        limit: usize,
1324    ) -> Result<BatchSearchResult> {
1325        let mut results = Vec::new();
1326
1327        for query in queries {
1328            let result = store.similarity_search(query, limit);
1329            results.push(result);
1330        }
1331
1332        Ok(results)
1333    }
1334}
1335
1336/// Error types specific to vector operations
1337#[derive(Debug, thiserror::Error)]
1338pub enum VectorError {
1339    #[error("Dimension mismatch: expected {expected}, got {actual}")]
1340    DimensionMismatch { expected: usize, actual: usize },
1341
1342    #[error("Empty vector")]
1343    EmptyVector,
1344
1345    #[error("Index not built")]
1346    IndexNotBuilt,
1347
1348    #[error("Embedding generation failed: {message}")]
1349    EmbeddingError { message: String },
1350
1351    #[error("SPARQL service error: {message}")]
1352    SparqlServiceError { message: String },
1353
1354    #[error("Compression error: {0}")]
1355    CompressionError(String),
1356
1357    #[error("Invalid dimensions: {0}")]
1358    InvalidDimensions(String),
1359
1360    #[error("Unsupported operation: {0}")]
1361    UnsupportedOperation(String),
1362
1363    #[error("Invalid data: {0}")]
1364    InvalidData(String),
1365
1366    #[error("IO error: {0}")]
1367    IoError(#[from] std::io::Error),
1368}
1369
1370/// Utility functions for vector operations
1371pub mod utils {
1372    use super::Vector;
1373
1374    /// Calculate centroid of a set of vectors
1375    pub fn centroid(vectors: &[Vector]) -> Option<Vector> {
1376        if vectors.is_empty() {
1377            return None;
1378        }
1379
1380        let dimensions = vectors[0].dimensions;
1381        let mut sum_values = vec![0.0; dimensions];
1382
1383        for vector in vectors {
1384            if vector.dimensions != dimensions {
1385                return None; // Inconsistent dimensions
1386            }
1387
1388            let vector_f32 = vector.as_f32();
1389            for (i, &value) in vector_f32.iter().enumerate() {
1390                sum_values[i] += value;
1391            }
1392        }
1393
1394        let count = vectors.len() as f32;
1395        for value in &mut sum_values {
1396            *value /= count;
1397        }
1398
1399        Some(Vector::new(sum_values))
1400    }
1401
1402    /// Generate random vector for testing
1403    pub fn random_vector(dimensions: usize, seed: Option<u64>) -> Vector {
1404        use std::collections::hash_map::DefaultHasher;
1405        use std::hash::{Hash, Hasher};
1406
1407        let mut hasher = DefaultHasher::new();
1408        seed.unwrap_or(42).hash(&mut hasher);
1409        let mut rng_state = hasher.finish();
1410
1411        let mut values = Vec::with_capacity(dimensions);
1412        for _ in 0..dimensions {
1413            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
1414            let normalized = (rng_state as f32) / (u64::MAX as f32);
1415            values.push((normalized - 0.5) * 2.0); // Range: -1.0 to 1.0
1416        }
1417
1418        Vector::new(values)
1419    }
1420
1421    /// Convert vector to normalized unit vector
1422    pub fn normalize_vector(vector: &Vector) -> Vector {
1423        vector.normalized()
1424    }
1425}
1426
1427#[cfg(test)]
1428mod tests {
1429    use super::*;
1430    use crate::similarity::SimilarityMetric;
1431
1432    #[test]
1433    fn test_vector_creation() {
1434        let values = vec![1.0, 2.0, 3.0];
1435        let vector = Vector::new(values.clone());
1436
1437        assert_eq!(vector.dimensions, 3);
1438        assert_eq!(vector.precision, VectorPrecision::F32);
1439        assert_eq!(vector.as_f32(), values);
1440    }
1441
1442    #[test]
1443    fn test_multi_precision_vectors() {
1444        // Test F64 vector
1445        let f64_values = vec![1.0, 2.0, 3.0];
1446        let f64_vector = Vector::f64(f64_values.clone());
1447        assert_eq!(f64_vector.precision, VectorPrecision::F64);
1448        assert_eq!(f64_vector.dimensions, 3);
1449
1450        // Test I8 vector
1451        let i8_values = vec![100, -50, 0];
1452        let i8_vector = Vector::i8(i8_values);
1453        assert_eq!(i8_vector.precision, VectorPrecision::I8);
1454        assert_eq!(i8_vector.dimensions, 3);
1455
1456        // Test binary vector
1457        let binary_values = vec![0b10101010, 0b11110000];
1458        let binary_vector = Vector::binary(binary_values);
1459        assert_eq!(binary_vector.precision, VectorPrecision::Binary);
1460        assert_eq!(binary_vector.dimensions, 16); // 2 bytes * 8 bits
1461    }
1462
1463    #[test]
1464    fn test_vector_operations() {
1465        let v1 = Vector::new(vec![1.0, 2.0, 3.0]);
1466        let v2 = Vector::new(vec![4.0, 5.0, 6.0]);
1467
1468        // Test addition
1469        let sum = v1.add(&v2).unwrap();
1470        assert_eq!(sum.as_f32(), vec![5.0, 7.0, 9.0]);
1471
1472        // Test subtraction
1473        let diff = v2.subtract(&v1).unwrap();
1474        assert_eq!(diff.as_f32(), vec![3.0, 3.0, 3.0]);
1475
1476        // Test scaling
1477        let scaled = v1.scale(2.0);
1478        assert_eq!(scaled.as_f32(), vec![2.0, 4.0, 6.0]);
1479    }
1480
1481    #[test]
1482    fn test_cosine_similarity() {
1483        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1484        let v2 = Vector::new(vec![1.0, 0.0, 0.0]);
1485        let v3 = Vector::new(vec![0.0, 1.0, 0.0]);
1486
1487        // Identical vectors should have similarity 1.0
1488        assert!((v1.cosine_similarity(&v2).unwrap() - 1.0).abs() < 0.001);
1489
1490        // Orthogonal vectors should have similarity 0.0
1491        assert!((v1.cosine_similarity(&v3).unwrap()).abs() < 0.001);
1492    }
1493
1494    #[test]
1495    fn test_vector_store() {
1496        let mut store = VectorStore::new();
1497
1498        // Test indexing
1499        store
1500            .index_resource("doc1".to_string(), "This is a test")
1501            .unwrap();
1502        store
1503            .index_resource("doc2".to_string(), "Another test document")
1504            .unwrap();
1505
1506        // Test searching
1507        let results = store.similarity_search("test", 5).unwrap();
1508        assert_eq!(results.len(), 2);
1509
1510        // Results should be sorted by similarity (descending)
1511        assert!(results[0].1 >= results[1].1);
1512    }
1513
1514    #[test]
1515    fn test_similarity_metrics() {
1516        let a = vec![1.0, 2.0, 3.0];
1517        let b = vec![4.0, 5.0, 6.0];
1518
1519        // Test different similarity metrics
1520        let cosine_sim = SimilarityMetric::Cosine.similarity(&a, &b).unwrap();
1521        let euclidean_sim = SimilarityMetric::Euclidean.similarity(&a, &b).unwrap();
1522        let manhattan_sim = SimilarityMetric::Manhattan.similarity(&a, &b).unwrap();
1523
1524        // All similarities should be between 0 and 1
1525        assert!((0.0..=1.0).contains(&cosine_sim));
1526        assert!((0.0..=1.0).contains(&euclidean_sim));
1527        assert!((0.0..=1.0).contains(&manhattan_sim));
1528    }
1529
1530    #[test]
1531    fn test_quantization() {
1532        let values = vec![1.0, -0.5, 0.0, 0.75];
1533        let quantized = Vector::quantize_to_i8(&values);
1534
1535        // Check that quantized values are in the expected range
1536        for &q in &quantized {
1537            assert!((-127..=127).contains(&q));
1538        }
1539    }
1540
1541    #[test]
1542    fn test_binary_conversion() {
1543        let values = vec![0.8, -0.3, 0.1, -0.9];
1544        let binary = Vector::to_binary(&values, 0.0);
1545
1546        // Should have 1 byte (4 values, each becomes 1 bit, packed into bytes)
1547        assert_eq!(binary.len(), 1);
1548
1549        // First bit should be 1 (0.8 > 0.0), second should be 0 (-0.3 < 0.0), etc.
1550        let byte = binary[0];
1551        assert_eq!(byte & 1, 1); // bit 0: 0.8 > 0.0
1552        assert_eq!((byte >> 1) & 1, 0); // bit 1: -0.3 < 0.0
1553        assert_eq!((byte >> 2) & 1, 1); // bit 2: 0.1 > 0.0
1554        assert_eq!((byte >> 3) & 1, 0); // bit 3: -0.9 < 0.0
1555    }
1556
1557    #[test]
1558    fn test_memory_vector_index() {
1559        let mut index = MemoryVectorIndex::new();
1560
1561        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1562        let v2 = Vector::new(vec![0.0, 1.0, 0.0]);
1563
1564        index.insert("v1".to_string(), v1.clone()).unwrap();
1565        index.insert("v2".to_string(), v2.clone()).unwrap();
1566
1567        // Test KNN search
1568        let results = index.search_knn(&v1, 1).unwrap();
1569        assert_eq!(results.len(), 1);
1570        assert_eq!(results[0].0, "v1");
1571
1572        // Test threshold search
1573        let results = index.search_threshold(&v1, 0.5).unwrap();
1574        assert!(!results.is_empty());
1575    }
1576
1577    #[test]
1578    fn test_hnsw_index() {
1579        use crate::hnsw::{HnswConfig, HnswIndex};
1580
1581        let config = HnswConfig::default();
1582        let mut index = HnswIndex::new(config).unwrap();
1583
1584        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1585        let v2 = Vector::new(vec![0.0, 1.0, 0.0]);
1586        let v3 = Vector::new(vec![0.0, 0.0, 1.0]);
1587
1588        index.insert("v1".to_string(), v1.clone()).unwrap();
1589        index.insert("v2".to_string(), v2.clone()).unwrap();
1590        index.insert("v3".to_string(), v3.clone()).unwrap();
1591
1592        // Test KNN search
1593        let results = index.search_knn(&v1, 2).unwrap();
1594        assert!(results.len() <= 2);
1595
1596        // The first result should be v1 itself (highest similarity)
1597        if !results.is_empty() {
1598            assert_eq!(results[0].0, "v1");
1599        }
1600    }
1601
1602    #[test]
1603    fn test_sparql_vector_service() {
1604        use crate::embeddings::EmbeddingStrategy;
1605        use crate::sparql_integration::{
1606            SparqlVectorService, VectorServiceArg, VectorServiceConfig, VectorServiceResult,
1607        };
1608
1609        let config = VectorServiceConfig::default();
1610        let mut service =
1611            SparqlVectorService::new(config, EmbeddingStrategy::SentenceTransformer).unwrap();
1612
1613        // Test vector similarity function
1614        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1615        let v2 = Vector::new(vec![1.0, 0.0, 0.0]);
1616
1617        let args = vec![VectorServiceArg::Vector(v1), VectorServiceArg::Vector(v2)];
1618
1619        let result = service
1620            .execute_function("vector_similarity", &args)
1621            .unwrap();
1622
1623        match result {
1624            VectorServiceResult::Number(similarity) => {
1625                assert!((similarity - 1.0).abs() < 0.001); // Should be very similar
1626            }
1627            _ => panic!("Expected a number result"),
1628        }
1629
1630        // Test text embedding function
1631        let text_args = vec![VectorServiceArg::String("test text".to_string())];
1632        let embed_result = service.execute_function("embed_text", &text_args).unwrap();
1633
1634        match embed_result {
1635            VectorServiceResult::Vector(vector) => {
1636                assert_eq!(vector.dimensions, 384); // Default embedding size
1637            }
1638            _ => panic!("Expected a vector result"),
1639        }
1640    }
1641}
oxirs_vec/lib.rs

oxirs_vec/
lib.rs