oxirs_vec/
lib.rs

1//! # OxiRS Vector Search
2//!
3//! [![Version](https://img.shields.io/badge/version-0.1.0--beta.1-blue)](https://github.com/cool-japan/oxirs/releases)
4//! [![docs.rs](https://docs.rs/oxirs-vec/badge.svg)](https://docs.rs/oxirs-vec)
5//!
6//! **Status**: Beta Release (v0.1.0-beta.1)
7//! **Stability**: Public APIs are stable. Production-ready with comprehensive testing.
8//!
9//! Vector index abstractions for semantic similarity and AI-augmented SPARQL querying.
10//!
11//! This crate provides comprehensive vector search capabilities for knowledge graphs,
12//! enabling semantic similarity searches, AI-augmented SPARQL queries, and hybrid
13//! symbolic-vector operations.
14
15#![allow(dead_code)]
16//!
17//! ## Features
18//!
19//! - **Multi-algorithm embeddings**: TF-IDF, sentence transformers, custom models
20//! - **Advanced indexing**: HNSW, flat, quantized, and multi-index support
21//! - **Rich similarity metrics**: Cosine, Euclidean, Pearson, Jaccard, and more
22//! - **SPARQL integration**: `vec:similar` service functions and hybrid queries
23//! - **Performance optimization**: Caching, batching, and parallel processing
24//!
25//! ## Quick Start
26//!
27//! ```rust
28//! use oxirs_vec::{VectorStore, embeddings::EmbeddingStrategy};
29//!
30//! // Create vector store with sentence transformer embeddings
31//! let mut store = VectorStore::with_embedding_strategy(
32//!     EmbeddingStrategy::SentenceTransformer
33//! ).unwrap();
34//!
35//! // Index some content
36//! store
37//!     .index_resource(
38//!         "http://example.org/doc1".to_string(),
39//!         "This is a document about AI",
40//!     )
41//!     .unwrap();
42//! store
43//!     .index_resource(
44//!         "http://example.org/doc2".to_string(),
45//!         "Machine learning tutorial",
46//!     )
47//!     .unwrap();
48//!
49//! // Search for similar content
50//! let results = store
51//!     .similarity_search("artificial intelligence", 5)
52//!     .unwrap();
53//!
54//! println!("Found {} matching resources", results.len());
55//! ```
56
57use anyhow::Result;
58use std::collections::HashMap;
59
60pub mod adaptive_compression;
61pub mod adaptive_intelligent_caching;
62pub mod advanced_analytics;
63pub mod advanced_benchmarking;
64pub mod advanced_caching;
65pub mod advanced_metrics;
66pub mod advanced_result_merging;
67pub mod automl_optimization;
68pub mod benchmarking;
69pub mod cache_friendly_index;
70pub mod clustering;
71pub mod compression;
72#[cfg(feature = "content-processing")]
73pub mod content_processing;
74pub mod cross_language_alignment;
75pub mod cross_modal_embeddings;
76pub mod distance_metrics;
77pub mod distributed_vector_search;
78pub mod embedding_pipeline;
79pub mod embeddings;
80pub mod enhanced_performance_monitoring;
81pub mod faiss_compatibility;
82pub mod faiss_gpu_integration;
83pub mod faiss_integration;
84pub mod faiss_migration_tools;
85pub mod faiss_native_integration;
86pub mod federated_search;
87pub mod filtered_search;
88pub mod gnn_embeddings;
89pub mod gpu;
90pub mod graph_aware_search;
91pub mod graph_indices;
92pub mod hierarchical_similarity;
93pub mod hnsw;
94pub mod huggingface;
95pub mod index;
96pub mod ivf;
97pub mod joint_embedding_spaces;
98pub mod kg_embeddings;
99pub mod lsh;
100pub mod mmap_advanced;
101pub mod mmap_index;
102pub mod opq;
103pub mod oxirs_arq_integration;
104pub mod performance_insights;
105pub mod persistence;
106pub mod pq;
107pub mod pytorch;
108pub mod quantum_search;
109pub mod random_utils;
110pub mod rdf_content_enhancement;
111pub mod rdf_integration;
112pub mod real_time_analytics;
113pub mod real_time_embedding_pipeline;
114pub mod real_time_updates;
115pub mod result_fusion;
116pub mod similarity;
117pub mod sparql_integration;
118pub mod sparql_service_endpoint;
119pub mod sparse;
120pub mod storage_optimizations;
121pub mod store_integration;
122pub mod structured_vectors;
123pub mod tensorflow;
124pub mod tree_indices;
125pub mod validation;
126pub mod word2vec;
127
128// Python bindings module
129#[cfg(feature = "python")]
130pub mod python_bindings;
131
132// Re-export commonly used types
133pub use adaptive_compression::{
134    AdaptiveCompressor, CompressionMetrics, CompressionPriorities, MultiLevelCompression,
135    VectorStats,
136};
137pub use adaptive_intelligent_caching::{
138    AccessPatternAnalyzer, AdaptiveIntelligentCache, CacheConfiguration, CacheOptimizer,
139    CachePerformanceMetrics, CacheTier, MLModels, PredictivePrefetcher,
140};
141pub use advanced_analytics::{
142    AnomalyDetection, AnomalyDetector, AnomalyType, ImplementationEffort,
143    OptimizationRecommendation, PerformanceTrends, Priority, QualityAspect, QualityRecommendation,
144    QueryAnalytics, QueryAnomaly, RecommendationType, VectorAnalyticsEngine,
145    VectorDistributionAnalysis, VectorQualityAssessment,
146};
147pub use advanced_benchmarking::{
148    AdvancedBenchmarkConfig, AdvancedBenchmarkResult, AdvancedBenchmarkSuite, AlgorithmParameters,
149    BenchmarkAlgorithm, BuildTimeMetrics, CacheMetrics, DatasetQualityMetrics, DatasetStatistics,
150    DistanceStatistics, EnhancedBenchmarkDataset, HyperparameterTuner, IndexSizeMetrics,
151    LatencyMetrics, MemoryMetrics, ObjectiveFunction, OptimizationStrategy,
152    ParallelBenchmarkConfig, ParameterSpace, ParameterType, ParameterValue, PerformanceMetrics,
153    PerformanceProfiler, QualityDegradation, QualityMetrics, ScalabilityMetrics,
154    StatisticalAnalyzer, StatisticalMetrics, ThroughputMetrics,
155};
156pub use advanced_caching::{
157    BackgroundCacheWorker, CacheAnalysisReport, CacheAnalyzer, CacheConfig, CacheEntry,
158    CacheInvalidator, CacheKey, CacheStats, CacheWarmer, EvictionPolicy, InvalidationStats,
159    MultiLevelCache, MultiLevelCacheStats,
160};
161pub use advanced_result_merging::{
162    AdvancedResultMerger, ConfidenceInterval, DiversityConfig, DiversityMetric, FusionStatistics,
163    MergedResult, RankFusionAlgorithm, RankingFactor, ResultExplanation, ResultMergingConfig,
164    ResultMetadata, ScoreCombinationStrategy, ScoreNormalizationMethod, ScoredResult,
165    SourceContribution, SourceResult, SourceType,
166};
167pub use automl_optimization::{
168    AutoMLConfig, AutoMLOptimizer, AutoMLResults, AutoMLStatistics, IndexConfiguration,
169    IndexParameterSpace, OptimizationMetric, OptimizationTrial, ResourceConstraints, SearchSpace,
170    TrialResult,
171};
172pub use benchmarking::{
173    BenchmarkConfig, BenchmarkDataset, BenchmarkOutputFormat, BenchmarkResult, BenchmarkRunner,
174    BenchmarkSuite, BenchmarkTestCase, MemoryMetrics as BenchmarkMemoryMetrics,
175    PerformanceMetrics as BenchmarkPerformanceMetrics, QualityMetrics as BenchmarkQualityMetrics,
176    ScalabilityMetrics as BenchmarkScalabilityMetrics, SystemInfo,
177};
178pub use cache_friendly_index::{CacheFriendlyVectorIndex, IndexConfig as CacheFriendlyIndexConfig};
179pub use compression::{create_compressor, CompressionMethod, VectorCompressor};
180#[cfg(feature = "content-processing")]
181pub use content_processing::{
182    ChunkType, ChunkingStrategy, ContentChunk, ContentExtractionConfig, ContentLocation,
183    ContentProcessor, DocumentFormat, DocumentStructure, ExtractedContent, ExtractedImage,
184    ExtractedLink, ExtractedTable, FormatHandler, Heading, ProcessingStats, TocEntry,
185};
186pub use cross_modal_embeddings::{
187    AttentionMechanism, AudioData, AudioEncoder, CrossModalConfig, CrossModalEncoder, FusionLayer,
188    FusionStrategy, GraphData, GraphEncoder, ImageData, ImageEncoder, Modality, ModalityData,
189    MultiModalContent, TextEncoder, VideoData, VideoEncoder,
190};
191pub use distributed_vector_search::{
192    ConsistencyLevel, DistributedClusterStats, DistributedNodeConfig, DistributedQuery,
193    DistributedSearchResponse, DistributedVectorSearch, LoadBalancingAlgorithm, NodeHealthStatus,
194    PartitioningStrategy, QueryExecutionStrategy,
195};
196pub use embedding_pipeline::{
197    DimensionalityReduction, EmbeddingPipeline, NormalizationConfig, PostprocessingPipeline,
198    PreprocessingPipeline, TokenizerConfig, VectorNormalization,
199};
200pub use embeddings::{
201    EmbeddableContent, EmbeddingConfig, EmbeddingManager, EmbeddingStrategy, ModelDetails,
202    OpenAIConfig, OpenAIEmbeddingGenerator, SentenceTransformerGenerator, TransformerModelType,
203};
204pub use enhanced_performance_monitoring::{
205    Alert, AlertManager, AlertSeverity, AlertThresholds, AlertType, AnalyticsEngine,
206    AnalyticsReport, DashboardData, EnhancedPerformanceMonitor, ExportConfig, ExportDestination,
207    ExportFormat, LatencyDistribution, MonitoringConfig as EnhancedMonitoringConfig,
208    QualityMetrics as EnhancedQualityMetrics, QualityMetricsCollector, QualityStatistics,
209    QueryInfo, QueryMetricsCollector, QueryStatistics, QueryType, Recommendation,
210    RecommendationCategory, RecommendationPriority, SystemMetrics, SystemMetricsCollector,
211    SystemStatistics, TrendData, TrendDirection,
212};
213pub use faiss_compatibility::{
214    CompressionLevel, ConversionMetrics, ConversionResult, FaissCompatibility, FaissExportConfig,
215    FaissImportConfig, FaissIndexMetadata, FaissIndexType, FaissMetricType, FaissParameter,
216    SimpleVectorIndex,
217};
218pub use federated_search::{
219    AuthenticationConfig, FederatedSearchConfig, FederatedVectorSearch, FederationEndpoint,
220    PrivacyEngine, PrivacyMode, SchemaCompatibility, TrustManager,
221};
222pub use gnn_embeddings::{AggregatorType, GraphSAGE, GCN};
223pub use gpu::{
224    create_default_accelerator, create_memory_optimized_accelerator,
225    create_performance_accelerator, is_gpu_available, GpuAccelerator, GpuBuffer, GpuConfig,
226    GpuDevice, GpuExecutionConfig,
227};
228pub use graph_indices::{
229    DelaunayGraph, GraphIndex, GraphIndexConfig, GraphType, NSWGraph, ONNGGraph, PANNGGraph,
230    RNGGraph,
231};
232pub use hierarchical_similarity::{
233    ConceptHierarchy, HierarchicalSimilarity, HierarchicalSimilarityConfig,
234    HierarchicalSimilarityResult, HierarchicalSimilarityStats, SimilarityContext,
235    SimilarityExplanation, SimilarityTaskType,
236};
237pub use hnsw::{HnswConfig, HnswIndex};
238pub use index::{AdvancedVectorIndex, DistanceMetric, IndexConfig, IndexType, SearchResult};
239pub use ivf::{IvfConfig, IvfIndex, IvfStats, QuantizationStrategy};
240pub use joint_embedding_spaces::{
241    ActivationFunction, AlignmentPair, CLIPAligner, ContrastiveOptimizer, CrossModalAttention,
242    CurriculumLearning, DataAugmentation, DifficultySchedule, DomainAdapter, DomainStatistics,
243    JointEmbeddingConfig, JointEmbeddingSpace, LearningRateSchedule, LinearProjector,
244    PacingFunction, ScheduleType, TemperatureScheduler, TrainingStatistics,
245};
246pub use kg_embeddings::{
247    ComplEx, KGEmbedding, KGEmbeddingConfig, KGEmbeddingModel as KGModel, KGEmbeddingModelType,
248    RotatE, TransE, Triple,
249};
250pub use lsh::{LshConfig, LshFamily, LshIndex, LshStats};
251pub use mmap_index::{MemoryMappedIndexStats, MemoryMappedVectorIndex};
252pub use performance_insights::{
253    AlertingSystem, OptimizationRecommendations, PerformanceInsightsAnalyzer,
254    PerformanceTrends as InsightsPerformanceTrends, QueryComplexity,
255    QueryStatistics as InsightsQueryStatistics, ReportFormat, VectorStatistics,
256};
257pub use pq::{PQConfig, PQIndex, PQStats};
258pub use pytorch::{
259    ArchitectureType, CompileMode, DeviceManager, PyTorchConfig, PyTorchDevice, PyTorchEmbedder,
260    PyTorchModelManager, PyTorchModelMetadata, PyTorchTokenizer,
261};
262pub use quantum_search::{
263    QuantumSearchConfig, QuantumSearchResult, QuantumSearchStatistics, QuantumState,
264    QuantumVectorSearch,
265};
266pub use rdf_content_enhancement::{
267    ComponentWeights, MultiLanguageProcessor, PathConstraint, PathDirection, PropertyAggregator,
268    PropertyPath, RdfContentConfig, RdfContentProcessor, RdfContext, RdfEntity, RdfValue,
269    TemporalInfo,
270};
271pub use rdf_integration::{
272    RdfIntegrationStats, RdfTermMapping, RdfTermMetadata, RdfTermType, RdfVectorConfig,
273    RdfVectorIntegration, RdfVectorSearchResult, SearchMetadata,
274};
275pub use real_time_analytics::{
276    AlertSeverity as AnalyticsAlertSeverity, AlertType as AnalyticsAlertType, AnalyticsConfig,
277    AnalyticsEvent, AnalyticsReport as RealTimeAnalyticsReport,
278    DashboardData as RealTimeDashboardData, ExportFormat as AnalyticsExportFormat,
279    MetricsCollector, PerformanceMonitor, QueryMetrics, SystemMetrics as AnalyticsSystemMetrics,
280    VectorAnalyticsEngine as RealTimeVectorAnalyticsEngine,
281};
282pub use real_time_embedding_pipeline::{
283    AlertThresholds as PipelineAlertThresholds, AutoScalingConfig, CompressionConfig, ContentItem,
284    MonitoringConfig as PipelineMonitoringConfig, PipelineConfig as RealTimeEmbeddingConfig,
285    PipelineStatistics as PipelineStats, ProcessingPriority, ProcessingResult, ProcessingStatus,
286    RealTimeEmbeddingPipeline, VersioningStrategy,
287};
288pub use real_time_updates::{
289    BatchProcessor, RealTimeConfig, RealTimeVectorSearch, RealTimeVectorUpdater, UpdateBatch,
290    UpdateOperation, UpdatePriority, UpdateStats,
291};
292pub use result_fusion::{
293    FusedResults, FusionAlgorithm, FusionConfig, FusionQualityMetrics, FusionStats,
294    ResultFusionEngine, ScoreNormalizationStrategy, SourceResults, VectorSearchResult,
295};
296pub use similarity::{AdaptiveSimilarity, SemanticSimilarity, SimilarityConfig, SimilarityMetric};
297pub use sparql_integration::{
298    CrossLanguageProcessor, FederatedQueryResult, QueryExecutor, SparqlVectorFunctions,
299    SparqlVectorService, VectorOperation, VectorQuery, VectorQueryResult, VectorServiceArg,
300    VectorServiceConfig, VectorServiceResult,
301};
302pub use sparql_service_endpoint::{
303    AuthenticationInfo, AuthenticationType, CustomFunctionRegistry, FederatedOperation,
304    FederatedSearchResult, FederatedServiceEndpoint, FederatedVectorQuery, FunctionMetadata,
305    LoadBalancer, ParameterInfo, ParameterType as ServiceParameterType, PartialSearchResult,
306    QueryScope, ReturnType, ServiceCapability, ServiceEndpointManager, ServiceType,
307};
308pub use sparse::{COOMatrix, CSRMatrix, SparseVector};
309pub use storage_optimizations::{
310    CompressionType, MmapVectorFile, StorageConfig, StorageUtils, VectorBlock, VectorFileHeader,
311    VectorReader, VectorWriter,
312};
313pub use structured_vectors::{
314    ConfidenceScoredVector, HierarchicalVector, NamedDimensionVector, TemporalVector,
315    WeightedDimensionVector,
316};
317pub use tensorflow::{
318    OptimizationLevel, PreprocessingPipeline as TensorFlowPreprocessingPipeline, ServerConfig,
319    SessionConfig, TensorDataType, TensorFlowConfig, TensorFlowDevice, TensorFlowEmbedder,
320    TensorFlowModelInfo, TensorFlowModelServer, TensorSpec,
321};
322pub use tree_indices::{
323    BallTree, CoverTree, KdTree, RandomProjectionTree, TreeIndex, TreeIndexConfig, TreeType, VpTree,
324};
325pub use word2vec::{
326    AggregationMethod, OovStrategy, Word2VecConfig, Word2VecEmbeddingGenerator, Word2VecFormat,
327};
328
329/// Vector identifier type
330pub type VectorId = String;
331
332/// Batch search result type
333pub type BatchSearchResult = Vec<Result<Vec<(String, f32)>>>;
334
335/// Trait for vector store implementations
336pub trait VectorStoreTrait: Send + Sync {
337    /// Insert a vector with metadata
338    fn insert_vector(&mut self, id: VectorId, vector: Vector) -> Result<()>;
339
340    /// Add a vector and return its ID
341    fn add_vector(&mut self, vector: Vector) -> Result<VectorId>;
342
343    /// Get a vector by its ID
344    fn get_vector(&self, id: &VectorId) -> Result<Option<Vector>>;
345
346    /// Get all vector IDs
347    fn get_all_vector_ids(&self) -> Result<Vec<VectorId>>;
348
349    /// Search for similar vectors
350    fn search_similar(&self, query: &Vector, k: usize) -> Result<Vec<(VectorId, f32)>>;
351
352    /// Remove a vector by ID
353    fn remove_vector(&mut self, id: &VectorId) -> Result<bool>;
354
355    /// Get the number of vectors stored
356    fn len(&self) -> usize;
357
358    /// Check if the store is empty
359    fn is_empty(&self) -> bool {
360        self.len() == 0
361    }
362}
363
364/// Precision types for vectors
365#[derive(Debug, Clone, Copy, PartialEq, serde::Serialize, serde::Deserialize)]
366pub enum VectorPrecision {
367    F32,
368    F64,
369    F16,
370    I8,
371    Binary,
372}
373
374/// Multi-precision vector with enhanced functionality
375#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
376pub struct Vector {
377    pub dimensions: usize,
378    pub precision: VectorPrecision,
379    pub values: VectorData,
380    pub metadata: Option<std::collections::HashMap<String, String>>,
381}
382
383/// Vector data storage supporting multiple precisions
384#[derive(Debug, Clone, PartialEq, serde::Serialize, serde::Deserialize)]
385pub enum VectorData {
386    F32(Vec<f32>),
387    F64(Vec<f64>),
388    F16(Vec<u16>), // Using u16 to represent f16 bits
389    I8(Vec<i8>),
390    Binary(Vec<u8>), // Packed binary representation
391}
392
393impl Vector {
394    /// Create a new F32 vector from values
395    pub fn new(values: Vec<f32>) -> Self {
396        let dimensions = values.len();
397        Self {
398            dimensions,
399            precision: VectorPrecision::F32,
400            values: VectorData::F32(values),
401            metadata: None,
402        }
403    }
404
405    /// Create a new vector with specific precision
406    pub fn with_precision(values: VectorData) -> Self {
407        let (dimensions, precision) = match &values {
408            VectorData::F32(v) => (v.len(), VectorPrecision::F32),
409            VectorData::F64(v) => (v.len(), VectorPrecision::F64),
410            VectorData::F16(v) => (v.len(), VectorPrecision::F16),
411            VectorData::I8(v) => (v.len(), VectorPrecision::I8),
412            VectorData::Binary(v) => (v.len() * 8, VectorPrecision::Binary), // 8 bits per byte
413        };
414
415        Self {
416            dimensions,
417            precision,
418            values,
419            metadata: None,
420        }
421    }
422
423    /// Create a new vector with metadata
424    pub fn with_metadata(
425        values: Vec<f32>,
426        metadata: std::collections::HashMap<String, String>,
427    ) -> Self {
428        let dimensions = values.len();
429        Self {
430            dimensions,
431            precision: VectorPrecision::F32,
432            values: VectorData::F32(values),
433            metadata: Some(metadata),
434        }
435    }
436
437    /// Create F64 vector
438    pub fn f64(values: Vec<f64>) -> Self {
439        Self::with_precision(VectorData::F64(values))
440    }
441
442    /// Create F16 vector (using u16 representation)
443    pub fn f16(values: Vec<u16>) -> Self {
444        Self::with_precision(VectorData::F16(values))
445    }
446
447    /// Create I8 quantized vector
448    pub fn i8(values: Vec<i8>) -> Self {
449        Self::with_precision(VectorData::I8(values))
450    }
451
452    /// Create binary vector
453    pub fn binary(values: Vec<u8>) -> Self {
454        Self::with_precision(VectorData::Binary(values))
455    }
456
457    /// Get vector values as f32 (converting if necessary)
458    pub fn as_f32(&self) -> Vec<f32> {
459        match &self.values {
460            VectorData::F32(v) => v.clone(),
461            VectorData::F64(v) => v.iter().map(|&x| x as f32).collect(),
462            VectorData::F16(v) => v.iter().map(|&x| Self::f16_to_f32(x)).collect(),
463            VectorData::I8(v) => v.iter().map(|&x| x as f32 / 128.0).collect(), // Normalize to [-1, 1]
464            VectorData::Binary(v) => {
465                let mut result = Vec::new();
466                for &byte in v {
467                    for bit in 0..8 {
468                        result.push(if (byte >> bit) & 1 == 1 { 1.0 } else { 0.0 });
469                    }
470                }
471                result
472            }
473        }
474    }
475
476    /// Convert f32 to f16 representation (simplified)
477    #[allow(dead_code)]
478    fn f32_to_f16(value: f32) -> u16 {
479        // Simplified f16 conversion - in practice, use proper IEEE 754 half-precision
480        let bits = value.to_bits();
481        let sign = (bits >> 31) & 0x1;
482        let exp = ((bits >> 23) & 0xff) as i32;
483        let mantissa = bits & 0x7fffff;
484
485        // Simplified conversion
486        let f16_exp = if exp == 0 {
487            0
488        } else {
489            (exp - 127 + 15).clamp(0, 31) as u16
490        };
491
492        let f16_mantissa = (mantissa >> 13) as u16;
493        ((sign as u16) << 15) | (f16_exp << 10) | f16_mantissa
494    }
495
496    /// Convert f16 representation to f32 (simplified)
497    fn f16_to_f32(value: u16) -> f32 {
498        // Simplified f16 conversion - in practice, use proper IEEE 754 half-precision
499        let sign = (value >> 15) & 0x1;
500        let exp = ((value >> 10) & 0x1f) as i32;
501        let mantissa = value & 0x3ff;
502
503        if exp == 0 {
504            if mantissa == 0 {
505                if sign == 1 {
506                    -0.0
507                } else {
508                    0.0
509                }
510            } else {
511                // Denormalized number
512                let f32_exp = -14 - 127;
513                let f32_mantissa = (mantissa as u32) << 13;
514                f32::from_bits(((sign as u32) << 31) | ((f32_exp as u32) << 23) | f32_mantissa)
515            }
516        } else {
517            let f32_exp = exp - 15 + 127;
518            let f32_mantissa = (mantissa as u32) << 13;
519            f32::from_bits(((sign as u32) << 31) | ((f32_exp as u32) << 23) | f32_mantissa)
520        }
521    }
522
523    /// Quantize f32 vector to i8
524    pub fn quantize_to_i8(values: &[f32]) -> Vec<i8> {
525        // Find min/max for normalization
526        let min_val = values.iter().fold(f32::INFINITY, |a, &b| a.min(b));
527        let max_val = values.iter().fold(f32::NEG_INFINITY, |a, &b| a.max(b));
528        let range = max_val - min_val;
529
530        if range == 0.0 {
531            vec![0; values.len()]
532        } else {
533            values
534                .iter()
535                .map(|&x| {
536                    let normalized = (x - min_val) / range; // 0 to 1
537                    let scaled = normalized * 254.0 - 127.0; // -127 to 127
538                    scaled.round().clamp(-127.0, 127.0) as i8
539                })
540                .collect()
541        }
542    }
543
544    /// Convert to binary representation using threshold
545    pub fn to_binary(values: &[f32], threshold: f32) -> Vec<u8> {
546        let mut binary = Vec::new();
547        let mut current_byte = 0u8;
548        let mut bit_position = 0;
549
550        for &value in values {
551            if value > threshold {
552                current_byte |= 1 << bit_position;
553            }
554
555            bit_position += 1;
556            if bit_position == 8 {
557                binary.push(current_byte);
558                current_byte = 0;
559                bit_position = 0;
560            }
561        }
562
563        // Handle remaining bits
564        if bit_position > 0 {
565            binary.push(current_byte);
566        }
567
568        binary
569    }
570
571    /// Calculate cosine similarity with another vector
572    pub fn cosine_similarity(&self, other: &Vector) -> Result<f32> {
573        if self.dimensions != other.dimensions {
574            return Err(anyhow::anyhow!("Vector dimensions must match"));
575        }
576
577        let self_f32 = self.as_f32();
578        let other_f32 = other.as_f32();
579
580        let dot_product: f32 = self_f32.iter().zip(&other_f32).map(|(a, b)| a * b).sum();
581
582        let magnitude_self: f32 = self_f32.iter().map(|x| x * x).sum::<f32>().sqrt();
583        let magnitude_other: f32 = other_f32.iter().map(|x| x * x).sum::<f32>().sqrt();
584
585        if magnitude_self == 0.0 || magnitude_other == 0.0 {
586            return Ok(0.0);
587        }
588
589        Ok(dot_product / (magnitude_self * magnitude_other))
590    }
591
592    /// Calculate Euclidean distance to another vector
593    pub fn euclidean_distance(&self, other: &Vector) -> Result<f32> {
594        if self.dimensions != other.dimensions {
595            return Err(anyhow::anyhow!("Vector dimensions must match"));
596        }
597
598        let self_f32 = self.as_f32();
599        let other_f32 = other.as_f32();
600
601        let distance = self_f32
602            .iter()
603            .zip(&other_f32)
604            .map(|(a, b)| (a - b).powi(2))
605            .sum::<f32>()
606            .sqrt();
607
608        Ok(distance)
609    }
610
611    /// Calculate Manhattan distance (L1 norm) to another vector
612    pub fn manhattan_distance(&self, other: &Vector) -> Result<f32> {
613        if self.dimensions != other.dimensions {
614            return Err(anyhow::anyhow!("Vector dimensions must match"));
615        }
616
617        let self_f32 = self.as_f32();
618        let other_f32 = other.as_f32();
619
620        let distance = self_f32
621            .iter()
622            .zip(&other_f32)
623            .map(|(a, b)| (a - b).abs())
624            .sum();
625
626        Ok(distance)
627    }
628
629    /// Calculate Minkowski distance (general Lp norm) to another vector
630    pub fn minkowski_distance(&self, other: &Vector, p: f32) -> Result<f32> {
631        if self.dimensions != other.dimensions {
632            return Err(anyhow::anyhow!("Vector dimensions must match"));
633        }
634
635        if p <= 0.0 {
636            return Err(anyhow::anyhow!("p must be positive"));
637        }
638
639        let self_f32 = self.as_f32();
640        let other_f32 = other.as_f32();
641
642        if p == f32::INFINITY {
643            // Special case: Chebyshev distance
644            return self.chebyshev_distance(other);
645        }
646
647        let distance = self_f32
648            .iter()
649            .zip(&other_f32)
650            .map(|(a, b)| (a - b).abs().powf(p))
651            .sum::<f32>()
652            .powf(1.0 / p);
653
654        Ok(distance)
655    }
656
657    /// Calculate Chebyshev distance (L∞ norm) to another vector
658    pub fn chebyshev_distance(&self, other: &Vector) -> Result<f32> {
659        if self.dimensions != other.dimensions {
660            return Err(anyhow::anyhow!("Vector dimensions must match"));
661        }
662
663        let self_f32 = self.as_f32();
664        let other_f32 = other.as_f32();
665
666        let distance = self_f32
667            .iter()
668            .zip(&other_f32)
669            .map(|(a, b)| (a - b).abs())
670            .fold(0.0f32, |max, val| max.max(val));
671
672        Ok(distance)
673    }
674
675    /// Get vector magnitude (L2 norm)
676    pub fn magnitude(&self) -> f32 {
677        let values = self.as_f32();
678        values.iter().map(|x| x * x).sum::<f32>().sqrt()
679    }
680
681    /// Normalize vector to unit length
682    pub fn normalize(&mut self) {
683        let mag = self.magnitude();
684        if mag > 0.0 {
685            match &mut self.values {
686                VectorData::F32(values) => {
687                    for value in values {
688                        *value /= mag;
689                    }
690                }
691                VectorData::F64(values) => {
692                    let mag_f64 = mag as f64;
693                    for value in values {
694                        *value /= mag_f64;
695                    }
696                }
697                _ => {
698                    // For other types, convert to f32, normalize, then convert back
699                    let mut f32_values = self.as_f32();
700                    for value in &mut f32_values {
701                        *value /= mag;
702                    }
703                    self.values = VectorData::F32(f32_values);
704                    self.precision = VectorPrecision::F32;
705                }
706            }
707        }
708    }
709
710    /// Get a normalized copy of this vector
711    pub fn normalized(&self) -> Vector {
712        let mut normalized = self.clone();
713        normalized.normalize();
714        normalized
715    }
716
717    /// Add another vector (element-wise)
718    pub fn add(&self, other: &Vector) -> Result<Vector> {
719        if self.dimensions != other.dimensions {
720            return Err(anyhow::anyhow!("Vector dimensions must match"));
721        }
722
723        let self_f32 = self.as_f32();
724        let other_f32 = other.as_f32();
725
726        let result_values: Vec<f32> = self_f32
727            .iter()
728            .zip(&other_f32)
729            .map(|(a, b)| a + b)
730            .collect();
731
732        Ok(Vector::new(result_values))
733    }
734
735    /// Subtract another vector (element-wise)
736    pub fn subtract(&self, other: &Vector) -> Result<Vector> {
737        if self.dimensions != other.dimensions {
738            return Err(anyhow::anyhow!("Vector dimensions must match"));
739        }
740
741        let self_f32 = self.as_f32();
742        let other_f32 = other.as_f32();
743
744        let result_values: Vec<f32> = self_f32
745            .iter()
746            .zip(&other_f32)
747            .map(|(a, b)| a - b)
748            .collect();
749
750        Ok(Vector::new(result_values))
751    }
752
753    /// Scale vector by a scalar
754    pub fn scale(&self, scalar: f32) -> Vector {
755        let values = self.as_f32();
756        let scaled_values: Vec<f32> = values.iter().map(|x| x * scalar).collect();
757
758        Vector::new(scaled_values)
759    }
760
761    /// Get the number of dimensions in the vector
762    pub fn len(&self) -> usize {
763        self.dimensions
764    }
765
766    /// Check if vector is empty (zero dimensions)
767    pub fn is_empty(&self) -> bool {
768        self.dimensions == 0
769    }
770
771    /// Get vector as slice of f32 values
772    pub fn as_slice(&self) -> Vec<f32> {
773        self.as_f32()
774    }
775}
776
777/// Vector index trait for efficient similarity search
778pub trait VectorIndex: Send + Sync {
779    /// Insert a vector with associated URI
780    fn insert(&mut self, uri: String, vector: Vector) -> Result<()>;
781
782    /// Find k nearest neighbors
783    fn search_knn(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>>;
784
785    /// Find all vectors within threshold similarity
786    fn search_threshold(&self, query: &Vector, threshold: f32) -> Result<Vec<(String, f32)>>;
787
788    /// Get a vector by its URI
789    fn get_vector(&self, uri: &str) -> Option<&Vector>;
790
791    /// Add a vector with associated ID and metadata
792    fn add_vector(
793        &mut self,
794        id: VectorId,
795        vector: Vector,
796        _metadata: Option<HashMap<String, String>>,
797    ) -> Result<()> {
798        // Default implementation that delegates to insert
799        self.insert(id, vector)
800    }
801
802    /// Update an existing vector
803    fn update_vector(&mut self, id: VectorId, vector: Vector) -> Result<()> {
804        // Default implementation that delegates to insert
805        self.insert(id, vector)
806    }
807
808    /// Update metadata for a vector
809    fn update_metadata(&mut self, _id: VectorId, _metadata: HashMap<String, String>) -> Result<()> {
810        // Default implementation (no-op)
811        Ok(())
812    }
813
814    /// Remove a vector by its ID
815    fn remove_vector(&mut self, _id: VectorId) -> Result<()> {
816        // Default implementation (no-op)
817        Ok(())
818    }
819}
820
821/// In-memory vector index implementation
822pub struct MemoryVectorIndex {
823    vectors: Vec<(String, Vector)>,
824    similarity_config: similarity::SimilarityConfig,
825}
826
827impl MemoryVectorIndex {
828    pub fn new() -> Self {
829        Self {
830            vectors: Vec::new(),
831            similarity_config: similarity::SimilarityConfig::default(),
832        }
833    }
834
835    pub fn with_similarity_config(config: similarity::SimilarityConfig) -> Self {
836        Self {
837            vectors: Vec::new(),
838            similarity_config: config,
839        }
840    }
841}
842
843impl Default for MemoryVectorIndex {
844    fn default() -> Self {
845        Self::new()
846    }
847}
848
849impl VectorIndex for MemoryVectorIndex {
850    fn insert(&mut self, uri: String, vector: Vector) -> Result<()> {
851        self.vectors.push((uri, vector));
852        Ok(())
853    }
854
855    fn search_knn(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
856        let metric = self.similarity_config.primary_metric;
857        let query_f32 = query.as_f32();
858        let mut similarities: Vec<(String, f32)> = self
859            .vectors
860            .iter()
861            .map(|(uri, vec)| {
862                let vec_f32 = vec.as_f32();
863                let sim = metric.similarity(&query_f32, &vec_f32).unwrap_or(0.0);
864                (uri.clone(), sim)
865            })
866            .collect();
867
868        similarities.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap());
869        similarities.truncate(k);
870
871        Ok(similarities)
872    }
873
874    fn search_threshold(&self, query: &Vector, threshold: f32) -> Result<Vec<(String, f32)>> {
875        let metric = self.similarity_config.primary_metric;
876        let query_f32 = query.as_f32();
877        let similarities: Vec<(String, f32)> = self
878            .vectors
879            .iter()
880            .filter_map(|(uri, vec)| {
881                let vec_f32 = vec.as_f32();
882                let sim = metric.similarity(&query_f32, &vec_f32).unwrap_or(0.0);
883                if sim >= threshold {
884                    Some((uri.clone(), sim))
885                } else {
886                    None
887                }
888            })
889            .collect();
890
891        Ok(similarities)
892    }
893
894    fn get_vector(&self, uri: &str) -> Option<&Vector> {
895        self.vectors.iter().find(|(u, _)| u == uri).map(|(_, v)| v)
896    }
897}
898
899/// Enhanced vector store with embedding management and advanced features
900pub struct VectorStore {
901    index: Box<dyn VectorIndex>,
902    embedding_manager: Option<embeddings::EmbeddingManager>,
903    config: VectorStoreConfig,
904}
905
906/// Configuration for vector store
907#[derive(Debug, Clone)]
908pub struct VectorStoreConfig {
909    pub auto_embed: bool,
910    pub cache_embeddings: bool,
911    pub similarity_threshold: f32,
912    pub max_results: usize,
913}
914
915impl Default for VectorStoreConfig {
916    fn default() -> Self {
917        Self {
918            auto_embed: true,
919            cache_embeddings: true,
920            similarity_threshold: 0.7,
921            max_results: 100,
922        }
923    }
924}
925
926impl VectorStore {
927    /// Create a new vector store with default memory index
928    pub fn new() -> Self {
929        Self {
930            index: Box::new(MemoryVectorIndex::new()),
931            embedding_manager: None,
932            config: VectorStoreConfig::default(),
933        }
934    }
935
936    /// Create vector store with specific embedding strategy
937    pub fn with_embedding_strategy(strategy: embeddings::EmbeddingStrategy) -> Result<Self> {
938        let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
939
940        Ok(Self {
941            index: Box::new(MemoryVectorIndex::new()),
942            embedding_manager: Some(embedding_manager),
943            config: VectorStoreConfig::default(),
944        })
945    }
946
947    /// Create vector store with custom index
948    pub fn with_index(index: Box<dyn VectorIndex>) -> Self {
949        Self {
950            index,
951            embedding_manager: None,
952            config: VectorStoreConfig::default(),
953        }
954    }
955
956    /// Create vector store with custom index and embedding strategy
957    pub fn with_index_and_embeddings(
958        index: Box<dyn VectorIndex>,
959        strategy: embeddings::EmbeddingStrategy,
960    ) -> Result<Self> {
961        let embedding_manager = embeddings::EmbeddingManager::new(strategy, 1000)?;
962
963        Ok(Self {
964            index,
965            embedding_manager: Some(embedding_manager),
966            config: VectorStoreConfig::default(),
967        })
968    }
969
970    /// Set vector store configuration
971    pub fn with_config(mut self, config: VectorStoreConfig) -> Self {
972        self.config = config;
973        self
974    }
975
976    /// Index a resource with automatic embedding generation
977    pub fn index_resource(&mut self, uri: String, content: &str) -> Result<()> {
978        if let Some(ref mut embedding_manager) = self.embedding_manager {
979            let embeddable_content = embeddings::EmbeddableContent::Text(content.to_string());
980            let vector = embedding_manager.get_embedding(&embeddable_content)?;
981            self.index.insert(uri, vector)
982        } else {
983            // Generate a simple hash-based vector as fallback
984            let vector = self.generate_fallback_vector(content);
985            self.index.insert(uri, vector)
986        }
987    }
988
989    /// Index an RDF resource with structured content
990    pub fn index_rdf_resource(
991        &mut self,
992        uri: String,
993        label: Option<String>,
994        description: Option<String>,
995        properties: std::collections::HashMap<String, Vec<String>>,
996    ) -> Result<()> {
997        if let Some(ref mut embedding_manager) = self.embedding_manager {
998            let embeddable_content = embeddings::EmbeddableContent::RdfResource {
999                uri: uri.clone(),
1000                label,
1001                description,
1002                properties,
1003            };
1004            let vector = embedding_manager.get_embedding(&embeddable_content)?;
1005            self.index.insert(uri, vector)
1006        } else {
1007            Err(anyhow::anyhow!(
1008                "Embedding manager required for RDF resource indexing"
1009            ))
1010        }
1011    }
1012
1013    /// Index a pre-computed vector
1014    pub fn index_vector(&mut self, uri: String, vector: Vector) -> Result<()> {
1015        self.index.insert(uri, vector)
1016    }
1017
1018    /// Search for similar resources using text query
1019    pub fn similarity_search(&self, query: &str, limit: usize) -> Result<Vec<(String, f32)>> {
1020        let query_vector = if let Some(ref _embedding_manager) = self.embedding_manager {
1021            let _embeddable_content = embeddings::EmbeddableContent::Text(query.to_string());
1022            // We need a mutable reference, but we only have an immutable one
1023            // For now, generate a fallback vector
1024            self.generate_fallback_vector(query)
1025        } else {
1026            self.generate_fallback_vector(query)
1027        };
1028
1029        self.index.search_knn(&query_vector, limit)
1030    }
1031
1032    /// Search for similar resources using a vector query
1033    pub fn similarity_search_vector(
1034        &self,
1035        query: &Vector,
1036        limit: usize,
1037    ) -> Result<Vec<(String, f32)>> {
1038        self.index.search_knn(query, limit)
1039    }
1040
1041    /// Find resources within similarity threshold
1042    pub fn threshold_search(&self, query: &str, threshold: f32) -> Result<Vec<(String, f32)>> {
1043        let query_vector = self.generate_fallback_vector(query);
1044        self.index.search_threshold(&query_vector, threshold)
1045    }
1046
1047    /// Advanced search with multiple options
1048    pub fn advanced_search(&self, options: SearchOptions) -> Result<Vec<(String, f32)>> {
1049        let query_vector = match options.query {
1050            SearchQuery::Text(text) => self.generate_fallback_vector(&text),
1051            SearchQuery::Vector(vector) => vector,
1052        };
1053
1054        let results = match options.search_type {
1055            SearchType::KNN(k) => self.index.search_knn(&query_vector, k)?,
1056            SearchType::Threshold(threshold) => {
1057                self.index.search_threshold(&query_vector, threshold)?
1058            }
1059        };
1060
1061        Ok(results)
1062    }
1063
1064    fn generate_fallback_vector(&self, text: &str) -> Vector {
1065        // Simple hash-based vector generation for fallback
1066        use std::collections::hash_map::DefaultHasher;
1067        use std::hash::{Hash, Hasher};
1068
1069        let mut hasher = DefaultHasher::new();
1070        text.hash(&mut hasher);
1071        let hash = hasher.finish();
1072
1073        let mut values = Vec::with_capacity(384); // Standard embedding size
1074        let mut seed = hash;
1075
1076        for _ in 0..384 {
1077            seed = seed.wrapping_mul(1103515245).wrapping_add(12345);
1078            let normalized = (seed as f32) / (u64::MAX as f32);
1079            values.push((normalized - 0.5) * 2.0); // Range: -1.0 to 1.0
1080        }
1081
1082        Vector::new(values)
1083    }
1084
1085    /// Get embedding manager statistics
1086    pub fn embedding_stats(&self) -> Option<(usize, usize)> {
1087        self.embedding_manager.as_ref().map(|em| em.cache_stats())
1088    }
1089
1090    /// Build vocabulary for TF-IDF embeddings
1091    pub fn build_vocabulary(&mut self, documents: &[String]) -> Result<()> {
1092        if let Some(ref mut embedding_manager) = self.embedding_manager {
1093            embedding_manager.build_vocabulary(documents)
1094        } else {
1095            Ok(()) // No-op if no embedding manager
1096        }
1097    }
1098
1099    /// Calculate similarity between two resources by their URIs
1100    pub fn calculate_similarity(&self, uri1: &str, uri2: &str) -> Result<f32> {
1101        // If the URIs are identical, return perfect similarity
1102        if uri1 == uri2 {
1103            return Ok(1.0);
1104        }
1105
1106        // Get the vectors for both URIs
1107        let vector1 = self
1108            .index
1109            .get_vector(uri1)
1110            .ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri1))?;
1111
1112        let vector2 = self
1113            .index
1114            .get_vector(uri2)
1115            .ok_or_else(|| anyhow::anyhow!("Vector not found for URI: {}", uri2))?;
1116
1117        // Calculate cosine similarity between the vectors
1118        vector1.cosine_similarity(vector2)
1119    }
1120
1121    /// Get a vector by its ID (delegates to VectorIndex)
1122    pub fn get_vector(&self, id: &str) -> Option<&Vector> {
1123        self.index.get_vector(id)
1124    }
1125
1126    /// Index a vector with metadata (stub)
1127    pub fn index_vector_with_metadata(
1128        &mut self,
1129        uri: String,
1130        vector: Vector,
1131        _metadata: HashMap<String, String>,
1132    ) -> Result<()> {
1133        // For now, just delegate to index_vector, ignoring metadata
1134        // Future: Extend VectorIndex trait to support metadata
1135        self.index_vector(uri, vector)
1136    }
1137
1138    /// Index a resource with metadata (stub)
1139    pub fn index_resource_with_metadata(
1140        &mut self,
1141        uri: String,
1142        content: &str,
1143        _metadata: HashMap<String, String>,
1144    ) -> Result<()> {
1145        // For now, just delegate to index_resource, ignoring metadata
1146        // Future: Store and utilize metadata
1147        self.index_resource(uri, content)
1148    }
1149
1150    /// Search with additional parameters (stub)
1151    pub fn similarity_search_with_params(
1152        &self,
1153        query: &str,
1154        limit: usize,
1155        _params: HashMap<String, String>,
1156    ) -> Result<Vec<(String, f32)>> {
1157        // For now, just delegate to similarity_search, ignoring params
1158        // Future: Use params for filtering, threshold, etc.
1159        self.similarity_search(query, limit)
1160    }
1161
1162    /// Vector search with additional parameters (stub)
1163    pub fn vector_search_with_params(
1164        &self,
1165        query: &Vector,
1166        limit: usize,
1167        _params: HashMap<String, String>,
1168    ) -> Result<Vec<(String, f32)>> {
1169        // For now, just delegate to similarity_search_vector, ignoring params
1170        // Future: Use params for filtering, distance metric selection, etc.
1171        self.similarity_search_vector(query, limit)
1172    }
1173
1174    /// Get all vector IDs (stub)
1175    pub fn get_vector_ids(&self) -> Result<Vec<String>> {
1176        // VectorIndex trait doesn't provide this method yet
1177        // Future: Add to VectorIndex trait or track separately
1178        Ok(Vec::new())
1179    }
1180
1181    /// Remove a vector by its URI (stub)
1182    pub fn remove_vector(&mut self, uri: &str) -> Result<()> {
1183        // Delegate to VectorIndex trait's remove_vector method
1184        self.index.remove_vector(uri.to_string())
1185    }
1186
1187    /// Get store statistics (stub)
1188    pub fn get_statistics(&self) -> Result<HashMap<String, String>> {
1189        // Return basic statistics as a map
1190        // Future: Provide comprehensive stats from index
1191        let mut stats = HashMap::new();
1192        stats.insert("type".to_string(), "VectorStore".to_string());
1193
1194        if let Some((cache_size, cache_capacity)) = self.embedding_stats() {
1195            stats.insert("embedding_cache_size".to_string(), cache_size.to_string());
1196            stats.insert(
1197                "embedding_cache_capacity".to_string(),
1198                cache_capacity.to_string(),
1199            );
1200        }
1201
1202        Ok(stats)
1203    }
1204
1205    /// Save store to disk (stub)
1206    pub fn save_to_disk(&self, _path: &str) -> Result<()> {
1207        // Stub implementation - serialization not yet implemented
1208        // Future: Serialize index and configuration to disk
1209        Err(anyhow::anyhow!("save_to_disk not yet implemented"))
1210    }
1211
1212    /// Load store from disk (stub)
1213    pub fn load_from_disk(_path: &str) -> Result<Self> {
1214        // Stub implementation - deserialization not yet implemented
1215        // Future: Deserialize index and configuration from disk
1216        Err(anyhow::anyhow!("load_from_disk not yet implemented"))
1217    }
1218
1219    /// Optimize the underlying index (stub)
1220    pub fn optimize_index(&mut self) -> Result<()> {
1221        // Stub implementation - optimization not yet implemented
1222        // Future: Trigger index compaction, rebalancing, etc.
1223        Ok(())
1224    }
1225}
1226
1227impl Default for VectorStore {
1228    fn default() -> Self {
1229        Self::new()
1230    }
1231}
1232
1233impl VectorStoreTrait for VectorStore {
1234    fn insert_vector(&mut self, id: VectorId, vector: Vector) -> Result<()> {
1235        self.index.insert(id, vector)
1236    }
1237
1238    fn add_vector(&mut self, vector: Vector) -> Result<VectorId> {
1239        // Generate a unique ID for the vector
1240        let id = format!("vec_{}", uuid::Uuid::new_v4());
1241        self.index.insert(id.clone(), vector)?;
1242        Ok(id)
1243    }
1244
1245    fn get_vector(&self, id: &VectorId) -> Result<Option<Vector>> {
1246        Ok(self.index.get_vector(id).cloned())
1247    }
1248
1249    fn get_all_vector_ids(&self) -> Result<Vec<VectorId>> {
1250        // For now, return empty vec as VectorIndex doesn't provide this method
1251        // This could be enhanced if the underlying index supports it
1252        Ok(Vec::new())
1253    }
1254
1255    fn search_similar(&self, query: &Vector, k: usize) -> Result<Vec<(VectorId, f32)>> {
1256        self.index.search_knn(query, k)
1257    }
1258
1259    fn remove_vector(&mut self, id: &VectorId) -> Result<bool> {
1260        // VectorIndex trait doesn't have remove, so we'll return false for now
1261        // This could be enhanced in the future if needed
1262        let _ = id;
1263        Ok(false)
1264    }
1265
1266    fn len(&self) -> usize {
1267        // VectorIndex trait doesn't have len, so we'll return 0 for now
1268        // This could be enhanced in the future if needed
1269        0
1270    }
1271}
1272
1273/// Search query types
1274#[derive(Debug, Clone)]
1275pub enum SearchQuery {
1276    Text(String),
1277    Vector(Vector),
1278}
1279
1280/// Search operation types
1281#[derive(Debug, Clone)]
1282pub enum SearchType {
1283    KNN(usize),
1284    Threshold(f32),
1285}
1286
1287/// Advanced search options
1288#[derive(Debug, Clone)]
1289pub struct SearchOptions {
1290    pub query: SearchQuery,
1291    pub search_type: SearchType,
1292}
1293
1294/// Vector operation results with enhanced metadata
1295#[derive(Debug, Clone)]
1296pub struct VectorOperationResult {
1297    pub uri: String,
1298    pub similarity: f32,
1299    pub vector: Option<Vector>,
1300    pub metadata: Option<std::collections::HashMap<String, String>>,
1301    pub rank: usize,
1302}
1303
1304/// Document batch processing utilities
1305pub struct DocumentBatchProcessor;
1306
1307impl DocumentBatchProcessor {
1308    /// Process multiple documents in batch for efficient indexing
1309    pub fn batch_index(
1310        store: &mut VectorStore,
1311        documents: &[(String, String)], // (uri, content) pairs
1312    ) -> Result<Vec<Result<()>>> {
1313        let mut results = Vec::new();
1314
1315        for (uri, content) in documents {
1316            let result = store.index_resource(uri.clone(), content);
1317            results.push(result);
1318        }
1319
1320        Ok(results)
1321    }
1322
1323    /// Process multiple queries in batch
1324    pub fn batch_search(
1325        store: &VectorStore,
1326        queries: &[String],
1327        limit: usize,
1328    ) -> Result<BatchSearchResult> {
1329        let mut results = Vec::new();
1330
1331        for query in queries {
1332            let result = store.similarity_search(query, limit);
1333            results.push(result);
1334        }
1335
1336        Ok(results)
1337    }
1338}
1339
1340/// Error types specific to vector operations
1341#[derive(Debug, thiserror::Error)]
1342pub enum VectorError {
1343    #[error("Dimension mismatch: expected {expected}, got {actual}")]
1344    DimensionMismatch { expected: usize, actual: usize },
1345
1346    #[error("Empty vector")]
1347    EmptyVector,
1348
1349    #[error("Index not built")]
1350    IndexNotBuilt,
1351
1352    #[error("Embedding generation failed: {message}")]
1353    EmbeddingError { message: String },
1354
1355    #[error("SPARQL service error: {message}")]
1356    SparqlServiceError { message: String },
1357
1358    #[error("Compression error: {0}")]
1359    CompressionError(String),
1360
1361    #[error("Invalid dimensions: {0}")]
1362    InvalidDimensions(String),
1363
1364    #[error("Unsupported operation: {0}")]
1365    UnsupportedOperation(String),
1366
1367    #[error("Invalid data: {0}")]
1368    InvalidData(String),
1369
1370    #[error("IO error: {0}")]
1371    IoError(#[from] std::io::Error),
1372}
1373
1374/// Utility functions for vector operations
1375pub mod utils {
1376    use super::Vector;
1377
1378    /// Calculate centroid of a set of vectors
1379    pub fn centroid(vectors: &[Vector]) -> Option<Vector> {
1380        if vectors.is_empty() {
1381            return None;
1382        }
1383
1384        let dimensions = vectors[0].dimensions;
1385        let mut sum_values = vec![0.0; dimensions];
1386
1387        for vector in vectors {
1388            if vector.dimensions != dimensions {
1389                return None; // Inconsistent dimensions
1390            }
1391
1392            let vector_f32 = vector.as_f32();
1393            for (i, &value) in vector_f32.iter().enumerate() {
1394                sum_values[i] += value;
1395            }
1396        }
1397
1398        let count = vectors.len() as f32;
1399        for value in &mut sum_values {
1400            *value /= count;
1401        }
1402
1403        Some(Vector::new(sum_values))
1404    }
1405
1406    /// Generate random vector for testing
1407    pub fn random_vector(dimensions: usize, seed: Option<u64>) -> Vector {
1408        use std::collections::hash_map::DefaultHasher;
1409        use std::hash::{Hash, Hasher};
1410
1411        let mut hasher = DefaultHasher::new();
1412        seed.unwrap_or(42).hash(&mut hasher);
1413        let mut rng_state = hasher.finish();
1414
1415        let mut values = Vec::with_capacity(dimensions);
1416        for _ in 0..dimensions {
1417            rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
1418            let normalized = (rng_state as f32) / (u64::MAX as f32);
1419            values.push((normalized - 0.5) * 2.0); // Range: -1.0 to 1.0
1420        }
1421
1422        Vector::new(values)
1423    }
1424
1425    /// Convert vector to normalized unit vector
1426    pub fn normalize_vector(vector: &Vector) -> Vector {
1427        vector.normalized()
1428    }
1429}
1430
1431#[cfg(test)]
1432mod tests {
1433    use super::*;
1434    use crate::similarity::SimilarityMetric;
1435
1436    #[test]
1437    fn test_vector_creation() {
1438        let values = vec![1.0, 2.0, 3.0];
1439        let vector = Vector::new(values.clone());
1440
1441        assert_eq!(vector.dimensions, 3);
1442        assert_eq!(vector.precision, VectorPrecision::F32);
1443        assert_eq!(vector.as_f32(), values);
1444    }
1445
1446    #[test]
1447    fn test_multi_precision_vectors() {
1448        // Test F64 vector
1449        let f64_values = vec![1.0, 2.0, 3.0];
1450        let f64_vector = Vector::f64(f64_values.clone());
1451        assert_eq!(f64_vector.precision, VectorPrecision::F64);
1452        assert_eq!(f64_vector.dimensions, 3);
1453
1454        // Test I8 vector
1455        let i8_values = vec![100, -50, 0];
1456        let i8_vector = Vector::i8(i8_values);
1457        assert_eq!(i8_vector.precision, VectorPrecision::I8);
1458        assert_eq!(i8_vector.dimensions, 3);
1459
1460        // Test binary vector
1461        let binary_values = vec![0b10101010, 0b11110000];
1462        let binary_vector = Vector::binary(binary_values);
1463        assert_eq!(binary_vector.precision, VectorPrecision::Binary);
1464        assert_eq!(binary_vector.dimensions, 16); // 2 bytes * 8 bits
1465    }
1466
1467    #[test]
1468    fn test_vector_operations() {
1469        let v1 = Vector::new(vec![1.0, 2.0, 3.0]);
1470        let v2 = Vector::new(vec![4.0, 5.0, 6.0]);
1471
1472        // Test addition
1473        let sum = v1.add(&v2).unwrap();
1474        assert_eq!(sum.as_f32(), vec![5.0, 7.0, 9.0]);
1475
1476        // Test subtraction
1477        let diff = v2.subtract(&v1).unwrap();
1478        assert_eq!(diff.as_f32(), vec![3.0, 3.0, 3.0]);
1479
1480        // Test scaling
1481        let scaled = v1.scale(2.0);
1482        assert_eq!(scaled.as_f32(), vec![2.0, 4.0, 6.0]);
1483    }
1484
1485    #[test]
1486    fn test_cosine_similarity() {
1487        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1488        let v2 = Vector::new(vec![1.0, 0.0, 0.0]);
1489        let v3 = Vector::new(vec![0.0, 1.0, 0.0]);
1490
1491        // Identical vectors should have similarity 1.0
1492        assert!((v1.cosine_similarity(&v2).unwrap() - 1.0).abs() < 0.001);
1493
1494        // Orthogonal vectors should have similarity 0.0
1495        assert!((v1.cosine_similarity(&v3).unwrap()).abs() < 0.001);
1496    }
1497
1498    #[test]
1499    fn test_vector_store() {
1500        let mut store = VectorStore::new();
1501
1502        // Test indexing
1503        store
1504            .index_resource("doc1".to_string(), "This is a test")
1505            .unwrap();
1506        store
1507            .index_resource("doc2".to_string(), "Another test document")
1508            .unwrap();
1509
1510        // Test searching
1511        let results = store.similarity_search("test", 5).unwrap();
1512        assert_eq!(results.len(), 2);
1513
1514        // Results should be sorted by similarity (descending)
1515        assert!(results[0].1 >= results[1].1);
1516    }
1517
1518    #[test]
1519    fn test_similarity_metrics() {
1520        let a = vec![1.0, 2.0, 3.0];
1521        let b = vec![4.0, 5.0, 6.0];
1522
1523        // Test different similarity metrics
1524        let cosine_sim = SimilarityMetric::Cosine.similarity(&a, &b).unwrap();
1525        let euclidean_sim = SimilarityMetric::Euclidean.similarity(&a, &b).unwrap();
1526        let manhattan_sim = SimilarityMetric::Manhattan.similarity(&a, &b).unwrap();
1527
1528        // All similarities should be between 0 and 1
1529        assert!((0.0..=1.0).contains(&cosine_sim));
1530        assert!((0.0..=1.0).contains(&euclidean_sim));
1531        assert!((0.0..=1.0).contains(&manhattan_sim));
1532    }
1533
1534    #[test]
1535    fn test_quantization() {
1536        let values = vec![1.0, -0.5, 0.0, 0.75];
1537        let quantized = Vector::quantize_to_i8(&values);
1538
1539        // Check that quantized values are in the expected range
1540        for &q in &quantized {
1541            assert!((-127..=127).contains(&q));
1542        }
1543    }
1544
1545    #[test]
1546    fn test_binary_conversion() {
1547        let values = vec![0.8, -0.3, 0.1, -0.9];
1548        let binary = Vector::to_binary(&values, 0.0);
1549
1550        // Should have 1 byte (4 values, each becomes 1 bit, packed into bytes)
1551        assert_eq!(binary.len(), 1);
1552
1553        // First bit should be 1 (0.8 > 0.0), second should be 0 (-0.3 < 0.0), etc.
1554        let byte = binary[0];
1555        assert_eq!(byte & 1, 1); // bit 0: 0.8 > 0.0
1556        assert_eq!((byte >> 1) & 1, 0); // bit 1: -0.3 < 0.0
1557        assert_eq!((byte >> 2) & 1, 1); // bit 2: 0.1 > 0.0
1558        assert_eq!((byte >> 3) & 1, 0); // bit 3: -0.9 < 0.0
1559    }
1560
1561    #[test]
1562    fn test_memory_vector_index() {
1563        let mut index = MemoryVectorIndex::new();
1564
1565        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1566        let v2 = Vector::new(vec![0.0, 1.0, 0.0]);
1567
1568        index.insert("v1".to_string(), v1.clone()).unwrap();
1569        index.insert("v2".to_string(), v2.clone()).unwrap();
1570
1571        // Test KNN search
1572        let results = index.search_knn(&v1, 1).unwrap();
1573        assert_eq!(results.len(), 1);
1574        assert_eq!(results[0].0, "v1");
1575
1576        // Test threshold search
1577        let results = index.search_threshold(&v1, 0.5).unwrap();
1578        assert!(!results.is_empty());
1579    }
1580
1581    #[test]
1582    fn test_hnsw_index() {
1583        use crate::hnsw::{HnswConfig, HnswIndex};
1584
1585        let config = HnswConfig::default();
1586        let mut index = HnswIndex::new(config).unwrap();
1587
1588        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1589        let v2 = Vector::new(vec![0.0, 1.0, 0.0]);
1590        let v3 = Vector::new(vec![0.0, 0.0, 1.0]);
1591
1592        index.insert("v1".to_string(), v1.clone()).unwrap();
1593        index.insert("v2".to_string(), v2.clone()).unwrap();
1594        index.insert("v3".to_string(), v3.clone()).unwrap();
1595
1596        // Test KNN search
1597        let results = index.search_knn(&v1, 2).unwrap();
1598        assert!(results.len() <= 2);
1599
1600        // The first result should be v1 itself (highest similarity)
1601        if !results.is_empty() {
1602            assert_eq!(results[0].0, "v1");
1603        }
1604    }
1605
1606    #[test]
1607    fn test_sparql_vector_service() {
1608        use crate::embeddings::EmbeddingStrategy;
1609        use crate::sparql_integration::{
1610            SparqlVectorService, VectorServiceArg, VectorServiceConfig, VectorServiceResult,
1611        };
1612
1613        let config = VectorServiceConfig::default();
1614        let mut service =
1615            SparqlVectorService::new(config, EmbeddingStrategy::SentenceTransformer).unwrap();
1616
1617        // Test vector similarity function
1618        let v1 = Vector::new(vec![1.0, 0.0, 0.0]);
1619        let v2 = Vector::new(vec![1.0, 0.0, 0.0]);
1620
1621        let args = vec![VectorServiceArg::Vector(v1), VectorServiceArg::Vector(v2)];
1622
1623        let result = service
1624            .execute_function("vector_similarity", &args)
1625            .unwrap();
1626
1627        match result {
1628            VectorServiceResult::Number(similarity) => {
1629                assert!((similarity - 1.0).abs() < 0.001); // Should be very similar
1630            }
1631            _ => panic!("Expected a number result"),
1632        }
1633
1634        // Test text embedding function
1635        let text_args = vec![VectorServiceArg::String("test text".to_string())];
1636        let embed_result = service.execute_function("embed_text", &text_args).unwrap();
1637
1638        match embed_result {
1639            VectorServiceResult::Vector(vector) => {
1640                assert_eq!(vector.dimensions, 384); // Default embedding size
1641            }
1642            _ => panic!("Expected a vector result"),
1643        }
1644    }
1645}
oxirs_vec/lib.rs

oxirs_vec/
lib.rs