oxirs_vec/
faiss_compatibility.rs

1//! FAISS Compatibility Layer
2//!
3//! This module provides compatibility with Facebook AI Similarity Search (FAISS)
4//! library, enabling import/export of vector indexes to/from FAISS format.
5//! This allows seamless integration with the broader ML ecosystem.
6
7use crate::{
8    hnsw::{HnswConfig, HnswIndex},
9    ivf::{IvfConfig, IvfIndex},
10    similarity::SimilarityMetric,
11    Vector, VectorIndex,
12};
13use anyhow::{anyhow, Result};
14use serde::{Deserialize, Serialize};
15use std::collections::HashMap;
16use std::fs::File;
17use std::io::{BufReader, BufWriter, Read, Write};
18use std::path::Path;
19
20/// FAISS index types that we support
21#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
22pub enum FaissIndexType {
23    /// Flat (brute force) index
24    IndexFlatL2,
25    IndexFlatIP,
26    /// IVF with flat quantizer
27    IndexIVFFlat,
28    /// IVF with product quantization
29    IndexIVFPQ,
30    /// Hierarchical NSW (HNSW)
31    IndexHNSWFlat,
32    /// LSH (Locality Sensitive Hashing)
33    IndexLSH,
34    /// PCA + flat index
35    IndexPCAFlat,
36}
37
38/// FAISS index metadata
39#[derive(Debug, Clone, Serialize, Deserialize)]
40pub struct FaissIndexMetadata {
41    pub index_type: FaissIndexType,
42    pub dimension: usize,
43    pub num_vectors: usize,
44    pub metric_type: FaissMetricType,
45    pub parameters: HashMap<String, FaissParameter>,
46    pub version: String,
47    pub created_at: String,
48}
49
50/// FAISS metric types
51#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
52pub enum FaissMetricType {
53    /// L2 (Euclidean) distance
54    L2,
55    /// Inner product (dot product)
56    InnerProduct,
57    /// Cosine similarity (normalized inner product)
58    Cosine,
59}
60
61/// FAISS parameter values
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub enum FaissParameter {
64    Integer(i64),
65    Float(f64),
66    String(String),
67    Boolean(bool),
68}
69
70/// FAISS compatibility layer for vector indexes
71pub struct FaissCompatibility {
72    supported_formats: Vec<FaissIndexType>,
73    conversion_cache: HashMap<String, ConversionResult>,
74}
75
76/// Result of index conversion
77#[derive(Debug, Clone)]
78pub struct ConversionResult {
79    pub success: bool,
80    pub metadata: FaissIndexMetadata,
81    pub performance_metrics: ConversionMetrics,
82    pub warnings: Vec<String>,
83}
84
85/// Conversion performance metrics
86#[derive(Debug, Clone, Default)]
87pub struct ConversionMetrics {
88    pub conversion_time: std::time::Duration,
89    pub memory_used: usize,
90    pub vectors_processed: usize,
91    pub accuracy_preserved: f32, // 0.0 to 1.0
92}
93
94/// FAISS export configuration
95#[derive(Debug, Clone)]
96pub struct FaissExportConfig {
97    pub target_format: FaissIndexType,
98    pub compression_level: CompressionLevel,
99    pub preserve_accuracy: bool,
100    pub include_metadata: bool,
101    pub chunk_size: usize,
102}
103
104/// FAISS import configuration
105#[derive(Debug, Clone)]
106pub struct FaissImportConfig {
107    pub validate_format: bool,
108    pub preserve_performance: bool,
109    pub rebuild_if_incompatible: bool,
110    pub batch_size: usize,
111}
112
113/// Compression levels for export
114#[derive(Debug, Clone, Copy)]
115pub enum CompressionLevel {
116    None,
117    Low,
118    Medium,
119    High,
120    Maximum,
121}
122
123impl Default for FaissExportConfig {
124    fn default() -> Self {
125        Self {
126            target_format: FaissIndexType::IndexHNSWFlat,
127            compression_level: CompressionLevel::Medium,
128            preserve_accuracy: true,
129            include_metadata: true,
130            chunk_size: 10000,
131        }
132    }
133}
134
135impl Default for FaissImportConfig {
136    fn default() -> Self {
137        Self {
138            validate_format: true,
139            preserve_performance: true,
140            rebuild_if_incompatible: false,
141            batch_size: 5000,
142        }
143    }
144}
145
146impl FaissCompatibility {
147    /// Create a new FAISS compatibility layer
148    pub fn new() -> Self {
149        Self {
150            supported_formats: vec![
151                FaissIndexType::IndexFlatL2,
152                FaissIndexType::IndexFlatIP,
153                FaissIndexType::IndexIVFFlat,
154                FaissIndexType::IndexIVFPQ,
155                FaissIndexType::IndexHNSWFlat,
156                FaissIndexType::IndexLSH,
157            ],
158            conversion_cache: HashMap::new(),
159        }
160    }
161
162    /// Export an oxirs-vec index to FAISS format
163    pub fn export_to_faiss<T: VectorIndex>(
164        &mut self,
165        index: &T,
166        output_path: &Path,
167        config: &FaissExportConfig,
168    ) -> Result<ConversionResult> {
169        let start_time = std::time::Instant::now();
170        let mut warnings = Vec::new();
171
172        // Detect the appropriate FAISS format for the index
173        let detected_format = self.detect_optimal_faiss_format(index)?;
174        let target_format = if detected_format != config.target_format {
175            warnings.push(format!(
176                "Requested format {:?} differs from optimal format {:?}",
177                config.target_format, detected_format
178            ));
179            config.target_format
180        } else {
181            detected_format
182        };
183
184        // Create metadata
185        let metadata = FaissIndexMetadata {
186            index_type: target_format,
187            dimension: self.get_index_dimension(index)?,
188            num_vectors: self.get_index_size(index)?,
189            metric_type: self.convert_similarity_metric(self.get_index_metric(index)?),
190            parameters: self.extract_index_parameters(index, target_format)?,
191            version: "oxirs-vec-1.0".to_string(),
192            created_at: chrono::Utc::now().to_rfc3339(),
193        };
194
195        // Export the index data
196        self.write_faiss_index(index, output_path, &metadata, config)?;
197
198        let conversion_time = start_time.elapsed();
199        let performance_metrics = ConversionMetrics {
200            conversion_time,
201            memory_used: self.estimate_memory_usage(&metadata),
202            vectors_processed: metadata.num_vectors,
203            accuracy_preserved: self.estimate_accuracy_preservation(target_format, config),
204        };
205
206        let result = ConversionResult {
207            success: true,
208            metadata,
209            performance_metrics,
210            warnings,
211        };
212
213        // Cache the result
214        let cache_key = format!("{:?}-{}", target_format, output_path.display());
215        self.conversion_cache.insert(cache_key, result.clone());
216
217        Ok(result)
218    }
219
220    /// Import a FAISS index to oxirs-vec format
221    pub fn import_from_faiss(
222        &mut self,
223        input_path: &Path,
224        config: &FaissImportConfig,
225    ) -> Result<Box<dyn VectorIndex>> {
226        // Read and validate FAISS metadata
227        let metadata = self.read_faiss_metadata(input_path)?;
228
229        if config.validate_format && !self.is_format_supported(&metadata.index_type) {
230            return Err(anyhow!(
231                "Unsupported FAISS format: {:?}",
232                metadata.index_type
233            ));
234        }
235
236        // Create appropriate oxirs-vec index based on FAISS type
237        match metadata.index_type {
238            FaissIndexType::IndexHNSWFlat => self.import_hnsw_index(input_path, &metadata, config),
239            FaissIndexType::IndexIVFFlat | FaissIndexType::IndexIVFPQ => {
240                self.import_ivf_index(input_path, &metadata, config)
241            }
242            FaissIndexType::IndexFlatL2 | FaissIndexType::IndexFlatIP => {
243                self.import_flat_index(input_path, &metadata, config)
244            }
245            _ => Err(anyhow!(
246                "Import not yet implemented for {:?}",
247                metadata.index_type
248            )),
249        }
250    }
251
252    /// Convert HNSW index to FAISS format
253    fn export_hnsw_to_faiss(
254        &self,
255        index: &HnswIndex,
256        output_path: &Path,
257        metadata: &FaissIndexMetadata,
258        config: &FaissExportConfig,
259    ) -> Result<()> {
260        let file = File::create(output_path)?;
261        let mut writer = BufWriter::new(file);
262
263        // Write FAISS header
264        self.write_faiss_header(&mut writer, metadata)?;
265
266        // Write HNSW-specific data
267        self.write_hnsw_data(&mut writer, index, config)?;
268
269        // Write vectors in chunks
270        self.write_vectors_chunked(&mut writer, index, config.chunk_size)?;
271
272        writer.flush()?;
273        Ok(())
274    }
275
276    /// Convert IVF index to FAISS format
277    fn export_ivf_to_faiss(
278        &self,
279        index: &IvfIndex,
280        output_path: &Path,
281        metadata: &FaissIndexMetadata,
282        config: &FaissExportConfig,
283    ) -> Result<()> {
284        let file = File::create(output_path)?;
285        let mut writer = BufWriter::new(file);
286
287        // Write FAISS header
288        self.write_faiss_header(&mut writer, metadata)?;
289
290        // Write IVF-specific data
291        self.write_ivf_data(&mut writer, index, config)?;
292
293        // Write centroids and inverted lists
294        self.write_ivf_structure(&mut writer, index)?;
295
296        writer.flush()?;
297        Ok(())
298    }
299
300    /// Import HNSW index from FAISS format
301    fn import_hnsw_index(
302        &self,
303        input_path: &Path,
304        metadata: &FaissIndexMetadata,
305        config: &FaissImportConfig,
306    ) -> Result<Box<dyn VectorIndex>> {
307        let file = File::open(input_path)?;
308        let mut reader = BufReader::new(file);
309
310        // Skip FAISS header
311        self.skip_faiss_header(&mut reader)?;
312
313        // Read HNSW configuration
314        let hnsw_config = self.read_hnsw_config(&mut reader, metadata)?;
315
316        // Create new HNSW index
317        let mut index = HnswIndex::new(hnsw_config)?;
318
319        // Read and import vectors in batches
320        self.import_vectors_batched(&mut reader, &mut index, config.batch_size)?;
321
322        Ok(Box::new(index))
323    }
324
325    /// Import IVF index from FAISS format
326    fn import_ivf_index(
327        &self,
328        input_path: &Path,
329        metadata: &FaissIndexMetadata,
330        config: &FaissImportConfig,
331    ) -> Result<Box<dyn VectorIndex>> {
332        let file = File::open(input_path)?;
333        let mut reader = BufReader::new(file);
334
335        // Skip FAISS header
336        self.skip_faiss_header(&mut reader)?;
337
338        // Read IVF configuration
339        let ivf_config = self.read_ivf_config(&mut reader, metadata)?;
340
341        // Create new IVF index
342        let mut index = IvfIndex::new(ivf_config)?;
343
344        // Read centroids and structure
345        self.read_ivf_structure(&mut reader, &mut index)?;
346
347        // Import vectors
348        self.import_vectors_batched(&mut reader, &mut index, config.batch_size)?;
349
350        Ok(Box::new(index))
351    }
352
353    /// Import flat index from FAISS format
354    fn import_flat_index(
355        &self,
356        input_path: &Path,
357        metadata: &FaissIndexMetadata,
358        _config: &FaissImportConfig,
359    ) -> Result<Box<dyn VectorIndex>> {
360        let file = File::open(input_path)?;
361        let mut reader = BufReader::new(file);
362
363        // Skip FAISS header
364        self.skip_faiss_header(&mut reader)?;
365
366        // Create a simple in-memory index for flat FAISS indexes
367        let mut vectors = Vec::new();
368        let mut uris = Vec::new();
369
370        // Read all vectors
371        for i in 0..metadata.num_vectors {
372            let vector = self.read_vector(&mut reader, metadata.dimension)?;
373            vectors.push(vector);
374            uris.push(format!("faiss_vector_{i}"));
375        }
376
377        // Create a simple flat index implementation
378        Ok(Box::new(SimpleVectorIndex::new(vectors, uris)))
379    }
380
381    /// Detect optimal FAISS format for an index
382    fn detect_optimal_faiss_format<T: VectorIndex>(&self, index: &T) -> Result<FaissIndexType> {
383        let size = self.get_index_size(index)?;
384        let dimension = self.get_index_dimension(index)?;
385
386        // Use heuristics to determine best format
387        if size < 10000 {
388            // Small datasets - use flat index
389            Ok(FaissIndexType::IndexFlatL2)
390        } else if dimension > 1000 {
391            // High-dimensional - use IVF with PQ
392            Ok(FaissIndexType::IndexIVFPQ)
393        } else if size > 100000 {
394            // Large dataset - use HNSW
395            Ok(FaissIndexType::IndexHNSWFlat)
396        } else {
397            // Medium dataset - use IVF flat
398            Ok(FaissIndexType::IndexIVFFlat)
399        }
400    }
401
402    /// Check if a FAISS format is supported
403    fn is_format_supported(&self, format: &FaissIndexType) -> bool {
404        self.supported_formats.contains(format)
405    }
406
407    /// Convert similarity metric to FAISS metric type
408    fn convert_similarity_metric(&self, metric: SimilarityMetric) -> FaissMetricType {
409        match metric {
410            SimilarityMetric::Cosine => FaissMetricType::Cosine,
411            SimilarityMetric::Euclidean => FaissMetricType::L2,
412            SimilarityMetric::DotProduct => FaissMetricType::InnerProduct,
413            SimilarityMetric::Manhattan => FaissMetricType::L2, // Approximate with L2
414            // All other metrics approximate with L2 for FAISS compatibility
415            _ => FaissMetricType::L2,
416        }
417    }
418
419    /// Write FAISS header to file
420    fn write_faiss_header(
421        &self,
422        writer: &mut BufWriter<File>,
423        metadata: &FaissIndexMetadata,
424    ) -> Result<()> {
425        // FAISS magic number
426        writer.write_all(b"FAISS")?;
427
428        // Version
429        writer.write_all(&1u32.to_le_bytes())?;
430
431        // Index type identifier
432        let type_id = self.faiss_type_to_id(metadata.index_type);
433        writer.write_all(&type_id.to_le_bytes())?;
434
435        // Dimension
436        writer.write_all(&(metadata.dimension as u32).to_le_bytes())?;
437
438        // Number of vectors
439        writer.write_all(&(metadata.num_vectors as u64).to_le_bytes())?;
440
441        // Metric type
442        let metric_id = self.faiss_metric_to_id(metadata.metric_type);
443        writer.write_all(&metric_id.to_le_bytes())?;
444
445        Ok(())
446    }
447
448    /// Skip FAISS header when reading
449    fn skip_faiss_header(&self, reader: &mut BufReader<File>) -> Result<()> {
450        let mut magic = [0u8; 5];
451        reader.read_exact(&mut magic)?;
452
453        if &magic != b"FAISS" {
454            return Err(anyhow!("Invalid FAISS file format"));
455        }
456
457        // Skip version, type, dimension, count, metric
458        let mut buffer = [0u8; 21]; // 4 + 4 + 4 + 8 + 1
459        reader.read_exact(&mut buffer)?;
460
461        Ok(())
462    }
463
464    /// Write HNSW-specific data
465    fn write_hnsw_data(
466        &self,
467        writer: &mut BufWriter<File>,
468        index: &HnswIndex,
469        _config: &FaissExportConfig,
470    ) -> Result<()> {
471        // Write HNSW parameters
472        let config = index.config();
473        writer.write_all(&(config.m as u32).to_le_bytes())?;
474        writer.write_all(&(config.m_l0 as u32).to_le_bytes())?;
475        writer.write_all(&(config.ef as u32).to_le_bytes())?;
476        writer.write_all(&config.ml.to_le_bytes())?;
477
478        Ok(())
479    }
480
481    /// Write IVF-specific data
482    fn write_ivf_data(
483        &self,
484        writer: &mut BufWriter<File>,
485        index: &IvfIndex,
486        _config: &FaissExportConfig,
487    ) -> Result<()> {
488        // Write IVF parameters
489        let config = index.config();
490        writer.write_all(&(config.n_clusters as u32).to_le_bytes())?;
491        writer.write_all(&(config.n_probes as u32).to_le_bytes())?;
492
493        Ok(())
494    }
495
496    /// Write vectors in chunks for memory efficiency
497    fn write_vectors_chunked<T: VectorIndex>(
498        &self,
499        writer: &mut BufWriter<File>,
500        index: &T,
501        chunk_size: usize,
502    ) -> Result<()> {
503        let total_vectors = self.get_index_size(index)?;
504
505        for chunk_start in (0..total_vectors).step_by(chunk_size) {
506            let chunk_end = std::cmp::min(chunk_start + chunk_size, total_vectors);
507
508            for i in chunk_start..chunk_end {
509                if let Some(vector) = self.get_vector_at_index(index, i) {
510                    self.write_vector(writer, &vector)?;
511                }
512            }
513        }
514
515        Ok(())
516    }
517
518    /// Write a single vector to file
519    fn write_vector(&self, writer: &mut BufWriter<File>, vector: &Vector) -> Result<()> {
520        let data = vector.as_f32();
521        for &value in &data {
522            writer.write_all(&value.to_le_bytes())?;
523        }
524        Ok(())
525    }
526
527    /// Read a single vector from file
528    fn read_vector(&self, reader: &mut BufReader<File>, dimension: usize) -> Result<Vector> {
529        let mut data = vec![0.0f32; dimension];
530        for value in &mut data {
531            let mut bytes = [0u8; 4];
532            reader.read_exact(&mut bytes)?;
533            *value = f32::from_le_bytes(bytes);
534        }
535
536        Ok(Vector::new(data))
537    }
538
539    /// Utility methods for index introspection
540    fn get_index_dimension<T: VectorIndex>(&self, _index: &T) -> Result<usize> {
541        // This would need to be implemented based on the actual VectorIndex trait
542        // For now, return a default dimension
543        Ok(768) // Common dimension for transformer embeddings
544    }
545
546    fn get_index_size<T: VectorIndex>(&self, _index: &T) -> Result<usize> {
547        // This would need to be implemented based on the actual VectorIndex trait
548        Ok(0) // Placeholder
549    }
550
551    fn get_index_metric<T: VectorIndex>(&self, _index: &T) -> Result<SimilarityMetric> {
552        // This would need to be implemented based on the actual VectorIndex trait
553        Ok(SimilarityMetric::Cosine) // Default
554    }
555
556    fn get_vector_at_index<T: VectorIndex>(&self, _index: &T, _idx: usize) -> Option<Vector> {
557        // This would need to be implemented based on the actual VectorIndex trait
558        None // Placeholder
559    }
560
561    /// Helper methods for format conversion
562    fn faiss_type_to_id(&self, faiss_type: FaissIndexType) -> u32 {
563        match faiss_type {
564            FaissIndexType::IndexFlatL2 => 0,
565            FaissIndexType::IndexFlatIP => 1,
566            FaissIndexType::IndexIVFFlat => 2,
567            FaissIndexType::IndexIVFPQ => 3,
568            FaissIndexType::IndexHNSWFlat => 4,
569            FaissIndexType::IndexLSH => 5,
570            FaissIndexType::IndexPCAFlat => 6,
571        }
572    }
573
574    fn faiss_metric_to_id(&self, metric: FaissMetricType) -> u8 {
575        match metric {
576            FaissMetricType::L2 => 0,
577            FaissMetricType::InnerProduct => 1,
578            FaissMetricType::Cosine => 2,
579        }
580    }
581
582    fn extract_index_parameters<T: VectorIndex>(
583        &self,
584        _index: &T,
585        _format: FaissIndexType,
586    ) -> Result<HashMap<String, FaissParameter>> {
587        // Extract relevant parameters based on index type
588        let mut params = HashMap::new();
589        params.insert(
590            "created_by".to_string(),
591            FaissParameter::String("oxirs-vec".to_string()),
592        );
593        Ok(params)
594    }
595
596    fn estimate_memory_usage(&self, metadata: &FaissIndexMetadata) -> usize {
597        // Rough estimate based on vectors and dimension
598        metadata.num_vectors * metadata.dimension * 4 // 4 bytes per float
599    }
600
601    fn estimate_accuracy_preservation(
602        &self,
603        _format: FaissIndexType,
604        _config: &FaissExportConfig,
605    ) -> f32 {
606        // Conservative estimate
607        0.95 // 95% accuracy preservation
608    }
609
610    // Additional helper methods for reading configurations
611    fn read_hnsw_config(
612        &self,
613        _reader: &mut BufReader<File>,
614        _metadata: &FaissIndexMetadata,
615    ) -> Result<HnswConfig> {
616        // Read HNSW-specific configuration from FAISS file
617        Ok(HnswConfig::default()) // Placeholder
618    }
619
620    fn read_ivf_config(
621        &self,
622        _reader: &mut BufReader<File>,
623        _metadata: &FaissIndexMetadata,
624    ) -> Result<IvfConfig> {
625        // Read IVF-specific configuration from FAISS file
626        Ok(IvfConfig::default()) // Placeholder
627    }
628
629    fn write_ivf_structure(&self, _writer: &mut BufWriter<File>, _index: &IvfIndex) -> Result<()> {
630        // Write IVF centroids and inverted lists
631        Ok(()) // Placeholder
632    }
633
634    fn read_ivf_structure(
635        &self,
636        _reader: &mut BufReader<File>,
637        _index: &mut IvfIndex,
638    ) -> Result<()> {
639        // Read IVF centroids and structure
640        Ok(()) // Placeholder
641    }
642
643    fn import_vectors_batched<T: VectorIndex>(
644        &self,
645        _reader: &mut BufReader<File>,
646        _index: &mut T,
647        _batch_size: usize,
648    ) -> Result<()> {
649        // Import vectors in batches for memory efficiency
650        Ok(()) // Placeholder
651    }
652
653    fn read_faiss_metadata(&self, _input_path: &Path) -> Result<FaissIndexMetadata> {
654        // Read and parse FAISS metadata from file
655        Ok(FaissIndexMetadata {
656            index_type: FaissIndexType::IndexHNSWFlat,
657            dimension: 768,
658            num_vectors: 0,
659            metric_type: FaissMetricType::Cosine,
660            parameters: HashMap::new(),
661            version: "1.0".to_string(),
662            created_at: chrono::Utc::now().to_rfc3339(),
663        }) // Placeholder
664    }
665
666    fn write_faiss_index<T: VectorIndex>(
667        &self,
668        index: &T,
669        output_path: &Path,
670        metadata: &FaissIndexMetadata,
671        config: &FaissExportConfig,
672    ) -> Result<()> {
673        match metadata.index_type {
674            FaissIndexType::IndexHNSWFlat => {
675                // Cast to HnswIndex for HNSW export
676                // This is a simplified approach - in practice, you'd need proper type handling
677                if let Some(hnsw_index) = self.try_cast_to_hnsw(index) {
678                    self.export_hnsw_to_faiss(hnsw_index, output_path, metadata, config)
679                } else {
680                    Err(anyhow!("Index is not an HNSW index"))
681                }
682            }
683            FaissIndexType::IndexIVFFlat | FaissIndexType::IndexIVFPQ => {
684                // Cast to IvfIndex for IVF export
685                if let Some(ivf_index) = self.try_cast_to_ivf(index) {
686                    self.export_ivf_to_faiss(ivf_index, output_path, metadata, config)
687                } else {
688                    Err(anyhow!("Index is not an IVF index"))
689                }
690            }
691            _ => Err(anyhow!(
692                "Export format not yet implemented: {:?}",
693                metadata.index_type
694            )),
695        }
696    }
697
698    // Helper methods for type casting (simplified approach)
699    fn try_cast_to_hnsw<T: VectorIndex>(&self, _index: &T) -> Option<&HnswIndex> {
700        // In practice, this would require proper downcasting or a different approach
701        None // Placeholder
702    }
703
704    fn try_cast_to_ivf<T: VectorIndex>(&self, _index: &T) -> Option<&IvfIndex> {
705        // In practice, this would require proper downcasting or a different approach
706        None // Placeholder
707    }
708}
709
710impl Default for FaissCompatibility {
711    fn default() -> Self {
712        Self::new()
713    }
714}
715
716/// Simple in-memory vector index for flat FAISS imports
717pub struct SimpleVectorIndex {
718    vectors: Vec<Vector>,
719    uris: Vec<String>,
720}
721
722impl SimpleVectorIndex {
723    pub fn new(vectors: Vec<Vector>, uris: Vec<String>) -> Self {
724        Self { vectors, uris }
725    }
726}
727
728impl VectorIndex for SimpleVectorIndex {
729    fn insert(&mut self, uri: String, vector: Vector) -> Result<()> {
730        self.uris.push(uri);
731        self.vectors.push(vector);
732        Ok(())
733    }
734
735    fn search_knn(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
736        let mut results = Vec::new();
737
738        for (i, vector) in self.vectors.iter().enumerate() {
739            let similarity = self.compute_similarity(query, vector);
740            results.push((self.uris[i].clone(), similarity));
741        }
742
743        // Sort by similarity (descending) and take top k
744        results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
745        results.truncate(k);
746
747        Ok(results)
748    }
749
750    fn search_threshold(&self, query: &Vector, threshold: f32) -> Result<Vec<(String, f32)>> {
751        let mut results = Vec::new();
752
753        for (i, vector) in self.vectors.iter().enumerate() {
754            let similarity = self.compute_similarity(query, vector);
755            if similarity >= threshold {
756                results.push((self.uris[i].clone(), similarity));
757            }
758        }
759
760        // Sort by similarity (descending)
761        results.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
762
763        Ok(results)
764    }
765
766    fn get_vector(&self, uri: &str) -> Option<&Vector> {
767        self.uris
768            .iter()
769            .position(|u| u == uri)
770            .map(|i| &self.vectors[i])
771    }
772}
773
774impl SimpleVectorIndex {
775    fn compute_similarity(&self, v1: &Vector, v2: &Vector) -> f32 {
776        // Simple cosine similarity implementation
777        let v1_data = v1.as_f32();
778        let v2_data = v2.as_f32();
779
780        if v1_data.len() != v2_data.len() {
781            return 0.0;
782        }
783
784        let dot_product: f32 = v1_data.iter().zip(v2_data.iter()).map(|(a, b)| a * b).sum();
785        let magnitude1: f32 = v1_data.iter().map(|x| x * x).sum::<f32>().sqrt();
786        let magnitude2: f32 = v2_data.iter().map(|x| x * x).sum::<f32>().sqrt();
787
788        if magnitude1 == 0.0 || magnitude2 == 0.0 {
789            0.0
790        } else {
791            dot_product / (magnitude1 * magnitude2)
792        }
793    }
794}
795
796/// Utility functions for FAISS compatibility
797pub mod utils {
798    use super::*;
799
800    /// Convert oxirs-vec similarity metric to FAISS metric
801    pub fn convert_metric(metric: SimilarityMetric) -> FaissMetricType {
802        match metric {
803            SimilarityMetric::Cosine => FaissMetricType::Cosine,
804            SimilarityMetric::Euclidean => FaissMetricType::L2,
805            SimilarityMetric::DotProduct => FaissMetricType::InnerProduct,
806            SimilarityMetric::Manhattan => FaissMetricType::L2,
807            // All other metrics approximate with L2 for FAISS compatibility
808            _ => FaissMetricType::L2,
809        }
810    }
811
812    /// Get recommended FAISS format for given constraints
813    pub fn recommend_faiss_format(
814        num_vectors: usize,
815        dimension: usize,
816        memory_constraint: Option<usize>,
817        accuracy_requirement: f32,
818    ) -> FaissIndexType {
819        if num_vectors < 1000 || accuracy_requirement > 0.99 {
820            FaissIndexType::IndexFlatL2
821        } else if dimension > 1000 || memory_constraint.is_some_and(|mem| mem < 1024 * 1024 * 1024)
822        {
823            FaissIndexType::IndexIVFPQ
824        } else if num_vectors > 100000 {
825            FaissIndexType::IndexHNSWFlat
826        } else {
827            FaissIndexType::IndexIVFFlat
828        }
829    }
830
831    /// Estimate memory requirements for FAISS format
832    pub fn estimate_memory_requirement(
833        format: FaissIndexType,
834        num_vectors: usize,
835        dimension: usize,
836    ) -> usize {
837        let base_memory = num_vectors * dimension * 4; // 4 bytes per float
838
839        match format {
840            FaissIndexType::IndexFlatL2 | FaissIndexType::IndexFlatIP => base_memory,
841            FaissIndexType::IndexIVFFlat => base_memory + (num_vectors / 100) * dimension * 4, // Centroids
842            FaissIndexType::IndexIVFPQ => base_memory / 8 + (num_vectors / 100) * dimension * 4, // PQ compression
843            FaissIndexType::IndexHNSWFlat => base_memory * 2, // Graph structure overhead
844            FaissIndexType::IndexLSH => base_memory / 2,      // Hash table compression
845            FaissIndexType::IndexPCAFlat => base_memory, // Assuming no dimension reduction for estimate
846        }
847    }
848}
849
850#[cfg(test)]
851mod tests {
852    use super::*;
853
854    #[test]
855    fn test_faiss_compatibility_creation() {
856        let faiss_compat = FaissCompatibility::new();
857        assert!(!faiss_compat.supported_formats.is_empty());
858    }
859
860    #[test]
861    fn test_metric_conversion() {
862        let faiss_compat = FaissCompatibility::new();
863
864        assert_eq!(
865            faiss_compat.convert_similarity_metric(SimilarityMetric::Cosine),
866            FaissMetricType::Cosine
867        );
868        assert_eq!(
869            faiss_compat.convert_similarity_metric(SimilarityMetric::Euclidean),
870            FaissMetricType::L2
871        );
872        assert_eq!(
873            faiss_compat.convert_similarity_metric(SimilarityMetric::DotProduct),
874            FaissMetricType::InnerProduct
875        );
876    }
877
878    #[test]
879    fn test_simple_vector_index() {
880        let vectors = vec![
881            Vector::new(vec![1.0, 0.0, 0.0]),
882            Vector::new(vec![0.0, 1.0, 0.0]),
883            Vector::new(vec![0.0, 0.0, 1.0]),
884        ];
885        let uris = vec!["v1".to_string(), "v2".to_string(), "v3".to_string()];
886
887        let index = SimpleVectorIndex::new(vectors, uris);
888
889        let query = Vector::new(vec![1.0, 0.0, 0.0]);
890        let results = index.search_knn(&query, 2).unwrap();
891
892        assert_eq!(results.len(), 2);
893        assert_eq!(results[0].0, "v1"); // Should be most similar to itself
894    }
895
896    #[test]
897    fn test_format_recommendation() {
898        use crate::faiss_compatibility::utils::recommend_faiss_format;
899
900        // Small dataset
901        assert_eq!(
902            recommend_faiss_format(100, 128, None, 0.9),
903            FaissIndexType::IndexFlatL2
904        );
905
906        // Large dataset
907        assert_eq!(
908            recommend_faiss_format(1000000, 128, None, 0.8),
909            FaissIndexType::IndexHNSWFlat
910        );
911
912        // High dimensional with memory constraint
913        assert_eq!(
914            recommend_faiss_format(50000, 2048, Some(512 * 1024 * 1024), 0.8),
915            FaissIndexType::IndexIVFPQ
916        );
917    }
918}