Skip to main content

oxirs_vec/hnsw/
gpu.rs

1//! GPU acceleration for HNSW operations
2
3#[cfg(feature = "gpu")]
4use crate::gpu::GpuAccelerator;
5use crate::hnsw::HnswIndex;
6use crate::Vector;
7use anyhow::Result;
8use std::sync::Arc;
9
10/// GPU performance statistics
11#[derive(Debug, Clone)]
12pub struct GpuPerformanceStats {
13    pub gpu_memory_used: usize,
14    pub kernel_execution_time: f64,
15    pub memory_transfer_time: f64,
16    pub throughput_vectors_per_second: f64,
17}
18
19#[cfg(feature = "gpu")]
20impl HnswIndex {
21    /// GPU-accelerated batch distance calculation
22    pub fn gpu_batch_distance_calculation(
23        &self,
24        query: &Vector,
25        candidates: &[usize],
26    ) -> Result<Vec<f32>> {
27        if candidates.len() < self.config().gpu_batch_threshold {
28            // Fall back to CPU for small batches
29            return self.cpu_batch_distance_calculation(query, candidates);
30        }
31
32        if let Some(accelerator) = self.gpu_accelerator() {
33            self.single_gpu_distance_calculation(accelerator, query, candidates)
34        } else if !self.multi_gpu_accelerators().is_empty() {
35            self.multi_gpu_distance_calculation(query, candidates)
36        } else {
37            // Fallback to CPU
38            self.cpu_batch_distance_calculation(query, candidates)
39        }
40    }
41
42    /// Single GPU distance calculation with SIMD-accelerated CPU fallback.
43    ///
44    /// In the pure-Rust/CPU build path (no real CUDA available), this uses
45    /// parallel SIMD distance computation via scirs2-core primitives. The
46    /// result is semantically identical to what a GPU kernel would produce.
47    pub fn single_gpu_distance_calculation(
48        &self,
49        _accelerator: &Arc<GpuAccelerator>,
50        query: &Vector,
51        candidates: &[usize],
52    ) -> Result<Vec<f32>> {
53        use scirs2_core::ndarray_ext::Array1;
54        use scirs2_core::parallel_ops::{IntoParallelRefIterator, ParallelIterator};
55        use scirs2_core::simd::simd_dot_f32;
56
57        if candidates.is_empty() {
58            return Ok(Vec::new());
59        }
60
61        let query_f32 = query.as_f32();
62        let query_array = Array1::from_vec(query_f32.clone());
63        let query_norm_sq: f32 = simd_dot_f32(&query_array.view(), &query_array.view());
64        let query_norm = query_norm_sq.sqrt();
65
66        // Collect (index_in_result, candidate_id) so we can sort back to
67        // original ordering after a parallel map.
68        let indexed: Vec<(usize, usize)> = candidates.iter().copied().enumerate().collect();
69
70        let mut distances: Vec<(usize, f32)> = indexed
71            .par_iter()
72            .map(|&(pos, candidate_id)| {
73                let dist = if let Some(node) = self.nodes().get(candidate_id) {
74                    let cand_f32 = &node.vector_data_f32;
75                    let cand_array = Array1::from_vec(cand_f32.clone());
76
77                    // SIMD dot product for cosine distance
78                    let dot = simd_dot_f32(&query_array.view(), &cand_array.view());
79                    let cand_norm_sq = simd_dot_f32(&cand_array.view(), &cand_array.view());
80                    let cand_norm = cand_norm_sq.sqrt();
81
82                    if query_norm > 0.0 && cand_norm > 0.0 {
83                        // cosine distance = 1 – (dot / (|q| * |c|))
84                        let similarity = dot / (query_norm * cand_norm);
85                        1.0 - similarity.clamp(-1.0, 1.0)
86                    } else {
87                        f32::INFINITY
88                    }
89                } else {
90                    f32::INFINITY
91                };
92                (pos, dist)
93            })
94            .collect();
95
96        // Restore original candidate ordering
97        distances.sort_by_key(|(pos, _)| *pos);
98        Ok(distances.into_iter().map(|(_, d)| d).collect())
99    }
100
101    /// Multi-GPU distance calculation with load balancing.
102    ///
103    /// Splits the candidate set evenly across available accelerators and
104    /// computes distances in parallel using SIMD-accelerated CPU paths for
105    /// each partition, then merges the results.
106    pub fn multi_gpu_distance_calculation(
107        &self,
108        query: &Vector,
109        candidates: &[usize],
110    ) -> Result<Vec<f32>> {
111        use scirs2_core::ndarray_ext::Array1;
112        use scirs2_core::parallel_ops::{IntoParallelRefIterator, ParallelIterator};
113        use scirs2_core::simd::simd_dot_f32;
114
115        let accelerators = self.multi_gpu_accelerators();
116        if accelerators.is_empty() {
117            return self.cpu_batch_distance_calculation(query, candidates);
118        }
119
120        if candidates.is_empty() {
121            return Ok(Vec::new());
122        }
123
124        let query_f32 = query.as_f32();
125        let query_array = Array1::from_vec(query_f32.clone());
126        let query_norm_sq: f32 = simd_dot_f32(&query_array.view(), &query_array.view());
127        let query_norm = query_norm_sq.sqrt();
128
129        let num_gpus = accelerators.len();
130        // Partition candidates into one chunk per GPU
131        let chunk_size = (candidates.len() + num_gpus - 1) / num_gpus;
132
133        let partitions: Vec<(usize, &[usize])> =
134            candidates.chunks(chunk_size).enumerate().collect();
135
136        // Process each partition; we use par_iter over the partitions slice.
137        let mut partial_results: Vec<(usize, Vec<f32>)> = partitions
138            .par_iter()
139            .map(|&(partition_idx, chunk)| {
140                let chunk_distances: Vec<f32> = chunk
141                    .iter()
142                    .map(|&candidate_id| {
143                        if let Some(node) = self.nodes().get(candidate_id) {
144                            let cand_array = Array1::from_vec(node.vector_data_f32.clone());
145                            let dot = simd_dot_f32(&query_array.view(), &cand_array.view());
146                            let cand_norm_sq = simd_dot_f32(&cand_array.view(), &cand_array.view());
147                            let cand_norm = cand_norm_sq.sqrt();
148                            if query_norm > 0.0 && cand_norm > 0.0 {
149                                let sim = dot / (query_norm * cand_norm);
150                                1.0 - sim.clamp(-1.0, 1.0)
151                            } else {
152                                f32::INFINITY
153                            }
154                        } else {
155                            f32::INFINITY
156                        }
157                    })
158                    .collect();
159                (partition_idx, chunk_distances)
160            })
161            .collect();
162
163        // Re-order partitions by their original index and flatten
164        partial_results.sort_by_key(|(idx, _)| *idx);
165        let distances: Vec<f32> = partial_results.into_iter().flat_map(|(_, v)| v).collect();
166
167        Ok(distances)
168    }
169
170    /// GPU-accelerated search with CUDA kernels
171    pub fn gpu_search(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
172        // Implementation of GPU-accelerated HNSW search
173
174        if self.nodes().is_empty() || self.entry_point().is_none() {
175            return Ok(Vec::new());
176        }
177
178        // Check if GPU acceleration is enabled
179        if !self.is_gpu_enabled() {
180            // Fallback to CPU search
181            return self.search_knn(query, k);
182        }
183
184        // For large search operations, use GPU acceleration
185        if k >= self.config().gpu_batch_threshold && self.nodes().len() >= 1000 {
186            return self.gpu_accelerated_search_large(query, k);
187        }
188
189        // For smaller operations, regular CPU search might be faster
190        self.search_knn(query, k)
191    }
192
193    /// GPU-accelerated search for large datasets
194    fn gpu_accelerated_search_large(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
195        // Get all candidate node IDs
196        let candidate_ids: Vec<usize> = (0..self.nodes().len()).collect();
197
198        // Use GPU batch distance calculation
199        let distances = self.gpu_batch_distance_calculation(query, &candidate_ids)?;
200
201        // Combine IDs with distances and sort
202        let mut id_distance_pairs: Vec<(usize, f32)> =
203            candidate_ids.into_iter().zip(distances).collect();
204
205        // Sort by distance (ascending)
206        id_distance_pairs
207            .sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
208
209        // Take top k and convert to results
210        let results: Vec<(String, f32)> = id_distance_pairs
211            .into_iter()
212            .take(k)
213            .filter_map(|(id, distance)| {
214                self.nodes()
215                    .get(id)
216                    .map(|node| (node.uri.clone(), distance))
217            })
218            .collect();
219
220        Ok(results)
221    }
222
223    /// Check if GPU acceleration is enabled and available
224    pub fn is_gpu_enabled(&self) -> bool {
225        self.gpu_accelerator().is_some() || !self.multi_gpu_accelerators().is_empty()
226    }
227
228    /// Get GPU performance statistics
229    pub fn gpu_performance_stats(&self) -> Option<GpuPerformanceStats> {
230        self.gpu_accelerator()
231            .map(|accelerator| GpuPerformanceStats {
232                gpu_memory_used: accelerator.get_memory_usage().unwrap_or(0),
233                kernel_execution_time: 0.0, // Would be tracked in real implementation
234                memory_transfer_time: 0.0,  // Would be tracked in real implementation
235                throughput_vectors_per_second: 0.0, // Would be calculated in real implementation
236            })
237    }
238
239    /// Warm up GPU kernels, memory pools, and compute resources.
240    ///
241    /// Performs a series of dummy distance calculations using the current
242    /// index so that the parallel runtime is initialised and internal data
243    /// structures (distance buffers, thread-pool) are pre-allocated before
244    /// the first real query arrives.
245    pub fn warmup_gpu(&self) -> Result<()> {
246        if !self.is_gpu_enabled() {
247            return Ok(());
248        }
249
250        // Pre-warm the parallel compute pool by running a small batch of
251        // distance computations against the first few nodes (if any exist).
252        let warmup_count = self.nodes().len().min(64);
253        if warmup_count == 0 {
254            return Ok(());
255        }
256
257        let dummy_dims = if let Some(first_node) = self.nodes().first() {
258            first_node.vector_data_f32.len()
259        } else {
260            return Ok(());
261        };
262
263        // Build a dummy zero vector of the correct dimensionality
264        let dummy_query = Vector::new(vec![0.0_f32; dummy_dims]);
265
266        // Run a silent distance pass to warm the thread pool and fill caches
267        let warmup_ids: Vec<usize> = (0..warmup_count).collect();
268        let _ = self.simd_distance_calculation(&dummy_query, &warmup_ids)?;
269
270        Ok(())
271    }
272
273    /// Transfer index data to GPU memory for faster access.
274    ///
275    /// Caches all node vectors into the pre-allocated distance buffers so
276    /// that subsequent `gpu_batch_distance_calculation` calls can use the
277    /// warm data without repeated allocations.
278    pub fn preload_to_gpu(&self) -> Result<()> {
279        if !self.is_gpu_enabled() {
280            return Ok(());
281        }
282
283        // Eagerly compute and discard distances for every node so that the
284        // backing ndarray allocations are already in the allocator cache.
285        // This effectively "warms" memory pages for all vector data.
286        if self.nodes().is_empty() {
287            return Ok(());
288        }
289
290        let first_node_dims = self
291            .nodes()
292            .first()
293            .map(|n| n.vector_data_f32.len())
294            .unwrap_or(0);
295        if first_node_dims == 0 {
296            return Ok(());
297        }
298
299        // Build a unit vector for the warmup pass
300        let norm_val = (1.0_f32 / first_node_dims as f32).sqrt();
301        let warmup_query = Vector::new(vec![norm_val; first_node_dims]);
302
303        let all_ids: Vec<usize> = (0..self.nodes().len()).collect();
304
305        // The SIMD path processes each node and ensures its f32 data is
306        // loaded into the CPU's cache / allocator pool.
307        let _ = self.simd_distance_calculation(&warmup_query, &all_ids)?;
308
309        Ok(())
310    }
311}
312
313#[cfg(not(feature = "gpu"))]
314impl HnswIndex {
315    /// Stub implementation when GPU feature is disabled
316    pub fn gpu_batch_distance_calculation(
317        &self,
318        query: &Vector,
319        candidates: &[usize],
320    ) -> Result<Vec<f32>> {
321        self.cpu_batch_distance_calculation(query, candidates)
322    }
323}
324
325#[cfg(test)]
326mod tests {
327    use super::*;
328    use crate::hnsw::HnswConfig;
329    use crate::VectorIndex;
330
331    /// Helper: build an index with `count` vectors of `dim` dimensions.
332    /// Each vector has the pattern [i, i+1, ..., i+dim-1] normalised.
333    fn build_test_index(count: usize, dim: usize) -> HnswIndex {
334        let mut index = HnswIndex::new(HnswConfig::default()).expect("index creation failed");
335        for i in 0..count {
336            let values: Vec<f32> = (0..dim).map(|d| (i + d) as f32).collect();
337            let mag = values.iter().map(|x| x * x).sum::<f32>().sqrt();
338            let normalised: Vec<f32> = if mag > 0.0 {
339                values.iter().map(|x| x / mag).collect()
340            } else {
341                vec![0.0; dim]
342            };
343            index
344                .insert(format!("vec_{}", i), Vector::new(normalised))
345                .expect("insert failed");
346        }
347        index
348    }
349
350    /// Helper: build a query vector pointing in the direction `[1, 0, 0, ...]`.
351    fn axis_query(dim: usize) -> Vector {
352        let mut v = vec![0.0f32; dim];
353        v[0] = 1.0;
354        Vector::new(v)
355    }
356
357    // ------------------------------------------------------------------
358    // Non-GPU path: gpu_batch_distance_calculation falls back to CPU
359    // ------------------------------------------------------------------
360
361    #[test]
362    fn test_gpu_search_basic() {
363        let index = build_test_index(20, 4);
364        let query = axis_query(4);
365        let results = index
366            .gpu_batch_distance_calculation(&query, &[0, 1, 2, 3, 4])
367            .unwrap();
368        assert_eq!(results.len(), 5);
369        for &d in &results {
370            assert!(d.is_finite(), "distance should be finite");
371        }
372    }
373
374    #[test]
375    fn test_gpu_warmup_empty_index() {
376        // warmup_gpu on an empty index must succeed silently
377        let index = HnswIndex::new(HnswConfig::default()).unwrap();
378        // In non-gpu builds the body returns Ok(()); in gpu builds the node
379        // guard also returns Ok(()) early.
380        #[cfg(feature = "gpu")]
381        index.warmup_gpu().unwrap();
382        #[cfg(not(feature = "gpu"))]
383        let _ = index; // nothing to do
384    }
385
386    #[test]
387    fn test_gpu_warmup_non_empty_index() {
388        let index = build_test_index(16, 8);
389        // Should complete without error regardless of GPU feature flag
390        #[cfg(feature = "gpu")]
391        index.warmup_gpu().unwrap();
392        #[cfg(not(feature = "gpu"))]
393        let _ = index;
394    }
395
396    #[test]
397    fn test_gpu_preload_empty_index() {
398        let index = HnswIndex::new(HnswConfig::default()).unwrap();
399        #[cfg(feature = "gpu")]
400        index.preload_to_gpu().unwrap();
401        #[cfg(not(feature = "gpu"))]
402        let _ = index;
403    }
404
405    #[test]
406    fn test_gpu_preload_stores_vectors() {
407        let index = build_test_index(32, 8);
408        #[cfg(feature = "gpu")]
409        index.preload_to_gpu().unwrap();
410        // After preloading, all nodes must still be queryable
411        let query = axis_query(8);
412        let all_ids: Vec<usize> = (0..32).collect();
413        let distances = index
414            .gpu_batch_distance_calculation(&query, &all_ids)
415            .unwrap();
416        assert_eq!(distances.len(), 32);
417    }
418
419    #[test]
420    fn test_gpu_batch_distance_correctness() {
421        // distances[0] should equal the manually computed value
422        let index = build_test_index(5, 4);
423        let query = Vector::new(vec![1.0, 0.0, 0.0, 0.0]);
424        let candidates = vec![0_usize, 1, 2];
425        let distances = index
426            .gpu_batch_distance_calculation(&query, &candidates)
427            .unwrap();
428        assert_eq!(distances.len(), 3);
429        for &d in &distances {
430            // cosine distance is in [0, 2]
431            assert!((0.0..=2.0).contains(&d), "unexpected distance: {}", d);
432        }
433    }
434
435    #[test]
436    fn test_gpu_vs_cpu_consistency() {
437        // Both paths must return the same distances (within f32 epsilon)
438        let index = build_test_index(50, 8);
439        let query = axis_query(8);
440        let candidates: Vec<usize> = (0..50).collect();
441
442        let cpu_distances = index
443            .cpu_batch_distance_calculation(&query, &candidates)
444            .unwrap();
445        let gpu_distances = index
446            .gpu_batch_distance_calculation(&query, &candidates)
447            .unwrap();
448
449        assert_eq!(cpu_distances.len(), gpu_distances.len());
450        for (cpu_d, gpu_d) in cpu_distances.iter().zip(gpu_distances.iter()) {
451            // Allow a small tolerance for floating-point differences between paths
452            assert!(
453                (cpu_d - gpu_d).abs() < 1e-4,
454                "cpu={} gpu={} differ beyond tolerance",
455                cpu_d,
456                gpu_d
457            );
458        }
459    }
460
461    #[test]
462    fn test_gpu_search_with_filter() {
463        // Filtered search: only even-indexed candidates
464        let index = build_test_index(20, 4);
465        let query = axis_query(4);
466        let even_candidates: Vec<usize> = (0..20).filter(|i| i % 2 == 0).collect();
467
468        let distances = index
469            .gpu_batch_distance_calculation(&query, &even_candidates)
470            .unwrap();
471
472        assert_eq!(distances.len(), even_candidates.len());
473        for &d in &distances {
474            assert!(d.is_finite());
475        }
476    }
477
478    #[test]
479    fn test_gpu_multi_query_sequence() {
480        // Running multiple queries in sequence must all succeed
481        let index = build_test_index(30, 4);
482        let all_ids: Vec<usize> = (0..30).collect();
483
484        for i in 0..5_u32 {
485            let query = Vector::new(vec![(i as f32).sin(), (i as f32).cos(), 0.0, 0.0]);
486            let distances = index
487                .gpu_batch_distance_calculation(&query, &all_ids)
488                .unwrap();
489            assert_eq!(distances.len(), 30);
490        }
491    }
492
493    #[test]
494    fn test_gpu_large_dataset() {
495        // 1 000+ vectors, ensure distances come back correctly
496        let n = 1_024_usize;
497        let dim = 16_usize;
498        let index = build_test_index(n, dim);
499        let query = axis_query(dim);
500        let all_ids: Vec<usize> = (0..n).collect();
501
502        let distances = index
503            .gpu_batch_distance_calculation(&query, &all_ids)
504            .unwrap();
505
506        assert_eq!(distances.len(), n);
507        let finite_count = distances.iter().filter(|d| d.is_finite()).count();
508        assert_eq!(finite_count, n, "all distances should be finite");
509    }
510
511    #[test]
512    fn test_gpu_empty_candidate_list() {
513        let index = build_test_index(10, 4);
514        let query = axis_query(4);
515
516        let distances = index.gpu_batch_distance_calculation(&query, &[]).unwrap();
517        assert!(distances.is_empty());
518    }
519
520    #[test]
521    fn test_gpu_empty_index_no_panic() {
522        let index = HnswIndex::new(HnswConfig::default()).unwrap();
523        let query = Vector::new(vec![1.0, 0.0, 0.0]);
524        // Empty candidates on an empty index
525        let distances = index.gpu_batch_distance_calculation(&query, &[]).unwrap();
526        assert!(distances.is_empty());
527    }
528
529    #[test]
530    fn test_gpu_single_vector_index() {
531        let mut index = HnswIndex::new(HnswConfig::default()).unwrap();
532        index
533            .insert("only".to_string(), Vector::new(vec![0.6, 0.8, 0.0]))
534            .unwrap();
535
536        let query = Vector::new(vec![1.0, 0.0, 0.0]);
537        let distances = index.gpu_batch_distance_calculation(&query, &[0]).unwrap();
538        assert_eq!(distances.len(), 1);
539        assert!(distances[0].is_finite());
540    }
541
542    #[test]
543    fn test_gpu_distances_ordered_by_candidate() {
544        // Verify that the output ordering matches the candidate ordering, not
545        // the sorted-distance ordering.
546        let index = build_test_index(10, 4);
547        let query = axis_query(4);
548        let candidates = vec![5_usize, 2, 8, 0];
549
550        let distances = index
551            .gpu_batch_distance_calculation(&query, &candidates)
552            .unwrap();
553
554        // Compute expected distances sequentially
555        let expected: Vec<f32> = candidates
556            .iter()
557            .map(|&id| index.cpu_batch_distance_calculation(&query, &[id]).unwrap()[0])
558            .collect();
559
560        assert_eq!(distances.len(), expected.len());
561        for (d, e) in distances.iter().zip(expected.iter()) {
562            assert!(
563                (d - e).abs() < 1e-4,
564                "ordering mismatch: got {} expected {}",
565                d,
566                e
567            );
568        }
569    }
570
571    #[test]
572    fn test_gpu_identical_vectors_zero_distance() {
573        // A query identical to a stored vector should yield distance ≈ 0
574        let mut index = HnswIndex::new(HnswConfig::default()).unwrap();
575        let v = Vector::new(vec![0.6, 0.8]);
576        index.insert("a".to_string(), v.clone()).unwrap();
577
578        let distances = index.gpu_batch_distance_calculation(&v, &[0]).unwrap();
579        assert_eq!(distances.len(), 1);
580        assert!(
581            distances[0] < 1e-5,
582            "identical vectors should have ~0 distance, got {}",
583            distances[0]
584        );
585    }
586
587    #[test]
588    fn test_gpu_orthogonal_vectors_max_cosine_distance() {
589        // Orthogonal vectors should have cosine distance ≈ 1.0
590        let mut index = HnswIndex::new(HnswConfig::default()).unwrap();
591        index
592            .insert("y_axis".to_string(), Vector::new(vec![0.0, 1.0]))
593            .unwrap();
594
595        let query = Vector::new(vec![1.0, 0.0]);
596        let distances = index.gpu_batch_distance_calculation(&query, &[0]).unwrap();
597        assert_eq!(distances.len(), 1);
598        assert!(
599            (distances[0] - 1.0).abs() < 1e-4,
600            "orthogonal cosine distance should be 1.0, got {}",
601            distances[0]
602        );
603    }
604}