1#[cfg(feature = "gpu")]
4use crate::gpu::GpuAccelerator;
5use crate::hnsw::HnswIndex;
6use crate::Vector;
7use anyhow::Result;
8use std::sync::Arc;
9
10#[derive(Debug, Clone)]
12pub struct GpuPerformanceStats {
13 pub gpu_memory_used: usize,
14 pub kernel_execution_time: f64,
15 pub memory_transfer_time: f64,
16 pub throughput_vectors_per_second: f64,
17}
18
19#[cfg(feature = "gpu")]
20impl HnswIndex {
21 pub fn gpu_batch_distance_calculation(
23 &self,
24 query: &Vector,
25 candidates: &[usize],
26 ) -> Result<Vec<f32>> {
27 if candidates.len() < self.config().gpu_batch_threshold {
28 return self.cpu_batch_distance_calculation(query, candidates);
30 }
31
32 if let Some(accelerator) = self.gpu_accelerator() {
33 self.single_gpu_distance_calculation(accelerator, query, candidates)
34 } else if !self.multi_gpu_accelerators().is_empty() {
35 self.multi_gpu_distance_calculation(query, candidates)
36 } else {
37 self.cpu_batch_distance_calculation(query, candidates)
39 }
40 }
41
42 pub fn single_gpu_distance_calculation(
48 &self,
49 _accelerator: &Arc<GpuAccelerator>,
50 query: &Vector,
51 candidates: &[usize],
52 ) -> Result<Vec<f32>> {
53 use scirs2_core::ndarray_ext::Array1;
54 use scirs2_core::parallel_ops::{IntoParallelRefIterator, ParallelIterator};
55 use scirs2_core::simd::simd_dot_f32;
56
57 if candidates.is_empty() {
58 return Ok(Vec::new());
59 }
60
61 let query_f32 = query.as_f32();
62 let query_array = Array1::from_vec(query_f32.clone());
63 let query_norm_sq: f32 = simd_dot_f32(&query_array.view(), &query_array.view());
64 let query_norm = query_norm_sq.sqrt();
65
66 let indexed: Vec<(usize, usize)> = candidates.iter().copied().enumerate().collect();
69
70 let mut distances: Vec<(usize, f32)> = indexed
71 .par_iter()
72 .map(|&(pos, candidate_id)| {
73 let dist = if let Some(node) = self.nodes().get(candidate_id) {
74 let cand_f32 = &node.vector_data_f32;
75 let cand_array = Array1::from_vec(cand_f32.clone());
76
77 let dot = simd_dot_f32(&query_array.view(), &cand_array.view());
79 let cand_norm_sq = simd_dot_f32(&cand_array.view(), &cand_array.view());
80 let cand_norm = cand_norm_sq.sqrt();
81
82 if query_norm > 0.0 && cand_norm > 0.0 {
83 let similarity = dot / (query_norm * cand_norm);
85 1.0 - similarity.clamp(-1.0, 1.0)
86 } else {
87 f32::INFINITY
88 }
89 } else {
90 f32::INFINITY
91 };
92 (pos, dist)
93 })
94 .collect();
95
96 distances.sort_by_key(|(pos, _)| *pos);
98 Ok(distances.into_iter().map(|(_, d)| d).collect())
99 }
100
101 pub fn multi_gpu_distance_calculation(
107 &self,
108 query: &Vector,
109 candidates: &[usize],
110 ) -> Result<Vec<f32>> {
111 use scirs2_core::ndarray_ext::Array1;
112 use scirs2_core::parallel_ops::{IntoParallelRefIterator, ParallelIterator};
113 use scirs2_core::simd::simd_dot_f32;
114
115 let accelerators = self.multi_gpu_accelerators();
116 if accelerators.is_empty() {
117 return self.cpu_batch_distance_calculation(query, candidates);
118 }
119
120 if candidates.is_empty() {
121 return Ok(Vec::new());
122 }
123
124 let query_f32 = query.as_f32();
125 let query_array = Array1::from_vec(query_f32.clone());
126 let query_norm_sq: f32 = simd_dot_f32(&query_array.view(), &query_array.view());
127 let query_norm = query_norm_sq.sqrt();
128
129 let num_gpus = accelerators.len();
130 let chunk_size = (candidates.len() + num_gpus - 1) / num_gpus;
132
133 let partitions: Vec<(usize, &[usize])> =
134 candidates.chunks(chunk_size).enumerate().collect();
135
136 let mut partial_results: Vec<(usize, Vec<f32>)> = partitions
138 .par_iter()
139 .map(|&(partition_idx, chunk)| {
140 let chunk_distances: Vec<f32> = chunk
141 .iter()
142 .map(|&candidate_id| {
143 if let Some(node) = self.nodes().get(candidate_id) {
144 let cand_array = Array1::from_vec(node.vector_data_f32.clone());
145 let dot = simd_dot_f32(&query_array.view(), &cand_array.view());
146 let cand_norm_sq = simd_dot_f32(&cand_array.view(), &cand_array.view());
147 let cand_norm = cand_norm_sq.sqrt();
148 if query_norm > 0.0 && cand_norm > 0.0 {
149 let sim = dot / (query_norm * cand_norm);
150 1.0 - sim.clamp(-1.0, 1.0)
151 } else {
152 f32::INFINITY
153 }
154 } else {
155 f32::INFINITY
156 }
157 })
158 .collect();
159 (partition_idx, chunk_distances)
160 })
161 .collect();
162
163 partial_results.sort_by_key(|(idx, _)| *idx);
165 let distances: Vec<f32> = partial_results.into_iter().flat_map(|(_, v)| v).collect();
166
167 Ok(distances)
168 }
169
170 pub fn gpu_search(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
172 if self.nodes().is_empty() || self.entry_point().is_none() {
175 return Ok(Vec::new());
176 }
177
178 if !self.is_gpu_enabled() {
180 return self.search_knn(query, k);
182 }
183
184 if k >= self.config().gpu_batch_threshold && self.nodes().len() >= 1000 {
186 return self.gpu_accelerated_search_large(query, k);
187 }
188
189 self.search_knn(query, k)
191 }
192
193 fn gpu_accelerated_search_large(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
195 let candidate_ids: Vec<usize> = (0..self.nodes().len()).collect();
197
198 let distances = self.gpu_batch_distance_calculation(query, &candidate_ids)?;
200
201 let mut id_distance_pairs: Vec<(usize, f32)> =
203 candidate_ids.into_iter().zip(distances).collect();
204
205 id_distance_pairs
207 .sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
208
209 let results: Vec<(String, f32)> = id_distance_pairs
211 .into_iter()
212 .take(k)
213 .filter_map(|(id, distance)| {
214 self.nodes()
215 .get(id)
216 .map(|node| (node.uri.clone(), distance))
217 })
218 .collect();
219
220 Ok(results)
221 }
222
223 pub fn is_gpu_enabled(&self) -> bool {
225 self.gpu_accelerator().is_some() || !self.multi_gpu_accelerators().is_empty()
226 }
227
228 pub fn gpu_performance_stats(&self) -> Option<GpuPerformanceStats> {
230 self.gpu_accelerator()
231 .map(|accelerator| GpuPerformanceStats {
232 gpu_memory_used: accelerator.get_memory_usage().unwrap_or(0),
233 kernel_execution_time: 0.0, memory_transfer_time: 0.0, throughput_vectors_per_second: 0.0, })
237 }
238
239 pub fn warmup_gpu(&self) -> Result<()> {
246 if !self.is_gpu_enabled() {
247 return Ok(());
248 }
249
250 let warmup_count = self.nodes().len().min(64);
253 if warmup_count == 0 {
254 return Ok(());
255 }
256
257 let dummy_dims = if let Some(first_node) = self.nodes().first() {
258 first_node.vector_data_f32.len()
259 } else {
260 return Ok(());
261 };
262
263 let dummy_query = Vector::new(vec![0.0_f32; dummy_dims]);
265
266 let warmup_ids: Vec<usize> = (0..warmup_count).collect();
268 let _ = self.simd_distance_calculation(&dummy_query, &warmup_ids)?;
269
270 Ok(())
271 }
272
273 pub fn preload_to_gpu(&self) -> Result<()> {
279 if !self.is_gpu_enabled() {
280 return Ok(());
281 }
282
283 if self.nodes().is_empty() {
287 return Ok(());
288 }
289
290 let first_node_dims = self
291 .nodes()
292 .first()
293 .map(|n| n.vector_data_f32.len())
294 .unwrap_or(0);
295 if first_node_dims == 0 {
296 return Ok(());
297 }
298
299 let norm_val = (1.0_f32 / first_node_dims as f32).sqrt();
301 let warmup_query = Vector::new(vec![norm_val; first_node_dims]);
302
303 let all_ids: Vec<usize> = (0..self.nodes().len()).collect();
304
305 let _ = self.simd_distance_calculation(&warmup_query, &all_ids)?;
308
309 Ok(())
310 }
311}
312
313#[cfg(not(feature = "gpu"))]
314impl HnswIndex {
315 pub fn gpu_batch_distance_calculation(
317 &self,
318 query: &Vector,
319 candidates: &[usize],
320 ) -> Result<Vec<f32>> {
321 self.cpu_batch_distance_calculation(query, candidates)
322 }
323}
324
325#[cfg(test)]
326mod tests {
327 use super::*;
328 use crate::hnsw::HnswConfig;
329 use crate::VectorIndex;
330
331 fn build_test_index(count: usize, dim: usize) -> HnswIndex {
334 let mut index = HnswIndex::new(HnswConfig::default()).expect("index creation failed");
335 for i in 0..count {
336 let values: Vec<f32> = (0..dim).map(|d| (i + d) as f32).collect();
337 let mag = values.iter().map(|x| x * x).sum::<f32>().sqrt();
338 let normalised: Vec<f32> = if mag > 0.0 {
339 values.iter().map(|x| x / mag).collect()
340 } else {
341 vec![0.0; dim]
342 };
343 index
344 .insert(format!("vec_{}", i), Vector::new(normalised))
345 .expect("insert failed");
346 }
347 index
348 }
349
350 fn axis_query(dim: usize) -> Vector {
352 let mut v = vec![0.0f32; dim];
353 v[0] = 1.0;
354 Vector::new(v)
355 }
356
357 #[test]
362 fn test_gpu_search_basic() {
363 let index = build_test_index(20, 4);
364 let query = axis_query(4);
365 let results = index
366 .gpu_batch_distance_calculation(&query, &[0, 1, 2, 3, 4])
367 .unwrap();
368 assert_eq!(results.len(), 5);
369 for &d in &results {
370 assert!(d.is_finite(), "distance should be finite");
371 }
372 }
373
374 #[test]
375 fn test_gpu_warmup_empty_index() {
376 let index = HnswIndex::new(HnswConfig::default()).unwrap();
378 #[cfg(feature = "gpu")]
381 index.warmup_gpu().unwrap();
382 #[cfg(not(feature = "gpu"))]
383 let _ = index; }
385
386 #[test]
387 fn test_gpu_warmup_non_empty_index() {
388 let index = build_test_index(16, 8);
389 #[cfg(feature = "gpu")]
391 index.warmup_gpu().unwrap();
392 #[cfg(not(feature = "gpu"))]
393 let _ = index;
394 }
395
396 #[test]
397 fn test_gpu_preload_empty_index() {
398 let index = HnswIndex::new(HnswConfig::default()).unwrap();
399 #[cfg(feature = "gpu")]
400 index.preload_to_gpu().unwrap();
401 #[cfg(not(feature = "gpu"))]
402 let _ = index;
403 }
404
405 #[test]
406 fn test_gpu_preload_stores_vectors() {
407 let index = build_test_index(32, 8);
408 #[cfg(feature = "gpu")]
409 index.preload_to_gpu().unwrap();
410 let query = axis_query(8);
412 let all_ids: Vec<usize> = (0..32).collect();
413 let distances = index
414 .gpu_batch_distance_calculation(&query, &all_ids)
415 .unwrap();
416 assert_eq!(distances.len(), 32);
417 }
418
419 #[test]
420 fn test_gpu_batch_distance_correctness() {
421 let index = build_test_index(5, 4);
423 let query = Vector::new(vec![1.0, 0.0, 0.0, 0.0]);
424 let candidates = vec![0_usize, 1, 2];
425 let distances = index
426 .gpu_batch_distance_calculation(&query, &candidates)
427 .unwrap();
428 assert_eq!(distances.len(), 3);
429 for &d in &distances {
430 assert!((0.0..=2.0).contains(&d), "unexpected distance: {}", d);
432 }
433 }
434
435 #[test]
436 fn test_gpu_vs_cpu_consistency() {
437 let index = build_test_index(50, 8);
439 let query = axis_query(8);
440 let candidates: Vec<usize> = (0..50).collect();
441
442 let cpu_distances = index
443 .cpu_batch_distance_calculation(&query, &candidates)
444 .unwrap();
445 let gpu_distances = index
446 .gpu_batch_distance_calculation(&query, &candidates)
447 .unwrap();
448
449 assert_eq!(cpu_distances.len(), gpu_distances.len());
450 for (cpu_d, gpu_d) in cpu_distances.iter().zip(gpu_distances.iter()) {
451 assert!(
453 (cpu_d - gpu_d).abs() < 1e-4,
454 "cpu={} gpu={} differ beyond tolerance",
455 cpu_d,
456 gpu_d
457 );
458 }
459 }
460
461 #[test]
462 fn test_gpu_search_with_filter() {
463 let index = build_test_index(20, 4);
465 let query = axis_query(4);
466 let even_candidates: Vec<usize> = (0..20).filter(|i| i % 2 == 0).collect();
467
468 let distances = index
469 .gpu_batch_distance_calculation(&query, &even_candidates)
470 .unwrap();
471
472 assert_eq!(distances.len(), even_candidates.len());
473 for &d in &distances {
474 assert!(d.is_finite());
475 }
476 }
477
478 #[test]
479 fn test_gpu_multi_query_sequence() {
480 let index = build_test_index(30, 4);
482 let all_ids: Vec<usize> = (0..30).collect();
483
484 for i in 0..5_u32 {
485 let query = Vector::new(vec![(i as f32).sin(), (i as f32).cos(), 0.0, 0.0]);
486 let distances = index
487 .gpu_batch_distance_calculation(&query, &all_ids)
488 .unwrap();
489 assert_eq!(distances.len(), 30);
490 }
491 }
492
493 #[test]
494 fn test_gpu_large_dataset() {
495 let n = 1_024_usize;
497 let dim = 16_usize;
498 let index = build_test_index(n, dim);
499 let query = axis_query(dim);
500 let all_ids: Vec<usize> = (0..n).collect();
501
502 let distances = index
503 .gpu_batch_distance_calculation(&query, &all_ids)
504 .unwrap();
505
506 assert_eq!(distances.len(), n);
507 let finite_count = distances.iter().filter(|d| d.is_finite()).count();
508 assert_eq!(finite_count, n, "all distances should be finite");
509 }
510
511 #[test]
512 fn test_gpu_empty_candidate_list() {
513 let index = build_test_index(10, 4);
514 let query = axis_query(4);
515
516 let distances = index.gpu_batch_distance_calculation(&query, &[]).unwrap();
517 assert!(distances.is_empty());
518 }
519
520 #[test]
521 fn test_gpu_empty_index_no_panic() {
522 let index = HnswIndex::new(HnswConfig::default()).unwrap();
523 let query = Vector::new(vec![1.0, 0.0, 0.0]);
524 let distances = index.gpu_batch_distance_calculation(&query, &[]).unwrap();
526 assert!(distances.is_empty());
527 }
528
529 #[test]
530 fn test_gpu_single_vector_index() {
531 let mut index = HnswIndex::new(HnswConfig::default()).unwrap();
532 index
533 .insert("only".to_string(), Vector::new(vec![0.6, 0.8, 0.0]))
534 .unwrap();
535
536 let query = Vector::new(vec![1.0, 0.0, 0.0]);
537 let distances = index.gpu_batch_distance_calculation(&query, &[0]).unwrap();
538 assert_eq!(distances.len(), 1);
539 assert!(distances[0].is_finite());
540 }
541
542 #[test]
543 fn test_gpu_distances_ordered_by_candidate() {
544 let index = build_test_index(10, 4);
547 let query = axis_query(4);
548 let candidates = vec![5_usize, 2, 8, 0];
549
550 let distances = index
551 .gpu_batch_distance_calculation(&query, &candidates)
552 .unwrap();
553
554 let expected: Vec<f32> = candidates
556 .iter()
557 .map(|&id| index.cpu_batch_distance_calculation(&query, &[id]).unwrap()[0])
558 .collect();
559
560 assert_eq!(distances.len(), expected.len());
561 for (d, e) in distances.iter().zip(expected.iter()) {
562 assert!(
563 (d - e).abs() < 1e-4,
564 "ordering mismatch: got {} expected {}",
565 d,
566 e
567 );
568 }
569 }
570
571 #[test]
572 fn test_gpu_identical_vectors_zero_distance() {
573 let mut index = HnswIndex::new(HnswConfig::default()).unwrap();
575 let v = Vector::new(vec![0.6, 0.8]);
576 index.insert("a".to_string(), v.clone()).unwrap();
577
578 let distances = index.gpu_batch_distance_calculation(&v, &[0]).unwrap();
579 assert_eq!(distances.len(), 1);
580 assert!(
581 distances[0] < 1e-5,
582 "identical vectors should have ~0 distance, got {}",
583 distances[0]
584 );
585 }
586
587 #[test]
588 fn test_gpu_orthogonal_vectors_max_cosine_distance() {
589 let mut index = HnswIndex::new(HnswConfig::default()).unwrap();
591 index
592 .insert("y_axis".to_string(), Vector::new(vec![0.0, 1.0]))
593 .unwrap();
594
595 let query = Vector::new(vec![1.0, 0.0]);
596 let distances = index.gpu_batch_distance_calculation(&query, &[0]).unwrap();
597 assert_eq!(distances.len(), 1);
598 assert!(
599 (distances[0] - 1.0).abs() < 1e-4,
600 "orthogonal cosine distance should be 1.0, got {}",
601 distances[0]
602 );
603 }
604}