1#[cfg(feature = "gpu")]
4use crate::gpu::GpuAccelerator;
5use crate::hnsw::HnswIndex;
6use crate::Vector;
7use anyhow::Result;
8use std::sync::Arc;
9
10#[derive(Debug, Clone)]
12pub struct GpuPerformanceStats {
13 pub gpu_memory_used: usize,
14 pub kernel_execution_time: f64,
15 pub memory_transfer_time: f64,
16 pub throughput_vectors_per_second: f64,
17}
18
19#[cfg(feature = "gpu")]
20impl HnswIndex {
21 pub fn gpu_batch_distance_calculation(
23 &self,
24 query: &Vector,
25 candidates: &[usize],
26 ) -> Result<Vec<f32>> {
27 if candidates.len() < self.config().gpu_batch_threshold {
28 return self.cpu_batch_distance_calculation(query, candidates);
30 }
31
32 if let Some(accelerator) = self.gpu_accelerator() {
33 self.single_gpu_distance_calculation(accelerator, query, candidates)
34 } else if !self.multi_gpu_accelerators().is_empty() {
35 self.multi_gpu_distance_calculation(query, candidates)
36 } else {
37 self.cpu_batch_distance_calculation(query, candidates)
39 }
40 }
41
42 pub fn single_gpu_distance_calculation(
48 &self,
49 _accelerator: &Arc<GpuAccelerator>,
50 query: &Vector,
51 candidates: &[usize],
52 ) -> Result<Vec<f32>> {
53 use scirs2_core::ndarray_ext::Array1;
54 use scirs2_core::parallel_ops::{IntoParallelRefIterator, ParallelIterator};
55 use scirs2_core::simd::simd_dot_f32;
56
57 if candidates.is_empty() {
58 return Ok(Vec::new());
59 }
60
61 let query_f32 = query.as_f32();
62 let query_array = Array1::from_vec(query_f32.clone());
63 let query_norm_sq: f32 = simd_dot_f32(&query_array.view(), &query_array.view());
64 let query_norm = query_norm_sq.sqrt();
65
66 let indexed: Vec<(usize, usize)> = candidates.iter().copied().enumerate().collect();
69
70 let mut distances: Vec<(usize, f32)> = indexed
71 .par_iter()
72 .map(|&(pos, candidate_id)| {
73 let dist = if let Some(node) = self.nodes().get(candidate_id) {
74 let cand_f32 = &node.vector_data_f32;
75 let cand_array = Array1::from_vec(cand_f32.clone());
76
77 let dot = simd_dot_f32(&query_array.view(), &cand_array.view());
79 let cand_norm_sq = simd_dot_f32(&cand_array.view(), &cand_array.view());
80 let cand_norm = cand_norm_sq.sqrt();
81
82 if query_norm > 0.0 && cand_norm > 0.0 {
83 let similarity = dot / (query_norm * cand_norm);
85 1.0 - similarity.clamp(-1.0, 1.0)
86 } else {
87 f32::INFINITY
88 }
89 } else {
90 f32::INFINITY
91 };
92 (pos, dist)
93 })
94 .collect();
95
96 distances.sort_by_key(|(pos, _)| *pos);
98 Ok(distances.into_iter().map(|(_, d)| d).collect())
99 }
100
101 pub fn multi_gpu_distance_calculation(
107 &self,
108 query: &Vector,
109 candidates: &[usize],
110 ) -> Result<Vec<f32>> {
111 use scirs2_core::ndarray_ext::Array1;
112 use scirs2_core::parallel_ops::{IntoParallelRefIterator, ParallelIterator};
113 use scirs2_core::simd::simd_dot_f32;
114
115 let accelerators = self.multi_gpu_accelerators();
116 if accelerators.is_empty() {
117 return self.cpu_batch_distance_calculation(query, candidates);
118 }
119
120 if candidates.is_empty() {
121 return Ok(Vec::new());
122 }
123
124 let query_f32 = query.as_f32();
125 let query_array = Array1::from_vec(query_f32.clone());
126 let query_norm_sq: f32 = simd_dot_f32(&query_array.view(), &query_array.view());
127 let query_norm = query_norm_sq.sqrt();
128
129 let num_gpus = accelerators.len();
130 let chunk_size = (candidates.len() + num_gpus - 1) / num_gpus;
132
133 let partitions: Vec<(usize, &[usize])> =
134 candidates.chunks(chunk_size).enumerate().collect();
135
136 let mut partial_results: Vec<(usize, Vec<f32>)> = partitions
138 .par_iter()
139 .map(|&(partition_idx, chunk)| {
140 let chunk_distances: Vec<f32> = chunk
141 .iter()
142 .map(|&candidate_id| {
143 if let Some(node) = self.nodes().get(candidate_id) {
144 let cand_array = Array1::from_vec(node.vector_data_f32.clone());
145 let dot = simd_dot_f32(&query_array.view(), &cand_array.view());
146 let cand_norm_sq = simd_dot_f32(&cand_array.view(), &cand_array.view());
147 let cand_norm = cand_norm_sq.sqrt();
148 if query_norm > 0.0 && cand_norm > 0.0 {
149 let sim = dot / (query_norm * cand_norm);
150 1.0 - sim.clamp(-1.0, 1.0)
151 } else {
152 f32::INFINITY
153 }
154 } else {
155 f32::INFINITY
156 }
157 })
158 .collect();
159 (partition_idx, chunk_distances)
160 })
161 .collect();
162
163 partial_results.sort_by_key(|(idx, _)| *idx);
165 let distances: Vec<f32> = partial_results.into_iter().flat_map(|(_, v)| v).collect();
166
167 Ok(distances)
168 }
169
170 pub fn gpu_search(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
172 if self.nodes().is_empty() || self.entry_point().is_none() {
175 return Ok(Vec::new());
176 }
177
178 if !self.is_gpu_enabled() {
180 return self.search_knn(query, k);
182 }
183
184 if k >= self.config().gpu_batch_threshold && self.nodes().len() >= 1000 {
186 return self.gpu_accelerated_search_large(query, k);
187 }
188
189 self.search_knn(query, k)
191 }
192
193 fn gpu_accelerated_search_large(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
195 let candidate_ids: Vec<usize> = (0..self.nodes().len()).collect();
197
198 let distances = self.gpu_batch_distance_calculation(query, &candidate_ids)?;
200
201 let mut id_distance_pairs: Vec<(usize, f32)> =
203 candidate_ids.into_iter().zip(distances).collect();
204
205 id_distance_pairs
207 .sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
208
209 let results: Vec<(String, f32)> = id_distance_pairs
211 .into_iter()
212 .take(k)
213 .filter_map(|(id, distance)| {
214 self.nodes()
215 .get(id)
216 .map(|node| (node.uri.clone(), distance))
217 })
218 .collect();
219
220 Ok(results)
221 }
222
223 pub fn is_gpu_enabled(&self) -> bool {
225 self.gpu_accelerator().is_some() || !self.multi_gpu_accelerators().is_empty()
226 }
227
228 pub fn gpu_performance_stats(&self) -> Option<GpuPerformanceStats> {
230 self.gpu_accelerator()
231 .map(|accelerator| GpuPerformanceStats {
232 gpu_memory_used: accelerator.get_memory_usage().unwrap_or(0),
233 kernel_execution_time: 0.0, memory_transfer_time: 0.0, throughput_vectors_per_second: 0.0, })
237 }
238
239 pub fn warmup_gpu(&self) -> Result<()> {
246 if !self.is_gpu_enabled() {
247 return Ok(());
248 }
249
250 let warmup_count = self.nodes().len().min(64);
253 if warmup_count == 0 {
254 return Ok(());
255 }
256
257 let dummy_dims = if let Some(first_node) = self.nodes().first() {
258 first_node.vector_data_f32.len()
259 } else {
260 return Ok(());
261 };
262
263 let dummy_query = Vector::new(vec![0.0_f32; dummy_dims]);
265
266 let warmup_ids: Vec<usize> = (0..warmup_count).collect();
268 let _ = self.simd_distance_calculation(&dummy_query, &warmup_ids)?;
269
270 Ok(())
271 }
272
273 pub fn preload_to_gpu(&self) -> Result<()> {
279 if !self.is_gpu_enabled() {
280 return Ok(());
281 }
282
283 if self.nodes().is_empty() {
287 return Ok(());
288 }
289
290 let first_node_dims = self
291 .nodes()
292 .first()
293 .map(|n| n.vector_data_f32.len())
294 .unwrap_or(0);
295 if first_node_dims == 0 {
296 return Ok(());
297 }
298
299 let norm_val = (1.0_f32 / first_node_dims as f32).sqrt();
301 let warmup_query = Vector::new(vec![norm_val; first_node_dims]);
302
303 let all_ids: Vec<usize> = (0..self.nodes().len()).collect();
304
305 let _ = self.simd_distance_calculation(&warmup_query, &all_ids)?;
308
309 Ok(())
310 }
311}
312
313#[cfg(not(feature = "gpu"))]
314impl HnswIndex {
315 pub fn gpu_batch_distance_calculation(
317 &self,
318 query: &Vector,
319 candidates: &[usize],
320 ) -> Result<Vec<f32>> {
321 self.cpu_batch_distance_calculation(query, candidates)
322 }
323}
324
325#[cfg(test)]
326mod tests {
327 use super::*;
328 use crate::hnsw::HnswConfig;
329 use crate::VectorIndex;
330
331 fn build_test_index(count: usize, dim: usize) -> HnswIndex {
334 let mut index = HnswIndex::new(HnswConfig::default()).expect("index creation failed");
335 for i in 0..count {
336 let values: Vec<f32> = (0..dim).map(|d| (i + d) as f32).collect();
337 let mag = values.iter().map(|x| x * x).sum::<f32>().sqrt();
338 let normalised: Vec<f32> = if mag > 0.0 {
339 values.iter().map(|x| x / mag).collect()
340 } else {
341 vec![0.0; dim]
342 };
343 index
344 .insert(format!("vec_{}", i), Vector::new(normalised))
345 .expect("insert failed");
346 }
347 index
348 }
349
350 fn axis_query(dim: usize) -> Vector {
352 let mut v = vec![0.0f32; dim];
353 v[0] = 1.0;
354 Vector::new(v)
355 }
356
357 #[test]
362 fn test_gpu_search_basic() -> Result<()> {
363 let index = build_test_index(20, 4);
364 let query = axis_query(4);
365 let results = index.gpu_batch_distance_calculation(&query, &[0, 1, 2, 3, 4])?;
366 assert_eq!(results.len(), 5);
367 for &d in &results {
368 assert!(d.is_finite(), "distance should be finite");
369 }
370 Ok(())
371 }
372
373 #[test]
374 fn test_gpu_warmup_empty_index() -> Result<()> {
375 let index = HnswIndex::new(HnswConfig::default())?;
377 #[cfg(feature = "gpu")]
380 index.warmup_gpu()?;
381 #[cfg(not(feature = "gpu"))]
382 let _ = index; Ok(())
384 }
385
386 #[test]
387 fn test_gpu_warmup_non_empty_index() -> Result<()> {
388 let index = build_test_index(16, 8);
389 #[cfg(feature = "gpu")]
391 index.warmup_gpu()?;
392 #[cfg(not(feature = "gpu"))]
393 let _ = index;
394 Ok(())
395 }
396
397 #[test]
398 fn test_gpu_preload_empty_index() -> Result<()> {
399 let index = HnswIndex::new(HnswConfig::default())?;
400 #[cfg(feature = "gpu")]
401 index.preload_to_gpu()?;
402 #[cfg(not(feature = "gpu"))]
403 let _ = index;
404 Ok(())
405 }
406
407 #[test]
408 fn test_gpu_preload_stores_vectors() -> Result<()> {
409 let index = build_test_index(32, 8);
410 #[cfg(feature = "gpu")]
411 index.preload_to_gpu()?;
412 let query = axis_query(8);
414 let all_ids: Vec<usize> = (0..32).collect();
415 let distances = index.gpu_batch_distance_calculation(&query, &all_ids)?;
416 assert_eq!(distances.len(), 32);
417 Ok(())
418 }
419
420 #[test]
421 fn test_gpu_batch_distance_correctness() -> Result<()> {
422 let index = build_test_index(5, 4);
424 let query = Vector::new(vec![1.0, 0.0, 0.0, 0.0]);
425 let candidates = vec![0_usize, 1, 2];
426 let distances = index.gpu_batch_distance_calculation(&query, &candidates)?;
427 assert_eq!(distances.len(), 3);
428 for &d in &distances {
429 assert!((0.0..=2.0).contains(&d), "unexpected distance: {}", d);
431 }
432 Ok(())
433 }
434
435 #[test]
436 fn test_gpu_vs_cpu_consistency() -> Result<()> {
437 let index = build_test_index(50, 8);
439 let query = axis_query(8);
440 let candidates: Vec<usize> = (0..50).collect();
441
442 let cpu_distances = index.cpu_batch_distance_calculation(&query, &candidates)?;
443 let gpu_distances = index.gpu_batch_distance_calculation(&query, &candidates)?;
444
445 assert_eq!(cpu_distances.len(), gpu_distances.len());
446 for (cpu_d, gpu_d) in cpu_distances.iter().zip(gpu_distances.iter()) {
447 assert!(
449 (cpu_d - gpu_d).abs() < 1e-4,
450 "cpu={} gpu={} differ beyond tolerance",
451 cpu_d,
452 gpu_d
453 );
454 }
455 Ok(())
456 }
457
458 #[test]
459 fn test_gpu_search_with_filter() -> Result<()> {
460 let index = build_test_index(20, 4);
462 let query = axis_query(4);
463 let even_candidates: Vec<usize> = (0..20).filter(|i| i % 2 == 0).collect();
464
465 let distances = index.gpu_batch_distance_calculation(&query, &even_candidates)?;
466
467 assert_eq!(distances.len(), even_candidates.len());
468 for &d in &distances {
469 assert!(d.is_finite());
470 }
471 Ok(())
472 }
473
474 #[test]
475 fn test_gpu_multi_query_sequence() -> Result<()> {
476 let index = build_test_index(30, 4);
478 let all_ids: Vec<usize> = (0..30).collect();
479
480 for i in 0..5_u32 {
481 let query = Vector::new(vec![(i as f32).sin(), (i as f32).cos(), 0.0, 0.0]);
482 let distances = index.gpu_batch_distance_calculation(&query, &all_ids)?;
483 assert_eq!(distances.len(), 30);
484 }
485 Ok(())
486 }
487
488 #[test]
489 fn test_gpu_large_dataset() -> Result<()> {
490 let n = 1_024_usize;
492 let dim = 16_usize;
493 let index = build_test_index(n, dim);
494 let query = axis_query(dim);
495 let all_ids: Vec<usize> = (0..n).collect();
496
497 let distances = index.gpu_batch_distance_calculation(&query, &all_ids)?;
498
499 assert_eq!(distances.len(), n);
500 let finite_count = distances.iter().filter(|d| d.is_finite()).count();
501 assert_eq!(finite_count, n, "all distances should be finite");
502 Ok(())
503 }
504
505 #[test]
506 fn test_gpu_empty_candidate_list() -> Result<()> {
507 let index = build_test_index(10, 4);
508 let query = axis_query(4);
509
510 let distances = index.gpu_batch_distance_calculation(&query, &[])?;
511 assert!(distances.is_empty());
512 Ok(())
513 }
514
515 #[test]
516 fn test_gpu_empty_index_no_panic() -> Result<()> {
517 let index = HnswIndex::new(HnswConfig::default())?;
518 let query = Vector::new(vec![1.0, 0.0, 0.0]);
519 let distances = index.gpu_batch_distance_calculation(&query, &[])?;
521 assert!(distances.is_empty());
522 Ok(())
523 }
524
525 #[test]
526 fn test_gpu_single_vector_index() -> Result<()> {
527 let mut index = HnswIndex::new(HnswConfig::default())?;
528 index.insert("only".to_string(), Vector::new(vec![0.6, 0.8, 0.0]))?;
529
530 let query = Vector::new(vec![1.0, 0.0, 0.0]);
531 let distances = index.gpu_batch_distance_calculation(&query, &[0])?;
532 assert_eq!(distances.len(), 1);
533 assert!(distances[0].is_finite());
534 Ok(())
535 }
536
537 #[test]
538 fn test_gpu_distances_ordered_by_candidate() -> Result<()> {
539 let index = build_test_index(10, 4);
542 let query = axis_query(4);
543 let candidates = vec![5_usize, 2, 8, 0];
544
545 let distances = index.gpu_batch_distance_calculation(&query, &candidates)?;
546
547 let expected: Vec<f32> = candidates
549 .iter()
550 .map(|&id| {
551 index
552 .cpu_batch_distance_calculation(&query, &[id])
553 .expect("cpu distance calculation should succeed")[0]
554 })
555 .collect();
556
557 assert_eq!(distances.len(), expected.len());
558 for (d, e) in distances.iter().zip(expected.iter()) {
559 assert!(
560 (d - e).abs() < 1e-4,
561 "ordering mismatch: got {} expected {}",
562 d,
563 e
564 );
565 }
566 Ok(())
567 }
568
569 #[test]
570 fn test_gpu_identical_vectors_zero_distance() -> Result<()> {
571 let mut index = HnswIndex::new(HnswConfig::default())?;
573 let v = Vector::new(vec![0.6, 0.8]);
574 index.insert("a".to_string(), v.clone())?;
575
576 let distances = index.gpu_batch_distance_calculation(&v, &[0])?;
577 assert_eq!(distances.len(), 1);
578 assert!(
579 distances[0] < 1e-5,
580 "identical vectors should have ~0 distance, got {}",
581 distances[0]
582 );
583 Ok(())
584 }
585
586 #[test]
587 fn test_gpu_orthogonal_vectors_max_cosine_distance() -> Result<()> {
588 let mut index = HnswIndex::new(HnswConfig::default())?;
590 index.insert("y_axis".to_string(), Vector::new(vec![0.0, 1.0]))?;
591
592 let query = Vector::new(vec![1.0, 0.0]);
593 let distances = index.gpu_batch_distance_calculation(&query, &[0])?;
594 assert_eq!(distances.len(), 1);
595 assert!(
596 (distances[0] - 1.0).abs() < 1e-4,
597 "orthogonal cosine distance should be 1.0, got {}",
598 distances[0]
599 );
600 Ok(())
601 }
602}