#[cfg(feature = "gpu")]
use crate::gpu::GpuAccelerator;
use crate::hnsw::HnswIndex;
use crate::Vector;
use anyhow::Result;
use std::sync::Arc;
#[derive(Debug, Clone)]
pub struct GpuPerformanceStats {
pub gpu_memory_used: usize,
pub kernel_execution_time: f64,
pub memory_transfer_time: f64,
pub throughput_vectors_per_second: f64,
}
#[cfg(feature = "gpu")]
impl HnswIndex {
pub fn gpu_batch_distance_calculation(
&self,
query: &Vector,
candidates: &[usize],
) -> Result<Vec<f32>> {
if candidates.len() < self.config().gpu_batch_threshold {
return self.cpu_batch_distance_calculation(query, candidates);
}
if let Some(accelerator) = self.gpu_accelerator() {
self.single_gpu_distance_calculation(accelerator, query, candidates)
} else if !self.multi_gpu_accelerators().is_empty() {
self.multi_gpu_distance_calculation(query, candidates)
} else {
self.cpu_batch_distance_calculation(query, candidates)
}
}
pub fn single_gpu_distance_calculation(
&self,
_accelerator: &Arc<GpuAccelerator>,
query: &Vector,
candidates: &[usize],
) -> Result<Vec<f32>> {
use scirs2_core::ndarray_ext::Array1;
use scirs2_core::parallel_ops::{IntoParallelRefIterator, ParallelIterator};
use scirs2_core::simd::simd_dot_f32;
if candidates.is_empty() {
return Ok(Vec::new());
}
let query_f32 = query.as_f32();
let query_array = Array1::from_vec(query_f32.clone());
let query_norm_sq: f32 = simd_dot_f32(&query_array.view(), &query_array.view());
let query_norm = query_norm_sq.sqrt();
let indexed: Vec<(usize, usize)> = candidates.iter().copied().enumerate().collect();
let mut distances: Vec<(usize, f32)> = indexed
.par_iter()
.map(|&(pos, candidate_id)| {
let dist = if let Some(node) = self.nodes().get(candidate_id) {
let cand_f32 = &node.vector_data_f32;
let cand_array = Array1::from_vec(cand_f32.clone());
let dot = simd_dot_f32(&query_array.view(), &cand_array.view());
let cand_norm_sq = simd_dot_f32(&cand_array.view(), &cand_array.view());
let cand_norm = cand_norm_sq.sqrt();
if query_norm > 0.0 && cand_norm > 0.0 {
let similarity = dot / (query_norm * cand_norm);
1.0 - similarity.clamp(-1.0, 1.0)
} else {
f32::INFINITY
}
} else {
f32::INFINITY
};
(pos, dist)
})
.collect();
distances.sort_by_key(|(pos, _)| *pos);
Ok(distances.into_iter().map(|(_, d)| d).collect())
}
pub fn multi_gpu_distance_calculation(
&self,
query: &Vector,
candidates: &[usize],
) -> Result<Vec<f32>> {
use scirs2_core::ndarray_ext::Array1;
use scirs2_core::parallel_ops::{IntoParallelRefIterator, ParallelIterator};
use scirs2_core::simd::simd_dot_f32;
let accelerators = self.multi_gpu_accelerators();
if accelerators.is_empty() {
return self.cpu_batch_distance_calculation(query, candidates);
}
if candidates.is_empty() {
return Ok(Vec::new());
}
let query_f32 = query.as_f32();
let query_array = Array1::from_vec(query_f32.clone());
let query_norm_sq: f32 = simd_dot_f32(&query_array.view(), &query_array.view());
let query_norm = query_norm_sq.sqrt();
let num_gpus = accelerators.len();
let chunk_size = (candidates.len() + num_gpus - 1) / num_gpus;
let partitions: Vec<(usize, &[usize])> =
candidates.chunks(chunk_size).enumerate().collect();
let mut partial_results: Vec<(usize, Vec<f32>)> = partitions
.par_iter()
.map(|&(partition_idx, chunk)| {
let chunk_distances: Vec<f32> = chunk
.iter()
.map(|&candidate_id| {
if let Some(node) = self.nodes().get(candidate_id) {
let cand_array = Array1::from_vec(node.vector_data_f32.clone());
let dot = simd_dot_f32(&query_array.view(), &cand_array.view());
let cand_norm_sq = simd_dot_f32(&cand_array.view(), &cand_array.view());
let cand_norm = cand_norm_sq.sqrt();
if query_norm > 0.0 && cand_norm > 0.0 {
let sim = dot / (query_norm * cand_norm);
1.0 - sim.clamp(-1.0, 1.0)
} else {
f32::INFINITY
}
} else {
f32::INFINITY
}
})
.collect();
(partition_idx, chunk_distances)
})
.collect();
partial_results.sort_by_key(|(idx, _)| *idx);
let distances: Vec<f32> = partial_results.into_iter().flat_map(|(_, v)| v).collect();
Ok(distances)
}
pub fn gpu_search(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
if self.nodes().is_empty() || self.entry_point().is_none() {
return Ok(Vec::new());
}
if !self.is_gpu_enabled() {
return self.search_knn(query, k);
}
if k >= self.config().gpu_batch_threshold && self.nodes().len() >= 1000 {
return self.gpu_accelerated_search_large(query, k);
}
self.search_knn(query, k)
}
fn gpu_accelerated_search_large(&self, query: &Vector, k: usize) -> Result<Vec<(String, f32)>> {
let candidate_ids: Vec<usize> = (0..self.nodes().len()).collect();
let distances = self.gpu_batch_distance_calculation(query, &candidate_ids)?;
let mut id_distance_pairs: Vec<(usize, f32)> =
candidate_ids.into_iter().zip(distances).collect();
id_distance_pairs
.sort_by(|a, b| a.1.partial_cmp(&b.1).unwrap_or(std::cmp::Ordering::Equal));
let results: Vec<(String, f32)> = id_distance_pairs
.into_iter()
.take(k)
.filter_map(|(id, distance)| {
self.nodes()
.get(id)
.map(|node| (node.uri.clone(), distance))
})
.collect();
Ok(results)
}
pub fn is_gpu_enabled(&self) -> bool {
self.gpu_accelerator().is_some() || !self.multi_gpu_accelerators().is_empty()
}
pub fn gpu_performance_stats(&self) -> Option<GpuPerformanceStats> {
self.gpu_accelerator()
.map(|accelerator| GpuPerformanceStats {
gpu_memory_used: accelerator.get_memory_usage().unwrap_or(0),
kernel_execution_time: 0.0, memory_transfer_time: 0.0, throughput_vectors_per_second: 0.0, })
}
pub fn warmup_gpu(&self) -> Result<()> {
if !self.is_gpu_enabled() {
return Ok(());
}
let warmup_count = self.nodes().len().min(64);
if warmup_count == 0 {
return Ok(());
}
let dummy_dims = if let Some(first_node) = self.nodes().first() {
first_node.vector_data_f32.len()
} else {
return Ok(());
};
let dummy_query = Vector::new(vec![0.0_f32; dummy_dims]);
let warmup_ids: Vec<usize> = (0..warmup_count).collect();
let _ = self.simd_distance_calculation(&dummy_query, &warmup_ids)?;
Ok(())
}
pub fn preload_to_gpu(&self) -> Result<()> {
if !self.is_gpu_enabled() {
return Ok(());
}
if self.nodes().is_empty() {
return Ok(());
}
let first_node_dims = self
.nodes()
.first()
.map(|n| n.vector_data_f32.len())
.unwrap_or(0);
if first_node_dims == 0 {
return Ok(());
}
let norm_val = (1.0_f32 / first_node_dims as f32).sqrt();
let warmup_query = Vector::new(vec![norm_val; first_node_dims]);
let all_ids: Vec<usize> = (0..self.nodes().len()).collect();
let _ = self.simd_distance_calculation(&warmup_query, &all_ids)?;
Ok(())
}
}
#[cfg(not(feature = "gpu"))]
impl HnswIndex {
pub fn gpu_batch_distance_calculation(
&self,
query: &Vector,
candidates: &[usize],
) -> Result<Vec<f32>> {
self.cpu_batch_distance_calculation(query, candidates)
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::hnsw::HnswConfig;
use crate::VectorIndex;
fn build_test_index(count: usize, dim: usize) -> HnswIndex {
let mut index = HnswIndex::new(HnswConfig::default()).expect("index creation failed");
for i in 0..count {
let values: Vec<f32> = (0..dim).map(|d| (i + d) as f32).collect();
let mag = values.iter().map(|x| x * x).sum::<f32>().sqrt();
let normalised: Vec<f32> = if mag > 0.0 {
values.iter().map(|x| x / mag).collect()
} else {
vec![0.0; dim]
};
index
.insert(format!("vec_{}", i), Vector::new(normalised))
.expect("insert failed");
}
index
}
fn axis_query(dim: usize) -> Vector {
let mut v = vec![0.0f32; dim];
v[0] = 1.0;
Vector::new(v)
}
#[test]
fn test_gpu_search_basic() -> Result<()> {
let index = build_test_index(20, 4);
let query = axis_query(4);
let results = index.gpu_batch_distance_calculation(&query, &[0, 1, 2, 3, 4])?;
assert_eq!(results.len(), 5);
for &d in &results {
assert!(d.is_finite(), "distance should be finite");
}
Ok(())
}
#[test]
fn test_gpu_warmup_empty_index() -> Result<()> {
let index = HnswIndex::new(HnswConfig::default())?;
#[cfg(feature = "gpu")]
index.warmup_gpu()?;
#[cfg(not(feature = "gpu"))]
let _ = index; Ok(())
}
#[test]
fn test_gpu_warmup_non_empty_index() -> Result<()> {
let index = build_test_index(16, 8);
#[cfg(feature = "gpu")]
index.warmup_gpu()?;
#[cfg(not(feature = "gpu"))]
let _ = index;
Ok(())
}
#[test]
fn test_gpu_preload_empty_index() -> Result<()> {
let index = HnswIndex::new(HnswConfig::default())?;
#[cfg(feature = "gpu")]
index.preload_to_gpu()?;
#[cfg(not(feature = "gpu"))]
let _ = index;
Ok(())
}
#[test]
fn test_gpu_preload_stores_vectors() -> Result<()> {
let index = build_test_index(32, 8);
#[cfg(feature = "gpu")]
index.preload_to_gpu()?;
let query = axis_query(8);
let all_ids: Vec<usize> = (0..32).collect();
let distances = index.gpu_batch_distance_calculation(&query, &all_ids)?;
assert_eq!(distances.len(), 32);
Ok(())
}
#[test]
fn test_gpu_batch_distance_correctness() -> Result<()> {
let index = build_test_index(5, 4);
let query = Vector::new(vec![1.0, 0.0, 0.0, 0.0]);
let candidates = vec![0_usize, 1, 2];
let distances = index.gpu_batch_distance_calculation(&query, &candidates)?;
assert_eq!(distances.len(), 3);
for &d in &distances {
assert!((0.0..=2.0).contains(&d), "unexpected distance: {}", d);
}
Ok(())
}
#[test]
fn test_gpu_vs_cpu_consistency() -> Result<()> {
let index = build_test_index(50, 8);
let query = axis_query(8);
let candidates: Vec<usize> = (0..50).collect();
let cpu_distances = index.cpu_batch_distance_calculation(&query, &candidates)?;
let gpu_distances = index.gpu_batch_distance_calculation(&query, &candidates)?;
assert_eq!(cpu_distances.len(), gpu_distances.len());
for (cpu_d, gpu_d) in cpu_distances.iter().zip(gpu_distances.iter()) {
assert!(
(cpu_d - gpu_d).abs() < 1e-4,
"cpu={} gpu={} differ beyond tolerance",
cpu_d,
gpu_d
);
}
Ok(())
}
#[test]
fn test_gpu_search_with_filter() -> Result<()> {
let index = build_test_index(20, 4);
let query = axis_query(4);
let even_candidates: Vec<usize> = (0..20).filter(|i| i % 2 == 0).collect();
let distances = index.gpu_batch_distance_calculation(&query, &even_candidates)?;
assert_eq!(distances.len(), even_candidates.len());
for &d in &distances {
assert!(d.is_finite());
}
Ok(())
}
#[test]
fn test_gpu_multi_query_sequence() -> Result<()> {
let index = build_test_index(30, 4);
let all_ids: Vec<usize> = (0..30).collect();
for i in 0..5_u32 {
let query = Vector::new(vec![(i as f32).sin(), (i as f32).cos(), 0.0, 0.0]);
let distances = index.gpu_batch_distance_calculation(&query, &all_ids)?;
assert_eq!(distances.len(), 30);
}
Ok(())
}
#[test]
fn test_gpu_large_dataset() -> Result<()> {
let n = 1_024_usize;
let dim = 16_usize;
let index = build_test_index(n, dim);
let query = axis_query(dim);
let all_ids: Vec<usize> = (0..n).collect();
let distances = index.gpu_batch_distance_calculation(&query, &all_ids)?;
assert_eq!(distances.len(), n);
let finite_count = distances.iter().filter(|d| d.is_finite()).count();
assert_eq!(finite_count, n, "all distances should be finite");
Ok(())
}
#[test]
fn test_gpu_empty_candidate_list() -> Result<()> {
let index = build_test_index(10, 4);
let query = axis_query(4);
let distances = index.gpu_batch_distance_calculation(&query, &[])?;
assert!(distances.is_empty());
Ok(())
}
#[test]
fn test_gpu_empty_index_no_panic() -> Result<()> {
let index = HnswIndex::new(HnswConfig::default())?;
let query = Vector::new(vec![1.0, 0.0, 0.0]);
let distances = index.gpu_batch_distance_calculation(&query, &[])?;
assert!(distances.is_empty());
Ok(())
}
#[test]
fn test_gpu_single_vector_index() -> Result<()> {
let mut index = HnswIndex::new(HnswConfig::default())?;
index.insert("only".to_string(), Vector::new(vec![0.6, 0.8, 0.0]))?;
let query = Vector::new(vec![1.0, 0.0, 0.0]);
let distances = index.gpu_batch_distance_calculation(&query, &[0])?;
assert_eq!(distances.len(), 1);
assert!(distances[0].is_finite());
Ok(())
}
#[test]
fn test_gpu_distances_ordered_by_candidate() -> Result<()> {
let index = build_test_index(10, 4);
let query = axis_query(4);
let candidates = vec![5_usize, 2, 8, 0];
let distances = index.gpu_batch_distance_calculation(&query, &candidates)?;
let expected: Vec<f32> = candidates
.iter()
.map(|&id| {
index
.cpu_batch_distance_calculation(&query, &[id])
.expect("cpu distance calculation should succeed")[0]
})
.collect();
assert_eq!(distances.len(), expected.len());
for (d, e) in distances.iter().zip(expected.iter()) {
assert!(
(d - e).abs() < 1e-4,
"ordering mismatch: got {} expected {}",
d,
e
);
}
Ok(())
}
#[test]
fn test_gpu_identical_vectors_zero_distance() -> Result<()> {
let mut index = HnswIndex::new(HnswConfig::default())?;
let v = Vector::new(vec![0.6, 0.8]);
index.insert("a".to_string(), v.clone())?;
let distances = index.gpu_batch_distance_calculation(&v, &[0])?;
assert_eq!(distances.len(), 1);
assert!(
distances[0] < 1e-5,
"identical vectors should have ~0 distance, got {}",
distances[0]
);
Ok(())
}
#[test]
fn test_gpu_orthogonal_vectors_max_cosine_distance() -> Result<()> {
let mut index = HnswIndex::new(HnswConfig::default())?;
index.insert("y_axis".to_string(), Vector::new(vec![0.0, 1.0]))?;
let query = Vector::new(vec![1.0, 0.0]);
let distances = index.gpu_batch_distance_calculation(&query, &[0])?;
assert_eq!(distances.len(), 1);
assert!(
(distances[0] - 1.0).abs() < 1e-4,
"orthogonal cosine distance should be 1.0, got {}",
distances[0]
);
Ok(())
}
}