oxirs_vec/gpu/
accelerator.rs

1//! Main GPU accelerator implementation
2
3use super::{GpuBuffer, GpuConfig, GpuDevice, GpuPerformanceStats, KernelManager};
4use crate::similarity::SimilarityMetric;
5use anyhow::{anyhow, Result};
6use parking_lot::RwLock;
7use std::collections::HashMap;
8use std::sync::{Arc, Mutex};
9
10/// CUDA stream handle
11#[derive(Debug)]
12pub struct CudaStream {
13    handle: *mut std::ffi::c_void,
14    device_id: i32,
15}
16
17unsafe impl Send for CudaStream {}
18unsafe impl Sync for CudaStream {}
19
20/// CUDA kernel handle
21#[derive(Debug)]
22pub struct CudaKernel {
23    function: *mut std::ffi::c_void,
24    module: *mut std::ffi::c_void,
25    name: String,
26}
27
28unsafe impl Send for CudaKernel {}
29unsafe impl Sync for CudaKernel {}
30
31/// Parameters for similarity kernel execution
32#[derive(Debug, Clone)]
33pub struct SimilarityKernelParams {
34    pub query_count: usize,
35    pub db_count: usize,
36    pub dim: usize,
37    pub metric: String,
38}
39
40/// GPU acceleration engine for vector operations
41#[derive(Debug)]
42pub struct GpuAccelerator {
43    config: GpuConfig,
44    device: GpuDevice,
45    memory_pool: Arc<Mutex<Vec<GpuBuffer>>>,
46    stream_pool: Vec<CudaStream>,
47    kernel_cache: Arc<RwLock<HashMap<String, CudaKernel>>>,
48    performance_stats: Arc<RwLock<GpuPerformanceStats>>,
49    kernel_manager: KernelManager,
50}
51
52unsafe impl Send for GpuAccelerator {}
53unsafe impl Sync for GpuAccelerator {}
54
55impl GpuAccelerator {
56    pub fn new(config: GpuConfig) -> Result<Self> {
57        config.validate()?;
58
59        let device = GpuDevice::get_device_info(config.device_id)?;
60        let memory_pool = Arc::new(Mutex::new(Vec::new()));
61        let stream_pool = Self::create_streams(config.stream_count, config.device_id)?;
62        let kernel_manager = KernelManager::new();
63
64        Ok(Self {
65            config,
66            device,
67            memory_pool,
68            stream_pool,
69            kernel_cache: Arc::new(RwLock::new(HashMap::new())),
70            performance_stats: Arc::new(RwLock::new(GpuPerformanceStats::new())),
71            kernel_manager,
72        })
73    }
74
75    fn create_streams(count: usize, device_id: i32) -> Result<Vec<CudaStream>> {
76        let mut streams = Vec::new();
77
78        for _ in 0..count {
79            let handle = Self::create_cuda_stream(device_id)?;
80            streams.push(CudaStream { handle, device_id });
81        }
82
83        Ok(streams)
84    }
85
86    #[allow(unused_variables)]
87    fn create_cuda_stream(device_id: i32) -> Result<*mut std::ffi::c_void> {
88        #[cfg(all(feature = "cuda", cuda_runtime_available))]
89        {
90            use cuda_runtime_sys::*;
91            unsafe {
92                let result = cudaSetDevice(device_id);
93                if result != cudaError_t::cudaSuccess {
94                    return Err(anyhow!("Failed to set CUDA device"));
95                }
96
97                let mut stream: cudaStream_t = std::ptr::null_mut();
98                let result = cudaStreamCreate(&mut stream);
99                if result != cudaError_t::cudaSuccess {
100                    return Err(anyhow!("Failed to create CUDA stream"));
101                }
102                Ok(stream as *mut std::ffi::c_void)
103            }
104        }
105
106        #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
107        {
108            // Fallback: return a dummy handle for testing
109            Ok(1 as *mut std::ffi::c_void)
110        }
111    }
112
113    /// Compute similarity between query vectors and database vectors
114    pub fn compute_similarity(
115        &self,
116        queries: &[f32],
117        database: &[f32],
118        query_count: usize,
119        db_count: usize,
120        dim: usize,
121        metric: SimilarityMetric,
122    ) -> Result<Vec<f32>> {
123        let timer = super::performance::GpuTimer::start("similarity_computation");
124
125        // Allocate GPU buffers
126        let mut query_buffer = GpuBuffer::new(queries.len(), self.config.device_id)?;
127        let mut db_buffer = GpuBuffer::new(database.len(), self.config.device_id)?;
128        let result_buffer = GpuBuffer::new(query_count * db_count, self.config.device_id)?;
129
130        // Copy data to GPU
131        query_buffer.copy_from_host(queries)?;
132        db_buffer.copy_from_host(database)?;
133
134        // Select appropriate kernel
135        let kernel_name = match metric {
136            SimilarityMetric::Cosine => "cosine_similarity",
137            SimilarityMetric::Euclidean => "euclidean_distance",
138            _ => return Err(anyhow!("Unsupported similarity metric for GPU")),
139        };
140
141        // Create kernel parameters
142        let params = SimilarityKernelParams {
143            query_count,
144            db_count,
145            dim,
146            metric: kernel_name.to_string(),
147        };
148
149        // Launch kernel
150        self.launch_similarity_kernel(
151            kernel_name,
152            &query_buffer,
153            &db_buffer,
154            &result_buffer,
155            &params,
156        )?;
157
158        // Copy results back
159        let mut results = vec![0.0f32; query_count * db_count];
160        result_buffer.copy_to_host(&mut results)?;
161
162        // Record performance
163        let duration = timer.stop();
164        self.performance_stats
165            .write()
166            .record_compute_operation(duration);
167
168        Ok(results)
169    }
170
171    fn launch_similarity_kernel(
172        &self,
173        kernel_name: &str,
174        query_buffer: &GpuBuffer,
175        db_buffer: &GpuBuffer,
176        result_buffer: &GpuBuffer,
177        params: &SimilarityKernelParams,
178    ) -> Result<()> {
179        #[cfg(all(feature = "cuda", cuda_runtime_available))]
180        {
181            // Get or compile kernel
182            let kernel = self.get_or_compile_kernel(kernel_name)?;
183
184            // Calculate grid and block dimensions
185            let (blocks, threads) = self
186                .device
187                .calculate_optimal_block_config(params.query_count * params.db_count);
188
189            // Launch kernel
190            self.launch_kernel_impl(
191                &kernel,
192                blocks,
193                threads,
194                &[
195                    query_buffer.ptr() as *mut std::ffi::c_void,
196                    db_buffer.ptr() as *mut std::ffi::c_void,
197                    result_buffer.ptr() as *mut std::ffi::c_void,
198                    &params.query_count as *const usize as *mut std::ffi::c_void,
199                    &params.db_count as *const usize as *mut std::ffi::c_void,
200                    &params.dim as *const usize as *mut std::ffi::c_void,
201                ],
202            )?;
203        }
204
205        #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
206        {
207            // Fallback CPU implementation for testing
208            self.compute_similarity_cpu(
209                query_buffer,
210                db_buffer,
211                result_buffer,
212                params,
213                kernel_name,
214            )?;
215        }
216
217        Ok(())
218    }
219
220    #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
221    fn compute_similarity_cpu(
222        &self,
223        _query_buffer: &GpuBuffer,
224        _db_buffer: &GpuBuffer,
225        _result_buffer: &GpuBuffer,
226        params: &SimilarityKernelParams,
227        _metric: &str,
228    ) -> Result<()> {
229        // Simplified CPU fallback
230        let query_data = vec![0.0f32; params.query_count * params.dim];
231        let db_data = vec![0.0f32; params.db_count * params.dim];
232        let mut results = vec![0.0f32; params.query_count * params.db_count];
233
234        // Copy data from "GPU" buffers (actually host memory in fallback)
235        // In real implementation, this would be proper GPU memory access
236
237        for i in 0..params.query_count {
238            for j in 0..params.db_count {
239                let query_vec = &query_data[i * params.dim..(i + 1) * params.dim];
240                let db_vec = &db_data[j * params.dim..(j + 1) * params.dim];
241
242                let similarity = match params.metric.as_str() {
243                    "cosine_similarity" => self.compute_cosine_similarity(query_vec, db_vec),
244                    "euclidean_distance" => self.compute_euclidean_distance(query_vec, db_vec),
245                    _ => 0.0,
246                };
247
248                results[i * params.db_count + j] = similarity;
249            }
250        }
251
252        Ok(())
253    }
254
255    #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
256    fn compute_cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
257        let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
258        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
259        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
260
261        if norm_a > 1e-8 && norm_b > 1e-8 {
262            dot / (norm_a * norm_b)
263        } else {
264            0.0
265        }
266    }
267
268    #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
269    fn compute_euclidean_distance(&self, a: &[f32], b: &[f32]) -> f32 {
270        a.iter()
271            .zip(b.iter())
272            .map(|(x, y)| (x - y).powi(2))
273            .sum::<f32>()
274            .sqrt()
275    }
276
277    fn get_or_compile_kernel(&self, name: &str) -> Result<CudaKernel> {
278        // Check if kernel is already compiled
279        if let Some(kernel) = self.kernel_cache.read().get(name) {
280            return Ok(CudaKernel {
281                function: kernel.function,
282                module: kernel.module,
283                name: kernel.name.clone(),
284            });
285        }
286
287        // Compile kernel
288        let kernel_source = self
289            .kernel_manager
290            .get_kernel(name)
291            .ok_or_else(|| anyhow!("Kernel {} not found", name))?;
292
293        let compiled_kernel = self.compile_kernel(name, kernel_source)?;
294
295        // Cache the compiled kernel
296        self.kernel_cache.write().insert(
297            name.to_string(),
298            CudaKernel {
299                function: compiled_kernel.function,
300                module: compiled_kernel.module,
301                name: compiled_kernel.name.clone(),
302            },
303        );
304
305        Ok(compiled_kernel)
306    }
307
308    #[allow(unused_variables)]
309    fn compile_kernel(&self, name: &str, source: &str) -> Result<CudaKernel> {
310        #[cfg(all(feature = "cuda", cuda_runtime_available))]
311        {
312            // In a real implementation, this would use NVRTC or similar to compile CUDA kernels
313            // For now, return a dummy kernel
314            Ok(CudaKernel {
315                function: std::ptr::null_mut(),
316                module: std::ptr::null_mut(),
317                name: name.to_string(),
318            })
319        }
320
321        #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
322        {
323            Ok(CudaKernel {
324                function: std::ptr::null_mut(),
325                module: std::ptr::null_mut(),
326                name: name.to_string(),
327            })
328        }
329    }
330
331    #[cfg(all(feature = "cuda", cuda_runtime_available))]
332    fn launch_kernel_impl(
333        &self,
334        kernel: &CudaKernel,
335        blocks: i32,
336        threads: i32,
337        args: &[*mut std::ffi::c_void],
338    ) -> Result<()> {
339        use cuda_runtime_sys::*;
340        unsafe {
341            let result = cudaLaunchKernel(
342                kernel.function,
343                dim3 {
344                    x: blocks as u32,
345                    y: 1,
346                    z: 1,
347                },
348                dim3 {
349                    x: threads as u32,
350                    y: 1,
351                    z: 1,
352                },
353                args.as_ptr() as *mut *mut std::ffi::c_void,
354                0,
355                std::ptr::null_mut(),
356            );
357            if result != cudaError_t::cudaSuccess {
358                return Err(anyhow!("Failed to launch kernel"));
359            }
360
361            // Synchronize
362            let result = cudaDeviceSynchronize();
363            if result != cudaError_t::cudaSuccess {
364                return Err(anyhow!("Kernel execution failed"));
365            }
366        }
367        Ok(())
368    }
369
370    /// Get device information
371    pub fn device(&self) -> &GpuDevice {
372        &self.device
373    }
374
375    /// Get configuration
376    pub fn config(&self) -> &GpuConfig {
377        &self.config
378    }
379
380    /// Get performance statistics
381    pub fn performance_stats(&self) -> Arc<RwLock<GpuPerformanceStats>> {
382        self.performance_stats.clone()
383    }
384
385    /// Synchronize all operations
386    pub fn synchronize(&self) -> Result<()> {
387        #[cfg(all(feature = "cuda", cuda_runtime_available))]
388        {
389            use cuda_runtime_sys::*;
390            unsafe {
391                let result = cudaDeviceSynchronize();
392                if result != cudaError_t::cudaSuccess {
393                    return Err(anyhow!("Failed to synchronize device"));
394                }
395            }
396        }
397        Ok(())
398    }
399
400    /// Reset performance statistics
401    pub fn reset_stats(&self) {
402        self.performance_stats.write().reset();
403    }
404
405    /// Get current GPU memory usage in bytes
406    pub fn get_memory_usage(&self) -> Result<usize> {
407        #[cfg(all(feature = "cuda", cuda_runtime_available))]
408        {
409            use cuda_runtime_sys::*;
410            unsafe {
411                let mut free: usize = 0;
412                let mut total: usize = 0;
413                let result = cudaMemGetInfo(&mut free as *mut usize, &mut total as *mut usize);
414                if result != cudaError_t::cudaSuccess {
415                    return Err(anyhow!("Failed to get memory info"));
416                }
417                Ok(total - free)
418            }
419        }
420        #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
421        {
422            // Return dummy value for testing
423            Ok(0)
424        }
425    }
426}
427
428impl Drop for GpuAccelerator {
429    fn drop(&mut self) {
430        // Cleanup CUDA streams
431        #[cfg(all(feature = "cuda", cuda_runtime_available))]
432        {
433            for stream in &self.stream_pool {
434                unsafe {
435                    let _ = cuda_runtime_sys::cudaStreamDestroy(
436                        stream.handle as cuda_runtime_sys::cudaStream_t,
437                    );
438                }
439            }
440        }
441    }
442}
443
444/// Check if GPU acceleration is available
445pub fn is_gpu_available() -> bool {
446    #[cfg(all(feature = "cuda", cuda_runtime_available))]
447    {
448        match crate::gpu::device::GpuDevice::get_all_devices() {
449            Ok(devices) => !devices.is_empty(),
450            Err(_) => false,
451        }
452    }
453    #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
454    {
455        false
456    }
457}
458
459/// Create a default GPU accelerator configuration
460pub fn create_default_accelerator() -> Result<GpuAccelerator> {
461    let config = GpuConfig::default();
462    GpuAccelerator::new(config)
463}
464
465/// Create a performance-optimized GPU accelerator
466pub fn create_performance_accelerator() -> Result<GpuAccelerator> {
467    let config = GpuConfig {
468        optimization_level: crate::gpu::OptimizationLevel::Performance,
469        precision_mode: crate::gpu::PrecisionMode::FP32,
470        memory_pool_size: 1024 * 1024 * 1024, // 1GB
471        batch_size: 10000,
472        enable_tensor_cores: true,
473        enable_mixed_precision: false,
474        ..Default::default()
475    };
476    GpuAccelerator::new(config)
477}
478
479/// Create a memory-optimized GPU accelerator
480pub fn create_memory_optimized_accelerator() -> Result<GpuAccelerator> {
481    let config = GpuConfig {
482        optimization_level: crate::gpu::OptimizationLevel::Balanced,
483        precision_mode: crate::gpu::PrecisionMode::FP16,
484        memory_pool_size: 256 * 1024 * 1024, // 256MB
485        batch_size: 1000,
486        enable_tensor_cores: true,
487        enable_mixed_precision: true,
488        ..Default::default()
489    };
490    GpuAccelerator::new(config)
491}