oxirs_vec/gpu/
accelerator.rs

1//! Main GPU accelerator implementation
2
3use super::{GpuBuffer, GpuConfig, GpuDevice, GpuPerformanceStats, KernelManager};
4use crate::similarity::SimilarityMetric;
5use anyhow::{anyhow, Result};
6use parking_lot::RwLock;
7use std::collections::HashMap;
8use std::sync::{Arc, Mutex};
9
10/// CUDA stream handle
11#[derive(Debug)]
12pub struct CudaStream {
13    handle: *mut std::ffi::c_void,
14    device_id: i32,
15}
16
17unsafe impl Send for CudaStream {}
18unsafe impl Sync for CudaStream {}
19
20/// CUDA kernel handle
21#[derive(Debug)]
22pub struct CudaKernel {
23    function: *mut std::ffi::c_void,
24    module: *mut std::ffi::c_void,
25    name: String,
26}
27
28unsafe impl Send for CudaKernel {}
29unsafe impl Sync for CudaKernel {}
30
31/// Parameters for similarity kernel execution
32#[derive(Debug, Clone)]
33pub struct SimilarityKernelParams {
34    pub query_count: usize,
35    pub db_count: usize,
36    pub dim: usize,
37    pub metric: String,
38}
39
40/// GPU acceleration engine for vector operations
41#[derive(Debug)]
42pub struct GpuAccelerator {
43    config: GpuConfig,
44    device: GpuDevice,
45    memory_pool: Arc<Mutex<Vec<GpuBuffer>>>,
46    stream_pool: Vec<CudaStream>,
47    kernel_cache: Arc<RwLock<HashMap<String, CudaKernel>>>,
48    performance_stats: Arc<RwLock<GpuPerformanceStats>>,
49    kernel_manager: KernelManager,
50}
51
52unsafe impl Send for GpuAccelerator {}
53unsafe impl Sync for GpuAccelerator {}
54
55impl GpuAccelerator {
56    pub fn new(config: GpuConfig) -> Result<Self> {
57        config.validate()?;
58
59        let device = GpuDevice::get_device_info(config.device_id)?;
60        let memory_pool = Arc::new(Mutex::new(Vec::new()));
61        let stream_pool = Self::create_streams(config.stream_count, config.device_id)?;
62        let kernel_manager = KernelManager::new();
63
64        Ok(Self {
65            config,
66            device,
67            memory_pool,
68            stream_pool,
69            kernel_cache: Arc::new(RwLock::new(HashMap::new())),
70            performance_stats: Arc::new(RwLock::new(GpuPerformanceStats::new())),
71            kernel_manager,
72        })
73    }
74
75    fn create_streams(count: usize, device_id: i32) -> Result<Vec<CudaStream>> {
76        let mut streams = Vec::new();
77
78        for _ in 0..count {
79            let handle = Self::create_cuda_stream(device_id)?;
80            streams.push(CudaStream { handle, device_id });
81        }
82
83        Ok(streams)
84    }
85
86    #[allow(unused_variables)]
87    fn create_cuda_stream(device_id: i32) -> Result<*mut std::ffi::c_void> {
88        #[cfg(feature = "cuda")]
89        {
90            use cuda_runtime_sys::*;
91            unsafe {
92                let result = cudaSetDevice(device_id);
93                if result != cudaError_t::cudaSuccess {
94                    return Err(anyhow!("Failed to set CUDA device"));
95                }
96
97                let mut stream: cudaStream_t = std::ptr::null_mut();
98                let result = cudaStreamCreate(&mut stream);
99                if result != cudaError_t::cudaSuccess {
100                    return Err(anyhow!("Failed to create CUDA stream"));
101                }
102                Ok(stream as *mut std::ffi::c_void)
103            }
104        }
105
106        #[cfg(not(feature = "cuda"))]
107        {
108            // Fallback: return a dummy handle for testing
109            Ok(1 as *mut std::ffi::c_void)
110        }
111    }
112
113    /// Compute similarity between query vectors and database vectors
114    pub fn compute_similarity(
115        &self,
116        queries: &[f32],
117        database: &[f32],
118        query_count: usize,
119        db_count: usize,
120        dim: usize,
121        metric: SimilarityMetric,
122    ) -> Result<Vec<f32>> {
123        let timer = super::performance::GpuTimer::start("similarity_computation");
124
125        // Allocate GPU buffers
126        let mut query_buffer = GpuBuffer::new(queries.len(), self.config.device_id)?;
127        let mut db_buffer = GpuBuffer::new(database.len(), self.config.device_id)?;
128        let result_buffer = GpuBuffer::new(query_count * db_count, self.config.device_id)?;
129
130        // Copy data to GPU
131        query_buffer.copy_from_host(queries)?;
132        db_buffer.copy_from_host(database)?;
133
134        // Select appropriate kernel
135        let kernel_name = match metric {
136            SimilarityMetric::Cosine => "cosine_similarity",
137            SimilarityMetric::Euclidean => "euclidean_distance",
138            _ => return Err(anyhow!("Unsupported similarity metric for GPU")),
139        };
140
141        // Create kernel parameters
142        let params = SimilarityKernelParams {
143            query_count,
144            db_count,
145            dim,
146            metric: kernel_name.to_string(),
147        };
148
149        // Launch kernel
150        self.launch_similarity_kernel(
151            kernel_name,
152            &query_buffer,
153            &db_buffer,
154            &result_buffer,
155            &params,
156        )?;
157
158        // Copy results back
159        let mut results = vec![0.0f32; query_count * db_count];
160        result_buffer.copy_to_host(&mut results)?;
161
162        // Record performance
163        let duration = timer.stop();
164        self.performance_stats
165            .write()
166            .record_compute_operation(duration);
167
168        Ok(results)
169    }
170
171    fn launch_similarity_kernel(
172        &self,
173        kernel_name: &str,
174        query_buffer: &GpuBuffer,
175        db_buffer: &GpuBuffer,
176        result_buffer: &GpuBuffer,
177        params: &SimilarityKernelParams,
178    ) -> Result<()> {
179        #[cfg(feature = "cuda")]
180        {
181            // Get or compile kernel
182            let kernel = self.get_or_compile_kernel(kernel_name)?;
183
184            // Calculate grid and block dimensions
185            let (blocks, threads) = self
186                .device
187                .calculate_optimal_block_config(params.query_count * params.db_count);
188
189            // Launch kernel
190            self.launch_kernel_impl(
191                &kernel,
192                blocks,
193                threads,
194                &[
195                    query_buffer.ptr() as *mut std::ffi::c_void,
196                    db_buffer.ptr() as *mut std::ffi::c_void,
197                    result_buffer.ptr() as *mut std::ffi::c_void,
198                    &params.query_count as *const usize as *mut std::ffi::c_void,
199                    &params.db_count as *const usize as *mut std::ffi::c_void,
200                    &params.dim as *const usize as *mut std::ffi::c_void,
201                ],
202            )?;
203        }
204
205        #[cfg(not(feature = "cuda"))]
206        {
207            // Fallback CPU implementation for testing
208            self.compute_similarity_cpu(
209                query_buffer,
210                db_buffer,
211                result_buffer,
212                params,
213                kernel_name,
214            )?;
215        }
216
217        Ok(())
218    }
219
220    #[cfg(not(feature = "cuda"))]
221    fn compute_similarity_cpu(
222        &self,
223        _query_buffer: &GpuBuffer,
224        _db_buffer: &GpuBuffer,
225        _result_buffer: &GpuBuffer,
226        params: &SimilarityKernelParams,
227        _metric: &str,
228    ) -> Result<()> {
229        // Simplified CPU fallback
230        let query_data = vec![0.0f32; params.query_count * params.dim];
231        let db_data = vec![0.0f32; params.db_count * params.dim];
232        let mut results = vec![0.0f32; params.query_count * params.db_count];
233
234        // Copy data from "GPU" buffers (actually host memory in fallback)
235        // In real implementation, this would be proper GPU memory access
236
237        for i in 0..params.query_count {
238            for j in 0..params.db_count {
239                let query_vec = &query_data[i * params.dim..(i + 1) * params.dim];
240                let db_vec = &db_data[j * params.dim..(j + 1) * params.dim];
241
242                let similarity = match params.metric.as_str() {
243                    "cosine_similarity" => self.compute_cosine_similarity(query_vec, db_vec),
244                    "euclidean_distance" => self.compute_euclidean_distance(query_vec, db_vec),
245                    _ => 0.0,
246                };
247
248                results[i * params.db_count + j] = similarity;
249            }
250        }
251
252        Ok(())
253    }
254
255    #[cfg(not(feature = "cuda"))]
256    fn compute_cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
257        let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
258        let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
259        let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
260
261        if norm_a > 1e-8 && norm_b > 1e-8 {
262            dot / (norm_a * norm_b)
263        } else {
264            0.0
265        }
266    }
267
268    #[cfg(not(feature = "cuda"))]
269    fn compute_euclidean_distance(&self, a: &[f32], b: &[f32]) -> f32 {
270        a.iter()
271            .zip(b.iter())
272            .map(|(x, y)| (x - y).powi(2))
273            .sum::<f32>()
274            .sqrt()
275    }
276
277    fn get_or_compile_kernel(&self, name: &str) -> Result<CudaKernel> {
278        // Check if kernel is already compiled
279        if let Some(kernel) = self.kernel_cache.read().get(name) {
280            return Ok(CudaKernel {
281                function: kernel.function,
282                module: kernel.module,
283                name: kernel.name.clone(),
284            });
285        }
286
287        // Compile kernel
288        let kernel_source = self
289            .kernel_manager
290            .get_kernel(name)
291            .ok_or_else(|| anyhow!("Kernel {} not found", name))?;
292
293        let compiled_kernel = self.compile_kernel(name, kernel_source)?;
294
295        // Cache the compiled kernel
296        self.kernel_cache.write().insert(
297            name.to_string(),
298            CudaKernel {
299                function: compiled_kernel.function,
300                module: compiled_kernel.module,
301                name: compiled_kernel.name.clone(),
302            },
303        );
304
305        Ok(compiled_kernel)
306    }
307
308    #[allow(unused_variables)]
309    fn compile_kernel(&self, name: &str, source: &str) -> Result<CudaKernel> {
310        #[cfg(feature = "cuda")]
311        {
312            use cuda_runtime_sys::*;
313            // In a real implementation, this would use NVRTC or similar to compile CUDA kernels
314            // For now, return a dummy kernel
315            Ok(CudaKernel {
316                function: std::ptr::null_mut(),
317                module: std::ptr::null_mut(),
318                name: name.to_string(),
319            })
320        }
321
322        #[cfg(not(feature = "cuda"))]
323        {
324            Ok(CudaKernel {
325                function: std::ptr::null_mut(),
326                module: std::ptr::null_mut(),
327                name: name.to_string(),
328            })
329        }
330    }
331
332    #[cfg(feature = "cuda")]
333    fn launch_kernel_impl(
334        &self,
335        kernel: &CudaKernel,
336        blocks: i32,
337        threads: i32,
338        args: &[*mut std::ffi::c_void],
339    ) -> Result<()> {
340        use cuda_runtime_sys::*;
341        unsafe {
342            let result = cudaLaunchKernel(
343                kernel.function,
344                dim3 {
345                    x: blocks as u32,
346                    y: 1,
347                    z: 1,
348                },
349                dim3 {
350                    x: threads as u32,
351                    y: 1,
352                    z: 1,
353                },
354                args.as_ptr() as *mut *mut std::ffi::c_void,
355                0,
356                std::ptr::null_mut(),
357            );
358            if result != cudaError_t::cudaSuccess {
359                return Err(anyhow!("Failed to launch kernel"));
360            }
361
362            // Synchronize
363            let result = cudaDeviceSynchronize();
364            if result != cudaError_t::cudaSuccess {
365                return Err(anyhow!("Kernel execution failed"));
366            }
367        }
368        Ok(())
369    }
370
371    /// Get device information
372    pub fn device(&self) -> &GpuDevice {
373        &self.device
374    }
375
376    /// Get configuration
377    pub fn config(&self) -> &GpuConfig {
378        &self.config
379    }
380
381    /// Get performance statistics
382    pub fn performance_stats(&self) -> Arc<RwLock<GpuPerformanceStats>> {
383        self.performance_stats.clone()
384    }
385
386    /// Synchronize all operations
387    pub fn synchronize(&self) -> Result<()> {
388        #[cfg(feature = "cuda")]
389        {
390            use cuda_runtime_sys::*;
391            unsafe {
392                let result = cudaDeviceSynchronize();
393                if result != cudaError_t::cudaSuccess {
394                    return Err(anyhow!("Failed to synchronize device"));
395                }
396            }
397        }
398        Ok(())
399    }
400
401    /// Reset performance statistics
402    pub fn reset_stats(&self) {
403        self.performance_stats.write().reset();
404    }
405
406    /// Get current GPU memory usage in bytes
407    pub fn get_memory_usage(&self) -> Result<usize> {
408        #[cfg(feature = "cuda")]
409        {
410            use cuda_runtime_sys::*;
411            unsafe {
412                let mut free: usize = 0;
413                let mut total: usize = 0;
414                let result = cudaMemGetInfo(&mut free as *mut usize, &mut total as *mut usize);
415                if result != cudaError_t::cudaSuccess {
416                    return Err(anyhow!("Failed to get memory info"));
417                }
418                Ok(total - free)
419            }
420        }
421        #[cfg(not(feature = "cuda"))]
422        {
423            // Return dummy value for testing
424            Ok(0)
425        }
426    }
427}
428
429impl Drop for GpuAccelerator {
430    fn drop(&mut self) {
431        // Cleanup CUDA streams
432        #[cfg(feature = "cuda")]
433        {
434            for stream in &self.stream_pool {
435                unsafe {
436                    let _ = cuda_runtime_sys::cudaStreamDestroy(
437                        stream.handle as cuda_runtime_sys::cudaStream_t,
438                    );
439                }
440            }
441        }
442    }
443}
444
445/// Check if GPU acceleration is available
446pub fn is_gpu_available() -> bool {
447    #[cfg(feature = "cuda")]
448    {
449        match crate::gpu::device::GpuDevice::get_all_devices() {
450            Ok(devices) => !devices.is_empty(),
451            Err(_) => false,
452        }
453    }
454    #[cfg(not(feature = "cuda"))]
455    {
456        false
457    }
458}
459
460/// Create a default GPU accelerator configuration
461pub fn create_default_accelerator() -> Result<GpuAccelerator> {
462    let config = GpuConfig::default();
463    GpuAccelerator::new(config)
464}
465
466/// Create a performance-optimized GPU accelerator
467pub fn create_performance_accelerator() -> Result<GpuAccelerator> {
468    let config = GpuConfig {
469        optimization_level: crate::gpu::OptimizationLevel::Performance,
470        precision_mode: crate::gpu::PrecisionMode::FP32,
471        memory_pool_size: 1024 * 1024 * 1024, // 1GB
472        batch_size: 10000,
473        enable_tensor_cores: true,
474        enable_mixed_precision: false,
475        ..Default::default()
476    };
477    GpuAccelerator::new(config)
478}
479
480/// Create a memory-optimized GPU accelerator
481pub fn create_memory_optimized_accelerator() -> Result<GpuAccelerator> {
482    let config = GpuConfig {
483        optimization_level: crate::gpu::OptimizationLevel::Balanced,
484        precision_mode: crate::gpu::PrecisionMode::FP16,
485        memory_pool_size: 256 * 1024 * 1024, // 256MB
486        batch_size: 1000,
487        enable_tensor_cores: true,
488        enable_mixed_precision: true,
489        ..Default::default()
490    };
491    GpuAccelerator::new(config)
492}