1use super::{GpuBuffer, GpuConfig, GpuDevice, GpuPerformanceStats, KernelManager};
4use crate::similarity::SimilarityMetric;
5use anyhow::{anyhow, Result};
6use parking_lot::RwLock;
7use std::collections::HashMap;
8use std::sync::{Arc, Mutex};
9
10#[derive(Debug)]
12pub struct CudaStream {
13 handle: *mut std::ffi::c_void,
14 device_id: i32,
15}
16
17unsafe impl Send for CudaStream {}
18unsafe impl Sync for CudaStream {}
19
20#[derive(Debug)]
22pub struct CudaKernel {
23 function: *mut std::ffi::c_void,
24 module: *mut std::ffi::c_void,
25 name: String,
26}
27
28unsafe impl Send for CudaKernel {}
29unsafe impl Sync for CudaKernel {}
30
31#[derive(Debug, Clone)]
33pub struct SimilarityKernelParams {
34 pub query_count: usize,
35 pub db_count: usize,
36 pub dim: usize,
37 pub metric: String,
38}
39
40#[derive(Debug)]
42pub struct GpuAccelerator {
43 config: GpuConfig,
44 device: GpuDevice,
45 memory_pool: Arc<Mutex<Vec<GpuBuffer>>>,
46 stream_pool: Vec<CudaStream>,
47 kernel_cache: Arc<RwLock<HashMap<String, CudaKernel>>>,
48 performance_stats: Arc<RwLock<GpuPerformanceStats>>,
49 kernel_manager: KernelManager,
50}
51
52unsafe impl Send for GpuAccelerator {}
53unsafe impl Sync for GpuAccelerator {}
54
55impl GpuAccelerator {
56 pub fn new(config: GpuConfig) -> Result<Self> {
57 config.validate()?;
58
59 let device = GpuDevice::get_device_info(config.device_id)?;
60 let memory_pool = Arc::new(Mutex::new(Vec::new()));
61 let stream_pool = Self::create_streams(config.stream_count, config.device_id)?;
62 let kernel_manager = KernelManager::new();
63
64 Ok(Self {
65 config,
66 device,
67 memory_pool,
68 stream_pool,
69 kernel_cache: Arc::new(RwLock::new(HashMap::new())),
70 performance_stats: Arc::new(RwLock::new(GpuPerformanceStats::new())),
71 kernel_manager,
72 })
73 }
74
75 fn create_streams(count: usize, device_id: i32) -> Result<Vec<CudaStream>> {
76 let mut streams = Vec::new();
77
78 for _ in 0..count {
79 let handle = Self::create_cuda_stream(device_id)?;
80 streams.push(CudaStream { handle, device_id });
81 }
82
83 Ok(streams)
84 }
85
86 #[allow(unused_variables)]
87 fn create_cuda_stream(device_id: i32) -> Result<*mut std::ffi::c_void> {
88 #[cfg(feature = "cuda")]
89 {
90 use cuda_runtime_sys::*;
91 unsafe {
92 let result = cudaSetDevice(device_id);
93 if result != cudaError_t::cudaSuccess {
94 return Err(anyhow!("Failed to set CUDA device"));
95 }
96
97 let mut stream: cudaStream_t = std::ptr::null_mut();
98 let result = cudaStreamCreate(&mut stream);
99 if result != cudaError_t::cudaSuccess {
100 return Err(anyhow!("Failed to create CUDA stream"));
101 }
102 Ok(stream as *mut std::ffi::c_void)
103 }
104 }
105
106 #[cfg(not(feature = "cuda"))]
107 {
108 Ok(1 as *mut std::ffi::c_void)
110 }
111 }
112
113 pub fn compute_similarity(
115 &self,
116 queries: &[f32],
117 database: &[f32],
118 query_count: usize,
119 db_count: usize,
120 dim: usize,
121 metric: SimilarityMetric,
122 ) -> Result<Vec<f32>> {
123 let timer = super::performance::GpuTimer::start("similarity_computation");
124
125 let mut query_buffer = GpuBuffer::new(queries.len(), self.config.device_id)?;
127 let mut db_buffer = GpuBuffer::new(database.len(), self.config.device_id)?;
128 let result_buffer = GpuBuffer::new(query_count * db_count, self.config.device_id)?;
129
130 query_buffer.copy_from_host(queries)?;
132 db_buffer.copy_from_host(database)?;
133
134 let kernel_name = match metric {
136 SimilarityMetric::Cosine => "cosine_similarity",
137 SimilarityMetric::Euclidean => "euclidean_distance",
138 _ => return Err(anyhow!("Unsupported similarity metric for GPU")),
139 };
140
141 let params = SimilarityKernelParams {
143 query_count,
144 db_count,
145 dim,
146 metric: kernel_name.to_string(),
147 };
148
149 self.launch_similarity_kernel(
151 kernel_name,
152 &query_buffer,
153 &db_buffer,
154 &result_buffer,
155 ¶ms,
156 )?;
157
158 let mut results = vec![0.0f32; query_count * db_count];
160 result_buffer.copy_to_host(&mut results)?;
161
162 let duration = timer.stop();
164 self.performance_stats
165 .write()
166 .record_compute_operation(duration);
167
168 Ok(results)
169 }
170
171 fn launch_similarity_kernel(
172 &self,
173 kernel_name: &str,
174 query_buffer: &GpuBuffer,
175 db_buffer: &GpuBuffer,
176 result_buffer: &GpuBuffer,
177 params: &SimilarityKernelParams,
178 ) -> Result<()> {
179 #[cfg(feature = "cuda")]
180 {
181 let kernel = self.get_or_compile_kernel(kernel_name)?;
183
184 let (blocks, threads) = self
186 .device
187 .calculate_optimal_block_config(params.query_count * params.db_count);
188
189 self.launch_kernel_impl(
191 &kernel,
192 blocks,
193 threads,
194 &[
195 query_buffer.ptr() as *mut std::ffi::c_void,
196 db_buffer.ptr() as *mut std::ffi::c_void,
197 result_buffer.ptr() as *mut std::ffi::c_void,
198 ¶ms.query_count as *const usize as *mut std::ffi::c_void,
199 ¶ms.db_count as *const usize as *mut std::ffi::c_void,
200 ¶ms.dim as *const usize as *mut std::ffi::c_void,
201 ],
202 )?;
203 }
204
205 #[cfg(not(feature = "cuda"))]
206 {
207 self.compute_similarity_cpu(
209 query_buffer,
210 db_buffer,
211 result_buffer,
212 params,
213 kernel_name,
214 )?;
215 }
216
217 Ok(())
218 }
219
220 #[cfg(not(feature = "cuda"))]
221 fn compute_similarity_cpu(
222 &self,
223 _query_buffer: &GpuBuffer,
224 _db_buffer: &GpuBuffer,
225 _result_buffer: &GpuBuffer,
226 params: &SimilarityKernelParams,
227 _metric: &str,
228 ) -> Result<()> {
229 let query_data = vec![0.0f32; params.query_count * params.dim];
231 let db_data = vec![0.0f32; params.db_count * params.dim];
232 let mut results = vec![0.0f32; params.query_count * params.db_count];
233
234 for i in 0..params.query_count {
238 for j in 0..params.db_count {
239 let query_vec = &query_data[i * params.dim..(i + 1) * params.dim];
240 let db_vec = &db_data[j * params.dim..(j + 1) * params.dim];
241
242 let similarity = match params.metric.as_str() {
243 "cosine_similarity" => self.compute_cosine_similarity(query_vec, db_vec),
244 "euclidean_distance" => self.compute_euclidean_distance(query_vec, db_vec),
245 _ => 0.0,
246 };
247
248 results[i * params.db_count + j] = similarity;
249 }
250 }
251
252 Ok(())
253 }
254
255 #[cfg(not(feature = "cuda"))]
256 fn compute_cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
257 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
258 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
259 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
260
261 if norm_a > 1e-8 && norm_b > 1e-8 {
262 dot / (norm_a * norm_b)
263 } else {
264 0.0
265 }
266 }
267
268 #[cfg(not(feature = "cuda"))]
269 fn compute_euclidean_distance(&self, a: &[f32], b: &[f32]) -> f32 {
270 a.iter()
271 .zip(b.iter())
272 .map(|(x, y)| (x - y).powi(2))
273 .sum::<f32>()
274 .sqrt()
275 }
276
277 fn get_or_compile_kernel(&self, name: &str) -> Result<CudaKernel> {
278 if let Some(kernel) = self.kernel_cache.read().get(name) {
280 return Ok(CudaKernel {
281 function: kernel.function,
282 module: kernel.module,
283 name: kernel.name.clone(),
284 });
285 }
286
287 let kernel_source = self
289 .kernel_manager
290 .get_kernel(name)
291 .ok_or_else(|| anyhow!("Kernel {} not found", name))?;
292
293 let compiled_kernel = self.compile_kernel(name, kernel_source)?;
294
295 self.kernel_cache.write().insert(
297 name.to_string(),
298 CudaKernel {
299 function: compiled_kernel.function,
300 module: compiled_kernel.module,
301 name: compiled_kernel.name.clone(),
302 },
303 );
304
305 Ok(compiled_kernel)
306 }
307
308 #[allow(unused_variables)]
309 fn compile_kernel(&self, name: &str, source: &str) -> Result<CudaKernel> {
310 #[cfg(feature = "cuda")]
311 {
312 use cuda_runtime_sys::*;
313 Ok(CudaKernel {
316 function: std::ptr::null_mut(),
317 module: std::ptr::null_mut(),
318 name: name.to_string(),
319 })
320 }
321
322 #[cfg(not(feature = "cuda"))]
323 {
324 Ok(CudaKernel {
325 function: std::ptr::null_mut(),
326 module: std::ptr::null_mut(),
327 name: name.to_string(),
328 })
329 }
330 }
331
332 #[cfg(feature = "cuda")]
333 fn launch_kernel_impl(
334 &self,
335 kernel: &CudaKernel,
336 blocks: i32,
337 threads: i32,
338 args: &[*mut std::ffi::c_void],
339 ) -> Result<()> {
340 use cuda_runtime_sys::*;
341 unsafe {
342 let result = cudaLaunchKernel(
343 kernel.function,
344 dim3 {
345 x: blocks as u32,
346 y: 1,
347 z: 1,
348 },
349 dim3 {
350 x: threads as u32,
351 y: 1,
352 z: 1,
353 },
354 args.as_ptr() as *mut *mut std::ffi::c_void,
355 0,
356 std::ptr::null_mut(),
357 );
358 if result != cudaError_t::cudaSuccess {
359 return Err(anyhow!("Failed to launch kernel"));
360 }
361
362 let result = cudaDeviceSynchronize();
364 if result != cudaError_t::cudaSuccess {
365 return Err(anyhow!("Kernel execution failed"));
366 }
367 }
368 Ok(())
369 }
370
371 pub fn device(&self) -> &GpuDevice {
373 &self.device
374 }
375
376 pub fn config(&self) -> &GpuConfig {
378 &self.config
379 }
380
381 pub fn performance_stats(&self) -> Arc<RwLock<GpuPerformanceStats>> {
383 self.performance_stats.clone()
384 }
385
386 pub fn synchronize(&self) -> Result<()> {
388 #[cfg(feature = "cuda")]
389 {
390 use cuda_runtime_sys::*;
391 unsafe {
392 let result = cudaDeviceSynchronize();
393 if result != cudaError_t::cudaSuccess {
394 return Err(anyhow!("Failed to synchronize device"));
395 }
396 }
397 }
398 Ok(())
399 }
400
401 pub fn reset_stats(&self) {
403 self.performance_stats.write().reset();
404 }
405
406 pub fn get_memory_usage(&self) -> Result<usize> {
408 #[cfg(feature = "cuda")]
409 {
410 use cuda_runtime_sys::*;
411 unsafe {
412 let mut free: usize = 0;
413 let mut total: usize = 0;
414 let result = cudaMemGetInfo(&mut free as *mut usize, &mut total as *mut usize);
415 if result != cudaError_t::cudaSuccess {
416 return Err(anyhow!("Failed to get memory info"));
417 }
418 Ok(total - free)
419 }
420 }
421 #[cfg(not(feature = "cuda"))]
422 {
423 Ok(0)
425 }
426 }
427}
428
429impl Drop for GpuAccelerator {
430 fn drop(&mut self) {
431 #[cfg(feature = "cuda")]
433 {
434 for stream in &self.stream_pool {
435 unsafe {
436 let _ = cuda_runtime_sys::cudaStreamDestroy(
437 stream.handle as cuda_runtime_sys::cudaStream_t,
438 );
439 }
440 }
441 }
442 }
443}
444
445pub fn is_gpu_available() -> bool {
447 #[cfg(feature = "cuda")]
448 {
449 match crate::gpu::device::GpuDevice::get_all_devices() {
450 Ok(devices) => !devices.is_empty(),
451 Err(_) => false,
452 }
453 }
454 #[cfg(not(feature = "cuda"))]
455 {
456 false
457 }
458}
459
460pub fn create_default_accelerator() -> Result<GpuAccelerator> {
462 let config = GpuConfig::default();
463 GpuAccelerator::new(config)
464}
465
466pub fn create_performance_accelerator() -> Result<GpuAccelerator> {
468 let config = GpuConfig {
469 optimization_level: crate::gpu::OptimizationLevel::Performance,
470 precision_mode: crate::gpu::PrecisionMode::FP32,
471 memory_pool_size: 1024 * 1024 * 1024, batch_size: 10000,
473 enable_tensor_cores: true,
474 enable_mixed_precision: false,
475 ..Default::default()
476 };
477 GpuAccelerator::new(config)
478}
479
480pub fn create_memory_optimized_accelerator() -> Result<GpuAccelerator> {
482 let config = GpuConfig {
483 optimization_level: crate::gpu::OptimizationLevel::Balanced,
484 precision_mode: crate::gpu::PrecisionMode::FP16,
485 memory_pool_size: 256 * 1024 * 1024, batch_size: 1000,
487 enable_tensor_cores: true,
488 enable_mixed_precision: true,
489 ..Default::default()
490 };
491 GpuAccelerator::new(config)
492}