1use super::{GpuBuffer, GpuConfig, GpuDevice, GpuPerformanceStats, KernelManager};
4use crate::similarity::SimilarityMetric;
5use anyhow::{anyhow, Result};
6use parking_lot::RwLock;
7use std::collections::HashMap;
8use std::sync::{Arc, Mutex};
9
10#[derive(Debug)]
12pub struct CudaStream {
13 handle: *mut std::ffi::c_void,
14 device_id: i32,
15}
16
17unsafe impl Send for CudaStream {}
18unsafe impl Sync for CudaStream {}
19
20#[derive(Debug)]
22pub struct CudaKernel {
23 function: *mut std::ffi::c_void,
24 module: *mut std::ffi::c_void,
25 name: String,
26}
27
28unsafe impl Send for CudaKernel {}
29unsafe impl Sync for CudaKernel {}
30
31#[derive(Debug, Clone)]
33pub struct SimilarityKernelParams {
34 pub query_count: usize,
35 pub db_count: usize,
36 pub dim: usize,
37 pub metric: String,
38}
39
40#[derive(Debug)]
42pub struct GpuAccelerator {
43 config: GpuConfig,
44 device: GpuDevice,
45 memory_pool: Arc<Mutex<Vec<GpuBuffer>>>,
46 stream_pool: Vec<CudaStream>,
47 kernel_cache: Arc<RwLock<HashMap<String, CudaKernel>>>,
48 performance_stats: Arc<RwLock<GpuPerformanceStats>>,
49 kernel_manager: KernelManager,
50}
51
52unsafe impl Send for GpuAccelerator {}
53unsafe impl Sync for GpuAccelerator {}
54
55impl GpuAccelerator {
56 pub fn new(config: GpuConfig) -> Result<Self> {
57 config.validate()?;
58
59 let device = GpuDevice::get_device_info(config.device_id)?;
60 let memory_pool = Arc::new(Mutex::new(Vec::new()));
61 let stream_pool = Self::create_streams(config.stream_count, config.device_id)?;
62 let kernel_manager = KernelManager::new();
63
64 Ok(Self {
65 config,
66 device,
67 memory_pool,
68 stream_pool,
69 kernel_cache: Arc::new(RwLock::new(HashMap::new())),
70 performance_stats: Arc::new(RwLock::new(GpuPerformanceStats::new())),
71 kernel_manager,
72 })
73 }
74
75 fn create_streams(count: usize, device_id: i32) -> Result<Vec<CudaStream>> {
76 let mut streams = Vec::new();
77
78 for _ in 0..count {
79 let handle = Self::create_cuda_stream(device_id)?;
80 streams.push(CudaStream { handle, device_id });
81 }
82
83 Ok(streams)
84 }
85
86 #[allow(unused_variables)]
87 fn create_cuda_stream(device_id: i32) -> Result<*mut std::ffi::c_void> {
88 #[cfg(all(feature = "cuda", cuda_runtime_available))]
89 {
90 use cuda_runtime_sys::*;
91 unsafe {
92 let result = cudaSetDevice(device_id);
93 if result != cudaError_t::cudaSuccess {
94 return Err(anyhow!("Failed to set CUDA device"));
95 }
96
97 let mut stream: cudaStream_t = std::ptr::null_mut();
98 let result = cudaStreamCreate(&mut stream);
99 if result != cudaError_t::cudaSuccess {
100 return Err(anyhow!("Failed to create CUDA stream"));
101 }
102 Ok(stream as *mut std::ffi::c_void)
103 }
104 }
105
106 #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
107 {
108 Ok(1 as *mut std::ffi::c_void)
110 }
111 }
112
113 pub fn compute_similarity(
115 &self,
116 queries: &[f32],
117 database: &[f32],
118 query_count: usize,
119 db_count: usize,
120 dim: usize,
121 metric: SimilarityMetric,
122 ) -> Result<Vec<f32>> {
123 let timer = super::performance::GpuTimer::start("similarity_computation");
124
125 let mut query_buffer = GpuBuffer::new(queries.len(), self.config.device_id)?;
127 let mut db_buffer = GpuBuffer::new(database.len(), self.config.device_id)?;
128 let result_buffer = GpuBuffer::new(query_count * db_count, self.config.device_id)?;
129
130 query_buffer.copy_from_host(queries)?;
132 db_buffer.copy_from_host(database)?;
133
134 let kernel_name = match metric {
136 SimilarityMetric::Cosine => "cosine_similarity",
137 SimilarityMetric::Euclidean => "euclidean_distance",
138 _ => return Err(anyhow!("Unsupported similarity metric for GPU")),
139 };
140
141 let params = SimilarityKernelParams {
143 query_count,
144 db_count,
145 dim,
146 metric: kernel_name.to_string(),
147 };
148
149 self.launch_similarity_kernel(
151 kernel_name,
152 &query_buffer,
153 &db_buffer,
154 &result_buffer,
155 ¶ms,
156 )?;
157
158 let mut results = vec![0.0f32; query_count * db_count];
160 result_buffer.copy_to_host(&mut results)?;
161
162 let duration = timer.stop();
164 self.performance_stats
165 .write()
166 .record_compute_operation(duration);
167
168 Ok(results)
169 }
170
171 fn launch_similarity_kernel(
172 &self,
173 kernel_name: &str,
174 query_buffer: &GpuBuffer,
175 db_buffer: &GpuBuffer,
176 result_buffer: &GpuBuffer,
177 params: &SimilarityKernelParams,
178 ) -> Result<()> {
179 #[cfg(all(feature = "cuda", cuda_runtime_available))]
180 {
181 let kernel = self.get_or_compile_kernel(kernel_name)?;
183
184 let (blocks, threads) = self
186 .device
187 .calculate_optimal_block_config(params.query_count * params.db_count);
188
189 self.launch_kernel_impl(
191 &kernel,
192 blocks,
193 threads,
194 &[
195 query_buffer.ptr() as *mut std::ffi::c_void,
196 db_buffer.ptr() as *mut std::ffi::c_void,
197 result_buffer.ptr() as *mut std::ffi::c_void,
198 ¶ms.query_count as *const usize as *mut std::ffi::c_void,
199 ¶ms.db_count as *const usize as *mut std::ffi::c_void,
200 ¶ms.dim as *const usize as *mut std::ffi::c_void,
201 ],
202 )?;
203 }
204
205 #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
206 {
207 self.compute_similarity_cpu(
209 query_buffer,
210 db_buffer,
211 result_buffer,
212 params,
213 kernel_name,
214 )?;
215 }
216
217 Ok(())
218 }
219
220 #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
221 fn compute_similarity_cpu(
222 &self,
223 _query_buffer: &GpuBuffer,
224 _db_buffer: &GpuBuffer,
225 _result_buffer: &GpuBuffer,
226 params: &SimilarityKernelParams,
227 _metric: &str,
228 ) -> Result<()> {
229 let query_data = vec![0.0f32; params.query_count * params.dim];
231 let db_data = vec![0.0f32; params.db_count * params.dim];
232 let mut results = vec![0.0f32; params.query_count * params.db_count];
233
234 for i in 0..params.query_count {
238 for j in 0..params.db_count {
239 let query_vec = &query_data[i * params.dim..(i + 1) * params.dim];
240 let db_vec = &db_data[j * params.dim..(j + 1) * params.dim];
241
242 let similarity = match params.metric.as_str() {
243 "cosine_similarity" => self.compute_cosine_similarity(query_vec, db_vec),
244 "euclidean_distance" => self.compute_euclidean_distance(query_vec, db_vec),
245 _ => 0.0,
246 };
247
248 results[i * params.db_count + j] = similarity;
249 }
250 }
251
252 Ok(())
253 }
254
255 #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
256 fn compute_cosine_similarity(&self, a: &[f32], b: &[f32]) -> f32 {
257 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
258 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
259 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
260
261 if norm_a > 1e-8 && norm_b > 1e-8 {
262 dot / (norm_a * norm_b)
263 } else {
264 0.0
265 }
266 }
267
268 #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
269 fn compute_euclidean_distance(&self, a: &[f32], b: &[f32]) -> f32 {
270 a.iter()
271 .zip(b.iter())
272 .map(|(x, y)| (x - y).powi(2))
273 .sum::<f32>()
274 .sqrt()
275 }
276
277 fn get_or_compile_kernel(&self, name: &str) -> Result<CudaKernel> {
278 if let Some(kernel) = self.kernel_cache.read().get(name) {
280 return Ok(CudaKernel {
281 function: kernel.function,
282 module: kernel.module,
283 name: kernel.name.clone(),
284 });
285 }
286
287 let kernel_source = self
289 .kernel_manager
290 .get_kernel(name)
291 .ok_or_else(|| anyhow!("Kernel {} not found", name))?;
292
293 let compiled_kernel = self.compile_kernel(name, kernel_source)?;
294
295 self.kernel_cache.write().insert(
297 name.to_string(),
298 CudaKernel {
299 function: compiled_kernel.function,
300 module: compiled_kernel.module,
301 name: compiled_kernel.name.clone(),
302 },
303 );
304
305 Ok(compiled_kernel)
306 }
307
308 #[allow(unused_variables)]
309 fn compile_kernel(&self, name: &str, source: &str) -> Result<CudaKernel> {
310 #[cfg(all(feature = "cuda", cuda_runtime_available))]
311 {
312 Ok(CudaKernel {
315 function: std::ptr::null_mut(),
316 module: std::ptr::null_mut(),
317 name: name.to_string(),
318 })
319 }
320
321 #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
322 {
323 Ok(CudaKernel {
324 function: std::ptr::null_mut(),
325 module: std::ptr::null_mut(),
326 name: name.to_string(),
327 })
328 }
329 }
330
331 #[cfg(all(feature = "cuda", cuda_runtime_available))]
332 fn launch_kernel_impl(
333 &self,
334 kernel: &CudaKernel,
335 blocks: i32,
336 threads: i32,
337 args: &[*mut std::ffi::c_void],
338 ) -> Result<()> {
339 use cuda_runtime_sys::*;
340 unsafe {
341 let result = cudaLaunchKernel(
342 kernel.function,
343 dim3 {
344 x: blocks as u32,
345 y: 1,
346 z: 1,
347 },
348 dim3 {
349 x: threads as u32,
350 y: 1,
351 z: 1,
352 },
353 args.as_ptr() as *mut *mut std::ffi::c_void,
354 0,
355 std::ptr::null_mut(),
356 );
357 if result != cudaError_t::cudaSuccess {
358 return Err(anyhow!("Failed to launch kernel"));
359 }
360
361 let result = cudaDeviceSynchronize();
363 if result != cudaError_t::cudaSuccess {
364 return Err(anyhow!("Kernel execution failed"));
365 }
366 }
367 Ok(())
368 }
369
370 pub fn device(&self) -> &GpuDevice {
372 &self.device
373 }
374
375 pub fn config(&self) -> &GpuConfig {
377 &self.config
378 }
379
380 pub fn performance_stats(&self) -> Arc<RwLock<GpuPerformanceStats>> {
382 self.performance_stats.clone()
383 }
384
385 pub fn synchronize(&self) -> Result<()> {
387 #[cfg(all(feature = "cuda", cuda_runtime_available))]
388 {
389 use cuda_runtime_sys::*;
390 unsafe {
391 let result = cudaDeviceSynchronize();
392 if result != cudaError_t::cudaSuccess {
393 return Err(anyhow!("Failed to synchronize device"));
394 }
395 }
396 }
397 Ok(())
398 }
399
400 pub fn reset_stats(&self) {
402 self.performance_stats.write().reset();
403 }
404
405 pub fn get_memory_usage(&self) -> Result<usize> {
407 #[cfg(all(feature = "cuda", cuda_runtime_available))]
408 {
409 use cuda_runtime_sys::*;
410 unsafe {
411 let mut free: usize = 0;
412 let mut total: usize = 0;
413 let result = cudaMemGetInfo(&mut free as *mut usize, &mut total as *mut usize);
414 if result != cudaError_t::cudaSuccess {
415 return Err(anyhow!("Failed to get memory info"));
416 }
417 Ok(total - free)
418 }
419 }
420 #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
421 {
422 Ok(0)
424 }
425 }
426}
427
428impl Drop for GpuAccelerator {
429 fn drop(&mut self) {
430 #[cfg(all(feature = "cuda", cuda_runtime_available))]
432 {
433 for stream in &self.stream_pool {
434 unsafe {
435 let _ = cuda_runtime_sys::cudaStreamDestroy(
436 stream.handle as cuda_runtime_sys::cudaStream_t,
437 );
438 }
439 }
440 }
441 }
442}
443
444pub fn is_gpu_available() -> bool {
446 #[cfg(all(feature = "cuda", cuda_runtime_available))]
447 {
448 match crate::gpu::device::GpuDevice::get_all_devices() {
449 Ok(devices) => !devices.is_empty(),
450 Err(_) => false,
451 }
452 }
453 #[cfg(not(all(feature = "cuda", cuda_runtime_available)))]
454 {
455 false
456 }
457}
458
459pub fn create_default_accelerator() -> Result<GpuAccelerator> {
461 let config = GpuConfig::default();
462 GpuAccelerator::new(config)
463}
464
465pub fn create_performance_accelerator() -> Result<GpuAccelerator> {
467 let config = GpuConfig {
468 optimization_level: crate::gpu::OptimizationLevel::Performance,
469 precision_mode: crate::gpu::PrecisionMode::FP32,
470 memory_pool_size: 1024 * 1024 * 1024, batch_size: 10000,
472 enable_tensor_cores: true,
473 enable_mixed_precision: false,
474 ..Default::default()
475 };
476 GpuAccelerator::new(config)
477}
478
479pub fn create_memory_optimized_accelerator() -> Result<GpuAccelerator> {
481 let config = GpuConfig {
482 optimization_level: crate::gpu::OptimizationLevel::Balanced,
483 precision_mode: crate::gpu::PrecisionMode::FP16,
484 memory_pool_size: 256 * 1024 * 1024, batch_size: 1000,
486 enable_tensor_cores: true,
487 enable_mixed_precision: true,
488 ..Default::default()
489 };
490 GpuAccelerator::new(config)
491}