pandrs 0.3.0 - Docs.rs

//! CUDA-specific GPU operations
//!
//! This module provides CUDA implementations of GPU operations used in the operations module.
//! It's only compiled when the 'cuda' feature is enabled.

use ndarray::{Array1, Array2};
use std::ptr;
use std::sync::Arc;

use crate::error::{Error, Result};
use crate::gpu::operations::{GpuMatrix, GpuVector};
use crate::gpu::{GpuError, GpuManager};

// Import CUDA-specific dependencies when the feature is enabled
#[cfg(cuda_available)]
use cudarc::cublas::CudaBlas;
#[cfg(cuda_available)]
use cudarc::driver::CudaFunction;
#[cfg(cuda_available)]
use cudarc::driver::{CudaContext as CudarcContext, CudaSlice, CudaStream, DriverError};
#[cfg(cuda_available)]
use half::f16;

/// CUDA context wrapper for managing CUDA resources
#[cfg(cuda_available)]
pub struct PandrsGpuContext {
    /// CUDA context (cudarc 0.18+ renamed CudaDevice to CudaContext)
    context: Arc<CudarcContext>,
    /// CUDA stream for operations
    stream: Arc<CudaStream>,
    /// cuBLAS handle
    cublas: Arc<CudaBlas>,
    /// Whether the device supports compute capability 7.0+ (Volta or later)
    supports_tensor_cores: bool,
}

#[cfg(cuda_available)]
impl PandrsGpuContext {
    /// Create a new CUDA context
    pub fn new(device_id: i32) -> Result<Self> {
        // Initialize CUDA context (cudarc 0.18+ uses CudaContext instead of CudaDevice)
        let context = match CudarcContext::new(device_id as usize) {
            Ok(ctx) => ctx,
            Err(e) => {
                return Err(Error::from(GpuError::DeviceError(format!(
                    "Failed to initialize CUDA context: {}",
                    e
                ))))
            }
        };

        // Get default stream
        let stream = context.default_stream();

        // Initialize cuBLAS
        let cublas = match CudaBlas::new(stream.clone()) {
            Ok(cublas) => Arc::new(cublas),
            Err(e) => {
                return Err(Error::from(GpuError::DeviceError(format!(
                    "Failed to initialize cuBLAS: {}",
                    e
                ))))
            }
        };

        // Check compute capability
        // cudarc 0.18.x doesn't expose device properties easily
        // Assume modern GPU with tensor core support
        let supports_tensor_cores = true;

        Ok(PandrsGpuContext {
            context,
            stream,
            cublas,
            supports_tensor_cores,
        })
    }

    /// Get CUDA context
    pub fn context(&self) -> Arc<CudarcContext> {
        self.context.clone()
    }

    /// Get CUDA stream
    pub fn stream(&self) -> Arc<CudaStream> {
        self.stream.clone()
    }

    /// Get cuBLAS handle
    pub fn cublas(&self) -> Arc<CudaBlas> {
        self.cublas.clone()
    }

    /// Check if tensor cores are supported
    pub fn supports_tensor_cores(&self) -> bool {
        self.supports_tensor_cores
    }

    /// Load a CUDA kernel from PTX
    pub fn load_kernel(&self, name: &str, _ptx: &str) -> Result<CudaFunction> {
        // In cudarc 0.18.x, kernel loading requires load_module() first
        // For now, return an error since dynamic PTX loading requires proper module handling
        Err(Error::from(GpuError::DeviceError(format!(
            "Kernel '{}' not found. PTX loading requires module loading in cudarc 0.18.x.",
            name
        ))))
    }
}

/// Get or create a CUDA context for the specified device
#[cfg(cuda_available)]
fn get_cuda_context(manager: &GpuManager) -> Result<Arc<PandrsGpuContext>> {
    // In a real implementation, this would maintain a cache of contexts
    // For simplicity, we'll create a new one each time
    let context = PandrsGpuContext::new(manager.context().config().device_id)?;
    Ok(Arc::new(context))
}

/// Matrix multiplication using CUDA
pub fn matrix_multiply(a: &GpuMatrix, b: &GpuMatrix, manager: &GpuManager) -> Result<GpuMatrix> {
    // Check if dimensions are compatible
    if a.data.shape()[1] != b.data.shape()[0] {
        return Err(Error::DimensionMismatch(format!(
            "Incompatible dimensions for matrix multiplication: {:?} and {:?}",
            a.data.shape(),
            b.data.shape()
        )));
    }

    #[cfg(cuda_available)]
    {
        let cuda_context = get_cuda_context(manager)?;
        let stream = cuda_context.stream();
        let cublas = cuda_context.cublas();

        // Get dimensions
        let m = a.data.shape()[0] as i32;
        let n = b.data.shape()[1] as i32;
        let k = a.data.shape()[1] as i32;

        // Allocate device memory
        let a_data = a
            .data
            .as_slice()
            .ok_or_else(|| Error::InvalidOperation("Data not in contiguous layout".into()))?;
        let b_data = b
            .data
            .as_slice()
            .ok_or_else(|| Error::InvalidOperation("Data not in contiguous layout".into()))?;

        // Copy data to device using stream (cudarc 0.18+ API uses clone_htod)
        let _d_a = match stream.clone_htod(a_data) {
            Ok(d_a) => d_a,
            Err(e) => {
                return Err(Error::from(GpuError::TransferError(format!(
                    "Failed to copy matrix A to device: {}",
                    e
                ))))
            }
        };

        let _d_b = match stream.clone_htod(b_data) {
            Ok(d_b) => d_b,
            Err(e) => {
                return Err(Error::from(GpuError::TransferError(format!(
                    "Failed to copy matrix B to device: {}",
                    e
                ))))
            }
        };

        // Allocate device memory for result
        let _d_c = match stream.alloc_zeros::<f64>((m * n) as usize) {
            Ok(d_c) => d_c,
            Err(e) => {
                return Err(Error::from(GpuError::DeviceError(format!(
                    "Failed to allocate device memory: {}",
                    e
                ))))
            }
        };

        // Set up parameters for GEMM
        let _alpha = 1.0f64;
        let _beta = 0.0f64;

        // Perform matrix multiplication
        // Note: cudarc 0.18.x API - full GEMM implementation would require
        // proper cuBLAS bindings. For now, return placeholder error.
        return Err(Error::from(GpuError::DeviceError(
            "GPU matrix multiplication not fully implemented for cudarc 0.18.x".to_string(),
        )));
    }

    #[cfg(not(cuda_available))]
    {
        // Create result matrix with appropriate dimensions
        let m = a.data.shape()[0];
        let n = b.data.shape()[1];
        let result_shape = (m, n);
        let result_data = Array2::zeros(result_shape);

        Ok(GpuMatrix {
            data: result_data,
            on_gpu: false,
        })
    }
}

/// Element-wise operation template for CUDA
#[cfg(cuda_available)]
fn elementwise_op<F>(
    a: &GpuMatrix,
    b: &GpuMatrix,
    manager: &GpuManager,
    op_name: &str,
    ptx_code: &str,
    op_type: &str,
) -> Result<GpuMatrix>
where
    F: Fn(f64, f64) -> f64,
{
    // Check if dimensions match
    if a.data.shape() != b.data.shape() {
        return Err(Error::DimensionMismatch(format!(
            "Incompatible dimensions for element-wise {}: {:?} and {:?}",
            op_type,
            a.data.shape(),
            b.data.shape()
        )));
    }

    let cuda_context = get_cuda_context(manager)?;
    let stream = cuda_context.stream();

    // Get dimensions
    let shape = a.data.shape();
    let total_elements = shape[0] * shape[1];

    // Load kernel
    let _kernel = cuda_context.load_kernel(op_name, ptx_code)?;

    // Allocate device memory
    let a_data = a.data.as_slice().ok_or_else(|| {
        Error::from(GpuError::DeviceError(
            "Matrix A is not contiguous in memory".to_string(),
        ))
    })?;
    let b_data = b.data.as_slice().ok_or_else(|| {
        Error::from(GpuError::DeviceError(
            "Matrix B is not contiguous in memory".to_string(),
        ))
    })?;

    // Copy data to device using stream (cudarc 0.18+ API uses clone_htod)
    let _d_a = match stream.clone_htod(a_data) {
        Ok(d_a) => d_a,
        Err(e) => {
            return Err(Error::from(GpuError::TransferError(format!(
                "Failed to copy matrix A to device: {}",
                e
            ))))
        }
    };

    let _d_b = match stream.clone_htod(b_data) {
        Ok(d_b) => d_b,
        Err(e) => {
            return Err(Error::from(GpuError::TransferError(format!(
                "Failed to copy matrix B to device: {}",
                e
            ))))
        }
    };

    // Allocate device memory for result
    let _d_c = match stream.alloc_zeros::<f64>(total_elements) {
        Ok(d_c) => d_c,
        Err(e) => {
            return Err(Error::from(GpuError::DeviceError(format!(
                "Failed to allocate device memory: {}",
                e
            ))))
        }
    };

    // Calculate grid and block dimensions
    let _block_size = 256;
    let _grid_size = (total_elements + _block_size - 1) / _block_size;

    // Launch kernel
    // Note: kernel launch API for cudarc 0.18.x
    // For now, return a placeholder error
    return Err(Error::from(GpuError::DeviceError(
        "GPU kernel launch not fully implemented for cudarc 0.18.x".to_string(),
    )));
}

/// Element-wise addition of matrices using CUDA
pub fn elementwise_add(a: &GpuMatrix, b: &GpuMatrix, manager: &GpuManager) -> Result<GpuMatrix> {
    // Check if dimensions match
    if a.data.shape() != b.data.shape() {
        return Err(Error::DimensionMismatch(format!(
            "Incompatible dimensions for element-wise addition: {:?} and {:?}",
            a.data.shape(),
            b.data.shape()
        )));
    }

    #[cfg(cuda_available)]
    {
        // PTX code for element-wise addition kernel
        const PTX_ADD: &str = r#"
        .version 7.0
        .target sm_70
        .address_size 64

        .visible .entry add(
            .param .u64 a,
            .param .u64 b,
            .param .u64 c,
            .param .u32 n
        )
        {
            .reg .b32 	%r<4>;
            .reg .b64 	%rd<8>;
            .reg .f64 	%fd<4>;

            ld.param.u64 	%rd1, [a];
            ld.param.u64 	%rd2, [b];
            ld.param.u64 	%rd3, [c];
            ld.param.u32 	%r1, [n];

            mov.u32 	%r2, %tid.x;
            mov.u32 	%r3, %ntid.x;
            mad.lo.s32 	%r2, %r3, %ctaid.x, %r2;

            setp.ge.s32	%p1, %r2, %r1;
            @%p1 bra 	$L__BB0_2;

            cvt.s64.s32	%rd4, %r2;
            mul.wide.s32 	%rd5, %r2, 8;
            add.s64 	%rd6, %rd1, %rd5;
            add.s64 	%rd7, %rd2, %rd5;
            ld.global.f64 	%fd1, [%rd6];
            ld.global.f64 	%fd2, [%rd7];
            add.f64 	%fd3, %fd1, %fd2;
            add.s64 	%rd4, %rd3, %rd5;
            st.global.f64 	[%rd4], %fd3;

        $L__BB0_2:
            ret;
        }
        "#;

        return elementwise_op::<fn(f64, f64) -> f64>(a, b, manager, "add", PTX_ADD, "addition");
    }

    #[cfg(not(cuda_available))]
    {
        // Create result matrix with same dimensions
        let shape = a.data.shape();
        let result_data = Array2::zeros(shape);

        Ok(GpuMatrix {
            data: result_data,
            on_gpu: false,
        })
    }
}

/// Element-wise subtraction of matrices using CUDA
pub fn elementwise_subtract(
    a: &GpuMatrix,
    b: &GpuMatrix,
    manager: &GpuManager,
) -> Result<GpuMatrix> {
    // Check if dimensions match
    if a.data.shape() != b.data.shape() {
        return Err(Error::DimensionMismatch(format!(
            "Incompatible dimensions for element-wise subtraction: {:?} and {:?}",
            a.data.shape(),
            b.data.shape()
        )));
    }

    #[cfg(cuda_available)]
    {
        // PTX code for element-wise subtraction kernel
        const PTX_SUB: &str = r#"
        .version 7.0
        .target sm_70
        .address_size 64

        .visible .entry subtract(
            .param .u64 a,
            .param .u64 b,
            .param .u64 c,
            .param .u32 n
        )
        {
            .reg .b32 	%r<4>;
            .reg .b64 	%rd<8>;
            .reg .f64 	%fd<4>;

            ld.param.u64 	%rd1, [a];
            ld.param.u64 	%rd2, [b];
            ld.param.u64 	%rd3, [c];
            ld.param.u32 	%r1, [n];

            mov.u32 	%r2, %tid.x;
            mov.u32 	%r3, %ntid.x;
            mad.lo.s32 	%r2, %r3, %ctaid.x, %r2;

            setp.ge.s32	%p1, %r2, %r1;
            @%p1 bra 	$L__BB0_2;

            cvt.s64.s32	%rd4, %r2;
            mul.wide.s32 	%rd5, %r2, 8;
            add.s64 	%rd6, %rd1, %rd5;
            add.s64 	%rd7, %rd2, %rd5;
            ld.global.f64 	%fd1, [%rd6];
            ld.global.f64 	%fd2, [%rd7];
            sub.f64 	%fd3, %fd1, %fd2;
            add.s64 	%rd4, %rd3, %rd5;
            st.global.f64 	[%rd4], %fd3;

        $L__BB0_2:
            ret;
        }
        "#;

        return elementwise_op::<fn(f64, f64) -> f64>(
            a,
            b,
            manager,
            "subtract",
            PTX_SUB,
            "subtraction",
        );
    }

    #[cfg(not(cuda_available))]
    {
        // Create result matrix with same dimensions
        let shape = a.data.shape();
        let result_data = Array2::zeros(shape);

        Ok(GpuMatrix {
            data: result_data,
            on_gpu: false,
        })
    }
}

/// Element-wise multiplication of matrices using CUDA
pub fn elementwise_multiply(
    a: &GpuMatrix,
    b: &GpuMatrix,
    manager: &GpuManager,
) -> Result<GpuMatrix> {
    // Check if dimensions match
    if a.data.shape() != b.data.shape() {
        return Err(Error::DimensionMismatch(format!(
            "Incompatible dimensions for element-wise multiplication: {:?} and {:?}",
            a.data.shape(),
            b.data.shape()
        )));
    }

    #[cfg(cuda_available)]
    {
        // PTX code for element-wise multiplication kernel
        const PTX_MUL: &str = r#"
        .version 7.0
        .target sm_70
        .address_size 64

        .visible .entry multiply(
            .param .u64 a,
            .param .u64 b,
            .param .u64 c,
            .param .u32 n
        )
        {
            .reg .b32 	%r<4>;
            .reg .b64 	%rd<8>;
            .reg .f64 	%fd<4>;

            ld.param.u64 	%rd1, [a];
            ld.param.u64 	%rd2, [b];
            ld.param.u64 	%rd3, [c];
            ld.param.u32 	%r1, [n];

            mov.u32 	%r2, %tid.x;
            mov.u32 	%r3, %ntid.x;
            mad.lo.s32 	%r2, %r3, %ctaid.x, %r2;

            setp.ge.s32	%p1, %r2, %r1;
            @%p1 bra 	$L__BB0_2;

            cvt.s64.s32	%rd4, %r2;
            mul.wide.s32 	%rd5, %r2, 8;
            add.s64 	%rd6, %rd1, %rd5;
            add.s64 	%rd7, %rd2, %rd5;
            ld.global.f64 	%fd1, [%rd6];
            ld.global.f64 	%fd2, [%rd7];
            mul.f64 	%fd3, %fd1, %fd2;
            add.s64 	%rd4, %rd3, %rd5;
            st.global.f64 	[%rd4], %fd3;

        $L__BB0_2:
            ret;
        }
        "#;

        return elementwise_op::<fn(f64, f64) -> f64>(
            a,
            b,
            manager,
            "multiply",
            PTX_MUL,
            "multiplication",
        );
    }

    #[cfg(not(cuda_available))]
    {
        // Create result matrix with same dimensions
        let shape = a.data.shape();
        let result_data = Array2::zeros(shape);

        Ok(GpuMatrix {
            data: result_data,
            on_gpu: false,
        })
    }
}

/// Element-wise division of matrices using CUDA
pub fn elementwise_divide(a: &GpuMatrix, b: &GpuMatrix, manager: &GpuManager) -> Result<GpuMatrix> {
    // Check if dimensions match
    if a.data.shape() != b.data.shape() {
        return Err(Error::DimensionMismatch(format!(
            "Incompatible dimensions for element-wise division: {:?} and {:?}",
            a.data.shape(),
            b.data.shape()
        )));
    }

    #[cfg(cuda_available)]
    {
        // PTX code for element-wise division kernel
        const PTX_DIV: &str = r#"
        .version 7.0
        .target sm_70
        .address_size 64

        .visible .entry divide(
            .param .u64 a,
            .param .u64 b,
            .param .u64 c,
            .param .u32 n
        )
        {
            .reg .b32 	%r<4>;
            .reg .b64 	%rd<8>;
            .reg .f64 	%fd<4>;
            .reg .pred 	%p<2>;

            ld.param.u64 	%rd1, [a];
            ld.param.u64 	%rd2, [b];
            ld.param.u64 	%rd3, [c];
            ld.param.u32 	%r1, [n];

            mov.u32 	%r2, %tid.x;
            mov.u32 	%r3, %ntid.x;
            mad.lo.s32 	%r2, %r3, %ctaid.x, %r2;

            setp.ge.s32	%p1, %r2, %r1;
            @%p1 bra 	$L__BB0_2;

            cvt.s64.s32	%rd4, %r2;
            mul.wide.s32 	%rd5, %r2, 8;
            add.s64 	%rd6, %rd1, %rd5;
            add.s64 	%rd7, %rd2, %rd5;
            ld.global.f64 	%fd1, [%rd6];
            ld.global.f64 	%fd2, [%rd7];

            // Check for division by zero
            setp.eq.f64	%p2, %fd2, 0.0;
            @%p2 bra	$L__BB0_1;

            div.rn.f64 	%fd3, %fd1, %fd2;
            bra $L__BB0_3;

        $L__BB0_1:
            mov.f64 	%fd3, 0d7FF8000000000000;  // NaN

        $L__BB0_3:
            add.s64 	%rd4, %rd3, %rd5;
            st.global.f64 	[%rd4], %fd3;

        $L__BB0_2:
            ret;
        }
        "#;

        return elementwise_op::<fn(f64, f64) -> f64>(a, b, manager, "divide", PTX_DIV, "division");
    }

    #[cfg(not(cuda_available))]
    {
        // Create result matrix with same dimensions
        let shape = a.data.shape();
        let result_data = Array2::zeros(shape);

        Ok(GpuMatrix {
            data: result_data,
            on_gpu: false,
        })
    }
}

/// Sum of all matrix elements using CUDA
pub fn matrix_sum(a: &GpuMatrix, manager: &GpuManager) -> Result<f64> {
    #[cfg(cuda_available)]
    {
        let cuda_context = get_cuda_context(manager)?;
        let stream = cuda_context.stream();
        let _cublas = cuda_context.cublas();

        // Get dimensions
        let shape = a.data.shape();
        let total_elements = shape[0] * shape[1];

        // Allocate device memory
        let a_data = a
            .data
            .as_slice()
            .ok_or_else(|| Error::InvalidOperation("Data not in contiguous layout".into()))?;

        // Copy data to device using stream (cudarc 0.18+ API)
        let _d_a = match stream.clone_htod(a_data) {
            Ok(d_a) => d_a,
            Err(e) => {
                return Err(Error::from(GpuError::TransferError(format!(
                    "Failed to copy matrix to device: {}",
                    e
                ))))
            }
        };

        // Create a vector of ones for reduction
        let ones = vec![1.0f64; total_elements];
        let _d_ones = match stream.clone_htod(&ones) {
            Ok(d_ones) => d_ones,
            Err(e) => {
                return Err(Error::from(GpuError::TransferError(format!(
                    "Failed to copy ones vector to device: {}",
                    e
                ))))
            }
        };

        // Allocate device memory for result
        let _d_sum = match stream.alloc_zeros::<f64>(1) {
            Ok(d_sum) => d_sum,
            Err(e) => {
                return Err(Error::from(GpuError::DeviceError(format!(
                    "Failed to allocate device memory: {}",
                    e
                ))))
            }
        };

        // Compute dot product: sum = A • ones
        let _incx = 1;
        let _incy = 1;
        // Note: CudaBlas API for cudarc 0.18.x
        return Err(Error::from(GpuError::DeviceError(
            "GPU BLAS operations not fully implemented for cudarc 0.18.x".to_string(),
        )));
    }

    #[cfg(not(cuda_available))]
    {
        Ok(0.0)
    }
}

/// Sort matrix rows using CUDA
pub fn sort_matrix_rows(a: &GpuMatrix, manager: &GpuManager) -> Result<GpuMatrix> {
    #[cfg(cuda_available)]
    {
        // PTX code for bitonic sort kernel (simplified for demonstration)
        const _PTX_SORT: &str = r#"
        .version 7.0
        .target sm_70
        .address_size 64

        .visible .entry bitonic_sort(
            .param .u64 input,
            .param .u64 output,
            .param .u32 width,
            .param .u32 height
        )
        {
            // Implementation of bitonic sort would go here
            // This is a simplified placeholder
        }
        "#;

        let cuda_context = get_cuda_context(manager)?;
        let _stream = cuda_context.stream();

        // Get dimensions
        let shape = a.data.shape();
        let _height = shape[0];
        let _width = shape[1];
        let _total_elements = _height * _width;

        // For demonstration, we'll just copy the input to output
        // In a real implementation, we would perform a proper sort
        let _a_data = a
            .data
            .as_slice()
            .ok_or_else(|| Error::InvalidOperation("Data not in contiguous layout".into()))?;

        // Create result matrix with same dimensions
        let result_data = a.data.clone();

        Ok(GpuMatrix {
            data: result_data,
            on_gpu: false,
        })
    }

    #[cfg(not(cuda_available))]
    {
        // Create result matrix with same dimensions
        let shape = a.data.shape();
        let result_data = Array2::zeros(shape);

        Ok(GpuMatrix {
            data: result_data,
            on_gpu: false,
        })
    }
}

/// Vector dot product using CUDA
pub fn vector_dot_product(a: &GpuVector, b: &GpuVector, manager: &GpuManager) -> Result<f64> {
    // Check if dimensions match
    if a.data.len() != b.data.len() {
        return Err(Error::DimensionMismatch(format!(
            "Incompatible dimensions for dot product: {} and {}",
            a.data.len(),
            b.data.len()
        )));
    }

    #[cfg(cuda_available)]
    {
        let cuda_context = get_cuda_context(manager)?;
        let stream = cuda_context.stream();
        let _cublas = cuda_context.cublas();

        // Get dimensions
        let _n = a.data.len() as i32;

        // Allocate device memory
        let a_data = a
            .data
            .as_slice()
            .ok_or_else(|| Error::InvalidOperation("Data not in contiguous layout".into()))?;
        let b_data = b
            .data
            .as_slice()
            .ok_or_else(|| Error::InvalidOperation("Data not in contiguous layout".into()))?;

        // Copy data to device using stream (cudarc 0.18+ API)
        let _d_a = match stream.clone_htod(a_data) {
            Ok(d_a) => d_a,
            Err(e) => {
                return Err(Error::from(GpuError::TransferError(format!(
                    "Failed to copy vector A to device: {}",
                    e
                ))))
            }
        };

        let _d_b = match stream.clone_htod(b_data) {
            Ok(d_b) => d_b,
            Err(e) => {
                return Err(Error::from(GpuError::TransferError(format!(
                    "Failed to copy vector B to device: {}",
                    e
                ))))
            }
        };

        // Allocate device memory for result
        let _d_result = match stream.alloc_zeros::<f64>(1) {
            Ok(d_result) => d_result,
            Err(e) => {
                return Err(Error::from(GpuError::DeviceError(format!(
                    "Failed to allocate device memory: {}",
                    e
                ))))
            }
        };

        // Perform dot product
        let _incx = 1;
        let _incy = 1;
        // Note: CudaBlas API for cudarc 0.18.x
        return Err(Error::from(GpuError::DeviceError(
            "GPU BLAS operations not fully implemented for cudarc 0.18.x".to_string(),
        )));
    }

    #[cfg(not(cuda_available))]
    {
        Ok(0.0)
    }
}

/// Vector addition using CUDA
pub fn vector_add(a: &GpuVector, b: &GpuVector, manager: &GpuManager) -> Result<GpuVector> {
    // Check if dimensions match
    if a.data.len() != b.data.len() {
        return Err(Error::DimensionMismatch(format!(
            "Incompatible dimensions for vector addition: {} and {}",
            a.data.len(),
            b.data.len()
        )));
    }

    #[cfg(cuda_available)]
    {
        let cuda_context = get_cuda_context(manager)?;
        let stream = cuda_context.stream();
        let _cublas = cuda_context.cublas();

        // Get dimensions
        let _n = a.data.len() as i32;

        // Allocate device memory
        let a_data = a
            .data
            .as_slice()
            .ok_or_else(|| Error::InvalidOperation("Data not in contiguous layout".into()))?;
        let b_data = b
            .data
            .as_slice()
            .ok_or_else(|| Error::InvalidOperation("Data not in contiguous layout".into()))?;

        // Copy data to device using stream (cudarc 0.18+ API)
        let _d_a = match stream.clone_htod(a_data) {
            Ok(d_a) => d_a,
            Err(e) => {
                return Err(Error::from(GpuError::TransferError(format!(
                    "Failed to copy vector A to device: {}",
                    e
                ))))
            }
        };

        let _d_b = match stream.clone_htod(b_data) {
            Ok(d_b) => d_b,
            Err(e) => {
                return Err(Error::from(GpuError::TransferError(format!(
                    "Failed to copy vector B to device: {}",
                    e
                ))))
            }
        };

        // Allocate device memory for result (copy B as the starting point)
        let _d_result = match stream.clone_htod(b_data) {
            Ok(d_result) => d_result,
            Err(e) => {
                return Err(Error::from(GpuError::DeviceError(format!(
                    "Failed to allocate device memory: {}",
                    e
                ))))
            }
        };

        // Perform vector addition: result = a + b
        // Using axpy: y = alpha*x + y (with alpha=1.0, x=a, y=b/result)
        let _alpha = 1.0f64;
        let _incx = 1;
        let _incy = 1;
        // Note: CudaBlas API for cudarc 0.18.x
        return Err(Error::from(GpuError::DeviceError(
            "GPU BLAS operations not fully implemented for cudarc 0.18.x".to_string(),
        )));
    }

    #[cfg(not(cuda_available))]
    {
        // Create result vector with same dimensions
        let len = a.data.len();
        let result_data = Array1::zeros(len);

        Ok(GpuVector {
            data: result_data,
            on_gpu: false,
        })
    }
}

// Helper functions for memory management

/// Transfer a CPU matrix to GPU memory
#[cfg(cuda_available)]
fn to_gpu(matrix: &Array2<f64>, stream: &Arc<CudaStream>) -> Result<CudaSlice<f64>> {
    let data = matrix
        .as_slice()
        .ok_or_else(|| Error::InvalidOperation("Data not in contiguous layout".into()))?;
    let d_data = match stream.clone_htod(data) {
        Ok(d_data) => d_data,
        Err(e) => {
            return Err(Error::from(GpuError::TransferError(format!(
                "Failed to copy matrix to device: {}",
                e
            ))))
        }
    };

    Ok(d_data)
}

/// Transfer a GPU matrix to CPU memory
#[cfg(cuda_available)]
fn to_cpu<T: cudarc::driver::DeviceRepr + Copy + Default>(
    _stream: &Arc<CudaStream>,
    _d_data: &CudaSlice<T>,
    shape: (usize, usize),
) -> Result<Vec<T>> {
    let total_elements = shape.0 * shape.1;
    let result = vec![T::default(); total_elements];

    // In cudarc 0.18.x, device to host copy uses stream.dtoh_sync_copy
    // For now, return placeholder result
    match Ok::<(), DriverError>(()) {
        Ok(()) => Ok(result),
        Err(e) => Err(Error::from(GpuError::TransferError(format!(
            "Failed to copy data from device: {}",
            e
        )))),
    }
}

/// Free GPU memory (no longer needed with RAII approach)
#[allow(dead_code)]
fn free_gpu(gpu_ptr: *mut f64) -> Result<()> {
    // Modern CUDA crates handle this automatically through RAII
    Ok(())
}